1 static char rcsid[] = "$Id: substring.c 223080 2020-09-13 14:20:25Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "substring.h"
7 #include <stdlib.h>
8 #include <string.h>
9 #include <ctype.h>
10 #include <math.h> /* For log and exp */
11
12 #include "assert.h"
13 #include "mem.h"
14 #include "maxent_hr.h"
15 #include "complement.h"
16 #include "genome128_hr.h"
17 #include "mapq.h"
18 #include "comp.h"
19 #include "splice.h"
20 #include "sense.h"
21 #include "simplepair.h" /* For Simplepair_new_out */
22
23
24 /* Designed to allow 1 match to offset 1 mismatch. To handle 2 matches vs 2 mismatches, penalize for multiple mismatches */
25 #define TRIM_MATCH_SCORE 1
26 #define TRIM_MISMATCH_SCORE_LAST -1 /* Requires 1 match to compensate */
27 #define TRIM_MISMATCH_SCORE_MULT -4 /* Requires 4 matches to compensate */
28
29 #define KNOWN_SPLICESITE_EDGE 1 /* Cannot be 0; otherwise will hit found splice sites */
30
31 #define CONSISTENT_TEXT "consistent"
32 #define TRANSLOCATION_TEXT "translocation"
33 #define INVERSION_TEXT "inversion"
34 #define SCRAMBLE_TEXT "scramble"
35
36 #define MIN_EXON_LENGTH 20
37
38
39 #ifdef CHECK_ASSERTIONS
40 #define CHECK_NMISMATCHES 1
41 #endif
42
43
44 #ifdef DEBUG
45 #define debug(x) x
46 #else
47 #define debug(x)
48 #endif
49
50 /* mark_mismatches */
51 #ifdef DEBUG1
52 #define debug1(x) x
53 #else
54 #define debug1(x)
55 #endif
56
57 /* Substring_new */
58 #ifdef DEBUG2
59 #define debug2(x) x
60 #else
61 #define debug2(x)
62 #endif
63
64 /* Substring_overlap_p and Substring_insert_length */
65 #ifdef DEBUG3
66 #define debug3(x) x
67 #else
68 #define debug3(x)
69 #endif
70
71
72 /* splice site probs */
73 #ifdef DEBUG4
74 #define debug4(x) x
75 #else
76 #define debug4(x)
77 #endif
78
79
80 /* Substring_convert_to_pairs */
81 #ifdef DEBUG6
82 #define debug6(x) x
83 #else
84 #define debug6(x)
85 #endif
86
87
88 /* contains known splicesite */
89 #ifdef DEBUG7
90 #define debug7(x) x
91 #else
92 #define debug7(x)
93 #endif
94
95
96 /* trimming. may also want to turn on DEBUG8 in pair.c */
97 #ifdef DEBUG8
98 #define debug8(x) x
99 #else
100 #define debug8(x)
101 #endif
102
103 /* bad_stretch_p */
104 #ifdef DEBUG9
105 #define debug9(x) x
106 #else
107 #define debug9(x)
108 #endif
109
110
111 /* Substring_set_alt */
112 #ifdef DEBUG10
113 #define debug10(x) x
114 #else
115 #define debug10(x)
116 #endif
117
118 /* Substring_circularpos */
119 #ifdef DEBUG12
120 #define debug12(x) x
121 #else
122 #define debug12(x)
123 #endif
124
125
126 #define ACCEPTABLE_TRIM 3
127 #define END_SPLICESITE_PROB_CLOSE 0.95
128 #define END_SPLICESITE_PROB_FAR 0.90
129 #define MAX_NCONSECUTIVE_CLOSE 6
130 #define MAX_NCONSECUTIVE_FAR 20 /* Needs to be generous to find splices */
131
132
133
134 #define LOG_99 -0.01005033585
135 #define LOG_01 -4.605170186
136
137 #if 0
138 /* Switches on 5 consecutive mismatches */
139 #define LOG_99_9999 -0.01015034085
140 #define LOG_99_0001 -9.220390708
141 #define LOG_25_0001 -10.59663473
142 #define LOG_25_9999 -1.386394366
143 #define LOG_01_9999 -4.605270191
144 #define LOG_01_0001 -13.81551056
145 #define LOG_75_0001 -9.498022444
146 #define LOG_75_9999 -0.2877820775
147 #endif
148
149 #if 0
150 #define LOG_99_999 -0.01105083619
151 #define LOG_99_001 -6.917805615
152 #define LOG_25_001 -8.29404964
153 #define LOG_25_999 -1.387294861
154 #define LOG_01_999 -4.606170686
155 #define LOG_01_001 -11.51292546
156 #define LOG_75_001 -7.195437351
157 #define LOG_75_999 -0.2886825728
158 #endif
159
160 /* Switches on 4 consecutive mismatches */
161 #define LOG_99_99 -0.02010067171
162 #define LOG_99_01 -4.615220522
163 #define LOG_25_01 -5.991464547
164 #define LOG_25_99 -1.396344697
165 #define LOG_01_99 -4.615220522
166 #define LOG_01_01 -9.210340372
167 #define LOG_75_01 -4.892852258
168 #define LOG_75_99 -0.2977324083
169
170
171 static bool print_nsnpdiffs_p;
172 static bool print_snplabels_p;
173 static bool show_refdiff_p;
174
175 static IIT_T snps_iit;
176 static int *snps_divint_crosstable;
177
178 static Genome_T genomebits;
179 static Genome_T genomebits_alt;
180
181 static Univ_IIT_T chromosome_iit;
182 static Univcoord_T genomelength;
183 static double genomesize; /* For BLAST E-value */
184 static int nchromosomes;
185 static int circular_typeint;
186
187 static bool novelsplicingp;
188 static bool knownsplicingp;
189 static Outputtype_T output_type;
190
191 static Mode_T mode;
192
193 static bool maskedp;
194
195 static char complCode[128] = COMPLEMENT_LC;
196
197 static char *
make_complement_inplace(char * sequence,unsigned int length)198 make_complement_inplace (char *sequence, unsigned int length) {
199 char temp;
200 unsigned int i, j;
201
202 for (i = 0, j = length-1; i < length/2; i++, j--) {
203 temp = complCode[(int) sequence[i]];
204 sequence[i] = complCode[(int) sequence[j]];
205 sequence[j] = temp;
206 }
207 if (i == j) {
208 sequence[i] = complCode[(int) sequence[i]];
209 }
210
211 return sequence;
212 }
213
214
215
216 #define T Substring_T
217
218 struct T {
219 /* Maintaining this pointer so we don't have to pass in the values
220 for Substring_display_prep based on plusp. Caller needs to keep
221 these values until all computation is finished. */
222 Compress_T query_compress; /* Pointer to the associated query_compress */
223
224
225 int nmismatches_bothdiff; /* Over region left after trimming */
226 int nmismatches_refdiff; /* Over region left after trimming */
227 /* nsnpdiffs = nmismatches_bothdiff - nmismatches_refdiff */
228
229 int nmatches_plus_spliced_trims;
230 int nmatches_to_trims;
231
232 int ref_nmatches_plus_spliced_trims;
233 int ref_nmatches_to_trims;
234
235 /* int trim_querystart; */
236 /* int trim_queryend; */
237 int mandatory_trim_querystart; /* Resulting from unalias, plus SOFT_CLIPS_AVOID_CIRCULARIZATION */
238 int mandatory_trim_queryend; /* Resulting from alias, plus SOFT_CLIPS_AVOID_CIRCULARIZATION */
239 int start_amb_length;
240 int end_amb_length;
241
242 bool trim_querystart_splicep;
243 bool trim_queryend_splicep;
244
245 Chrnum_T chrnum;
246 Univcoord_T chroffset;
247 Univcoord_T chrhigh;
248 Chrpos_T chrlength;
249
250 Univcoord_T left; /* for plus: alignstart - querystart(orig). for
251 minus: alignend - (querylength -
252 queryend(orig)). Set when substring is created
253 or made unambiguous, and remains constant */
254
255 Univcoord_T genomicstart; /* For region corresponding to entire querylength (if extrapolated) */
256 Univcoord_T genomicend;
257
258 Endtype_T start_endtype;
259 Endtype_T end_endtype;
260
261 int querystart_pretrim; /* From original invocation, before trimming */
262 int queryend_pretrim; /* From original invocation, before trimming */
263 int querystart; /* For part that aligns to genome, post-trim */
264 int queryend;
265
266 int querystart_chrbound; /* Bound on alignment if this were substring1 */
267 int queryend_chrbound; /* Bound on alignment if this were substringN */
268
269 int amb_splice_pos; /* Used for ambiguous substrings. Based on left, not querystart */
270 int querylength;
271
272 Univcoord_T alignstart_trim; /* For part that aligns to genome, excluding part that is trimmed (post-trim) */
273 Univcoord_T alignend_trim;
274
275 bool plusp;
276 int genestrand;
277
278 char *genomic_bothdiff; /* In same direction as query. NULL if same
279 as query. Has dashes outside of
280 querystart..(queryend-1) for indels and
281 splices. Has lowercase to indicate
282 trimmed regions, terminal regions,
283 deletions, splice dinucleotides, and
284 mismatches from ref and alt. Use for
285 MAPQ computations and scoring. */
286
287 char *genomic_refdiff; /* Same as above, but lowercase for
288 mismatches from ref only. For
289 non-SNP-tolerant alignment, this is just
290 a pointer to genomic_bothdiff. Use for
291 NM and MD computations. */
292
293 float mapq_loglik;
294
295 int sensedir;
296
297 Univcoord_T splicecoord_D;
298 int splicesitesD_knowni; /* Needed for intragenic_splice_p in stage1hr.c */
299
300 int siteD_pos;
301 double siteD_prob;
302
303 /* for shortexon (always use *_1 for acceptor and *_2 for donor) */
304 /* for donor/acceptor: the ambiguous position */
305 Univcoord_T splicecoord_A;
306 int splicesitesA_knowni;
307
308 int siteA_pos;
309 double siteA_prob;
310
311 /* Note: For DNA fusions, use both splicecoord_D and splicecoord_A */
312 int siteN_pos;
313
314
315 int alts_ncoords;
316 Univcoord_T *alts_coords;
317 int *alts_knowni;
318 int *alts_nmismatches;
319 int *alts_ref_nmismatches;
320 double *alts_probs;
321
322 Endtype_T amb_type; /* Ambiguous DONs or ACCs */
323 };
324
325
326 void
Substring_alias_circular(T this)327 Substring_alias_circular (T this) {
328 Chrpos_T chrlength;
329 int i;
330
331 if (this == NULL) {
332 /* Skip */
333
334 } else if (this->alts_ncoords > 0) {
335 /* Cannot rely on this->left, which is 0. Let mandatory_trim_querystart and mandatory_trim_queryend be 0 */
336 chrlength = this->chrlength;
337 for (i = 0; i < this->alts_ncoords; i++) {
338 this->alts_coords[i] += chrlength;
339 }
340
341 } else {
342 chrlength = this->chrlength;
343
344 if (this->left + this->querylength > this->chroffset + chrlength) {
345 debug2(printf("For alias, this->left %u + this->querylength %d > offset %u + chrlength %u\n",
346 this->left,this->querylength,this->chroffset,chrlength));
347 if (this->plusp == true) {
348 this->mandatory_trim_queryend = (this->left + this->querylength) - (this->chroffset + chrlength);
349 debug2(printf("For alias, plusp true, setting mandatory_trim_queryend to be %d\n",this->mandatory_trim_queryend));
350 assert(this->mandatory_trim_queryend >= 0);
351 } else {
352 this->mandatory_trim_querystart = (this->left + this->querylength) - (this->chroffset + chrlength);
353 debug2(printf("For alias, plusp false, setting mandatory_trim_querystart to be %d\n",this->mandatory_trim_querystart));
354 assert(this->mandatory_trim_querystart >= 0);
355 }
356 }
357
358 this->left += chrlength;
359 this->genomicstart += chrlength;
360 this->genomicend += chrlength;
361 this->alignstart_trim += chrlength;
362 this->alignend_trim += chrlength;
363 this->splicecoord_D += chrlength;
364 this->splicecoord_A += chrlength;
365 }
366
367 return;
368 }
369
370
371 void
Substring_unalias_circular(T this)372 Substring_unalias_circular (T this) {
373 Chrpos_T chrlength;
374 int i;
375
376 if (this == NULL) {
377 /* Skip */
378
379 } else if (this->alts_ncoords > 0) {
380 /* Cannot rely on this->left, which is 0. Let mandatory_trim_querystart and mandatory_trim_queryend be 0 */
381 chrlength = this->chrlength;
382 for (i = 0; i < this->alts_ncoords; i++) {
383 this->alts_coords[i] -= chrlength;
384 }
385
386 } else {
387 chrlength = this->chrlength;
388
389 if (this->left < this->chroffset + chrlength) {
390 debug2(printf("For unalias, this->left %u < chroffset %u + chrlength %d\n",
391 this->left,this->chroffset,chrlength));
392 if (this->plusp == true) {
393 this->mandatory_trim_querystart = (this->chroffset + chrlength) - this->left;
394 debug2(printf("For unalias, plusp true, setting mandatory_trim_querystart to be %d\n",this->mandatory_trim_querystart));
395 assert(this->mandatory_trim_querystart >= 0);
396 } else {
397 this->mandatory_trim_queryend = (this->chroffset + chrlength) - this->left;
398 debug2(printf("For unalias, plusp false, setting mandatory_trim_queryend to be %d\n",this->mandatory_trim_queryend));
399 assert(this->mandatory_trim_queryend >= 0);
400 }
401 }
402
403 this->left -= chrlength;
404 this->genomicstart -= chrlength;
405 this->genomicend -= chrlength;
406 this->alignstart_trim -= chrlength;
407 this->alignend_trim -= chrlength;
408 this->splicecoord_D -= chrlength;
409 this->splicecoord_A -= chrlength;
410 }
411
412 return;
413 }
414
415
416
417 static void
fill_w_dashes(char * string,int start,int end)418 fill_w_dashes (char *string, int start, int end) {
419 int i;
420
421 for (i = start; i < end; i++) {
422 string[i] = '-';
423 }
424 return;
425 }
426
427 static void
fill_w_stars(char * string,int start,int end)428 fill_w_stars (char *string, int start, int end) {
429 int i;
430
431 for (i = start; i < end; i++) {
432 string[i] = '*';
433 }
434 return;
435 }
436
437
438
439 void
Substring_free(T * old)440 Substring_free (T *old) {
441 if (*old) {
442 debug2(printf("Freeing substring %p\n",*old));
443 if ((*old)->alts_ncoords > 0) {
444 FREE_OUT((*old)->alts_coords);
445 FREE_OUT((*old)->alts_knowni);
446 FREE_OUT((*old)->alts_nmismatches);
447 FREE_OUT((*old)->alts_ref_nmismatches);
448 FREE_OUT((*old)->alts_probs);
449 }
450 if ((*old)->genomic_bothdiff != NULL) {
451 if ((*old)->genomic_refdiff != (*old)->genomic_bothdiff) {
452 FREE_OUT((*old)->genomic_refdiff);
453 }
454 FREE_OUT((*old)->genomic_bothdiff);
455 }
456
457 FREE_OUT(*old);
458 }
459 return;
460 }
461
462
463 void
Substring_list_gc(List_T * old)464 Substring_list_gc (List_T *old) {
465 List_T p;
466 T substring;
467
468 for (p = *old; p != NULL; p = p->rest) {
469 substring = (Substring_T) p->first;
470 Substring_free(&substring);
471 }
472 /* List_free(&(*old)); -- allocated by Listpool_push */
473 return;
474 }
475
476
477
478 bool
Substring_contains_p(T this,int querypos)479 Substring_contains_p (T this, int querypos) {
480 if (querypos >= this->querystart && querypos < this->queryend) {
481 return true;
482 } else {
483 return false;
484 }
485 }
486
487
488 int
Substring_compare(T substring1,T substring2,int alias1,int alias2,Chrpos_T chrlength1,Chrpos_T chrlength2)489 Substring_compare (T substring1, T substring2, int alias1, int alias2, Chrpos_T chrlength1, Chrpos_T chrlength2) {
490 Univcoord_T alignstart1, alignend1, alignstart2, alignend2;
491
492 if (substring1 == NULL && substring2 == NULL) {
493 return 0;
494 } else if (substring1 == NULL) {
495 return -1;
496 } else if (substring2 == NULL) {
497 return +1;
498 } else {
499 alignstart1 = substring1->alignstart_trim;
500 alignend1 = substring1->alignend_trim;
501 if (alias1 < 0) {
502 alignstart1 += chrlength1;
503 alignend1 += chrlength1;
504 }
505
506 alignstart2 = substring2->alignstart_trim;
507 alignend2 = substring2->alignend_trim;
508 if (alias2 < 0) {
509 alignstart2 += chrlength2;
510 alignend2 += chrlength2;
511 }
512
513 if (alignstart1 < alignstart2) {
514 return -1;
515 } else if (alignstart1 > alignstart2) {
516 return +1;
517 } else if (alignend1 < alignend2) {
518 return -1;
519 } else if (alignend1 > alignend2) {
520 return +1;
521 } else {
522 return 0;
523 }
524 }
525 }
526
527
528 bool
Substring_equal_p(T substring1,T substring2)529 Substring_equal_p (T substring1, T substring2) {
530 Univcoord_T low1, high1, low2, high2;
531
532 if (substring1->plusp == true) {
533 low1 = substring1->alignstart_trim;
534 high1 = substring1->alignend_trim;
535 } else {
536 low1 = substring1->alignend_trim;
537 high1 = substring1->alignstart_trim;
538 }
539 if (high1 > 0) {
540 high1 -= 1;
541 }
542
543 if (substring2->plusp == true) {
544 low2 = substring2->alignstart_trim;
545 high2 = substring2->alignend_trim;
546 } else {
547 low2 = substring2->alignend_trim;
548 high2 = substring2->alignstart_trim;
549 }
550 if (high2 > 0) {
551 high2 -= 1;
552 }
553
554 debug3(printf("Checking equivalence between %u..%u and %u..%u",low1,high1,low2,high2));
555
556 if (low1 == low2 && high1 == high2) {
557 return true;
558 } else {
559 return false;
560 }
561 }
562
563
564 bool
Substring_overlap_p(T substring1,T substring2)565 Substring_overlap_p (T substring1, T substring2) {
566 Univcoord_T low1, high1, low2, high2;
567
568 if (substring1->plusp == true) {
569 low1 = substring1->alignstart_trim;
570 high1 = substring1->alignend_trim;
571 } else {
572 low1 = substring1->alignend_trim;
573 high1 = substring1->alignstart_trim;
574 }
575 if (high1 > 0) {
576 high1 -= 1;
577 }
578
579 if (substring2->plusp == true) {
580 low2 = substring2->alignstart_trim;
581 high2 = substring2->alignend_trim;
582 } else {
583 low2 = substring2->alignend_trim;
584 high2 = substring2->alignstart_trim;
585 }
586 if (high2 > 0) {
587 high2 -= 1;
588 }
589
590 debug3(printf("Checking overlap between %u..%u and %u..%u",low1,high1,low2,high2));
591
592 if (high2 < low1) {
593 debug3(printf(" => no because %u < %u\n",high2,low1));
594 return false;
595 } else if (low2 > high1) {
596 debug3(printf(" => no because %u > %u\n",low2,high1));
597 return false;
598 } else {
599 debug3(printf(" => yes\n"));
600 return true;
601 }
602 }
603
604
605 Chrpos_T
Substring_insert_length(int * pair_relationship,T substring5,T substring3)606 Substring_insert_length (int *pair_relationship, T substring5, T substring3) {
607 Univcoord_T low, high, low5, high5, low3, high3;
608
609 if (substring5->plusp == true) {
610 low5 = substring5->genomicstart;
611 high5 = substring5->genomicend;
612 } else {
613 low5 = substring5->genomicend;
614 high5 = substring5->genomicstart;
615 }
616
617 if (substring3->plusp == true) {
618 low3 = substring3->genomicstart;
619 high3 = substring3->genomicend;
620 } else {
621 low3 = substring3->genomicend;
622 high3 = substring3->genomicstart;
623 }
624
625 if (low5 < low3) {
626 low = low5;
627 } else {
628 low = low3;
629 }
630
631 if (high5 > high3) {
632 high = high5;
633 } else {
634 high = high3;
635 }
636
637 if (low5 < low3 && high5 < high3) {
638 *pair_relationship = +1;
639 } else if (low3 < low5 && high3 < high5) {
640 *pair_relationship = -1;
641 } else if (substring5->plusp == true && substring3->plusp == true) {
642 *pair_relationship = +1;
643 } else if (substring5->plusp == false && substring3->plusp == false) {
644 *pair_relationship = -1;
645 } else {
646 *pair_relationship = 0;
647 }
648
649 debug3(printf("Returning %u - %u. pair_relationship %d\n",high,low,*pair_relationship));
650 return high - low;
651 }
652
653
654 bool
Substring_overlap_point_trimmed_p(T substring,Univcoord_T endpos)655 Substring_overlap_point_trimmed_p (T substring, Univcoord_T endpos) {
656 Univcoord_T low, high;
657
658 if (substring == NULL) {
659 return false;
660 }
661
662 if (substring->plusp == true) {
663 low = substring->alignstart_trim;
664 high = substring->alignend_trim;
665 if (high > 0) {
666 high -= 1;
667 }
668 debug3(printf("Checking overlap between plus %u..%u and %u",low,high,endpos));
669 } else {
670 low = substring->alignend_trim;
671 high = substring->alignstart_trim;
672 if (high > 0) {
673 high -= 1;
674 }
675 debug3(printf("Checking overlap between minus %u..%u and %u",low,high,endpos));
676 }
677
678
679 if (endpos < low) {
680 debug3(printf(" => no because %u < %u\n",endpos,low));
681 return false;
682 } else if (endpos > high) {
683 debug3(printf(" => no because %u > %u\n",endpos,high));
684 return false;
685 } else {
686 debug3(printf(" => yes\n"));
687 return true;
688 }
689 }
690
691
692 Univcoord_T
Substring_overlap_segment_trimmed(T substring1,T substring2)693 Substring_overlap_segment_trimmed (T substring1, T substring2) {
694 Univcoord_T maxlow, minhigh;
695 Univcoord_T low1, high1, low2, high2;
696
697 if (substring1->plusp == true) {
698 low1 = substring1->alignstart_trim;
699 high1 = substring1->alignend_trim;
700 } else {
701 low1 = substring1->alignend_trim;
702 high1 = substring1->alignstart_trim;
703 }
704 if (high1 > 0) {
705 high1 -= 1;
706 }
707
708 if (substring2->plusp == true) {
709 low2 = substring2->alignstart_trim;
710 high2 = substring2->alignend_trim;
711 } else {
712 low2 = substring2->alignend_trim;
713 high2 = substring2->alignstart_trim;
714 }
715 if (high2 > 0) {
716 high2 -= 1;
717 }
718
719 debug3(printf("Checking overlap between %u..%u and %u..%u",low1,high1,low2,high2));
720
721 if (high2 < low1) {
722 debug3(printf(" => no because %u < %u\n",high2,low1));
723 return 0;
724 } else if (low2 > high1) {
725 debug3(printf(" => no because %u > %u\n",low2,high1));
726 return 0;
727 } else {
728 maxlow = (low1 > low2) ? low1 : low2;
729 minhigh = (high1 < high2) ? high1 : high2;
730 debug3(printf(" => yes. maxlow %llu, minhigh %llu. returning %llu\n",
731 maxlow,minhigh,maxlow + (minhigh - maxlow)/2));
732 return maxlow + (minhigh - maxlow)/2;
733 }
734 }
735
736
737 static void
mark_mismatches_cmet_gsnap(char * gbuffer,char * query,int start,int end,int genestrand)738 mark_mismatches_cmet_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
739 int i;
740
741 debug1(printf("\n"));
742 debug1(printf("query: %s\n",query));
743 debug1(printf("genome: %s\n",gbuffer));
744 debug1(printf("count: "));
745
746 if (genestrand == +2) {
747 for (i = start; i < end; i++) {
748 if (gbuffer[i] == 'G' && query[i] == 'A') {
749 debug1(printf("."));
750 gbuffer[i] = '.';
751 } else if (query[i] != gbuffer[i]) {
752 debug1(printf("x"));
753 assert(gbuffer[i] != OUTOFBOUNDS);
754 gbuffer[i] = (char) tolower(gbuffer[i]);
755 } else {
756 debug1(printf("*"));
757 }
758 }
759
760 } else {
761 for (i = start; i < end; i++) {
762 if (gbuffer[i] == 'C' && query[i] == 'T') {
763 debug1(printf("."));
764 gbuffer[i] = '.';
765 } else if (query[i] != gbuffer[i]) {
766 debug1(printf("x"));
767 assert(gbuffer[i] != OUTOFBOUNDS);
768 gbuffer[i] = (char) tolower(gbuffer[i]);
769 } else {
770 debug1(printf("*"));
771 }
772 }
773 }
774
775 return;
776 }
777
778
779 static void
mark_mismatches_cmet_sam(char * gbuffer,char * query,int start,int end,int genestrand)780 mark_mismatches_cmet_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
781 int i;
782
783 debug1(printf("query: %s\n",query));
784 debug1(printf("genome: %s\n",gbuffer));
785 debug1(printf("count: "));
786
787 if (genestrand == +2) {
788 for (i = start; i < end; i++) {
789 if (gbuffer[i] == 'G' && query[i] == 'A') {
790 debug1(printf("."));
791 #if 0
792 /* Want to show mismatches */
793 gbuffer[i] = 'A'; /* Avoids showing mismatches in MD and NM strings */
794 #endif
795 } else if (query[i] != gbuffer[i]) {
796 debug1(printf("x"));
797 assert(gbuffer[i] != OUTOFBOUNDS);
798 gbuffer[i] = (char) tolower(gbuffer[i]);
799 } else {
800 debug1(printf("*"));
801 }
802 }
803
804 } else {
805 for (i = start; i < end; i++) {
806 if (gbuffer[i] == 'C' && query[i] == 'T') {
807 debug1(printf("."));
808 #if 0
809 /* Want to show mismatches */
810 gbuffer[i] = 'T'; /* Avoids showing mismatches in MD and NM strings */
811 #endif
812 } else if (query[i] != gbuffer[i]) {
813 debug1(printf("x"));
814 assert(gbuffer[i] != OUTOFBOUNDS);
815 gbuffer[i] = (char) tolower(gbuffer[i]);
816 } else {
817 debug1(printf("*"));
818 }
819 }
820 }
821
822 return;
823 }
824
825
826
827 static void
mark_mismatches_atoi_gsnap(char * gbuffer,char * query,int start,int end,int genestrand)828 mark_mismatches_atoi_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
829 int i;
830
831 debug1(printf("query: %s\n",query));
832 debug1(printf("genome: %s\n",gbuffer));
833 debug1(printf("count: "));
834
835 if (genestrand == +2) {
836 for (i = start; i < end; i++) {
837 if (gbuffer[i] == 'T' && query[i] == 'C') {
838 debug1(printf("."));
839 gbuffer[i] = '.';
840 } else if (query[i] != gbuffer[i]) {
841 debug1(printf("x"));
842 assert(gbuffer[i] != OUTOFBOUNDS);
843 gbuffer[i] = (char) tolower(gbuffer[i]);
844 } else {
845 debug1(printf("*"));
846 }
847 }
848
849 } else {
850 for (i = start; i < end; i++) {
851 if (gbuffer[i] == 'A' && query[i] == 'G') {
852 debug1(printf("."));
853 gbuffer[i] = '.';
854 } else if (query[i] != gbuffer[i]) {
855 debug1(printf("x"));
856 assert(gbuffer[i] != OUTOFBOUNDS);
857 gbuffer[i] = (char) tolower(gbuffer[i]);
858 } else {
859 debug1(printf("*"));
860 }
861 }
862 }
863
864 return;
865 }
866
867
868
869 static void
mark_mismatches_atoi_sam(char * gbuffer,char * query,int start,int end,int genestrand)870 mark_mismatches_atoi_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
871 int i;
872
873 debug1(printf("query: %s\n",query));
874 debug1(printf("genome: %s\n",gbuffer));
875 debug1(printf("count: "));
876
877 if (genestrand == +2) {
878 for (i = start; i < end; i++) {
879 if (gbuffer[i] == 'T' && query[i] == 'C') {
880 debug1(printf("."));
881 #if 0
882 /* Want to show mismatches */
883 gbuffer[i] = 'C'; /* Avoids showing mismatches in MD and NM strings */
884 #endif
885 } else if (query[i] != gbuffer[i]) {
886 debug1(printf("x"));
887 assert(gbuffer[i] != OUTOFBOUNDS);
888 gbuffer[i] = (char) tolower(gbuffer[i]);
889 } else {
890 debug1(printf("*"));
891 }
892 }
893
894 } else {
895 for (i = start; i < end; i++) {
896 if (gbuffer[i] == 'A' && query[i] == 'G') {
897 debug1(printf("."));
898 #if 0
899 /* Want to show mismatches */
900 gbuffer[i] = 'G'; /* Avoids showing mismatches in MD and NM strings */
901 #endif
902 } else if (query[i] != gbuffer[i]) {
903 debug1(printf("x"));
904 assert(gbuffer[i] != OUTOFBOUNDS);
905 gbuffer[i] = (char) tolower(gbuffer[i]);
906 } else {
907 debug1(printf("*"));
908 }
909 }
910 }
911
912 return;
913 }
914
915
916 static void
mark_mismatches_ttoc_gsnap(char * gbuffer,char * query,int start,int end,int genestrand)917 mark_mismatches_ttoc_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
918 int i;
919
920 debug1(printf("query: %s\n",query));
921 debug1(printf("genome: %s\n",gbuffer));
922 debug1(printf("count: "));
923
924 if (genestrand == +2) {
925 for (i = start; i < end; i++) {
926 if (gbuffer[i] == 'A' && query[i] == 'G') {
927 debug1(printf("."));
928 gbuffer[i] = '.';
929 } else if (query[i] != gbuffer[i]) {
930 debug1(printf("x"));
931 assert(gbuffer[i] != OUTOFBOUNDS);
932 gbuffer[i] = (char) tolower(gbuffer[i]);
933 } else {
934 debug1(printf("*"));
935 }
936 }
937
938 } else {
939 for (i = start; i < end; i++) {
940 if (gbuffer[i] == 'T' && query[i] == 'C') {
941 debug1(printf("."));
942 gbuffer[i] = '.';
943 } else if (query[i] != gbuffer[i]) {
944 debug1(printf("x"));
945 assert(gbuffer[i] != OUTOFBOUNDS);
946 gbuffer[i] = (char) tolower(gbuffer[i]);
947 } else {
948 debug1(printf("*"));
949 }
950 }
951 }
952
953 return;
954 }
955
956
957
958 static void
mark_mismatches_ttoc_sam(char * gbuffer,char * query,int start,int end,int genestrand)959 mark_mismatches_ttoc_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
960 int i;
961
962 debug1(printf("query: %s\n",query));
963 debug1(printf("genome: %s\n",gbuffer));
964 debug1(printf("count: "));
965
966 if (genestrand == +2) {
967 for (i = start; i < end; i++) {
968 if (gbuffer[i] == 'A' && query[i] == 'G') {
969 debug1(printf("."));
970 #if 0
971 /* Want to show mismatches */
972 gbuffer[i] = 'G'; /* Avoids showing mismatches in MD and NM strings */
973 #endif
974 } else if (query[i] != gbuffer[i]) {
975 debug1(printf("x"));
976 assert(gbuffer[i] != OUTOFBOUNDS);
977 gbuffer[i] = (char) tolower(gbuffer[i]);
978 } else {
979 debug1(printf("*"));
980 }
981 }
982
983 } else {
984 for (i = start; i < end; i++) {
985 if (gbuffer[i] == 'T' && query[i] == 'C') {
986 debug1(printf("."));
987 #if 0
988 /* Want to show mismatches */
989 gbuffer[i] = 'C'; /* Avoids showing mismatches in MD and NM strings */
990 #endif
991 } else if (query[i] != gbuffer[i]) {
992 debug1(printf("x"));
993 assert(gbuffer[i] != OUTOFBOUNDS);
994 gbuffer[i] = (char) tolower(gbuffer[i]);
995 } else {
996 debug1(printf("*"));
997 }
998 }
999 }
1000
1001 return;
1002 }
1003
1004
1005
1006 /* The result variable should be an existing genomic_bothdiff or genomic_refdiff, and then reassigned to that */
1007 static char *
embellish_genomic(char * result,char * genomic_diff,char * query,int querystart,int queryend,int querylength,int mandatory_trim_querystart_in,int mandatory_trim_queryend_in,int extraleft,int extraright,bool plusp,int genestrand)1008 embellish_genomic (char *result, char *genomic_diff, char *query, int querystart, int queryend, int querylength,
1009 int mandatory_trim_querystart_in, int mandatory_trim_queryend_in, int extraleft, int extraright,
1010 bool plusp, int genestrand) {
1011 int i, j, k;
1012 int mandatory_trim_querystart, mandatory_trim_queryend;
1013
1014 debug1(printf("Entered embellish_genomic with querystart %d, queryend %d, querylength %d, mandatory_trim_querystart %d, mandatory_trim_queryend %d\n",
1015 querystart,queryend,querylength,mandatory_trim_querystart_in,mandatory_trim_queryend_in));
1016 debug1(printf("genomic_diff %s\n",genomic_diff));
1017
1018 assert(mandatory_trim_querystart_in >= 0);
1019 assert(mandatory_trim_queryend_in >= 0);
1020 assert(mandatory_trim_querystart_in < querylength);
1021 assert(mandatory_trim_queryend_in < querylength);
1022
1023 if (plusp == true) {
1024 mandatory_trim_querystart = mandatory_trim_querystart_in;
1025 mandatory_trim_queryend = mandatory_trim_queryend_in;
1026 } else {
1027 mandatory_trim_querystart = mandatory_trim_queryend_in;
1028 mandatory_trim_queryend = mandatory_trim_querystart_in;
1029 }
1030
1031 if (result == NULL) {
1032 result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
1033 }
1034 result[querylength] = '\0';
1035 #if 0
1036 for (i = 0; i < querylength; i++) {
1037 result[i] = '?';
1038 }
1039 #endif
1040
1041 /* Add aligned region with lower-case diffs, surrounded by dashes */
1042 fill_w_dashes(result,0,querystart);
1043 fill_w_stars(result,0,mandatory_trim_querystart);
1044
1045 /* Don't need to know adj anymore, because each substring has its own left */
1046 debug1(printf("Copying from genomic_diff[%d] to result[%d] for a length of %d - %d\n",querystart,querystart,queryend,querystart));
1047 strncpy(&(result[querystart]),&(genomic_diff[querystart]),queryend-querystart);
1048 debug1(printf("A g1: %s (%d..%d) extraleft:%d extraright:%d\n",result,querystart,queryend,extraleft,extraright));
1049
1050 if (mode == STANDARD) {
1051 /* Skip */
1052 } else if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
1053 mark_mismatches_cmet_gsnap(result,query,querystart,queryend,genestrand);
1054 } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
1055 mark_mismatches_atoi_gsnap(result,query,querystart,queryend,genestrand);
1056 } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
1057 mark_mismatches_ttoc_gsnap(result,query,querystart,queryend,genestrand);
1058 } else {
1059 abort();
1060 }
1061
1062 fill_w_dashes(result,queryend,querylength);
1063 fill_w_stars(result,querylength - mandatory_trim_queryend,querylength);
1064 debug1(printf("B g1: %s\n",result));
1065
1066 /* Add terminal ends as lower-case. Previously stopped at i >= 0 */
1067 for (k = 0, i = j = querystart - 1; k < extraleft && i >= mandatory_trim_querystart /*&& j >= 0*/; k++, i--, j--) {
1068 result[i] = (char) tolower(genomic_diff[j]);
1069 /* printf("k=%d i=%d result[i]=%c\n",k,i,result[i]); */
1070 assert(result[i] == 'a' || result[i] == 'c' || result[i] == 'g' || result[i] == 't' || result[i] == 'n' || result[i] == '*');
1071 }
1072
1073 /* Add terminal ends as lower-case. Previously stopped at i < querylength */
1074 for (k = 0, i = j = queryend; k < extraright && i < querylength - mandatory_trim_queryend /*&& j < genomiclength*/; k++, i++, j++) {
1075 result[i] = (char) tolower(genomic_diff[j]);
1076 /* printf("k=%d i=%d result[i]=%c\n",k,i,result[i]); */
1077 }
1078 debug1(printf("C g1: %s\n",result));
1079
1080 return result;
1081 }
1082
1083
1084 static char *
embellish_genomic_sam(char * result,char * genomic_diff,char * query,int querystart,int queryend,int querylength,int mandatory_trim_querystart_in,int mandatory_trim_queryend_in,int extraleft,int extraright,bool plusp,int genestrand)1085 embellish_genomic_sam (char *result, char *genomic_diff, char *query, int querystart, int queryend, int querylength,
1086 int mandatory_trim_querystart_in, int mandatory_trim_queryend_in, int extraleft, int extraright,
1087 bool plusp, int genestrand) {
1088 int i, j, k;
1089 int start, end;
1090 int mandatory_trim_querystart, mandatory_trim_queryend;
1091
1092 assert(mandatory_trim_querystart_in >= 0);
1093 assert(mandatory_trim_queryend_in >= 0);
1094
1095 if (plusp == true) {
1096 mandatory_trim_querystart = mandatory_trim_querystart_in;
1097 mandatory_trim_queryend = mandatory_trim_queryend_in;
1098 } else {
1099 mandatory_trim_querystart = mandatory_trim_queryend_in;
1100 mandatory_trim_queryend = mandatory_trim_querystart_in;
1101 }
1102
1103
1104 if (result == NULL) {
1105 result = (char *) MALLOC_OUT((querylength+1) * sizeof(char));
1106 }
1107 result[querylength] = '\0';
1108
1109 if ((start = querystart - extraleft) < 0) {
1110 start = 0;
1111 }
1112 if ((end = queryend + extraright) > querylength) {
1113 end = querylength;
1114 }
1115
1116 fill_w_dashes(result,0,start);
1117 fill_w_stars(result,0,mandatory_trim_querystart);
1118 strncpy(&(result[start]),&(genomic_diff[start]),end-start);
1119 fill_w_dashes(result,end,querylength);
1120 fill_w_stars(result,querylength - mandatory_trim_queryend,querylength);
1121
1122 if (mode == STANDARD) {
1123 /* Skip */
1124 } else if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
1125 mark_mismatches_cmet_sam(result,query,querystart,queryend,genestrand);
1126 } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
1127 mark_mismatches_atoi_sam(result,query,querystart,queryend,genestrand);
1128 } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
1129 mark_mismatches_ttoc_sam(result,query,querystart,queryend,genestrand);
1130 } else {
1131 abort();
1132 }
1133
1134 /* Add terminal ends as lower-case. Previously stopped at i >= 0 */
1135 for (k = 0, i = querystart-1, j = querystart-1; i >= mandatory_trim_querystart /*&& j >= 0*/; k++, i--, j--) {
1136 if (query[i] == genomic_diff[j]) {
1137 result[i] = genomic_diff[j];
1138 } else {
1139 result[i] = (char) tolower(genomic_diff[j]);
1140 }
1141 /* printf("k=%d i=%d j=%d result[i]=%c\n",k,i,j,result[i]); */
1142 }
1143
1144 /* Add terminal ends as lower-case. Previously stopped at i < querylength */
1145 for (k = 0, i = queryend, j = queryend; i < querylength - mandatory_trim_queryend /*&& j < genomiclength*/; k++, i++, j++) {
1146 if (query[i] == genomic_diff[j]) {
1147 result[i] = genomic_diff[j];
1148 } else {
1149 result[i] = (char) tolower(genomic_diff[j]);
1150 }
1151 }
1152
1153 return result;
1154 }
1155
1156
1157 /* Still takes left as a parameter */
1158 int
Substring_trim_qstart_nosplice(int * nmismatches,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T left,Univcoord_T chroffset,int pos5,int pos3,int querylength,bool plusp,int genestrand)1159 Substring_trim_qstart_nosplice (int *nmismatches, int *mismatch_positions_alloc,
1160 Compress_T query_compress, Univcoord_T left, Univcoord_T chroffset,
1161 int pos5, int pos3, int querylength, bool plusp, int genestrand) {
1162 int score = 0;
1163 int trimpos = pos5, pos, prevpos, i;
1164 int *mismatch_positions = mismatch_positions_alloc;
1165 int alignlength = pos3 - pos5;
1166 Univcoord_T univdiagonal;
1167
1168
1169 debug8(printf("Entered Substring_trim_qstart_nosplice with pos5 %d, pos3 %d, mismatch_scores %d/%d, match_score %d\n",
1170 pos5,pos3,TRIM_MISMATCH_SCORE_LAST,TRIM_MISMATCH_SCORE_MULT,TRIM_MATCH_SCORE));
1171 debug8(printf("Calling Genome_mismatches_right_trim with left %u, pos5 %d, pos3 %d\n",left,pos5,pos3));
1172 *nmismatches = Genome_mismatches_right_trim(mismatch_positions,/*max_mismatches*/alignlength,
1173 /*ome*/genomebits,/*ome_alt*/genomebits_alt,query_compress,
1174 left,pos5,pos3,plusp,genestrand);
1175 debug8(printf("%d mismatches:",*nmismatches));
1176 debug8(
1177 for (i = 0; i <= *nmismatches; i++) {
1178 printf(" %d",mismatch_positions[i]);
1179 }
1180 printf("\n");
1181 );
1182
1183 if (*nmismatches == 0) {
1184 return pos5;
1185 }
1186
1187 /* Advance until we get to a good region */
1188 pos = prevpos = pos3;
1189 i = 0;
1190 while (i < (*nmismatches) /*- 1*/ && score <= 0) {
1191 pos = mismatch_positions[i]; /* position at the mismatch */
1192 score += (prevpos - pos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1193 if (score < 0) {
1194 /* Distal matches did not compensate for the mismatch, so advance to here */
1195 debug8(printf("Advance qstart pos %d, score %d => 0, pos %d\n",pos,score,pos));
1196 score = 0;
1197 } else {
1198 debug8(printf("Advance qstart pos %d, score %d, pos %d\n",pos,score,pos));
1199 }
1200 prevpos = pos; /* On the mismatch */
1201 i++;
1202 }
1203
1204 /* Now trim */
1205 trimpos = prevpos = pos;
1206 score = 0;
1207 debug8(printf("Trim qend pos %d, score %d, trimpos %d\n",pos,score,trimpos));
1208 while (i < *nmismatches) {
1209 pos = mismatch_positions[i];
1210 score += (prevpos - pos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1211 if (score >= 0) {
1212 trimpos = pos;
1213 score = 0;
1214 }
1215 debug8(printf("Trim qstart pos %d, score %d, trimpos %d\n",pos,score,trimpos));
1216 prevpos = pos;
1217 i++;
1218 }
1219
1220 score += (prevpos - pos5 - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_LAST;
1221 if (score >= 0) {
1222 trimpos = pos5 - 1; /* At the "mismatch" */
1223 /* score = 0; */
1224 }
1225
1226 if (novelsplicingp == false && trimpos == 0 &&
1227 left + (Univcoord_T) querylength >= chroffset + (Univcoord_T) querylength) {
1228 /* For DNA-seq, if still within chromosome, take the initial mismatch at the beginning of the read */
1229 debug8(printf("Backing up 1 bp to start => trimpos %d\n",0));
1230 return 0;
1231 } else {
1232 debug8(printf("Final qstart pos %d => trimpos %d\n",pos,trimpos+1));
1233 return trimpos + 1; /* One position after the mismatch */
1234 }
1235 }
1236
1237
1238 /* Still takes left as a parameter */
1239 int
Substring_trim_qend_nosplice(int * nmismatches,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T left,Univcoord_T chrhigh,int pos5,int pos3,int querylength,bool plusp,int genestrand)1240 Substring_trim_qend_nosplice (int *nmismatches, int *mismatch_positions_alloc,
1241 Compress_T query_compress, Univcoord_T left, Univcoord_T chrhigh,
1242 int pos5, int pos3, int querylength, bool plusp, int genestrand) {
1243 int score = 0;
1244 int trimpos = pos3, pos, prevpos, i;
1245 int *mismatch_positions = mismatch_positions_alloc;
1246 int alignlength = pos3 - pos5;
1247
1248
1249 debug8(printf("Entered Substring_trim_qend_nosplice with pos5 %d, pos3 %d, mismatch_scores %d/%d, match_score %d\n",
1250 pos5,pos3,TRIM_MISMATCH_SCORE_LAST,TRIM_MISMATCH_SCORE_MULT,TRIM_MATCH_SCORE));
1251 debug8(printf("Calling Genome_mismatches_left_trim with left %u, pos5 %d, pos3 %d\n",left,pos5,pos3));
1252 *nmismatches = Genome_mismatches_left_trim(mismatch_positions,/*max_mismatches*/alignlength,
1253 /*ome*/genomebits,/*ome_alt*/genomebits_alt,query_compress,
1254 left,pos5,pos3,plusp,genestrand);
1255
1256 debug8(printf("%d mismatches:",*nmismatches));
1257 debug8(
1258 for (i = 0; i <= *nmismatches; i++) {
1259 printf(" %d",mismatch_positions[i]);
1260 }
1261 printf("\n");
1262 );
1263
1264 if (*nmismatches == 0) {
1265 return pos3;
1266 }
1267
1268 /* Advance until we get to a good region */
1269 pos = prevpos = pos5;
1270 i = 0;
1271 while (i < (*nmismatches) /*- 1*/ && score <= 0) {
1272 pos = mismatch_positions[i]; /* position at the mismatch */
1273 score += (pos - prevpos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1274 if (score < 0) {
1275 /* Distal matches did not compensate for the mismatch, so advance to here */
1276 debug8(printf("Advance qend pos %d, score %d => 0, pos %d\n",pos,score,pos));
1277 score = 0;
1278 } else {
1279 debug8(printf("Advance qend pos %d, score %d, pos %d\n",pos,score,pos));
1280 }
1281 prevpos = pos; /* On the mismatch */
1282 i++;
1283 }
1284
1285 /* Now trim */
1286 trimpos = prevpos = pos;
1287 score = 0;
1288 while (i < *nmismatches) {
1289 pos = mismatch_positions[i];
1290 score += (pos - prevpos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_MULT;
1291 if (score >= 0) {
1292 trimpos = pos;
1293 score = 0;
1294 }
1295 debug8(printf("Trim qend pos %d, score %d, trimpos %d\n",pos,score,trimpos));
1296 prevpos = pos;
1297 i++;
1298 }
1299
1300 score += (pos3 - prevpos - 1)*TRIM_MATCH_SCORE + TRIM_MISMATCH_SCORE_LAST;
1301 if (score >= 0) {
1302 trimpos = pos3;
1303 /* score = 0; */
1304 }
1305
1306 if (novelsplicingp == false && trimpos == querylength - 1 &&
1307 left + (Univcoord_T) querylength <= chrhigh) {
1308 /* For DNA-seq, if still within the chromosome, take the final mismatch at end of read */
1309 debug8(printf("Advancing 1 bp to end => trimpos %d\n",pos3));
1310 return querylength;
1311 } else {
1312 debug8(printf("Final qend pos %d => trimpos %d\n",pos,trimpos));
1313 return trimpos; /* qend is outside the region */
1314 }
1315 }
1316
1317
1318 /* Returns true if splice was found, false otherwise. If true, result
1319 is the number of spliceends. If false, result is the single trim
1320 position */
1321 bool
Substring_trimmed_qstarts(int * result,Splicetype_T * splicetype,int ** ambig_qstarts,double ** ambig_probs_5,Univcoord_T univdiagonal,int qend,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,int sensedir)1322 Substring_trimmed_qstarts (int *result, Splicetype_T *splicetype, int **ambig_qstarts, double **ambig_probs_5,
1323 Univcoord_T univdiagonal, int qend, int querylength,
1324 bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
1325 Univcoord_T chroffset, int sensedir) {
1326 Univcoord_T left;
1327 int trimpos, pos5;
1328 int nspliceends;
1329 int nmismatches;
1330
1331
1332 debug8(printf("\n***Entered Substring_trimmed_qstarts with univdiagonal %u, qend %d..%d, plusp %d, sensedir %d\n",
1333 univdiagonal-chroffset,0,qend,plusp,sensedir));
1334
1335 left = univdiagonal - (Univcoord_T) querylength;
1336 pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
1337
1338 trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1339 left,chroffset,pos5,/*pos3*/qend,querylength,plusp,genestrand);
1340 debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qstart*/0));
1341
1342 if (trimpos >= qend) {
1343 debug8(printf("trimpos %d >= qend %d, so returning -1\n",trimpos,qend));
1344 *result = -1;
1345 return false;
1346
1347 } else if (novelsplicingp == false) {
1348 /* Keep given trim */
1349 *result = trimpos;
1350 return false;
1351
1352 } else if (trimpos == pos5) {
1353 /* Keep given trim */
1354 *result = trimpos;
1355 return false;
1356
1357 } else if (trimpos <= pos5 + ACCEPTABLE_TRIM) {
1358 /* Accept splice if probability is high */
1359 debug8(printf("Accepting splice if probability is high\n"));
1360 if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&(*ambig_qstarts),&(*ambig_probs_5),
1361 left,/*qstart*/pos5,qend,
1362 mismatch_positions_alloc,nmismatches,chroffset,
1363 END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1364 plusp,sensedir)) == 0) {
1365 *result = trimpos; /* was pos5, but can lead to false positive mismatches at end */
1366 return false;
1367 } else {
1368 *result = nspliceends;
1369 return true;
1370 }
1371
1372 } else {
1373 if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&(*ambig_qstarts),&(*ambig_probs_5),
1374 left,/*qstart*/trimpos,qend,
1375 mismatch_positions_alloc,nmismatches,chroffset,
1376 END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1377 plusp,sensedir)) == 0) {
1378 *result = trimpos;
1379 return false;
1380
1381 } else {
1382 *result = nspliceends;
1383 return true;
1384 }
1385 }
1386 }
1387
1388
1389 bool
Substring_qstart_trim(int * trimpos,Splicetype_T * splicetype,double * ambig_prob_qstart,Univcoord_T univdiagonal,int pos3,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,int sensedir)1390 Substring_qstart_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qstart,
1391 Univcoord_T univdiagonal, int pos3, int querylength, bool plusp, int genestrand,
1392 int *mismatch_positions_alloc, Compress_T query_compress,
1393 Univcoord_T chroffset, int sensedir) {
1394 Univcoord_T left;
1395 int pos5;
1396 int nspliceends;
1397 int nmismatches;
1398 int *ambig_qstarts;
1399 double *ambig_probs_5;
1400
1401 debug8(printf("\n***Entered Substring_qstart_trim with univdiagonal %u, pos3 %d, plusp %d, sensedir %d\n",
1402 univdiagonal,pos3,plusp,sensedir));
1403
1404 left = univdiagonal - (Univcoord_T) querylength;
1405 pos5 = (univdiagonal >= (Univcoord_T) querylength) ? 0 : (int) -left;
1406
1407 *trimpos = Substring_trim_qstart_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1408 left,chroffset,pos5,pos3,querylength,plusp,genestrand);
1409 debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos5));
1410
1411 if (*trimpos >= pos3) {
1412 debug8(printf("trimpos %d >= pos3 %d, so returning -1\n",*trimpos,pos3));
1413 *trimpos = -1;
1414 return false;
1415
1416 } else if (novelsplicingp == false) {
1417 /* Keep given trim */
1418 *ambig_prob_qstart = 0.0;
1419 return false;
1420
1421 } else if (*trimpos == pos5) {
1422 /* Keep given trim */
1423 *ambig_prob_qstart = 0.0;
1424 return false;
1425
1426 } else if (*trimpos <= pos5 + ACCEPTABLE_TRIM) {
1427 /* Accept splice if probability is high */
1428 debug8(printf("Accepting splice if probability is high\n"));
1429 if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&ambig_qstarts,&ambig_probs_5,
1430 left,/*qstart*/pos5,/*qend*/pos3,
1431 mismatch_positions_alloc,nmismatches,chroffset,
1432 END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1433 plusp,sensedir)) == 0) {
1434 /* Keep given trim */
1435 /* *trimpos = pos5; -- can lead to false positive mismatches at end */
1436 *ambig_prob_qstart = 0.0;
1437 return false;
1438
1439 } else {
1440 /* TODO: Make sure this is the farthest one */
1441 *trimpos = ambig_qstarts[0];
1442 *ambig_prob_qstart = ambig_probs_5[0];
1443 FREE(ambig_qstarts);
1444 FREE(ambig_probs_5);
1445 return true;
1446 }
1447
1448 } else {
1449 if ((nspliceends = Splice_trim_novel_spliceends_5(&(*splicetype),&ambig_qstarts,&ambig_probs_5,
1450 left,/*qstart*/(*trimpos),/*qend*/pos3,
1451 mismatch_positions_alloc,nmismatches,chroffset,
1452 END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1453 plusp,sensedir)) == 0) {
1454 /* Keep given trim */
1455 *ambig_prob_qstart = 0.0;
1456 return false;
1457
1458 } else {
1459 /* TODO: Make sure this is the farthest one */
1460 *trimpos = ambig_qstarts[0];
1461 *ambig_prob_qstart = ambig_probs_5[0];
1462 FREE(ambig_qstarts);
1463 FREE(ambig_probs_5);
1464 return true;
1465 }
1466 }
1467 }
1468
1469
1470 /* Returns true if splice was found, false otherwise. If true, result
1471 is the number of spliceends. If false, result is the single trim
1472 position */
1473 bool
Substring_trimmed_qends(int * result,Splicetype_T * splicetype,int ** ambig_qends,double ** ambig_probs_3,Univcoord_T univdiagonal,int qstart,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,Univcoord_T chrhigh,int sensedir)1474 Substring_trimmed_qends (int *result, Splicetype_T *splicetype, int **ambig_qends, double **ambig_probs_3,
1475 Univcoord_T univdiagonal, int qstart, int querylength,
1476 bool plusp, int genestrand, int *mismatch_positions_alloc, Compress_T query_compress,
1477 Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
1478 Univcoord_T left;
1479 int trimpos, pos3;
1480 int nspliceends;
1481 int nmismatches;
1482
1483 debug8(printf("\n***Entered Substring_trimmed_qends with univdiagonal %u, qstart %d..%d, plusp %d, sensedir %d\n",
1484 univdiagonal-chroffset,qstart,querylength,plusp,sensedir));
1485
1486 left = univdiagonal - (Univcoord_T) querylength;
1487 pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
1488
1489 trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1490 left,chrhigh,/*pos5*/qstart,pos3,querylength,plusp,genestrand);
1491 debug8(printf("trimpos %d (relative to %d)\n",trimpos,/*qend*/querylength));
1492
1493 if (trimpos <= qstart) {
1494 debug8(printf("trimpos %d <= qstart %d, so returning -1\n",trimpos,qstart));
1495 *result = -1;
1496 return false;
1497
1498 } else if (novelsplicingp == false) {
1499 /* Keep given trim */
1500 *result = trimpos;
1501 return false;
1502
1503 } else if (trimpos == pos3) {
1504 /* Keep given trim */
1505 *result = trimpos;
1506 return false;
1507
1508 } else if (trimpos >= pos3 - ACCEPTABLE_TRIM) {
1509 /* Accept splice if probability is high */
1510 debug8(printf("Accepting splice if probability is high\n"));
1511 if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&(*ambig_qends),&(*ambig_probs_3),
1512 left,qstart,/*qend*/pos3,querylength,
1513 mismatch_positions_alloc,nmismatches,chroffset,
1514 END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1515 plusp,sensedir)) == 0) {
1516 *result = trimpos; /* was pos3, but can lead to false positive mismatches at end */
1517 return false;
1518 } else {
1519 *result = nspliceends;
1520 return true;
1521 }
1522
1523 } else {
1524 if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&(*ambig_qends),&(*ambig_probs_3),
1525 left,qstart,/*qend*/trimpos,querylength,
1526 mismatch_positions_alloc,nmismatches,chroffset,
1527 END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1528 plusp,sensedir)) == 0) {
1529 *result = trimpos;
1530 return false;
1531 } else {
1532 *result = nspliceends;
1533 return true;
1534 }
1535 }
1536 }
1537
1538
1539 bool
Substring_qend_trim(int * trimpos,Splicetype_T * splicetype,double * ambig_prob_qend,Univcoord_T univdiagonal,int pos5,int querylength,bool plusp,int genestrand,int * mismatch_positions_alloc,Compress_T query_compress,Univcoord_T chroffset,Univcoord_T chrhigh,int sensedir)1540 Substring_qend_trim (int *trimpos, Splicetype_T *splicetype, double *ambig_prob_qend,
1541 Univcoord_T univdiagonal, int pos5, int querylength, bool plusp, int genestrand,
1542 int *mismatch_positions_alloc, Compress_T query_compress,
1543 Univcoord_T chroffset, Univcoord_T chrhigh, int sensedir) {
1544 Univcoord_T left;
1545 int pos3;
1546 int nspliceends;
1547 int nmismatches;
1548 int *ambig_qends;
1549 double *ambig_probs_3;
1550
1551 debug8(printf("\n***Entered Substring_qend_trim with univdiagonal %u, pos5 %d, plusp %d, sensedir %d\n",
1552 univdiagonal,pos5,plusp,sensedir));
1553
1554 left = univdiagonal - (Univcoord_T) querylength;
1555 pos3 = (univdiagonal <= genomelength) ? querylength : (int) (genomelength - left);
1556
1557 *trimpos = Substring_trim_qend_nosplice(&nmismatches,mismatch_positions_alloc,query_compress,
1558 left,chrhigh,pos5,pos3,querylength,plusp,genestrand);
1559 debug8(printf("trimpos %d (relative to %d)\n",*trimpos,pos3));
1560
1561 if (*trimpos <= pos5) {
1562 debug8(printf("trimpos %d <= pos5 %d, so returning -1\n",*trimpos,pos5));
1563 *trimpos = -1;
1564 return false;
1565
1566 } else if (novelsplicingp == false) {
1567 /* Keep given trim */
1568 *ambig_prob_qend = 0.0;
1569 return false;
1570
1571 } else if (*trimpos == pos3) {
1572 /* Keep given trim */
1573 *ambig_prob_qend = 0.0;
1574 return false;
1575
1576 } else if (*trimpos >= pos3 - ACCEPTABLE_TRIM) {
1577 /* Accept splice if probability is high */
1578 debug8(printf("Accepting splice if probability is high\n"));
1579 if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&ambig_qends,&ambig_probs_3,
1580 left,/*qstart*/pos5,/*qend*/pos3,querylength,
1581 mismatch_positions_alloc,nmismatches,chroffset,
1582 END_SPLICESITE_PROB_CLOSE,MAX_NCONSECUTIVE_CLOSE,
1583 plusp,sensedir)) == 0) {
1584 /* Keep given trim */
1585 /* *trimpos = pos3; -- can lead to false positive mismatches at end */
1586 *ambig_prob_qend = 0.0;
1587 return false;
1588 } else {
1589 /* TODO: Make sure this is the farthest one */
1590 *trimpos = ambig_qends[0];
1591 *ambig_prob_qend = ambig_probs_3[0];
1592 FREE(ambig_qends);
1593 FREE(ambig_probs_3);
1594 return true;
1595 }
1596
1597 } else {
1598 if ((nspliceends = Splice_trim_novel_spliceends_3(&(*splicetype),&ambig_qends,&ambig_probs_3,
1599 left,/*qstart*/pos5,/*qend*/*trimpos,querylength,
1600 mismatch_positions_alloc,nmismatches,chroffset,
1601 END_SPLICESITE_PROB_FAR,MAX_NCONSECUTIVE_FAR,
1602 plusp,sensedir)) == 0) {
1603 /* Keep given trim */
1604 *ambig_prob_qend = 0.0;
1605 return false;
1606 } else {
1607 /* TODO: Make sure this is the farthest one */
1608 *trimpos = ambig_qends[0];
1609 *ambig_prob_qend = ambig_probs_3[0];
1610 FREE(ambig_qends);
1611 FREE(ambig_probs_3);
1612 return true;
1613 }
1614 }
1615 }
1616
1617
1618
1619 /* Modified from Substring_new to return nmatches_plus_spliced_trims only */
1620 int
Substring_compute_nmatches(int * ref_nmatches_plus_spliced_trims,Univcoord_T left,int querystart,int queryend,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice_querystart_p,bool splice_queryend_p,bool chrnum_fixed_p)1621 Substring_compute_nmatches (int *ref_nmatches_plus_spliced_trims, Univcoord_T left,
1622 int querystart, int queryend, int querylength,
1623 bool plusp, int genestrand, Compress_T query_compress,
1624 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1625 bool splice_querystart_p, bool splice_queryend_p, bool chrnum_fixed_p) {
1626 int nmatches_plus_spliced_trims, nmismatches_bothdiff, nmismatches_refdiff, amb_length;
1627 Univcoord_T alignstart, alignend;
1628 int outofbounds_start, outofbounds_end;
1629 int trim_at_querystart, trim_at_queryend;
1630
1631
1632 debug2(printf("\n***Entered Substring_compute_nmatches with left %u, chr %d, genome %u..%u, query %d..%d, plusp %d, splice_querystart_p %d, splice_queryend_p %d\n",
1633 left,chrnum,left+querystart-chroffset,left+queryend-chroffset,querystart,queryend,plusp,splice_querystart_p,splice_queryend_p));
1634
1635 #if 0
1636 if (plusp == true) {
1637 genomicstart = left;
1638 genomicend = left + querylength;
1639 } else {
1640 genomicstart = left + querylength;
1641 genomicend = left;
1642 }
1643 #endif
1644
1645 trim_at_querystart = querystart;
1646 trim_at_queryend = querylength - queryend;
1647
1648 if (chrnum_fixed_p == true) {
1649 /* Chromosomal information determined by the middle diagonal */
1650 if (plusp) {
1651 alignstart = left + trim_at_querystart;
1652 alignend = left + (querylength - trim_at_queryend);
1653
1654 if (alignstart >= chroffset) {
1655 outofbounds_start = 0;
1656 } else {
1657 outofbounds_start = chroffset - alignstart;
1658 }
1659 if (alignend <= chrhigh) {
1660 outofbounds_end = 0;
1661 } else {
1662 outofbounds_end = alignend - chrhigh;
1663 }
1664
1665 } else {
1666 alignstart = left + (querylength - trim_at_querystart);
1667 alignend = left + trim_at_queryend;
1668
1669 if (alignend >= chroffset) {
1670 outofbounds_end = 0;
1671 } else {
1672 outofbounds_end = chroffset - alignend;
1673 }
1674 if (alignstart <= chrhigh) {
1675 outofbounds_start = 0;
1676 } else {
1677 outofbounds_start = alignstart - chrhigh;
1678 }
1679 }
1680
1681 } else {
1682 /* Determine chromosomal information */
1683 if (plusp) {
1684 alignstart = left + trim_at_querystart;
1685 alignend = left + (querylength - trim_at_queryend);
1686
1687 if (alignend < chroffset) {
1688 /* Need to recompute chromosome bounds (unexpected since chroffset should be based on left) */
1689 debug2(printf("Plus: recomputing chromosomal bounds\n"));
1690 chrnum = Univ_IIT_get_one(chromosome_iit,alignstart,alignstart);
1691 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1692
1693 } else if (alignstart >= chrhigh) {
1694 /* Need to recompute chromosome bounds */
1695 debug2(printf("Plus: recomputing chromosomal bounds\n"));
1696 chrnum = Univ_IIT_get_one(chromosome_iit,alignstart,alignstart);
1697 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1698 }
1699
1700 if (alignend <= chrhigh) {
1701 /* Alignment is within the chromosome */
1702 debug2(printf("alignend <= chrhigh, so alignment is within the chromosome\n"));
1703 outofbounds_start = outofbounds_end = 0;
1704 } else {
1705 /* Alignment straddles the high bound of the chromosome */
1706 outofbounds_start = chrhigh - alignstart;
1707 outofbounds_end = alignend - chrhigh;
1708 if (outofbounds_start >= outofbounds_end) {
1709 /* Keep in existing chromosome */
1710 outofbounds_start = 0;
1711 } else if (++chrnum > nchromosomes) {
1712 /* Went past the end of the genome */
1713 return -1;
1714 } else {
1715 /* Move to next chromosome */
1716 debug2(printf("Moving to next chromosome\n"));
1717 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1718 if (alignend <= chrhigh) {
1719 /* Alignment is within the new chromosome */
1720 debug2(printf("alignend <= new chrhigh %u, so alignment is within the new chromosome\n",chrhigh));
1721 outofbounds_end = 0;
1722 } else {
1723 outofbounds_end = alignend - chrhigh;
1724 }
1725 }
1726 }
1727
1728 } else {
1729 alignstart = left + (querylength - trim_at_querystart);
1730 alignend = left + trim_at_queryend;
1731
1732 if (alignstart < chroffset) {
1733 /* Need to recompute chromosome bounds (unexpected since chroffset should be based on left) */
1734 debug2(printf("Minus: Recomputing chromosomal bounds\n"));
1735 chrnum = Univ_IIT_get_one(chromosome_iit,alignend,alignend);
1736 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1737
1738 } else if (alignend >= chrhigh) {
1739 /* Need to recompute chromosome bounds */
1740 debug2(printf("Minus: Recomputing chromosomal bounds\n"));
1741 chrnum = Univ_IIT_get_one(chromosome_iit,alignend,alignend);
1742 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1743 }
1744
1745 if (alignstart <= chrhigh) {
1746 /* Alignment is within the chromosome */
1747 debug2(printf("alignstart <= chrhigh, so alignment is within the chromosome\n"));
1748 outofbounds_start = outofbounds_end = 0;
1749 } else {
1750 /* Alignment straddles the high bound of the chromosome */
1751 outofbounds_start = alignstart - chrhigh;
1752 outofbounds_end = chrhigh - alignend;
1753 if (outofbounds_end >= outofbounds_start) {
1754 /* Keep in existing chromosome */
1755 outofbounds_end = 0;
1756 } else if (++chrnum > nchromosomes) {
1757 /* Went past the end of the genome */
1758 return -1;
1759 } else {
1760 /* Move to next chromosome */
1761 debug2(printf("Moving to next chromosome\n"));
1762 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,circular_typeint);
1763 if (alignstart <= chrhigh) {
1764 debug2(printf("alignstart <= new chrhigh %u, so alignment is within the new chromosome\n",chrhigh));
1765 outofbounds_start = 0;
1766 } else {
1767 outofbounds_start = alignstart - chrhigh;
1768 }
1769 }
1770 }
1771 }
1772 }
1773
1774 /* outofbounds values are relative to preliminary alignstart and alignend */
1775 trim_at_querystart += outofbounds_start;
1776 trim_at_queryend += outofbounds_end;
1777
1778 querystart = trim_at_querystart;
1779 queryend = querylength - trim_at_queryend;
1780 if (querystart >= queryend) {
1781 /* Can happen if multiple mismatches result in trimming down to 0 */
1782 return -1;
1783
1784 } else if (plusp == true) {
1785 debug2(printf("Counting mismatches from querystart %d to queryend %d\n",querystart,queryend));
1786 nmismatches_bothdiff =
1787 Genome_count_mismatches_substring(&nmismatches_refdiff,genomebits,genomebits_alt,query_compress,left,
1788 /*pos5*/querystart,/*pos3*/queryend,/*plusp*/true,genestrand);
1789 } else {
1790 debug2(printf("Counting mismatches from querystart %d to queryend %d\n",querystart,queryend));
1791 nmismatches_bothdiff =
1792 Genome_count_mismatches_substring(&nmismatches_refdiff,genomebits,genomebits_alt,query_compress,left,
1793 /*pos5*/querylength - queryend,/*pos3*/querylength - querystart,
1794 /*plusp*/false,genestrand);
1795 }
1796
1797 /* Needs to be identical to code in Substring_nmatches. Don't need to handle alts_substring, though. */
1798 amb_length = 0;
1799 if (splice_querystart_p == true) {
1800 amb_length += trim_at_querystart;
1801 }
1802 if (splice_queryend_p == true) {
1803 amb_length += trim_at_queryend;
1804 }
1805 nmatches_plus_spliced_trims = amb_length + (queryend - querystart) - nmismatches_bothdiff;
1806 *ref_nmatches_plus_spliced_trims = amb_length + (queryend - querystart) - nmismatches_refdiff;
1807
1808 debug2(printf("Substring_compute_nmatches returning %d matches\n",nmatches_plus_spliced_trims));
1809 return nmatches_plus_spliced_trims;
1810 }
1811
1812
1813 /* Want querylength and not querylength_adj */
1814 /* If orig_nmismatches < 0, then this procedure (re-)computes the value */
1815
1816 /* For trimming: querystart and queryend should be used as a basis for
1817 trim_novel_spliceends (which search to both sides of the point, but
1818 not for trim_querystart and trim_queryend, which search only
1819 inwards. For those procedures, should use querystart of 0 and
1820 queryend of querylength, and the results are absolute, not relative
1821 to the given querystart and queryend */
1822
1823 /* Assumes that chrnum is determined correctly, even for straddles, from middle diagonal */
1824 T
Substring_new(int nmismatches,int ref_nmismatches,Univcoord_T left,int querystart,int queryend,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,bool splice_querystart_p,Splicetype_T splicetype_querystart,double ambig_prob_querystart,bool splice_queryend_p,Splicetype_T splicetype_queryend,double ambig_prob_queryend,int sensedir)1825 Substring_new (int nmismatches, int ref_nmismatches, Univcoord_T left,
1826 int querystart, int queryend, int querylength, bool plusp, int genestrand,
1827 Compress_T query_compress, Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1828 bool splice_querystart_p, Splicetype_T splicetype_querystart, double ambig_prob_querystart,
1829 bool splice_queryend_p, Splicetype_T splicetype_queryend, double ambig_prob_queryend,
1830 int sensedir) {
1831 T new;
1832 int trim_querystart, trim_queryend;
1833 Univcoord_T alignstart, alignend;
1834 int outofbounds_start, outofbounds_end;
1835
1836
1837 debug2(printf("\n***Entered Substring_new with left %u, chr %d, genome %d..%d, query %d..%d, plusp %d, sensedir %d, splice_querystart_p %d, splice_queryend_p %d\n",
1838 left,chrnum,(int) (left+querystart-chroffset),(int) (left+queryend-chroffset),querystart,queryend,plusp,sensedir,splice_querystart_p,splice_queryend_p));
1839
1840 /* assert(queryend > querystart); -- Assertion does not hold. Sometimes queryend == querystart */
1841
1842 new = (T) MALLOC_OUT(sizeof(*new));
1843 debug2(printf("substring %p:\n",new));
1844
1845 /* Required values if we abort and invoke Substring_free early */
1846 new->alts_ncoords = 0;
1847 new->genomic_bothdiff = (char *) NULL;
1848 new->genomic_refdiff = (char *) NULL;
1849
1850 new->querylength = querylength;
1851
1852 /* These values are required by substring_trim_novel_spliceends */
1853 new->left = left;
1854 new->plusp = plusp;
1855 new->genestrand = genestrand;
1856 new->query_compress = query_compress;
1857
1858 if (plusp == true) {
1859 new->genomicstart = left;
1860 new->genomicend = left + querylength;
1861
1862 if (left + querylength > chroffset + querylength) {
1863 new->querystart_chrbound = 0;
1864 } else {
1865 new->querystart_chrbound = chroffset - left;
1866 }
1867 if (left + querylength < chrhigh) {
1868 new->queryend_chrbound = querylength;
1869 } else {
1870 new->queryend_chrbound = chrhigh - left;
1871 }
1872
1873 } else {
1874 new->genomicstart = left + querylength;
1875 new->genomicend = left;
1876
1877 if (left + querylength < chrhigh) {
1878 new->querystart_chrbound = 0;
1879 } else {
1880 new->querystart_chrbound = (left + querylength) - chrhigh;
1881 }
1882 if (left + querylength > chroffset + querylength) {
1883 new->queryend_chrbound = querylength;
1884 } else {
1885 new->queryend_chrbound = (left + querylength) - chroffset;
1886 }
1887 }
1888
1889 new->trim_querystart_splicep = splice_querystart_p;
1890 new->trim_queryend_splicep = splice_queryend_p;
1891
1892 new->querystart_pretrim = querystart;
1893 new->queryend_pretrim = queryend;
1894
1895 trim_querystart = querystart;
1896 trim_queryend = querylength - queryend;
1897
1898 new->sensedir = sensedir;
1899
1900 new->amb_splice_pos = 0;
1901 new->splicecoord_D = new->splicecoord_A = 0;
1902 new->siteD_pos = new->siteA_pos = new->siteN_pos = 0;
1903 new->siteD_prob = new->siteA_prob = 0.0;
1904
1905 if (splice_querystart_p == false) {
1906 new->start_amb_length = 0;
1907 new->amb_type = END;
1908 new->start_endtype = END;
1909
1910 } else if (splicetype_querystart == DONOR || splicetype_querystart == ANTIDONOR) {
1911 new->start_amb_length = querystart;
1912 new->amb_type = DON;
1913 new->start_endtype = DON;
1914 new->siteD_prob = ambig_prob_querystart;
1915
1916 } else {
1917 new->start_amb_length = querystart;
1918 new->amb_type = ACC;
1919 new->start_endtype = ACC;
1920 new->siteA_prob = ambig_prob_querystart;
1921 }
1922
1923 if (splice_queryend_p == false) {
1924 new->end_amb_length = 0;
1925 new->amb_type = END;
1926 new->end_endtype = END;
1927
1928 } else if (splicetype_queryend == DONOR || splicetype_queryend == ANTIDONOR) {
1929 new->end_amb_length = querylength - queryend;
1930 new->amb_type = DON;
1931 new->end_endtype = DON;
1932 new->siteD_prob = ambig_prob_queryend;
1933
1934 } else {
1935 new->end_amb_length = querylength - queryend;
1936 new->amb_type = ACC;
1937 new->end_endtype = ACC;
1938 new->siteA_prob = ambig_prob_queryend;
1939 }
1940
1941
1942 /* Chromosomal information determined by the middle diagonal */
1943 if (plusp) {
1944 alignstart = left + trim_querystart;
1945 alignend = left + (querylength - trim_queryend);
1946
1947 #if 1
1948 /* Needed to protect against substrings on the wrong chromosome */
1949 if (alignend < chroffset || alignstart >= chrhigh) {
1950 debug2(printf("Substring fails because alignend %u < chroffset %u or alignstart %u >= chrhigh %u\n",
1951 alignend,chroffset,alignstart,chrhigh));
1952 Substring_free(&new);
1953 return (T) NULL;
1954 }
1955 #endif
1956
1957 if (alignstart >= chroffset) {
1958 outofbounds_start = 0;
1959 } else {
1960 outofbounds_start = chroffset - alignstart;
1961 }
1962 if (alignend <= chrhigh) {
1963 outofbounds_end = 0;
1964 } else {
1965 outofbounds_end = alignend - chrhigh;
1966 }
1967
1968 } else {
1969 alignstart = left + (querylength - trim_querystart);
1970 alignend = left + trim_queryend;
1971
1972 #if 1
1973 /* Needed to protect against substrings on the wrong chromosome */
1974 if (alignstart < chroffset || alignend >= chrhigh) {
1975 debug2(printf("Substring fails because alignstart %u < chroffset %u or alignend %u >= chrhigh %u\n",
1976 alignstart,chroffset,alignend,chrhigh));
1977 Substring_free(&new);
1978 return (T) NULL;
1979 }
1980 #endif
1981
1982 if (alignend >= chroffset) {
1983 outofbounds_end = 0;
1984 } else {
1985 outofbounds_end = chroffset - alignend;
1986 }
1987 if (alignstart <= chrhigh) {
1988 outofbounds_start = 0;
1989 } else {
1990 outofbounds_start = alignstart - chrhigh;
1991 }
1992 }
1993
1994 /* outofbounds values are relative to preliminary alignstart and alignend */
1995 trim_querystart += outofbounds_start;
1996 trim_queryend += outofbounds_end;
1997
1998 #if 0
1999 if (trim_querystart + trim_queryend >= querylength) {
2000 debug2(printf("Substring fails because trim_querystart %d + trim_queryend %d >= querylength %d\n",
2001 trim_querystart,trim_queryend,querylength));
2002 Substring_free(&new);
2003 return (T) NULL;
2004 }
2005 #endif
2006
2007 #ifdef DEBUG2
2008 printf("Outofbounds start %d, outofbounds end %d\n",outofbounds_start,outofbounds_end);
2009 if (outofbounds_start > 0) {
2010 printf("Out of bounds: Revising trim_querystart to be %d\n",trim_querystart);
2011 }
2012 if (outofbounds_end > 0) {
2013 printf("Out of bounds: Revising trim_queryend to be %d\n",trim_queryend);
2014 }
2015 printf("Got trims of %d and %d\n",trim_querystart,trim_queryend);
2016 #endif
2017
2018
2019 debug2(printf("\n***chrnum %d (chroffset %u, chrhigh %u), plusp %d\n",chrnum,chroffset,chrhigh,plusp));
2020 new->chrnum = chrnum;
2021 new->chroffset = chroffset;
2022 new->chrhigh = chrhigh;
2023 new->chrlength = chrlength;
2024
2025 /* mandatory_trim_querystart and mandatory_trim_queryend also set during aliasing for circular chromosomes */
2026 /* These calculations are equivalent to those for querystart_chrbound and queryend_chrbound */
2027 if (left + querylength < new->chroffset + querylength) {
2028 new->mandatory_trim_querystart = new->chroffset - left;
2029 assert(new->mandatory_trim_querystart >= 0);
2030 } else {
2031 new->mandatory_trim_querystart = 0;
2032 }
2033 if (left + querylength >= new->chrhigh) {
2034 new->mandatory_trim_queryend = (left + querylength) - new->chrhigh;
2035 assert(new->mandatory_trim_queryend >= 0);
2036 } else {
2037 new->mandatory_trim_queryend = 0;
2038 }
2039 debug2(printf("mandatory_trim_querystart %d, mandatory_trim_queryend %d\n",new->mandatory_trim_querystart,new->mandatory_trim_queryend));
2040
2041
2042 if (querystart != trim_querystart || queryend != querylength - trim_queryend) {
2043 nmismatches = -1; /* Need to recalculate, because of change in querystart */
2044 }
2045 new->querystart = trim_querystart;
2046 new->queryend = querylength - trim_queryend;
2047 debug2(printf("querystart %d, queryend %d\n",new->querystart,new->queryend));
2048
2049 assert(new->querystart >= 0);
2050 assert(new->querystart <= querylength);
2051 assert(new->queryend >= 0);
2052 assert(new->queryend <= querylength);
2053
2054 /* Compute coordinates */
2055 if (new->queryend <= new->querystart) {
2056 /* Can happen if multiple mismatches result in trimming down to 0 */
2057 Substring_free(&new);
2058 return (T) NULL;
2059
2060 } else if (plusp == true) {
2061 new->alignstart_trim = left + new->querystart;
2062 new->alignend_trim = left + new->queryend;
2063
2064 debug2(printf("left is %u\n",new->left));
2065 debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2066 debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2067 assert(new->alignstart_trim >= chroffset);
2068 assert(new->alignend_trim <= chrhigh);
2069
2070 debug2(printf("Counting mismatches from querystart %d to queryend %d\n",new->querystart,new->queryend));
2071 if (nmismatches >= 0) {
2072 debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",left - chroffset,new->querystart,new->queryend));
2073 debug7(printf("%d vs %d\n",nmismatches,
2074 Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2075 /*pos5*/new->querystart,/*pos3*/new->queryend,
2076 /*plusp*/true,genestrand)));
2077 #ifdef CHECK_NMISMATCHES
2078 assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2079 /*pos5*/new->querystart,/*pos3*/new->queryend,
2080 /*plusp*/true,genestrand));
2081 #endif
2082 } else {
2083 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2084 /*pos5*/new->querystart,/*pos3*/new->queryend,
2085 /*plusp*/true,genestrand);
2086 }
2087
2088 } else {
2089 new->alignstart_trim = left + (querylength - new->querystart);
2090 new->alignend_trim = left + (querylength - new->queryend);
2091
2092 debug2(printf("left is %u\n",new->left));
2093 debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2094 debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2095 assert(new->alignstart_trim <= chrhigh);
2096 assert(new->alignend_trim >= chroffset);
2097
2098 if (nmismatches >= 0) {
2099 debug7(printf("Checking mismatches at %u from querystart %d to queryend %d\n",left - chroffset,new->querystart,new->queryend));
2100 debug7(printf("%d vs %d\n",nmismatches,
2101 Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2102 /*pos5*/querylength - new->queryend,/*pos3*/querylength - new->querystart,
2103 /*plusp*/false,genestrand)));
2104 #ifdef CHECK_NMISMATCHES
2105 assert(nmismatches == Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2106 /*pos5*/querylength - new->queryend,/*pos3*/querylength - new->querystart,
2107 /*plusp*/false,genestrand));
2108 #endif
2109 } else {
2110 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,left,
2111 /*pos5*/querylength - new->queryend,/*pos3*/querylength - new->querystart,
2112 /*plusp*/false,genestrand);
2113 }
2114 }
2115
2116 new->nmismatches_bothdiff = nmismatches;
2117 new->nmismatches_refdiff = ref_nmismatches;
2118
2119 new->nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_bothdiff;
2120 new->nmatches_plus_spliced_trims = new->nmatches_to_trims + new->start_amb_length + new->end_amb_length;
2121
2122 new->ref_nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_refdiff;
2123 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims + new->start_amb_length + new->end_amb_length;
2124
2125 debug2(printf("nmatches_to_trims %d = queryend %d - querystart %d - nmismatches_bothdiff %d\n",
2126 new->nmatches_to_trims,new->queryend,new->querystart,new->nmismatches_bothdiff));
2127 assert(new->nmatches_to_trims >= 0);
2128
2129 new->alts_ncoords = 0;
2130 new->alts_coords = (Univcoord_T *) NULL;
2131 new->alts_knowni = (int *) NULL;
2132 new->alts_nmismatches = (int *) NULL;
2133 new->alts_ref_nmismatches = (int *) NULL;
2134 new->alts_probs = (double *) NULL;
2135
2136 debug2(printf("Substring_new returning %d matches\n",new->nmatches_plus_spliced_trims));
2137 #if 0
2138 debug2(printf("** Returning substring %p, query %d..%d, trim %d..%d, nmatches_plus_spliced_trims %d, nmismatches_refdiff %d, nmismatches_bothdiff %d, amb_lengths %d and %d\n",
2139 new,new->querystart,new->queryend,trim_querystart,trim_queryend,nmatches_plus_spliced_trims,
2140 new->nmismatches_refdiff,new->nmismatches_bothdiff,Substring_start_amb_length(new),Substring_end_amb_length(new)));
2141 #endif
2142
2143 assert(new->nmismatches_bothdiff >= 0);
2144 return new;
2145 }
2146
2147
2148 /* Copied from Stage3end_new_substitution */
2149 T
Substring_extend_anchor_querystart(T anchor,T substring,int * mismatch_positions_alloc,Compress_T query_compress)2150 Substring_extend_anchor_querystart (T anchor, T substring, int *mismatch_positions_alloc,
2151 Compress_T query_compress) {
2152 T new_anchor;
2153 int qstart, qend, nmismatches, ref_nmismatches;
2154 bool splice_querystart_p, splice_queryend_p;
2155 Splicetype_T splicetype_querystart, splicetype_queryend;
2156 double ambig_prob_querystart, ambig_prob_queryend;
2157
2158 if (substring->querystart != 0) {
2159 /* Keep previously computed bounds for anchor */
2160 return anchor;
2161
2162 } else if (anchor->plusp == true) {
2163 qend = anchor->queryend;
2164 splice_querystart_p = Substring_qstart_trim(&qstart,&splicetype_querystart,&ambig_prob_querystart,
2165 /*univdiagonal*/anchor->left + anchor->querylength,/*pos3*/qend,
2166 anchor->querylength,/*plusp*/true,anchor->genestrand,
2167 mismatch_positions_alloc,query_compress,
2168 anchor->chroffset,anchor->sensedir);
2169 /* printf("Substring_extend_anchor_querystart, plus, yields splicep %d, qstart %d, prob %f\n",
2170 splice_querystart_p,qstart,ambig_prob_querystart); */
2171
2172 if (qstart < 0) {
2173 return anchor;
2174
2175 } else if (qend <= qstart) {
2176 return anchor;
2177
2178 } else {
2179 if (anchor->end_endtype == END) {
2180 splice_queryend_p = false;
2181 } else if (anchor->end_endtype == DON) {
2182 splice_queryend_p = true;
2183 splicetype_queryend = DONOR;
2184 ambig_prob_queryend = anchor->siteD_prob;
2185 } else {
2186 splice_queryend_p = true;
2187 splicetype_queryend = ACCEPTOR;
2188 ambig_prob_queryend = anchor->siteA_prob;
2189 }
2190
2191 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2192 /*pos5*/qstart,/*pos3*/qend,
2193 /*plusp*/true,anchor->genestrand);
2194 if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,/*querystart*/qstart,/*queryend*/qend,
2195 anchor->querylength,/*plusp*/true,anchor->genestrand,query_compress,
2196 anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2197 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2198 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2199 anchor->sensedir)) == NULL) {
2200 return anchor;
2201 } else {
2202 Substring_free(&anchor);
2203 return new_anchor;
2204 }
2205 }
2206
2207 } else {
2208 qstart = anchor->querylength - anchor->queryend;
2209 splice_querystart_p = Substring_qend_trim(&qend,&splicetype_querystart,&ambig_prob_querystart,
2210 /*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
2211 anchor->querylength,/*plusp*/false,anchor->genestrand,
2212 mismatch_positions_alloc,query_compress,
2213 anchor->chroffset,anchor->chrhigh,anchor->sensedir);
2214 /* printf("Substring_extend_anchor_querystart, minus, yields splicep %d, qend %d, prob %f\n",
2215 splice_querystart_p,qend,ambig_prob_querystart); */
2216
2217 if (qend < 0) {
2218 return anchor;
2219
2220 } else if (qend <= qstart) {
2221 return anchor;
2222
2223 } else {
2224 if (anchor->end_endtype == END) {
2225 splice_queryend_p = false;
2226 } else if (anchor->end_endtype == DON) {
2227 splice_queryend_p = true;
2228 splicetype_queryend = ANTIDONOR;
2229 ambig_prob_queryend = anchor->siteD_prob;
2230 } else {
2231 splice_queryend_p = true;
2232 splicetype_queryend = ANTIACCEPTOR;
2233 ambig_prob_queryend = anchor->siteA_prob;
2234 }
2235
2236 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2237 /*pos5*/qstart,/*pos3*/qend,
2238 /*plusp*/false,anchor->genestrand);
2239 if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,
2240 /*querystart*/anchor->querylength - qend,
2241 /*queryend*/anchor->querylength - qstart,
2242 anchor->querylength,/*plusp*/false,anchor->genestrand,query_compress,
2243 anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2244 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2245 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2246 anchor->sensedir)) == NULL) {
2247 return anchor;
2248 } else {
2249 Substring_free(&anchor);
2250 return new_anchor;
2251 }
2252 }
2253 }
2254 }
2255
2256
2257 /* Copied from Stage3end_new_substitution */
2258 T
Substring_extend_anchor_queryend(T anchor,T substring,int * mismatch_positions_alloc,Compress_T query_compress)2259 Substring_extend_anchor_queryend (T anchor, T substring, int *mismatch_positions_alloc,
2260 Compress_T query_compress) {
2261 T new_anchor;
2262 int qstart, qend, nmismatches, ref_nmismatches;
2263 bool splice_querystart_p, splice_queryend_p;
2264 Splicetype_T splicetype_querystart, splicetype_queryend;
2265 double ambig_prob_querystart, ambig_prob_queryend;
2266
2267 if (substring->queryend != substring->querylength) {
2268 /* Keep previously computed bounds for anchor */
2269 return anchor;
2270
2271 } else if (anchor->plusp == true) {
2272 qstart = anchor->querystart;
2273 splice_queryend_p = Substring_qend_trim(&qend,&splicetype_queryend,&ambig_prob_queryend,
2274 /*univdiagonal*/anchor->left + anchor->querylength,/*pos5*/qstart,
2275 anchor->querylength,/*plusp*/true,anchor->genestrand,
2276 mismatch_positions_alloc,query_compress,
2277 anchor->chroffset,anchor->chrhigh,anchor->sensedir);
2278 /* printf("Substring_extend_anchor_queryend, plus, yields splicep %d, qend %d, prob %f\n",
2279 splice_queryend_p,qend,ambig_prob_queryend); */
2280
2281 if (qend < 0) {
2282 return anchor;
2283
2284 } else if (qend <= qstart) {
2285 return anchor;
2286
2287 } else {
2288 if (anchor->start_endtype == END) {
2289 splice_querystart_p = false;
2290 } else if (anchor->start_endtype == DON) {
2291 splice_querystart_p = true;
2292 splicetype_querystart = DONOR;
2293 ambig_prob_querystart = anchor->siteD_prob;
2294 } else {
2295 splice_querystart_p = true;
2296 splicetype_querystart = ACCEPTOR;
2297 ambig_prob_querystart = anchor->siteA_prob;
2298 }
2299
2300 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2301 /*pos5*/qstart,/*pos3*/qend,/*plusp*/true,anchor->genestrand);
2302 if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,/*querystart*/qstart,/*queryend*/qend,
2303 anchor->querylength,/*plusp*/true,anchor->genestrand,query_compress,
2304 anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2305 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2306 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2307 anchor->sensedir)) == NULL) {
2308 return anchor;
2309 } else {
2310 Substring_free(&anchor);
2311 return new_anchor;
2312 }
2313 }
2314
2315 } else {
2316 qend = anchor->querylength - anchor->querystart;
2317 splice_queryend_p = Substring_qstart_trim(&qstart,&splicetype_queryend,&ambig_prob_queryend,
2318 /*univdiagonal*/anchor->left + anchor->querylength,
2319 /*pos3*/qend,substring->querylength,
2320 /*plusp*/false,anchor->genestrand,
2321 mismatch_positions_alloc,query_compress,
2322 anchor->chroffset,anchor->sensedir);
2323 /*printf("Substring_extend_anchor_queryend, minus, yields splicep %d, qstart %d, prob %f\n",
2324 splice_queryend_p,qstart,ambig_prob_queryend); */
2325
2326 if (qstart < 0) {
2327 return anchor;
2328
2329 } else if (qend <= qstart) {
2330 return anchor;
2331
2332 } else {
2333 if (anchor->start_endtype == END) {
2334 splice_querystart_p = false;
2335 } else if (anchor->start_endtype == DON) {
2336 splice_querystart_p = true;
2337 splicetype_querystart = ANTIDONOR;
2338 ambig_prob_querystart = anchor->siteD_prob;
2339 } else {
2340 splice_querystart_p = true;
2341 splicetype_querystart = ANTIACCEPTOR;
2342 ambig_prob_querystart = anchor->siteA_prob;
2343 }
2344
2345 nmismatches = Genome_count_mismatches_substring(&ref_nmismatches,genomebits,genomebits_alt,query_compress,anchor->left,
2346 /*pos5*/qstart,/*pos3*/qend,/*plusp*/false,anchor->genestrand);
2347 if ((new_anchor = Substring_new(nmismatches,ref_nmismatches,anchor->left,
2348 /*querystart*/anchor->querylength - qend,
2349 /*queryend*/anchor->querylength - qstart,
2350 anchor->querylength,/*plusp*/false,anchor->genestrand,query_compress,
2351 anchor->chrnum,anchor->chroffset,anchor->chrhigh,anchor->chrlength,
2352 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
2353 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,
2354 anchor->sensedir)) == NULL) {
2355 return anchor;
2356 } else {
2357 Substring_free(&anchor);
2358 return new_anchor;
2359 }
2360 }
2361 }
2362 }
2363
2364
2365 #if 0
2366 /* No longer used */
2367 /* Used when converting pairs to substrings. Assumes nmismatches and
2368 sensedir are correct, and no trimming is needed */
2369 T
2370 Substring_new_simple (int nmismatches, Univcoord_T left, int querystart, int queryend, int querylength,
2371 bool plusp, int genestrand, Compress_T query_compress,
2372 Endtype_T start_endtype, Endtype_T end_endtype,
2373 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
2374 int sensedir) {
2375 T new;
2376
2377
2378 debug2(printf("\n***Entered Substring_new_simple with left %u, chr %d, genome %u..%u, query %d..%d, plusp %d\n",
2379 left,chrnum,left+querystart-chroffset,left+queryend-chroffset,querystart,queryend,plusp));
2380
2381 /* assert(queryend > querystart); -- Assertion does not hold. Sometimes queryend == querystart */
2382 new = (T) MALLOC_OUT(sizeof(*new));
2383 debug2(printf("substring %p:\n",new));
2384
2385
2386 /* Required values if we abort and invoke Substring_free early */
2387 new->alts_ncoords = 0;
2388 new->genomic_bothdiff = (char *) NULL;
2389 new->genomic_refdiff = (char *) NULL;
2390
2391 new->start_endtype = start_endtype;
2392 new->end_endtype = end_endtype;
2393
2394 /* Revised later for the first and last substrings */
2395 new->querystart_pretrim = querystart;
2396 new->queryend_pretrim = queryend;
2397
2398 new->querylength = querylength;
2399
2400 /* These values are required by substring_trim_novel_spliceends */
2401 new->left = left;
2402 new->plusp = plusp;
2403 new->genestrand = genestrand;
2404 new->query_compress = query_compress;
2405
2406 if (plusp == true) {
2407 new->genomicstart = left;
2408 new->genomicend = left + querylength;
2409
2410 if (left + querylength >= chroffset + querylength) {
2411 new->querystart_chrbound = 0;
2412 } else {
2413 new->querystart_chrbound = chroffset - left;
2414 }
2415 if (left + querylength < chrhigh) {
2416 new->queryend_chrbound = querylength;
2417 } else {
2418 new->queryend_chrbound = chrhigh - left;
2419 }
2420
2421 } else {
2422 new->genomicstart = left + querylength;
2423 new->genomicend = left;
2424
2425 if (left + querylength < chrhigh) {
2426 new->querystart_chrbound = 0;
2427 } else {
2428 new->querystart_chrbound = (left + querylength) - chrhigh;
2429 }
2430 if (left + querylength > chroffset + querylength) {
2431 new->queryend_chrbound = querylength;
2432 } else {
2433 new->queryend_chrbound = (left + querylength) - chroffset;
2434 }
2435 }
2436
2437 new->amb_splice_pos = 0;
2438
2439 new->splicecoord_D = new->splicecoord_A = 0;
2440 new->siteD_pos = new->siteA_pos = new->siteN_pos = 0;
2441
2442 new->siteD_prob = new->siteA_prob = 0.0;
2443
2444 new->trim_querystart_splicep = new->trim_queryend_splicep = false;
2445 new->start_amb_length = new->end_amb_length = 0;
2446 new->sensedir = sensedir;
2447
2448 /* Assume coordinates are all within the given chromosome */
2449 debug2(printf("\n***chrnum %d (chroffset %u, chrhigh %u), plusp %d\n",chrnum,chroffset,chrhigh,plusp));
2450 new->chrnum = chrnum;
2451 new->chroffset = chroffset;
2452 new->chrhigh = chrhigh;
2453 new->chrlength = chrlength;
2454
2455 /* mandatory_trim_querystart and mandatory_trim_queryend also set during aliasing for circular chromosomes */
2456 if (left < new->chroffset) {
2457 new->mandatory_trim_querystart = new->chroffset - left;
2458 assert(new->mandatory_trim_querystart >= 0);
2459 } else {
2460 new->mandatory_trim_querystart = 0;
2461 }
2462 if (left + querylength >= new->chrhigh) {
2463 new->mandatory_trim_queryend = (left + querylength) - new->chrhigh;
2464 assert(new->mandatory_trim_queryend >= 0);
2465 } else {
2466 new->mandatory_trim_queryend = 0;
2467 }
2468 debug2(printf("mandatory_trim_querystart %d, mandatory_trim_queryend %d\n",new->mandatory_trim_querystart,new->mandatory_trim_queryend));
2469
2470 new->querystart = querystart;
2471 new->queryend = queryend;
2472 debug2(printf("querystart %d, queryend %d\n",new->querystart,new->queryend));
2473
2474 assert(new->querystart >= 0);
2475 assert(new->querystart <= querylength);
2476 assert(new->queryend >= 0);
2477 assert(new->queryend <= querylength);
2478
2479 /* Compute coordinates */
2480 if (plusp == true) {
2481 new->alignstart_trim = left + new->querystart;
2482 new->alignend_trim = left + new->queryend;
2483
2484 debug2(printf("left is %u\n",new->left));
2485 debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2486 debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2487 assert(new->alignstart_trim >= chroffset);
2488 assert(new->alignend_trim <= chrhigh);
2489
2490 } else {
2491 new->alignstart_trim = left + (querylength - new->querystart);
2492 new->alignend_trim = left + (querylength - new->queryend);
2493
2494 debug2(printf("left is %u\n",new->left));
2495 debug2(printf("genomicstart is %u, genomicend is %u\n",new->genomicstart,new->genomicend));
2496 debug2(printf("querylength is %d, alignstart is %u, alignend is %u\n",querylength,new->alignstart_trim,new->alignend_trim));
2497 assert(new->alignstart_trim <= chrhigh);
2498 assert(new->alignend_trim >= chroffset);
2499 }
2500
2501
2502 new->nmismatches_bothdiff = nmismatches;
2503 new->nmismatches_refdiff = nmismatches;
2504 /* new->nmatches_plus_spliced_trims = (new->queryend - new->querystart) - new->nmismatches_bothdiff; */
2505
2506 new->nmatches_plus_spliced_trims = new->nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_bothdiff;
2507 new->ref_nmatches_plus_spliced_trims = new->ref_nmatches_to_trims = (new->queryend - new->querystart) - new->nmismatches_refdiff;
2508
2509 new->alts_ncoords = 0;
2510 new->alts_coords = (Univcoord_T *) NULL;
2511 new->alts_knowni = (int *) NULL;
2512 new->alts_nmismatches = (int *) NULL;
2513 new->alts_ref_nmismatches = (int *) NULL;
2514 new->alts_probs = (double *) NULL;
2515 new->amb_type = END;
2516
2517 debug2(printf("** Returning substring %p, query %d..%d, trim %d..%d, nmismatches %d\n",
2518 new,new->querystart,new->queryend,querystart,querylength - queryend,new->nmismatches_bothdiff));
2519
2520 assert(new->nmismatches_bothdiff >= 0);
2521
2522 if (new->querystart >= new->queryend) {
2523 /* Can happen if multiple mismatches result in trimming down to 0 */
2524 Substring_free(&new);
2525 return (T) NULL;
2526 } else {
2527 return new;
2528 }
2529 }
2530 #endif
2531
2532
2533 static Univcoord_T
Univcoord_max(Univcoord_T * list,int n)2534 Univcoord_max (Univcoord_T *list, int n) {
2535 Univcoord_T m = list[0];
2536 int i;
2537
2538 for (i = 1; i < n; i++) {
2539 if (list[i] > m) {
2540 m = list[i];
2541 }
2542 }
2543
2544 return m;
2545 }
2546
2547 static Univcoord_T
Univcoord_min(Univcoord_T * list,int n)2548 Univcoord_min (Univcoord_T *list, int n) {
2549 Univcoord_T m = list[0];
2550 int i;
2551
2552 for (i = 1; i < n; i++) {
2553 if (list[i] < m) {
2554 m = list[i];
2555 }
2556 }
2557
2558 return m;
2559 }
2560
2561 static int
int_min(int * list,int n)2562 int_min (int *list, int n) {
2563 int m = list[0];
2564 int i;
2565
2566 for (i = 1; i < n; i++) {
2567 if (list[i] < m) {
2568 m = list[i];
2569 }
2570 }
2571
2572 return m;
2573 }
2574
2575
2576 T
Substring_new_alts_D(int querystart,int queryend,int splice_pos,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Univcoord_T * alts_coords,int * alts_knowni,int * alts_nmismatches,int * alts_ref_nmismatches,double * alts_probs,int alts_ncoords,double alts_common_prob,bool substring1p)2577 Substring_new_alts_D (int querystart, int queryend, int splice_pos, int querylength,
2578 bool plusp, int genestrand, Compress_T query_compress,
2579 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
2580 Univcoord_T *alts_coords, int *alts_knowni, int *alts_nmismatches,
2581 int *alts_ref_nmismatches, double *alts_probs,
2582 int alts_ncoords, double alts_common_prob, bool substring1p) {
2583 T new = (T) MALLOC_OUT(sizeof(*new));
2584 #ifdef DEBUG2
2585 int i;
2586 #endif
2587
2588 debug2(printf("Entered Substring_new_alts_D with chrnum %d (chroffset %u, chrhigh %u), %d..%d, splice_pos %d, querylength %d, plusp %d\n",
2589 chrnum,chroffset,chrhigh,querystart,queryend,splice_pos,querylength,plusp));
2590 debug2(printf("alts_common_prob %f\n",alts_common_prob));
2591
2592 new->chrnum = chrnum;
2593 new->chroffset = chroffset;
2594 new->chrhigh = chrhigh;
2595 new->chrlength = chrlength;
2596
2597 new->left = 0;
2598 if (plusp == true) {
2599 new->genomicstart = Univcoord_max(alts_coords,alts_ncoords);
2600 new->genomicend = Univcoord_min(alts_coords,alts_ncoords);
2601 } else {
2602 new->genomicstart = Univcoord_min(alts_coords,alts_ncoords);
2603 new->genomicend = Univcoord_max(alts_coords,alts_ncoords);
2604 }
2605
2606 new->querystart_chrbound = 0;
2607 new->queryend_chrbound = querylength;
2608
2609 new->start_endtype = END;
2610 new->end_endtype = END;
2611
2612 new->querystart = new->querystart_pretrim = querystart;
2613 new->queryend = new->queryend_pretrim = queryend;
2614 new->querylength = querylength;
2615
2616 new->alignstart_trim = 0;
2617 new->alignend_trim = 0;
2618
2619 new->plusp = plusp;
2620 new->genestrand = genestrand;
2621 new->query_compress = query_compress;
2622
2623 new->siteD_prob = 0.0;
2624 new->siteA_prob = alts_common_prob;
2625
2626 new->nmismatches_bothdiff = int_min(alts_nmismatches,alts_ncoords);
2627 new->nmismatches_refdiff = int_min(alts_ref_nmismatches,alts_ncoords);
2628 debug2(printf("nmismatches_bothdiff due to alts_nmismatches is %d\n",new->nmismatches_bothdiff));
2629
2630 /* Works because new->querystart == querystart and new->queryend == queryend */
2631 /* new->nmatches_plus_spliced_trims = (queryend - querystart) - new->nmismatches_bothdiff; */
2632
2633 new->genomic_bothdiff = (char *) NULL;
2634 new->genomic_refdiff = (char *) NULL;
2635
2636 if (substring1p == true) {
2637 debug2(printf("substring1p is true, so setting amb_lengths to be %d and %d\n",queryend,0));
2638 if (alts_common_prob >= 0.9) {
2639 /* Treat as a substring */
2640 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = queryend - new->nmismatches_bothdiff;
2641 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = queryend - new->nmismatches_refdiff;
2642 new->start_amb_length = new->end_amb_length = 0;
2643 } else {
2644 /* Treat as ambiguous splice site */
2645 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2646 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2647 new->start_amb_length = queryend - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2648 new->end_amb_length = 0;
2649 }
2650 } else {
2651 debug2(printf("substring1p is false, so setting amb_lengths to be %d and %d\n",0,querylength - querystart));
2652 if (alts_common_prob >= 0.9) {
2653 /* Treat as a substring */
2654 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_bothdiff;
2655 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_refdiff;
2656 new->start_amb_length = new->end_amb_length = 0;
2657 } else {
2658 /* Treat as ambiguous splice site */
2659 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2660 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2661 new->start_amb_length = 0;
2662 new->end_amb_length = querylength - querystart - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2663 }
2664 }
2665 new->mandatory_trim_querystart = 0;
2666 new->mandatory_trim_queryend = 0;
2667 new->trim_querystart_splicep = new->trim_queryend_splicep = false;
2668 /* new->start_amb_length = new->end_amb_length = 0; -- Set above */
2669
2670 #ifdef DEBUG2
2671 for (i = 0; i < alts_ncoords; i++) {
2672 printf("%u %f\n",alts_coords[i],alts_probs[i]);
2673 }
2674 #endif
2675 new->alts_ncoords = alts_ncoords;
2676 new->alts_coords = alts_coords;
2677 new->alts_knowni = alts_knowni;
2678 new->alts_nmismatches = alts_nmismatches;
2679 new->alts_ref_nmismatches = alts_ref_nmismatches;
2680 new->alts_probs = alts_probs;
2681 new->amb_splice_pos = splice_pos;
2682 new->amb_type = DON;
2683
2684 assert(new->nmismatches_bothdiff >= 0);
2685
2686 if (new->querystart >= new->queryend) {
2687 /* Can happen if multiple mismatches result in trimming down to 0 */
2688 Substring_free(&new);
2689 return (T) NULL;
2690 } else {
2691 return new;
2692 }
2693 }
2694
2695 T
Substring_new_alts_A(int querystart,int queryend,int splice_pos,int querylength,bool plusp,int genestrand,Compress_T query_compress,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Univcoord_T * alts_coords,int * alts_knowni,int * alts_nmismatches,int * alts_ref_nmismatches,double * alts_probs,int alts_ncoords,double alts_common_prob,bool substring1p)2696 Substring_new_alts_A (int querystart, int queryend, int splice_pos, int querylength,
2697 bool plusp, int genestrand, Compress_T query_compress,
2698 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
2699 Univcoord_T *alts_coords, int *alts_knowni, int *alts_nmismatches,
2700 int *alts_ref_nmismatches, double *alts_probs,
2701 int alts_ncoords, double alts_common_prob, bool substring1p) {
2702 T new = (T) MALLOC_OUT(sizeof(*new));
2703 #ifdef DEBUG2
2704 int i;
2705 #endif
2706
2707 debug2(printf("Entered Substring_new_alts_A with chrnum %d (chroffset %u, chrhigh %u), %d..%d, splice_pos %d, querylength %d, plusp %d\n",
2708 chrnum,chroffset,chrhigh,querystart,queryend,splice_pos,querylength,plusp));
2709 debug2(printf("alts_common_prob %f\n",alts_common_prob));
2710
2711 new->chrnum = chrnum;
2712 new->chroffset = chroffset;
2713 new->chrhigh = chrhigh;
2714 new->chrlength = chrlength;
2715
2716 new->left = 0;
2717 if (plusp == true) {
2718 new->genomicstart = Univcoord_max(alts_coords,alts_ncoords);
2719 new->genomicend = Univcoord_min(alts_coords,alts_ncoords);
2720 } else {
2721 new->genomicstart = Univcoord_min(alts_coords,alts_ncoords);
2722 new->genomicend = Univcoord_max(alts_coords,alts_ncoords);
2723 }
2724
2725 new->querystart_chrbound = 0;
2726 new->queryend_chrbound = querylength;
2727
2728 new->start_endtype = END;
2729 new->end_endtype = END;
2730
2731 new->querystart = new->querystart_pretrim = querystart;
2732 new->queryend = new->queryend_pretrim = queryend;
2733 new->querylength = querylength;
2734
2735 new->alignstart_trim = 0;
2736 new->alignend_trim = 0;
2737
2738 new->plusp = plusp;
2739 new->genestrand = genestrand;
2740 new->query_compress = query_compress;
2741
2742 new->siteA_prob = 0.0;
2743 new->siteD_prob = alts_common_prob;
2744
2745 new->nmismatches_bothdiff = int_min(alts_nmismatches,alts_ncoords);
2746 new->nmismatches_refdiff = int_min(alts_ref_nmismatches,alts_ncoords);
2747 debug2(printf("nmismatches_bothdiff due to alts_nmismatches is %d\n",new->nmismatches_bothdiff));
2748
2749 /* Works because new->querystart == querystart and new->queryend == queryend */
2750 /* new->nmatches_plus_spliced_trims = (queryend - querystart) - new->nmismatches_bothdiff; */
2751
2752 new->genomic_bothdiff = (char *) NULL;
2753 new->genomic_refdiff = (char *) NULL;
2754
2755 if (substring1p == true) {
2756 debug2(printf("substring1p is true, so setting trims to be %d and %d\n",querystart,0));
2757 if (alts_common_prob >= 0.9) {
2758 /* Treat as a substring */
2759 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = queryend - new->nmismatches_bothdiff;
2760 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = queryend - new->nmismatches_refdiff;
2761 new->start_amb_length = new->end_amb_length = 0;
2762 } else {
2763 /* Treat as ambiguous splice site */
2764 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2765 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2766 new->start_amb_length = queryend - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2767 new->end_amb_length = 0;
2768 }
2769 } else {
2770 debug2(printf("substring1p is false, so setting trims to be %d and %d\n",0,querylength - queryend));
2771 new->nmatches_to_trims = querylength - querystart - new->nmismatches_bothdiff;
2772 new->ref_nmatches_to_trims = querylength - querystart - new->nmismatches_refdiff;
2773 if (alts_common_prob >= 0.9) {
2774 /* Treat as a substring */
2775 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_bothdiff;
2776 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = querylength - querystart - new->nmismatches_refdiff;
2777 new->start_amb_length = new->end_amb_length = 0;
2778 } else {
2779 /* Treat as ambiguous splice site */
2780 new->nmatches_to_trims = new->nmatches_plus_spliced_trims = 0;
2781 new->ref_nmatches_to_trims = new->ref_nmatches_plus_spliced_trims = 0;
2782 new->start_amb_length = 0;
2783 new->end_amb_length = querylength - querystart - new->nmismatches_bothdiff; /* This is the amb length for the entire Stage3end_T object */
2784 }
2785 }
2786 new->mandatory_trim_querystart = 0;
2787 new->mandatory_trim_queryend = 0;
2788 new->trim_querystart_splicep = new->trim_queryend_splicep = false;
2789
2790 #ifdef DEBUG2
2791 for (i = 0; i < alts_ncoords; i++) {
2792 printf("%u %f\n",alts_coords[i],alts_probs[i]);
2793 }
2794 #endif
2795
2796 new->alts_ncoords = alts_ncoords;
2797 new->alts_coords = alts_coords;
2798 new->alts_knowni = alts_knowni;
2799 new->alts_nmismatches = alts_nmismatches;
2800 new->alts_ref_nmismatches = alts_ref_nmismatches;
2801 new->alts_probs = alts_probs;
2802 new->amb_splice_pos = splice_pos;
2803 new->amb_type = ACC;
2804
2805 assert(new->nmismatches_bothdiff >= 0);
2806
2807 if (new->querystart >= new->queryend) {
2808 /* Can happen if multiple mismatches result in trimming down to 0 */
2809 Substring_free(&new);
2810 return (T) NULL;
2811 } else {
2812 return new;
2813 }
2814 }
2815
2816
2817 Univcoord_T
Substring_set_alt(double * donor_prob,double * acceptor_prob,Univcoord_T * genomicstart,Univcoord_T * genomicend,T this,int bingoi)2818 Substring_set_alt (double *donor_prob, double *acceptor_prob, Univcoord_T *genomicstart, Univcoord_T *genomicend,
2819 T this, int bingoi) {
2820
2821 #ifdef DEBUG10
2822 printf("Entered Substring_set_alt for %d..%d. plusp %d, ",this->querystart,this->queryend,this->plusp);
2823 if (this->amb_type == DON) {
2824 printf("type DON\n");
2825 } else {
2826 printf("type ACC\n");
2827 }
2828 #endif
2829
2830 assert(this->alts_probs != NULL);
2831
2832 this->nmismatches_bothdiff = this->alts_nmismatches[bingoi];
2833 this->nmismatches_refdiff = this->alts_ref_nmismatches[bingoi];
2834
2835 if (this->plusp == true) {
2836 if (this->amb_type == DON) {
2837 *acceptor_prob = this->siteA_prob;
2838 *donor_prob = this->siteD_prob = this->alts_probs[bingoi];
2839 this->splicecoord_D = this->alts_coords[bingoi];
2840 this->splicesitesD_knowni = this->alts_knowni[bingoi];
2841 this->left = this->splicecoord_D - this->amb_splice_pos;
2842 } else {
2843 *donor_prob = this->siteD_prob;
2844 *acceptor_prob = this->siteA_prob = this->alts_probs[bingoi];
2845 this->splicecoord_A = this->alts_coords[bingoi];
2846 this->splicesitesA_knowni = this->alts_knowni[bingoi];
2847 this->left = this->splicecoord_A - this->amb_splice_pos;
2848 }
2849
2850 debug10(printf("left %u [%u]\n",this->left,this->left - this->chroffset));
2851 *genomicstart = this->genomicstart = this->left;
2852 *genomicend = this->genomicend = this->left + this->querylength;
2853
2854 debug10(printf("genomicstart %u [%u]\n",*genomicstart,*genomicstart - this->chroffset));
2855 debug10(printf("genomicend %u [%u]\n",*genomicend,*genomicend - this->chroffset));
2856
2857 this->alignstart_trim = this->genomicstart + this->querystart;
2858 this->alignend_trim = this->genomicstart + this->queryend;
2859 /* this->nmatches_plus_spliced_trims = (this->alignend_trim - this->alignstart_trim) - this->nmismatches_bothdiff; */
2860
2861 debug10(printf("querypos %d..%d, alignstart is %u (%u), alignend is %u (%u), genomicstart is %u, genomicend is %u\n",
2862 this->querystart,this->queryend,this->alignstart_trim,this->alignstart_trim - this->chroffset,
2863 this->alignend_trim,this->alignend_trim - this->chroffset,this->genomicstart,this->genomicend));
2864
2865 } else {
2866 if (this->amb_type == DON) {
2867 *acceptor_prob = this->siteA_prob;
2868 *donor_prob = this->siteD_prob = this->alts_probs[bingoi];
2869 this->splicecoord_D = this->alts_coords[bingoi];
2870 this->splicesitesD_knowni = this->alts_knowni[bingoi];
2871 /* this->left = this->splicecoord_D - (this->querylength - this->amb_splice_pos); */
2872 this->left = this->splicecoord_D - this->amb_splice_pos; /* amb_splice_pos is based on left */
2873 debug10(printf("left %u [%u] = %u - (%d - %d)\n",
2874 this->left,this->left - this->chroffset,this->splicecoord_D - this->chroffset,
2875 this->querylength,this->amb_splice_pos));
2876 } else {
2877 *donor_prob = this->siteD_prob;
2878 *acceptor_prob = this->siteA_prob = this->alts_probs[bingoi];
2879 this->splicecoord_A = this->alts_coords[bingoi];
2880 this->splicesitesA_knowni = this->alts_knowni[bingoi];
2881 /* this->left = this->splicecoord_A - (this->querylength - this->amb_splice_pos); */
2882 this->left = this->splicecoord_A - this->amb_splice_pos; /* amb_splice_pos is based on left */
2883 debug10(printf("left %u [%u] = %u - (%d - %d)\n",
2884 this->left,this->left - this->chroffset,this->splicecoord_A - this->chroffset,
2885 this->querylength,this->amb_splice_pos));
2886 }
2887
2888 *genomicend = this->genomicend = this->left;
2889 *genomicstart = this->genomicstart = this->left + this->querylength;
2890 debug10(printf("genomicstart %u [%u]\n",*genomicstart,*genomicstart - this->chroffset));
2891 debug10(printf("genomicend %u [%u]\n",*genomicend,*genomicend - this->chroffset));
2892
2893 this->alignend_trim = this->genomicstart - this->queryend;
2894 this->alignstart_trim = this->genomicstart - this->querystart;
2895 /* this->nmatches_plus_spliced_trims = (this->alignstart_trim - this->alignend_trim) - this->nmismatches_bothdiff; */
2896
2897 debug10(printf("querypos %d..%d, alignstart is %u (%u), alignend is %u (%u), genomicstart is %u, genomicend is %u\n",
2898 this->querystart,this->queryend,this->alignstart_trim,this->alignstart_trim - this->chroffset,
2899 this->alignend_trim,this->alignend_trim - this->chroffset,this->genomicstart,this->genomicend));
2900 }
2901
2902 this->start_amb_length = this->end_amb_length = 0;
2903 this->amb_type = END;
2904
2905 this->nmatches_plus_spliced_trims = this->nmatches_to_trims = this->queryend - this->querystart - this->nmismatches_bothdiff;
2906 this->ref_nmatches_plus_spliced_trims = this->ref_nmatches_to_trims = this->queryend - this->querystart - this->nmismatches_refdiff;
2907
2908 this->alts_ncoords = 0; /* To signify that alts have been resolved */
2909 FREE_OUT(this->alts_coords);
2910 FREE_OUT(this->alts_knowni);
2911 FREE_OUT(this->alts_nmismatches);
2912 FREE_OUT(this->alts_ref_nmismatches);
2913 FREE_OUT(this->alts_probs);
2914
2915 return this->left;
2916 }
2917
2918
2919 float
Substring_compute_mapq(T this,char * quality_string)2920 Substring_compute_mapq (T this, char *quality_string) {
2921 int mapq_start, mapq_end;
2922 float best_loglik, loglik;
2923 Univcoord_T left, splicecoord;
2924 int i;
2925
2926 /* mapq */
2927 mapq_start = this->querystart;
2928 mapq_end = this->queryend;
2929
2930 /* It appears from simulated reads that it is better not to trim in
2931 computing MAPQ. The correct mapping then tends to be selected
2932 with a higher MAPQ score. */
2933 /* But if all ends are terminals, then terminal parts should not be
2934 included in MAPQ scoring */
2935
2936 if (this->nmismatches_bothdiff == 0) { /* was this->exactp == true */
2937 /* this->mapq_loglik = MAPQ_loglik_exact(quality_string,0,querylength); */
2938 this->mapq_loglik = 0.0;
2939
2940 } else if (this->alts_ncoords > 0) {
2941 if (this->plusp == true) {
2942 splicecoord = this->alts_coords[0];
2943 left = splicecoord - this->amb_splice_pos;
2944 best_loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2945 this->query_compress,left,mapq_start,mapq_end,
2946 this->querylength,quality_string,/*plusp*/true,this->genestrand);
2947 for (i = 1; i < this->alts_ncoords; i++) {
2948 splicecoord = this->alts_coords[i];
2949 left = splicecoord - this->amb_splice_pos;
2950 if ((loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2951 this->query_compress,left,mapq_start,mapq_end,
2952 this->querylength,quality_string,/*plusp*/true,this->genestrand)) > best_loglik) {
2953 best_loglik = loglik;
2954 }
2955 }
2956 } else {
2957 splicecoord = this->alts_coords[0];
2958 left = splicecoord - (this->querylength - this->amb_splice_pos);
2959 best_loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2960 this->query_compress,left,mapq_start,mapq_end,
2961 this->querylength,quality_string,/*plusp*/false,this->genestrand);
2962 for (i = 1; i < this->alts_ncoords; i++) {
2963 splicecoord = this->alts_coords[i];
2964 left = splicecoord - (this->querylength - this->amb_splice_pos);
2965 if ((loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2966 this->query_compress,left,mapq_start,mapq_end,
2967 this->querylength,quality_string,/*plusp*/false,this->genestrand)) > best_loglik) {
2968 best_loglik = loglik;
2969 }
2970 }
2971 }
2972
2973 this->mapq_loglik = best_loglik;
2974
2975 } else {
2976 debug2(printf("trim_querystart %d, trim_queryend %d, mapq_start = %d, mapq_end = %d\n",
2977 this->querystart,this->querylength - this->queryend,mapq_start,mapq_end));
2978 this->mapq_loglik = MAPQ_loglik(/*ome*/genomebits,/*ome_alt*/genomebits_alt,
2979 this->query_compress,this->left,mapq_start,mapq_end,
2980 this->querylength,quality_string,this->plusp,this->genestrand);
2981 debug2(printf("Substring %u..%u gets loglik %f\n",this->genomicstart - this->chroffset,
2982 this->genomicend - this->chroffset,this->mapq_loglik));
2983 }
2984
2985 return this->mapq_loglik;
2986 }
2987
2988
2989 /* Sets genomic_bothdiff, genomic_refdiff, and nmismatches_refdiff */
2990 int
Substring_display_prep(T this,char * queryuc_ptr,int querylength,int extraleft,int extraright,Genome_T genome)2991 Substring_display_prep (T this, char *queryuc_ptr, int querylength,
2992 int extraleft, int extraright, Genome_T genome) {
2993
2994 #if defined(HAVE_ALLOCA)
2995 char *genomic_diff, *gbuffer;
2996 #else
2997 char *genomic_diff, *gbuffer;
2998 #endif
2999
3000 /* printf("left:%u nmismatches:%d extraleft:%d extraright:%d\n",
3001 this->left - this->chroffset,this->nmismatches_bothdiff,extraleft,extraright); */
3002
3003 if (output_type == M8_OUTPUT && snps_iit == NULL && maskedp == false) {
3004 /* Print procedures do not refer to genomic sequence */
3005 this->genomic_refdiff = this->genomic_bothdiff = (char *) NULL;
3006 this->nmismatches_refdiff = this->nmismatches_bothdiff;
3007
3008 } else if (this->nmismatches_bothdiff == 0 && extraleft == 0 && extraright == 0 &&
3009 snps_iit == NULL && maskedp == false) {
3010 /* Print procedures will refer to query sequence */
3011 this->genomic_refdiff = this->genomic_bothdiff = (char *) NULL;
3012 this->nmismatches_refdiff = 0;
3013
3014 } else if (this->plusp == true) {
3015 /* Used to be this->genomiclength, but doesn't work for large insertions */
3016 #if defined(HAVE_ALLOCA)
3017 gbuffer = (char *) ALLOCA((querylength+1) * sizeof(char));
3018 #else
3019 gbuffer = (char *) MALLOC((querylength+1) * sizeof(char));
3020 #endif
3021
3022 debug1(printf("Obtaining genomic_diff from left %u (%u) for querylength %d\n",
3023 this->left,this->left - this->chroffset,querylength));
3024 Genome_fill_buffer_simple(genome,this->left,querylength,gbuffer);
3025 genomic_diff = gbuffer;
3026 debug1(printf("%s\n",genomic_diff));
3027
3028 if (maskedp == true) {
3029 Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,
3030 this->left,/*pos5*/this->querystart,/*pos3*/this->queryend,
3031 /*plusp*/true,this->genestrand);
3032 } else {
3033 Genome_mark_mismatches(genomic_diff,querylength,this->query_compress,
3034 this->left,/*pos5*/this->querystart,/*pos3*/this->queryend,
3035 /*plusp*/true,this->genestrand);
3036 }
3037
3038 /* Need to perform embellish to put dashes in */
3039 this->genomic_bothdiff =
3040 embellish_genomic(this->genomic_bothdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
3041 querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3042 extraleft,extraright,/*plusp*/true,this->genestrand);
3043
3044 if (snps_iit == NULL && maskedp == false) {
3045 this->genomic_refdiff = this->genomic_bothdiff;
3046 this->nmismatches_refdiff = this->nmismatches_bothdiff;
3047
3048 } else {
3049 this->nmismatches_refdiff =
3050 Genome_count_mismatches_substring_ref(genomebits,this->query_compress,this->left,
3051 /*pos5*/this->alignstart_trim - this->left,
3052 /*pos3*/this->alignend_trim - this->left,
3053 /*plusp*/true,this->genestrand);
3054
3055 Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,this->left,
3056 /*pos5*/this->querystart,/*pos3*/this->queryend,
3057 /*plusp*/true,this->genestrand);
3058 if (output_type == STD_OUTPUT) {
3059 this->genomic_refdiff =
3060 embellish_genomic(this->genomic_refdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
3061 querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3062 extraleft,extraright,/*plusp*/true,this->genestrand);
3063 }
3064 }
3065
3066 if (output_type == SAM_OUTPUT) {
3067 this->genomic_refdiff =
3068 embellish_genomic_sam(this->genomic_refdiff,genomic_diff,queryuc_ptr,this->querystart,this->queryend,
3069 querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3070 extraleft,extraright,/*plusp*/true,this->genestrand);
3071 }
3072
3073 #if defined(HAVE_ALLOCA)
3074 FREEA(gbuffer);
3075 #else
3076 FREE(gbuffer);
3077 #endif
3078
3079 } else {
3080 /* Used to be this->genomiclength, but doesn't work for large insertions */
3081 #if defined(HAVE_ALLOCA)
3082 gbuffer = (char *) ALLOCA((querylength+1) * sizeof(char));
3083 #else
3084 gbuffer = (char *) MALLOC((querylength+1) * sizeof(char));
3085 #endif
3086
3087 debug1(printf("Obtaining genomic_diff from left %u (%u) for querylength %d, and complemented\n",
3088 this->left,this->left - this->chroffset,querylength));
3089 Genome_fill_buffer_simple(genome,this->left,querylength,gbuffer);
3090 genomic_diff = make_complement_inplace(gbuffer,querylength);
3091 debug1(printf("%s\n",genomic_diff));
3092
3093 if (maskedp == true) {
3094 Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,
3095 this->left,/*pos5*/querylength - this->queryend,
3096 /*pos3*/querylength - this->querystart,
3097 /*plusp*/false,this->genestrand);
3098 } else {
3099 Genome_mark_mismatches(genomic_diff,querylength,this->query_compress,
3100 this->left,/*pos5*/querylength - this->queryend,
3101 /*pos3*/querylength - this->querystart,
3102 /*plusp*/false,this->genestrand);
3103 }
3104
3105 /* Need to perform embellish to put dashes in */
3106 this->genomic_bothdiff =
3107 embellish_genomic(this->genomic_bothdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
3108 querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3109 extraleft,extraright,/*plusp*/false,this->genestrand);
3110
3111 if (snps_iit == NULL && maskedp == false) {
3112 this->genomic_refdiff = this->genomic_bothdiff;
3113 this->nmismatches_refdiff = this->nmismatches_bothdiff;
3114
3115 } else {
3116 this->nmismatches_refdiff =
3117 Genome_count_mismatches_substring_ref(genomebits,this->query_compress,this->left,
3118 /*pos5*/this->alignend_trim - this->left,
3119 /*pos3*/this->alignstart_trim - this->left,/*plusp*/false,
3120 this->genestrand);
3121
3122 Genome_mark_mismatches_ref(genomic_diff,querylength,this->query_compress,this->left,
3123 /*pos5*/querylength - this->queryend,
3124 /*pos3*/querylength - this->querystart,
3125 /*plusp*/false,this->genestrand);
3126
3127 if (output_type == STD_OUTPUT) {
3128 this->genomic_refdiff =
3129 embellish_genomic(this->genomic_refdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
3130 querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3131 extraleft,extraright,/*plusp*/false,this->genestrand);
3132 }
3133 }
3134
3135 if (output_type == SAM_OUTPUT) {
3136 this->genomic_refdiff =
3137 embellish_genomic_sam(this->genomic_refdiff,genomic_diff,/*not queryrc*/queryuc_ptr,this->querystart,this->queryend,
3138 querylength,this->mandatory_trim_querystart,this->mandatory_trim_queryend,
3139 extraleft,extraright,/*plusp*/false,this->genestrand);
3140 }
3141
3142 #if defined(HAVE_ALLOCA)
3143 FREEA(gbuffer);
3144 #else
3145 FREE(gbuffer);
3146 #endif
3147 }
3148
3149 return this->nmismatches_refdiff;
3150 }
3151
3152
3153 char *
Substring_genomic_sequence(int * seqlength,T this,Genome_T genome)3154 Substring_genomic_sequence (int *seqlength, T this, Genome_T genome) {
3155 char *gbuffer;
3156
3157 *seqlength = this->queryend - this->querystart;
3158 gbuffer = (char *) MALLOC((*seqlength+1) * sizeof(char));
3159 if (this->plusp == true) {
3160 Genome_fill_buffer_simple(genome,this->left + this->querystart,*seqlength,gbuffer);
3161 return gbuffer;
3162 } else {
3163 Genome_fill_buffer_simple(genome,this->left + (this->querylength - this->queryend),*seqlength,gbuffer);
3164 return make_complement_inplace(gbuffer,*seqlength);
3165 }
3166 }
3167
3168
3169 Univcoord_T
Substring_left(T this)3170 Substring_left (T this) {
3171 return this->left;
3172 }
3173
3174
3175 Univcoord_T
Substring_splicecoord_D(T this)3176 Substring_splicecoord_D (T this) {
3177 return this->splicecoord_D;
3178 }
3179
3180 Univcoord_T
Substring_splicecoord_A(T this)3181 Substring_splicecoord_A (T this) {
3182 return this->splicecoord_A;
3183 }
3184
3185
3186 /* Called only by samprint, for donor or acceptor substrings */
3187 char
Substring_chimera_strand(T this)3188 Substring_chimera_strand (T this) {
3189 int sensedir;
3190
3191 if ((sensedir = this->sensedir) != SENSE_ANTI) {
3192 if (this->plusp == true) {
3193 return '+';
3194 } else {
3195 return '-';
3196 }
3197 } else {
3198 if (this->plusp == true) {
3199 return '-';
3200 } else {
3201 return '+';
3202 }
3203 }
3204 }
3205
3206
3207 /* Called only by samprint */
3208 Chrpos_T
Substring_chr_splicecoord_D(T this,char donor_strand)3209 Substring_chr_splicecoord_D (T this, char donor_strand) {
3210 if (donor_strand == '+') {
3211 return (Chrpos_T) (this->splicecoord_D - this->chroffset);
3212 } else if (donor_strand == '-') {
3213 return (Chrpos_T) (this->splicecoord_D - this->chroffset + 1);
3214 } else {
3215 abort();
3216 }
3217 }
3218
3219 /* Called only by samprint */
3220 Chrpos_T
Substring_chr_splicecoord_A(T this,char acceptor_strand)3221 Substring_chr_splicecoord_A (T this, char acceptor_strand) {
3222 if (acceptor_strand == '+') {
3223 return (Chrpos_T) (this->splicecoord_A - this->chroffset + 1);
3224 } else if (acceptor_strand == '-') {
3225 return (Chrpos_T) (this->splicecoord_A - this->chroffset);
3226 } else {
3227 abort();
3228 }
3229 }
3230
3231 int
Substring_splicesitesD_knowni(T this)3232 Substring_splicesitesD_knowni (T this) {
3233 return this->splicesitesD_knowni;
3234 }
3235
3236 int
Substring_splicesitesA_knowni(T this)3237 Substring_splicesitesA_knowni (T this) {
3238 return this->splicesitesA_knowni;
3239 }
3240
3241 bool
Substring_plusp(T this)3242 Substring_plusp (T this) {
3243 return this->plusp;
3244 }
3245
3246 int
Substring_sensedir(T this)3247 Substring_sensedir (T this) {
3248 return this->sensedir;
3249 }
3250
3251 int
Substring_genestrand(T this)3252 Substring_genestrand (T this) {
3253 return this->genestrand;
3254 }
3255
3256 char *
Substring_genomic_bothdiff(T this)3257 Substring_genomic_bothdiff (T this) {
3258 return this->genomic_bothdiff;
3259 }
3260
3261 char *
Substring_genomic_refdiff(T this)3262 Substring_genomic_refdiff (T this) {
3263 return this->genomic_refdiff;
3264 }
3265
3266 int
Substring_nmismatches_bothdiff(T this)3267 Substring_nmismatches_bothdiff (T this) {
3268 return this->nmismatches_bothdiff;
3269 }
3270
3271 int
Substring_nmismatches_refdiff(T this)3272 Substring_nmismatches_refdiff (T this) {
3273 return this->nmismatches_refdiff;
3274 }
3275
3276 int
Substring_nmatches_to_trims(T this)3277 Substring_nmatches_to_trims (T this) {
3278 #if 0
3279 if (this->alts_ncoords > 0) {
3280 /* The entire substring is "trimmed" */
3281 return 0;
3282 } else {
3283 return this->queryend - this->querystart - this->nmismatches_bothdiff;
3284 }
3285 #else
3286 return this->nmatches_to_trims;
3287 #endif
3288 }
3289
3290 int
Substring_ref_nmatches_to_trims(T this)3291 Substring_ref_nmatches_to_trims (T this) {
3292 return this->ref_nmatches_to_trims;
3293 }
3294
3295 /* nmatches_to_trims plus amb_length */
3296 int
Substring_nmatches(T this)3297 Substring_nmatches (T this) {
3298 #if 0
3299 int amb_length;
3300
3301 if (this->alts_ncoords > 0) {
3302 return this->start_amb_length + this->end_amb_length;
3303 } else {
3304 amb_length = 0;
3305 if (this->trim_querystart_splicep == true) {
3306 amb_length += this->querystart;
3307 }
3308 if (this->trim_queryend_splicep == true) {
3309 amb_length += this->querylength - this->queryend;
3310 }
3311 return amb_length + /*nmatches_to_trims*/ (this->queryend - this->querystart - this->nmismatches_bothdiff);
3312 }
3313 #else
3314 return this->nmatches_plus_spliced_trims;
3315 #endif
3316 }
3317
3318 int
Substring_ref_nmatches(T this)3319 Substring_ref_nmatches (T this) {
3320 #if 0
3321 int amb_length;
3322
3323 if (this->alts_ncoords > 0) {
3324 return this->start_amb_length + this->end_amb_length;
3325 } else {
3326 amb_length = 0;
3327 if (this->trim_querystart_splicep == true) {
3328 amb_length += this->querystart;
3329 }
3330 if (this->trim_queryend_splicep == true) {
3331 amb_length += this->querylength - this->queryend;
3332 }
3333 return amb_length + /*nmatches_to_trims*/ (this->queryend - this->querystart - this->nmismatches_bothdiff);
3334 }
3335 #else
3336 return this->ref_nmatches_plus_spliced_trims;
3337 #endif
3338 }
3339
3340
3341 Endtype_T
Substring_start_endtype(T this)3342 Substring_start_endtype (T this) {
3343 return this->start_endtype;
3344 }
3345
3346 Endtype_T
Substring_end_endtype(T this)3347 Substring_end_endtype (T this) {
3348 return this->end_endtype;
3349 }
3350
3351 float
Substring_mapq_loglik(T this)3352 Substring_mapq_loglik (T this) {
3353 return this->mapq_loglik;
3354 }
3355
3356 int
Substring_trim_querystart(T this)3357 Substring_trim_querystart (T this) {
3358 return this->querystart;
3359 }
3360
3361 int
Substring_trim_queryend(T this)3362 Substring_trim_queryend (T this) {
3363 return this->querylength - this->queryend;
3364 }
3365
3366 bool
Substring_trim_querystart_splicep(T this)3367 Substring_trim_querystart_splicep (T this) {
3368 return this->trim_querystart_splicep;
3369 }
3370
3371 bool
Substring_trim_queryend_splicep(T this)3372 Substring_trim_queryend_splicep (T this) {
3373 return this->trim_queryend_splicep;
3374 }
3375
3376 int
Substring_mandatory_trim_querystart(T this)3377 Substring_mandatory_trim_querystart (T this) {
3378 return this->mandatory_trim_querystart;
3379 }
3380
3381 int
Substring_mandatory_trim_queryend(T this)3382 Substring_mandatory_trim_queryend (T this) {
3383 return this->mandatory_trim_queryend;
3384 }
3385
3386
3387 int
Substring_querystart(T this)3388 Substring_querystart (T this) {
3389 return this->querystart;
3390 }
3391
3392 int
Substring_querystart_pretrim(T this)3393 Substring_querystart_pretrim (T this) {
3394 return this->querystart_pretrim;
3395 }
3396
3397 int
Substring_queryend(T this)3398 Substring_queryend (T this) {
3399 return this->queryend;
3400 }
3401
3402 int
Substring_querylength(T this)3403 Substring_querylength (T this) {
3404 return this->querylength;
3405 }
3406
3407 int
Substring_querystart_chrbound(T this)3408 Substring_querystart_chrbound (T this) {
3409 return this->querystart_chrbound;
3410 }
3411
3412 int
Substring_queryend_chrbound(T this)3413 Substring_queryend_chrbound (T this) {
3414 return this->queryend_chrbound;
3415 }
3416
3417
3418 int
Substring_match_length(T this)3419 Substring_match_length (T this) {
3420 if (this == NULL) {
3421 return 0;
3422 } else {
3423 return this->queryend - this->querystart;
3424 }
3425 }
3426
3427 int
Substring_match_length_pretrim(T this)3428 Substring_match_length_pretrim (T this) {
3429 if (this == NULL) {
3430 return 0;
3431 } else {
3432 return this->queryend_pretrim - this->querystart_pretrim;
3433 }
3434 }
3435
3436
3437 /* Mapped and unmapped */
3438 int
Substring_amb_length(T this)3439 Substring_amb_length (T this) {
3440 return this->start_amb_length + this->end_amb_length;
3441 }
3442
3443
3444 /* Mapped and unmapped */
3445 int
Substring_start_amb_length(T this)3446 Substring_start_amb_length (T this) {
3447 return this->start_amb_length;
3448 }
3449
3450
3451 /* Mapped and unmapped */
3452 int
Substring_end_amb_length(T this)3453 Substring_end_amb_length (T this) {
3454 return this->end_amb_length;
3455 }
3456
3457
3458 bool
Substring_equal(T substring1,T substring2)3459 Substring_equal (T substring1, T substring2) {
3460 if (substring1->genomicstart != substring2->genomicstart) {
3461 return false;
3462 } else if (substring1->genomicend != substring2->genomicend) {
3463 return false;
3464 } else if (substring1->alignstart_trim != substring2->alignstart_trim) {
3465 return false;
3466 } else if (substring1->alignend_trim != substring2->alignend_trim) {
3467 return false;
3468 } else if (substring1->start_amb_length != substring2->start_amb_length) {
3469 return false;
3470 } else if (substring1->end_amb_length != substring2->end_amb_length) {
3471 return false;
3472 } else {
3473 return true;
3474 }
3475 }
3476
3477
3478 Chrpos_T
Substring_genomic_alignment_length(T this)3479 Substring_genomic_alignment_length (T this) {
3480 /* Don't check for alts, as long as we return 0 */
3481 /* assert(this->alts_ncoords == 0); */
3482
3483 if (this == NULL) {
3484 return (Chrpos_T) 0;
3485 } else if (this->alts_ncoords > 0) {
3486 return (Chrpos_T) 0;
3487 } else if (this->alignend_trim > this->alignstart_trim) {
3488 return this->alignend_trim - this->alignstart_trim;
3489 } else {
3490 return this->alignstart_trim - this->alignend_trim;
3491 }
3492 }
3493
3494
3495 Chrnum_T
Substring_chrnum(T this)3496 Substring_chrnum (T this) {
3497 return this->chrnum;
3498 }
3499
3500 Univcoord_T
Substring_chroffset(T this)3501 Substring_chroffset (T this) {
3502 return this->chroffset;
3503 }
3504
3505 Univcoord_T
Substring_chrhigh(T this)3506 Substring_chrhigh (T this) {
3507 return this->chrhigh;
3508 }
3509
3510 Chrpos_T
Substring_chrlength(T this)3511 Substring_chrlength (T this) {
3512 return this->chrlength;
3513 }
3514
3515 Chrpos_T
Substring_chrpos_low(T this)3516 Substring_chrpos_low (T this) {
3517 /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3518 assert(this->alts_ncoords == 0);
3519
3520 if (this->plusp == true) {
3521 return this->alignstart_trim - this->chroffset;
3522 } else {
3523 return this->alignend_trim - this->chroffset;
3524 }
3525 }
3526
3527 Chrpos_T
Substring_chrpos_high(T this)3528 Substring_chrpos_high (T this) {
3529 /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3530 assert(this->alts_ncoords == 0);
3531
3532 if (this->plusp == true) {
3533 return this->alignend_trim - this->chroffset;
3534 } else {
3535 return this->alignstart_trim - this->chroffset;
3536 }
3537 }
3538
3539
3540 Chrpos_T
Substring_alignstart_trim_chr(T this)3541 Substring_alignstart_trim_chr (T this) {
3542 /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3543 assert(this->alts_ncoords == 0);
3544
3545 /* Previously had provision for alts_coords */
3546 return this->alignstart_trim - this->chroffset;
3547 }
3548
3549 Chrpos_T
Substring_alignend_trim_chr(T this)3550 Substring_alignend_trim_chr (T this) {
3551 /* Need to check for alts, because then alignstart_trim and alignend_trim are set to 0 */
3552 assert(this->alts_ncoords == 0);
3553
3554 /* Previously had provision for alts_coords */
3555 return this->alignend_trim - this->chroffset;
3556 }
3557
3558 Univcoord_T
Substring_alignstart_trim(T this)3559 Substring_alignstart_trim (T this) {
3560 return this->alignstart_trim;
3561 }
3562
3563 Univcoord_T
Substring_alignend_trim(T this)3564 Substring_alignend_trim (T this) {
3565 return this->alignend_trim;
3566 }
3567
3568
3569 Univcoord_T
Substring_left_genomicseg(T this)3570 Substring_left_genomicseg (T this) {
3571 return this->left;
3572 }
3573
3574 Univcoord_T
Substring_genomicstart(T this)3575 Substring_genomicstart (T this) {
3576 return this->genomicstart;
3577 }
3578
3579 Univcoord_T
Substring_genomicend(T this)3580 Substring_genomicend (T this) {
3581 return this->genomicend;
3582 }
3583
3584 double
Substring_amb_prob(T this)3585 Substring_amb_prob (T this) {
3586 if (this->amb_type == DON) {
3587 return this->siteD_prob;
3588 } else if (this->amb_type == ACC) {
3589 return this->siteA_prob;
3590 } else {
3591 return 0.0;
3592 }
3593 }
3594
3595
3596 double
Substring_amb_donor_prob(T this)3597 Substring_amb_donor_prob (T this) {
3598 double max;
3599 int i;
3600
3601 if (this->alts_ncoords == 0) {
3602 return this->siteD_prob;
3603 } else if (this->amb_type == DON) {
3604 max = this->alts_probs[0];
3605 for (i = 1; i < this->alts_ncoords; i++) {
3606 if (this->alts_probs[i] > max) {
3607 max = this->alts_probs[i];
3608 }
3609 }
3610 return max;
3611 } else {
3612 return this->siteD_prob;
3613 }
3614 }
3615
3616 double
Substring_amb_acceptor_prob(T this)3617 Substring_amb_acceptor_prob (T this) {
3618 double max;
3619 int i;
3620
3621 if (this->alts_ncoords == 0) {
3622 return this->siteA_prob;
3623 } else if (this->amb_type == ACC) {
3624 max = this->alts_probs[0];
3625 for (i = 1; i < this->alts_ncoords; i++) {
3626 if (this->alts_probs[i] > max) {
3627 max = this->alts_probs[i];
3628 }
3629 }
3630 return max;
3631 } else {
3632 return this->siteA_prob;
3633 }
3634 }
3635
3636
3637
3638 double
Substring_siteD_prob(T this)3639 Substring_siteD_prob (T this) {
3640 return this->siteD_prob;
3641 }
3642
3643 double
Substring_siteA_prob(T this)3644 Substring_siteA_prob (T this) {
3645 return this->siteA_prob;
3646 }
3647
3648 int
Substring_siteD_pos(T this)3649 Substring_siteD_pos (T this) {
3650 return this->siteD_pos;
3651 }
3652
3653 int
Substring_siteA_pos(T this)3654 Substring_siteA_pos (T this) {
3655 return this->siteA_pos;
3656 }
3657
3658 int
Substring_siteN_pos(T this)3659 Substring_siteN_pos (T this) {
3660 return this->siteN_pos;
3661 }
3662
3663
3664 bool
Substring_ambiguous_p(T this)3665 Substring_ambiguous_p (T this) {
3666 if (this->alts_ncoords > 0) {
3667 return false;
3668 } else if (this->amb_type == END) {
3669 return false;
3670 } else {
3671 return true;
3672 }
3673 }
3674
3675 bool
Substring_has_alts_p(T this)3676 Substring_has_alts_p (T this) {
3677 if (this->alts_ncoords > 0) {
3678 return true;
3679 } else {
3680 return false;
3681 }
3682 }
3683
3684 int
Substring_alts_ncoords(T this)3685 Substring_alts_ncoords (T this) {
3686 return this->alts_ncoords;
3687 }
3688
3689 Univcoord_T *
Substring_alts_coords(T this)3690 Substring_alts_coords (T this) {
3691 return this->alts_coords;
3692 }
3693
3694 double
Substring_alts_common_prob(T this)3695 Substring_alts_common_prob (T this) {
3696 if (this->amb_type == DON) {
3697 return this->siteA_prob;
3698 } else if (this->amb_type == ACC) {
3699 return this->siteD_prob;
3700 } else {
3701 return 0.0;
3702 }
3703 }
3704
3705 /* Debugging procedure */
3706 void
Substring_print_alts_coords(T this)3707 Substring_print_alts_coords (T this) {
3708 int i;
3709
3710 #ifdef LARGE_GENOMES
3711 for (i = 0; i < this->alts_ncoords; i++) {
3712 printf(" %llu",this->alts_coords[i] - this->chroffset);
3713 }
3714 #else
3715 for (i = 0; i < this->alts_ncoords; i++) {
3716 printf(" %u",this->alts_coords[i] - this->chroffset);
3717 }
3718 #endif
3719
3720 return;
3721 }
3722
3723
3724 int *
Substring_alts_nmismatches(T this)3725 Substring_alts_nmismatches (T this) {
3726 return this->alts_nmismatches;
3727 }
3728
3729 int *
Substring_alts_ref_nmismatches(T this)3730 Substring_alts_ref_nmismatches (T this) {
3731 return this->alts_ref_nmismatches;
3732 }
3733
3734 /* circularpos measures query distance from SAM chrlow to origin */
3735 int
Substring_circularpos(T this)3736 Substring_circularpos (T this) {
3737 if (this == NULL) {
3738 return -1;
3739
3740 } else if (this->plusp == true) {
3741 debug12(printf("Substring circularpos plus: %u..%u vs %u+%u\n",
3742 this->alignstart_trim,this->alignend_trim,this->chroffset,this->chrlength));
3743 if (this->alignstart_trim > this->chroffset + this->chrlength) {
3744 debug12(printf("all of substring is in the upper part, so previous substring was in lower part\n"));
3745 debug12(printf("returning querystart %d\n",this->querystart));
3746 return this->querystart;
3747
3748 } else if (this->alignend_trim > this->chroffset + this->chrlength) {
3749 debug12(printf("substring straddles the circular origin\n"));
3750 debug12(printf("returning querystart %d + chroffset %u + chrlength %u - alignstart_trim %d\n",
3751 this->querystart,this->chroffset,this->chrlength,this->alignstart_trim));
3752 /* return (this->querystart - this->trim_querystart) + (this->chroffset + this->chrlength) - this->alignstart; */
3753 return this->querystart + (this->chroffset + this->chrlength) - this->alignstart_trim;
3754
3755 } else {
3756 return -1;
3757 }
3758
3759 } else {
3760 debug12(printf("Substring circularpos minus: %u..%u vs %u+%u\n",
3761 this->alignstart_trim,this->alignend_trim,this->chroffset,this->chrlength));
3762 if (this->alignend_trim > this->chroffset + this->chrlength) {
3763 debug12(printf("all of substring is in the upper part, so previous substring was in lower part\n"));
3764 debug12(printf("returning querylength %d - queryend %d\n",this->querylength,this->queryend));
3765 return this->querylength - this->queryend;
3766
3767 } else if (this->alignstart_trim > this->chroffset + this->chrlength) {
3768 debug12(printf("returning querylength %d - queryend %d + chroffset %u + chrlength %u - alignend_trim %d\n",
3769 this->querylength,this->queryend,this->chroffset,this->chrlength,this->alignend_trim));
3770 /* return ((this->querylength - this->trim_queryend) - this->queryend) + (this->chroffset + this->chrlength) - this->alignend; */
3771 return (this->querylength - this->queryend) + (this->chroffset + this->chrlength) - this->alignend_trim;
3772
3773 } else {
3774 return -1;
3775 }
3776 }
3777 }
3778
3779
3780 T
Substring_copy(T old)3781 Substring_copy (T old) {
3782 T new;
3783
3784 if (old == NULL) {
3785 return (T) NULL;
3786 } else {
3787 new = (T) MALLOC_OUT(sizeof(*new));
3788 debug2(printf("substring %p is a copy of %p\n",new,old));
3789
3790 new->nmismatches_bothdiff = old->nmismatches_bothdiff;
3791 new->nmismatches_refdiff = old->nmismatches_refdiff;
3792
3793 new->nmatches_plus_spliced_trims = old->nmatches_plus_spliced_trims;
3794 new->nmatches_to_trims = old->nmatches_to_trims;
3795
3796 new->ref_nmatches_plus_spliced_trims = old->ref_nmatches_plus_spliced_trims;
3797 new->ref_nmatches_to_trims = old->ref_nmatches_to_trims;
3798
3799 new->mandatory_trim_querystart = old->mandatory_trim_querystart;
3800 new->mandatory_trim_queryend = old->mandatory_trim_queryend;
3801 new->start_amb_length = old->start_amb_length;
3802 new->end_amb_length = old->end_amb_length;
3803
3804 new->trim_querystart_splicep = old->trim_querystart_splicep;
3805 new->trim_queryend_splicep = old->trim_queryend_splicep;
3806
3807 new->chrnum = old->chrnum;
3808 new->chroffset = old->chroffset;
3809 new->chrhigh = old->chrhigh;
3810 new->chrlength = old->chrlength;
3811
3812 new->left = old->left;
3813 new->genomicstart = old->genomicstart;
3814 new->genomicend = old->genomicend;
3815
3816 new->start_endtype = old->start_endtype;
3817 new->end_endtype = old->end_endtype;
3818
3819 new->querystart_pretrim = old->querystart_pretrim;
3820 new->queryend_pretrim = old->queryend_pretrim;
3821 new->querystart = old->querystart;
3822 new->queryend = old->queryend;
3823 new->querystart_chrbound = old->querystart_chrbound;
3824 new->queryend_chrbound = old->queryend_chrbound;
3825
3826 new->amb_splice_pos = old->amb_splice_pos;
3827 new->querylength = old->querylength;
3828
3829 new->alignstart_trim = old->alignstart_trim;
3830 new->alignend_trim = old->alignend_trim;
3831
3832 new->plusp = old->plusp;
3833 new->genestrand = old->genestrand;
3834 new->query_compress = old->query_compress;
3835
3836 if (old->genomic_bothdiff == NULL) {
3837 new->genomic_bothdiff = (char *) NULL;
3838 new->genomic_refdiff = (char *) NULL;
3839
3840 } else if (old->genomic_refdiff == old->genomic_bothdiff) {
3841 assert((int) strlen(old->genomic_bothdiff) == old->querylength);
3842 new->genomic_bothdiff = (char *) CALLOC_OUT(strlen(old->genomic_bothdiff)+1,sizeof(char));
3843 strcpy(new->genomic_bothdiff,old->genomic_bothdiff);
3844 new->genomic_refdiff = new->genomic_bothdiff;
3845
3846 } else {
3847 assert((int) strlen(old->genomic_bothdiff) == old->querylength);
3848 assert((int) strlen(old->genomic_refdiff) == old->querylength);
3849 new->genomic_bothdiff = (char *) CALLOC_OUT(strlen(old->genomic_bothdiff)+1,sizeof(char));
3850 strcpy(new->genomic_bothdiff,old->genomic_bothdiff);
3851 new->genomic_refdiff = (char *) CALLOC_OUT(strlen(old->genomic_refdiff)+1,sizeof(char));
3852 strcpy(new->genomic_refdiff,old->genomic_refdiff);
3853 }
3854
3855 new->mapq_loglik = old->mapq_loglik;
3856
3857 new->sensedir = old->sensedir;
3858
3859 new->splicecoord_D = old->splicecoord_D;
3860 new->splicesitesD_knowni = old->splicesitesD_knowni;
3861 new->siteD_pos = old->siteD_pos;
3862 new->siteD_prob = old->siteD_prob;
3863
3864 new->splicecoord_A = old->splicecoord_A;
3865 new->splicesitesA_knowni = old->splicesitesA_knowni;
3866 new->siteA_pos = old->siteA_pos;
3867 new->siteA_prob = old->siteA_prob;
3868
3869 new->siteN_pos = old->siteN_pos;
3870
3871 if (old->alts_ncoords == 0) {
3872 new->alts_ncoords = 0;
3873 new->alts_coords = (Univcoord_T *) NULL;
3874 new->alts_knowni = (int *) NULL;
3875 new->alts_nmismatches = (int *) NULL;
3876 new->alts_ref_nmismatches = (int *) NULL;
3877 new->alts_probs = (double *) NULL;
3878 } else {
3879 new->alts_ncoords = old->alts_ncoords;
3880 new->alts_coords = (Univcoord_T *) MALLOC_OUT(old->alts_ncoords * sizeof(Univcoord_T));
3881 new->alts_knowni = (int *) MALLOC_OUT(old->alts_ncoords * sizeof(int));
3882 new->alts_nmismatches = (int *) MALLOC_OUT(old->alts_ncoords * sizeof(int));
3883 new->alts_ref_nmismatches = (int *) MALLOC_OUT(old->alts_ncoords * sizeof(int));
3884 new->alts_probs = (double *) MALLOC_OUT(old->alts_ncoords * sizeof(double));
3885
3886 memcpy(new->alts_coords,old->alts_coords,old->alts_ncoords * sizeof(Univcoord_T));
3887 memcpy(new->alts_knowni,old->alts_knowni,old->alts_ncoords * sizeof(int));
3888 memcpy(new->alts_nmismatches,old->alts_nmismatches,old->alts_ncoords * sizeof(int));
3889 memcpy(new->alts_ref_nmismatches,old->alts_ref_nmismatches,old->alts_ncoords * sizeof(int));
3890 memcpy(new->alts_probs,old->alts_probs,old->alts_ncoords * sizeof(double));
3891 }
3892 new->amb_type = old->amb_type;
3893
3894 return new;
3895 }
3896 }
3897
3898
3899 /* nmismatches is between donor_pos and endpos */
3900 T
Substring_new_donor(int nmismatches,int ref_nmismatches,Univcoord_T donor_coord,int donor_knowni,int querystart,int queryend,int sitepos,double donor_prob,bool splice_querystart_p,Splicetype_T splicetype_querystart,double ambig_prob_querystart,bool splice_queryend_p,Splicetype_T splicetype_queryend,double ambig_prob_queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,int sensedir,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)3901 Substring_new_donor (int nmismatches, int ref_nmismatches, Univcoord_T donor_coord, int donor_knowni,
3902 int querystart, int queryend, int sitepos, double donor_prob,
3903 bool splice_querystart_p, Splicetype_T splicetype_querystart, double ambig_prob_querystart,
3904 bool splice_queryend_p, Splicetype_T splicetype_queryend, double ambig_prob_queryend,
3905
3906 Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand, int sensedir,
3907 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
3908 T new;
3909
3910 /* Previously checked if left >= chroffset + chrlength to exclude
3911 the duplicate length, but now excluding all translocations to
3912 circular chromosomes */
3913
3914 if (chroffset + chrlength < chrhigh) {
3915 /* Don't splice to circular chromosomes */
3916 return (T) NULL;
3917
3918 } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
3919 plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
3920 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
3921 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,sensedir)) == NULL) {
3922 return (T) NULL;
3923 }
3924
3925 debug2(printf("Making new donor with splicesites_i %d, coord %u and left %u, plusp %d, sensedir %d, query %d..%d, trim %d..%d\n",
3926 donor_knowni,donor_coord,left,plusp,sensedir,new->querystart,new->queryend,
3927 new->querystart,querylength - new->queryend));
3928 debug2(printf("Setting siteD_prob to be %f\n",donor_prob));
3929
3930 new->splicecoord_D = donor_coord;
3931 new->splicesitesD_knowni = donor_knowni;
3932
3933 new->sensedir = sensedir;
3934
3935 new->siteD_pos = sitepos; /* not a qpos */
3936 new->siteD_prob = donor_prob;
3937
3938 return new;
3939 }
3940
3941
3942 /* nmismatches is between acceptor_pos and endpos */
3943 T
Substring_new_acceptor(int nmismatches,int ref_nmismatches,Univcoord_T acceptor_coord,int acceptor_knowni,int querystart,int queryend,int sitepos,double acceptor_prob,bool splice_querystart_p,Splicetype_T splicetype_querystart,double ambig_prob_querystart,bool splice_queryend_p,Splicetype_T splicetype_queryend,double ambig_prob_queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,int sensedir,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)3944 Substring_new_acceptor (int nmismatches, int ref_nmismatches, Univcoord_T acceptor_coord, int acceptor_knowni,
3945 int querystart, int queryend, int sitepos, double acceptor_prob,
3946 bool splice_querystart_p, Splicetype_T splicetype_querystart, double ambig_prob_querystart,
3947 bool splice_queryend_p, Splicetype_T splicetype_queryend, double ambig_prob_queryend,
3948
3949 Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand, int sensedir,
3950 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
3951 T new;
3952
3953 /* Previously checked if left >= chroffset + chrlength to exclude
3954 the duplicate length, but now excluding all translocations to
3955 circular chromosomes */
3956
3957 if (chroffset + chrlength < chrhigh) {
3958 /* Don't splice to circular chromosomes */
3959 return (T) NULL;
3960
3961 } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
3962 plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
3963 splice_querystart_p,splicetype_querystart,ambig_prob_querystart,
3964 splice_queryend_p,splicetype_queryend,ambig_prob_queryend,sensedir)) == NULL) {
3965 return (T) NULL;
3966 }
3967
3968 debug2(printf("Making new acceptor with splicesites_i %d, coord %u and left %u, plusp %d, sensedir %d, query %d..%d, trim %d..%d\n",
3969 acceptor_knowni,acceptor_coord,left,plusp,sensedir,new->querystart,new->queryend,
3970 new->querystart,querylength - new->queryend));
3971 debug2(printf("Setting siteA_prob to be %f\n",acceptor_prob));
3972
3973 new->splicecoord_A = acceptor_coord;
3974 new->splicesitesA_knowni = acceptor_knowni;
3975
3976 new->sensedir = sensedir;
3977
3978 new->siteA_pos = sitepos; /* not a qpos */
3979 new->siteA_prob = acceptor_prob;
3980
3981 return new;
3982 }
3983
3984
3985 void
Substring_label_donor(T this,int splice_pos,double donor_prob,int sensedir_distant_guess)3986 Substring_label_donor (T this, int splice_pos, double donor_prob, int sensedir_distant_guess) {
3987 if (this->plusp == true) {
3988 this->splicecoord_D = this->left + splice_pos;
3989 } else {
3990 this->splicecoord_D = this->left + this->querylength - splice_pos;
3991 }
3992 /* new->splicesitesD_knowni = donor_knowni; */
3993 /* new->sensedir = sensedir; */
3994 this->sensedir = sensedir_distant_guess;
3995
3996 this->siteD_pos = splice_pos; /* not a qpos */
3997 this->siteD_prob = donor_prob;
3998
3999 /* printf("Labeling donor with splicecoord %llu and prob %f\n",this->splicecoord_D,donor_prob); */
4000
4001 return;
4002 }
4003
4004
4005 void
Substring_label_acceptor(T this,int splice_pos,double acceptor_prob,int sensedir_distant_guess)4006 Substring_label_acceptor (T this, int splice_pos, double acceptor_prob, int sensedir_distant_guess) {
4007 if (this->plusp == true) {
4008 this->splicecoord_A = this->left + splice_pos;
4009 } else {
4010 this->splicecoord_A = this->left + this->querylength - splice_pos;
4011 }
4012 /* new->splicesitesA_knowni = acceptor_knowni; */
4013 /* new->sensedir = sensedir; */
4014 this->sensedir = sensedir_distant_guess;
4015
4016 this->siteA_pos = splice_pos; /* not a qpos */
4017 this->siteA_prob = acceptor_prob;
4018
4019 /* printf("Labeling acceptor with splicecoord %llu and prob %f\n",this->splicecoord_A,acceptor_prob); */
4020
4021 return;
4022 }
4023
4024
4025
4026 /* Used for distant DNA splicing. Uses querystart and queryend, not qstart and qend. middlepos is queryend */
4027 T
Substring_new_startfrag(int nmismatches,int querystart,int queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)4028 Substring_new_startfrag (int nmismatches, int querystart, int queryend,
4029 Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand,
4030 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
4031 T new;
4032 int ref_nmismatches = nmismatches;
4033
4034 /* Previously checked if left >= chroffset + chrlength to exclude
4035 the duplicate length, but now excluding all translocations to
4036 circular chromosomes */
4037
4038 if (chroffset + chrlength < chrhigh) {
4039 /* Don't splice to circular chromosomes */
4040 return (T) NULL;
4041
4042 } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
4043 plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
4044 /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4045 /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4046 /*sensedir*/SENSE_NULL)) == NULL) {
4047 return (T) NULL;
4048
4049 } else {
4050 debug2(printf("Making new startfrag with left %u, plusp %d, query %d..%d, genome %u..%u\n",
4051 left,plusp,querystart,queryend,
4052 Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4053 new->siteN_pos = queryend;
4054 return new;
4055 }
4056 }
4057
4058
4059 /* Used for distant DNA splicing. Uses querystart and queryend, not qstart and qend. middlepos is querystart */
4060 T
Substring_new_endfrag(int nmismatches,int querystart,int queryend,Univcoord_T left,Compress_T query_compress,int querylength,bool plusp,int genestrand,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength)4061 Substring_new_endfrag (int nmismatches, int querystart, int queryend,
4062 Univcoord_T left, Compress_T query_compress, int querylength, bool plusp, int genestrand,
4063 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength) {
4064 T new;
4065 int ref_nmismatches = nmismatches;
4066
4067 /* Previously checked if left >= chroffset + chrlength to exclude
4068 the duplicate length, but now excluding all translocations to
4069 circular chromosomes */
4070
4071 if (chroffset + chrlength < chrhigh) {
4072 /* Don't splice to circular chromosomes */
4073 return (T) NULL;
4074
4075 } else if ((new = Substring_new(nmismatches,ref_nmismatches,left,querystart,queryend,querylength,
4076 plusp,genestrand,query_compress,chrnum,chroffset,chrhigh,chrlength,
4077 /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4078 /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4079 /*sensedir*/SENSE_NULL)) == NULL) {
4080 return (T) NULL;
4081
4082 } else {
4083 debug2(printf("Making new endfrag with left %u, plusp %d, %d..%d, genome %u..%u\n",
4084 left,plusp,querystart,queryend,
4085 Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4086 new->siteN_pos = querystart;
4087 return new;
4088 }
4089 }
4090
4091
4092 T
Substring_trim_startfrag(int nmismatches,T old,int new_queryend)4093 Substring_trim_startfrag (int nmismatches, T old, int new_queryend) {
4094 T new;
4095 int ref_nmismatches = nmismatches;
4096
4097 debug2(printf("Entered Substring_trim_startfrag with %d..%d -> %d..%d\n",
4098 old->querystart,old->queryend,old->querystart,new_queryend));
4099
4100 if ((new = Substring_new(nmismatches,ref_nmismatches,old->left,old->querystart,new_queryend,old->querylength,
4101 old->plusp,old->genestrand,old->query_compress,
4102 old->chrnum,old->chroffset,old->chrhigh,old->chrlength,
4103 /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4104 /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4105 /*sensedir*/SENSE_NULL)) == NULL) {
4106 debug2(printf("Returning NULL\n"));
4107 return (T) NULL;
4108
4109 } else {
4110 debug2(printf("Making new startfrag with left %u, plusp %d, query %d..%d, genome %u..%u\n",
4111 old->left,old->plusp,old->querystart,new_queryend,
4112 Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4113 new->siteN_pos = new_queryend;
4114 return new;
4115 }
4116 }
4117
4118 T
Substring_trim_endfrag(int nmismatches,T old,int new_querystart)4119 Substring_trim_endfrag (int nmismatches, T old, int new_querystart) {
4120 T new;
4121 int ref_nmismatches = nmismatches;
4122
4123 debug2(printf("Entered Substring_trim_endfrag with %d..%d -> %d..%d\n",
4124 old->querystart,old->queryend,new_querystart,old->queryend));
4125
4126 if ((new = Substring_new(nmismatches,ref_nmismatches,old->left,new_querystart,old->queryend,old->querylength,
4127 old->plusp,old->genestrand,old->query_compress,
4128 old->chrnum,old->chroffset,old->chrhigh,old->chrlength,
4129 /*splice_querystart_p*/false,/*splicetype_querystart*/NO_SPLICE,/*ambig_prob_querystart*/0.0,
4130 /*splice_queryend_p*/false,/*splicetype_queryend*/NO_SPLICE,/*ambig_prob_queryend*/0.0,
4131 /*sensedir*/SENSE_NULL)) == NULL) {
4132 debug2(printf("Returning NULL\n"));
4133 return (T) NULL;
4134
4135 } else {
4136 debug2(printf("Making new endfrag with left %u, plusp %d, %d..%d, genome %u..%u\n",
4137 old->left,old->plusp,new_querystart,old->queryend,
4138 Substring_alignstart_trim(new),Substring_alignend_trim(new)));
4139 new->siteN_pos = new_querystart;
4140 return new;
4141 }
4142 }
4143
4144
4145
4146 static int
ascending_siteD_pos_cmp(const void * a,const void * b)4147 ascending_siteD_pos_cmp (const void *a, const void *b) {
4148 T x = * (T *) a;
4149 T y = * (T *) b;
4150
4151 if (x->siteD_pos < y->siteD_pos) {
4152 return -1;
4153 } else if (x->siteD_pos > y->siteD_pos) {
4154 return +1;
4155 } else if (x->genomicstart < y->genomicstart) {
4156 return -1;
4157 } else if (x->genomicstart > y->genomicstart) {
4158 return +1;
4159 } else {
4160 return 0;
4161 }
4162 }
4163
4164 static int
ascending_siteA_pos_cmp(const void * a,const void * b)4165 ascending_siteA_pos_cmp (const void *a, const void *b) {
4166 T x = * (T *) a;
4167 T y = * (T *) b;
4168
4169 if (x->siteA_pos < y->siteA_pos) {
4170 return -1;
4171 } else if (x->siteA_pos > y->siteA_pos) {
4172 return +1;
4173 } else if (x->genomicstart < y->genomicstart) {
4174 return -1;
4175 } else if (x->genomicstart > y->genomicstart) {
4176 return +1;
4177 } else {
4178 return 0;
4179 }
4180 }
4181
4182 static int
ascending_siteN_pos_cmp(const void * a,const void * b)4183 ascending_siteN_pos_cmp (const void *a, const void *b) {
4184 T x = * (T *) a;
4185 T y = * (T *) b;
4186
4187 if (x->siteN_pos < y->siteN_pos) {
4188 return -1;
4189 } else if (x->siteN_pos > y->siteN_pos) {
4190 return +1;
4191 } else if (x->genomicstart < y->genomicstart) {
4192 return -1;
4193 } else if (x->genomicstart > y->genomicstart) {
4194 return +1;
4195 } else {
4196 return 0;
4197 }
4198 }
4199
4200 static int
descending_siteD_pos_cmp(const void * a,const void * b)4201 descending_siteD_pos_cmp (const void *a, const void *b) {
4202 T x = * (T *) a;
4203 T y = * (T *) b;
4204
4205 if (x->siteD_pos < y->siteD_pos) {
4206 return -1;
4207 } else if (x->siteD_pos > y->siteD_pos) {
4208 return +1;
4209 } else if (x->genomicstart > y->genomicstart) {
4210 return -1;
4211 } else if (x->genomicstart < y->genomicstart) {
4212 return +1;
4213 } else {
4214 return 0;
4215 }
4216 }
4217
4218 static int
descending_siteA_pos_cmp(const void * a,const void * b)4219 descending_siteA_pos_cmp (const void *a, const void *b) {
4220 T x = * (T *) a;
4221 T y = * (T *) b;
4222
4223 if (x->siteA_pos < y->siteA_pos) {
4224 return -1;
4225 } else if (x->siteA_pos > y->siteA_pos) {
4226 return +1;
4227 } else if (x->genomicstart > y->genomicstart) {
4228 return -1;
4229 } else if (x->genomicstart < y->genomicstart) {
4230 return +1;
4231 } else {
4232 return 0;
4233 }
4234 }
4235
4236 static int
descending_siteN_pos_cmp(const void * a,const void * b)4237 descending_siteN_pos_cmp (const void *a, const void *b) {
4238 T x = * (T *) a;
4239 T y = * (T *) b;
4240
4241 if (x->siteN_pos < y->siteN_pos) {
4242 return -1;
4243 } else if (x->siteN_pos > y->siteN_pos) {
4244 return +1;
4245 } else if (x->genomicstart > y->genomicstart) {
4246 return -1;
4247 } else if (x->genomicstart < y->genomicstart) {
4248 return +1;
4249 } else {
4250 return 0;
4251 }
4252 }
4253
4254 List_T
Substring_sort_siteD_halves(List_T hitlist,Listpool_T listpool,bool ascendingp)4255 Substring_sort_siteD_halves (List_T hitlist, Listpool_T listpool, bool ascendingp) {
4256 List_T sorted = NULL;
4257 T x, *hits;
4258 int n, i, j;
4259 bool *eliminate;
4260
4261 n = List_length(hitlist);
4262 debug(printf("Checking %d spliceends for duplicates...",n));
4263 if (n == 0) {
4264 debug(printf("\n"));
4265 return NULL;
4266 }
4267
4268 hits = (T *) MALLOCA(n * sizeof(T));
4269 List_fill_array((void **) hits,hitlist);
4270 /* List_free(&hitlist); -- allocated by Listpool_push */
4271
4272 if (ascendingp == true) {
4273 qsort(hits,n,sizeof(T),ascending_siteD_pos_cmp);
4274 } else {
4275 qsort(hits,n,sizeof(T),descending_siteD_pos_cmp);
4276 }
4277
4278 /* Check for duplicates */
4279 eliminate = (bool *) CALLOCA(n,sizeof(bool));
4280 for (i = 0; i < n; i++) {
4281 x = hits[i];
4282 j = i+1;
4283 while (j < n && hits[j]->siteD_pos == x->siteD_pos && hits[j]->genomicstart == x->genomicstart) {
4284 eliminate[j] = true;
4285 j++;
4286 }
4287 }
4288
4289 debug(j = 0);
4290 for (i = n-1; i >= 0; i--) {
4291 x = hits[i];
4292 if (eliminate[i] == false) {
4293 sorted = Listpool_push(sorted,listpool,(void *) x);
4294 } else {
4295 Substring_free(&x);
4296 debug(j++);
4297 }
4298 }
4299 debug(printf("%d eliminated\n",j));
4300
4301 FREEA(hits);
4302 FREEA(eliminate);
4303
4304 return sorted;
4305 }
4306
4307 List_T
Substring_sort_siteA_halves(List_T hitlist,Listpool_T listpool,bool ascendingp)4308 Substring_sort_siteA_halves (List_T hitlist, Listpool_T listpool, bool ascendingp) {
4309 List_T sorted = NULL;
4310 T x, *hits;
4311 int n, i, j;
4312 bool *eliminate;
4313
4314 n = List_length(hitlist);
4315 debug(printf("Checking %d spliceends for duplicates...",n));
4316 if (n == 0) {
4317 debug(printf("\n"));
4318 return NULL;
4319 }
4320
4321 hits = (T *) MALLOCA(n * sizeof(T));
4322 List_fill_array((void **) hits,hitlist);
4323 /* List_free(&hitlist); -- allocated by Listpool_push */
4324
4325 if (ascendingp == true) {
4326 qsort(hits,n,sizeof(T),ascending_siteA_pos_cmp);
4327 } else {
4328 qsort(hits,n,sizeof(T),descending_siteA_pos_cmp);
4329 }
4330
4331 /* Check for duplicates */
4332 eliminate = (bool *) CALLOCA(n,sizeof(bool));
4333 for (i = 0; i < n; i++) {
4334 x = hits[i];
4335 j = i+1;
4336 while (j < n && hits[j]->siteA_pos == x->siteA_pos && hits[j]->genomicstart == x->genomicstart) {
4337 eliminate[j] = true;
4338 j++;
4339 }
4340 }
4341
4342 debug(j = 0);
4343 for (i = n-1; i >= 0; i--) {
4344 x = hits[i];
4345 if (eliminate[i] == false) {
4346 sorted = Listpool_push(sorted,listpool,(void *) x);
4347 } else {
4348 Substring_free(&x);
4349 debug(j++);
4350 }
4351 }
4352 debug(printf("%d eliminated\n",j));
4353
4354 FREEA(hits);
4355 FREEA(eliminate);
4356
4357 return sorted;
4358 }
4359
4360 List_T
Substring_sort_siteN_halves(List_T hitlist,Listpool_T listpool,bool ascendingp)4361 Substring_sort_siteN_halves (List_T hitlist, Listpool_T listpool, bool ascendingp) {
4362 List_T sorted = NULL;
4363 T x, *hits;
4364 int n, i, j;
4365 bool *eliminate;
4366
4367 n = List_length(hitlist);
4368 debug(printf("Checking %d spliceends for duplicates...",n));
4369 if (n == 0) {
4370 debug(printf("\n"));
4371 return NULL;
4372 }
4373
4374 hits = (T *) MALLOCA(n * sizeof(T));
4375 List_fill_array((void **) hits,hitlist);
4376 /* List_free(&hitlist); -- allocated by Listpool_push */
4377
4378 if (ascendingp == true) {
4379 qsort(hits,n,sizeof(T),ascending_siteN_pos_cmp);
4380 } else {
4381 qsort(hits,n,sizeof(T),descending_siteN_pos_cmp);
4382 }
4383
4384 /* Check for duplicates */
4385 eliminate = (bool *) CALLOCA(n,sizeof(bool));
4386 for (i = 0; i < n; i++) {
4387 x = hits[i];
4388 j = i+1;
4389 while (j < n && hits[j]->siteN_pos == x->siteN_pos && hits[j]->genomicstart == x->genomicstart) {
4390 eliminate[j] = true;
4391 j++;
4392 }
4393 }
4394
4395 debug(j = 0);
4396 for (i = n-1; i >= 0; i--) {
4397 x = hits[i];
4398 if (eliminate[i] == false) {
4399 sorted = Listpool_push(sorted,listpool,(void *) x);
4400 } else {
4401 Substring_free(&x);
4402 debug(j++);
4403 }
4404 }
4405 debug(printf("%d eliminated\n",j));
4406
4407 FREEA(hits);
4408 FREEA(eliminate);
4409
4410 return sorted;
4411 }
4412
4413
4414 static void
print_snp_labels(Filestring_T fp,T this,Shortread_T queryseq)4415 print_snp_labels (Filestring_T fp, T this, Shortread_T queryseq) {
4416 int *snps, nsnps, querypos, i;
4417 char *label, *seq1, *seq2;
4418 bool allocp, printp = false;
4419 Interval_T interval;
4420 Chrpos_T position;
4421
4422 if (this->plusp == true) {
4423 snps = IIT_get_with_divno(&nsnps,snps_iit,
4424 snps_divint_crosstable[this->chrnum],
4425 this->alignstart_trim - this->chroffset + 1,this->alignend_trim - this->chroffset,
4426 /*sortp*/false);
4427 } else {
4428 snps = IIT_get_with_divno(&nsnps,snps_iit,
4429 snps_divint_crosstable[this->chrnum],
4430 this->alignend_trim - this->chroffset + 1,this->alignstart_trim - this->chroffset,
4431 /*sortp*/false);
4432 }
4433
4434 FPRINTF(fp,",snps:");
4435
4436 seq1 = Shortread_fullpointer_uc(queryseq);
4437 if (this->genomic_bothdiff == NULL) {
4438 seq2 = Shortread_fullpointer(queryseq);
4439 } else {
4440 seq2 = this->genomic_bothdiff;
4441 }
4442
4443 if (this->plusp) {
4444
4445 #if 0
4446 for (i = 0; i < nsnps; i++) {
4447 interval = IIT_interval(snps_iit,snps[i]);
4448 position = Interval_low(interval);
4449 querypos = position - (this->genomicstart-this->chroffset) - 1;
4450 printf("%d ",querypos);
4451 }
4452 printf("\n");
4453 #endif
4454
4455 for (i = 0; i < nsnps; i++) {
4456 interval = IIT_interval(snps_iit,snps[i]);
4457 position = Interval_low(interval);
4458 querypos = position - (this->genomicstart-this->chroffset) - 1;
4459 /* assert(querypos >= 0 && querypos < this->genomiclength); */
4460
4461 #if 0
4462 alleles = IIT_typestring(snps_iit,Interval_type(interval));
4463 if (c == alleles[0] || c == alleles[1]) ;
4464 #endif
4465
4466 if (isupper(seq2[querypos]) && seq1[querypos] != seq2[querypos]) {
4467 label = IIT_label(snps_iit,snps[i],&allocp);
4468 if (printp) {
4469 FPRINTF(fp,"|");
4470 }
4471 FPRINTF(fp,"%d@",querypos+1);
4472 FPRINTF(fp,"%s",label);
4473 printp = true;
4474 if (allocp) FREE(label);
4475 }
4476 }
4477
4478 } else {
4479
4480 for (i = nsnps-1; i >= 0; i--) {
4481 interval = IIT_interval(snps_iit,snps[i]);
4482 position = Interval_low(interval);
4483 querypos = (this->genomicstart-this->chroffset) - position;
4484 /* assert(querypos >= 0 && querypos < this->genomiclength); */
4485
4486 #if 0
4487 /* printf("\n%d%c\n",querypos,c); */
4488 alleles = IIT_typestring(snps_iit,Interval_type(interval));
4489 if (c == alleles[0] || c == alleles[1]) ;
4490 #endif
4491
4492 if (isupper(seq2[querypos]) && seq1[querypos] != seq2[querypos]) {
4493 label = IIT_label(snps_iit,snps[i],&allocp);
4494 if (printp) {
4495 FPRINTF(fp,"|");
4496 }
4497 FPRINTF(fp,"%d@",querypos+1);
4498 FPRINTF(fp,"%s",label);
4499 printp = true;
4500 if (allocp) FREE(label);
4501 }
4502 }
4503
4504 }
4505
4506 FREE(snps);
4507
4508 return;
4509 }
4510
4511
4512 Chrpos_T
Substring_compute_chrpos(T this,int hardclip_low,bool hide_soft_clips_p)4513 Substring_compute_chrpos (T this, int hardclip_low, bool hide_soft_clips_p) {
4514 Chrpos_T chrpos_low;
4515
4516 if (hide_soft_clips_p == true) {
4517 if (this->plusp == true) {
4518 /* Add 1 to report in 1-based coordinates */
4519 chrpos_low = this->genomicstart - this->chroffset + 1;
4520 chrpos_low += hardclip_low;
4521 /* *chrpos_high = this->genomicend - this->chroffset + 1; */
4522 /* *chrpos_high -= hardclip_high; */
4523
4524 } else {
4525 /* Add 1 to report in 1-based coordinates */
4526 chrpos_low = this->genomicend - this->chroffset + 1;
4527 chrpos_low += hardclip_low;
4528 /* *chrpos_high = this->genomicstart - this->chroffset + 1; */
4529 /* *chrpos_high -= hardclip_high; */
4530 }
4531
4532 } else {
4533 if (this->plusp == true) {
4534 chrpos_low = this->genomicstart - this->chroffset + 1;
4535 if (this->querystart > hardclip_low) {
4536 chrpos_low += this->querystart; /* not querystart_orig */
4537 } else {
4538 chrpos_low += hardclip_low;
4539 }
4540
4541 #if 0
4542 *chrpos_high = this->genomicend - this->chroffset + 1;
4543 if (this->querylength - this->queryend > hardclip_high) {
4544 *chrpos_high -= this->querylength - this->queryend;
4545 } else {
4546 *chrpos_high -= hardclip_high;
4547 }
4548 #endif
4549
4550 } else {
4551 chrpos_low = this->genomicend - this->chroffset + 1;
4552 if (this->querylength - this->queryend > hardclip_low) {
4553 chrpos_low += this->querylength - this->queryend; /* not queryend_orig */
4554 } else {
4555 chrpos_low += hardclip_low;
4556 }
4557
4558 #if 0
4559 *chrpos_high = this->genomicstart - this->chroffset + 1;
4560 if (this->querystart > hardclip_high) {
4561 *chrpos_high -= this->querystart;
4562 } else {
4563 *chrpos_high -= hardclip_high;
4564 }
4565 #endif
4566 }
4567 }
4568
4569 return chrpos_low;
4570 }
4571
4572
4573
4574 /* Taken from NCBI Blast 2.2.29, algo/blast/core/blast_stat.c */
4575 /* Karlin-Altschul formula: m n exp(-lambda * S + log k) = k m n exp(-lambda * S) */
4576 /* Also in pair.c */
4577
4578 static double
blast_evalue(int alignlength,int nmismatches)4579 blast_evalue (int alignlength, int nmismatches) {
4580 double k = 0.1;
4581 double lambda = 1.58; /* For a +1, -1 scoring scheme */
4582 double score;
4583
4584 score = (double) ((alignlength - nmismatches) /* scored as +1 */ - nmismatches /* scored as -1 */);
4585
4586 return k * (double) alignlength * genomesize * exp(-lambda * score);
4587 }
4588
4589 static double
blast_bitscore(int alignlength,int nmismatches)4590 blast_bitscore (int alignlength, int nmismatches) {
4591 double k = 0.1;
4592 double lambda = 1.58; /* For a +1, -1 scoring scheme */
4593 double score;
4594
4595 score = (double) ((alignlength - nmismatches) /* scored as +1 */ - nmismatches /* scored as -1 */);
4596 return (score * lambda - log(k)) / log(2.0);
4597 }
4598
4599
4600 double
Substring_evalue(T substring)4601 Substring_evalue (T substring) {
4602 int alignlength_trim;
4603
4604 #if 0
4605 /* Doesn't work for ambiguous substrings */
4606 if (substring->plusp == true) {
4607 alignlength_trim = (int) (substring->alignend_trim - substring->alignstart_trim);
4608 } else {
4609 alignlength_trim = (int) (substring->alignstart_trim - substring->alignend_trim);
4610 }
4611 assert(alignlength_trim == substring->queryend - substring->querystart);
4612 #else
4613 alignlength_trim = substring->queryend - substring->querystart;
4614 #endif
4615
4616 return blast_evalue(alignlength_trim,substring->nmismatches_bothdiff);
4617 }
4618
4619
4620 void
Substring_print_m8(Filestring_T fp,T substring,Shortread_T headerseq,char * acc_suffix,char * chr,bool invertp)4621 Substring_print_m8 (Filestring_T fp, T substring, Shortread_T headerseq, char *acc_suffix,
4622 char *chr, bool invertp) {
4623 double identity;
4624 int alignlength_trim;
4625
4626 FPRINTF(fp,"%s%s",Shortread_accession(headerseq),acc_suffix); /* field 0: accession */
4627
4628 FPRINTF(fp,"\t%s",chr); /* field 1: chr */
4629
4630 /* field 2: identity */
4631 if (substring->plusp == true) {
4632 alignlength_trim = (int) (substring->alignend_trim - substring->alignstart_trim);
4633 } else {
4634 alignlength_trim = (int) (substring->alignstart_trim - substring->alignend_trim);
4635 }
4636
4637 identity = (double) (alignlength_trim - substring->nmismatches_bothdiff)/(double) alignlength_trim;
4638 FPRINTF(fp,"\t%.1f",100.0*identity);
4639
4640
4641 FPRINTF(fp,"\t%d",alignlength_trim); /* field 3: query length */
4642
4643 FPRINTF(fp,"\t%d",substring->nmismatches_bothdiff); /* field 4: nmismatches */
4644
4645 FPRINTF(fp,"\t0"); /* field 5: gap openings */
4646
4647 FPRINTF(fp,"\t%d",substring->querystart + 1); /* field 6: query start */
4648
4649 FPRINTF(fp,"\t%d",substring->queryend); /* field 7: query end */
4650
4651 /* fields 8 and 9: chr start and end */
4652 if (substring->plusp == true) {
4653 if (invertp == false) {
4654 FPRINTF(fp,"\t%u\t%u",substring->alignstart_trim - substring->chroffset + 1,
4655 substring->alignend_trim - substring->chroffset);
4656 } else {
4657 FPRINTF(fp,"\t%u\t%u",substring->alignend_trim - substring->chroffset,
4658 substring->alignstart_trim - substring->chroffset + 1);
4659 }
4660 } else {
4661 if (invertp == false) {
4662 FPRINTF(fp,"\t%u\t%u",substring->alignstart_trim - substring->chroffset,
4663 substring->alignend_trim - substring->chroffset + 1);
4664 } else {
4665 FPRINTF(fp,"\t%u\t%u",substring->alignend_trim - substring->chroffset + 1,
4666 substring->alignstart_trim - substring->chroffset);
4667 }
4668 }
4669
4670 /* field 10: E value */
4671 FPRINTF(fp,"\t%.2g",blast_evalue(alignlength_trim,substring->nmismatches_bothdiff));
4672
4673 /* field 11: bit score */
4674 FPRINTF(fp,"\t%.1f",blast_bitscore(alignlength_trim,substring->nmismatches_bothdiff));
4675
4676 FPRINTF(fp,"\n");
4677
4678 return;
4679 }
4680
4681
4682
4683 static void
print_forward(Filestring_T fp,char * string,int n)4684 print_forward (Filestring_T fp, char *string, int n) {
4685
4686 FPRINTF(fp,"%.*s",n,string);
4687 return;
4688 }
4689
4690 static void
print_lc(Filestring_T fp,char * string,int n)4691 print_lc (Filestring_T fp, char *string, int n) {
4692 int i;
4693
4694 for (i = 0; i < n; i++) {
4695 FPRINTF(fp,"%c",(char) tolower(string[i]));
4696 }
4697 return;
4698 }
4699
4700 static void
print_revcomp(Filestring_T fp,char * nt,int len)4701 print_revcomp (Filestring_T fp, char *nt, int len) {
4702
4703 FPRINTF(fp,"%.*R",len,nt);
4704 return;
4705 }
4706
4707 static void
print_revcomp_lc(Filestring_T fp,char * nt,int len)4708 print_revcomp_lc (Filestring_T fp, char *nt, int len) {
4709 int i;
4710
4711 for (i = len-1; i >= 0; --i) {
4712 FPRINTF(fp,"%c",(char) tolower(complCode[(int) nt[i]]));
4713 }
4714 return;
4715 }
4716
4717
4718 static void
print_genomic(Filestring_T fp,T substring,char * deletion,int deletionlength,bool invertp,Shortread_T queryseq)4719 print_genomic (Filestring_T fp, T substring, char *deletion, int deletionlength, bool invertp,
4720 Shortread_T queryseq) {
4721 int i;
4722
4723 /* Now handled by calculation of queryleft/queryright by Stage3end_display_prep */
4724 deletion = NULL;
4725 deletionlength = 0;
4726
4727 if (invertp == false) {
4728 if (substring->genomic_bothdiff == NULL) {
4729 /* Exact match */
4730 Shortread_print_subseq_uc(fp,queryseq,substring->querystart,substring->queryend);
4731
4732 } else if (show_refdiff_p == true) {
4733 print_forward(fp,substring->genomic_refdiff,substring->queryend);
4734 if (deletion != NULL) {
4735 print_lc(fp,deletion,deletionlength);
4736 }
4737 print_forward(fp,&(substring->genomic_refdiff[substring->queryend]),substring->querylength - deletionlength - substring->queryend);
4738
4739 } else {
4740 print_forward(fp,substring->genomic_bothdiff,substring->queryend);
4741 if (deletion != NULL) {
4742 print_lc(fp,deletion,deletionlength);
4743 }
4744 print_forward(fp,&(substring->genomic_bothdiff[substring->queryend]),substring->querylength - deletionlength - substring->queryend);
4745 }
4746
4747 for (i = 0; i < Shortread_choplength(queryseq); i++) {
4748 FPRINTF(fp,"*");
4749 }
4750 FPRINTF(fp,"\t");
4751 FPRINTF(fp,"%d..%d",1 + substring->querystart,substring->queryend);
4752
4753 } else {
4754 if (substring->genomic_bothdiff == NULL) {
4755 /* Exact match */
4756 Shortread_print_subseq_revcomp_uc(fp,queryseq,substring->querystart,substring->queryend);
4757
4758 } else if (show_refdiff_p == true) {
4759 print_revcomp(fp,&(substring->genomic_refdiff[substring->querystart]),substring->querylength - substring->querystart);
4760 if (deletion != NULL) {
4761 print_revcomp_lc(fp,deletion,deletionlength);
4762 }
4763 print_revcomp(fp,&(substring->genomic_refdiff[deletionlength]),substring->querystart - deletionlength);
4764
4765 } else {
4766 print_revcomp(fp,&(substring->genomic_bothdiff[substring->querystart]),substring->querylength - substring->querystart);
4767 if (deletion != NULL) {
4768 print_revcomp_lc(fp,deletion,deletionlength);
4769 }
4770 print_revcomp(fp,&(substring->genomic_bothdiff[deletionlength]),substring->querystart - deletionlength);
4771 }
4772 for (i = 0; i < Shortread_choplength(queryseq); i++) {
4773 FPRINTF(fp,"*");
4774 }
4775 FPRINTF(fp,"\t");
4776 FPRINTF(fp,"%d..%d",1 + substring->querylength - substring->queryend,
4777 substring->querylength - substring->querystart);
4778 }
4779 return;
4780 }
4781
4782
4783 static void
print_coordinates(Filestring_T fp,T substring,char * chr,bool invertp)4784 print_coordinates (Filestring_T fp, T substring, char *chr, bool invertp) {
4785
4786 if (substring->plusp == true) {
4787 if (invertp == false) {
4788 FPRINTF(fp,"+%s:%u..%u",chr,substring->alignstart_trim - substring->chroffset + 1,
4789 substring->alignend_trim - substring->chroffset);
4790 } else {
4791 FPRINTF(fp,"-%s:%u..%u",chr,substring->alignend_trim - substring->chroffset,
4792 substring->alignstart_trim - substring->chroffset + 1);
4793 }
4794 } else {
4795 if (invertp == false) {
4796 FPRINTF(fp,"-%s:%u..%u",chr,substring->alignstart_trim - substring->chroffset,
4797 substring->alignend_trim - substring->chroffset + 1);
4798 } else {
4799 FPRINTF(fp,"+%s:%u..%u",chr,substring->alignend_trim - substring->chroffset + 1,
4800 substring->alignstart_trim - substring->chroffset);
4801 }
4802 }
4803
4804 return;
4805 }
4806
4807
4808
4809 void
Substring_print_alignment(Filestring_T fp,Junction_T pre_junction,T substring,Junction_T post_junction,Shortread_T queryseq,Genome_T genome,char * chr,bool invertp)4810 Substring_print_alignment (Filestring_T fp, Junction_T pre_junction, T substring, Junction_T post_junction,
4811 Shortread_T queryseq, Genome_T genome, char *chr, bool invertp) {
4812 char *deletion_string;
4813 int deletion_length;
4814 Junctiontype_T type1, type2;
4815 Chrpos_T splice_distance_1, splice_distance_2;
4816
4817 if (post_junction == NULL) {
4818 deletion_string = (char *) NULL;
4819 deletion_length = 0;
4820 } else if (Junction_type(post_junction) != DEL_JUNCTION) {
4821 deletion_string = (char *) NULL;
4822 deletion_length = 0;
4823 } else {
4824 deletion_string = Junction_deletion_string(post_junction,genome,substring->plusp);
4825 deletion_length = Junction_nindels(post_junction);
4826 }
4827
4828 print_genomic(fp,substring,deletion_string,deletion_length,invertp,queryseq);
4829 FREE(deletion_string);
4830 FPRINTF(fp,"\t");
4831 print_coordinates(fp,substring,chr,invertp);
4832
4833 FPRINTF(fp,"\t");
4834 if (pre_junction == NULL) {
4835 type1 = NO_JUNCTION;
4836 /* Handle result of substring_trim_novel_spliceends */
4837 if (invertp == false) {
4838 if (substring->start_endtype == DON) {
4839 FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4840 } else if (substring->start_endtype == ACC) {
4841 FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4842 } else {
4843 FPRINTF(fp,"start:%d",substring->querystart);
4844 }
4845 } else {
4846 if (substring->end_endtype == DON) {
4847 FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4848 } else if (substring->end_endtype == ACC) {
4849 FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4850 } else {
4851 FPRINTF(fp,"start:%d",substring->querylength - substring->queryend);
4852 }
4853 }
4854 } else if ((type1 = Junction_type(pre_junction)) == INS_JUNCTION) {
4855 FPRINTF(fp,"ins:%d",Junction_nindels(pre_junction));
4856 } else if (type1 == DEL_JUNCTION) {
4857 FPRINTF(fp,"del:%d",Junction_nindels(pre_junction));
4858 } else if (type1 == SPLICE_JUNCTION) {
4859 if (invertp == false) {
4860 if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4861 FPRINTF(fp,"donor:%.2f",Junction_donor_prob(pre_junction));
4862 } else {
4863 FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(pre_junction));
4864 }
4865 } else {
4866 if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4867 FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(pre_junction));
4868 } else {
4869 FPRINTF(fp,"donor:%.2f",Junction_donor_prob(pre_junction));
4870 }
4871 }
4872 } else if (type1 == CHIMERA_JUNCTION) {
4873 FPRINTF(fp,"distant:%u",Junction_splice_distance(pre_junction));
4874 } else {
4875 abort();
4876 }
4877
4878 FPRINTF(fp,"..");
4879
4880 if (post_junction == NULL) {
4881 type2 = NO_JUNCTION;
4882 /* Handle result of substring_trim_novel_spliceends */
4883 if (invertp == false) {
4884 if (substring->end_endtype == DON) {
4885 FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4886 } else if (substring->end_endtype == ACC) {
4887 FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4888 } else {
4889 FPRINTF(fp,"end:%d",substring->querylength - substring->queryend);
4890 }
4891 } else {
4892 if (substring->start_endtype == DON) {
4893 FPRINTF(fp,"donor:%.2f",substring->siteD_prob);
4894 } else if (substring->start_endtype == ACC) {
4895 FPRINTF(fp,"acceptor:%.2f",substring->siteA_prob);
4896 } else {
4897 FPRINTF(fp,"end:%d",substring->querystart);
4898 }
4899 }
4900 } else if ((type2 = Junction_type(post_junction)) == INS_JUNCTION) {
4901 FPRINTF(fp,"ins:%d",Junction_nindels(post_junction));
4902 } else if (type2 == DEL_JUNCTION) {
4903 FPRINTF(fp,"del:%d",Junction_nindels(post_junction));
4904 } else if (type2 == SPLICE_JUNCTION) {
4905 if (invertp == false) {
4906 if (Junction_sensedir(post_junction) == SENSE_ANTI) {
4907 FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(post_junction));
4908 } else {
4909 FPRINTF(fp,"donor:%.2f",Junction_donor_prob(post_junction));
4910 }
4911 } else {
4912 if (Junction_sensedir(post_junction) == SENSE_ANTI) {
4913 FPRINTF(fp,"donor:%.2f",Junction_donor_prob(post_junction));
4914 } else {
4915 FPRINTF(fp,"acceptor:%.2f",Junction_acceptor_prob(post_junction));
4916 }
4917 }
4918 } else if (type2 == CHIMERA_JUNCTION) {
4919 FPRINTF(fp,"distant:%u",Junction_splice_distance(post_junction));
4920 } else {
4921 abort();
4922 }
4923
4924
4925 FPRINTF(fp,",matches:%d,sub:%d",Substring_nmatches_to_trims(substring),substring->nmismatches_bothdiff);
4926 if (print_nsnpdiffs_p) {
4927 FPRINTF(fp,"+%d=%d",substring->nmismatches_refdiff - substring->nmismatches_bothdiff,substring->nmismatches_refdiff);
4928 if (print_snplabels_p && substring->nmismatches_refdiff > substring->nmismatches_bothdiff) {
4929 print_snp_labels(fp,substring,queryseq);
4930 }
4931 }
4932
4933 if (type1 == SPLICE_JUNCTION && type2 == SPLICE_JUNCTION) {
4934 if (invertp == false) {
4935 if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4936 FPRINTF(fp,",dir:sense");
4937 } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4938 FPRINTF(fp,",dir:antisense");
4939 } else {
4940 FPRINTF(fp,",dir:unknown");
4941 }
4942 } else {
4943 if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4944 FPRINTF(fp,",dir:antisense");
4945 } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4946 FPRINTF(fp,",dir:sense");
4947 } else {
4948 FPRINTF(fp,",dir:unknown");
4949 }
4950 }
4951 splice_distance_1 = Junction_splice_distance(pre_junction);
4952 splice_distance_2 = Junction_splice_distance(post_junction);
4953 if (splice_distance_1 == 0 && splice_distance_2 == 0) {
4954 /* Skip */
4955 } else if (splice_distance_1 == 0) {
4956 FPRINTF(fp,",splice_type:consistent");
4957 FPRINTF(fp,",splice_dist_2:%u",splice_distance_2);
4958 } else if (splice_distance_2 == 0) {
4959 FPRINTF(fp,",splice_type:consistent");
4960 FPRINTF(fp,",splice_dist_1:%u",splice_distance_1);
4961 } else {
4962 FPRINTF(fp,",splice_type:consistent");
4963 FPRINTF(fp,",splice_dist_1:%u",splice_distance_1);
4964 FPRINTF(fp,",splice_dist_2:%u",splice_distance_2);
4965 }
4966
4967 } else if (type1 == SPLICE_JUNCTION) {
4968 if (invertp == false) {
4969 if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4970 FPRINTF(fp,",dir:sense");
4971 } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4972 FPRINTF(fp,",dir:antisense");
4973 } else {
4974 FPRINTF(fp,",dir:unknown");
4975 }
4976 } else {
4977 if (Junction_sensedir(pre_junction) == SENSE_FORWARD) {
4978 FPRINTF(fp,",dir:antisense");
4979 } else if (Junction_sensedir(pre_junction) == SENSE_ANTI) {
4980 FPRINTF(fp,",dir:sense");
4981 } else {
4982 FPRINTF(fp,",dir:unknown");
4983 }
4984 }
4985 if ((splice_distance_1 = Junction_splice_distance(pre_junction)) > 0) {
4986 FPRINTF(fp,",splice_type:consistent");
4987 FPRINTF(fp,",splice_dist_1:%u",splice_distance_1);
4988 }
4989
4990 } else if (type2 == SPLICE_JUNCTION) {
4991 if (invertp == false) {
4992 if (Junction_sensedir(post_junction) == SENSE_FORWARD) {
4993 FPRINTF(fp,",dir:sense");
4994 } else if (Junction_sensedir(post_junction) == SENSE_ANTI) {
4995 FPRINTF(fp,",dir:antisense");
4996 } else {
4997 FPRINTF(fp,",dir:unknown");
4998 }
4999 } else {
5000 if (Junction_sensedir(post_junction) == SENSE_FORWARD) {
5001 FPRINTF(fp,",dir:antisense");
5002 } else if (Junction_sensedir(post_junction) == SENSE_ANTI) {
5003 FPRINTF(fp,",dir:sense");
5004 } else {
5005 FPRINTF(fp,",dir:unknown");
5006 }
5007 }
5008 if ((splice_distance_2 = Junction_splice_distance(post_junction)) > 0) {
5009 FPRINTF(fp,",splice_type:consistent");
5010 FPRINTF(fp,",splice_dist_2:%u",splice_distance_2);
5011 }
5012 }
5013
5014 return;
5015 }
5016
5017
5018 /************************************************************************
5019 * Multiclean
5020 ************************************************************************/
5021
5022 static char *
get_total_tally(long int * tally,char * ptr)5023 get_total_tally (long int *tally, char *ptr) {
5024 int n;
5025 char *end;
5026
5027 if ((end = index(ptr,'\n')) == NULL) {
5028 fprintf(stderr,"Premature end of line %s\n",ptr);
5029 return 0;
5030 }
5031 /* fprintf(stderr,"Getting tally for %.*s\n",end-ptr,ptr); */
5032
5033 while (ptr < end) {
5034 while (ptr < end && !isdigit((int) *ptr)) {
5035 ptr++;
5036 }
5037 if (ptr < end) {
5038 sscanf(ptr,"%d",&n);
5039 #if 0
5040 debug(if (n > 0) printf(" %d",n));
5041 #endif
5042 (*tally) += n;
5043 while (ptr < end && !isspace(*ptr)) {
5044 ptr++;
5045 }
5046 while (ptr < end && isspace(*ptr)) {
5047 ptr++;
5048 }
5049 }
5050 }
5051
5052 return ptr;
5053 }
5054
5055 long int
Substring_tally(T this,IIT_T tally_iit,int * tally_divint_crosstable)5056 Substring_tally (T this, IIT_T tally_iit, int *tally_divint_crosstable) {
5057 long int total = 0U;
5058 Interval_T interval;
5059 char *annotation, *restofheader, *ptr;
5060 #if 0
5061 bool alloc_chr_p;
5062 #endif
5063 bool allocp;
5064 Chrpos_T chrpos, intervalend;
5065
5066 Chrpos_T coordstart, coordend, pos5, pos3;
5067 int *matches;
5068 int nmatches, i;
5069
5070 pos5 = this->alignstart_trim - this->chroffset;
5071 pos3 = this->alignend_trim - this->chroffset;
5072
5073 if (pos5 < pos3) {
5074 coordstart = pos5;
5075 coordend = pos3;
5076 } else {
5077 coordstart = pos3;
5078 coordend = pos5;
5079 }
5080 coordstart += 1; /* Because tally IIT is 1-based */
5081 debug(printf("coordstart = %u, coordend = %u\n",coordstart,coordend));
5082
5083 #if 0
5084 chr = Univ_IIT_label(chromosome_iit,this->chrnum,&alloc_chr_p);
5085 #endif
5086 matches = IIT_get_with_divno(&nmatches,tally_iit,tally_divint_crosstable[this->chrnum],
5087 coordstart,coordend,/*sortp*/false);
5088
5089 for (i = 0; i < nmatches; i++) {
5090 annotation = IIT_annotation(&restofheader,tally_iit,matches[i],&allocp);
5091
5092 interval = IIT_interval(tally_iit,matches[i]);
5093 chrpos = Interval_low(interval);
5094 intervalend = Interval_high(interval);
5095
5096 ptr = annotation;
5097
5098 while (chrpos < coordstart) {
5099 if ((ptr = index(ptr,'\n')) == NULL) {
5100 fprintf(stderr,"Premature end of tally from %u to %u\n",
5101 Interval_low(interval),Interval_high(interval));
5102 return total;
5103 } else {
5104 ptr++;
5105 }
5106 chrpos++;
5107 }
5108
5109 while (chrpos <= intervalend && chrpos <= coordend) {
5110 ptr = get_total_tally(&total,ptr);
5111 ptr++;
5112 chrpos++;
5113 }
5114
5115 if (allocp == true) {
5116 FREE(restofheader);
5117 }
5118 }
5119
5120 FREE(matches);
5121
5122 #if 0
5123 if (alloc_chr_p) {
5124 FREE(chr);
5125 }
5126 #endif
5127
5128 debug(printf("Subtotal = %ld\n",total));
5129 return total;
5130 }
5131
5132
5133 bool
Substring_runlength_p(T this,IIT_T runlength_iit,int * runlength_divint_crosstable)5134 Substring_runlength_p (T this, IIT_T runlength_iit, int *runlength_divint_crosstable) {
5135 Chrpos_T coordstart, coordend, pos5, pos3;
5136
5137 pos5 = this->alignstart_trim - this->chroffset;
5138 pos3 = this->alignend_trim - this->chroffset;
5139
5140 if (pos5 < pos3) {
5141 coordstart = pos5;
5142 coordend = pos3;
5143 } else {
5144 coordstart = pos3;
5145 coordend = pos5;
5146 }
5147 coordstart += 1; /* Because runlength IIT is 1-based */
5148 debug(printf("coordstart = %u, coordend = %u\n",coordstart,coordend));
5149
5150 /* chr = Univ_IIT_label(chromosome_iit,this->chrnum,&alloc_chr_p); */
5151 return IIT_exists_with_divno(runlength_iit,runlength_divint_crosstable[this->chrnum],
5152 coordstart,coordend);
5153 }
5154
5155
5156 int
Substring_count_mismatches_region(int * ref_nmismatches,T this,int trim_querystart,int trim_queryend)5157 Substring_count_mismatches_region (int *ref_nmismatches, T this, int trim_querystart, int trim_queryend) {
5158 int left_bound, right_bound;
5159
5160 if (this == NULL) {
5161 return 0;
5162 } else {
5163 left_bound = trim_querystart;
5164 right_bound = this->querylength - trim_queryend;
5165
5166 if (this->queryend_pretrim < left_bound) {
5167 return 0;
5168 } else if (this->querystart_pretrim > right_bound) {
5169 return 0;
5170 } else {
5171 if (this->querystart_pretrim > left_bound) {
5172 left_bound = this->querystart_pretrim;
5173 }
5174 if (this->queryend_pretrim < right_bound) {
5175 right_bound = this->queryend_pretrim;
5176 }
5177
5178 #if 0
5179 /* Not sure why we have this assertion. Values set by eventrim procedure in Stage3end_optimal_score_prefinal. If assertion fails, then just return 0 */
5180 assert(left_bound <= right_bound);
5181 #endif
5182
5183 if (left_bound > right_bound) {
5184 return 0;
5185 } else if (this->plusp) {
5186 return Genome_count_mismatches_substring(&(*ref_nmismatches),genomebits,genomebits_alt,this->query_compress,this->left,
5187 /*pos5*/left_bound,/*pos3*/right_bound,/*plusp*/true,this->genestrand);
5188 } else {
5189 return Genome_count_mismatches_substring(&(*ref_nmismatches),genomebits,genomebits_alt,this->query_compress,this->left,
5190 /*pos5*/this->querylength - right_bound,
5191 /*pos3*/this->querylength - left_bound,
5192 /*plusp*/false,this->genestrand);
5193 }
5194 }
5195 }
5196 }
5197
5198
5199 /************************************************************************
5200 * Conversion to Pair_T format
5201 ************************************************************************/
5202
5203 List_T
Substring_convert_to_pairs_out(List_T pairs,T substring,int querylength,Shortread_T queryseq,int hardclip_low,int hardclip_high,int queryseq_offset)5204 Substring_convert_to_pairs_out (List_T pairs, T substring, int querylength, Shortread_T queryseq,
5205 int hardclip_low, int hardclip_high, int queryseq_offset) {
5206 int querystart, queryend, querypos, i;
5207 Chrpos_T chrpos;
5208 char *seq1;
5209 char genome;
5210
5211 if (substring == NULL) {
5212 return pairs;
5213 } else if (substring->alts_ncoords > 0) {
5214 return pairs;
5215 }
5216
5217 debug6(printf("*** Entered Substring_convert_to_pairs with querylength %d, hardclip_low %d, hardclip_high %d, queryseq_offset %d\n",
5218 querylength,hardclip_low,hardclip_high,queryseq_offset));
5219
5220 seq1 = Shortread_fullpointer_uc(queryseq);
5221 if (substring->plusp == true) {
5222 if (hardclip_low > substring->querystart) {
5223 querystart = hardclip_low;
5224 } else {
5225 querystart = substring->querystart;
5226 }
5227
5228 if (querylength - hardclip_high < substring->queryend) {
5229 queryend = querylength - hardclip_high;
5230 } else {
5231 queryend = substring->queryend;
5232 }
5233 /* Pairs are all zero-based, so do not add 1 */
5234 #if 0
5235 chrpos = substring->genomicstart_adj + querystart - substring->chroffset /*+ 1*/;
5236 #else
5237 chrpos = substring->genomicstart + querystart - substring->chroffset /*+ 1*/;
5238 #endif
5239
5240 debug6(printf("plus conversion\n"));
5241 debug6(printf("querystart %d, queryend %d, plusp %d\n",querystart,queryend,substring->plusp));
5242 debug6(printf("alignstart %u, alignend %u\n",substring->alignstart_trim - substring->chroffset,
5243 substring->alignend_trim - substring->chroffset));
5244 debug6(printf("chrpos %u\n",chrpos));
5245
5246 if (substring->genomic_bothdiff == NULL) {
5247 /* Exact match */
5248 for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5249 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5250 seq1[i],/*comp*/MATCH_COMP,seq1[i]));
5251 }
5252 } else if (show_refdiff_p == true) {
5253 for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5254 if (isupper(genome = substring->genomic_refdiff[i])) {
5255 assert(seq1[i] == genome || seq1[i] == 'N');
5256 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5257 seq1[i],/*comp*/MATCH_COMP,genome));
5258 } else {
5259 assert(seq1[i] != toupper(genome));
5260 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5261 seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5262 }
5263 }
5264 } else {
5265 /* printf("querystart %d, queryend %d\n",querystart,queryend); */
5266 /* printf("seq1 %s\n",seq1); */
5267 /* printf("genome %s\n",substring->genomic_bothdiff); */
5268 for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5269 if (isupper(genome = substring->genomic_bothdiff[i])) {
5270 assert(seq1[i] == genome || seq1[i] == 'N');
5271 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5272 seq1[i],/*comp*/MATCH_COMP,genome));
5273 } else {
5274 assert(seq1[i] != toupper(genome));
5275 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos++,
5276 seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5277 }
5278 }
5279 }
5280
5281 } else {
5282 if (hardclip_high > substring->querystart) {
5283 querystart = hardclip_high;
5284 } else {
5285 querystart = substring->querystart;
5286 }
5287
5288 if (querylength - hardclip_low < substring->queryend) {
5289 queryend = querylength - hardclip_low;
5290 } else {
5291 queryend = substring->queryend;
5292 }
5293 /* For minus, to get 0-based coordinates, subtract 1 */
5294 #if 0
5295 chrpos = substring->genomicstart_adj - querystart - substring->chroffset - 1;
5296 #else
5297 chrpos = substring->genomicstart - querystart - substring->chroffset - 1;
5298 #endif
5299
5300 debug6(printf("minus conversion\n"));
5301 debug6(printf("querystart %d, queryend %d, plusp %d\n",querystart,queryend,substring->plusp));
5302 debug6(printf("alignstart %u, alignend %u\n",substring->alignstart_trim - substring->chroffset,
5303 substring->alignend_trim - substring->chroffset));
5304 debug6(printf("chrpos %u\n",chrpos));
5305
5306 if (substring->genomic_bothdiff == NULL) {
5307 /* Exact match */
5308 for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5309 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5310 seq1[i],/*comp*/MATCH_COMP,seq1[i]));
5311 }
5312 } else if (show_refdiff_p == true) {
5313 for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5314 if (isupper(genome = substring->genomic_refdiff[i])) {
5315 assert(seq1[i] == genome || seq1[i] == 'N');
5316 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5317 seq1[i],/*comp*/MATCH_COMP,genome));
5318 } else {
5319 assert(seq1[i] != toupper(genome));
5320 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5321 seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5322 }
5323 }
5324 } else {
5325 for (i = querystart, querypos = queryseq_offset + querystart; i < queryend; i++, querypos++) {
5326 if (isupper(genome = substring->genomic_bothdiff[i])) {
5327 /* assert(seq1[i] == genome || seq1[i] == 'N'); */
5328 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5329 seq1[i],/*comp*/MATCH_COMP,genome));
5330 } else {
5331 /* assert(seq1[i] != toupper(genome)); */
5332 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrpos--,
5333 seq1[i],/*comp*/MISMATCH_COMP,toupper(genome)));
5334 }
5335 }
5336 }
5337 }
5338
5339 debug6(Simplepair_dump_list(pairs,true));
5340 return pairs;
5341 }
5342
5343
5344 List_T
Substring_add_insertion_out(List_T pairs,T substringA,T substringB,int querylength,int insertionlength,Shortread_T queryseq,int hardclip_low,int hardclip_high,int queryseq_offset)5345 Substring_add_insertion_out (List_T pairs, T substringA, T substringB, int querylength,
5346 int insertionlength, Shortread_T queryseq,
5347 int hardclip_low, int hardclip_high, int queryseq_offset) {
5348 int querystartA, queryendA, querystartB, queryendB, querypos, i;
5349 Chrpos_T chrendA;
5350 char *seq1;
5351
5352
5353 if (substringA->plusp == true) {
5354 if (hardclip_low > substringA->querystart) {
5355 querystartA = hardclip_low;
5356 } else {
5357 querystartA = substringA->querystart;
5358 }
5359
5360 if (querylength - hardclip_high < substringA->queryend) {
5361 queryendA = querylength - hardclip_high;
5362 } else {
5363 queryendA = substringA->queryend;
5364 }
5365
5366 if (hardclip_low > substringB->querystart) {
5367 querystartB = hardclip_low;
5368 } else {
5369 querystartB = substringB->querystart;
5370 }
5371
5372 if (querylength - hardclip_high < substringB->queryend) {
5373 queryendB = querylength - hardclip_high;
5374 } else {
5375 queryendB = substringB->queryend;
5376 }
5377
5378 /* Pairs are all zero-based, so do not add 1 */
5379 #if 0
5380 chrendA = substringA->genomicstart_adj + queryendA - substringA->chroffset /*+ 1*/;
5381 #else
5382 chrendA = substringA->genomicstart + queryendA - substringA->chroffset /*+ 1*/;
5383 #endif
5384
5385 } else {
5386 if (hardclip_high > substringA->querystart) {
5387 querystartA = hardclip_high;
5388 } else {
5389 querystartA = substringA->querystart;
5390 }
5391
5392 if (querylength - hardclip_low < substringA->queryend) {
5393 queryendA = querylength - hardclip_low;
5394 } else {
5395 queryendA = substringA->queryend;
5396 }
5397
5398 if (hardclip_high > substringB->querystart) {
5399 querystartB = hardclip_high;
5400 } else {
5401 querystartB = substringB->querystart;
5402 }
5403
5404 if (querylength - hardclip_low < substringB->queryend) {
5405 queryendB = querylength - hardclip_low;
5406 } else {
5407 queryendB = substringB->queryend;
5408 }
5409
5410 /* Pairs are all zero-based, so subtract 1 */
5411 #if 0
5412 chrendA = substringA->genomicstart_adj - queryendA - substringA->chroffset - 1;
5413 #else
5414 chrendA = substringA->genomicstart - queryendA - substringA->chroffset - 1;
5415 #endif
5416 }
5417
5418 if (querystartA <= queryendA && querystartB <= queryendB) {
5419 seq1 = Shortread_fullpointer_uc(queryseq);
5420 querypos = queryendA + queryseq_offset;
5421 i = queryendA;
5422 while (--insertionlength >= 0) {
5423 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos++,/*genomepos*/chrendA,
5424 seq1[i++],/*comp*/INDEL_COMP,' '));
5425 }
5426 }
5427
5428 return pairs;
5429 }
5430
5431
5432 List_T
Substring_add_deletion_out(List_T pairs,T substringA,T substringB,int querylength,char * deletion,int deletionlength,int hardclip_low,int hardclip_high,int queryseq_offset)5433 Substring_add_deletion_out (List_T pairs, T substringA, T substringB, int querylength,
5434 char *deletion, int deletionlength,
5435 int hardclip_low, int hardclip_high, int queryseq_offset) {
5436 int querystartA, queryendA, querystartB, queryendB, querypos, k;
5437 Chrpos_T chrendA;
5438
5439 if (substringA->plusp == true) {
5440 if (hardclip_low > substringA->querystart) {
5441 querystartA = hardclip_low;
5442 } else {
5443 querystartA = substringA->querystart;
5444 }
5445
5446 if (querylength - hardclip_high < substringA->queryend) {
5447 queryendA = querylength - hardclip_high;
5448 } else {
5449 queryendA = substringA->queryend;
5450 }
5451
5452 if (hardclip_low > substringB->querystart) {
5453 querystartB = hardclip_low;
5454 } else {
5455 querystartB = substringB->querystart;
5456 }
5457
5458 if (querylength - hardclip_high < substringB->queryend) {
5459 queryendB = querylength - hardclip_high;
5460 } else {
5461 queryendB = substringB->queryend;
5462 }
5463
5464 /* Pairs are all zero-based, so do not add 1 */
5465 #if 0
5466 chrendA = substringA->genomicstart_adj + queryendA - substringA->chroffset /*+ 1*/;
5467 #else
5468 chrendA = substringA->genomicstart + queryendA - substringA->chroffset /*+ 1*/;
5469 #endif
5470
5471 if (querystartA < queryendA && querystartB < queryendB) {
5472 querypos = queryendA + queryseq_offset;
5473 for (k = 0; k < deletionlength; k++) {
5474 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrendA++,
5475 ' ',/*comp*/INDEL_COMP,deletion[k]));
5476 }
5477 }
5478
5479 } else {
5480 if (hardclip_high > substringA->querystart) {
5481 querystartA = hardclip_high;
5482 } else {
5483 querystartA = substringA->querystart;
5484 }
5485
5486 if (querylength - hardclip_low < substringA->queryend) {
5487 queryendA = querylength - hardclip_low;
5488 } else {
5489 queryendA = substringA->queryend;
5490 }
5491
5492 if (hardclip_high > substringB->querystart) {
5493 querystartB = hardclip_high;
5494 } else {
5495 querystartB = substringB->querystart;
5496 }
5497
5498 if (querylength - hardclip_low < substringB->queryend) {
5499 queryendB = querylength - hardclip_low;
5500 } else {
5501 queryendB = substringB->queryend;
5502 }
5503
5504 /* Pairs are all zero-based, so subtract 1 */
5505 #if 0
5506 chrendA = substringA->genomicstart_adj - queryendA - substringA->chroffset - 1;
5507 #else
5508 chrendA = substringA->genomicstart - queryendA - substringA->chroffset - 1;
5509 #endif
5510
5511 if (querystartA <= queryendA && querystartB <= queryendB) {
5512 querypos = queryendA + queryseq_offset;
5513 for (k = 0; k < deletionlength; k++) {
5514 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrendA--,
5515 ' ',/*comp*/INDEL_COMP,deletion[k]));
5516 }
5517 }
5518 }
5519
5520 return pairs;
5521 }
5522
5523
5524
5525 List_T
Substring_add_intron_out(List_T pairs,T substringA,T substringB,int querylength,int hardclip_low,int hardclip_high,int queryseq_offset)5526 Substring_add_intron_out (List_T pairs, T substringA, T substringB, int querylength,
5527 int hardclip_low, int hardclip_high, int queryseq_offset) {
5528 int querystartA, queryendA, querystartB, queryendB, querypos;
5529 Chrpos_T chrendA;
5530
5531 if (substringA->plusp == true) {
5532 if (hardclip_low > substringA->querystart) {
5533 querystartA = hardclip_low;
5534 } else {
5535 querystartA = substringA->querystart;
5536 }
5537
5538 if (querylength - hardclip_high < substringA->queryend) {
5539 queryendA = querylength - hardclip_high;
5540 } else {
5541 queryendA = substringA->queryend;
5542 }
5543
5544 if (hardclip_low > substringB->querystart) {
5545 querystartB = hardclip_low;
5546 } else {
5547 querystartB = substringB->querystart;
5548 }
5549
5550 if (querylength - hardclip_high < substringB->queryend) {
5551 queryendB = querylength - hardclip_high;
5552 } else {
5553 queryendB = substringB->queryend;
5554 }
5555
5556 /* Pairs are all zero-based, so do not add 1 */
5557 #if 0
5558 chrendA = substringA->genomicstart_adj + queryendA - substringA->chroffset /*+ 1*/;
5559 #else
5560 chrendA = substringA->genomicstart + queryendA - substringA->chroffset /*+ 1*/;
5561 #endif
5562
5563 } else {
5564 if (hardclip_high > substringA->querystart) {
5565 querystartA = hardclip_high;
5566 } else {
5567 querystartA = substringA->querystart;
5568 }
5569
5570 if (querylength - hardclip_low < substringA->queryend) {
5571 queryendA = querylength - hardclip_low;
5572 } else {
5573 queryendA = substringA->queryend;
5574 }
5575
5576 if (hardclip_high > substringB->querystart) {
5577 querystartB = hardclip_high;
5578 } else {
5579 querystartB = substringB->querystart;
5580 }
5581
5582 if (querylength - hardclip_low < substringB->queryend) {
5583 queryendB = querylength - hardclip_low;
5584 } else {
5585 queryendB = substringB->queryend;
5586 }
5587
5588 /* Pairs are all zero-based, so subtract 1 */
5589 #if 0
5590 chrendA = substringA->genomicstart_adj - queryendA - substringA->chroffset - 1;
5591 #else
5592 chrendA = substringA->genomicstart - queryendA - substringA->chroffset - 1;
5593 #endif
5594 }
5595
5596 if (querystartA <= queryendA && querystartB <= queryendB) {
5597 /* Add gapholder */
5598 /* All we really need for Pair_print_sam is to set gapp to be true */
5599 querypos = queryendA + queryseq_offset;
5600 pairs = List_push_out(pairs,(void *) Simplepair_new_out(querypos,/*genomepos*/chrendA,
5601 ' ',/*comp*/FWD_CANONICAL_INTRON_COMP,' '));
5602 }
5603
5604 return pairs;
5605 }
5606
5607
5608
5609 void
Substring_setup(bool print_nsnpdiffs_p_in,bool print_snplabels_p_in,bool show_refdiff_p_in,IIT_T snps_iit_in,int * snps_divint_crosstable_in,Genome_T genomebits_in,Genome_T genomebits_alt_in,Univ_IIT_T chromosome_iit_in,Univcoord_T genomelength_in,int nchromosomes_in,int circular_typeint_in,bool novelsplicingp_in,bool knownsplicingp_in,Outputtype_T output_type_in,Mode_T mode_in,bool maskedp_in)5610 Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
5611 bool show_refdiff_p_in, IIT_T snps_iit_in, int *snps_divint_crosstable_in,
5612 Genome_T genomebits_in, Genome_T genomebits_alt_in,
5613 Univ_IIT_T chromosome_iit_in, Univcoord_T genomelength_in,
5614 int nchromosomes_in, int circular_typeint_in,
5615 bool novelsplicingp_in, bool knownsplicingp_in,
5616 Outputtype_T output_type_in, Mode_T mode_in, bool maskedp_in) {
5617 print_nsnpdiffs_p = print_nsnpdiffs_p_in;
5618 print_snplabels_p = print_snplabels_p_in;
5619 show_refdiff_p = show_refdiff_p_in;
5620
5621 snps_iit = snps_iit_in;
5622 snps_divint_crosstable = snps_divint_crosstable_in;
5623
5624 genomebits = genomebits_in;
5625 genomebits_alt = genomebits_alt_in;
5626
5627 chromosome_iit = chromosome_iit_in;
5628 genomelength = genomelength_in;
5629 genomesize = (double) Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias_p*/false);
5630 nchromosomes = nchromosomes_in;
5631 circular_typeint = circular_typeint_in;
5632
5633 novelsplicingp = novelsplicingp_in;
5634 knownsplicingp = knownsplicingp_in;
5635
5636 output_type = output_type_in;
5637 mode = mode_in;
5638
5639 maskedp = maskedp_in;
5640
5641 return;
5642 }
5643
5644
5645
5646