1 static char rcsid[] = "$Id: pair.c 223009 2020-07-10 15:13:26Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "pair.h"
13 #include "pairdef.h"
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h> /* For memcpy */
17 #include <math.h> /* For rint(), abs() */
18 #include <ctype.h> /* For toupper */
19
20 #include "assert.h"
21 #include "except.h"
22 #include "mem.h"
23 #include "comp.h"
24 #include "complement.h"
25 #include "intron.h"
26 #include "intlist.h"
27 #include "separator.h"
28 #include "scores.h"
29 #include "segmentpos.h"
30 #include "maxent.h"
31 #include "maxent_hr.h"
32 #include "sense.h"
33 #include "samflags.h"
34
35
36 #define ONEBASEDP 1 /* 1-based coordinates. Also defined in segmentpos.c */
37
38 #define MIN_INTRONLEN 20 /* For deciding between N and D in cigar string */
39
40
41 /* Check for ANSI mode, which does not include rint */
42 #ifdef __STRICT_ANSI__
43 #define rint(x) floor(0.5+(x))
44 #endif
45
46 #define DEFAULT_MARGIN 14
47
48 /* #define DIAGNOSTICP 1 */
49
50 #ifdef DEBUG
51 #define debug(x) x
52 #else
53 #define debug(x)
54 #endif
55
56 /* Print pointer information in Pair_dump_one */
57 #ifdef DEBUG1
58 #define debug1(x) x
59 #else
60 #define debug1(x)
61 #endif
62
63 /* PSL indels */
64 #ifdef DEBUG2
65 #define debug2(x) x
66 #else
67 #define debug2(x)
68 #endif
69
70 /* Pair_fracidentity_max */
71 #ifdef DEBUG3
72 #define debug3(x) x
73 #else
74 #define debug3(x)
75 #endif
76
77 /* compute_md_string */
78 #ifdef DEBUG4
79 #define debug4(x) x
80 #else
81 #define debug4(x)
82 #endif
83
84 /* Phase information */
85 #ifdef DEBUG5
86 #define debug5(x) x
87 #else
88 #define debug5(x)
89 #endif
90
91 /* Pairarray_convert_to_substrings */
92 #ifdef DEBUG6
93 #define debug6(x) x
94 #else
95 #define debug6(x)
96 #endif
97
98 /* cds_phase in gff3 output */
99 #ifdef DEBUG7
100 #define debug7(x) x
101 #else
102 #define debug7(x)
103 #endif
104
105 /* trimming */
106 #ifdef DEBUG8
107 #define debug8(x) x
108 #else
109 #define debug8(x)
110 #endif
111
112 /* end_bound and start_bound */
113 #ifdef DEBUG9
114 #define debug9(x) x
115 #else
116 #define debug9(x)
117 #endif
118
119 /* binary search */
120 #ifdef DEBUG10
121 #define debug10(x) x
122 #else
123 #define debug10(x)
124 #endif
125
126 /* maxnegscore */
127 #ifdef DEBUG11
128 #define debug11(x) x
129 #else
130 #define debug11(x)
131 #endif
132
133 /* circularpos */
134 #ifdef DEBUG12
135 #define debug12(x) x
136 #else
137 #define debug12(x)
138 #endif
139
140
141 #define TRIM_MATCH_SCORE 1
142 #define TRIM_MISMATCH_SCORE -1
143
144 static bool novelsplicingp;
145 static IIT_T splicesites_iit;
146
147 static int trim_indel_score;
148 static bool gff3_separators_p;
149 static bool sam_insert_0M_p = false;
150 static bool force_xs_direction_p;
151 static bool md_lowercase_variant_p;
152 static bool snps_p;
153
154 static bool gff3_phase_swap_p;
155 static CDStype_T cdstype;
156 static bool cigar_extended_p;
157 static Cigar_action_T cigar_action;
158
159
160 void
Pair_setup(bool novelsplicingp_in,IIT_T splicesites_iit_in,int trim_indel_score_in,bool gff3_separators_p_in,bool sam_insert_0M_p_in,bool force_xs_direction_p_in,bool md_lowercase_variant_p_in,bool snps_p_in,bool gff3_phase_swap_p_in,CDStype_T cdstype_in,bool cigar_extended_p_in,Cigar_action_T cigar_action_in)161 Pair_setup (bool novelsplicingp_in, IIT_T splicesites_iit_in, int trim_indel_score_in,
162 bool gff3_separators_p_in, bool sam_insert_0M_p_in, bool force_xs_direction_p_in,
163 bool md_lowercase_variant_p_in, bool snps_p_in,
164 bool gff3_phase_swap_p_in, CDStype_T cdstype_in,
165 bool cigar_extended_p_in, Cigar_action_T cigar_action_in) {
166
167 novelsplicingp = novelsplicingp_in;
168 splicesites_iit = splicesites_iit_in;
169
170 trim_indel_score = trim_indel_score_in;
171 gff3_separators_p = gff3_separators_p_in;
172 sam_insert_0M_p = sam_insert_0M_p_in;
173 force_xs_direction_p = force_xs_direction_p_in;
174 md_lowercase_variant_p = md_lowercase_variant_p_in;
175 snps_p = snps_p_in;
176 gff3_phase_swap_p = gff3_phase_swap_p_in;
177 cdstype = cdstype_in;
178 cigar_extended_p = cigar_extended_p_in;
179 cigar_action = cigar_action_in;
180
181 return;
182 }
183
184
185
186 #define T Pair_T
187
188 int
Pair_querypos(T this)189 Pair_querypos (T this) {
190 return this->querypos;
191 }
192
193 Chrpos_T
Pair_genomepos(T this)194 Pair_genomepos (T this) {
195 return this->genomepos;
196 }
197
198 char
Pair_cdna(T this)199 Pair_cdna (T this) {
200 return this->cdna;
201 }
202
203 char
Pair_comp(T this)204 Pair_comp (T this) {
205 return this->comp;
206 }
207
208 char
Pair_genome(T this)209 Pair_genome (T this) {
210 return this->genome;
211 }
212
213 char
Pair_genomealt(T this)214 Pair_genomealt (T this) {
215 return this->genomealt;
216 }
217
218 bool
Pair_gapp(T this)219 Pair_gapp (T this) {
220 return this->gapp;
221 }
222
223 bool
Pair_shortexonp(T this)224 Pair_shortexonp (T this) {
225 return this->shortexonp;
226 }
227
228
229 void
Pair_print_ends(List_T pairs)230 Pair_print_ends (List_T pairs) {
231 List_T p;
232 T start, end;
233
234 if (pairs == NULL) {
235 printf("0..0, 0..0\n");
236 } else {
237 start = (T) pairs->first;
238 for (p = pairs; p != NULL; p = p->rest) {
239 end = (T) p->first;
240 }
241 printf("%d..%d %u..%u",start->querypos,end->querypos,start->genomepos,end->genomepos);
242 }
243 return;
244 }
245
246
247 void
Pair_set_genomepos(struct T * pairarray,int npairs,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)248 Pair_set_genomepos (struct T *pairarray, int npairs,
249 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp) {
250 int i;
251 Chrpos_T chraliaslength;
252
253 if (watsonp == true) {
254 /* No need to adjust, since we are using chromosomal coordinates already */
255 } else {
256 chraliaslength = chrhigh - chroffset;
257 for (i = 0; i < npairs; i++) {
258 pairarray[i].genomepos = chraliaslength - pairarray[i].genomepos;
259 }
260 }
261 return;
262 }
263
264
265 void
Pair_subtract_genomepos(struct T * pairs,int npairs,Chrpos_T adjustment)266 Pair_subtract_genomepos (struct T *pairs, int npairs, Chrpos_T adjustment) {
267 int i;
268 struct T *ptr;
269
270 i = 0;
271 ptr = pairs;
272 while (i < npairs) {
273 ptr->genomepos -= adjustment;
274 i++;
275 ptr++;
276 }
277
278 return;
279 }
280
281
282 #if 0
283 /* Don't change list, just pairarray */
284 void
285 Pair_set_genomepos_list (List_T pairs, Univcoord_T chroffset,
286 Univcoord_T chrhigh, bool watsonp) {
287 List_T p;
288 T pair;
289 Chrpos_T chraliaslength;
290
291 if (watsonp == true) {
292 /* No need to adjust, since we are using chromosomal coordinates already */
293 } else {
294 chraliaslength = chrhigh - chroffset;
295 for (p = pairs; p != NULL; p = p->rest) {
296 pair = (T) p->first;
297 pair->genomepos = chraliaslength - pair->genomepos;
298 }
299 }
300
301 return;
302 }
303 #endif
304
305
306 /* For outbuffer usage (e.g., truncation), use Pair_clip_bounded_array instead */
307 /* Note: This code is designed to handle source, which may still have
308 gaps with querypos undefined */
309 List_T
Pair_clip_bounded_list_5(List_T source,int minpos,int maxpos)310 Pair_clip_bounded_list_5 (List_T source, int minpos, int maxpos) {
311 List_T dest, *prev, p;
312 T pair;
313 int starti = -1, endi = -1, i;
314
315 if (source == NULL) {
316 return (List_T) NULL;
317 } else {
318 for (p = source, i = 0; p != NULL; p = p->rest, i++) {
319 pair = (Pair_T) List_head(p);
320 if (pair->querypos == minpos) {
321 starti = i; /* Advances in case of ties */
322 } else if (pair->querypos > minpos && starti < 0) {
323 starti = i; /* Handles case where minpos was skipped */
324 }
325
326 if (pair->querypos == maxpos && endi < 0) {
327 endi = i + 1; /* Does not advance in case of tie */
328 } else if (pair->querypos > maxpos && endi < 0) {
329 endi = i; /* Handles case where maxpos was skipped */
330 }
331 }
332
333 if (starti < 0 && endi < 0) {
334 /* None of the pairs fall within bounds */
335 return (List_T) NULL;
336 } else {
337 if (starti < 0) {
338 starti = 0;
339 }
340 if (endi < 0) {
341 endi = i;
342 }
343 }
344
345 p = source;
346 i = 0;
347 while (i < starti) {
348 p = p->rest;
349 i++;
350 }
351
352 dest = p;
353 prev = &p->rest;
354 while (i < endi) {
355 prev = &p->rest;
356 p = p->rest;
357 i++;
358 }
359
360 *prev = NULL; /* Clip rest of list */
361 return dest;
362 }
363 }
364
365
366 List_T
Pair_clip_bounded_list_3(List_T source,int minpos,int maxpos)367 Pair_clip_bounded_list_3 (List_T source, int minpos, int maxpos) {
368 List_T dest, *prev, p;
369 T pair;
370 int starti = -1, endi = -1, i;
371
372 if (source == NULL) {
373 return (List_T) NULL;
374 } else {
375 for (p = source, i = 0; p != NULL; p = p->rest, i++) {
376 pair = (Pair_T) List_head(p);
377 if (pair->querypos == minpos && starti < 0) {
378 starti = i; /* Does not advance in case of tie */
379 } else if (pair->querypos > minpos && starti < 0) {
380 starti = i; /* Handles case where minpos was skipped */
381 }
382
383 if (pair->querypos == maxpos) {
384 endi = i + 1; /* Advances in case of ties */
385 } else if (pair->querypos > maxpos && endi < 0) {
386 endi = i; /* Handles case where maxpos was skipped */
387 }
388 }
389
390 if (starti < 0 && endi < 0) {
391 /* None of the pairs fall within bounds */
392 return (List_T) NULL;
393 } else {
394 if (starti < 0) {
395 starti = 0;
396 }
397 if (endi < 0) {
398 endi = i;
399 }
400 }
401
402 p = source;
403 i = 0;
404 while (i < starti) {
405 p = p->rest;
406 i++;
407 }
408
409 dest = p;
410 prev = &p->rest;
411 while (i < endi) {
412 prev = &p->rest;
413 p = p->rest;
414 i++;
415 }
416
417 *prev = NULL; /* Clip rest of list */
418 return dest;
419 }
420 }
421
422
423 int
Pair_clip_bounded_array(struct T * source,int npairs,int minpos,int maxpos)424 Pair_clip_bounded_array (struct T *source, int npairs, int minpos, int maxpos) {
425 T pair;
426 int starti = -1, endi = -1, i, k;
427
428 #if 0
429 printf("Pair_clip_bounded_array called with %d pairs, minpos %d, maxpos %d\n",npairs,minpos,maxpos);
430 Pair_dump_array(source,npairs,true);
431 #endif
432
433 for (i = 0; i < npairs; i++) {
434 pair = &(source[i]);
435 if (pair->querypos == minpos) {
436 starti = i; /* Advances in case of ties */
437 } else if (pair->querypos > minpos && starti < 0) {
438 starti = i; /* Handles case where minpos was skipped */
439 }
440
441 if (pair->querypos == maxpos && endi < 0) {
442 endi = i + 1; /* Does not advance in case of tie */
443 } else if (pair->querypos > maxpos && endi < 0) {
444 endi = i; /* Handles case where maxpos was skipped */
445 }
446 }
447
448 if (starti < 0 && endi < 0) {
449 /* None of the pairs fall within bounds. Don't do anything. */
450 return npairs;
451 } else {
452 if (starti < 0) {
453 starti = 0;
454 }
455 if (endi < 0) {
456 endi = i;
457 }
458 }
459
460 k = 0;
461 for (i = starti; i < endi; i++) {
462 memcpy((void *) &(source[k++]),(void *) &(source[i]),sizeof(struct T));
463 }
464
465 return endi - starti;
466 }
467
468
469
470 /* Head of list is the medial part of the read */
471 List_T
Pair_protect_end5(List_T pairs)472 Pair_protect_end5 (List_T pairs) {
473 List_T p;
474 T pair;
475
476 p = pairs;
477
478 /* Go until known splice is seen */
479 while (p != NULL && ((T) p->first)->gapp == false) {
480 pair = (T) p->first;
481 pair->protectedp = true;
482 p = p->rest;
483 }
484
485 /* Handle known splice */
486 if (p != NULL) {
487 pair = (T) p->first;
488 pair->protectedp = true;
489 p = p->rest;
490 }
491
492 /* Continue until distal indel is seen */
493 while (p != NULL && ((T) p->first)->cdna != ' ' && ((T) p->first)->genome != ' ') {
494 pair = (T) p->first;
495 pair->protectedp = true;
496 p = p->rest;
497 }
498
499 /* Do not protect the sequence after the distal indel */
500 while (p != NULL) {
501 pair = (T) p->first;
502 pair->protectedp = false;
503 p = p->rest;
504 }
505
506 return pairs;
507 }
508
509
510 /* Head of list is the 3' distal end of the read */
511 List_T
Pair_protect_end3(List_T pairs)512 Pair_protect_end3 (List_T pairs) {
513 List_T p;
514 T pair;
515
516 p = pairs = List_reverse(pairs); /* Now head is medial end */
517
518 /* Go until known splice is seen */
519 while (p != NULL && ((T) p->first)->gapp == false) {
520 pair = (T) p->first;
521 pair->protectedp = true;
522 /* result = Pairpool_push_existing(result,pairpool,pair); */
523 p = p->rest;
524 }
525
526 /* Handle known splice */
527 if (p != NULL) {
528 pair = (T) p->first;
529 pair->protectedp = true;
530 /* result = Pairpool_push_existing(result,pairpool,pair); */
531 p = p->rest;
532 }
533
534 /* Continue until distal indel is seen */
535 while (p != NULL && ((T) p->first)->cdna != ' ' && ((T) p->first)->genome != ' ') {
536 pair = (T) p->first;
537 pair->protectedp = true;
538 /* result = Pairpool_push_existing(result,pairpool,pair); */
539 p = p->rest;
540 }
541
542 /* Do not protect the sequence after the distal indel */
543 while (p != NULL) {
544 pair = (T) p->first;
545 pair->protectedp = false;
546 /* result = Pairpool_push_existing(result,pairpool,pair); */
547 p = p->rest;
548 }
549
550 return List_reverse(pairs);
551 }
552
553
554 void
Pair_protect_list(List_T pairs)555 Pair_protect_list (List_T pairs) {
556 List_T p;
557 T pair;
558
559 for (p = pairs; p != NULL; p = p->rest) {
560 pair = (T) p->first;
561 pair->protectedp = true;
562 }
563
564 return;
565 }
566
567
568
569
570 /* Print routines */
571
572 static char *RULER = " . : . : . : . : . :";
573 static void
print_top_ruler(Filestring_T fp,int n,int npairs,int margin,int wraplength)574 print_top_ruler (Filestring_T fp, int n, int npairs, int margin, int wraplength) {
575 FPRINTF(fp,"%*d ",margin,n);
576 if (n + wraplength < npairs) {
577 FPRINTF(fp,"%s\n",RULER);
578 } else {
579 FPRINTF(fp,"%.*s\n",npairs-n,RULER);
580 }
581 return;
582 }
583
584 /*
585 static void
586 print_bottom_ruler (int n, int npairs, int margin, int wraplength) {
587 printf("%*s ",margin,"");
588 if (n + wraplength < npairs) {
589 printf("%s\n",RULER);
590 } else {
591 printf("%.*s\n",npairs-n,RULER);
592 }
593 return;
594 }
595 */
596
597
598 static void
print_cdna_sequence(Filestring_T fp,struct T * ptr,int n,int npairs,int margin,int wraplength)599 print_cdna_sequence (Filestring_T fp, struct T *ptr, int n, int npairs, int margin, int wraplength) {
600 struct T *this;
601 int i;
602
603 this = ptr;
604 FPRINTF(fp,"%*u ",margin,this->querypos + ONEBASEDP);
605 for (i = 0; n < npairs && i < wraplength; n++, i++) {
606 this = ptr++;
607 PUTC(this->cdna,fp);
608 }
609 PUTC('\n',fp);
610 return;
611 }
612
613 static int
find_aapos_in_line(struct T * ptr,int n,int npairs,int wraplength,bool genomep)614 find_aapos_in_line (struct T *ptr, int n, int npairs, int wraplength,
615 bool genomep) {
616 struct T *this, *last;
617
618 if (npairs - n < wraplength) {
619 last = &ptr[npairs - n - 1];
620 } else {
621 last = &ptr[wraplength - 1];
622 }
623 this = ptr;
624 while (this <= last && (genomep ? this->aa_g : this->aa_e) == ' ') {
625 this++;
626 }
627
628 if (this > last) {
629 /* No aa found */
630 return -1;
631 } else {
632 return this->aapos;
633 }
634 }
635
636
637 static void
print_peptide(Filestring_T fp,struct T * ptr,int n,int npairs,int margin,int wraplength,bool genomep)638 print_peptide (Filestring_T fp, struct T *ptr, int n, int npairs, int margin,
639 int wraplength, bool genomep) {
640 struct T *this;
641 int aapos, i;
642
643 if ((aapos = find_aapos_in_line(ptr,n,npairs,wraplength,genomep)) < 0) {
644 FPRINTF(fp,"%*s ",margin,"");
645 } else {
646 /* 4 is length of "aa.c" and "aa.g" */
647 if (genomep == true) {
648 FPRINTF(fp,"aa.g%*d ",margin-4,aapos);
649 } else {
650 FPRINTF(fp,"aa.c%*d ",margin-4,aapos);
651 }
652 }
653
654 if (genomep == true) {
655 for (i = 0; n < npairs && i < wraplength; n++, i++) {
656 this = ptr++;
657 PUTC(this->aa_g,fp);
658 }
659 } else {
660 for (i = 0; n < npairs && i < wraplength; n++, i++) {
661 this = ptr++;
662 PUTC(this->aa_e,fp);
663 }
664 }
665
666 PUTC('\n',fp);
667 return;
668 }
669
670 static void
print_alignment(Filestring_T fp,struct T * ptr,int n,int npairs,int margin,int wraplength)671 print_alignment (Filestring_T fp, struct T *ptr, int n, int npairs,
672 int margin, int wraplength) {
673 struct T *this;
674 int i;
675
676 FPRINTF(fp,"%*s ",margin,"");
677 for (i = 0; n < npairs && i < wraplength; n++, i++) {
678 this = ptr++;
679
680 #ifdef DIAGNOSTICP
681 /* Subtract 1 because dynprogindices start at +1 and -1 */
682 if (this->comp == DYNPROG_MATCH_COMP) {
683 if (this->dynprogindex > 0) {
684 FPRINTF(fp,"%c",(this->dynprogindex-1)%26+'a');
685 } else if (this->dynprogindex < 0) {
686 FPRINTF(fp,"%c",(-this->dynprogindex-1)%26+'A');
687 } else {
688 PUTC(DYNPROG_MATCH_COMP,fp);
689 }
690 } else if (this->shortexonp == true) {
691 PUTC(DIAGNOSTIC_SHORTEXON_COMP,fp);
692 } else {
693 PUTC(this->comp,fp);
694 }
695
696 #else
697 if (this->comp == DYNPROG_MATCH_COMP) {
698 PUTC(MATCH_COMP,fp);
699 } else if (this->comp == AMBIGUOUS_COMP) {
700 /* Previously put AMBIGUOUS_COMP only for PMAP, and MISMATCH_COMP for GMAP */
701 PUTC(AMBIGUOUS_COMP,fp);
702 } else if (this->comp == SHORTGAP_COMP) {
703 PUTC(INDEL_COMP,fp);
704 } else if (this->comp == EXTRAEXON_COMP) {
705 PUTC(INTRONGAP_COMP,fp);
706 } else {
707 PUTC(this->comp,fp);
708 }
709 #endif
710
711 }
712
713 PUTC('\n',fp);
714 return;
715 }
716
717
718 static void
print_genomic_sequence(Filestring_T fp,struct T * ptr,int n,int npairs,char * chrstring,Univcoord_T chroffset,int margin,int wraplength)719 print_genomic_sequence (Filestring_T fp, struct T *ptr, int n, int npairs,
720 char *chrstring, Univcoord_T chroffset,
721 int margin, int wraplength) {
722 struct T *this;
723 int i;
724 char Buffer[100];
725
726 this = ptr;
727 if (chrstring == NULL) {
728 sprintf(Buffer,"%llu",(unsigned long long) (chroffset+this->genomepos + ONEBASEDP));
729 } else {
730 sprintf(Buffer,"%s:%llu",chrstring,(unsigned long long) (this->genomepos + ONEBASEDP));
731 }
732 FPRINTF(fp,"%*s ",margin,Buffer);
733 for (i = 0; n < npairs && i < wraplength; n++, i++) {
734 this = ptr++;
735 if (this->comp == EXTRAEXON_COMP) {
736 PUTC(INTRONGAP_CHAR,fp);
737 } else {
738 PUTC(this->genome,fp);
739 }
740 }
741 PUTC('\n',fp);
742 return;
743 }
744
745 static void
print_genomicalt_sequence(Filestring_T fp,struct T * ptr,int n,int npairs,char * chrstring,Univcoord_T chroffset,int margin,int wraplength)746 print_genomicalt_sequence (Filestring_T fp, struct T *ptr, int n, int npairs,
747 char *chrstring, Univcoord_T chroffset,
748 int margin, int wraplength) {
749 struct T *this;
750 int i;
751 char Buffer[100];
752
753 this = ptr;
754 if (chrstring == NULL) {
755 sprintf(Buffer,"%llu",(unsigned long long) (chroffset+this->genomepos + ONEBASEDP));
756 } else {
757 sprintf(Buffer,"%s:%llu",chrstring, (unsigned long long) (this->genomepos + ONEBASEDP));
758 }
759 FPRINTF(fp,"%*s ",margin,Buffer);
760 for (i = 0; n < npairs && i < wraplength; n++, i++) {
761 this = ptr++;
762 if (this->comp == EXTRAEXON_COMP) {
763 PUTC(INTRONGAP_CHAR,fp);
764 } else if (this->genomealt == this->genome) {
765 PUTC(' ',fp);
766 } else {
767 PUTC(this->genomealt,fp);
768 }
769 }
770 PUTC('\n',fp);
771 return;
772 }
773
774
775 static int
compute_margin(struct T * start,struct T * end,char * chrstring,Univcoord_T chroffset)776 compute_margin (struct T *start, struct T *end, char *chrstring,
777 Univcoord_T chroffset) {
778 int margin;
779 char Buffer[100];
780
781 if (chrstring == NULL) {
782 sprintf(Buffer,"%llu",(unsigned long long) (chroffset + start->genomepos + ONEBASEDP));
783 } else {
784 sprintf(Buffer,"%s:%llu",chrstring,(unsigned long long) (start->genomepos + ONEBASEDP));
785 }
786 margin = (int) strlen(Buffer) + 1;
787
788 if (chrstring == NULL) {
789 sprintf(Buffer,"%llu",(unsigned long long) (chroffset + end->genomepos + ONEBASEDP));
790 } else {
791 sprintf(Buffer,"%s:%llu",chrstring,(unsigned long long) (end->genomepos + ONEBASEDP));
792 }
793 if ((int) strlen(Buffer) + 1 > margin) {
794 margin = (int) strlen(Buffer) + 1;
795 }
796
797 if (margin < DEFAULT_MARGIN) {
798 margin = DEFAULT_MARGIN;
799 }
800
801 return margin;
802 }
803
804
805 /*
806 static char
807 intron_symbol_rev (char c) {
808 switch (c) {
809 case '>': return '<';
810 case ')': return '(';
811 case ']': return '[';
812 case '<': return '>';
813 case '(': return ')';
814 case '[': return ']';
815 default: return c;
816 }
817 }
818 */
819
820 static char complCode[128] = COMPLEMENT_LC;
821
822 static struct T *
invert_path(struct T * old,int npairs)823 invert_path (struct T *old, int npairs) {
824 struct T *new;
825 int i, j;
826
827 new = (struct T *) MALLOC(npairs*sizeof(struct T));
828 for (i = 0, j = npairs-1; i < npairs; i++, j--) {
829 memcpy(&(new[j]),&(old[i]),sizeof(struct T));
830 new[j].comp = complCode[(int) old[i].comp];
831 }
832 return new;
833 }
834
835 static struct T *
invert_and_revcomp_path(struct T * old,int npairs)836 invert_and_revcomp_path (struct T *old, int npairs) {
837 struct T *new;
838 int i, j;
839
840 new = (struct T *) MALLOC(npairs*sizeof(struct T));
841 for (i = 0, j = npairs-1; i < npairs; i++, j--) {
842 memcpy(&(new[j]),&(old[i]),sizeof(struct T));
843 new[j].cdna = complCode[(int) old[i].cdna];
844 new[j].genome = complCode[(int) old[i].genome];
845 new[j].genomealt = complCode[(int) old[i].genomealt];
846 new[j].comp = complCode[(int) old[i].comp];
847 }
848 return new;
849 }
850
851
852 #ifdef GSNAP
853 static struct T *
invert_and_revcomp_path_and_coords(struct T * old,int npairs,int querylength)854 invert_and_revcomp_path_and_coords (struct T *old, int npairs, int querylength) {
855 struct T *new;
856 int i, j;
857
858 new = (struct T *) MALLOC(npairs*sizeof(struct T));
859 for (i = 0, j = npairs-1; i < npairs; i++, j--) {
860 memcpy(&(new[j]),&(old[i]),sizeof(struct T));
861 new[j].querypos = (querylength - 1) - old[i].querypos;
862 new[j].cdna = complCode[(int) old[i].cdna];
863 new[j].genome = complCode[(int) old[i].genome];
864 new[j].genomealt = complCode[(int) old[i].genomealt];
865 new[j].comp = complCode[(int) old[i].comp];
866 }
867 return new;
868 }
869 #endif
870
871
872 static void
add_intronlengths(struct T * pairs,int npairs)873 add_intronlengths (struct T *pairs, int npairs) {
874 struct T *this = NULL, *ptr;
875 int space, margin, i, j, k, gapstart;
876 char intronstring[20], cdnabreak[20], genomicbreak[20], comp;
877 int last_querypos = -1;
878 Chrpos_T last_genomepos = (Chrpos_T) -1;
879
880 i = 0;
881 while (i < npairs) {
882 /* prev = this; */
883 this = &(pairs[i++]);
884
885 if (this->extraexonp == true) {
886 /* Don't add any lengths */
887 } else if (this->gapp) {
888 comp = this->comp;
889 gapstart = i-1;
890 space = 0;
891 while (this->gapp) {
892 this = &(pairs[i++]);
893 space++;
894 }
895
896 if (comp == DUALBREAK_COMP || comp == EXTRAEXON_COMP) {
897 /* abs() gives a large value when flag -m64 is specified */
898 /* sprintf(cdnabreak,"%d",abs(this->querypos - last_querypos)-1); */
899 if (this->querypos > last_querypos) {
900 sprintf(cdnabreak,"%d",(this->querypos - last_querypos) - 1);
901 } else {
902 sprintf(cdnabreak,"%d",(last_querypos - this->querypos) - 1);
903 }
904 if (this->genomepos < last_genomepos) {
905 sprintf(genomicbreak,"%d",last_genomepos - this->genomepos - 1);
906 } else {
907 sprintf(genomicbreak,"%d",this->genomepos - last_genomepos - 1);
908 }
909
910 margin = (space - strlen(cdnabreak))/2;
911 j = gapstart;
912 while (margin > 0) {
913 ptr = &(pairs[j++]);
914 margin--;
915 }
916 for (k = 0; k < (int) strlen(cdnabreak); k++) {
917 ptr = &(pairs[j++]);
918 ptr->cdna = cdnabreak[k];
919 }
920
921 margin = (space - strlen(genomicbreak))/2;
922 j = gapstart;
923 while (margin > 0) {
924 ptr = &(pairs[j++]);
925 margin--;
926 }
927 for (k = 0; k < (int) strlen(genomicbreak); k++) {
928 ptr = &(pairs[j++]);
929 ptr->genome = genomicbreak[k];
930 /* ptr->genomealt = ' '; */
931 }
932
933 } else { /* Intron */
934 if (this->genomepos < last_genomepos) {
935 sprintf(intronstring,"%d",last_genomepos - this->genomepos - 1);
936 } else {
937 sprintf(intronstring,"%d",this->genomepos - last_genomepos - 1);
938 }
939 margin = (space - strlen(intronstring))/2;
940 j = gapstart;
941 while (margin > 0) {
942 ptr = &(pairs[j++]);
943 margin--;
944 }
945 for (k = 0; k < (int) strlen(intronstring); k++) {
946 ptr = &(pairs[j++]);
947 ptr->cdna = intronstring[k];
948 }
949 }
950 }
951
952 if (this->cdna != ' ') {
953 last_querypos = this->querypos;
954 }
955 if (this->genome != ' ') {
956 last_genomepos = this->genomepos;
957 }
958 }
959 return;
960 }
961
962
963 /* Needed to recompute translation_length in parts of chimeras */
964 int
Pair_translation_length(struct T * pairs,int npairs)965 Pair_translation_length (struct T *pairs, int npairs) {
966 int translation_length = 0;
967 int i;
968
969 for (i = 0; i < npairs; i++) {
970 if (pairs[i].aa_e == ' ') {
971 } else if (pairs[i].aa_e == '*') {
972 } else {
973 translation_length++;
974 }
975 }
976 return translation_length;
977 }
978
979
980 void
Pair_print_continuous(Filestring_T fp,struct T * pairs,int npairs,bool watsonp,bool genomefirstp,int invertmode,bool nointronlenp)981 Pair_print_continuous (Filestring_T fp, struct T *pairs, int npairs, bool watsonp,
982 bool genomefirstp, int invertmode, bool nointronlenp) {
983 T this;
984 struct T *save = NULL, *ptr;
985 int n = 0;
986
987 if (watsonp == true) {
988 ptr = pairs;
989 } else if (invertmode == 0) {
990 ptr = pairs;
991 } else if (invertmode == 1) {
992 save = ptr = invert_path(pairs,npairs);
993 } else if (invertmode == 2) {
994 save = ptr = invert_and_revcomp_path(pairs,npairs);
995 } else {
996 fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
997 exit(9);
998 }
999 if (nointronlenp == false) {
1000 add_intronlengths(ptr,npairs);
1001 }
1002
1003 if (genomefirstp == true) {
1004 ptr = pairs;
1005 for (n = 0; n < npairs; n++) {
1006 this = ptr++;
1007 PUTC(this->genome,fp);
1008 }
1009 PUTC('\n',fp);
1010
1011 ptr = pairs;
1012 for (n = 0; n < npairs; n++) {
1013 this = ptr++;
1014 #ifdef DIAGNOSTICP
1015 PUTC(this->comp,fp);
1016 #else
1017 if (this->comp == MATCH_COMP) {
1018 PUTC(MATCH_COMP,fp);
1019 } else if (this->comp == DYNPROG_MATCH_COMP) {
1020 PUTC(MATCH_COMP,fp);
1021 } else if (this->comp == AMBIGUOUS_COMP) {
1022 #ifdef PMAP
1023 PUTC(AMBIGUOUS_COMP,fp);
1024 #else
1025 PUTC(MISMATCH_COMP,fp);
1026 #endif
1027 } else {
1028 PUTC(this->comp,fp);
1029 }
1030 #endif
1031
1032 }
1033 PUTC('\n',fp);
1034
1035 ptr = pairs;
1036 for (n = 0; n < npairs; n++) {
1037 this = ptr++;
1038 PUTC(this->cdna,fp);
1039 }
1040 PUTC('\n',fp);
1041
1042 } else {
1043 ptr = pairs;
1044 for (n = 0; n < npairs; n++) {
1045 this = ptr++;
1046 PUTC(this->cdna,fp);
1047 }
1048 PUTC('\n',fp);
1049
1050 ptr = pairs;
1051 for (n = 0; n < npairs; n++) {
1052 this = ptr++;
1053
1054 #ifdef DIAGNOSTICP
1055 PUTC(this->comp,fp);
1056 #else
1057 if (this->comp == MATCH_COMP) {
1058 PUTC(MATCH_COMP,fp);
1059 } else if (this->comp == DYNPROG_MATCH_COMP) {
1060 PUTC(MATCH_COMP,fp);
1061 } else if (this->comp == AMBIGUOUS_COMP) {
1062 #ifdef PMAP
1063 PUTC(AMBIGUOUS_COMP,fp);
1064 #else
1065 PUTC(MISMATCH_COMP,fp);
1066 #endif
1067 } else {
1068 PUTC(this->comp,fp);
1069 }
1070 #endif
1071
1072 }
1073 PUTC('\n',fp);
1074
1075 ptr = pairs;
1076 for (n = 0; n < npairs; n++) {
1077 this = ptr++;
1078 PUTC(this->genome,fp);
1079 }
1080 PUTC('\n',fp);
1081 }
1082
1083 if (save != NULL) {
1084 FREE(save);
1085 }
1086 return;
1087 }
1088
1089
1090
1091 void
Pair_print_continuous_byexon(Filestring_T fp,struct T * pairs,int npairs,bool watsonp,int invertmode)1092 Pair_print_continuous_byexon (Filestring_T fp, struct T *pairs, int npairs, bool watsonp, int invertmode) {
1093 T this;
1094 struct T *save = NULL, *ptr;
1095 int i = 0, j;
1096
1097 if (watsonp == true) {
1098 ptr = pairs;
1099 } else if (invertmode == 0) {
1100 ptr = pairs;
1101 } else if (invertmode == 1) {
1102 save = ptr = invert_path(pairs,npairs);
1103 } else if (invertmode == 2) {
1104 save = ptr = invert_and_revcomp_path(pairs,npairs);
1105 } else {
1106 fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
1107 exit(9);
1108 }
1109
1110 ptr = pairs;
1111 while (i < npairs) {
1112 j = i;
1113 this = ptr;
1114
1115 while (j < npairs && this->gapp == false) {
1116 PUTC(this->genome,fp);
1117 this++;
1118 j++;
1119 }
1120 PUTC('\n',fp);
1121
1122 j = i;
1123 this = ptr;
1124 while (j < npairs && this->gapp == false) {
1125
1126 #ifdef DIAGNOSTICP
1127 PUTC(this->comp,fp);
1128
1129 #else
1130 if (this->comp == MATCH_COMP) {
1131 PUTC(MATCH_COMP,fp);
1132 } else if (this->comp == DYNPROG_MATCH_COMP) {
1133 PUTC(MATCH_COMP,fp);
1134 } else if (this->comp == AMBIGUOUS_COMP) {
1135 #ifdef PMAP
1136 PUTC(AMBIGUOUS_COMP,fp);
1137 #else
1138 PUTC(MISMATCH_COMP,fp);
1139 #endif
1140 } else {
1141 PUTC(this->comp,fp);
1142 }
1143 #endif
1144
1145 this++;
1146 j++;
1147 }
1148 PUTC('\n',fp);
1149
1150 j = i;
1151 this = ptr;
1152 while (j < npairs && this->gapp == false) {
1153 PUTC(this->cdna,fp);
1154 this++;
1155 j++;
1156 }
1157 FPRINTF(fp,"\n\n");
1158
1159 i = j;
1160 while (i < npairs && this->gapp == true) {
1161 this++;
1162 i++;
1163 }
1164 ptr = this;
1165 }
1166
1167 if (save != NULL) {
1168 FREE(save);
1169 }
1170 return;
1171 }
1172
1173
1174 void
Pair_print_alignment(Filestring_T fp,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,bool watsonp,int invertmode,bool nointronlenp,int wraplength)1175 Pair_print_alignment (Filestring_T fp, struct T *pairs, int npairs, Chrnum_T chrnum,
1176 Univcoord_T chroffset, Univ_IIT_T chromosome_iit, bool watsonp,
1177 int invertmode, bool nointronlenp, int wraplength) {
1178 struct T *save = NULL, *ptr;
1179 int n = 0, i;
1180 char *chrstring = NULL;
1181 int margin;
1182
1183 if (watsonp == true) {
1184 ptr = pairs;
1185
1186 } else if (invertmode == 0) {
1187 /* Given cDNA sequence, use minus genome strand */
1188 ptr = pairs;
1189
1190 } else if (invertmode == 1) {
1191 /* Invert cDNA sequence, use minus genome strand */
1192 save = ptr = invert_path(pairs,npairs);
1193
1194 } else if (invertmode == 2) {
1195 /* Invert cDNA sequence, use plus genome strand */
1196 save = ptr = invert_and_revcomp_path(pairs,npairs);
1197
1198 } else {
1199 fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
1200 exit(9);
1201 }
1202
1203 if (nointronlenp == false) {
1204 add_intronlengths(ptr,npairs);
1205 }
1206 if (chrnum != 0) {
1207 if (invertmode == 2) {
1208 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
1209 } else {
1210 chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,watsonp);
1211 }
1212 }
1213
1214 margin = compute_margin(&(pairs[0]),&(pairs[npairs-1]),chrstring,chroffset);
1215
1216 while (n < npairs) {
1217 print_top_ruler(fp,n,npairs,margin,wraplength);
1218 print_peptide(fp,ptr,n,npairs,margin,wraplength,/*genomep*/true);
1219 if (snps_p) {
1220 print_genomicalt_sequence(fp,ptr,n,npairs,chrstring,
1221 chroffset,margin,wraplength);
1222 }
1223 print_genomic_sequence(fp,ptr,n,npairs,chrstring,
1224 chroffset,margin,wraplength);
1225 print_alignment(fp,ptr,n,npairs,margin,wraplength);
1226 print_cdna_sequence(fp,ptr,n,npairs,margin,wraplength);
1227 print_peptide(fp,ptr,n,npairs,margin,wraplength,/*genomep*/false);
1228 PUTC('\n',fp);
1229 for (i = 0; n < npairs && i < wraplength; n++, i++) {
1230 ptr++;
1231 }
1232 }
1233 if (chrstring != NULL) {
1234 FREE(chrstring);
1235 }
1236 if (save != NULL) {
1237 FREE(save);
1238 }
1239 return;
1240 }
1241
1242 void
Pair_print_pathsummary(Filestring_T fp,int pathnum,T start,T end,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,bool referencealignp,IIT_T altstrain_iit,char * strain,Univ_IIT_T contig_iit,char * dbversion,int querylength_given,int skiplength,int trim_start,int trim_end,int nexons,int matches,int unknowns,int mismatches,int qopens,int qindels,int topens,int tindels,bool watsonp,int cdna_direction,int translation_start,int translation_end,int translation_length,int relaastart,int relaaend)1243 Pair_print_pathsummary (Filestring_T fp, int pathnum, T start, T end, Chrnum_T chrnum,
1244 Univcoord_T chroffset, Univ_IIT_T chromosome_iit, bool referencealignp,
1245 IIT_T altstrain_iit, char *strain, Univ_IIT_T contig_iit, char *dbversion,
1246 int querylength_given, int skiplength, int trim_start, int trim_end,
1247 int nexons, int matches, int unknowns, int mismatches,
1248 int qopens, int qindels, int topens, int tindels,
1249 bool watsonp, int cdna_direction,
1250 int translation_start, int translation_end, int translation_length,
1251 int relaastart, int relaaend) {
1252 int querypos1, querypos2, den;
1253 double fracidentity, coverage, trimmed_coverage;
1254 Univcoord_T position1, position2;
1255 Chrpos_T chrpos1, chrpos2;
1256 char *refstrain, *comma1, *comma2, *chr;
1257
1258 querypos1 = start->querypos;
1259 querypos2 = end->querypos;
1260
1261 FPRINTF(fp," Path %d: ",pathnum);
1262 FPRINTF(fp,"query %d%s%d (%d bp) => ",
1263 querypos1 + ONEBASEDP,SEPARATOR,querypos2 + ONEBASEDP,querypos2-querypos1+1);
1264
1265 chrpos1 = start->genomepos;
1266 chrpos2 = end->genomepos;
1267
1268 comma1 = Genomicpos_commafmt(chrpos1 + ONEBASEDP);
1269 comma2 = Genomicpos_commafmt(chrpos2 + ONEBASEDP);
1270 if (chrnum == 0) {
1271 if (watsonp) {
1272 FPRINTF(fp,"genome %s%s%s (%d bp)\n",
1273 comma1,SEPARATOR,comma2,chrpos2-chrpos1+1);
1274 } else {
1275 FPRINTF(fp,"genome %s%s%s (%d bp)\n",
1276 comma1,SEPARATOR,comma2,chrpos2-chrpos1-1);
1277 }
1278 } else {
1279 chr = Chrnum_to_string(chrnum,chromosome_iit);
1280 if (watsonp) {
1281 FPRINTF(fp,"genome %s:%s%s%s (%d bp)\n",chr,comma1,SEPARATOR,comma2,chrpos2-chrpos1+1);
1282 } else {
1283 FPRINTF(fp,"genome %s:%s%s%s (%d bp)\n",chr,comma1,SEPARATOR,comma2,chrpos2-chrpos1-1);
1284 }
1285 FREE(chr);
1286 }
1287 FREE(comma2);
1288 FREE(comma1);
1289
1290 FPRINTF(fp," cDNA direction: ");
1291 if (cdna_direction > 0) {
1292 FPRINTF(fp,"sense\n");
1293 } else if (cdna_direction < 0) {
1294 FPRINTF(fp,"antisense\n");
1295 } else {
1296 FPRINTF(fp,"indeterminate\n");
1297 }
1298
1299 if (altstrain_iit != NULL) {
1300 if (strain == NULL) {
1301 refstrain = IIT_typestring(altstrain_iit,/*straintype*/0);
1302 if (refstrain[0] == '\0') {
1303 /* Backward compatibility with old altstrain_iit */
1304 FPRINTF(fp," Strain: reference\n");
1305 } else {
1306 FPRINTF(fp," Strain: %s (reference)\n",refstrain);
1307 }
1308 } else {
1309 FPRINTF(fp," Strain: %s\n",strain);
1310 }
1311 }
1312
1313 position1 = chroffset + chrpos1;
1314 position2 = chroffset + chrpos2;
1315 comma1 = Genomicpos_commafmt(position1 + ONEBASEDP);
1316 comma2 = Genomicpos_commafmt(position2 + ONEBASEDP);
1317 if (dbversion == NULL) {
1318 FPRINTF(fp," Genomic pos: %s%s%s",comma1,SEPARATOR,comma2);
1319 } else {
1320 FPRINTF(fp," Genomic pos: %s:%s%s%s",dbversion,comma1,SEPARATOR,comma2);
1321 }
1322 if (chrpos1 <= chrpos2) {
1323 FPRINTF(fp," (+ strand)\n");
1324 } else {
1325 FPRINTF(fp," (- strand)\n");
1326 }
1327 FREE(comma2);
1328 FREE(comma1);
1329
1330 if (contig_iit != NULL) {
1331 if (position1 <= position2) {
1332 Segmentpos_print_accessions(fp,contig_iit,position1,position2,referencealignp,strain);
1333 } else {
1334 Segmentpos_print_accessions(fp,contig_iit,position2,position1,referencealignp,strain);
1335 }
1336 }
1337
1338 FPRINTF(fp," Number of exons: %d\n",nexons);
1339
1340 #ifdef PMAP
1341 coverage = (double) (querypos2 - querypos1 + 1)/(double) (3*(querylength_given + skiplength));
1342 /* coverage = (double) (matches + mismatches + qindels)/(double) (3*(querylength_given + skiplength)); */
1343
1344 /* Can have coverage greater than given querylength because of added '*' at end */
1345 if (coverage > 1.0) {
1346 coverage = 1.0;
1347 }
1348 #else
1349 /* coverage = (double) (matches + mismatches + qindels)/(double) (querylength_given + skiplength); */
1350 coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given + skiplength);
1351 #endif
1352 FPRINTF(fp," Coverage: %.1f",((double) rint(1000.0*coverage))/10.0);
1353 #ifdef PMAP
1354 FPRINTF(fp," (query length: %d aa)\n",querylength_given);
1355 #else
1356 FPRINTF(fp," (query length: %d bp)\n",querylength_given);
1357 if (querypos2 + 1 > trim_end) {
1358 trim_end = querypos2 + 1;
1359 }
1360 if (querypos1 < trim_start) {
1361 trim_start = querypos1;
1362 }
1363
1364 trimmed_coverage = (double) (querypos2 - querypos1 + 1)/(double) (trim_end - trim_start + skiplength);
1365 FPRINTF(fp," Trimmed coverage: %.1f",((double) rint(1000.0*trimmed_coverage))/10.0);
1366 FPRINTF(fp," (trimmed length: %d bp, trimmed region: %d..%d)",
1367 trim_end-trim_start,trim_start+ONEBASEDP,trim_end-1+ONEBASEDP);
1368 PUTC('\n',fp);
1369 #endif
1370
1371 if ((den = matches + mismatches + qindels + tindels) == 0) {
1372 fracidentity = 1.0;
1373 } else {
1374 fracidentity = (double) matches/(double) den;
1375 }
1376
1377 /* The definition of indels here should be consistent with Stage3_indels */
1378 FPRINTF(fp," Percent identity: %.1f (%d matches, %d mismatches, %d indels, %d unknowns)\n",
1379 ((double) rint(1000.0*fracidentity))/10.0,matches,mismatches,qindels+tindels,unknowns);
1380 if (qindels + tindels > 0) {
1381 FPRINTF(fp," Non-intron gaps: %d openings, %d bases in cdna; %d openings, %d bases in genome\n",
1382 qopens,qindels,topens,tindels);
1383 }
1384
1385 #ifndef PMAP
1386 if (translation_length > 0) {
1387 if (cdna_direction >= 0) {
1388 FPRINTF(fp," Translation: %d..%d (%d aa)\n",
1389 translation_start+ONEBASEDP,translation_end+ONEBASEDP,translation_length);
1390 } else {
1391 FPRINTF(fp," Translation: %d..%d (%d aa)\n",
1392 translation_end+ONEBASEDP,translation_start+ONEBASEDP,translation_length);
1393 }
1394 } else if (relaastart > 0) {
1395 if (relaastart < relaaend) {
1396 FPRINTF(fp," Protein coords: %d..%d\n",relaastart,relaaend);
1397 } else {
1398 FPRINTF(fp," Protein coords: %d..%d\n",relaaend,relaastart);
1399 }
1400 }
1401 #endif
1402
1403 /* FPRINTF(fp," Defect rate (percent): %.1f\n",defect_rate*100.0); */
1404
1405 /* PUTC('\n',fp); -- Done by caller */
1406
1407 return;
1408 }
1409
1410
1411 void
Pair_print_coordinates(Filestring_T fp,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,bool watsonp,int invertmode)1412 Pair_print_coordinates (Filestring_T fp, struct T *pairs, int npairs, Chrnum_T chrnum,
1413 Univcoord_T chroffset, Univ_IIT_T chromosome_iit,
1414 bool watsonp, int invertmode) {
1415 T this;
1416 struct T *save = NULL;
1417 int i;
1418 char *chrstring = NULL;
1419
1420 Pair_check_array_pairs(pairs,npairs);
1421
1422 if (watsonp == true) {
1423 /* ptr = pairs; */
1424
1425 } else if (invertmode == 0) {
1426 /* Given cDNA sequence, use minus genome strand */
1427 /* ptr = pairs; */
1428
1429 } else if (invertmode == 1) {
1430 /* Invert cDNA sequence, use minus genome strand */
1431 save = invert_path(pairs,npairs);
1432
1433 } else if (invertmode == 2) {
1434 /* Invert cDNA sequence, use plus genome strand */
1435 save = invert_and_revcomp_path(pairs,npairs);
1436
1437 } else {
1438 fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
1439 exit(9);
1440 }
1441
1442 if (chrnum != 0) {
1443 if (invertmode == 2) {
1444 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
1445 } else {
1446 chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,watsonp);
1447 }
1448 }
1449
1450 for (i = 0; i < npairs; i++) {
1451 this = pairs++;
1452 if (this->gapp == false) {
1453 #ifdef DEBUG5
1454 FPRINTF(fp,"%d %d %c\t",this->aapos,this->aaphase_e,this->aa_e);
1455 #else
1456 if (this->aaphase_e != 0) {
1457 FPRINTF(fp,"%d\t",this->aapos);
1458 } else {
1459 FPRINTF(fp,"%d %c\t",this->aapos,this->aa_e);
1460 }
1461 #endif
1462 FPRINTF(fp,"%d %c\t",this->querypos + ONEBASEDP,this->cdna);
1463 if (chrstring == NULL) {
1464 FPRINTF(fp,"%u %u %c",this->genomepos + ONEBASEDP,
1465 chroffset + this->genomepos + ONEBASEDP,
1466 this->genome);
1467 } else {
1468 FPRINTF(fp,"%s:%u %u %c",chrstring,
1469 this->genomepos + ONEBASEDP,
1470 chroffset + this->genomepos + ONEBASEDP,
1471 this->genome);
1472 }
1473 if (this->genomealt != this->genome) {
1474 FPRINTF(fp," %c",this->genomealt);
1475 }
1476
1477 #ifdef DEBUG5
1478 FPRINTF(fp,"\t%d %c",this->aaphase_g,this->aa_g);
1479 #else
1480 if (this->aaphase_g != 0) {
1481 FPRINTF(fp,"\t");
1482 } else {
1483 FPRINTF(fp,"\t%c",this->aa_g);
1484 }
1485 #endif
1486 PUTC('\n',fp);
1487 }
1488 }
1489
1490 if (chrstring != NULL) {
1491 FREE(chrstring);
1492 }
1493 if (save != NULL) {
1494 FREE(save);
1495 }
1496 return;
1497 }
1498
1499
1500 int
Pair_cmp(const void * a,const void * b)1501 Pair_cmp (const void *a, const void *b) {
1502 T x = * (T *) a;
1503 T y = * (T *) b;
1504
1505 if (x->querypos < y->querypos) {
1506 return -1;
1507 } else if (y->querypos < x->querypos) {
1508 return +1;
1509 } else if (x->genomepos < y->genomepos) {
1510 return -1;
1511 } else if (y->genomepos < x->genomepos) {
1512 return +1;
1513 } else {
1514 return 0;
1515 }
1516 }
1517
1518
1519 void
Pair_dump_one(T this,bool zerobasedp)1520 Pair_dump_one (T this, bool zerobasedp) {
1521
1522 debug1(printf("%p ",this));
1523
1524 if (this->gapp == true && this->extraexonp == false) {
1525 printf("*** Gap: queryjump = %d, genomejump = %d, type: ",this->queryjump,this->genomejump);
1526 switch (this->comp) {
1527 case FWD_CANONICAL_INTRON_COMP: printf("> GT-AG"); break;
1528 case FWD_GCAG_INTRON_COMP: printf(") GC-AG"); break;
1529 case FWD_ATAC_INTRON_COMP: printf("] AT-AC"); break;
1530 case REV_ATAC_INTRON_COMP: printf("[ AT-AC"); break;
1531 case REV_GCAG_INTRON_COMP: printf("( GC-AG"); break;
1532 case REV_CANONICAL_INTRON_COMP: printf("< GT-AG"); break;
1533 case SHORTGAP_COMP: printf("~ shortgap"); break;
1534 case NONINTRON_COMP: printf("= nonintron"); break;
1535 default: printf("? unknown"); break;
1536 }
1537
1538 if (this->knowngapp == true) {
1539 printf(" known");
1540 }
1541
1542 printf(" donor:%f acceptor:%f",this->donor_prob,this->acceptor_prob);
1543 printf(" ***");
1544
1545 } else {
1546 printf("%d %d %c ",
1547 this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->cdna);
1548
1549 /* Subtract 1 because dynprogindices start at +1 and -1 */
1550 if (this->dynprogindex > 0) {
1551 printf("%c%c",this->comp,(this->dynprogindex-1)%26+'a');
1552 } else if (this->dynprogindex < 0) {
1553 printf("%c%c",this->comp,(-this->dynprogindex-1)%26+'A');
1554 } else {
1555 putchar(this->comp);
1556 }
1557 printf(" %c",this->genome);
1558 if (this->genomealt != this->genome) {
1559 printf(" alt:%c",this->genomealt);
1560 }
1561 }
1562
1563 if (this->protectedp == true) {
1564 printf(" protected");
1565 }
1566
1567 if (this->disallowedp == true) {
1568 printf(" disallowed");
1569 }
1570
1571 if (this->shortexonp == true) {
1572 printf(" shortexon");
1573 }
1574
1575 if (this->gapp == true) {
1576 printf(" gap");
1577 }
1578
1579 #if 0
1580 if (this->state == BAD) {
1581 printf(" bad");
1582 }
1583 #endif
1584
1585 return;
1586 }
1587
1588
1589 /* Useful for debugging */
1590 void
Pair_dump_list(List_T pairs,bool zerobasedp)1591 Pair_dump_list (List_T pairs, bool zerobasedp) {
1592 T this, prev = NULL, old = NULL;
1593 List_T p;
1594
1595 printf("***Start of list***\n");
1596 for (p = pairs; p != NULL; p = List_next(p)) {
1597 this = List_head(p);
1598 Pair_dump_one(this,zerobasedp);
1599 printf("\n");
1600
1601 if (this->querypos != -1) {
1602 if (old != NULL) {
1603 if (old->querypos > prev->querypos) {
1604 if (prev->querypos < this->querypos) {
1605 fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
1606 abort();
1607 }
1608 } else if (old->querypos < prev->querypos) {
1609 if (prev->querypos > this->querypos) {
1610 fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
1611 abort();
1612 }
1613 }
1614 }
1615
1616 old = prev;
1617 prev = this;
1618 }
1619
1620 }
1621 printf("***End of list***\n");
1622 return;
1623 }
1624
1625 void
Pair_dump_array(struct T * pairs,int npairs,bool zerobasedp)1626 Pair_dump_array (struct T *pairs, int npairs, bool zerobasedp) {
1627 struct T *this;
1628 int i;
1629
1630 for (i = 0; i < npairs; i++) {
1631 this = pairs++;
1632 printf("%d: %d %d %d %c ",
1633 i,this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->aapos,
1634 this->cdna);
1635
1636 /* Subtract 1 because dynprogindices start at +1 and -1 */
1637 if (this->dynprogindex > 0) {
1638 printf("%c%c",this->comp,(this->dynprogindex-1)%26+'a');
1639 } else if (this->dynprogindex < 0) {
1640 printf("%c%c",this->comp,(-this->dynprogindex-1)%26+'A');
1641 } else {
1642 putchar(this->comp);
1643 }
1644 printf(" %c",this->genome);
1645 if (this->genomealt != this->genome) {
1646 printf(" alt:%c",this->genomealt);
1647 }
1648
1649 debug7(printf(" aaphase_g:%d aaphase_e:%d",this->aaphase_g,this->aaphase_e));
1650
1651 if (this->aaphase_g == 0 || this->aaphase_e == 0) {
1652 printf(" => %c %c",this->aa_g,this->aa_e);
1653 }
1654
1655 if (this->gapp) {
1656 printf(" gap");
1657 }
1658
1659 printf("\n");
1660 }
1661 return;
1662 }
1663
1664
1665 void
Pair_dump_array_stderr(struct T * pairs,int npairs,bool zerobasedp)1666 Pair_dump_array_stderr (struct T *pairs, int npairs, bool zerobasedp) {
1667 struct T *this;
1668 int i;
1669
1670 for (i = 0; i < npairs; i++) {
1671 this = pairs++;
1672 fprintf(stderr,"%d: %d %d %d %c ",
1673 i,this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->aapos,
1674 this->cdna);
1675
1676 /* Subtract 1 because dynprogindices start at +1 and -1 */
1677 if (this->dynprogindex > 0) {
1678 fprintf(stderr,"%c%c",this->comp,(this->dynprogindex-1)%26+'a');
1679 } else if (this->dynprogindex < 0) {
1680 fprintf(stderr,"%c%c",this->comp,(-this->dynprogindex-1)%26+'A');
1681 } else {
1682 putc(this->comp,stderr);
1683 }
1684 fprintf(stderr," %c",this->genome);
1685 if (this->genomealt != this->genome) {
1686 fprintf(stderr," alt:%c",this->genomealt);
1687 }
1688
1689 if (this->aaphase_g == 0 || this->aaphase_e == 0) {
1690 fprintf(stderr," => %c %c",this->aa_g,this->aa_e);
1691 }
1692 fprintf(stderr,"\n");
1693 }
1694 return;
1695 }
1696
1697
1698 void
Pair_dump_genome_array(struct T * pairs,int npairs)1699 Pair_dump_genome_array (struct T *pairs, int npairs) {
1700 struct T *this;
1701 int i;
1702
1703 for (i = 0; i < npairs; i++) {
1704 this = pairs++;
1705 printf("%c",this->genome);
1706 }
1707 printf("\n");
1708
1709 return;
1710 }
1711
1712 void
Pair_dump_comp_array(struct T * pairs,int npairs)1713 Pair_dump_comp_array (struct T *pairs, int npairs) {
1714 struct T *this;
1715 int i;
1716
1717 for (i = 0; i < npairs; i++) {
1718 this = pairs++;
1719 printf("%c",this->comp);
1720 }
1721 printf("\n");
1722
1723 return;
1724 }
1725
1726
1727 Chrpos_T
Pair_genomicpos(struct T * pairs,int npairs,int querypos,bool headp)1728 Pair_genomicpos (struct T *pairs, int npairs, int querypos, bool headp) {
1729 struct T *this;
1730 int i;
1731
1732 if (headp == true) {
1733 for (i = 0; i < npairs; i++) {
1734 this = pairs++;
1735 if (this->querypos == querypos) {
1736 return this->genomepos;
1737 } else if (this->querypos > querypos) {
1738 return 0;
1739 }
1740 }
1741 } else {
1742 pairs += npairs;
1743 for (i = npairs-1; i >= 0; --i) {
1744 this = --pairs;
1745 if (this->querypos == querypos) {
1746 return this->genomepos;
1747 } else if (this->querypos < querypos) {
1748 return 0;
1749 }
1750 }
1751 }
1752 return 0;
1753 }
1754
1755 int
Pair_codon_changepos(struct T * pairs,int npairs,int aapos,int cdna_direction)1756 Pair_codon_changepos (struct T *pairs, int npairs, int aapos, int cdna_direction) {
1757 struct T *this, *start, *end;
1758 int changepos = 0, i, ngenome = 0, ncdna = 0;
1759
1760 i = 0;
1761 this = pairs;
1762 while (i < npairs && this->aapos != aapos) {
1763 this++;
1764 i++;
1765 }
1766 start = this;
1767
1768 while (i < npairs && (ngenome < 3 || ncdna < 3)) {
1769 if (this->gapp == false) {
1770 if (this->genome != ' ') {
1771 ngenome++;
1772 }
1773 if (this->cdna != ' ') {
1774 ncdna++;
1775 }
1776 }
1777 this++;
1778 i++;
1779 }
1780 end = --this;
1781
1782 if (cdna_direction < 0) {
1783 for (this = end; this >= start; --this) {
1784 if (this->gapp == true) {
1785 } else if (this->genome == ' ') {
1786 } else if (this->cdna == ' ') {
1787 } else if (this->genome != this->cdna) {
1788 return changepos;
1789 } else {
1790 changepos++;
1791 }
1792 }
1793 } else {
1794 for (this = start; this <= end; this++) {
1795 if (this->gapp == true) {
1796 } else if (this->genome == ' ') {
1797 } else if (this->cdna == ' ') {
1798 } else if (this->genome != this->cdna) {
1799 return changepos;
1800 } else {
1801 changepos++;
1802 }
1803 }
1804 }
1805
1806 return changepos;
1807 }
1808
1809
1810 #if 0
1811 bool
1812 Pair_identical_p (List_T pairs1, List_T pairs2) {
1813 List_T p, q;
1814 T pair1, pair2;
1815
1816 p = pairs1;
1817 q = pairs2;
1818 while (p && q) {
1819 pair1 = (T) List_head(p);
1820 pair2 = (T) List_head(q);
1821 if (pair1->gapp != pair2->gapp) {
1822 return false;
1823 } else if (pair1->querypos != pair2->querypos) {
1824 return false;
1825 } else if (pair1->genomepos != pair2->genomepos) {
1826 return false;
1827 } else if (pair1->comp != pair2->comp) {
1828 return false;
1829 }
1830 p = List_next(p);
1831 q = List_next(q);
1832 }
1833
1834 if (p || q) {
1835 return false;
1836 } else {
1837 return true;
1838 }
1839 }
1840 #endif
1841
1842
1843 void
Pair_check_list_pairs(List_T pairs)1844 Pair_check_list_pairs (List_T pairs) {
1845 T this;
1846 List_T p;
1847 int prev_querypos;
1848
1849 if (pairs == NULL) {
1850 return;
1851 } else {
1852 this = List_head(pairs);
1853 prev_querypos = this->querypos;
1854 /* prev_genomepos = this->genomepos; */
1855
1856 for (p = List_next(pairs); p != NULL; p = List_next(p)) {
1857 this = List_head(p);
1858 if (this->gapp == false) {
1859 if (this->querypos < prev_querypos) {
1860 printf("Problem at querypos %d < prev querypos %d\n",this->querypos,prev_querypos);
1861 abort();
1862 }
1863 #if 0
1864 /* No longer a valid check after genomepos converted to chrpos */
1865 if (this->genomepos < prev_genomepos) {
1866 printf("Problem at genomepos %d\n",this->genomepos);
1867 }
1868 #endif
1869 prev_querypos = this->querypos;
1870 /* prev_genomepos = this->genomepos; */
1871 }
1872 }
1873 }
1874 return;
1875 }
1876
1877 void
Pair_check_list_path(List_T path)1878 Pair_check_list_path (List_T path) {
1879 T this;
1880 List_T p;
1881 int prev_querypos;
1882
1883 if (path == NULL) {
1884 return;
1885 } else {
1886 this = List_head(path);
1887 prev_querypos = this->querypos;
1888 /* prev_genomepos = this->genomepos; */
1889
1890 for (p = List_next(path); p != NULL; p = List_next(p)) {
1891 this = List_head(p);
1892 if (this->gapp == false) {
1893 if (this->querypos > prev_querypos) {
1894 printf("Problem at querypos %d > prev querypos %d\n",this->querypos,prev_querypos);
1895 abort();
1896 }
1897 #if 0
1898 /* No longer a valid check after genomepos converted to chrpos */
1899 if (this->genomepos > prev_genomepos) {
1900 printf("Problem at genomepos %d\n",this->genomepos);
1901 }
1902 #endif
1903 prev_querypos = this->querypos;
1904 /* prev_genomepos = this->genomepos; */
1905 }
1906 }
1907 }
1908 return;
1909 }
1910
1911
1912 bool
Pair_check_array_pairs(struct T * pairs,int npairs)1913 Pair_check_array_pairs (struct T *pairs, int npairs) {
1914 bool result = false;
1915 struct T *this;
1916 int prev_querypos;
1917 int i;
1918
1919 if (npairs == 0) {
1920 return false;
1921 } else {
1922 this = pairs++;
1923 prev_querypos = this->querypos;
1924 /* prev_genomepos = this->genomepos; */
1925
1926 for (i = 1; i < npairs; i++) {
1927 this = pairs++;
1928 if (this->querypos < prev_querypos) {
1929 printf("Problem at querypos %d < prev querypos %d\n",this->querypos,prev_querypos);
1930 abort();
1931 result = true;
1932 } else if (this->querypos - prev_querypos > 1) {
1933 /* Could be the result of a dual break */
1934 fprintf(stderr,"Jump at querypos %d\n",this->querypos);
1935 result = false;
1936 }
1937 #if 0
1938 /* No longer a valid check after genomepos converted to chrpos */
1939 if (this->genomepos < prev_genomepos) {
1940 fprintf(stderr,"Problem at genomepos %d\n",this->genomepos);
1941 result = true;
1942 }
1943 #endif
1944 prev_querypos = this->querypos;
1945 /* prev_genomepos = this->genomepos; */
1946 }
1947 }
1948 return result;
1949 }
1950
1951
1952 bool
Pair_check_array_path(struct T * path,int npairs)1953 Pair_check_array_path (struct T *path, int npairs) {
1954 bool result = false;
1955 struct T *this;
1956 int prev_querypos;
1957 int i;
1958
1959 if (npairs == 0) {
1960 return false;
1961 } else {
1962 this = path++;
1963 prev_querypos = this->querypos;
1964 /* prev_genomepos = this->genomepos; */
1965
1966 for (i = 1; i < npairs; i++) {
1967 this = path++;
1968 if (this->querypos > prev_querypos) {
1969 printf("Problem at querypos %d > prev querypos %d\n",this->querypos,prev_querypos);
1970 abort();
1971 result = true;
1972 } else if (this->querypos - prev_querypos > 1) {
1973 /* Could be the result of a dual break */
1974 fprintf(stderr,"Jump at querypos %d\n",this->querypos);
1975 result = false;
1976 }
1977 #if 0
1978 /* No longer a valid check after genomepos converted to chrpos */
1979 if (this->genomepos < prev_genomepos) {
1980 fprintf(stderr,"Problem at genomepos %d\n",this->genomepos);
1981 result = true;
1982 }
1983 #endif
1984 prev_querypos = this->querypos;
1985 /* prev_genomepos = this->genomepos; */
1986 }
1987 }
1988 return result;
1989 }
1990
1991
1992 #if 0
1993 /* Modeled after Pair_convert_array_to_pairs */
1994 List_T
1995 Pair_convert_array_to_pairs (List_T pairs, struct T *pairarray, int npairs, bool plusp,
1996 Chrpos_T chrlength, Pairpool_T pairpool) {
1997 T pair;
1998 int i;
1999
2000 if (plusp == true) {
2001 for (i = 0; i < npairs; i++) {
2002 pair = &(pairarray[i]);
2003 if (pair->gapp) {
2004 /* Skip */
2005 } else {
2006 pairs = Pairpool_push(pairs,pairpool,pair->querypos /*+ queryseq_offset*/,pair->genomepos,
2007 pair->cdna,pair->comp,pair->genome,pair->genomealt,/*dynprogindex*/0);
2008 }
2009 }
2010
2011 } else {
2012 for (i = 0; i < npairs; i++) {
2013 pair = &(pairarray[i]);
2014 if (pair->gapp) {
2015 /* Skip */
2016 } else {
2017 pairs = Pairpool_push(pairs,pairpool,pair->querypos /*+ queryseq_offset*/,chrlength - pair->genomepos,
2018 pair->cdna,pair->comp,pair->genome,pair->genomealt,/*dynprogindex*/0);
2019 }
2020 }
2021 }
2022
2023
2024 return pairs;
2025 }
2026 #endif
2027
2028
2029 #if 0
2030 /* Called by output thread for --merge-overlap feature. Modeled after Substring_convert_to_pairs. */
2031 List_T
2032 Pair_convert_array_to_pairs_out (List_T pairs, struct T *pairarray, int npairs, bool plusp, int querylength,
2033 int hardclip_low, int hardclip_high, int queryseq_offset) {
2034 T pair;
2035 int querystart, queryend, i;
2036
2037 if (plusp == true) {
2038 querystart = hardclip_low;
2039 queryend = querylength - hardclip_high;
2040
2041 } else {
2042 querystart = hardclip_high;
2043 queryend = querylength - hardclip_low;
2044 }
2045
2046 for (i = 0; i < npairs; i++) {
2047 pair = &(pairarray[i]);
2048 if (pair->querypos >= querystart && pair->querypos < queryend) {
2049 pairs = List_push_out(pairs,(void *) Pair_new_out(pair->querypos + queryseq_offset,/*genomepos*/pair->genomepos,
2050 pair->cdna,pair->comp,pair->genome));
2051 }
2052 }
2053
2054 return pairs;
2055 }
2056 #endif
2057
2058
2059
2060 #if 0
2061 static void
2062 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
2063 int i, j;
2064
2065 /* complement = (char *) CALLOC(length+1,sizeof(char)); */
2066 for (i = length-1, j = 0; i >= 0; i--, j++) {
2067 complement[(int) j] = complCode[(int) sequence[i]];
2068 }
2069 complement[length] = '\0';
2070 return;
2071 }
2072 #endif
2073
2074 static void
make_complement_inplace(char * sequence,unsigned int length)2075 make_complement_inplace (char *sequence, unsigned int length) {
2076 char temp;
2077 unsigned int i, j;
2078
2079 for (i = 0, j = length-1; i < length/2; i++, j--) {
2080 temp = complCode[(int) sequence[i]];
2081 sequence[i] = complCode[(int) sequence[j]];
2082 sequence[j] = temp;
2083 }
2084 if (i == j) {
2085 sequence[i] = complCode[(int) sequence[i]];
2086 }
2087
2088 return;
2089 }
2090
2091
2092 static double
donor_score(Univcoord_T genomicpos,Univcoord_T chroffset,bool revcomp,Genome_T genome,Univ_IIT_T chromosome_iit)2093 donor_score (Univcoord_T genomicpos, Univcoord_T chroffset, bool revcomp, Genome_T genome,
2094 Univ_IIT_T chromosome_iit) {
2095 Univcoord_T left;
2096 Chrnum_T chrnum;
2097 int nunknowns;
2098 char gbuffer[MAXENT_MAXLENGTH];
2099 Genomecomp_T *genome_blocks;
2100
2101 if (revcomp == false) {
2102 if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2103 /* Add 1 to get from exon end to intron start */
2104 return Maxent_hr_donor_prob(genomicpos + 1,chroffset);
2105 } else {
2106 left = genomicpos + 1 - DONOR_MODEL_LEFT_MARGIN; /* Add 1 to get from exon end to intron start */
2107 Genome_fill_buffer(&chrnum,&nunknowns,genome,left,DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2108 #if 0
2109 printf("\n");
2110 printf("%s donor truestrand:+ left:%u\n",gbuffer,left);
2111 printf("%*s^^\n",DONOR_MODEL_LEFT_MARGIN,"");
2112 #endif
2113 return Maxent_donor_prob(gbuffer);
2114 }
2115
2116 } else {
2117 if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2118 return Maxent_hr_antidonor_prob(genomicpos,chroffset);
2119 } else {
2120 left = genomicpos - DONOR_MODEL_RIGHT_MARGIN - 1;
2121 Genome_fill_buffer(&chrnum,&nunknowns,genome,left,DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2122 make_complement_inplace(gbuffer,DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1);
2123 #if 0
2124 printf("\n");
2125 printf("%s donor truestrand:- left:%u\n",gbuffer,left);
2126 printf("%*s^^\n",DONOR_MODEL_LEFT_MARGIN,"");
2127 #endif
2128 return Maxent_donor_prob(gbuffer);
2129 }
2130 }
2131 }
2132
2133
2134 static double
acceptor_score(Univcoord_T genomicpos,Univcoord_T chroffset,bool revcomp,Genome_T genome,Univ_IIT_T chromosome_iit)2135 acceptor_score (Univcoord_T genomicpos, Univcoord_T chroffset, bool revcomp, Genome_T genome,
2136 Univ_IIT_T chromosome_iit) {
2137 Univcoord_T left;
2138 Chrnum_T chrnum;
2139 int nunknowns;
2140 char gbuffer[MAXENT_MAXLENGTH];
2141 Genomecomp_T *genome_blocks;
2142
2143 if (revcomp == false) {
2144 /* sense on plus strand, or antisense on minus strand */
2145 if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2146 return Maxent_hr_acceptor_prob(genomicpos,chroffset);
2147 } else {
2148 left = genomicpos - ACCEPTOR_MODEL_LEFT_MARGIN;
2149 Genome_fill_buffer(&chrnum,&nunknowns,genome,left,ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2150 #if 0
2151 printf("\n");
2152 printf("%s acceptor truestrand:+ left:%u\n",gbuffer,left);
2153 printf("%*s^^\n",ACCEPTOR_MODEL_LEFT_MARGIN-2,"");
2154 #endif
2155 return Maxent_acceptor_prob(gbuffer);
2156 }
2157
2158 } else {
2159 if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2160 /* Add 1 to get from exon end to intron start */
2161 return Maxent_hr_antiacceptor_prob(genomicpos + 1,chroffset);
2162 } else {
2163 left = genomicpos - ACCEPTOR_MODEL_RIGHT_MARGIN;
2164 Genome_fill_buffer(&chrnum,&nunknowns,genome,left,ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2165 make_complement_inplace(gbuffer,ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1);
2166 #if 0
2167 printf("\n");
2168 printf("%s acceptor truestrand:- left:%u\n",gbuffer,left);
2169 printf("%*s^^\n",ACCEPTOR_MODEL_LEFT_MARGIN-2,"");
2170 #endif
2171 return Maxent_acceptor_prob(gbuffer);
2172 }
2173 }
2174 }
2175
2176
2177
2178 static bool
unknown_base(char c)2179 unknown_base (char c) {
2180 switch (c) {
2181 case 'A': case 'C': case 'G': case 'T': case 'U':
2182 case 'a': case 'c': case 'g': case 't': case 'u': return false;
2183 default: return true;
2184 }
2185 }
2186
2187 void
Pair_print_exonsummary(Filestring_T fp,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Genome_T genome,Univ_IIT_T chromosome_iit,bool watsonp,int cdna_direction,bool genomefirstp,int invertmode)2188 Pair_print_exonsummary (Filestring_T fp, struct T *pairs, int npairs, Chrnum_T chrnum,
2189 Univcoord_T chroffset, Genome_T genome, Univ_IIT_T chromosome_iit,
2190 bool watsonp, int cdna_direction, bool genomefirstp, int invertmode) {
2191 bool in_exon = false;
2192 struct T *save = NULL, *ptr, *this = NULL;
2193 int exon_querystart = -1, exon_queryend;
2194 Chrpos_T exon_genomestart = 0, exon_genomeend, intron_start, intron_end;
2195 int num = 0, den = 0, i;
2196 char *chrstring = NULL;
2197 int last_querypos = -1;
2198 Chrpos_T last_genomepos = (Chrpos_T) -1;
2199
2200
2201 if (watsonp == true) {
2202 ptr = pairs;
2203 } else if (invertmode == 0) {
2204 ptr = pairs;
2205 } else if (invertmode == 1) {
2206 save = ptr = invert_path(pairs,npairs);
2207 } else if (invertmode == 2) {
2208 save = ptr = invert_and_revcomp_path(pairs,npairs);
2209 } else {
2210 fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
2211 exit(9);
2212 }
2213
2214 if (chrnum != 0) {
2215 if (invertmode == 2) {
2216 chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,/*watsonp*/true);
2217 } else {
2218 chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,watsonp);
2219 }
2220 }
2221
2222 debug(Pair_dump_array(pairs,npairs,/*zerobasedp*/true));
2223
2224 for (i = 0; i < npairs; i++) {
2225 /* prev = this; */
2226 this = ptr++;
2227
2228 if (this->gapp) {
2229 if (in_exon == true) {
2230 exon_queryend = last_querypos + ONEBASEDP;
2231 exon_genomeend = last_genomepos + ONEBASEDP;
2232 if (watsonp) {
2233 intron_start = exon_genomeend + 1;
2234 } else {
2235 intron_start = exon_genomeend - 1;
2236 }
2237 if (genomefirstp == true) {
2238 FPRINTF(fp," ");
2239 if (chrnum == 0) {
2240 FPRINTF(fp,"%u-%u",chroffset+exon_genomestart,chroffset+exon_genomeend);
2241 } else {
2242 FPRINTF(fp,"%s:%d-%d",chrstring,exon_genomestart,exon_genomeend);
2243 }
2244 FPRINTF(fp," (%d-%d)",exon_querystart,exon_queryend);
2245 } else {
2246 FPRINTF(fp," %d-%d",exon_querystart,exon_queryend);
2247 FPRINTF(fp," ");
2248 if (chrnum == 0) {
2249 FPRINTF(fp,"(%u-%u)",chroffset+exon_genomestart,chroffset+exon_genomeend);
2250 } else {
2251 FPRINTF(fp,"(%s:%d-%d)",chrstring,exon_genomestart,exon_genomeend);
2252 }
2253 }
2254 if (den == 0) {
2255 FPRINTF(fp," %d%%",100);
2256 } else {
2257 FPRINTF(fp," %d%%",(int) floor(100.0*(double) num/(double) den));
2258 }
2259 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
2260 FPRINTF(fp," ->");
2261 /* sensep = true; */
2262 } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
2263 FPRINTF(fp," <-");
2264 /* sensep = false; */
2265 } else if (this->comp == FWD_GCAG_INTRON_COMP) {
2266 FPRINTF(fp," -)");
2267 /* sensep = true; */
2268 } else if (this->comp == REV_GCAG_INTRON_COMP) {
2269 FPRINTF(fp," (-");
2270 /* sensep = false; */
2271 } else if (this->comp == FWD_ATAC_INTRON_COMP) {
2272 FPRINTF(fp," -]");
2273 /* sensep = true; */
2274 } else if (this->comp == REV_ATAC_INTRON_COMP) {
2275 FPRINTF(fp," [-");
2276 /* sensep = false; */
2277 } else if (this->comp == NONINTRON_COMP) {
2278 FPRINTF(fp," ==");
2279 /* sensep = true; */
2280 } else {
2281 FPRINTF(fp," ##");
2282 /* sensep = true; */
2283 }
2284 in_exon = false;
2285 }
2286 } else if (this->comp == INTRONGAP_COMP) {
2287 /* Do nothing */
2288 } else {
2289 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
2290 SHORTGAP_COMP, or MISMATCH_COMP */
2291 if (in_exon == false) {
2292 exon_querystart = this->querypos + ONEBASEDP;
2293 exon_genomestart = this->genomepos + ONEBASEDP;
2294 if (watsonp) {
2295 intron_end = exon_genomestart - 1;
2296 } else {
2297 intron_end = exon_genomestart + 1;
2298 }
2299 if (i > 0) {
2300 if (intron_end > intron_start) {
2301 FPRINTF(fp," ...%d...",intron_end - intron_start + 1);
2302 } else {
2303 FPRINTF(fp," ...%d...",intron_start - intron_end + 1);
2304 }
2305
2306 if (exon_querystart > exon_queryend + 1) {
2307 FPRINTF(fp," ***query_skip:%d***",exon_querystart-(exon_queryend+1));
2308 }
2309
2310 if (genome != NULL) {
2311 if (cdna_direction > 0) {
2312 FPRINTF(fp," %.3f, %.3f",
2313 donor_score(chroffset+exon_genomeend-1,chroffset,!watsonp,genome,chromosome_iit),
2314 acceptor_score(chroffset+exon_genomestart-1,chroffset,!watsonp,genome,chromosome_iit));
2315 } else if (cdna_direction < 0) {
2316 FPRINTF(fp," %.3f, %.3f",
2317 acceptor_score(chroffset+exon_genomeend-1,chroffset,watsonp,genome,chromosome_iit),
2318 donor_score(chroffset+exon_genomestart-1,chroffset,watsonp,genome,chromosome_iit));
2319 }
2320 }
2321
2322 PUTC('\n',fp);
2323 }
2324 num = den = 0;
2325 in_exon = true;
2326 }
2327 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
2328 /* Previously not counted in numerator or denominator */
2329 den++;
2330 #ifndef PMAP
2331 } else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
2332 /* Comp must be a space */
2333 /* Don't count in numerator or denominator */
2334 #endif
2335 } else {
2336 den++;
2337 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
2338 num++;
2339 } else if (this->comp == AMBIGUOUS_COMP) {
2340 #ifdef PMAP
2341 num++;
2342 #else
2343 den--;
2344 #endif
2345 }
2346 }
2347 }
2348
2349 if (this->cdna != ' ') {
2350 last_querypos = this->querypos;
2351 }
2352 if (this->genome != ' ') {
2353 last_genomepos = this->genomepos;
2354 }
2355 }
2356
2357 /* prev = this; */
2358 exon_queryend = last_querypos + ONEBASEDP;
2359 exon_genomeend = last_genomepos + ONEBASEDP;
2360 if (genomefirstp == true) {
2361 FPRINTF(fp," ");
2362 if (chrnum == 0) {
2363 FPRINTF(fp,"%u-%u",chroffset+exon_genomestart,chroffset+exon_genomeend);
2364 } else {
2365 FPRINTF(fp,"%s:%d-%d",chrstring,exon_genomestart,exon_genomeend);
2366 }
2367 FPRINTF(fp," (%d-%d)",exon_querystart,exon_queryend);
2368 } else {
2369 FPRINTF(fp," %d-%d",exon_querystart,exon_queryend);
2370 FPRINTF(fp," ");
2371 if (chrnum == 0) {
2372 FPRINTF(fp,"(%u-%u)",chroffset+exon_genomestart,chroffset+exon_genomeend);
2373 } else {
2374 FPRINTF(fp,"(%s:%d-%d)",chrstring,exon_genomestart,exon_genomeend);
2375 }
2376 }
2377 if (den == 0) {
2378 FPRINTF(fp," %d%%",100);
2379 } else {
2380 FPRINTF(fp," %d%%",(int) floor(100.0*(double) num/(double) den));
2381 }
2382 FPRINTF(fp,"\n\n");
2383
2384 if (chrstring != NULL) {
2385 FREE(chrstring);
2386 }
2387 if (save != NULL) {
2388 FREE(save);
2389 }
2390
2391 return;
2392 }
2393
2394 void
Pair_tokens_free(List_T * tokens)2395 Pair_tokens_free (List_T *tokens) {
2396 List_T p;
2397 char *token;
2398
2399 for (p = *tokens; p != NULL; p = List_next(p)) {
2400 token = (char *) List_head(p);
2401 FREE_OUT(token);
2402 }
2403 List_free_out(&(*tokens));
2404
2405 return;
2406 }
2407
2408
2409 List_T
Pair_tokens_copy(List_T old)2410 Pair_tokens_copy (List_T old) {
2411 List_T new = NULL;
2412 char *new_token, *old_token;
2413
2414 while (old != NULL) {
2415 old_token = (char *) List_head(old);
2416 new_token = (char *) MALLOC_OUT((strlen(old_token)+1) * sizeof(char));
2417 strcpy(new_token,old_token);
2418 new = List_push_out(new,(void *) new_token);
2419 old = List_next(old);
2420 }
2421
2422 return List_reverse(new);
2423 }
2424
2425
2426
2427 #if 0
2428 /* Tokens used by compressed and gff3 formats */
2429 /* Used by Pair_print_compressed_old */
2430 static void
2431 print_tokens_compressed (Filestring_T fp, List_T tokens) {
2432 List_T p;
2433 int tokencount = 1;
2434 char *token, *lasttoken = NULL;
2435
2436 for (p = tokens; p != NULL; p = List_next(p)) {
2437 token = (char *) List_head(p);
2438 if (lasttoken == NULL) {
2439 FPRINTF(fp,"\t%s",token);
2440 lasttoken = token;
2441 } else if (!strcmp(token,lasttoken)) {
2442 tokencount++;
2443 } else {
2444 if (tokencount > 1) {
2445 FPRINTF(fp,"!%d",tokencount);
2446 }
2447 FPRINTF(fp," %s",token);
2448 lasttoken = token;
2449 tokencount = 1;
2450 }
2451 }
2452 if (tokencount > 1) {
2453 FPRINTF(fp,"!%d",tokencount);
2454 }
2455
2456 for (p = tokens; p != NULL; p = List_next(p)) {
2457 token = (char *) List_head(p);
2458 FREE_OUT(token);
2459 }
2460
2461 return;
2462 }
2463 #endif
2464
2465
2466 static void
print_tokens_gff3(Filestring_T fp,List_T tokens)2467 print_tokens_gff3 (Filestring_T fp, List_T tokens) {
2468 List_T p;
2469 char *token;
2470
2471 if (tokens != NULL) {
2472 p = tokens;
2473 token = (char *) List_head(p);
2474 FPRINTF(fp,"%s",token);
2475
2476 for (p = List_next(p); p != NULL; p = List_next(p)) {
2477 token = (char *) List_head(p);
2478 FPRINTF(fp," %s",token);
2479 }
2480 }
2481
2482 for (p = tokens; p != NULL; p = List_next(p)) {
2483 token = (char *) List_head(p);
2484 FREE_OUT(token);
2485 }
2486
2487 return;
2488 }
2489
2490 static List_T
push_token(List_T tokens,char * token)2491 push_token (List_T tokens, char *token) {
2492 char *copy;
2493
2494 copy = (char *) MALLOC_OUT((strlen(token)+1) * sizeof(char));
2495 strcpy(copy,token);
2496 return List_push_out(tokens,(void *) copy);
2497 }
2498
2499
2500 /* Definition of GFF3 format is at http://song.sourceforge.net/gff3.shtml */
2501
2502 static void
print_gff3_gene(Filestring_T fp,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T start_genomepos,Chrpos_T end_genomepos,bool watsonp,int cdna_direction)2503 print_gff3_gene (Filestring_T fp, int pathnum, char *sourcename, char *accession, char *fasta_annotation,
2504 char *chrstring, Chrpos_T start_genomepos, Chrpos_T end_genomepos,
2505 bool watsonp, int cdna_direction) {
2506
2507 /* 1: seqid */
2508 if (chrstring == NULL) {
2509 FPRINTF(fp,"%s\t","NA");
2510 } else {
2511 FPRINTF(fp,"%s\t",chrstring);
2512 }
2513 FPRINTF(fp,"%s\t",sourcename); /* 2: source */
2514 FPRINTF(fp,"gene\t"); /* 3: type */
2515
2516 if (start_genomepos < end_genomepos) {
2517 FPRINTF(fp,"%u\t%u\t",start_genomepos,end_genomepos); /* 4,5: start, end */
2518 } else {
2519 FPRINTF(fp,"%u\t%u\t",end_genomepos,start_genomepos); /* 4,5: start, end */
2520 }
2521
2522 FPRINTF(fp,".\t"); /* 6: score */
2523
2524 if (watsonp == true) {
2525 if (cdna_direction >= 0) {
2526 FPRINTF(fp,"+\t");
2527 } else {
2528 FPRINTF(fp,"-\t");
2529 }
2530 } else {
2531 if (cdna_direction >= 0) {
2532 FPRINTF(fp,"-\t"); /* 7: strand */
2533 } else {
2534 FPRINTF(fp,"+\t");
2535 }
2536 }
2537
2538 FPRINTF(fp,".\t"); /* 8: phase */
2539
2540 /* 9: features */
2541 if (accession == NULL) {
2542 FPRINTF(fp,"ID=%s.path%d;Name=%s","NA",pathnum,"NA");
2543 } else {
2544 FPRINTF(fp,"ID=%s.path%d;Name=%s",accession,pathnum,accession);
2545 }
2546
2547 if (fasta_annotation != NULL) {
2548 FPRINTF(fp,";%s",fasta_annotation);
2549 }
2550
2551 if (cdna_direction > 0) {
2552 FPRINTF(fp,";Dir=sense");
2553 } else if (cdna_direction < 0) {
2554 FPRINTF(fp,";Dir=antisense");
2555 } else {
2556 FPRINTF(fp,";Dir=indeterminate");
2557 }
2558
2559 PUTC('\n',fp);
2560
2561 return;
2562 }
2563
2564 static void
print_gff3_mrna(Filestring_T fp,int pathnum,T start,T end,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T start_genomepos,Chrpos_T end_genomepos,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction)2565 print_gff3_mrna (Filestring_T fp, int pathnum, T start, T end,
2566 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2567 Chrpos_T start_genomepos, Chrpos_T end_genomepos,
2568 int querylength_given, int skiplength, int matches, int mismatches,
2569 int qindels, int tindels, int unknowns, bool watsonp, int cdna_direction) {
2570 int den;
2571 int querypos1, querypos2;
2572 double coverage, fracidentity;
2573
2574 /* 1: seqid */
2575 if (chrstring == NULL) {
2576 FPRINTF(fp,"%s\t","NA");
2577 } else {
2578 FPRINTF(fp,"%s\t",chrstring);
2579 }
2580 FPRINTF(fp,"%s\t",sourcename); /* 2: source */
2581 FPRINTF(fp,"mRNA\t"); /* 3: type */
2582 if (start_genomepos < end_genomepos) {
2583 FPRINTF(fp,"%u\t%u\t",start_genomepos,end_genomepos); /* 4,5: start, end */
2584 } else {
2585 FPRINTF(fp,"%u\t%u\t",end_genomepos,start_genomepos); /* 4,5: start, end */
2586 }
2587
2588 FPRINTF(fp,".\t"); /* 6: score */
2589
2590 if (watsonp == true) {
2591 if (cdna_direction >= 0) {
2592 FPRINTF(fp,"+\t");
2593 } else {
2594 FPRINTF(fp,"-\t");
2595 }
2596 } else {
2597 if (cdna_direction >= 0) {
2598 FPRINTF(fp,"-\t"); /* 7: strand */
2599 } else {
2600 FPRINTF(fp,"+\t");
2601 }
2602 }
2603
2604 FPRINTF(fp,".\t"); /* 8: phase */
2605
2606 /* 9: features */
2607 if (accession == NULL) {
2608 FPRINTF(fp,"ID=%s.mrna%d;Name=%s;Parent=%s.path%d",
2609 "NA",pathnum,"NA","NA",pathnum);
2610 } else {
2611 FPRINTF(fp,"ID=%s.mrna%d;Name=%s;Parent=%s.path%d",
2612 accession,pathnum,accession,accession,pathnum);
2613 }
2614
2615 if (fasta_annotation != NULL) {
2616 FPRINTF(fp,";%s",fasta_annotation);
2617 }
2618
2619 if (cdna_direction > 0) {
2620 FPRINTF(fp,";Dir=sense");
2621 } else if (cdna_direction < 0) {
2622 FPRINTF(fp,";Dir=antisense");
2623 } else {
2624 FPRINTF(fp,";Dir=indeterminate");
2625 }
2626
2627 querypos1 = start->querypos;
2628 querypos2 = end->querypos;
2629
2630 #ifdef PMAP
2631 coverage = (double) (querypos2 - querypos1 + 1)/(double) (3*(querylength_given + skiplength));
2632 /* Can have coverage greater than given querylength because of added '*' at end */
2633 if (coverage > 1.0) {
2634 coverage = 1.0;
2635 }
2636 #else
2637 coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given + skiplength);
2638 #endif
2639 FPRINTF(fp,";coverage=%.1f",((double) rint(1000.0*coverage))/10.0);
2640
2641 if ((den = matches + mismatches + qindels + tindels) == 0) {
2642 fracidentity = 1.0;
2643 } else {
2644 fracidentity = (double) matches/(double) den;
2645 }
2646 FPRINTF(fp,";identity=%.1f",((double) rint(1000.0*fracidentity))/10.0);
2647 FPRINTF(fp,";matches=%d;mismatches=%d;indels=%d;unknowns=%d",
2648 matches,mismatches,qindels+tindels,unknowns);
2649
2650 PUTC('\n',fp);
2651
2652 return;
2653 }
2654
2655
2656 static void
print_gff3_exon(Filestring_T fp,int exonno,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int exon_querystart,int exon_queryend,bool watsonp,int cdna_direction,int pctidentity)2657 print_gff3_exon (Filestring_T fp, int exonno, int pathnum, char *sourcename,
2658 char *accession, char *fasta_annotation, char *chrstring,
2659 Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
2660 int exon_querystart, int exon_queryend, bool watsonp, int cdna_direction,
2661 int pctidentity) {
2662
2663 if (exon_genomestart == exon_genomeend) {
2664 /* Due to a query skip, so don't print */
2665
2666 } else {
2667 /* 1: seqid */
2668 if (chrstring == NULL) {
2669 FPRINTF(fp,"%s\t","NA");
2670 } else {
2671 FPRINTF(fp,"%s\t",chrstring);
2672 }
2673 FPRINTF(fp,"%s\t",sourcename); /* 2: source */
2674 FPRINTF(fp,"exon\t"); /* 3: type */
2675 if (exon_genomestart < exon_genomeend) {
2676 FPRINTF(fp,"%u\t%u\t",exon_genomestart,exon_genomeend); /* 4,5: start, end */
2677 } else {
2678 FPRINTF(fp,"%u\t%u\t",exon_genomeend,exon_genomestart); /* 4,5: start, end */
2679 }
2680 FPRINTF(fp,"%d\t",pctidentity); /* 6: score */
2681
2682 if (watsonp == true) {
2683 if (cdna_direction >= 0) {
2684 FPRINTF(fp,"+\t");
2685 } else {
2686 FPRINTF(fp,"-\t");
2687 }
2688 } else {
2689 if (cdna_direction >= 0) {
2690 FPRINTF(fp,"-\t"); /* 7: strand */
2691 } else {
2692 FPRINTF(fp,"+\t");
2693 }
2694 }
2695
2696 FPRINTF(fp,".\t"); /* 8: phase */
2697
2698 /* 9: features */
2699 if (accession == NULL) {
2700 accession = "NA";
2701 }
2702 FPRINTF(fp,"ID=%s.mrna%d.exon%d;",accession,pathnum,exonno);
2703 FPRINTF(fp,"Name=%s;",accession);
2704 FPRINTF(fp,"Parent=%s.mrna%d",accession,pathnum);
2705
2706 if (fasta_annotation != NULL) {
2707 FPRINTF(fp,";%s",fasta_annotation);
2708 }
2709
2710 if (cdna_direction > 0) {
2711 FPRINTF(fp,";Target=%s %d %d +\n",accession,exon_querystart,exon_queryend);
2712 } else if (cdna_direction < 0) {
2713 FPRINTF(fp,";Target=%s %d %d -\n",accession,exon_queryend,exon_querystart);
2714 } else {
2715 FPRINTF(fp,";Target=%s %d %d .\n",accession,exon_queryend,exon_querystart);
2716 }
2717 }
2718
2719 return;
2720 }
2721
2722 static void
print_gff3_cds(Filestring_T fp,int cdsno,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T cds_genomestart,Chrpos_T cds_genomeend,int cds_querystart,int cds_queryend,bool watsonp,int cdna_direction,int pctidentity,int cds_phase)2723 print_gff3_cds (Filestring_T fp, int cdsno, int pathnum,
2724 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2725 Chrpos_T cds_genomestart, Chrpos_T cds_genomeend,
2726 int cds_querystart, int cds_queryend, bool watsonp, int cdna_direction,
2727 int pctidentity, int cds_phase) {
2728
2729 assert(cds_phase >= 0);
2730
2731 if (cds_genomestart == cds_genomeend) {
2732 /* Due to a query skip, so don't print */
2733
2734 } else {
2735 /* 1: seqid */
2736 if (chrstring == NULL) {
2737 FPRINTF(fp,"%s\t","NA");
2738 } else {
2739 FPRINTF(fp,"%s\t",chrstring);
2740 }
2741 FPRINTF(fp,"%s\t",sourcename); /* 2: source */
2742 FPRINTF(fp,"CDS\t"); /* 3: type */
2743 if (cds_genomestart < cds_genomeend) {
2744 FPRINTF(fp,"%u\t%u\t",cds_genomestart,cds_genomeend); /* 4,5: start, end */
2745 } else {
2746 FPRINTF(fp,"%u\t%u\t",cds_genomeend,cds_genomestart); /* 4,5: start, end */
2747 }
2748 FPRINTF(fp,"%d\t",pctidentity); /* 6: score */
2749
2750 if (watsonp == true) {
2751 if (cdna_direction >= 0) {
2752 FPRINTF(fp,"+\t");
2753 } else {
2754 FPRINTF(fp,"-\t");
2755 }
2756 } else {
2757 if (cdna_direction >= 0) {
2758 FPRINTF(fp,"-\t"); /* 7: strand */
2759 } else {
2760 FPRINTF(fp,"+\t");
2761 }
2762 }
2763
2764 if (gff3_phase_swap_p == true && cds_phase > 0) {
2765 /* Some analysis programs want phase in gff3 to be different */
2766 FPRINTF(fp,"%d\t",3 - cds_phase); /* 8: phase */
2767 } else {
2768 /* This appears to be the specification: a phase of 0 indicates
2769 that the next codon begins at the first base of the region
2770 described by the current line, a phase of 1 indicates that the
2771 next codon begins at the second base of this region, and a
2772 phase of 2 indicates that the codon begins at the third base of
2773 this region. */
2774 FPRINTF(fp,"%d\t",cds_phase); /* 8: phase */
2775 }
2776
2777 /* 9: features */
2778 if (accession == NULL) {
2779 accession = "NA";
2780 }
2781 FPRINTF(fp,"ID=%s.mrna%d.cds%d;",accession,pathnum,cdsno);
2782 FPRINTF(fp,"Name=%s;",accession);
2783 FPRINTF(fp,"Parent=%s.mrna%d",accession,pathnum);
2784
2785 if (fasta_annotation != NULL) {
2786 FPRINTF(fp,";%s",fasta_annotation);
2787 }
2788
2789 if (cdna_direction > 0) {
2790 FPRINTF(fp,";Target=%s %d %d +\n",accession,cds_querystart,cds_queryend);
2791 } else if (cdna_direction > 0) {
2792 FPRINTF(fp,";Target=%s %d %d -\n",accession,cds_queryend,cds_querystart);
2793 } else {
2794 FPRINTF(fp,";Target=%s %d %d .\n",accession,cds_queryend,cds_querystart);
2795 }
2796 }
2797
2798 return;
2799 }
2800
2801
2802 static void
print_gff3_cdna_match(Filestring_T fp,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int exon_querystart,int exon_queryend,bool watsonp,int cdna_direction,int pctidentity,List_T tokens)2803 print_gff3_cdna_match (Filestring_T fp, int pathnum,
2804 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2805 Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
2806 int exon_querystart, int exon_queryend, bool watsonp, int cdna_direction,
2807 int pctidentity, List_T tokens) {
2808
2809 if (exon_genomestart == exon_genomeend) {
2810 /* Due to a query skip, so don't print */
2811
2812 } else {
2813 /* 1: seqid */
2814 if (chrstring == NULL) {
2815 FPRINTF(fp,"%s\t","NA");
2816 } else {
2817 FPRINTF(fp,"%s\t",chrstring);
2818 }
2819 FPRINTF(fp,"%s\t",sourcename); /* 2: source */
2820 FPRINTF(fp,"cDNA_match\t"); /* 3: type */
2821 if (exon_genomestart < exon_genomeend) {
2822 FPRINTF(fp,"%u\t%u\t",exon_genomestart,exon_genomeend); /* 4,5: start, end */
2823 } else {
2824 FPRINTF(fp,"%u\t%u\t",exon_genomeend,exon_genomestart); /* 4,5: start, end */
2825 }
2826 FPRINTF(fp,"%d\t",pctidentity); /* 6: score */
2827
2828 /* 7: strand */
2829 if (watsonp == true) {
2830 FPRINTF(fp,"+\t");
2831 } else {
2832 FPRINTF(fp,"-\t");
2833 }
2834
2835 FPRINTF(fp,".\t"); /* 8: phase */
2836
2837 /* 9: features */
2838 if (accession == NULL) {
2839 accession = "NA";
2840 }
2841 FPRINTF(fp,"ID=%s.path%d;",accession,pathnum);
2842 FPRINTF(fp,"Name=%s",accession);
2843
2844 if (fasta_annotation != NULL) {
2845 FPRINTF(fp,";%s",fasta_annotation);
2846 }
2847
2848 if (cdna_direction > 0) {
2849 FPRINTF(fp,";Dir=sense");
2850 } else if (cdna_direction < 0) {
2851 FPRINTF(fp,";Dir=antisense");
2852 } else {
2853 FPRINTF(fp,";Dir=indeterminate");
2854 }
2855
2856 FPRINTF(fp,";Target=%s %d %d;Gap=",accession,exon_querystart,exon_queryend);
2857 print_tokens_gff3(fp,tokens);
2858 PUTC('\n',fp);
2859 }
2860
2861 return;
2862 }
2863
2864
2865 static char
strand_char(int strand)2866 strand_char (int strand) {
2867 switch (strand) {
2868 case 1: return '+';
2869 case -1: return '-';
2870 /* case 0: return '?'; -- Now returning '.' for unknown strand */
2871 default: return '.';
2872 }
2873 }
2874
2875
2876 static void
print_gff3_est_match(Filestring_T fp,int pathnum,T start,T end,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int exon_querystart,int exon_queryend,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction,int pctidentity,List_T tokens)2877 print_gff3_est_match (Filestring_T fp, int pathnum, T start, T end,
2878 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2879 Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
2880 int exon_querystart, int exon_queryend,
2881 int querylength_given, int skiplength, int matches, int mismatches, int qindels, int tindels,
2882 int unknowns, bool watsonp, int cdna_direction, int pctidentity, List_T tokens) {
2883 int feature_strand, target_strand;
2884 double coverage, fracidentity;
2885 int den;
2886 int querypos1, querypos2;
2887
2888 if (exon_genomestart == exon_genomeend) {
2889 /* Due to a query skip, so don't print */
2890
2891 } else {
2892 /* 1: seqid */
2893 if (chrstring == NULL) {
2894 FPRINTF(fp,"%s\t","NA");
2895 } else {
2896 FPRINTF(fp,"%s\t",chrstring);
2897 }
2898 FPRINTF(fp,"%s\t",sourcename); /* 2: source */
2899 FPRINTF(fp,"EST_match\t"); /* 3: type */
2900 if (exon_genomestart < exon_genomeend) {
2901 FPRINTF(fp,"%u\t%u\t",exon_genomestart,exon_genomeend); /* 4,5: start, end */
2902 } else {
2903 FPRINTF(fp,"%u\t%u\t",exon_genomeend,exon_genomestart); /* 4,5: start, end */
2904 }
2905 FPRINTF(fp,"%d\t",pctidentity); /* 6: score */
2906
2907 /* 7: strand */
2908 feature_strand = watsonp ? cdna_direction : -cdna_direction;
2909 FPRINTF(fp,"%c\t",strand_char(feature_strand));
2910
2911 FPRINTF(fp,".\t"); /* 8: phase */
2912
2913 /* 9: features */
2914 if (accession == NULL) {
2915 accession = "NA";
2916 }
2917 FPRINTF(fp,"ID=%s.path%d;",accession,pathnum);
2918 FPRINTF(fp,"Name=%s",accession);
2919
2920 if (fasta_annotation != NULL) {
2921 FPRINTF(fp,";%s",fasta_annotation);
2922 }
2923
2924 if (cdna_direction > 0) {
2925 FPRINTF(fp,";Dir=sense");
2926 } else if (cdna_direction < 0) {
2927 FPRINTF(fp,";Dir=antisense");
2928 } else {
2929 FPRINTF(fp,";Dir=indeterminate");
2930 }
2931
2932 target_strand = cdna_direction != 0 ? cdna_direction : (watsonp ? 1 : -1);
2933 FPRINTF(fp,";Target=%s %d %d %c;Gap=",accession,exon_querystart,exon_queryend,
2934 strand_char(target_strand));
2935 print_tokens_gff3(fp,tokens);
2936
2937 querypos1 = start->querypos;
2938 querypos2 = end->querypos;
2939
2940 #ifdef PMAP
2941 coverage = (double) (querypos2 - querypos1 + 1)/(double) (3*(querylength_given + skiplength));
2942 /* Can have coverage greater than given querylength because of added '*' at end */
2943 if (coverage > 1.0) {
2944 coverage = 1.0;
2945 }
2946 #else
2947 coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given + skiplength);
2948 #endif
2949 FPRINTF(fp,";coverage=%.1f",((double) rint(1000.0*coverage))/10.0);
2950
2951 if ((den = matches + mismatches + qindels + tindels) == 0) {
2952 fracidentity = 1.0;
2953 } else {
2954 fracidentity = (double) matches/(double) den;
2955 }
2956 FPRINTF(fp,";identity=%.1f",((double) rint(1000.0*fracidentity))/10.0);
2957 FPRINTF(fp,";matches=%d;mismatches=%d;indels=%d;unknowns=%d",
2958 matches,mismatches,qindels+tindels,unknowns);
2959
2960 PUTC('\n',fp);
2961 }
2962
2963 return;
2964 }
2965
2966
2967 static void
print_gff3_exons_forward(Filestring_T fp,struct T * pairs,int npairs,int pathnum,T start,T end,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction,bool gff_introns_p,bool gff_gene_format_p,bool gff_estmatch_format_p,bool cds_p)2968 print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int pathnum, T start, T end,
2969 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2970 int querylength_given, int skiplength, int matches, int mismatches,
2971 int qindels, int tindels, int unknowns, bool watsonp, int cdna_direction,
2972 bool gff_introns_p, bool gff_gene_format_p, bool gff_estmatch_format_p,
2973 bool cds_p) {
2974 bool in_exon = false;
2975 struct T *ptr, *this = NULL;
2976 int exon_querystart = -1, exon_queryend, exon_phase = 0;
2977 Chrpos_T exon_genomestart = 0, exon_genomeend, intron_start, intron_end;
2978 int pctidentity, num = 0, den = 0, exonno = 0, cdsno = 0, starti, endi, last_valid_i, i;
2979 int Mlength = 0, Ilength = 0, Dlength = 0;
2980 List_T tokens = NULL;
2981 char token[11];
2982 #if 0
2983 int intronno = 0;
2984 #endif
2985 int estmatch_querystart, estmatch_queryend, estmatch_genomestart, estmatch_genomeend;
2986 int last_querypos = -1;
2987 Chrpos_T last_genomepos = (Chrpos_T) -1;
2988
2989 endi = npairs - 1;
2990 if (cds_p == false) {
2991 starti = 0;
2992
2993 } else if (cdstype == CDS_CDNA) {
2994 i = 0;
2995 starti = -1;
2996 while (i < npairs) {
2997 if (pairs[i].gapp == true) {
2998 i++;
2999 } else if (pairs[i].cdna == ' ') {
3000 i++;
3001 } else if (pairs[i].aaphase_e == -1) {
3002 i++;
3003 } else {
3004 debug7(printf("FORWARD: Setting starti to be %d\n",i));
3005 starti = i;
3006 last_valid_i = i;
3007 while (i < npairs) {
3008 if (pairs[i].gapp == true) {
3009 i++;
3010 } else if (pairs[i].cdna == ' ') {
3011 i++;
3012 } else if (pairs[i].aaphase_e != -1) {
3013 last_valid_i = i;
3014 i++;
3015 } else {
3016 debug7(printf("FORWARD: Saw aaphase_e of -1 at pair %d\n",i));
3017 endi = last_valid_i; /* inclusive */
3018 i = npairs;
3019 }
3020 }
3021 }
3022 }
3023
3024 } else if (cdstype == CDS_GENOMIC) {
3025 i = 0;
3026 starti = -1;
3027 while (i < npairs) {
3028 if (pairs[i].gapp == true) {
3029 i++;
3030 } else if (pairs[i].genome == ' ') {
3031 i++;
3032 } else if (pairs[i].aaphase_g == -1) {
3033 i++;
3034 } else {
3035 debug7(printf("FORWARD: Setting starti to be %d\n",i));
3036 starti = i;
3037 last_valid_i = i;
3038 while (i < npairs) {
3039 if (pairs[i].gapp == true) {
3040 i++;
3041 } else if (pairs[i].genome == ' ') {
3042 i++;
3043 } else if (pairs[i].aaphase_g != -1) {
3044 last_valid_i = i;
3045 i++;
3046 } else {
3047 debug7(printf("FORWARD: Saw aaphase_g of -1 at pair %d\n",i));
3048 endi = last_valid_i; /* inclusive */
3049 i = npairs;
3050 }
3051 }
3052 }
3053 }
3054
3055 } else {
3056 fprintf(stderr,"Do not recognize cdstype %d\n",cdstype);
3057 abort();
3058 }
3059
3060 debug7(Pair_dump_array(pairs,npairs,true));
3061
3062 if (cds_p == true && starti < 0) {
3063 /* Want CDS, and none seen */
3064 return;
3065 }
3066
3067 ptr = &(pairs[starti]);
3068 for (i = starti; i <= endi; i++) {
3069 /* prev = this; */
3070 this = ptr++;
3071
3072 if (this->gapp) {
3073 if (in_exon == true) {
3074 exon_queryend = last_querypos + 1;
3075 exon_genomeend = last_genomepos + 1;
3076
3077 if (watsonp) {
3078 intron_start = exon_genomeend + 1;
3079 } else {
3080 intron_start = exon_genomeend - 1;
3081 }
3082
3083 if (den == 0) {
3084 pctidentity = 100;
3085 } else {
3086 pctidentity = (int) floor(100.0*(double) num/(double) den);
3087 }
3088
3089 if (cds_p == true) {
3090 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3091 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3092
3093 } else if (gff_gene_format_p == true) {
3094 print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3095 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3096 } else {
3097 if (Mlength > 0) {
3098 sprintf(token,"M%d",Mlength);
3099 tokens = push_token(tokens,token);
3100 } else if (Ilength > 0) {
3101 sprintf(token,"I%d",Ilength);
3102 tokens = push_token(tokens,token);
3103 } else if (Dlength > 0) {
3104 sprintf(token,"D%d",Dlength);
3105 tokens = push_token(tokens,token);
3106 }
3107 if (gff_estmatch_format_p == false) {
3108 tokens = List_reverse(tokens);
3109 /* ++exonno; */
3110 print_gff3_cdna_match(fp,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3111 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,tokens);
3112 List_free_out(&tokens);
3113 }
3114 }
3115
3116 Mlength = Ilength = Dlength = 0;
3117 in_exon = false;
3118 }
3119 } else if (this->comp == INTRONGAP_COMP) {
3120 /* Do nothing */
3121 } else {
3122 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
3123 SHORTGAP_COMP, or MISMATCH_COMP */
3124 if (in_exon == false) {
3125 exon_querystart = this->querypos + 1;
3126 exon_genomestart = this->genomepos + 1;
3127 #if 0
3128 if (this->aaphase_e != -1) {
3129 /* Otherwise, if phase is -1 from an indel, use previous exon_phase. Should be fixed now. */
3130 exon_phase = this->aaphase_e;
3131 }
3132 #else
3133 if (cdstype == CDS_CDNA) {
3134 exon_phase = this->aaphase_e;
3135 } else {
3136 exon_phase = this->aaphase_g;
3137 }
3138 #endif
3139 if (watsonp) {
3140 intron_end = exon_genomestart - 1;
3141 } else {
3142 intron_end = exon_genomestart + 1;
3143 }
3144
3145 if (gff_estmatch_format_p == true && i > 0) {
3146 /* abs() gives a large value when flag -m64 is specified */
3147 /* sprintf(token,"N%u",abs(intron_end - intron_start) + 1); */
3148 if (intron_end > intron_start) {
3149 sprintf(token,"N%u",(intron_end - intron_start) + 1);
3150 } else {
3151 sprintf(token,"N%u",(intron_start - intron_end) + 1);
3152 }
3153
3154 tokens = push_token(tokens,token);
3155 } else if (gff_introns_p == true) {
3156 if (i > 0) {
3157 #if 0
3158 printf_gff3_intron(++intronno,pathnum,sourcename,accession,chrstring,?,?,intron_start,intron_end,watsonp);
3159 #endif
3160 }
3161 PUTC('\n',fp);
3162 }
3163
3164 num = den = 0;
3165 in_exon = true;
3166 }
3167 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3168 /* Gap in upper or lower sequence */
3169 if (gff_gene_format_p == true) {
3170 /* Don't deal with tokens */
3171 } else if (this->genome == ' ') {
3172 if (Mlength > 0) {
3173 sprintf(token,"M%d",Mlength);
3174 tokens = push_token(tokens,token);
3175 Mlength = 0;
3176 } else if (Dlength > 0) {
3177 /* unlikely */
3178 sprintf(token,"D%d",Dlength);
3179 tokens = push_token(tokens,token);
3180 Dlength = 0;
3181 }
3182 Ilength++;
3183 } else if (this->cdna == ' ') {
3184 if (Mlength > 0) {
3185 sprintf(token,"M%d",Mlength);
3186 tokens = push_token(tokens,token);
3187 Mlength = 0;
3188 } else if (Ilength > 0) {
3189 sprintf(token,"I%d",Ilength);
3190 tokens = push_token(tokens,token);
3191 Ilength = 0;
3192 }
3193 Dlength++;
3194 } else {
3195 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
3196 exit(9);
3197 }
3198
3199 /* Previously not counted in numerator or denominator */
3200 den++;
3201
3202 } else {
3203 /* Count in token even if unknown base */
3204
3205 if (gff_gene_format_p == true) {
3206 /* Don't deal with tokens */
3207 } else if (Ilength > 0) {
3208 sprintf(token,"I%d",Ilength);
3209 tokens = push_token(tokens,token);
3210 Ilength = 0;
3211 } else if (Dlength > 0) {
3212 sprintf(token,"D%d",Dlength);
3213 tokens = push_token(tokens,token);
3214 Dlength = 0;
3215 }
3216 Mlength++;
3217
3218 #ifdef PMAP
3219 den++;
3220 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3221 num++;
3222 } else if (this->comp == AMBIGUOUS_COMP) {
3223 num++;
3224 }
3225 #else
3226 if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3227 /* Comp must be a space */
3228 /* Don't count in numerator or denominator */
3229 } else {
3230 den++;
3231 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3232 num++;
3233 } else if (this->comp == AMBIGUOUS_COMP) {
3234 den--;
3235 }
3236 }
3237 #endif
3238
3239 }
3240 }
3241
3242 if (this->cdna != ' ') {
3243 last_querypos = this->querypos;
3244 }
3245 if (this->genome != ' ') {
3246 last_genomepos = this->genomepos;
3247 }
3248 }
3249
3250 /* prev = this; */
3251 exon_queryend = last_querypos + 1;
3252 exon_genomeend = last_genomepos + 1;
3253
3254 if (den == 0) {
3255 pctidentity = 100;
3256 } else {
3257 pctidentity = (int) floor(100.0*(double) num/(double) den);
3258 }
3259
3260 if (cds_p == true) {
3261 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3262 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3263
3264 } else if (gff_gene_format_p == true) {
3265 print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3266 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3267 } else {
3268 if (Mlength > 0) {
3269 sprintf(token,"M%d",Mlength);
3270 tokens = push_token(tokens,token);
3271 } else if (Ilength > 0) {
3272 sprintf(token,"I%d",Ilength);
3273 tokens = push_token(tokens,token);
3274 } else if (Dlength > 0) {
3275 sprintf(token,"D%d",Dlength);
3276 tokens = push_token(tokens,token);
3277 }
3278 if (gff_estmatch_format_p == true) {
3279 estmatch_querystart = pairs->querypos + 1;
3280 estmatch_queryend = exon_queryend;
3281 estmatch_genomestart = pairs->genomepos + 1;
3282 estmatch_genomeend = exon_genomeend;
3283 if (watsonp) {
3284 tokens = List_reverse(tokens);
3285 }
3286 print_gff3_est_match(fp,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3287 estmatch_genomestart,estmatch_genomeend,
3288 estmatch_querystart,estmatch_queryend,
3289 querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3290 watsonp,cdna_direction,pctidentity,tokens);
3291 } else {
3292 tokens = List_reverse(tokens);
3293 /* ++exonno; */
3294 print_gff3_cdna_match(fp,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3295 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,tokens);
3296 }
3297 List_free_out(&tokens);
3298 }
3299
3300 return;
3301 }
3302
3303 static void
print_gff3_exons_backward(Filestring_T fp,struct T * pairs,int npairs,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,bool watsonp,int cdna_direction,bool gff_introns_p,bool cds_p)3304 print_gff3_exons_backward (Filestring_T fp, struct T *pairs, int npairs, int pathnum,
3305 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
3306 bool watsonp, int cdna_direction, bool gff_introns_p, bool cds_p) {
3307 bool in_exon = false;
3308 struct T *ptr, *this = NULL;
3309 int exon_querystart = -1, exon_queryend, exon_phase = 0;
3310 Chrpos_T exon_genomestart = 0, exon_genomeend;
3311 int pctidentity, num = 0, den = 0, exonno = 0, cdsno = 0, starti, endi, last_valid_i, i;
3312 #if 0
3313 int intronno = 0;
3314 Chrpos_T intron_start, intron_end;
3315 #endif
3316 int last_querypos = -1;
3317 Chrpos_T last_genomepos = (Chrpos_T) -1;
3318
3319 starti = 0;
3320 if (cds_p == false) {
3321 endi = npairs - 1;
3322
3323 } else if (cdstype == CDS_CDNA) {
3324 i = npairs - 1;
3325 endi = npairs;
3326 while (i >= 0) {
3327 if (pairs[i].gapp == true) {
3328 i--;
3329 } else if (pairs[i].cdna == ' ') {
3330 i--;
3331 } else if (pairs[i].aaphase_e == -1) {
3332 i--;
3333 } else {
3334 debug7(printf("BACKWARD: Setting endi to be %d\n",i));
3335 endi = i;
3336 last_valid_i = i;
3337 while (i >= 0) {
3338 if (pairs[i].gapp == true) {
3339 i--;
3340 } else if (pairs[i].cdna == ' ') {
3341 i--;
3342 } else if (pairs[i].aaphase_e != -1) {
3343 last_valid_i = i;
3344 i--;
3345 } else {
3346 debug7(printf("BACKWARD: Saw aaphase_e of -1 at pair %d\n",i));
3347 starti = last_valid_i; /* inclusive */
3348 i = -1;
3349 }
3350 }
3351 }
3352 }
3353
3354 } else if (cdstype == CDS_GENOMIC) {
3355 i = npairs - 1;
3356 endi = npairs;
3357 while (i >= 0) {
3358 if (pairs[i].gapp == true) {
3359 i--;
3360 } else if (pairs[i].genome == ' ') {
3361 i--;
3362 } else if (pairs[i].aaphase_g == -1) {
3363 i--;
3364 } else {
3365 debug7(printf("BACKWARD: Setting endi to be %d\n",i));
3366 endi = i;
3367 last_valid_i = i;
3368 while (i >= 0) {
3369 if (pairs[i].gapp == true) {
3370 i--;
3371 } else if (pairs[i].genome == ' ') {
3372 i--;
3373 } else if (pairs[i].aaphase_g != -1) {
3374 last_valid_i = i;
3375 i--;
3376 } else {
3377 debug7(printf("BACKWARD: Saw aaphase_g of -1 at pair %d\n",i));
3378 starti = last_valid_i; /* inclusive */
3379 i = -1;
3380 }
3381 }
3382 }
3383 }
3384
3385 } else {
3386 fprintf(stderr,"Do not recognize cdstype %d\n",cdstype);
3387 abort();
3388 }
3389
3390 debug7(Pair_dump_array(pairs,npairs,true));
3391
3392 if (cds_p == true && endi >= npairs) {
3393 /* Want CDS, and none seen */
3394 return;
3395 }
3396
3397 ptr = &(pairs[endi]);
3398 for (i = endi; i >= starti; i--) {
3399 /* prev = this; */
3400 this = ptr--;
3401
3402 if (this->gapp) {
3403 if (in_exon == true) {
3404 exon_queryend = last_querypos + 1;
3405 exon_genomeend = last_genomepos + 1;
3406
3407 if (den == 0) {
3408 pctidentity = 100;
3409 } else {
3410 pctidentity = (int) floor(100.0*(double) num/(double) den);
3411 }
3412
3413 if (cds_p == true) {
3414 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3415 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3416
3417 } else {
3418 print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3419 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3420
3421 }
3422
3423 in_exon = false;
3424 }
3425 } else if (this->comp == INTRONGAP_COMP) {
3426 /* Do nothing */
3427 } else {
3428 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
3429 SHORTGAP_COMP, or MISMATCH_COMP */
3430 if (in_exon == false) {
3431 exon_querystart = this->querypos + 1;
3432 exon_genomestart = this->genomepos + 1;
3433 #if 0
3434 if (this->aaphase_e != -1) {
3435 /* Otherwise, if phase is -1 from an indel, use previous exon_phase. Should be fixed now */
3436 exon_phase = this->aaphase_e;
3437 }
3438 #else
3439 if (cdstype == CDS_CDNA) {
3440 exon_phase = this->aaphase_e;
3441 } else {
3442 exon_phase = this->aaphase_g;
3443 }
3444 #endif
3445
3446 if (gff_introns_p == true) {
3447 if (i > 0) {
3448 #if 0
3449 printf_gff3_intron(++intronno,pathnum,sourcename,accession,chrstring,?,?,intron_start,intron_end,watsonp);
3450 #endif
3451 }
3452 PUTC('\n',fp);
3453 }
3454
3455 num = den = 0;
3456 in_exon = true;
3457 }
3458 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3459 /* Previously not counted in numerator or denominator */
3460 den++;
3461
3462 #ifndef PMAP
3463 } else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3464 /* Comp must be a space */
3465 /* Don't count in numerator or denominator */
3466 #endif
3467 } else {
3468 den++;
3469 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3470 num++;
3471 } else if (this->comp == AMBIGUOUS_COMP) {
3472 #ifdef PMAP
3473 num++;
3474 #else
3475 den--;
3476 #endif
3477 }
3478 }
3479 }
3480 if (this->cdna != ' ') {
3481 last_querypos = this->querypos;
3482 }
3483 if (this->genome != ' ') {
3484 last_genomepos = this->genomepos;
3485 }
3486 }
3487
3488 /* prev = this; */
3489 exon_queryend = last_querypos + 1;
3490 exon_genomeend = last_genomepos + 1;
3491
3492 if (den == 0) {
3493 pctidentity = 100;
3494 } else {
3495 pctidentity = (int) floor(100.0*(double) num/(double) den);
3496 }
3497
3498 if (cds_p == true) {
3499 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3500 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3501 } else {
3502 print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3503 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3504 }
3505
3506 return;
3507 }
3508
3509
3510 #if 0
3511 /* Replaced by print_gff3_exons_forward */
3512 static void
3513 print_gff3_cdss_forward (Filestring_T fp, struct T *pairs, int npairs, int pathnum,
3514 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
3515 bool watsonp, int cdna_direction) {
3516 bool in_cds = false;
3517 struct T *ptr, *this = NULL;
3518 int exon_querystart = -1, exon_queryend, exon_phase;
3519 Chrpos_T exon_genomestart = 0, exon_genomeend;
3520 int pctidentity, num = 0, den = 0, cdsno = 0;
3521 #if 0
3522 Chrpos_T intron_start, intron_end;
3523 #endif
3524 int last_querypos = -1;
3525 Chrpos_T last_genomepos = (Chrpos_T) -1;
3526
3527 ptr = pairs;
3528 while (ptr < &(pairs[npairs])) {
3529 /* prev = this; */
3530 this = ptr++;
3531
3532 if (in_cds == true) {
3533 if (this->aaphase_e == -1) { /* was aaphase_g */
3534 /* End of cds */
3535 exon_queryend = last_querypos + 1;
3536 exon_genomeend = last_genomepos + 1;
3537
3538 if (den == 0) {
3539 pctidentity = 100;
3540 } else {
3541 pctidentity = (int) floor(100.0*(double) num/(double) den);
3542 }
3543
3544 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3545 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3546
3547 in_cds = false;
3548
3549 } else {
3550 /* Continuation of cds */
3551 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3552 /* Previously not counted in numerator or denominator */
3553 den++;
3554
3555 #ifndef PMAP
3556 } else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3557 /* Comp must be a space */
3558 /* Don't count in numerator or denominator */
3559 #endif
3560 } else {
3561 den++;
3562 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3563 num++;
3564 } else if (this->comp == AMBIGUOUS_COMP) {
3565 #ifdef PMAP
3566 num++;
3567 #else
3568 den--;
3569 #endif
3570 }
3571 }
3572 }
3573
3574 } else {
3575 if (this->aaphase_e == -1) {
3576 /* Continuation of non-cds */
3577 } else {
3578 /* Start of cds */
3579 exon_querystart = this->querypos + 1;
3580 exon_phase = this->aaphase_e; /* ? was aaphase_g */
3581 exon_genomestart = this->genomepos + 1;
3582
3583 num = den = 0;
3584 in_cds = true;
3585 }
3586 }
3587 if (this->cdna != ' ') {
3588 last_querypos = this->querypos;
3589 }
3590 if (this->genome != ' ') {
3591 last_genomepos = this->genomepos;
3592 }
3593 }
3594
3595 if (in_cds == true) {
3596 exon_queryend = last_querypos + 1;
3597 exon_genomeend = last_genomepos + 1;
3598
3599 if (den == 0) {
3600 pctidentity = 100;
3601 } else {
3602 pctidentity = (int) floor(100.0*(double) num/(double) den);
3603 }
3604
3605 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3606 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3607 }
3608
3609 return;
3610 }
3611 #endif
3612
3613
3614 #if 0
3615 /* Replaced by print_gff3_exons_backward */
3616 static void
3617 print_gff3_cdss_backward (Filestring_T fp, struct T *pairs, int npairs, int pathnum,
3618 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
3619 bool watsonp, int cdna_direction) {
3620 bool in_cds = false;
3621 struct T *ptr, *this = NULL;
3622 int exon_querystart = -1, exon_queryend, exon_phase;
3623 Chrpos_T exon_genomestart = 0, exon_genomeend;
3624 int pctidentity, num = 0, den = 0, cdsno = 0;
3625 #if 0
3626 Chrpos_T intron_start, intron_end;
3627 #endif
3628 int last_querypos = -1;
3629 Chrpos_T last_genomepos = (Chrpos_T) -1;
3630
3631
3632 ptr = &(pairs[npairs-1]);
3633 while (ptr >= &(pairs[0])) {
3634 /* prev = this; */
3635 this = ptr--;
3636
3637 if (in_cds == true) {
3638 if (this->aaphase_e == -1) { /* was aaphase_g */
3639 /* End of cds */
3640 exon_queryend = last_querypos + 1;
3641 exon_genomeend = last_genomepos + 1;
3642
3643 if (den == 0) {
3644 pctidentity = 100;
3645 } else {
3646 pctidentity = (int) floor(100.0*(double) num/(double) den);
3647 }
3648
3649 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3650 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3651
3652 in_cds = false;
3653
3654 } else {
3655 /* Continuation of cds */
3656 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3657 /* Previously not counted in numerator or denominator */
3658 den++;
3659
3660 #ifndef PMAP
3661 } else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3662 /* Comp must be a space */
3663 /* Don't count in numerator or denominator */
3664 #endif
3665 } else {
3666 den++;
3667 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3668 num++;
3669 } else if (this->comp == AMBIGUOUS_COMP) {
3670 #ifdef PMAP
3671 num++;
3672 #else
3673 den--;
3674 #endif
3675 }
3676 }
3677 }
3678
3679 } else {
3680 if (this->aaphase_e == -1) { /* was aaphase_g */
3681 /* Continuation of non-cds */
3682 } else {
3683 /* Start of cds */
3684 exon_querystart = this->querypos + 1;
3685 exon_phase = this->aaphase_e; /* ? was aaphase_g */
3686 exon_genomestart = this->genomepos + 1;
3687
3688 num = den = 0;
3689 in_cds = true;
3690 }
3691 }
3692
3693 if (this->cdna != ' ') {
3694 last_querypos = this->querypos;
3695 }
3696 if (this->genome != ' ') {
3697 last_genomepos = this->genomepos;
3698 }
3699 }
3700
3701 if (in_cds == true) {
3702 exon_queryend = last_querypos + 1;
3703 exon_genomeend = last_genomepos + 1;
3704
3705 if (den == 0) {
3706 pctidentity = 100;
3707 } else {
3708 pctidentity = (int) floor(100.0*(double) num/(double) den);
3709 }
3710
3711 print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3712 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3713 }
3714
3715 return;
3716 }
3717 #endif
3718
3719
3720 void
Pair_print_gff3(Filestring_T fp,struct T * pairs,int npairs,int pathnum,char * accession,char * fasta_annotation,T start,T end,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,int translation_end,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction,bool gff_gene_format_p,bool gff_estmatch_format_p,char * sourcename)3721 Pair_print_gff3 (Filestring_T fp, struct T *pairs, int npairs, int pathnum, char *accession, char *fasta_annotation,
3722 T start, T end, Chrnum_T chrnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment,
3723 int translation_end, int querylength_given, int skiplength, int matches, int mismatches,
3724 int qindels, int tindels, int unknowns, bool watsonp, int cdna_direction,
3725 bool gff_gene_format_p, bool gff_estmatch_format_p, char *sourcename) {
3726 char *chrstring = NULL;
3727 Chrpos_T chrpos1, chrpos2;
3728
3729 if (chrnum == 0) {
3730 chrstring = Sequence_accession(usersegment);
3731 } else {
3732 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
3733 }
3734
3735 if (sourcename == NULL) {
3736 sourcename = "NA";
3737 }
3738
3739 if (gff_gene_format_p == true) {
3740 chrpos1 = start->genomepos;
3741 chrpos2 = end->genomepos;
3742
3743 print_gff3_gene(fp,pathnum,sourcename,accession,fasta_annotation,chrstring,chrpos1+1,chrpos2+1,watsonp,cdna_direction);
3744 print_gff3_mrna(fp,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,chrpos1+1,chrpos2+1,
3745 querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3746 watsonp,cdna_direction);
3747
3748 if (cdna_direction >= 0) {
3749 print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3750 querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3751 watsonp,cdna_direction,/*gff_introns_p*/false,/*gff_gene_format_p*/true,
3752 /*gff_estmatch_format_p*/false,/*cds_p*/false);
3753 if (translation_end > 0) {
3754 #if 0
3755 print_gff3_cdss_forward(fp,pairs,npairs,pathnum,sourcename,accession,fasta_annotation,chrstring,watsonp,
3756 cdna_direction);
3757 #else
3758 print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3759 querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3760 watsonp,cdna_direction,/*gff_introns_p*/false,/*gff_gene_format_p*/false,
3761 /*gff_estmatch_format_p*/false,/*cds_p*/true);
3762 #endif
3763 }
3764 } else {
3765 print_gff3_exons_backward(fp,pairs,npairs,pathnum,sourcename,accession,fasta_annotation,chrstring,watsonp,
3766 cdna_direction,/*gff_introns_p*/false,/*cds_p*/false);
3767 if (translation_end > 0) {
3768 #if 0
3769 print_gff3_cdss_backward(fp,pairs,npairs,pathnum,sourcename,accession,reestofheader,chrstring,watsonp,
3770 cdna_direction);
3771 #else
3772 print_gff3_exons_backward(fp,pairs,npairs,pathnum,sourcename,accession,fasta_annotation,chrstring,watsonp,
3773 cdna_direction,/*gff_introns_p*/false,/*cds_p*/true);
3774 #endif
3775 }
3776 }
3777
3778 } else {
3779 print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3780 querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3781 watsonp,cdna_direction,/*gff_introns_p*/false,/*gff_gene_format_p*/false,
3782 gff_estmatch_format_p,/*cds_p*/false);
3783 }
3784
3785 if (gff3_separators_p == true) {
3786 FPRINTF(fp,"###\n"); /* Terminates alignment */
3787 }
3788
3789 if (chrnum != 0) {
3790 FREE(chrstring);
3791 }
3792
3793 return;
3794 }
3795
3796
3797 /* Don't want to use SOFT_CLIPS_AVOID_CIRCULARIZATION, because the
3798 pairs array already contains the trim information */
3799 int
Pair_circularpos(int * alias,struct T * pairs,int npairs,Chrpos_T chrlength,bool plusp,int querylength)3800 Pair_circularpos (int *alias, struct T *pairs, int npairs, Chrpos_T chrlength, bool plusp, int querylength) {
3801 Chrpos_T low, high;
3802 struct T *ptr;
3803 int i, ninsertions, querypos;
3804 /* Univcoord_T chrhigh; */
3805
3806 debug12(Pair_dump_array(pairs,npairs,true));
3807
3808 /* chrhigh = chrlength + chrlength; */
3809 if (plusp == true) {
3810 low = pairs[0].genomepos; /* includes "trim_left" */
3811 high = pairs[npairs-1].genomepos; /* includes "trim_right" */
3812 debug12(printf("plus: low %u, high %u, chrlength %u\n",low,high,chrlength));
3813
3814 if (low >= chrlength) {
3815 /* All of read after trimming is in circular alias */
3816 #if 0
3817 if (high > chrhigh) { /* Differs from code in stage3hr.c */
3818 *alias = +2; /* Extends beyond end of second copy */
3819 } else {
3820 *alias = +1; /* All of read is in second copy */
3821 }
3822 #else
3823 *alias = +1;
3824 #endif
3825 debug12(printf("Returning -1 with alias %d\n",*alias));
3826 return -1;
3827
3828 } else if (high < chrlength) {
3829 /* All of read after trimming is in circular proper */
3830 #if 0
3831 if (low < (Chrpos_T) trim_left) {
3832 *alias = -2; /* Extends beyond beginning of first copy */
3833 } else {
3834 *alias = -1; /* All of read is in first copy */
3835 }
3836 #else
3837 *alias = -1;
3838 #endif
3839 debug12(printf("Returning -1 with alias %d\n",*alias));
3840 return -1;
3841
3842 } else {
3843 /* Some of read is in circular proper and some is in circular alias */
3844 i = 0;
3845 ptr = pairs;
3846 ninsertions = 0;
3847
3848 while (i++ < npairs && ptr->genomepos <= chrlength) { /* Needs to be <= for plus, < for minus */
3849 querypos = ptr->querypos;
3850 if (ptr->genome == ' ' && ptr->gapp == false) {
3851 ninsertions += 1;
3852 }
3853 ptr++;
3854 }
3855
3856 *alias = 0;
3857 debug12(printf("Returning %d with no alias\n",(querypos - ninsertions)));
3858 return querypos - ninsertions;
3859 }
3860
3861 } else {
3862 low = pairs[npairs-1].genomepos; /* includes "trim_right" */
3863 high = pairs[0].genomepos; /* includes "trim_left" */
3864 debug12(printf("minus: low %u, high %u\n",low,high));
3865
3866 if (low >= chrlength) {
3867 /* All of read after trimming is in circular alias */
3868 #if 0
3869 if (high > chrhigh) { /* Differs from code in stage3hr.c */
3870 *alias = +2; /* Extends beyond end of second copy */
3871 } else {
3872 *alias = +1; /* All of read is in second copy */
3873 }
3874 #else
3875 *alias = +1;
3876 #endif
3877 debug12(printf("Returning -1 with alias %d\n",*alias));
3878 return -1;
3879
3880 } else if (high < chrlength) {
3881 /* All of read after trimming is in circular proper */
3882 #if 0
3883 if (low < (Chrpos_T) trim_right) {
3884 *alias = -2; /* Extends beyond beginning of first copy */
3885 } else {
3886 *alias = -1; /* All of read is in first copy */
3887 }
3888 #else
3889 *alias = -1;
3890 #endif
3891 debug12(printf("Returning -1 with alias %d\n",*alias));
3892 return -1;
3893
3894 } else {
3895 /* Some of read is in circular proper and some is in circular alias */
3896 i = npairs - 1;
3897 ptr = &(pairs[i]);
3898 ninsertions = 0;
3899
3900 while (--i >= 0 && ptr->genomepos < chrlength) { /* Needs to be <= for plus, < for minus */
3901 querypos = ptr->querypos;
3902 if (ptr->genome == ' ' && ptr->gapp == false) {
3903 ninsertions += 1;
3904 }
3905 --ptr;
3906 }
3907
3908 *alias = 0;
3909 debug12(printf("Returning %d with no alias\n",(querylength - querypos - ninsertions)));
3910 return (querylength - querypos - ninsertions);
3911 }
3912 }
3913 }
3914
3915
3916 #ifndef PMAP
3917 void
Pair_print_bedpe(Filestring_T fp,struct T * pairarray,int npairs,Chrnum_T chrnum,bool watsonp,Univ_IIT_T chromosome_iit)3918 Pair_print_bedpe (Filestring_T fp, struct T *pairarray, int npairs,
3919 Chrnum_T chrnum, bool watsonp, Univ_IIT_T chromosome_iit) {
3920 bool in_exon = true;
3921 struct T *ptr, *ptr0, *this = NULL, *start;
3922 Chrpos_T exon_genomestart = 0, exon_genomeend;
3923 int nindels, i;
3924 /* int last_querypos = -1; */
3925 Chrpos_T last_genomepos = (Chrpos_T) -1;
3926 char *chr, strand;
3927 bool allocp;
3928
3929
3930 #if 0
3931 if (invertedp == true) {
3932 pairs = invert_and_revcomp_path_and_coords(pairs_querydir,npairs,querylength);
3933 watsonp = !watsonp;
3934 cdna_direction = -cdna_direction;
3935 } else {
3936 pairs = pairs_querydir;
3937 }
3938 #endif
3939
3940
3941 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
3942 if (watsonp == true) {
3943 strand = '+';
3944 } else {
3945 strand = '-';
3946 }
3947
3948
3949 ptr = pairarray;
3950 /* exon_querystart = ptr->querypos + 1; */
3951 exon_genomestart = ptr->genomepos + 1;
3952
3953
3954 i = 0;
3955 while (i < npairs) {
3956 /* prev = this; */
3957 this = ptr++;
3958 i++;
3959
3960 if (this->gapp) {
3961 if (in_exon == true) {
3962 /* SPLICE START */
3963 ptr0 = ptr;
3964 while (ptr0->gapp) {
3965 ptr0++;
3966 }
3967 /* exon_queryend = last_querypos + 1; */
3968 exon_genomeend = last_genomepos + 1;
3969
3970 in_exon = false;
3971 }
3972 } else if (this->comp == INTRONGAP_COMP) {
3973 /* May want to print dinucleotides */
3974
3975 } else {
3976 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
3977 SHORTGAP_COMP, or MISMATCH_COMP */
3978 if (in_exon == false) {
3979 /* SPLICE CONTINUATION */
3980 /* exon_querystart = this->querypos + 1; */
3981 exon_genomestart = this->genomepos + 1;
3982
3983 in_exon = true;
3984 if (strand == '+') {
3985 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
3986 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
3987 FPRINTF(fp,"DELETION\t0\t");
3988 FPRINTF(fp,"+\t+\t");
3989 FPRINTF(fp,"%d\n",exon_genomestart - exon_genomeend - 1);
3990 } else {
3991 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
3992 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
3993 FPRINTF(fp,"DELETION\t0\t");
3994 FPRINTF(fp,"+\t+\t");
3995 FPRINTF(fp,"%d\n",exon_genomeend - exon_genomestart - 1);
3996 }
3997 }
3998
3999 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4000 if (this->genome == ' ') {
4001 /* INSERTION */
4002 /* exon_queryend = last_querypos + 1; */
4003 exon_genomeend = last_genomepos + 1;
4004
4005 /* indel_pos = this->querypos; */
4006 start = this;
4007 nindels = 0;
4008 while (i < npairs && this->gapp == false && this->genome == ' ') {
4009 nindels++;
4010 this = ptr++;
4011 i++;
4012 }
4013 if (i < npairs) {
4014 ptr--;
4015 i--;
4016 this = ptr;
4017 }
4018
4019 /* exon_querystart = this->querypos + 1; */
4020 exon_genomestart = this->genomepos + 1;
4021
4022 if (strand == '+') {
4023 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4024 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4025 FPRINTF(fp,"INSERTION\t0\t");
4026 FPRINTF(fp,"+\t+\t");
4027 while (start < this) {
4028 FPRINTF(fp,"%c",start->cdna);
4029 start++;
4030 }
4031 } else {
4032 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4033 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4034 FPRINTF(fp,"INSERTION\t0\t");
4035 FPRINTF(fp,"+\t+\t");
4036 while (start < this) {
4037 FPRINTF(fp,"%c",complCode[(int) start->cdna]);
4038 start++;
4039 }
4040 }
4041 FPRINTF(fp,"\n");
4042
4043 } else if (this->cdna == ' ') {
4044 /* DELETION */
4045 /* exon_queryend = last_querypos + 1; */
4046 exon_genomeend = last_genomepos + 1;
4047
4048 /* indel_pos = this->querypos; */
4049 nindels = 0;
4050 while (i < npairs && this->gapp == false && this->cdna == ' ') {
4051 nindels++;
4052 this = ptr++;
4053 i++;
4054 }
4055 if (i < npairs) {
4056 ptr--;
4057 i--;
4058 this = ptr;
4059 }
4060
4061 /* exon_querystart = this->querypos + 1; */
4062 exon_genomestart = this->genomepos + 1;
4063
4064 if (strand == '+') {
4065 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4066 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4067 } else {
4068 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4069 FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4070 }
4071 FPRINTF(fp,"DELETION\t0\t");
4072 FPRINTF(fp,"+\t+\t");
4073 FPRINTF(fp,"%d\n",nindels);
4074
4075 } else {
4076 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4077 exit(9);
4078 }
4079
4080 } else {
4081 /* Match or mismatch */
4082 }
4083 }
4084
4085 #if 0
4086 if (this->cdna != ' ') {
4087 last_querypos = this->querypos;
4088 }
4089 #endif
4090 if (this->genome != ' ') {
4091 last_genomepos = this->genomepos;
4092 }
4093 }
4094
4095 if (allocp) {
4096 FREE(chr);
4097 }
4098
4099 #if 0
4100 if (invertedp == true) {
4101 FREE(pairs);
4102 }
4103 #endif
4104
4105 return;
4106 }
4107 #endif
4108
4109
4110
4111 #ifdef GSNAP
4112 static double
blast_bitscore(int alignlength,int nmismatches)4113 blast_bitscore (int alignlength, int nmismatches) {
4114 double k = 0.1;
4115 double lambda = 1.58; /* For a +1, -1 scoring scheme */
4116 double score;
4117
4118 score = (double) ((alignlength - nmismatches) /* scored as +1 */ - nmismatches /* scored as -1 */);
4119 return (score * lambda - log(k)) / log(2.0);
4120 }
4121
4122
4123 static void
print_m8_line(Filestring_T fp,int exon_querystart,int exon_queryend,char * chr,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int nmismatches_bothdiff,Shortread_T headerseq,char * acc_suffix)4124 print_m8_line (Filestring_T fp, int exon_querystart, int exon_queryend,
4125 char *chr, Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
4126 int nmismatches_bothdiff, Shortread_T headerseq, char *acc_suffix) {
4127 double identity;
4128 int alignlength_trim;
4129
4130 FPRINTF(fp,"%s%s",Shortread_accession(headerseq),acc_suffix); /* field 0: accession */
4131
4132 FPRINTF(fp,"\t%s",chr); /* field 1: chr */
4133
4134 /* field 2: identity */
4135 alignlength_trim = exon_queryend - exon_querystart;
4136 identity = (double) (alignlength_trim - nmismatches_bothdiff)/(double) alignlength_trim;
4137 FPRINTF(fp,"\t%.1f",100.0*identity);
4138
4139
4140 FPRINTF(fp,"\t%d",alignlength_trim); /* field 3: query length */
4141
4142 FPRINTF(fp,"\t%d",nmismatches_bothdiff); /* field 4: nmismatches */
4143
4144 FPRINTF(fp,"\t0"); /* field 5: gap openings */
4145
4146 /* fields 6 and 7: query start and end */
4147 FPRINTF(fp,"\t%d\t%d",exon_querystart,exon_queryend);
4148
4149 /* fields 8 and 9: chr start and end */
4150 FPRINTF(fp,"\t%u\t%u",exon_genomestart,exon_genomeend);
4151
4152 /* field 10: E value */
4153 FPRINTF(fp,"\t%.2g",blast_evalue(alignlength_trim,nmismatches_bothdiff));
4154
4155 /* field 11: bit score */
4156 FPRINTF(fp,"\t%.1f",blast_bitscore(alignlength_trim,nmismatches_bothdiff));
4157
4158 FPRINTF(fp,"\n");
4159
4160 return;
4161 }
4162
4163
4164 void
Pair_print_m8(Filestring_T fp,struct T * pairs_querydir,int npairs,bool invertedp,Chrnum_T chrnum,Shortread_T queryseq,Shortread_T headerseq,char * acc_suffix,Univ_IIT_T chromosome_iit)4165 Pair_print_m8 (Filestring_T fp, struct T *pairs_querydir, int npairs, bool invertedp,
4166 Chrnum_T chrnum, Shortread_T queryseq, Shortread_T headerseq,
4167 char *acc_suffix, Univ_IIT_T chromosome_iit) {
4168 bool in_exon = true;
4169 struct T *pairs, *ptr, *ptr0, *this = NULL;
4170 int exon_querystart = -1, exon_queryend;
4171 Chrpos_T exon_genomestart = 0, exon_genomeend;
4172 int nmismatches_refdiff, nmismatches_bothdiff, nmatches, i;
4173 int last_querypos = -1;
4174 Chrpos_T last_genomepos = (Chrpos_T) -1;
4175 char *chr;
4176 int querylength;
4177 bool allocp;
4178
4179 querylength = Shortread_fulllength(queryseq);
4180
4181 if (invertedp == true) {
4182 pairs = invert_and_revcomp_path_and_coords(pairs_querydir,npairs,querylength);
4183 } else {
4184 pairs = pairs_querydir;
4185 }
4186
4187
4188 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
4189
4190 ptr = pairs;
4191 exon_querystart = ptr->querypos + 1;
4192 exon_genomestart = ptr->genomepos + 1;
4193 nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4194
4195 i = 0;
4196 while (i < npairs) {
4197 this = ptr++;
4198 i++;
4199
4200 if (this->gapp) {
4201 if (in_exon == true) {
4202 /* SPLICE START */
4203 ptr0 = ptr;
4204 while (ptr0->gapp) {
4205 ptr0++;
4206 }
4207 exon_queryend = last_querypos + 1;
4208 exon_genomeend = last_genomepos + 1;
4209
4210 print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
4211 nmismatches_bothdiff,headerseq,acc_suffix);
4212
4213 nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4214
4215 in_exon = false;
4216 }
4217 } else if (this->comp == INTRONGAP_COMP) {
4218 /* May want to print dinucleotides */
4219
4220 } else {
4221 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4222 SHORTGAP_COMP, or MISMATCH_COMP */
4223 if (in_exon == false) {
4224 /* SPLICE CONTINUATION */
4225 exon_querystart = this->querypos + 1;
4226 exon_genomestart = this->genomepos + 1;
4227
4228 in_exon = true;
4229 }
4230 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4231 if (this->genome == ' ') {
4232 /* INSERTION */
4233 exon_queryend = last_querypos + 1;
4234 exon_genomeend = last_genomepos + 1;
4235
4236 /* indel_pos = this->querypos; */
4237 while (i < npairs && this->gapp == false && this->genome == ' ') {
4238 this = ptr++;
4239 i++;
4240 }
4241 if (i < npairs) {
4242 ptr--;
4243 i--;
4244
4245 this = ptr;
4246 exon_querystart = this->querypos + 1;
4247 exon_genomestart = this->genomepos + 1;
4248 nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4249 }
4250
4251 } else if (this->cdna == ' ') {
4252 /* DELETION */
4253 exon_queryend = last_querypos + 1;
4254 exon_genomeend = last_genomepos + 1;
4255
4256 /* indel_pos = this->querypos; */
4257 while (i < npairs && this->gapp == false && this->cdna == ' ') {
4258 this = ptr++;
4259 i++;
4260 }
4261 if (i < npairs) {
4262 ptr--;
4263 i--;
4264 }
4265
4266 /* Finish rest of this line */
4267 print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
4268 nmismatches_bothdiff,headerseq,acc_suffix);
4269
4270 if (i < npairs) {
4271 this = ptr;
4272 exon_querystart = this->querypos + 1;
4273 exon_genomestart = this->genomepos + 1;
4274 nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4275 }
4276
4277 } else {
4278 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4279 exit(9);
4280 }
4281
4282 } else {
4283 /* c = this->genome; */
4284 if (this->genome == this->cdna) {
4285 nmatches++;
4286 } else if (this->genomealt == this->cdna) {
4287 nmismatches_refdiff++;
4288 } else {
4289 nmismatches_bothdiff++;
4290 nmismatches_refdiff++;
4291 }
4292 }
4293 }
4294
4295 if (this->cdna != ' ') {
4296 last_querypos = this->querypos;
4297 }
4298 if (this->genome != ' ') {
4299 last_genomepos = this->genomepos;
4300 }
4301 }
4302
4303 exon_queryend = last_querypos + 1;
4304 exon_genomeend = last_genomepos + 1;
4305
4306 print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
4307 nmismatches_bothdiff,headerseq,acc_suffix);
4308
4309 if (allocp) {
4310 FREE(chr);
4311 }
4312
4313 if (invertedp == true) {
4314 FREE(pairs);
4315 }
4316
4317 return;
4318 }
4319 #endif
4320
4321
4322 #if 0
4323 double
4324 Pair_min_evalue (struct T *pairarray, int npairs) {
4325 double min_evalue = 1000.0, evalue;
4326 bool in_exon = true;
4327 struct T *ptr, *ptr0, *this = NULL;
4328 int alignlength_trim, exon_querystart = -1, exon_queryend;
4329 int nmismatches_bothdiff, i;
4330 int last_querypos = -1;
4331
4332
4333 ptr = pairarray;
4334 exon_querystart = ptr->querypos + 1;
4335 nmismatches_bothdiff = 0;
4336
4337 i = 0;
4338 while (i < npairs) {
4339 this = ptr++;
4340 i++;
4341
4342 if (this->gapp) {
4343 if (in_exon == true) {
4344 /* SPLICE START */
4345 ptr0 = ptr;
4346 while (ptr0->gapp) {
4347 ptr0++;
4348 }
4349 exon_queryend = last_querypos + 1;
4350
4351 alignlength_trim = exon_queryend - exon_querystart;
4352 assert(alignlength_trim >= 0);
4353 if ((evalue = blast_evalue(alignlength_trim,nmismatches_bothdiff)) < min_evalue) {
4354 min_evalue = evalue;
4355 }
4356
4357 nmismatches_bothdiff = 0;
4358
4359 in_exon = false;
4360 }
4361 } else if (this->comp == INTRONGAP_COMP) {
4362 /* May want to print dinucleotides */
4363
4364 } else {
4365 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4366 SHORTGAP_COMP, or MISMATCH_COMP */
4367 if (in_exon == false) {
4368 /* SPLICE CONTINUATION */
4369 exon_querystart = this->querypos + 1;
4370
4371 in_exon = true;
4372 }
4373 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4374 if (this->genome == ' ') {
4375 /* INSERTION */
4376 exon_queryend = last_querypos + 1;
4377
4378 /* indel_pos = this->querypos; */
4379 while (i < npairs && this->gapp == false && this->genome == ' ') {
4380 this = ptr++;
4381 i++;
4382 }
4383 if (i < npairs) {
4384 ptr--;
4385 i--;
4386 this = ptr;
4387 }
4388
4389 exon_querystart = this->querypos + 1;
4390 nmismatches_bothdiff = 0;
4391
4392 } else if (this->cdna == ' ') {
4393 /* DELETION */
4394 exon_queryend = last_querypos + 1;
4395
4396 /* indel_pos = this->querypos; */
4397 while (i < npairs && this->gapp == false && this->cdna == ' ') {
4398 this = ptr++;
4399 i++;
4400 }
4401 if (i < npairs) {
4402 ptr--;
4403 i--;
4404 this = ptr;
4405 }
4406
4407 /* Finish rest of this line */
4408 alignlength_trim = exon_queryend - exon_querystart;
4409 assert(alignlength_trim >= 0);
4410 if ((evalue = blast_evalue(alignlength_trim,nmismatches_bothdiff)) < min_evalue) {
4411 min_evalue = evalue;
4412 }
4413
4414 exon_querystart = this->querypos + 1;
4415 nmismatches_bothdiff = 0;
4416
4417 } else {
4418 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4419 exit(9);
4420 }
4421
4422 } else {
4423 /* c = this->genome; */
4424 if (this->genome == this->cdna) {
4425 /* nmatches++; */
4426 } else if (this->genomealt == this->cdna) {
4427 /* nmismatches_refdiff++; */
4428 } else {
4429 nmismatches_bothdiff++;
4430 /* nmismatches_refdiff++; */
4431 }
4432 }
4433 }
4434
4435 if (this->cdna != ' ') {
4436 last_querypos = this->querypos;
4437 }
4438 }
4439
4440 exon_queryend = last_querypos + 1;
4441
4442 alignlength_trim = exon_queryend - exon_querystart;
4443 assert(alignlength_trim >= 0);
4444 if ((evalue = blast_evalue(alignlength_trim,nmismatches_bothdiff)) < min_evalue) {
4445 min_evalue = evalue;
4446 }
4447
4448 return min_evalue;
4449 }
4450 #endif
4451
4452
4453 /* Modified from print_endtypes */
4454 static void
splice_site_probs(double * sense_prob,double * antisense_prob,bool prev_splicesitep,bool splicesitep,Univcoord_T chroffset,int exon_genomestart,int exon_genomeend,bool watsonp)4455 splice_site_probs (double *sense_prob, double *antisense_prob,
4456 bool prev_splicesitep, bool splicesitep, Univcoord_T chroffset,
4457 int exon_genomestart, int exon_genomeend, bool watsonp) {
4458
4459 if (prev_splicesitep == true) {
4460 if (watsonp == true) {
4461 /* printf("watsonp is true, so looking up acceptor/antidonor at %u+%u-1\n",chroffset,exon_genomestart); */
4462 *sense_prob += Maxent_hr_acceptor_prob(chroffset+exon_genomestart-1,chroffset);
4463 *antisense_prob += Maxent_hr_antidonor_prob(chroffset+exon_genomestart-1,chroffset);
4464 } else {
4465 /* printf("watsonp is false, so looking up antiacceptor/donor at %u+%u\n",chroffset,exon_genomestart); */
4466 *sense_prob += Maxent_hr_antiacceptor_prob(chroffset+exon_genomestart,chroffset);
4467 *antisense_prob += Maxent_hr_donor_prob(chroffset+exon_genomestart,chroffset);
4468 }
4469 }
4470
4471 if (splicesitep == true) {
4472 if (watsonp == true) {
4473 /* printf("watsonp is true, so looking up donor/antiacceptor at %u+%u\n",chroffset,exon_genomeend); */
4474 *sense_prob += Maxent_hr_donor_prob(chroffset+exon_genomeend,chroffset);
4475 *antisense_prob += Maxent_hr_antiacceptor_prob(chroffset+exon_genomeend,chroffset);
4476 } else {
4477 /* printf("watsonp is false, so looking up antiacceptor/donor at %u+%u-1\n",chroffset,exon_genomeend); */
4478 *sense_prob += Maxent_hr_antidonor_prob(chroffset+exon_genomeend-1,chroffset);
4479 *antisense_prob += Maxent_hr_acceptor_prob(chroffset+exon_genomeend-1,chroffset);
4480 }
4481 }
4482 /* printf("sense %g, antisense %g\n",*sense_prob,*antisense_prob); */
4483
4484 return;
4485 }
4486
4487
4488 /* Modified from Pair_print_gsnap */
4489 int
Pair_guess_cdna_direction_array(int * sensedir,struct T * pairs_querydir,int npairs,bool invertedp,Univcoord_T chroffset,bool watsonp)4490 Pair_guess_cdna_direction_array (int *sensedir, struct T *pairs_querydir, int npairs, bool invertedp,
4491 Univcoord_T chroffset, bool watsonp) {
4492 double sense_prob = 0.0, antisense_prob = 0.0;
4493 bool in_exon = true;
4494 struct T *pairs, *ptr, *this = NULL;
4495 int i;
4496 Chrpos_T exon_genomestart = 0, exon_genomeend;
4497 Chrpos_T last_genomepos = (Chrpos_T) -1;
4498 bool splicesitep, prev_splicesitep;
4499
4500
4501 if (invertedp == true) {
4502 fprintf(stderr,"Pair_guess_cdna_direction cannot handle invertedp\n");
4503 /* pairs = invert_and_revcomp_path_and_coords(pairs_querydir,npairs,querylength); */
4504 /* watsonp = !watsonp; */
4505 abort();
4506 } else {
4507 pairs = pairs_querydir;
4508 }
4509
4510 if (pairs == NULL) {
4511 *sensedir = SENSE_NULL;
4512 return 0;
4513 } else {
4514 ptr = pairs;
4515 exon_genomestart = ptr->genomepos + 1;
4516 splicesitep = false;
4517 }
4518
4519 i = 0;
4520 while (i < npairs) {
4521 this = ptr++;
4522 i++;
4523
4524 if (this->gapp) {
4525 if (in_exon == true) {
4526 /* SPLICE START */
4527 #if 0
4528 ptr0 = ptr;
4529 while (ptr0->gapp) {
4530 ptr0++;
4531 }
4532 #endif
4533 exon_genomeend = last_genomepos + 1;
4534
4535 prev_splicesitep = splicesitep;
4536 splicesitep = true;
4537
4538 splice_site_probs(&sense_prob,&antisense_prob,
4539 prev_splicesitep,splicesitep,chroffset,
4540 exon_genomestart,exon_genomeend,watsonp);
4541
4542 in_exon = false;
4543 }
4544 } else if (this->comp == INTRONGAP_COMP) {
4545 /* May want to print dinucleotides */
4546
4547 } else {
4548 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4549 SHORTGAP_COMP, or MISMATCH_COMP */
4550 if (in_exon == false) {
4551 /* SPLICE CONTINUATION */
4552 exon_genomestart = this->genomepos + 1;
4553 in_exon = true;
4554 }
4555 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4556 if (this->genome == ' ') {
4557 /* INSERTION */
4558 exon_genomeend = last_genomepos + 1;
4559 prev_splicesitep = splicesitep;
4560 splicesitep = false;
4561
4562 while (i < npairs && this->gapp == false && this->genome == ' ') {
4563 this = ptr++;
4564 i++;
4565 }
4566 if (i < npairs) {
4567 ptr--;
4568 i--;
4569 this = ptr;
4570 }
4571
4572 splice_site_probs(&sense_prob,&antisense_prob,
4573 prev_splicesitep,splicesitep,chroffset,
4574 exon_genomestart,exon_genomeend,watsonp);
4575
4576 exon_genomestart = this->genomepos + 1;
4577
4578 } else if (this->cdna == ' ') {
4579 /* DELETION */
4580 exon_genomeend = last_genomepos + 1;
4581 prev_splicesitep = splicesitep;
4582 splicesitep = false;
4583
4584 while (i < npairs && this->gapp == false && this->cdna == ' ') {
4585 this = ptr++;
4586 i++;
4587 }
4588 if (i < npairs) {
4589 ptr--;
4590 i--;
4591 this = ptr;
4592 }
4593
4594 splice_site_probs(&sense_prob,&antisense_prob,
4595 prev_splicesitep,splicesitep,chroffset,
4596 exon_genomestart,exon_genomeend,watsonp);
4597
4598 exon_genomestart = this->genomepos + 1;
4599
4600 } else {
4601 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4602 exit(9);
4603 }
4604
4605 }
4606 }
4607
4608 if (this->genome != ' ') {
4609 last_genomepos = this->genomepos;
4610 }
4611 }
4612
4613 exon_genomeend = last_genomepos + 1;
4614 prev_splicesitep = splicesitep;
4615 splicesitep = false;
4616
4617 splice_site_probs(&sense_prob,&antisense_prob,
4618 prev_splicesitep,splicesitep,chroffset,
4619 exon_genomestart,exon_genomeend,watsonp);
4620
4621 if (invertedp == true) {
4622 FREE(pairs);
4623 }
4624
4625 if (sense_prob > antisense_prob) {
4626 *sensedir = SENSE_FORWARD;
4627 return +1;
4628 } else if (sense_prob < antisense_prob) {
4629 *sensedir = SENSE_ANTI;
4630 return -1;
4631 } else {
4632 *sensedir = SENSE_NULL;
4633 return 0;
4634 }
4635 }
4636
4637
4638 #if 0
4639 static char
4640 get_genomic_nt_array (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
4641 bool watsonp) {
4642 char c2, c2_alt;
4643 Univcoord_T pos;
4644
4645 if (watsonp) {
4646 if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
4647 *g_alt = '*';
4648 return '*';
4649
4650 } else if (pos >= chrhigh) {
4651 *g_alt = '*';
4652 return '*';
4653
4654 } else {
4655 return Genome_get_char_blocks(&(*g_alt),pos);
4656 }
4657
4658 } else {
4659 /* coordinates already processed by Pair_set_genomepos */
4660 if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
4661 return '*';
4662
4663 } else if (pos >= chrhigh) {
4664 return '*';
4665
4666 } else {
4667 c2 = Genome_get_char_blocks(&c2_alt,pos);
4668 }
4669 *g_alt = complCode[(int) c2_alt];
4670 return complCode[(int) c2];
4671 }
4672 }
4673 #endif
4674
4675
4676 void
Pair_fix_cdna_direction_array(struct T * pairs_querydir,int npairs,int cdna_direction)4677 Pair_fix_cdna_direction_array (struct T *pairs_querydir, int npairs, int cdna_direction) {
4678 struct T *ptr, *this = NULL;
4679 int i;
4680
4681 ptr = pairs_querydir;
4682 i = 0;
4683
4684 while (i < npairs) {
4685 this = ptr++;
4686 i++;
4687
4688 if (this->gapp && this->comp == NONINTRON_COMP) {
4689 if (cdna_direction > 0) {
4690 switch (this->introntype) {
4691 case GTAG_FWD: this->comp = FWD_CANONICAL_INTRON_COMP; break;
4692 case GCAG_FWD: this->comp = FWD_GCAG_INTRON_COMP; break;
4693 case ATAC_FWD: this->comp = FWD_ATAC_INTRON_COMP; break;
4694 default: this->comp = NONINTRON_COMP;
4695 }
4696 #ifndef PMAP
4697 } else if (cdna_direction < 0) {
4698 switch (this->introntype) {
4699 case ATAC_REV: this->comp = REV_ATAC_INTRON_COMP; break;
4700 case GCAG_REV: this->comp = REV_GCAG_INTRON_COMP; break;
4701 case GTAG_REV: this->comp = REV_CANONICAL_INTRON_COMP; break;
4702 default: this->comp = NONINTRON_COMP; break;
4703 }
4704 #endif
4705 }
4706 }
4707 }
4708
4709 return;
4710 }
4711
4712
4713
4714 int
Pair_gsnap_nsegments(int * total_nmismatches,int * total_nindels,int * nintrons,int * nindelbreaks,struct T * pairs,int npairs,int querylength)4715 Pair_gsnap_nsegments (int *total_nmismatches, int *total_nindels, int *nintrons,
4716 int *nindelbreaks, struct T *pairs, int npairs, int querylength) {
4717 int nsegments = 0;
4718 bool in_exon = true;
4719 struct T *ptr, *ptr0, *this = NULL;
4720 int i;
4721
4722 ptr = pairs;
4723 *total_nindels = 0;
4724 *nintrons = 0;
4725 *nindelbreaks = 0;
4726
4727 /* *total_nmismatches = 0; */
4728 *total_nmismatches = pairs[0].querypos + (querylength - pairs[npairs-1].querypos);
4729
4730 i = 0;
4731 while (i < npairs) {
4732 this = ptr++;
4733 i++;
4734
4735 if (this->gapp) {
4736 if (in_exon == true) {
4737 /* SPLICE START */
4738 ptr0 = ptr;
4739 while (ptr0->gapp) {
4740 ptr0++;
4741 }
4742
4743 (*nintrons) += 1;
4744 nsegments++;
4745
4746 in_exon = false;
4747 }
4748 } else if (this->comp == INTRONGAP_COMP) {
4749 /* May want to print dinucleotides */
4750
4751 } else {
4752 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4753 SHORTGAP_COMP, or MISMATCH_COMP */
4754 if (in_exon == false) {
4755 /* SPLICE CONTINUATION */
4756 in_exon = true;
4757 }
4758 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4759 if (this->genome == ' ') {
4760 /* INSERTION */
4761 while (i < npairs && this->genome == ' ') {
4762 (*total_nindels) += 1;
4763 this = ptr++;
4764 i++;
4765 }
4766 if (i < npairs) {
4767 ptr--;
4768 i--;
4769 }
4770
4771 (*nindelbreaks) += 1;
4772 nsegments++;
4773
4774 } else if (this->cdna == ' ') {
4775 /* DELETION */
4776 while (i < npairs && this->cdna == ' ') {
4777 (*total_nindels) += 1;
4778 this = ptr++;
4779 i++;
4780 }
4781 if (i < npairs) {
4782 ptr--;
4783 i--;
4784 }
4785
4786 (*nindelbreaks) += 1;
4787 nsegments++;
4788
4789 } else {
4790 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4791 exit(9);
4792 }
4793
4794 } else if (this->genome != this->cdna) {
4795 (*total_nmismatches) += 1;
4796 }
4797 }
4798 }
4799
4800 nsegments++;
4801
4802 return nsegments;
4803 }
4804
4805
4806
4807 /************************************************************************
4808 * SAM
4809 ************************************************************************/
4810
4811 /* Modeled after Shortread_print_chopped */
4812 static void
print_chopped(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4813 print_chopped (Filestring_T fp, char *contents, int querylength,
4814 int hardclip_start, int hardclip_end) {
4815 int i;
4816
4817 for (i = hardclip_start; i < querylength - hardclip_end; i++) {
4818 PUTC(contents[i],fp);
4819 }
4820 return;
4821 }
4822
4823 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
4824 static void
print_chopped_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4825 print_chopped_revcomp (Filestring_T fp, char *contents, int querylength,
4826 int hardclip_start, int hardclip_end) {
4827 int i;
4828
4829 for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
4830 PUTC(complCode[(int) contents[i]],fp);
4831 }
4832 return;
4833 }
4834
4835
4836 static void
print_chopped_end(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4837 print_chopped_end (Filestring_T fp, char *contents, int querylength,
4838 int hardclip_start, int hardclip_end) {
4839 int i;
4840
4841 for (i = 0; i < hardclip_start; i++) {
4842 PUTC(contents[i],fp);
4843 }
4844
4845 /* No separator */
4846
4847 for (i = querylength - hardclip_end; i < querylength; i++) {
4848 PUTC(contents[i],fp);
4849 }
4850
4851 return;
4852 }
4853
4854 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
4855 static void
print_chopped_end_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4856 print_chopped_end_revcomp (Filestring_T fp, char *contents, int querylength,
4857 int hardclip_start, int hardclip_end) {
4858 int i;
4859
4860 for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
4861 PUTC(complCode[(int) contents[i]],fp);
4862 }
4863
4864 /* No separator */
4865
4866 for (i = hardclip_start - 1; i >= 0; --i) {
4867 PUTC(complCode[(int) contents[i]],fp);
4868 }
4869
4870 return;
4871 }
4872
4873
4874 static void
print_chopped_end_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)4875 print_chopped_end_quality (Filestring_T fp, char *quality, int querylength,
4876 int hardclip_start, int hardclip_end) {
4877 int i;
4878
4879 if (hardclip_start > 0) {
4880 for (i = 0; i < hardclip_start; i++) {
4881 PUTC(quality[i],fp);
4882 }
4883 return;
4884
4885 } else {
4886 for (i = querylength - hardclip_end; i < querylength; i++) {
4887 PUTC(quality[i],fp);
4888 }
4889 return;
4890 }
4891 }
4892
4893 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
4894 static void
print_chopped_end_quality_reverse(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)4895 print_chopped_end_quality_reverse (Filestring_T fp, char *quality, int querylength,
4896 int hardclip_start, int hardclip_end) {
4897 int i;
4898
4899 if (hardclip_start > 0) {
4900 for (i = hardclip_start - 1; i >= 0; --i) {
4901 PUTC(quality[i],fp);
4902 }
4903 return;
4904
4905 } else {
4906 for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
4907 PUTC(quality[i],fp);
4908 }
4909 return;
4910 }
4911 }
4912
4913
4914
4915 /* Modeled after Shortread_print_quality */
4916 static void
print_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)4917 print_quality (Filestring_T fp, char *quality, int querylength,
4918 int hardclip_start, int hardclip_end, int shift) {
4919 int i;
4920 int c;
4921
4922 if (quality == NULL) {
4923 PUTC('*',fp);
4924 } else {
4925 for (i = hardclip_start; i < querylength - hardclip_end; i++) {
4926 if ((c = quality[i] + shift) <= 32) {
4927 fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable. May need to specify --quality-protocol or --quality-print-shift\n",
4928 shift,quality[i]);
4929 abort();
4930 } else {
4931 PUTC(c,fp);
4932 }
4933 }
4934 }
4935 return;
4936 }
4937
4938
4939 static void
print_quality_revcomp(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)4940 print_quality_revcomp (Filestring_T fp, char *quality, int querylength,
4941 int hardclip_start, int hardclip_end, int shift) {
4942 int i;
4943 int c;
4944
4945 if (quality == NULL) {
4946 PUTC('*',fp);
4947 } else {
4948 for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
4949 if ((c = quality[i] + shift) <= 32) {
4950 fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable. May need to specify --quality-protocol or --quality-print-shift\n",
4951 shift,quality[i]);
4952 abort();
4953 } else {
4954 PUTC(c,fp);
4955 }
4956 }
4957 }
4958
4959 return;
4960 }
4961
4962
4963 /* Only for GMAP program */
4964 static unsigned int
compute_sam_flag_nomate(int npaths,bool first_read_p,bool watsonp,bool sam_paired_p)4965 compute_sam_flag_nomate (int npaths, bool first_read_p, bool watsonp, bool sam_paired_p) {
4966 unsigned int flag = 0U;
4967
4968 if (sam_paired_p == true) {
4969 flag |= PAIRED_READ;
4970 if (first_read_p == true) {
4971 flag |= FIRST_READ_P;
4972 } else {
4973 flag |= SECOND_READ_P;
4974 }
4975 }
4976
4977 if (npaths == 0) {
4978 flag |= QUERY_UNMAPPED;
4979 } else if (watsonp == false) {
4980 flag |= QUERY_MINUSP;
4981 }
4982
4983 #if 0
4984 /* Will let external program decide what is primary */
4985 if (pathnum > 1) {
4986 flag |= NOT_PRIMARY;
4987 }
4988 #endif
4989
4990 return flag;
4991 }
4992
4993
4994
4995 void
Pair_print_sam_nomapping(Filestring_T fp,char * abbrev,char * acc1,char * acc2,char * queryseq_ptr,char * quality_string,int querylength,int quality_shift,bool first_read_p,bool sam_paired_p,char * sam_read_group_id)4996 Pair_print_sam_nomapping (Filestring_T fp, char *abbrev, char *acc1, char *acc2, char *queryseq_ptr,
4997 char *quality_string, int querylength, int quality_shift,
4998 bool first_read_p, bool sam_paired_p, char *sam_read_group_id) {
4999 unsigned int flag;
5000
5001 /* 1. QNAME */
5002 if (acc2 == NULL) {
5003 FPRINTF(fp,"%s",acc1);
5004 } else {
5005 FPRINTF(fp,"%s,%s",acc1,acc2);
5006 }
5007
5008 /* 2. FLAG */
5009 flag = compute_sam_flag_nomate(/*npaths*/0,first_read_p,/*watsonp*/true,sam_paired_p);
5010 FPRINTF(fp,"\t%u",flag);
5011
5012 /* 3. RNAME: chr */
5013 FPRINTF(fp,"\t*");
5014
5015 /* 4. POS: chrpos */
5016 FPRINTF(fp,"\t0");
5017
5018 /* 5. MAPQ: Mapping quality */
5019 /* Picard says MAPQ should be 0 for an unmapped read */
5020 FPRINTF(fp,"\t0");
5021
5022 /* 6. CIGAR */
5023 FPRINTF(fp,"\t*");
5024
5025 /* 7. MRNM: Mate chr */
5026 /* 8. MPOS: Mate chrpos */
5027 /* 9. ISIZE: Insert size */
5028 FPRINTF(fp,"\t*\t0\t0\t");
5029
5030 /* 10. SEQ: queryseq and 11. QUAL: quality scores */
5031 print_chopped(fp,queryseq_ptr,querylength,/*hardclip_start*/0,/*hardclip_end*/0);
5032 FPRINTF(fp,"\t");
5033 print_quality(fp,quality_string,querylength,/*hardclip_start*/0,/*hardclip_end*/0,
5034 quality_shift);
5035
5036 /* 12. TAGS: RG */
5037 if (sam_read_group_id != NULL) {
5038 FPRINTF(fp,"\tRG:Z:%s",sam_read_group_id);
5039 }
5040
5041 /* 12. TAGS: XO */
5042 FPRINTF(fp,"\tXO:Z:%s",abbrev);
5043
5044 FPRINTF(fp,"\n");
5045
5046 return;
5047 }
5048
5049
5050
5051 #if 0
5052 static int
5053 sensedir_from_cdna_direction (int cdna_direction) {
5054 if (cdna_direction > 0) {
5055 return SENSE_FORWARD;
5056 } else if (cdna_direction < 0) {
5057 return SENSE_ANTI;
5058 } else {
5059 return SENSE_NULL;
5060 }
5061 }
5062 #endif
5063
5064
5065 void
Pair_alias_circular(struct T * pairs,int npairs,Chrpos_T chrlength)5066 Pair_alias_circular (struct T *pairs, int npairs, Chrpos_T chrlength) {
5067 int i;
5068 struct T *ptr;
5069
5070 i = 0;
5071 ptr = pairs;
5072 while (i < npairs) {
5073 assert(ptr->genomepos < chrlength);
5074 ptr->genomepos += chrlength;
5075 i++;
5076 ptr++;
5077 }
5078
5079 return;
5080 }
5081
5082 void
Pair_unalias_circular(struct T * pairs,int npairs,Chrpos_T chrlength)5083 Pair_unalias_circular (struct T *pairs, int npairs, Chrpos_T chrlength) {
5084 int i;
5085 struct T *ptr;
5086
5087 i = 0;
5088 ptr = pairs;
5089 while (i < npairs) {
5090 assert(ptr->genomepos >= chrlength);
5091 ptr->genomepos -= chrlength;
5092 i++;
5093 ptr++;
5094 }
5095
5096 return;
5097 }
5098
5099
5100 static List_T
clean_cigar(List_T tokens,bool watsonp)5101 clean_cigar (List_T tokens, bool watsonp) {
5102 List_T clean, unique = NULL, p;
5103 char token[11], *curr_token, *last_token;
5104 int length = 0;
5105 char type, last_type = ' ';
5106 bool duplicatep = false;
5107
5108 for (p = tokens; p != NULL; p = List_next(p)) {
5109 curr_token = (char *) List_head(p);
5110 type = curr_token[strlen(curr_token)-1];
5111 if (type == last_type) {
5112 length += atoi(last_token);
5113 FREE_OUT(last_token);
5114 duplicatep = true;
5115 } else {
5116 if (last_type == ' ') {
5117 /* Skip */
5118 } else if (duplicatep == false) {
5119 unique = List_push_out(unique,(void *) last_token);
5120 } else {
5121 length += atoi(last_token);
5122 FREE_OUT(last_token);
5123 sprintf(token,"%d%c",length,last_type);
5124 unique = push_token(unique,token);
5125 }
5126 last_type = type;
5127 duplicatep = false;
5128 length = 0;
5129 }
5130 last_token = curr_token;
5131 }
5132 if (last_type == ' ') {
5133 /* Skip */
5134 } else if (duplicatep == false) {
5135 unique = List_push_out(unique,(void *) last_token);
5136 } else {
5137 length += atoi(last_token);
5138 FREE_OUT(last_token);
5139 sprintf(token,"%d%c",length,last_type);
5140 unique = push_token(unique,token);
5141 }
5142 List_free_out(&tokens);
5143
5144
5145 if (sam_insert_0M_p == false) {
5146 /* Return result */
5147 if (watsonp) {
5148 /* Put tokens in forward order */
5149 return unique;
5150 } else {
5151 /* Keep tokens in reverse order */
5152 return List_reverse(unique);
5153 }
5154
5155 } else {
5156 /* Insert "0M" between adjacent I and D operations */
5157 last_type = ' ';
5158 clean = (List_T) NULL;
5159 for (p = unique; p != NULL; p = List_next(p)) {
5160 curr_token = (char *) List_head(p);
5161 type = curr_token[strlen(curr_token)-1];
5162 if (last_type == 'I' && type == 'D') {
5163 clean = push_token(clean,"0M");
5164 } else if (last_type == 'D' && type == 'I') {
5165 clean = push_token(clean,"0M");
5166 }
5167 clean = List_push_out(clean,(void *) curr_token);
5168 last_type = type;
5169 }
5170 List_free_out(&unique);
5171
5172 /* Return result */
5173 if (watsonp) {
5174 /* Put tokens in forward order */
5175 return List_reverse(clean);
5176 } else {
5177 /* Keep tokens in reverse order */
5178 return clean;
5179 }
5180 }
5181 }
5182
5183
5184 /* Derived from print_tokens_gff3 */
5185 int
Pair_cigar_length(List_T tokens)5186 Pair_cigar_length (List_T tokens) {
5187 int length = 0, tokenlength;
5188 List_T p;
5189 char *token;
5190 char type;
5191
5192 for (p = tokens; p != NULL; p = List_next(p)) {
5193 token = (char *) List_head(p);
5194 type = token[strlen(token)-1];
5195 /* Should include 'H', but that gets added according to hardclip_low and hardclip_high */
5196 if (type == 'S' || type == 'I' || type == 'M' || type == 'X' || type == '=') {
5197 sscanf(token,"%d",&tokenlength);
5198 length += tokenlength;
5199 }
5200 }
5201
5202 return length;
5203 }
5204
5205 /* Derived from print_tokens_gff3 */
5206 void
Pair_print_tokens(Filestring_T fp,List_T tokens)5207 Pair_print_tokens (Filestring_T fp, List_T tokens) {
5208 List_T p;
5209 char *token;
5210
5211 for (p = tokens; p != NULL; p = List_next(p)) {
5212 token = (char *) List_head(p);
5213 FPRINTF(fp,"%s",token);
5214 /* FREE_OUT(token); -- Now freed within Stage3end_free or Stage3_free */
5215 }
5216
5217 return;
5218 }
5219
5220
5221
5222 static List_T
compute_cigar_standard(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)5223 compute_cigar_standard (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
5224 bool watsonp,
5225 #ifdef CONVERT_INTRONS_TO_DELETIONS
5226 int sensedir,
5227 #endif
5228 int chimera_part) {
5229 List_T tokens = NULL;
5230 char token[11];
5231 int Mlength = 0, Ilength = 0, Dlength = 0;
5232 bool in_exon = false, deletionp;
5233 struct T *ptr, *prev, *this = NULL;
5234 int exon_queryend = -1;
5235 Chrpos_T exon_genomestart = 0;
5236 Chrpos_T exon_genomeend, genome_gap;
5237 int query_gap;
5238 int last_querypos = -1;
5239 Chrpos_T last_genomepos = (Chrpos_T) -1;
5240 int i;
5241
5242 /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
5243 *intronp = false;
5244
5245 ptr = pairs;
5246
5247 if (chimera_part == +1) {
5248 if (ptr->querypos > *hardclip_start) {
5249 if (ptr->querypos > 0) {
5250 /* Clip to beginning */
5251 *hardclip_start = ptr->querypos;
5252 sprintf(token,"%dH",*hardclip_start);
5253 tokens = push_token(tokens,token);
5254 }
5255 } else {
5256 if (*hardclip_start > 0) {
5257 /* Clip to hard clip boundary */
5258 sprintf(token,"%dH",*hardclip_start);
5259 tokens = push_token(tokens,token);
5260 }
5261 }
5262 } else {
5263 if (*hardclip_start > 0) {
5264 sprintf(token,"%dH",*hardclip_start);
5265 tokens = push_token(tokens,token);
5266 }
5267 if (ptr->querypos > (*hardclip_start)) {
5268 sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
5269 tokens = push_token(tokens,token);
5270 }
5271 }
5272
5273 this = (T) NULL;
5274 for (i = 0; i < npairs; i++) {
5275 prev = this;
5276 this = ptr++;
5277
5278 #if 0
5279 /* Cigar_print_tokens(stdout,tokens); */
5280 Pair_dump_one(this,true);
5281 printf("\n");
5282 #endif
5283
5284 if (this->gapp) {
5285 if (in_exon == true) {
5286 exon_queryend = last_querypos + 1;
5287 exon_genomeend = last_genomepos + 1;
5288 #if 0
5289 if (watsonp) {
5290 intron_start = exon_genomeend + 1;
5291 } else {
5292 intron_start = exon_genomeend - 1;
5293 }
5294 #endif
5295
5296 if (Mlength > 0) {
5297 sprintf(token,"%dM",Mlength);
5298 tokens = push_token(tokens,token);
5299 } else if (Ilength > 0) {
5300 sprintf(token,"%dI",Ilength);
5301 tokens = push_token(tokens,token);
5302 } else if (Dlength > 0) {
5303 sprintf(token,"%dD",Dlength);
5304 tokens = push_token(tokens,token);
5305 }
5306
5307 Mlength = Ilength = Dlength = 0;
5308
5309 in_exon = false;
5310 }
5311
5312 } else if (this->comp == INTRONGAP_COMP) {
5313 /* Do nothing */
5314
5315 } else {
5316 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
5317 SHORTGAP_COMP, or MISMATCH_COMP */
5318 if (in_exon == false) {
5319 /* exon_querystart = this->querypos + 1; */
5320 exon_genomestart = this->genomepos + 1;
5321
5322 if (prev != NULL) {
5323 /* Gap */
5324 /* abs() gives a large value when flag -m64 is specified */
5325 /* genome_gap = abs(intron_end - intron_start) + 1; */
5326 if (watsonp) {
5327 /* intron_end = exon_genomestart - 1; */
5328 /* genome_gap = (intron_end - intron_start) + 1; */
5329 genome_gap = exon_genomestart - exon_genomeend - 1;
5330 } else {
5331 /* intron_end = exon_genomestart + 1; */
5332 /* genome_gap = (intron_start - intron_end) + 1; */
5333 genome_gap = exon_genomeend - exon_genomestart - 1;
5334 }
5335
5336 deletionp = false;
5337 #ifdef CONVERT_INTRONS_TO_DELETIONS
5338 if (sensedir == SENSE_FORWARD) {
5339 if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
5340 prev->comp == FWD_GCAG_INTRON_COMP ||
5341 prev->comp == FWD_ATAC_INTRON_COMP) {
5342 sprintf(token,"%uN",genome_gap);
5343 *intronp = true;
5344 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5345 sprintf(token,"%uN",genome_gap);
5346 *intronp = true;
5347 } else {
5348 sprintf(token,"%uD",genome_gap);
5349 deletionp = true;
5350 }
5351 } else if (sensedir == SENSE_ANTI) {
5352 if (prev->comp == REV_CANONICAL_INTRON_COMP ||
5353 prev->comp == REV_GCAG_INTRON_COMP ||
5354 prev->comp == REV_ATAC_INTRON_COMP) {
5355 sprintf(token,"%uN",genome_gap);
5356 *intronp = true;
5357 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5358 sprintf(token,"%uN",genome_gap);
5359 *intronp = true;
5360 } else {
5361 sprintf(token,"%uD",genome_gap);
5362 deletionp = true;
5363 }
5364 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
5365 sprintf(token,"%uN",genome_gap);
5366 *intronp = true;
5367 } else {
5368 sprintf(token,"%uD",genome_gap);
5369 deletionp = true;
5370 }
5371 #else
5372 sprintf(token,"%uN",genome_gap);
5373 *intronp = true;
5374 #endif
5375 tokens = push_token(tokens,token);
5376
5377 /* Check for dual gap. Doesn't work for hard clipping. */
5378 /* assert(exon_queryend >= 0); */
5379
5380 query_gap = this->querypos - exon_queryend;
5381 assert(query_gap >= 0);
5382 if (query_gap > 0) {
5383 if (deletionp == true && sam_insert_0M_p == true) {
5384 /* Put zero matches between deletion and insertion, since some programs will complain */
5385 sprintf(token,"0M");
5386 tokens = push_token(tokens,token);
5387 }
5388
5389 sprintf(token,"%uI",query_gap);
5390 tokens = push_token(tokens,token);
5391 }
5392 }
5393
5394 in_exon = true;
5395 }
5396
5397 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
5398 /* Gap in upper or lower sequence */
5399 if (this->genome == ' ') {
5400 /* Insertion relative to genome */
5401 if (Mlength > 0) {
5402 sprintf(token,"%dM",Mlength);
5403 tokens = push_token(tokens,token);
5404 Mlength = 0;
5405 } else if (Dlength > 0) {
5406 /* unlikely */
5407 sprintf(token,"%dD",Dlength);
5408 tokens = push_token(tokens,token);
5409 Dlength = 0;
5410 }
5411 Ilength++;
5412 } else if (this->cdna == ' ') {
5413 /* Deletion relative to genome */
5414 if (Mlength > 0) {
5415 sprintf(token,"%dM",Mlength);
5416 tokens = push_token(tokens,token);
5417 Mlength = 0;
5418 } else if (Ilength > 0) {
5419 sprintf(token,"%dI",Ilength);
5420 tokens = push_token(tokens,token);
5421 Ilength = 0;
5422 }
5423 Dlength++;
5424 } else {
5425 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
5426 exit(9);
5427 }
5428
5429 } else {
5430 /* Count even if unknown base */
5431
5432 if (Ilength > 0) {
5433 sprintf(token,"%dI",Ilength);
5434 tokens = push_token(tokens,token);
5435 Ilength = 0;
5436 } else if (Dlength > 0) {
5437 sprintf(token,"%dD",Dlength);
5438 tokens = push_token(tokens,token);
5439 Dlength = 0;
5440 }
5441 Mlength++;
5442
5443 }
5444 }
5445
5446 if (this != NULL) {
5447 if (this->cdna != ' ') {
5448 last_querypos = this->querypos;
5449 }
5450 if (this->genome != ' ') {
5451 last_genomepos = this->genomepos;
5452 }
5453 }
5454 }
5455
5456 /* prev = this; */
5457 /* exon_queryend = last_querypos + 1; */
5458 /* exon_genomeend = last_genomepos + 1; */
5459
5460 if (Mlength > 0) {
5461 sprintf(token,"%dM",Mlength);
5462 tokens = push_token(tokens,token);
5463 } else if (Ilength > 0) {
5464 sprintf(token,"%dI",Ilength);
5465 tokens = push_token(tokens,token);
5466 } else if (Dlength > 0) {
5467 sprintf(token,"%dD",Dlength);
5468 tokens = push_token(tokens,token);
5469 }
5470
5471
5472 /* Terminal clipping */
5473 if (chimera_part == -1) {
5474 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5475 if (last_querypos < querylength_given - 1) {
5476 /* Clip to end */
5477 *hardclip_end = querylength_given - 1 - last_querypos;
5478 sprintf(token,"%dH",*hardclip_end);
5479 tokens = push_token(tokens,token);
5480 }
5481 } else {
5482 if (*hardclip_end > 0) {
5483 /* Clip to hard clip boundary */
5484 sprintf(token,"%dH",*hardclip_end);
5485 tokens = push_token(tokens,token);
5486 }
5487 }
5488 } else {
5489 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5490 sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
5491 tokens = push_token(tokens,token);
5492 }
5493 if (*hardclip_end > 0) {
5494 sprintf(token,"%dH",*hardclip_end);
5495 tokens = push_token(tokens,token);
5496 }
5497 }
5498
5499 return clean_cigar(tokens,watsonp);
5500 }
5501
5502
5503 static List_T
compute_cigar_extended(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)5504 compute_cigar_extended (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
5505 bool watsonp,
5506 #ifdef CONVERT_INTRONS_TO_DELETIONS
5507 int sensedir,
5508 #endif
5509 int chimera_part) {
5510 List_T tokens = NULL;
5511 char token[11];
5512 int Elength = 0, Xlength = 0, Ilength = 0, Dlength = 0;
5513 bool in_exon = false, deletionp;
5514 struct T *ptr, *prev, *this = NULL;
5515 int exon_queryend = -1;
5516 Chrpos_T exon_genomestart = 0;
5517 Chrpos_T exon_genomeend, genome_gap;
5518 int query_gap;
5519 int last_querypos = -1;
5520 Chrpos_T last_genomepos = (Chrpos_T) -1;
5521 int i;
5522
5523 /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
5524 *intronp = false;
5525
5526 ptr = pairs;
5527
5528 if (chimera_part == +1) {
5529 if (ptr->querypos > *hardclip_start) {
5530 if (ptr->querypos > 0) {
5531 /* Clip to beginning */
5532 *hardclip_start = ptr->querypos;
5533 sprintf(token,"%dH",*hardclip_start);
5534 tokens = push_token(tokens,token);
5535 }
5536 } else {
5537 if (*hardclip_start > 0) {
5538 /* Clip to hard clip boundary */
5539 sprintf(token,"%dH",*hardclip_start);
5540 tokens = push_token(tokens,token);
5541 }
5542 }
5543 } else {
5544 if (*hardclip_start > 0) {
5545 sprintf(token,"%dH",*hardclip_start);
5546 tokens = push_token(tokens,token);
5547 }
5548 if (ptr->querypos > (*hardclip_start)) {
5549 sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
5550 tokens = push_token(tokens,token);
5551 }
5552 }
5553
5554 this = (T) NULL;
5555 for (i = 0; i < npairs; i++) {
5556 prev = this;
5557 this = ptr++;
5558
5559 #if 0
5560 /* Cigar_print_tokens(stdout,tokens); */
5561 Pair_dump_one(this,true);
5562 printf("\n");
5563 #endif
5564
5565 if (this->gapp) {
5566 if (in_exon == true) {
5567 exon_queryend = last_querypos + 1;
5568 exon_genomeend = last_genomepos + 1;
5569 #if 0
5570 if (watsonp) {
5571 intron_start = exon_genomeend + 1;
5572 } else {
5573 intron_start = exon_genomeend - 1;
5574 }
5575 #endif
5576
5577 if (Elength > 0) {
5578 sprintf(token,"%d=",Elength);
5579 tokens = push_token(tokens,token);
5580 } else if (Xlength > 0) {
5581 sprintf(token,"%dX",Xlength);
5582 tokens = push_token(tokens,token);
5583 } else if (Ilength > 0) {
5584 sprintf(token,"%dI",Ilength);
5585 tokens = push_token(tokens,token);
5586 } else if (Dlength > 0) {
5587 sprintf(token,"%dD",Dlength);
5588 tokens = push_token(tokens,token);
5589 }
5590
5591 Elength = Xlength = Ilength = Dlength = 0;
5592
5593 in_exon = false;
5594 }
5595
5596 } else if (this->comp == INTRONGAP_COMP) {
5597 /* Do nothing */
5598
5599 } else {
5600 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
5601 SHORTGAP_COMP, or MISMATCH_COMP */
5602 if (in_exon == false) {
5603 /* exon_querystart = this->querypos + 1; */
5604 exon_genomestart = this->genomepos + 1;
5605
5606 if (prev != NULL) {
5607 /* Gap */
5608 /* abs() gives a large value when flag -m64 is specified */
5609 /* genome_gap = abs(intron_end - intron_start) + 1; */
5610 if (watsonp) {
5611 /* intron_end = exon_genomestart - 1; */
5612 /* genome_gap = (intron_end - intron_start) + 1; */
5613 genome_gap = exon_genomestart - exon_genomeend - 1;
5614 } else {
5615 /* intron_end = exon_genomestart + 1; */
5616 /* genome_gap = (intron_start - intron_end) + 1; */
5617 genome_gap = exon_genomeend - exon_genomestart - 1;
5618 }
5619
5620 deletionp = false;
5621 #ifdef CONVERT_INTRONS_TO_DELETIONS
5622 if (sensedir == SENSE_FORWARD) {
5623 if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
5624 prev->comp == FWD_GCAG_INTRON_COMP ||
5625 prev->comp == FWD_ATAC_INTRON_COMP) {
5626 sprintf(token,"%uN",genome_gap);
5627 *intronp = true;
5628 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5629 sprintf(token,"%uN",genome_gap);
5630 *intronp = true;
5631 } else {
5632 sprintf(token,"%uD",genome_gap);
5633 deletionp = true;
5634 }
5635 } else if (sensedir == SENSE_ANTI) {
5636 if (prev->comp == REV_CANONICAL_INTRON_COMP ||
5637 prev->comp == REV_GCAG_INTRON_COMP ||
5638 prev->comp == REV_ATAC_INTRON_COMP) {
5639 sprintf(token,"%uN",genome_gap);
5640 *intronp = true;
5641 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5642 sprintf(token,"%uN",genome_gap);
5643 *intronp = true;
5644 } else {
5645 sprintf(token,"%uD",genome_gap);
5646 deletionp = true;
5647 }
5648 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
5649 sprintf(token,"%uN",genome_gap);
5650 *intronp = true;
5651 } else {
5652 sprintf(token,"%uD",genome_gap);
5653 deletionp = true;
5654 }
5655 #else
5656 sprintf(token,"%uN",genome_gap);
5657 *intronp = true;
5658 #endif
5659 tokens = push_token(tokens,token);
5660
5661 /* Check for dual gap. Doesn't work for hard clipping. */
5662 /* assert(exon_queryend >= 0); */
5663
5664 query_gap = this->querypos - exon_queryend;
5665 assert(query_gap >= 0);
5666 if (query_gap > 0) {
5667 if (deletionp == true && sam_insert_0M_p == true) {
5668 /* Put zero matches between deletion and insertion, since some programs will complain */
5669 sprintf(token,"0M");
5670 tokens = push_token(tokens,token);
5671 }
5672
5673 sprintf(token,"%uI",query_gap);
5674 tokens = push_token(tokens,token);
5675 }
5676 }
5677
5678 in_exon = true;
5679 }
5680
5681 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
5682 /* Gap in upper or lower sequence */
5683 if (this->genome == ' ') {
5684 /* Insertion relative to genome */
5685 if (Elength > 0) {
5686 sprintf(token,"%d=",Elength);
5687 tokens = push_token(tokens,token);
5688 Elength = 0;
5689 } else if (Xlength > 0) {
5690 sprintf(token,"%dX",Xlength);
5691 tokens = push_token(tokens,token);
5692 Xlength = 0;
5693 } else if (Dlength > 0) {
5694 /* unlikely */
5695 sprintf(token,"%dD",Dlength);
5696 tokens = push_token(tokens,token);
5697 Dlength = 0;
5698 }
5699 Ilength++;
5700 } else if (this->cdna == ' ') {
5701 /* Deletion relative to genome */
5702 if (Elength > 0) {
5703 sprintf(token,"%d=",Elength);
5704 tokens = push_token(tokens,token);
5705 Elength = 0;
5706 } else if (Xlength > 0) {
5707 sprintf(token,"%dX",Xlength);
5708 tokens = push_token(tokens,token);
5709 Xlength = 0;
5710 } else if (Ilength > 0) {
5711 sprintf(token,"%dI",Ilength);
5712 tokens = push_token(tokens,token);
5713 Ilength = 0;
5714 }
5715 Dlength++;
5716 } else {
5717 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
5718 exit(9);
5719 }
5720
5721 } else {
5722 /* Count even if unknown base */
5723
5724 if (Ilength > 0) {
5725 sprintf(token,"%dI",Ilength);
5726 tokens = push_token(tokens,token);
5727 Ilength = 0;
5728 } else if (Dlength > 0) {
5729 sprintf(token,"%dD",Dlength);
5730 tokens = push_token(tokens,token);
5731 Dlength = 0;
5732 }
5733
5734 if (prev == NULL || prev->gapp || prev->comp == INDEL_COMP || prev->comp == SHORTGAP_COMP) {
5735 if (this->cdna == this->genome) {
5736 Elength++;
5737 } else {
5738 Xlength++;
5739 }
5740
5741 } else if (prev->cdna == prev->genome) {
5742 if (this->cdna == this->genome) {
5743 Elength++;
5744 } else {
5745 if (Elength > 0) {
5746 sprintf(token,"%d=",Elength);
5747 tokens = push_token(tokens,token);
5748 Elength = 0;
5749 }
5750 Xlength++;
5751 }
5752
5753 } else {
5754 if (this->cdna != this->genome) {
5755 Xlength++;
5756 } else {
5757 if (Xlength > 0) {
5758 sprintf(token,"%dX",Xlength);
5759 tokens = push_token(tokens,token);
5760 Xlength = 0;
5761 }
5762 Elength++;
5763 }
5764 }
5765 }
5766 }
5767
5768 if (this != NULL) {
5769 if (this->cdna != ' ') {
5770 last_querypos = this->querypos;
5771 }
5772 if (this->genome != ' ') {
5773 last_genomepos = this->genomepos;
5774 }
5775 }
5776 }
5777
5778 /* prev = this; */
5779 /* exon_queryend = last_querypos + 1; */
5780 /* exon_genomeend = last_genomepos + 1; */
5781
5782 if (Elength > 0) {
5783 sprintf(token,"%d=",Elength);
5784 tokens = push_token(tokens,token);
5785 } else if (Xlength > 0) {
5786 sprintf(token,"%dX",Xlength);
5787 tokens = push_token(tokens,token);
5788 } else if (Ilength > 0) {
5789 sprintf(token,"%dI",Ilength);
5790 tokens = push_token(tokens,token);
5791 } else if (Dlength > 0) {
5792 sprintf(token,"%dD",Dlength);
5793 tokens = push_token(tokens,token);
5794 }
5795
5796
5797 /* Terminal clipping */
5798 if (chimera_part == -1) {
5799 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5800 if (last_querypos < querylength_given - 1) {
5801 /* Clip to end */
5802 *hardclip_end = querylength_given - 1 - last_querypos;
5803 sprintf(token,"%dH",*hardclip_end);
5804 tokens = push_token(tokens,token);
5805 }
5806 } else {
5807 if (*hardclip_end > 0) {
5808 /* Clip to hard clip boundary */
5809 sprintf(token,"%dH",*hardclip_end);
5810 tokens = push_token(tokens,token);
5811 }
5812 }
5813 } else {
5814 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5815 sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
5816 tokens = push_token(tokens,token);
5817 }
5818 if (*hardclip_end > 0) {
5819 sprintf(token,"%dH",*hardclip_end);
5820 tokens = push_token(tokens,token);
5821 }
5822 }
5823
5824 return clean_cigar(tokens,watsonp);
5825 }
5826
5827
5828 List_T
Pair_compute_cigar(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int chimera_part)5829 Pair_compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
5830 bool watsonp, int chimera_part) {
5831 if (cigar_extended_p == true) {
5832 return compute_cigar_extended(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
5833 watsonp,chimera_part);
5834 } else {
5835 return compute_cigar_standard(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
5836 watsonp,chimera_part);
5837 }
5838 }
5839
5840
5841 /* Derived from print_gff3_cdna_match */
5842 /* Assumes pairarray has been hard clipped already */
5843 static void
print_sam_line(Filestring_T fp,char * abbrev,char * acc1,char * acc2,char * chrstring,bool watsonp,int sensedir,List_T cigar_tokens,List_T md_tokens,int nmismatches_refdiff,int nmismatches_bothdiff,int nindels,bool intronp,char * queryseq_ptr,char * quality_string,int hardclip_start,int hardclip_end,int querylength,Chimera_T chimera,int quality_shift,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,unsigned int flag,Univ_IIT_T chromosome_iit,Chrpos_T chrpos,Chrpos_T chrlength,int mapq_score,char * sam_read_group_id)5844 print_sam_line (Filestring_T fp, char *abbrev, char *acc1, char *acc2, char *chrstring,
5845 bool watsonp, int sensedir, List_T cigar_tokens, List_T md_tokens,
5846 int nmismatches_refdiff, int nmismatches_bothdiff, int nindels,
5847 bool intronp, char *queryseq_ptr, char *quality_string,
5848 int hardclip_start, int hardclip_end,
5849 int querylength, Chimera_T chimera, int quality_shift,
5850 int pathnum, int npaths_primary, int npaths_altloc, int absmq_score, int second_absmq, unsigned int flag,
5851 Univ_IIT_T chromosome_iit, Chrpos_T chrpos, Chrpos_T chrlength,
5852 int mapq_score, char *sam_read_group_id) {
5853
5854 /* Should already be checked when Stage3_T or Stage3end_T object was created */
5855 if (cigar_action == CIGAR_ACTION_IGNORE) {
5856 /* Don't check */
5857 } else if (Pair_cigar_length(cigar_tokens) + hardclip_start + hardclip_end == querylength) {
5858 /* Okay */
5859 } else if (cigar_action == CIGAR_ACTION_WARNING) {
5860 fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
5861 acc1,Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5862 } else if (cigar_action == CIGAR_ACTION_NOPRINT) {
5863 fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
5864 acc1,Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5865 return;
5866 } else {
5867 /* CIGAR_ACTION_ABORT */
5868 fprintf(stderr,"Error: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
5869 acc1,Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5870 abort();
5871 }
5872
5873 /* 1. QNAME or Accession */
5874 if (acc2 == NULL) {
5875 FPRINTF(fp,"%s\t",acc1);
5876 } else {
5877 FPRINTF(fp,"%s,%s\t",acc1,acc2);
5878 }
5879
5880 /* 2. Flags */
5881 FPRINTF(fp,"%u\t",flag);
5882
5883 /* 3. RNAME or Chrstring */
5884 /* 4. POS or Chrlow */
5885 /* Taken from GMAP part of SAM_chromosomal_pos */
5886 if (chrpos > chrlength) {
5887 FPRINTF(fp,"%s\t%u\t",chrstring,chrpos - chrlength /*+ 1*/);
5888 } else {
5889 FPRINTF(fp,"%s\t%u\t",chrstring,chrpos /*+ 1*/);
5890 }
5891
5892 /* 5. MAPQ or Mapping quality */
5893 FPRINTF(fp,"%d\t",mapq_score);
5894
5895 /* 6. CIGAR */
5896 Pair_print_tokens(fp,cigar_tokens);
5897
5898 /* 7. MRNM: Mate chr */
5899 /* 8. MPOS: Mate chrpos */
5900 FPRINTF(fp,"\t*\t0");
5901
5902 /* 9. ISIZE: Insert size */
5903 FPRINTF(fp,"\t0");
5904
5905 /* 10. SEQ: queryseq and 11. QUAL: quality_scores */
5906 FPRINTF(fp,"\t");
5907 if (watsonp == true) {
5908 print_chopped(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5909 FPRINTF(fp,"\t");
5910 print_quality(fp,quality_string,querylength,hardclip_start,hardclip_end,
5911 quality_shift);
5912 } else {
5913 print_chopped_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5914 FPRINTF(fp,"\t");
5915 print_quality_revcomp(fp,quality_string,querylength,hardclip_start,hardclip_end,
5916 quality_shift);
5917 }
5918
5919 /* 12. TAGS: RG */
5920 if (sam_read_group_id != NULL) {
5921 FPRINTF(fp,"\tRG:Z:%s",sam_read_group_id);
5922 }
5923
5924 /* 12. TAGS: XH and XI */
5925 if (hardclip_start > 0 || hardclip_end > 0) {
5926 FPRINTF(fp,"\tXH:Z:");
5927 if (watsonp == true) {
5928 print_chopped_end(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5929 } else {
5930 print_chopped_end_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5931 }
5932
5933 if (quality_string != NULL) {
5934 FPRINTF(fp,"\tXI:Z:");
5935 if (watsonp == true) {
5936 print_chopped_end_quality(fp,quality_string,querylength,hardclip_start,hardclip_end);
5937 } else {
5938 print_chopped_end_quality_reverse(fp,quality_string,querylength,hardclip_start,hardclip_end);
5939 }
5940 }
5941 }
5942
5943 /* 12. TAGS: MD string */
5944 FPRINTF(fp,"\tMD:Z:");
5945 Pair_print_tokens(fp,md_tokens);
5946
5947 /* 12. TAGS: NH */
5948 FPRINTF(fp,"\tNH:i:%d",npaths_primary + npaths_altloc);
5949
5950 /* 12. TAGS: HI */
5951 FPRINTF(fp,"\tHI:i:%d",pathnum);
5952
5953 /* 12. TAGS: NM */
5954 FPRINTF(fp,"\tNM:i:%d",nmismatches_refdiff + nindels);
5955
5956 if (snps_p) {
5957 /* 12. TAGS: XW and XV */
5958 FPRINTF(fp,"\tXW:i:%d",nmismatches_bothdiff);
5959 FPRINTF(fp,"\tXV:i:%d",nmismatches_refdiff - nmismatches_bothdiff);
5960 }
5961
5962
5963 /* 12. TAGS: SM */
5964 FPRINTF(fp,"\tSM:i:%d",40);
5965
5966 /* 12. TAGS: XQ */
5967 FPRINTF(fp,"\tXQ:i:%d",absmq_score);
5968
5969 /* 12. TAGS: X2 */
5970 FPRINTF(fp,"\tX2:i:%d",second_absmq);
5971
5972 /* 12. TAGS: XO */
5973 FPRINTF(fp,"\tXO:Z:%s",abbrev);
5974
5975 /* 12. TAGS: XS */
5976 if (novelsplicingp == false && splicesites_iit == NULL) {
5977 /* Do not print XS field */
5978
5979 } else if (sensedir == SENSE_FORWARD) {
5980 if (watsonp == true) {
5981 FPRINTF(fp,"\tXS:A:+");
5982 } else {
5983 FPRINTF(fp,"\tXS:A:-");
5984 }
5985
5986 } else if (sensedir == SENSE_ANTI) {
5987 if (watsonp == true) {
5988 FPRINTF(fp,"\tXS:A:-");
5989 } else {
5990 FPRINTF(fp,"\tXS:A:+");
5991 }
5992
5993 } else if (intronp == false) {
5994 /* Skip. No intron in this end and mate is not revealing. */
5995
5996 #if 0
5997 } else if (force_xs_direction_p == true) {
5998 /* Don't print XS field for SENSE_NULL */
5999 /* Could not determine sense, so just report arbitrarily as + */
6000 /* This option provided for users of Cufflinks, which cannot handle XS:A:? */
6001 FPRINTF(fp,"\tXS:A:+");
6002
6003 } else {
6004 /* Non-canonical. Don't report. */
6005 FPRINTF(fp,"\tXS:A:?");
6006 #endif
6007 }
6008
6009 /* 12. TAGS: XT */
6010 if (chimera != NULL) {
6011 FPRINTF(fp,"\tXT:Z:");
6012 Chimera_print_sam_tag(fp,chimera,chromosome_iit);
6013 }
6014
6015 FPRINTF(fp,"\n");
6016
6017 return;
6018 }
6019
6020
6021 typedef enum {IN_MATCHES, IN_MISMATCHES, IN_DELETION} MD_state_T;
6022
6023 static List_T
compute_md_string(int * nmismatches_refdiff,int * nmismatches_bothdiff,int * nindels,struct T * pairs,int npairs,bool watsonp,List_T cigar_tokens)6024 compute_md_string (int *nmismatches_refdiff, int *nmismatches_bothdiff, int *nindels,
6025 struct T *pairs, int npairs, bool watsonp, List_T cigar_tokens) {
6026 List_T md_tokens = NULL, p;
6027 char *cigar_token, token[11], *first_token, type;
6028 T this;
6029 int nmatches = 0, length;
6030 MD_state_T state = IN_MISMATCHES;
6031 int i, k = 0;
6032
6033 *nmismatches_refdiff = *nmismatches_bothdiff = *nindels = 0;
6034
6035 debug4(Pair_dump_array(pairs,npairs,true));
6036 debug4(printf("watsonp %d\n",watsonp));
6037
6038 if (watsonp == true) {
6039 for (p = cigar_tokens; p != NULL; p = List_next(p)) {
6040 cigar_token = (char *) List_head(p);
6041 debug4(printf("token is %s\n",cigar_token));
6042 type = cigar_token[strlen(cigar_token)-1];
6043 length = atoi(cigar_token);
6044
6045 if (type == 'H') {
6046 /* k += length; */
6047
6048 } else if (type == 'S') {
6049 /* k += length; */
6050
6051 } else if (type == 'M' || type == 'X' || type == '=') {
6052 for (i = 0; i < length; i++, k++) {
6053 this = &(pairs[k]);
6054 debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
6055 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6056 nmatches++;
6057 state = IN_MATCHES;
6058
6059 } else if (this->comp == MISMATCH_COMP) {
6060 if (state == IN_MATCHES) {
6061 sprintf(token,"%d",nmatches);
6062 md_tokens = push_token(md_tokens,token);
6063 nmatches = 0;
6064 } else if (state == IN_DELETION) {
6065 md_tokens = push_token(md_tokens,"0");
6066 }
6067 state = IN_MISMATCHES;
6068
6069 *nmismatches_refdiff += 1;
6070 if (md_lowercase_variant_p && this->cdna == this->genomealt) {
6071 /* A mismatch against the reference only => alternate variant */
6072 sprintf(token,"%c",tolower(this->genome));
6073 } else {
6074 /* A true mismatch against both variants */
6075 *nmismatches_bothdiff += 1;
6076 sprintf(token,"%c",this->genome);
6077 }
6078 md_tokens = push_token(md_tokens,token);
6079
6080 } else {
6081 fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
6082 abort();
6083 }
6084 }
6085
6086 } else if (type == 'I') {
6087 while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
6088 *nindels += 1;
6089 k++;
6090 }
6091 state = IN_MATCHES;
6092
6093 } else if (type == 'N') {
6094 while (k < npairs && pairs[k].gapp == true) {
6095 k++;
6096 }
6097
6098 } else if (type == 'D') {
6099 if (state == IN_MATCHES) {
6100 if (nmatches > 0) {
6101 sprintf(token,"%d",nmatches);
6102 md_tokens = push_token(md_tokens,token);
6103 nmatches = 0;
6104 }
6105 }
6106
6107 if (state != IN_DELETION) {
6108 md_tokens = push_token(md_tokens,"^");
6109 }
6110 for (i = 0; i < length; i++, k++) {
6111 this = &(pairs[k]);
6112 sprintf(token,"%c",this->genome);
6113 md_tokens = push_token(md_tokens,token);
6114 *nindels += 1;
6115 }
6116
6117 state = IN_DELETION;
6118
6119 } else {
6120 fprintf(stderr,"Don't recognize type %c\n",type);
6121 abort();
6122 }
6123 }
6124
6125 if (nmatches > 0) {
6126 sprintf(token,"%d",nmatches);
6127 md_tokens = push_token(md_tokens,token);
6128 }
6129
6130 md_tokens = List_reverse(md_tokens);
6131
6132 } else {
6133 cigar_tokens = List_reverse(cigar_tokens);
6134 for (p = cigar_tokens; p != NULL; p = List_next(p)) {
6135 cigar_token = (char *) List_head(p);
6136 debug4(printf("token is %s\n",cigar_token));
6137 type = cigar_token[strlen(cigar_token)-1];
6138 length = atoi(cigar_token);
6139
6140 if (type == 'H') {
6141 /* k += length; */
6142
6143 } else if (type == 'S') {
6144 /* k += length; */
6145
6146 } else if (type == 'M' || type == 'X' || type == '=') {
6147 if (state == IN_DELETION) {
6148 md_tokens = push_token(md_tokens,"^");
6149 }
6150
6151 for (i = 0; i < length; i++, k++) {
6152 this = &(pairs[k]);
6153 debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
6154 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6155 nmatches++;
6156 state = IN_MATCHES;
6157
6158 } else if (this->comp == MISMATCH_COMP) {
6159 if (state == IN_MATCHES) {
6160 sprintf(token,"%d",nmatches);
6161 md_tokens = push_token(md_tokens,token);
6162 nmatches = 0;
6163 }
6164 state = IN_MISMATCHES;
6165
6166 *nmismatches_refdiff += 1;
6167
6168 if (md_lowercase_variant_p && this->cdna == this->genomealt) {
6169 /* A mismatch against the reference only => alternate variant */
6170 sprintf(token,"%c",tolower(complCode[(int) this->genome]));
6171 } else {
6172 *nmismatches_bothdiff += 1;
6173 sprintf(token,"%c",complCode[(int) this->genome]);
6174 }
6175 md_tokens = push_token(md_tokens,token);
6176
6177
6178 } else {
6179 fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
6180 abort();
6181 }
6182 }
6183
6184 } else if (type == 'I') {
6185 if (state == IN_DELETION) {
6186 md_tokens = push_token(md_tokens,"^");
6187 }
6188
6189 while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
6190 *nindels += 1;
6191 k++;
6192 }
6193 state = IN_MATCHES;
6194
6195 } else if (type == 'N') {
6196 #if 0
6197 /* Ignore deletion adjacent to intron, to avoid double ^^ */
6198 if (state == IN_DELETION) {
6199 md_tokens = push_token(md_tokens,"^");
6200 }
6201 #endif
6202
6203 while (k < npairs && pairs[k].gapp == true) {
6204 k++;
6205 }
6206
6207 } else if (type == 'D') {
6208 if (state == IN_MATCHES) {
6209 if (nmatches > 0) {
6210 sprintf(token,"%d",nmatches);
6211 md_tokens = push_token(md_tokens,token);
6212 nmatches = 0;
6213 }
6214 } else if (state == IN_MISMATCHES) {
6215 md_tokens = push_token(md_tokens,"0");
6216 }
6217
6218 for (i = 0; i < length; i++, k++) {
6219 this = &(pairs[k]);
6220 sprintf(token,"%c",complCode[(int) this->genome]);
6221 md_tokens = push_token(md_tokens,token);
6222 *nindels += 1;
6223 }
6224 state = IN_DELETION;
6225
6226 } else {
6227 fprintf(stderr,"Don't recognize type %c\n",type);
6228 abort();
6229 }
6230 }
6231
6232 if (nmatches > 0) {
6233 sprintf(token,"%d",nmatches);
6234 md_tokens = push_token(md_tokens,token);
6235 }
6236
6237 /* Restore cigar_tokens */
6238 cigar_tokens = List_reverse(cigar_tokens);
6239 }
6240
6241 assert(k == npairs);
6242
6243 /* Insert initial 0 token if necessary */
6244 if (md_tokens != NULL) {
6245 first_token = (char *) List_head(md_tokens);
6246 if (!isdigit(first_token[0])) {
6247 md_tokens = push_token(md_tokens,"0");
6248 }
6249 }
6250
6251 return md_tokens;
6252 }
6253
6254
6255 static struct T *
hardclip_pairarray(int * clipped_npairs,int hardclip_start,int hardclip_end,struct T * pairs,int npairs,int querylength)6256 hardclip_pairarray (int *clipped_npairs, int hardclip_start, int hardclip_end,
6257 struct T *pairs, int npairs, int querylength) {
6258 struct T *clipped_pairs, *ptr;
6259 int i, starti;
6260
6261 debug10(printf("Entered hardclip_pairarray with hardclip_start %d, hardclip_end %d, querylength %d\n",
6262 hardclip_start,hardclip_end,querylength));
6263 debug10(Simplepair_dump_array(pairs,npairs,true));
6264 debug10(printf("Starting with %d pairs\n",npairs));
6265
6266 i = 0;
6267 ptr = pairs;
6268 while (i < npairs && ptr->querypos < hardclip_start) {
6269 i++;
6270 ptr++;
6271 }
6272 while (i < npairs && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
6273 i++;
6274 ptr++;
6275 }
6276
6277 if (i >= npairs) {
6278 /* hardclip_start passes right end of read, so invalid */
6279 debug10(printf("i = %d, so passed end of read\n",i));
6280 hardclip_start = 0;
6281 } else if (hardclip_start > 0) {
6282 hardclip_start = ptr->querypos;
6283 }
6284
6285 starti = i;
6286 debug10(printf("starti is %d\n",starti));
6287
6288 clipped_pairs = ptr;
6289
6290 while (i < npairs && ptr->querypos < querylength - hardclip_end) {
6291 i++;
6292 ptr++;
6293 }
6294
6295 i--;
6296 ptr--;
6297 while (i >= starti && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
6298 i--;
6299 ptr--;
6300 }
6301
6302 if (i < 0) {
6303 /* hardclip_end passes left end of read, so invalid */
6304 debug10(printf("i = %d, so passed left end of read\n",i));
6305 hardclip_end = 0;
6306 } else if (hardclip_end > 0) {
6307 hardclip_end = querylength - 1 - ptr->querypos;
6308 }
6309
6310 if (hardclip_start == 0 && hardclip_end == 0) {
6311 debug10(printf("Unable to hard clip\n"));
6312 *clipped_npairs = npairs;
6313 clipped_pairs = pairs;
6314 } else {
6315 *clipped_npairs = i - starti + 1;
6316 }
6317
6318 debug10(printf("Ending with %d pairs\n",*clipped_npairs));
6319 debug10(printf("Exiting hardclip_pairarray with hardclip_start %d, hardclip_end %d\n",
6320 hardclip_start,hardclip_end));
6321
6322 return clipped_pairs;
6323 }
6324
6325
6326 /* Called only for GMAP */
6327 void
Pair_print_sam(Filestring_T fp,char * abbrev,struct T * pairarray,int npairs,char * acc1,char * acc2,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,char * queryseq_ptr,char * quality_string,int hardclip_low,int hardclip_high,int querylength_given,bool watsonp,int sensedir,int chimera_part,Chimera_T chimera,int quality_shift,bool first_read_p,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,Chrpos_T chrpos,Chrpos_T chrlength,int mapq_score,bool sam_paired_p,char * sam_read_group_id)6328 Pair_print_sam (Filestring_T fp, char *abbrev, struct T *pairarray, int npairs,
6329 char *acc1, char *acc2, Chrnum_T chrnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment,
6330 char *queryseq_ptr, char *quality_string,
6331 int hardclip_low, int hardclip_high, int querylength_given,
6332 bool watsonp, int sensedir, int chimera_part, Chimera_T chimera,
6333 int quality_shift, bool first_read_p, int pathnum, int npaths_primary, int npaths_altloc,
6334 int absmq_score, int second_absmq, Chrpos_T chrpos, Chrpos_T chrlength,
6335 int mapq_score, bool sam_paired_p, char *sam_read_group_id) {
6336 char *chrstring = NULL;
6337 unsigned int flag;
6338
6339 List_T cigar_tokens, md_tokens = NULL;
6340 int nmismatches_refdiff, nmismatches_bothdiff, nindels;
6341 bool intronp;
6342 int hardclip_start, hardclip_end;
6343 /* int hardclip_start_zero = 0, hardclip_end_zero = 0; */
6344 struct T *clipped_pairarray;
6345 int clipped_npairs;
6346 bool cigar_tokens_alloc;
6347
6348
6349 if (chrnum == 0) {
6350 chrstring = Sequence_accession(usersegment);
6351 } else {
6352 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
6353 }
6354
6355 flag = compute_sam_flag_nomate(npaths_primary + npaths_altloc,first_read_p,watsonp,sam_paired_p);
6356
6357 debug4(printf("Entered SAM_print_pairs with watsonp %d, first_read_p %d, hardclip_low %d, and hardclip_high %d\n",
6358 watsonp,first_read_p,hardclip_low,hardclip_high));
6359
6360 if (watsonp == true) {
6361 hardclip_start = hardclip_low;
6362 hardclip_end = hardclip_high;
6363 } else {
6364 hardclip_start = hardclip_high;
6365 hardclip_end = hardclip_low;
6366 }
6367 debug4(printf("hardclip_start %d, hardclip_end %d\n",hardclip_start,hardclip_end));
6368
6369
6370 clipped_pairarray = hardclip_pairarray(&clipped_npairs,hardclip_start,hardclip_end,
6371 pairarray,npairs,querylength_given);
6372 cigar_tokens = Pair_compute_cigar(&intronp,&hardclip_start,&hardclip_end,clipped_pairarray,clipped_npairs,querylength_given,
6373 watsonp,chimera_part);
6374 cigar_tokens_alloc = true;
6375
6376
6377 /* Cigar updates hardclip5 and hardclip3 for chimeras */
6378 md_tokens = compute_md_string(&nmismatches_refdiff,&nmismatches_bothdiff,&nindels,
6379 clipped_pairarray,clipped_npairs,watsonp,cigar_tokens);
6380
6381 #if 0
6382 min_evalue = Pair_min_evalue(clipped_pairarray,clipped_npairs);
6383 #endif
6384
6385 print_sam_line(fp,abbrev,acc1,acc2,chrstring,
6386 watsonp,sensedir,cigar_tokens,md_tokens,
6387 nmismatches_refdiff,nmismatches_bothdiff,nindels,
6388 intronp,queryseq_ptr,quality_string,hardclip_start,hardclip_end,
6389 querylength_given,chimera,quality_shift,pathnum,npaths_primary,npaths_altloc,
6390 absmq_score,second_absmq,flag,chromosome_iit,chrpos,chrlength,
6391 mapq_score,sam_read_group_id);
6392
6393 /* Print procedures free the character strings */
6394 Pair_tokens_free(&md_tokens);
6395 if (cigar_tokens_alloc == true) {
6396 Pair_tokens_free(&cigar_tokens);
6397 }
6398
6399 if (chrnum != 0) {
6400 FREE(chrstring);
6401 }
6402
6403 return;
6404 }
6405
6406
6407
6408 #if 0
6409 /* Copied from samprint.c */
6410 static bool
6411 check_cigar_types (Intlist_T cigar_types) {
6412 Intlist_T p;
6413 int type;
6414 /* int last_type = 'M'; */
6415 bool M_present_p = false;
6416
6417 for (p = cigar_types; p != NULL; p = Intlist_next(p)) {
6418 type = Intlist_head(p);
6419 if (type == 'M') {
6420 M_present_p = true;
6421 #if 0
6422 } else if (type == 'H' && last_type == 'S') {
6423 debug1(printf("check_cigar_types detects adjacent S and H, so returning false\n"));
6424 return false;
6425 } else if (type == 'S' && last_type == 'H') {
6426 debug1(printf("check_cigar_types detects adjacent S and H, so returning false\n"));
6427 return false;
6428 #endif
6429 }
6430 }
6431
6432 return M_present_p;
6433 }
6434 #endif
6435
6436
6437 #if 0
6438 bool
6439 Pair_check_cigar (struct T *pairs, int npairs, int querylength_given,
6440 int clipdir, int hardclip5, int hardclip3,
6441 bool watsonp, bool first_read_p, bool circularp) {
6442 bool result;
6443 Intlist_T cigar_types = NULL;
6444 int hardclip_low, hardclip_high;
6445 int Mlength = 0, Ilength = 0, Dlength = 0;
6446 bool in_exon = false, deletionp;
6447 struct T *ptr, *prev, *this = NULL;
6448 int exon_queryend;
6449 int query_gap;
6450 int last_querypos = -1;
6451 int i;
6452
6453 if (circularp == true) {
6454 if (watsonp == true) {
6455 hardclip_low = hardclip5;
6456 hardclip_high = hardclip3;
6457 } else {
6458 hardclip_low = hardclip3;
6459 hardclip_high = hardclip5;
6460 }
6461 } else {
6462 /* Incoming hardclip5 and hardclip3 are due to overlaps, not chimera */
6463 if (clipdir >= 0) {
6464 if (watsonp == true) {
6465 if (first_read_p == true) {
6466 hardclip_high = hardclip5;
6467 hardclip_low = 0;
6468 } else {
6469 hardclip_high = 0;
6470 hardclip_low = hardclip3;
6471 }
6472 } else {
6473 if (first_read_p == true) {
6474 hardclip_low = hardclip5;
6475 hardclip_high = 0;
6476 } else {
6477 hardclip_low = 0;
6478 hardclip_high = hardclip3;
6479 }
6480 }
6481 } else {
6482 if (watsonp == true) {
6483 if (first_read_p == true) {
6484 hardclip_low = hardclip5;
6485 hardclip_high = 0;
6486 } else {
6487 hardclip_low = 0;
6488 hardclip_high = hardclip3;
6489 }
6490 } else {
6491 if (first_read_p == true) {
6492 hardclip_high = hardclip5;
6493 hardclip_low = 0;
6494 } else {
6495 hardclip_high = 0;
6496 hardclip_low = hardclip3;
6497 }
6498 }
6499 }
6500 }
6501
6502
6503 ptr = pairs;
6504
6505 #if 0
6506 /* This procedure is used to check circular alignments */
6507 if (chimera_part == +1) {
6508 if (ptr->querypos > hardclip_low) {
6509 if (ptr->querypos > 0) {
6510 /* Clip to beginning */
6511 hardclip_low = ptr->querypos;
6512 cigar_types = Intlist_push(cigar_types,'H');
6513 }
6514 } else {
6515 if (hardclip_low > 0) {
6516 /* Clip to hard clip boundary */
6517 cigar_types = Intlist_push(cigar_types,'H');
6518 }
6519 }
6520 } else {
6521 #endif
6522 if (hardclip_low > 0) {
6523 cigar_types = Intlist_push(cigar_types,'H');
6524 }
6525 if (ptr->querypos > hardclip_low) {
6526 cigar_types = Intlist_push(cigar_types,'S');
6527 }
6528 #if 0
6529 }
6530 #endif
6531
6532 this = (T) NULL;
6533 for (i = 0; i < npairs; i++) {
6534 prev = this;
6535 this = ptr++;
6536
6537 if (this->gapp) {
6538 if (in_exon == true) {
6539 exon_queryend = last_querypos + 1;
6540 #if 0
6541 exon_genomeend = last_genomepos + 1;
6542 if (watsonp) {
6543 intron_start = exon_genomeend + 1;
6544 } else {
6545 intron_start = exon_genomeend - 1;
6546 }
6547 #endif
6548
6549 if (Mlength > 0) {
6550 cigar_types = Intlist_push(cigar_types,'M');
6551 } else if (Ilength > 0) {
6552 cigar_types = Intlist_push(cigar_types,'I');
6553 } else if (Dlength > 0) {
6554 cigar_types = Intlist_push(cigar_types,'D');
6555 }
6556
6557 Mlength = Ilength = Dlength = 0;
6558
6559 in_exon = false;
6560 }
6561
6562 } else if (this->comp == INTRONGAP_COMP) {
6563 /* Do nothing */
6564
6565 } else {
6566 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
6567 SHORTGAP_COMP, or MISMATCH_COMP */
6568 if (in_exon == false) {
6569 #if 0
6570 /* Needed only for full token */
6571 /* exon_querystart = this->querypos + 1; */
6572 exon_genomestart = this->genomepos + 1;
6573 if (watsonp) {
6574 intron_end = exon_genomestart - 1;
6575 } else {
6576 intron_end = exon_genomestart + 1;
6577 }
6578 #endif
6579
6580 if (prev != NULL) {
6581 /* Gap */
6582 /* genome_gap = intron_end - intron_start + 1; */
6583
6584 deletionp = false;
6585 #ifdef CONVERT_INTRONS_TO_DELETIONS
6586 if (cdna_direction > 0) {
6587 if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
6588 prev->comp == FWD_GCAG_INTRON_COMP ||
6589 prev->comp == FWD_ATAC_INTRON_COMP) {
6590 cigar_types = Intlist_push(cigar_types,'N');
6591 /* *intronp = true; */
6592 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
6593 cigar_types = Intlist_push(cigar_types,'N');
6594 /* *intronp = true; */
6595 } else {
6596 cigar_types = Intlist_push(cigar_types,'D');
6597 deletionp = true;
6598 }
6599 } else if (cdna_direction < 0) {
6600 if (prev->comp == REV_CANONICAL_INTRON_COMP ||
6601 prev->comp == REV_GCAG_INTRON_COMP ||
6602 prev->comp == REV_ATAC_INTRON_COMP) {
6603 cigar_types = Intlist_push(cigar_types,'N');
6604 /* *intronp = true; */
6605 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
6606 cigar_types = Intlist_push(cigar_types,'N');
6607 /* *intronp = true; */
6608 } else {
6609 cigar_types = Intlist_push(cigar_types,'D');
6610 deletionp = true;
6611 }
6612 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
6613 cigar_types = Intlist_push(cigar_types,'N');
6614 /* *intronp = true; */
6615 } else {
6616 cigar_types = Intlist_push(cigar_types,'D');
6617 deletionp = true;
6618 }
6619 #else
6620 cigar_types = Intlist_push(cigar_types,'N');
6621 /* *intronp = true; */
6622 #endif
6623
6624 /* Check for dual gap. Doesn't work for hard clipping. */
6625 assert(exon_queryend >= 0);
6626
6627 query_gap = this->querypos - exon_queryend;
6628 assert(query_gap >= 0);
6629 if (query_gap > 0) {
6630 if (deletionp == true && sam_insert_0M_p == true) {
6631 /* Put zero matches between deletion and insertion, since some programs will complain */
6632 cigar_types = Intlist_push(cigar_types,'M');
6633 }
6634
6635 cigar_types = Intlist_push(cigar_types,'I');
6636 }
6637 }
6638
6639 in_exon = true;
6640 }
6641
6642 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
6643 /* Gap in upper or lower sequence */
6644 if (this->genome == ' ') {
6645 /* Insertion relative to genome */
6646 if (Mlength > 0) {
6647 cigar_types = Intlist_push(cigar_types,'M');
6648 Mlength = 0;
6649 } else if (Dlength > 0) {
6650 /* unlikely */
6651 cigar_types = Intlist_push(cigar_types,'D');
6652 Dlength = 0;
6653 }
6654 Ilength++;
6655 } else if (this->cdna == ' ') {
6656 /* Deletion relative to genome */
6657 if (Mlength > 0) {
6658 cigar_types = Intlist_push(cigar_types,'M');
6659 Mlength = 0;
6660 } else if (Ilength > 0) {
6661 cigar_types = Intlist_push(cigar_types,'I');
6662 Ilength = 0;
6663 }
6664 Dlength++;
6665 } else {
6666 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
6667 exit(9);
6668 }
6669
6670 } else {
6671 /* Count even if unknown base */
6672
6673 if (Ilength > 0) {
6674 cigar_types = Intlist_push(cigar_types,'I');
6675 Ilength = 0;
6676 } else if (Dlength > 0) {
6677 cigar_types = Intlist_push(cigar_types,'D');
6678 Dlength = 0;
6679 }
6680 Mlength++;
6681 }
6682 }
6683
6684 if (this != NULL) {
6685 if (this->cdna != ' ') {
6686 last_querypos = this->querypos;
6687 }
6688 #if 0
6689 if (this->genome != ' ') {
6690 last_genomepos = this->genomepos;
6691 }
6692 #endif
6693 }
6694 }
6695
6696 /* prev = this; */
6697 exon_queryend = last_querypos + 1;
6698 /* exon_genomeend = last_genomepos + 1; */
6699
6700 if (Mlength > 0) {
6701 cigar_types = Intlist_push(cigar_types,'M');
6702 } else if (Ilength > 0) {
6703 cigar_types = Intlist_push(cigar_types,'I');
6704 } else if (Dlength > 0) {
6705 cigar_types = Intlist_push(cigar_types,'D');
6706 }
6707
6708
6709 /* Terminal clipping */
6710 #if 0
6711 /* This procedure is used to check circular alignments */
6712 if (chimera_part == -1) {
6713 if (last_querypos < querylength_given - 1 - hardclip_high) {
6714 if (last_querypos < querylength_given - 1) {
6715 /* Clip to end */
6716 hardclip_high = querylength_given - 1 - last_querypos;
6717 cigar_types = Intlist_push(cigar_types,'H');
6718 }
6719 } else {
6720 if (hardclip_high > 0) {
6721 /* Clip to hard clip boundary */
6722 cigar_types = Intlist_push(cigar_types,'H');
6723 }
6724 }
6725 } else {
6726 #endif
6727 if (last_querypos < querylength_given - 1 - hardclip_high) {
6728 cigar_types = Intlist_push(cigar_types,'S');
6729 }
6730 if (hardclip_high > 0) {
6731 cigar_types = Intlist_push(cigar_types,'H');
6732 }
6733 #if 0
6734 }
6735 #endif
6736
6737 result = check_cigar_types(cigar_types);
6738
6739 Intlist_free(&cigar_types);
6740 return result;
6741 }
6742 #endif
6743
6744
6745 #if 0
6746 static void
6747 state_print (MD_state_T state) {
6748 switch (state) {
6749 case IN_MATCHES: printf("IN_MATCHES"); break;
6750 case IN_MISMATCHES: printf("IN_MISMATCHES"); break;
6751 case IN_DELETION: printf("IN_DELETION"); break;
6752 default: abort();
6753 }
6754 return;
6755 }
6756 #endif
6757
6758
6759 #if 0
6760 static List_T
6761 compute_md_string_old (int *nmismatches, struct T *pairs, int npairs, bool watsonp) {
6762 List_T tokens = NULL;
6763 char token[11], *first_token;
6764 int nmatches = 0;
6765 struct T *ptr, *prev, *this = NULL;
6766 MD_state_T state = IN_MISMATCHES;
6767 int i;
6768
6769 ptr = pairs;
6770 *nmismatches = 0;
6771
6772 /* Ignore initial soft clipping */
6773
6774 if (watsonp == true) {
6775 for (i = 0; i < npairs; i++) {
6776 prev = this;
6777 this = ptr++;
6778
6779 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6780 nmatches++;
6781 state = IN_MATCHES;
6782
6783 } else if (this->comp == MISMATCH_COMP) {
6784 *nmismatches += 1;
6785 if (state == IN_MATCHES) {
6786 if (nmatches > 0) {
6787 sprintf(token,"%d",nmatches);
6788 tokens = push_token(tokens,token);
6789 nmatches = 0;
6790 }
6791
6792 } else if (state == IN_DELETION) {
6793 tokens = push_token(tokens,"0");
6794 }
6795 state = IN_MISMATCHES;
6796
6797 sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6798 tokens = push_token(tokens,token);
6799
6800 } else if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
6801 if (this->genome == ' ') {
6802 #if 0
6803 /* Insertion relative to genome. Ignored in MD string (but not in cigar). */
6804 nmatches++;
6805 state = IN_MATCHES;
6806 #endif
6807
6808 } else if (this->cdna == ' ') {
6809 /* Deletion relative to genome */
6810 if (state == IN_MATCHES) {
6811 if (nmatches > 0) {
6812 sprintf(token,"%d",nmatches);
6813 tokens = push_token(tokens,token);
6814 nmatches = 0;
6815 }
6816 tokens = push_token(tokens,"^");
6817
6818 } else if (state == IN_MISMATCHES) {
6819 tokens = push_token(tokens,"^");
6820
6821 }
6822 state = IN_DELETION;
6823
6824 sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6825 tokens = push_token(tokens,token);
6826 }
6827
6828 } else {
6829 /* Ignore */
6830 }
6831 }
6832
6833 /* Ignore terminal soft clipping */
6834
6835 if (nmatches > 0) {
6836 sprintf(token,"%d",nmatches);
6837 tokens = push_token(tokens,token);
6838 }
6839
6840 /* Put tokens in forward order */
6841 tokens = List_reverse(tokens);
6842
6843 } else {
6844
6845 for (i = 0; i < npairs; i++) {
6846 prev = this;
6847 this = ptr++;
6848
6849 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6850 if (state == IN_DELETION) {
6851 tokens = push_token(tokens,"^");
6852 }
6853 nmatches++;
6854 state = IN_MATCHES;
6855
6856 } else if (this->comp == MISMATCH_COMP) {
6857 *nmismatches += 1;
6858 if (state == IN_MATCHES) {
6859 if (nmatches > 0) {
6860 sprintf(token,"%d",nmatches);
6861 tokens = push_token(tokens,token);
6862 nmatches = 0;
6863 }
6864
6865 } else if (state == IN_DELETION) {
6866 tokens = push_token(tokens,"^");
6867 }
6868 state = IN_MISMATCHES;
6869
6870 sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6871 tokens = push_token(tokens,token);
6872
6873 } else if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
6874 if (this->genome == ' ') {
6875 #if 0
6876 /* Insertion relative to genome. Ignored in MD string, but not in cigar string. */
6877 if (state == IN_DELETION) {
6878 tokens = push_token(tokens,"^");
6879 }
6880 nmatches++;
6881 state = IN_MATCHES;
6882 #endif
6883
6884 } else if (this->cdna == ' ') {
6885 /* Deletion relative to genome */
6886 if (state == IN_MATCHES) {
6887 if (nmatches > 0) {
6888 sprintf(token,"%d",nmatches);
6889 tokens = push_token(tokens,token);
6890 nmatches = 0;
6891 }
6892
6893 } else if (state == IN_MISMATCHES) {
6894 tokens = push_token(tokens,"0");
6895
6896 }
6897 state = IN_DELETION;
6898
6899 sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6900 tokens = push_token(tokens,token);
6901 }
6902
6903 } else {
6904 /* Ignore */
6905 }
6906 }
6907
6908 /* Ignore terminal soft clipping */
6909
6910 if (nmatches > 0) {
6911 sprintf(token,"%d",nmatches);
6912 tokens = push_token(tokens,token);
6913 }
6914
6915 /* Keep tokens in reverse order */
6916 }
6917
6918
6919 /* Insert initial 0 token if necessary */
6920 if (tokens != NULL) {
6921 first_token = (char *) List_head(tokens);
6922 if (!isdigit(first_token[0])) {
6923 tokens = push_token(tokens,"0");
6924 }
6925 }
6926
6927 return tokens;
6928 }
6929 #endif
6930
6931
6932 Uintlist_T
Pair_exonbounds(struct T * pairs,int npairs)6933 Pair_exonbounds (struct T *pairs, int npairs) {
6934 Uintlist_T exonbounds = NULL;
6935 struct T *ptr, *this = NULL;
6936 bool in_exon = false;
6937 int i;
6938 Chrpos_T last_genomepos = (Chrpos_T) -1;
6939
6940 ptr = pairs;
6941 for (i = 0; i < npairs; i++) {
6942 /* prev = this; */
6943 this = ptr++;
6944
6945 if (this->gapp) {
6946 if (in_exon == true) {
6947 /* exon genomeend */
6948 exonbounds = Uintlist_push(exonbounds,/*chroffset +*/last_genomepos);
6949 in_exon = false;
6950 }
6951 } else if (this->comp == INTRONGAP_COMP) {
6952 /* Do nothing */
6953 } else {
6954 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
6955 SHORTGAP_COMP, or MISMATCH_COMP */
6956 if (in_exon == false) {
6957 /* exon genomestart */
6958 exonbounds = Uintlist_push(exonbounds,/*chroffset +*/this->genomepos);
6959 in_exon = true;
6960 }
6961 }
6962 if (this->genome != ' ') {
6963 last_genomepos = this->genomepos;
6964 }
6965 }
6966
6967 /* prev = this; */
6968 exonbounds = Uintlist_push(exonbounds,/*chroffset +*/last_genomepos);
6969
6970 return Uintlist_reverse(exonbounds);
6971 }
6972
6973
6974 static int
count_psl_blocks_nt(Intlist_T * blockSizes,Intlist_T * qStarts,Uintlist_T * tStarts,struct T * pairs_directional,int npairs,int querylength,bool watsonp)6975 count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tStarts, struct T *pairs_directional,
6976 int npairs, int querylength, bool watsonp) {
6977 int nblocks = 0, i;
6978 int block_querystart, block_queryend;
6979 struct T *ptr = pairs_directional, *this = NULL;
6980 bool in_block = false;
6981 int last_querypos = -1;
6982 /* Chrpos_T last_genomepos = (Chrpos_T) -1; */
6983
6984 for (i = 0; i < npairs; i++) {
6985 /* prev = this; */
6986 this = ptr++;
6987
6988 if (this->gapp) {
6989 if (in_block == true) {
6990 nblocks++;
6991 block_queryend = last_querypos;
6992 debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
6993 /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
6994 if (block_queryend > block_querystart) {
6995 *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
6996 } else {
6997 *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
6998 }
6999 in_block = false;
7000 }
7001 } else if (this->comp == INTRONGAP_COMP) {
7002 /* Do nothing */
7003
7004 } else if (this->cdna == ' ' || this->genome == ' ') {
7005 if (in_block == true) {
7006 nblocks++;
7007 block_queryend = last_querypos;
7008 debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
7009 /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
7010 if (block_queryend > block_querystart) {
7011 *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
7012 } else {
7013 *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
7014 }
7015 in_block = false;
7016 }
7017
7018 } else {
7019 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7020 or SHORTGAP_COMP */
7021 if (in_block == false) {
7022 block_querystart = this->querypos;
7023 if (watsonp == true) {
7024 debug2(FPRINTF(fp,"Pushing qstart: %d\n",block_querystart));
7025 *qStarts = Intlist_push(*qStarts,block_querystart);
7026 } else {
7027 debug2(FPRINTF(fp,"Pushing qstart: %d\n",querylength-block_querystart-1));
7028 *qStarts = Intlist_push(*qStarts,querylength-block_querystart-1);
7029 }
7030 *tStarts = Uintlist_push(*tStarts,this->genomepos);
7031 in_block = true;
7032 }
7033 }
7034
7035 if (this->cdna != ' ') {
7036 last_querypos = this->querypos;
7037 }
7038 #if 0
7039 if (this->genome != ' ') {
7040 last_genomepos = this->genomepos;
7041 }
7042 #endif
7043 }
7044
7045 if (in_block == true) {
7046 /* prev = this; */
7047 nblocks++;
7048 block_queryend = last_querypos;
7049 debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
7050 /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
7051 if (block_queryend > block_querystart) {
7052 *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
7053 } else {
7054 *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
7055 }
7056 }
7057
7058 *blockSizes = Intlist_reverse(*blockSizes);
7059 *qStarts = Intlist_reverse(*qStarts);
7060 *tStarts = Uintlist_reverse(*tStarts);
7061
7062 return nblocks;
7063 }
7064
7065
7066 static int
count_psl_blocks_pro(Intlist_T * blockSizes,Intlist_T * qStarts,Uintlist_T * tStarts,struct T * pairs_directional,int npairs,bool watsonp,Chrpos_T chrlength)7067 count_psl_blocks_pro (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tStarts, struct T *pairs_directional,
7068 int npairs, bool watsonp, Chrpos_T chrlength) {
7069 int nblocks = 0, i;
7070 int naminoacids = 0;
7071 int block_querystart;
7072 struct T *ptr = pairs_directional, *this = NULL;
7073 bool in_block = false;
7074 #ifdef NOGAPSINBLOCK
7075 struct T *prev;
7076 #endif
7077
7078 for (i = 0; i < npairs; i++) {
7079 #ifdef NOGAPSINBLOCK
7080 prev = this;
7081 #endif
7082 this = ptr++;
7083
7084 if (this->gapp) {
7085 if (in_block == true) {
7086 nblocks++;
7087 *blockSizes = Intlist_push(*blockSizes,naminoacids);
7088 in_block = false;
7089 naminoacids = 0;
7090 }
7091 } else if (this->comp == INTRONGAP_COMP) {
7092 /* Do nothing */
7093
7094 #ifdef NOGAPSINBLOCK
7095 } else if (this->cdna == ' ' || this->genome == ' ') {
7096 if (in_block == true) {
7097 nblocks++;
7098 block_queryend = last_querypos;
7099 *blockSizes = Intlist_push(*blockSizes,block_queryend/3-(block_querystart+2)/3+1);
7100 in_block = false;
7101 }
7102 #endif
7103
7104 } else {
7105 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7106 or SHORTGAP_COMP */
7107 if (this->aa_e != ' ') {
7108 naminoacids++;
7109 }
7110 if (in_block == false) {
7111 block_querystart = this->querypos;
7112 *qStarts = Intlist_push(*qStarts,(block_querystart+2)/3);
7113 if (watsonp == true) {
7114 *tStarts = Uintlist_push(*tStarts,this->genomepos);
7115 } else {
7116 #if 0
7117 /* Should be this */
7118 *tStarts = Uintlist_push(*tStarts,this->genomepos);
7119 #else
7120 /* But is actually this */
7121 *tStarts = Uintlist_push(*tStarts,chrlength - this->genomepos - 1);
7122 #endif
7123 }
7124 in_block = true;
7125 }
7126 }
7127 }
7128
7129 if (in_block == true) {
7130 #ifdef NOGAPSINBLOCK
7131 prev = this;
7132 #endif
7133 nblocks++;
7134 *blockSizes = Intlist_push(*blockSizes,naminoacids);
7135 }
7136
7137 *blockSizes = Intlist_reverse(*blockSizes);
7138 *qStarts = Intlist_reverse(*qStarts);
7139 *tStarts = Uintlist_reverse(*tStarts);
7140
7141 return nblocks;
7142 }
7143
7144
7145 static void
compute_gap_lengths_int(int * nbreaks,int * length,Intlist_T blockSizes,Intlist_T Starts,int nblocks)7146 compute_gap_lengths_int (int *nbreaks, int *length, Intlist_T blockSizes, Intlist_T Starts, int nblocks) {
7147 int i;
7148 int start, end;
7149 /* Intlist_T p = blockSizes, q = Starts; */
7150
7151 debug2(FPRINTF(fp,"Entered compute_gap_lengths_int with nblocks = %d, and Starts having length %d\n",
7152 nblocks,Intlist_length(Starts)));
7153 *nbreaks = *length = 0;
7154 for (i = 0; i < nblocks - 1; i++) {
7155 if (i > 0) {
7156 start = Intlist_head(Starts);
7157 if (start - end > 0) {
7158 *nbreaks += 1;
7159 *length += (start - end);
7160 }
7161 debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7162 }
7163 end = Intlist_head(Starts) + Intlist_head(blockSizes);
7164 blockSizes = Intlist_next(blockSizes);
7165 Starts = Intlist_next(Starts);
7166 }
7167
7168 if (i > 0) {
7169 start = Intlist_head(Starts);
7170 if (start - end > 0) {
7171 *nbreaks += 1;
7172 *length += (start - end);
7173 }
7174 debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7175 }
7176
7177 return;
7178 }
7179
7180 static void
compute_gap_lengths_uint(int * nbreaks,int * length,Intlist_T blockSizes,Uintlist_T Starts,int nblocks)7181 compute_gap_lengths_uint (int *nbreaks, int *length, Intlist_T blockSizes, Uintlist_T Starts, int nblocks) {
7182 int i;
7183 int start, end;
7184 /*
7185 Intlist_T p = blockSizes;
7186 Uintlist_T q = Starts;
7187 */
7188
7189 *nbreaks = *length = 0;
7190 for (i = 0; i < nblocks - 1; i++) {
7191 if (i > 0) {
7192 start = Uintlist_head(Starts);
7193 if (start - end > 0) {
7194 *nbreaks += 1;
7195 *length += (start - end);
7196 }
7197 debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7198 }
7199 end = Uintlist_head(Starts) + Intlist_head(blockSizes);
7200 blockSizes = Intlist_next(blockSizes);
7201 Starts = Uintlist_next(Starts);
7202 }
7203
7204 if (i > 0) {
7205 start = Uintlist_head(Starts);
7206 if (start - end > 0) {
7207 *nbreaks += 1;
7208 *length += (start - end);
7209 }
7210 debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7211 }
7212
7213 return;
7214 }
7215
7216
7217
7218 static void
count_matches_pro(int * matches,int * mismatches,int * unknowns,struct T * pairs,int npairs)7219 count_matches_pro (int *matches, int *mismatches, int *unknowns,
7220 struct T *pairs, int npairs) {
7221 struct T *this = NULL;
7222 int i;
7223
7224 i = 0;
7225 while (i < npairs) {
7226 /* prev = this; */
7227 this = &(pairs[i++]);
7228
7229 if (this->gapp == false) {
7230 if (this->aa_g != ' ' && this->aa_e != ' ') {
7231 if (this->aa_g == this->aa_e) {
7232 *matches += 1;
7233 } else if (this->aa_e == 'X') {
7234 *unknowns += 1;
7235 } else {
7236 *mismatches += 1;
7237 }
7238 }
7239 }
7240 }
7241 return;
7242 }
7243
7244
7245
7246 void
Pair_print_pslformat_nt(Filestring_T fp,struct T * pairs,int npairs,T start,T end,Sequence_T queryseq,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,int matches,int unknowns,int mismatches,bool watsonp)7247 Pair_print_pslformat_nt (Filestring_T fp, struct T *pairs, int npairs, T start, T end,
7248 Sequence_T queryseq, Chrnum_T chrnum,
7249 Univ_IIT_T chromosome_iit, Sequence_T usersegment,
7250 int matches, int unknowns, int mismatches,
7251 bool watsonp) {
7252 Chrpos_T chrpos1, chrpos2;
7253 struct T *pairs_directional = NULL;
7254 Intlist_T blockSizes = NULL, qStarts = NULL, p;
7255 Uintlist_T tStarts = NULL, q;
7256 int nblocks;
7257 int qnbreaks, qlength, tnbreaks, tlength, querylength;
7258 char *chr;
7259
7260 #ifdef PMAP
7261 querylength = 3*Sequence_fulllength(queryseq);
7262 #else
7263 querylength = Sequence_fulllength(queryseq);
7264 #endif
7265
7266 if (watsonp == true) {
7267 pairs_directional = pairs;
7268 } else {
7269 pairs_directional = invert_and_revcomp_path(pairs,npairs);
7270 }
7271
7272 nblocks = count_psl_blocks_nt(&blockSizes,&qStarts,&tStarts,pairs_directional,npairs,
7273 querylength,watsonp);
7274 compute_gap_lengths_int(&qnbreaks,&qlength,blockSizes,qStarts,nblocks);
7275 compute_gap_lengths_uint(&tnbreaks,&tlength,blockSizes,tStarts,nblocks);
7276
7277 FPRINTF(fp,"%d\t%d\t%d\t%d\t",matches,mismatches,/*repeatmatches*/0,unknowns);
7278 FPRINTF(fp,"%d\t%d\t%d\t%d\t",qnbreaks,qlength,tnbreaks,tlength);
7279
7280 if (watsonp == true) {
7281 FPRINTF(fp,"+");
7282 } else {
7283 FPRINTF(fp,"-");
7284 }
7285 FPRINTF(fp,"\t%s\t%d",Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
7286
7287 FPRINTF(fp,"\t%d\t%d",start->querypos,end->querypos+1);
7288
7289 /* T name and T size */
7290 if (chrnum == 0) {
7291 FPRINTF(fp,"\t%s\t%u",Sequence_accession(usersegment),Sequence_fulllength(usersegment));
7292 } else {
7293 chr = Chrnum_to_string(chrnum,chromosome_iit);
7294 FPRINTF(fp,"\t%s\t%u",chr,Chrnum_length(chrnum,chromosome_iit));
7295 FREE(chr);
7296 }
7297
7298 /* T start and T end */
7299 chrpos1 = start->genomepos;
7300 chrpos2 = end->genomepos;
7301 if (watsonp) {
7302 FPRINTF(fp,"\t%u\t%u",chrpos1,chrpos2+1);
7303 } else {
7304 FPRINTF(fp,"\t%u\t%u",chrpos2,chrpos1+1);
7305 }
7306
7307 FPRINTF(fp,"\t%d",nblocks);
7308
7309 FPRINTF(fp,"\t");
7310 for (p = blockSizes; p != NULL; p = Intlist_next(p)) {
7311 FPRINTF(fp,"%d,",Intlist_head(p));
7312 }
7313
7314 FPRINTF(fp,"\t");
7315 for (p = qStarts; p != NULL; p = Intlist_next(p)) {
7316 FPRINTF(fp,"%d,",Intlist_head(p));
7317 }
7318
7319 FPRINTF(fp,"\t");
7320 for (q = tStarts; q != NULL; q = Uintlist_next(q)) {
7321 FPRINTF(fp,"%u,",Uintlist_head(q));
7322 }
7323
7324 Intlist_free(&blockSizes);
7325 Intlist_free(&qStarts);
7326 Uintlist_free(&tStarts);
7327
7328 if (watsonp == false) {
7329 FREE(pairs_directional);
7330 }
7331
7332 PUTC('\n',fp);
7333 return;
7334 }
7335
7336 void
Pair_print_pslformat_pro(Filestring_T fp,struct T * pairs,int npairs,T start,T end,Sequence_T queryseq,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,bool watsonp,int cdna_direction)7337 Pair_print_pslformat_pro (Filestring_T fp, struct T *pairs, int npairs, T start, T end,
7338 Sequence_T queryseq, Chrnum_T chrnum,
7339 Univ_IIT_T chromosome_iit, Sequence_T usersegment,
7340 bool watsonp, int cdna_direction) {
7341 Chrpos_T chrpos1, chrpos2;
7342 Chrpos_T chrlength;
7343 Intlist_T blockSizes = NULL, qStarts = NULL, p;
7344 Uintlist_T tStarts = NULL, q;
7345 int nblocks, matches = 0, mismatches = 0, unknowns = 0;
7346 int qnbreaks, qlength, tnbreaks, tlength;
7347 char *chr;
7348
7349 chrlength = Chrnum_length(chrnum,chromosome_iit);
7350 nblocks = count_psl_blocks_pro(&blockSizes,&qStarts,&tStarts,pairs,npairs,
7351 watsonp,chrlength);
7352 compute_gap_lengths_int(&qnbreaks,&qlength,blockSizes,qStarts,nblocks);
7353 compute_gap_lengths_uint(&tnbreaks,&tlength,blockSizes,tStarts,nblocks);
7354
7355 count_matches_pro(&matches,&mismatches,&unknowns,pairs,npairs);
7356
7357 FPRINTF(fp,"%d\t%d\t%d\t%d\t",matches,mismatches,/*repeatmatches*/0,unknowns);
7358 FPRINTF(fp,"%d\t%d\t%d\t%d\t",qnbreaks,qlength,tnbreaks,tlength);
7359
7360 if (cdna_direction >= 0) {
7361 FPRINTF(fp,"+");
7362 } else {
7363 FPRINTF(fp,"-");
7364 }
7365
7366 if (watsonp == true) {
7367 FPRINTF(fp,"+");
7368 } else {
7369 FPRINTF(fp,"-");
7370 }
7371 FPRINTF(fp,"\t%s\t%d",Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
7372
7373 FPRINTF(fp,"\t%d\t%d",(start->querypos+2)/3,end->querypos/3+1);
7374
7375 /* T name and T size */
7376 if (chrnum == 0) {
7377 FPRINTF(fp,"\t%s\t%u",Sequence_accession(usersegment),Sequence_fulllength(usersegment));
7378 } else {
7379 chr = Chrnum_to_string(chrnum,chromosome_iit);
7380 FPRINTF(fp,"\tchr%s\t%u",chr,Chrnum_length(chrnum,chromosome_iit));
7381 FREE(chr);
7382 }
7383
7384 /* T start and T end */
7385 chrpos1 = start->genomepos;
7386 chrpos2 = end->genomepos;
7387 if (watsonp) {
7388 FPRINTF(fp,"\t%u\t%u",chrpos1,chrpos2+1);
7389 } else {
7390 FPRINTF(fp,"\t%u\t%u",chrpos2,chrpos1+1);
7391 }
7392
7393 nblocks = count_psl_blocks_pro(&blockSizes,&qStarts,&tStarts,pairs,npairs,
7394 watsonp,chrlength);
7395 FPRINTF(fp,"\t%d",nblocks);
7396 FPRINTF(fp,"\t");
7397
7398 for (p = blockSizes; p != NULL; p = Intlist_next(p)) {
7399 FPRINTF(fp,"%d,",Intlist_head(p));
7400 }
7401
7402 FPRINTF(fp,"\t");
7403 for (p = qStarts; p != NULL; p = Intlist_next(p)) {
7404 FPRINTF(fp,"%d,",Intlist_head(p));
7405 }
7406
7407 FPRINTF(fp,"\t");
7408
7409 for (q = tStarts; q != NULL; q = Uintlist_next(q)) {
7410 FPRINTF(fp,"%u,",Uintlist_head(q));
7411 }
7412
7413 Intlist_free(&blockSizes);
7414 Intlist_free(&qStarts);
7415 Uintlist_free(&tStarts);
7416
7417 PUTC('\n',fp);
7418 return;
7419 }
7420
7421 void
Pair_print_exons(Filestring_T fp,struct T * pairs,int npairs,int wraplength,int ngap,bool cdnap)7422 Pair_print_exons (Filestring_T fp, struct T *pairs, int npairs, int wraplength, int ngap, bool cdnap) {
7423 bool in_exon = false;
7424 struct T *ptr, *this = NULL;
7425 int i, exonno = 0, column = 0;
7426
7427 ptr = pairs;
7428 for (i = 0; i < npairs; i++) {
7429 this = ptr++;
7430
7431 if (this->gapp) {
7432 if (in_exon == true) {
7433 if (column != 0) {
7434 PUTC('\n',fp);
7435 column = 0;
7436 }
7437 FPRINTF(fp,"</exon>\n");
7438 in_exon = false;
7439 if (ngap > 0) {
7440 FPRINTF(fp,"<intron %d>\n",exonno);
7441 PUTC(this->genome,fp);
7442 column = 1;
7443 }
7444 } else {
7445 if (ngap > 0) {
7446 PUTC(this->genome,fp);
7447 if (++column % wraplength == 0) {
7448 PUTC('\n',fp);
7449 column = 0;
7450 }
7451 }
7452 }
7453 } else if (this->comp == INTRONGAP_COMP) {
7454 /* Do nothing */
7455 } else {
7456 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7457 SHORTGAP_COMP, or MISMATCH_COMP */
7458 if (in_exon == false) {
7459 if (ngap > 0) {
7460 if (exonno > 0) {
7461 if (column != 0) {
7462 PUTC('\n',fp);
7463 column = 0;
7464 }
7465 FPRINTF(fp,"</intron>\n");
7466 }
7467 }
7468 FPRINTF(fp,"<exon %d",++exonno);
7469 if (cdnap == true) {
7470 if (this->aaphase_e >= 0) {
7471 FPRINTF(fp,", phase %d",this->aaphase_e);
7472 }
7473 } else {
7474 if (this->aaphase_g >= 0) {
7475 FPRINTF(fp,", phase %d",this->aaphase_g);
7476 }
7477 }
7478 FPRINTF(fp,">\n");
7479 in_exon = true;
7480 }
7481 if (cdnap == true) {
7482 if (this->cdna != ' ') {
7483 PUTC(this->cdna,fp);
7484 if (++column % wraplength == 0) {
7485 PUTC('\n',fp);
7486 column = 0;
7487 }
7488 }
7489 } else {
7490 if (this->genome != ' ') {
7491 PUTC(this->genome,fp);
7492 if (++column % wraplength == 0) {
7493 PUTC('\n',fp);
7494 column = 0;
7495 }
7496 }
7497 }
7498 }
7499 }
7500 if (column != 0) {
7501 PUTC('\n',fp);
7502 }
7503 FPRINTF(fp,"</exon>\n");
7504
7505 return;
7506 }
7507
7508
7509 int
Pair_nmatches_posttrim(int * max_match_length,List_T pairs,int pos5,int pos3)7510 Pair_nmatches_posttrim (int *max_match_length, List_T pairs, int pos5, int pos3) {
7511 int nmatches = 0, match_length;
7512 bool in_intron = false;
7513 /* bool indelp = false; */
7514 List_T p;
7515 T this;
7516
7517 *max_match_length = match_length = 0;
7518 for (p = pairs; p != NULL; p = p->rest) {
7519 this = p->first;
7520 if (this->gapp) {
7521 if (!in_intron) {
7522 in_intron = true;
7523 }
7524 } else {
7525 if (in_intron) {
7526 in_intron = false;
7527 }
7528 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7529 /* indelp = true; */
7530 #ifndef PMAP
7531 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7532 /* (*unknowns)++; */
7533 #endif
7534 } else if (this->querypos < pos5) {
7535 /* Don't count match or mismatch */
7536 } else if (this->querypos >= pos3) {
7537 /* Don't count match or mismatch */
7538 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7539 nmatches++;
7540 match_length++;
7541 } else if (this->comp == MISMATCH_COMP) {
7542 /* (*mismatches)++; */
7543 if (match_length > *max_match_length) {
7544 *max_match_length = match_length;
7545 }
7546 match_length = 0;
7547 } else {
7548 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7549 abort();
7550 }
7551 }
7552 }
7553
7554 if (match_length > *max_match_length) {
7555 *max_match_length = match_length;
7556 }
7557
7558 return nmatches;
7559 }
7560
7561
7562 int
Pair_array_nmatches_posttrim(struct T * pairarray,int npairs,int pos5,int pos3)7563 Pair_array_nmatches_posttrim (struct T *pairarray, int npairs, int pos5, int pos3) {
7564 int nmatches = 0;
7565 bool in_intron = false;
7566 /* bool indelp = false; */
7567 int i;
7568 T this;
7569
7570 for (i = 0; i < npairs; i++) {
7571 this = &(pairarray[i]);
7572 if (this->gapp) {
7573 if (!in_intron) {
7574 in_intron = true;
7575 }
7576 } else {
7577 if (in_intron) {
7578 in_intron = false;
7579 }
7580 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7581 /* indelp = true; */
7582 #ifndef PMAP
7583 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7584 /* (*unknowns)++; */
7585 #endif
7586 } else if (this->querypos < pos5) {
7587 /* Don't count match or mismatch */
7588 } else if (this->querypos >= pos3) {
7589 /* Don't count match or mismatch */
7590 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7591 nmatches++;
7592 } else if (this->comp == MISMATCH_COMP) {
7593 /* (*mismatches)++; */
7594 } else {
7595 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7596 abort();
7597 }
7598 }
7599 }
7600
7601 return nmatches;
7602 }
7603
7604
7605 int
Pair_nmismatches_region(int * nindelbreaks,int * nbadintrons,struct T * pairs,int npairs,int trim_left,int trim_right,int start_amb_nmatches,int end_amb_nmatches,int querylength)7606 Pair_nmismatches_region (int *nindelbreaks, int *nbadintrons, struct T *pairs, int npairs,
7607 int trim_left, int trim_right, int start_amb_nmatches, int end_amb_nmatches,
7608 int querylength) {
7609 int nmismatches = 0;
7610 /* bool in_intron = false; */
7611 /* bool indelp = false; */
7612 bool in_exon = false;
7613 int i = 0;
7614 T this;
7615
7616 *nindelbreaks = *nbadintrons = 0;
7617
7618 /* Handle GMAP alignments that are not extended to the end */
7619 this = &(pairs[0]);
7620 if (this->querypos - start_amb_nmatches < trim_left) {
7621 /* Skip */
7622 } else {
7623 nmismatches += (this->querypos - start_amb_nmatches) - trim_left;
7624 }
7625
7626 while (i < npairs) {
7627 this = &(pairs[i]);
7628
7629 if (this->gapp) {
7630 if (in_exon == true) {
7631 /* SPLICE START */
7632 if (this->comp == FWD_CANONICAL_INTRON_COMP || this->comp == REV_CANONICAL_INTRON_COMP) {
7633 /* Okay */
7634 } else {
7635 /* Count bad introns, even if outside of trimmed region */
7636 (*nbadintrons) += 1;
7637 }
7638 in_exon = false;
7639 }
7640
7641 } else if (this->comp == INTRONGAP_COMP) {
7642 /* May want to print dinucleotides */
7643
7644 } else {
7645 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7646 SHORTGAP_COMP, or MISMATCH_COMP */
7647 if (in_exon == false) {
7648 /* SPLICE CONTINUATION */
7649 in_exon = true;
7650 }
7651 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7652 /* Count indelbreaks, even if outside of trimmed region */
7653 if (this->genome == ' ') {
7654 /* INSERTION */
7655 while (i < npairs && this->genome == ' ') {
7656 /* (*total_nindels) += 1; */
7657 this = &(pairs[i++]);
7658 }
7659 i--;
7660 (*nindelbreaks) += 1;
7661
7662 } else if (this->cdna == ' ') {
7663 /* DELETION */
7664 while (i < npairs && this->cdna == ' ') {
7665 /* (*total_nindels) -= 1; */
7666 this = &(pairs[i++]);
7667 }
7668 i--;
7669 (*nindelbreaks) += 1;
7670 }
7671
7672 } else if (this->querypos < trim_left) {
7673 /* Skip for counting mismatches */
7674 } else if (this->querypos >= querylength - trim_right) {
7675 /* Skip for counting mismatches */
7676 } else if (this->comp == MISMATCH_COMP) {
7677 nmismatches++;
7678 }
7679 }
7680
7681 i++;
7682 }
7683
7684 /* Handle GMAP alignments that are not extended to the end */
7685 this = &(pairs[npairs-1]);
7686 if (this->querypos + end_amb_nmatches >= (querylength - 1) - trim_right) {
7687 /* Skip */
7688 } else {
7689 nmismatches += (querylength - 1 - trim_right) - (this->querypos + end_amb_nmatches);
7690 }
7691
7692 return nmismatches;
7693 }
7694
7695
7696
7697 int
Pair_goodness_simple(List_T pairs)7698 Pair_goodness_simple (List_T pairs) {
7699 int matches = 0, mismatches = 0;
7700 bool in_intron = false;
7701 List_T p;
7702 T this;
7703
7704 for (p = pairs; p != NULL; p = p->rest) {
7705 this = p->first;
7706 if (this->gapp) {
7707 if (!in_intron) {
7708 in_intron = true;
7709 }
7710 } else {
7711 if (in_intron) {
7712 in_intron = false;
7713 }
7714 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7715
7716 #ifndef PMAP
7717 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7718 /* (unknowns)++; */
7719 #endif
7720 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7721 matches++;
7722 } else if (this->comp == MISMATCH_COMP) {
7723 mismatches++;
7724 } else {
7725 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7726 abort();
7727 }
7728 }
7729 }
7730
7731 return matches + MISMATCH*mismatches;
7732 }
7733
7734
7735 void
Pair_fracidentity_simple(int * matches,int * unknowns,int * mismatches,List_T pairs)7736 Pair_fracidentity_simple (int *matches, int *unknowns, int *mismatches, List_T pairs) {
7737 bool in_intron = false;
7738 List_T p;
7739 T this;
7740
7741 *matches = *unknowns = *mismatches = 0;
7742 for (p = pairs; p != NULL; p = p->rest) {
7743 this = p->first;
7744 if (this->gapp) {
7745 if (!in_intron) {
7746 in_intron = true;
7747 }
7748 } else {
7749 if (in_intron) {
7750 in_intron = false;
7751 }
7752 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7753 #ifndef PMAP
7754 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7755 (*unknowns)++;
7756 #endif
7757 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7758 (*matches)++;
7759 } else if (this->comp == MISMATCH_COMP) {
7760 (*mismatches)++;
7761 } else {
7762 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7763 abort();
7764 }
7765 }
7766 }
7767
7768 return;
7769 }
7770
7771
7772 void
Pair_fracidentity(int * matches,int * unknowns,int * mismatches,int * qopens,int * qindels,int * topens,int * tindels,int * ncanonical,int * nsemicanonical,int * nnoncanonical,double * min_splice_prob,List_T pairs,int cdna_direction)7773 Pair_fracidentity (int *matches, int *unknowns, int *mismatches, int *qopens, int *qindels,
7774 int *topens, int *tindels, int *ncanonical, int *nsemicanonical, int *nnoncanonical,
7775 double *min_splice_prob, List_T pairs, int cdna_direction) {
7776 bool in_intron = false;
7777 List_T p;
7778 T this, prev = NULL;
7779
7780 *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
7781 *ncanonical = *nsemicanonical = *nnoncanonical = 0;
7782 *min_splice_prob = 1.0;
7783
7784 for (p = pairs; p != NULL; p = p->rest) {
7785 this = p->first;
7786 if (this->gapp) {
7787 if (this->donor_prob < *min_splice_prob) {
7788 *min_splice_prob = this->donor_prob;
7789 }
7790 if (this->acceptor_prob < *min_splice_prob) {
7791 *min_splice_prob = this->acceptor_prob;
7792 }
7793 if (!in_intron) {
7794 if (cdna_direction > 0) {
7795 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
7796 (*ncanonical)++;
7797 in_intron = true;
7798 } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
7799 (*nsemicanonical)++;
7800 in_intron = true;
7801 } else if (this->genomejump - this->queryjump < 50) {
7802 (*topens)++;
7803 (*tindels) += this->genomejump - this->queryjump;
7804 /* in_intron = false */
7805 } else if (this->comp == NONINTRON_COMP) {
7806 (*nnoncanonical)++;
7807 in_intron = true;
7808 }
7809
7810 } else if (cdna_direction < 0) {
7811 if (this->comp == REV_CANONICAL_INTRON_COMP) {
7812 (*ncanonical)++;
7813 in_intron = true;
7814 } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
7815 (*nsemicanonical)++;
7816 in_intron = true;
7817 } else if (this->genomejump - this->queryjump < 50) {
7818 (*topens)++;
7819 (*tindels) += this->genomejump - this->queryjump;
7820 /* in_intron = false */
7821 } else if (this->comp == NONINTRON_COMP) {
7822 (*nnoncanonical)++;
7823 in_intron = true;
7824 }
7825 }
7826 }
7827 } else {
7828 if (in_intron) {
7829 in_intron = false;
7830 }
7831 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7832 if (this->cdna == ' ') {
7833 (*tindels)++; /* If genome has extra char, count it as a genome skip */
7834 if (prev && prev->cdna != ' ') {
7835 (*topens)++;
7836 }
7837 } else if (this->genome == ' ') {
7838 (*qindels)++;
7839 if (prev && prev->genome != ' ') {
7840 (*qopens)++;
7841 }
7842 } else {
7843 fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
7844 this->comp,this->cdna,this->genome);
7845 abort();
7846 }
7847 #ifndef PMAP
7848 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7849 (*unknowns)++;
7850 #endif
7851 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7852 (*matches)++;
7853 } else if (this->comp == MISMATCH_COMP) {
7854 (*mismatches)++;
7855 } else {
7856 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7857 abort();
7858 }
7859 }
7860 prev = this;
7861 }
7862
7863 return;
7864 }
7865
7866
7867 int
Pair_fracidentity_array(int * matches,int * unknowns,int * mismatches,int * qopens,int * qindels,int * topens,int * tindels,int * ncanonical,int * nsemicanonical,int * nnoncanonical,double * min_splice_prob,struct T * ptr,int npairs,int cdna_direction)7868 Pair_fracidentity_array (int *matches, int *unknowns, int *mismatches, int *qopens, int *qindels,
7869 int *topens, int *tindels, int *ncanonical, int *nsemicanonical, int *nnoncanonical,
7870 double *min_splice_prob, struct T *ptr, int npairs, int cdna_direction) {
7871 bool in_intron = false;
7872 int i;
7873 T this, prev = NULL;
7874
7875 *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
7876 *ncanonical = *nsemicanonical = *nnoncanonical = 0;
7877 *min_splice_prob = 1.0;
7878
7879 for (i = 0; i < npairs; i++) {
7880 this = ptr++;
7881 if (this->gapp) {
7882 if (this->donor_prob < *min_splice_prob) {
7883 *min_splice_prob = this->donor_prob;
7884 }
7885 if (this->acceptor_prob < *min_splice_prob) {
7886 *min_splice_prob = this->acceptor_prob;
7887 }
7888 if (!in_intron) {
7889 if (cdna_direction > 0) {
7890 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
7891 (*ncanonical)++;
7892 in_intron = true;
7893 } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
7894 (*nsemicanonical)++;
7895 in_intron = true;
7896 } else if (this->genomejump - this->queryjump < 50) {
7897 (*topens)++;
7898 (*tindels) += this->genomejump - this->queryjump;
7899 /* in_intron = false */
7900 } else if (this->comp == NONINTRON_COMP) {
7901 (*nnoncanonical)++;
7902 in_intron = true;
7903 }
7904
7905 } else if (cdna_direction < 0) {
7906 if (this->comp == REV_CANONICAL_INTRON_COMP) {
7907 (*ncanonical)++;
7908 in_intron = true;
7909 } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
7910 (*nsemicanonical)++;
7911 in_intron = true;
7912 } else if (this->genomejump - this->queryjump < 50) {
7913 (*topens)++;
7914 (*tindels) += this->genomejump - this->queryjump;
7915 /* in_intron = false */
7916 } else if (this->comp == NONINTRON_COMP) {
7917 (*nnoncanonical)++;
7918 in_intron = true;
7919 }
7920 }
7921 }
7922 } else {
7923 if (in_intron) {
7924 in_intron = false;
7925 }
7926 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7927 if (this->cdna == ' ') {
7928 (*tindels)++; /* If genome has extra char, count it as a genome skip */
7929 if (prev && prev->cdna != ' ') {
7930 (*topens)++;
7931 }
7932 } else if (this->genome == ' ') {
7933 (*qindels)++;
7934 if (prev && prev->genome != ' ') {
7935 (*qopens)++;
7936 }
7937 } else {
7938 fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
7939 this->comp,this->cdna,this->genome);
7940 abort();
7941 }
7942 #ifndef PMAP
7943 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7944 (*unknowns)++;
7945 #endif
7946 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7947 (*matches)++;
7948 } else if (this->comp == MISMATCH_COMP) {
7949 (*mismatches)++;
7950 } else {
7951 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7952 abort();
7953 }
7954 }
7955 prev = this;
7956 }
7957
7958 return (*matches) + MISMATCH*(*mismatches)
7959 + QOPEN*(*qopens) + QINDEL*(*qindels) + TOPEN*(*topens) + TINDEL*(*tindels)
7960 - CANONICAL_POINTS*(*nnoncanonical);
7961 }
7962
7963
7964 #if 0
7965 /* Called on first and last exons during distal/medial calculation */
7966 /* Procedure seems to give random results */
7967 int
7968 Pair_fracidentity_changepoint (List_T pairs, int cdna_direction) {
7969 int changepoint = 0, maxscore = 0, score = 0;
7970 int i = 0;
7971
7972 bool in_intron = false;
7973 List_T p;
7974 T this, prev = NULL;
7975
7976 for (p = pairs; p != NULL; p = p->rest) {
7977 i++;
7978 this = p->first;
7979 debug3(FPRINTF(fp,"%d: ",i));
7980 debug3(Pair_dump_one(this,/*zerobasedp*/false));
7981 if (this->gapp) {
7982 if (!in_intron) {
7983 #if 0
7984 /* Don't expect an intron */
7985 if (cdna_direction > 0) {
7986 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
7987 (*ncanonical)++;
7988 } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
7989 (*nsemicanonical)++;
7990 } else if (this->comp == NONINTRON_COMP) {
7991 (*nnoncanonical)++;
7992 }
7993
7994 } else if (cdna_direction < 0) {
7995 if (this->comp == REV_CANONICAL_INTRON_COMP) {
7996 (*ncanonical)++;
7997 } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
7998 (*nsemicanonical)++;
7999 } else if (this->comp == NONINTRON_COMP) {
8000 (*nnoncanonical)++;
8001 }
8002 }
8003 #endif
8004 in_intron = true;
8005 }
8006 } else {
8007 if (in_intron) {
8008 in_intron = false;
8009 }
8010 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8011 if (this->cdna == ' ') {
8012 score += TINDEL;
8013 if (prev && prev->cdna != ' ') {
8014 score += TOPEN;
8015 }
8016 } else if (this->genome == ' ') {
8017 score += QINDEL;
8018 if (prev && prev->genome != ' ') {
8019 score += QOPEN;
8020 }
8021 } else {
8022 fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8023 this->comp,this->cdna,this->genome);
8024 abort();
8025 }
8026 #ifndef PMAP
8027 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8028 /* (*unknowns)++; */
8029 #endif
8030 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8031 #if 0
8032 score += (MATCH + MATCH); /* Give more weight to matches to allow for poor quality at ends */
8033 #else
8034 score += MATCH;
8035 #endif
8036 if (score > maxscore) {
8037 maxscore = score;
8038 changepoint = i;
8039 debug3(FPRINTF(fp," => maxscore %d",maxscore));
8040 }
8041 } else if (this->comp == MISMATCH_COMP) {
8042 score += MISMATCH;
8043 } else {
8044 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8045 abort();
8046 }
8047 }
8048 debug3(FPRINTF(fp,"\n"));
8049 prev = this;
8050 }
8051
8052 return changepoint;
8053 }
8054 #endif
8055
8056
8057 int
Pair_fracidentity_score(List_T pairs)8058 Pair_fracidentity_score (List_T pairs) {
8059 int score = 0;
8060 int i = 0;
8061
8062 bool in_intron = false;
8063 List_T p;
8064 T this, prev = NULL;
8065
8066 for (p = pairs; p != NULL; p = p->rest) {
8067 i++;
8068 this = p->first;
8069 debug3(FPRINTF(fp,"%d: ",i));
8070 debug3(Pair_dump_one(this,/*zerobasedp*/false));
8071 if (this->gapp) {
8072 if (!in_intron) {
8073 in_intron = true;
8074 }
8075 } else {
8076 if (in_intron) {
8077 in_intron = false;
8078 }
8079 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8080 if (this->cdna == ' ') {
8081 score += TINDEL;
8082 if (prev && prev->cdna != ' ') {
8083 score += TOPEN;
8084 }
8085 } else if (this->genome == ' ') {
8086 score += QINDEL;
8087 if (prev && prev->genome != ' ') {
8088 score += QOPEN;
8089 }
8090 } else {
8091 fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8092 this->comp,this->cdna,this->genome);
8093 abort();
8094 }
8095 #ifndef PMAP
8096 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8097 /* (*unknowns)++; */
8098 #endif
8099 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8100 score += MATCH;
8101 } else if (this->comp == MISMATCH_COMP) {
8102 score += MISMATCH;
8103 } else {
8104 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8105 abort();
8106 }
8107 }
8108 debug3(FPRINTF(fp,"\n"));
8109 prev = this;
8110 }
8111
8112 return score;
8113 }
8114
8115
8116 double
Pair_frac_error(List_T pairs,int cdna_direction)8117 Pair_frac_error (List_T pairs, int cdna_direction) {
8118 int matches, unknowns, mismatches, qopens, qindels,
8119 topens, tindels, ncanonical, nsemicanonical, nnoncanonical;
8120 int den;
8121 double min_splice_prob;
8122
8123 Pair_fracidentity(&matches,&unknowns,&mismatches,&qopens,&qindels,
8124 &topens,&tindels,&ncanonical,&nsemicanonical,&nnoncanonical,
8125 &min_splice_prob,pairs,cdna_direction);
8126
8127 if ((den = matches + mismatches + qindels + tindels) == 0) {
8128 return 1.0;
8129 } else {
8130 return (double) (mismatches + qindels + tindels)/(double) den;
8131 }
8132 }
8133
8134 void
Pair_fracidentity_bounded(int * matches,int * unknowns,int * mismatches,int * qopens,int * qindels,int * topens,int * tindels,int * ncanonical,int * nsemicanonical,int * nnoncanonical,struct T * ptr,int npairs,int cdna_direction,int minpos,int maxpos)8135 Pair_fracidentity_bounded (int *matches, int *unknowns, int *mismatches,
8136 int *qopens, int *qindels, int *topens, int *tindels,
8137 int *ncanonical, int *nsemicanonical, int *nnoncanonical,
8138 struct T *ptr, int npairs,
8139 int cdna_direction, int minpos, int maxpos) {
8140 bool in_intron = false;
8141 T this, prev = NULL;
8142 int i;
8143
8144 *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
8145 *ncanonical = *nsemicanonical = *nnoncanonical = 0;
8146
8147 for (i = 0; i < npairs; i++) {
8148 this = ptr++;
8149 if (this->gapp) {
8150 if (!in_intron) {
8151 if (this->querypos >= minpos && this->querypos <= maxpos) {
8152 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8153 (*ncanonical)++;
8154 } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
8155 (*nsemicanonical)++;
8156 } else if (this->comp == NONINTRON_COMP) {
8157 (*nnoncanonical)++;
8158 }
8159 } else if (cdna_direction < 0) {
8160 if (this->comp == REV_CANONICAL_INTRON_COMP) {
8161 (*ncanonical)++;
8162 } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
8163 (*nsemicanonical)++;
8164 } else if (this->comp == NONINTRON_COMP) {
8165 (*nnoncanonical)++;
8166 }
8167 }
8168 in_intron = true;
8169 }
8170 } else {
8171 if (in_intron) {
8172 in_intron = false;
8173 }
8174 if (this->querypos >= minpos && this->querypos <= maxpos) {
8175 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8176 if (this->cdna == ' ') {
8177 (*tindels)++; /* If genome has extra char, count it as a genome skip */
8178 if (prev && prev->cdna != ' ') {
8179 (*topens)++;
8180 }
8181 } else if (this->genome == ' ') {
8182 (*qindels)++;
8183 if (prev && prev->genome != ' ') {
8184 (*qopens)++;
8185 }
8186 } else {
8187 fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8188 this->comp,this->cdna,this->genome);
8189 abort();
8190 }
8191 #ifndef PMAP
8192 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8193 (*unknowns)++;
8194 #endif
8195 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8196 (*matches)++;
8197 } else if (this->comp == MISMATCH_COMP) {
8198 (*mismatches)++;
8199 } else {
8200 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8201 abort();
8202 }
8203 }
8204 }
8205 prev = this;
8206 }
8207 return;
8208 }
8209
8210 static const Except_T Array_bounds_error = { "Exceeded array bounds" };
8211
8212
8213 void
Pair_matchscores(int * matchscores,struct T * ptr,int npairs)8214 Pair_matchscores (int *matchscores, struct T *ptr, int npairs) {
8215 T this;
8216 int querypos;
8217 int i;
8218
8219 for (i = 0; i < npairs; i++) {
8220 this = ptr++;
8221 querypos = this->querypos;
8222
8223 if (this->gapp) {
8224 matchscores[querypos] = 0; /* Count as mismatch; make evidence support the gap */
8225 } else if (this->comp == MISMATCH_COMP) {
8226 matchscores[querypos] = 0; /* For mismatch */
8227 } else if (this->comp == INDEL_COMP) {
8228 matchscores[querypos] = -1; /* Ignore indels */
8229 } else {
8230 matchscores[querypos] = 1; /* For match */
8231 }
8232 }
8233
8234 return;
8235 }
8236
8237
8238 int
Pair_maxnegscore(List_T pairs)8239 Pair_maxnegscore (List_T pairs) {
8240 int maxnegscore = 0, prevhigh = 0, score = 0;
8241 T this;
8242 List_T p = pairs;
8243
8244 while (p != NULL) {
8245 this = p->first;
8246 debug11(Pair_dump_one(this,/*zerobasedp*/true));
8247
8248 if (this->gapp) {
8249 /* Skip */
8250 p = p->rest;
8251
8252 } else if (this->comp == MISMATCH_COMP) {
8253 score += MISMATCH;
8254 if (score - prevhigh < maxnegscore) {
8255 maxnegscore = score - prevhigh;
8256 }
8257 p = p->rest;
8258
8259 } else if (this->comp == INDEL_COMP) {
8260 score += QOPEN + QINDEL;
8261 p = p->rest;
8262 while (p != NULL && ((T) p->first)->comp == INDEL_COMP) {
8263 score += QINDEL;
8264 p = p->rest;
8265 }
8266 if (score - prevhigh < maxnegscore) {
8267 maxnegscore = score - prevhigh;
8268 }
8269
8270 } else {
8271 score += MATCH;
8272 if (score > prevhigh) {
8273 prevhigh = score;
8274 }
8275 p = p->rest;
8276 }
8277
8278 debug11(printf(" score %d, prevhigh %d, maxnegscore %d\n",score,prevhigh,maxnegscore));
8279 }
8280
8281 return maxnegscore;
8282 }
8283
8284
8285 void
Pair_pathscores(bool * gapp,int * pathscores,struct T * ptr,int npairs,int cdna_direction,int querylength,cDNAEnd_T cdnaend,int pre_extension_slop)8286 Pair_pathscores (bool *gapp, int *pathscores, struct T *ptr, int npairs,
8287 int cdna_direction, int querylength, cDNAEnd_T cdnaend, int pre_extension_slop) {
8288 int querypos, querystart, queryend;
8289 int basescore;
8290 bool in_intron = false;
8291 T this, prev = NULL;
8292 int i;
8293
8294 /* Determine these before ptr changes */
8295 this = &(ptr[0]);
8296 querystart = this->querypos;
8297 this = &(ptr[npairs-1]);
8298 queryend = this->querypos;
8299 /* printf("Entered Pair_pathscores with querystart %d and queryend %d\n",querystart,queryend); */
8300
8301 /* Allow transitions slightly outside of the ends
8302 (pre_extension_slop) when finding non-extended paths to pair, but
8303 not when finding the breakpoint for the final pair, which has
8304 been extended */
8305 if (cdnaend == FIVE) {
8306 /* left part of chimera */
8307 for (querypos = 0; querypos < querystart; querypos++) {
8308 gapp[querypos] = true;
8309 }
8310 for (querypos = queryend + 1 + pre_extension_slop; querypos < querylength; querypos++) {
8311 gapp[querypos] = true;
8312 }
8313 } else {
8314 /* right part of chimera */
8315 for (querypos = 0; querypos < querystart - pre_extension_slop; querypos++) {
8316 gapp[querypos] = true;
8317 }
8318 for (querypos = queryend + 1; querypos < querylength; querypos++) {
8319 gapp[querypos] = true;
8320 }
8321 }
8322
8323 /* Initialize to cover the ends that aren't aligned */
8324 for (querypos = 0; querypos < querylength; querypos++) {
8325 pathscores[querypos] = QINDEL;
8326 }
8327
8328 for (i = 0; i < npairs; i++) {
8329 this = ptr++;
8330
8331 querypos = this->querypos;
8332 if (querypos >= querylength) {
8333 fprintf(stderr,"Pair_pathscores: querypos %d >= querylength %d\n",querypos,querylength);
8334 Pair_dump_array(ptr,npairs,/*zerobasedp*/true);
8335 fflush(stdout);
8336 abort();
8337 RAISE(Array_bounds_error);
8338 }
8339
8340 if (this->gapp) {
8341 gapp[querypos] = true;
8342 if (in_intron == false) {
8343 /* Adds only a single reward/penalty per intron */
8344 if (cdna_direction > 0) {
8345 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8346 pathscores[querypos] = CANONICAL_POINTS;
8347 } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
8348 pathscores[querypos] = SEMICANONICAL_POINTS;
8349 } else {
8350 pathscores[querypos] = NONCANONICAL_POINTS; /* noncanonical */
8351 }
8352 } else if (cdna_direction < 0) {
8353 if (this->comp == REV_CANONICAL_INTRON_COMP) {
8354 pathscores[querypos] = CANONICAL_POINTS;
8355 } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
8356 pathscores[querypos] = SEMICANONICAL_POINTS;
8357 } else {
8358 pathscores[querypos] = NONCANONICAL_POINTS; /* noncanonical */
8359 }
8360 } else {
8361 pathscores[querypos] = NONCANONICAL_POINTS; /* indeterminate */
8362 }
8363 in_intron = true;
8364 }
8365
8366 } else {
8367 if (in_intron) {
8368 in_intron = false;
8369 }
8370 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8371 if (this->cdna == ' ') {
8372 pathscores[querypos] = TINDEL;
8373 if (prev && prev->cdna != ' ') {
8374 pathscores[querypos] = TOPEN;
8375 }
8376 } else if (this->genome == ' ') {
8377 pathscores[querypos] = QINDEL;
8378 if (prev && prev->genome != ' ') {
8379 pathscores[querypos] = QOPEN;
8380 }
8381 } else {
8382 fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8383 this->comp,this->cdna,this->genome);
8384 abort();
8385 }
8386 #ifndef PMAP
8387 } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8388 /* (*unknowns)++; */
8389 #endif
8390 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8391 pathscores[querypos] = +1; /* For match */
8392 } else if (this->comp == MISMATCH_COMP) {
8393 pathscores[querypos] = MISMATCH;
8394 } else {
8395 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8396 abort();
8397 }
8398 }
8399 prev = this;
8400 }
8401
8402 #if 0
8403 /* Gets querystart to queryend inclusive */
8404 if (0 && querystart == 0) {
8405 for (i = 1; i <= queryend; i++) {
8406 pathscores[i] += pathscores[i-1];
8407 }
8408 } else {
8409 for (i = querystart; i <= queryend; i++) {
8410 pathscores[i] += pathscores[i-1];
8411 }
8412 }
8413 #endif
8414
8415 #if 0
8416 if (cdnaend == FIVE) {
8417 for (i = queryend + 1; i < querylength; i++) {
8418 pathscores[i] = pathscores[i-1] + QINDEL;
8419 }
8420 } else if (cdnaend == THREE) {
8421 for (i = querystart - 1; i >= 0; --i) {
8422 pathscores[i] = pathscores[i+1] - QINDEL;
8423 }
8424 for (i = queryend + 1; i < querylength; i++) {
8425 pathscores[i] = pathscores[i-1];
8426 }
8427 }
8428 #endif
8429
8430 if (cdnaend == FIVE) {
8431 for (i = 1; i < querylength; i++) {
8432 pathscores[i] += pathscores[i-1];
8433 }
8434 basescore = pathscores[querystart];
8435 } else if (cdnaend == THREE) {
8436 for (i = querylength-2; i >= 0; --i) {
8437 pathscores[i] += pathscores[i+1];
8438 }
8439 basescore = pathscores[queryend];
8440 }
8441
8442 for (i = 0; i < querylength; i++) {
8443 pathscores[i] -= basescore;
8444 }
8445
8446 return;
8447 }
8448
8449
8450 int
Pair_nexons_approx(List_T pairs)8451 Pair_nexons_approx (List_T pairs) {
8452 int nexons = 0;
8453 bool in_exon = false;
8454 T this;
8455 List_T p;
8456
8457 for (p = pairs; p != NULL; p = List_next(p)) {
8458 this = List_head(p);
8459 if (this->gapp) {
8460 if (in_exon) {
8461 in_exon = false;
8462 }
8463 } else {
8464 if (!in_exon) {
8465 nexons++;
8466 in_exon = true;
8467 }
8468 }
8469 }
8470
8471 return nexons;
8472 }
8473
8474
8475 int
Pair_nexons(struct T * pairs,int npairs)8476 Pair_nexons (struct T *pairs, int npairs) {
8477 int nexons = 0;
8478 struct T *ptr, *this = NULL;
8479 bool in_exon = false;
8480 int i;
8481
8482 ptr = pairs;
8483 for (i = 0; i < npairs; i++) {
8484 this = ptr++;
8485 if (this->gapp) {
8486 if (in_exon) {
8487 in_exon = false;
8488 }
8489 } else if (this->comp == INTRONGAP_COMP) {
8490 /* Do nothing */
8491 } else {
8492 if (!in_exon) {
8493 nexons++;
8494 in_exon = true;
8495 }
8496 }
8497 }
8498
8499 return nexons;
8500 }
8501
8502
8503 bool
Pair_consistentp(int * ncanonical,struct T * pairs,int npairs,int cdna_direction)8504 Pair_consistentp (int *ncanonical, struct T *pairs, int npairs, int cdna_direction) {
8505 bool in_intron = false;
8506 struct T *this;
8507 int i;
8508
8509 *ncanonical = 0;
8510 for (i = 0; i < npairs; i++) {
8511 this = pairs++;
8512 if (this->gapp) {
8513 if (!in_intron) {
8514 if (cdna_direction > 0) {
8515 if (this->comp == REV_CANONICAL_INTRON_COMP ||
8516 this->comp == REV_GCAG_INTRON_COMP ||
8517 this->comp == REV_ATAC_INTRON_COMP) {
8518 return false;
8519 } else if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8520 (*ncanonical)++;
8521 }
8522 } else if (cdna_direction < 0) {
8523 if (this->comp == FWD_CANONICAL_INTRON_COMP ||
8524 this->comp == FWD_GCAG_INTRON_COMP ||
8525 this->comp == FWD_ATAC_INTRON_COMP) {
8526 return false;
8527 } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
8528 (*ncanonical)++;
8529 }
8530 } else if (cdna_direction == 0) {
8531 /* Set cdna_direction for next time */
8532 if (this->comp == FWD_CANONICAL_INTRON_COMP ||
8533 this->comp == FWD_GCAG_INTRON_COMP ||
8534 this->comp == FWD_ATAC_INTRON_COMP) {
8535 cdna_direction = +1;
8536 } else if (this->comp == REV_CANONICAL_INTRON_COMP ||
8537 this->comp == REV_GCAG_INTRON_COMP ||
8538 this->comp == REV_ATAC_INTRON_COMP) {
8539 cdna_direction = -1;
8540 }
8541 }
8542 in_intron = true;
8543 }
8544 } else {
8545 if (in_intron) {
8546 in_intron = false;
8547 }
8548 }
8549 }
8550
8551 return true;
8552 }
8553
8554
8555 #if 0
8556 static void
8557 invert_intron (char *donor, char *acceptor) {
8558 char temp;
8559
8560 temp = donor[0];
8561 donor[0] = complCode[(int) acceptor[1]];
8562 acceptor[1] = complCode[(int) temp];
8563
8564 temp = donor[1];
8565 donor[1] = complCode[(int) acceptor[0]];
8566 acceptor[0] = complCode[(int) temp];
8567
8568 return;
8569 }
8570 #endif
8571
8572
8573 void
Pair_print_protein_genomic(Filestring_T fp,struct T * ptr,int npairs,int wraplength,bool forwardp)8574 Pair_print_protein_genomic (Filestring_T fp, struct T *ptr, int npairs, int wraplength, bool forwardp) {
8575 struct T *this;
8576 int xpos = 0, i;
8577
8578 if (forwardp == true) {
8579 for (i = 0; i < npairs; i++) {
8580 this = ptr++;
8581 if (this->aa_g != ' ') {
8582 if (xpos == wraplength) {
8583 PUTC('\n',fp);
8584 xpos = 0;
8585 }
8586 #ifdef PMAP
8587 PUTC(this->aa_g,fp);
8588 xpos++;
8589 #else
8590 if (this->aa_g != '*') {
8591 PUTC(this->aa_g,fp);
8592 xpos++;
8593 }
8594 #endif
8595 }
8596 }
8597 PUTC('\n',fp);
8598
8599 } else {
8600 for (i = npairs-1; i >= 0; i--) {
8601 this = ptr--;
8602 if (this->aa_g != ' ') {
8603 if (xpos == wraplength) {
8604 PUTC('\n',fp);
8605 xpos = 0;
8606 }
8607 #ifdef PMAP
8608 abort();
8609 PUTC(this->aa_g,fp);
8610 xpos++;
8611 #else
8612 if (this->aa_g != '*') {
8613 PUTC(this->aa_g,fp);
8614 xpos++;
8615 }
8616 #endif
8617 }
8618 }
8619 PUTC('\n',fp);
8620
8621 }
8622
8623 return;
8624 }
8625
8626 #ifdef PMAP
8627 void
Pair_print_nucleotide_cdna(Filestring_T fp,struct T * ptr,int npairs,int wraplength)8628 Pair_print_nucleotide_cdna (Filestring_T fp, struct T *ptr, int npairs, int wraplength) {
8629 struct T *this;
8630 int xpos = 0, i;
8631
8632 for (i = 0; i < npairs; i++) {
8633 this = ptr++;
8634 if (this->cdna != ' ') {
8635 if (xpos == wraplength) {
8636 PUTC('\n',fp);
8637 xpos = 0;
8638 }
8639 PUTC(this->cdna,fp);
8640 xpos++;
8641 }
8642 }
8643 PUTC('\n',fp);
8644 return;
8645 }
8646 #else
8647 void
Pair_print_protein_cdna(Filestring_T fp,struct T * ptr,int npairs,int wraplength,bool forwardp)8648 Pair_print_protein_cdna (Filestring_T fp, struct T *ptr, int npairs, int wraplength, bool forwardp) {
8649 struct T *this;
8650 int xpos = 0, i;
8651
8652 if (forwardp == true) {
8653 for (i = 0; i < npairs; i++) {
8654 this = ptr++;
8655 if (this->aa_e != ' ') {
8656 if (xpos == wraplength) {
8657 PUTC('\n',fp);
8658 xpos = 0;
8659 }
8660 if (this->aa_e != '*') {
8661 PUTC(this->aa_e,fp);
8662 xpos++;
8663 }
8664 }
8665 }
8666 PUTC('\n',fp);
8667
8668 } else {
8669 for (i = npairs-1; i >= 0; i--) {
8670 this = ptr--;
8671 if (this->aa_e != ' ') {
8672 if (xpos == wraplength) {
8673 PUTC('\n',fp);
8674 xpos = 0;
8675 }
8676 if (this->aa_e != '*') {
8677 PUTC(this->aa_e,fp);
8678 xpos++;
8679 }
8680 }
8681 }
8682 PUTC('\n',fp);
8683 }
8684
8685 return;
8686 }
8687 #endif
8688
8689
8690 #if 0
8691 void
8692 Pair_print_compressed_old (Filestring_T fp, int pathnum, int npaths, T start, T end, Sequence_T queryseq, char *dbversion,
8693 Sequence_T usersegment, int nexons, double fracidentity,
8694 struct T *pairs, int npairs, Chrnum_T chrnum,
8695 Univcoord_T chroffset, Univ_IIT_T chromosome_iit, int querylength_given,
8696 int skiplength, int trim_start, int trim_end, bool checksump,
8697 int chimerapos, int chimeraequivpos, double donor_prob, double acceptor_prob,
8698 int chimera_cdna_direction, char *strain, bool watsonp, int cdna_direction) {
8699 Chrpos_T chrpos1, chrpos2;
8700 Univcoord_T position1, position2;
8701
8702 bool in_exon = false;
8703 List_T tokens = NULL;
8704 struct T *ptr = pairs, *this = NULL;
8705 int querypos1, querypos2;
8706 int exon_querystart = -1, exon_queryend;
8707 Chrpos_T exon_genomestart = 0, exon_genomeend, intron_start, intron_end;
8708 int num = 0, den = 0, runlength = 0, i;
8709 int print_dinucleotide_p;
8710 char token[11], donor[3], acceptor[3], *chr;
8711 double coverage;
8712 /* double trimmed_coverage; */
8713 int last_querypos = -1;
8714 Chrpos_T last_genomepos = (Chrpos_T) -1;
8715
8716 donor[0] = donor[1] = donor[2] = '\0';
8717 acceptor[0] = acceptor[1] = acceptor[2] = '\0';
8718
8719 querypos1 = start->querypos;
8720 querypos2 = end->querypos;
8721
8722 FPRINTF(fp,">%s ",Sequence_accession(queryseq));
8723 if (dbversion != NULL) {
8724 FPRINTF(fp,"%s ",dbversion);
8725 } else if (usersegment != NULL && Sequence_accession(usersegment) != NULL) {
8726 FPRINTF(fp,"%s ",Sequence_accession(usersegment));
8727 } else {
8728 FPRINTF(fp,"user-provided ");
8729 }
8730 #ifdef PMAP
8731 FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,(querylength_given+skiplength)*3,nexons);
8732 coverage = (double) (querypos2 - querypos1 + 1)/(double) ((querylength_given+skiplength)*3);
8733 FPRINTF(fp," %.1f",((double) rint(1000.0*coverage)));
8734 #else
8735 coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given+skiplength);
8736 if (end->querypos + 1 > trim_end) {
8737 trim_end = end->querypos + 1;
8738 }
8739 if (start->querypos < trim_start) {
8740 trim_start = start->querypos;
8741 }
8742 /*
8743 trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trim_end - trim_start + skiplength);
8744 FPRINTF(fp,">%s %s %d/%d %d(%d) %d",
8745 Sequence_accession(queryseq),dbversion,pathnum,npaths,
8746 querylength_given+skiplength,trim_end-trim_start,nexons);
8747 FPRINTF(fp," %.1f(%.1f)",((double) rint(1000.0*coverage))/10.0,((double) rint(1000.0*trimmed_coverage))/10.0);
8748 */
8749 FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,querylength_given+skiplength,nexons);
8750 FPRINTF(fp," %.1f",((double) rint(1000.0*coverage))/10.0);
8751 #endif
8752 FPRINTF(fp," %.1f",((double) rint(1000.0*fracidentity))/10.0);
8753
8754 start = &(pairs[0]);
8755 end = &(pairs[npairs-1]);
8756 FPRINTF(fp," %d%s%d",start->querypos + ONEBASEDP,"..",end->querypos + ONEBASEDP);
8757
8758 chrpos1 = start->genomepos;
8759 chrpos2 = end->genomepos;
8760 position1 = chroffset + chrpos1;
8761 position2 = chroffset + chrpos2;
8762 FPRINTF(fp," %u%s%u",position1 + ONEBASEDP,"..",position2 + ONEBASEDP);
8763
8764 if (chrnum == 0) {
8765 FPRINTF(fp," %u%s%u",chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
8766 } else {
8767 chr = Chrnum_to_string(chrnum,chromosome_iit);
8768 FPRINTF(fp," %s:%u%s%u",chr,chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
8769 FREE(chr);
8770 }
8771
8772 if (chrpos1 <= chrpos2) {
8773 FPRINTF(fp," +");
8774 } else {
8775 FPRINTF(fp," -");
8776 }
8777
8778 if (cdna_direction > 0) {
8779 FPRINTF(fp," dir:sense");
8780 } else if (cdna_direction < 0) {
8781 FPRINTF(fp," dir:antisense");
8782 } else {
8783 FPRINTF(fp," dir:indet");
8784 }
8785
8786 if (checksump == true) {
8787 FPRINTF(fp," md5:");
8788 Sequence_print_digest(fp,queryseq);
8789 }
8790
8791 if (chimerapos >= 0) {
8792 if (chimeraequivpos == chimerapos) {
8793 if (donor_prob > 0.0 && acceptor_prob > 0.0) {
8794 if (chimera_cdna_direction >= 0) {
8795 FPRINTF(fp," chimera:%d(>)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
8796 } else {
8797 FPRINTF(fp," chimera:%d(<)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
8798 }
8799 } else {
8800 FPRINTF(fp," chimera:%d",chimerapos + ONEBASEDP);
8801 }
8802 } else {
8803 FPRINTF(fp," chimera:%d..%d",chimerapos + ONEBASEDP,chimeraequivpos + ONEBASEDP);
8804 }
8805 }
8806
8807 if (strain != NULL) {
8808 FPRINTF(fp," strain:%s",strain);
8809 }
8810
8811 PUTC('\n',fp);
8812
8813 for (i = 0; i < npairs; i++) {
8814 /* prev = this; */
8815 this = ptr++;
8816
8817 if (this->gapp) {
8818 if (in_exon == true) {
8819 /* Beginning of gap */
8820 exon_queryend = last_querypos + ONEBASEDP;
8821 exon_genomeend = last_genomepos + ONEBASEDP;
8822 if (watsonp) {
8823 intron_start = exon_genomeend + 1;
8824 } else {
8825 intron_start = exon_genomeend - 1;
8826 }
8827
8828 FPRINTF(fp,"\t%u %u",exon_genomestart,exon_genomeend);
8829 FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
8830 if (den == 0) {
8831 FPRINTF(fp," 100");
8832 } else {
8833 FPRINTF(fp," %d",(int) floor(100.0*(double) num/(double) den));
8834 }
8835 print_dinucleotide_p = 1;
8836 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8837 sprintf(token,"%d>",runlength);
8838 } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
8839 sprintf(token,"%d<",runlength);
8840 print_dinucleotide_p = -1;
8841 } else if (this->comp == NONINTRON_COMP) {
8842 sprintf(token,"%d=",runlength);
8843 } else if (this->comp == FWD_GCAG_INTRON_COMP) {
8844 sprintf(token,"%d)",runlength);
8845 } else if (this->comp == REV_GCAG_INTRON_COMP) {
8846 sprintf(token,"%d(",runlength);
8847 print_dinucleotide_p = -1;
8848 } else if (this->comp == FWD_ATAC_INTRON_COMP) {
8849 sprintf(token,"%d]",runlength);
8850 } else if (this->comp == REV_ATAC_INTRON_COMP) {
8851 sprintf(token,"%d[",runlength);
8852 print_dinucleotide_p = -1;
8853 } else if (this->comp == DUALBREAK_COMP) {
8854 sprintf(token,"%d#",runlength);
8855 print_dinucleotide_p = 0;
8856 } else if (this->comp == EXTRAEXON_COMP) {
8857 sprintf(token,"%d#",runlength);
8858 print_dinucleotide_p = 0;
8859 } else {
8860 fprintf(stderr,"Can't parse comp '%c' in compression for %s\n",
8861 this->comp,Sequence_accession(queryseq));
8862 abort();
8863 }
8864 tokens = push_token(tokens,token);
8865 tokens = List_reverse(tokens);
8866 print_tokens_compressed(fp,tokens);
8867 List_free_out(&tokens);
8868 FPRINTF(fp,"\t%d",exon_queryend - exon_querystart + 1);
8869
8870 runlength = 0;
8871 donor[0] = this->genome;
8872 donor[1] = '\0';
8873 in_exon = false;
8874 } else if (donor[1] == '\0') {
8875 donor[1] = this->genome;
8876 } else {
8877 acceptor[0] = acceptor[1];
8878 acceptor[1] = this->genome;
8879 }
8880 } else if (this->comp == INTRONGAP_COMP) {
8881 /* Do nothing */
8882 } else {
8883 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
8884 SHORTGAP_COMP, or MISMATCH_COMP */
8885 if (in_exon == false) {
8886 exon_querystart = this->querypos + ONEBASEDP;
8887 exon_genomestart = this->genomepos + ONEBASEDP;
8888 if (watsonp) {
8889 intron_end = exon_genomestart - 1;
8890 } else {
8891 intron_end = exon_genomestart + 1;
8892 }
8893 if (i > 0) {
8894 if (intron_end > intron_start) {
8895 FPRINTF(fp,"\t%d",intron_end - intron_start + 1);
8896 } else {
8897 FPRINTF(fp,"\t%d",intron_start - intron_end + 1);
8898 }
8899 if (print_dinucleotide_p == -1) {
8900 invert_intron(donor,acceptor);
8901 }
8902 if (print_dinucleotide_p != 0) {
8903 if ((donor[0] == 'G' || donor[0] == 'g') &&
8904 (donor[1] == 'T' || donor[1] == 't') &&
8905 (acceptor[0] == 'A' || acceptor[0] == 'a') &&
8906 (acceptor[1] == 'G' || acceptor[1] == 'g')) {
8907 /* Do nothing */
8908 } else {
8909 FPRINTF(fp,"\t%c%c-%c%c",toupper(donor[0]),toupper(donor[1]),toupper(acceptor[0]),toupper(acceptor[1]));
8910 }
8911 }
8912 #if 0
8913 if (exon_querystart > exon_queryend + 1) {
8914 FPRINTF(fp,"***");
8915 }
8916 #endif
8917 PUTC('\n',fp);
8918 }
8919
8920 num = den = 0;
8921 in_exon = true;
8922 }
8923 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8924 /* Gap in upper or lower sequence */
8925 if (this->genome == ' ') {
8926 sprintf(token,"%d^%c",runlength,this->cdna);
8927 } else if (this->cdna == ' ') {
8928 sprintf(token,"%dv",runlength);
8929 } else {
8930 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
8931 exit(9);
8932 }
8933 tokens = push_token(tokens,token);
8934 runlength = 0;
8935 /* Don't increment den */
8936
8937 } else if (this->comp == MISMATCH_COMP) {
8938 sprintf(token,"%dx%c",runlength,this->cdna);
8939 tokens = push_token(tokens,token);
8940 runlength = 0;
8941 den++;
8942
8943 #ifndef PMAP
8944 } else if (this->comp == AMBIGUOUS_COMP) {
8945 sprintf(token,"%d:%c",runlength,this->cdna);
8946 tokens = push_token(tokens,token);
8947 runlength = 0;
8948 den++;
8949 num++;
8950 #endif
8951
8952 } else {
8953 runlength++;
8954 den++;
8955 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
8956 /* AMBIGUOUS_COMP handled above */
8957 num++;
8958 }
8959 }
8960 }
8961
8962 if (this->cdna != ' ') {
8963 last_querypos = this->querypos;
8964 }
8965 if (this->genome != ' ') {
8966 last_genomepos = this->genomepos;
8967 }
8968 }
8969
8970 /* prev = this; */
8971 exon_queryend = last_querypos + ONEBASEDP;
8972 exon_genomeend = last_genomepos + ONEBASEDP;
8973
8974 FPRINTF(fp,"\t%d %d",exon_genomestart,exon_genomeend);
8975 FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
8976 if (den == 0) {
8977 FPRINTF(fp," 100");
8978 } else {
8979 FPRINTF(fp," %d",(int) floor(100.0*(double) num/(double) den));
8980 }
8981
8982 sprintf(token,"%d*",runlength);
8983 tokens = push_token(tokens,token);
8984 tokens = List_reverse(tokens);
8985 print_tokens_compressed(fp,tokens);
8986 List_free_out(&tokens);
8987
8988 FPRINTF(fp,"\t%d",exon_queryend - exon_querystart + 1);
8989 PUTC('\n',fp);
8990
8991 return;
8992 }
8993 #endif
8994
8995 #if 0
8996 void
8997 Pair_print_compressed_byexons (Filestring_T fp, int pathnum, int npaths, T start, T end, Sequence_T queryseq, char *dbversion,
8998 Sequence_T usersegment, int nexons, double fracidentity,
8999 struct T *pairs, int npairs, Chrnum_T chrnum,
9000 Univcoord_T chroffset, Univ_IIT_T chromosome_iit, int querylength_given,
9001 int skiplength, int trim_start, int trim_end, bool checksump,
9002 int chimerapos, int chimeraequivpos, double donor_prob, double acceptor_prob,
9003 int chimera_cdna_direction, char *strain, bool watsonp, int cdna_direction) {
9004 Chrpos_T chrpos1, chrpos2;
9005 Univcoord_T position1, position2;
9006
9007 bool in_exon = false;
9008 struct T *ptr = pairs, *this = NULL;
9009 int querypos1, querypos2;
9010 int exon_querystart = -1, exon_queryend;
9011 int exon_pairi_start, exon_pairi_end;
9012 Chrpos_T exon_genomestart = 0, exon_genomeend;
9013 int i, k;
9014 char *chr, c;
9015 double coverage;
9016 /* double trimmed_coverage; */
9017 int last_querypos = -1;
9018 Chrpos_T last_genomepos = (Chrpos_T) -1;
9019
9020 querypos1 = start->querypos;
9021 querypos2 = end->querypos;
9022
9023 FPRINTF(fp,">%s ",Sequence_accession(queryseq));
9024 if (dbversion != NULL) {
9025 FPRINTF(fp,"%s ",dbversion);
9026 } else if (usersegment != NULL && Sequence_accession(usersegment) != NULL) {
9027 FPRINTF(fp,"%s ",Sequence_accession(usersegment));
9028 } else {
9029 FPRINTF(fp,"user-provided ");
9030 }
9031 #ifdef PMAP
9032 FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,(querylength_given+skiplength)*3,nexons);
9033 coverage = (double) (querypos2 - querypos1 + 1)/(double) ((querylength_given+skiplength)*3);
9034 FPRINTF(fp," %.1f",((double) rint(1000.0*coverage)));
9035 #else
9036 coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given+skiplength);
9037 if (end->querypos + 1 > trim_end) {
9038 trim_end = end->querypos + 1;
9039 }
9040 if (start->querypos < trim_start) {
9041 trim_start = start->querypos;
9042 }
9043 /*
9044 trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trim_end - trim_start + skiplength);
9045 FPRINTF(fp,">%s %s %d/%d %d(%d) %d",
9046 Sequence_accession(queryseq),dbversion,pathnum,npaths,
9047 querylength_given+skiplength,trim_end-trim_start,nexons);
9048 FPRINTF(fp," %.1f(%.1f)",((double) rint(1000.0*coverage))/10.0,((double) rint(1000.0*trimmed_coverage))/10.0);
9049 */
9050 FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,querylength_given+skiplength,nexons);
9051 FPRINTF(fp," %.1f",((double) rint(1000.0*coverage))/10.0);
9052 #endif
9053 FPRINTF(fp," %.1f",((double) rint(1000.0*fracidentity))/10.0);
9054
9055 start = &(pairs[0]);
9056 end = &(pairs[npairs-1]);
9057 FPRINTF(fp," %d%s%d",start->querypos + ONEBASEDP,"..",end->querypos + ONEBASEDP);
9058
9059 chrpos1 = start->genomepos;
9060 chrpos2 = end->genomepos;
9061 position1 = chroffset + chrpos1;
9062 position2 = chroffset + chrpos2;
9063 FPRINTF(fp," %u%s%u",position1 + ONEBASEDP,"..",position2 + ONEBASEDP);
9064
9065 if (chrnum == 0) {
9066 FPRINTF(fp," %u%s%u",chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9067 } else {
9068 chr = Chrnum_to_string(chrnum,chromosome_iit);
9069 FPRINTF(fp," %s:%u%s%u",chr,chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9070 FREE(chr);
9071 }
9072
9073 if (chrpos1 <= chrpos2) {
9074 FPRINTF(fp," +");
9075 } else {
9076 FPRINTF(fp," -");
9077 }
9078
9079 if (cdna_direction > 0) {
9080 FPRINTF(fp," dir:sense");
9081 } else if (cdna_direction < 0) {
9082 FPRINTF(fp," dir:antisense");
9083 } else {
9084 FPRINTF(fp," dir:indet");
9085 }
9086
9087 if (checksump == true) {
9088 FPRINTF(fp," md5:");
9089 Sequence_print_digest(fp,queryseq);
9090 }
9091
9092 if (chimerapos >= 0) {
9093 if (chimeraequivpos == chimerapos) {
9094 if (donor_prob > 0.0 && acceptor_prob > 0.0) {
9095 if (chimera_cdna_direction >= 0) {
9096 FPRINTF(fp," chimera:%d(>)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9097 } else {
9098 FPRINTF(fp," chimera:%d(<)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9099 }
9100 } else {
9101 FPRINTF(fp," chimera:%d",chimerapos + ONEBASEDP);
9102 }
9103 } else {
9104 FPRINTF(fp," chimera:%d..%d",chimerapos + ONEBASEDP,chimeraequivpos + ONEBASEDP);
9105 }
9106 }
9107
9108 if (strain != NULL) {
9109 FPRINTF(fp," strain:%s",strain);
9110 }
9111
9112 PUTC('\n',fp);
9113
9114 exon_pairi_start = 0;
9115 for (i = 0; i < npairs; i++) {
9116 /* prev = this; */
9117 this = ptr++;
9118
9119 if (this->gapp) {
9120 if (in_exon == true) {
9121 /* Beginning of gap */
9122 exon_queryend = last_querypos + ONEBASEDP;
9123 exon_genomeend = last_genomepos + ONEBASEDP;
9124 exon_pairi_end = i;
9125
9126 FPRINTF(fp,"\t%u %u",exon_genomestart,exon_genomeend);
9127 FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
9128 PUTC('\t',fp);
9129 for (k = exon_pairi_start; k < exon_pairi_end; k++) {
9130 if ((c = pairs[k].cdna) != ' ') {
9131 PUTC(c,fp);
9132 }
9133 }
9134
9135 in_exon = false;
9136 }
9137
9138 } else if (this->comp == INTRONGAP_COMP) {
9139 /* Do nothing */
9140
9141 } else {
9142 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9143 SHORTGAP_COMP, or MISMATCH_COMP */
9144 if (in_exon == false) {
9145 exon_querystart = this->querypos + ONEBASEDP;
9146 exon_genomestart = this->genomepos + ONEBASEDP;
9147 exon_pairi_start = i;
9148 if (i > 0) {
9149 PUTC('\n',fp);
9150 }
9151
9152 in_exon = true;
9153 }
9154 }
9155
9156 if (this->cdna != ' ') {
9157 last_querypos = this->querypos;
9158 }
9159 if (this->genome != ' ') {
9160 last_genomepos = this->genomepos;
9161 }
9162 }
9163
9164 /* prev = this; */
9165 exon_queryend = last_querypos + ONEBASEDP;
9166 exon_genomeend = last_genomepos + ONEBASEDP;
9167 exon_pairi_end = i;
9168
9169 FPRINTF(fp,"\t%d %d",exon_genomestart,exon_genomeend);
9170 FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
9171 PUTC('\t',fp);
9172 for (k = exon_pairi_start; k < exon_pairi_end; k++) {
9173 if ((c = pairs[k].cdna) != ' ') {
9174 PUTC(c,fp);
9175 }
9176 }
9177 PUTC('\n',fp);
9178
9179 return;
9180 }
9181 #endif
9182
9183
9184 void
Pair_print_compressed(Filestring_T fp,int pathnum,int npaths,T start,T end,Sequence_T queryseq,char * dbversion,Sequence_T usersegment,int nexons,double fracidentity,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,int querylength_given,int skiplength,int trim_start,int trim_end,bool checksump,int chimerapos,int chimeraequivpos,double donor_prob,double acceptor_prob,int chimera_cdna_direction,char * strain,int cdna_direction)9185 Pair_print_compressed (Filestring_T fp, int pathnum, int npaths, T start, T end, Sequence_T queryseq, char *dbversion,
9186 Sequence_T usersegment, int nexons, double fracidentity,
9187 struct T *pairs, int npairs, Chrnum_T chrnum,
9188 Univcoord_T chroffset, Univ_IIT_T chromosome_iit, int querylength_given,
9189 int skiplength, int trim_start, int trim_end, bool checksump,
9190 int chimerapos, int chimeraequivpos, double donor_prob, double acceptor_prob,
9191 int chimera_cdna_direction, char *strain, int cdna_direction) {
9192 Chrpos_T chrpos1, chrpos2;
9193 Univcoord_T position1, position2;
9194
9195 struct T *ptr = pairs, *this = NULL;
9196 int querypos1, querypos2;
9197 int i;
9198 char *chr;
9199 double coverage;
9200 /* double trimmed_coverage; */
9201 /* int last_querypos = -1; */
9202 /* Chrpos_T last_genomepos = (Chrpos_T) -1; */
9203
9204 querypos1 = start->querypos;
9205 querypos2 = end->querypos;
9206
9207 FPRINTF(fp,">%s ",Sequence_accession(queryseq));
9208 if (dbversion != NULL) {
9209 FPRINTF(fp,"%s ",dbversion);
9210 } else if (usersegment != NULL && Sequence_accession(usersegment) != NULL) {
9211 FPRINTF(fp,"%s ",Sequence_accession(usersegment));
9212 } else {
9213 FPRINTF(fp,"user-provided ");
9214 }
9215 #ifdef PMAP
9216 FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,(querylength_given+skiplength)*3,nexons);
9217 coverage = (double) (querypos2 - querypos1 + 1)/(double) ((querylength_given+skiplength)*3);
9218 FPRINTF(fp," %.1f",((double) rint(1000.0*coverage)));
9219 #else
9220 coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given+skiplength);
9221 if (end->querypos + 1 > trim_end) {
9222 trim_end = end->querypos + 1;
9223 }
9224 if (start->querypos < trim_start) {
9225 trim_start = start->querypos;
9226 }
9227 /*
9228 trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trim_end - trim_start + skiplength);
9229 FPRINTF(fp,">%s %s %d/%d %d(%d) %d",
9230 Sequence_accession(queryseq),dbversion,pathnum,npaths,
9231 querylength_given+skiplength,trim_end-trim_start,nexons);
9232 FPRINTF(fp," %.1f(%.1f)",((double) rint(1000.0*coverage))/10.0,((double) rint(1000.0*trimmed_coverage))/10.0);
9233 */
9234 FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,querylength_given+skiplength,nexons);
9235 FPRINTF(fp," %.1f",((double) rint(1000.0*coverage))/10.0);
9236 #endif
9237 FPRINTF(fp," %.1f",((double) rint(1000.0*fracidentity))/10.0);
9238
9239 start = &(pairs[0]);
9240 end = &(pairs[npairs-1]);
9241 FPRINTF(fp," %d%s%d",start->querypos + ONEBASEDP,"..",end->querypos + ONEBASEDP);
9242
9243 chrpos1 = start->genomepos;
9244 chrpos2 = end->genomepos;
9245 position1 = chroffset + chrpos1;
9246 position2 = chroffset + chrpos2;
9247 FPRINTF(fp," %u%s%u",position1 + ONEBASEDP,"..",position2 + ONEBASEDP);
9248
9249 if (chrnum == 0) {
9250 FPRINTF(fp," %u%s%u",chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9251 } else {
9252 chr = Chrnum_to_string(chrnum,chromosome_iit);
9253 FPRINTF(fp," %s:%u%s%u",chr,chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9254 FREE(chr);
9255 }
9256
9257 if (chrpos1 <= chrpos2) {
9258 FPRINTF(fp," +");
9259 } else {
9260 FPRINTF(fp," -");
9261 }
9262
9263 if (cdna_direction > 0) {
9264 FPRINTF(fp," dir:sense");
9265 } else if (cdna_direction < 0) {
9266 FPRINTF(fp," dir:antisense");
9267 } else {
9268 FPRINTF(fp," dir:indet");
9269 }
9270
9271 if (checksump == true) {
9272 FPRINTF(fp," md5:");
9273 Sequence_print_digest(fp,queryseq);
9274 }
9275
9276 if (chimerapos >= 0) {
9277 if (chimeraequivpos == chimerapos) {
9278 if (donor_prob > 0.0 && acceptor_prob > 0.0) {
9279 if (chimera_cdna_direction >= 0) {
9280 FPRINTF(fp," chimera:%d(>)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9281 } else {
9282 FPRINTF(fp," chimera:%d(<)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9283 }
9284 } else {
9285 FPRINTF(fp," chimera:%d",chimerapos + ONEBASEDP);
9286 }
9287 } else {
9288 FPRINTF(fp," chimera:%d..%d",chimerapos + ONEBASEDP,chimeraequivpos + ONEBASEDP);
9289 }
9290 }
9291
9292 if (strain != NULL) {
9293 FPRINTF(fp," strain:%s",strain);
9294 }
9295
9296 PUTC('\n',fp);
9297
9298 for (i = 0; i < npairs; i++) {
9299 /* prev = this; */
9300 this = ptr++;
9301 if (this->cdna != ' ') {
9302 PUTC(this->cdna,fp);
9303 }
9304 }
9305
9306 PUTC('\n',fp);
9307
9308 return;
9309 }
9310
9311
9312 void
Pair_print_iit_map(Filestring_T fp,Sequence_T queryseq,char * accession,T start,T end,Chrnum_T chrnum,Univ_IIT_T chromosome_iit)9313 Pair_print_iit_map (Filestring_T fp, Sequence_T queryseq, char *accession,
9314 T start, T end, Chrnum_T chrnum, Univ_IIT_T chromosome_iit) {
9315 char *chrstring = NULL;
9316 Chrpos_T chrpos1, chrpos2;
9317
9318 if (chrnum == 0) {
9319 chrstring = "";
9320 } else {
9321 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9322 }
9323
9324 /* Made identical to code for Pair_print_iit_exon_map */
9325 chrpos1 = start->genomepos + ONEBASEDP;
9326 chrpos2 = end->genomepos + ONEBASEDP;
9327 FPRINTF(fp,">%s %s:%u..%u\n",accession,chrstring,chrpos1,chrpos2);
9328 Sequence_print_header(fp,queryseq,/*checksump*/false);
9329
9330 if (chrnum != 0) {
9331 FREE(chrstring);
9332 }
9333
9334 return;
9335 }
9336
9337
9338 void
Pair_print_iit_exon_map(Filestring_T fp,struct T * pairs,int npairs,Sequence_T queryseq,char * accession,T start,T end,Chrnum_T chrnum,Univ_IIT_T chromosome_iit)9339 Pair_print_iit_exon_map (Filestring_T fp, struct T *pairs, int npairs, Sequence_T queryseq, char *accession,
9340 T start, T end, Chrnum_T chrnum, Univ_IIT_T chromosome_iit) {
9341 int i;
9342 bool in_exon = false;
9343 struct T *ptr = pairs, *this = NULL;
9344 Chrpos_T exon_genomestart = 0, exon_genomeend;
9345 char *chrstring = NULL;
9346 Chrpos_T chrpos1, chrpos2;
9347 Chrpos_T last_genomepos = (Chrpos_T) -1;
9348
9349 if (chrnum == 0) {
9350 chrstring = "";
9351 } else {
9352 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9353 }
9354
9355 chrpos1 = start->genomepos + ONEBASEDP;
9356 chrpos2 = end->genomepos + ONEBASEDP;
9357 FPRINTF(fp,">%s %s:%u..%u\n",accession,chrstring,chrpos1,chrpos2);
9358 Sequence_print_header(fp,queryseq,/*checksump*/false);
9359
9360 for (i = 0; i < npairs; i++) {
9361 /* prev = this; */
9362 this = ptr++;
9363
9364 if (this->gapp) {
9365 if (in_exon == true) {
9366 /* Beginning of gap */
9367 exon_genomeend = last_genomepos + ONEBASEDP;
9368 FPRINTF(fp,"%u %u\n",exon_genomestart,exon_genomeend);
9369 in_exon = false;
9370 }
9371 } else if (this->comp == INTRONGAP_COMP) {
9372 /* Do nothing */
9373 } else {
9374 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9375 SHORTGAP_COMP, or MISMATCH_COMP */
9376 if (in_exon == false) {
9377 exon_genomestart = this->genomepos + ONEBASEDP;
9378 in_exon = true;
9379 }
9380 }
9381 if (this->genome != ' ') {
9382 last_genomepos = this->genomepos;
9383 }
9384 }
9385
9386 /* prev = this; */
9387 exon_genomeend = last_genomepos + ONEBASEDP;
9388
9389 FPRINTF(fp,"%u %u\n",exon_genomestart,exon_genomeend);
9390
9391 if (chrnum != 0) {
9392 FREE(chrstring);
9393 }
9394
9395 return;
9396 }
9397
9398
9399 void
Pair_print_splicesites(Filestring_T fp,struct T * pairs,int npairs,char * accession,int nexons,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,bool watsonp)9400 Pair_print_splicesites (Filestring_T fp, struct T *pairs, int npairs, char *accession,
9401 int nexons, Chrnum_T chrnum, Univ_IIT_T chromosome_iit, bool watsonp) {
9402 int exoni = 0, i;
9403 bool in_exon = false;
9404 struct T *ptr = pairs, *this = NULL;
9405 Chrpos_T exon_genomestart = 0, exon_genomeend;
9406 char *chrstring = NULL;
9407 Chrpos_T last_genomepos = (Chrpos_T) -1, intron_length;
9408
9409 if (chrnum == 0) {
9410 chrstring = "";
9411 } else {
9412 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9413 }
9414
9415 for (i = 0; i < npairs; i++) {
9416 /* prev = this; */
9417 this = ptr++;
9418
9419 if (this->gapp) {
9420 if (in_exon == true) {
9421 /* Beginning of gap */
9422 exon_genomeend = last_genomepos + ONEBASEDP;
9423 if (watsonp) {
9424 FPRINTF(fp,">%s.exon%d/%d %s:%u..%u donor",accession,exoni,nexons,chrstring,exon_genomeend,exon_genomeend+1);
9425 } else {
9426 FPRINTF(fp,">%s.exon%d/%d %s:%u..%u donor",accession,exoni,nexons,chrstring,exon_genomeend,exon_genomeend-1);
9427 }
9428 in_exon = false;
9429 }
9430 } else if (this->comp == INTRONGAP_COMP) {
9431 /* Do nothing */
9432 } else {
9433 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9434 SHORTGAP_COMP, or MISMATCH_COMP */
9435 if (in_exon == false) {
9436 exoni++;
9437 if (exoni > 1) {
9438 exon_genomestart = this->genomepos + ONEBASEDP;
9439 if (watsonp) {
9440 intron_length = exon_genomestart - exon_genomeend - 1;
9441 FPRINTF(fp," %u\n",intron_length); /* For previous donor */
9442 FPRINTF(fp,">%s.exon%d/%d %s:%u..%u acceptor",accession,exoni,nexons,chrstring,exon_genomestart-1,exon_genomestart);
9443 FPRINTF(fp," %u\n",intron_length);
9444 } else {
9445 intron_length = exon_genomeend - exon_genomestart - 1;
9446 FPRINTF(fp," %u\n",intron_length); /* For previous donor */
9447 FPRINTF(fp,">%s.exon%d/%d %s:%u..%u acceptor",accession,exoni,nexons,chrstring,exon_genomestart+1,exon_genomestart);
9448 FPRINTF(fp," %u\n",intron_length);
9449 }
9450 }
9451
9452 in_exon = true;
9453 }
9454 }
9455 if (this->genome != ' ') {
9456 last_genomepos = this->genomepos;
9457 }
9458 }
9459
9460 if (chrnum != 0) {
9461 FREE(chrstring);
9462 }
9463
9464 return;
9465 }
9466
9467
9468 void
Pair_print_introns(Filestring_T fp,struct T * pairs,int npairs,char * accession,int nexons,Chrnum_T chrnum,Univ_IIT_T chromosome_iit)9469 Pair_print_introns (Filestring_T fp, struct T *pairs, int npairs, char *accession,
9470 int nexons, Chrnum_T chrnum, Univ_IIT_T chromosome_iit) {
9471 int exoni = 0, i;
9472 bool in_exon = false;
9473 struct T *ptr = pairs, *this = NULL;
9474 Chrpos_T exon_genomestart = 0, exon_genomeend;
9475 char *chrstring = NULL;
9476 Chrpos_T last_genomepos = (Chrpos_T) -1;
9477
9478 if (chrnum == 0) {
9479 chrstring = "";
9480 } else {
9481 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9482 }
9483
9484 for (i = 0; i < npairs; i++) {
9485 /* prev = this; */
9486 this = ptr++;
9487
9488 if (this->gapp) {
9489 if (in_exon == true) {
9490 /* Beginning of gap */
9491 exon_genomeend = last_genomepos + ONEBASEDP;
9492 in_exon = false;
9493 }
9494 } else if (this->comp == INTRONGAP_COMP) {
9495 /* Do nothing */
9496 } else {
9497 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9498 SHORTGAP_COMP, or MISMATCH_COMP */
9499 if (in_exon == false) {
9500 exoni++;
9501 if (exoni > 1) {
9502 exon_genomestart = this->genomepos + ONEBASEDP;
9503 FPRINTF(fp,">%s.intron%d/%d %s:%u..%u\n",accession,exoni-1,nexons-1,chrstring,exon_genomeend,exon_genomestart);
9504 }
9505
9506 in_exon = true;
9507 }
9508 }
9509 if (this->genome != ' ') {
9510 last_genomepos = this->genomepos;
9511 }
9512 }
9513
9514 if (chrnum != 0) {
9515 FREE(chrstring);
9516 }
9517
9518 return;
9519 }
9520
9521
9522 static int
print_Ns(Filestring_T fp,int column,int n,int wraplength)9523 print_Ns (Filestring_T fp, int column, int n, int wraplength) {
9524 int i;
9525
9526 for (i = 0; i < n; i++) {
9527 PUTC('N',fp);
9528 if (++column % wraplength == 0) {
9529 PUTC('\n',fp);
9530 column = 0;
9531 }
9532 }
9533
9534 return column;
9535 }
9536
9537
9538 void
Pair_print_mask_introns(Filestring_T fp,struct T * pairs,int npairs,Chrpos_T chrlength,int wraplength,bool include_utr_p)9539 Pair_print_mask_introns (Filestring_T fp, struct T *pairs, int npairs,
9540 Chrpos_T chrlength, int wraplength, bool include_utr_p) {
9541 int exoni = 0, column = 0, i;
9542 bool in_exon = false;
9543 struct T *ptr = pairs, *this = NULL;
9544 Chrpos_T exon_genomestart = 0, exon_genomeend;
9545 Chrpos_T last_genomepos = (Chrpos_T) -1;
9546
9547 assert(pairs != NULL);
9548 if (include_utr_p == true) {
9549 column = print_Ns(fp,column,pairs->genomepos,wraplength);
9550 }
9551
9552 for (i = 0; i < npairs; i++) {
9553 /* prev = this; */
9554 this = ptr++;
9555
9556 if (this->gapp) {
9557 if (in_exon == true) {
9558 /* Beginning of gap */
9559 exon_genomeend = last_genomepos + ONEBASEDP;
9560 in_exon = false;
9561 }
9562 } else if (this->comp == INTRONGAP_COMP) {
9563 /* Do nothing */
9564 } else {
9565 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9566 SHORTGAP_COMP, or MISMATCH_COMP */
9567 if (in_exon == false) {
9568 exoni++;
9569 if (exoni > 1) {
9570 exon_genomestart = this->genomepos + ONEBASEDP;
9571 column = print_Ns(fp,column,exon_genomestart - exon_genomeend - 1,wraplength);
9572 }
9573
9574 in_exon = true;
9575 }
9576 if (this->genome != ' ') {
9577 PUTC(this->genome,fp);
9578 if (++column % wraplength == 0) {
9579 PUTC('\n',fp);
9580 column = 0;
9581 }
9582 }
9583 }
9584
9585 if (this->genome != ' ') {
9586 last_genomepos = this->genomepos;
9587 }
9588 }
9589
9590 if (include_utr_p == true) {
9591 column = print_Ns(fp,column,chrlength - last_genomepos - 1,wraplength);
9592 }
9593
9594 if (column != 0) {
9595 PUTC('\n',fp);
9596 }
9597
9598 return;
9599 }
9600
9601
9602 #if 0
9603 /* goal_start < goal_end */
9604 Chrpos_T
9605 Pair_binary_search_ascending (int *querypos, int lowi, int highi, struct T *pairarray,
9606 Chrpos_T goal_start, Chrpos_T goal_end) {
9607 int middlei;
9608
9609 debug10(printf("entered binary search_ascending with lowi=%d, highi=%d, goal=%u..%u\n",
9610 lowi,highi,goal_start,goal_end));
9611
9612 while (lowi < highi) {
9613 middlei = lowi + ((highi - lowi) / 2);
9614 while (middlei < highi && pairarray[middlei].cdna == ' ') {
9615 /* Go forward past pairs corresponding to gaps */
9616 middlei++;
9617 }
9618 if (middlei >= highi) {
9619 middlei = lowi + ((highi - lowi) / 2);
9620 while (middlei >= lowi && pairarray[middlei].cdna == ' ') {
9621 /* Go backward past pairs corresponding to gaps */
9622 middlei--;
9623 }
9624 if (middlei < lowi) {
9625 debug10(printf("all intermediate pairs are gaps\n"));
9626 #if 0
9627 *querypos = pairarray[lowi].querypos;
9628 return pairarray[lowi].genomepos;
9629 #else
9630 return 0U;
9631 #endif
9632 }
9633 }
9634
9635 debug10(printf(" binary: %d:%u %d:%u %d:%u vs. %u..%u\n",
9636 lowi,pairarray[lowi].genomepos,middlei,pairarray[middlei].genomepos,
9637 highi,pairarray[highi].genomepos,goal_start,goal_end));
9638 if (goal_end < pairarray[middlei].genomepos) {
9639 highi = middlei;
9640 } else if (goal_start > pairarray[middlei].genomepos) {
9641 lowi = middlei + 1;
9642 } else {
9643 debug10(printf("binary search returns %d\n",middlei));
9644 *querypos = pairarray[middlei].querypos;
9645 return pairarray[middlei].genomepos;
9646 }
9647 }
9648
9649 debug10(printf("binary search returns %d\n",highi));
9650 return 0U;
9651 }
9652 #endif
9653
9654 #if 0
9655 /* goal_start > goal_end */
9656 Chrpos_T
9657 Pair_binary_search_descending (int *querypos, int lowi, int highi, struct T *pairarray,
9658 Chrpos_T goal_start, Chrpos_T goal_end) {
9659 int middlei;
9660
9661 debug10(printf("entered binary search_descending with lowi=%d, highi=%d, goal=%u..%u\n",
9662 lowi,highi,goal_start,goal_end));
9663
9664 while (lowi < highi) {
9665 middlei = lowi + ((highi - lowi) / 2);
9666 while (middlei < highi && pairarray[middlei].cdna == ' ') {
9667 /* Go forward past pairs corresponding to gaps */
9668 middlei++;
9669 }
9670 if (middlei >= highi) {
9671 middlei = lowi + ((highi - lowi) / 2);
9672 while (middlei >= lowi && pairarray[middlei].cdna == ' ') {
9673 /* Go backward past pairs corresponding to gaps */
9674 middlei--;
9675 }
9676 if (middlei < lowi) {
9677 debug10(printf("all intermediate pairs are gaps\n"));
9678 #if 0
9679 *querypos = pairarray[lowi].querypos;
9680 return pairarray[lowi].genomepos;
9681 #else
9682 return 0U;
9683 #endif
9684 }
9685 }
9686
9687 debug10(printf(" binary: %d:%u %d:%u %d:%u vs. %u..%u\n",
9688 lowi,pairarray[lowi].genomepos,middlei,pairarray[middlei].genomepos,
9689 highi,pairarray[highi].genomepos,goal_start,goal_end));
9690 if (goal_end > pairarray[middlei].genomepos) {
9691 highi = middlei;
9692 } else if (goal_start < pairarray[middlei].genomepos) {
9693 lowi = middlei + 1;
9694 } else {
9695 debug10(printf("binary search returns %d\n",middlei));
9696 *querypos = pairarray[middlei].querypos;
9697 return pairarray[middlei].genomepos;
9698 }
9699 }
9700
9701 debug10(printf("binary search returns %d\n",highi));
9702 return 0U;
9703 }
9704 #endif
9705
9706
9707 #if 0
9708 /* Assumes querypos is in ascending order. Need to look for worst
9709 case, so go to querypos, and then check all pairs for that
9710 querypos. This also guarantees that the querypos value is unique,
9711 since a second value must be due to an indel. */
9712 bool
9713 Pairarray_contains_p (struct T *pairarray, int npairs, int querypos) {
9714 int i;
9715
9716 i = 0;
9717 while (i < npairs && pairarray[i].querypos < querypos) {
9718 i++;
9719 }
9720
9721 if (i >= npairs || pairarray[i].querypos > querypos) {
9722 return false;
9723 } else {
9724 while (i < npairs && pairarray[i].querypos == querypos) {
9725 if (pairarray[i].gapp == true) {
9726 return false;
9727 } else if (pairarray[i].cdna == ' ') {
9728 return false;
9729 } else if (pairarray[i].genome == ' ') {
9730 return false;
9731 } else {
9732 /* Withhold judgement */
9733 i++;
9734 }
9735 }
9736
9737 return true;
9738 }
9739 }
9740 #endif
9741
9742
9743 #if 0
9744 Chrpos_T
9745 Pairarray_lookup (struct T *pairarray, int npairs, int querypos) {
9746 int i;
9747 T pair;
9748
9749 for (i = 0; i < npairs; i++) {
9750 pair = &(pairarray[i]);
9751 if (pair->querypos > querypos) {
9752 /* continue */
9753 } else if (pair->querypos < querypos) {
9754 /* continue */
9755 } else if (pair->gapp == true) {
9756 /* continue */
9757 } else if (pair->cdna == ' ') {
9758 /* continue */
9759 } else if (pair->genome == ' ') {
9760 /* continue */
9761 } else {
9762 return pair->genomepos;
9763 }
9764 }
9765
9766 return 0;
9767 }
9768 #endif
9769
9770
9771 void
Pairarray_chrpos_bounds(Chrpos_T * chrpos_start,Chrpos_T * chrpos_end,struct T * pairarray,int npairs)9772 Pairarray_chrpos_bounds (Chrpos_T *chrpos_start, Chrpos_T *chrpos_end,
9773 struct T *pairarray, int npairs) {
9774 T start, end;
9775
9776 start = &(pairarray[0]);
9777 end = &(pairarray[npairs-1]);
9778 *chrpos_start = start->genomepos;
9779 *chrpos_end = end->genomepos;
9780
9781 return;
9782 }
9783
9784
9785
9786
9787 Chrpos_T
Pairarray_genomicbound_from_start(struct T * pairarray,int npairs,int overlap)9788 Pairarray_genomicbound_from_start (struct T *pairarray, int npairs, int overlap) {
9789 int i;
9790 struct T pair;
9791
9792 i = 0;
9793 pair = pairarray[i];
9794 while (i < npairs && overlap > 0) {
9795 pair = pairarray[i];
9796 if (pair.cdna != ' ') {
9797 overlap--;
9798 }
9799 i++;
9800 }
9801
9802 return pair.genomepos;
9803 }
9804
9805 Chrpos_T
Pairarray_genomicbound_from_end(struct T * pairarray,int npairs,int overlap)9806 Pairarray_genomicbound_from_end (struct T *pairarray, int npairs, int overlap) {
9807 int i;
9808 struct T pair;
9809
9810 i = npairs-1;
9811 pair = pairarray[i];
9812 while (i >= 0 && overlap > 0) {
9813 pair = pairarray[i];
9814 if (pair.cdna != ' ') {
9815 overlap--;
9816 }
9817 i--;
9818 }
9819
9820 return pair.genomepos;
9821 }
9822
9823
9824 char *
Pairarray_genomic_sequence(int * seqlength,struct T * pairarray,int npairs)9825 Pairarray_genomic_sequence (int *seqlength, struct T *pairarray, int npairs) {
9826 char *genomic, g;
9827 int i, k;
9828
9829 for (i = 0, k = 0; i < npairs; i++) {
9830 if (pairarray[i].gapp == true) {
9831 /* Skip */
9832 } else if (pairarray[i].genome == ' ') {
9833 /* Skip */
9834 } else {
9835 k++;
9836 }
9837 }
9838
9839 genomic = (char *) MALLOC((k+1) * sizeof(char));
9840 for (i = 0, k = 0; i < npairs; i++) {
9841 if (pairarray[i].gapp == true) {
9842 /* Skip. Apparently, pairarray can have gap characters at introns */
9843 } else if ((g = pairarray[i].genome) == ' ') {
9844 /* Skip */
9845 } else {
9846 genomic[k++] = g;
9847 }
9848 }
9849 genomic[k] = '\0';
9850
9851 *seqlength = k;
9852 return genomic;
9853 }
9854
9855
9856
9857 int
Pair_cdna_direction(List_T pairs)9858 Pair_cdna_direction (List_T pairs) {
9859 int cdna_direction = 0;
9860 bool in_intron = false;
9861 T this;
9862 List_T p;
9863
9864 for (p = pairs; p != NULL; p = List_next(p)) {
9865 this = (T) List_head(p);
9866 if (this->gapp) {
9867 if (!in_intron) {
9868 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
9869 cdna_direction += 1;
9870 } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
9871 cdna_direction -= 1;
9872 }
9873 in_intron = true;
9874 }
9875 } else {
9876 if (in_intron) {
9877 in_intron = false;
9878 }
9879 }
9880 }
9881
9882 return cdna_direction;
9883 }
9884
9885
9886 /* Returns first pair that exceeds breakpoint */
9887 T
Pair_start_bound(int * cdna_direction,List_T pairs,int breakpoint)9888 Pair_start_bound (int *cdna_direction, List_T pairs, int breakpoint) {
9889 T start = NULL, this;
9890 bool in_intron = false;
9891 List_T p;
9892
9893 debug9(printf("Entering Pair_start_bound with breakpoint %d\n",breakpoint));
9894
9895 *cdna_direction = 0;
9896
9897 if ((p = pairs) != NULL) {
9898 start = this = (T) p->first;
9899 }
9900
9901 while (p != NULL) {
9902 this = (T) p->first;
9903 debug9(Pair_dump_one(this,true));
9904 debug9(printf("\n"));
9905
9906
9907 if (this->gapp == true) {
9908 /* Skip */
9909 } else if (this->querypos > breakpoint) {
9910 while (p != NULL) {
9911 this = (T) List_head(p);
9912
9913 if (this->gapp) {
9914 debug9(printf("For start bound, saw gap with comp %c\n",this->comp));
9915 if (!in_intron) {
9916 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
9917 *cdna_direction += 1;
9918 } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
9919 *cdna_direction -= 1;
9920 }
9921 in_intron = true;
9922 }
9923 } else {
9924 if (in_intron) {
9925 in_intron = false;
9926 }
9927 }
9928
9929 p = p->rest;
9930 }
9931
9932 if (*cdna_direction > 0) {
9933 *cdna_direction = +1;
9934 } else if (*cdna_direction < 0) {
9935 *cdna_direction = -1;
9936 }
9937 return start;
9938
9939 } else {
9940 start = this;
9941 }
9942
9943 p = p->rest;
9944 }
9945
9946 #if 0
9947 /* Found no gap beyond start */
9948 if (*cdna_direction > 0) {
9949 *cdna_direction = +1;
9950 } else if (*cdna_direction < 0) {
9951 *cdna_direction = -1;
9952 }
9953 #endif
9954
9955 return start;
9956 }
9957
9958
9959 /* Returns last pair that exceeds breakpoint */
9960 T
Pair_end_bound(int * cdna_direction,List_T pairs,int breakpoint)9961 Pair_end_bound (int *cdna_direction, List_T pairs, int breakpoint) {
9962 T end = NULL, this;
9963 bool in_intron = false;
9964 List_T p;
9965
9966 debug9(printf("Entering Pair_end_bound with breakpoint %d\n",breakpoint));
9967
9968 *cdna_direction = 0;
9969
9970 if ((p = pairs) != NULL) {
9971 end = this = (T) p->first;
9972 }
9973
9974 while (p != NULL) {
9975 this = (T) p->first;
9976 debug9(Pair_dump_one(this,true));
9977 debug9(printf("\n"));
9978 if (this->gapp) {
9979 debug9(printf("For end bound, saw gap with comp %c\n",this->comp));
9980 if (!in_intron) {
9981 if (this->comp == FWD_CANONICAL_INTRON_COMP) {
9982 *cdna_direction += 1;
9983 } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
9984 *cdna_direction -= 1;
9985 }
9986 in_intron = true;
9987 }
9988
9989 } else {
9990 if (in_intron) {
9991 in_intron = false;
9992 }
9993
9994 if (this->querypos > breakpoint) {
9995
9996 if (*cdna_direction > 0) {
9997 *cdna_direction = +1;
9998 } else if (*cdna_direction < 0) {
9999 *cdna_direction = -1;
10000 }
10001 return end;
10002
10003 } else {
10004 end = this;
10005 }
10006 }
10007
10008 p = p->rest;
10009 }
10010
10011 if (*cdna_direction > 0) {
10012 *cdna_direction = +1;
10013 } else if (*cdna_direction < 0) {
10014 *cdna_direction = -1;
10015 }
10016 return end;
10017 }
10018
10019
10020 #if 0
10021 /* Previously used for Stage3end_new_gmap */
10022 int
10023 Pair_count_ge_fromstart (struct T *pairs, int npairs, Chrpos_T chrbound) {
10024 int count = 0, i;
10025
10026 for (i = 0; i < npairs; i++) {
10027 if (pairs[i].genomepos >= chrbound) {
10028 /* Pass */
10029 } else {
10030 /* Trim bad pairs */
10031 while (--i >= 0 && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10032 count--;
10033 }
10034 return count;
10035 }
10036 count++;
10037 }
10038
10039 return count;
10040 }
10041 #endif
10042
10043 #if 0
10044 /* Previously used for Stage3end_new_gmap */
10045 int
10046 Pair_count_ge_fromend (struct T *pairs, int npairs, Chrpos_T chrbound) {
10047 int count = 0, i;
10048
10049 for (i = npairs - 1; i >= 0; --i) {
10050 if (pairs[i].genomepos >= chrbound) {
10051 /* Pass */
10052 } else {
10053 /* Trim bad pairs */
10054 while (++i < npairs && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10055 count--;
10056 }
10057 return count;
10058 }
10059 count++;
10060 }
10061
10062 return count;
10063 }
10064 #endif
10065
10066 #if 0
10067 /* Previously used for Stage3end_new_gmap */
10068 int
10069 Pair_count_lt_fromstart (struct T *pairs, int npairs, Chrpos_T chrbound) {
10070 int count = 0, i;
10071
10072 for (i = 0; i < npairs; i++) {
10073 if (pairs[i].genomepos < chrbound) {
10074 /* Pass */
10075 } else {
10076 while (--i >= 0 && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10077 count--;
10078 }
10079 return count;
10080 }
10081 count++;
10082 }
10083
10084 return count;
10085 }
10086 #endif
10087
10088 #if 0
10089 /* Previously used for Stage3end_new_gmap */
10090 int
10091 Pair_count_lt_fromend (struct T *pairs, int npairs, Chrpos_T chrbound) {
10092 int count = 0, i;
10093
10094 for (i = npairs - 1; i >= 0; --i) {
10095 if (pairs[i].genomepos < chrbound) {
10096 /* Pass */
10097 } else {
10098 while (++i < npairs && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10099 count--;
10100 }
10101 return count;
10102 }
10103 count++;
10104 }
10105
10106 return count;
10107 }
10108 #endif
10109
10110
10111
10112 void
Pair_trim_distances(int * trim5,int * trim3,List_T pairs)10113 Pair_trim_distances (int *trim5, int *trim3, List_T pairs) {
10114 int trim_right = 0, trim_left = -1; /* Needs to be -1 to avoid trimming when pairs is NULL */
10115 int bestscore, score, nmismatches = 0;
10116 int pairi;
10117 List_T p;
10118 T this;
10119 bool in_indelp;
10120
10121 debug8(printf("Entered Pair_trim_distances\n"));
10122 if (pairs == NULL) {
10123 *trim5 = *trim3 = 0;
10124 return;
10125 }
10126
10127
10128 /* Find trim_right */
10129 bestscore = 0;
10130 score = 0;
10131 in_indelp = false;
10132 this = (T) NULL;
10133 for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10134 this = p->first;
10135
10136 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10137 if (in_indelp == false) {
10138 score += trim_indel_score;
10139 if (score < 0) {
10140 score = 0;
10141 }
10142 in_indelp = true;
10143 }
10144
10145 } else {
10146 in_indelp = false;
10147 if (this->gapp) {
10148 /* Don't count */
10149
10150 } else if (this->comp == INTRONGAP_COMP) {
10151 /* Do nothing */
10152
10153 } else if (
10154 /* cdna of N is used commonly in PMAP */
10155 #ifndef PMAP
10156 this->cdna == 'N' ||
10157 #endif
10158 this->comp == MISMATCH_COMP) {
10159 if (nmismatches++ == 0) {
10160 score += TRIM_MISMATCH_SCORE;
10161 } else {
10162 score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10163 }
10164 if (score < 0) {
10165 score = 0;
10166 } else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10167 bestscore = score;
10168 trim_right = pairi;
10169 }
10170
10171 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10172 score += TRIM_MATCH_SCORE;
10173 if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10174 bestscore = score;
10175 trim_right = pairi;
10176 }
10177
10178 } else {
10179 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10180 abort();
10181 }
10182 }
10183
10184 debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim right score %d, trim_right %d, protectedp %d\n",
10185 pairi,this->querypos,this->genomepos,this->comp,score,trim_right,this->protectedp));
10186 }
10187
10188 *trim3 = pairi - 1 - trim_right;
10189 debug8(printf("Final: Trim right pairi %d, score %d, trim3 %d\n",pairi,score,*trim3));
10190
10191
10192 /* Find trim_left */
10193 pairs = List_reverse(pairs);
10194 bestscore = 0;
10195 score = 0;
10196 in_indelp = false;
10197 this = (T) NULL;
10198 for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10199 this = p->first;
10200
10201 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10202 if (in_indelp == false) {
10203 score += trim_indel_score;
10204 if (score < 0) {
10205 score = 0;
10206 }
10207 in_indelp = true;
10208 }
10209
10210 } else {
10211 in_indelp = false;
10212
10213 if (this->gapp) {
10214 /* Don't count */
10215
10216 } else if (this->comp == INTRONGAP_COMP) {
10217 /* Do nothing */
10218
10219 } else if (
10220 /* cdna of N is used commonly in PMAP */
10221 #ifndef PMAP
10222 this->cdna == 'N' ||
10223 #endif
10224 this->comp == MISMATCH_COMP) {
10225 if (nmismatches++ == 0) {
10226 score += TRIM_MISMATCH_SCORE;
10227 } else {
10228 score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10229 }
10230 if (score < 0) {
10231 score = 0;
10232 } else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10233 bestscore = score;
10234 trim_left = pairi;
10235 }
10236
10237 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10238 score += TRIM_MATCH_SCORE;
10239 if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10240 bestscore = score;
10241 trim_left = pairi;
10242 }
10243
10244 } else {
10245 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10246 abort();
10247 }
10248 }
10249
10250 debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim left score %d, trim_left %d, protectedp %d\n",
10251 pairi,this->querypos,this->genomepos,this->comp,score,trim_left,this->protectedp));
10252 }
10253
10254 *trim5 = pairi - 1 - trim_left;
10255 debug8(printf("Final: Trim left pairi %d, score %d, trim5 %d\n",pairi,score,*trim5));
10256
10257 /* Restore original order */
10258 pairs = List_reverse(pairs);
10259 return;
10260 }
10261
10262
10263 List_T
Pair_trim_ends(bool * trim5p,bool * trim3p,List_T pairs,int ambig_end_length_5,int ambig_end_length_3)10264 Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs, int ambig_end_length_5, int ambig_end_length_3) {
10265 List_T trimmed = NULL;
10266 int trim_right = 0, trim_left = -1; /* Needs to be -1 to avoid trimming when pairs is NULL */
10267 int bestscore, score, nmismatches = 0;
10268 int pairi;
10269 List_T p, pairptr;
10270 T this;
10271 int i;
10272 bool in_indelp;
10273
10274 debug8(printf("Entered trim_ends\n"));
10275 if (pairs == NULL) {
10276 *trim5p = *trim3p = false;
10277 return (List_T) NULL;
10278 }
10279
10280
10281 /* Find trim_right */
10282 bestscore = 0;
10283 score = 0;
10284 in_indelp = false;
10285 this = (T) NULL;
10286 for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10287 this = p->first;
10288
10289 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10290 if (in_indelp == false) {
10291 score += trim_indel_score;
10292 if (score < 0) {
10293 score = 0;
10294 }
10295 in_indelp = true;
10296 }
10297
10298 } else {
10299 in_indelp = false;
10300 if (this->gapp) {
10301 /* Don't count */
10302
10303 } else if (this->comp == INTRONGAP_COMP) {
10304 /* Do nothing */
10305
10306 } else if (
10307 /* cdna of N is used commonly in PMAP */
10308 #ifndef PMAP
10309 this->cdna == 'N' ||
10310 #endif
10311 this->comp == MISMATCH_COMP) {
10312 if (nmismatches++ == 0) {
10313 score += TRIM_MISMATCH_SCORE;
10314 } else {
10315 score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10316 }
10317 if (score < 0) {
10318 score = 0;
10319 } else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10320 bestscore = score;
10321 trim_right = pairi;
10322 }
10323
10324 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10325 score += TRIM_MATCH_SCORE;
10326 if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10327 bestscore = score;
10328 trim_right = pairi;
10329 }
10330
10331 } else {
10332 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10333 abort();
10334 }
10335 }
10336
10337 debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim right score %d, trim_right %d, protectedp %d\n",
10338 pairi,this->querypos,this->genomepos,this->comp,score,trim_right,this->protectedp));
10339 }
10340
10341 if (this == NULL) {
10342 fprintf(stderr,"check for trim_right yields this == NULL\n");
10343 abort();
10344 } else if (ambig_end_length_3 > 0) {
10345 debug8(printf("Not disturbing ambiguous end on right\n"));
10346 trim_right = 0;
10347 } else if (this->protectedp == true) {
10348 debug8(printf("Protected against trim_right\n"));
10349 trim_right = 0;
10350 } else {
10351 trim_right = pairi - 1 - trim_right;
10352 debug8(printf("Final: Trim right pairi %d, score %d, trim_right %d\n",pairi,score,trim_right));
10353 }
10354 debug8(printf("\n"));
10355
10356
10357 /* Find trim_left */
10358 pairs = List_reverse(pairs);
10359 bestscore = 0;
10360 score = 0;
10361 in_indelp = false;
10362 this = (T) NULL;
10363 for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10364 this = p->first;
10365
10366 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10367 if (in_indelp == false) {
10368 score += trim_indel_score;
10369 if (score < 0) {
10370 score = 0;
10371 }
10372 in_indelp = true;
10373 }
10374
10375 } else {
10376 in_indelp = false;
10377
10378 if (this->gapp) {
10379 /* Don't count */
10380
10381 } else if (this->comp == INTRONGAP_COMP) {
10382 /* Do nothing */
10383
10384 } else if (
10385 /* cdna of N is used commonly in PMAP */
10386 #ifndef PMAP
10387 this->cdna == 'N' ||
10388 #endif
10389 this->comp == MISMATCH_COMP) {
10390 if (nmismatches++ == 0) {
10391 score += TRIM_MISMATCH_SCORE;
10392 } else {
10393 score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10394 }
10395 if (score < 0) {
10396 score = 0;
10397 } else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10398 bestscore = score;
10399 trim_left = pairi;
10400 }
10401
10402 } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10403 score += TRIM_MATCH_SCORE;
10404 if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10405 bestscore = score;
10406 trim_left = pairi;
10407 }
10408
10409 } else {
10410 fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10411 abort();
10412 }
10413 }
10414
10415 debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim left score %d, trim_left %d, protectedp %d\n",
10416 pairi,this->querypos,this->genomepos,this->comp,score,trim_left,this->protectedp));
10417 }
10418
10419 if (this == NULL) {
10420 fprintf(stderr,"check for trim_left yields this == NULL\n");
10421 abort();
10422 } else if (ambig_end_length_5 > 0) {
10423 debug8(printf("Not disturbing ambiguous end on left\n"));
10424 trim_left = pairi - 1;
10425 } else if (this->protectedp == true) {
10426 debug8(printf("Protected against trim_left\n"));
10427 trim_left = pairi - 1;
10428 } else {
10429 debug8(printf("Final: Trim left pairi %d, score %d, trim_left %d\n",pairi,score,trim_left));
10430 }
10431 debug8(printf("\n"));
10432
10433
10434 /* trim */
10435 if (trim_right == 0) {
10436 *trim3p = false;
10437 } else {
10438 *trim3p = true;
10439 }
10440
10441 if (trim_left == 0) {
10442 *trim5p = false;
10443 } else {
10444 *trim5p = true;
10445 }
10446
10447 i = 0;
10448 while (i < trim_right) {
10449 pairs = Pairpool_pop(pairs,&this);
10450 i++;
10451 }
10452
10453 while (i <= trim_left) {
10454 pairptr = pairs;
10455 pairs = Pairpool_pop(pairs,&this);
10456 #ifdef WASTE
10457 path = Pairpool_push_existing(path,pairpool,pair);
10458 #else
10459 trimmed = List_push_existing(trimmed,pairptr);
10460 #endif
10461 i++;
10462 }
10463
10464 debug8(Pair_dump_list(trimmed,/*zerobasedp*/true));
10465
10466 return trimmed;
10467 }
10468
10469
10470 #if 0
10471 void
10472 Pairarray_unalias (struct T *pairarray, int npairs, Chrpos_T chrlength) {
10473 int i;
10474
10475 for (i = 0; i < npairs; i++) {
10476 if (pairarray[i].genomepos > chrlength) {
10477 pairarray[i].genomepos -= chrlength;
10478 }
10479 }
10480 return;
10481 }
10482 #endif
10483
10484
10485 void
Pair_split_circular(List_T * pairs_below,List_T * pairs_above,List_T pairs,Chrpos_T chrlength,Pairpool_T pairpool,bool plusp)10486 Pair_split_circular (List_T *pairs_below, List_T *pairs_above, List_T pairs,
10487 Chrpos_T chrlength, Pairpool_T pairpool, bool plusp) {
10488 List_T below = NULL, above = NULL, *dest, p = pairs;
10489 T pair;
10490
10491 if (plusp == true) {
10492 dest = &below;
10493 while (p != NULL) {
10494 pair = (T) List_head(p);
10495 if (pair->gapp == true) {
10496 /* Skip */
10497 } else if (pair->genomepos >= chrlength) {
10498 dest = &above;
10499 }
10500 *dest = Pairpool_push_existing(*dest,pairpool,pair);
10501 p = List_next(p);
10502 }
10503
10504 /* Unalias pairs above */
10505 for (p = above; p != NULL; p = List_next(p)) {
10506 pair = (T) List_head(p);
10507 pair->genomepos -= chrlength;
10508 }
10509
10510 } else {
10511 dest = &above;
10512 while (p != NULL) {
10513 pair = (T) List_head(p);
10514 if (pair->gapp == true) {
10515 /* Skip */
10516 } else if (pair->genomepos > chrlength) {
10517 dest = &below;
10518 }
10519 *dest = Pairpool_push_existing(*dest,pairpool,pair);
10520 p = List_next(p);
10521 }
10522
10523 /* Unalias pairs above */
10524 for (p = below; p != NULL; p = List_next(p)) {
10525 pair = (T) List_head(p);
10526 pair->genomepos -= chrlength;
10527 }
10528 }
10529
10530 *pairs_below = List_reverse(below);
10531 *pairs_above = List_reverse(above);
10532
10533 return;
10534 }
10535
10536
10537
10538