1 static char rcsid[] = "$Id: pair.c 223009 2020-07-10 15:13:26Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "pair.h"
13 #include "pairdef.h"
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>		/* For memcpy */
17 #include <math.h>		/* For rint(), abs() */
18 #include <ctype.h>		/* For toupper */
19 
20 #include "assert.h"
21 #include "except.h"
22 #include "mem.h"
23 #include "comp.h"
24 #include "complement.h"
25 #include "intron.h"
26 #include "intlist.h"
27 #include "separator.h"
28 #include "scores.h"
29 #include "segmentpos.h"
30 #include "maxent.h"
31 #include "maxent_hr.h"
32 #include "sense.h"
33 #include "samflags.h"
34 
35 
36 #define ONEBASEDP 1		/* 1-based coordinates.  Also defined in segmentpos.c */
37 
38 #define MIN_INTRONLEN 20	/* For deciding between N and D in cigar string */
39 
40 
41 /* Check for ANSI mode, which does not include rint */
42 #ifdef __STRICT_ANSI__
43 #define rint(x) floor(0.5+(x))
44 #endif
45 
46 #define DEFAULT_MARGIN 14
47 
48 /* #define DIAGNOSTICP 1 */
49 
50 #ifdef DEBUG
51 #define debug(x) x
52 #else
53 #define debug(x)
54 #endif
55 
56 /* Print pointer information in Pair_dump_one */
57 #ifdef DEBUG1
58 #define debug1(x) x
59 #else
60 #define debug1(x)
61 #endif
62 
63 /* PSL indels */
64 #ifdef DEBUG2
65 #define debug2(x) x
66 #else
67 #define debug2(x)
68 #endif
69 
70 /* Pair_fracidentity_max */
71 #ifdef DEBUG3
72 #define debug3(x) x
73 #else
74 #define debug3(x)
75 #endif
76 
77 /* compute_md_string */
78 #ifdef DEBUG4
79 #define debug4(x) x
80 #else
81 #define debug4(x)
82 #endif
83 
84 /* Phase information */
85 #ifdef DEBUG5
86 #define debug5(x) x
87 #else
88 #define debug5(x)
89 #endif
90 
91 /* Pairarray_convert_to_substrings */
92 #ifdef DEBUG6
93 #define debug6(x) x
94 #else
95 #define debug6(x)
96 #endif
97 
98 /* cds_phase in gff3 output */
99 #ifdef DEBUG7
100 #define debug7(x) x
101 #else
102 #define debug7(x)
103 #endif
104 
105 /* trimming */
106 #ifdef DEBUG8
107 #define debug8(x) x
108 #else
109 #define debug8(x)
110 #endif
111 
112 /* end_bound and start_bound */
113 #ifdef DEBUG9
114 #define debug9(x) x
115 #else
116 #define debug9(x)
117 #endif
118 
119 /* binary search */
120 #ifdef DEBUG10
121 #define debug10(x) x
122 #else
123 #define debug10(x)
124 #endif
125 
126 /* maxnegscore */
127 #ifdef DEBUG11
128 #define debug11(x) x
129 #else
130 #define debug11(x)
131 #endif
132 
133 /* circularpos */
134 #ifdef DEBUG12
135 #define debug12(x) x
136 #else
137 #define debug12(x)
138 #endif
139 
140 
141 #define TRIM_MATCH_SCORE 1
142 #define TRIM_MISMATCH_SCORE -1
143 
144 static bool novelsplicingp;
145 static IIT_T splicesites_iit;
146 
147 static int trim_indel_score;
148 static bool gff3_separators_p;
149 static bool sam_insert_0M_p = false;
150 static bool force_xs_direction_p;
151 static bool md_lowercase_variant_p;
152 static bool snps_p;
153 
154 static bool gff3_phase_swap_p;
155 static CDStype_T cdstype;
156 static bool cigar_extended_p;
157 static Cigar_action_T cigar_action;
158 
159 
160 void
Pair_setup(bool novelsplicingp_in,IIT_T splicesites_iit_in,int trim_indel_score_in,bool gff3_separators_p_in,bool sam_insert_0M_p_in,bool force_xs_direction_p_in,bool md_lowercase_variant_p_in,bool snps_p_in,bool gff3_phase_swap_p_in,CDStype_T cdstype_in,bool cigar_extended_p_in,Cigar_action_T cigar_action_in)161 Pair_setup (bool novelsplicingp_in, IIT_T splicesites_iit_in, int trim_indel_score_in,
162 	    bool gff3_separators_p_in, bool sam_insert_0M_p_in, bool force_xs_direction_p_in,
163 	    bool md_lowercase_variant_p_in, bool snps_p_in,
164 	    bool gff3_phase_swap_p_in, CDStype_T cdstype_in,
165 	    bool cigar_extended_p_in, Cigar_action_T cigar_action_in) {
166 
167   novelsplicingp = novelsplicingp_in;
168   splicesites_iit = splicesites_iit_in;
169 
170   trim_indel_score = trim_indel_score_in;
171   gff3_separators_p = gff3_separators_p_in;
172   sam_insert_0M_p = sam_insert_0M_p_in;
173   force_xs_direction_p = force_xs_direction_p_in;
174   md_lowercase_variant_p = md_lowercase_variant_p_in;
175   snps_p = snps_p_in;
176   gff3_phase_swap_p = gff3_phase_swap_p_in;
177   cdstype = cdstype_in;
178   cigar_extended_p = cigar_extended_p_in;
179   cigar_action = cigar_action_in;
180 
181   return;
182 }
183 
184 
185 
186 #define T Pair_T
187 
188 int
Pair_querypos(T this)189 Pair_querypos (T this) {
190   return this->querypos;
191 }
192 
193 Chrpos_T
Pair_genomepos(T this)194 Pair_genomepos (T this) {
195   return this->genomepos;
196 }
197 
198 char
Pair_cdna(T this)199 Pair_cdna (T this) {
200   return this->cdna;
201 }
202 
203 char
Pair_comp(T this)204 Pair_comp (T this) {
205   return this->comp;
206 }
207 
208 char
Pair_genome(T this)209 Pair_genome (T this) {
210   return this->genome;
211 }
212 
213 char
Pair_genomealt(T this)214 Pair_genomealt (T this) {
215   return this->genomealt;
216 }
217 
218 bool
Pair_gapp(T this)219 Pair_gapp (T this) {
220   return this->gapp;
221 }
222 
223 bool
Pair_shortexonp(T this)224 Pair_shortexonp (T this) {
225   return this->shortexonp;
226 }
227 
228 
229 void
Pair_print_ends(List_T pairs)230 Pair_print_ends (List_T pairs) {
231   List_T p;
232   T start, end;
233 
234   if (pairs == NULL) {
235     printf("0..0, 0..0\n");
236   } else {
237     start = (T) pairs->first;
238     for (p = pairs; p != NULL; p = p->rest) {
239       end = (T) p->first;
240     }
241     printf("%d..%d %u..%u",start->querypos,end->querypos,start->genomepos,end->genomepos);
242   }
243   return;
244 }
245 
246 
247 void
Pair_set_genomepos(struct T * pairarray,int npairs,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)248 Pair_set_genomepos (struct T *pairarray, int npairs,
249 		    Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp) {
250   int i;
251   Chrpos_T chraliaslength;
252 
253   if (watsonp == true) {
254     /* No need to adjust, since we are using chromosomal coordinates already */
255   } else {
256     chraliaslength = chrhigh - chroffset;
257     for (i = 0; i < npairs; i++) {
258       pairarray[i].genomepos = chraliaslength - pairarray[i].genomepos;
259     }
260   }
261   return;
262 }
263 
264 
265 void
Pair_subtract_genomepos(struct T * pairs,int npairs,Chrpos_T adjustment)266 Pair_subtract_genomepos (struct T *pairs, int npairs, Chrpos_T adjustment) {
267   int i;
268   struct T *ptr;
269 
270   i = 0;
271   ptr = pairs;
272   while (i < npairs) {
273     ptr->genomepos -= adjustment;
274     i++;
275     ptr++;
276   }
277 
278   return;
279 }
280 
281 
282 #if 0
283 /* Don't change list, just pairarray */
284 void
285 Pair_set_genomepos_list (List_T pairs, Univcoord_T chroffset,
286 			 Univcoord_T chrhigh, bool watsonp) {
287   List_T p;
288   T pair;
289   Chrpos_T chraliaslength;
290 
291   if (watsonp == true) {
292     /* No need to adjust, since we are using chromosomal coordinates already */
293   } else {
294     chraliaslength = chrhigh - chroffset;
295     for (p = pairs; p != NULL; p = p->rest) {
296       pair = (T) p->first;
297       pair->genomepos = chraliaslength - pair->genomepos;
298     }
299   }
300 
301   return;
302 }
303 #endif
304 
305 
306 /* For outbuffer usage (e.g., truncation), use Pair_clip_bounded_array instead */
307 /* Note: This code is designed to handle source, which may still have
308    gaps with querypos undefined */
309 List_T
Pair_clip_bounded_list_5(List_T source,int minpos,int maxpos)310 Pair_clip_bounded_list_5 (List_T source, int minpos, int maxpos) {
311   List_T dest, *prev, p;
312   T pair;
313   int starti = -1, endi = -1, i;
314 
315   if (source == NULL) {
316     return (List_T) NULL;
317   } else {
318     for (p = source, i = 0; p != NULL; p = p->rest, i++) {
319       pair = (Pair_T) List_head(p);
320       if (pair->querypos == minpos) {
321 	starti = i;		/* Advances in case of ties */
322       } else if (pair->querypos > minpos && starti < 0) {
323 	starti = i;		/* Handles case where minpos was skipped */
324       }
325 
326       if (pair->querypos == maxpos && endi < 0) {
327 	endi = i + 1;		/* Does not advance in case of tie */
328       } else if (pair->querypos > maxpos && endi < 0) {
329 	endi = i;	   /* Handles case where maxpos was skipped */
330       }
331     }
332 
333     if (starti < 0 && endi < 0) {
334       /* None of the pairs fall within bounds */
335       return (List_T) NULL;
336     } else {
337       if (starti < 0) {
338 	starti = 0;
339       }
340       if (endi < 0) {
341 	endi = i;
342       }
343     }
344 
345     p = source;
346     i = 0;
347     while (i < starti) {
348       p = p->rest;
349       i++;
350     }
351 
352     dest = p;
353     prev = &p->rest;
354     while (i < endi) {
355       prev = &p->rest;
356       p = p->rest;
357       i++;
358     }
359 
360     *prev = NULL;		/* Clip rest of list */
361     return dest;
362   }
363 }
364 
365 
366 List_T
Pair_clip_bounded_list_3(List_T source,int minpos,int maxpos)367 Pair_clip_bounded_list_3 (List_T source, int minpos, int maxpos) {
368   List_T dest, *prev, p;
369   T pair;
370   int starti = -1, endi = -1, i;
371 
372   if (source == NULL) {
373     return (List_T) NULL;
374   } else {
375     for (p = source, i = 0; p != NULL; p = p->rest, i++) {
376       pair = (Pair_T) List_head(p);
377       if (pair->querypos == minpos && starti < 0) {
378 	starti = i;		/* Does not advance in case of tie */
379       } else if (pair->querypos > minpos && starti < 0) {
380 	starti = i;		/* Handles case where minpos was skipped */
381       }
382 
383       if (pair->querypos == maxpos) {
384 	endi = i + 1;		/* Advances in case of ties */
385       } else if (pair->querypos > maxpos && endi < 0) {
386 	endi = i;	   /* Handles case where maxpos was skipped */
387       }
388     }
389 
390     if (starti < 0 && endi < 0) {
391       /* None of the pairs fall within bounds */
392       return (List_T) NULL;
393     } else {
394       if (starti < 0) {
395 	starti = 0;
396       }
397       if (endi < 0) {
398 	endi = i;
399       }
400     }
401 
402     p = source;
403     i = 0;
404     while (i < starti) {
405       p = p->rest;
406       i++;
407     }
408 
409     dest = p;
410     prev = &p->rest;
411     while (i < endi) {
412       prev = &p->rest;
413       p = p->rest;
414       i++;
415     }
416 
417     *prev = NULL;		/* Clip rest of list */
418     return dest;
419   }
420 }
421 
422 
423 int
Pair_clip_bounded_array(struct T * source,int npairs,int minpos,int maxpos)424 Pair_clip_bounded_array (struct T *source, int npairs, int minpos, int maxpos) {
425   T pair;
426   int starti = -1, endi = -1, i, k;
427 
428 #if 0
429   printf("Pair_clip_bounded_array called with %d pairs, minpos %d, maxpos %d\n",npairs,minpos,maxpos);
430   Pair_dump_array(source,npairs,true);
431 #endif
432 
433   for (i = 0; i < npairs; i++) {
434     pair = &(source[i]);
435     if (pair->querypos == minpos) {
436       starti = i;		/* Advances in case of ties */
437     } else if (pair->querypos > minpos && starti < 0) {
438       starti = i;		/* Handles case where minpos was skipped */
439     }
440 
441     if (pair->querypos == maxpos && endi < 0) {
442       endi = i + 1;		/* Does not advance in case of tie */
443     } else if (pair->querypos > maxpos && endi < 0) {
444       endi = i;	   /* Handles case where maxpos was skipped */
445     }
446   }
447 
448   if (starti < 0 && endi < 0) {
449     /* None of the pairs fall within bounds.  Don't do anything. */
450     return npairs;
451   } else {
452     if (starti < 0) {
453       starti = 0;
454     }
455     if (endi < 0) {
456       endi = i;
457     }
458   }
459 
460   k = 0;
461   for (i = starti; i < endi; i++) {
462     memcpy((void *) &(source[k++]),(void *) &(source[i]),sizeof(struct T));
463   }
464 
465   return endi - starti;
466 }
467 
468 
469 
470 /* Head of list is the medial part of the read */
471 List_T
Pair_protect_end5(List_T pairs)472 Pair_protect_end5 (List_T pairs) {
473   List_T p;
474   T pair;
475 
476   p = pairs;
477 
478   /* Go until known splice is seen */
479   while (p != NULL && ((T) p->first)->gapp == false) {
480     pair = (T) p->first;
481     pair->protectedp = true;
482     p = p->rest;
483   }
484 
485   /* Handle known splice */
486   if (p != NULL) {
487     pair = (T) p->first;
488     pair->protectedp = true;
489     p = p->rest;
490   }
491 
492   /* Continue until distal indel is seen */
493   while (p != NULL && ((T) p->first)->cdna != ' ' && ((T) p->first)->genome != ' ') {
494     pair = (T) p->first;
495     pair->protectedp = true;
496     p = p->rest;
497   }
498 
499   /* Do not protect the sequence after the distal indel */
500   while (p != NULL) {
501     pair = (T) p->first;
502     pair->protectedp = false;
503     p = p->rest;
504   }
505 
506   return pairs;
507 }
508 
509 
510 /* Head of list is the 3' distal end of the read */
511 List_T
Pair_protect_end3(List_T pairs)512 Pair_protect_end3 (List_T pairs) {
513   List_T p;
514   T pair;
515 
516   p = pairs = List_reverse(pairs); /* Now head is medial end */
517 
518   /* Go until known splice is seen */
519   while (p != NULL && ((T) p->first)->gapp == false) {
520     pair = (T) p->first;
521     pair->protectedp = true;
522     /* result = Pairpool_push_existing(result,pairpool,pair); */
523     p = p->rest;
524   }
525 
526   /* Handle known splice */
527   if (p != NULL) {
528     pair = (T) p->first;
529     pair->protectedp = true;
530     /* result = Pairpool_push_existing(result,pairpool,pair); */
531     p = p->rest;
532   }
533 
534   /* Continue until distal indel is seen */
535   while (p != NULL && ((T) p->first)->cdna != ' ' && ((T) p->first)->genome != ' ') {
536     pair = (T) p->first;
537     pair->protectedp = true;
538     /* result = Pairpool_push_existing(result,pairpool,pair); */
539     p = p->rest;
540   }
541 
542   /* Do not protect the sequence after the distal indel */
543   while (p != NULL) {
544     pair = (T) p->first;
545     pair->protectedp = false;
546     /* result = Pairpool_push_existing(result,pairpool,pair); */
547     p = p->rest;
548   }
549 
550   return List_reverse(pairs);
551 }
552 
553 
554 void
Pair_protect_list(List_T pairs)555 Pair_protect_list (List_T pairs) {
556   List_T p;
557   T pair;
558 
559   for (p = pairs; p != NULL; p = p->rest) {
560     pair = (T) p->first;
561     pair->protectedp = true;
562   }
563 
564   return;
565 }
566 
567 
568 
569 
570 /* Print routines */
571 
572 static char *RULER = "    .    :    .    :    .    :    .    :    .    :";
573 static void
print_top_ruler(Filestring_T fp,int n,int npairs,int margin,int wraplength)574 print_top_ruler (Filestring_T fp, int n, int npairs, int margin, int wraplength) {
575   FPRINTF(fp,"%*d ",margin,n);
576   if (n + wraplength < npairs) {
577     FPRINTF(fp,"%s\n",RULER);
578   } else {
579     FPRINTF(fp,"%.*s\n",npairs-n,RULER);
580   }
581   return;
582 }
583 
584 /*
585 static void
586 print_bottom_ruler (int n, int npairs, int margin, int wraplength) {
587   printf("%*s ",margin,"");
588   if (n + wraplength < npairs) {
589     printf("%s\n",RULER);
590   } else {
591     printf("%.*s\n",npairs-n,RULER);
592   }
593   return;
594 }
595 */
596 
597 
598 static void
print_cdna_sequence(Filestring_T fp,struct T * ptr,int n,int npairs,int margin,int wraplength)599 print_cdna_sequence (Filestring_T fp, struct T *ptr, int n, int npairs, int margin, int wraplength) {
600   struct T *this;
601   int i;
602 
603   this = ptr;
604   FPRINTF(fp,"%*u ",margin,this->querypos + ONEBASEDP);
605   for (i = 0; n < npairs && i < wraplength; n++, i++) {
606     this = ptr++;
607     PUTC(this->cdna,fp);
608   }
609   PUTC('\n',fp);
610   return;
611 }
612 
613 static int
find_aapos_in_line(struct T * ptr,int n,int npairs,int wraplength,bool genomep)614 find_aapos_in_line (struct T *ptr, int n, int npairs, int wraplength,
615 		    bool genomep) {
616   struct T *this, *last;
617 
618   if (npairs - n < wraplength) {
619     last = &ptr[npairs - n - 1];
620   } else {
621     last = &ptr[wraplength - 1];
622   }
623   this = ptr;
624   while (this <= last && (genomep ? this->aa_g : this->aa_e) == ' ') {
625     this++;
626   }
627 
628   if (this > last) {
629     /* No aa found */
630     return -1;
631   } else {
632     return this->aapos;
633   }
634 }
635 
636 
637 static void
print_peptide(Filestring_T fp,struct T * ptr,int n,int npairs,int margin,int wraplength,bool genomep)638 print_peptide (Filestring_T fp, struct T *ptr, int n, int npairs, int margin,
639 	       int wraplength, bool genomep) {
640   struct T *this;
641   int aapos, i;
642 
643   if ((aapos = find_aapos_in_line(ptr,n,npairs,wraplength,genomep)) < 0) {
644     FPRINTF(fp,"%*s ",margin,"");
645   } else {
646     /* 4 is length of "aa.c" and "aa.g" */
647     if (genomep == true) {
648       FPRINTF(fp,"aa.g%*d ",margin-4,aapos);
649     } else {
650       FPRINTF(fp,"aa.c%*d ",margin-4,aapos);
651     }
652   }
653 
654   if (genomep == true) {
655     for (i = 0; n < npairs && i < wraplength; n++, i++) {
656       this = ptr++;
657       PUTC(this->aa_g,fp);
658     }
659   } else {
660     for (i = 0; n < npairs && i < wraplength; n++, i++) {
661       this = ptr++;
662       PUTC(this->aa_e,fp);
663     }
664   }
665 
666   PUTC('\n',fp);
667   return;
668 }
669 
670 static void
print_alignment(Filestring_T fp,struct T * ptr,int n,int npairs,int margin,int wraplength)671 print_alignment (Filestring_T fp, struct T *ptr, int n, int npairs,
672 		 int margin, int wraplength) {
673   struct T *this;
674   int i;
675 
676   FPRINTF(fp,"%*s ",margin,"");
677   for (i = 0; n < npairs && i < wraplength; n++, i++) {
678     this = ptr++;
679 
680 #ifdef DIAGNOSTICP
681     /* Subtract 1 because dynprogindices start at +1 and -1 */
682     if (this->comp == DYNPROG_MATCH_COMP) {
683       if (this->dynprogindex > 0) {
684 	FPRINTF(fp,"%c",(this->dynprogindex-1)%26+'a');
685       } else if (this->dynprogindex < 0) {
686 	FPRINTF(fp,"%c",(-this->dynprogindex-1)%26+'A');
687       } else {
688 	PUTC(DYNPROG_MATCH_COMP,fp);
689       }
690     } else if (this->shortexonp == true) {
691       PUTC(DIAGNOSTIC_SHORTEXON_COMP,fp);
692     } else {
693       PUTC(this->comp,fp);
694     }
695 
696 #else
697     if (this->comp == DYNPROG_MATCH_COMP) {
698       PUTC(MATCH_COMP,fp);
699     } else if (this->comp == AMBIGUOUS_COMP) {
700       /* Previously put AMBIGUOUS_COMP only for PMAP, and MISMATCH_COMP for GMAP */
701       PUTC(AMBIGUOUS_COMP,fp);
702     } else if (this->comp == SHORTGAP_COMP) {
703       PUTC(INDEL_COMP,fp);
704     } else if (this->comp == EXTRAEXON_COMP) {
705       PUTC(INTRONGAP_COMP,fp);
706     } else {
707       PUTC(this->comp,fp);
708     }
709 #endif
710 
711   }
712 
713   PUTC('\n',fp);
714   return;
715 }
716 
717 
718 static void
print_genomic_sequence(Filestring_T fp,struct T * ptr,int n,int npairs,char * chrstring,Univcoord_T chroffset,int margin,int wraplength)719 print_genomic_sequence (Filestring_T fp, struct T *ptr, int n, int npairs,
720 			char *chrstring, Univcoord_T chroffset,
721 			int margin, int wraplength) {
722   struct T *this;
723   int i;
724   char Buffer[100];
725 
726   this = ptr;
727   if (chrstring == NULL) {
728     sprintf(Buffer,"%llu",(unsigned long long) (chroffset+this->genomepos + ONEBASEDP));
729   } else {
730     sprintf(Buffer,"%s:%llu",chrstring,(unsigned long long) (this->genomepos + ONEBASEDP));
731   }
732   FPRINTF(fp,"%*s ",margin,Buffer);
733   for (i = 0; n < npairs && i < wraplength; n++, i++) {
734     this = ptr++;
735     if (this->comp == EXTRAEXON_COMP) {
736       PUTC(INTRONGAP_CHAR,fp);
737     } else {
738       PUTC(this->genome,fp);
739     }
740   }
741   PUTC('\n',fp);
742   return;
743 }
744 
745 static void
print_genomicalt_sequence(Filestring_T fp,struct T * ptr,int n,int npairs,char * chrstring,Univcoord_T chroffset,int margin,int wraplength)746 print_genomicalt_sequence (Filestring_T fp, struct T *ptr, int n, int npairs,
747 			   char *chrstring, Univcoord_T chroffset,
748 			   int margin, int wraplength) {
749   struct T *this;
750   int i;
751   char Buffer[100];
752 
753   this = ptr;
754   if (chrstring == NULL) {
755     sprintf(Buffer,"%llu",(unsigned long long) (chroffset+this->genomepos + ONEBASEDP));
756   } else {
757     sprintf(Buffer,"%s:%llu",chrstring, (unsigned long long) (this->genomepos + ONEBASEDP));
758   }
759   FPRINTF(fp,"%*s ",margin,Buffer);
760   for (i = 0; n < npairs && i < wraplength; n++, i++) {
761     this = ptr++;
762     if (this->comp == EXTRAEXON_COMP) {
763       PUTC(INTRONGAP_CHAR,fp);
764     } else if (this->genomealt == this->genome) {
765       PUTC(' ',fp);
766     } else {
767       PUTC(this->genomealt,fp);
768     }
769   }
770   PUTC('\n',fp);
771   return;
772 }
773 
774 
775 static int
compute_margin(struct T * start,struct T * end,char * chrstring,Univcoord_T chroffset)776 compute_margin (struct T *start, struct T *end, char *chrstring,
777 		Univcoord_T chroffset) {
778   int margin;
779   char Buffer[100];
780 
781   if (chrstring == NULL) {
782     sprintf(Buffer,"%llu",(unsigned long long) (chroffset + start->genomepos + ONEBASEDP));
783   } else {
784     sprintf(Buffer,"%s:%llu",chrstring,(unsigned long long) (start->genomepos + ONEBASEDP));
785   }
786   margin = (int) strlen(Buffer) + 1;
787 
788   if (chrstring == NULL) {
789     sprintf(Buffer,"%llu",(unsigned long long) (chroffset + end->genomepos + ONEBASEDP));
790   } else {
791     sprintf(Buffer,"%s:%llu",chrstring,(unsigned long long) (end->genomepos + ONEBASEDP));
792   }
793   if ((int) strlen(Buffer) + 1 > margin) {
794     margin = (int) strlen(Buffer) + 1;
795   }
796 
797   if (margin < DEFAULT_MARGIN) {
798     margin = DEFAULT_MARGIN;
799   }
800 
801   return margin;
802 }
803 
804 
805 /*
806 static char
807 intron_symbol_rev (char c) {
808   switch (c) {
809   case '>': return '<';
810   case ')': return '(';
811   case ']': return '[';
812   case '<': return '>';
813   case '(': return ')';
814   case '[': return ']';
815   default: return c;
816   }
817 }
818 */
819 
820 static char complCode[128] = COMPLEMENT_LC;
821 
822 static struct T *
invert_path(struct T * old,int npairs)823 invert_path (struct T *old, int npairs) {
824   struct T *new;
825   int i, j;
826 
827   new = (struct T *) MALLOC(npairs*sizeof(struct T));
828   for (i = 0, j = npairs-1; i < npairs; i++, j--) {
829     memcpy(&(new[j]),&(old[i]),sizeof(struct T));
830     new[j].comp = complCode[(int) old[i].comp];
831   }
832   return new;
833 }
834 
835 static struct T *
invert_and_revcomp_path(struct T * old,int npairs)836 invert_and_revcomp_path (struct T *old, int npairs) {
837   struct T *new;
838   int i, j;
839 
840   new = (struct T *) MALLOC(npairs*sizeof(struct T));
841   for (i = 0, j = npairs-1; i < npairs; i++, j--) {
842     memcpy(&(new[j]),&(old[i]),sizeof(struct T));
843     new[j].cdna = complCode[(int) old[i].cdna];
844     new[j].genome = complCode[(int) old[i].genome];
845     new[j].genomealt = complCode[(int) old[i].genomealt];
846     new[j].comp = complCode[(int) old[i].comp];
847   }
848   return new;
849 }
850 
851 
852 #ifdef GSNAP
853 static struct T *
invert_and_revcomp_path_and_coords(struct T * old,int npairs,int querylength)854 invert_and_revcomp_path_and_coords (struct T *old, int npairs, int querylength) {
855   struct T *new;
856   int i, j;
857 
858   new = (struct T *) MALLOC(npairs*sizeof(struct T));
859   for (i = 0, j = npairs-1; i < npairs; i++, j--) {
860     memcpy(&(new[j]),&(old[i]),sizeof(struct T));
861     new[j].querypos = (querylength - 1) - old[i].querypos;
862     new[j].cdna = complCode[(int) old[i].cdna];
863     new[j].genome = complCode[(int) old[i].genome];
864     new[j].genomealt = complCode[(int) old[i].genomealt];
865     new[j].comp = complCode[(int) old[i].comp];
866   }
867   return new;
868 }
869 #endif
870 
871 
872 static void
add_intronlengths(struct T * pairs,int npairs)873 add_intronlengths (struct T *pairs, int npairs) {
874   struct T *this = NULL, *ptr;
875   int space, margin, i, j, k, gapstart;
876   char intronstring[20], cdnabreak[20], genomicbreak[20], comp;
877   int last_querypos = -1;
878   Chrpos_T last_genomepos = (Chrpos_T) -1;
879 
880   i = 0;
881   while (i < npairs) {
882     /* prev = this; */
883     this = &(pairs[i++]);
884 
885     if (this->extraexonp == true) {
886       /* Don't add any lengths */
887     } else if (this->gapp) {
888       comp = this->comp;
889       gapstart = i-1;
890       space = 0;
891       while (this->gapp) {
892 	this = &(pairs[i++]);
893 	space++;
894       }
895 
896       if (comp == DUALBREAK_COMP || comp == EXTRAEXON_COMP) {
897 	/* abs() gives a large value when flag -m64 is specified */
898 	/* sprintf(cdnabreak,"%d",abs(this->querypos - last_querypos)-1); */
899 	if (this->querypos > last_querypos) {
900 	  sprintf(cdnabreak,"%d",(this->querypos - last_querypos) - 1);
901 	} else {
902 	  sprintf(cdnabreak,"%d",(last_querypos - this->querypos) - 1);
903 	}
904 	if (this->genomepos < last_genomepos) {
905 	  sprintf(genomicbreak,"%d",last_genomepos - this->genomepos - 1);
906 	} else {
907 	  sprintf(genomicbreak,"%d",this->genomepos - last_genomepos - 1);
908 	}
909 
910 	margin = (space - strlen(cdnabreak))/2;
911 	j = gapstart;
912 	while (margin > 0) {
913 	  ptr = &(pairs[j++]);
914 	  margin--;
915 	}
916 	for (k = 0; k < (int) strlen(cdnabreak); k++) {
917 	  ptr = &(pairs[j++]);
918 	  ptr->cdna = cdnabreak[k];
919 	}
920 
921 	margin = (space - strlen(genomicbreak))/2;
922 	j = gapstart;
923 	while (margin > 0) {
924 	  ptr = &(pairs[j++]);
925 	  margin--;
926 	}
927 	for (k = 0; k < (int) strlen(genomicbreak); k++) {
928 	  ptr = &(pairs[j++]);
929 	  ptr->genome = genomicbreak[k];
930 	  /* ptr->genomealt = ' '; */
931 	}
932 
933       } else {			/* Intron */
934 	if (this->genomepos < last_genomepos) {
935 	  sprintf(intronstring,"%d",last_genomepos - this->genomepos - 1);
936 	} else {
937 	  sprintf(intronstring,"%d",this->genomepos - last_genomepos - 1);
938 	}
939 	margin = (space - strlen(intronstring))/2;
940 	j = gapstart;
941 	while (margin > 0) {
942 	  ptr = &(pairs[j++]);
943 	  margin--;
944 	}
945 	for (k = 0; k < (int) strlen(intronstring); k++) {
946 	  ptr = &(pairs[j++]);
947 	  ptr->cdna = intronstring[k];
948 	}
949       }
950     }
951 
952     if (this->cdna != ' ') {
953       last_querypos = this->querypos;
954     }
955     if (this->genome != ' ') {
956       last_genomepos = this->genomepos;
957     }
958   }
959   return;
960 }
961 
962 
963 /* Needed to recompute translation_length in parts of chimeras */
964 int
Pair_translation_length(struct T * pairs,int npairs)965 Pair_translation_length (struct T *pairs, int npairs) {
966   int translation_length = 0;
967   int i;
968 
969   for (i = 0; i < npairs; i++) {
970     if (pairs[i].aa_e == ' ') {
971     } else if (pairs[i].aa_e == '*') {
972     } else {
973       translation_length++;
974     }
975   }
976   return translation_length;
977 }
978 
979 
980 void
Pair_print_continuous(Filestring_T fp,struct T * pairs,int npairs,bool watsonp,bool genomefirstp,int invertmode,bool nointronlenp)981 Pair_print_continuous (Filestring_T fp, struct T *pairs, int npairs, bool watsonp,
982 		       bool genomefirstp, int invertmode, bool nointronlenp) {
983   T this;
984   struct T *save = NULL, *ptr;
985   int n = 0;
986 
987   if (watsonp == true) {
988     ptr = pairs;
989   } else if (invertmode == 0) {
990     ptr = pairs;
991   } else if (invertmode == 1) {
992     save = ptr = invert_path(pairs,npairs);
993   } else if (invertmode == 2) {
994     save = ptr = invert_and_revcomp_path(pairs,npairs);
995   } else {
996     fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
997     exit(9);
998   }
999   if (nointronlenp == false) {
1000     add_intronlengths(ptr,npairs);
1001   }
1002 
1003   if (genomefirstp == true) {
1004     ptr = pairs;
1005     for (n = 0; n < npairs; n++) {
1006       this = ptr++;
1007       PUTC(this->genome,fp);
1008     }
1009     PUTC('\n',fp);
1010 
1011     ptr = pairs;
1012     for (n = 0; n < npairs; n++) {
1013       this = ptr++;
1014 #ifdef DIAGNOSTICP
1015       PUTC(this->comp,fp);
1016 #else
1017       if (this->comp == MATCH_COMP) {
1018 	PUTC(MATCH_COMP,fp);
1019       } else if (this->comp == DYNPROG_MATCH_COMP) {
1020 	PUTC(MATCH_COMP,fp);
1021       } else if (this->comp == AMBIGUOUS_COMP) {
1022 #ifdef PMAP
1023 	PUTC(AMBIGUOUS_COMP,fp);
1024 #else
1025 	PUTC(MISMATCH_COMP,fp);
1026 #endif
1027       } else {
1028 	PUTC(this->comp,fp);
1029       }
1030 #endif
1031 
1032     }
1033     PUTC('\n',fp);
1034 
1035     ptr = pairs;
1036     for (n = 0; n < npairs; n++) {
1037       this = ptr++;
1038       PUTC(this->cdna,fp);
1039     }
1040     PUTC('\n',fp);
1041 
1042   } else {
1043     ptr = pairs;
1044     for (n = 0; n < npairs; n++) {
1045       this = ptr++;
1046       PUTC(this->cdna,fp);
1047     }
1048     PUTC('\n',fp);
1049 
1050     ptr = pairs;
1051     for (n = 0; n < npairs; n++) {
1052       this = ptr++;
1053 
1054 #ifdef DIAGNOSTICP
1055       PUTC(this->comp,fp);
1056 #else
1057       if (this->comp == MATCH_COMP) {
1058 	PUTC(MATCH_COMP,fp);
1059       } else if (this->comp == DYNPROG_MATCH_COMP) {
1060 	PUTC(MATCH_COMP,fp);
1061       } else if (this->comp == AMBIGUOUS_COMP) {
1062 #ifdef PMAP
1063 	PUTC(AMBIGUOUS_COMP,fp);
1064 #else
1065 	PUTC(MISMATCH_COMP,fp);
1066 #endif
1067       } else {
1068 	PUTC(this->comp,fp);
1069       }
1070 #endif
1071 
1072     }
1073     PUTC('\n',fp);
1074 
1075     ptr = pairs;
1076     for (n = 0; n < npairs; n++) {
1077       this = ptr++;
1078       PUTC(this->genome,fp);
1079     }
1080     PUTC('\n',fp);
1081   }
1082 
1083   if (save != NULL) {
1084     FREE(save);
1085   }
1086   return;
1087 }
1088 
1089 
1090 
1091 void
Pair_print_continuous_byexon(Filestring_T fp,struct T * pairs,int npairs,bool watsonp,int invertmode)1092 Pair_print_continuous_byexon (Filestring_T fp, struct T *pairs, int npairs, bool watsonp, int invertmode) {
1093   T this;
1094   struct T *save = NULL, *ptr;
1095   int i = 0, j;
1096 
1097   if (watsonp == true) {
1098     ptr = pairs;
1099   } else if (invertmode == 0) {
1100     ptr = pairs;
1101   } else if (invertmode == 1) {
1102     save = ptr = invert_path(pairs,npairs);
1103   } else if (invertmode == 2) {
1104     save = ptr = invert_and_revcomp_path(pairs,npairs);
1105   } else {
1106     fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
1107     exit(9);
1108   }
1109 
1110   ptr = pairs;
1111   while (i < npairs) {
1112     j = i;
1113     this = ptr;
1114 
1115     while (j < npairs && this->gapp == false) {
1116       PUTC(this->genome,fp);
1117       this++;
1118       j++;
1119     }
1120     PUTC('\n',fp);
1121 
1122     j = i;
1123     this = ptr;
1124     while (j < npairs && this->gapp == false) {
1125 
1126 #ifdef DIAGNOSTICP
1127       PUTC(this->comp,fp);
1128 
1129 #else
1130       if (this->comp == MATCH_COMP) {
1131 	PUTC(MATCH_COMP,fp);
1132       } else if (this->comp == DYNPROG_MATCH_COMP) {
1133 	PUTC(MATCH_COMP,fp);
1134       } else if (this->comp == AMBIGUOUS_COMP) {
1135 #ifdef PMAP
1136 	PUTC(AMBIGUOUS_COMP,fp);
1137 #else
1138 	PUTC(MISMATCH_COMP,fp);
1139 #endif
1140       } else {
1141 	PUTC(this->comp,fp);
1142       }
1143 #endif
1144 
1145       this++;
1146       j++;
1147     }
1148     PUTC('\n',fp);
1149 
1150     j = i;
1151     this = ptr;
1152     while (j < npairs && this->gapp == false) {
1153       PUTC(this->cdna,fp);
1154       this++;
1155       j++;
1156     }
1157     FPRINTF(fp,"\n\n");
1158 
1159     i = j;
1160     while (i < npairs && this->gapp == true) {
1161       this++;
1162       i++;
1163     }
1164     ptr = this;
1165   }
1166 
1167   if (save != NULL) {
1168     FREE(save);
1169   }
1170   return;
1171 }
1172 
1173 
1174 void
Pair_print_alignment(Filestring_T fp,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,bool watsonp,int invertmode,bool nointronlenp,int wraplength)1175 Pair_print_alignment (Filestring_T fp, struct T *pairs, int npairs, Chrnum_T chrnum,
1176 		      Univcoord_T chroffset, Univ_IIT_T chromosome_iit, bool watsonp,
1177 		      int invertmode, bool nointronlenp, int wraplength) {
1178   struct T *save = NULL, *ptr;
1179   int n = 0, i;
1180   char *chrstring = NULL;
1181   int margin;
1182 
1183   if (watsonp == true) {
1184     ptr = pairs;
1185 
1186   } else if (invertmode == 0) {
1187     /* Given cDNA sequence, use minus genome strand */
1188     ptr = pairs;
1189 
1190   } else if (invertmode == 1) {
1191     /* Invert cDNA sequence, use minus genome strand */
1192     save = ptr = invert_path(pairs,npairs);
1193 
1194   } else if (invertmode == 2) {
1195     /* Invert cDNA sequence, use plus genome strand */
1196     save = ptr = invert_and_revcomp_path(pairs,npairs);
1197 
1198   } else {
1199     fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
1200     exit(9);
1201   }
1202 
1203   if (nointronlenp == false) {
1204     add_intronlengths(ptr,npairs);
1205   }
1206   if (chrnum != 0) {
1207     if (invertmode == 2) {
1208       chrstring = Chrnum_to_string(chrnum,chromosome_iit);
1209     } else {
1210       chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,watsonp);
1211     }
1212   }
1213 
1214   margin = compute_margin(&(pairs[0]),&(pairs[npairs-1]),chrstring,chroffset);
1215 
1216   while (n < npairs) {
1217     print_top_ruler(fp,n,npairs,margin,wraplength);
1218     print_peptide(fp,ptr,n,npairs,margin,wraplength,/*genomep*/true);
1219     if (snps_p) {
1220       print_genomicalt_sequence(fp,ptr,n,npairs,chrstring,
1221 				chroffset,margin,wraplength);
1222     }
1223     print_genomic_sequence(fp,ptr,n,npairs,chrstring,
1224 			   chroffset,margin,wraplength);
1225     print_alignment(fp,ptr,n,npairs,margin,wraplength);
1226     print_cdna_sequence(fp,ptr,n,npairs,margin,wraplength);
1227     print_peptide(fp,ptr,n,npairs,margin,wraplength,/*genomep*/false);
1228     PUTC('\n',fp);
1229     for (i = 0; n < npairs && i < wraplength; n++, i++) {
1230       ptr++;
1231     }
1232   }
1233   if (chrstring != NULL) {
1234     FREE(chrstring);
1235   }
1236   if (save != NULL) {
1237     FREE(save);
1238   }
1239   return;
1240 }
1241 
1242 void
Pair_print_pathsummary(Filestring_T fp,int pathnum,T start,T end,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,bool referencealignp,IIT_T altstrain_iit,char * strain,Univ_IIT_T contig_iit,char * dbversion,int querylength_given,int skiplength,int trim_start,int trim_end,int nexons,int matches,int unknowns,int mismatches,int qopens,int qindels,int topens,int tindels,bool watsonp,int cdna_direction,int translation_start,int translation_end,int translation_length,int relaastart,int relaaend)1243 Pair_print_pathsummary (Filestring_T fp, int pathnum, T start, T end, Chrnum_T chrnum,
1244 			Univcoord_T chroffset, Univ_IIT_T chromosome_iit, bool referencealignp,
1245 			IIT_T altstrain_iit, char *strain, Univ_IIT_T contig_iit, char *dbversion,
1246 			int querylength_given, int skiplength, int trim_start, int trim_end,
1247 			int nexons, int matches, int unknowns, int mismatches,
1248 			int qopens, int qindels, int topens, int tindels,
1249 			bool watsonp, int cdna_direction,
1250 			int translation_start, int translation_end, int translation_length,
1251 			int relaastart, int relaaend) {
1252   int querypos1, querypos2, den;
1253   double fracidentity, coverage, trimmed_coverage;
1254   Univcoord_T position1, position2;
1255   Chrpos_T chrpos1, chrpos2;
1256   char *refstrain, *comma1, *comma2, *chr;
1257 
1258   querypos1 = start->querypos;
1259   querypos2 = end->querypos;
1260 
1261   FPRINTF(fp,"  Path %d: ",pathnum);
1262   FPRINTF(fp,"query %d%s%d (%d bp) => ",
1263 	 querypos1 + ONEBASEDP,SEPARATOR,querypos2 + ONEBASEDP,querypos2-querypos1+1);
1264 
1265   chrpos1 = start->genomepos;
1266   chrpos2 = end->genomepos;
1267 
1268   comma1 = Genomicpos_commafmt(chrpos1 + ONEBASEDP);
1269   comma2 = Genomicpos_commafmt(chrpos2 + ONEBASEDP);
1270   if (chrnum == 0) {
1271     if (watsonp) {
1272       FPRINTF(fp,"genome %s%s%s (%d bp)\n",
1273 	     comma1,SEPARATOR,comma2,chrpos2-chrpos1+1);
1274     } else {
1275       FPRINTF(fp,"genome %s%s%s (%d bp)\n",
1276 	     comma1,SEPARATOR,comma2,chrpos2-chrpos1-1);
1277     }
1278   } else {
1279     chr = Chrnum_to_string(chrnum,chromosome_iit);
1280     if (watsonp) {
1281       FPRINTF(fp,"genome %s:%s%s%s (%d bp)\n",chr,comma1,SEPARATOR,comma2,chrpos2-chrpos1+1);
1282     } else {
1283       FPRINTF(fp,"genome %s:%s%s%s (%d bp)\n",chr,comma1,SEPARATOR,comma2,chrpos2-chrpos1-1);
1284     }
1285     FREE(chr);
1286   }
1287   FREE(comma2);
1288   FREE(comma1);
1289 
1290   FPRINTF(fp,"    cDNA direction: ");
1291   if (cdna_direction > 0) {
1292     FPRINTF(fp,"sense\n");
1293   } else if (cdna_direction < 0) {
1294     FPRINTF(fp,"antisense\n");
1295   } else {
1296     FPRINTF(fp,"indeterminate\n");
1297   }
1298 
1299   if (altstrain_iit != NULL) {
1300     if (strain == NULL) {
1301       refstrain = IIT_typestring(altstrain_iit,/*straintype*/0);
1302       if (refstrain[0] == '\0') {
1303 	/* Backward compatibility with old altstrain_iit */
1304 	FPRINTF(fp,"    Strain: reference\n");
1305       } else {
1306 	FPRINTF(fp,"    Strain: %s (reference)\n",refstrain);
1307       }
1308     } else {
1309       FPRINTF(fp,"    Strain: %s\n",strain);
1310     }
1311   }
1312 
1313   position1 = chroffset + chrpos1;
1314   position2 = chroffset + chrpos2;
1315   comma1 = Genomicpos_commafmt(position1 + ONEBASEDP);
1316   comma2 = Genomicpos_commafmt(position2 + ONEBASEDP);
1317   if (dbversion == NULL) {
1318     FPRINTF(fp,"    Genomic pos: %s%s%s",comma1,SEPARATOR,comma2);
1319   } else {
1320     FPRINTF(fp,"    Genomic pos: %s:%s%s%s",dbversion,comma1,SEPARATOR,comma2);
1321   }
1322   if (chrpos1 <= chrpos2) {
1323     FPRINTF(fp," (+ strand)\n");
1324   } else {
1325     FPRINTF(fp," (- strand)\n");
1326   }
1327   FREE(comma2);
1328   FREE(comma1);
1329 
1330   if (contig_iit != NULL) {
1331     if (position1 <= position2) {
1332       Segmentpos_print_accessions(fp,contig_iit,position1,position2,referencealignp,strain);
1333     } else {
1334       Segmentpos_print_accessions(fp,contig_iit,position2,position1,referencealignp,strain);
1335     }
1336   }
1337 
1338   FPRINTF(fp,"    Number of exons: %d\n",nexons);
1339 
1340 #ifdef PMAP
1341   coverage = (double) (querypos2 - querypos1 + 1)/(double) (3*(querylength_given + skiplength));
1342   /* coverage = (double) (matches + mismatches + qindels)/(double) (3*(querylength_given + skiplength)); */
1343 
1344   /* Can have coverage greater than given querylength because of added '*' at end */
1345   if (coverage > 1.0) {
1346     coverage = 1.0;
1347   }
1348 #else
1349   /* coverage = (double) (matches + mismatches + qindels)/(double) (querylength_given + skiplength); */
1350   coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given + skiplength);
1351 #endif
1352   FPRINTF(fp,"    Coverage: %.1f",((double) rint(1000.0*coverage))/10.0);
1353 #ifdef PMAP
1354   FPRINTF(fp," (query length: %d aa)\n",querylength_given);
1355 #else
1356   FPRINTF(fp," (query length: %d bp)\n",querylength_given);
1357   if (querypos2 + 1 > trim_end) {
1358     trim_end = querypos2 + 1;
1359   }
1360   if (querypos1 < trim_start) {
1361     trim_start = querypos1;
1362   }
1363 
1364   trimmed_coverage = (double) (querypos2 - querypos1 + 1)/(double) (trim_end - trim_start + skiplength);
1365   FPRINTF(fp,"    Trimmed coverage: %.1f",((double) rint(1000.0*trimmed_coverage))/10.0);
1366   FPRINTF(fp," (trimmed length: %d bp, trimmed region: %d..%d)",
1367 	  trim_end-trim_start,trim_start+ONEBASEDP,trim_end-1+ONEBASEDP);
1368   PUTC('\n',fp);
1369 #endif
1370 
1371   if ((den = matches + mismatches + qindels + tindels) == 0) {
1372     fracidentity = 1.0;
1373   } else {
1374     fracidentity = (double) matches/(double) den;
1375   }
1376 
1377   /* The definition of indels here should be consistent with Stage3_indels */
1378   FPRINTF(fp,"    Percent identity: %.1f (%d matches, %d mismatches, %d indels, %d unknowns)\n",
1379 	  ((double) rint(1000.0*fracidentity))/10.0,matches,mismatches,qindels+tindels,unknowns);
1380   if (qindels + tindels > 0) {
1381     FPRINTF(fp,"    Non-intron gaps: %d openings, %d bases in cdna; %d openings, %d bases in genome\n",
1382 	    qopens,qindels,topens,tindels);
1383   }
1384 
1385 #ifndef PMAP
1386   if (translation_length > 0) {
1387     if (cdna_direction >= 0) {
1388       FPRINTF(fp,"    Translation: %d..%d (%d aa)\n",
1389 	      translation_start+ONEBASEDP,translation_end+ONEBASEDP,translation_length);
1390     } else {
1391       FPRINTF(fp,"    Translation: %d..%d (%d aa)\n",
1392 	      translation_end+ONEBASEDP,translation_start+ONEBASEDP,translation_length);
1393     }
1394   } else if (relaastart > 0) {
1395     if (relaastart < relaaend) {
1396       FPRINTF(fp,"    Protein coords: %d..%d\n",relaastart,relaaend);
1397     } else {
1398       FPRINTF(fp,"    Protein coords: %d..%d\n",relaaend,relaastart);
1399     }
1400   }
1401 #endif
1402 
1403   /* FPRINTF(fp,"    Defect rate (percent): %.1f\n",defect_rate*100.0); */
1404 
1405   /* PUTC('\n',fp); -- Done by caller */
1406 
1407   return;
1408 }
1409 
1410 
1411 void
Pair_print_coordinates(Filestring_T fp,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,bool watsonp,int invertmode)1412 Pair_print_coordinates (Filestring_T fp, struct T *pairs, int npairs, Chrnum_T chrnum,
1413 			Univcoord_T chroffset, Univ_IIT_T chromosome_iit,
1414 			bool watsonp, int invertmode) {
1415   T this;
1416   struct T *save = NULL;
1417   int i;
1418   char *chrstring = NULL;
1419 
1420   Pair_check_array_pairs(pairs,npairs);
1421 
1422   if (watsonp == true) {
1423     /* ptr = pairs; */
1424 
1425   } else if (invertmode == 0) {
1426     /* Given cDNA sequence, use minus genome strand */
1427     /* ptr = pairs; */
1428 
1429   } else if (invertmode == 1) {
1430     /* Invert cDNA sequence, use minus genome strand */
1431     save = invert_path(pairs,npairs);
1432 
1433   } else if (invertmode == 2) {
1434     /* Invert cDNA sequence, use plus genome strand */
1435     save = invert_and_revcomp_path(pairs,npairs);
1436 
1437   } else {
1438     fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
1439     exit(9);
1440   }
1441 
1442   if (chrnum != 0) {
1443     if (invertmode == 2) {
1444       chrstring = Chrnum_to_string(chrnum,chromosome_iit);
1445     } else {
1446       chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,watsonp);
1447     }
1448   }
1449 
1450   for (i = 0; i < npairs; i++) {
1451     this = pairs++;
1452     if (this->gapp == false) {
1453 #ifdef DEBUG5
1454       FPRINTF(fp,"%d %d %c\t",this->aapos,this->aaphase_e,this->aa_e);
1455 #else
1456       if (this->aaphase_e != 0) {
1457 	FPRINTF(fp,"%d\t",this->aapos);
1458       } else {
1459 	FPRINTF(fp,"%d %c\t",this->aapos,this->aa_e);
1460       }
1461 #endif
1462       FPRINTF(fp,"%d %c\t",this->querypos + ONEBASEDP,this->cdna);
1463       if (chrstring == NULL) {
1464 	FPRINTF(fp,"%u %u %c",this->genomepos + ONEBASEDP,
1465 		chroffset + this->genomepos + ONEBASEDP,
1466 		this->genome);
1467       } else {
1468 	FPRINTF(fp,"%s:%u %u %c",chrstring,
1469 		this->genomepos + ONEBASEDP,
1470 		chroffset + this->genomepos + ONEBASEDP,
1471 		this->genome);
1472       }
1473       if (this->genomealt != this->genome) {
1474 	FPRINTF(fp," %c",this->genomealt);
1475       }
1476 
1477 #ifdef DEBUG5
1478       FPRINTF(fp,"\t%d %c",this->aaphase_g,this->aa_g);
1479 #else
1480       if (this->aaphase_g != 0) {
1481 	FPRINTF(fp,"\t");
1482       } else {
1483 	FPRINTF(fp,"\t%c",this->aa_g);
1484       }
1485 #endif
1486       PUTC('\n',fp);
1487     }
1488   }
1489 
1490   if (chrstring != NULL) {
1491     FREE(chrstring);
1492   }
1493   if (save != NULL) {
1494     FREE(save);
1495   }
1496   return;
1497 }
1498 
1499 
1500 int
Pair_cmp(const void * a,const void * b)1501 Pair_cmp (const void *a, const void *b) {
1502   T x = * (T *) a;
1503   T y = * (T *) b;
1504 
1505   if (x->querypos < y->querypos) {
1506     return -1;
1507   } else if (y->querypos < x->querypos) {
1508     return +1;
1509   } else if (x->genomepos < y->genomepos) {
1510     return -1;
1511   } else if (y->genomepos < x->genomepos) {
1512     return +1;
1513   } else {
1514     return 0;
1515   }
1516 }
1517 
1518 
1519 void
Pair_dump_one(T this,bool zerobasedp)1520 Pair_dump_one (T this, bool zerobasedp) {
1521 
1522   debug1(printf("%p ",this));
1523 
1524   if (this->gapp == true && this->extraexonp == false) {
1525     printf("*** Gap: queryjump = %d, genomejump = %d, type: ",this->queryjump,this->genomejump);
1526     switch (this->comp) {
1527     case FWD_CANONICAL_INTRON_COMP: printf("> GT-AG"); break;
1528     case FWD_GCAG_INTRON_COMP: printf(") GC-AG"); break;
1529     case FWD_ATAC_INTRON_COMP: printf("] AT-AC"); break;
1530     case REV_ATAC_INTRON_COMP: printf("[ AT-AC"); break;
1531     case REV_GCAG_INTRON_COMP: printf("( GC-AG"); break;
1532     case REV_CANONICAL_INTRON_COMP: printf("< GT-AG"); break;
1533     case SHORTGAP_COMP: printf("~ shortgap"); break;
1534     case NONINTRON_COMP: printf("= nonintron"); break;
1535     default: printf("? unknown"); break;
1536     }
1537 
1538     if (this->knowngapp == true) {
1539       printf(" known");
1540     }
1541 
1542     printf(" donor:%f acceptor:%f",this->donor_prob,this->acceptor_prob);
1543     printf(" ***");
1544 
1545   } else {
1546     printf("%d %d %c ",
1547 	   this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->cdna);
1548 
1549     /* Subtract 1 because dynprogindices start at +1 and -1 */
1550     if (this->dynprogindex > 0) {
1551       printf("%c%c",this->comp,(this->dynprogindex-1)%26+'a');
1552     } else if (this->dynprogindex < 0) {
1553       printf("%c%c",this->comp,(-this->dynprogindex-1)%26+'A');
1554     } else {
1555       putchar(this->comp);
1556     }
1557     printf(" %c",this->genome);
1558     if (this->genomealt != this->genome) {
1559       printf(" alt:%c",this->genomealt);
1560     }
1561   }
1562 
1563   if (this->protectedp == true) {
1564     printf(" protected");
1565   }
1566 
1567   if (this->disallowedp == true) {
1568     printf(" disallowed");
1569   }
1570 
1571   if (this->shortexonp == true) {
1572     printf(" shortexon");
1573   }
1574 
1575   if (this->gapp == true) {
1576     printf(" gap");
1577   }
1578 
1579 #if 0
1580   if (this->state == BAD) {
1581     printf(" bad");
1582   }
1583 #endif
1584 
1585   return;
1586 }
1587 
1588 
1589 /* Useful for debugging */
1590 void
Pair_dump_list(List_T pairs,bool zerobasedp)1591 Pair_dump_list (List_T pairs, bool zerobasedp) {
1592   T this, prev = NULL, old = NULL;
1593   List_T p;
1594 
1595   printf("***Start of list***\n");
1596   for (p = pairs; p != NULL; p = List_next(p)) {
1597     this = List_head(p);
1598     Pair_dump_one(this,zerobasedp);
1599     printf("\n");
1600 
1601     if (this->querypos != -1) {
1602       if (old != NULL) {
1603 	if (old->querypos > prev->querypos) {
1604 	  if (prev->querypos < this->querypos) {
1605 	    fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
1606 	    abort();
1607 	  }
1608 	} else if (old->querypos < prev->querypos) {
1609 	  if (prev->querypos > this->querypos) {
1610 	    fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
1611 	    abort();
1612 	  }
1613 	}
1614       }
1615 
1616       old = prev;
1617       prev = this;
1618     }
1619 
1620   }
1621   printf("***End of list***\n");
1622   return;
1623 }
1624 
1625 void
Pair_dump_array(struct T * pairs,int npairs,bool zerobasedp)1626 Pair_dump_array (struct T *pairs, int npairs, bool zerobasedp) {
1627   struct T *this;
1628   int i;
1629 
1630   for (i = 0; i < npairs; i++) {
1631     this = pairs++;
1632     printf("%d: %d %d %d %c ",
1633 	   i,this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->aapos,
1634 	   this->cdna);
1635 
1636     /* Subtract 1 because dynprogindices start at +1 and -1 */
1637     if (this->dynprogindex > 0) {
1638       printf("%c%c",this->comp,(this->dynprogindex-1)%26+'a');
1639     } else if (this->dynprogindex < 0) {
1640       printf("%c%c",this->comp,(-this->dynprogindex-1)%26+'A');
1641     } else {
1642       putchar(this->comp);
1643     }
1644     printf(" %c",this->genome);
1645     if (this->genomealt != this->genome) {
1646       printf(" alt:%c",this->genomealt);
1647     }
1648 
1649     debug7(printf(" aaphase_g:%d aaphase_e:%d",this->aaphase_g,this->aaphase_e));
1650 
1651     if (this->aaphase_g == 0 || this->aaphase_e == 0) {
1652       printf(" => %c %c",this->aa_g,this->aa_e);
1653     }
1654 
1655     if (this->gapp) {
1656       printf(" gap");
1657     }
1658 
1659     printf("\n");
1660   }
1661   return;
1662 }
1663 
1664 
1665 void
Pair_dump_array_stderr(struct T * pairs,int npairs,bool zerobasedp)1666 Pair_dump_array_stderr (struct T *pairs, int npairs, bool zerobasedp) {
1667   struct T *this;
1668   int i;
1669 
1670   for (i = 0; i < npairs; i++) {
1671     this = pairs++;
1672     fprintf(stderr,"%d: %d %d %d %c ",
1673 	    i,this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->aapos,
1674 	    this->cdna);
1675 
1676     /* Subtract 1 because dynprogindices start at +1 and -1 */
1677     if (this->dynprogindex > 0) {
1678       fprintf(stderr,"%c%c",this->comp,(this->dynprogindex-1)%26+'a');
1679     } else if (this->dynprogindex < 0) {
1680       fprintf(stderr,"%c%c",this->comp,(-this->dynprogindex-1)%26+'A');
1681     } else {
1682       putc(this->comp,stderr);
1683     }
1684     fprintf(stderr," %c",this->genome);
1685     if (this->genomealt != this->genome) {
1686       fprintf(stderr," alt:%c",this->genomealt);
1687     }
1688 
1689     if (this->aaphase_g == 0 || this->aaphase_e == 0) {
1690       fprintf(stderr," => %c %c",this->aa_g,this->aa_e);
1691     }
1692     fprintf(stderr,"\n");
1693   }
1694   return;
1695 }
1696 
1697 
1698 void
Pair_dump_genome_array(struct T * pairs,int npairs)1699 Pair_dump_genome_array (struct T *pairs, int npairs) {
1700   struct T *this;
1701   int i;
1702 
1703   for (i = 0; i < npairs; i++) {
1704     this = pairs++;
1705     printf("%c",this->genome);
1706   }
1707   printf("\n");
1708 
1709   return;
1710 }
1711 
1712 void
Pair_dump_comp_array(struct T * pairs,int npairs)1713 Pair_dump_comp_array (struct T *pairs, int npairs) {
1714   struct T *this;
1715   int i;
1716 
1717   for (i = 0; i < npairs; i++) {
1718     this = pairs++;
1719     printf("%c",this->comp);
1720   }
1721   printf("\n");
1722 
1723   return;
1724 }
1725 
1726 
1727 Chrpos_T
Pair_genomicpos(struct T * pairs,int npairs,int querypos,bool headp)1728 Pair_genomicpos (struct T *pairs, int npairs, int querypos, bool headp) {
1729   struct T *this;
1730   int i;
1731 
1732   if (headp == true) {
1733     for (i = 0; i < npairs; i++) {
1734       this = pairs++;
1735       if (this->querypos == querypos) {
1736 	return this->genomepos;
1737       } else if (this->querypos > querypos) {
1738 	return 0;
1739       }
1740     }
1741   } else {
1742     pairs += npairs;
1743     for (i = npairs-1; i >= 0; --i) {
1744       this = --pairs;
1745       if (this->querypos == querypos) {
1746 	return this->genomepos;
1747       } else if (this->querypos < querypos) {
1748 	return 0;
1749       }
1750     }
1751   }
1752   return 0;
1753 }
1754 
1755 int
Pair_codon_changepos(struct T * pairs,int npairs,int aapos,int cdna_direction)1756 Pair_codon_changepos (struct T *pairs, int npairs, int aapos, int cdna_direction) {
1757   struct T *this, *start, *end;
1758   int changepos = 0, i, ngenome = 0, ncdna = 0;
1759 
1760   i = 0;
1761   this = pairs;
1762   while (i < npairs && this->aapos != aapos) {
1763     this++;
1764     i++;
1765   }
1766   start = this;
1767 
1768   while (i < npairs && (ngenome < 3 || ncdna < 3)) {
1769     if (this->gapp == false) {
1770       if (this->genome != ' ') {
1771 	ngenome++;
1772       }
1773       if (this->cdna != ' ') {
1774 	ncdna++;
1775       }
1776     }
1777     this++;
1778     i++;
1779   }
1780   end = --this;
1781 
1782   if (cdna_direction < 0) {
1783     for (this = end; this >= start; --this) {
1784       if (this->gapp == true) {
1785       } else if (this->genome == ' ') {
1786       } else if (this->cdna == ' ') {
1787       } else if (this->genome != this->cdna) {
1788 	return changepos;
1789       } else {
1790 	changepos++;
1791       }
1792     }
1793   } else {
1794     for (this = start; this <= end; this++) {
1795       if (this->gapp == true) {
1796       } else if (this->genome == ' ') {
1797       } else if (this->cdna == ' ') {
1798       } else if (this->genome != this->cdna) {
1799 	return changepos;
1800       } else {
1801 	changepos++;
1802       }
1803     }
1804   }
1805 
1806   return changepos;
1807 }
1808 
1809 
1810 #if 0
1811 bool
1812 Pair_identical_p (List_T pairs1, List_T pairs2) {
1813   List_T p, q;
1814   T pair1, pair2;
1815 
1816   p = pairs1;
1817   q = pairs2;
1818   while (p && q) {
1819     pair1 = (T) List_head(p);
1820     pair2 = (T) List_head(q);
1821     if (pair1->gapp != pair2->gapp) {
1822       return false;
1823     } else if (pair1->querypos != pair2->querypos) {
1824       return false;
1825     } else if (pair1->genomepos != pair2->genomepos) {
1826       return false;
1827     } else if (pair1->comp != pair2->comp) {
1828       return false;
1829     }
1830     p = List_next(p);
1831     q = List_next(q);
1832   }
1833 
1834   if (p || q) {
1835     return false;
1836   } else {
1837     return true;
1838   }
1839 }
1840 #endif
1841 
1842 
1843 void
Pair_check_list_pairs(List_T pairs)1844 Pair_check_list_pairs (List_T pairs) {
1845   T this;
1846   List_T p;
1847   int prev_querypos;
1848 
1849   if (pairs == NULL) {
1850     return;
1851   } else {
1852     this = List_head(pairs);
1853     prev_querypos = this->querypos;
1854     /* prev_genomepos = this->genomepos; */
1855 
1856     for (p = List_next(pairs); p != NULL; p = List_next(p)) {
1857       this = List_head(p);
1858       if (this->gapp == false) {
1859 	if (this->querypos < prev_querypos) {
1860 	  printf("Problem at querypos %d < prev querypos %d\n",this->querypos,prev_querypos);
1861 	  abort();
1862 	}
1863 #if 0
1864 	/* No longer a valid check after genomepos converted to chrpos */
1865 	if (this->genomepos < prev_genomepos) {
1866 	  printf("Problem at genomepos %d\n",this->genomepos);
1867 	}
1868 #endif
1869 	prev_querypos = this->querypos;
1870 	/* prev_genomepos = this->genomepos; */
1871       }
1872     }
1873   }
1874   return;
1875 }
1876 
1877 void
Pair_check_list_path(List_T path)1878 Pair_check_list_path (List_T path) {
1879   T this;
1880   List_T p;
1881   int prev_querypos;
1882 
1883   if (path == NULL) {
1884     return;
1885   } else {
1886     this = List_head(path);
1887     prev_querypos = this->querypos;
1888     /* prev_genomepos = this->genomepos; */
1889 
1890     for (p = List_next(path); p != NULL; p = List_next(p)) {
1891       this = List_head(p);
1892       if (this->gapp == false) {
1893 	if (this->querypos > prev_querypos) {
1894 	  printf("Problem at querypos %d > prev querypos %d\n",this->querypos,prev_querypos);
1895 	  abort();
1896 	}
1897 #if 0
1898 	/* No longer a valid check after genomepos converted to chrpos */
1899 	if (this->genomepos > prev_genomepos) {
1900 	  printf("Problem at genomepos %d\n",this->genomepos);
1901 	}
1902 #endif
1903 	prev_querypos = this->querypos;
1904 	/* prev_genomepos = this->genomepos; */
1905       }
1906     }
1907   }
1908   return;
1909 }
1910 
1911 
1912 bool
Pair_check_array_pairs(struct T * pairs,int npairs)1913 Pair_check_array_pairs (struct T *pairs, int npairs) {
1914   bool result = false;
1915   struct T *this;
1916   int prev_querypos;
1917   int i;
1918 
1919   if (npairs == 0) {
1920     return false;
1921   } else {
1922     this = pairs++;
1923     prev_querypos = this->querypos;
1924     /* prev_genomepos = this->genomepos; */
1925 
1926     for (i = 1; i < npairs; i++) {
1927       this = pairs++;
1928       if (this->querypos < prev_querypos) {
1929 	printf("Problem at querypos %d < prev querypos %d\n",this->querypos,prev_querypos);
1930 	abort();
1931 	result = true;
1932       } else if (this->querypos - prev_querypos > 1) {
1933 	/* Could be the result of a dual break */
1934 	fprintf(stderr,"Jump at querypos %d\n",this->querypos);
1935 	result = false;
1936       }
1937 #if 0
1938       /* No longer a valid check after genomepos converted to chrpos */
1939       if (this->genomepos < prev_genomepos) {
1940 	fprintf(stderr,"Problem at genomepos %d\n",this->genomepos);
1941 	result = true;
1942       }
1943 #endif
1944       prev_querypos = this->querypos;
1945       /* prev_genomepos = this->genomepos; */
1946     }
1947   }
1948   return result;
1949 }
1950 
1951 
1952 bool
Pair_check_array_path(struct T * path,int npairs)1953 Pair_check_array_path (struct T *path, int npairs) {
1954   bool result = false;
1955   struct T *this;
1956   int prev_querypos;
1957   int i;
1958 
1959   if (npairs == 0) {
1960     return false;
1961   } else {
1962     this = path++;
1963     prev_querypos = this->querypos;
1964     /* prev_genomepos = this->genomepos; */
1965 
1966     for (i = 1; i < npairs; i++) {
1967       this = path++;
1968       if (this->querypos > prev_querypos) {
1969 	printf("Problem at querypos %d > prev querypos %d\n",this->querypos,prev_querypos);
1970 	abort();
1971 	result = true;
1972       } else if (this->querypos - prev_querypos > 1) {
1973 	/* Could be the result of a dual break */
1974 	fprintf(stderr,"Jump at querypos %d\n",this->querypos);
1975 	result = false;
1976       }
1977 #if 0
1978       /* No longer a valid check after genomepos converted to chrpos */
1979       if (this->genomepos < prev_genomepos) {
1980 	fprintf(stderr,"Problem at genomepos %d\n",this->genomepos);
1981 	result = true;
1982       }
1983 #endif
1984       prev_querypos = this->querypos;
1985       /* prev_genomepos = this->genomepos; */
1986     }
1987   }
1988   return result;
1989 }
1990 
1991 
1992 #if 0
1993 /* Modeled after Pair_convert_array_to_pairs */
1994 List_T
1995 Pair_convert_array_to_pairs (List_T pairs, struct T *pairarray, int npairs, bool plusp,
1996 			     Chrpos_T chrlength, Pairpool_T pairpool) {
1997   T pair;
1998   int i;
1999 
2000   if (plusp == true) {
2001     for (i = 0; i < npairs; i++) {
2002       pair = &(pairarray[i]);
2003       if (pair->gapp) {
2004 	/* Skip */
2005       } else {
2006 	pairs = Pairpool_push(pairs,pairpool,pair->querypos /*+ queryseq_offset*/,pair->genomepos,
2007 			      pair->cdna,pair->comp,pair->genome,pair->genomealt,/*dynprogindex*/0);
2008       }
2009     }
2010 
2011   } else {
2012     for (i = 0; i < npairs; i++) {
2013       pair = &(pairarray[i]);
2014       if (pair->gapp) {
2015 	/* Skip */
2016       } else {
2017 	pairs = Pairpool_push(pairs,pairpool,pair->querypos /*+ queryseq_offset*/,chrlength - pair->genomepos,
2018 			      pair->cdna,pair->comp,pair->genome,pair->genomealt,/*dynprogindex*/0);
2019       }
2020     }
2021   }
2022 
2023 
2024   return pairs;
2025 }
2026 #endif
2027 
2028 
2029 #if 0
2030 /* Called by output thread for --merge-overlap feature.  Modeled after Substring_convert_to_pairs. */
2031 List_T
2032 Pair_convert_array_to_pairs_out (List_T pairs, struct T *pairarray, int npairs, bool plusp, int querylength,
2033 				 int hardclip_low, int hardclip_high, int queryseq_offset) {
2034   T pair;
2035   int querystart, queryend, i;
2036 
2037   if (plusp == true) {
2038     querystart = hardclip_low;
2039     queryend = querylength - hardclip_high;
2040 
2041   } else {
2042     querystart = hardclip_high;
2043     queryend = querylength - hardclip_low;
2044   }
2045 
2046   for (i = 0; i < npairs; i++) {
2047     pair = &(pairarray[i]);
2048     if (pair->querypos >= querystart && pair->querypos < queryend) {
2049       pairs = List_push_out(pairs,(void *) Pair_new_out(pair->querypos + queryseq_offset,/*genomepos*/pair->genomepos,
2050 							pair->cdna,pair->comp,pair->genome));
2051     }
2052   }
2053 
2054   return pairs;
2055 }
2056 #endif
2057 
2058 
2059 
2060 #if 0
2061 static void
2062 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
2063   int i, j;
2064 
2065   /* complement = (char *) CALLOC(length+1,sizeof(char)); */
2066   for (i = length-1, j = 0; i >= 0; i--, j++) {
2067     complement[(int) j] = complCode[(int) sequence[i]];
2068   }
2069   complement[length] = '\0';
2070   return;
2071 }
2072 #endif
2073 
2074 static void
make_complement_inplace(char * sequence,unsigned int length)2075 make_complement_inplace (char *sequence, unsigned int length) {
2076   char temp;
2077   unsigned int i, j;
2078 
2079   for (i = 0, j = length-1; i < length/2; i++, j--) {
2080     temp = complCode[(int) sequence[i]];
2081     sequence[i] = complCode[(int) sequence[j]];
2082     sequence[j] = temp;
2083   }
2084   if (i == j) {
2085     sequence[i] = complCode[(int) sequence[i]];
2086   }
2087 
2088   return;
2089 }
2090 
2091 
2092 static double
donor_score(Univcoord_T genomicpos,Univcoord_T chroffset,bool revcomp,Genome_T genome,Univ_IIT_T chromosome_iit)2093 donor_score (Univcoord_T genomicpos, Univcoord_T chroffset, bool revcomp, Genome_T genome,
2094 	     Univ_IIT_T chromosome_iit) {
2095   Univcoord_T left;
2096   Chrnum_T chrnum;
2097   int nunknowns;
2098   char gbuffer[MAXENT_MAXLENGTH];
2099   Genomecomp_T *genome_blocks;
2100 
2101   if (revcomp == false) {
2102     if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2103       /* Add 1 to get from exon end to intron start */
2104       return Maxent_hr_donor_prob(genomicpos + 1,chroffset);
2105     } else {
2106       left = genomicpos + 1 - DONOR_MODEL_LEFT_MARGIN; /* Add 1 to get from exon end to intron start */
2107       Genome_fill_buffer(&chrnum,&nunknowns,genome,left,DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2108 #if 0
2109       printf("\n");
2110       printf("%s donor truestrand:+ left:%u\n",gbuffer,left);
2111       printf("%*s^^\n",DONOR_MODEL_LEFT_MARGIN,"");
2112 #endif
2113       return Maxent_donor_prob(gbuffer);
2114     }
2115 
2116   } else {
2117     if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2118       return Maxent_hr_antidonor_prob(genomicpos,chroffset);
2119     } else {
2120       left = genomicpos - DONOR_MODEL_RIGHT_MARGIN - 1;
2121       Genome_fill_buffer(&chrnum,&nunknowns,genome,left,DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2122       make_complement_inplace(gbuffer,DONOR_MODEL_LEFT_MARGIN+DONOR_MODEL_RIGHT_MARGIN+1);
2123 #if 0
2124       printf("\n");
2125       printf("%s donor truestrand:- left:%u\n",gbuffer,left);
2126       printf("%*s^^\n",DONOR_MODEL_LEFT_MARGIN,"");
2127 #endif
2128       return Maxent_donor_prob(gbuffer);
2129     }
2130   }
2131 }
2132 
2133 
2134 static double
acceptor_score(Univcoord_T genomicpos,Univcoord_T chroffset,bool revcomp,Genome_T genome,Univ_IIT_T chromosome_iit)2135 acceptor_score (Univcoord_T genomicpos, Univcoord_T chroffset, bool revcomp, Genome_T genome,
2136 		Univ_IIT_T chromosome_iit) {
2137   Univcoord_T left;
2138   Chrnum_T chrnum;
2139   int nunknowns;
2140   char gbuffer[MAXENT_MAXLENGTH];
2141   Genomecomp_T *genome_blocks;
2142 
2143   if (revcomp == false) {
2144     /* sense on plus strand, or antisense on minus strand */
2145     if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2146       return Maxent_hr_acceptor_prob(genomicpos,chroffset);
2147     } else {
2148       left = genomicpos - ACCEPTOR_MODEL_LEFT_MARGIN;
2149       Genome_fill_buffer(&chrnum,&nunknowns,genome,left,ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2150 #if 0
2151       printf("\n");
2152       printf("%s acceptor truestrand:+ left:%u\n",gbuffer,left);
2153       printf("%*s^^\n",ACCEPTOR_MODEL_LEFT_MARGIN-2,"");
2154 #endif
2155       return Maxent_acceptor_prob(gbuffer);
2156     }
2157 
2158   } else {
2159     if ((genome_blocks = Genome_blocks(genome)) != NULL) {
2160       /* Add 1 to get from exon end to intron start */
2161       return Maxent_hr_antiacceptor_prob(genomicpos + 1,chroffset);
2162     } else {
2163       left = genomicpos - ACCEPTOR_MODEL_RIGHT_MARGIN;
2164       Genome_fill_buffer(&chrnum,&nunknowns,genome,left,ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1,gbuffer,chromosome_iit);
2165       make_complement_inplace(gbuffer,ACCEPTOR_MODEL_LEFT_MARGIN+ACCEPTOR_MODEL_RIGHT_MARGIN+1);
2166 #if 0
2167       printf("\n");
2168       printf("%s acceptor truestrand:- left:%u\n",gbuffer,left);
2169       printf("%*s^^\n",ACCEPTOR_MODEL_LEFT_MARGIN-2,"");
2170 #endif
2171       return Maxent_acceptor_prob(gbuffer);
2172     }
2173   }
2174 }
2175 
2176 
2177 
2178 static bool
unknown_base(char c)2179 unknown_base (char c) {
2180   switch (c) {
2181   case 'A': case 'C': case 'G': case 'T': case 'U':
2182   case 'a': case 'c': case 'g': case 't': case 'u': return false;
2183   default: return true;
2184   }
2185 }
2186 
2187 void
Pair_print_exonsummary(Filestring_T fp,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Genome_T genome,Univ_IIT_T chromosome_iit,bool watsonp,int cdna_direction,bool genomefirstp,int invertmode)2188 Pair_print_exonsummary (Filestring_T fp, struct T *pairs, int npairs, Chrnum_T chrnum,
2189 			Univcoord_T chroffset, Genome_T genome, Univ_IIT_T chromosome_iit,
2190 			bool watsonp, int cdna_direction, bool genomefirstp, int invertmode) {
2191   bool in_exon = false;
2192   struct T *save = NULL, *ptr, *this = NULL;
2193   int exon_querystart = -1, exon_queryend;
2194   Chrpos_T exon_genomestart = 0, exon_genomeend, intron_start, intron_end;
2195   int num = 0, den = 0, i;
2196   char *chrstring = NULL;
2197   int last_querypos = -1;
2198   Chrpos_T last_genomepos = (Chrpos_T) -1;
2199 
2200 
2201   if (watsonp == true) {
2202     ptr = pairs;
2203   } else if (invertmode == 0) {
2204     ptr = pairs;
2205   } else if (invertmode == 1) {
2206     save = ptr = invert_path(pairs,npairs);
2207   } else if (invertmode == 2) {
2208     save = ptr = invert_and_revcomp_path(pairs,npairs);
2209   } else {
2210     fprintf(stderr,"Don't recognize invert mode %d\n",invertmode);
2211     exit(9);
2212   }
2213 
2214   if (chrnum != 0) {
2215     if (invertmode == 2) {
2216       chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,/*watsonp*/true);
2217     } else {
2218       chrstring = Chrnum_to_string_signed(chrnum,chromosome_iit,watsonp);
2219     }
2220   }
2221 
2222   debug(Pair_dump_array(pairs,npairs,/*zerobasedp*/true));
2223 
2224   for (i = 0; i < npairs; i++) {
2225     /* prev = this; */
2226     this = ptr++;
2227 
2228     if (this->gapp) {
2229       if (in_exon == true) {
2230 	exon_queryend = last_querypos + ONEBASEDP;
2231 	exon_genomeend = last_genomepos + ONEBASEDP;
2232 	if (watsonp) {
2233 	  intron_start = exon_genomeend + 1;
2234 	} else {
2235 	  intron_start = exon_genomeend - 1;
2236 	}
2237 	if (genomefirstp == true) {
2238 	  FPRINTF(fp,"    ");
2239 	  if (chrnum == 0) {
2240 	    FPRINTF(fp,"%u-%u",chroffset+exon_genomestart,chroffset+exon_genomeend);
2241 	  } else {
2242 	    FPRINTF(fp,"%s:%d-%d",chrstring,exon_genomestart,exon_genomeend);
2243 	  }
2244 	  FPRINTF(fp,"  (%d-%d)",exon_querystart,exon_queryend);
2245 	} else {
2246 	  FPRINTF(fp,"    %d-%d",exon_querystart,exon_queryend);
2247 	  FPRINTF(fp,"  ");
2248 	  if (chrnum == 0) {
2249 	    FPRINTF(fp,"(%u-%u)",chroffset+exon_genomestart,chroffset+exon_genomeend);
2250 	  } else {
2251 	    FPRINTF(fp,"(%s:%d-%d)",chrstring,exon_genomestart,exon_genomeend);
2252 	  }
2253 	}
2254 	if (den == 0) {
2255 	  FPRINTF(fp,"   %d%%",100);
2256 	} else {
2257 	  FPRINTF(fp,"   %d%%",(int) floor(100.0*(double) num/(double) den));
2258 	}
2259 	if (this->comp == FWD_CANONICAL_INTRON_COMP) {
2260 	  FPRINTF(fp," ->");
2261 	  /* sensep = true; */
2262 	} else if (this->comp == REV_CANONICAL_INTRON_COMP) {
2263 	  FPRINTF(fp," <-");
2264 	  /* sensep = false; */
2265 	} else if (this->comp == FWD_GCAG_INTRON_COMP) {
2266 	  FPRINTF(fp," -)");
2267 	  /* sensep = true; */
2268 	} else if (this->comp == REV_GCAG_INTRON_COMP) {
2269 	  FPRINTF(fp," (-");
2270 	  /* sensep = false; */
2271 	} else if (this->comp == FWD_ATAC_INTRON_COMP) {
2272 	  FPRINTF(fp," -]");
2273 	  /* sensep = true; */
2274 	} else if (this->comp == REV_ATAC_INTRON_COMP) {
2275 	  FPRINTF(fp," [-");
2276 	  /* sensep = false; */
2277 	} else if (this->comp == NONINTRON_COMP) {
2278 	  FPRINTF(fp," ==");
2279 	  /* sensep = true; */
2280 	} else {
2281 	  FPRINTF(fp," ##");
2282 	  /* sensep = true; */
2283 	}
2284 	in_exon = false;
2285       }
2286     } else if (this->comp == INTRONGAP_COMP) {
2287       /* Do nothing */
2288     } else {
2289       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
2290 	 SHORTGAP_COMP, or MISMATCH_COMP */
2291       if (in_exon == false) {
2292 	exon_querystart = this->querypos + ONEBASEDP;
2293 	exon_genomestart = this->genomepos + ONEBASEDP;
2294 	if (watsonp) {
2295 	  intron_end = exon_genomestart - 1;
2296 	} else {
2297 	  intron_end = exon_genomestart + 1;
2298 	}
2299 	if (i > 0) {
2300 	  if (intron_end > intron_start) {
2301 	    FPRINTF(fp,"   ...%d...",intron_end - intron_start + 1);
2302 	  } else {
2303 	    FPRINTF(fp,"   ...%d...",intron_start - intron_end + 1);
2304 	  }
2305 
2306 	  if (exon_querystart > exon_queryend + 1) {
2307 	    FPRINTF(fp,"   ***query_skip:%d***",exon_querystart-(exon_queryend+1));
2308 	  }
2309 
2310 	  if (genome != NULL) {
2311 	    if (cdna_direction > 0) {
2312 	      FPRINTF(fp,"  %.3f, %.3f",
2313 		      donor_score(chroffset+exon_genomeend-1,chroffset,!watsonp,genome,chromosome_iit),
2314 		      acceptor_score(chroffset+exon_genomestart-1,chroffset,!watsonp,genome,chromosome_iit));
2315 	    } else if (cdna_direction < 0) {
2316 	      FPRINTF(fp,"  %.3f, %.3f",
2317 		      acceptor_score(chroffset+exon_genomeend-1,chroffset,watsonp,genome,chromosome_iit),
2318 		      donor_score(chroffset+exon_genomestart-1,chroffset,watsonp,genome,chromosome_iit));
2319 	    }
2320 	  }
2321 
2322 	  PUTC('\n',fp);
2323 	}
2324 	num = den = 0;
2325 	in_exon = true;
2326       }
2327       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
2328 	/* Previously not counted in numerator or denominator */
2329 	den++;
2330 #ifndef PMAP
2331       } else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
2332 	/* Comp must be a space */
2333 	/* Don't count in numerator or denominator */
2334 #endif
2335       } else {
2336 	den++;
2337 	if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
2338 	  num++;
2339 	} else if (this->comp == AMBIGUOUS_COMP) {
2340 #ifdef PMAP
2341 	  num++;
2342 #else
2343 	  den--;
2344 #endif
2345 	}
2346       }
2347     }
2348 
2349     if (this->cdna != ' ') {
2350       last_querypos = this->querypos;
2351     }
2352     if (this->genome != ' ') {
2353       last_genomepos = this->genomepos;
2354     }
2355   }
2356 
2357   /* prev = this; */
2358   exon_queryend = last_querypos + ONEBASEDP;
2359   exon_genomeend = last_genomepos + ONEBASEDP;
2360   if (genomefirstp == true) {
2361     FPRINTF(fp,"    ");
2362     if (chrnum == 0) {
2363       FPRINTF(fp,"%u-%u",chroffset+exon_genomestart,chroffset+exon_genomeend);
2364     } else {
2365       FPRINTF(fp,"%s:%d-%d",chrstring,exon_genomestart,exon_genomeend);
2366     }
2367     FPRINTF(fp,"  (%d-%d)",exon_querystart,exon_queryend);
2368   } else {
2369     FPRINTF(fp,"    %d-%d",exon_querystart,exon_queryend);
2370     FPRINTF(fp,"  ");
2371     if (chrnum == 0) {
2372       FPRINTF(fp,"(%u-%u)",chroffset+exon_genomestart,chroffset+exon_genomeend);
2373     } else {
2374       FPRINTF(fp,"(%s:%d-%d)",chrstring,exon_genomestart,exon_genomeend);
2375     }
2376   }
2377   if (den == 0) {
2378     FPRINTF(fp,"   %d%%",100);
2379   } else {
2380     FPRINTF(fp,"   %d%%",(int) floor(100.0*(double) num/(double) den));
2381   }
2382   FPRINTF(fp,"\n\n");
2383 
2384   if (chrstring != NULL) {
2385     FREE(chrstring);
2386   }
2387   if (save != NULL) {
2388     FREE(save);
2389   }
2390 
2391   return;
2392 }
2393 
2394 void
Pair_tokens_free(List_T * tokens)2395 Pair_tokens_free (List_T *tokens) {
2396   List_T p;
2397   char *token;
2398 
2399   for (p = *tokens; p != NULL; p = List_next(p)) {
2400     token = (char *) List_head(p);
2401     FREE_OUT(token);
2402   }
2403   List_free_out(&(*tokens));
2404 
2405   return;
2406 }
2407 
2408 
2409 List_T
Pair_tokens_copy(List_T old)2410 Pair_tokens_copy (List_T old) {
2411   List_T new = NULL;
2412   char *new_token, *old_token;
2413 
2414   while (old != NULL) {
2415     old_token = (char *) List_head(old);
2416     new_token = (char *) MALLOC_OUT((strlen(old_token)+1) * sizeof(char));
2417     strcpy(new_token,old_token);
2418     new = List_push_out(new,(void *) new_token);
2419     old = List_next(old);
2420   }
2421 
2422   return List_reverse(new);
2423 }
2424 
2425 
2426 
2427 #if 0
2428 /* Tokens used by compressed and gff3 formats */
2429 /* Used by Pair_print_compressed_old */
2430 static void
2431 print_tokens_compressed (Filestring_T fp, List_T tokens) {
2432   List_T p;
2433   int tokencount = 1;
2434   char *token, *lasttoken = NULL;
2435 
2436   for (p = tokens; p != NULL; p = List_next(p)) {
2437     token = (char *) List_head(p);
2438     if (lasttoken == NULL) {
2439       FPRINTF(fp,"\t%s",token);
2440       lasttoken = token;
2441     } else if (!strcmp(token,lasttoken)) {
2442       tokencount++;
2443     } else {
2444       if (tokencount > 1) {
2445 	FPRINTF(fp,"!%d",tokencount);
2446       }
2447       FPRINTF(fp," %s",token);
2448       lasttoken = token;
2449       tokencount = 1;
2450     }
2451   }
2452   if (tokencount > 1) {
2453     FPRINTF(fp,"!%d",tokencount);
2454   }
2455 
2456   for (p = tokens; p != NULL; p = List_next(p)) {
2457     token = (char *) List_head(p);
2458     FREE_OUT(token);
2459   }
2460 
2461   return;
2462 }
2463 #endif
2464 
2465 
2466 static void
print_tokens_gff3(Filestring_T fp,List_T tokens)2467 print_tokens_gff3 (Filestring_T fp, List_T tokens) {
2468   List_T p;
2469   char *token;
2470 
2471   if (tokens != NULL) {
2472     p = tokens;
2473     token = (char *) List_head(p);
2474     FPRINTF(fp,"%s",token);
2475 
2476     for (p = List_next(p); p != NULL; p = List_next(p)) {
2477       token = (char *) List_head(p);
2478       FPRINTF(fp," %s",token);
2479     }
2480   }
2481 
2482   for (p = tokens; p != NULL; p = List_next(p)) {
2483     token = (char *) List_head(p);
2484     FREE_OUT(token);
2485   }
2486 
2487   return;
2488 }
2489 
2490 static List_T
push_token(List_T tokens,char * token)2491 push_token (List_T tokens, char *token) {
2492   char *copy;
2493 
2494   copy = (char *) MALLOC_OUT((strlen(token)+1) * sizeof(char));
2495   strcpy(copy,token);
2496   return List_push_out(tokens,(void *) copy);
2497 }
2498 
2499 
2500 /* Definition of GFF3 format is at http://song.sourceforge.net/gff3.shtml */
2501 
2502 static void
print_gff3_gene(Filestring_T fp,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T start_genomepos,Chrpos_T end_genomepos,bool watsonp,int cdna_direction)2503 print_gff3_gene (Filestring_T fp, int pathnum, char *sourcename, char *accession, char *fasta_annotation,
2504 		 char *chrstring, Chrpos_T start_genomepos, Chrpos_T end_genomepos,
2505 		 bool watsonp, int cdna_direction) {
2506 
2507   /* 1: seqid */
2508   if (chrstring == NULL) {
2509     FPRINTF(fp,"%s\t","NA");
2510   } else {
2511     FPRINTF(fp,"%s\t",chrstring);
2512   }
2513   FPRINTF(fp,"%s\t",sourcename);	/* 2: source */
2514   FPRINTF(fp,"gene\t");		/* 3: type */
2515 
2516   if (start_genomepos < end_genomepos) {
2517     FPRINTF(fp,"%u\t%u\t",start_genomepos,end_genomepos); /* 4,5: start, end */
2518   } else {
2519     FPRINTF(fp,"%u\t%u\t",end_genomepos,start_genomepos); /* 4,5: start, end */
2520   }
2521 
2522   FPRINTF(fp,".\t");		/* 6: score */
2523 
2524   if (watsonp == true) {
2525     if (cdna_direction >= 0) {
2526       FPRINTF(fp,"+\t");
2527     } else {
2528       FPRINTF(fp,"-\t");
2529     }
2530   } else {
2531     if (cdna_direction >= 0) {
2532       FPRINTF(fp,"-\t");		/* 7: strand */
2533     } else {
2534       FPRINTF(fp,"+\t");
2535     }
2536   }
2537 
2538   FPRINTF(fp,".\t");		/* 8: phase */
2539 
2540   /* 9: features */
2541   if (accession == NULL) {
2542     FPRINTF(fp,"ID=%s.path%d;Name=%s","NA",pathnum,"NA");
2543   } else {
2544     FPRINTF(fp,"ID=%s.path%d;Name=%s",accession,pathnum,accession);
2545   }
2546 
2547   if (fasta_annotation != NULL) {
2548     FPRINTF(fp,";%s",fasta_annotation);
2549   }
2550 
2551   if (cdna_direction > 0) {
2552     FPRINTF(fp,";Dir=sense");
2553   } else if (cdna_direction < 0) {
2554     FPRINTF(fp,";Dir=antisense");
2555   } else {
2556     FPRINTF(fp,";Dir=indeterminate");
2557   }
2558 
2559   PUTC('\n',fp);
2560 
2561   return;
2562 }
2563 
2564 static void
print_gff3_mrna(Filestring_T fp,int pathnum,T start,T end,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T start_genomepos,Chrpos_T end_genomepos,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction)2565 print_gff3_mrna (Filestring_T fp, int pathnum, T start, T end,
2566 		 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2567 		 Chrpos_T start_genomepos, Chrpos_T end_genomepos,
2568 		 int querylength_given, int skiplength, int matches, int mismatches,
2569 		 int qindels, int tindels, int unknowns, bool watsonp, int cdna_direction) {
2570   int den;
2571   int querypos1, querypos2;
2572   double coverage, fracidentity;
2573 
2574   /* 1: seqid */
2575   if (chrstring == NULL) {
2576     FPRINTF(fp,"%s\t","NA");
2577   } else {
2578     FPRINTF(fp,"%s\t",chrstring);
2579   }
2580   FPRINTF(fp,"%s\t",sourcename);	/* 2: source */
2581   FPRINTF(fp,"mRNA\t");		/* 3: type */
2582   if (start_genomepos < end_genomepos) {
2583     FPRINTF(fp,"%u\t%u\t",start_genomepos,end_genomepos); /* 4,5: start, end */
2584   } else {
2585     FPRINTF(fp,"%u\t%u\t",end_genomepos,start_genomepos); /* 4,5: start, end */
2586   }
2587 
2588   FPRINTF(fp,".\t");		/* 6: score */
2589 
2590   if (watsonp == true) {
2591     if (cdna_direction >= 0) {
2592       FPRINTF(fp,"+\t");
2593     } else {
2594       FPRINTF(fp,"-\t");
2595     }
2596   } else {
2597     if (cdna_direction >= 0) {
2598       FPRINTF(fp,"-\t");		/* 7: strand */
2599     } else {
2600       FPRINTF(fp,"+\t");
2601     }
2602   }
2603 
2604   FPRINTF(fp,".\t");		/* 8: phase */
2605 
2606   /* 9: features */
2607   if (accession == NULL) {
2608     FPRINTF(fp,"ID=%s.mrna%d;Name=%s;Parent=%s.path%d",
2609 	    "NA",pathnum,"NA","NA",pathnum);
2610   } else {
2611     FPRINTF(fp,"ID=%s.mrna%d;Name=%s;Parent=%s.path%d",
2612 	    accession,pathnum,accession,accession,pathnum);
2613   }
2614 
2615   if (fasta_annotation != NULL) {
2616     FPRINTF(fp,";%s",fasta_annotation);
2617   }
2618 
2619   if (cdna_direction > 0) {
2620     FPRINTF(fp,";Dir=sense");
2621   } else if (cdna_direction < 0) {
2622     FPRINTF(fp,";Dir=antisense");
2623   } else {
2624     FPRINTF(fp,";Dir=indeterminate");
2625   }
2626 
2627   querypos1 = start->querypos;
2628   querypos2 = end->querypos;
2629 
2630 #ifdef PMAP
2631   coverage = (double) (querypos2 - querypos1 + 1)/(double) (3*(querylength_given + skiplength));
2632   /* Can have coverage greater than given querylength because of added '*' at end */
2633   if (coverage > 1.0) {
2634     coverage = 1.0;
2635   }
2636 #else
2637   coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given + skiplength);
2638 #endif
2639   FPRINTF(fp,";coverage=%.1f",((double) rint(1000.0*coverage))/10.0);
2640 
2641   if ((den = matches + mismatches + qindels + tindels) == 0) {
2642     fracidentity = 1.0;
2643   } else {
2644     fracidentity = (double) matches/(double) den;
2645   }
2646   FPRINTF(fp,";identity=%.1f",((double) rint(1000.0*fracidentity))/10.0);
2647   FPRINTF(fp,";matches=%d;mismatches=%d;indels=%d;unknowns=%d",
2648 	  matches,mismatches,qindels+tindels,unknowns);
2649 
2650   PUTC('\n',fp);
2651 
2652   return;
2653 }
2654 
2655 
2656 static void
print_gff3_exon(Filestring_T fp,int exonno,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int exon_querystart,int exon_queryend,bool watsonp,int cdna_direction,int pctidentity)2657 print_gff3_exon (Filestring_T fp, int exonno, int pathnum, char *sourcename,
2658 		 char *accession, char *fasta_annotation, char *chrstring,
2659 		 Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
2660 		 int exon_querystart, int exon_queryend, bool watsonp, int cdna_direction,
2661 		 int pctidentity) {
2662 
2663   if (exon_genomestart == exon_genomeend) {
2664     /* Due to a query skip, so don't print */
2665 
2666   } else {
2667     /* 1: seqid */
2668     if (chrstring == NULL) {
2669       FPRINTF(fp,"%s\t","NA");
2670     } else {
2671       FPRINTF(fp,"%s\t",chrstring);
2672     }
2673     FPRINTF(fp,"%s\t",sourcename);	/* 2: source */
2674     FPRINTF(fp,"exon\t");		/* 3: type */
2675     if (exon_genomestart < exon_genomeend) {
2676       FPRINTF(fp,"%u\t%u\t",exon_genomestart,exon_genomeend); /* 4,5: start, end */
2677     } else {
2678       FPRINTF(fp,"%u\t%u\t",exon_genomeend,exon_genomestart); /* 4,5: start, end */
2679     }
2680     FPRINTF(fp,"%d\t",pctidentity);	/* 6: score */
2681 
2682     if (watsonp == true) {
2683       if (cdna_direction >= 0) {
2684 	FPRINTF(fp,"+\t");
2685       } else {
2686 	FPRINTF(fp,"-\t");
2687       }
2688     } else {
2689       if (cdna_direction >= 0) {
2690 	FPRINTF(fp,"-\t");		/* 7: strand */
2691       } else {
2692 	FPRINTF(fp,"+\t");
2693       }
2694     }
2695 
2696     FPRINTF(fp,".\t");		/* 8: phase */
2697 
2698     /* 9: features */
2699     if (accession == NULL) {
2700       accession = "NA";
2701     }
2702     FPRINTF(fp,"ID=%s.mrna%d.exon%d;",accession,pathnum,exonno);
2703     FPRINTF(fp,"Name=%s;",accession);
2704     FPRINTF(fp,"Parent=%s.mrna%d",accession,pathnum);
2705 
2706     if (fasta_annotation != NULL) {
2707       FPRINTF(fp,";%s",fasta_annotation);
2708     }
2709 
2710     if (cdna_direction > 0) {
2711       FPRINTF(fp,";Target=%s %d %d +\n",accession,exon_querystart,exon_queryend);
2712     } else if (cdna_direction < 0) {
2713       FPRINTF(fp,";Target=%s %d %d -\n",accession,exon_queryend,exon_querystart);
2714     } else {
2715       FPRINTF(fp,";Target=%s %d %d .\n",accession,exon_queryend,exon_querystart);
2716     }
2717   }
2718 
2719   return;
2720 }
2721 
2722 static void
print_gff3_cds(Filestring_T fp,int cdsno,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T cds_genomestart,Chrpos_T cds_genomeend,int cds_querystart,int cds_queryend,bool watsonp,int cdna_direction,int pctidentity,int cds_phase)2723 print_gff3_cds (Filestring_T fp, int cdsno, int pathnum,
2724 		char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2725 		Chrpos_T cds_genomestart, Chrpos_T cds_genomeend,
2726 		int cds_querystart, int cds_queryend, bool watsonp, int cdna_direction,
2727 		int pctidentity, int cds_phase) {
2728 
2729   assert(cds_phase >= 0);
2730 
2731   if (cds_genomestart == cds_genomeend) {
2732     /* Due to a query skip, so don't print */
2733 
2734   } else {
2735     /* 1: seqid */
2736     if (chrstring == NULL) {
2737       FPRINTF(fp,"%s\t","NA");
2738     } else {
2739       FPRINTF(fp,"%s\t",chrstring);
2740     }
2741     FPRINTF(fp,"%s\t",sourcename);	/* 2: source */
2742     FPRINTF(fp,"CDS\t");		/* 3: type */
2743     if (cds_genomestart < cds_genomeend) {
2744       FPRINTF(fp,"%u\t%u\t",cds_genomestart,cds_genomeend); /* 4,5: start, end */
2745     } else {
2746       FPRINTF(fp,"%u\t%u\t",cds_genomeend,cds_genomestart); /* 4,5: start, end */
2747     }
2748     FPRINTF(fp,"%d\t",pctidentity);	/* 6: score */
2749 
2750     if (watsonp == true) {
2751       if (cdna_direction >= 0) {
2752 	FPRINTF(fp,"+\t");
2753       } else {
2754 	FPRINTF(fp,"-\t");
2755       }
2756     } else {
2757       if (cdna_direction >= 0) {
2758 	FPRINTF(fp,"-\t");		/* 7: strand */
2759       } else {
2760 	FPRINTF(fp,"+\t");
2761       }
2762     }
2763 
2764     if (gff3_phase_swap_p == true && cds_phase > 0) {
2765       /* Some analysis programs want phase in gff3 to be different */
2766       FPRINTF(fp,"%d\t",3 - cds_phase);	/* 8: phase */
2767     } else {
2768       /* This appears to be the specification: a phase of 0 indicates
2769 	 that the next codon begins at the first base of the region
2770 	 described by the current line, a phase of 1 indicates that the
2771 	 next codon begins at the second base of this region, and a
2772 	 phase of 2 indicates that the codon begins at the third base of
2773 	 this region. */
2774       FPRINTF(fp,"%d\t",cds_phase);	/* 8: phase */
2775     }
2776 
2777     /* 9: features */
2778     if (accession == NULL) {
2779       accession = "NA";
2780     }
2781     FPRINTF(fp,"ID=%s.mrna%d.cds%d;",accession,pathnum,cdsno);
2782     FPRINTF(fp,"Name=%s;",accession);
2783     FPRINTF(fp,"Parent=%s.mrna%d",accession,pathnum);
2784 
2785     if (fasta_annotation != NULL) {
2786       FPRINTF(fp,";%s",fasta_annotation);
2787     }
2788 
2789     if (cdna_direction > 0) {
2790       FPRINTF(fp,";Target=%s %d %d +\n",accession,cds_querystart,cds_queryend);
2791     } else if (cdna_direction > 0) {
2792       FPRINTF(fp,";Target=%s %d %d -\n",accession,cds_queryend,cds_querystart);
2793     } else {
2794       FPRINTF(fp,";Target=%s %d %d .\n",accession,cds_queryend,cds_querystart);
2795     }
2796   }
2797 
2798   return;
2799 }
2800 
2801 
2802 static void
print_gff3_cdna_match(Filestring_T fp,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int exon_querystart,int exon_queryend,bool watsonp,int cdna_direction,int pctidentity,List_T tokens)2803 print_gff3_cdna_match (Filestring_T fp, int pathnum,
2804 		       char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2805 		       Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
2806 		       int exon_querystart, int exon_queryend, bool watsonp, int cdna_direction,
2807 		       int pctidentity, List_T tokens) {
2808 
2809   if (exon_genomestart == exon_genomeend) {
2810     /* Due to a query skip, so don't print */
2811 
2812   } else {
2813     /* 1: seqid */
2814     if (chrstring == NULL) {
2815       FPRINTF(fp,"%s\t","NA");
2816     } else {
2817       FPRINTF(fp,"%s\t",chrstring);
2818     }
2819     FPRINTF(fp,"%s\t",sourcename);	/* 2: source */
2820     FPRINTF(fp,"cDNA_match\t");		/* 3: type */
2821     if (exon_genomestart < exon_genomeend) {
2822       FPRINTF(fp,"%u\t%u\t",exon_genomestart,exon_genomeend); /* 4,5: start, end */
2823     } else {
2824       FPRINTF(fp,"%u\t%u\t",exon_genomeend,exon_genomestart); /* 4,5: start, end */
2825     }
2826     FPRINTF(fp,"%d\t",pctidentity);	/* 6: score */
2827 
2828     /* 7: strand */
2829     if (watsonp == true) {
2830       FPRINTF(fp,"+\t");
2831     } else {
2832       FPRINTF(fp,"-\t");
2833     }
2834 
2835     FPRINTF(fp,".\t");		/* 8: phase */
2836 
2837     /* 9: features */
2838     if (accession == NULL) {
2839       accession = "NA";
2840     }
2841     FPRINTF(fp,"ID=%s.path%d;",accession,pathnum);
2842     FPRINTF(fp,"Name=%s",accession);
2843 
2844     if (fasta_annotation != NULL) {
2845       FPRINTF(fp,";%s",fasta_annotation);
2846     }
2847 
2848     if (cdna_direction > 0) {
2849       FPRINTF(fp,";Dir=sense");
2850     } else if (cdna_direction < 0) {
2851       FPRINTF(fp,";Dir=antisense");
2852     } else {
2853       FPRINTF(fp,";Dir=indeterminate");
2854     }
2855 
2856     FPRINTF(fp,";Target=%s %d %d;Gap=",accession,exon_querystart,exon_queryend);
2857     print_tokens_gff3(fp,tokens);
2858     PUTC('\n',fp);
2859   }
2860 
2861   return;
2862 }
2863 
2864 
2865 static char
strand_char(int strand)2866 strand_char (int strand) {
2867   switch (strand) {
2868     case  1: return '+';
2869     case -1: return '-';
2870       /* case  0: return '?'; -- Now returning '.' for unknown strand */
2871     default: return '.';
2872   }
2873 }
2874 
2875 
2876 static void
print_gff3_est_match(Filestring_T fp,int pathnum,T start,T end,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int exon_querystart,int exon_queryend,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction,int pctidentity,List_T tokens)2877 print_gff3_est_match (Filestring_T fp, int pathnum, T start, T end,
2878 		      char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2879 		      Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
2880 		      int exon_querystart, int exon_queryend,
2881 		      int querylength_given, int skiplength, int matches, int mismatches, int qindels, int tindels,
2882 		      int unknowns, bool watsonp, int cdna_direction, int pctidentity, List_T tokens) {
2883   int feature_strand, target_strand;
2884   double coverage, fracidentity;
2885   int den;
2886   int querypos1, querypos2;
2887 
2888   if (exon_genomestart == exon_genomeend) {
2889     /* Due to a query skip, so don't print */
2890 
2891   } else {
2892     /* 1: seqid */
2893     if (chrstring == NULL) {
2894       FPRINTF(fp,"%s\t","NA");
2895     } else {
2896       FPRINTF(fp,"%s\t",chrstring);
2897     }
2898     FPRINTF(fp,"%s\t",sourcename);	/* 2: source */
2899     FPRINTF(fp,"EST_match\t");	/* 3: type */
2900     if (exon_genomestart < exon_genomeend) {
2901       FPRINTF(fp,"%u\t%u\t",exon_genomestart,exon_genomeend); /* 4,5: start, end */
2902     } else {
2903       FPRINTF(fp,"%u\t%u\t",exon_genomeend,exon_genomestart); /* 4,5: start, end */
2904     }
2905     FPRINTF(fp,"%d\t",pctidentity);	/* 6: score */
2906 
2907     /* 7: strand */
2908     feature_strand = watsonp ? cdna_direction : -cdna_direction;
2909     FPRINTF(fp,"%c\t",strand_char(feature_strand));
2910 
2911     FPRINTF(fp,".\t");		/* 8: phase */
2912 
2913     /* 9: features */
2914     if (accession == NULL) {
2915       accession = "NA";
2916     }
2917     FPRINTF(fp,"ID=%s.path%d;",accession,pathnum);
2918     FPRINTF(fp,"Name=%s",accession);
2919 
2920     if (fasta_annotation != NULL) {
2921       FPRINTF(fp,";%s",fasta_annotation);
2922     }
2923 
2924     if (cdna_direction > 0) {
2925       FPRINTF(fp,";Dir=sense");
2926     } else if (cdna_direction < 0) {
2927       FPRINTF(fp,";Dir=antisense");
2928     } else {
2929       FPRINTF(fp,";Dir=indeterminate");
2930     }
2931 
2932     target_strand = cdna_direction != 0 ? cdna_direction : (watsonp ? 1 : -1);
2933     FPRINTF(fp,";Target=%s %d %d %c;Gap=",accession,exon_querystart,exon_queryend,
2934 	    strand_char(target_strand));
2935     print_tokens_gff3(fp,tokens);
2936 
2937     querypos1 = start->querypos;
2938     querypos2 = end->querypos;
2939 
2940 #ifdef PMAP
2941     coverage = (double) (querypos2 - querypos1 + 1)/(double) (3*(querylength_given + skiplength));
2942     /* Can have coverage greater than given querylength because of added '*' at end */
2943     if (coverage > 1.0) {
2944       coverage = 1.0;
2945     }
2946 #else
2947     coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given + skiplength);
2948 #endif
2949     FPRINTF(fp,";coverage=%.1f",((double) rint(1000.0*coverage))/10.0);
2950 
2951     if ((den = matches + mismatches + qindels + tindels) == 0) {
2952       fracidentity = 1.0;
2953     } else {
2954       fracidentity = (double) matches/(double) den;
2955     }
2956     FPRINTF(fp,";identity=%.1f",((double) rint(1000.0*fracidentity))/10.0);
2957     FPRINTF(fp,";matches=%d;mismatches=%d;indels=%d;unknowns=%d",
2958 	    matches,mismatches,qindels+tindels,unknowns);
2959 
2960     PUTC('\n',fp);
2961   }
2962 
2963   return;
2964 }
2965 
2966 
2967 static void
print_gff3_exons_forward(Filestring_T fp,struct T * pairs,int npairs,int pathnum,T start,T end,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction,bool gff_introns_p,bool gff_gene_format_p,bool gff_estmatch_format_p,bool cds_p)2968 print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int pathnum, T start, T end,
2969 			  char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
2970 			  int querylength_given, int skiplength, int matches, int mismatches,
2971 			  int qindels, int tindels, int unknowns, bool watsonp, int cdna_direction,
2972 			  bool gff_introns_p, bool gff_gene_format_p, bool gff_estmatch_format_p,
2973 			  bool cds_p) {
2974   bool in_exon = false;
2975   struct T *ptr, *this = NULL;
2976   int exon_querystart = -1, exon_queryend, exon_phase = 0;
2977   Chrpos_T exon_genomestart = 0, exon_genomeend, intron_start, intron_end;
2978   int pctidentity, num = 0, den = 0, exonno = 0, cdsno = 0, starti, endi, last_valid_i, i;
2979   int Mlength = 0, Ilength = 0, Dlength = 0;
2980   List_T tokens = NULL;
2981   char token[11];
2982 #if 0
2983   int intronno = 0;
2984 #endif
2985   int estmatch_querystart, estmatch_queryend, estmatch_genomestart, estmatch_genomeend;
2986   int last_querypos = -1;
2987   Chrpos_T last_genomepos = (Chrpos_T) -1;
2988 
2989   endi = npairs - 1;
2990   if (cds_p == false) {
2991     starti = 0;
2992 
2993   } else if (cdstype == CDS_CDNA) {
2994     i = 0;
2995     starti = -1;
2996     while (i < npairs) {
2997       if (pairs[i].gapp == true) {
2998 	i++;
2999       } else if (pairs[i].cdna == ' ') {
3000 	i++;
3001       } else if (pairs[i].aaphase_e == -1) {
3002 	i++;
3003       } else {
3004 	debug7(printf("FORWARD: Setting starti to be %d\n",i));
3005 	starti = i;
3006 	last_valid_i = i;
3007 	while (i < npairs) {
3008 	  if (pairs[i].gapp == true) {
3009 	    i++;
3010 	  } else if (pairs[i].cdna == ' ') {
3011 	    i++;
3012 	  } else if (pairs[i].aaphase_e != -1) {
3013 	    last_valid_i = i;
3014 	    i++;
3015 	  } else {
3016 	    debug7(printf("FORWARD: Saw aaphase_e of -1 at pair %d\n",i));
3017 	    endi = last_valid_i; /* inclusive */
3018 	    i = npairs;
3019 	  }
3020 	}
3021       }
3022     }
3023 
3024   } else if (cdstype == CDS_GENOMIC) {
3025     i = 0;
3026     starti = -1;
3027     while (i < npairs) {
3028       if (pairs[i].gapp == true) {
3029 	i++;
3030       } else if (pairs[i].genome == ' ') {
3031 	i++;
3032       } else if (pairs[i].aaphase_g == -1) {
3033 	i++;
3034       } else {
3035 	debug7(printf("FORWARD: Setting starti to be %d\n",i));
3036 	starti = i;
3037 	last_valid_i = i;
3038 	while (i < npairs) {
3039 	  if (pairs[i].gapp == true) {
3040 	    i++;
3041 	  } else if (pairs[i].genome == ' ') {
3042 	    i++;
3043 	  } else if (pairs[i].aaphase_g != -1) {
3044 	    last_valid_i = i;
3045 	    i++;
3046 	  } else {
3047 	    debug7(printf("FORWARD: Saw aaphase_g of -1 at pair %d\n",i));
3048 	    endi = last_valid_i; /* inclusive */
3049 	    i = npairs;
3050 	  }
3051 	}
3052       }
3053     }
3054 
3055   } else {
3056     fprintf(stderr,"Do not recognize cdstype %d\n",cdstype);
3057     abort();
3058   }
3059 
3060   debug7(Pair_dump_array(pairs,npairs,true));
3061 
3062   if (cds_p == true && starti < 0) {
3063     /* Want CDS, and none seen */
3064     return;
3065   }
3066 
3067   ptr = &(pairs[starti]);
3068   for (i = starti; i <= endi; i++) {
3069     /* prev = this; */
3070     this = ptr++;
3071 
3072     if (this->gapp) {
3073       if (in_exon == true) {
3074 	exon_queryend = last_querypos + 1;
3075 	exon_genomeend = last_genomepos + 1;
3076 
3077 	if (watsonp) {
3078 	  intron_start = exon_genomeend + 1;
3079 	} else {
3080 	  intron_start = exon_genomeend - 1;
3081 	}
3082 
3083 	if (den == 0) {
3084 	  pctidentity = 100;
3085 	} else {
3086 	  pctidentity = (int) floor(100.0*(double) num/(double) den);
3087 	}
3088 
3089 	if (cds_p == true) {
3090 	  print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3091 			 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3092 
3093 	} else if (gff_gene_format_p == true) {
3094 	  print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3095 			  exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3096 	} else {
3097 	  if (Mlength > 0) {
3098 	    sprintf(token,"M%d",Mlength);
3099 	    tokens = push_token(tokens,token);
3100 	  } else if (Ilength > 0) {
3101 	    sprintf(token,"I%d",Ilength);
3102 	    tokens = push_token(tokens,token);
3103 	  } else if (Dlength > 0) {
3104 	    sprintf(token,"D%d",Dlength);
3105 	    tokens = push_token(tokens,token);
3106 	  }
3107 	  if (gff_estmatch_format_p == false) {
3108 	    tokens = List_reverse(tokens);
3109 	    /* ++exonno; */
3110 	    print_gff3_cdna_match(fp,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3111 				  exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,tokens);
3112 	    List_free_out(&tokens);
3113 	  }
3114 	}
3115 
3116 	Mlength = Ilength = Dlength = 0;
3117 	in_exon = false;
3118       }
3119     } else if (this->comp == INTRONGAP_COMP) {
3120       /* Do nothing */
3121     } else {
3122       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
3123 	 SHORTGAP_COMP, or MISMATCH_COMP */
3124       if (in_exon == false) {
3125 	exon_querystart = this->querypos + 1;
3126 	exon_genomestart = this->genomepos + 1;
3127 #if 0
3128 	if (this->aaphase_e != -1) {
3129 	  /* Otherwise, if phase is -1 from an indel, use previous exon_phase.  Should be fixed now. */
3130 	  exon_phase = this->aaphase_e;
3131 	}
3132 #else
3133 	if (cdstype == CDS_CDNA) {
3134 	  exon_phase = this->aaphase_e;
3135 	} else {
3136 	  exon_phase = this->aaphase_g;
3137 	}
3138 #endif
3139 	if (watsonp) {
3140 	  intron_end = exon_genomestart - 1;
3141 	} else {
3142 	  intron_end = exon_genomestart + 1;
3143 	}
3144 
3145 	if (gff_estmatch_format_p == true && i > 0) {
3146 	  /* abs() gives a large value when flag -m64 is specified */
3147 	  /* sprintf(token,"N%u",abs(intron_end - intron_start) + 1); */
3148 	  if (intron_end > intron_start) {
3149 	    sprintf(token,"N%u",(intron_end - intron_start) + 1);
3150 	  } else {
3151 	    sprintf(token,"N%u",(intron_start - intron_end) + 1);
3152 	  }
3153 
3154 	  tokens = push_token(tokens,token);
3155 	} else if (gff_introns_p == true) {
3156 	  if (i > 0) {
3157 #if 0
3158 	    printf_gff3_intron(++intronno,pathnum,sourcename,accession,chrstring,?,?,intron_start,intron_end,watsonp);
3159 #endif
3160 	  }
3161 	  PUTC('\n',fp);
3162 	}
3163 
3164 	num = den = 0;
3165 	in_exon = true;
3166       }
3167       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3168 	/* Gap in upper or lower sequence */
3169 	if (gff_gene_format_p == true) {
3170 	  /* Don't deal with tokens */
3171 	} else if (this->genome == ' ') {
3172 	  if (Mlength > 0) {
3173 	    sprintf(token,"M%d",Mlength);
3174 	    tokens = push_token(tokens,token);
3175 	    Mlength = 0;
3176 	  } else if (Dlength > 0) {
3177 	    /* unlikely */
3178 	    sprintf(token,"D%d",Dlength);
3179 	    tokens = push_token(tokens,token);
3180 	    Dlength = 0;
3181 	  }
3182 	  Ilength++;
3183 	} else if (this->cdna == ' ') {
3184 	  if (Mlength > 0) {
3185 	    sprintf(token,"M%d",Mlength);
3186 	    tokens = push_token(tokens,token);
3187 	    Mlength = 0;
3188 	  } else if (Ilength > 0) {
3189 	    sprintf(token,"I%d",Ilength);
3190 	    tokens = push_token(tokens,token);
3191 	    Ilength = 0;
3192 	  }
3193 	  Dlength++;
3194 	} else {
3195 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
3196 	  exit(9);
3197 	}
3198 
3199 	/* Previously not counted in numerator or denominator */
3200 	den++;
3201 
3202       } else {
3203 	/* Count in token even if unknown base */
3204 
3205 	if (gff_gene_format_p == true) {
3206 	  /* Don't deal with tokens */
3207 	} else if (Ilength > 0) {
3208 	  sprintf(token,"I%d",Ilength);
3209 	  tokens = push_token(tokens,token);
3210 	  Ilength = 0;
3211 	} else if (Dlength > 0) {
3212 	  sprintf(token,"D%d",Dlength);
3213 	  tokens = push_token(tokens,token);
3214 	  Dlength = 0;
3215 	}
3216 	Mlength++;
3217 
3218 #ifdef PMAP
3219 	den++;
3220 	if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3221 	  num++;
3222 	} else if (this->comp == AMBIGUOUS_COMP) {
3223 	  num++;
3224 	}
3225 #else
3226 	if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3227 	  /* Comp must be a space */
3228 	  /* Don't count in numerator or denominator */
3229 	} else {
3230 	  den++;
3231 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3232 	    num++;
3233 	  } else if (this->comp == AMBIGUOUS_COMP) {
3234 	    den--;
3235 	  }
3236 	}
3237 #endif
3238 
3239       }
3240     }
3241 
3242     if (this->cdna != ' ') {
3243       last_querypos = this->querypos;
3244     }
3245     if (this->genome != ' ') {
3246       last_genomepos = this->genomepos;
3247     }
3248   }
3249 
3250   /* prev = this; */
3251   exon_queryend = last_querypos + 1;
3252   exon_genomeend = last_genomepos + 1;
3253 
3254   if (den == 0) {
3255     pctidentity = 100;
3256   } else {
3257     pctidentity = (int) floor(100.0*(double) num/(double) den);
3258   }
3259 
3260   if (cds_p == true) {
3261     print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3262 		   exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3263 
3264   } else if (gff_gene_format_p == true) {
3265     print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3266 		    exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3267   } else {
3268     if (Mlength > 0) {
3269       sprintf(token,"M%d",Mlength);
3270       tokens = push_token(tokens,token);
3271     } else if (Ilength > 0) {
3272       sprintf(token,"I%d",Ilength);
3273       tokens = push_token(tokens,token);
3274     } else if (Dlength > 0) {
3275       sprintf(token,"D%d",Dlength);
3276       tokens = push_token(tokens,token);
3277     }
3278     if (gff_estmatch_format_p == true) {
3279       estmatch_querystart = pairs->querypos + 1;
3280       estmatch_queryend = exon_queryend;
3281       estmatch_genomestart = pairs->genomepos + 1;
3282       estmatch_genomeend = exon_genomeend;
3283       if (watsonp) {
3284 	tokens = List_reverse(tokens);
3285       }
3286       print_gff3_est_match(fp,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3287 			   estmatch_genomestart,estmatch_genomeend,
3288 			   estmatch_querystart,estmatch_queryend,
3289 			   querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3290 			   watsonp,cdna_direction,pctidentity,tokens);
3291     } else {
3292       tokens = List_reverse(tokens);
3293       /* ++exonno; */
3294       print_gff3_cdna_match(fp,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3295 			    exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,tokens);
3296     }
3297     List_free_out(&tokens);
3298   }
3299 
3300   return;
3301 }
3302 
3303 static void
print_gff3_exons_backward(Filestring_T fp,struct T * pairs,int npairs,int pathnum,char * sourcename,char * accession,char * fasta_annotation,char * chrstring,bool watsonp,int cdna_direction,bool gff_introns_p,bool cds_p)3304 print_gff3_exons_backward (Filestring_T fp, struct T *pairs, int npairs, int pathnum,
3305 			   char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
3306 			   bool watsonp, int cdna_direction, bool gff_introns_p, bool cds_p) {
3307   bool in_exon = false;
3308   struct T *ptr, *this = NULL;
3309   int exon_querystart = -1, exon_queryend, exon_phase = 0;
3310   Chrpos_T exon_genomestart = 0, exon_genomeend;
3311   int pctidentity, num = 0, den = 0, exonno = 0, cdsno = 0, starti, endi, last_valid_i, i;
3312 #if 0
3313   int intronno = 0;
3314   Chrpos_T intron_start, intron_end;
3315 #endif
3316   int last_querypos = -1;
3317   Chrpos_T last_genomepos = (Chrpos_T) -1;
3318 
3319   starti = 0;
3320   if (cds_p == false) {
3321     endi = npairs - 1;
3322 
3323   } else if (cdstype == CDS_CDNA) {
3324     i = npairs - 1;
3325     endi = npairs;
3326     while (i >= 0) {
3327       if (pairs[i].gapp == true) {
3328 	i--;
3329       } else if (pairs[i].cdna == ' ') {
3330 	i--;
3331       } else if (pairs[i].aaphase_e == -1) {
3332 	i--;
3333       } else {
3334 	debug7(printf("BACKWARD: Setting endi to be %d\n",i));
3335 	endi = i;
3336 	last_valid_i = i;
3337 	while (i >= 0) {
3338 	  if (pairs[i].gapp == true) {
3339 	    i--;
3340 	  } else if (pairs[i].cdna == ' ') {
3341 	    i--;
3342 	  } else if (pairs[i].aaphase_e != -1) {
3343 	    last_valid_i = i;
3344 	    i--;
3345 	  } else {
3346 	    debug7(printf("BACKWARD: Saw aaphase_e of -1 at pair %d\n",i));
3347 	    starti = last_valid_i; /* inclusive */
3348 	    i = -1;
3349 	  }
3350 	}
3351       }
3352     }
3353 
3354   } else if (cdstype == CDS_GENOMIC) {
3355     i = npairs - 1;
3356     endi = npairs;
3357     while (i >= 0) {
3358       if (pairs[i].gapp == true) {
3359 	i--;
3360       } else if (pairs[i].genome == ' ') {
3361 	i--;
3362       } else if (pairs[i].aaphase_g == -1) {
3363 	i--;
3364       } else {
3365 	debug7(printf("BACKWARD: Setting endi to be %d\n",i));
3366 	endi = i;
3367 	last_valid_i = i;
3368 	while (i >= 0) {
3369 	  if (pairs[i].gapp == true) {
3370 	    i--;
3371 	  } else if (pairs[i].genome == ' ') {
3372 	    i--;
3373 	  } else if (pairs[i].aaphase_g != -1) {
3374 	    last_valid_i = i;
3375 	    i--;
3376 	  } else {
3377 	    debug7(printf("BACKWARD: Saw aaphase_g of -1 at pair %d\n",i));
3378 	    starti = last_valid_i; /* inclusive */
3379 	    i = -1;
3380 	  }
3381 	}
3382       }
3383     }
3384 
3385   } else {
3386     fprintf(stderr,"Do not recognize cdstype %d\n",cdstype);
3387     abort();
3388   }
3389 
3390   debug7(Pair_dump_array(pairs,npairs,true));
3391 
3392   if (cds_p == true && endi >= npairs) {
3393     /* Want CDS, and none seen */
3394     return;
3395   }
3396 
3397   ptr = &(pairs[endi]);
3398   for (i = endi; i >= starti; i--) {
3399     /* prev = this; */
3400     this = ptr--;
3401 
3402     if (this->gapp) {
3403       if (in_exon == true) {
3404 	exon_queryend = last_querypos + 1;
3405 	exon_genomeend = last_genomepos + 1;
3406 
3407 	if (den == 0) {
3408 	  pctidentity = 100;
3409 	} else {
3410 	  pctidentity = (int) floor(100.0*(double) num/(double) den);
3411 	}
3412 
3413 	if (cds_p == true) {
3414 	  print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3415 			 exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3416 
3417 	} else {
3418 	  print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3419 			  exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3420 
3421 	}
3422 
3423 	in_exon = false;
3424       }
3425     } else if (this->comp == INTRONGAP_COMP) {
3426       /* Do nothing */
3427     } else {
3428       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
3429 	 SHORTGAP_COMP, or MISMATCH_COMP */
3430       if (in_exon == false) {
3431 	exon_querystart = this->querypos + 1;
3432 	exon_genomestart = this->genomepos + 1;
3433 #if 0
3434 	if (this->aaphase_e != -1) {
3435 	  /* Otherwise, if phase is -1 from an indel, use previous exon_phase.  Should be fixed now */
3436 	  exon_phase = this->aaphase_e;
3437 	}
3438 #else
3439 	if (cdstype == CDS_CDNA) {
3440 	  exon_phase = this->aaphase_e;
3441 	} else {
3442 	  exon_phase = this->aaphase_g;
3443 	}
3444 #endif
3445 
3446 	if (gff_introns_p == true) {
3447 	  if (i > 0) {
3448 #if 0
3449 	    printf_gff3_intron(++intronno,pathnum,sourcename,accession,chrstring,?,?,intron_start,intron_end,watsonp);
3450 #endif
3451 	  }
3452 	  PUTC('\n',fp);
3453 	}
3454 
3455 	num = den = 0;
3456 	in_exon = true;
3457       }
3458       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3459 	/* Previously not counted in numerator or denominator */
3460 	den++;
3461 
3462 #ifndef PMAP
3463       } else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3464 	/* Comp must be a space */
3465 	/* Don't count in numerator or denominator */
3466 #endif
3467       } else {
3468 	den++;
3469 	if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3470 	  num++;
3471 	} else if (this->comp == AMBIGUOUS_COMP) {
3472 #ifdef PMAP
3473 	  num++;
3474 #else
3475 	  den--;
3476 #endif
3477 	}
3478       }
3479     }
3480     if (this->cdna != ' ') {
3481       last_querypos = this->querypos;
3482     }
3483     if (this->genome != ' ') {
3484       last_genomepos = this->genomepos;
3485     }
3486   }
3487 
3488   /* prev = this; */
3489   exon_queryend = last_querypos + 1;
3490   exon_genomeend = last_genomepos + 1;
3491 
3492   if (den == 0) {
3493     pctidentity = 100;
3494   } else {
3495     pctidentity = (int) floor(100.0*(double) num/(double) den);
3496   }
3497 
3498   if (cds_p == true) {
3499     print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3500 		   exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3501   } else {
3502     print_gff3_exon(fp,++exonno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3503 		    exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity);
3504   }
3505 
3506   return;
3507 }
3508 
3509 
3510 #if 0
3511 /* Replaced by print_gff3_exons_forward */
3512 static void
3513 print_gff3_cdss_forward (Filestring_T fp, struct T *pairs, int npairs, int pathnum,
3514 			 char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
3515 			 bool watsonp, int cdna_direction) {
3516   bool in_cds = false;
3517   struct T *ptr, *this = NULL;
3518   int exon_querystart = -1, exon_queryend, exon_phase;
3519   Chrpos_T exon_genomestart = 0, exon_genomeend;
3520   int pctidentity, num = 0, den = 0, cdsno = 0;
3521 #if 0
3522   Chrpos_T intron_start, intron_end;
3523 #endif
3524   int last_querypos = -1;
3525   Chrpos_T last_genomepos = (Chrpos_T) -1;
3526 
3527   ptr = pairs;
3528   while (ptr < &(pairs[npairs])) {
3529     /* prev = this; */
3530     this = ptr++;
3531 
3532     if (in_cds == true) {
3533       if (this->aaphase_e == -1) { /* was aaphase_g */
3534 	/* End of cds */
3535 	exon_queryend = last_querypos + 1;
3536 	exon_genomeend = last_genomepos + 1;
3537 
3538 	if (den == 0) {
3539 	  pctidentity = 100;
3540 	} else {
3541 	  pctidentity = (int) floor(100.0*(double) num/(double) den);
3542 	}
3543 
3544 	print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3545 		       exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3546 
3547 	in_cds = false;
3548 
3549       } else {
3550 	/* Continuation of cds */
3551 	if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3552 	  /* Previously not counted in numerator or denominator */
3553 	  den++;
3554 
3555 #ifndef PMAP
3556 	} else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3557 	  /* Comp must be a space */
3558 	  /* Don't count in numerator or denominator */
3559 #endif
3560 	} else {
3561 	  den++;
3562 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3563 	    num++;
3564 	  } else if (this->comp == AMBIGUOUS_COMP) {
3565 #ifdef PMAP
3566 	    num++;
3567 #else
3568 	    den--;
3569 #endif
3570 	  }
3571 	}
3572       }
3573 
3574     } else {
3575       if (this->aaphase_e == -1) {
3576 	/* Continuation of non-cds */
3577       } else {
3578 	/* Start of cds */
3579 	exon_querystart = this->querypos + 1;
3580 	exon_phase = this->aaphase_e; /* ? was aaphase_g */
3581 	exon_genomestart = this->genomepos + 1;
3582 
3583 	num = den = 0;
3584 	in_cds = true;
3585       }
3586     }
3587     if (this->cdna != ' ') {
3588       last_querypos = this->querypos;
3589     }
3590     if (this->genome != ' ') {
3591       last_genomepos = this->genomepos;
3592     }
3593   }
3594 
3595   if (in_cds == true) {
3596     exon_queryend = last_querypos + 1;
3597     exon_genomeend = last_genomepos + 1;
3598 
3599     if (den == 0) {
3600       pctidentity = 100;
3601     } else {
3602       pctidentity = (int) floor(100.0*(double) num/(double) den);
3603     }
3604 
3605     print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3606 		   exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3607   }
3608 
3609   return;
3610 }
3611 #endif
3612 
3613 
3614 #if 0
3615 /* Replaced by print_gff3_exons_backward */
3616 static void
3617 print_gff3_cdss_backward (Filestring_T fp, struct T *pairs, int npairs, int pathnum,
3618 			  char *sourcename, char *accession, char *fasta_annotation, char *chrstring,
3619 			  bool watsonp, int cdna_direction) {
3620   bool in_cds = false;
3621   struct T *ptr, *this = NULL;
3622   int exon_querystart = -1, exon_queryend, exon_phase;
3623   Chrpos_T exon_genomestart = 0, exon_genomeend;
3624   int pctidentity, num = 0, den = 0, cdsno = 0;
3625 #if 0
3626   Chrpos_T intron_start, intron_end;
3627 #endif
3628   int last_querypos = -1;
3629   Chrpos_T last_genomepos = (Chrpos_T) -1;
3630 
3631 
3632   ptr = &(pairs[npairs-1]);
3633   while (ptr >= &(pairs[0])) {
3634     /* prev = this; */
3635     this = ptr--;
3636 
3637     if (in_cds == true) {
3638       if (this->aaphase_e == -1) { /* was aaphase_g */
3639 	/* End of cds */
3640 	exon_queryend = last_querypos + 1;
3641 	exon_genomeend = last_genomepos + 1;
3642 
3643 	if (den == 0) {
3644 	  pctidentity = 100;
3645 	} else {
3646 	  pctidentity = (int) floor(100.0*(double) num/(double) den);
3647 	}
3648 
3649 	print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3650 		       exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3651 
3652 	in_cds = false;
3653 
3654       } else {
3655 	/* Continuation of cds */
3656 	if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
3657 	  /* Previously not counted in numerator or denominator */
3658 	  den++;
3659 
3660 #ifndef PMAP
3661 	} else if (unknown_base(this->cdna) || unknown_base(this->genome)) {
3662 	  /* Comp must be a space */
3663 	  /* Don't count in numerator or denominator */
3664 #endif
3665 	} else {
3666 	  den++;
3667 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
3668 	    num++;
3669 	  } else if (this->comp == AMBIGUOUS_COMP) {
3670 #ifdef PMAP
3671 	    num++;
3672 #else
3673 	    den--;
3674 #endif
3675 	  }
3676 	}
3677       }
3678 
3679     } else {
3680       if (this->aaphase_e == -1) { /* was aaphase_g */
3681 	/* Continuation of non-cds */
3682       } else {
3683 	/* Start of cds */
3684 	exon_querystart = this->querypos + 1;
3685 	exon_phase = this->aaphase_e; /* ? was aaphase_g */
3686 	exon_genomestart = this->genomepos + 1;
3687 
3688 	num = den = 0;
3689 	in_cds = true;
3690       }
3691     }
3692 
3693     if (this->cdna != ' ') {
3694       last_querypos = this->querypos;
3695     }
3696     if (this->genome != ' ') {
3697       last_genomepos = this->genomepos;
3698     }
3699   }
3700 
3701   if (in_cds == true) {
3702     exon_queryend = last_querypos + 1;
3703     exon_genomeend = last_genomepos + 1;
3704 
3705     if (den == 0) {
3706       pctidentity = 100;
3707     } else {
3708       pctidentity = (int) floor(100.0*(double) num/(double) den);
3709     }
3710 
3711     print_gff3_cds(fp,++cdsno,pathnum,sourcename,accession,fasta_annotation,chrstring,exon_genomestart,exon_genomeend,
3712 		   exon_querystart,exon_queryend,watsonp,cdna_direction,pctidentity,exon_phase);
3713   }
3714 
3715   return;
3716 }
3717 #endif
3718 
3719 
3720 void
Pair_print_gff3(Filestring_T fp,struct T * pairs,int npairs,int pathnum,char * accession,char * fasta_annotation,T start,T end,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,int translation_end,int querylength_given,int skiplength,int matches,int mismatches,int qindels,int tindels,int unknowns,bool watsonp,int cdna_direction,bool gff_gene_format_p,bool gff_estmatch_format_p,char * sourcename)3721 Pair_print_gff3 (Filestring_T fp, struct T *pairs, int npairs, int pathnum, char *accession, char *fasta_annotation,
3722 		 T start, T end, Chrnum_T chrnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment,
3723 		 int translation_end, int querylength_given, int skiplength, int matches, int mismatches,
3724 		 int qindels, int tindels, int unknowns, bool watsonp, int cdna_direction,
3725 		 bool gff_gene_format_p, bool gff_estmatch_format_p, char *sourcename) {
3726   char *chrstring = NULL;
3727   Chrpos_T chrpos1, chrpos2;
3728 
3729   if (chrnum == 0) {
3730     chrstring = Sequence_accession(usersegment);
3731   } else {
3732     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
3733   }
3734 
3735   if (sourcename == NULL) {
3736     sourcename = "NA";
3737   }
3738 
3739   if (gff_gene_format_p == true) {
3740     chrpos1 = start->genomepos;
3741     chrpos2 = end->genomepos;
3742 
3743     print_gff3_gene(fp,pathnum,sourcename,accession,fasta_annotation,chrstring,chrpos1+1,chrpos2+1,watsonp,cdna_direction);
3744     print_gff3_mrna(fp,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,chrpos1+1,chrpos2+1,
3745 		    querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3746 		    watsonp,cdna_direction);
3747 
3748     if (cdna_direction >= 0) {
3749       print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3750 			       querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3751 			       watsonp,cdna_direction,/*gff_introns_p*/false,/*gff_gene_format_p*/true,
3752 			       /*gff_estmatch_format_p*/false,/*cds_p*/false);
3753       if (translation_end > 0) {
3754 #if 0
3755 	print_gff3_cdss_forward(fp,pairs,npairs,pathnum,sourcename,accession,fasta_annotation,chrstring,watsonp,
3756 				cdna_direction);
3757 #else
3758 	print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3759 				 querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3760 				 watsonp,cdna_direction,/*gff_introns_p*/false,/*gff_gene_format_p*/false,
3761 				 /*gff_estmatch_format_p*/false,/*cds_p*/true);
3762 #endif
3763       }
3764     } else {
3765       print_gff3_exons_backward(fp,pairs,npairs,pathnum,sourcename,accession,fasta_annotation,chrstring,watsonp,
3766 				cdna_direction,/*gff_introns_p*/false,/*cds_p*/false);
3767       if (translation_end > 0) {
3768 #if 0
3769 	print_gff3_cdss_backward(fp,pairs,npairs,pathnum,sourcename,accession,reestofheader,chrstring,watsonp,
3770 				 cdna_direction);
3771 #else
3772 	print_gff3_exons_backward(fp,pairs,npairs,pathnum,sourcename,accession,fasta_annotation,chrstring,watsonp,
3773 				  cdna_direction,/*gff_introns_p*/false,/*cds_p*/true);
3774 #endif
3775       }
3776     }
3777 
3778   } else {
3779     print_gff3_exons_forward(fp,pairs,npairs,pathnum,start,end,sourcename,accession,fasta_annotation,chrstring,
3780 			     querylength_given,skiplength,matches,mismatches,qindels,tindels,unknowns,
3781 			     watsonp,cdna_direction,/*gff_introns_p*/false,/*gff_gene_format_p*/false,
3782 			     gff_estmatch_format_p,/*cds_p*/false);
3783   }
3784 
3785   if (gff3_separators_p == true) {
3786     FPRINTF(fp,"###\n");		/* Terminates alignment */
3787   }
3788 
3789   if (chrnum != 0) {
3790     FREE(chrstring);
3791   }
3792 
3793   return;
3794 }
3795 
3796 
3797 /* Don't want to use SOFT_CLIPS_AVOID_CIRCULARIZATION, because the
3798    pairs array already contains the trim information */
3799 int
Pair_circularpos(int * alias,struct T * pairs,int npairs,Chrpos_T chrlength,bool plusp,int querylength)3800 Pair_circularpos (int *alias, struct T *pairs, int npairs, Chrpos_T chrlength, bool plusp, int querylength) {
3801   Chrpos_T low, high;
3802   struct T *ptr;
3803   int i, ninsertions, querypos;
3804   /* Univcoord_T chrhigh; */
3805 
3806   debug12(Pair_dump_array(pairs,npairs,true));
3807 
3808   /* chrhigh = chrlength + chrlength; */
3809   if (plusp == true) {
3810     low = pairs[0].genomepos;	/* includes "trim_left" */
3811     high = pairs[npairs-1].genomepos; /* includes "trim_right" */
3812     debug12(printf("plus: low %u, high %u, chrlength %u\n",low,high,chrlength));
3813 
3814     if (low >= chrlength) {
3815       /* All of read after trimming is in circular alias */
3816 #if 0
3817       if (high > chrhigh) {    /* Differs from code in stage3hr.c */
3818 	*alias = +2;		/* Extends beyond end of second copy */
3819       } else {
3820 	*alias = +1;		/* All of read is in second copy */
3821       }
3822 #else
3823       *alias = +1;
3824 #endif
3825       debug12(printf("Returning -1 with alias %d\n",*alias));
3826       return -1;
3827 
3828     } else if (high < chrlength) {
3829       /* All of read after trimming is in circular proper */
3830 #if 0
3831       if (low < (Chrpos_T) trim_left) {
3832 	*alias = -2;		/* Extends beyond beginning of first copy */
3833       } else {
3834 	*alias = -1;		/* All of read is in first copy */
3835       }
3836 #else
3837       *alias = -1;
3838 #endif
3839       debug12(printf("Returning -1 with alias %d\n",*alias));
3840       return -1;
3841 
3842     } else {
3843       /* Some of read is in circular proper and some is in circular alias */
3844       i = 0;
3845       ptr = pairs;
3846       ninsertions = 0;
3847 
3848       while (i++ < npairs && ptr->genomepos <= chrlength) { /* Needs to be <= for plus, < for minus */
3849 	querypos = ptr->querypos;
3850 	if (ptr->genome == ' ' && ptr->gapp == false) {
3851 	  ninsertions += 1;
3852 	}
3853 	ptr++;
3854       }
3855 
3856       *alias = 0;
3857       debug12(printf("Returning %d with no alias\n",(querypos - ninsertions)));
3858       return querypos - ninsertions;
3859     }
3860 
3861   } else {
3862     low = pairs[npairs-1].genomepos; /* includes "trim_right" */
3863     high = pairs[0].genomepos; /* includes "trim_left" */
3864     debug12(printf("minus: low %u, high %u\n",low,high));
3865 
3866     if (low >= chrlength) {
3867       /* All of read after trimming is in circular alias */
3868 #if 0
3869       if (high > chrhigh) {    /* Differs from code in stage3hr.c */
3870 	*alias = +2;		/* Extends beyond end of second copy */
3871       } else {
3872 	*alias = +1;		/* All of read is in second copy */
3873       }
3874 #else
3875       *alias = +1;
3876 #endif
3877       debug12(printf("Returning -1 with alias %d\n",*alias));
3878       return -1;
3879 
3880     } else if (high < chrlength) {
3881       /* All of read after trimming is in circular proper */
3882 #if 0
3883       if (low < (Chrpos_T) trim_right) {
3884 	*alias = -2;		/* Extends beyond beginning of first copy */
3885       } else {
3886 	*alias = -1;		/* All of read is in first copy */
3887       }
3888 #else
3889       *alias = -1;
3890 #endif
3891       debug12(printf("Returning -1 with alias %d\n",*alias));
3892       return -1;
3893 
3894     } else {
3895       /* Some of read is in circular proper and some is in circular alias */
3896       i = npairs - 1;
3897       ptr = &(pairs[i]);
3898       ninsertions = 0;
3899 
3900       while (--i >= 0 && ptr->genomepos < chrlength) { /* Needs to be <= for plus, < for minus */
3901 	querypos = ptr->querypos;
3902 	if (ptr->genome == ' ' && ptr->gapp == false) {
3903 	  ninsertions += 1;
3904 	}
3905 	--ptr;
3906       }
3907 
3908       *alias = 0;
3909       debug12(printf("Returning %d with no alias\n",(querylength - querypos - ninsertions)));
3910       return (querylength - querypos - ninsertions);
3911     }
3912   }
3913 }
3914 
3915 
3916 #ifndef PMAP
3917 void
Pair_print_bedpe(Filestring_T fp,struct T * pairarray,int npairs,Chrnum_T chrnum,bool watsonp,Univ_IIT_T chromosome_iit)3918 Pair_print_bedpe (Filestring_T fp, struct T *pairarray, int npairs,
3919 		  Chrnum_T chrnum, bool watsonp, Univ_IIT_T chromosome_iit) {
3920   bool in_exon = true;
3921   struct T *ptr, *ptr0, *this = NULL, *start;
3922   Chrpos_T exon_genomestart = 0, exon_genomeend;
3923   int nindels, i;
3924   /* int last_querypos = -1; */
3925   Chrpos_T last_genomepos = (Chrpos_T) -1;
3926   char *chr, strand;
3927   bool allocp;
3928 
3929 
3930 #if 0
3931   if (invertedp == true) {
3932     pairs = invert_and_revcomp_path_and_coords(pairs_querydir,npairs,querylength);
3933     watsonp = !watsonp;
3934     cdna_direction = -cdna_direction;
3935   } else {
3936     pairs = pairs_querydir;
3937   }
3938 #endif
3939 
3940 
3941   chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
3942   if (watsonp == true) {
3943     strand = '+';
3944   } else {
3945     strand = '-';
3946   }
3947 
3948 
3949   ptr = pairarray;
3950   /* exon_querystart = ptr->querypos + 1; */
3951   exon_genomestart = ptr->genomepos + 1;
3952 
3953 
3954   i = 0;
3955   while (i < npairs) {
3956     /* prev = this; */
3957     this = ptr++;
3958     i++;
3959 
3960     if (this->gapp) {
3961       if (in_exon == true) {
3962 	/* SPLICE START */
3963 	ptr0 = ptr;
3964 	while (ptr0->gapp) {
3965 	  ptr0++;
3966 	}
3967 	/* exon_queryend = last_querypos + 1; */
3968 	exon_genomeend = last_genomepos + 1;
3969 
3970 	in_exon = false;
3971       }
3972     } else if (this->comp == INTRONGAP_COMP) {
3973       /* May want to print dinucleotides */
3974 
3975     } else {
3976       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
3977 	 SHORTGAP_COMP, or MISMATCH_COMP */
3978       if (in_exon == false) {
3979 	/* SPLICE CONTINUATION */
3980 	/* exon_querystart = this->querypos + 1; */
3981 	exon_genomestart = this->genomepos + 1;
3982 
3983 	in_exon = true;
3984 	if (strand == '+') {
3985 	  FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
3986 	  FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
3987 	  FPRINTF(fp,"DELETION\t0\t");
3988 	  FPRINTF(fp,"+\t+\t");
3989 	  FPRINTF(fp,"%d\n",exon_genomestart - exon_genomeend - 1);
3990 	} else {
3991 	  FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
3992 	  FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
3993 	  FPRINTF(fp,"DELETION\t0\t");
3994 	  FPRINTF(fp,"+\t+\t");
3995 	  FPRINTF(fp,"%d\n",exon_genomeend - exon_genomestart - 1);
3996 	}
3997       }
3998 
3999       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4000 	if (this->genome == ' ') {
4001 	  /* INSERTION */
4002 	  /* exon_queryend = last_querypos + 1; */
4003 	  exon_genomeend = last_genomepos + 1;
4004 
4005 	  /* indel_pos = this->querypos; */
4006 	  start = this;
4007 	  nindels = 0;
4008 	  while (i < npairs && this->gapp == false && this->genome == ' ') {
4009 	    nindels++;
4010 	    this = ptr++;
4011 	    i++;
4012 	  }
4013 	  if (i < npairs) {
4014 	    ptr--;
4015 	    i--;
4016 	    this = ptr;
4017 	  }
4018 
4019 	  /* exon_querystart = this->querypos + 1; */
4020 	  exon_genomestart = this->genomepos + 1;
4021 
4022 	  if (strand == '+') {
4023 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4024 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4025 	    FPRINTF(fp,"INSERTION\t0\t");
4026 	    FPRINTF(fp,"+\t+\t");
4027 	    while (start < this) {
4028 	      FPRINTF(fp,"%c",start->cdna);
4029 	      start++;
4030 	    }
4031 	  } else {
4032 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4033 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4034 	    FPRINTF(fp,"INSERTION\t0\t");
4035 	    FPRINTF(fp,"+\t+\t");
4036 	    while (start < this) {
4037 	      FPRINTF(fp,"%c",complCode[(int) start->cdna]);
4038 	      start++;
4039 	    }
4040 	  }
4041 	  FPRINTF(fp,"\n");
4042 
4043 	} else if (this->cdna == ' ') {
4044 	  /* DELETION */
4045 	  /* exon_queryend = last_querypos + 1; */
4046 	  exon_genomeend = last_genomepos + 1;
4047 
4048 	  /* indel_pos = this->querypos; */
4049 	  nindels = 0;
4050 	  while (i < npairs && this->gapp == false && this->cdna == ' ') {
4051 	    nindels++;
4052 	    this = ptr++;
4053 	    i++;
4054 	  }
4055 	  if (i < npairs) {
4056 	    ptr--;
4057 	    i--;
4058 	    this = ptr;
4059 	  }
4060 
4061 	  /* exon_querystart = this->querypos + 1; */
4062 	  exon_genomestart = this->genomepos + 1;
4063 
4064 	  if (strand == '+') {
4065 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4066 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4067 	  } else {
4068 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomestart-1,exon_genomestart);
4069 	    FPRINTF(fp,"%s\t%u\t%u\t",chr,exon_genomeend-1,exon_genomeend);
4070 	  }
4071 	  FPRINTF(fp,"DELETION\t0\t");
4072 	  FPRINTF(fp,"+\t+\t");
4073 	  FPRINTF(fp,"%d\n",nindels);
4074 
4075 	} else {
4076 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4077 	  exit(9);
4078 	}
4079 
4080       } else {
4081 	/* Match or mismatch */
4082       }
4083     }
4084 
4085 #if 0
4086     if (this->cdna != ' ') {
4087       last_querypos = this->querypos;
4088     }
4089 #endif
4090     if (this->genome != ' ') {
4091       last_genomepos = this->genomepos;
4092     }
4093   }
4094 
4095   if (allocp) {
4096     FREE(chr);
4097   }
4098 
4099 #if 0
4100   if (invertedp == true) {
4101     FREE(pairs);
4102   }
4103 #endif
4104 
4105   return;
4106 }
4107 #endif
4108 
4109 
4110 
4111 #ifdef GSNAP
4112 static double
blast_bitscore(int alignlength,int nmismatches)4113 blast_bitscore (int alignlength, int nmismatches) {
4114   double k = 0.1;
4115   double lambda = 1.58;		/* For a +1, -1 scoring scheme */
4116   double score;
4117 
4118   score = (double) ((alignlength - nmismatches) /* scored as +1 */ - nmismatches /* scored as -1 */);
4119   return (score * lambda - log(k)) / log(2.0);
4120 }
4121 
4122 
4123 static void
print_m8_line(Filestring_T fp,int exon_querystart,int exon_queryend,char * chr,Chrpos_T exon_genomestart,Chrpos_T exon_genomeend,int nmismatches_bothdiff,Shortread_T headerseq,char * acc_suffix)4124 print_m8_line (Filestring_T fp, int exon_querystart, int exon_queryend,
4125 	       char *chr, Chrpos_T exon_genomestart, Chrpos_T exon_genomeend,
4126 	       int nmismatches_bothdiff, Shortread_T headerseq, char *acc_suffix) {
4127   double identity;
4128   int alignlength_trim;
4129 
4130   FPRINTF(fp,"%s%s",Shortread_accession(headerseq),acc_suffix); /* field 0: accession */
4131 
4132   FPRINTF(fp,"\t%s",chr);	/* field 1: chr */
4133 
4134   /* field 2: identity */
4135   alignlength_trim = exon_queryend - exon_querystart;
4136   identity = (double) (alignlength_trim - nmismatches_bothdiff)/(double) alignlength_trim;
4137   FPRINTF(fp,"\t%.1f",100.0*identity);
4138 
4139 
4140   FPRINTF(fp,"\t%d",alignlength_trim); /* field 3: query length */
4141 
4142   FPRINTF(fp,"\t%d",nmismatches_bothdiff); /* field 4: nmismatches */
4143 
4144   FPRINTF(fp,"\t0");		/* field 5: gap openings */
4145 
4146   /* fields 6 and 7: query start and end */
4147   FPRINTF(fp,"\t%d\t%d",exon_querystart,exon_queryend);
4148 
4149   /* fields 8 and 9: chr start and end */
4150   FPRINTF(fp,"\t%u\t%u",exon_genomestart,exon_genomeend);
4151 
4152   /* field 10: E value */
4153   FPRINTF(fp,"\t%.2g",blast_evalue(alignlength_trim,nmismatches_bothdiff));
4154 
4155  /* field 11: bit score */
4156   FPRINTF(fp,"\t%.1f",blast_bitscore(alignlength_trim,nmismatches_bothdiff));
4157 
4158   FPRINTF(fp,"\n");
4159 
4160   return;
4161 }
4162 
4163 
4164 void
Pair_print_m8(Filestring_T fp,struct T * pairs_querydir,int npairs,bool invertedp,Chrnum_T chrnum,Shortread_T queryseq,Shortread_T headerseq,char * acc_suffix,Univ_IIT_T chromosome_iit)4165 Pair_print_m8 (Filestring_T fp, struct T *pairs_querydir, int npairs, bool invertedp,
4166 	       Chrnum_T chrnum, Shortread_T queryseq, Shortread_T headerseq,
4167 	       char *acc_suffix, Univ_IIT_T chromosome_iit) {
4168   bool in_exon = true;
4169   struct T *pairs, *ptr, *ptr0, *this = NULL;
4170   int exon_querystart = -1, exon_queryend;
4171   Chrpos_T exon_genomestart = 0, exon_genomeend;
4172   int nmismatches_refdiff, nmismatches_bothdiff, nmatches, i;
4173   int last_querypos = -1;
4174   Chrpos_T last_genomepos = (Chrpos_T) -1;
4175   char *chr;
4176   int querylength;
4177   bool allocp;
4178 
4179   querylength = Shortread_fulllength(queryseq);
4180 
4181   if (invertedp == true) {
4182     pairs = invert_and_revcomp_path_and_coords(pairs_querydir,npairs,querylength);
4183   } else {
4184     pairs = pairs_querydir;
4185   }
4186 
4187 
4188   chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
4189 
4190   ptr = pairs;
4191   exon_querystart = ptr->querypos + 1;
4192   exon_genomestart = ptr->genomepos + 1;
4193   nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4194 
4195   i = 0;
4196   while (i < npairs) {
4197     this = ptr++;
4198     i++;
4199 
4200     if (this->gapp) {
4201       if (in_exon == true) {
4202 	/* SPLICE START */
4203 	ptr0 = ptr;
4204 	while (ptr0->gapp) {
4205 	  ptr0++;
4206 	}
4207 	exon_queryend = last_querypos + 1;
4208 	exon_genomeend = last_genomepos + 1;
4209 
4210 	print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
4211 		      nmismatches_bothdiff,headerseq,acc_suffix);
4212 
4213 	nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4214 
4215 	in_exon = false;
4216       }
4217     } else if (this->comp == INTRONGAP_COMP) {
4218       /* May want to print dinucleotides */
4219 
4220     } else {
4221       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4222 	 SHORTGAP_COMP, or MISMATCH_COMP */
4223       if (in_exon == false) {
4224 	/* SPLICE CONTINUATION */
4225 	exon_querystart = this->querypos + 1;
4226 	exon_genomestart = this->genomepos + 1;
4227 
4228 	in_exon = true;
4229       }
4230       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4231 	if (this->genome == ' ') {
4232 	  /* INSERTION */
4233 	  exon_queryend = last_querypos + 1;
4234 	  exon_genomeend = last_genomepos + 1;
4235 
4236 	  /* indel_pos = this->querypos; */
4237 	  while (i < npairs && this->gapp == false && this->genome == ' ') {
4238 	    this = ptr++;
4239 	    i++;
4240 	  }
4241 	  if (i < npairs) {
4242 	    ptr--;
4243 	    i--;
4244 
4245 	    this = ptr;
4246 	    exon_querystart = this->querypos + 1;
4247 	    exon_genomestart = this->genomepos + 1;
4248 	    nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4249 	  }
4250 
4251 	} else if (this->cdna == ' ') {
4252 	  /* DELETION */
4253 	  exon_queryend = last_querypos + 1;
4254 	  exon_genomeend = last_genomepos + 1;
4255 
4256 	  /* indel_pos = this->querypos; */
4257 	  while (i < npairs && this->gapp == false && this->cdna == ' ') {
4258 	    this = ptr++;
4259 	    i++;
4260 	  }
4261 	  if (i < npairs) {
4262 	    ptr--;
4263 	    i--;
4264 	  }
4265 
4266 	  /* Finish rest of this line */
4267 	  print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
4268 			nmismatches_bothdiff,headerseq,acc_suffix);
4269 
4270 	  if (i < npairs) {
4271 	    this = ptr;
4272 	    exon_querystart = this->querypos + 1;
4273 	    exon_genomestart = this->genomepos + 1;
4274 	    nmismatches_refdiff = nmismatches_bothdiff = nmatches = 0;
4275 	  }
4276 
4277 	} else {
4278 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4279 	  exit(9);
4280 	}
4281 
4282       } else {
4283 	/* c = this->genome; */
4284 	if (this->genome == this->cdna) {
4285 	  nmatches++;
4286 	} else if (this->genomealt == this->cdna) {
4287 	  nmismatches_refdiff++;
4288 	} else {
4289 	  nmismatches_bothdiff++;
4290 	  nmismatches_refdiff++;
4291 	}
4292       }
4293     }
4294 
4295     if (this->cdna != ' ') {
4296       last_querypos = this->querypos;
4297     }
4298     if (this->genome != ' ') {
4299       last_genomepos = this->genomepos;
4300     }
4301   }
4302 
4303   exon_queryend = last_querypos + 1;
4304   exon_genomeend = last_genomepos + 1;
4305 
4306   print_m8_line(fp,exon_querystart,exon_queryend,chr,exon_genomestart,exon_genomeend,
4307 		nmismatches_bothdiff,headerseq,acc_suffix);
4308 
4309   if (allocp) {
4310     FREE(chr);
4311   }
4312 
4313   if (invertedp == true) {
4314     FREE(pairs);
4315   }
4316 
4317   return;
4318 }
4319 #endif
4320 
4321 
4322 #if 0
4323 double
4324 Pair_min_evalue (struct T *pairarray, int npairs) {
4325   double min_evalue = 1000.0, evalue;
4326   bool in_exon = true;
4327   struct T *ptr, *ptr0, *this = NULL;
4328   int alignlength_trim, exon_querystart = -1, exon_queryend;
4329   int nmismatches_bothdiff, i;
4330   int last_querypos = -1;
4331 
4332 
4333   ptr = pairarray;
4334   exon_querystart = ptr->querypos + 1;
4335   nmismatches_bothdiff = 0;
4336 
4337   i = 0;
4338   while (i < npairs) {
4339     this = ptr++;
4340     i++;
4341 
4342     if (this->gapp) {
4343       if (in_exon == true) {
4344 	/* SPLICE START */
4345 	ptr0 = ptr;
4346 	while (ptr0->gapp) {
4347 	  ptr0++;
4348 	}
4349 	exon_queryend = last_querypos + 1;
4350 
4351 	alignlength_trim = exon_queryend - exon_querystart;
4352 	assert(alignlength_trim >= 0);
4353 	if ((evalue = blast_evalue(alignlength_trim,nmismatches_bothdiff)) < min_evalue) {
4354 	  min_evalue = evalue;
4355 	}
4356 
4357 	nmismatches_bothdiff = 0;
4358 
4359 	in_exon = false;
4360       }
4361     } else if (this->comp == INTRONGAP_COMP) {
4362       /* May want to print dinucleotides */
4363 
4364     } else {
4365       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4366 	 SHORTGAP_COMP, or MISMATCH_COMP */
4367       if (in_exon == false) {
4368 	/* SPLICE CONTINUATION */
4369 	exon_querystart = this->querypos + 1;
4370 
4371 	in_exon = true;
4372       }
4373       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4374 	if (this->genome == ' ') {
4375 	  /* INSERTION */
4376 	  exon_queryend = last_querypos + 1;
4377 
4378 	  /* indel_pos = this->querypos; */
4379 	  while (i < npairs && this->gapp == false && this->genome == ' ') {
4380 	    this = ptr++;
4381 	    i++;
4382 	  }
4383 	  if (i < npairs) {
4384 	    ptr--;
4385 	    i--;
4386 	    this = ptr;
4387 	  }
4388 
4389 	  exon_querystart = this->querypos + 1;
4390 	  nmismatches_bothdiff = 0;
4391 
4392 	} else if (this->cdna == ' ') {
4393 	  /* DELETION */
4394 	  exon_queryend = last_querypos + 1;
4395 
4396 	  /* indel_pos = this->querypos; */
4397 	  while (i < npairs && this->gapp == false && this->cdna == ' ') {
4398 	    this = ptr++;
4399 	    i++;
4400 	  }
4401 	  if (i < npairs) {
4402 	    ptr--;
4403 	    i--;
4404 	    this = ptr;
4405 	  }
4406 
4407 	  /* Finish rest of this line */
4408 	  alignlength_trim = exon_queryend - exon_querystart;
4409 	  assert(alignlength_trim >= 0);
4410 	  if ((evalue = blast_evalue(alignlength_trim,nmismatches_bothdiff)) < min_evalue) {
4411 	    min_evalue = evalue;
4412 	  }
4413 
4414 	  exon_querystart = this->querypos + 1;
4415 	  nmismatches_bothdiff = 0;
4416 
4417 	} else {
4418 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4419 	  exit(9);
4420 	}
4421 
4422       } else {
4423 	/* c = this->genome; */
4424 	if (this->genome == this->cdna) {
4425 	  /* nmatches++; */
4426 	} else if (this->genomealt == this->cdna) {
4427 	  /* nmismatches_refdiff++; */
4428 	} else {
4429 	  nmismatches_bothdiff++;
4430 	  /* nmismatches_refdiff++; */
4431 	}
4432       }
4433     }
4434 
4435     if (this->cdna != ' ') {
4436       last_querypos = this->querypos;
4437     }
4438   }
4439 
4440   exon_queryend = last_querypos + 1;
4441 
4442   alignlength_trim = exon_queryend - exon_querystart;
4443   assert(alignlength_trim >= 0);
4444   if ((evalue = blast_evalue(alignlength_trim,nmismatches_bothdiff)) < min_evalue) {
4445     min_evalue = evalue;
4446   }
4447 
4448   return min_evalue;
4449 }
4450 #endif
4451 
4452 
4453 /* Modified from print_endtypes */
4454 static void
splice_site_probs(double * sense_prob,double * antisense_prob,bool prev_splicesitep,bool splicesitep,Univcoord_T chroffset,int exon_genomestart,int exon_genomeend,bool watsonp)4455 splice_site_probs (double *sense_prob, double *antisense_prob,
4456 		   bool prev_splicesitep, bool splicesitep, Univcoord_T chroffset,
4457 		   int exon_genomestart, int exon_genomeend, bool watsonp) {
4458 
4459   if (prev_splicesitep == true) {
4460     if (watsonp == true) {
4461       /* printf("watsonp is true, so looking up acceptor/antidonor at %u+%u-1\n",chroffset,exon_genomestart); */
4462       *sense_prob += Maxent_hr_acceptor_prob(chroffset+exon_genomestart-1,chroffset);
4463       *antisense_prob += Maxent_hr_antidonor_prob(chroffset+exon_genomestart-1,chroffset);
4464     } else {
4465       /* printf("watsonp is false, so looking up antiacceptor/donor at %u+%u\n",chroffset,exon_genomestart); */
4466       *sense_prob += Maxent_hr_antiacceptor_prob(chroffset+exon_genomestart,chroffset);
4467       *antisense_prob += Maxent_hr_donor_prob(chroffset+exon_genomestart,chroffset);
4468     }
4469   }
4470 
4471   if (splicesitep == true) {
4472     if (watsonp == true) {
4473       /* printf("watsonp is true, so looking up donor/antiacceptor at %u+%u\n",chroffset,exon_genomeend); */
4474       *sense_prob += Maxent_hr_donor_prob(chroffset+exon_genomeend,chroffset);
4475       *antisense_prob += Maxent_hr_antiacceptor_prob(chroffset+exon_genomeend,chroffset);
4476     } else {
4477       /* printf("watsonp is false, so looking up antiacceptor/donor at %u+%u-1\n",chroffset,exon_genomeend); */
4478       *sense_prob += Maxent_hr_antidonor_prob(chroffset+exon_genomeend-1,chroffset);
4479       *antisense_prob += Maxent_hr_acceptor_prob(chroffset+exon_genomeend-1,chroffset);
4480     }
4481   }
4482   /* printf("sense %g, antisense %g\n",*sense_prob,*antisense_prob); */
4483 
4484   return;
4485 }
4486 
4487 
4488 /* Modified from Pair_print_gsnap */
4489 int
Pair_guess_cdna_direction_array(int * sensedir,struct T * pairs_querydir,int npairs,bool invertedp,Univcoord_T chroffset,bool watsonp)4490 Pair_guess_cdna_direction_array (int *sensedir, struct T *pairs_querydir, int npairs, bool invertedp,
4491 				 Univcoord_T chroffset, bool watsonp) {
4492   double sense_prob = 0.0, antisense_prob = 0.0;
4493   bool in_exon = true;
4494   struct T *pairs, *ptr, *this = NULL;
4495   int i;
4496   Chrpos_T exon_genomestart = 0, exon_genomeend;
4497   Chrpos_T last_genomepos = (Chrpos_T) -1;
4498   bool splicesitep, prev_splicesitep;
4499 
4500 
4501   if (invertedp == true) {
4502     fprintf(stderr,"Pair_guess_cdna_direction cannot handle invertedp\n");
4503     /* pairs = invert_and_revcomp_path_and_coords(pairs_querydir,npairs,querylength); */
4504     /* watsonp = !watsonp; */
4505     abort();
4506   } else {
4507     pairs = pairs_querydir;
4508   }
4509 
4510   if (pairs == NULL) {
4511     *sensedir = SENSE_NULL;
4512     return 0;
4513   } else {
4514     ptr = pairs;
4515     exon_genomestart = ptr->genomepos + 1;
4516     splicesitep = false;
4517   }
4518 
4519   i = 0;
4520   while (i < npairs) {
4521     this = ptr++;
4522     i++;
4523 
4524     if (this->gapp) {
4525       if (in_exon == true) {
4526 	/* SPLICE START */
4527 #if 0
4528 	ptr0 = ptr;
4529 	while (ptr0->gapp) {
4530 	  ptr0++;
4531 	}
4532 #endif
4533 	exon_genomeend = last_genomepos + 1;
4534 
4535 	prev_splicesitep = splicesitep;
4536 	splicesitep = true;
4537 
4538 	splice_site_probs(&sense_prob,&antisense_prob,
4539 			  prev_splicesitep,splicesitep,chroffset,
4540 			  exon_genomestart,exon_genomeend,watsonp);
4541 
4542 	in_exon = false;
4543       }
4544     } else if (this->comp == INTRONGAP_COMP) {
4545       /* May want to print dinucleotides */
4546 
4547     } else {
4548       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4549 	 SHORTGAP_COMP, or MISMATCH_COMP */
4550       if (in_exon == false) {
4551 	/* SPLICE CONTINUATION */
4552 	exon_genomestart = this->genomepos + 1;
4553 	in_exon = true;
4554       }
4555       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4556 	if (this->genome == ' ') {
4557 	  /* INSERTION */
4558 	  exon_genomeend = last_genomepos + 1;
4559 	  prev_splicesitep = splicesitep;
4560 	  splicesitep = false;
4561 
4562 	  while (i < npairs && this->gapp == false && this->genome == ' ') {
4563 	    this = ptr++;
4564 	    i++;
4565 	  }
4566 	  if (i < npairs) {
4567 	    ptr--;
4568 	    i--;
4569 	    this = ptr;
4570 	  }
4571 
4572 	  splice_site_probs(&sense_prob,&antisense_prob,
4573 			    prev_splicesitep,splicesitep,chroffset,
4574 			    exon_genomestart,exon_genomeend,watsonp);
4575 
4576 	  exon_genomestart = this->genomepos + 1;
4577 
4578 	} else if (this->cdna == ' ') {
4579 	  /* DELETION */
4580 	  exon_genomeend = last_genomepos + 1;
4581 	  prev_splicesitep = splicesitep;
4582 	  splicesitep = false;
4583 
4584 	  while (i < npairs && this->gapp == false && this->cdna == ' ') {
4585 	    this = ptr++;
4586 	    i++;
4587 	  }
4588 	  if (i < npairs) {
4589 	    ptr--;
4590 	    i--;
4591 	    this = ptr;
4592 	  }
4593 
4594 	  splice_site_probs(&sense_prob,&antisense_prob,
4595 			    prev_splicesitep,splicesitep,chroffset,
4596 			    exon_genomestart,exon_genomeend,watsonp);
4597 
4598 	  exon_genomestart = this->genomepos + 1;
4599 
4600 	} else {
4601 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4602 	  exit(9);
4603 	}
4604 
4605       }
4606     }
4607 
4608     if (this->genome != ' ') {
4609       last_genomepos = this->genomepos;
4610     }
4611   }
4612 
4613   exon_genomeend = last_genomepos + 1;
4614   prev_splicesitep = splicesitep;
4615   splicesitep = false;
4616 
4617   splice_site_probs(&sense_prob,&antisense_prob,
4618 		    prev_splicesitep,splicesitep,chroffset,
4619 		    exon_genomestart,exon_genomeend,watsonp);
4620 
4621   if (invertedp == true) {
4622     FREE(pairs);
4623   }
4624 
4625   if (sense_prob > antisense_prob) {
4626     *sensedir = SENSE_FORWARD;
4627     return +1;
4628   } else if (sense_prob < antisense_prob) {
4629     *sensedir = SENSE_ANTI;
4630     return -1;
4631   } else {
4632     *sensedir = SENSE_NULL;
4633     return 0;
4634   }
4635 }
4636 
4637 
4638 #if 0
4639 static char
4640 get_genomic_nt_array (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
4641 		      bool watsonp) {
4642   char c2, c2_alt;
4643   Univcoord_T pos;
4644 
4645   if (watsonp) {
4646     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
4647       *g_alt = '*';
4648       return '*';
4649 
4650     } else if (pos >= chrhigh) {
4651       *g_alt = '*';
4652       return '*';
4653 
4654     } else {
4655       return Genome_get_char_blocks(&(*g_alt),pos);
4656     }
4657 
4658   } else {
4659     /* coordinates already processed by Pair_set_genomepos */
4660     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
4661       return '*';
4662 
4663     } else if (pos >= chrhigh) {
4664       return '*';
4665 
4666     } else {
4667       c2 = Genome_get_char_blocks(&c2_alt,pos);
4668     }
4669     *g_alt = complCode[(int) c2_alt];
4670     return complCode[(int) c2];
4671   }
4672 }
4673 #endif
4674 
4675 
4676 void
Pair_fix_cdna_direction_array(struct T * pairs_querydir,int npairs,int cdna_direction)4677 Pair_fix_cdna_direction_array (struct T *pairs_querydir, int npairs, int cdna_direction) {
4678   struct T *ptr, *this = NULL;
4679   int i;
4680 
4681   ptr = pairs_querydir;
4682   i = 0;
4683 
4684   while (i < npairs) {
4685     this = ptr++;
4686     i++;
4687 
4688     if (this->gapp && this->comp == NONINTRON_COMP) {
4689       if (cdna_direction > 0) {
4690 	switch (this->introntype) {
4691 	case GTAG_FWD: this->comp = FWD_CANONICAL_INTRON_COMP; break;
4692 	case GCAG_FWD: this->comp = FWD_GCAG_INTRON_COMP; break;
4693 	case ATAC_FWD: this->comp = FWD_ATAC_INTRON_COMP; break;
4694 	default: this->comp = NONINTRON_COMP;
4695 	}
4696 #ifndef PMAP
4697       } else if (cdna_direction < 0) {
4698 	switch (this->introntype) {
4699 	case ATAC_REV: this->comp = REV_ATAC_INTRON_COMP; break;
4700 	case GCAG_REV: this->comp = REV_GCAG_INTRON_COMP; break;
4701 	case GTAG_REV: this->comp = REV_CANONICAL_INTRON_COMP; break;
4702 	default: this->comp = NONINTRON_COMP; break;
4703 	}
4704 #endif
4705       }
4706     }
4707   }
4708 
4709   return;
4710 }
4711 
4712 
4713 
4714 int
Pair_gsnap_nsegments(int * total_nmismatches,int * total_nindels,int * nintrons,int * nindelbreaks,struct T * pairs,int npairs,int querylength)4715 Pair_gsnap_nsegments (int *total_nmismatches, int *total_nindels, int *nintrons,
4716 		      int *nindelbreaks, struct T *pairs, int npairs, int querylength) {
4717   int nsegments = 0;
4718   bool in_exon = true;
4719   struct T *ptr, *ptr0, *this = NULL;
4720   int i;
4721 
4722   ptr = pairs;
4723   *total_nindels = 0;
4724   *nintrons = 0;
4725   *nindelbreaks = 0;
4726 
4727   /* *total_nmismatches = 0; */
4728   *total_nmismatches = pairs[0].querypos + (querylength - pairs[npairs-1].querypos);
4729 
4730   i = 0;
4731   while (i < npairs) {
4732     this = ptr++;
4733     i++;
4734 
4735     if (this->gapp) {
4736       if (in_exon == true) {
4737 	/* SPLICE START */
4738 	ptr0 = ptr;
4739 	while (ptr0->gapp) {
4740 	  ptr0++;
4741 	}
4742 
4743 	(*nintrons) += 1;
4744 	nsegments++;
4745 
4746 	in_exon = false;
4747       }
4748     } else if (this->comp == INTRONGAP_COMP) {
4749       /* May want to print dinucleotides */
4750 
4751     } else {
4752       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
4753 	 SHORTGAP_COMP, or MISMATCH_COMP */
4754       if (in_exon == false) {
4755 	/* SPLICE CONTINUATION */
4756 	in_exon = true;
4757       }
4758       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
4759 	if (this->genome == ' ') {
4760 	  /* INSERTION */
4761 	  while (i < npairs && this->genome == ' ') {
4762 	    (*total_nindels) += 1;
4763 	    this = ptr++;
4764 	    i++;
4765 	  }
4766 	  if (i < npairs) {
4767 	    ptr--;
4768 	    i--;
4769 	  }
4770 
4771 	  (*nindelbreaks) += 1;
4772 	  nsegments++;
4773 
4774 	} else if (this->cdna == ' ') {
4775 	  /* DELETION */
4776 	  while (i < npairs && this->cdna == ' ') {
4777 	    (*total_nindels) += 1;
4778 	    this = ptr++;
4779 	    i++;
4780 	  }
4781 	  if (i < npairs) {
4782 	    ptr--;
4783 	    i--;
4784 	  }
4785 
4786 	  (*nindelbreaks) += 1;
4787 	  nsegments++;
4788 
4789 	} else {
4790 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
4791 	  exit(9);
4792 	}
4793 
4794       } else if (this->genome != this->cdna) {
4795 	(*total_nmismatches) += 1;
4796       }
4797     }
4798   }
4799 
4800   nsegments++;
4801 
4802   return nsegments;
4803 }
4804 
4805 
4806 
4807 /************************************************************************
4808  *   SAM
4809  ************************************************************************/
4810 
4811 /* Modeled after Shortread_print_chopped */
4812 static void
print_chopped(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4813 print_chopped (Filestring_T fp, char *contents, int querylength,
4814 	       int hardclip_start, int hardclip_end) {
4815   int i;
4816 
4817   for (i = hardclip_start; i < querylength - hardclip_end; i++) {
4818     PUTC(contents[i],fp);
4819   }
4820   return;
4821 }
4822 
4823 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
4824 static void
print_chopped_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4825 print_chopped_revcomp (Filestring_T fp, char *contents, int querylength,
4826 		       int hardclip_start, int hardclip_end) {
4827   int i;
4828 
4829   for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
4830     PUTC(complCode[(int) contents[i]],fp);
4831   }
4832   return;
4833 }
4834 
4835 
4836 static void
print_chopped_end(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4837 print_chopped_end (Filestring_T fp, char *contents, int querylength,
4838 		   int hardclip_start, int hardclip_end) {
4839   int i;
4840 
4841   for (i = 0; i < hardclip_start; i++) {
4842     PUTC(contents[i],fp);
4843   }
4844 
4845   /* No separator */
4846 
4847   for (i = querylength - hardclip_end; i < querylength; i++) {
4848     PUTC(contents[i],fp);
4849   }
4850 
4851   return;
4852 }
4853 
4854 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
4855 static void
print_chopped_end_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)4856 print_chopped_end_revcomp (Filestring_T fp, char *contents, int querylength,
4857 			   int hardclip_start, int hardclip_end) {
4858   int i;
4859 
4860   for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
4861     PUTC(complCode[(int) contents[i]],fp);
4862   }
4863 
4864   /* No separator */
4865 
4866   for (i = hardclip_start - 1; i >= 0; --i) {
4867     PUTC(complCode[(int) contents[i]],fp);
4868   }
4869 
4870   return;
4871 }
4872 
4873 
4874 static void
print_chopped_end_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)4875 print_chopped_end_quality (Filestring_T fp, char *quality, int querylength,
4876 			   int hardclip_start, int hardclip_end) {
4877   int i;
4878 
4879   if (hardclip_start > 0) {
4880     for (i = 0; i < hardclip_start; i++) {
4881       PUTC(quality[i],fp);
4882     }
4883     return;
4884 
4885   } else {
4886     for (i = querylength - hardclip_end; i < querylength; i++) {
4887       PUTC(quality[i],fp);
4888     }
4889     return;
4890   }
4891 }
4892 
4893 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
4894 static void
print_chopped_end_quality_reverse(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)4895 print_chopped_end_quality_reverse (Filestring_T fp, char *quality, int querylength,
4896 				   int hardclip_start, int hardclip_end) {
4897   int i;
4898 
4899   if (hardclip_start > 0) {
4900     for (i = hardclip_start - 1; i >= 0; --i) {
4901       PUTC(quality[i],fp);
4902     }
4903     return;
4904 
4905   } else {
4906     for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
4907       PUTC(quality[i],fp);
4908     }
4909     return;
4910   }
4911 }
4912 
4913 
4914 
4915 /* Modeled after Shortread_print_quality */
4916 static void
print_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)4917 print_quality (Filestring_T fp, char *quality, int querylength,
4918 	       int hardclip_start, int hardclip_end, int shift) {
4919   int i;
4920   int c;
4921 
4922   if (quality == NULL) {
4923     PUTC('*',fp);
4924   } else {
4925     for (i = hardclip_start; i < querylength - hardclip_end; i++) {
4926       if ((c = quality[i] + shift) <= 32) {
4927 	fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable.  May need to specify --quality-protocol or --quality-print-shift\n",
4928 		shift,quality[i]);
4929 	abort();
4930       } else {
4931 	PUTC(c,fp);
4932       }
4933     }
4934   }
4935   return;
4936 }
4937 
4938 
4939 static void
print_quality_revcomp(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)4940 print_quality_revcomp (Filestring_T fp, char *quality, int querylength,
4941 		       int hardclip_start, int hardclip_end, int shift) {
4942   int i;
4943   int c;
4944 
4945   if (quality == NULL) {
4946     PUTC('*',fp);
4947   } else {
4948     for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
4949       if ((c = quality[i] + shift) <= 32) {
4950 	fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable.  May need to specify --quality-protocol or --quality-print-shift\n",
4951 		shift,quality[i]);
4952 	abort();
4953       } else {
4954 	PUTC(c,fp);
4955       }
4956     }
4957   }
4958 
4959   return;
4960 }
4961 
4962 
4963 /* Only for GMAP program */
4964 static unsigned int
compute_sam_flag_nomate(int npaths,bool first_read_p,bool watsonp,bool sam_paired_p)4965 compute_sam_flag_nomate (int npaths, bool first_read_p, bool watsonp, bool sam_paired_p) {
4966   unsigned int flag = 0U;
4967 
4968   if (sam_paired_p == true) {
4969     flag |= PAIRED_READ;
4970     if (first_read_p == true) {
4971       flag |= FIRST_READ_P;
4972     } else {
4973       flag |= SECOND_READ_P;
4974     }
4975   }
4976 
4977   if (npaths == 0) {
4978     flag |= QUERY_UNMAPPED;
4979   } else if (watsonp == false) {
4980     flag |= QUERY_MINUSP;
4981   }
4982 
4983 #if 0
4984   /* Will let external program decide what is primary */
4985   if (pathnum > 1) {
4986     flag |= NOT_PRIMARY;
4987   }
4988 #endif
4989 
4990   return flag;
4991 }
4992 
4993 
4994 
4995 void
Pair_print_sam_nomapping(Filestring_T fp,char * abbrev,char * acc1,char * acc2,char * queryseq_ptr,char * quality_string,int querylength,int quality_shift,bool first_read_p,bool sam_paired_p,char * sam_read_group_id)4996 Pair_print_sam_nomapping (Filestring_T fp, char *abbrev, char *acc1, char *acc2, char *queryseq_ptr,
4997 			  char *quality_string, int querylength, int quality_shift,
4998 			  bool first_read_p, bool sam_paired_p, char *sam_read_group_id) {
4999   unsigned int flag;
5000 
5001   /* 1. QNAME */
5002   if (acc2 == NULL) {
5003     FPRINTF(fp,"%s",acc1);
5004   } else {
5005     FPRINTF(fp,"%s,%s",acc1,acc2);
5006   }
5007 
5008   /* 2. FLAG */
5009   flag = compute_sam_flag_nomate(/*npaths*/0,first_read_p,/*watsonp*/true,sam_paired_p);
5010   FPRINTF(fp,"\t%u",flag);
5011 
5012   /* 3. RNAME: chr */
5013   FPRINTF(fp,"\t*");
5014 
5015   /* 4. POS: chrpos */
5016   FPRINTF(fp,"\t0");
5017 
5018   /* 5. MAPQ: Mapping quality */
5019   /* Picard says MAPQ should be 0 for an unmapped read */
5020   FPRINTF(fp,"\t0");
5021 
5022   /* 6. CIGAR */
5023   FPRINTF(fp,"\t*");
5024 
5025   /* 7. MRNM: Mate chr */
5026   /* 8. MPOS: Mate chrpos */
5027   /* 9. ISIZE: Insert size */
5028   FPRINTF(fp,"\t*\t0\t0\t");
5029 
5030   /* 10. SEQ: queryseq and 11. QUAL: quality scores */
5031   print_chopped(fp,queryseq_ptr,querylength,/*hardclip_start*/0,/*hardclip_end*/0);
5032   FPRINTF(fp,"\t");
5033   print_quality(fp,quality_string,querylength,/*hardclip_start*/0,/*hardclip_end*/0,
5034 		quality_shift);
5035 
5036   /* 12. TAGS: RG */
5037   if (sam_read_group_id != NULL) {
5038     FPRINTF(fp,"\tRG:Z:%s",sam_read_group_id);
5039   }
5040 
5041   /* 12. TAGS: XO */
5042   FPRINTF(fp,"\tXO:Z:%s",abbrev);
5043 
5044   FPRINTF(fp,"\n");
5045 
5046   return;
5047 }
5048 
5049 
5050 
5051 #if 0
5052 static int
5053 sensedir_from_cdna_direction (int cdna_direction) {
5054   if (cdna_direction > 0) {
5055     return SENSE_FORWARD;
5056   } else if (cdna_direction < 0) {
5057     return SENSE_ANTI;
5058   } else {
5059     return SENSE_NULL;
5060   }
5061 }
5062 #endif
5063 
5064 
5065 void
Pair_alias_circular(struct T * pairs,int npairs,Chrpos_T chrlength)5066 Pair_alias_circular (struct T *pairs, int npairs, Chrpos_T chrlength) {
5067   int i;
5068   struct T *ptr;
5069 
5070   i = 0;
5071   ptr = pairs;
5072   while (i < npairs) {
5073     assert(ptr->genomepos < chrlength);
5074     ptr->genomepos += chrlength;
5075     i++;
5076     ptr++;
5077   }
5078 
5079   return;
5080 }
5081 
5082 void
Pair_unalias_circular(struct T * pairs,int npairs,Chrpos_T chrlength)5083 Pair_unalias_circular (struct T *pairs, int npairs, Chrpos_T chrlength) {
5084   int i;
5085   struct T *ptr;
5086 
5087   i = 0;
5088   ptr = pairs;
5089   while (i < npairs) {
5090     assert(ptr->genomepos >= chrlength);
5091     ptr->genomepos -= chrlength;
5092     i++;
5093     ptr++;
5094   }
5095 
5096   return;
5097 }
5098 
5099 
5100 static List_T
clean_cigar(List_T tokens,bool watsonp)5101 clean_cigar (List_T tokens, bool watsonp) {
5102   List_T clean, unique = NULL, p;
5103   char token[11], *curr_token, *last_token;
5104   int length = 0;
5105   char type, last_type = ' ';
5106   bool duplicatep = false;
5107 
5108   for (p = tokens; p != NULL; p = List_next(p)) {
5109     curr_token = (char *) List_head(p);
5110     type = curr_token[strlen(curr_token)-1];
5111     if (type == last_type) {
5112       length += atoi(last_token);
5113       FREE_OUT(last_token);
5114       duplicatep = true;
5115     } else {
5116       if (last_type == ' ') {
5117 	/* Skip */
5118       } else if (duplicatep == false) {
5119 	unique = List_push_out(unique,(void *) last_token);
5120       } else {
5121 	length += atoi(last_token);
5122 	FREE_OUT(last_token);
5123 	sprintf(token,"%d%c",length,last_type);
5124 	unique = push_token(unique,token);
5125       }
5126       last_type = type;
5127       duplicatep = false;
5128       length = 0;
5129     }
5130     last_token = curr_token;
5131   }
5132   if (last_type == ' ') {
5133     /* Skip */
5134   } else if (duplicatep == false) {
5135     unique = List_push_out(unique,(void *) last_token);
5136   } else {
5137     length += atoi(last_token);
5138     FREE_OUT(last_token);
5139     sprintf(token,"%d%c",length,last_type);
5140     unique = push_token(unique,token);
5141   }
5142   List_free_out(&tokens);
5143 
5144 
5145   if (sam_insert_0M_p == false) {
5146     /* Return result */
5147     if (watsonp) {
5148       /* Put tokens in forward order */
5149       return unique;
5150     } else {
5151       /* Keep tokens in reverse order */
5152       return List_reverse(unique);
5153     }
5154 
5155   } else {
5156     /* Insert "0M" between adjacent I and D operations */
5157     last_type = ' ';
5158     clean = (List_T) NULL;
5159     for (p = unique; p != NULL; p = List_next(p)) {
5160       curr_token = (char *) List_head(p);
5161       type = curr_token[strlen(curr_token)-1];
5162       if (last_type == 'I' && type == 'D') {
5163 	clean = push_token(clean,"0M");
5164       } else if (last_type == 'D' && type == 'I') {
5165 	clean = push_token(clean,"0M");
5166       }
5167       clean = List_push_out(clean,(void *) curr_token);
5168       last_type = type;
5169     }
5170     List_free_out(&unique);
5171 
5172     /* Return result */
5173     if (watsonp) {
5174       /* Put tokens in forward order */
5175       return List_reverse(clean);
5176     } else {
5177       /* Keep tokens in reverse order */
5178       return clean;
5179     }
5180   }
5181 }
5182 
5183 
5184 /* Derived from print_tokens_gff3 */
5185 int
Pair_cigar_length(List_T tokens)5186 Pair_cigar_length (List_T tokens) {
5187   int length = 0, tokenlength;
5188   List_T p;
5189   char *token;
5190   char type;
5191 
5192   for (p = tokens; p != NULL; p = List_next(p)) {
5193     token = (char *) List_head(p);
5194     type = token[strlen(token)-1];
5195     /* Should include 'H', but that gets added according to hardclip_low and hardclip_high */
5196     if (type == 'S' || type == 'I' || type == 'M' || type == 'X' || type == '=') {
5197       sscanf(token,"%d",&tokenlength);
5198       length += tokenlength;
5199     }
5200   }
5201 
5202   return length;
5203 }
5204 
5205 /* Derived from print_tokens_gff3 */
5206 void
Pair_print_tokens(Filestring_T fp,List_T tokens)5207 Pair_print_tokens (Filestring_T fp, List_T tokens) {
5208   List_T p;
5209   char *token;
5210 
5211   for (p = tokens; p != NULL; p = List_next(p)) {
5212     token = (char *) List_head(p);
5213     FPRINTF(fp,"%s",token);
5214     /* FREE_OUT(token); -- Now freed within Stage3end_free or Stage3_free */
5215   }
5216 
5217   return;
5218 }
5219 
5220 
5221 
5222 static List_T
compute_cigar_standard(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)5223 compute_cigar_standard (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
5224 			bool watsonp,
5225 #ifdef CONVERT_INTRONS_TO_DELETIONS
5226 			int sensedir,
5227 #endif
5228 			int chimera_part) {
5229   List_T tokens = NULL;
5230   char token[11];
5231   int Mlength = 0, Ilength = 0, Dlength = 0;
5232   bool in_exon = false, deletionp;
5233   struct T *ptr, *prev, *this = NULL;
5234   int exon_queryend = -1;
5235   Chrpos_T exon_genomestart = 0;
5236   Chrpos_T exon_genomeend, genome_gap;
5237   int query_gap;
5238   int last_querypos = -1;
5239   Chrpos_T last_genomepos = (Chrpos_T) -1;
5240   int i;
5241 
5242   /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
5243   *intronp = false;
5244 
5245   ptr = pairs;
5246 
5247   if (chimera_part == +1) {
5248     if (ptr->querypos > *hardclip_start) {
5249       if (ptr->querypos > 0) {
5250 	/* Clip to beginning */
5251 	*hardclip_start = ptr->querypos;
5252 	sprintf(token,"%dH",*hardclip_start);
5253 	tokens = push_token(tokens,token);
5254       }
5255     } else {
5256       if (*hardclip_start > 0) {
5257 	/* Clip to hard clip boundary */
5258 	sprintf(token,"%dH",*hardclip_start);
5259 	tokens = push_token(tokens,token);
5260       }
5261     }
5262   } else {
5263     if (*hardclip_start > 0) {
5264       sprintf(token,"%dH",*hardclip_start);
5265       tokens = push_token(tokens,token);
5266     }
5267     if (ptr->querypos > (*hardclip_start)) {
5268       sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
5269       tokens = push_token(tokens,token);
5270     }
5271   }
5272 
5273   this = (T) NULL;
5274   for (i = 0; i < npairs; i++) {
5275     prev = this;
5276     this = ptr++;
5277 
5278 #if 0
5279     /* Cigar_print_tokens(stdout,tokens); */
5280     Pair_dump_one(this,true);
5281     printf("\n");
5282 #endif
5283 
5284     if (this->gapp) {
5285       if (in_exon == true) {
5286 	exon_queryend = last_querypos + 1;
5287 	exon_genomeend = last_genomepos + 1;
5288 #if 0
5289 	if (watsonp) {
5290 	  intron_start = exon_genomeend + 1;
5291 	} else {
5292 	  intron_start = exon_genomeend - 1;
5293 	}
5294 #endif
5295 
5296 	if (Mlength > 0) {
5297 	  sprintf(token,"%dM",Mlength);
5298 	  tokens = push_token(tokens,token);
5299 	} else if (Ilength > 0) {
5300 	  sprintf(token,"%dI",Ilength);
5301 	  tokens = push_token(tokens,token);
5302 	} else if (Dlength > 0) {
5303 	  sprintf(token,"%dD",Dlength);
5304 	  tokens = push_token(tokens,token);
5305 	}
5306 
5307 	Mlength = Ilength = Dlength = 0;
5308 
5309 	in_exon = false;
5310       }
5311 
5312     } else if (this->comp == INTRONGAP_COMP) {
5313       /* Do nothing */
5314 
5315     } else {
5316       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
5317 	 SHORTGAP_COMP, or MISMATCH_COMP */
5318       if (in_exon == false) {
5319 	/* exon_querystart = this->querypos + 1; */
5320 	exon_genomestart = this->genomepos + 1;
5321 
5322 	if (prev != NULL) {
5323 	  /* Gap */
5324 	  /* abs() gives a large value when flag -m64 is specified */
5325 	  /* genome_gap = abs(intron_end - intron_start) + 1; */
5326 	  if (watsonp) {
5327 	    /* intron_end = exon_genomestart - 1; */
5328 	    /* genome_gap = (intron_end - intron_start) + 1; */
5329 	    genome_gap = exon_genomestart - exon_genomeend - 1;
5330 	  } else {
5331 	    /* intron_end = exon_genomestart + 1; */
5332 	    /* genome_gap = (intron_start - intron_end) + 1; */
5333 	    genome_gap = exon_genomeend - exon_genomestart - 1;
5334 	  }
5335 
5336 	  deletionp = false;
5337 #ifdef CONVERT_INTRONS_TO_DELETIONS
5338 	  if (sensedir == SENSE_FORWARD) {
5339 	    if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
5340 		prev->comp == FWD_GCAG_INTRON_COMP ||
5341 		prev->comp == FWD_ATAC_INTRON_COMP) {
5342 	      sprintf(token,"%uN",genome_gap);
5343 	      *intronp = true;
5344 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5345 	      sprintf(token,"%uN",genome_gap);
5346 	      *intronp = true;
5347 	    } else {
5348 	      sprintf(token,"%uD",genome_gap);
5349 	      deletionp = true;
5350 	    }
5351 	  } else if (sensedir == SENSE_ANTI) {
5352 	    if (prev->comp == REV_CANONICAL_INTRON_COMP ||
5353 		prev->comp == REV_GCAG_INTRON_COMP ||
5354 		prev->comp == REV_ATAC_INTRON_COMP) {
5355 	      sprintf(token,"%uN",genome_gap);
5356 	      *intronp = true;
5357 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5358 	      sprintf(token,"%uN",genome_gap);
5359 	      *intronp = true;
5360 	    } else {
5361 	      sprintf(token,"%uD",genome_gap);
5362 	      deletionp = true;
5363 	    }
5364 	  } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
5365 	    sprintf(token,"%uN",genome_gap);
5366 	    *intronp = true;
5367 	  } else {
5368 	    sprintf(token,"%uD",genome_gap);
5369 	    deletionp = true;
5370 	  }
5371 #else
5372 	  sprintf(token,"%uN",genome_gap);
5373 	  *intronp = true;
5374 #endif
5375 	  tokens = push_token(tokens,token);
5376 
5377 	  /* Check for dual gap.  Doesn't work for hard clipping. */
5378 	  /* assert(exon_queryend >= 0); */
5379 
5380 	  query_gap = this->querypos - exon_queryend;
5381 	  assert(query_gap >= 0);
5382 	  if (query_gap > 0) {
5383 	    if (deletionp == true && sam_insert_0M_p == true) {
5384 	      /* Put zero matches between deletion and insertion, since some programs will complain */
5385 	      sprintf(token,"0M");
5386 	      tokens = push_token(tokens,token);
5387 	    }
5388 
5389 	    sprintf(token,"%uI",query_gap);
5390 	    tokens = push_token(tokens,token);
5391 	  }
5392 	}
5393 
5394 	in_exon = true;
5395       }
5396 
5397       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
5398 	/* Gap in upper or lower sequence */
5399 	if (this->genome == ' ') {
5400 	  /* Insertion relative to genome */
5401 	  if (Mlength > 0) {
5402 	    sprintf(token,"%dM",Mlength);
5403 	    tokens = push_token(tokens,token);
5404 	    Mlength = 0;
5405 	  } else if (Dlength > 0) {
5406 	    /* unlikely */
5407 	    sprintf(token,"%dD",Dlength);
5408 	    tokens = push_token(tokens,token);
5409 	    Dlength = 0;
5410 	  }
5411 	  Ilength++;
5412 	} else if (this->cdna == ' ') {
5413 	  /* Deletion relative to genome */
5414 	  if (Mlength > 0) {
5415 	    sprintf(token,"%dM",Mlength);
5416 	    tokens = push_token(tokens,token);
5417 	    Mlength = 0;
5418 	  } else if (Ilength > 0) {
5419 	    sprintf(token,"%dI",Ilength);
5420 	    tokens = push_token(tokens,token);
5421 	    Ilength = 0;
5422 	  }
5423 	  Dlength++;
5424 	} else {
5425 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
5426 	  exit(9);
5427 	}
5428 
5429       } else {
5430 	/* Count even if unknown base */
5431 
5432 	if (Ilength > 0) {
5433 	  sprintf(token,"%dI",Ilength);
5434 	  tokens = push_token(tokens,token);
5435 	  Ilength = 0;
5436 	} else if (Dlength > 0) {
5437 	  sprintf(token,"%dD",Dlength);
5438 	  tokens = push_token(tokens,token);
5439 	  Dlength = 0;
5440 	}
5441 	Mlength++;
5442 
5443       }
5444     }
5445 
5446     if (this != NULL) {
5447       if (this->cdna != ' ') {
5448 	last_querypos = this->querypos;
5449       }
5450       if (this->genome != ' ') {
5451 	last_genomepos = this->genomepos;
5452       }
5453     }
5454   }
5455 
5456   /* prev = this; */
5457   /* exon_queryend = last_querypos + 1; */
5458   /* exon_genomeend = last_genomepos + 1; */
5459 
5460   if (Mlength > 0) {
5461     sprintf(token,"%dM",Mlength);
5462     tokens = push_token(tokens,token);
5463   } else if (Ilength > 0) {
5464     sprintf(token,"%dI",Ilength);
5465     tokens = push_token(tokens,token);
5466   } else if (Dlength > 0) {
5467     sprintf(token,"%dD",Dlength);
5468     tokens = push_token(tokens,token);
5469   }
5470 
5471 
5472   /* Terminal clipping */
5473   if (chimera_part == -1) {
5474     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5475       if (last_querypos < querylength_given - 1) {
5476 	/* Clip to end */
5477 	*hardclip_end = querylength_given - 1 - last_querypos;
5478 	sprintf(token,"%dH",*hardclip_end);
5479 	tokens = push_token(tokens,token);
5480       }
5481     } else {
5482       if (*hardclip_end > 0) {
5483 	/* Clip to hard clip boundary */
5484 	sprintf(token,"%dH",*hardclip_end);
5485 	tokens = push_token(tokens,token);
5486       }
5487     }
5488   } else {
5489     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5490       sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
5491       tokens = push_token(tokens,token);
5492     }
5493     if (*hardclip_end > 0) {
5494       sprintf(token,"%dH",*hardclip_end);
5495       tokens = push_token(tokens,token);
5496     }
5497   }
5498 
5499   return clean_cigar(tokens,watsonp);
5500 }
5501 
5502 
5503 static List_T
compute_cigar_extended(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)5504 compute_cigar_extended (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
5505 			bool watsonp,
5506 #ifdef CONVERT_INTRONS_TO_DELETIONS
5507 			int sensedir,
5508 #endif
5509 			int chimera_part) {
5510   List_T tokens = NULL;
5511   char token[11];
5512   int Elength = 0, Xlength = 0, Ilength = 0, Dlength = 0;
5513   bool in_exon = false, deletionp;
5514   struct T *ptr, *prev, *this = NULL;
5515   int exon_queryend = -1;
5516   Chrpos_T exon_genomestart = 0;
5517   Chrpos_T exon_genomeend, genome_gap;
5518   int query_gap;
5519   int last_querypos = -1;
5520   Chrpos_T last_genomepos = (Chrpos_T) -1;
5521   int i;
5522 
5523   /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
5524   *intronp = false;
5525 
5526   ptr = pairs;
5527 
5528   if (chimera_part == +1) {
5529     if (ptr->querypos > *hardclip_start) {
5530       if (ptr->querypos > 0) {
5531 	/* Clip to beginning */
5532 	*hardclip_start = ptr->querypos;
5533 	sprintf(token,"%dH",*hardclip_start);
5534 	tokens = push_token(tokens,token);
5535       }
5536     } else {
5537       if (*hardclip_start > 0) {
5538 	/* Clip to hard clip boundary */
5539 	sprintf(token,"%dH",*hardclip_start);
5540 	tokens = push_token(tokens,token);
5541       }
5542     }
5543   } else {
5544     if (*hardclip_start > 0) {
5545       sprintf(token,"%dH",*hardclip_start);
5546       tokens = push_token(tokens,token);
5547     }
5548     if (ptr->querypos > (*hardclip_start)) {
5549       sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
5550       tokens = push_token(tokens,token);
5551     }
5552   }
5553 
5554   this = (T) NULL;
5555   for (i = 0; i < npairs; i++) {
5556     prev = this;
5557     this = ptr++;
5558 
5559 #if 0
5560     /* Cigar_print_tokens(stdout,tokens); */
5561     Pair_dump_one(this,true);
5562     printf("\n");
5563 #endif
5564 
5565     if (this->gapp) {
5566       if (in_exon == true) {
5567 	exon_queryend = last_querypos + 1;
5568 	exon_genomeend = last_genomepos + 1;
5569 #if 0
5570 	if (watsonp) {
5571 	  intron_start = exon_genomeend + 1;
5572 	} else {
5573 	  intron_start = exon_genomeend - 1;
5574 	}
5575 #endif
5576 
5577 	if (Elength > 0) {
5578 	  sprintf(token,"%d=",Elength);
5579 	  tokens = push_token(tokens,token);
5580 	} else if (Xlength > 0) {
5581 	  sprintf(token,"%dX",Xlength);
5582 	  tokens = push_token(tokens,token);
5583 	} else if (Ilength > 0) {
5584 	  sprintf(token,"%dI",Ilength);
5585 	  tokens = push_token(tokens,token);
5586 	} else if (Dlength > 0) {
5587 	  sprintf(token,"%dD",Dlength);
5588 	  tokens = push_token(tokens,token);
5589 	}
5590 
5591 	Elength = Xlength = Ilength = Dlength = 0;
5592 
5593 	in_exon = false;
5594       }
5595 
5596     } else if (this->comp == INTRONGAP_COMP) {
5597       /* Do nothing */
5598 
5599     } else {
5600       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
5601 	 SHORTGAP_COMP, or MISMATCH_COMP */
5602       if (in_exon == false) {
5603 	/* exon_querystart = this->querypos + 1; */
5604 	exon_genomestart = this->genomepos + 1;
5605 
5606 	if (prev != NULL) {
5607 	  /* Gap */
5608 	  /* abs() gives a large value when flag -m64 is specified */
5609 	  /* genome_gap = abs(intron_end - intron_start) + 1; */
5610 	  if (watsonp) {
5611 	    /* intron_end = exon_genomestart - 1; */
5612 	    /* genome_gap = (intron_end - intron_start) + 1; */
5613 	    genome_gap = exon_genomestart - exon_genomeend - 1;
5614 	  } else {
5615 	    /* intron_end = exon_genomestart + 1; */
5616 	    /* genome_gap = (intron_start - intron_end) + 1; */
5617 	    genome_gap = exon_genomeend - exon_genomestart - 1;
5618 	  }
5619 
5620 	  deletionp = false;
5621 #ifdef CONVERT_INTRONS_TO_DELETIONS
5622 	  if (sensedir == SENSE_FORWARD) {
5623 	    if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
5624 		prev->comp == FWD_GCAG_INTRON_COMP ||
5625 		prev->comp == FWD_ATAC_INTRON_COMP) {
5626 	      sprintf(token,"%uN",genome_gap);
5627 	      *intronp = true;
5628 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5629 	      sprintf(token,"%uN",genome_gap);
5630 	      *intronp = true;
5631 	    } else {
5632 	      sprintf(token,"%uD",genome_gap);
5633 	      deletionp = true;
5634 	    }
5635 	  } else if (sensedir == SENSE_ANTI) {
5636 	    if (prev->comp == REV_CANONICAL_INTRON_COMP ||
5637 		prev->comp == REV_GCAG_INTRON_COMP ||
5638 		prev->comp == REV_ATAC_INTRON_COMP) {
5639 	      sprintf(token,"%uN",genome_gap);
5640 	      *intronp = true;
5641 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
5642 	      sprintf(token,"%uN",genome_gap);
5643 	      *intronp = true;
5644 	    } else {
5645 	      sprintf(token,"%uD",genome_gap);
5646 	      deletionp = true;
5647 	    }
5648 	  } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
5649 	    sprintf(token,"%uN",genome_gap);
5650 	    *intronp = true;
5651 	  } else {
5652 	    sprintf(token,"%uD",genome_gap);
5653 	    deletionp = true;
5654 	  }
5655 #else
5656 	  sprintf(token,"%uN",genome_gap);
5657 	  *intronp = true;
5658 #endif
5659 	  tokens = push_token(tokens,token);
5660 
5661 	  /* Check for dual gap.  Doesn't work for hard clipping. */
5662 	  /* assert(exon_queryend >= 0); */
5663 
5664 	  query_gap = this->querypos - exon_queryend;
5665 	  assert(query_gap >= 0);
5666 	  if (query_gap > 0) {
5667 	    if (deletionp == true && sam_insert_0M_p == true) {
5668 	      /* Put zero matches between deletion and insertion, since some programs will complain */
5669 	      sprintf(token,"0M");
5670 	      tokens = push_token(tokens,token);
5671 	    }
5672 
5673 	    sprintf(token,"%uI",query_gap);
5674 	    tokens = push_token(tokens,token);
5675 	  }
5676 	}
5677 
5678 	in_exon = true;
5679       }
5680 
5681       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
5682 	/* Gap in upper or lower sequence */
5683 	if (this->genome == ' ') {
5684 	  /* Insertion relative to genome */
5685 	  if (Elength > 0) {
5686 	    sprintf(token,"%d=",Elength);
5687 	    tokens = push_token(tokens,token);
5688 	    Elength = 0;
5689 	  } else if (Xlength > 0) {
5690 	    sprintf(token,"%dX",Xlength);
5691 	    tokens = push_token(tokens,token);
5692 	    Xlength = 0;
5693 	  } else if (Dlength > 0) {
5694 	    /* unlikely */
5695 	    sprintf(token,"%dD",Dlength);
5696 	    tokens = push_token(tokens,token);
5697 	    Dlength = 0;
5698 	  }
5699 	  Ilength++;
5700 	} else if (this->cdna == ' ') {
5701 	  /* Deletion relative to genome */
5702 	  if (Elength > 0) {
5703 	    sprintf(token,"%d=",Elength);
5704 	    tokens = push_token(tokens,token);
5705 	    Elength = 0;
5706 	  } else if (Xlength > 0) {
5707 	    sprintf(token,"%dX",Xlength);
5708 	    tokens = push_token(tokens,token);
5709 	    Xlength = 0;
5710 	  } else if (Ilength > 0) {
5711 	    sprintf(token,"%dI",Ilength);
5712 	    tokens = push_token(tokens,token);
5713 	    Ilength = 0;
5714 	  }
5715 	  Dlength++;
5716 	} else {
5717 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
5718 	  exit(9);
5719 	}
5720 
5721       } else {
5722 	/* Count even if unknown base */
5723 
5724 	if (Ilength > 0) {
5725 	  sprintf(token,"%dI",Ilength);
5726 	  tokens = push_token(tokens,token);
5727 	  Ilength = 0;
5728 	} else if (Dlength > 0) {
5729 	  sprintf(token,"%dD",Dlength);
5730 	  tokens = push_token(tokens,token);
5731 	  Dlength = 0;
5732 	}
5733 
5734 	if (prev == NULL || prev->gapp || prev->comp == INDEL_COMP || prev->comp == SHORTGAP_COMP) {
5735 	  if (this->cdna == this->genome) {
5736 	    Elength++;
5737 	  } else {
5738 	    Xlength++;
5739 	  }
5740 
5741 	} else if (prev->cdna == prev->genome) {
5742 	  if (this->cdna == this->genome) {
5743 	    Elength++;
5744 	  } else {
5745 	    if (Elength > 0) {
5746 	      sprintf(token,"%d=",Elength);
5747 	      tokens = push_token(tokens,token);
5748 	      Elength = 0;
5749 	    }
5750 	    Xlength++;
5751 	  }
5752 
5753 	} else {
5754 	  if (this->cdna != this->genome) {
5755 	    Xlength++;
5756 	  } else {
5757 	    if (Xlength > 0) {
5758 	      sprintf(token,"%dX",Xlength);
5759 	      tokens = push_token(tokens,token);
5760 	      Xlength = 0;
5761 	    }
5762 	    Elength++;
5763 	  }
5764 	}
5765       }
5766     }
5767 
5768     if (this != NULL) {
5769       if (this->cdna != ' ') {
5770 	last_querypos = this->querypos;
5771       }
5772       if (this->genome != ' ') {
5773 	last_genomepos = this->genomepos;
5774       }
5775     }
5776   }
5777 
5778   /* prev = this; */
5779   /* exon_queryend = last_querypos + 1; */
5780   /* exon_genomeend = last_genomepos + 1; */
5781 
5782   if (Elength > 0) {
5783     sprintf(token,"%d=",Elength);
5784     tokens = push_token(tokens,token);
5785   } else if (Xlength > 0) {
5786     sprintf(token,"%dX",Xlength);
5787     tokens = push_token(tokens,token);
5788   } else if (Ilength > 0) {
5789     sprintf(token,"%dI",Ilength);
5790     tokens = push_token(tokens,token);
5791   } else if (Dlength > 0) {
5792     sprintf(token,"%dD",Dlength);
5793     tokens = push_token(tokens,token);
5794   }
5795 
5796 
5797   /* Terminal clipping */
5798   if (chimera_part == -1) {
5799     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5800       if (last_querypos < querylength_given - 1) {
5801 	/* Clip to end */
5802 	*hardclip_end = querylength_given - 1 - last_querypos;
5803 	sprintf(token,"%dH",*hardclip_end);
5804 	tokens = push_token(tokens,token);
5805       }
5806     } else {
5807       if (*hardclip_end > 0) {
5808 	/* Clip to hard clip boundary */
5809 	sprintf(token,"%dH",*hardclip_end);
5810 	tokens = push_token(tokens,token);
5811       }
5812     }
5813   } else {
5814     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
5815       sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
5816       tokens = push_token(tokens,token);
5817     }
5818     if (*hardclip_end > 0) {
5819       sprintf(token,"%dH",*hardclip_end);
5820       tokens = push_token(tokens,token);
5821     }
5822   }
5823 
5824   return clean_cigar(tokens,watsonp);
5825 }
5826 
5827 
5828 List_T
Pair_compute_cigar(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int chimera_part)5829 Pair_compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
5830 		    bool watsonp, int chimera_part) {
5831   if (cigar_extended_p == true) {
5832     return compute_cigar_extended(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
5833 				  watsonp,chimera_part);
5834   } else {
5835     return compute_cigar_standard(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
5836 				  watsonp,chimera_part);
5837   }
5838 }
5839 
5840 
5841 /* Derived from print_gff3_cdna_match */
5842 /* Assumes pairarray has been hard clipped already */
5843 static void
print_sam_line(Filestring_T fp,char * abbrev,char * acc1,char * acc2,char * chrstring,bool watsonp,int sensedir,List_T cigar_tokens,List_T md_tokens,int nmismatches_refdiff,int nmismatches_bothdiff,int nindels,bool intronp,char * queryseq_ptr,char * quality_string,int hardclip_start,int hardclip_end,int querylength,Chimera_T chimera,int quality_shift,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,unsigned int flag,Univ_IIT_T chromosome_iit,Chrpos_T chrpos,Chrpos_T chrlength,int mapq_score,char * sam_read_group_id)5844 print_sam_line (Filestring_T fp, char *abbrev, char *acc1, char *acc2, char *chrstring,
5845 		bool watsonp, int sensedir, List_T cigar_tokens, List_T md_tokens,
5846 		int nmismatches_refdiff, int nmismatches_bothdiff, int nindels,
5847 		bool intronp, char *queryseq_ptr, char *quality_string,
5848 		int hardclip_start, int hardclip_end,
5849 		int querylength, Chimera_T chimera, int quality_shift,
5850 		int pathnum, int npaths_primary, int npaths_altloc, int absmq_score, int second_absmq, unsigned int flag,
5851 		Univ_IIT_T chromosome_iit, Chrpos_T chrpos, Chrpos_T chrlength,
5852 		int mapq_score,	char *sam_read_group_id) {
5853 
5854   /* Should already be checked when Stage3_T or Stage3end_T object was created */
5855   if (cigar_action == CIGAR_ACTION_IGNORE) {
5856     /* Don't check */
5857   } else if (Pair_cigar_length(cigar_tokens) + hardclip_start + hardclip_end == querylength) {
5858     /* Okay */
5859   } else if (cigar_action == CIGAR_ACTION_WARNING) {
5860     fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
5861 	    acc1,Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5862   } else if (cigar_action == CIGAR_ACTION_NOPRINT) {
5863     fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
5864 	    acc1,Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5865     return;
5866   } else {
5867     /* CIGAR_ACTION_ABORT */
5868     fprintf(stderr,"Error: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
5869 	    acc1,Pair_cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
5870     abort();
5871   }
5872 
5873   /* 1. QNAME or Accession */
5874   if (acc2 == NULL) {
5875     FPRINTF(fp,"%s\t",acc1);
5876   } else {
5877     FPRINTF(fp,"%s,%s\t",acc1,acc2);
5878   }
5879 
5880   /* 2. Flags */
5881   FPRINTF(fp,"%u\t",flag);
5882 
5883   /* 3. RNAME or Chrstring */
5884   /* 4. POS or Chrlow */
5885   /* Taken from GMAP part of SAM_chromosomal_pos */
5886   if (chrpos > chrlength) {
5887     FPRINTF(fp,"%s\t%u\t",chrstring,chrpos - chrlength /*+ 1*/);
5888   } else {
5889     FPRINTF(fp,"%s\t%u\t",chrstring,chrpos /*+ 1*/);
5890   }
5891 
5892   /* 5. MAPQ or Mapping quality */
5893   FPRINTF(fp,"%d\t",mapq_score);
5894 
5895   /* 6. CIGAR */
5896   Pair_print_tokens(fp,cigar_tokens);
5897 
5898   /* 7. MRNM: Mate chr */
5899   /* 8. MPOS: Mate chrpos */
5900   FPRINTF(fp,"\t*\t0");
5901 
5902   /* 9. ISIZE: Insert size */
5903   FPRINTF(fp,"\t0");
5904 
5905   /* 10. SEQ: queryseq and 11. QUAL: quality_scores */
5906   FPRINTF(fp,"\t");
5907   if (watsonp == true) {
5908     print_chopped(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5909     FPRINTF(fp,"\t");
5910     print_quality(fp,quality_string,querylength,hardclip_start,hardclip_end,
5911 		  quality_shift);
5912   } else {
5913     print_chopped_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5914     FPRINTF(fp,"\t");
5915     print_quality_revcomp(fp,quality_string,querylength,hardclip_start,hardclip_end,
5916 			  quality_shift);
5917   }
5918 
5919   /* 12. TAGS: RG */
5920   if (sam_read_group_id != NULL) {
5921     FPRINTF(fp,"\tRG:Z:%s",sam_read_group_id);
5922   }
5923 
5924   /* 12. TAGS: XH and XI */
5925   if (hardclip_start > 0 || hardclip_end > 0) {
5926     FPRINTF(fp,"\tXH:Z:");
5927     if (watsonp == true) {
5928       print_chopped_end(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5929     } else {
5930       print_chopped_end_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
5931     }
5932 
5933     if (quality_string != NULL) {
5934       FPRINTF(fp,"\tXI:Z:");
5935       if (watsonp == true) {
5936 	print_chopped_end_quality(fp,quality_string,querylength,hardclip_start,hardclip_end);
5937       } else {
5938 	print_chopped_end_quality_reverse(fp,quality_string,querylength,hardclip_start,hardclip_end);
5939       }
5940     }
5941   }
5942 
5943   /* 12. TAGS: MD string */
5944   FPRINTF(fp,"\tMD:Z:");
5945   Pair_print_tokens(fp,md_tokens);
5946 
5947   /* 12. TAGS: NH */
5948   FPRINTF(fp,"\tNH:i:%d",npaths_primary + npaths_altloc);
5949 
5950   /* 12. TAGS: HI */
5951   FPRINTF(fp,"\tHI:i:%d",pathnum);
5952 
5953   /* 12. TAGS: NM */
5954   FPRINTF(fp,"\tNM:i:%d",nmismatches_refdiff + nindels);
5955 
5956   if (snps_p) {
5957     /* 12. TAGS: XW and XV */
5958     FPRINTF(fp,"\tXW:i:%d",nmismatches_bothdiff);
5959     FPRINTF(fp,"\tXV:i:%d",nmismatches_refdiff - nmismatches_bothdiff);
5960   }
5961 
5962 
5963   /* 12. TAGS: SM */
5964   FPRINTF(fp,"\tSM:i:%d",40);
5965 
5966   /* 12. TAGS: XQ */
5967   FPRINTF(fp,"\tXQ:i:%d",absmq_score);
5968 
5969   /* 12. TAGS: X2 */
5970   FPRINTF(fp,"\tX2:i:%d",second_absmq);
5971 
5972   /* 12. TAGS: XO */
5973   FPRINTF(fp,"\tXO:Z:%s",abbrev);
5974 
5975   /* 12. TAGS: XS */
5976   if (novelsplicingp == false && splicesites_iit == NULL) {
5977     /* Do not print XS field */
5978 
5979   } else if (sensedir == SENSE_FORWARD) {
5980     if (watsonp == true) {
5981       FPRINTF(fp,"\tXS:A:+");
5982     } else {
5983       FPRINTF(fp,"\tXS:A:-");
5984     }
5985 
5986   } else if (sensedir == SENSE_ANTI) {
5987     if (watsonp == true) {
5988       FPRINTF(fp,"\tXS:A:-");
5989     } else {
5990       FPRINTF(fp,"\tXS:A:+");
5991     }
5992 
5993   } else if (intronp == false) {
5994     /* Skip.  No intron in this end and mate is not revealing. */
5995 
5996 #if 0
5997   } else if (force_xs_direction_p == true) {
5998     /* Don't print XS field for SENSE_NULL */
5999     /* Could not determine sense, so just report arbitrarily as + */
6000     /* This option provided for users of Cufflinks, which cannot handle XS:A:? */
6001     FPRINTF(fp,"\tXS:A:+");
6002 
6003   } else {
6004     /* Non-canonical.  Don't report. */
6005     FPRINTF(fp,"\tXS:A:?");
6006 #endif
6007   }
6008 
6009   /* 12. TAGS: XT */
6010   if (chimera != NULL) {
6011     FPRINTF(fp,"\tXT:Z:");
6012     Chimera_print_sam_tag(fp,chimera,chromosome_iit);
6013   }
6014 
6015   FPRINTF(fp,"\n");
6016 
6017   return;
6018 }
6019 
6020 
6021 typedef enum {IN_MATCHES, IN_MISMATCHES, IN_DELETION} MD_state_T;
6022 
6023 static List_T
compute_md_string(int * nmismatches_refdiff,int * nmismatches_bothdiff,int * nindels,struct T * pairs,int npairs,bool watsonp,List_T cigar_tokens)6024 compute_md_string (int *nmismatches_refdiff, int *nmismatches_bothdiff, int *nindels,
6025 		   struct T *pairs, int npairs, bool watsonp, List_T cigar_tokens) {
6026   List_T md_tokens = NULL, p;
6027   char *cigar_token, token[11], *first_token, type;
6028   T this;
6029   int nmatches = 0, length;
6030   MD_state_T state = IN_MISMATCHES;
6031   int i, k = 0;
6032 
6033   *nmismatches_refdiff = *nmismatches_bothdiff = *nindels = 0;
6034 
6035   debug4(Pair_dump_array(pairs,npairs,true));
6036   debug4(printf("watsonp %d\n",watsonp));
6037 
6038   if (watsonp == true) {
6039     for (p = cigar_tokens; p != NULL; p = List_next(p)) {
6040       cigar_token = (char *) List_head(p);
6041       debug4(printf("token is %s\n",cigar_token));
6042       type = cigar_token[strlen(cigar_token)-1];
6043       length = atoi(cigar_token);
6044 
6045       if (type == 'H') {
6046 	/* k += length; */
6047 
6048       } else if (type == 'S') {
6049 	/* k += length; */
6050 
6051       } else if (type == 'M' || type == 'X' || type == '=') {
6052 	for (i = 0; i < length; i++, k++) {
6053 	  this = &(pairs[k]);
6054 	  debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
6055 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6056 	    nmatches++;
6057 	    state = IN_MATCHES;
6058 
6059 	  } else if (this->comp == MISMATCH_COMP) {
6060 	    if (state == IN_MATCHES) {
6061 	      sprintf(token,"%d",nmatches);
6062 	      md_tokens = push_token(md_tokens,token);
6063 	      nmatches = 0;
6064 	    } else if (state == IN_DELETION) {
6065 	      md_tokens = push_token(md_tokens,"0");
6066 	    }
6067 	    state = IN_MISMATCHES;
6068 
6069 	    *nmismatches_refdiff += 1;
6070 	    if (md_lowercase_variant_p && this->cdna == this->genomealt) {
6071 	      /* A mismatch against the reference only => alternate variant */
6072 	      sprintf(token,"%c",tolower(this->genome));
6073 	    } else {
6074 	      /* A true mismatch against both variants */
6075 	      *nmismatches_bothdiff += 1;
6076 	      sprintf(token,"%c",this->genome);
6077 	    }
6078 	    md_tokens = push_token(md_tokens,token);
6079 
6080 	  } else {
6081 	    fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
6082 	    abort();
6083 	  }
6084 	}
6085 
6086       } else if (type == 'I') {
6087 	while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
6088 	  *nindels += 1;
6089 	  k++;
6090 	}
6091 	state = IN_MATCHES;
6092 
6093       } else if (type == 'N') {
6094 	while (k < npairs && pairs[k].gapp == true) {
6095 	  k++;
6096 	}
6097 
6098       } else if (type == 'D') {
6099 	if (state == IN_MATCHES) {
6100 	  if (nmatches > 0) {
6101 	    sprintf(token,"%d",nmatches);
6102 	    md_tokens = push_token(md_tokens,token);
6103 	    nmatches = 0;
6104 	  }
6105 	}
6106 
6107 	if (state != IN_DELETION) {
6108 	  md_tokens = push_token(md_tokens,"^");
6109 	}
6110 	for (i = 0; i < length; i++, k++) {
6111 	  this = &(pairs[k]);
6112 	  sprintf(token,"%c",this->genome);
6113 	  md_tokens = push_token(md_tokens,token);
6114 	  *nindels += 1;
6115 	}
6116 
6117 	state = IN_DELETION;
6118 
6119       } else {
6120 	fprintf(stderr,"Don't recognize type %c\n",type);
6121 	abort();
6122       }
6123     }
6124 
6125     if (nmatches > 0) {
6126       sprintf(token,"%d",nmatches);
6127       md_tokens = push_token(md_tokens,token);
6128     }
6129 
6130     md_tokens = List_reverse(md_tokens);
6131 
6132   } else {
6133     cigar_tokens = List_reverse(cigar_tokens);
6134     for (p = cigar_tokens; p != NULL; p = List_next(p)) {
6135       cigar_token = (char *) List_head(p);
6136       debug4(printf("token is %s\n",cigar_token));
6137       type = cigar_token[strlen(cigar_token)-1];
6138       length = atoi(cigar_token);
6139 
6140       if (type == 'H') {
6141 	/* k += length; */
6142 
6143       } else if (type == 'S') {
6144 	/* k += length; */
6145 
6146       } else if (type == 'M' || type == 'X' || type == '=') {
6147 	if (state == IN_DELETION) {
6148 	  md_tokens = push_token(md_tokens,"^");
6149 	}
6150 
6151 	for (i = 0; i < length; i++, k++) {
6152 	  this = &(pairs[k]);
6153 	  debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
6154 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6155 	    nmatches++;
6156 	    state = IN_MATCHES;
6157 
6158 	  } else if (this->comp == MISMATCH_COMP) {
6159 	    if (state == IN_MATCHES) {
6160 	      sprintf(token,"%d",nmatches);
6161 	      md_tokens = push_token(md_tokens,token);
6162 	      nmatches = 0;
6163 	    }
6164 	    state = IN_MISMATCHES;
6165 
6166 	    *nmismatches_refdiff += 1;
6167 
6168 	    if (md_lowercase_variant_p && this->cdna == this->genomealt) {
6169 	      /* A mismatch against the reference only => alternate variant */
6170 	      sprintf(token,"%c",tolower(complCode[(int) this->genome]));
6171 	    } else {
6172 	      *nmismatches_bothdiff += 1;
6173 	      sprintf(token,"%c",complCode[(int) this->genome]);
6174 	    }
6175 	    md_tokens = push_token(md_tokens,token);
6176 
6177 
6178 	  } else {
6179 	    fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
6180 	    abort();
6181 	  }
6182 	}
6183 
6184       } else if (type == 'I') {
6185 	if (state == IN_DELETION) {
6186 	  md_tokens = push_token(md_tokens,"^");
6187 	}
6188 
6189 	while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
6190 	  *nindels += 1;
6191 	  k++;
6192 	}
6193 	state = IN_MATCHES;
6194 
6195       } else if (type == 'N') {
6196 #if 0
6197 	/* Ignore deletion adjacent to intron, to avoid double ^^ */
6198 	if (state == IN_DELETION) {
6199 	  md_tokens = push_token(md_tokens,"^");
6200 	}
6201 #endif
6202 
6203 	while (k < npairs && pairs[k].gapp == true) {
6204 	  k++;
6205 	}
6206 
6207       } else if (type == 'D') {
6208 	if (state == IN_MATCHES) {
6209 	  if (nmatches > 0) {
6210 	    sprintf(token,"%d",nmatches);
6211 	    md_tokens = push_token(md_tokens,token);
6212 	    nmatches = 0;
6213 	  }
6214 	} else if (state == IN_MISMATCHES) {
6215 	  md_tokens = push_token(md_tokens,"0");
6216 	}
6217 
6218 	for (i = 0; i < length; i++, k++) {
6219 	  this = &(pairs[k]);
6220 	  sprintf(token,"%c",complCode[(int) this->genome]);
6221 	  md_tokens = push_token(md_tokens,token);
6222 	  *nindels += 1;
6223 	}
6224 	state = IN_DELETION;
6225 
6226       } else {
6227 	fprintf(stderr,"Don't recognize type %c\n",type);
6228 	abort();
6229       }
6230     }
6231 
6232     if (nmatches > 0) {
6233       sprintf(token,"%d",nmatches);
6234       md_tokens = push_token(md_tokens,token);
6235     }
6236 
6237     /* Restore cigar_tokens */
6238     cigar_tokens = List_reverse(cigar_tokens);
6239   }
6240 
6241   assert(k == npairs);
6242 
6243   /* Insert initial 0 token if necessary */
6244   if (md_tokens != NULL) {
6245     first_token = (char *) List_head(md_tokens);
6246     if (!isdigit(first_token[0])) {
6247       md_tokens = push_token(md_tokens,"0");
6248     }
6249   }
6250 
6251   return md_tokens;
6252 }
6253 
6254 
6255 static struct T *
hardclip_pairarray(int * clipped_npairs,int hardclip_start,int hardclip_end,struct T * pairs,int npairs,int querylength)6256 hardclip_pairarray (int *clipped_npairs, int hardclip_start, int hardclip_end,
6257 		    struct T *pairs, int npairs, int querylength) {
6258   struct T *clipped_pairs, *ptr;
6259   int i, starti;
6260 
6261   debug10(printf("Entered hardclip_pairarray with hardclip_start %d, hardclip_end %d, querylength %d\n",
6262 		 hardclip_start,hardclip_end,querylength));
6263   debug10(Simplepair_dump_array(pairs,npairs,true));
6264   debug10(printf("Starting with %d pairs\n",npairs));
6265 
6266   i = 0;
6267   ptr = pairs;
6268   while (i < npairs && ptr->querypos < hardclip_start) {
6269     i++;
6270     ptr++;
6271   }
6272   while (i < npairs && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
6273     i++;
6274     ptr++;
6275   }
6276 
6277   if (i >= npairs) {
6278     /* hardclip_start passes right end of read, so invalid */
6279     debug10(printf("i = %d, so passed end of read\n",i));
6280     hardclip_start = 0;
6281   } else if (hardclip_start > 0) {
6282     hardclip_start = ptr->querypos;
6283   }
6284 
6285   starti = i;
6286   debug10(printf("starti is %d\n",starti));
6287 
6288   clipped_pairs = ptr;
6289 
6290   while (i < npairs && ptr->querypos < querylength - hardclip_end) {
6291     i++;
6292     ptr++;
6293   }
6294 
6295   i--;
6296   ptr--;
6297   while (i >= starti && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
6298     i--;
6299     ptr--;
6300   }
6301 
6302   if (i < 0) {
6303     /* hardclip_end passes left end of read, so invalid */
6304     debug10(printf("i = %d, so passed left end of read\n",i));
6305     hardclip_end = 0;
6306   } else if (hardclip_end > 0) {
6307     hardclip_end = querylength - 1 - ptr->querypos;
6308   }
6309 
6310   if (hardclip_start == 0 && hardclip_end == 0) {
6311     debug10(printf("Unable to hard clip\n"));
6312     *clipped_npairs = npairs;
6313     clipped_pairs = pairs;
6314   } else {
6315     *clipped_npairs = i - starti + 1;
6316   }
6317 
6318   debug10(printf("Ending with %d pairs\n",*clipped_npairs));
6319   debug10(printf("Exiting hardclip_pairarray with hardclip_start %d, hardclip_end %d\n",
6320 		 hardclip_start,hardclip_end));
6321 
6322   return clipped_pairs;
6323 }
6324 
6325 
6326 /* Called only for GMAP */
6327 void
Pair_print_sam(Filestring_T fp,char * abbrev,struct T * pairarray,int npairs,char * acc1,char * acc2,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,char * queryseq_ptr,char * quality_string,int hardclip_low,int hardclip_high,int querylength_given,bool watsonp,int sensedir,int chimera_part,Chimera_T chimera,int quality_shift,bool first_read_p,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,Chrpos_T chrpos,Chrpos_T chrlength,int mapq_score,bool sam_paired_p,char * sam_read_group_id)6328 Pair_print_sam (Filestring_T fp, char *abbrev, struct T *pairarray, int npairs,
6329 		char *acc1, char *acc2, Chrnum_T chrnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment,
6330 		char *queryseq_ptr, char *quality_string,
6331 		int hardclip_low, int hardclip_high, int querylength_given,
6332 		bool watsonp, int sensedir, int chimera_part, Chimera_T chimera,
6333 		int quality_shift, bool first_read_p, int pathnum, int npaths_primary, int npaths_altloc,
6334 		int absmq_score, int second_absmq, Chrpos_T chrpos, Chrpos_T chrlength,
6335 		int mapq_score, bool sam_paired_p, char *sam_read_group_id) {
6336   char *chrstring = NULL;
6337   unsigned int flag;
6338 
6339   List_T cigar_tokens, md_tokens = NULL;
6340   int nmismatches_refdiff, nmismatches_bothdiff, nindels;
6341   bool intronp;
6342   int hardclip_start, hardclip_end;
6343   /* int hardclip_start_zero = 0, hardclip_end_zero = 0; */
6344   struct T *clipped_pairarray;
6345   int clipped_npairs;
6346   bool cigar_tokens_alloc;
6347 
6348 
6349   if (chrnum == 0) {
6350     chrstring = Sequence_accession(usersegment);
6351   } else {
6352     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
6353   }
6354 
6355   flag = compute_sam_flag_nomate(npaths_primary + npaths_altloc,first_read_p,watsonp,sam_paired_p);
6356 
6357   debug4(printf("Entered SAM_print_pairs with watsonp %d, first_read_p %d, hardclip_low %d, and hardclip_high %d\n",
6358 		watsonp,first_read_p,hardclip_low,hardclip_high));
6359 
6360   if (watsonp == true) {
6361     hardclip_start = hardclip_low;
6362     hardclip_end = hardclip_high;
6363   } else {
6364     hardclip_start = hardclip_high;
6365     hardclip_end = hardclip_low;
6366   }
6367   debug4(printf("hardclip_start %d, hardclip_end %d\n",hardclip_start,hardclip_end));
6368 
6369 
6370   clipped_pairarray = hardclip_pairarray(&clipped_npairs,hardclip_start,hardclip_end,
6371 					 pairarray,npairs,querylength_given);
6372   cigar_tokens = Pair_compute_cigar(&intronp,&hardclip_start,&hardclip_end,clipped_pairarray,clipped_npairs,querylength_given,
6373 				    watsonp,chimera_part);
6374   cigar_tokens_alloc = true;
6375 
6376 
6377   /* Cigar updates hardclip5 and hardclip3 for chimeras */
6378   md_tokens = compute_md_string(&nmismatches_refdiff,&nmismatches_bothdiff,&nindels,
6379 				clipped_pairarray,clipped_npairs,watsonp,cigar_tokens);
6380 
6381 #if 0
6382   min_evalue = Pair_min_evalue(clipped_pairarray,clipped_npairs);
6383 #endif
6384 
6385   print_sam_line(fp,abbrev,acc1,acc2,chrstring,
6386 		 watsonp,sensedir,cigar_tokens,md_tokens,
6387 		 nmismatches_refdiff,nmismatches_bothdiff,nindels,
6388 		 intronp,queryseq_ptr,quality_string,hardclip_start,hardclip_end,
6389 		 querylength_given,chimera,quality_shift,pathnum,npaths_primary,npaths_altloc,
6390 		 absmq_score,second_absmq,flag,chromosome_iit,chrpos,chrlength,
6391 		 mapq_score,sam_read_group_id);
6392 
6393   /* Print procedures free the character strings */
6394   Pair_tokens_free(&md_tokens);
6395   if (cigar_tokens_alloc == true) {
6396     Pair_tokens_free(&cigar_tokens);
6397   }
6398 
6399   if (chrnum != 0) {
6400     FREE(chrstring);
6401   }
6402 
6403   return;
6404 }
6405 
6406 
6407 
6408 #if 0
6409 /* Copied from samprint.c */
6410 static bool
6411 check_cigar_types (Intlist_T cigar_types) {
6412   Intlist_T p;
6413   int type;
6414   /* int last_type = 'M'; */
6415   bool M_present_p = false;
6416 
6417   for (p = cigar_types; p != NULL; p = Intlist_next(p)) {
6418     type = Intlist_head(p);
6419     if (type == 'M') {
6420       M_present_p = true;
6421 #if 0
6422     } else if (type == 'H' && last_type == 'S') {
6423       debug1(printf("check_cigar_types detects adjacent S and H, so returning false\n"));
6424       return false;
6425     } else if (type == 'S' && last_type == 'H') {
6426       debug1(printf("check_cigar_types detects adjacent S and H, so returning false\n"));
6427       return false;
6428 #endif
6429     }
6430   }
6431 
6432   return M_present_p;
6433 }
6434 #endif
6435 
6436 
6437 #if 0
6438 bool
6439 Pair_check_cigar (struct T *pairs, int npairs, int querylength_given,
6440 		  int clipdir, int hardclip5, int hardclip3,
6441 		  bool watsonp, bool first_read_p, bool circularp) {
6442   bool result;
6443   Intlist_T cigar_types = NULL;
6444   int hardclip_low, hardclip_high;
6445   int Mlength = 0, Ilength = 0, Dlength = 0;
6446   bool in_exon = false, deletionp;
6447   struct T *ptr, *prev, *this = NULL;
6448   int exon_queryend;
6449   int query_gap;
6450   int last_querypos = -1;
6451   int i;
6452 
6453   if (circularp == true) {
6454     if (watsonp == true) {
6455       hardclip_low = hardclip5;
6456       hardclip_high = hardclip3;
6457     } else {
6458       hardclip_low = hardclip3;
6459       hardclip_high = hardclip5;
6460     }
6461   } else {
6462     /* Incoming hardclip5 and hardclip3 are due to overlaps, not chimera */
6463     if (clipdir >= 0) {
6464       if (watsonp == true) {
6465 	if (first_read_p == true) {
6466 	  hardclip_high = hardclip5;
6467 	  hardclip_low = 0;
6468 	} else {
6469 	  hardclip_high = 0;
6470 	  hardclip_low = hardclip3;
6471 	}
6472       } else {
6473 	if (first_read_p == true) {
6474 	  hardclip_low = hardclip5;
6475 	  hardclip_high = 0;
6476 	} else {
6477 	  hardclip_low = 0;
6478 	  hardclip_high = hardclip3;
6479 	}
6480       }
6481     } else {
6482       if (watsonp == true) {
6483 	if (first_read_p == true) {
6484 	  hardclip_low = hardclip5;
6485 	  hardclip_high = 0;
6486 	} else {
6487 	  hardclip_low = 0;
6488 	  hardclip_high = hardclip3;
6489 	}
6490       } else {
6491 	if (first_read_p == true) {
6492 	  hardclip_high = hardclip5;
6493 	  hardclip_low = 0;
6494 	} else {
6495 	  hardclip_high = 0;
6496 	  hardclip_low = hardclip3;
6497 	}
6498       }
6499     }
6500   }
6501 
6502 
6503   ptr = pairs;
6504 
6505 #if 0
6506   /* This procedure is used to check circular alignments */
6507   if (chimera_part == +1) {
6508     if (ptr->querypos > hardclip_low) {
6509       if (ptr->querypos > 0) {
6510 	/* Clip to beginning */
6511 	hardclip_low = ptr->querypos;
6512 	cigar_types = Intlist_push(cigar_types,'H');
6513       }
6514     } else {
6515       if (hardclip_low > 0) {
6516 	/* Clip to hard clip boundary */
6517 	cigar_types = Intlist_push(cigar_types,'H');
6518       }
6519     }
6520   } else {
6521 #endif
6522     if (hardclip_low > 0) {
6523       cigar_types = Intlist_push(cigar_types,'H');
6524     }
6525     if (ptr->querypos > hardclip_low) {
6526       cigar_types = Intlist_push(cigar_types,'S');
6527     }
6528 #if 0
6529   }
6530 #endif
6531 
6532   this = (T) NULL;
6533   for (i = 0; i < npairs; i++) {
6534     prev = this;
6535     this = ptr++;
6536 
6537     if (this->gapp) {
6538       if (in_exon == true) {
6539 	exon_queryend = last_querypos + 1;
6540 #if 0
6541 	exon_genomeend = last_genomepos + 1;
6542 	if (watsonp) {
6543 	  intron_start = exon_genomeend + 1;
6544 	} else {
6545 	  intron_start = exon_genomeend - 1;
6546 	}
6547 #endif
6548 
6549 	if (Mlength > 0) {
6550 	  cigar_types = Intlist_push(cigar_types,'M');
6551 	} else if (Ilength > 0) {
6552 	  cigar_types = Intlist_push(cigar_types,'I');
6553 	} else if (Dlength > 0) {
6554 	  cigar_types = Intlist_push(cigar_types,'D');
6555 	}
6556 
6557 	Mlength = Ilength = Dlength = 0;
6558 
6559 	in_exon = false;
6560       }
6561 
6562     } else if (this->comp == INTRONGAP_COMP) {
6563       /* Do nothing */
6564 
6565     } else {
6566       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
6567 	 SHORTGAP_COMP, or MISMATCH_COMP */
6568       if (in_exon == false) {
6569 #if 0
6570 	/* Needed only for full token */
6571 	/* exon_querystart = this->querypos + 1; */
6572 	exon_genomestart = this->genomepos + 1;
6573 	if (watsonp) {
6574 	  intron_end = exon_genomestart - 1;
6575 	} else {
6576 	  intron_end = exon_genomestart + 1;
6577 	}
6578 #endif
6579 
6580 	if (prev != NULL) {
6581 	  /* Gap */
6582 	  /* genome_gap = intron_end - intron_start + 1; */
6583 
6584 	  deletionp = false;
6585 #ifdef CONVERT_INTRONS_TO_DELETIONS
6586 	  if (cdna_direction > 0) {
6587 	    if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
6588 		prev->comp == FWD_GCAG_INTRON_COMP ||
6589 		prev->comp == FWD_ATAC_INTRON_COMP) {
6590 	      cigar_types = Intlist_push(cigar_types,'N');
6591 	      /* *intronp = true; */
6592 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
6593 	      cigar_types = Intlist_push(cigar_types,'N');
6594 	      /* *intronp = true; */
6595 	    } else {
6596 	      cigar_types = Intlist_push(cigar_types,'D');
6597 	      deletionp = true;
6598 	    }
6599 	  } else if (cdna_direction < 0) {
6600 	    if (prev->comp == REV_CANONICAL_INTRON_COMP ||
6601 		prev->comp == REV_GCAG_INTRON_COMP ||
6602 		prev->comp == REV_ATAC_INTRON_COMP) {
6603 	      cigar_types = Intlist_push(cigar_types,'N');
6604 	      /* *intronp = true; */
6605 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
6606 	      cigar_types = Intlist_push(cigar_types,'N');
6607 	      /* *intronp = true; */
6608 	    } else {
6609 	      cigar_types = Intlist_push(cigar_types,'D');
6610 	      deletionp = true;
6611 	    }
6612 	  } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
6613 	    cigar_types = Intlist_push(cigar_types,'N');
6614 	    /* *intronp = true; */
6615 	  } else {
6616 	    cigar_types = Intlist_push(cigar_types,'D');
6617 	    deletionp = true;
6618 	  }
6619 #else
6620 	  cigar_types = Intlist_push(cigar_types,'N');
6621 	  /* *intronp = true; */
6622 #endif
6623 
6624 	  /* Check for dual gap.  Doesn't work for hard clipping. */
6625 	  assert(exon_queryend >= 0);
6626 
6627 	  query_gap = this->querypos - exon_queryend;
6628 	  assert(query_gap >= 0);
6629 	  if (query_gap > 0) {
6630 	    if (deletionp == true && sam_insert_0M_p == true) {
6631 	      /* Put zero matches between deletion and insertion, since some programs will complain */
6632 	      cigar_types = Intlist_push(cigar_types,'M');
6633 	    }
6634 
6635 	    cigar_types = Intlist_push(cigar_types,'I');
6636 	  }
6637 	}
6638 
6639 	in_exon = true;
6640       }
6641 
6642       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
6643 	/* Gap in upper or lower sequence */
6644 	if (this->genome == ' ') {
6645 	  /* Insertion relative to genome */
6646 	  if (Mlength > 0) {
6647 	    cigar_types = Intlist_push(cigar_types,'M');
6648 	    Mlength = 0;
6649 	  } else if (Dlength > 0) {
6650 	    /* unlikely */
6651 	    cigar_types = Intlist_push(cigar_types,'D');
6652 	    Dlength = 0;
6653 	  }
6654 	  Ilength++;
6655 	} else if (this->cdna == ' ') {
6656 	  /* Deletion relative to genome */
6657 	  if (Mlength > 0) {
6658 	    cigar_types = Intlist_push(cigar_types,'M');
6659 	    Mlength = 0;
6660 	  } else if (Ilength > 0) {
6661 	    cigar_types = Intlist_push(cigar_types,'I');
6662 	    Ilength = 0;
6663 	  }
6664 	  Dlength++;
6665 	} else {
6666 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
6667 	  exit(9);
6668 	}
6669 
6670       } else {
6671 	/* Count even if unknown base */
6672 
6673 	if (Ilength > 0) {
6674 	  cigar_types = Intlist_push(cigar_types,'I');
6675 	  Ilength = 0;
6676 	} else if (Dlength > 0) {
6677 	  cigar_types = Intlist_push(cigar_types,'D');
6678 	  Dlength = 0;
6679 	}
6680 	Mlength++;
6681       }
6682     }
6683 
6684     if (this != NULL) {
6685       if (this->cdna != ' ') {
6686 	last_querypos = this->querypos;
6687       }
6688 #if 0
6689       if (this->genome != ' ') {
6690 	last_genomepos = this->genomepos;
6691       }
6692 #endif
6693     }
6694   }
6695 
6696   /* prev = this; */
6697   exon_queryend = last_querypos + 1;
6698   /* exon_genomeend = last_genomepos + 1; */
6699 
6700   if (Mlength > 0) {
6701     cigar_types = Intlist_push(cigar_types,'M');
6702   } else if (Ilength > 0) {
6703     cigar_types = Intlist_push(cigar_types,'I');
6704   } else if (Dlength > 0) {
6705     cigar_types = Intlist_push(cigar_types,'D');
6706   }
6707 
6708 
6709   /* Terminal clipping */
6710 #if 0
6711   /* This procedure is used to check circular alignments */
6712   if (chimera_part == -1) {
6713     if (last_querypos < querylength_given - 1 - hardclip_high) {
6714       if (last_querypos < querylength_given - 1) {
6715 	/* Clip to end */
6716 	hardclip_high = querylength_given - 1 - last_querypos;
6717 	cigar_types = Intlist_push(cigar_types,'H');
6718       }
6719     } else {
6720       if (hardclip_high > 0) {
6721 	/* Clip to hard clip boundary */
6722 	cigar_types = Intlist_push(cigar_types,'H');
6723       }
6724     }
6725   } else {
6726 #endif
6727     if (last_querypos < querylength_given - 1 - hardclip_high) {
6728       cigar_types = Intlist_push(cigar_types,'S');
6729     }
6730     if (hardclip_high > 0) {
6731       cigar_types = Intlist_push(cigar_types,'H');
6732     }
6733 #if 0
6734   }
6735 #endif
6736 
6737   result = check_cigar_types(cigar_types);
6738 
6739   Intlist_free(&cigar_types);
6740   return result;
6741 }
6742 #endif
6743 
6744 
6745 #if 0
6746 static void
6747 state_print (MD_state_T state) {
6748   switch (state) {
6749   case IN_MATCHES: printf("IN_MATCHES"); break;
6750   case IN_MISMATCHES: printf("IN_MISMATCHES"); break;
6751   case IN_DELETION: printf("IN_DELETION"); break;
6752   default: abort();
6753   }
6754   return;
6755 }
6756 #endif
6757 
6758 
6759 #if 0
6760 static List_T
6761 compute_md_string_old (int *nmismatches, struct T *pairs, int npairs, bool watsonp) {
6762   List_T tokens = NULL;
6763   char token[11], *first_token;
6764   int nmatches = 0;
6765   struct T *ptr, *prev, *this = NULL;
6766   MD_state_T state = IN_MISMATCHES;
6767   int i;
6768 
6769   ptr = pairs;
6770   *nmismatches = 0;
6771 
6772   /* Ignore initial soft clipping */
6773 
6774   if (watsonp == true) {
6775     for (i = 0; i < npairs; i++) {
6776       prev = this;
6777       this = ptr++;
6778 
6779       if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6780 	nmatches++;
6781 	state = IN_MATCHES;
6782 
6783       } else if (this->comp == MISMATCH_COMP) {
6784 	*nmismatches += 1;
6785 	if (state == IN_MATCHES) {
6786 	  if (nmatches > 0) {
6787 	    sprintf(token,"%d",nmatches);
6788 	    tokens = push_token(tokens,token);
6789 	    nmatches = 0;
6790 	  }
6791 
6792 	} else if (state == IN_DELETION) {
6793 	  tokens = push_token(tokens,"0");
6794 	}
6795 	state = IN_MISMATCHES;
6796 
6797 	sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6798 	tokens = push_token(tokens,token);
6799 
6800       } else if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
6801 	if (this->genome == ' ') {
6802 #if 0
6803 	  /* Insertion relative to genome.  Ignored in MD string (but not in cigar). */
6804 	  nmatches++;
6805 	  state = IN_MATCHES;
6806 #endif
6807 
6808 	} else if (this->cdna == ' ') {
6809 	  /* Deletion relative to genome */
6810 	  if (state == IN_MATCHES) {
6811 	    if (nmatches > 0) {
6812 	      sprintf(token,"%d",nmatches);
6813 	      tokens = push_token(tokens,token);
6814 	      nmatches = 0;
6815 	    }
6816 	    tokens = push_token(tokens,"^");
6817 
6818 	  } else if (state == IN_MISMATCHES) {
6819 	    tokens = push_token(tokens,"^");
6820 
6821 	  }
6822 	  state = IN_DELETION;
6823 
6824 	  sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6825 	  tokens = push_token(tokens,token);
6826 	}
6827 
6828       } else {
6829 	/* Ignore */
6830       }
6831     }
6832 
6833     /* Ignore terminal soft clipping */
6834 
6835     if (nmatches > 0) {
6836       sprintf(token,"%d",nmatches);
6837       tokens = push_token(tokens,token);
6838     }
6839 
6840     /* Put tokens in forward order */
6841     tokens = List_reverse(tokens);
6842 
6843   } else {
6844 
6845     for (i = 0; i < npairs; i++) {
6846       prev = this;
6847       this = ptr++;
6848 
6849       if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
6850 	if (state == IN_DELETION) {
6851 	  tokens = push_token(tokens,"^");
6852 	}
6853 	nmatches++;
6854 	state = IN_MATCHES;
6855 
6856       } else if (this->comp == MISMATCH_COMP) {
6857 	*nmismatches += 1;
6858 	if (state == IN_MATCHES) {
6859 	  if (nmatches > 0) {
6860 	    sprintf(token,"%d",nmatches);
6861 	    tokens = push_token(tokens,token);
6862 	    nmatches = 0;
6863 	  }
6864 
6865 	} else if (state == IN_DELETION) {
6866 	  tokens = push_token(tokens,"^");
6867 	}
6868 	state = IN_MISMATCHES;
6869 
6870 	sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6871 	tokens = push_token(tokens,token);
6872 
6873       } else if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
6874 	if (this->genome == ' ') {
6875 #if 0
6876 	  /* Insertion relative to genome.  Ignored in MD string, but not in cigar string. */
6877 	  if (state == IN_DELETION) {
6878 	    tokens = push_token(tokens,"^");
6879 	  }
6880 	  nmatches++;
6881 	  state = IN_MATCHES;
6882 #endif
6883 
6884 	} else if (this->cdna == ' ') {
6885 	  /* Deletion relative to genome */
6886 	  if (state == IN_MATCHES) {
6887 	    if (nmatches > 0) {
6888 	      sprintf(token,"%d",nmatches);
6889 	      tokens = push_token(tokens,token);
6890 	      nmatches = 0;
6891 	    }
6892 
6893 	  } else if (state == IN_MISMATCHES) {
6894 	    tokens = push_token(tokens,"0");
6895 
6896 	  }
6897 	  state = IN_DELETION;
6898 
6899 	  sprintf(token,"%c",watsonp ? this->genome : complCode[(int) this->genome]);
6900 	  tokens = push_token(tokens,token);
6901 	}
6902 
6903       } else {
6904 	/* Ignore */
6905       }
6906     }
6907 
6908     /* Ignore terminal soft clipping */
6909 
6910     if (nmatches > 0) {
6911       sprintf(token,"%d",nmatches);
6912       tokens = push_token(tokens,token);
6913     }
6914 
6915     /* Keep tokens in reverse order */
6916   }
6917 
6918 
6919   /* Insert initial 0 token if necessary */
6920   if (tokens != NULL) {
6921     first_token = (char *) List_head(tokens);
6922     if (!isdigit(first_token[0])) {
6923       tokens = push_token(tokens,"0");
6924     }
6925   }
6926 
6927   return tokens;
6928 }
6929 #endif
6930 
6931 
6932 Uintlist_T
Pair_exonbounds(struct T * pairs,int npairs)6933 Pair_exonbounds (struct T *pairs, int npairs) {
6934   Uintlist_T exonbounds = NULL;
6935   struct T *ptr, *this = NULL;
6936   bool in_exon = false;
6937   int i;
6938   Chrpos_T last_genomepos = (Chrpos_T) -1;
6939 
6940   ptr = pairs;
6941   for (i = 0; i < npairs; i++) {
6942     /* prev = this; */
6943     this = ptr++;
6944 
6945     if (this->gapp) {
6946       if (in_exon == true) {
6947 	/* exon genomeend */
6948 	exonbounds = Uintlist_push(exonbounds,/*chroffset +*/last_genomepos);
6949 	in_exon = false;
6950       }
6951     } else if (this->comp == INTRONGAP_COMP) {
6952       /* Do nothing */
6953     } else {
6954       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
6955 	 SHORTGAP_COMP, or MISMATCH_COMP */
6956       if (in_exon == false) {
6957 	/* exon genomestart */
6958 	exonbounds = Uintlist_push(exonbounds,/*chroffset +*/this->genomepos);
6959 	in_exon = true;
6960       }
6961     }
6962     if (this->genome != ' ') {
6963       last_genomepos = this->genomepos;
6964     }
6965   }
6966 
6967   /* prev = this; */
6968   exonbounds = Uintlist_push(exonbounds,/*chroffset +*/last_genomepos);
6969 
6970   return Uintlist_reverse(exonbounds);
6971 }
6972 
6973 
6974 static int
count_psl_blocks_nt(Intlist_T * blockSizes,Intlist_T * qStarts,Uintlist_T * tStarts,struct T * pairs_directional,int npairs,int querylength,bool watsonp)6975 count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tStarts, struct T *pairs_directional,
6976 		     int npairs, int querylength, bool watsonp) {
6977   int nblocks = 0, i;
6978   int block_querystart, block_queryend;
6979   struct T *ptr = pairs_directional, *this = NULL;
6980   bool in_block = false;
6981   int last_querypos = -1;
6982   /* Chrpos_T last_genomepos = (Chrpos_T) -1; */
6983 
6984   for (i = 0; i < npairs; i++) {
6985     /* prev = this; */
6986     this = ptr++;
6987 
6988     if (this->gapp) {
6989       if (in_block == true) {
6990 	nblocks++;
6991 	block_queryend = last_querypos;
6992 	debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
6993 	/* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
6994 	if (block_queryend > block_querystart) {
6995 	  *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
6996 	} else {
6997 	  *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
6998 	}
6999 	in_block = false;
7000       }
7001     } else if (this->comp == INTRONGAP_COMP) {
7002       /* Do nothing */
7003 
7004     } else if (this->cdna == ' ' || this->genome == ' ') {
7005       if (in_block == true) {
7006 	nblocks++;
7007 	block_queryend = last_querypos;
7008 	debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
7009 	/* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
7010 	if (block_queryend > block_querystart) {
7011 	  *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
7012 	} else {
7013 	  *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
7014 	}
7015 	in_block = false;
7016       }
7017 
7018     } else {
7019       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7020          or SHORTGAP_COMP */
7021       if (in_block == false) {
7022 	block_querystart = this->querypos;
7023 	if (watsonp == true) {
7024 	  debug2(FPRINTF(fp,"Pushing qstart: %d\n",block_querystart));
7025 	  *qStarts = Intlist_push(*qStarts,block_querystart);
7026 	} else {
7027 	  debug2(FPRINTF(fp,"Pushing qstart: %d\n",querylength-block_querystart-1));
7028 	  *qStarts = Intlist_push(*qStarts,querylength-block_querystart-1);
7029 	}
7030 	*tStarts = Uintlist_push(*tStarts,this->genomepos);
7031 	in_block = true;
7032       }
7033     }
7034 
7035     if (this->cdna != ' ') {
7036       last_querypos = this->querypos;
7037     }
7038 #if 0
7039     if (this->genome != ' ') {
7040       last_genomepos = this->genomepos;
7041     }
7042 #endif
7043   }
7044 
7045   if (in_block == true) {
7046     /* prev = this; */
7047     nblocks++;
7048     block_queryend = last_querypos;
7049     debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1));
7050     /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */
7051     if (block_queryend > block_querystart) {
7052       *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1);
7053     } else {
7054       *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1);
7055     }
7056   }
7057 
7058   *blockSizes = Intlist_reverse(*blockSizes);
7059   *qStarts = Intlist_reverse(*qStarts);
7060   *tStarts = Uintlist_reverse(*tStarts);
7061 
7062   return nblocks;
7063 }
7064 
7065 
7066 static int
count_psl_blocks_pro(Intlist_T * blockSizes,Intlist_T * qStarts,Uintlist_T * tStarts,struct T * pairs_directional,int npairs,bool watsonp,Chrpos_T chrlength)7067 count_psl_blocks_pro (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tStarts, struct T *pairs_directional,
7068 		      int npairs, bool watsonp, Chrpos_T chrlength) {
7069   int nblocks = 0, i;
7070   int naminoacids = 0;
7071   int block_querystart;
7072   struct T *ptr = pairs_directional, *this = NULL;
7073   bool in_block = false;
7074 #ifdef NOGAPSINBLOCK
7075   struct T *prev;
7076 #endif
7077 
7078   for (i = 0; i < npairs; i++) {
7079 #ifdef NOGAPSINBLOCK
7080     prev = this;
7081 #endif
7082     this = ptr++;
7083 
7084     if (this->gapp) {
7085       if (in_block == true) {
7086 	nblocks++;
7087 	*blockSizes = Intlist_push(*blockSizes,naminoacids);
7088 	in_block = false;
7089 	naminoacids = 0;
7090       }
7091     } else if (this->comp == INTRONGAP_COMP) {
7092       /* Do nothing */
7093 
7094 #ifdef NOGAPSINBLOCK
7095     } else if (this->cdna == ' ' || this->genome == ' ') {
7096       if (in_block == true) {
7097 	nblocks++;
7098 	block_queryend = last_querypos;
7099 	*blockSizes = Intlist_push(*blockSizes,block_queryend/3-(block_querystart+2)/3+1);
7100 	in_block = false;
7101       }
7102 #endif
7103 
7104     } else {
7105       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7106          or SHORTGAP_COMP */
7107       if (this->aa_e != ' ') {
7108 	naminoacids++;
7109       }
7110       if (in_block == false) {
7111 	block_querystart = this->querypos;
7112 	*qStarts = Intlist_push(*qStarts,(block_querystart+2)/3);
7113 	if (watsonp == true) {
7114 	  *tStarts = Uintlist_push(*tStarts,this->genomepos);
7115 	} else {
7116 #if 0
7117 	  /* Should be this */
7118 	  *tStarts = Uintlist_push(*tStarts,this->genomepos);
7119 #else
7120 	  /* But is actually this */
7121 	  *tStarts = Uintlist_push(*tStarts,chrlength - this->genomepos - 1);
7122 #endif
7123 	}
7124 	in_block = true;
7125       }
7126     }
7127   }
7128 
7129   if (in_block == true) {
7130 #ifdef NOGAPSINBLOCK
7131     prev = this;
7132 #endif
7133     nblocks++;
7134     *blockSizes = Intlist_push(*blockSizes,naminoacids);
7135   }
7136 
7137   *blockSizes = Intlist_reverse(*blockSizes);
7138   *qStarts = Intlist_reverse(*qStarts);
7139   *tStarts = Uintlist_reverse(*tStarts);
7140 
7141   return nblocks;
7142 }
7143 
7144 
7145 static void
compute_gap_lengths_int(int * nbreaks,int * length,Intlist_T blockSizes,Intlist_T Starts,int nblocks)7146 compute_gap_lengths_int (int *nbreaks, int *length, Intlist_T blockSizes, Intlist_T Starts, int nblocks) {
7147   int i;
7148   int start, end;
7149   /* Intlist_T p = blockSizes, q = Starts; */
7150 
7151   debug2(FPRINTF(fp,"Entered compute_gap_lengths_int with nblocks = %d, and Starts having length %d\n",
7152 		nblocks,Intlist_length(Starts)));
7153   *nbreaks = *length = 0;
7154   for (i = 0; i < nblocks - 1; i++) {
7155     if (i > 0) {
7156       start = Intlist_head(Starts);
7157       if (start - end > 0) {
7158 	*nbreaks += 1;
7159 	*length += (start - end);
7160       }
7161       debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7162     }
7163     end = Intlist_head(Starts) + Intlist_head(blockSizes);
7164     blockSizes = Intlist_next(blockSizes);
7165     Starts = Intlist_next(Starts);
7166   }
7167 
7168   if (i > 0) {
7169     start = Intlist_head(Starts);
7170     if (start - end > 0) {
7171       *nbreaks += 1;
7172       *length += (start - end);
7173     }
7174     debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7175   }
7176 
7177   return;
7178 }
7179 
7180 static void
compute_gap_lengths_uint(int * nbreaks,int * length,Intlist_T blockSizes,Uintlist_T Starts,int nblocks)7181 compute_gap_lengths_uint (int *nbreaks, int *length, Intlist_T blockSizes, Uintlist_T Starts, int nblocks) {
7182   int i;
7183   int start, end;
7184   /*
7185   Intlist_T p = blockSizes;
7186   Uintlist_T q = Starts;
7187   */
7188 
7189   *nbreaks = *length = 0;
7190   for (i = 0; i < nblocks - 1; i++) {
7191     if (i > 0) {
7192       start = Uintlist_head(Starts);
7193       if (start - end > 0) {
7194 	*nbreaks += 1;
7195 	*length += (start - end);
7196       }
7197       debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7198     }
7199     end = Uintlist_head(Starts) + Intlist_head(blockSizes);
7200     blockSizes = Intlist_next(blockSizes);
7201     Starts = Uintlist_next(Starts);
7202   }
7203 
7204   if (i > 0) {
7205     start = Uintlist_head(Starts);
7206     if (start - end > 0) {
7207       *nbreaks += 1;
7208       *length += (start - end);
7209     }
7210     debug2(FPRINTF(fp,"%d - %d = %d, gap = %d\n",start,end,start-end,*length));
7211   }
7212 
7213   return;
7214 }
7215 
7216 
7217 
7218 static void
count_matches_pro(int * matches,int * mismatches,int * unknowns,struct T * pairs,int npairs)7219 count_matches_pro (int *matches, int *mismatches, int *unknowns,
7220 		   struct T *pairs, int npairs) {
7221   struct T *this = NULL;
7222   int i;
7223 
7224   i = 0;
7225   while (i < npairs) {
7226     /* prev = this; */
7227     this = &(pairs[i++]);
7228 
7229     if (this->gapp == false) {
7230       if (this->aa_g != ' ' && this->aa_e != ' ') {
7231 	if (this->aa_g == this->aa_e) {
7232 	  *matches += 1;
7233 	} else if (this->aa_e == 'X') {
7234 	  *unknowns += 1;
7235 	} else {
7236 	  *mismatches += 1;
7237 	}
7238       }
7239     }
7240   }
7241   return;
7242 }
7243 
7244 
7245 
7246 void
Pair_print_pslformat_nt(Filestring_T fp,struct T * pairs,int npairs,T start,T end,Sequence_T queryseq,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,int matches,int unknowns,int mismatches,bool watsonp)7247 Pair_print_pslformat_nt (Filestring_T fp, struct T *pairs, int npairs, T start, T end,
7248 			 Sequence_T queryseq, Chrnum_T chrnum,
7249 			 Univ_IIT_T chromosome_iit, Sequence_T usersegment,
7250 			 int matches, int unknowns, int mismatches,
7251 			 bool watsonp) {
7252   Chrpos_T chrpos1, chrpos2;
7253   struct T *pairs_directional = NULL;
7254   Intlist_T blockSizes = NULL, qStarts = NULL, p;
7255   Uintlist_T tStarts = NULL, q;
7256   int nblocks;
7257   int qnbreaks, qlength, tnbreaks, tlength, querylength;
7258   char *chr;
7259 
7260 #ifdef PMAP
7261     querylength = 3*Sequence_fulllength(queryseq);
7262 #else
7263     querylength = Sequence_fulllength(queryseq);
7264 #endif
7265 
7266   if (watsonp == true) {
7267     pairs_directional = pairs;
7268   } else {
7269     pairs_directional = invert_and_revcomp_path(pairs,npairs);
7270   }
7271 
7272   nblocks = count_psl_blocks_nt(&blockSizes,&qStarts,&tStarts,pairs_directional,npairs,
7273 				querylength,watsonp);
7274   compute_gap_lengths_int(&qnbreaks,&qlength,blockSizes,qStarts,nblocks);
7275   compute_gap_lengths_uint(&tnbreaks,&tlength,blockSizes,tStarts,nblocks);
7276 
7277   FPRINTF(fp,"%d\t%d\t%d\t%d\t",matches,mismatches,/*repeatmatches*/0,unknowns);
7278   FPRINTF(fp,"%d\t%d\t%d\t%d\t",qnbreaks,qlength,tnbreaks,tlength);
7279 
7280   if (watsonp == true) {
7281     FPRINTF(fp,"+");
7282   } else {
7283     FPRINTF(fp,"-");
7284   }
7285   FPRINTF(fp,"\t%s\t%d",Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
7286 
7287   FPRINTF(fp,"\t%d\t%d",start->querypos,end->querypos+1);
7288 
7289   /* T name and T size */
7290   if (chrnum == 0) {
7291     FPRINTF(fp,"\t%s\t%u",Sequence_accession(usersegment),Sequence_fulllength(usersegment));
7292   } else {
7293     chr = Chrnum_to_string(chrnum,chromosome_iit);
7294     FPRINTF(fp,"\t%s\t%u",chr,Chrnum_length(chrnum,chromosome_iit));
7295     FREE(chr);
7296   }
7297 
7298   /* T start and T end */
7299   chrpos1 = start->genomepos;
7300   chrpos2 = end->genomepos;
7301   if (watsonp) {
7302     FPRINTF(fp,"\t%u\t%u",chrpos1,chrpos2+1);
7303   } else {
7304     FPRINTF(fp,"\t%u\t%u",chrpos2,chrpos1+1);
7305   }
7306 
7307   FPRINTF(fp,"\t%d",nblocks);
7308 
7309   FPRINTF(fp,"\t");
7310   for (p = blockSizes; p != NULL; p = Intlist_next(p)) {
7311     FPRINTF(fp,"%d,",Intlist_head(p));
7312   }
7313 
7314   FPRINTF(fp,"\t");
7315   for (p = qStarts; p != NULL; p = Intlist_next(p)) {
7316     FPRINTF(fp,"%d,",Intlist_head(p));
7317   }
7318 
7319   FPRINTF(fp,"\t");
7320   for (q = tStarts; q != NULL; q = Uintlist_next(q)) {
7321     FPRINTF(fp,"%u,",Uintlist_head(q));
7322   }
7323 
7324   Intlist_free(&blockSizes);
7325   Intlist_free(&qStarts);
7326   Uintlist_free(&tStarts);
7327 
7328   if (watsonp == false) {
7329     FREE(pairs_directional);
7330   }
7331 
7332   PUTC('\n',fp);
7333   return;
7334 }
7335 
7336 void
Pair_print_pslformat_pro(Filestring_T fp,struct T * pairs,int npairs,T start,T end,Sequence_T queryseq,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,Sequence_T usersegment,bool watsonp,int cdna_direction)7337 Pair_print_pslformat_pro (Filestring_T fp, struct T *pairs, int npairs, T start, T end,
7338 			  Sequence_T queryseq, Chrnum_T chrnum,
7339 			  Univ_IIT_T chromosome_iit, Sequence_T usersegment,
7340 			  bool watsonp, int cdna_direction) {
7341   Chrpos_T chrpos1, chrpos2;
7342   Chrpos_T chrlength;
7343   Intlist_T blockSizes = NULL, qStarts = NULL, p;
7344   Uintlist_T tStarts = NULL, q;
7345   int nblocks, matches = 0, mismatches = 0, unknowns = 0;
7346   int qnbreaks, qlength, tnbreaks, tlength;
7347   char *chr;
7348 
7349   chrlength = Chrnum_length(chrnum,chromosome_iit);
7350   nblocks = count_psl_blocks_pro(&blockSizes,&qStarts,&tStarts,pairs,npairs,
7351 				 watsonp,chrlength);
7352   compute_gap_lengths_int(&qnbreaks,&qlength,blockSizes,qStarts,nblocks);
7353   compute_gap_lengths_uint(&tnbreaks,&tlength,blockSizes,tStarts,nblocks);
7354 
7355   count_matches_pro(&matches,&mismatches,&unknowns,pairs,npairs);
7356 
7357   FPRINTF(fp,"%d\t%d\t%d\t%d\t",matches,mismatches,/*repeatmatches*/0,unknowns);
7358   FPRINTF(fp,"%d\t%d\t%d\t%d\t",qnbreaks,qlength,tnbreaks,tlength);
7359 
7360   if (cdna_direction >= 0) {
7361     FPRINTF(fp,"+");
7362   } else {
7363     FPRINTF(fp,"-");
7364   }
7365 
7366   if (watsonp == true) {
7367     FPRINTF(fp,"+");
7368   } else {
7369     FPRINTF(fp,"-");
7370   }
7371   FPRINTF(fp,"\t%s\t%d",Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
7372 
7373   FPRINTF(fp,"\t%d\t%d",(start->querypos+2)/3,end->querypos/3+1);
7374 
7375   /* T name and T size */
7376   if (chrnum == 0) {
7377     FPRINTF(fp,"\t%s\t%u",Sequence_accession(usersegment),Sequence_fulllength(usersegment));
7378   } else {
7379     chr = Chrnum_to_string(chrnum,chromosome_iit);
7380     FPRINTF(fp,"\tchr%s\t%u",chr,Chrnum_length(chrnum,chromosome_iit));
7381     FREE(chr);
7382   }
7383 
7384   /* T start and T end */
7385   chrpos1 = start->genomepos;
7386   chrpos2 = end->genomepos;
7387   if (watsonp) {
7388     FPRINTF(fp,"\t%u\t%u",chrpos1,chrpos2+1);
7389   } else {
7390     FPRINTF(fp,"\t%u\t%u",chrpos2,chrpos1+1);
7391   }
7392 
7393   nblocks = count_psl_blocks_pro(&blockSizes,&qStarts,&tStarts,pairs,npairs,
7394 				 watsonp,chrlength);
7395   FPRINTF(fp,"\t%d",nblocks);
7396   FPRINTF(fp,"\t");
7397 
7398   for (p = blockSizes; p != NULL; p = Intlist_next(p)) {
7399     FPRINTF(fp,"%d,",Intlist_head(p));
7400   }
7401 
7402   FPRINTF(fp,"\t");
7403   for (p = qStarts; p != NULL; p = Intlist_next(p)) {
7404     FPRINTF(fp,"%d,",Intlist_head(p));
7405   }
7406 
7407   FPRINTF(fp,"\t");
7408 
7409   for (q = tStarts; q != NULL; q = Uintlist_next(q)) {
7410     FPRINTF(fp,"%u,",Uintlist_head(q));
7411   }
7412 
7413   Intlist_free(&blockSizes);
7414   Intlist_free(&qStarts);
7415   Uintlist_free(&tStarts);
7416 
7417   PUTC('\n',fp);
7418   return;
7419 }
7420 
7421 void
Pair_print_exons(Filestring_T fp,struct T * pairs,int npairs,int wraplength,int ngap,bool cdnap)7422 Pair_print_exons (Filestring_T fp, struct T *pairs, int npairs, int wraplength, int ngap, bool cdnap) {
7423   bool in_exon = false;
7424   struct T *ptr, *this = NULL;
7425   int i, exonno = 0, column = 0;
7426 
7427   ptr = pairs;
7428   for (i = 0; i < npairs; i++) {
7429     this = ptr++;
7430 
7431     if (this->gapp) {
7432       if (in_exon == true) {
7433 	if (column != 0) {
7434 	  PUTC('\n',fp);
7435 	  column = 0;
7436 	}
7437 	FPRINTF(fp,"</exon>\n");
7438 	in_exon = false;
7439 	if (ngap > 0) {
7440 	  FPRINTF(fp,"<intron %d>\n",exonno);
7441 	  PUTC(this->genome,fp);
7442 	  column = 1;
7443 	}
7444       } else {
7445 	if (ngap > 0) {
7446 	  PUTC(this->genome,fp);
7447 	  if (++column % wraplength == 0) {
7448 	    PUTC('\n',fp);
7449 	    column = 0;
7450 	  }
7451 	}
7452       }
7453     } else if (this->comp == INTRONGAP_COMP) {
7454       /* Do nothing */
7455     } else {
7456       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7457 	 SHORTGAP_COMP, or MISMATCH_COMP */
7458       if (in_exon == false) {
7459 	if (ngap > 0) {
7460 	  if (exonno > 0) {
7461 	    if (column != 0) {
7462 	      PUTC('\n',fp);
7463 	      column = 0;
7464 	    }
7465 	    FPRINTF(fp,"</intron>\n");
7466 	  }
7467 	}
7468 	FPRINTF(fp,"<exon %d",++exonno);
7469 	if (cdnap == true) {
7470 	  if (this->aaphase_e >= 0) {
7471 	    FPRINTF(fp,", phase %d",this->aaphase_e);
7472 	  }
7473 	} else {
7474 	  if (this->aaphase_g >= 0) {
7475 	    FPRINTF(fp,", phase %d",this->aaphase_g);
7476 	  }
7477 	}
7478 	FPRINTF(fp,">\n");
7479 	in_exon = true;
7480       }
7481       if (cdnap == true) {
7482 	if (this->cdna != ' ') {
7483 	  PUTC(this->cdna,fp);
7484 	  if (++column % wraplength == 0) {
7485 	    PUTC('\n',fp);
7486 	    column = 0;
7487 	  }
7488 	}
7489       } else {
7490 	if (this->genome != ' ') {
7491 	  PUTC(this->genome,fp);
7492 	  if (++column % wraplength == 0) {
7493 	    PUTC('\n',fp);
7494 	    column = 0;
7495 	  }
7496 	}
7497       }
7498     }
7499   }
7500   if (column != 0) {
7501     PUTC('\n',fp);
7502   }
7503   FPRINTF(fp,"</exon>\n");
7504 
7505   return;
7506 }
7507 
7508 
7509 int
Pair_nmatches_posttrim(int * max_match_length,List_T pairs,int pos5,int pos3)7510 Pair_nmatches_posttrim (int *max_match_length, List_T pairs, int pos5, int pos3) {
7511   int nmatches = 0, match_length;
7512   bool in_intron = false;
7513   /* bool indelp = false; */
7514   List_T p;
7515   T this;
7516 
7517   *max_match_length = match_length = 0;
7518   for (p = pairs; p != NULL; p = p->rest) {
7519     this = p->first;
7520     if (this->gapp) {
7521       if (!in_intron) {
7522 	in_intron = true;
7523       }
7524     } else {
7525       if (in_intron) {
7526 	in_intron = false;
7527       }
7528       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7529 	/* indelp = true; */
7530 #ifndef PMAP
7531       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7532 	/* (*unknowns)++; */
7533 #endif
7534       } else if (this->querypos < pos5) {
7535 	/* Don't count match or mismatch */
7536       } else if (this->querypos >= pos3) {
7537 	/* Don't count match or mismatch */
7538       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7539 	nmatches++;
7540 	match_length++;
7541       } else if (this->comp == MISMATCH_COMP) {
7542 	/* (*mismatches)++; */
7543 	if (match_length > *max_match_length) {
7544 	  *max_match_length = match_length;
7545 	}
7546 	match_length = 0;
7547       } else {
7548 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7549 	abort();
7550       }
7551     }
7552   }
7553 
7554   if (match_length > *max_match_length) {
7555     *max_match_length = match_length;
7556   }
7557 
7558   return nmatches;
7559 }
7560 
7561 
7562 int
Pair_array_nmatches_posttrim(struct T * pairarray,int npairs,int pos5,int pos3)7563 Pair_array_nmatches_posttrim (struct T *pairarray, int npairs, int pos5, int pos3) {
7564   int nmatches = 0;
7565   bool in_intron = false;
7566   /* bool indelp = false; */
7567   int i;
7568   T this;
7569 
7570   for (i = 0; i < npairs; i++) {
7571     this = &(pairarray[i]);
7572     if (this->gapp) {
7573       if (!in_intron) {
7574 	in_intron = true;
7575       }
7576     } else {
7577       if (in_intron) {
7578 	in_intron = false;
7579       }
7580       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7581 	/* indelp = true; */
7582 #ifndef PMAP
7583       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7584 	/* (*unknowns)++; */
7585 #endif
7586       } else if (this->querypos < pos5) {
7587 	/* Don't count match or mismatch */
7588       } else if (this->querypos >= pos3) {
7589 	/* Don't count match or mismatch */
7590       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7591 	nmatches++;
7592       } else if (this->comp == MISMATCH_COMP) {
7593 	/* (*mismatches)++; */
7594       } else {
7595 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7596 	abort();
7597       }
7598     }
7599   }
7600 
7601   return nmatches;
7602 }
7603 
7604 
7605 int
Pair_nmismatches_region(int * nindelbreaks,int * nbadintrons,struct T * pairs,int npairs,int trim_left,int trim_right,int start_amb_nmatches,int end_amb_nmatches,int querylength)7606 Pair_nmismatches_region (int *nindelbreaks, int *nbadintrons, struct T *pairs, int npairs,
7607 			 int trim_left, int trim_right, int start_amb_nmatches, int end_amb_nmatches,
7608 			 int querylength) {
7609   int nmismatches = 0;
7610   /* bool in_intron = false; */
7611   /* bool indelp = false; */
7612   bool in_exon = false;
7613   int i = 0;
7614   T this;
7615 
7616   *nindelbreaks = *nbadintrons = 0;
7617 
7618   /* Handle GMAP alignments that are not extended to the end */
7619   this = &(pairs[0]);
7620   if (this->querypos - start_amb_nmatches < trim_left) {
7621     /* Skip */
7622   } else {
7623     nmismatches += (this->querypos - start_amb_nmatches) - trim_left;
7624   }
7625 
7626   while (i < npairs) {
7627     this = &(pairs[i]);
7628 
7629     if (this->gapp) {
7630       if (in_exon == true) {
7631 	/* SPLICE START */
7632 	if (this->comp == FWD_CANONICAL_INTRON_COMP || this->comp == REV_CANONICAL_INTRON_COMP) {
7633 	  /* Okay */
7634 	} else {
7635 	  /* Count bad introns, even if outside of trimmed region */
7636 	  (*nbadintrons) += 1;
7637 	}
7638 	in_exon = false;
7639       }
7640 
7641     } else if (this->comp == INTRONGAP_COMP) {
7642       /* May want to print dinucleotides */
7643 
7644     } else {
7645       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
7646 	 SHORTGAP_COMP, or MISMATCH_COMP */
7647       if (in_exon == false) {
7648 	/* SPLICE CONTINUATION */
7649 	in_exon = true;
7650       }
7651       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7652 	/* Count indelbreaks, even if outside of trimmed region */
7653 	if (this->genome == ' ') {
7654 	  /* INSERTION */
7655 	  while (i < npairs && this->genome == ' ') {
7656 	    /* (*total_nindels) += 1; */
7657 	    this = &(pairs[i++]);
7658 	  }
7659 	  i--;
7660 	  (*nindelbreaks) += 1;
7661 
7662 	} else if (this->cdna == ' ') {
7663 	  /* DELETION */
7664 	  while (i < npairs && this->cdna == ' ') {
7665 	    /* (*total_nindels) -= 1; */
7666 	    this = &(pairs[i++]);
7667 	  }
7668 	  i--;
7669 	  (*nindelbreaks) += 1;
7670 	}
7671 
7672       } else if (this->querypos < trim_left) {
7673 	/* Skip for counting mismatches */
7674       } else if (this->querypos >= querylength - trim_right) {
7675 	/* Skip for counting mismatches */
7676       } else if (this->comp == MISMATCH_COMP) {
7677 	nmismatches++;
7678       }
7679     }
7680 
7681     i++;
7682   }
7683 
7684   /* Handle GMAP alignments that are not extended to the end */
7685   this = &(pairs[npairs-1]);
7686   if (this->querypos + end_amb_nmatches >= (querylength - 1) - trim_right) {
7687     /* Skip */
7688   } else {
7689     nmismatches += (querylength - 1 - trim_right) - (this->querypos + end_amb_nmatches);
7690   }
7691 
7692   return nmismatches;
7693 }
7694 
7695 
7696 
7697 int
Pair_goodness_simple(List_T pairs)7698 Pair_goodness_simple (List_T pairs) {
7699   int matches = 0, mismatches = 0;
7700   bool in_intron = false;
7701   List_T p;
7702   T this;
7703 
7704   for (p = pairs; p != NULL; p = p->rest) {
7705     this = p->first;
7706     if (this->gapp) {
7707       if (!in_intron) {
7708 	in_intron = true;
7709       }
7710     } else {
7711       if (in_intron) {
7712 	in_intron = false;
7713       }
7714       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7715 
7716 #ifndef PMAP
7717       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7718 	/* (unknowns)++; */
7719 #endif
7720       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7721 	matches++;
7722       } else if (this->comp == MISMATCH_COMP) {
7723 	mismatches++;
7724       } else {
7725 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7726 	abort();
7727       }
7728     }
7729   }
7730 
7731   return matches + MISMATCH*mismatches;
7732 }
7733 
7734 
7735 void
Pair_fracidentity_simple(int * matches,int * unknowns,int * mismatches,List_T pairs)7736 Pair_fracidentity_simple (int *matches, int *unknowns, int *mismatches, List_T pairs) {
7737   bool in_intron = false;
7738   List_T p;
7739   T this;
7740 
7741   *matches = *unknowns = *mismatches = 0;
7742   for (p = pairs; p != NULL; p = p->rest) {
7743     this = p->first;
7744     if (this->gapp) {
7745       if (!in_intron) {
7746 	in_intron = true;
7747       }
7748     } else {
7749       if (in_intron) {
7750 	in_intron = false;
7751       }
7752       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7753 #ifndef PMAP
7754       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7755 	(*unknowns)++;
7756 #endif
7757       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7758 	(*matches)++;
7759       } else if (this->comp == MISMATCH_COMP) {
7760 	(*mismatches)++;
7761       } else {
7762 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7763 	abort();
7764       }
7765     }
7766   }
7767 
7768   return;
7769 }
7770 
7771 
7772 void
Pair_fracidentity(int * matches,int * unknowns,int * mismatches,int * qopens,int * qindels,int * topens,int * tindels,int * ncanonical,int * nsemicanonical,int * nnoncanonical,double * min_splice_prob,List_T pairs,int cdna_direction)7773 Pair_fracidentity (int *matches, int *unknowns, int *mismatches, int *qopens, int *qindels,
7774 		   int *topens, int *tindels, int *ncanonical, int *nsemicanonical, int *nnoncanonical,
7775 		   double *min_splice_prob, List_T pairs, int cdna_direction) {
7776   bool in_intron = false;
7777   List_T p;
7778   T this, prev = NULL;
7779 
7780   *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
7781     *ncanonical = *nsemicanonical = *nnoncanonical = 0;
7782   *min_splice_prob = 1.0;
7783 
7784   for (p = pairs; p != NULL; p = p->rest) {
7785     this = p->first;
7786     if (this->gapp) {
7787       if (this->donor_prob < *min_splice_prob) {
7788 	*min_splice_prob = this->donor_prob;
7789       }
7790       if (this->acceptor_prob < *min_splice_prob) {
7791 	*min_splice_prob = this->acceptor_prob;
7792       }
7793       if (!in_intron) {
7794 	if (cdna_direction > 0) {
7795 	  if (this->comp == FWD_CANONICAL_INTRON_COMP) {
7796 	    (*ncanonical)++;
7797 	    in_intron = true;
7798 	  } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
7799 	    (*nsemicanonical)++;
7800 	    in_intron = true;
7801 	  } else if (this->genomejump - this->queryjump < 50) {
7802 	    (*topens)++;
7803 	    (*tindels) += this->genomejump - this->queryjump;
7804 	    /* in_intron = false */
7805 	  } else if (this->comp == NONINTRON_COMP) {
7806 	    (*nnoncanonical)++;
7807 	    in_intron = true;
7808 	  }
7809 
7810 	} else if (cdna_direction < 0) {
7811 	  if (this->comp == REV_CANONICAL_INTRON_COMP) {
7812 	    (*ncanonical)++;
7813 	    in_intron = true;
7814 	  } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
7815 	    (*nsemicanonical)++;
7816 	    in_intron = true;
7817 	  } else if (this->genomejump - this->queryjump < 50) {
7818 	    (*topens)++;
7819 	    (*tindels) += this->genomejump - this->queryjump;
7820 	    /* in_intron = false */
7821 	  } else if (this->comp == NONINTRON_COMP) {
7822 	    (*nnoncanonical)++;
7823 	    in_intron = true;
7824 	  }
7825 	}
7826       }
7827     } else {
7828       if (in_intron) {
7829 	in_intron = false;
7830       }
7831       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7832 	if (this->cdna == ' ') {
7833 	  (*tindels)++;		/* If genome has extra char, count it as a genome skip */
7834 	  if (prev && prev->cdna != ' ') {
7835 	    (*topens)++;
7836 	  }
7837 	} else if (this->genome == ' ') {
7838 	  (*qindels)++;
7839 	  if (prev && prev->genome != ' ') {
7840 	    (*qopens)++;
7841 	  }
7842 	} else {
7843 	  fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
7844 		  this->comp,this->cdna,this->genome);
7845 	  abort();
7846 	}
7847 #ifndef PMAP
7848       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7849 	(*unknowns)++;
7850 #endif
7851       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7852 	(*matches)++;
7853       } else if (this->comp == MISMATCH_COMP) {
7854 	(*mismatches)++;
7855       } else {
7856 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7857 	abort();
7858       }
7859     }
7860     prev = this;
7861   }
7862 
7863   return;
7864 }
7865 
7866 
7867 int
Pair_fracidentity_array(int * matches,int * unknowns,int * mismatches,int * qopens,int * qindels,int * topens,int * tindels,int * ncanonical,int * nsemicanonical,int * nnoncanonical,double * min_splice_prob,struct T * ptr,int npairs,int cdna_direction)7868 Pair_fracidentity_array (int *matches, int *unknowns, int *mismatches, int *qopens, int *qindels,
7869 			 int *topens, int *tindels, int *ncanonical, int *nsemicanonical, int *nnoncanonical,
7870 			 double *min_splice_prob, struct T *ptr, int npairs, int cdna_direction) {
7871   bool in_intron = false;
7872   int i;
7873   T this, prev = NULL;
7874 
7875   *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
7876     *ncanonical = *nsemicanonical = *nnoncanonical = 0;
7877   *min_splice_prob = 1.0;
7878 
7879   for (i = 0; i < npairs; i++) {
7880     this = ptr++;
7881     if (this->gapp) {
7882       if (this->donor_prob < *min_splice_prob) {
7883 	*min_splice_prob = this->donor_prob;
7884       }
7885       if (this->acceptor_prob < *min_splice_prob) {
7886 	*min_splice_prob = this->acceptor_prob;
7887       }
7888       if (!in_intron) {
7889 	if (cdna_direction > 0) {
7890 	  if (this->comp == FWD_CANONICAL_INTRON_COMP) {
7891 	    (*ncanonical)++;
7892 	    in_intron = true;
7893 	  } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
7894 	    (*nsemicanonical)++;
7895 	    in_intron = true;
7896 	  } else if (this->genomejump - this->queryjump < 50) {
7897 	    (*topens)++;
7898 	    (*tindels) += this->genomejump - this->queryjump;
7899 	    /* in_intron = false */
7900 	  } else if (this->comp == NONINTRON_COMP) {
7901 	    (*nnoncanonical)++;
7902 	    in_intron = true;
7903 	  }
7904 
7905 	} else if (cdna_direction < 0) {
7906 	  if (this->comp == REV_CANONICAL_INTRON_COMP) {
7907 	    (*ncanonical)++;
7908 	    in_intron = true;
7909 	  } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
7910 	    (*nsemicanonical)++;
7911 	    in_intron = true;
7912 	  } else if (this->genomejump - this->queryjump < 50) {
7913 	    (*topens)++;
7914 	    (*tindels) += this->genomejump - this->queryjump;
7915 	    /* in_intron = false */
7916 	  } else if (this->comp == NONINTRON_COMP) {
7917 	    (*nnoncanonical)++;
7918 	    in_intron = true;
7919 	  }
7920 	}
7921       }
7922     } else {
7923       if (in_intron) {
7924 	in_intron = false;
7925       }
7926       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
7927 	if (this->cdna == ' ') {
7928 	  (*tindels)++;		/* If genome has extra char, count it as a genome skip */
7929 	  if (prev && prev->cdna != ' ') {
7930 	    (*topens)++;
7931 	  }
7932 	} else if (this->genome == ' ') {
7933 	  (*qindels)++;
7934 	  if (prev && prev->genome != ' ') {
7935 	    (*qopens)++;
7936 	  }
7937 	} else {
7938 	  fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
7939 		  this->comp,this->cdna,this->genome);
7940 	  abort();
7941 	}
7942 #ifndef PMAP
7943       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
7944 	(*unknowns)++;
7945 #endif
7946       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
7947 	(*matches)++;
7948       } else if (this->comp == MISMATCH_COMP) {
7949 	(*mismatches)++;
7950       } else {
7951 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
7952 	abort();
7953       }
7954     }
7955     prev = this;
7956   }
7957 
7958   return (*matches) + MISMATCH*(*mismatches)
7959     + QOPEN*(*qopens) + QINDEL*(*qindels) + TOPEN*(*topens) + TINDEL*(*tindels)
7960     - CANONICAL_POINTS*(*nnoncanonical);
7961 }
7962 
7963 
7964 #if 0
7965 /* Called on first and last exons during distal/medial calculation */
7966 /* Procedure seems to give random results */
7967 int
7968 Pair_fracidentity_changepoint (List_T pairs, int cdna_direction) {
7969   int changepoint = 0, maxscore = 0, score = 0;
7970   int i = 0;
7971 
7972   bool in_intron = false;
7973   List_T p;
7974   T this, prev = NULL;
7975 
7976   for (p = pairs; p != NULL; p = p->rest) {
7977     i++;
7978     this = p->first;
7979     debug3(FPRINTF(fp,"%d: ",i));
7980     debug3(Pair_dump_one(this,/*zerobasedp*/false));
7981     if (this->gapp) {
7982       if (!in_intron) {
7983 #if 0
7984 	/* Don't expect an intron */
7985 	if (cdna_direction > 0) {
7986 	  if (this->comp == FWD_CANONICAL_INTRON_COMP) {
7987 	    (*ncanonical)++;
7988 	  } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
7989 	    (*nsemicanonical)++;
7990 	  } else if (this->comp == NONINTRON_COMP) {
7991 	    (*nnoncanonical)++;
7992 	  }
7993 
7994 	} else if (cdna_direction < 0) {
7995 	  if (this->comp == REV_CANONICAL_INTRON_COMP) {
7996 	    (*ncanonical)++;
7997 	  } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
7998 	    (*nsemicanonical)++;
7999 	  } else if (this->comp == NONINTRON_COMP) {
8000 	    (*nnoncanonical)++;
8001 	  }
8002 	}
8003 #endif
8004 	in_intron = true;
8005       }
8006     } else {
8007       if (in_intron) {
8008 	in_intron = false;
8009       }
8010       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8011 	if (this->cdna == ' ') {
8012 	  score += TINDEL;
8013 	  if (prev && prev->cdna != ' ') {
8014 	    score += TOPEN;
8015 	  }
8016 	} else if (this->genome == ' ') {
8017 	  score += QINDEL;
8018 	  if (prev && prev->genome != ' ') {
8019 	    score += QOPEN;
8020 	  }
8021 	} else {
8022 	  fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8023 		  this->comp,this->cdna,this->genome);
8024 	  abort();
8025 	}
8026 #ifndef PMAP
8027       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8028 	/* (*unknowns)++; */
8029 #endif
8030       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8031 #if 0
8032 	score += (MATCH + MATCH); /* Give more weight to matches to allow for poor quality at ends */
8033 #else
8034 	score += MATCH;
8035 #endif
8036 	if (score > maxscore) {
8037 	  maxscore = score;
8038 	  changepoint = i;
8039 	  debug3(FPRINTF(fp," => maxscore %d",maxscore));
8040 	}
8041       } else if (this->comp == MISMATCH_COMP) {
8042 	score += MISMATCH;
8043       } else {
8044 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8045 	abort();
8046       }
8047     }
8048     debug3(FPRINTF(fp,"\n"));
8049     prev = this;
8050   }
8051 
8052   return changepoint;
8053 }
8054 #endif
8055 
8056 
8057 int
Pair_fracidentity_score(List_T pairs)8058 Pair_fracidentity_score (List_T pairs) {
8059   int score = 0;
8060   int i = 0;
8061 
8062   bool in_intron = false;
8063   List_T p;
8064   T this, prev = NULL;
8065 
8066   for (p = pairs; p != NULL; p = p->rest) {
8067     i++;
8068     this = p->first;
8069     debug3(FPRINTF(fp,"%d: ",i));
8070     debug3(Pair_dump_one(this,/*zerobasedp*/false));
8071     if (this->gapp) {
8072       if (!in_intron) {
8073 	in_intron = true;
8074       }
8075     } else {
8076       if (in_intron) {
8077 	in_intron = false;
8078       }
8079       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8080 	if (this->cdna == ' ') {
8081 	  score += TINDEL;
8082 	  if (prev && prev->cdna != ' ') {
8083 	    score += TOPEN;
8084 	  }
8085 	} else if (this->genome == ' ') {
8086 	  score += QINDEL;
8087 	  if (prev && prev->genome != ' ') {
8088 	    score += QOPEN;
8089 	  }
8090 	} else {
8091 	  fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8092 		  this->comp,this->cdna,this->genome);
8093 	  abort();
8094 	}
8095 #ifndef PMAP
8096       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8097 	/* (*unknowns)++; */
8098 #endif
8099       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8100 	score += MATCH;
8101       } else if (this->comp == MISMATCH_COMP) {
8102 	score += MISMATCH;
8103       } else {
8104 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8105 	abort();
8106       }
8107     }
8108     debug3(FPRINTF(fp,"\n"));
8109     prev = this;
8110   }
8111 
8112   return score;
8113 }
8114 
8115 
8116 double
Pair_frac_error(List_T pairs,int cdna_direction)8117 Pair_frac_error (List_T pairs, int cdna_direction) {
8118   int matches, unknowns, mismatches, qopens, qindels,
8119     topens, tindels, ncanonical, nsemicanonical, nnoncanonical;
8120   int den;
8121   double min_splice_prob;
8122 
8123   Pair_fracidentity(&matches,&unknowns,&mismatches,&qopens,&qindels,
8124 		    &topens,&tindels,&ncanonical,&nsemicanonical,&nnoncanonical,
8125 		    &min_splice_prob,pairs,cdna_direction);
8126 
8127   if ((den = matches + mismatches + qindels + tindels) == 0) {
8128     return 1.0;
8129   } else {
8130     return (double) (mismatches + qindels + tindels)/(double) den;
8131   }
8132 }
8133 
8134 void
Pair_fracidentity_bounded(int * matches,int * unknowns,int * mismatches,int * qopens,int * qindels,int * topens,int * tindels,int * ncanonical,int * nsemicanonical,int * nnoncanonical,struct T * ptr,int npairs,int cdna_direction,int minpos,int maxpos)8135 Pair_fracidentity_bounded (int *matches, int *unknowns, int *mismatches,
8136 			   int *qopens, int *qindels, int *topens, int *tindels,
8137 			   int *ncanonical, int *nsemicanonical, int *nnoncanonical,
8138 			   struct T *ptr, int npairs,
8139 			   int cdna_direction, int minpos, int maxpos) {
8140   bool in_intron = false;
8141   T this, prev = NULL;
8142   int i;
8143 
8144   *matches = *unknowns = *mismatches = *qopens = *qindels = *topens = *tindels =
8145     *ncanonical = *nsemicanonical = *nnoncanonical = 0;
8146 
8147   for (i = 0; i < npairs; i++) {
8148     this = ptr++;
8149     if (this->gapp) {
8150       if (!in_intron) {
8151 	if (this->querypos >= minpos && this->querypos <= maxpos) {
8152 	  if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8153 	    (*ncanonical)++;
8154 	  } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
8155 	    (*nsemicanonical)++;
8156 	  } else if (this->comp == NONINTRON_COMP) {
8157 	    (*nnoncanonical)++;
8158 	  }
8159 	} else if (cdna_direction < 0) {
8160 	  if (this->comp == REV_CANONICAL_INTRON_COMP) {
8161 	    (*ncanonical)++;
8162 	  } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
8163 	    (*nsemicanonical)++;
8164 	  } else if (this->comp == NONINTRON_COMP) {
8165 	    (*nnoncanonical)++;
8166 	  }
8167 	}
8168 	in_intron = true;
8169       }
8170     } else {
8171       if (in_intron) {
8172 	in_intron = false;
8173       }
8174       if (this->querypos >= minpos && this->querypos <= maxpos) {
8175 	if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8176 	  if (this->cdna == ' ') {
8177 	    (*tindels)++;		/* If genome has extra char, count it as a genome skip */
8178 	    if (prev && prev->cdna != ' ') {
8179 	      (*topens)++;
8180 	    }
8181 	  } else if (this->genome == ' ') {
8182 	    (*qindels)++;
8183 	    if (prev && prev->genome != ' ') {
8184 	      (*qopens)++;
8185 	    }
8186 	  } else {
8187 	    fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8188 		    this->comp,this->cdna,this->genome);
8189 	    abort();
8190 	  }
8191 #ifndef PMAP
8192 	} else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8193 	  (*unknowns)++;
8194 #endif
8195 	} else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8196 	  (*matches)++;
8197 	} else if (this->comp == MISMATCH_COMP) {
8198 	  (*mismatches)++;
8199 	} else {
8200 	  fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8201 	  abort();
8202 	}
8203       }
8204     }
8205     prev = this;
8206   }
8207   return;
8208 }
8209 
8210 static const Except_T Array_bounds_error = { "Exceeded array bounds" };
8211 
8212 
8213 void
Pair_matchscores(int * matchscores,struct T * ptr,int npairs)8214 Pair_matchscores (int *matchscores, struct T *ptr, int npairs) {
8215   T this;
8216   int querypos;
8217   int i;
8218 
8219   for (i = 0; i < npairs; i++) {
8220     this = ptr++;
8221     querypos = this->querypos;
8222 
8223     if (this->gapp) {
8224       matchscores[querypos] = 0;	/* Count as mismatch; make evidence support the gap */
8225     } else if (this->comp == MISMATCH_COMP) {
8226       matchscores[querypos] = 0; /* For mismatch */
8227     } else if (this->comp == INDEL_COMP) {
8228       matchscores[querypos] = -1;	/* Ignore indels */
8229     } else {
8230       matchscores[querypos] = 1; /* For match */
8231     }
8232   }
8233 
8234   return;
8235 }
8236 
8237 
8238 int
Pair_maxnegscore(List_T pairs)8239 Pair_maxnegscore (List_T pairs) {
8240   int maxnegscore = 0, prevhigh = 0, score = 0;
8241   T this;
8242   List_T p = pairs;
8243 
8244   while (p != NULL) {
8245     this = p->first;
8246     debug11(Pair_dump_one(this,/*zerobasedp*/true));
8247 
8248     if (this->gapp) {
8249       /* Skip */
8250       p = p->rest;
8251 
8252     } else if (this->comp == MISMATCH_COMP) {
8253       score += MISMATCH;
8254       if (score - prevhigh < maxnegscore) {
8255 	maxnegscore = score - prevhigh;
8256       }
8257       p = p->rest;
8258 
8259     } else if (this->comp == INDEL_COMP) {
8260       score += QOPEN + QINDEL;
8261       p = p->rest;
8262       while (p != NULL && ((T) p->first)->comp == INDEL_COMP) {
8263 	score += QINDEL;
8264 	p = p->rest;
8265       }
8266       if (score - prevhigh < maxnegscore) {
8267 	maxnegscore = score - prevhigh;
8268       }
8269 
8270     } else {
8271       score += MATCH;
8272       if (score > prevhigh) {
8273 	prevhigh = score;
8274       }
8275       p = p->rest;
8276     }
8277 
8278     debug11(printf("  score %d, prevhigh %d, maxnegscore %d\n",score,prevhigh,maxnegscore));
8279   }
8280 
8281   return maxnegscore;
8282 }
8283 
8284 
8285 void
Pair_pathscores(bool * gapp,int * pathscores,struct T * ptr,int npairs,int cdna_direction,int querylength,cDNAEnd_T cdnaend,int pre_extension_slop)8286 Pair_pathscores (bool *gapp, int *pathscores, struct T *ptr, int npairs,
8287 		 int cdna_direction, int querylength, cDNAEnd_T cdnaend, int pre_extension_slop) {
8288   int querypos, querystart, queryend;
8289   int basescore;
8290   bool in_intron = false;
8291   T this, prev = NULL;
8292   int i;
8293 
8294   /* Determine these before ptr changes */
8295   this = &(ptr[0]);
8296   querystart = this->querypos;
8297   this = &(ptr[npairs-1]);
8298   queryend = this->querypos;
8299   /* printf("Entered Pair_pathscores with querystart %d and queryend %d\n",querystart,queryend); */
8300 
8301   /* Allow transitions slightly outside of the ends
8302      (pre_extension_slop) when finding non-extended paths to pair, but
8303      not when finding the breakpoint for the final pair, which has
8304      been extended */
8305   if (cdnaend == FIVE) {
8306     /* left part of chimera */
8307     for (querypos = 0; querypos < querystart; querypos++) {
8308       gapp[querypos] = true;
8309     }
8310     for (querypos = queryend + 1 + pre_extension_slop; querypos < querylength; querypos++) {
8311       gapp[querypos] = true;
8312     }
8313   } else {
8314     /* right part of chimera */
8315     for (querypos = 0; querypos < querystart - pre_extension_slop; querypos++) {
8316       gapp[querypos] = true;
8317     }
8318     for (querypos = queryend + 1; querypos < querylength; querypos++) {
8319       gapp[querypos] = true;
8320     }
8321   }
8322 
8323   /* Initialize to cover the ends that aren't aligned */
8324   for (querypos = 0; querypos < querylength; querypos++) {
8325     pathscores[querypos] = QINDEL;
8326   }
8327 
8328   for (i = 0; i < npairs; i++) {
8329     this = ptr++;
8330 
8331     querypos = this->querypos;
8332     if (querypos >= querylength) {
8333       fprintf(stderr,"Pair_pathscores: querypos %d >= querylength %d\n",querypos,querylength);
8334       Pair_dump_array(ptr,npairs,/*zerobasedp*/true);
8335       fflush(stdout);
8336       abort();
8337       RAISE(Array_bounds_error);
8338     }
8339 
8340     if (this->gapp) {
8341       gapp[querypos] = true;
8342       if (in_intron == false) {
8343 	/* Adds only a single reward/penalty per intron */
8344 	if (cdna_direction > 0) {
8345 	  if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8346 	    pathscores[querypos] = CANONICAL_POINTS;
8347 	  } else if (this->comp == FWD_GCAG_INTRON_COMP || this->comp == FWD_ATAC_INTRON_COMP) {
8348 	    pathscores[querypos] = SEMICANONICAL_POINTS;
8349 	  } else {
8350 	    pathscores[querypos] = NONCANONICAL_POINTS; /* noncanonical */
8351 	  }
8352 	} else if (cdna_direction < 0) {
8353 	  if (this->comp == REV_CANONICAL_INTRON_COMP) {
8354 	    pathscores[querypos] = CANONICAL_POINTS;
8355 	  } else if (this->comp == REV_GCAG_INTRON_COMP || this->comp == REV_ATAC_INTRON_COMP) {
8356 	    pathscores[querypos] = SEMICANONICAL_POINTS;
8357 	  } else {
8358 	    pathscores[querypos] = NONCANONICAL_POINTS; /* noncanonical */
8359 	  }
8360 	} else {
8361 	  pathscores[querypos] = NONCANONICAL_POINTS; /* indeterminate */
8362 	}
8363 	in_intron = true;
8364       }
8365 
8366     } else {
8367       if (in_intron) {
8368 	in_intron = false;
8369       }
8370       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8371 	if (this->cdna == ' ') {
8372 	  pathscores[querypos] = TINDEL;
8373 	  if (prev && prev->cdna != ' ') {
8374 	    pathscores[querypos] = TOPEN;
8375 	  }
8376 	} else if (this->genome == ' ') {
8377 	  pathscores[querypos] = QINDEL;
8378 	  if (prev && prev->genome != ' ') {
8379 	    pathscores[querypos] = QOPEN;
8380 	  }
8381 	} else {
8382 	  fprintf(stderr,"Can't parse comp %c, cdna %c, genome %c\n",
8383 		  this->comp,this->cdna,this->genome);
8384 	  abort();
8385 	}
8386 #ifndef PMAP
8387       } else if (unknown_base(this->cdna) || unknown_base(this->genome) || this->comp == AMBIGUOUS_COMP) {
8388 	/* (*unknowns)++; */
8389 #endif
8390       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
8391 	pathscores[querypos] = +1; /* For match */
8392       } else if (this->comp == MISMATCH_COMP) {
8393 	pathscores[querypos] = MISMATCH;
8394       } else {
8395 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
8396 	abort();
8397       }
8398     }
8399     prev = this;
8400   }
8401 
8402 #if 0
8403   /* Gets querystart to queryend inclusive */
8404   if (0 && querystart == 0) {
8405     for (i = 1; i <= queryend; i++) {
8406       pathscores[i] += pathscores[i-1];
8407     }
8408   } else {
8409     for (i = querystart; i <= queryend; i++) {
8410       pathscores[i] += pathscores[i-1];
8411     }
8412   }
8413 #endif
8414 
8415 #if 0
8416   if (cdnaend == FIVE) {
8417     for (i = queryend + 1; i < querylength; i++) {
8418       pathscores[i] = pathscores[i-1] + QINDEL;
8419     }
8420   } else if (cdnaend == THREE) {
8421     for (i = querystart - 1; i >= 0; --i) {
8422       pathscores[i] = pathscores[i+1] - QINDEL;
8423     }
8424     for (i = queryend + 1; i < querylength; i++) {
8425       pathscores[i] = pathscores[i-1];
8426     }
8427   }
8428 #endif
8429 
8430   if (cdnaend == FIVE) {
8431     for (i = 1; i < querylength; i++) {
8432       pathscores[i] += pathscores[i-1];
8433     }
8434     basescore = pathscores[querystart];
8435   } else if (cdnaend == THREE) {
8436     for (i = querylength-2; i >= 0; --i) {
8437       pathscores[i] += pathscores[i+1];
8438     }
8439     basescore = pathscores[queryend];
8440   }
8441 
8442   for (i = 0; i < querylength; i++) {
8443     pathscores[i] -= basescore;
8444   }
8445 
8446   return;
8447 }
8448 
8449 
8450 int
Pair_nexons_approx(List_T pairs)8451 Pair_nexons_approx (List_T pairs) {
8452   int nexons = 0;
8453   bool in_exon = false;
8454   T this;
8455   List_T p;
8456 
8457   for (p = pairs; p != NULL; p = List_next(p)) {
8458     this = List_head(p);
8459     if (this->gapp) {
8460       if (in_exon) {
8461 	in_exon = false;
8462       }
8463     } else {
8464       if (!in_exon) {
8465 	nexons++;
8466 	in_exon = true;
8467       }
8468     }
8469   }
8470 
8471   return nexons;
8472 }
8473 
8474 
8475 int
Pair_nexons(struct T * pairs,int npairs)8476 Pair_nexons (struct T *pairs, int npairs) {
8477   int nexons = 0;
8478   struct T *ptr, *this = NULL;
8479   bool in_exon = false;
8480   int i;
8481 
8482   ptr = pairs;
8483   for (i = 0; i < npairs; i++) {
8484     this = ptr++;
8485     if (this->gapp) {
8486       if (in_exon) {
8487 	in_exon = false;
8488       }
8489     } else if (this->comp == INTRONGAP_COMP) {
8490       /* Do nothing */
8491     } else {
8492       if (!in_exon) {
8493 	nexons++;
8494 	in_exon = true;
8495       }
8496     }
8497   }
8498 
8499   return nexons;
8500 }
8501 
8502 
8503 bool
Pair_consistentp(int * ncanonical,struct T * pairs,int npairs,int cdna_direction)8504 Pair_consistentp (int *ncanonical, struct T *pairs, int npairs, int cdna_direction) {
8505   bool in_intron = false;
8506   struct T *this;
8507   int i;
8508 
8509   *ncanonical = 0;
8510   for (i = 0; i < npairs; i++) {
8511     this = pairs++;
8512     if (this->gapp) {
8513       if (!in_intron) {
8514 	if (cdna_direction > 0) {
8515 	  if (this->comp == REV_CANONICAL_INTRON_COMP ||
8516 	      this->comp == REV_GCAG_INTRON_COMP ||
8517 	      this->comp == REV_ATAC_INTRON_COMP) {
8518 	    return false;
8519 	  } else if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8520 	    (*ncanonical)++;
8521 	  }
8522 	} else if (cdna_direction < 0) {
8523 	  if (this->comp == FWD_CANONICAL_INTRON_COMP ||
8524 	      this->comp == FWD_GCAG_INTRON_COMP ||
8525 	      this->comp == FWD_ATAC_INTRON_COMP) {
8526 	    return false;
8527 	  } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
8528 	    (*ncanonical)++;
8529 	  }
8530 	} else if (cdna_direction == 0) {
8531 	  /* Set cdna_direction for next time */
8532 	  if (this->comp == FWD_CANONICAL_INTRON_COMP ||
8533 	      this->comp == FWD_GCAG_INTRON_COMP ||
8534 	      this->comp == FWD_ATAC_INTRON_COMP) {
8535 	    cdna_direction = +1;
8536 	  } else if (this->comp == REV_CANONICAL_INTRON_COMP ||
8537 		     this->comp == REV_GCAG_INTRON_COMP ||
8538 		     this->comp == REV_ATAC_INTRON_COMP) {
8539 	    cdna_direction = -1;
8540 	  }
8541 	}
8542 	in_intron = true;
8543       }
8544     } else {
8545       if (in_intron) {
8546 	in_intron = false;
8547       }
8548     }
8549   }
8550 
8551   return true;
8552 }
8553 
8554 
8555 #if 0
8556 static void
8557 invert_intron (char *donor, char *acceptor) {
8558   char temp;
8559 
8560   temp = donor[0];
8561   donor[0] = complCode[(int) acceptor[1]];
8562   acceptor[1] = complCode[(int) temp];
8563 
8564   temp = donor[1];
8565   donor[1] = complCode[(int) acceptor[0]];
8566   acceptor[0] = complCode[(int) temp];
8567 
8568   return;
8569 }
8570 #endif
8571 
8572 
8573 void
Pair_print_protein_genomic(Filestring_T fp,struct T * ptr,int npairs,int wraplength,bool forwardp)8574 Pair_print_protein_genomic (Filestring_T fp, struct T *ptr, int npairs, int wraplength, bool forwardp) {
8575   struct T *this;
8576   int xpos = 0, i;
8577 
8578   if (forwardp == true) {
8579     for (i = 0; i < npairs; i++) {
8580       this = ptr++;
8581       if (this->aa_g != ' ') {
8582 	if (xpos == wraplength) {
8583 	  PUTC('\n',fp);
8584 	  xpos = 0;
8585 	}
8586 #ifdef PMAP
8587 	PUTC(this->aa_g,fp);
8588 	xpos++;
8589 #else
8590 	if (this->aa_g != '*') {
8591 	  PUTC(this->aa_g,fp);
8592 	  xpos++;
8593 	}
8594 #endif
8595       }
8596     }
8597     PUTC('\n',fp);
8598 
8599   } else {
8600     for (i = npairs-1; i >= 0; i--) {
8601       this = ptr--;
8602       if (this->aa_g != ' ') {
8603 	if (xpos == wraplength) {
8604 	  PUTC('\n',fp);
8605 	  xpos = 0;
8606 	}
8607 #ifdef PMAP
8608 	abort();
8609 	PUTC(this->aa_g,fp);
8610 	xpos++;
8611 #else
8612 	if (this->aa_g != '*') {
8613 	  PUTC(this->aa_g,fp);
8614 	  xpos++;
8615 	}
8616 #endif
8617       }
8618     }
8619     PUTC('\n',fp);
8620 
8621   }
8622 
8623   return;
8624 }
8625 
8626 #ifdef PMAP
8627 void
Pair_print_nucleotide_cdna(Filestring_T fp,struct T * ptr,int npairs,int wraplength)8628 Pair_print_nucleotide_cdna (Filestring_T fp, struct T *ptr, int npairs, int wraplength) {
8629   struct T *this;
8630   int xpos = 0, i;
8631 
8632   for (i = 0; i < npairs; i++) {
8633     this = ptr++;
8634     if (this->cdna != ' ') {
8635       if (xpos == wraplength) {
8636 	PUTC('\n',fp);
8637 	xpos = 0;
8638       }
8639       PUTC(this->cdna,fp);
8640       xpos++;
8641     }
8642   }
8643   PUTC('\n',fp);
8644   return;
8645 }
8646 #else
8647 void
Pair_print_protein_cdna(Filestring_T fp,struct T * ptr,int npairs,int wraplength,bool forwardp)8648 Pair_print_protein_cdna (Filestring_T fp, struct T *ptr, int npairs, int wraplength, bool forwardp) {
8649   struct T *this;
8650   int xpos = 0, i;
8651 
8652   if (forwardp == true) {
8653     for (i = 0; i < npairs; i++) {
8654       this = ptr++;
8655       if (this->aa_e != ' ') {
8656 	if (xpos == wraplength) {
8657 	  PUTC('\n',fp);
8658 	  xpos = 0;
8659 	}
8660 	if (this->aa_e != '*') {
8661 	  PUTC(this->aa_e,fp);
8662 	  xpos++;
8663 	}
8664       }
8665     }
8666     PUTC('\n',fp);
8667 
8668   } else {
8669     for (i = npairs-1; i >= 0; i--) {
8670       this = ptr--;
8671       if (this->aa_e != ' ') {
8672 	if (xpos == wraplength) {
8673 	  PUTC('\n',fp);
8674 	  xpos = 0;
8675 	}
8676 	if (this->aa_e != '*') {
8677 	  PUTC(this->aa_e,fp);
8678 	  xpos++;
8679 	}
8680       }
8681     }
8682     PUTC('\n',fp);
8683   }
8684 
8685   return;
8686 }
8687 #endif
8688 
8689 
8690 #if 0
8691 void
8692 Pair_print_compressed_old (Filestring_T fp, int pathnum, int npaths, T start, T end, Sequence_T queryseq, char *dbversion,
8693 			   Sequence_T usersegment, int nexons, double fracidentity,
8694 			   struct T *pairs, int npairs, Chrnum_T chrnum,
8695 			   Univcoord_T chroffset, Univ_IIT_T chromosome_iit, int querylength_given,
8696 			   int skiplength, int trim_start, int trim_end, bool checksump,
8697 			   int chimerapos, int chimeraequivpos, double donor_prob, double acceptor_prob,
8698 			   int chimera_cdna_direction, char *strain, bool watsonp, int cdna_direction) {
8699   Chrpos_T chrpos1, chrpos2;
8700   Univcoord_T position1, position2;
8701 
8702   bool in_exon = false;
8703   List_T tokens = NULL;
8704   struct T *ptr = pairs, *this = NULL;
8705   int querypos1, querypos2;
8706   int exon_querystart = -1, exon_queryend;
8707   Chrpos_T exon_genomestart = 0, exon_genomeend, intron_start, intron_end;
8708   int num = 0, den = 0, runlength = 0, i;
8709   int print_dinucleotide_p;
8710   char token[11], donor[3], acceptor[3], *chr;
8711   double coverage;
8712   /* double trimmed_coverage; */
8713   int last_querypos = -1;
8714   Chrpos_T last_genomepos = (Chrpos_T) -1;
8715 
8716   donor[0] = donor[1] = donor[2] = '\0';
8717   acceptor[0] = acceptor[1] = acceptor[2] = '\0';
8718 
8719   querypos1 = start->querypos;
8720   querypos2 = end->querypos;
8721 
8722   FPRINTF(fp,">%s ",Sequence_accession(queryseq));
8723   if (dbversion != NULL) {
8724     FPRINTF(fp,"%s ",dbversion);
8725   } else if (usersegment != NULL && Sequence_accession(usersegment) != NULL) {
8726     FPRINTF(fp,"%s ",Sequence_accession(usersegment));
8727   } else {
8728     FPRINTF(fp,"user-provided ");
8729   }
8730 #ifdef PMAP
8731   FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,(querylength_given+skiplength)*3,nexons);
8732   coverage = (double) (querypos2 - querypos1 + 1)/(double) ((querylength_given+skiplength)*3);
8733   FPRINTF(fp," %.1f",((double) rint(1000.0*coverage)));
8734 #else
8735   coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given+skiplength);
8736   if (end->querypos + 1 > trim_end) {
8737     trim_end = end->querypos + 1;
8738   }
8739   if (start->querypos < trim_start) {
8740     trim_start = start->querypos;
8741   }
8742   /*
8743   trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trim_end - trim_start + skiplength);
8744   FPRINTF(fp,">%s %s %d/%d %d(%d) %d",
8745 	 Sequence_accession(queryseq),dbversion,pathnum,npaths,
8746 	 querylength_given+skiplength,trim_end-trim_start,nexons);
8747   FPRINTF(fp," %.1f(%.1f)",((double) rint(1000.0*coverage))/10.0,((double) rint(1000.0*trimmed_coverage))/10.0);
8748   */
8749   FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,querylength_given+skiplength,nexons);
8750   FPRINTF(fp," %.1f",((double) rint(1000.0*coverage))/10.0);
8751 #endif
8752   FPRINTF(fp," %.1f",((double) rint(1000.0*fracidentity))/10.0);
8753 
8754   start = &(pairs[0]);
8755   end = &(pairs[npairs-1]);
8756   FPRINTF(fp," %d%s%d",start->querypos + ONEBASEDP,"..",end->querypos + ONEBASEDP);
8757 
8758   chrpos1 = start->genomepos;
8759   chrpos2 = end->genomepos;
8760   position1 = chroffset + chrpos1;
8761   position2 = chroffset + chrpos2;
8762   FPRINTF(fp," %u%s%u",position1 + ONEBASEDP,"..",position2 + ONEBASEDP);
8763 
8764   if (chrnum == 0) {
8765     FPRINTF(fp," %u%s%u",chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
8766   } else {
8767     chr = Chrnum_to_string(chrnum,chromosome_iit);
8768     FPRINTF(fp," %s:%u%s%u",chr,chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
8769     FREE(chr);
8770   }
8771 
8772   if (chrpos1 <= chrpos2) {
8773     FPRINTF(fp," +");
8774   } else {
8775     FPRINTF(fp," -");
8776   }
8777 
8778   if (cdna_direction > 0) {
8779     FPRINTF(fp," dir:sense");
8780   } else if (cdna_direction < 0) {
8781     FPRINTF(fp," dir:antisense");
8782   } else {
8783     FPRINTF(fp," dir:indet");
8784   }
8785 
8786   if (checksump == true) {
8787     FPRINTF(fp," md5:");
8788     Sequence_print_digest(fp,queryseq);
8789   }
8790 
8791   if (chimerapos >= 0) {
8792     if (chimeraequivpos == chimerapos) {
8793       if (donor_prob > 0.0 && acceptor_prob > 0.0) {
8794 	if (chimera_cdna_direction >= 0) {
8795 	  FPRINTF(fp," chimera:%d(>)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
8796 	} else {
8797 	  FPRINTF(fp," chimera:%d(<)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
8798 	}
8799       } else {
8800 	FPRINTF(fp," chimera:%d",chimerapos + ONEBASEDP);
8801       }
8802     } else {
8803       FPRINTF(fp," chimera:%d..%d",chimerapos + ONEBASEDP,chimeraequivpos + ONEBASEDP);
8804     }
8805   }
8806 
8807   if (strain != NULL) {
8808     FPRINTF(fp," strain:%s",strain);
8809   }
8810 
8811   PUTC('\n',fp);
8812 
8813   for (i = 0; i < npairs; i++) {
8814     /* prev = this; */
8815     this = ptr++;
8816 
8817     if (this->gapp) {
8818       if (in_exon == true) {
8819 	/* Beginning of gap */
8820 	exon_queryend = last_querypos + ONEBASEDP;
8821 	exon_genomeend = last_genomepos + ONEBASEDP;
8822 	if (watsonp) {
8823 	  intron_start = exon_genomeend + 1;
8824 	} else {
8825 	  intron_start = exon_genomeend - 1;
8826 	}
8827 
8828 	FPRINTF(fp,"\t%u %u",exon_genomestart,exon_genomeend);
8829 	FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
8830 	if (den == 0) {
8831 	  FPRINTF(fp," 100");
8832 	} else {
8833 	  FPRINTF(fp," %d",(int) floor(100.0*(double) num/(double) den));
8834 	}
8835 	print_dinucleotide_p = 1;
8836 	if (this->comp == FWD_CANONICAL_INTRON_COMP) {
8837 	  sprintf(token,"%d>",runlength);
8838 	} else if (this->comp == REV_CANONICAL_INTRON_COMP) {
8839 	  sprintf(token,"%d<",runlength);
8840 	  print_dinucleotide_p = -1;
8841 	} else if (this->comp == NONINTRON_COMP) {
8842 	  sprintf(token,"%d=",runlength);
8843 	} else if (this->comp == FWD_GCAG_INTRON_COMP) {
8844 	  sprintf(token,"%d)",runlength);
8845 	} else if (this->comp == REV_GCAG_INTRON_COMP) {
8846 	  sprintf(token,"%d(",runlength);
8847 	  print_dinucleotide_p = -1;
8848 	} else if (this->comp == FWD_ATAC_INTRON_COMP) {
8849 	  sprintf(token,"%d]",runlength);
8850 	} else if (this->comp == REV_ATAC_INTRON_COMP) {
8851 	  sprintf(token,"%d[",runlength);
8852 	  print_dinucleotide_p = -1;
8853 	} else if (this->comp == DUALBREAK_COMP) {
8854 	  sprintf(token,"%d#",runlength);
8855 	  print_dinucleotide_p = 0;
8856 	} else if (this->comp == EXTRAEXON_COMP) {
8857 	  sprintf(token,"%d#",runlength);
8858 	  print_dinucleotide_p = 0;
8859 	} else {
8860 	  fprintf(stderr,"Can't parse comp '%c' in compression for %s\n",
8861 		  this->comp,Sequence_accession(queryseq));
8862 	  abort();
8863 	}
8864 	tokens = push_token(tokens,token);
8865 	tokens = List_reverse(tokens);
8866 	print_tokens_compressed(fp,tokens);
8867 	List_free_out(&tokens);
8868 	FPRINTF(fp,"\t%d",exon_queryend - exon_querystart + 1);
8869 
8870 	runlength = 0;
8871 	donor[0] = this->genome;
8872 	donor[1] = '\0';
8873 	in_exon = false;
8874       } else if (donor[1] == '\0') {
8875 	donor[1] = this->genome;
8876       } else {
8877 	acceptor[0] = acceptor[1];
8878 	acceptor[1] = this->genome;
8879       }
8880     } else if (this->comp == INTRONGAP_COMP) {
8881       /* Do nothing */
8882     } else {
8883       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
8884 	 SHORTGAP_COMP, or MISMATCH_COMP */
8885       if (in_exon == false) {
8886 	exon_querystart = this->querypos + ONEBASEDP;
8887 	exon_genomestart = this->genomepos + ONEBASEDP;
8888 	if (watsonp) {
8889 	  intron_end = exon_genomestart - 1;
8890 	} else {
8891 	  intron_end = exon_genomestart + 1;
8892 	}
8893 	if (i > 0) {
8894 	  if (intron_end > intron_start) {
8895 	    FPRINTF(fp,"\t%d",intron_end - intron_start + 1);
8896 	  } else {
8897 	    FPRINTF(fp,"\t%d",intron_start - intron_end + 1);
8898 	  }
8899 	  if (print_dinucleotide_p == -1) {
8900 	    invert_intron(donor,acceptor);
8901 	  }
8902 	  if (print_dinucleotide_p != 0) {
8903 	    if ((donor[0] == 'G' || donor[0] == 'g') &&
8904 		(donor[1] == 'T' || donor[1] == 't') &&
8905 		(acceptor[0] == 'A' || acceptor[0] == 'a') &&
8906 		(acceptor[1] == 'G' || acceptor[1] == 'g')) {
8907 	      /* Do nothing */
8908 	    } else {
8909 	      FPRINTF(fp,"\t%c%c-%c%c",toupper(donor[0]),toupper(donor[1]),toupper(acceptor[0]),toupper(acceptor[1]));
8910 	    }
8911 	  }
8912 #if 0
8913 	  if (exon_querystart > exon_queryend + 1) {
8914 	    FPRINTF(fp,"***");
8915 	  }
8916 #endif
8917 	  PUTC('\n',fp);
8918 	}
8919 
8920 	num = den = 0;
8921 	in_exon = true;
8922       }
8923       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
8924 	/* Gap in upper or lower sequence */
8925 	if (this->genome == ' ') {
8926 	  sprintf(token,"%d^%c",runlength,this->cdna);
8927 	} else if (this->cdna == ' ') {
8928 	  sprintf(token,"%dv",runlength);
8929 	} else {
8930 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
8931 	  exit(9);
8932 	}
8933 	tokens = push_token(tokens,token);
8934 	runlength = 0;
8935 	/* Don't increment den */
8936 
8937       } else if (this->comp == MISMATCH_COMP) {
8938 	sprintf(token,"%dx%c",runlength,this->cdna);
8939 	tokens = push_token(tokens,token);
8940 	runlength = 0;
8941 	den++;
8942 
8943 #ifndef PMAP
8944       } else if (this->comp == AMBIGUOUS_COMP) {
8945 	sprintf(token,"%d:%c",runlength,this->cdna);
8946 	tokens = push_token(tokens,token);
8947 	runlength = 0;
8948 	den++;
8949 	num++;
8950 #endif
8951 
8952       } else {
8953 	runlength++;
8954 	den++;
8955 	if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP) {
8956 	  /* AMBIGUOUS_COMP handled above */
8957 	  num++;
8958 	}
8959       }
8960     }
8961 
8962     if (this->cdna != ' ') {
8963       last_querypos = this->querypos;
8964     }
8965     if (this->genome != ' ') {
8966       last_genomepos = this->genomepos;
8967     }
8968   }
8969 
8970   /* prev = this; */
8971   exon_queryend = last_querypos + ONEBASEDP;
8972   exon_genomeend = last_genomepos + ONEBASEDP;
8973 
8974   FPRINTF(fp,"\t%d %d",exon_genomestart,exon_genomeend);
8975   FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
8976   if (den == 0) {
8977     FPRINTF(fp," 100");
8978   } else {
8979     FPRINTF(fp," %d",(int) floor(100.0*(double) num/(double) den));
8980   }
8981 
8982   sprintf(token,"%d*",runlength);
8983   tokens = push_token(tokens,token);
8984   tokens = List_reverse(tokens);
8985   print_tokens_compressed(fp,tokens);
8986   List_free_out(&tokens);
8987 
8988   FPRINTF(fp,"\t%d",exon_queryend - exon_querystart + 1);
8989   PUTC('\n',fp);
8990 
8991   return;
8992 }
8993 #endif
8994 
8995 #if 0
8996 void
8997 Pair_print_compressed_byexons (Filestring_T fp, int pathnum, int npaths, T start, T end, Sequence_T queryseq, char *dbversion,
8998 			       Sequence_T usersegment, int nexons, double fracidentity,
8999 			       struct T *pairs, int npairs, Chrnum_T chrnum,
9000 			       Univcoord_T chroffset, Univ_IIT_T chromosome_iit, int querylength_given,
9001 			       int skiplength, int trim_start, int trim_end, bool checksump,
9002 			       int chimerapos, int chimeraequivpos, double donor_prob, double acceptor_prob,
9003 			       int chimera_cdna_direction, char *strain, bool watsonp, int cdna_direction) {
9004   Chrpos_T chrpos1, chrpos2;
9005   Univcoord_T position1, position2;
9006 
9007   bool in_exon = false;
9008   struct T *ptr = pairs, *this = NULL;
9009   int querypos1, querypos2;
9010   int exon_querystart = -1, exon_queryend;
9011   int exon_pairi_start, exon_pairi_end;
9012   Chrpos_T exon_genomestart = 0, exon_genomeend;
9013   int i, k;
9014   char *chr, c;
9015   double coverage;
9016   /* double trimmed_coverage; */
9017   int last_querypos = -1;
9018   Chrpos_T last_genomepos = (Chrpos_T) -1;
9019 
9020   querypos1 = start->querypos;
9021   querypos2 = end->querypos;
9022 
9023   FPRINTF(fp,">%s ",Sequence_accession(queryseq));
9024   if (dbversion != NULL) {
9025     FPRINTF(fp,"%s ",dbversion);
9026   } else if (usersegment != NULL && Sequence_accession(usersegment) != NULL) {
9027     FPRINTF(fp,"%s ",Sequence_accession(usersegment));
9028   } else {
9029     FPRINTF(fp,"user-provided ");
9030   }
9031 #ifdef PMAP
9032   FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,(querylength_given+skiplength)*3,nexons);
9033   coverage = (double) (querypos2 - querypos1 + 1)/(double) ((querylength_given+skiplength)*3);
9034   FPRINTF(fp," %.1f",((double) rint(1000.0*coverage)));
9035 #else
9036   coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given+skiplength);
9037   if (end->querypos + 1 > trim_end) {
9038     trim_end = end->querypos + 1;
9039   }
9040   if (start->querypos < trim_start) {
9041     trim_start = start->querypos;
9042   }
9043   /*
9044   trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trim_end - trim_start + skiplength);
9045   FPRINTF(fp,">%s %s %d/%d %d(%d) %d",
9046 	 Sequence_accession(queryseq),dbversion,pathnum,npaths,
9047 	 querylength_given+skiplength,trim_end-trim_start,nexons);
9048   FPRINTF(fp," %.1f(%.1f)",((double) rint(1000.0*coverage))/10.0,((double) rint(1000.0*trimmed_coverage))/10.0);
9049   */
9050   FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,querylength_given+skiplength,nexons);
9051   FPRINTF(fp," %.1f",((double) rint(1000.0*coverage))/10.0);
9052 #endif
9053   FPRINTF(fp," %.1f",((double) rint(1000.0*fracidentity))/10.0);
9054 
9055   start = &(pairs[0]);
9056   end = &(pairs[npairs-1]);
9057   FPRINTF(fp," %d%s%d",start->querypos + ONEBASEDP,"..",end->querypos + ONEBASEDP);
9058 
9059   chrpos1 = start->genomepos;
9060   chrpos2 = end->genomepos;
9061   position1 = chroffset + chrpos1;
9062   position2 = chroffset + chrpos2;
9063   FPRINTF(fp," %u%s%u",position1 + ONEBASEDP,"..",position2 + ONEBASEDP);
9064 
9065   if (chrnum == 0) {
9066     FPRINTF(fp," %u%s%u",chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9067   } else {
9068     chr = Chrnum_to_string(chrnum,chromosome_iit);
9069     FPRINTF(fp," %s:%u%s%u",chr,chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9070     FREE(chr);
9071   }
9072 
9073   if (chrpos1 <= chrpos2) {
9074     FPRINTF(fp," +");
9075   } else {
9076     FPRINTF(fp," -");
9077   }
9078 
9079   if (cdna_direction > 0) {
9080     FPRINTF(fp," dir:sense");
9081   } else if (cdna_direction < 0) {
9082     FPRINTF(fp," dir:antisense");
9083   } else {
9084     FPRINTF(fp," dir:indet");
9085   }
9086 
9087   if (checksump == true) {
9088     FPRINTF(fp," md5:");
9089     Sequence_print_digest(fp,queryseq);
9090   }
9091 
9092   if (chimerapos >= 0) {
9093     if (chimeraequivpos == chimerapos) {
9094       if (donor_prob > 0.0 && acceptor_prob > 0.0) {
9095 	if (chimera_cdna_direction >= 0) {
9096 	  FPRINTF(fp," chimera:%d(>)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9097 	} else {
9098 	  FPRINTF(fp," chimera:%d(<)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9099 	}
9100       } else {
9101 	FPRINTF(fp," chimera:%d",chimerapos + ONEBASEDP);
9102       }
9103     } else {
9104       FPRINTF(fp," chimera:%d..%d",chimerapos + ONEBASEDP,chimeraequivpos + ONEBASEDP);
9105     }
9106   }
9107 
9108   if (strain != NULL) {
9109     FPRINTF(fp," strain:%s",strain);
9110   }
9111 
9112   PUTC('\n',fp);
9113 
9114   exon_pairi_start = 0;
9115   for (i = 0; i < npairs; i++) {
9116     /* prev = this; */
9117     this = ptr++;
9118 
9119     if (this->gapp) {
9120       if (in_exon == true) {
9121 	/* Beginning of gap */
9122 	exon_queryend = last_querypos + ONEBASEDP;
9123 	exon_genomeend = last_genomepos + ONEBASEDP;
9124 	exon_pairi_end = i;
9125 
9126 	FPRINTF(fp,"\t%u %u",exon_genomestart,exon_genomeend);
9127 	FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
9128 	PUTC('\t',fp);
9129 	for (k = exon_pairi_start; k < exon_pairi_end; k++) {
9130 	  if ((c = pairs[k].cdna) != ' ') {
9131 	    PUTC(c,fp);
9132 	  }
9133 	}
9134 
9135 	in_exon = false;
9136       }
9137 
9138     } else if (this->comp == INTRONGAP_COMP) {
9139       /* Do nothing */
9140 
9141     } else {
9142       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9143 	 SHORTGAP_COMP, or MISMATCH_COMP */
9144       if (in_exon == false) {
9145 	exon_querystart = this->querypos + ONEBASEDP;
9146 	exon_genomestart = this->genomepos + ONEBASEDP;
9147 	exon_pairi_start = i;
9148 	if (i > 0) {
9149 	  PUTC('\n',fp);
9150 	}
9151 
9152 	in_exon = true;
9153       }
9154     }
9155 
9156     if (this->cdna != ' ') {
9157       last_querypos = this->querypos;
9158     }
9159     if (this->genome != ' ') {
9160       last_genomepos = this->genomepos;
9161     }
9162   }
9163 
9164   /* prev = this; */
9165   exon_queryend = last_querypos + ONEBASEDP;
9166   exon_genomeend = last_genomepos + ONEBASEDP;
9167   exon_pairi_end = i;
9168 
9169   FPRINTF(fp,"\t%d %d",exon_genomestart,exon_genomeend);
9170   FPRINTF(fp," %d %d",exon_querystart,exon_queryend);
9171   PUTC('\t',fp);
9172   for (k = exon_pairi_start; k < exon_pairi_end; k++) {
9173     if ((c = pairs[k].cdna) != ' ') {
9174       PUTC(c,fp);
9175     }
9176   }
9177   PUTC('\n',fp);
9178 
9179   return;
9180 }
9181 #endif
9182 
9183 
9184 void
Pair_print_compressed(Filestring_T fp,int pathnum,int npaths,T start,T end,Sequence_T queryseq,char * dbversion,Sequence_T usersegment,int nexons,double fracidentity,struct T * pairs,int npairs,Chrnum_T chrnum,Univcoord_T chroffset,Univ_IIT_T chromosome_iit,int querylength_given,int skiplength,int trim_start,int trim_end,bool checksump,int chimerapos,int chimeraequivpos,double donor_prob,double acceptor_prob,int chimera_cdna_direction,char * strain,int cdna_direction)9185 Pair_print_compressed (Filestring_T fp, int pathnum, int npaths, T start, T end, Sequence_T queryseq, char *dbversion,
9186 		       Sequence_T usersegment, int nexons, double fracidentity,
9187 		       struct T *pairs, int npairs, Chrnum_T chrnum,
9188 		       Univcoord_T chroffset, Univ_IIT_T chromosome_iit, int querylength_given,
9189 		       int skiplength, int trim_start, int trim_end, bool checksump,
9190 		       int chimerapos, int chimeraequivpos, double donor_prob, double acceptor_prob,
9191 		       int chimera_cdna_direction, char *strain, int cdna_direction) {
9192   Chrpos_T chrpos1, chrpos2;
9193   Univcoord_T position1, position2;
9194 
9195   struct T *ptr = pairs, *this = NULL;
9196   int querypos1, querypos2;
9197   int i;
9198   char *chr;
9199   double coverage;
9200   /* double trimmed_coverage; */
9201   /* int last_querypos = -1; */
9202   /* Chrpos_T last_genomepos = (Chrpos_T) -1; */
9203 
9204   querypos1 = start->querypos;
9205   querypos2 = end->querypos;
9206 
9207   FPRINTF(fp,">%s ",Sequence_accession(queryseq));
9208   if (dbversion != NULL) {
9209     FPRINTF(fp,"%s ",dbversion);
9210   } else if (usersegment != NULL && Sequence_accession(usersegment) != NULL) {
9211     FPRINTF(fp,"%s ",Sequence_accession(usersegment));
9212   } else {
9213     FPRINTF(fp,"user-provided ");
9214   }
9215 #ifdef PMAP
9216   FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,(querylength_given+skiplength)*3,nexons);
9217   coverage = (double) (querypos2 - querypos1 + 1)/(double) ((querylength_given+skiplength)*3);
9218   FPRINTF(fp," %.1f",((double) rint(1000.0*coverage)));
9219 #else
9220   coverage = (double) (querypos2 - querypos1 + 1)/(double) (querylength_given+skiplength);
9221   if (end->querypos + 1 > trim_end) {
9222     trim_end = end->querypos + 1;
9223   }
9224   if (start->querypos < trim_start) {
9225     trim_start = start->querypos;
9226   }
9227   /*
9228   trimmed_coverage = (double) (end->querypos - start->querypos + 1)/(double) (trim_end - trim_start + skiplength);
9229   FPRINTF(fp,">%s %s %d/%d %d(%d) %d",
9230 	 Sequence_accession(queryseq),dbversion,pathnum,npaths,
9231 	 querylength_given+skiplength,trim_end-trim_start,nexons);
9232   FPRINTF(fp," %.1f(%.1f)",((double) rint(1000.0*coverage))/10.0,((double) rint(1000.0*trimmed_coverage))/10.0);
9233   */
9234   FPRINTF(fp,"%d/%d %d %d",pathnum,npaths,querylength_given+skiplength,nexons);
9235   FPRINTF(fp," %.1f",((double) rint(1000.0*coverage))/10.0);
9236 #endif
9237   FPRINTF(fp," %.1f",((double) rint(1000.0*fracidentity))/10.0);
9238 
9239   start = &(pairs[0]);
9240   end = &(pairs[npairs-1]);
9241   FPRINTF(fp," %d%s%d",start->querypos + ONEBASEDP,"..",end->querypos + ONEBASEDP);
9242 
9243   chrpos1 = start->genomepos;
9244   chrpos2 = end->genomepos;
9245   position1 = chroffset + chrpos1;
9246   position2 = chroffset + chrpos2;
9247   FPRINTF(fp," %u%s%u",position1 + ONEBASEDP,"..",position2 + ONEBASEDP);
9248 
9249   if (chrnum == 0) {
9250     FPRINTF(fp," %u%s%u",chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9251   } else {
9252     chr = Chrnum_to_string(chrnum,chromosome_iit);
9253     FPRINTF(fp," %s:%u%s%u",chr,chrpos1 + ONEBASEDP,"..",chrpos2 + ONEBASEDP);
9254     FREE(chr);
9255   }
9256 
9257   if (chrpos1 <= chrpos2) {
9258     FPRINTF(fp," +");
9259   } else {
9260     FPRINTF(fp," -");
9261   }
9262 
9263   if (cdna_direction > 0) {
9264     FPRINTF(fp," dir:sense");
9265   } else if (cdna_direction < 0) {
9266     FPRINTF(fp," dir:antisense");
9267   } else {
9268     FPRINTF(fp," dir:indet");
9269   }
9270 
9271   if (checksump == true) {
9272     FPRINTF(fp," md5:");
9273     Sequence_print_digest(fp,queryseq);
9274   }
9275 
9276   if (chimerapos >= 0) {
9277     if (chimeraequivpos == chimerapos) {
9278       if (donor_prob > 0.0 && acceptor_prob > 0.0) {
9279 	if (chimera_cdna_direction >= 0) {
9280 	  FPRINTF(fp," chimera:%d(>)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9281 	} else {
9282 	  FPRINTF(fp," chimera:%d(<)/%.3f/%.3f",chimerapos + ONEBASEDP,donor_prob,acceptor_prob);
9283 	}
9284       } else {
9285 	FPRINTF(fp," chimera:%d",chimerapos + ONEBASEDP);
9286       }
9287     } else {
9288       FPRINTF(fp," chimera:%d..%d",chimerapos + ONEBASEDP,chimeraequivpos + ONEBASEDP);
9289     }
9290   }
9291 
9292   if (strain != NULL) {
9293     FPRINTF(fp," strain:%s",strain);
9294   }
9295 
9296   PUTC('\n',fp);
9297 
9298   for (i = 0; i < npairs; i++) {
9299     /* prev = this; */
9300     this = ptr++;
9301     if (this->cdna != ' ') {
9302       PUTC(this->cdna,fp);
9303     }
9304   }
9305 
9306   PUTC('\n',fp);
9307 
9308   return;
9309 }
9310 
9311 
9312 void
Pair_print_iit_map(Filestring_T fp,Sequence_T queryseq,char * accession,T start,T end,Chrnum_T chrnum,Univ_IIT_T chromosome_iit)9313 Pair_print_iit_map (Filestring_T fp, Sequence_T queryseq, char *accession,
9314 		    T start, T end, Chrnum_T chrnum, Univ_IIT_T chromosome_iit) {
9315   char *chrstring = NULL;
9316   Chrpos_T chrpos1, chrpos2;
9317 
9318   if (chrnum == 0) {
9319     chrstring = "";
9320   } else {
9321     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9322   }
9323 
9324   /* Made identical to code for Pair_print_iit_exon_map */
9325   chrpos1 = start->genomepos + ONEBASEDP;
9326   chrpos2 = end->genomepos + ONEBASEDP;
9327   FPRINTF(fp,">%s %s:%u..%u\n",accession,chrstring,chrpos1,chrpos2);
9328   Sequence_print_header(fp,queryseq,/*checksump*/false);
9329 
9330   if (chrnum != 0) {
9331     FREE(chrstring);
9332   }
9333 
9334   return;
9335 }
9336 
9337 
9338 void
Pair_print_iit_exon_map(Filestring_T fp,struct T * pairs,int npairs,Sequence_T queryseq,char * accession,T start,T end,Chrnum_T chrnum,Univ_IIT_T chromosome_iit)9339 Pair_print_iit_exon_map (Filestring_T fp, struct T *pairs, int npairs, Sequence_T queryseq, char *accession,
9340 			 T start, T end, Chrnum_T chrnum, Univ_IIT_T chromosome_iit) {
9341   int i;
9342   bool in_exon = false;
9343   struct T *ptr = pairs, *this = NULL;
9344   Chrpos_T exon_genomestart = 0, exon_genomeend;
9345   char *chrstring = NULL;
9346   Chrpos_T chrpos1, chrpos2;
9347   Chrpos_T last_genomepos = (Chrpos_T) -1;
9348 
9349   if (chrnum == 0) {
9350     chrstring = "";
9351   } else {
9352     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9353   }
9354 
9355   chrpos1 = start->genomepos + ONEBASEDP;
9356   chrpos2 = end->genomepos + ONEBASEDP;
9357   FPRINTF(fp,">%s %s:%u..%u\n",accession,chrstring,chrpos1,chrpos2);
9358   Sequence_print_header(fp,queryseq,/*checksump*/false);
9359 
9360   for (i = 0; i < npairs; i++) {
9361     /* prev = this; */
9362     this = ptr++;
9363 
9364     if (this->gapp) {
9365       if (in_exon == true) {
9366 	/* Beginning of gap */
9367 	exon_genomeend = last_genomepos + ONEBASEDP;
9368 	FPRINTF(fp,"%u %u\n",exon_genomestart,exon_genomeend);
9369 	in_exon = false;
9370       }
9371     } else if (this->comp == INTRONGAP_COMP) {
9372       /* Do nothing */
9373     } else {
9374       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9375 	 SHORTGAP_COMP, or MISMATCH_COMP */
9376       if (in_exon == false) {
9377 	exon_genomestart = this->genomepos + ONEBASEDP;
9378 	in_exon = true;
9379       }
9380     }
9381     if (this->genome != ' ') {
9382       last_genomepos = this->genomepos;
9383     }
9384   }
9385 
9386   /* prev = this; */
9387   exon_genomeend = last_genomepos + ONEBASEDP;
9388 
9389   FPRINTF(fp,"%u %u\n",exon_genomestart,exon_genomeend);
9390 
9391   if (chrnum != 0) {
9392     FREE(chrstring);
9393   }
9394 
9395   return;
9396 }
9397 
9398 
9399 void
Pair_print_splicesites(Filestring_T fp,struct T * pairs,int npairs,char * accession,int nexons,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,bool watsonp)9400 Pair_print_splicesites (Filestring_T fp, struct T *pairs, int npairs, char *accession,
9401 			int nexons, Chrnum_T chrnum, Univ_IIT_T chromosome_iit, bool watsonp) {
9402   int exoni = 0, i;
9403   bool in_exon = false;
9404   struct T *ptr = pairs, *this = NULL;
9405   Chrpos_T exon_genomestart = 0, exon_genomeend;
9406   char *chrstring = NULL;
9407   Chrpos_T last_genomepos = (Chrpos_T) -1, intron_length;
9408 
9409   if (chrnum == 0) {
9410     chrstring = "";
9411   } else {
9412     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9413   }
9414 
9415   for (i = 0; i < npairs; i++) {
9416     /* prev = this; */
9417     this = ptr++;
9418 
9419     if (this->gapp) {
9420       if (in_exon == true) {
9421 	/* Beginning of gap */
9422 	exon_genomeend = last_genomepos + ONEBASEDP;
9423 	if (watsonp) {
9424 	  FPRINTF(fp,">%s.exon%d/%d %s:%u..%u donor",accession,exoni,nexons,chrstring,exon_genomeend,exon_genomeend+1);
9425 	} else {
9426 	  FPRINTF(fp,">%s.exon%d/%d %s:%u..%u donor",accession,exoni,nexons,chrstring,exon_genomeend,exon_genomeend-1);
9427 	}
9428 	in_exon = false;
9429       }
9430     } else if (this->comp == INTRONGAP_COMP) {
9431       /* Do nothing */
9432     } else {
9433       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9434 	 SHORTGAP_COMP, or MISMATCH_COMP */
9435       if (in_exon == false) {
9436 	exoni++;
9437 	if (exoni > 1) {
9438 	  exon_genomestart = this->genomepos + ONEBASEDP;
9439 	  if (watsonp) {
9440 	    intron_length = exon_genomestart - exon_genomeend - 1;
9441 	    FPRINTF(fp," %u\n",intron_length); /* For previous donor */
9442 	    FPRINTF(fp,">%s.exon%d/%d %s:%u..%u acceptor",accession,exoni,nexons,chrstring,exon_genomestart-1,exon_genomestart);
9443 	    FPRINTF(fp," %u\n",intron_length);
9444 	  } else {
9445 	    intron_length = exon_genomeend - exon_genomestart - 1;
9446 	    FPRINTF(fp," %u\n",intron_length); /* For previous donor */
9447 	    FPRINTF(fp,">%s.exon%d/%d %s:%u..%u acceptor",accession,exoni,nexons,chrstring,exon_genomestart+1,exon_genomestart);
9448 	    FPRINTF(fp," %u\n",intron_length);
9449 	  }
9450 	}
9451 
9452 	in_exon = true;
9453       }
9454     }
9455     if (this->genome != ' ') {
9456       last_genomepos = this->genomepos;
9457     }
9458   }
9459 
9460   if (chrnum != 0) {
9461     FREE(chrstring);
9462   }
9463 
9464   return;
9465 }
9466 
9467 
9468 void
Pair_print_introns(Filestring_T fp,struct T * pairs,int npairs,char * accession,int nexons,Chrnum_T chrnum,Univ_IIT_T chromosome_iit)9469 Pair_print_introns (Filestring_T fp, struct T *pairs, int npairs, char *accession,
9470 		    int nexons, Chrnum_T chrnum, Univ_IIT_T chromosome_iit) {
9471   int exoni = 0, i;
9472   bool in_exon = false;
9473   struct T *ptr = pairs, *this = NULL;
9474   Chrpos_T exon_genomestart = 0, exon_genomeend;
9475   char *chrstring = NULL;
9476   Chrpos_T last_genomepos = (Chrpos_T) -1;
9477 
9478   if (chrnum == 0) {
9479     chrstring = "";
9480   } else {
9481     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
9482   }
9483 
9484   for (i = 0; i < npairs; i++) {
9485     /* prev = this; */
9486     this = ptr++;
9487 
9488     if (this->gapp) {
9489       if (in_exon == true) {
9490 	/* Beginning of gap */
9491 	exon_genomeend = last_genomepos + ONEBASEDP;
9492 	in_exon = false;
9493       }
9494     } else if (this->comp == INTRONGAP_COMP) {
9495       /* Do nothing */
9496     } else {
9497       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9498 	 SHORTGAP_COMP, or MISMATCH_COMP */
9499       if (in_exon == false) {
9500 	exoni++;
9501 	if (exoni > 1) {
9502 	  exon_genomestart = this->genomepos + ONEBASEDP;
9503 	  FPRINTF(fp,">%s.intron%d/%d %s:%u..%u\n",accession,exoni-1,nexons-1,chrstring,exon_genomeend,exon_genomestart);
9504 	}
9505 
9506 	in_exon = true;
9507       }
9508     }
9509     if (this->genome != ' ') {
9510       last_genomepos = this->genomepos;
9511     }
9512   }
9513 
9514   if (chrnum != 0) {
9515     FREE(chrstring);
9516   }
9517 
9518   return;
9519 }
9520 
9521 
9522 static int
print_Ns(Filestring_T fp,int column,int n,int wraplength)9523 print_Ns (Filestring_T fp, int column, int n, int wraplength) {
9524   int i;
9525 
9526   for (i = 0; i < n; i++) {
9527     PUTC('N',fp);
9528     if (++column % wraplength == 0) {
9529       PUTC('\n',fp);
9530       column = 0;
9531     }
9532   }
9533 
9534   return column;
9535 }
9536 
9537 
9538 void
Pair_print_mask_introns(Filestring_T fp,struct T * pairs,int npairs,Chrpos_T chrlength,int wraplength,bool include_utr_p)9539 Pair_print_mask_introns (Filestring_T fp, struct T *pairs, int npairs,
9540 			 Chrpos_T chrlength, int wraplength, bool include_utr_p) {
9541   int exoni = 0, column = 0, i;
9542   bool in_exon = false;
9543   struct T *ptr = pairs, *this = NULL;
9544   Chrpos_T exon_genomestart = 0, exon_genomeend;
9545   Chrpos_T last_genomepos = (Chrpos_T) -1;
9546 
9547   assert(pairs != NULL);
9548   if (include_utr_p == true) {
9549     column = print_Ns(fp,column,pairs->genomepos,wraplength);
9550   }
9551 
9552   for (i = 0; i < npairs; i++) {
9553     /* prev = this; */
9554     this = ptr++;
9555 
9556     if (this->gapp) {
9557       if (in_exon == true) {
9558 	/* Beginning of gap */
9559 	exon_genomeend = last_genomepos + ONEBASEDP;
9560 	in_exon = false;
9561       }
9562     } else if (this->comp == INTRONGAP_COMP) {
9563       /* Do nothing */
9564     } else {
9565       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
9566 	 SHORTGAP_COMP, or MISMATCH_COMP */
9567       if (in_exon == false) {
9568 	exoni++;
9569 	if (exoni > 1) {
9570 	  exon_genomestart = this->genomepos + ONEBASEDP;
9571 	  column = print_Ns(fp,column,exon_genomestart - exon_genomeend - 1,wraplength);
9572 	}
9573 
9574 	in_exon = true;
9575       }
9576       if (this->genome != ' ') {
9577 	PUTC(this->genome,fp);
9578 	if (++column % wraplength == 0) {
9579 	  PUTC('\n',fp);
9580 	  column = 0;
9581 	}
9582       }
9583     }
9584 
9585     if (this->genome != ' ') {
9586       last_genomepos = this->genomepos;
9587     }
9588   }
9589 
9590   if (include_utr_p == true) {
9591     column = print_Ns(fp,column,chrlength - last_genomepos - 1,wraplength);
9592   }
9593 
9594   if (column != 0) {
9595     PUTC('\n',fp);
9596   }
9597 
9598   return;
9599 }
9600 
9601 
9602 #if 0
9603 /* goal_start < goal_end */
9604 Chrpos_T
9605 Pair_binary_search_ascending (int *querypos, int lowi, int highi, struct T *pairarray,
9606 			      Chrpos_T goal_start, Chrpos_T goal_end) {
9607   int middlei;
9608 
9609   debug10(printf("entered binary search_ascending with lowi=%d, highi=%d, goal=%u..%u\n",
9610 		 lowi,highi,goal_start,goal_end));
9611 
9612   while (lowi < highi) {
9613     middlei = lowi + ((highi - lowi) / 2);
9614     while (middlei < highi && pairarray[middlei].cdna == ' ') {
9615       /* Go forward past pairs corresponding to gaps */
9616       middlei++;
9617     }
9618     if (middlei >= highi) {
9619       middlei = lowi + ((highi - lowi) / 2);
9620       while (middlei >= lowi && pairarray[middlei].cdna == ' ') {
9621 	/* Go backward past pairs corresponding to gaps */
9622 	middlei--;
9623       }
9624       if (middlei < lowi) {
9625 	debug10(printf("all intermediate pairs are gaps\n"));
9626 #if 0
9627 	*querypos = pairarray[lowi].querypos;
9628 	return pairarray[lowi].genomepos;
9629 #else
9630 	return 0U;
9631 #endif
9632       }
9633     }
9634 
9635     debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u..%u\n",
9636 		   lowi,pairarray[lowi].genomepos,middlei,pairarray[middlei].genomepos,
9637 		   highi,pairarray[highi].genomepos,goal_start,goal_end));
9638     if (goal_end < pairarray[middlei].genomepos) {
9639       highi = middlei;
9640     } else if (goal_start > pairarray[middlei].genomepos) {
9641       lowi = middlei + 1;
9642     } else {
9643       debug10(printf("binary search returns %d\n",middlei));
9644       *querypos = pairarray[middlei].querypos;
9645       return pairarray[middlei].genomepos;
9646     }
9647   }
9648 
9649   debug10(printf("binary search returns %d\n",highi));
9650   return 0U;
9651 }
9652 #endif
9653 
9654 #if 0
9655 /* goal_start > goal_end */
9656 Chrpos_T
9657 Pair_binary_search_descending (int *querypos, int lowi, int highi, struct T *pairarray,
9658 			       Chrpos_T goal_start, Chrpos_T goal_end) {
9659   int middlei;
9660 
9661   debug10(printf("entered binary search_descending with lowi=%d, highi=%d, goal=%u..%u\n",
9662 		 lowi,highi,goal_start,goal_end));
9663 
9664   while (lowi < highi) {
9665     middlei = lowi + ((highi - lowi) / 2);
9666     while (middlei < highi && pairarray[middlei].cdna == ' ') {
9667       /* Go forward past pairs corresponding to gaps */
9668       middlei++;
9669     }
9670     if (middlei >= highi) {
9671       middlei = lowi + ((highi - lowi) / 2);
9672       while (middlei >= lowi && pairarray[middlei].cdna == ' ') {
9673 	/* Go backward past pairs corresponding to gaps */
9674 	middlei--;
9675       }
9676       if (middlei < lowi) {
9677 	debug10(printf("all intermediate pairs are gaps\n"));
9678 #if 0
9679 	*querypos = pairarray[lowi].querypos;
9680 	return pairarray[lowi].genomepos;
9681 #else
9682 	return 0U;
9683 #endif
9684       }
9685     }
9686 
9687     debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u..%u\n",
9688 		   lowi,pairarray[lowi].genomepos,middlei,pairarray[middlei].genomepos,
9689 		   highi,pairarray[highi].genomepos,goal_start,goal_end));
9690     if (goal_end > pairarray[middlei].genomepos) {
9691       highi = middlei;
9692     } else if (goal_start < pairarray[middlei].genomepos) {
9693       lowi = middlei + 1;
9694     } else {
9695       debug10(printf("binary search returns %d\n",middlei));
9696       *querypos = pairarray[middlei].querypos;
9697       return pairarray[middlei].genomepos;
9698     }
9699   }
9700 
9701   debug10(printf("binary search returns %d\n",highi));
9702   return 0U;
9703 }
9704 #endif
9705 
9706 
9707 #if 0
9708 /* Assumes querypos is in ascending order.  Need to look for worst
9709    case, so go to querypos, and then check all pairs for that
9710    querypos.  This also guarantees that the querypos value is unique,
9711    since a second value must be due to an indel. */
9712 bool
9713 Pairarray_contains_p (struct T *pairarray, int npairs, int querypos) {
9714   int i;
9715 
9716   i = 0;
9717   while (i < npairs && pairarray[i].querypos < querypos) {
9718     i++;
9719   }
9720 
9721   if (i >= npairs || pairarray[i].querypos > querypos) {
9722     return false;
9723   } else {
9724     while (i < npairs && pairarray[i].querypos == querypos) {
9725       if (pairarray[i].gapp == true) {
9726 	return false;
9727       } else if (pairarray[i].cdna == ' ') {
9728 	return false;
9729       } else if (pairarray[i].genome == ' ') {
9730 	return false;
9731       } else {
9732 	/* Withhold judgement */
9733 	i++;
9734       }
9735     }
9736 
9737     return true;
9738   }
9739 }
9740 #endif
9741 
9742 
9743 #if 0
9744 Chrpos_T
9745 Pairarray_lookup (struct T *pairarray, int npairs, int querypos) {
9746   int i;
9747   T pair;
9748 
9749   for (i = 0; i < npairs; i++) {
9750     pair = &(pairarray[i]);
9751     if (pair->querypos > querypos) {
9752       /* continue */
9753     } else if (pair->querypos < querypos) {
9754       /* continue */
9755     } else if (pair->gapp == true) {
9756       /* continue */
9757     } else if (pair->cdna == ' ') {
9758       /* continue */
9759     } else if (pair->genome == ' ') {
9760       /* continue */
9761     } else {
9762       return pair->genomepos;
9763     }
9764   }
9765 
9766   return 0;
9767 }
9768 #endif
9769 
9770 
9771 void
Pairarray_chrpos_bounds(Chrpos_T * chrpos_start,Chrpos_T * chrpos_end,struct T * pairarray,int npairs)9772 Pairarray_chrpos_bounds (Chrpos_T *chrpos_start, Chrpos_T *chrpos_end,
9773 			 struct T *pairarray, int npairs) {
9774   T start, end;
9775 
9776   start = &(pairarray[0]);
9777   end = &(pairarray[npairs-1]);
9778   *chrpos_start = start->genomepos;
9779   *chrpos_end = end->genomepos;
9780 
9781   return;
9782 }
9783 
9784 
9785 
9786 
9787 Chrpos_T
Pairarray_genomicbound_from_start(struct T * pairarray,int npairs,int overlap)9788 Pairarray_genomicbound_from_start (struct T *pairarray, int npairs, int overlap) {
9789   int i;
9790   struct T pair;
9791 
9792   i = 0;
9793   pair = pairarray[i];
9794   while (i < npairs && overlap > 0) {
9795     pair = pairarray[i];
9796     if (pair.cdna != ' ') {
9797       overlap--;
9798     }
9799     i++;
9800   }
9801 
9802   return pair.genomepos;
9803 }
9804 
9805 Chrpos_T
Pairarray_genomicbound_from_end(struct T * pairarray,int npairs,int overlap)9806 Pairarray_genomicbound_from_end (struct T *pairarray, int npairs, int overlap) {
9807   int i;
9808   struct T pair;
9809 
9810   i = npairs-1;
9811   pair = pairarray[i];
9812   while (i >= 0 && overlap > 0) {
9813     pair = pairarray[i];
9814     if (pair.cdna != ' ') {
9815       overlap--;
9816     }
9817     i--;
9818   }
9819 
9820   return pair.genomepos;
9821 }
9822 
9823 
9824 char *
Pairarray_genomic_sequence(int * seqlength,struct T * pairarray,int npairs)9825 Pairarray_genomic_sequence (int *seqlength, struct T *pairarray, int npairs) {
9826   char *genomic, g;
9827   int i, k;
9828 
9829   for (i = 0, k = 0; i < npairs; i++) {
9830     if (pairarray[i].gapp == true) {
9831       /* Skip */
9832     } else if (pairarray[i].genome == ' ') {
9833       /* Skip */
9834     } else {
9835       k++;
9836     }
9837   }
9838 
9839   genomic = (char *) MALLOC((k+1) * sizeof(char));
9840   for (i = 0, k = 0; i < npairs; i++) {
9841     if (pairarray[i].gapp == true) {
9842       /* Skip.  Apparently, pairarray can have gap characters at introns */
9843     } else if ((g = pairarray[i].genome) == ' ') {
9844       /* Skip */
9845     } else {
9846       genomic[k++] = g;
9847     }
9848   }
9849   genomic[k] = '\0';
9850 
9851   *seqlength = k;
9852   return genomic;
9853 }
9854 
9855 
9856 
9857 int
Pair_cdna_direction(List_T pairs)9858 Pair_cdna_direction (List_T pairs) {
9859   int cdna_direction = 0;
9860   bool in_intron = false;
9861   T this;
9862   List_T p;
9863 
9864   for (p = pairs; p != NULL; p = List_next(p)) {
9865     this = (T) List_head(p);
9866     if (this->gapp) {
9867       if (!in_intron) {
9868 	if (this->comp == FWD_CANONICAL_INTRON_COMP) {
9869 	  cdna_direction += 1;
9870 	} else if (this->comp == REV_CANONICAL_INTRON_COMP) {
9871 	  cdna_direction -= 1;
9872 	}
9873 	in_intron = true;
9874       }
9875     } else {
9876       if (in_intron) {
9877 	in_intron = false;
9878       }
9879     }
9880   }
9881 
9882   return cdna_direction;
9883 }
9884 
9885 
9886 /* Returns first pair that exceeds breakpoint */
9887 T
Pair_start_bound(int * cdna_direction,List_T pairs,int breakpoint)9888 Pair_start_bound (int *cdna_direction, List_T pairs, int breakpoint) {
9889   T start = NULL, this;
9890   bool in_intron = false;
9891   List_T p;
9892 
9893   debug9(printf("Entering Pair_start_bound with breakpoint %d\n",breakpoint));
9894 
9895   *cdna_direction = 0;
9896 
9897   if ((p = pairs) != NULL) {
9898     start = this = (T) p->first;
9899   }
9900 
9901   while (p != NULL) {
9902     this = (T) p->first;
9903     debug9(Pair_dump_one(this,true));
9904     debug9(printf("\n"));
9905 
9906 
9907     if (this->gapp == true) {
9908       /* Skip */
9909     } else if (this->querypos > breakpoint) {
9910       while (p != NULL) {
9911 	this = (T) List_head(p);
9912 
9913 	if (this->gapp) {
9914 	  debug9(printf("For start bound, saw gap with comp %c\n",this->comp));
9915 	  if (!in_intron) {
9916 	    if (this->comp == FWD_CANONICAL_INTRON_COMP) {
9917 	      *cdna_direction += 1;
9918 	    } else if (this->comp == REV_CANONICAL_INTRON_COMP) {
9919 	      *cdna_direction -= 1;
9920 	    }
9921 	    in_intron = true;
9922 	  }
9923 	} else {
9924 	  if (in_intron) {
9925 	    in_intron = false;
9926 	  }
9927 	}
9928 
9929 	p = p->rest;
9930       }
9931 
9932       if (*cdna_direction > 0) {
9933 	*cdna_direction = +1;
9934       } else if (*cdna_direction < 0) {
9935 	*cdna_direction = -1;
9936       }
9937       return start;
9938 
9939     } else {
9940       start = this;
9941     }
9942 
9943     p = p->rest;
9944   }
9945 
9946 #if 0
9947   /* Found no gap beyond start */
9948   if (*cdna_direction > 0) {
9949     *cdna_direction = +1;
9950   } else if (*cdna_direction < 0) {
9951     *cdna_direction = -1;
9952   }
9953 #endif
9954 
9955   return start;
9956 }
9957 
9958 
9959 /* Returns last pair that exceeds breakpoint */
9960 T
Pair_end_bound(int * cdna_direction,List_T pairs,int breakpoint)9961 Pair_end_bound (int *cdna_direction, List_T pairs, int breakpoint) {
9962   T end = NULL, this;
9963   bool in_intron = false;
9964   List_T p;
9965 
9966   debug9(printf("Entering Pair_end_bound with breakpoint %d\n",breakpoint));
9967 
9968   *cdna_direction = 0;
9969 
9970   if ((p = pairs) != NULL) {
9971     end = this = (T) p->first;
9972   }
9973 
9974   while (p != NULL) {
9975     this = (T) p->first;
9976     debug9(Pair_dump_one(this,true));
9977     debug9(printf("\n"));
9978     if (this->gapp) {
9979       debug9(printf("For end bound, saw gap with comp %c\n",this->comp));
9980       if (!in_intron) {
9981 	if (this->comp == FWD_CANONICAL_INTRON_COMP) {
9982 	  *cdna_direction += 1;
9983 	} else if (this->comp == REV_CANONICAL_INTRON_COMP) {
9984 	  *cdna_direction -= 1;
9985 	}
9986 	in_intron = true;
9987       }
9988 
9989     } else {
9990       if (in_intron) {
9991 	in_intron = false;
9992       }
9993 
9994       if (this->querypos > breakpoint) {
9995 
9996 	if (*cdna_direction > 0) {
9997 	  *cdna_direction = +1;
9998 	} else if (*cdna_direction < 0) {
9999 	  *cdna_direction = -1;
10000 	}
10001 	return end;
10002 
10003       } else {
10004 	end = this;
10005       }
10006     }
10007 
10008     p = p->rest;
10009   }
10010 
10011   if (*cdna_direction > 0) {
10012     *cdna_direction = +1;
10013   } else if (*cdna_direction < 0) {
10014     *cdna_direction = -1;
10015   }
10016   return end;
10017 }
10018 
10019 
10020 #if 0
10021 /* Previously used for Stage3end_new_gmap */
10022 int
10023 Pair_count_ge_fromstart (struct T *pairs, int npairs, Chrpos_T chrbound) {
10024   int count = 0, i;
10025 
10026   for (i = 0; i < npairs; i++) {
10027     if (pairs[i].genomepos >= chrbound) {
10028       /* Pass */
10029     } else {
10030       /* Trim bad pairs */
10031       while (--i >= 0 && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10032 	count--;
10033       }
10034       return count;
10035     }
10036     count++;
10037   }
10038 
10039   return count;
10040 }
10041 #endif
10042 
10043 #if 0
10044 /* Previously used for Stage3end_new_gmap */
10045 int
10046 Pair_count_ge_fromend (struct T *pairs, int npairs, Chrpos_T chrbound) {
10047   int count = 0, i;
10048 
10049   for (i = npairs - 1; i >= 0; --i) {
10050     if (pairs[i].genomepos >= chrbound) {
10051       /* Pass */
10052     } else {
10053       /* Trim bad pairs */
10054       while (++i < npairs && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10055 	count--;
10056       }
10057       return count;
10058     }
10059     count++;
10060   }
10061 
10062   return count;
10063 }
10064 #endif
10065 
10066 #if 0
10067 /* Previously used for Stage3end_new_gmap */
10068 int
10069 Pair_count_lt_fromstart (struct T *pairs, int npairs, Chrpos_T chrbound) {
10070   int count = 0, i;
10071 
10072   for (i = 0; i < npairs; i++) {
10073     if (pairs[i].genomepos < chrbound) {
10074       /* Pass */
10075     } else {
10076       while (--i >= 0 && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10077 	count--;
10078       }
10079       return count;
10080     }
10081     count++;
10082   }
10083 
10084   return count;
10085 }
10086 #endif
10087 
10088 #if 0
10089 /* Previously used for Stage3end_new_gmap */
10090 int
10091 Pair_count_lt_fromend (struct T *pairs, int npairs, Chrpos_T chrbound) {
10092   int count = 0, i;
10093 
10094   for (i = npairs - 1; i >= 0; --i) {
10095     if (pairs[i].genomepos < chrbound) {
10096       /* Pass */
10097     } else {
10098       while (++i < npairs && (pairs[i].gapp || pairs[i].cdna == ' ' || pairs[i].genome == ' ')) {
10099 	count--;
10100       }
10101       return count;
10102     }
10103     count++;
10104   }
10105 
10106   return count;
10107 }
10108 #endif
10109 
10110 
10111 
10112 void
Pair_trim_distances(int * trim5,int * trim3,List_T pairs)10113 Pair_trim_distances (int *trim5, int *trim3, List_T pairs) {
10114   int trim_right = 0, trim_left = -1; /* Needs to be -1 to avoid trimming when pairs is NULL */
10115   int bestscore, score, nmismatches = 0;
10116   int pairi;
10117   List_T p;
10118   T this;
10119   bool in_indelp;
10120 
10121   debug8(printf("Entered Pair_trim_distances\n"));
10122   if (pairs == NULL) {
10123     *trim5 = *trim3 = 0;
10124     return;
10125   }
10126 
10127 
10128   /* Find trim_right */
10129   bestscore = 0;
10130   score = 0;
10131   in_indelp = false;
10132   this = (T) NULL;
10133   for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10134     this = p->first;
10135 
10136     if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10137       if (in_indelp == false) {
10138 	score += trim_indel_score;
10139 	if (score < 0) {
10140 	  score = 0;
10141 	}
10142 	in_indelp = true;
10143       }
10144 
10145     } else {
10146       in_indelp = false;
10147       if (this->gapp) {
10148 	/* Don't count */
10149 
10150       } else if (this->comp == INTRONGAP_COMP) {
10151 	/* Do nothing */
10152 
10153       } else if (
10154 	 /* cdna of N is used commonly in PMAP */
10155 #ifndef PMAP
10156 		 this->cdna == 'N' ||
10157 #endif
10158 		 this->comp == MISMATCH_COMP) {
10159 	if (nmismatches++ == 0) {
10160 	  score += TRIM_MISMATCH_SCORE;
10161 	} else {
10162 	  score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10163 	}
10164 	if (score < 0) {
10165 	  score = 0;
10166 	} else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10167 	  bestscore = score;
10168 	  trim_right = pairi;
10169 	}
10170 
10171       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10172 	score += TRIM_MATCH_SCORE;
10173 	if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10174 	  bestscore = score;
10175 	  trim_right = pairi;
10176 	}
10177 
10178       } else {
10179 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10180 	abort();
10181       }
10182     }
10183 
10184     debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim right score %d, trim_right %d, protectedp %d\n",
10185 		  pairi,this->querypos,this->genomepos,this->comp,score,trim_right,this->protectedp));
10186   }
10187 
10188   *trim3 = pairi - 1 - trim_right;
10189   debug8(printf("Final: Trim right pairi %d, score %d, trim3 %d\n",pairi,score,*trim3));
10190 
10191 
10192   /* Find trim_left */
10193   pairs = List_reverse(pairs);
10194   bestscore = 0;
10195   score = 0;
10196   in_indelp = false;
10197   this = (T) NULL;
10198   for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10199     this = p->first;
10200 
10201     if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10202       if (in_indelp == false) {
10203 	score += trim_indel_score;
10204 	if (score < 0) {
10205 	  score = 0;
10206 	}
10207 	in_indelp = true;
10208       }
10209 
10210     } else {
10211       in_indelp = false;
10212 
10213       if (this->gapp) {
10214 	/* Don't count */
10215 
10216       } else if (this->comp == INTRONGAP_COMP) {
10217 	/* Do nothing */
10218 
10219       } else if (
10220 	 /* cdna of N is used commonly in PMAP */
10221 #ifndef PMAP
10222 		 this->cdna == 'N' ||
10223 #endif
10224 		 this->comp == MISMATCH_COMP) {
10225 	if (nmismatches++ == 0) {
10226 	  score += TRIM_MISMATCH_SCORE;
10227 	} else {
10228 	  score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10229 	}
10230 	if (score < 0) {
10231 	  score = 0;
10232 	} else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10233 	  bestscore = score;
10234 	  trim_left = pairi;
10235 	}
10236 
10237       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10238 	score += TRIM_MATCH_SCORE;
10239 	if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10240 	  bestscore = score;
10241 	  trim_left = pairi;
10242 	}
10243 
10244       } else {
10245 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10246 	abort();
10247       }
10248     }
10249 
10250     debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim left score %d, trim_left %d, protectedp %d\n",
10251 		  pairi,this->querypos,this->genomepos,this->comp,score,trim_left,this->protectedp));
10252   }
10253 
10254   *trim5 = pairi - 1 - trim_left;
10255   debug8(printf("Final: Trim left pairi %d, score %d, trim5 %d\n",pairi,score,*trim5));
10256 
10257   /* Restore original order */
10258   pairs = List_reverse(pairs);
10259   return;
10260 }
10261 
10262 
10263 List_T
Pair_trim_ends(bool * trim5p,bool * trim3p,List_T pairs,int ambig_end_length_5,int ambig_end_length_3)10264 Pair_trim_ends (bool *trim5p, bool *trim3p, List_T pairs, int ambig_end_length_5, int ambig_end_length_3) {
10265   List_T trimmed = NULL;
10266   int trim_right = 0, trim_left = -1; /* Needs to be -1 to avoid trimming when pairs is NULL */
10267   int bestscore, score, nmismatches = 0;
10268   int pairi;
10269   List_T p, pairptr;
10270   T this;
10271   int i;
10272   bool in_indelp;
10273 
10274   debug8(printf("Entered trim_ends\n"));
10275   if (pairs == NULL) {
10276     *trim5p = *trim3p = false;
10277     return (List_T) NULL;
10278   }
10279 
10280 
10281   /* Find trim_right */
10282   bestscore = 0;
10283   score = 0;
10284   in_indelp = false;
10285   this = (T) NULL;
10286   for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10287     this = p->first;
10288 
10289     if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10290       if (in_indelp == false) {
10291 	score += trim_indel_score;
10292 	if (score < 0) {
10293 	  score = 0;
10294 	}
10295 	in_indelp = true;
10296       }
10297 
10298     } else {
10299       in_indelp = false;
10300       if (this->gapp) {
10301 	/* Don't count */
10302 
10303       } else if (this->comp == INTRONGAP_COMP) {
10304 	/* Do nothing */
10305 
10306       } else if (
10307 	 /* cdna of N is used commonly in PMAP */
10308 #ifndef PMAP
10309 		 this->cdna == 'N' ||
10310 #endif
10311 		 this->comp == MISMATCH_COMP) {
10312 	if (nmismatches++ == 0) {
10313 	  score += TRIM_MISMATCH_SCORE;
10314 	} else {
10315 	  score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10316 	}
10317 	if (score < 0) {
10318 	  score = 0;
10319 	} else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10320 	  bestscore = score;
10321 	  trim_right = pairi;
10322 	}
10323 
10324       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10325 	score += TRIM_MATCH_SCORE;
10326 	if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10327 	  bestscore = score;
10328 	  trim_right = pairi;
10329 	}
10330 
10331       } else {
10332 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10333 	abort();
10334       }
10335     }
10336 
10337     debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim right score %d, trim_right %d, protectedp %d\n",
10338 		  pairi,this->querypos,this->genomepos,this->comp,score,trim_right,this->protectedp));
10339   }
10340 
10341   if (this == NULL) {
10342     fprintf(stderr,"check for trim_right yields this == NULL\n");
10343     abort();
10344   } else if (ambig_end_length_3 > 0) {
10345     debug8(printf("Not disturbing ambiguous end on right\n"));
10346     trim_right = 0;
10347   } else if (this->protectedp == true) {
10348     debug8(printf("Protected against trim_right\n"));
10349     trim_right = 0;
10350   } else {
10351     trim_right = pairi - 1 - trim_right;
10352     debug8(printf("Final: Trim right pairi %d, score %d, trim_right %d\n",pairi,score,trim_right));
10353   }
10354   debug8(printf("\n"));
10355 
10356 
10357   /* Find trim_left */
10358   pairs = List_reverse(pairs);
10359   bestscore = 0;
10360   score = 0;
10361   in_indelp = false;
10362   this = (T) NULL;
10363   for (p = pairs, pairi = 0; p != NULL; p = p->rest, pairi++) {
10364     this = p->first;
10365 
10366     if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
10367       if (in_indelp == false) {
10368 	score += trim_indel_score;
10369 	if (score < 0) {
10370 	  score = 0;
10371 	}
10372 	in_indelp = true;
10373       }
10374 
10375     } else {
10376       in_indelp = false;
10377 
10378       if (this->gapp) {
10379 	/* Don't count */
10380 
10381       } else if (this->comp == INTRONGAP_COMP) {
10382 	/* Do nothing */
10383 
10384       } else if (
10385 	 /* cdna of N is used commonly in PMAP */
10386 #ifndef PMAP
10387 		 this->cdna == 'N' ||
10388 #endif
10389 		 this->comp == MISMATCH_COMP) {
10390 	if (nmismatches++ == 0) {
10391 	  score += TRIM_MISMATCH_SCORE;
10392 	} else {
10393 	  score += TRIM_MISMATCH_SCORE - 1; /* Penalize multiple mismatches */
10394 	}
10395 	if (score < 0) {
10396 	  score = 0;
10397 	} else if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10398 	  bestscore = score;
10399 	  trim_left = pairi;
10400 	}
10401 
10402       } else if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
10403 	score += TRIM_MATCH_SCORE;
10404 	if (score >= bestscore) { /* Want >= and not >, so extend to ends */
10405 	  bestscore = score;
10406 	  trim_left = pairi;
10407 	}
10408 
10409       } else {
10410 	fprintf(stderr,"Can't parse comp %c, gapp %d\n",this->comp,this->gapp);
10411 	abort();
10412       }
10413     }
10414 
10415     debug8(printf("pairi %d, querypos %d, genomepos %u, comp %c: Trim left score %d, trim_left %d, protectedp %d\n",
10416 		  pairi,this->querypos,this->genomepos,this->comp,score,trim_left,this->protectedp));
10417   }
10418 
10419   if (this == NULL) {
10420     fprintf(stderr,"check for trim_left yields this == NULL\n");
10421     abort();
10422   } else if (ambig_end_length_5 > 0) {
10423     debug8(printf("Not disturbing ambiguous end on left\n"));
10424     trim_left = pairi - 1;
10425   } else if (this->protectedp == true) {
10426     debug8(printf("Protected against trim_left\n"));
10427     trim_left = pairi - 1;
10428   } else {
10429     debug8(printf("Final: Trim left pairi %d, score %d, trim_left %d\n",pairi,score,trim_left));
10430   }
10431   debug8(printf("\n"));
10432 
10433 
10434   /* trim */
10435   if (trim_right == 0) {
10436     *trim3p = false;
10437   } else {
10438     *trim3p = true;
10439   }
10440 
10441   if (trim_left == 0) {
10442     *trim5p = false;
10443   } else {
10444     *trim5p = true;
10445   }
10446 
10447   i = 0;
10448   while (i < trim_right) {
10449     pairs = Pairpool_pop(pairs,&this);
10450     i++;
10451   }
10452 
10453   while (i <= trim_left) {
10454     pairptr = pairs;
10455     pairs = Pairpool_pop(pairs,&this);
10456 #ifdef WASTE
10457     path = Pairpool_push_existing(path,pairpool,pair);
10458 #else
10459     trimmed = List_push_existing(trimmed,pairptr);
10460 #endif
10461     i++;
10462   }
10463 
10464   debug8(Pair_dump_list(trimmed,/*zerobasedp*/true));
10465 
10466   return trimmed;
10467 }
10468 
10469 
10470 #if 0
10471 void
10472 Pairarray_unalias (struct T *pairarray, int npairs, Chrpos_T chrlength) {
10473   int i;
10474 
10475   for (i = 0; i < npairs; i++) {
10476     if (pairarray[i].genomepos > chrlength) {
10477       pairarray[i].genomepos -= chrlength;
10478     }
10479   }
10480   return;
10481 }
10482 #endif
10483 
10484 
10485 void
Pair_split_circular(List_T * pairs_below,List_T * pairs_above,List_T pairs,Chrpos_T chrlength,Pairpool_T pairpool,bool plusp)10486 Pair_split_circular (List_T *pairs_below, List_T *pairs_above, List_T pairs,
10487 		     Chrpos_T chrlength, Pairpool_T pairpool, bool plusp) {
10488   List_T below = NULL, above = NULL, *dest, p = pairs;
10489   T pair;
10490 
10491   if (plusp == true) {
10492     dest = &below;
10493     while (p != NULL) {
10494       pair = (T) List_head(p);
10495       if (pair->gapp == true) {
10496 	/* Skip */
10497       } else if (pair->genomepos >= chrlength) {
10498 	dest = &above;
10499       }
10500       *dest = Pairpool_push_existing(*dest,pairpool,pair);
10501       p = List_next(p);
10502     }
10503 
10504     /* Unalias pairs above */
10505     for (p = above; p != NULL; p = List_next(p)) {
10506       pair = (T) List_head(p);
10507       pair->genomepos -= chrlength;
10508     }
10509 
10510   } else {
10511     dest = &above;
10512     while (p != NULL) {
10513       pair = (T) List_head(p);
10514       if (pair->gapp == true) {
10515 	/* Skip */
10516       } else if (pair->genomepos > chrlength) {
10517 	dest = &below;
10518       }
10519       *dest = Pairpool_push_existing(*dest,pairpool,pair);
10520       p = List_next(p);
10521     }
10522 
10523     /* Unalias pairs above */
10524     for (p = below; p != NULL; p = List_next(p)) {
10525       pair = (T) List_head(p);
10526       pair->genomepos -= chrlength;
10527     }
10528   }
10529 
10530   *pairs_below = List_reverse(below);
10531   *pairs_above = List_reverse(above);
10532 
10533   return;
10534 }
10535 
10536 
10537 
10538