1 static char rcsid[] = "$Id: simplepair.c 222730 2020-05-29 13:10:09Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "simplepair.h"
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>		/* For memcpy */
16 #include <math.h>		/* For rint(), abs() */
17 #include <ctype.h>		/* For toupper */
18 
19 #include "assert.h"
20 #include "except.h"
21 #include "mem.h"
22 #include "comp.h"
23 #include "complement.h"
24 #include "transcript.h"
25 #include "method.h"
26 
27 
28 
29 #ifdef DEBUG1
30 #define debug1(x) x
31 #else
32 #define debug1(x)
33 #endif
34 
35 #ifdef DEBUG4
36 #define debug4(x) x
37 #else
38 #define debug4(x)
39 #endif
40 
41 /* hardclip_pairarray */
42 #ifdef DEBUG10
43 #define debug10(x) x
44 #else
45 #define debug10(x)
46 #endif
47 
48 
49 static bool splicingp;
50 static Univ_IIT_T transcript_iit;
51 
52 static bool sam_insert_0M_p = false;
53 static bool md_lowercase_variant_p;
54 static bool snps_p;
55 
56 static bool cigar_extended_p;
57 static Cigar_action_T cigar_action;
58 
59 
60 
61 #define T Simplepair_T
62 
63 
64 Chrpos_T
Simplepair_head_genomepos(List_T pairs)65 Simplepair_head_genomepos (List_T pairs) {
66   return ((T) List_head(pairs))->genomepos;
67 }
68 
69 Chrpos_T
Simplepair_last_genomepos(List_T pairs)70 Simplepair_last_genomepos (List_T pairs) {
71   return ((T) List_last_value(pairs))->genomepos;
72 }
73 
74 
75 /* For GSNAP, generate for output thread only.  For GMAP, pairs needed by worker threads are made in pairpool.c */
76 T
Simplepair_new_out(int querypos,Chrpos_T genomepos,char cdna,char comp,char genome)77 Simplepair_new_out (int querypos, Chrpos_T genomepos, char cdna, char comp, char genome) {
78   T new = (T) MALLOC_OUT(sizeof(*new));
79 
80   new->querypos = querypos;
81   new->genomepos = genomepos;
82   new->cdna = cdna;
83   new->comp = comp;
84   new->genome = genome;
85   new->genomealt = genome;
86 
87   switch (comp) {
88   case FWD_CANONICAL_INTRON_COMP /*'>'*/:
89   case REV_CANONICAL_INTRON_COMP /*'<'*/:
90   case NONINTRON_COMP /*'='*/:
91   case SHORTGAP_COMP /*'~'*/:
92   case INTRONGAP_COMP /*'.'*/:
93   case FWD_GCAG_INTRON_COMP /*')'*/:
94   case REV_GCAG_INTRON_COMP /*'('*/:
95   case FWD_ATAC_INTRON_COMP /*']'*/:
96   case REV_ATAC_INTRON_COMP /*'['*/:
97   case DUALBREAK_COMP /*'#'*/:
98     new->gapp = true; break;
99   default: new->gapp = false;
100   }
101 
102   return new;
103 }
104 
105 void
Simplepair_free_out(T * old)106 Simplepair_free_out (T *old) {
107   if (*old) {
108     FREE_OUT(*old);
109   }
110   return;
111 }
112 
113 
114 static void
dump_one(T this,bool zerobasedp)115 dump_one (T this, bool zerobasedp) {
116 
117   debug1(printf("%p ",this));
118 
119   if (this->gapp == true) {
120     printf("*** Gap: type: ");
121     switch (this->comp) {
122     case FWD_CANONICAL_INTRON_COMP: printf("> GT-AG"); break;
123     case FWD_GCAG_INTRON_COMP: printf(") GC-AG"); break;
124     case FWD_ATAC_INTRON_COMP: printf("] AT-AC"); break;
125     case REV_ATAC_INTRON_COMP: printf("[ AT-AC"); break;
126     case REV_GCAG_INTRON_COMP: printf("( GC-AG"); break;
127     case REV_CANONICAL_INTRON_COMP: printf("< GT-AG"); break;
128     case SHORTGAP_COMP: printf("~ shortgap"); break;
129     case NONINTRON_COMP: printf("= nonintron"); break;
130     default: printf("? unknown"); break;
131     }
132 
133     printf(" ***");
134 
135   } else {
136     printf("%d %d %c ",
137 	   this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->cdna);
138 
139     /* Subtract 1 because dynprogindices start at +1 and -1 */
140     putchar(this->comp);
141     printf(" %c",this->genome);
142     if (this->genomealt != this->genome) {
143       printf(" alt:%c",this->genomealt);
144     }
145   }
146 
147   return;
148 }
149 
150 
151 /* Useful for debugging */
152 void
Simplepair_dump_list(List_T pairs,bool zerobasedp)153 Simplepair_dump_list (List_T pairs, bool zerobasedp) {
154   T this, prev = NULL, old = NULL;
155   List_T p;
156 
157   printf("***Start of list***\n");
158   for (p = pairs; p != NULL; p = List_next(p)) {
159     this = List_head(p);
160     dump_one(this,zerobasedp);
161     printf("\n");
162 
163     if (this->querypos != -1) {
164       if (old != NULL) {
165 	if (old->querypos > prev->querypos) {
166 	  if (prev->querypos < this->querypos) {
167 	    fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
168 	    abort();
169 	  }
170 	} else if (old->querypos < prev->querypos) {
171 	  if (prev->querypos > this->querypos) {
172 	    fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
173 	    abort();
174 	  }
175 	}
176       }
177 
178       old = prev;
179       prev = this;
180     }
181 
182   }
183   printf("***End of list***\n");
184   return;
185 }
186 
187 
188 List_T
Simplepair_strip_gaps_at_head(List_T pairs)189 Simplepair_strip_gaps_at_head (List_T pairs) {
190   T pair;
191 
192   while (pairs != NULL && (pair = pairs->first) != NULL &&
193 	 (pair->gapp == true || pair->cdna == ' ' || pair->genome == ' ')) {
194     pairs = List_pop_out(pairs,(void **) &pair);
195     Simplepair_free_out(&pair);
196   }
197 
198   return pairs;
199 }
200 
201 List_T
Simplepair_strip_gaps_at_tail(List_T pairs)202 Simplepair_strip_gaps_at_tail (List_T pairs) {
203   T pair;
204 
205   if (pairs != NULL) {
206     pairs = List_reverse(pairs);
207 
208     while (pairs != NULL && (pair = pairs->first) != NULL &&
209 	   (pair->gapp == true || pair->cdna == ' ' || pair->genome == ' ')) {
210       pairs = List_pop_out(pairs,(void **) &pair);
211       Simplepair_free_out(&pair);
212     }
213 
214     pairs = List_reverse(pairs);
215   }
216 
217   return pairs;
218 }
219 
220 
221 
222 static struct T *
hardclip_pairarray(int * clipped_npairs,int hardclip_start,int hardclip_end,struct T * pairs,int npairs,int querylength)223 hardclip_pairarray (int *clipped_npairs, int hardclip_start, int hardclip_end,
224 		    struct T *pairs, int npairs, int querylength) {
225   struct T *clipped_pairs, *ptr;
226   int i, starti;
227 
228   debug10(printf("Entered hardclip_pairarray with hardclip_start %d, hardclip_end %d, querylength %d\n",
229 		 hardclip_start,hardclip_end,querylength));
230   debug10(Simplepair_dump_array(pairs,npairs,true));
231   debug10(printf("Starting with %d pairs\n",npairs));
232 
233   i = 0;
234   ptr = pairs;
235   while (i < npairs && ptr->querypos < hardclip_start) {
236     i++;
237     ptr++;
238   }
239   while (i < npairs && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
240     i++;
241     ptr++;
242   }
243 
244   if (i >= npairs) {
245     /* hardclip_start passes right end of read, so invalid */
246     debug10(printf("i = %d, so passed end of read\n",i));
247     hardclip_start = 0;
248   } else if (hardclip_start > 0) {
249     hardclip_start = ptr->querypos;
250   }
251 
252   starti = i;
253   debug10(printf("starti is %d\n",starti));
254 
255   clipped_pairs = ptr;
256 
257   while (i < npairs && ptr->querypos < querylength - hardclip_end) {
258     i++;
259     ptr++;
260   }
261 
262   i--;
263   ptr--;
264   while (i >= starti && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
265     i--;
266     ptr--;
267   }
268 
269   if (i < 0) {
270     /* hardclip_end passes left end of read, so invalid */
271     debug10(printf("i = %d, so passed left end of read\n",i));
272     hardclip_end = 0;
273   } else if (hardclip_end > 0) {
274     hardclip_end = querylength - 1 - ptr->querypos;
275   }
276 
277   if (hardclip_start == 0 && hardclip_end == 0) {
278     debug10(printf("Unable to hard clip\n"));
279     *clipped_npairs = npairs;
280     clipped_pairs = pairs;
281   } else {
282     *clipped_npairs = i - starti + 1;
283   }
284 
285   debug10(printf("Ending with %d pairs\n",*clipped_npairs));
286   debug10(printf("Exiting hardclip_pairs with hardclip_start %d, hardclip_end %d\n",
287 		 hardclip_start,hardclip_end));
288 
289   return clipped_pairs;
290 }
291 
292 
293 Chrpos_T
Simplepair_genomicpos_low(int hardclip_low,int hardclip_high,struct T * pairarray,int npairs,int querylength,bool plusp,bool hide_soft_clips_p)294 Simplepair_genomicpos_low (int hardclip_low, int hardclip_high,
295 			   struct T *pairarray, int npairs, int querylength,
296 			   bool plusp, bool hide_soft_clips_p) {
297   struct T *clipped_pairs;
298   int clipped_npairs;
299   T lowpair, highpair;
300 
301 #if 0
302   if (clipdir >= 0) {
303     if (plusp == true) {
304       if (first_read_p == true) {
305 	hardclip_high = hardclip5;
306 	hardclip_low = 0;
307       } else {
308 	hardclip_high = 0;
309 	hardclip_low = hardclip3;
310       }
311     } else {
312       if (first_read_p == true) {
313 	hardclip_low = hardclip5;
314 	hardclip_high = 0;
315       } else {
316 	hardclip_low = 0;
317 	hardclip_high = hardclip3;
318       }
319     }
320   } else {
321     if (plusp == true) {
322       if (first_read_p == true) {
323 	hardclip_low = hardclip5;
324 	hardclip_high = 0;
325       } else {
326 	hardclip_low = 0;
327 	hardclip_high = hardclip3;
328       }
329     } else {
330       if (first_read_p == true) {
331 	hardclip_high = hardclip5;
332 	hardclip_low = 0;
333       } else {
334 	hardclip_high = 0;
335 	hardclip_low = hardclip3;
336       }
337     }
338   }
339 #endif
340 
341   if (plusp == true) {
342     clipped_pairs = hardclip_pairarray(&clipped_npairs,hardclip_low,hardclip_high,
343 				       pairarray,npairs,querylength);
344     lowpair = &(clipped_pairs[0]);
345     highpair = &(clipped_pairs[clipped_npairs-1]);
346     if (hide_soft_clips_p == true) {
347       assert(lowpair->querypos == 0);
348       assert(highpair->querypos == querylength - 1);
349       /* *chrpos_high = highpair->genomepos + 1 - (querylength - 1 - highpair->querypos); */
350       return lowpair->genomepos + 1 - lowpair->querypos;
351     } else {
352       /* *chrpos_high = highpair->genomepos + 1; */
353       return lowpair->genomepos + 1;
354     }
355   } else {
356     /* Swap hardclip_low and hardclip_high */
357     clipped_pairs = hardclip_pairarray(&clipped_npairs,hardclip_high,hardclip_low,
358 				       pairarray,npairs,querylength);
359     lowpair = &(clipped_pairs[clipped_npairs-1]);
360     highpair = &(clipped_pairs[0]);
361     if (hide_soft_clips_p == true) {
362       assert(lowpair->querypos == querylength - 1);
363       assert(highpair->querypos == 0);
364       /* *chrpos_high = highpair->genomepos + 1 - highpair->querypos; */
365       return lowpair->genomepos + 1 + (querylength - 1 - lowpair->querypos);
366     } else {
367       /* *chrpos_high = highpair->genomepos + 1; */
368       return lowpair->genomepos + 1;
369     }
370   }
371 }
372 
373 
374 static List_T
push_token(List_T tokens,char * token)375 push_token (List_T tokens, char *token) {
376   char *copy;
377 
378   copy = (char *) MALLOC_OUT((strlen(token)+1) * sizeof(char));
379   strcpy(copy,token);
380   return List_push_out(tokens,(void *) copy);
381 }
382 
383 static void
tokens_free(List_T * tokens)384 tokens_free (List_T *tokens) {
385   List_T p;
386   char *token;
387 
388   for (p = *tokens; p != NULL; p = List_next(p)) {
389     token = (char *) List_head(p);
390     FREE_OUT(token);
391   }
392   List_free_out(&(*tokens));
393 
394   return;
395 }
396 
397 static void
print_tokens(Filestring_T fp,List_T tokens)398 print_tokens (Filestring_T fp, List_T tokens) {
399   List_T p;
400   char *token;
401 
402   for (p = tokens; p != NULL; p = List_next(p)) {
403     token = (char *) List_head(p);
404     FPRINTF(fp,"%s",token);
405     /* FREE_OUT(token); -- Now freed within Stage3end_free or Stage3_free */
406   }
407 
408   return;
409 }
410 
411 static int
cigar_length(List_T tokens)412 cigar_length (List_T tokens) {
413   int length = 0, tokenlength;
414   List_T p;
415   char *token;
416   char type;
417 
418   for (p = tokens; p != NULL; p = List_next(p)) {
419     token = (char *) List_head(p);
420     type = token[strlen(token)-1];
421     /* Should include 'H', but that gets added according to hardclip_low and hardclip_high */
422     if (type == 'S' || type == 'I' || type == 'M' || type == 'X' || type == '=') {
423       sscanf(token,"%d",&tokenlength);
424       length += tokenlength;
425     }
426   }
427 
428   return length;
429 }
430 
431 
432 static List_T
clean_cigar(List_T tokens,bool watsonp)433 clean_cigar (List_T tokens, bool watsonp) {
434   List_T clean, unique = NULL, p;
435   char token[11], *curr_token, *last_token;
436   int length = 0;
437   char type, last_type = ' ';
438   bool duplicatep = false;
439 
440   for (p = tokens; p != NULL; p = List_next(p)) {
441     curr_token = (char *) List_head(p);
442     type = curr_token[strlen(curr_token)-1];
443     if (type == last_type) {
444       length += atoi(last_token);
445       FREE_OUT(last_token);
446       duplicatep = true;
447     } else {
448       if (last_type == ' ') {
449 	/* Skip */
450       } else if (duplicatep == false) {
451 	unique = List_push_out(unique,(void *) last_token);
452       } else {
453 	length += atoi(last_token);
454 	FREE_OUT(last_token);
455 	sprintf(token,"%d%c",length,last_type);
456 	unique = push_token(unique,token);
457       }
458       last_type = type;
459       duplicatep = false;
460       length = 0;
461     }
462     last_token = curr_token;
463   }
464   if (last_type == ' ') {
465     /* Skip */
466   } else if (duplicatep == false) {
467     unique = List_push_out(unique,(void *) last_token);
468   } else {
469     length += atoi(last_token);
470     FREE_OUT(last_token);
471     sprintf(token,"%d%c",length,last_type);
472     unique = push_token(unique,token);
473   }
474   List_free_out(&tokens);
475 
476 
477   if (sam_insert_0M_p == false) {
478     /* Return result */
479     if (watsonp) {
480       /* Put tokens in forward order */
481       return unique;
482     } else {
483       /* Keep tokens in reverse order */
484       return List_reverse(unique);
485     }
486 
487   } else {
488     /* Insert "0M" between adjacent I and D operations */
489     last_type = ' ';
490     clean = (List_T) NULL;
491     for (p = unique; p != NULL; p = List_next(p)) {
492       curr_token = (char *) List_head(p);
493       type = curr_token[strlen(curr_token)-1];
494       if (last_type == 'I' && type == 'D') {
495 	clean = push_token(clean,"0M");
496       } else if (last_type == 'D' && type == 'I') {
497 	clean = push_token(clean,"0M");
498       }
499       clean = List_push_out(clean,(void *) curr_token);
500       last_type = type;
501     }
502     List_free_out(&unique);
503 
504     /* Return result */
505     if (watsonp) {
506       /* Put tokens in forward order */
507       return List_reverse(clean);
508     } else {
509       /* Keep tokens in reverse order */
510       return clean;
511     }
512   }
513 }
514 
515 
516 static List_T
compute_cigar_standard(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)517 compute_cigar_standard (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
518 			bool watsonp,
519 #ifdef CONVERT_INTRONS_TO_DELETIONS
520 			int sensedir,
521 #endif
522 			int chimera_part) {
523   List_T tokens = NULL;
524   char token[11];
525   int Mlength = 0, Ilength = 0, Dlength = 0;
526   bool in_exon = false, deletionp;
527   struct T *ptr, *prev, *this = NULL;
528   int exon_queryend = -1;
529   Chrpos_T exon_genomestart = 0;
530   Chrpos_T exon_genomeend, genome_gap;
531   int query_gap;
532   int last_querypos = -1;
533   Chrpos_T last_genomepos = (Chrpos_T) -1;
534   int i;
535 
536   /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
537   *intronp = false;
538 
539   ptr = pairs;
540 
541   if (chimera_part == +1) {
542     if (ptr->querypos > *hardclip_start) {
543       if (ptr->querypos > 0) {
544 	/* Clip to beginning */
545 	*hardclip_start = ptr->querypos;
546 	sprintf(token,"%dH",*hardclip_start);
547 	tokens = push_token(tokens,token);
548       }
549     } else {
550       if (*hardclip_start > 0) {
551 	/* Clip to hard clip boundary */
552 	sprintf(token,"%dH",*hardclip_start);
553 	tokens = push_token(tokens,token);
554       }
555     }
556   } else {
557     if (*hardclip_start > 0) {
558       sprintf(token,"%dH",*hardclip_start);
559       tokens = push_token(tokens,token);
560     }
561     if (ptr->querypos > (*hardclip_start)) {
562       sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
563       tokens = push_token(tokens,token);
564     }
565   }
566 
567   this = (T) NULL;
568   for (i = 0; i < npairs; i++) {
569     prev = this;
570     this = ptr++;
571 
572 #if 0
573     /* Cigar_print_tokens(stdout,tokens); */
574     Pair_dump_one(this,true);
575     printf("\n");
576 #endif
577 
578     if (this->gapp) {
579       if (in_exon == true) {
580 	exon_queryend = last_querypos + 1;
581 	exon_genomeend = last_genomepos + 1;
582 #if 0
583 	if (watsonp) {
584 	  intron_start = exon_genomeend + 1;
585 	} else {
586 	  intron_start = exon_genomeend - 1;
587 	}
588 #endif
589 
590 	if (Mlength > 0) {
591 	  sprintf(token,"%dM",Mlength);
592 	  tokens = push_token(tokens,token);
593 	} else if (Ilength > 0) {
594 	  sprintf(token,"%dI",Ilength);
595 	  tokens = push_token(tokens,token);
596 	} else if (Dlength > 0) {
597 	  sprintf(token,"%dD",Dlength);
598 	  tokens = push_token(tokens,token);
599 	}
600 
601 	Mlength = Ilength = Dlength = 0;
602 
603 	in_exon = false;
604       }
605 
606     } else if (this->comp == INTRONGAP_COMP) {
607       /* Do nothing */
608 
609     } else {
610       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
611 	 SHORTGAP_COMP, or MISMATCH_COMP */
612       if (in_exon == false) {
613 	/* exon_querystart = this->querypos + 1; */
614 	exon_genomestart = this->genomepos + 1;
615 
616 	if (prev != NULL) {
617 	  /* Gap */
618 	  /* abs() gives a large value when flag -m64 is specified */
619 	  /* genome_gap = abs(intron_end - intron_start) + 1; */
620 	  if (watsonp) {
621 	    /* intron_end = exon_genomestart - 1; */
622 	    /* genome_gap = (intron_end - intron_start) + 1; */
623 	    genome_gap = exon_genomestart - exon_genomeend - 1;
624 	  } else {
625 	    /* intron_end = exon_genomestart + 1; */
626 	    /* genome_gap = (intron_start - intron_end) + 1; */
627 	    genome_gap = exon_genomeend - exon_genomestart - 1;
628 	  }
629 
630 	  deletionp = false;
631 #ifdef CONVERT_INTRONS_TO_DELETIONS
632 	  if (sensedir == SENSE_FORWARD) {
633 	    if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
634 		prev->comp == FWD_GCAG_INTRON_COMP ||
635 		prev->comp == FWD_ATAC_INTRON_COMP) {
636 	      sprintf(token,"%uN",genome_gap);
637 	      *intronp = true;
638 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
639 	      sprintf(token,"%uN",genome_gap);
640 	      *intronp = true;
641 	    } else {
642 	      sprintf(token,"%uD",genome_gap);
643 	      deletionp = true;
644 	    }
645 	  } else if (sensedir == SENSE_ANTI) {
646 	    if (prev->comp == REV_CANONICAL_INTRON_COMP ||
647 		prev->comp == REV_GCAG_INTRON_COMP ||
648 		prev->comp == REV_ATAC_INTRON_COMP) {
649 	      sprintf(token,"%uN",genome_gap);
650 	      *intronp = true;
651 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
652 	      sprintf(token,"%uN",genome_gap);
653 	      *intronp = true;
654 	    } else {
655 	      sprintf(token,"%uD",genome_gap);
656 	      deletionp = true;
657 	    }
658 	  } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
659 	    sprintf(token,"%uN",genome_gap);
660 	    *intronp = true;
661 	  } else {
662 	    sprintf(token,"%uD",genome_gap);
663 	    deletionp = true;
664 	  }
665 #else
666 	  sprintf(token,"%uN",genome_gap);
667 	  *intronp = true;
668 #endif
669 	  tokens = push_token(tokens,token);
670 
671 	  /* Check for dual gap.  Doesn't work for hard clipping. */
672 	  /* assert(exon_queryend >= 0); */
673 
674 	  query_gap = this->querypos - exon_queryend;
675 	  assert(query_gap >= 0);
676 	  if (query_gap > 0) {
677 	    if (deletionp == true && sam_insert_0M_p == true) {
678 	      /* Put zero matches between deletion and insertion, since some programs will complain */
679 	      sprintf(token,"0M");
680 	      tokens = push_token(tokens,token);
681 	    }
682 
683 	    sprintf(token,"%uI",query_gap);
684 	    tokens = push_token(tokens,token);
685 	  }
686 	}
687 
688 	in_exon = true;
689       }
690 
691       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
692 	/* Gap in upper or lower sequence */
693 	if (this->genome == ' ') {
694 	  /* Insertion relative to genome */
695 	  if (Mlength > 0) {
696 	    sprintf(token,"%dM",Mlength);
697 	    tokens = push_token(tokens,token);
698 	    Mlength = 0;
699 	  } else if (Dlength > 0) {
700 	    /* unlikely */
701 	    sprintf(token,"%dD",Dlength);
702 	    tokens = push_token(tokens,token);
703 	    Dlength = 0;
704 	  }
705 	  Ilength++;
706 	} else if (this->cdna == ' ') {
707 	  /* Deletion relative to genome */
708 	  if (Mlength > 0) {
709 	    sprintf(token,"%dM",Mlength);
710 	    tokens = push_token(tokens,token);
711 	    Mlength = 0;
712 	  } else if (Ilength > 0) {
713 	    sprintf(token,"%dI",Ilength);
714 	    tokens = push_token(tokens,token);
715 	    Ilength = 0;
716 	  }
717 	  Dlength++;
718 	} else {
719 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
720 	  exit(9);
721 	}
722 
723       } else {
724 	/* Count even if unknown base */
725 
726 	if (Ilength > 0) {
727 	  sprintf(token,"%dI",Ilength);
728 	  tokens = push_token(tokens,token);
729 	  Ilength = 0;
730 	} else if (Dlength > 0) {
731 	  sprintf(token,"%dD",Dlength);
732 	  tokens = push_token(tokens,token);
733 	  Dlength = 0;
734 	}
735 	Mlength++;
736 
737       }
738     }
739 
740     if (this != NULL) {
741       if (this->cdna != ' ') {
742 	last_querypos = this->querypos;
743       }
744       if (this->genome != ' ') {
745 	last_genomepos = this->genomepos;
746       }
747     }
748   }
749 
750   /* prev = this; */
751   /* exon_queryend = last_querypos + 1; */
752   /* exon_genomeend = last_genomepos + 1; */
753 
754   if (Mlength > 0) {
755     sprintf(token,"%dM",Mlength);
756     tokens = push_token(tokens,token);
757   } else if (Ilength > 0) {
758     sprintf(token,"%dI",Ilength);
759     tokens = push_token(tokens,token);
760   } else if (Dlength > 0) {
761     sprintf(token,"%dD",Dlength);
762     tokens = push_token(tokens,token);
763   }
764 
765 
766   /* Terminal clipping */
767   if (chimera_part == -1) {
768     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
769       if (last_querypos < querylength_given - 1) {
770 	/* Clip to end */
771 	*hardclip_end = querylength_given - 1 - last_querypos;
772 	sprintf(token,"%dH",*hardclip_end);
773 	tokens = push_token(tokens,token);
774       }
775     } else {
776       if (*hardclip_end > 0) {
777 	/* Clip to hard clip boundary */
778 	sprintf(token,"%dH",*hardclip_end);
779 	tokens = push_token(tokens,token);
780       }
781     }
782   } else {
783     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
784       sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
785       tokens = push_token(tokens,token);
786     }
787     if (*hardclip_end > 0) {
788       sprintf(token,"%dH",*hardclip_end);
789       tokens = push_token(tokens,token);
790     }
791   }
792 
793   return clean_cigar(tokens,watsonp);
794 }
795 
796 
797 static List_T
compute_cigar_extended(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)798 compute_cigar_extended (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
799 			bool watsonp,
800 #ifdef CONVERT_INTRONS_TO_DELETIONS
801 			int sensedir,
802 #endif
803 			int chimera_part) {
804   List_T tokens = NULL;
805   char token[11];
806   int Elength = 0, Xlength = 0, Ilength = 0, Dlength = 0;
807   bool in_exon = false, deletionp;
808   struct T *ptr, *prev, *this = NULL;
809   int exon_queryend = -1;
810   Chrpos_T exon_genomestart = 0;
811   Chrpos_T exon_genomeend, genome_gap;
812   int query_gap;
813   int last_querypos = -1;
814   Chrpos_T last_genomepos = (Chrpos_T) -1;
815   int i;
816 
817   /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
818   *intronp = false;
819 
820   ptr = pairs;
821 
822   if (chimera_part == +1) {
823     if (ptr->querypos > *hardclip_start) {
824       if (ptr->querypos > 0) {
825 	/* Clip to beginning */
826 	*hardclip_start = ptr->querypos;
827 	sprintf(token,"%dH",*hardclip_start);
828 	tokens = push_token(tokens,token);
829       }
830     } else {
831       if (*hardclip_start > 0) {
832 	/* Clip to hard clip boundary */
833 	sprintf(token,"%dH",*hardclip_start);
834 	tokens = push_token(tokens,token);
835       }
836     }
837   } else {
838     if (*hardclip_start > 0) {
839       sprintf(token,"%dH",*hardclip_start);
840       tokens = push_token(tokens,token);
841     }
842     if (ptr->querypos > (*hardclip_start)) {
843       sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
844       tokens = push_token(tokens,token);
845     }
846   }
847 
848   this = (T) NULL;
849   for (i = 0; i < npairs; i++) {
850     prev = this;
851     this = ptr++;
852 
853 #if 0
854     /* Cigar_print_tokens(stdout,tokens); */
855     Pair_dump_one(this,true);
856     printf("\n");
857 #endif
858 
859     if (this->gapp) {
860       if (in_exon == true) {
861 	exon_queryend = last_querypos + 1;
862 	exon_genomeend = last_genomepos + 1;
863 #if 0
864 	if (watsonp) {
865 	  intron_start = exon_genomeend + 1;
866 	} else {
867 	  intron_start = exon_genomeend - 1;
868 	}
869 #endif
870 
871 	if (Elength > 0) {
872 	  sprintf(token,"%d=",Elength);
873 	  tokens = push_token(tokens,token);
874 	} else if (Xlength > 0) {
875 	  sprintf(token,"%dX",Xlength);
876 	  tokens = push_token(tokens,token);
877 	} else if (Ilength > 0) {
878 	  sprintf(token,"%dI",Ilength);
879 	  tokens = push_token(tokens,token);
880 	} else if (Dlength > 0) {
881 	  sprintf(token,"%dD",Dlength);
882 	  tokens = push_token(tokens,token);
883 	}
884 
885 	Elength = Xlength = Ilength = Dlength = 0;
886 
887 	in_exon = false;
888       }
889 
890     } else if (this->comp == INTRONGAP_COMP) {
891       /* Do nothing */
892 
893     } else {
894       /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
895 	 SHORTGAP_COMP, or MISMATCH_COMP */
896       if (in_exon == false) {
897 	/* exon_querystart = this->querypos + 1; */
898 	exon_genomestart = this->genomepos + 1;
899 
900 	if (prev != NULL) {
901 	  /* Gap */
902 	  /* abs() gives a large value when flag -m64 is specified */
903 	  /* genome_gap = abs(intron_end - intron_start) + 1; */
904 	  if (watsonp) {
905 	    /* intron_end = exon_genomestart - 1; */
906 	    /* genome_gap = (intron_end - intron_start) + 1; */
907 	    genome_gap = exon_genomestart - exon_genomeend - 1;
908 	  } else {
909 	    /* intron_end = exon_genomestart + 1; */
910 	    /* genome_gap = (intron_start - intron_end) + 1; */
911 	    genome_gap = exon_genomeend - exon_genomestart - 1;
912 	  }
913 
914 	  deletionp = false;
915 #ifdef CONVERT_INTRONS_TO_DELETIONS
916 	  if (sensedir == SENSE_FORWARD) {
917 	    if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
918 		prev->comp == FWD_GCAG_INTRON_COMP ||
919 		prev->comp == FWD_ATAC_INTRON_COMP) {
920 	      sprintf(token,"%uN",genome_gap);
921 	      *intronp = true;
922 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
923 	      sprintf(token,"%uN",genome_gap);
924 	      *intronp = true;
925 	    } else {
926 	      sprintf(token,"%uD",genome_gap);
927 	      deletionp = true;
928 	    }
929 	  } else if (sensedir == SENSE_ANTI) {
930 	    if (prev->comp == REV_CANONICAL_INTRON_COMP ||
931 		prev->comp == REV_GCAG_INTRON_COMP ||
932 		prev->comp == REV_ATAC_INTRON_COMP) {
933 	      sprintf(token,"%uN",genome_gap);
934 	      *intronp = true;
935 	    } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
936 	      sprintf(token,"%uN",genome_gap);
937 	      *intronp = true;
938 	    } else {
939 	      sprintf(token,"%uD",genome_gap);
940 	      deletionp = true;
941 	    }
942 	  } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
943 	    sprintf(token,"%uN",genome_gap);
944 	    *intronp = true;
945 	  } else {
946 	    sprintf(token,"%uD",genome_gap);
947 	    deletionp = true;
948 	  }
949 #else
950 	  sprintf(token,"%uN",genome_gap);
951 	  *intronp = true;
952 #endif
953 	  tokens = push_token(tokens,token);
954 
955 	  /* Check for dual gap.  Doesn't work for hard clipping. */
956 	  /* assert(exon_queryend >= 0); */
957 
958 	  query_gap = this->querypos - exon_queryend;
959 	  assert(query_gap >= 0);
960 	  if (query_gap > 0) {
961 	    if (deletionp == true && sam_insert_0M_p == true) {
962 	      /* Put zero matches between deletion and insertion, since some programs will complain */
963 	      sprintf(token,"0M");
964 	      tokens = push_token(tokens,token);
965 	    }
966 
967 	    sprintf(token,"%uI",query_gap);
968 	    tokens = push_token(tokens,token);
969 	  }
970 	}
971 
972 	in_exon = true;
973       }
974 
975       if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
976 	/* Gap in upper or lower sequence */
977 	if (this->genome == ' ') {
978 	  /* Insertion relative to genome */
979 	  if (Elength > 0) {
980 	    sprintf(token,"%d=",Elength);
981 	    tokens = push_token(tokens,token);
982 	    Elength = 0;
983 	  } else if (Xlength > 0) {
984 	    sprintf(token,"%dX",Xlength);
985 	    tokens = push_token(tokens,token);
986 	    Xlength = 0;
987 	  } else if (Dlength > 0) {
988 	    /* unlikely */
989 	    sprintf(token,"%dD",Dlength);
990 	    tokens = push_token(tokens,token);
991 	    Dlength = 0;
992 	  }
993 	  Ilength++;
994 	} else if (this->cdna == ' ') {
995 	  /* Deletion relative to genome */
996 	  if (Elength > 0) {
997 	    sprintf(token,"%d=",Elength);
998 	    tokens = push_token(tokens,token);
999 	    Elength = 0;
1000 	  } else if (Xlength > 0) {
1001 	    sprintf(token,"%dX",Xlength);
1002 	    tokens = push_token(tokens,token);
1003 	    Xlength = 0;
1004 	  } else if (Ilength > 0) {
1005 	    sprintf(token,"%dI",Ilength);
1006 	    tokens = push_token(tokens,token);
1007 	    Ilength = 0;
1008 	  }
1009 	  Dlength++;
1010 	} else {
1011 	  fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
1012 	  exit(9);
1013 	}
1014 
1015       } else {
1016 	/* Count even if unknown base */
1017 
1018 	if (Ilength > 0) {
1019 	  sprintf(token,"%dI",Ilength);
1020 	  tokens = push_token(tokens,token);
1021 	  Ilength = 0;
1022 	} else if (Dlength > 0) {
1023 	  sprintf(token,"%dD",Dlength);
1024 	  tokens = push_token(tokens,token);
1025 	  Dlength = 0;
1026 	}
1027 
1028 	if (prev == NULL || prev->gapp || prev->comp == INDEL_COMP || prev->comp == SHORTGAP_COMP) {
1029 	  if (this->cdna == this->genome) {
1030 	    Elength++;
1031 	  } else {
1032 	    Xlength++;
1033 	  }
1034 
1035 	} else if (prev->cdna == prev->genome) {
1036 	  if (this->cdna == this->genome) {
1037 	    Elength++;
1038 	  } else {
1039 	    if (Elength > 0) {
1040 	      sprintf(token,"%d=",Elength);
1041 	      tokens = push_token(tokens,token);
1042 	      Elength = 0;
1043 	    }
1044 	    Xlength++;
1045 	  }
1046 
1047 	} else {
1048 	  if (this->cdna != this->genome) {
1049 	    Xlength++;
1050 	  } else {
1051 	    if (Xlength > 0) {
1052 	      sprintf(token,"%dX",Xlength);
1053 	      tokens = push_token(tokens,token);
1054 	      Xlength = 0;
1055 	    }
1056 	    Elength++;
1057 	  }
1058 	}
1059       }
1060     }
1061 
1062     if (this != NULL) {
1063       if (this->cdna != ' ') {
1064 	last_querypos = this->querypos;
1065       }
1066       if (this->genome != ' ') {
1067 	last_genomepos = this->genomepos;
1068       }
1069     }
1070   }
1071 
1072   /* prev = this; */
1073   /* exon_queryend = last_querypos + 1; */
1074   /* exon_genomeend = last_genomepos + 1; */
1075 
1076   if (Elength > 0) {
1077     sprintf(token,"%d=",Elength);
1078     tokens = push_token(tokens,token);
1079   } else if (Xlength > 0) {
1080     sprintf(token,"%dX",Xlength);
1081     tokens = push_token(tokens,token);
1082   } else if (Ilength > 0) {
1083     sprintf(token,"%dI",Ilength);
1084     tokens = push_token(tokens,token);
1085   } else if (Dlength > 0) {
1086     sprintf(token,"%dD",Dlength);
1087     tokens = push_token(tokens,token);
1088   }
1089 
1090 
1091   /* Terminal clipping */
1092   if (chimera_part == -1) {
1093     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
1094       if (last_querypos < querylength_given - 1) {
1095 	/* Clip to end */
1096 	*hardclip_end = querylength_given - 1 - last_querypos;
1097 	sprintf(token,"%dH",*hardclip_end);
1098 	tokens = push_token(tokens,token);
1099       }
1100     } else {
1101       if (*hardclip_end > 0) {
1102 	/* Clip to hard clip boundary */
1103 	sprintf(token,"%dH",*hardclip_end);
1104 	tokens = push_token(tokens,token);
1105       }
1106     }
1107   } else {
1108     if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
1109       sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
1110       tokens = push_token(tokens,token);
1111     }
1112     if (*hardclip_end > 0) {
1113       sprintf(token,"%dH",*hardclip_end);
1114       tokens = push_token(tokens,token);
1115     }
1116   }
1117 
1118   return clean_cigar(tokens,watsonp);
1119 }
1120 
1121 
1122 static List_T
compute_cigar(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int chimera_part)1123 compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
1124 	       bool watsonp, int chimera_part) {
1125   if (cigar_extended_p == true) {
1126     return compute_cigar_extended(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
1127 				  watsonp,chimera_part);
1128   } else {
1129     return compute_cigar_standard(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
1130 				  watsonp,chimera_part);
1131   }
1132 }
1133 
1134 
1135 typedef enum {IN_MATCHES, IN_MISMATCHES, IN_DELETION} MD_state_T;
1136 
1137 static char complCode[128] = COMPLEMENT_LC;
1138 
1139 static List_T
compute_md_string(int * nmismatches_refdiff,int * nmismatches_bothdiff,int * nindels,struct T * pairs,int npairs,bool watsonp,List_T cigar_tokens)1140 compute_md_string (int *nmismatches_refdiff, int *nmismatches_bothdiff, int *nindels,
1141 		   struct T *pairs, int npairs, bool watsonp, List_T cigar_tokens) {
1142   List_T md_tokens = NULL, p;
1143   char *cigar_token, token[11], *first_token, type;
1144   T this;
1145   int nmatches = 0, length;
1146   MD_state_T state = IN_MISMATCHES;
1147   int i, k = 0;
1148 
1149   *nmismatches_refdiff = *nmismatches_bothdiff = *nindels = 0;
1150 
1151   debug4(Pair_dump_array(pairs,npairs,true));
1152   debug4(printf("watsonp %d\n",watsonp));
1153 
1154   if (watsonp == true) {
1155     for (p = cigar_tokens; p != NULL; p = List_next(p)) {
1156       cigar_token = (char *) List_head(p);
1157       debug4(printf("token is %s\n",cigar_token));
1158       type = cigar_token[strlen(cigar_token)-1];
1159       length = atoi(cigar_token);
1160 
1161       if (type == 'H') {
1162 	/* k += length; */
1163 
1164       } else if (type == 'S') {
1165 	/* k += length; */
1166 
1167       } else if (type == 'M' || type == 'X' || type == '=') {
1168 	for (i = 0; i < length; i++, k++) {
1169 	  this = &(pairs[k]);
1170 	  debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
1171 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
1172 	    nmatches++;
1173 	    state = IN_MATCHES;
1174 
1175 	  } else if (this->comp == MISMATCH_COMP) {
1176 	    if (state == IN_MATCHES) {
1177 	      sprintf(token,"%d",nmatches);
1178 	      md_tokens = push_token(md_tokens,token);
1179 	      nmatches = 0;
1180 	    } else if (state == IN_DELETION) {
1181 	      md_tokens = push_token(md_tokens,"0");
1182 	    }
1183 	    state = IN_MISMATCHES;
1184 
1185 	    *nmismatches_refdiff += 1;
1186 	    if (md_lowercase_variant_p && this->cdna == this->genomealt) {
1187 	      /* A mismatch against the reference only => alternate variant */
1188 	      sprintf(token,"%c",tolower(this->genome));
1189 	    } else {
1190 	      /* A true mismatch against both variants */
1191 	      *nmismatches_bothdiff += 1;
1192 	      sprintf(token,"%c",this->genome);
1193 	    }
1194 	    md_tokens = push_token(md_tokens,token);
1195 
1196 	  } else {
1197 	    fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
1198 	    abort();
1199 	  }
1200 	}
1201 
1202       } else if (type == 'I') {
1203 	while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
1204 	  *nindels += 1;
1205 	  k++;
1206 	}
1207 	state = IN_MATCHES;
1208 
1209       } else if (type == 'N') {
1210 	while (k < npairs && pairs[k].gapp == true) {
1211 	  k++;
1212 	}
1213 
1214       } else if (type == 'D') {
1215 	if (state == IN_MATCHES) {
1216 	  if (nmatches > 0) {
1217 	    sprintf(token,"%d",nmatches);
1218 	    md_tokens = push_token(md_tokens,token);
1219 	    nmatches = 0;
1220 	  }
1221 	}
1222 
1223 	if (state != IN_DELETION) {
1224 	  md_tokens = push_token(md_tokens,"^");
1225 	}
1226 	for (i = 0; i < length; i++, k++) {
1227 	  this = &(pairs[k]);
1228 	  sprintf(token,"%c",this->genome);
1229 	  md_tokens = push_token(md_tokens,token);
1230 	  *nindels += 1;
1231 	}
1232 
1233 	state = IN_DELETION;
1234 
1235       } else {
1236 	fprintf(stderr,"Don't recognize type %c\n",type);
1237 	abort();
1238       }
1239     }
1240 
1241     if (nmatches > 0) {
1242       sprintf(token,"%d",nmatches);
1243       md_tokens = push_token(md_tokens,token);
1244     }
1245 
1246     md_tokens = List_reverse(md_tokens);
1247 
1248   } else {
1249     cigar_tokens = List_reverse(cigar_tokens);
1250     for (p = cigar_tokens; p != NULL; p = List_next(p)) {
1251       cigar_token = (char *) List_head(p);
1252       debug4(printf("token is %s\n",cigar_token));
1253       type = cigar_token[strlen(cigar_token)-1];
1254       length = atoi(cigar_token);
1255 
1256       if (type == 'H') {
1257 	/* k += length; */
1258 
1259       } else if (type == 'S') {
1260 	/* k += length; */
1261 
1262       } else if (type == 'M' || type == 'X' || type == '=') {
1263 	if (state == IN_DELETION) {
1264 	  md_tokens = push_token(md_tokens,"^");
1265 	}
1266 
1267 	for (i = 0; i < length; i++, k++) {
1268 	  this = &(pairs[k]);
1269 	  debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
1270 	  if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
1271 	    nmatches++;
1272 	    state = IN_MATCHES;
1273 
1274 	  } else if (this->comp == MISMATCH_COMP) {
1275 	    if (state == IN_MATCHES) {
1276 	      sprintf(token,"%d",nmatches);
1277 	      md_tokens = push_token(md_tokens,token);
1278 	      nmatches = 0;
1279 	    }
1280 	    state = IN_MISMATCHES;
1281 
1282 	    *nmismatches_refdiff += 1;
1283 
1284 	    if (md_lowercase_variant_p && this->cdna == this->genomealt) {
1285 	      /* A mismatch against the reference only => alternate variant */
1286 	      sprintf(token,"%c",tolower(complCode[(int) this->genome]));
1287 	    } else {
1288 	      *nmismatches_bothdiff += 1;
1289 	      sprintf(token,"%c",complCode[(int) this->genome]);
1290 	    }
1291 	    md_tokens = push_token(md_tokens,token);
1292 
1293 
1294 	  } else {
1295 	    fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
1296 	    abort();
1297 	  }
1298 	}
1299 
1300       } else if (type == 'I') {
1301 	if (state == IN_DELETION) {
1302 	  md_tokens = push_token(md_tokens,"^");
1303 	}
1304 
1305 	while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
1306 	  *nindels += 1;
1307 	  k++;
1308 	}
1309 	state = IN_MATCHES;
1310 
1311       } else if (type == 'N') {
1312 #if 0
1313 	/* Ignore deletion adjacent to intron, to avoid double ^^ */
1314 	if (state == IN_DELETION) {
1315 	  md_tokens = push_token(md_tokens,"^");
1316 	}
1317 #endif
1318 
1319 	while (k < npairs && pairs[k].gapp == true) {
1320 	  k++;
1321 	}
1322 
1323       } else if (type == 'D') {
1324 	if (state == IN_MATCHES) {
1325 	  if (nmatches > 0) {
1326 	    sprintf(token,"%d",nmatches);
1327 	    md_tokens = push_token(md_tokens,token);
1328 	    nmatches = 0;
1329 	  }
1330 	} else if (state == IN_MISMATCHES) {
1331 	  md_tokens = push_token(md_tokens,"0");
1332 	}
1333 
1334 	for (i = 0; i < length; i++, k++) {
1335 	  this = &(pairs[k]);
1336 	  sprintf(token,"%c",complCode[(int) this->genome]);
1337 	  md_tokens = push_token(md_tokens,token);
1338 	  *nindels += 1;
1339 	}
1340 	state = IN_DELETION;
1341 
1342       } else {
1343 	fprintf(stderr,"Don't recognize type %c\n",type);
1344 	abort();
1345       }
1346     }
1347 
1348     if (nmatches > 0) {
1349       sprintf(token,"%d",nmatches);
1350       md_tokens = push_token(md_tokens,token);
1351     }
1352 
1353     /* Restore cigar_tokens */
1354     cigar_tokens = List_reverse(cigar_tokens);
1355   }
1356 
1357   assert(k == npairs);
1358 
1359   /* Insert initial 0 token if necessary */
1360   if (md_tokens != NULL) {
1361     first_token = (char *) List_head(md_tokens);
1362     if (!isdigit(first_token[0])) {
1363       md_tokens = push_token(md_tokens,"0");
1364     }
1365   }
1366 
1367   return md_tokens;
1368 }
1369 
1370 
1371 /* Modeled after Shortread_print_chopped */
1372 static void
print_chopped(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1373 print_chopped (Filestring_T fp, char *contents, int querylength,
1374 	       int hardclip_start, int hardclip_end) {
1375   int i;
1376 
1377   for (i = hardclip_start; i < querylength - hardclip_end; i++) {
1378     PUTC(contents[i],fp);
1379   }
1380   return;
1381 }
1382 
1383 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
1384 static void
print_chopped_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1385 print_chopped_revcomp (Filestring_T fp, char *contents, int querylength,
1386 		       int hardclip_start, int hardclip_end) {
1387   int i;
1388 
1389   for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
1390     PUTC(complCode[(int) contents[i]],fp);
1391   }
1392   return;
1393 }
1394 
1395 
1396 static void
print_chopped_end(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1397 print_chopped_end (Filestring_T fp, char *contents, int querylength,
1398 		   int hardclip_start, int hardclip_end) {
1399   int i;
1400 
1401   for (i = 0; i < hardclip_start; i++) {
1402     PUTC(contents[i],fp);
1403   }
1404 
1405   /* No separator */
1406 
1407   for (i = querylength - hardclip_end; i < querylength; i++) {
1408     PUTC(contents[i],fp);
1409   }
1410 
1411   return;
1412 }
1413 
1414 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
1415 static void
print_chopped_end_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1416 print_chopped_end_revcomp (Filestring_T fp, char *contents, int querylength,
1417 			   int hardclip_start, int hardclip_end) {
1418   int i;
1419 
1420   for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
1421     PUTC(complCode[(int) contents[i]],fp);
1422   }
1423 
1424   /* No separator */
1425 
1426   for (i = hardclip_start - 1; i >= 0; --i) {
1427     PUTC(complCode[(int) contents[i]],fp);
1428   }
1429 
1430   return;
1431 }
1432 
1433 
1434 static void
print_chopped_end_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)1435 print_chopped_end_quality (Filestring_T fp, char *quality, int querylength,
1436 			   int hardclip_start, int hardclip_end) {
1437   int i;
1438 
1439   if (hardclip_start > 0) {
1440     for (i = 0; i < hardclip_start; i++) {
1441       PUTC(quality[i],fp);
1442     }
1443     return;
1444 
1445   } else {
1446     for (i = querylength - hardclip_end; i < querylength; i++) {
1447       PUTC(quality[i],fp);
1448     }
1449     return;
1450   }
1451 }
1452 
1453 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
1454 static void
print_chopped_end_quality_reverse(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)1455 print_chopped_end_quality_reverse (Filestring_T fp, char *quality, int querylength,
1456 				   int hardclip_start, int hardclip_end) {
1457   int i;
1458 
1459   if (hardclip_start > 0) {
1460     for (i = hardclip_start - 1; i >= 0; --i) {
1461       PUTC(quality[i],fp);
1462     }
1463     return;
1464 
1465   } else {
1466     for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
1467       PUTC(quality[i],fp);
1468     }
1469     return;
1470   }
1471 }
1472 
1473 
1474 
1475 /* Modeled after Shortread_print_quality */
1476 static void
print_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)1477 print_quality (Filestring_T fp, char *quality, int querylength,
1478 	       int hardclip_start, int hardclip_end, int shift) {
1479   int i;
1480   int c;
1481 
1482   if (quality == NULL) {
1483     PUTC('*',fp);
1484   } else {
1485     for (i = hardclip_start; i < querylength - hardclip_end; i++) {
1486       if ((c = quality[i] + shift) <= 32) {
1487 	fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable.  May need to specify --quality-protocol or --quality-print-shift\n",
1488 		shift,quality[i]);
1489 	abort();
1490       } else {
1491 	PUTC(c,fp);
1492       }
1493     }
1494   }
1495   return;
1496 }
1497 
1498 
1499 static void
print_quality_revcomp(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)1500 print_quality_revcomp (Filestring_T fp, char *quality, int querylength,
1501 		       int hardclip_start, int hardclip_end, int shift) {
1502   int i;
1503   int c;
1504 
1505   if (quality == NULL) {
1506     PUTC('*',fp);
1507   } else {
1508     for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
1509       if ((c = quality[i] + shift) <= 32) {
1510 	fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable.  May need to specify --quality-protocol or --quality-print-shift\n",
1511 		shift,quality[i]);
1512 	abort();
1513       } else {
1514 	PUTC(c,fp);
1515       }
1516     }
1517   }
1518 
1519   return;
1520 }
1521 
1522 /* Derived from print_gff3_cdna_match */
1523 /* Assumes pairarray has been hard clipped already */
1524 static void
print_overlap_sam_line(Filestring_T fp,char * abbrev,char * acc1,char * acc2,char * chrstring,bool watsonp,int sensedir,List_T cigar_tokens,List_T md_tokens,int nmismatches_refdiff,int nmismatches_bothdiff,int nindels,bool intronp,char * queryseq_ptr,char * quality_string,int hardclip_start,int hardclip_end,int querylength,int quality_shift,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,unsigned int flag,Chrpos_T chrpos,Chrpos_T chrlength,Shortread_T queryseq,int pair_mapq_score,int end_mapq_score,Stage3pair_T stage3pair,Stage3end_T stage3hit,bool first_read_p,char * sam_read_group_id)1525 print_overlap_sam_line (Filestring_T fp, char *abbrev, char *acc1, char *acc2, char *chrstring,
1526 			bool watsonp, int sensedir, List_T cigar_tokens, List_T md_tokens,
1527 			int nmismatches_refdiff, int nmismatches_bothdiff, int nindels,
1528 			bool intronp, char *queryseq_ptr, char *quality_string,
1529 			int hardclip_start, int hardclip_end,
1530 			int querylength, int quality_shift,
1531 			int pathnum, int npaths_primary, int npaths_altloc, int absmq_score, int second_absmq, unsigned int flag,
1532 			Chrpos_T chrpos, Chrpos_T chrlength,
1533 
1534 			Shortread_T queryseq, int pair_mapq_score, int end_mapq_score,
1535 			Stage3pair_T stage3pair, Stage3end_T stage3hit,	bool first_read_p,
1536 			char *sam_read_group_id) {
1537   bool invertp = false;
1538 
1539   /* Should already be checked when Stage3_T or Stage3end_T object was created */
1540   if (cigar_action == CIGAR_ACTION_IGNORE) {
1541     /* Don't check */
1542   } else if (cigar_length(cigar_tokens) + hardclip_start + hardclip_end == querylength) {
1543     /* Okay */
1544   } else if (cigar_action == CIGAR_ACTION_WARNING) {
1545     fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
1546 	    acc1,cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
1547   } else if (cigar_action == CIGAR_ACTION_NOPRINT) {
1548     fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
1549 	    acc1,cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
1550     return;
1551   } else {
1552     /* CIGAR_ACTION_ABORT */
1553     fprintf(stderr,"Error: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
1554 	    acc1,cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
1555     abort();
1556   }
1557 
1558   /* 1. QNAME or Accession */
1559   if (acc2 == NULL) {
1560     FPRINTF(fp,"%s\t",acc1);
1561   } else {
1562     FPRINTF(fp,"%s,%s\t",acc1,acc2);
1563   }
1564 
1565   /* 2. Flags */
1566   FPRINTF(fp,"%u\t",flag);
1567 
1568   /* 3. RNAME or Chrstring */
1569   /* 4. POS or Chrlow */
1570   /* Taken from GMAP part of SAM_chromosomal_pos */
1571   if (chrpos > chrlength) {
1572     FPRINTF(fp,"%s\t%u\t",chrstring,chrpos - chrlength /*+ 1*/);
1573   } else {
1574     FPRINTF(fp,"%s\t%u\t",chrstring,chrpos /*+ 1*/);
1575   }
1576 
1577   /* 5. MAPQ or Mapping quality */
1578   FPRINTF(fp,"%d\t",pair_mapq_score);
1579 
1580   /* 6. CIGAR */
1581   print_tokens(fp,cigar_tokens);
1582 
1583   /* 7. MRNM: Mate chr */
1584   /* 8. MPOS: Mate chrpos */
1585   FPRINTF(fp,"\t*\t0");		/* Because sequence and mate are merged */
1586 
1587   /* 9. ISIZE: Insert size */
1588   FPRINTF(fp,"\t0");		/* Because sequence and mate are merged */
1589 
1590   /* 10. SEQ: queryseq and 11. QUAL: quality_scores */
1591   FPRINTF(fp,"\t");
1592   if (watsonp == true) {
1593     print_chopped(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1594     FPRINTF(fp,"\t");
1595     print_quality(fp,quality_string,querylength,hardclip_start,hardclip_end,
1596 		  quality_shift);
1597   } else {
1598     print_chopped_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1599     FPRINTF(fp,"\t");
1600     print_quality_revcomp(fp,quality_string,querylength,hardclip_start,hardclip_end,
1601 			  quality_shift);
1602   }
1603 
1604 
1605   /* 12. TAGS: RG */
1606   if (sam_read_group_id != NULL) {
1607     FPRINTF(fp,"\tRG:Z:%s",sam_read_group_id);
1608   }
1609 
1610   /* 12. TAGS: XH and XI */
1611   if (hardclip_start > 0 || hardclip_end > 0) {
1612     FPRINTF(fp,"\tXH:Z:");
1613     if (watsonp == true) {
1614       print_chopped_end(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1615     } else {
1616       print_chopped_end_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1617     }
1618 
1619     if (quality_string != NULL) {
1620       FPRINTF(fp,"\tXI:Z:");
1621       if (watsonp == true) {
1622 	print_chopped_end_quality(fp,quality_string,querylength,hardclip_start,hardclip_end);
1623       } else {
1624 	print_chopped_end_quality_reverse(fp,quality_string,querylength,hardclip_start,hardclip_end);
1625       }
1626     }
1627   }
1628 
1629   if (queryseq != NULL) {
1630     /* 12. TAGS: XB */
1631     Shortread_print_barcode(fp,queryseq);
1632 
1633     /* 12. TAGS: XP.  Logically should be last in reconstructing a read. */
1634     Shortread_print_chop(fp,queryseq,invertp);
1635   }
1636 
1637   /* 12. TAGS: MD string */
1638   FPRINTF(fp,"\tMD:Z:");
1639   print_tokens(fp,md_tokens);
1640 
1641   /* 12. TAGS: NH */
1642   FPRINTF(fp,"\tNH:i:%d",npaths_primary + npaths_altloc);
1643 
1644   /* 12. TAGS: HI */
1645   FPRINTF(fp,"\tHI:i:%d",pathnum);
1646 
1647   /* 12. TAGS: NM */
1648   FPRINTF(fp,"\tNM:i:%d",nmismatches_refdiff + nindels);
1649 
1650   if (snps_p) {
1651     /* 12. TAGS: XW and XV */
1652     FPRINTF(fp,"\tXW:i:%d",nmismatches_bothdiff);
1653     FPRINTF(fp,"\tXV:i:%d",nmismatches_refdiff - nmismatches_bothdiff);
1654   }
1655 
1656 
1657   /* 12. TAGS: SM */
1658   FPRINTF(fp,"\tSM:i:%d",end_mapq_score);
1659 
1660   /* 12. TAGS: XQ */
1661   FPRINTF(fp,"\tXQ:i:%d",absmq_score);
1662 
1663   /* 12. TAGS: X2 */
1664   FPRINTF(fp,"\tX2:i:%d",second_absmq);
1665 
1666   /* 12. TAGS: XO */
1667   FPRINTF(fp,"\tXO:Z:%s",abbrev);
1668 
1669   /* 12. TAGS: XS */
1670   if (splicingp == false) {
1671     /* Do not print XS field */
1672 
1673   } else if (sensedir == SENSE_FORWARD) {
1674     if (watsonp == true) {
1675       FPRINTF(fp,"\tXS:A:+");
1676     } else {
1677       FPRINTF(fp,"\tXS:A:-");
1678     }
1679 
1680   } else if (sensedir == SENSE_ANTI) {
1681     if (watsonp == true) {
1682       FPRINTF(fp,"\tXS:A:-");
1683     } else {
1684       FPRINTF(fp,"\tXS:A:+");
1685     }
1686 
1687   } else if (intronp == false) {
1688     /* Skip.  No intron in this end and mate is not revealing. */
1689 
1690 #if 0
1691   } else if (force_xs_direction_p == true) {
1692     /* Don't print XS field for SENSE_NULL */
1693     /* Could not determine sense, so just report arbitrarily as + */
1694     /* This option provided for users of Cufflinks, which cannot handle XS:A:? */
1695     FPRINTF(fp,"\tXS:A:+");
1696 
1697   } else {
1698     /* Non-canonical.  Don't report. */
1699     FPRINTF(fp,"\tXS:A:?");
1700 #endif
1701   }
1702 
1703 
1704   /* 12. TAGS: XX, XY */
1705   /* TODO: Merge transcripts */
1706 
1707 #if 0
1708   /* TAGS: XZ */
1709   if (Stage3end_transcripts_other(stage3hit) != NULL) {
1710     FPRINTF(fp,"\tXZ:Z:");
1711     Transcript_print_info(fp,Stage3end_transcripts_other(stage3hit),transcript_iit,invertp);
1712   }
1713 #endif
1714 
1715   /* 12. TAGS: XC (circular).  Not currently handled for GMAP */
1716 
1717   /* 12. TAGS: XG.  Not currently handled for GMAP */
1718   Method_samprint(fp,Stage3end_method(stage3hit));
1719 
1720 
1721   FPRINTF(fp,"\n");
1722 
1723   return;
1724 }
1725 
1726 
1727 /* Modified from Pair_print_sam */
1728 void
Simplepair_overlap_print_sam(Filestring_T fp,char * abbrev,struct T * pairarray,int npairs,char * acc1,char * acc2,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,char * queryseq_ptr,char * quality_string,int hardclip_low,int hardclip_high,int querylength_given,bool watsonp,int sensedir,int quality_shift,bool first_read_p,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,Chrpos_T chrpos,Chrpos_T chrlength,Shortread_T queryseq,unsigned int flag,int pair_mapq_score,int end_mapq_score,Stage3pair_T stage3pair,Stage3end_T stage3hit,char * sam_read_group_id)1729 Simplepair_overlap_print_sam (Filestring_T fp, char *abbrev, struct T *pairarray, int npairs,
1730 			      char *acc1, char *acc2, Chrnum_T chrnum, Univ_IIT_T chromosome_iit,
1731 			      char *queryseq_ptr, char *quality_string,
1732 			      int hardclip_low, int hardclip_high, int querylength_given,
1733 			      bool watsonp, int sensedir,
1734 			      int quality_shift, bool first_read_p, int pathnum, int npaths_primary, int npaths_altloc,
1735 			      int absmq_score, int second_absmq, Chrpos_T chrpos, Chrpos_T chrlength,
1736 
1737 			      Shortread_T queryseq, unsigned int flag,
1738 			      int pair_mapq_score, int end_mapq_score,
1739 			      Stage3pair_T stage3pair, Stage3end_T stage3hit,
1740 
1741 			      char *sam_read_group_id) {
1742   char *chrstring = NULL;
1743   int chimera_part = 0;
1744 #if 0
1745   char *mate_chrstring, *mate_chrstring_alloc = NULL;
1746 #elif defined(GSNAP)
1747 #else
1748   unsigned int flag;
1749 #endif
1750 
1751   List_T cigar_tokens, md_tokens = NULL;
1752   int nmismatches_refdiff, nmismatches_bothdiff, nindels;
1753   bool intronp;
1754   int hardclip_start, hardclip_end;
1755   /* int hardclip_start_zero = 0, hardclip_end_zero = 0; */
1756   struct T *clipped_pairarray;
1757   int clipped_npairs;
1758   bool cigar_tokens_alloc;
1759 
1760 
1761   if (chrnum == 0) {
1762     /* chrstring = Sequence_accession(usersegment); */
1763     fprintf(stderr,"Error in Simplepair_overlap_print_sam: chrnum == 0\n");
1764     exit(9);
1765   } else {
1766     chrstring = Chrnum_to_string(chrnum,chromosome_iit);
1767   }
1768 
1769 #if 0
1770   /* Previously this was code for GSNAP.  However for GSNAP, this is called only when the sequence and mate are merged */
1771   if (mate_chrpos_low == 0U) {
1772     mate_chrstring = "*";
1773   } else if (mate_chrnum == 0) {
1774     abort();
1775   } else if (/* chrpos > 0U && chrnum > 0 && */ mate_chrnum == chrnum) {
1776     mate_chrstring = "=";
1777   } else {
1778     mate_chrstring = mate_chrstring_alloc = Chrnum_to_string(mate_chrnum,chromosome_iit);
1779   }
1780 #elif defined(GSNAP)
1781   /* flag is given as a parameter */
1782 #else
1783   flag = compute_sam_flag_nomate(npaths_primary + npaths_altloc,first_read_p,watsonp,sam_paired_p);
1784 #endif
1785 
1786   debug4(printf("Entered SAM_print_pairs with watsonp %d, first_read_p %d, hardclip_low %d, and hardclip_high %d\n",
1787 		watsonp,first_read_p,hardclip_low,hardclip_high));
1788 
1789   if (watsonp == true) {
1790     hardclip_start = hardclip_low;
1791     hardclip_end = hardclip_high;
1792   } else {
1793     hardclip_start = hardclip_high;
1794     hardclip_end = hardclip_low;
1795   }
1796   debug4(printf("hardclip_start %d, hardclip_end %d\n",hardclip_start,hardclip_end));
1797 
1798 
1799   /* Because merged_overlap_p is true */
1800   /* clipped_pairarray = pairarray; */
1801   /* clipped_npairs = npairs; */
1802   clipped_pairarray = hardclip_pairarray(&clipped_npairs,hardclip_start,hardclip_end,
1803 					 pairarray,npairs,querylength_given);
1804   cigar_tokens = compute_cigar(&intronp,&hardclip_start,&hardclip_end,clipped_pairarray,clipped_npairs,querylength_given,
1805 			       watsonp,chimera_part);
1806   cigar_tokens_alloc = true;
1807 
1808 
1809   /* Cigar updates hardclip5 and hardclip3 for chimeras */
1810   md_tokens = compute_md_string(&nmismatches_refdiff,&nmismatches_bothdiff,&nindels,
1811 				clipped_pairarray,clipped_npairs,watsonp,cigar_tokens);
1812 
1813   print_overlap_sam_line(fp,abbrev,acc1,acc2,chrstring,
1814 			 watsonp,sensedir,cigar_tokens,md_tokens,
1815 			 nmismatches_refdiff,nmismatches_bothdiff,nindels,
1816 			 intronp,queryseq_ptr,quality_string,hardclip_start,hardclip_end,
1817 			 querylength_given,quality_shift,pathnum,npaths_primary,npaths_altloc,
1818 			 absmq_score,second_absmq,flag,chrpos,chrlength,
1819 			 queryseq,pair_mapq_score,end_mapq_score,stage3pair,stage3hit,first_read_p,
1820 			 sam_read_group_id);
1821 
1822   /* Print procedures free the character strings */
1823   tokens_free(&md_tokens);
1824   if (cigar_tokens_alloc == true) {
1825     tokens_free(&cigar_tokens);
1826   }
1827 
1828   if (chrnum != 0) {
1829     FREE(chrstring);
1830   }
1831 
1832   return;
1833 }
1834 
1835 
1836 void
Simplepair_setup(bool splicingp_in,Univ_IIT_T transcript_iit_in,bool sam_insert_0M_p_in,bool md_lowercase_variant_p_in,bool snps_p_in,bool cigar_extended_p_in,Cigar_action_T cigar_action_in)1837 Simplepair_setup (bool splicingp_in,
1838 		  Univ_IIT_T transcript_iit_in, bool sam_insert_0M_p_in,
1839 		  bool md_lowercase_variant_p_in, bool snps_p_in,
1840 		  bool cigar_extended_p_in, Cigar_action_T cigar_action_in) {
1841 
1842   splicingp = splicingp_in;
1843   transcript_iit = transcript_iit_in;
1844 
1845   sam_insert_0M_p = sam_insert_0M_p_in;
1846   md_lowercase_variant_p = md_lowercase_variant_p_in;
1847   snps_p = snps_p_in;
1848 
1849   cigar_extended_p = cigar_extended_p_in;
1850   cigar_action = cigar_action_in;
1851 
1852   return;
1853 }
1854