1 static char rcsid[] = "$Id: simplepair.c 222730 2020-05-29 13:10:09Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "simplepair.h"
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h> /* For memcpy */
16 #include <math.h> /* For rint(), abs() */
17 #include <ctype.h> /* For toupper */
18
19 #include "assert.h"
20 #include "except.h"
21 #include "mem.h"
22 #include "comp.h"
23 #include "complement.h"
24 #include "transcript.h"
25 #include "method.h"
26
27
28
29 #ifdef DEBUG1
30 #define debug1(x) x
31 #else
32 #define debug1(x)
33 #endif
34
35 #ifdef DEBUG4
36 #define debug4(x) x
37 #else
38 #define debug4(x)
39 #endif
40
41 /* hardclip_pairarray */
42 #ifdef DEBUG10
43 #define debug10(x) x
44 #else
45 #define debug10(x)
46 #endif
47
48
49 static bool splicingp;
50 static Univ_IIT_T transcript_iit;
51
52 static bool sam_insert_0M_p = false;
53 static bool md_lowercase_variant_p;
54 static bool snps_p;
55
56 static bool cigar_extended_p;
57 static Cigar_action_T cigar_action;
58
59
60
61 #define T Simplepair_T
62
63
64 Chrpos_T
Simplepair_head_genomepos(List_T pairs)65 Simplepair_head_genomepos (List_T pairs) {
66 return ((T) List_head(pairs))->genomepos;
67 }
68
69 Chrpos_T
Simplepair_last_genomepos(List_T pairs)70 Simplepair_last_genomepos (List_T pairs) {
71 return ((T) List_last_value(pairs))->genomepos;
72 }
73
74
75 /* For GSNAP, generate for output thread only. For GMAP, pairs needed by worker threads are made in pairpool.c */
76 T
Simplepair_new_out(int querypos,Chrpos_T genomepos,char cdna,char comp,char genome)77 Simplepair_new_out (int querypos, Chrpos_T genomepos, char cdna, char comp, char genome) {
78 T new = (T) MALLOC_OUT(sizeof(*new));
79
80 new->querypos = querypos;
81 new->genomepos = genomepos;
82 new->cdna = cdna;
83 new->comp = comp;
84 new->genome = genome;
85 new->genomealt = genome;
86
87 switch (comp) {
88 case FWD_CANONICAL_INTRON_COMP /*'>'*/:
89 case REV_CANONICAL_INTRON_COMP /*'<'*/:
90 case NONINTRON_COMP /*'='*/:
91 case SHORTGAP_COMP /*'~'*/:
92 case INTRONGAP_COMP /*'.'*/:
93 case FWD_GCAG_INTRON_COMP /*')'*/:
94 case REV_GCAG_INTRON_COMP /*'('*/:
95 case FWD_ATAC_INTRON_COMP /*']'*/:
96 case REV_ATAC_INTRON_COMP /*'['*/:
97 case DUALBREAK_COMP /*'#'*/:
98 new->gapp = true; break;
99 default: new->gapp = false;
100 }
101
102 return new;
103 }
104
105 void
Simplepair_free_out(T * old)106 Simplepair_free_out (T *old) {
107 if (*old) {
108 FREE_OUT(*old);
109 }
110 return;
111 }
112
113
114 static void
dump_one(T this,bool zerobasedp)115 dump_one (T this, bool zerobasedp) {
116
117 debug1(printf("%p ",this));
118
119 if (this->gapp == true) {
120 printf("*** Gap: type: ");
121 switch (this->comp) {
122 case FWD_CANONICAL_INTRON_COMP: printf("> GT-AG"); break;
123 case FWD_GCAG_INTRON_COMP: printf(") GC-AG"); break;
124 case FWD_ATAC_INTRON_COMP: printf("] AT-AC"); break;
125 case REV_ATAC_INTRON_COMP: printf("[ AT-AC"); break;
126 case REV_GCAG_INTRON_COMP: printf("( GC-AG"); break;
127 case REV_CANONICAL_INTRON_COMP: printf("< GT-AG"); break;
128 case SHORTGAP_COMP: printf("~ shortgap"); break;
129 case NONINTRON_COMP: printf("= nonintron"); break;
130 default: printf("? unknown"); break;
131 }
132
133 printf(" ***");
134
135 } else {
136 printf("%d %d %c ",
137 this->querypos + !zerobasedp,this->genomepos + !zerobasedp,this->cdna);
138
139 /* Subtract 1 because dynprogindices start at +1 and -1 */
140 putchar(this->comp);
141 printf(" %c",this->genome);
142 if (this->genomealt != this->genome) {
143 printf(" alt:%c",this->genomealt);
144 }
145 }
146
147 return;
148 }
149
150
151 /* Useful for debugging */
152 void
Simplepair_dump_list(List_T pairs,bool zerobasedp)153 Simplepair_dump_list (List_T pairs, bool zerobasedp) {
154 T this, prev = NULL, old = NULL;
155 List_T p;
156
157 printf("***Start of list***\n");
158 for (p = pairs; p != NULL; p = List_next(p)) {
159 this = List_head(p);
160 dump_one(this,zerobasedp);
161 printf("\n");
162
163 if (this->querypos != -1) {
164 if (old != NULL) {
165 if (old->querypos > prev->querypos) {
166 if (prev->querypos < this->querypos) {
167 fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
168 abort();
169 }
170 } else if (old->querypos < prev->querypos) {
171 if (prev->querypos > this->querypos) {
172 fprintf(stderr,"%d %d %d\n",old->querypos,prev->querypos,this->querypos);
173 abort();
174 }
175 }
176 }
177
178 old = prev;
179 prev = this;
180 }
181
182 }
183 printf("***End of list***\n");
184 return;
185 }
186
187
188 List_T
Simplepair_strip_gaps_at_head(List_T pairs)189 Simplepair_strip_gaps_at_head (List_T pairs) {
190 T pair;
191
192 while (pairs != NULL && (pair = pairs->first) != NULL &&
193 (pair->gapp == true || pair->cdna == ' ' || pair->genome == ' ')) {
194 pairs = List_pop_out(pairs,(void **) &pair);
195 Simplepair_free_out(&pair);
196 }
197
198 return pairs;
199 }
200
201 List_T
Simplepair_strip_gaps_at_tail(List_T pairs)202 Simplepair_strip_gaps_at_tail (List_T pairs) {
203 T pair;
204
205 if (pairs != NULL) {
206 pairs = List_reverse(pairs);
207
208 while (pairs != NULL && (pair = pairs->first) != NULL &&
209 (pair->gapp == true || pair->cdna == ' ' || pair->genome == ' ')) {
210 pairs = List_pop_out(pairs,(void **) &pair);
211 Simplepair_free_out(&pair);
212 }
213
214 pairs = List_reverse(pairs);
215 }
216
217 return pairs;
218 }
219
220
221
222 static struct T *
hardclip_pairarray(int * clipped_npairs,int hardclip_start,int hardclip_end,struct T * pairs,int npairs,int querylength)223 hardclip_pairarray (int *clipped_npairs, int hardclip_start, int hardclip_end,
224 struct T *pairs, int npairs, int querylength) {
225 struct T *clipped_pairs, *ptr;
226 int i, starti;
227
228 debug10(printf("Entered hardclip_pairarray with hardclip_start %d, hardclip_end %d, querylength %d\n",
229 hardclip_start,hardclip_end,querylength));
230 debug10(Simplepair_dump_array(pairs,npairs,true));
231 debug10(printf("Starting with %d pairs\n",npairs));
232
233 i = 0;
234 ptr = pairs;
235 while (i < npairs && ptr->querypos < hardclip_start) {
236 i++;
237 ptr++;
238 }
239 while (i < npairs && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
240 i++;
241 ptr++;
242 }
243
244 if (i >= npairs) {
245 /* hardclip_start passes right end of read, so invalid */
246 debug10(printf("i = %d, so passed end of read\n",i));
247 hardclip_start = 0;
248 } else if (hardclip_start > 0) {
249 hardclip_start = ptr->querypos;
250 }
251
252 starti = i;
253 debug10(printf("starti is %d\n",starti));
254
255 clipped_pairs = ptr;
256
257 while (i < npairs && ptr->querypos < querylength - hardclip_end) {
258 i++;
259 ptr++;
260 }
261
262 i--;
263 ptr--;
264 while (i >= starti && (ptr->gapp == true || ptr->cdna == ' ' || ptr->genome == ' ')) {
265 i--;
266 ptr--;
267 }
268
269 if (i < 0) {
270 /* hardclip_end passes left end of read, so invalid */
271 debug10(printf("i = %d, so passed left end of read\n",i));
272 hardclip_end = 0;
273 } else if (hardclip_end > 0) {
274 hardclip_end = querylength - 1 - ptr->querypos;
275 }
276
277 if (hardclip_start == 0 && hardclip_end == 0) {
278 debug10(printf("Unable to hard clip\n"));
279 *clipped_npairs = npairs;
280 clipped_pairs = pairs;
281 } else {
282 *clipped_npairs = i - starti + 1;
283 }
284
285 debug10(printf("Ending with %d pairs\n",*clipped_npairs));
286 debug10(printf("Exiting hardclip_pairs with hardclip_start %d, hardclip_end %d\n",
287 hardclip_start,hardclip_end));
288
289 return clipped_pairs;
290 }
291
292
293 Chrpos_T
Simplepair_genomicpos_low(int hardclip_low,int hardclip_high,struct T * pairarray,int npairs,int querylength,bool plusp,bool hide_soft_clips_p)294 Simplepair_genomicpos_low (int hardclip_low, int hardclip_high,
295 struct T *pairarray, int npairs, int querylength,
296 bool plusp, bool hide_soft_clips_p) {
297 struct T *clipped_pairs;
298 int clipped_npairs;
299 T lowpair, highpair;
300
301 #if 0
302 if (clipdir >= 0) {
303 if (plusp == true) {
304 if (first_read_p == true) {
305 hardclip_high = hardclip5;
306 hardclip_low = 0;
307 } else {
308 hardclip_high = 0;
309 hardclip_low = hardclip3;
310 }
311 } else {
312 if (first_read_p == true) {
313 hardclip_low = hardclip5;
314 hardclip_high = 0;
315 } else {
316 hardclip_low = 0;
317 hardclip_high = hardclip3;
318 }
319 }
320 } else {
321 if (plusp == true) {
322 if (first_read_p == true) {
323 hardclip_low = hardclip5;
324 hardclip_high = 0;
325 } else {
326 hardclip_low = 0;
327 hardclip_high = hardclip3;
328 }
329 } else {
330 if (first_read_p == true) {
331 hardclip_high = hardclip5;
332 hardclip_low = 0;
333 } else {
334 hardclip_high = 0;
335 hardclip_low = hardclip3;
336 }
337 }
338 }
339 #endif
340
341 if (plusp == true) {
342 clipped_pairs = hardclip_pairarray(&clipped_npairs,hardclip_low,hardclip_high,
343 pairarray,npairs,querylength);
344 lowpair = &(clipped_pairs[0]);
345 highpair = &(clipped_pairs[clipped_npairs-1]);
346 if (hide_soft_clips_p == true) {
347 assert(lowpair->querypos == 0);
348 assert(highpair->querypos == querylength - 1);
349 /* *chrpos_high = highpair->genomepos + 1 - (querylength - 1 - highpair->querypos); */
350 return lowpair->genomepos + 1 - lowpair->querypos;
351 } else {
352 /* *chrpos_high = highpair->genomepos + 1; */
353 return lowpair->genomepos + 1;
354 }
355 } else {
356 /* Swap hardclip_low and hardclip_high */
357 clipped_pairs = hardclip_pairarray(&clipped_npairs,hardclip_high,hardclip_low,
358 pairarray,npairs,querylength);
359 lowpair = &(clipped_pairs[clipped_npairs-1]);
360 highpair = &(clipped_pairs[0]);
361 if (hide_soft_clips_p == true) {
362 assert(lowpair->querypos == querylength - 1);
363 assert(highpair->querypos == 0);
364 /* *chrpos_high = highpair->genomepos + 1 - highpair->querypos; */
365 return lowpair->genomepos + 1 + (querylength - 1 - lowpair->querypos);
366 } else {
367 /* *chrpos_high = highpair->genomepos + 1; */
368 return lowpair->genomepos + 1;
369 }
370 }
371 }
372
373
374 static List_T
push_token(List_T tokens,char * token)375 push_token (List_T tokens, char *token) {
376 char *copy;
377
378 copy = (char *) MALLOC_OUT((strlen(token)+1) * sizeof(char));
379 strcpy(copy,token);
380 return List_push_out(tokens,(void *) copy);
381 }
382
383 static void
tokens_free(List_T * tokens)384 tokens_free (List_T *tokens) {
385 List_T p;
386 char *token;
387
388 for (p = *tokens; p != NULL; p = List_next(p)) {
389 token = (char *) List_head(p);
390 FREE_OUT(token);
391 }
392 List_free_out(&(*tokens));
393
394 return;
395 }
396
397 static void
print_tokens(Filestring_T fp,List_T tokens)398 print_tokens (Filestring_T fp, List_T tokens) {
399 List_T p;
400 char *token;
401
402 for (p = tokens; p != NULL; p = List_next(p)) {
403 token = (char *) List_head(p);
404 FPRINTF(fp,"%s",token);
405 /* FREE_OUT(token); -- Now freed within Stage3end_free or Stage3_free */
406 }
407
408 return;
409 }
410
411 static int
cigar_length(List_T tokens)412 cigar_length (List_T tokens) {
413 int length = 0, tokenlength;
414 List_T p;
415 char *token;
416 char type;
417
418 for (p = tokens; p != NULL; p = List_next(p)) {
419 token = (char *) List_head(p);
420 type = token[strlen(token)-1];
421 /* Should include 'H', but that gets added according to hardclip_low and hardclip_high */
422 if (type == 'S' || type == 'I' || type == 'M' || type == 'X' || type == '=') {
423 sscanf(token,"%d",&tokenlength);
424 length += tokenlength;
425 }
426 }
427
428 return length;
429 }
430
431
432 static List_T
clean_cigar(List_T tokens,bool watsonp)433 clean_cigar (List_T tokens, bool watsonp) {
434 List_T clean, unique = NULL, p;
435 char token[11], *curr_token, *last_token;
436 int length = 0;
437 char type, last_type = ' ';
438 bool duplicatep = false;
439
440 for (p = tokens; p != NULL; p = List_next(p)) {
441 curr_token = (char *) List_head(p);
442 type = curr_token[strlen(curr_token)-1];
443 if (type == last_type) {
444 length += atoi(last_token);
445 FREE_OUT(last_token);
446 duplicatep = true;
447 } else {
448 if (last_type == ' ') {
449 /* Skip */
450 } else if (duplicatep == false) {
451 unique = List_push_out(unique,(void *) last_token);
452 } else {
453 length += atoi(last_token);
454 FREE_OUT(last_token);
455 sprintf(token,"%d%c",length,last_type);
456 unique = push_token(unique,token);
457 }
458 last_type = type;
459 duplicatep = false;
460 length = 0;
461 }
462 last_token = curr_token;
463 }
464 if (last_type == ' ') {
465 /* Skip */
466 } else if (duplicatep == false) {
467 unique = List_push_out(unique,(void *) last_token);
468 } else {
469 length += atoi(last_token);
470 FREE_OUT(last_token);
471 sprintf(token,"%d%c",length,last_type);
472 unique = push_token(unique,token);
473 }
474 List_free_out(&tokens);
475
476
477 if (sam_insert_0M_p == false) {
478 /* Return result */
479 if (watsonp) {
480 /* Put tokens in forward order */
481 return unique;
482 } else {
483 /* Keep tokens in reverse order */
484 return List_reverse(unique);
485 }
486
487 } else {
488 /* Insert "0M" between adjacent I and D operations */
489 last_type = ' ';
490 clean = (List_T) NULL;
491 for (p = unique; p != NULL; p = List_next(p)) {
492 curr_token = (char *) List_head(p);
493 type = curr_token[strlen(curr_token)-1];
494 if (last_type == 'I' && type == 'D') {
495 clean = push_token(clean,"0M");
496 } else if (last_type == 'D' && type == 'I') {
497 clean = push_token(clean,"0M");
498 }
499 clean = List_push_out(clean,(void *) curr_token);
500 last_type = type;
501 }
502 List_free_out(&unique);
503
504 /* Return result */
505 if (watsonp) {
506 /* Put tokens in forward order */
507 return List_reverse(clean);
508 } else {
509 /* Keep tokens in reverse order */
510 return clean;
511 }
512 }
513 }
514
515
516 static List_T
compute_cigar_standard(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)517 compute_cigar_standard (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
518 bool watsonp,
519 #ifdef CONVERT_INTRONS_TO_DELETIONS
520 int sensedir,
521 #endif
522 int chimera_part) {
523 List_T tokens = NULL;
524 char token[11];
525 int Mlength = 0, Ilength = 0, Dlength = 0;
526 bool in_exon = false, deletionp;
527 struct T *ptr, *prev, *this = NULL;
528 int exon_queryend = -1;
529 Chrpos_T exon_genomestart = 0;
530 Chrpos_T exon_genomeend, genome_gap;
531 int query_gap;
532 int last_querypos = -1;
533 Chrpos_T last_genomepos = (Chrpos_T) -1;
534 int i;
535
536 /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
537 *intronp = false;
538
539 ptr = pairs;
540
541 if (chimera_part == +1) {
542 if (ptr->querypos > *hardclip_start) {
543 if (ptr->querypos > 0) {
544 /* Clip to beginning */
545 *hardclip_start = ptr->querypos;
546 sprintf(token,"%dH",*hardclip_start);
547 tokens = push_token(tokens,token);
548 }
549 } else {
550 if (*hardclip_start > 0) {
551 /* Clip to hard clip boundary */
552 sprintf(token,"%dH",*hardclip_start);
553 tokens = push_token(tokens,token);
554 }
555 }
556 } else {
557 if (*hardclip_start > 0) {
558 sprintf(token,"%dH",*hardclip_start);
559 tokens = push_token(tokens,token);
560 }
561 if (ptr->querypos > (*hardclip_start)) {
562 sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
563 tokens = push_token(tokens,token);
564 }
565 }
566
567 this = (T) NULL;
568 for (i = 0; i < npairs; i++) {
569 prev = this;
570 this = ptr++;
571
572 #if 0
573 /* Cigar_print_tokens(stdout,tokens); */
574 Pair_dump_one(this,true);
575 printf("\n");
576 #endif
577
578 if (this->gapp) {
579 if (in_exon == true) {
580 exon_queryend = last_querypos + 1;
581 exon_genomeend = last_genomepos + 1;
582 #if 0
583 if (watsonp) {
584 intron_start = exon_genomeend + 1;
585 } else {
586 intron_start = exon_genomeend - 1;
587 }
588 #endif
589
590 if (Mlength > 0) {
591 sprintf(token,"%dM",Mlength);
592 tokens = push_token(tokens,token);
593 } else if (Ilength > 0) {
594 sprintf(token,"%dI",Ilength);
595 tokens = push_token(tokens,token);
596 } else if (Dlength > 0) {
597 sprintf(token,"%dD",Dlength);
598 tokens = push_token(tokens,token);
599 }
600
601 Mlength = Ilength = Dlength = 0;
602
603 in_exon = false;
604 }
605
606 } else if (this->comp == INTRONGAP_COMP) {
607 /* Do nothing */
608
609 } else {
610 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
611 SHORTGAP_COMP, or MISMATCH_COMP */
612 if (in_exon == false) {
613 /* exon_querystart = this->querypos + 1; */
614 exon_genomestart = this->genomepos + 1;
615
616 if (prev != NULL) {
617 /* Gap */
618 /* abs() gives a large value when flag -m64 is specified */
619 /* genome_gap = abs(intron_end - intron_start) + 1; */
620 if (watsonp) {
621 /* intron_end = exon_genomestart - 1; */
622 /* genome_gap = (intron_end - intron_start) + 1; */
623 genome_gap = exon_genomestart - exon_genomeend - 1;
624 } else {
625 /* intron_end = exon_genomestart + 1; */
626 /* genome_gap = (intron_start - intron_end) + 1; */
627 genome_gap = exon_genomeend - exon_genomestart - 1;
628 }
629
630 deletionp = false;
631 #ifdef CONVERT_INTRONS_TO_DELETIONS
632 if (sensedir == SENSE_FORWARD) {
633 if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
634 prev->comp == FWD_GCAG_INTRON_COMP ||
635 prev->comp == FWD_ATAC_INTRON_COMP) {
636 sprintf(token,"%uN",genome_gap);
637 *intronp = true;
638 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
639 sprintf(token,"%uN",genome_gap);
640 *intronp = true;
641 } else {
642 sprintf(token,"%uD",genome_gap);
643 deletionp = true;
644 }
645 } else if (sensedir == SENSE_ANTI) {
646 if (prev->comp == REV_CANONICAL_INTRON_COMP ||
647 prev->comp == REV_GCAG_INTRON_COMP ||
648 prev->comp == REV_ATAC_INTRON_COMP) {
649 sprintf(token,"%uN",genome_gap);
650 *intronp = true;
651 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
652 sprintf(token,"%uN",genome_gap);
653 *intronp = true;
654 } else {
655 sprintf(token,"%uD",genome_gap);
656 deletionp = true;
657 }
658 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
659 sprintf(token,"%uN",genome_gap);
660 *intronp = true;
661 } else {
662 sprintf(token,"%uD",genome_gap);
663 deletionp = true;
664 }
665 #else
666 sprintf(token,"%uN",genome_gap);
667 *intronp = true;
668 #endif
669 tokens = push_token(tokens,token);
670
671 /* Check for dual gap. Doesn't work for hard clipping. */
672 /* assert(exon_queryend >= 0); */
673
674 query_gap = this->querypos - exon_queryend;
675 assert(query_gap >= 0);
676 if (query_gap > 0) {
677 if (deletionp == true && sam_insert_0M_p == true) {
678 /* Put zero matches between deletion and insertion, since some programs will complain */
679 sprintf(token,"0M");
680 tokens = push_token(tokens,token);
681 }
682
683 sprintf(token,"%uI",query_gap);
684 tokens = push_token(tokens,token);
685 }
686 }
687
688 in_exon = true;
689 }
690
691 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
692 /* Gap in upper or lower sequence */
693 if (this->genome == ' ') {
694 /* Insertion relative to genome */
695 if (Mlength > 0) {
696 sprintf(token,"%dM",Mlength);
697 tokens = push_token(tokens,token);
698 Mlength = 0;
699 } else if (Dlength > 0) {
700 /* unlikely */
701 sprintf(token,"%dD",Dlength);
702 tokens = push_token(tokens,token);
703 Dlength = 0;
704 }
705 Ilength++;
706 } else if (this->cdna == ' ') {
707 /* Deletion relative to genome */
708 if (Mlength > 0) {
709 sprintf(token,"%dM",Mlength);
710 tokens = push_token(tokens,token);
711 Mlength = 0;
712 } else if (Ilength > 0) {
713 sprintf(token,"%dI",Ilength);
714 tokens = push_token(tokens,token);
715 Ilength = 0;
716 }
717 Dlength++;
718 } else {
719 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
720 exit(9);
721 }
722
723 } else {
724 /* Count even if unknown base */
725
726 if (Ilength > 0) {
727 sprintf(token,"%dI",Ilength);
728 tokens = push_token(tokens,token);
729 Ilength = 0;
730 } else if (Dlength > 0) {
731 sprintf(token,"%dD",Dlength);
732 tokens = push_token(tokens,token);
733 Dlength = 0;
734 }
735 Mlength++;
736
737 }
738 }
739
740 if (this != NULL) {
741 if (this->cdna != ' ') {
742 last_querypos = this->querypos;
743 }
744 if (this->genome != ' ') {
745 last_genomepos = this->genomepos;
746 }
747 }
748 }
749
750 /* prev = this; */
751 /* exon_queryend = last_querypos + 1; */
752 /* exon_genomeend = last_genomepos + 1; */
753
754 if (Mlength > 0) {
755 sprintf(token,"%dM",Mlength);
756 tokens = push_token(tokens,token);
757 } else if (Ilength > 0) {
758 sprintf(token,"%dI",Ilength);
759 tokens = push_token(tokens,token);
760 } else if (Dlength > 0) {
761 sprintf(token,"%dD",Dlength);
762 tokens = push_token(tokens,token);
763 }
764
765
766 /* Terminal clipping */
767 if (chimera_part == -1) {
768 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
769 if (last_querypos < querylength_given - 1) {
770 /* Clip to end */
771 *hardclip_end = querylength_given - 1 - last_querypos;
772 sprintf(token,"%dH",*hardclip_end);
773 tokens = push_token(tokens,token);
774 }
775 } else {
776 if (*hardclip_end > 0) {
777 /* Clip to hard clip boundary */
778 sprintf(token,"%dH",*hardclip_end);
779 tokens = push_token(tokens,token);
780 }
781 }
782 } else {
783 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
784 sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
785 tokens = push_token(tokens,token);
786 }
787 if (*hardclip_end > 0) {
788 sprintf(token,"%dH",*hardclip_end);
789 tokens = push_token(tokens,token);
790 }
791 }
792
793 return clean_cigar(tokens,watsonp);
794 }
795
796
797 static List_T
compute_cigar_extended(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int sensedir,int chimera_part)798 compute_cigar_extended (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
799 bool watsonp,
800 #ifdef CONVERT_INTRONS_TO_DELETIONS
801 int sensedir,
802 #endif
803 int chimera_part) {
804 List_T tokens = NULL;
805 char token[11];
806 int Elength = 0, Xlength = 0, Ilength = 0, Dlength = 0;
807 bool in_exon = false, deletionp;
808 struct T *ptr, *prev, *this = NULL;
809 int exon_queryend = -1;
810 Chrpos_T exon_genomestart = 0;
811 Chrpos_T exon_genomeend, genome_gap;
812 int query_gap;
813 int last_querypos = -1;
814 Chrpos_T last_genomepos = (Chrpos_T) -1;
815 int i;
816
817 /* *chimera_hardclip_start = *chimera_hardclip_high = 0; */
818 *intronp = false;
819
820 ptr = pairs;
821
822 if (chimera_part == +1) {
823 if (ptr->querypos > *hardclip_start) {
824 if (ptr->querypos > 0) {
825 /* Clip to beginning */
826 *hardclip_start = ptr->querypos;
827 sprintf(token,"%dH",*hardclip_start);
828 tokens = push_token(tokens,token);
829 }
830 } else {
831 if (*hardclip_start > 0) {
832 /* Clip to hard clip boundary */
833 sprintf(token,"%dH",*hardclip_start);
834 tokens = push_token(tokens,token);
835 }
836 }
837 } else {
838 if (*hardclip_start > 0) {
839 sprintf(token,"%dH",*hardclip_start);
840 tokens = push_token(tokens,token);
841 }
842 if (ptr->querypos > (*hardclip_start)) {
843 sprintf(token,"%dS",ptr->querypos - (*hardclip_start));
844 tokens = push_token(tokens,token);
845 }
846 }
847
848 this = (T) NULL;
849 for (i = 0; i < npairs; i++) {
850 prev = this;
851 this = ptr++;
852
853 #if 0
854 /* Cigar_print_tokens(stdout,tokens); */
855 Pair_dump_one(this,true);
856 printf("\n");
857 #endif
858
859 if (this->gapp) {
860 if (in_exon == true) {
861 exon_queryend = last_querypos + 1;
862 exon_genomeend = last_genomepos + 1;
863 #if 0
864 if (watsonp) {
865 intron_start = exon_genomeend + 1;
866 } else {
867 intron_start = exon_genomeend - 1;
868 }
869 #endif
870
871 if (Elength > 0) {
872 sprintf(token,"%d=",Elength);
873 tokens = push_token(tokens,token);
874 } else if (Xlength > 0) {
875 sprintf(token,"%dX",Xlength);
876 tokens = push_token(tokens,token);
877 } else if (Ilength > 0) {
878 sprintf(token,"%dI",Ilength);
879 tokens = push_token(tokens,token);
880 } else if (Dlength > 0) {
881 sprintf(token,"%dD",Dlength);
882 tokens = push_token(tokens,token);
883 }
884
885 Elength = Xlength = Ilength = Dlength = 0;
886
887 in_exon = false;
888 }
889
890 } else if (this->comp == INTRONGAP_COMP) {
891 /* Do nothing */
892
893 } else {
894 /* Remaining possibilities are MATCH_COMP, DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, INDEL_COMP,
895 SHORTGAP_COMP, or MISMATCH_COMP */
896 if (in_exon == false) {
897 /* exon_querystart = this->querypos + 1; */
898 exon_genomestart = this->genomepos + 1;
899
900 if (prev != NULL) {
901 /* Gap */
902 /* abs() gives a large value when flag -m64 is specified */
903 /* genome_gap = abs(intron_end - intron_start) + 1; */
904 if (watsonp) {
905 /* intron_end = exon_genomestart - 1; */
906 /* genome_gap = (intron_end - intron_start) + 1; */
907 genome_gap = exon_genomestart - exon_genomeend - 1;
908 } else {
909 /* intron_end = exon_genomestart + 1; */
910 /* genome_gap = (intron_start - intron_end) + 1; */
911 genome_gap = exon_genomeend - exon_genomestart - 1;
912 }
913
914 deletionp = false;
915 #ifdef CONVERT_INTRONS_TO_DELETIONS
916 if (sensedir == SENSE_FORWARD) {
917 if (prev->comp == FWD_CANONICAL_INTRON_COMP ||
918 prev->comp == FWD_GCAG_INTRON_COMP ||
919 prev->comp == FWD_ATAC_INTRON_COMP) {
920 sprintf(token,"%uN",genome_gap);
921 *intronp = true;
922 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
923 sprintf(token,"%uN",genome_gap);
924 *intronp = true;
925 } else {
926 sprintf(token,"%uD",genome_gap);
927 deletionp = true;
928 }
929 } else if (sensedir == SENSE_ANTI) {
930 if (prev->comp == REV_CANONICAL_INTRON_COMP ||
931 prev->comp == REV_GCAG_INTRON_COMP ||
932 prev->comp == REV_ATAC_INTRON_COMP) {
933 sprintf(token,"%uN",genome_gap);
934 *intronp = true;
935 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN) {
936 sprintf(token,"%uN",genome_gap);
937 *intronp = true;
938 } else {
939 sprintf(token,"%uD",genome_gap);
940 deletionp = true;
941 }
942 } else if (cigar_noncanonical_splices_p == true && genome_gap >= MIN_INTRONLEN){
943 sprintf(token,"%uN",genome_gap);
944 *intronp = true;
945 } else {
946 sprintf(token,"%uD",genome_gap);
947 deletionp = true;
948 }
949 #else
950 sprintf(token,"%uN",genome_gap);
951 *intronp = true;
952 #endif
953 tokens = push_token(tokens,token);
954
955 /* Check for dual gap. Doesn't work for hard clipping. */
956 /* assert(exon_queryend >= 0); */
957
958 query_gap = this->querypos - exon_queryend;
959 assert(query_gap >= 0);
960 if (query_gap > 0) {
961 if (deletionp == true && sam_insert_0M_p == true) {
962 /* Put zero matches between deletion and insertion, since some programs will complain */
963 sprintf(token,"0M");
964 tokens = push_token(tokens,token);
965 }
966
967 sprintf(token,"%uI",query_gap);
968 tokens = push_token(tokens,token);
969 }
970 }
971
972 in_exon = true;
973 }
974
975 if (this->comp == INDEL_COMP || this->comp == SHORTGAP_COMP) {
976 /* Gap in upper or lower sequence */
977 if (this->genome == ' ') {
978 /* Insertion relative to genome */
979 if (Elength > 0) {
980 sprintf(token,"%d=",Elength);
981 tokens = push_token(tokens,token);
982 Elength = 0;
983 } else if (Xlength > 0) {
984 sprintf(token,"%dX",Xlength);
985 tokens = push_token(tokens,token);
986 Xlength = 0;
987 } else if (Dlength > 0) {
988 /* unlikely */
989 sprintf(token,"%dD",Dlength);
990 tokens = push_token(tokens,token);
991 Dlength = 0;
992 }
993 Ilength++;
994 } else if (this->cdna == ' ') {
995 /* Deletion relative to genome */
996 if (Elength > 0) {
997 sprintf(token,"%d=",Elength);
998 tokens = push_token(tokens,token);
999 Elength = 0;
1000 } else if (Xlength > 0) {
1001 sprintf(token,"%dX",Xlength);
1002 tokens = push_token(tokens,token);
1003 Xlength = 0;
1004 } else if (Ilength > 0) {
1005 sprintf(token,"%dI",Ilength);
1006 tokens = push_token(tokens,token);
1007 Ilength = 0;
1008 }
1009 Dlength++;
1010 } else {
1011 fprintf(stderr,"Error at %c%c%c\n",this->genome,this->comp,this->cdna);
1012 exit(9);
1013 }
1014
1015 } else {
1016 /* Count even if unknown base */
1017
1018 if (Ilength > 0) {
1019 sprintf(token,"%dI",Ilength);
1020 tokens = push_token(tokens,token);
1021 Ilength = 0;
1022 } else if (Dlength > 0) {
1023 sprintf(token,"%dD",Dlength);
1024 tokens = push_token(tokens,token);
1025 Dlength = 0;
1026 }
1027
1028 if (prev == NULL || prev->gapp || prev->comp == INDEL_COMP || prev->comp == SHORTGAP_COMP) {
1029 if (this->cdna == this->genome) {
1030 Elength++;
1031 } else {
1032 Xlength++;
1033 }
1034
1035 } else if (prev->cdna == prev->genome) {
1036 if (this->cdna == this->genome) {
1037 Elength++;
1038 } else {
1039 if (Elength > 0) {
1040 sprintf(token,"%d=",Elength);
1041 tokens = push_token(tokens,token);
1042 Elength = 0;
1043 }
1044 Xlength++;
1045 }
1046
1047 } else {
1048 if (this->cdna != this->genome) {
1049 Xlength++;
1050 } else {
1051 if (Xlength > 0) {
1052 sprintf(token,"%dX",Xlength);
1053 tokens = push_token(tokens,token);
1054 Xlength = 0;
1055 }
1056 Elength++;
1057 }
1058 }
1059 }
1060 }
1061
1062 if (this != NULL) {
1063 if (this->cdna != ' ') {
1064 last_querypos = this->querypos;
1065 }
1066 if (this->genome != ' ') {
1067 last_genomepos = this->genomepos;
1068 }
1069 }
1070 }
1071
1072 /* prev = this; */
1073 /* exon_queryend = last_querypos + 1; */
1074 /* exon_genomeend = last_genomepos + 1; */
1075
1076 if (Elength > 0) {
1077 sprintf(token,"%d=",Elength);
1078 tokens = push_token(tokens,token);
1079 } else if (Xlength > 0) {
1080 sprintf(token,"%dX",Xlength);
1081 tokens = push_token(tokens,token);
1082 } else if (Ilength > 0) {
1083 sprintf(token,"%dI",Ilength);
1084 tokens = push_token(tokens,token);
1085 } else if (Dlength > 0) {
1086 sprintf(token,"%dD",Dlength);
1087 tokens = push_token(tokens,token);
1088 }
1089
1090
1091 /* Terminal clipping */
1092 if (chimera_part == -1) {
1093 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
1094 if (last_querypos < querylength_given - 1) {
1095 /* Clip to end */
1096 *hardclip_end = querylength_given - 1 - last_querypos;
1097 sprintf(token,"%dH",*hardclip_end);
1098 tokens = push_token(tokens,token);
1099 }
1100 } else {
1101 if (*hardclip_end > 0) {
1102 /* Clip to hard clip boundary */
1103 sprintf(token,"%dH",*hardclip_end);
1104 tokens = push_token(tokens,token);
1105 }
1106 }
1107 } else {
1108 if (last_querypos < querylength_given - 1 - (*hardclip_end)) {
1109 sprintf(token,"%dS",querylength_given - 1 - (*hardclip_end) - last_querypos);
1110 tokens = push_token(tokens,token);
1111 }
1112 if (*hardclip_end > 0) {
1113 sprintf(token,"%dH",*hardclip_end);
1114 tokens = push_token(tokens,token);
1115 }
1116 }
1117
1118 return clean_cigar(tokens,watsonp);
1119 }
1120
1121
1122 static List_T
compute_cigar(bool * intronp,int * hardclip_start,int * hardclip_end,struct T * pairs,int npairs,int querylength_given,bool watsonp,int chimera_part)1123 compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given,
1124 bool watsonp, int chimera_part) {
1125 if (cigar_extended_p == true) {
1126 return compute_cigar_extended(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
1127 watsonp,chimera_part);
1128 } else {
1129 return compute_cigar_standard(&(*intronp),&(*hardclip_start),&(*hardclip_end),pairs,npairs,querylength_given,
1130 watsonp,chimera_part);
1131 }
1132 }
1133
1134
1135 typedef enum {IN_MATCHES, IN_MISMATCHES, IN_DELETION} MD_state_T;
1136
1137 static char complCode[128] = COMPLEMENT_LC;
1138
1139 static List_T
compute_md_string(int * nmismatches_refdiff,int * nmismatches_bothdiff,int * nindels,struct T * pairs,int npairs,bool watsonp,List_T cigar_tokens)1140 compute_md_string (int *nmismatches_refdiff, int *nmismatches_bothdiff, int *nindels,
1141 struct T *pairs, int npairs, bool watsonp, List_T cigar_tokens) {
1142 List_T md_tokens = NULL, p;
1143 char *cigar_token, token[11], *first_token, type;
1144 T this;
1145 int nmatches = 0, length;
1146 MD_state_T state = IN_MISMATCHES;
1147 int i, k = 0;
1148
1149 *nmismatches_refdiff = *nmismatches_bothdiff = *nindels = 0;
1150
1151 debug4(Pair_dump_array(pairs,npairs,true));
1152 debug4(printf("watsonp %d\n",watsonp));
1153
1154 if (watsonp == true) {
1155 for (p = cigar_tokens; p != NULL; p = List_next(p)) {
1156 cigar_token = (char *) List_head(p);
1157 debug4(printf("token is %s\n",cigar_token));
1158 type = cigar_token[strlen(cigar_token)-1];
1159 length = atoi(cigar_token);
1160
1161 if (type == 'H') {
1162 /* k += length; */
1163
1164 } else if (type == 'S') {
1165 /* k += length; */
1166
1167 } else if (type == 'M' || type == 'X' || type == '=') {
1168 for (i = 0; i < length; i++, k++) {
1169 this = &(pairs[k]);
1170 debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
1171 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
1172 nmatches++;
1173 state = IN_MATCHES;
1174
1175 } else if (this->comp == MISMATCH_COMP) {
1176 if (state == IN_MATCHES) {
1177 sprintf(token,"%d",nmatches);
1178 md_tokens = push_token(md_tokens,token);
1179 nmatches = 0;
1180 } else if (state == IN_DELETION) {
1181 md_tokens = push_token(md_tokens,"0");
1182 }
1183 state = IN_MISMATCHES;
1184
1185 *nmismatches_refdiff += 1;
1186 if (md_lowercase_variant_p && this->cdna == this->genomealt) {
1187 /* A mismatch against the reference only => alternate variant */
1188 sprintf(token,"%c",tolower(this->genome));
1189 } else {
1190 /* A true mismatch against both variants */
1191 *nmismatches_bothdiff += 1;
1192 sprintf(token,"%c",this->genome);
1193 }
1194 md_tokens = push_token(md_tokens,token);
1195
1196 } else {
1197 fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
1198 abort();
1199 }
1200 }
1201
1202 } else if (type == 'I') {
1203 while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
1204 *nindels += 1;
1205 k++;
1206 }
1207 state = IN_MATCHES;
1208
1209 } else if (type == 'N') {
1210 while (k < npairs && pairs[k].gapp == true) {
1211 k++;
1212 }
1213
1214 } else if (type == 'D') {
1215 if (state == IN_MATCHES) {
1216 if (nmatches > 0) {
1217 sprintf(token,"%d",nmatches);
1218 md_tokens = push_token(md_tokens,token);
1219 nmatches = 0;
1220 }
1221 }
1222
1223 if (state != IN_DELETION) {
1224 md_tokens = push_token(md_tokens,"^");
1225 }
1226 for (i = 0; i < length; i++, k++) {
1227 this = &(pairs[k]);
1228 sprintf(token,"%c",this->genome);
1229 md_tokens = push_token(md_tokens,token);
1230 *nindels += 1;
1231 }
1232
1233 state = IN_DELETION;
1234
1235 } else {
1236 fprintf(stderr,"Don't recognize type %c\n",type);
1237 abort();
1238 }
1239 }
1240
1241 if (nmatches > 0) {
1242 sprintf(token,"%d",nmatches);
1243 md_tokens = push_token(md_tokens,token);
1244 }
1245
1246 md_tokens = List_reverse(md_tokens);
1247
1248 } else {
1249 cigar_tokens = List_reverse(cigar_tokens);
1250 for (p = cigar_tokens; p != NULL; p = List_next(p)) {
1251 cigar_token = (char *) List_head(p);
1252 debug4(printf("token is %s\n",cigar_token));
1253 type = cigar_token[strlen(cigar_token)-1];
1254 length = atoi(cigar_token);
1255
1256 if (type == 'H') {
1257 /* k += length; */
1258
1259 } else if (type == 'S') {
1260 /* k += length; */
1261
1262 } else if (type == 'M' || type == 'X' || type == '=') {
1263 if (state == IN_DELETION) {
1264 md_tokens = push_token(md_tokens,"^");
1265 }
1266
1267 for (i = 0; i < length; i++, k++) {
1268 this = &(pairs[k]);
1269 debug4(printf("M %d/%d comp %c\n",i,length,this->comp));
1270 if (this->comp == MATCH_COMP || this->comp == DYNPROG_MATCH_COMP || this->comp == AMBIGUOUS_COMP) {
1271 nmatches++;
1272 state = IN_MATCHES;
1273
1274 } else if (this->comp == MISMATCH_COMP) {
1275 if (state == IN_MATCHES) {
1276 sprintf(token,"%d",nmatches);
1277 md_tokens = push_token(md_tokens,token);
1278 nmatches = 0;
1279 }
1280 state = IN_MISMATCHES;
1281
1282 *nmismatches_refdiff += 1;
1283
1284 if (md_lowercase_variant_p && this->cdna == this->genomealt) {
1285 /* A mismatch against the reference only => alternate variant */
1286 sprintf(token,"%c",tolower(complCode[(int) this->genome]));
1287 } else {
1288 *nmismatches_bothdiff += 1;
1289 sprintf(token,"%c",complCode[(int) this->genome]);
1290 }
1291 md_tokens = push_token(md_tokens,token);
1292
1293
1294 } else {
1295 fprintf(stderr,"Unexpected comp '%c'\n",this->comp);
1296 abort();
1297 }
1298 }
1299
1300 } else if (type == 'I') {
1301 if (state == IN_DELETION) {
1302 md_tokens = push_token(md_tokens,"^");
1303 }
1304
1305 while (k < npairs && pairs[k].comp == INDEL_COMP && pairs[k].genome == ' ') {
1306 *nindels += 1;
1307 k++;
1308 }
1309 state = IN_MATCHES;
1310
1311 } else if (type == 'N') {
1312 #if 0
1313 /* Ignore deletion adjacent to intron, to avoid double ^^ */
1314 if (state == IN_DELETION) {
1315 md_tokens = push_token(md_tokens,"^");
1316 }
1317 #endif
1318
1319 while (k < npairs && pairs[k].gapp == true) {
1320 k++;
1321 }
1322
1323 } else if (type == 'D') {
1324 if (state == IN_MATCHES) {
1325 if (nmatches > 0) {
1326 sprintf(token,"%d",nmatches);
1327 md_tokens = push_token(md_tokens,token);
1328 nmatches = 0;
1329 }
1330 } else if (state == IN_MISMATCHES) {
1331 md_tokens = push_token(md_tokens,"0");
1332 }
1333
1334 for (i = 0; i < length; i++, k++) {
1335 this = &(pairs[k]);
1336 sprintf(token,"%c",complCode[(int) this->genome]);
1337 md_tokens = push_token(md_tokens,token);
1338 *nindels += 1;
1339 }
1340 state = IN_DELETION;
1341
1342 } else {
1343 fprintf(stderr,"Don't recognize type %c\n",type);
1344 abort();
1345 }
1346 }
1347
1348 if (nmatches > 0) {
1349 sprintf(token,"%d",nmatches);
1350 md_tokens = push_token(md_tokens,token);
1351 }
1352
1353 /* Restore cigar_tokens */
1354 cigar_tokens = List_reverse(cigar_tokens);
1355 }
1356
1357 assert(k == npairs);
1358
1359 /* Insert initial 0 token if necessary */
1360 if (md_tokens != NULL) {
1361 first_token = (char *) List_head(md_tokens);
1362 if (!isdigit(first_token[0])) {
1363 md_tokens = push_token(md_tokens,"0");
1364 }
1365 }
1366
1367 return md_tokens;
1368 }
1369
1370
1371 /* Modeled after Shortread_print_chopped */
1372 static void
print_chopped(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1373 print_chopped (Filestring_T fp, char *contents, int querylength,
1374 int hardclip_start, int hardclip_end) {
1375 int i;
1376
1377 for (i = hardclip_start; i < querylength - hardclip_end; i++) {
1378 PUTC(contents[i],fp);
1379 }
1380 return;
1381 }
1382
1383 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
1384 static void
print_chopped_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1385 print_chopped_revcomp (Filestring_T fp, char *contents, int querylength,
1386 int hardclip_start, int hardclip_end) {
1387 int i;
1388
1389 for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
1390 PUTC(complCode[(int) contents[i]],fp);
1391 }
1392 return;
1393 }
1394
1395
1396 static void
print_chopped_end(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1397 print_chopped_end (Filestring_T fp, char *contents, int querylength,
1398 int hardclip_start, int hardclip_end) {
1399 int i;
1400
1401 for (i = 0; i < hardclip_start; i++) {
1402 PUTC(contents[i],fp);
1403 }
1404
1405 /* No separator */
1406
1407 for (i = querylength - hardclip_end; i < querylength; i++) {
1408 PUTC(contents[i],fp);
1409 }
1410
1411 return;
1412 }
1413
1414 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
1415 static void
print_chopped_end_revcomp(Filestring_T fp,char * contents,int querylength,int hardclip_start,int hardclip_end)1416 print_chopped_end_revcomp (Filestring_T fp, char *contents, int querylength,
1417 int hardclip_start, int hardclip_end) {
1418 int i;
1419
1420 for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
1421 PUTC(complCode[(int) contents[i]],fp);
1422 }
1423
1424 /* No separator */
1425
1426 for (i = hardclip_start - 1; i >= 0; --i) {
1427 PUTC(complCode[(int) contents[i]],fp);
1428 }
1429
1430 return;
1431 }
1432
1433
1434 static void
print_chopped_end_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)1435 print_chopped_end_quality (Filestring_T fp, char *quality, int querylength,
1436 int hardclip_start, int hardclip_end) {
1437 int i;
1438
1439 if (hardclip_start > 0) {
1440 for (i = 0; i < hardclip_start; i++) {
1441 PUTC(quality[i],fp);
1442 }
1443 return;
1444
1445 } else {
1446 for (i = querylength - hardclip_end; i < querylength; i++) {
1447 PUTC(quality[i],fp);
1448 }
1449 return;
1450 }
1451 }
1452
1453 /* Differs from Shortread version, in that hardclip_high and hardclip_low are not reversed */
1454 static void
print_chopped_end_quality_reverse(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end)1455 print_chopped_end_quality_reverse (Filestring_T fp, char *quality, int querylength,
1456 int hardclip_start, int hardclip_end) {
1457 int i;
1458
1459 if (hardclip_start > 0) {
1460 for (i = hardclip_start - 1; i >= 0; --i) {
1461 PUTC(quality[i],fp);
1462 }
1463 return;
1464
1465 } else {
1466 for (i = querylength - 1; i >= querylength - hardclip_end; --i) {
1467 PUTC(quality[i],fp);
1468 }
1469 return;
1470 }
1471 }
1472
1473
1474
1475 /* Modeled after Shortread_print_quality */
1476 static void
print_quality(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)1477 print_quality (Filestring_T fp, char *quality, int querylength,
1478 int hardclip_start, int hardclip_end, int shift) {
1479 int i;
1480 int c;
1481
1482 if (quality == NULL) {
1483 PUTC('*',fp);
1484 } else {
1485 for (i = hardclip_start; i < querylength - hardclip_end; i++) {
1486 if ((c = quality[i] + shift) <= 32) {
1487 fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable. May need to specify --quality-protocol or --quality-print-shift\n",
1488 shift,quality[i]);
1489 abort();
1490 } else {
1491 PUTC(c,fp);
1492 }
1493 }
1494 }
1495 return;
1496 }
1497
1498
1499 static void
print_quality_revcomp(Filestring_T fp,char * quality,int querylength,int hardclip_start,int hardclip_end,int shift)1500 print_quality_revcomp (Filestring_T fp, char *quality, int querylength,
1501 int hardclip_start, int hardclip_end, int shift) {
1502 int i;
1503 int c;
1504
1505 if (quality == NULL) {
1506 PUTC('*',fp);
1507 } else {
1508 for (i = querylength - 1 - hardclip_end; i >= hardclip_start; --i) {
1509 if ((c = quality[i] + shift) <= 32) {
1510 fprintf(stderr,"Warning: With a quality-print-shift of %d, QC score %c becomes non-printable. May need to specify --quality-protocol or --quality-print-shift\n",
1511 shift,quality[i]);
1512 abort();
1513 } else {
1514 PUTC(c,fp);
1515 }
1516 }
1517 }
1518
1519 return;
1520 }
1521
1522 /* Derived from print_gff3_cdna_match */
1523 /* Assumes pairarray has been hard clipped already */
1524 static void
print_overlap_sam_line(Filestring_T fp,char * abbrev,char * acc1,char * acc2,char * chrstring,bool watsonp,int sensedir,List_T cigar_tokens,List_T md_tokens,int nmismatches_refdiff,int nmismatches_bothdiff,int nindels,bool intronp,char * queryseq_ptr,char * quality_string,int hardclip_start,int hardclip_end,int querylength,int quality_shift,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,unsigned int flag,Chrpos_T chrpos,Chrpos_T chrlength,Shortread_T queryseq,int pair_mapq_score,int end_mapq_score,Stage3pair_T stage3pair,Stage3end_T stage3hit,bool first_read_p,char * sam_read_group_id)1525 print_overlap_sam_line (Filestring_T fp, char *abbrev, char *acc1, char *acc2, char *chrstring,
1526 bool watsonp, int sensedir, List_T cigar_tokens, List_T md_tokens,
1527 int nmismatches_refdiff, int nmismatches_bothdiff, int nindels,
1528 bool intronp, char *queryseq_ptr, char *quality_string,
1529 int hardclip_start, int hardclip_end,
1530 int querylength, int quality_shift,
1531 int pathnum, int npaths_primary, int npaths_altloc, int absmq_score, int second_absmq, unsigned int flag,
1532 Chrpos_T chrpos, Chrpos_T chrlength,
1533
1534 Shortread_T queryseq, int pair_mapq_score, int end_mapq_score,
1535 Stage3pair_T stage3pair, Stage3end_T stage3hit, bool first_read_p,
1536 char *sam_read_group_id) {
1537 bool invertp = false;
1538
1539 /* Should already be checked when Stage3_T or Stage3end_T object was created */
1540 if (cigar_action == CIGAR_ACTION_IGNORE) {
1541 /* Don't check */
1542 } else if (cigar_length(cigar_tokens) + hardclip_start + hardclip_end == querylength) {
1543 /* Okay */
1544 } else if (cigar_action == CIGAR_ACTION_WARNING) {
1545 fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
1546 acc1,cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
1547 } else if (cigar_action == CIGAR_ACTION_NOPRINT) {
1548 fprintf(stderr,"Warning: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
1549 acc1,cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
1550 return;
1551 } else {
1552 /* CIGAR_ACTION_ABORT */
1553 fprintf(stderr,"Error: for %s, CIGAR length %d plus hardclips %d and %d do not match sequence length %d\n",
1554 acc1,cigar_length(cigar_tokens),hardclip_start,hardclip_end,querylength);
1555 abort();
1556 }
1557
1558 /* 1. QNAME or Accession */
1559 if (acc2 == NULL) {
1560 FPRINTF(fp,"%s\t",acc1);
1561 } else {
1562 FPRINTF(fp,"%s,%s\t",acc1,acc2);
1563 }
1564
1565 /* 2. Flags */
1566 FPRINTF(fp,"%u\t",flag);
1567
1568 /* 3. RNAME or Chrstring */
1569 /* 4. POS or Chrlow */
1570 /* Taken from GMAP part of SAM_chromosomal_pos */
1571 if (chrpos > chrlength) {
1572 FPRINTF(fp,"%s\t%u\t",chrstring,chrpos - chrlength /*+ 1*/);
1573 } else {
1574 FPRINTF(fp,"%s\t%u\t",chrstring,chrpos /*+ 1*/);
1575 }
1576
1577 /* 5. MAPQ or Mapping quality */
1578 FPRINTF(fp,"%d\t",pair_mapq_score);
1579
1580 /* 6. CIGAR */
1581 print_tokens(fp,cigar_tokens);
1582
1583 /* 7. MRNM: Mate chr */
1584 /* 8. MPOS: Mate chrpos */
1585 FPRINTF(fp,"\t*\t0"); /* Because sequence and mate are merged */
1586
1587 /* 9. ISIZE: Insert size */
1588 FPRINTF(fp,"\t0"); /* Because sequence and mate are merged */
1589
1590 /* 10. SEQ: queryseq and 11. QUAL: quality_scores */
1591 FPRINTF(fp,"\t");
1592 if (watsonp == true) {
1593 print_chopped(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1594 FPRINTF(fp,"\t");
1595 print_quality(fp,quality_string,querylength,hardclip_start,hardclip_end,
1596 quality_shift);
1597 } else {
1598 print_chopped_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1599 FPRINTF(fp,"\t");
1600 print_quality_revcomp(fp,quality_string,querylength,hardclip_start,hardclip_end,
1601 quality_shift);
1602 }
1603
1604
1605 /* 12. TAGS: RG */
1606 if (sam_read_group_id != NULL) {
1607 FPRINTF(fp,"\tRG:Z:%s",sam_read_group_id);
1608 }
1609
1610 /* 12. TAGS: XH and XI */
1611 if (hardclip_start > 0 || hardclip_end > 0) {
1612 FPRINTF(fp,"\tXH:Z:");
1613 if (watsonp == true) {
1614 print_chopped_end(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1615 } else {
1616 print_chopped_end_revcomp(fp,queryseq_ptr,querylength,hardclip_start,hardclip_end);
1617 }
1618
1619 if (quality_string != NULL) {
1620 FPRINTF(fp,"\tXI:Z:");
1621 if (watsonp == true) {
1622 print_chopped_end_quality(fp,quality_string,querylength,hardclip_start,hardclip_end);
1623 } else {
1624 print_chopped_end_quality_reverse(fp,quality_string,querylength,hardclip_start,hardclip_end);
1625 }
1626 }
1627 }
1628
1629 if (queryseq != NULL) {
1630 /* 12. TAGS: XB */
1631 Shortread_print_barcode(fp,queryseq);
1632
1633 /* 12. TAGS: XP. Logically should be last in reconstructing a read. */
1634 Shortread_print_chop(fp,queryseq,invertp);
1635 }
1636
1637 /* 12. TAGS: MD string */
1638 FPRINTF(fp,"\tMD:Z:");
1639 print_tokens(fp,md_tokens);
1640
1641 /* 12. TAGS: NH */
1642 FPRINTF(fp,"\tNH:i:%d",npaths_primary + npaths_altloc);
1643
1644 /* 12. TAGS: HI */
1645 FPRINTF(fp,"\tHI:i:%d",pathnum);
1646
1647 /* 12. TAGS: NM */
1648 FPRINTF(fp,"\tNM:i:%d",nmismatches_refdiff + nindels);
1649
1650 if (snps_p) {
1651 /* 12. TAGS: XW and XV */
1652 FPRINTF(fp,"\tXW:i:%d",nmismatches_bothdiff);
1653 FPRINTF(fp,"\tXV:i:%d",nmismatches_refdiff - nmismatches_bothdiff);
1654 }
1655
1656
1657 /* 12. TAGS: SM */
1658 FPRINTF(fp,"\tSM:i:%d",end_mapq_score);
1659
1660 /* 12. TAGS: XQ */
1661 FPRINTF(fp,"\tXQ:i:%d",absmq_score);
1662
1663 /* 12. TAGS: X2 */
1664 FPRINTF(fp,"\tX2:i:%d",second_absmq);
1665
1666 /* 12. TAGS: XO */
1667 FPRINTF(fp,"\tXO:Z:%s",abbrev);
1668
1669 /* 12. TAGS: XS */
1670 if (splicingp == false) {
1671 /* Do not print XS field */
1672
1673 } else if (sensedir == SENSE_FORWARD) {
1674 if (watsonp == true) {
1675 FPRINTF(fp,"\tXS:A:+");
1676 } else {
1677 FPRINTF(fp,"\tXS:A:-");
1678 }
1679
1680 } else if (sensedir == SENSE_ANTI) {
1681 if (watsonp == true) {
1682 FPRINTF(fp,"\tXS:A:-");
1683 } else {
1684 FPRINTF(fp,"\tXS:A:+");
1685 }
1686
1687 } else if (intronp == false) {
1688 /* Skip. No intron in this end and mate is not revealing. */
1689
1690 #if 0
1691 } else if (force_xs_direction_p == true) {
1692 /* Don't print XS field for SENSE_NULL */
1693 /* Could not determine sense, so just report arbitrarily as + */
1694 /* This option provided for users of Cufflinks, which cannot handle XS:A:? */
1695 FPRINTF(fp,"\tXS:A:+");
1696
1697 } else {
1698 /* Non-canonical. Don't report. */
1699 FPRINTF(fp,"\tXS:A:?");
1700 #endif
1701 }
1702
1703
1704 /* 12. TAGS: XX, XY */
1705 /* TODO: Merge transcripts */
1706
1707 #if 0
1708 /* TAGS: XZ */
1709 if (Stage3end_transcripts_other(stage3hit) != NULL) {
1710 FPRINTF(fp,"\tXZ:Z:");
1711 Transcript_print_info(fp,Stage3end_transcripts_other(stage3hit),transcript_iit,invertp);
1712 }
1713 #endif
1714
1715 /* 12. TAGS: XC (circular). Not currently handled for GMAP */
1716
1717 /* 12. TAGS: XG. Not currently handled for GMAP */
1718 Method_samprint(fp,Stage3end_method(stage3hit));
1719
1720
1721 FPRINTF(fp,"\n");
1722
1723 return;
1724 }
1725
1726
1727 /* Modified from Pair_print_sam */
1728 void
Simplepair_overlap_print_sam(Filestring_T fp,char * abbrev,struct T * pairarray,int npairs,char * acc1,char * acc2,Chrnum_T chrnum,Univ_IIT_T chromosome_iit,char * queryseq_ptr,char * quality_string,int hardclip_low,int hardclip_high,int querylength_given,bool watsonp,int sensedir,int quality_shift,bool first_read_p,int pathnum,int npaths_primary,int npaths_altloc,int absmq_score,int second_absmq,Chrpos_T chrpos,Chrpos_T chrlength,Shortread_T queryseq,unsigned int flag,int pair_mapq_score,int end_mapq_score,Stage3pair_T stage3pair,Stage3end_T stage3hit,char * sam_read_group_id)1729 Simplepair_overlap_print_sam (Filestring_T fp, char *abbrev, struct T *pairarray, int npairs,
1730 char *acc1, char *acc2, Chrnum_T chrnum, Univ_IIT_T chromosome_iit,
1731 char *queryseq_ptr, char *quality_string,
1732 int hardclip_low, int hardclip_high, int querylength_given,
1733 bool watsonp, int sensedir,
1734 int quality_shift, bool first_read_p, int pathnum, int npaths_primary, int npaths_altloc,
1735 int absmq_score, int second_absmq, Chrpos_T chrpos, Chrpos_T chrlength,
1736
1737 Shortread_T queryseq, unsigned int flag,
1738 int pair_mapq_score, int end_mapq_score,
1739 Stage3pair_T stage3pair, Stage3end_T stage3hit,
1740
1741 char *sam_read_group_id) {
1742 char *chrstring = NULL;
1743 int chimera_part = 0;
1744 #if 0
1745 char *mate_chrstring, *mate_chrstring_alloc = NULL;
1746 #elif defined(GSNAP)
1747 #else
1748 unsigned int flag;
1749 #endif
1750
1751 List_T cigar_tokens, md_tokens = NULL;
1752 int nmismatches_refdiff, nmismatches_bothdiff, nindels;
1753 bool intronp;
1754 int hardclip_start, hardclip_end;
1755 /* int hardclip_start_zero = 0, hardclip_end_zero = 0; */
1756 struct T *clipped_pairarray;
1757 int clipped_npairs;
1758 bool cigar_tokens_alloc;
1759
1760
1761 if (chrnum == 0) {
1762 /* chrstring = Sequence_accession(usersegment); */
1763 fprintf(stderr,"Error in Simplepair_overlap_print_sam: chrnum == 0\n");
1764 exit(9);
1765 } else {
1766 chrstring = Chrnum_to_string(chrnum,chromosome_iit);
1767 }
1768
1769 #if 0
1770 /* Previously this was code for GSNAP. However for GSNAP, this is called only when the sequence and mate are merged */
1771 if (mate_chrpos_low == 0U) {
1772 mate_chrstring = "*";
1773 } else if (mate_chrnum == 0) {
1774 abort();
1775 } else if (/* chrpos > 0U && chrnum > 0 && */ mate_chrnum == chrnum) {
1776 mate_chrstring = "=";
1777 } else {
1778 mate_chrstring = mate_chrstring_alloc = Chrnum_to_string(mate_chrnum,chromosome_iit);
1779 }
1780 #elif defined(GSNAP)
1781 /* flag is given as a parameter */
1782 #else
1783 flag = compute_sam_flag_nomate(npaths_primary + npaths_altloc,first_read_p,watsonp,sam_paired_p);
1784 #endif
1785
1786 debug4(printf("Entered SAM_print_pairs with watsonp %d, first_read_p %d, hardclip_low %d, and hardclip_high %d\n",
1787 watsonp,first_read_p,hardclip_low,hardclip_high));
1788
1789 if (watsonp == true) {
1790 hardclip_start = hardclip_low;
1791 hardclip_end = hardclip_high;
1792 } else {
1793 hardclip_start = hardclip_high;
1794 hardclip_end = hardclip_low;
1795 }
1796 debug4(printf("hardclip_start %d, hardclip_end %d\n",hardclip_start,hardclip_end));
1797
1798
1799 /* Because merged_overlap_p is true */
1800 /* clipped_pairarray = pairarray; */
1801 /* clipped_npairs = npairs; */
1802 clipped_pairarray = hardclip_pairarray(&clipped_npairs,hardclip_start,hardclip_end,
1803 pairarray,npairs,querylength_given);
1804 cigar_tokens = compute_cigar(&intronp,&hardclip_start,&hardclip_end,clipped_pairarray,clipped_npairs,querylength_given,
1805 watsonp,chimera_part);
1806 cigar_tokens_alloc = true;
1807
1808
1809 /* Cigar updates hardclip5 and hardclip3 for chimeras */
1810 md_tokens = compute_md_string(&nmismatches_refdiff,&nmismatches_bothdiff,&nindels,
1811 clipped_pairarray,clipped_npairs,watsonp,cigar_tokens);
1812
1813 print_overlap_sam_line(fp,abbrev,acc1,acc2,chrstring,
1814 watsonp,sensedir,cigar_tokens,md_tokens,
1815 nmismatches_refdiff,nmismatches_bothdiff,nindels,
1816 intronp,queryseq_ptr,quality_string,hardclip_start,hardclip_end,
1817 querylength_given,quality_shift,pathnum,npaths_primary,npaths_altloc,
1818 absmq_score,second_absmq,flag,chrpos,chrlength,
1819 queryseq,pair_mapq_score,end_mapq_score,stage3pair,stage3hit,first_read_p,
1820 sam_read_group_id);
1821
1822 /* Print procedures free the character strings */
1823 tokens_free(&md_tokens);
1824 if (cigar_tokens_alloc == true) {
1825 tokens_free(&cigar_tokens);
1826 }
1827
1828 if (chrnum != 0) {
1829 FREE(chrstring);
1830 }
1831
1832 return;
1833 }
1834
1835
1836 void
Simplepair_setup(bool splicingp_in,Univ_IIT_T transcript_iit_in,bool sam_insert_0M_p_in,bool md_lowercase_variant_p_in,bool snps_p_in,bool cigar_extended_p_in,Cigar_action_T cigar_action_in)1837 Simplepair_setup (bool splicingp_in,
1838 Univ_IIT_T transcript_iit_in, bool sam_insert_0M_p_in,
1839 bool md_lowercase_variant_p_in, bool snps_p_in,
1840 bool cigar_extended_p_in, Cigar_action_T cigar_action_in) {
1841
1842 splicingp = splicingp_in;
1843 transcript_iit = transcript_iit_in;
1844
1845 sam_insert_0M_p = sam_insert_0M_p_in;
1846 md_lowercase_variant_p = md_lowercase_variant_p_in;
1847 snps_p = snps_p_in;
1848
1849 cigar_extended_p = cigar_extended_p_in;
1850 cigar_action = cigar_action_in;
1851
1852 return;
1853 }
1854