1 /*  bam_mate.c -- fix mate pairing information and clean up flags.
2 
3     Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd.
4     Portions copyright (C) 2011 Broad Institute.
5     Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
6 
7     Author: Heng Li <lh3@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notices and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.  */
26 
27 #include <config.h>
28 
29 #include <assert.h>
30 #include <stdbool.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include "htslib/thread_pool.h"
35 #include "sam_opts.h"
36 #include "htslib/kstring.h"
37 #include "htslib/sam.h"
38 #include "samtools.h"
39 
40 
41 #define MD_MIN_QUALITY 15
42 
43 /*
44  * This function calculates ct tag for two bams, it assumes they are from the same template and
45  * writes the tag to the first read in position terms.
46  */
bam_template_cigar(bam1_t * b1,bam1_t * b2,kstring_t * str)47 static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
48 {
49     bam1_t *swap;
50     int i;
51     hts_pos_t end;
52     uint32_t *cigar;
53     str->l = 0;
54     if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip
55     if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
56     kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
57     kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
58     for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) {
59         kputw(bam_cigar_oplen(cigar[i]), str);
60         kputc(bam_cigar_opchr(cigar[i]), str);
61     }
62     end = bam_endpos(b1);
63     kputw(b2->core.pos - end, str);
64     kputc('T', str);
65     kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
66     kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
67     for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) {
68         kputw(bam_cigar_oplen(cigar[i]), str);
69         kputc(bam_cigar_opchr(cigar[i]), str);
70     }
71 
72     uint8_t* data;
73     if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data);
74     if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data);
75 
76     bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s);
77 }
78 
79 /*
80  * What This Program is Supposed To Do:
81  * Fill in mate coordinates, ISIZE and mate related flags from a name-sorted
82  * alignment.
83  *
84  * How We Handle Input
85  *
86  * Secondary and supplementary Reads:
87  * -write to output unchanged
88  * All Reads:
89  * -if pos == 0 (1 based), tid == -1 set UNMAPPED flag
90  * single Reads:
91  * -if pos == 0 (1 based), tid == -1, or UNMAPPED then set UNMAPPED, pos = 0,
92  *  tid = -1
93  * -clear bad flags (PAIRED, MREVERSE, PROPER_PAIR)
94  * -set mpos = 0 (1 based), mtid = -1 and isize = 0
95  * -write to output
96  * Paired Reads:
97  * -if read is unmapped and mate is not, set pos and tid to equal that of mate
98  * -sync mate flags (MREVERSE, MUNMAPPED), mpos, mtid
99  * -recalculate ISIZE if possible, otherwise set it to 0
100  * -optionally clear PROPER_PAIR flag from reads where mapping or orientation
101  *  indicate this is not possible (Illumina orientation only)
102  * -calculate ct and apply to lowest positioned read
103  * -write to output
104  * Limitations
105  * -Does not handle tandem reads
106  * -Should mark supplementary reads the same as primary.
107  * Notes
108  * -CT definition appears to be something else in spec, this was in here before
109  *  I started tampering with it, anyone know what is going on here? To work
110  *  around this I have demoted the CT this tool generates to ct.
111  */
112 
sync_unmapped_pos_inner(bam1_t * src,bam1_t * dest)113 static void sync_unmapped_pos_inner(bam1_t* src, bam1_t* dest) {
114     if ((dest->core.flag & BAM_FUNMAP) && !(src->core.flag & BAM_FUNMAP)) {
115         // Set unmapped read's RNAME and POS to those of its mapped mate
116         // (recommended best practice, ensures if coord sort will be together)
117         dest->core.tid = src->core.tid;
118         dest->core.pos = src->core.pos;
119     }
120 }
121 
sync_mate_inner(bam1_t * src,bam1_t * dest)122 static void sync_mate_inner(bam1_t* src, bam1_t* dest)
123 {
124     // sync mate pos information
125     dest->core.mtid = src->core.tid; dest->core.mpos = src->core.pos;
126     // sync flag info
127     if (src->core.flag&BAM_FREVERSE)
128         dest->core.flag |= BAM_FMREVERSE;
129     else
130         dest->core.flag &= ~BAM_FMREVERSE;
131     if (src->core.flag & BAM_FUNMAP) {
132         dest->core.flag |= BAM_FMUNMAP;
133     }
134 }
135 
136 // Is it plausible that these reads are properly paired?
137 // Can't really give definitive answer without checking isize
plausibly_properly_paired(bam1_t * a,bam1_t * b)138 static bool plausibly_properly_paired(bam1_t* a, bam1_t* b)
139 {
140     if ((a->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FUNMAP)) return false;
141     assert(a->core.tid >= 0); // This should never happen if FUNMAP is set correctly
142 
143     if (a->core.tid != b->core.tid) return false;
144 
145     bam1_t* first = a;
146     bam1_t* second = b;
147     hts_pos_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos;
148     hts_pos_t  b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos;
149     if (a_pos > b_pos) {
150         first = b;
151         second = a;
152     } else {
153         first = a;
154         second = b;
155     }
156 
157     if (!(first->core.flag&BAM_FREVERSE) && (second->core.flag&BAM_FREVERSE))
158         return true;
159     else
160         return false;
161 }
162 
163 // Returns 0 on success, -1 on failure.
bam_format_cigar(const bam1_t * b,kstring_t * str)164 static int bam_format_cigar(const bam1_t* b, kstring_t* str)
165 {
166     // An empty cigar is a special case return "*" rather than ""
167     if (b->core.n_cigar == 0) {
168         return (kputc('*', str) == EOF) ? -1 : 0;
169     }
170 
171     const uint32_t *cigar = bam_get_cigar(b);
172     uint32_t i;
173 
174     for (i = 0; i < b->core.n_cigar; ++i) {
175         if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
176         if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
177     }
178 
179     return 0;
180 }
181 
182 // Returns 0 on success, -1 on failure.
sync_mq_mc(bam1_t * src,bam1_t * dest)183 static int sync_mq_mc(bam1_t* src, bam1_t* dest)
184 {
185     if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
186         // Copy Mate Mapping Quality
187         uint32_t mq = src->core.qual;
188         uint8_t* data;
189         if ((data = bam_aux_get(dest,"MQ")) != NULL) {
190             bam_aux_del(dest, data);
191         }
192 
193         bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
194     }
195     // Copy mate cigar if either read is mapped
196     if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
197         uint8_t* data_mc;
198         if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
199             bam_aux_del(dest, data_mc);
200         }
201 
202         // Convert cigar to string
203         kstring_t mc = { 0, 0, NULL };
204         if (bam_format_cigar(src, &mc) < 0) return -1;
205 
206         bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
207         free(mc.s);
208     }
209     return 0;
210 }
211 
212 // Copy flags.
213 // Returns 0 on success, -1 on failure.
sync_mate(bam1_t * a,bam1_t * b)214 static int sync_mate(bam1_t* a, bam1_t* b)
215 {
216     sync_unmapped_pos_inner(a,b);
217     sync_unmapped_pos_inner(b,a);
218     sync_mate_inner(a,b);
219     sync_mate_inner(b,a);
220     if (sync_mq_mc(a,b) < 0) return -1;
221     if (sync_mq_mc(b,a) < 0) return -1;
222     return 0;
223 }
224 
225 
calc_mate_score(bam1_t * b)226 static uint32_t calc_mate_score(bam1_t *b)
227 {
228     uint32_t score = 0;
229     uint8_t  *qual = bam_get_qual(b);
230     int i;
231 
232     for (i = 0; i < b->core.l_qseq; i++) {
233         if (qual[i] >= MD_MIN_QUALITY) score += qual[i];
234     }
235 
236     return score;
237 }
238 
239 
add_mate_score(bam1_t * src,bam1_t * dest)240 static int add_mate_score(bam1_t *src, bam1_t *dest)
241 {
242     uint8_t *data_ms;
243     uint32_t mate_score = calc_mate_score(src);
244 
245     if ((data_ms = bam_aux_get(dest, "ms")) != NULL) {
246         bam_aux_del(dest, data_ms);
247     }
248 
249     if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) {
250         return -1;
251     }
252 
253     return 0;
254 }
255 
256 // currently, this function ONLY works if each read has one hit
bam_mating_core(samFile * in,samFile * out,int remove_reads,int proper_pair_check,int add_ct,int do_mate_scoring,char * arg_list,int no_pg)257 static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg)
258 {
259     sam_hdr_t *header;
260     bam1_t *b[2] = { NULL, NULL };
261     int curr, has_prev, result;
262     hts_pos_t pre_end = 0, cur_end = 0;
263     kstring_t str = KS_INITIALIZE;
264 
265     header = sam_hdr_read(in);
266     if (header == NULL) {
267         fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n");
268         return 1;
269     }
270 
271     // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
272     if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "coordinate")) {
273         fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n");
274         goto fail;
275     }
276     ks_free(&str);
277 
278     if (!no_pg && sam_hdr_add_pg(header, "samtools",
279                                  "VN", samtools_version(),
280                                  arg_list ? "CL": NULL,
281                                  arg_list ? arg_list : NULL,
282                                  NULL))
283         goto fail;
284 
285     if (sam_hdr_write(out, header) < 0) goto write_fail;
286 
287     b[0] = bam_init1();
288     b[1] = bam_init1();
289     curr = 0; has_prev = 0;
290     while ((result = sam_read1(in, header, b[curr])) >= 0) {
291         bam1_t *cur = b[curr], *pre = b[1-curr];
292         if (cur->core.flag & BAM_FSECONDARY)
293         {
294             if ( !remove_reads ) {
295                 if (sam_write1(out, header, cur) < 0) goto write_fail;
296             }
297             continue; // skip secondary alignments
298         }
299         if (cur->core.flag & BAM_FSUPPLEMENTARY)
300         {
301             if (sam_write1(out, header, cur) < 0) goto write_fail;
302             continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
303         }
304         if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
305         {
306             cur->core.flag |= BAM_FUNMAP;
307         }
308         if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
309         {
310             cur_end = bam_endpos(cur);
311 
312             // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag
313             if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP;
314         }
315         if (has_prev) { // do we have a pair of reads to examine?
316             if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
317                 pre->core.flag |= BAM_FPAIRED;
318                 cur->core.flag |= BAM_FPAIRED;
319                 if (sync_mate(pre, cur)) goto fail;
320 
321                 if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
322                     && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
323                 {
324                     hts_pos_t cur5, pre5;
325                     cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
326                     pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
327                     cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
328                 } else cur->core.isize = pre->core.isize = 0;
329                 if (add_ct) bam_template_cigar(pre, cur, &str);
330                 // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution
331                 if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
332                     pre->core.flag &= ~BAM_FPROPER_PAIR;
333                     cur->core.flag &= ~BAM_FPROPER_PAIR;
334                 }
335 
336                 if (do_mate_scoring) {
337                     if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) {
338                         fprintf(stderr, "[bam_mating_core] ERROR: unable to add mate score.\n");
339                         goto fail;
340                     }
341                 }
342 
343                 // Write out result
344                 if ( !remove_reads ) {
345                     if (sam_write1(out, header, pre) < 0) goto write_fail;
346                     if (sam_write1(out, header, cur) < 0) goto write_fail;
347                 } else {
348                     // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
349                     if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
350                     if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
351                     if(!(pre->core.flag&BAM_FUNMAP)) {
352                         if (sam_write1(out, header, pre) < 0) goto write_fail;
353                     }
354                     if(!(cur->core.flag&BAM_FUNMAP)) {
355                         if (sam_write1(out, header, cur) < 0) goto write_fail;
356                     }
357                 }
358                 has_prev = 0;
359             } else { // unpaired?  clear bad info and write it out
360                 if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
361                     pre->core.flag |= BAM_FUNMAP;
362                     pre->core.tid = -1;
363                     pre->core.pos = -1;
364                 }
365                 pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
366                 pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
367                 if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
368                     if (sam_write1(out, header, pre) < 0) goto write_fail;
369                 }
370             }
371         } else has_prev = 1;
372         curr = 1 - curr;
373         pre_end = cur_end;
374     }
375     if (result < -1) goto read_fail;
376     if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
377         bam1_t *pre = b[1-curr];
378         if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
379             pre->core.flag |= BAM_FUNMAP;
380             pre->core.tid = -1;
381             pre->core.pos = -1;
382         }
383         pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
384         pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
385 
386         if (sam_write1(out, header, pre) < 0) goto write_fail;
387     }
388     sam_hdr_destroy(header);
389     bam_destroy1(b[0]);
390     bam_destroy1(b[1]);
391     ks_free(&str);
392     return 0;
393 
394  read_fail:
395     print_error("fixmate", "Couldn't read from input file");
396     goto fail;
397 
398  write_fail:
399     print_error_errno("fixmate", "Couldn't write to output file");
400  fail:
401     sam_hdr_destroy(header);
402     bam_destroy1(b[0]);
403     bam_destroy1(b[1]);
404     ks_free(&str);
405     return 1;
406 }
407 
usage(FILE * where)408 void usage(FILE* where)
409 {
410     fprintf(where,
411 "Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n"
412 "Options:\n"
413 "  -r           Remove unmapped reads and secondary alignments\n"
414 "  -p           Disable FR proper pair check\n"
415 "  -c           Add template cigar ct tag\n"
416 "  -m           Add mate score tag\n"
417 "  -u           Uncompressed output\n"
418 "  --no-PG      do not add a PG line\n");
419 
420     sam_global_opt_help(where, "-.O..@-.");
421 
422     fprintf(where,
423 "\n"
424 "As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n"
425 "file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n"
426 "input is not accepted.\n");
427 }
428 
bam_mating(int argc,char * argv[])429 int bam_mating(int argc, char *argv[])
430 {
431     htsThreadPool p = {NULL, 0};
432     samFile *in = NULL, *out = NULL;
433     int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
434     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
435     char wmode[4] = {'w', 'b', 0, 0};
436     static const struct option lopts[] = {
437         SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
438         {"no-PG", no_argument, NULL, 1},
439         { NULL, 0, NULL, 0 }
440     };
441     char *arg_list = NULL;
442 
443     // parse args
444     if (argc == 1) { usage(stdout); return 0; }
445     while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
446         switch (c) {
447             case 'r': remove_reads = 1; break;
448             case 'p': proper_pair_check = 0; break;
449             case 'c': add_ct = 1; break;
450             case 'm': mate_score = 1; break;
451             case 'u': wmode[2] = '0'; break;
452             case 1: no_pg = 1; break;
453             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
454                       /* else fall-through */
455             case '?': usage(stderr); goto fail;
456         }
457     }
458     if (optind+1 >= argc) { usage(stderr); goto fail; }
459 
460     if (!no_pg && !(arg_list =  stringify_argv(argc+1, argv-1)))
461         goto fail;
462 
463     // init
464     if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
465         print_error_errno("fixmate", "cannot open input file");
466         goto fail;
467     }
468     sam_open_mode(wmode+1, argv[optind+1], NULL);
469     if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) {
470         print_error_errno("fixmate", "cannot open output file");
471         goto fail;
472     }
473 
474     if (ga.nthreads > 0) {
475         if (!(p.pool = hts_tpool_init(ga.nthreads))) {
476             fprintf(stderr, "Error creating thread pool\n");
477             goto fail;
478         }
479         hts_set_opt(in,  HTS_OPT_THREAD_POOL, &p);
480         hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
481     }
482 
483     // run
484     res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg);
485 
486     // cleanup
487     sam_close(in);
488     if (sam_close(out) < 0) {
489         fprintf(stderr, "[bam_mating] error while closing output file\n");
490         res = 1;
491     }
492 
493     if (p.pool) hts_tpool_destroy(p.pool);
494     free(arg_list);
495     sam_global_args_free(&ga);
496     return res;
497 
498  fail:
499     if (in) sam_close(in);
500     if (out) sam_close(out);
501     if (p.pool) hts_tpool_destroy(p.pool);
502     free(arg_list);
503     sam_global_args_free(&ga);
504     return 1;
505 }
506 
507 
508