1 /* bam_mate.c -- fix mate pairing information and clean up flags.
2
3 Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd.
4 Portions copyright (C) 2011 Broad Institute.
5 Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
6
7 Author: Heng Li <lh3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notices and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 #include <config.h>
28
29 #include <assert.h>
30 #include <stdbool.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include "htslib/thread_pool.h"
35 #include "sam_opts.h"
36 #include "htslib/kstring.h"
37 #include "htslib/sam.h"
38 #include "samtools.h"
39
40
41 #define MD_MIN_QUALITY 15
42
43 /*
44 * This function calculates ct tag for two bams, it assumes they are from the same template and
45 * writes the tag to the first read in position terms.
46 */
bam_template_cigar(bam1_t * b1,bam1_t * b2,kstring_t * str)47 static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
48 {
49 bam1_t *swap;
50 int i;
51 hts_pos_t end;
52 uint32_t *cigar;
53 str->l = 0;
54 if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip
55 if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
56 kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
57 kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
58 for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) {
59 kputw(bam_cigar_oplen(cigar[i]), str);
60 kputc(bam_cigar_opchr(cigar[i]), str);
61 }
62 end = bam_endpos(b1);
63 kputw(b2->core.pos - end, str);
64 kputc('T', str);
65 kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
66 kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
67 for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) {
68 kputw(bam_cigar_oplen(cigar[i]), str);
69 kputc(bam_cigar_opchr(cigar[i]), str);
70 }
71
72 uint8_t* data;
73 if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data);
74 if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data);
75
76 bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s);
77 }
78
79 /*
80 * What This Program is Supposed To Do:
81 * Fill in mate coordinates, ISIZE and mate related flags from a name-sorted
82 * alignment.
83 *
84 * How We Handle Input
85 *
86 * Secondary and supplementary Reads:
87 * -write to output unchanged
88 * All Reads:
89 * -if pos == 0 (1 based), tid == -1 set UNMAPPED flag
90 * single Reads:
91 * -if pos == 0 (1 based), tid == -1, or UNMAPPED then set UNMAPPED, pos = 0,
92 * tid = -1
93 * -clear bad flags (PAIRED, MREVERSE, PROPER_PAIR)
94 * -set mpos = 0 (1 based), mtid = -1 and isize = 0
95 * -write to output
96 * Paired Reads:
97 * -if read is unmapped and mate is not, set pos and tid to equal that of mate
98 * -sync mate flags (MREVERSE, MUNMAPPED), mpos, mtid
99 * -recalculate ISIZE if possible, otherwise set it to 0
100 * -optionally clear PROPER_PAIR flag from reads where mapping or orientation
101 * indicate this is not possible (Illumina orientation only)
102 * -calculate ct and apply to lowest positioned read
103 * -write to output
104 * Limitations
105 * -Does not handle tandem reads
106 * -Should mark supplementary reads the same as primary.
107 * Notes
108 * -CT definition appears to be something else in spec, this was in here before
109 * I started tampering with it, anyone know what is going on here? To work
110 * around this I have demoted the CT this tool generates to ct.
111 */
112
sync_unmapped_pos_inner(bam1_t * src,bam1_t * dest)113 static void sync_unmapped_pos_inner(bam1_t* src, bam1_t* dest) {
114 if ((dest->core.flag & BAM_FUNMAP) && !(src->core.flag & BAM_FUNMAP)) {
115 // Set unmapped read's RNAME and POS to those of its mapped mate
116 // (recommended best practice, ensures if coord sort will be together)
117 dest->core.tid = src->core.tid;
118 dest->core.pos = src->core.pos;
119 }
120 }
121
sync_mate_inner(bam1_t * src,bam1_t * dest)122 static void sync_mate_inner(bam1_t* src, bam1_t* dest)
123 {
124 // sync mate pos information
125 dest->core.mtid = src->core.tid; dest->core.mpos = src->core.pos;
126 // sync flag info
127 if (src->core.flag&BAM_FREVERSE)
128 dest->core.flag |= BAM_FMREVERSE;
129 else
130 dest->core.flag &= ~BAM_FMREVERSE;
131 if (src->core.flag & BAM_FUNMAP) {
132 dest->core.flag |= BAM_FMUNMAP;
133 }
134 }
135
136 // Is it plausible that these reads are properly paired?
137 // Can't really give definitive answer without checking isize
plausibly_properly_paired(bam1_t * a,bam1_t * b)138 static bool plausibly_properly_paired(bam1_t* a, bam1_t* b)
139 {
140 if ((a->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FUNMAP)) return false;
141 assert(a->core.tid >= 0); // This should never happen if FUNMAP is set correctly
142
143 if (a->core.tid != b->core.tid) return false;
144
145 bam1_t* first = a;
146 bam1_t* second = b;
147 hts_pos_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos;
148 hts_pos_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos;
149 if (a_pos > b_pos) {
150 first = b;
151 second = a;
152 } else {
153 first = a;
154 second = b;
155 }
156
157 if (!(first->core.flag&BAM_FREVERSE) && (second->core.flag&BAM_FREVERSE))
158 return true;
159 else
160 return false;
161 }
162
163 // Returns 0 on success, -1 on failure.
bam_format_cigar(const bam1_t * b,kstring_t * str)164 static int bam_format_cigar(const bam1_t* b, kstring_t* str)
165 {
166 // An empty cigar is a special case return "*" rather than ""
167 if (b->core.n_cigar == 0) {
168 return (kputc('*', str) == EOF) ? -1 : 0;
169 }
170
171 const uint32_t *cigar = bam_get_cigar(b);
172 uint32_t i;
173
174 for (i = 0; i < b->core.n_cigar; ++i) {
175 if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
176 if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
177 }
178
179 return 0;
180 }
181
182 // Returns 0 on success, -1 on failure.
sync_mq_mc(bam1_t * src,bam1_t * dest)183 static int sync_mq_mc(bam1_t* src, bam1_t* dest)
184 {
185 if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
186 // Copy Mate Mapping Quality
187 uint32_t mq = src->core.qual;
188 uint8_t* data;
189 if ((data = bam_aux_get(dest,"MQ")) != NULL) {
190 bam_aux_del(dest, data);
191 }
192
193 bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
194 }
195 // Copy mate cigar if either read is mapped
196 if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
197 uint8_t* data_mc;
198 if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
199 bam_aux_del(dest, data_mc);
200 }
201
202 // Convert cigar to string
203 kstring_t mc = { 0, 0, NULL };
204 if (bam_format_cigar(src, &mc) < 0) return -1;
205
206 bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
207 free(mc.s);
208 }
209 return 0;
210 }
211
212 // Copy flags.
213 // Returns 0 on success, -1 on failure.
sync_mate(bam1_t * a,bam1_t * b)214 static int sync_mate(bam1_t* a, bam1_t* b)
215 {
216 sync_unmapped_pos_inner(a,b);
217 sync_unmapped_pos_inner(b,a);
218 sync_mate_inner(a,b);
219 sync_mate_inner(b,a);
220 if (sync_mq_mc(a,b) < 0) return -1;
221 if (sync_mq_mc(b,a) < 0) return -1;
222 return 0;
223 }
224
225
calc_mate_score(bam1_t * b)226 static uint32_t calc_mate_score(bam1_t *b)
227 {
228 uint32_t score = 0;
229 uint8_t *qual = bam_get_qual(b);
230 int i;
231
232 for (i = 0; i < b->core.l_qseq; i++) {
233 if (qual[i] >= MD_MIN_QUALITY) score += qual[i];
234 }
235
236 return score;
237 }
238
239
add_mate_score(bam1_t * src,bam1_t * dest)240 static int add_mate_score(bam1_t *src, bam1_t *dest)
241 {
242 uint8_t *data_ms;
243 uint32_t mate_score = calc_mate_score(src);
244
245 if ((data_ms = bam_aux_get(dest, "ms")) != NULL) {
246 bam_aux_del(dest, data_ms);
247 }
248
249 if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) {
250 return -1;
251 }
252
253 return 0;
254 }
255
256 // currently, this function ONLY works if each read has one hit
bam_mating_core(samFile * in,samFile * out,int remove_reads,int proper_pair_check,int add_ct,int do_mate_scoring,char * arg_list,int no_pg)257 static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg)
258 {
259 sam_hdr_t *header;
260 bam1_t *b[2] = { NULL, NULL };
261 int curr, has_prev, result;
262 hts_pos_t pre_end = 0, cur_end = 0;
263 kstring_t str = KS_INITIALIZE;
264
265 header = sam_hdr_read(in);
266 if (header == NULL) {
267 fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n");
268 return 1;
269 }
270
271 // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
272 if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "coordinate")) {
273 fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n");
274 goto fail;
275 }
276 ks_free(&str);
277
278 if (!no_pg && sam_hdr_add_pg(header, "samtools",
279 "VN", samtools_version(),
280 arg_list ? "CL": NULL,
281 arg_list ? arg_list : NULL,
282 NULL))
283 goto fail;
284
285 if (sam_hdr_write(out, header) < 0) goto write_fail;
286
287 b[0] = bam_init1();
288 b[1] = bam_init1();
289 curr = 0; has_prev = 0;
290 while ((result = sam_read1(in, header, b[curr])) >= 0) {
291 bam1_t *cur = b[curr], *pre = b[1-curr];
292 if (cur->core.flag & BAM_FSECONDARY)
293 {
294 if ( !remove_reads ) {
295 if (sam_write1(out, header, cur) < 0) goto write_fail;
296 }
297 continue; // skip secondary alignments
298 }
299 if (cur->core.flag & BAM_FSUPPLEMENTARY)
300 {
301 if (sam_write1(out, header, cur) < 0) goto write_fail;
302 continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
303 }
304 if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
305 {
306 cur->core.flag |= BAM_FUNMAP;
307 }
308 if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
309 {
310 cur_end = bam_endpos(cur);
311
312 // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag
313 if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP;
314 }
315 if (has_prev) { // do we have a pair of reads to examine?
316 if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
317 pre->core.flag |= BAM_FPAIRED;
318 cur->core.flag |= BAM_FPAIRED;
319 if (sync_mate(pre, cur)) goto fail;
320
321 if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
322 && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
323 {
324 hts_pos_t cur5, pre5;
325 cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
326 pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
327 cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
328 } else cur->core.isize = pre->core.isize = 0;
329 if (add_ct) bam_template_cigar(pre, cur, &str);
330 // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution
331 if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
332 pre->core.flag &= ~BAM_FPROPER_PAIR;
333 cur->core.flag &= ~BAM_FPROPER_PAIR;
334 }
335
336 if (do_mate_scoring) {
337 if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) {
338 fprintf(stderr, "[bam_mating_core] ERROR: unable to add mate score.\n");
339 goto fail;
340 }
341 }
342
343 // Write out result
344 if ( !remove_reads ) {
345 if (sam_write1(out, header, pre) < 0) goto write_fail;
346 if (sam_write1(out, header, cur) < 0) goto write_fail;
347 } else {
348 // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
349 if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
350 if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
351 if(!(pre->core.flag&BAM_FUNMAP)) {
352 if (sam_write1(out, header, pre) < 0) goto write_fail;
353 }
354 if(!(cur->core.flag&BAM_FUNMAP)) {
355 if (sam_write1(out, header, cur) < 0) goto write_fail;
356 }
357 }
358 has_prev = 0;
359 } else { // unpaired? clear bad info and write it out
360 if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
361 pre->core.flag |= BAM_FUNMAP;
362 pre->core.tid = -1;
363 pre->core.pos = -1;
364 }
365 pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
366 pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
367 if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
368 if (sam_write1(out, header, pre) < 0) goto write_fail;
369 }
370 }
371 } else has_prev = 1;
372 curr = 1 - curr;
373 pre_end = cur_end;
374 }
375 if (result < -1) goto read_fail;
376 if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
377 bam1_t *pre = b[1-curr];
378 if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
379 pre->core.flag |= BAM_FUNMAP;
380 pre->core.tid = -1;
381 pre->core.pos = -1;
382 }
383 pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
384 pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
385
386 if (sam_write1(out, header, pre) < 0) goto write_fail;
387 }
388 sam_hdr_destroy(header);
389 bam_destroy1(b[0]);
390 bam_destroy1(b[1]);
391 ks_free(&str);
392 return 0;
393
394 read_fail:
395 print_error("fixmate", "Couldn't read from input file");
396 goto fail;
397
398 write_fail:
399 print_error_errno("fixmate", "Couldn't write to output file");
400 fail:
401 sam_hdr_destroy(header);
402 bam_destroy1(b[0]);
403 bam_destroy1(b[1]);
404 ks_free(&str);
405 return 1;
406 }
407
usage(FILE * where)408 void usage(FILE* where)
409 {
410 fprintf(where,
411 "Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n"
412 "Options:\n"
413 " -r Remove unmapped reads and secondary alignments\n"
414 " -p Disable FR proper pair check\n"
415 " -c Add template cigar ct tag\n"
416 " -m Add mate score tag\n"
417 " -u Uncompressed output\n"
418 " --no-PG do not add a PG line\n");
419
420 sam_global_opt_help(where, "-.O..@-.");
421
422 fprintf(where,
423 "\n"
424 "As elsewhere in samtools, use '-' as the filename for stdin/stdout. The input\n"
425 "file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n"
426 "input is not accepted.\n");
427 }
428
bam_mating(int argc,char * argv[])429 int bam_mating(int argc, char *argv[])
430 {
431 htsThreadPool p = {NULL, 0};
432 samFile *in = NULL, *out = NULL;
433 int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
434 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
435 char wmode[4] = {'w', 'b', 0, 0};
436 static const struct option lopts[] = {
437 SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
438 {"no-PG", no_argument, NULL, 1},
439 { NULL, 0, NULL, 0 }
440 };
441 char *arg_list = NULL;
442
443 // parse args
444 if (argc == 1) { usage(stdout); return 0; }
445 while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
446 switch (c) {
447 case 'r': remove_reads = 1; break;
448 case 'p': proper_pair_check = 0; break;
449 case 'c': add_ct = 1; break;
450 case 'm': mate_score = 1; break;
451 case 'u': wmode[2] = '0'; break;
452 case 1: no_pg = 1; break;
453 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
454 /* else fall-through */
455 case '?': usage(stderr); goto fail;
456 }
457 }
458 if (optind+1 >= argc) { usage(stderr); goto fail; }
459
460 if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1)))
461 goto fail;
462
463 // init
464 if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
465 print_error_errno("fixmate", "cannot open input file");
466 goto fail;
467 }
468 sam_open_mode(wmode+1, argv[optind+1], NULL);
469 if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) {
470 print_error_errno("fixmate", "cannot open output file");
471 goto fail;
472 }
473
474 if (ga.nthreads > 0) {
475 if (!(p.pool = hts_tpool_init(ga.nthreads))) {
476 fprintf(stderr, "Error creating thread pool\n");
477 goto fail;
478 }
479 hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
480 hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
481 }
482
483 // run
484 res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg);
485
486 // cleanup
487 sam_close(in);
488 if (sam_close(out) < 0) {
489 fprintf(stderr, "[bam_mating] error while closing output file\n");
490 res = 1;
491 }
492
493 if (p.pool) hts_tpool_destroy(p.pool);
494 free(arg_list);
495 sam_global_args_free(&ga);
496 return res;
497
498 fail:
499 if (in) sam_close(in);
500 if (out) sam_close(out);
501 if (p.pool) hts_tpool_destroy(p.pool);
502 free(arg_list);
503 sam_global_args_free(&ga);
504 return 1;
505 }
506
507
508