1 /* bam_md.c -- calmd subcommand.
2
3 Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
4 Portions copyright (C) 2009-2011 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <limits.h>
33 #include <errno.h>
34 #include "htslib/faidx.h"
35 #include "htslib/sam.h"
36 #include "htslib/kstring.h"
37 #include "htslib/thread_pool.h"
38 #include "sam_opts.h"
39 #include "samtools.h"
40
41 #define USE_EQUAL 1
42 #define DROP_TAG 2
43 #define BIN_QUAL 4
44 #define UPDATE_NM 8
45 #define UPDATE_MD 16
46 #define HASH_QNM 32
47
48 int bam_aux_drop_other(bam1_t *b, uint8_t *s);
49
bam_fillmd1_core(const char * ref_name,bam1_t * b,char * ref,hts_pos_t ref_len,int flag,int max_nm,int quiet_mode,uint32_t * skipped)50 static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
51 hts_pos_t ref_len, int flag, int max_nm,
52 int quiet_mode, uint32_t *skipped)
53 {
54 uint8_t *seq = bam_get_seq(b);
55 uint32_t *cigar = bam_get_cigar(b);
56 bam1_core_t *c = &b->core;
57 int i, qpos, matched = 0;
58 hts_pos_t rpos;
59 kstring_t str = KS_INITIALIZE;
60 int32_t old_nm_i = -1, nm = 0;
61 uint32_t err = 0;
62
63 if (c->l_qseq == 0) {
64 if (!quiet_mode) {
65 if (ref_name) {
66 fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
67 "record for '%s' at %s:%"PRIhts_pos", skipped\n",
68 bam_get_qname(b), ref_name, c->pos + 1);
69 } else {
70 fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
71 "record for '%s', skipped", bam_get_qname(b));
72 }
73 }
74 if (skipped) (*skipped)++;
75 return 0;
76 }
77
78 for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
79 int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
80 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
81 for (j = 0; j < oplen; ++j) {
82 int c1, c2, z = qpos + j;
83 if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
84 break; // out of bounds
85 c1 = bam_seqi(seq, z);
86 c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
87 if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
88 if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
89 ++matched;
90 } else {
91 err |= kputw(matched, &str) < 0;
92 err |= kputc(toupper(ref[rpos+j]), &str) < 0;
93 matched = 0; ++nm;
94 }
95 }
96 if (j < oplen) break;
97 rpos += oplen; qpos += oplen;
98 } else if (op == BAM_CDEL) {
99 err |= kputw(matched, &str) < 0;
100 err |= kputc('^', &str) < 0;
101 for (j = 0; j < oplen; ++j) {
102 if (rpos+j >= ref_len || ref[rpos+j] == '\0') break;
103 err |= kputc(toupper(ref[rpos+j]), &str) < 0;
104 }
105 matched = 0;
106 rpos += j; nm += j;
107 if (j < oplen) break;
108 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
109 qpos += oplen;
110 if (op == BAM_CINS) nm += oplen;
111 } else if (op == BAM_CREF_SKIP) {
112 rpos += oplen;
113 }
114 }
115 err |= kputw(matched, &str) < 0;
116 if (err) {
117 print_error_errno("calmd", "Couldn't build new MD string");
118 goto fail;
119 }
120 // apply max_nm
121 if (max_nm > 0 && nm >= max_nm) {
122 for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
123 int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
124 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
125 for (j = 0; j < oplen; ++j) {
126 int c1, c2, z = qpos + j;
127 if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
128 break; // out of bounds
129 c1 = bam_seqi(seq, z);
130 c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
131 if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
132 seq[z/2] |= (z&1)? 0x0f : 0xf0;
133 bam_get_qual(b)[z] = 0;
134 }
135 }
136 if (j < oplen) break;
137 rpos += oplen; qpos += oplen;
138 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen;
139 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen;
140 }
141 }
142 // update NM
143 if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
144 uint8_t *old_nm = bam_aux_get(b, "NM");
145 if (old_nm) old_nm_i = bam_aux2i(old_nm);
146 if (!old_nm) {
147 if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
148 goto aux_fail;
149 }
150 else if (nm != old_nm_i) {
151 if (!quiet_mode) {
152 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
153 }
154 if (bam_aux_del(b, old_nm) < 0) goto aux_fail;
155 if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
156 goto aux_fail;
157 }
158 }
159 // update MD
160 if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
161 uint8_t *old_md = bam_aux_get(b, "MD");
162 if (!old_md) {
163 if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
164 goto aux_fail;
165 } else {
166 int is_diff = 0;
167 if (strlen((char*)old_md+1) == str.l) {
168 for (i = 0; i < str.l; ++i)
169 if (toupper(old_md[i+1]) != toupper(str.s[i]))
170 break;
171 if (i < str.l) is_diff = 1;
172 } else is_diff = 1;
173 if (is_diff) {
174 if (!quiet_mode) {
175 fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s);
176 }
177 if (bam_aux_del(b, old_md) < 0) goto aux_fail;
178 if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
179 goto aux_fail;
180 }
181 }
182 }
183
184 // drop all tags but RG
185 if (flag&DROP_TAG) {
186 uint8_t *q = bam_aux_get(b, "RG");
187 bam_aux_drop_other(b, q);
188 }
189 // reduce the resolution of base quality
190 if (flag&BIN_QUAL) {
191 uint8_t *qual = bam_get_qual(b);
192 for (i = 0; i < b->core.l_qseq; ++i)
193 if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
194 }
195
196 free(str.s);
197 return 0;
198
199 aux_fail:
200 if (errno == ENOMEM) {
201 print_error("calmd", "Couldn't add aux tag (too long)");
202 } else if (errno == EINVAL) {
203 print_error("calmd", "Corrupt aux data");
204 } else {
205 print_error_errno("calmd", "Couldn't add aux tag");
206 }
207 fail:
208 free(str.s);
209 return -1;
210 }
211
bam_fillmd1(bam1_t * b,char * ref,int flag,int quiet_mode)212 int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
213 {
214 return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
215 }
216
calmd_usage()217 int calmd_usage() {
218 fprintf(stderr,
219 "Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
220 "Options:\n"
221 " -e change identical bases to '='\n"
222 " -u uncompressed BAM output (for piping)\n"
223 " -b compressed BAM output\n"
224 " -S ignored (input format is auto-detected)\n"
225 " -A modify the quality string\n"
226 " -Q use quiet mode to output less debug info to stdout\n"
227 " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
228 " -E extended BAQ for better sensitivity but lower specificity\n"
229 " --no-PG do not add a PG line\n");
230
231 sam_global_opt_help(stderr, "-....@-.");
232 return 1;
233 }
234
bam_fillmd(int argc,char * argv[])235 int bam_fillmd(int argc, char *argv[])
236 {
237 int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0;
238 hts_pos_t len;
239 htsThreadPool p = {NULL, 0};
240 samFile *fp = NULL, *fpout = NULL;
241 sam_hdr_t *header = NULL;
242 faidx_t *fai = NULL;
243 char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
244 const char *ref_name = NULL;
245 bam1_t *b = NULL;
246 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
247 uint32_t skipped = 0;
248
249 static const struct option lopts[] = {
250 SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
251 {"no-PG", no_argument, NULL, 1},
252 { NULL, 0, NULL, 0 }
253 };
254
255 flt_flag = UPDATE_NM | UPDATE_MD;
256 is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = quiet_mode = 0;
257 strcpy(mode_w, "w");
258 while ((c = getopt_long(argc, argv, "EqQreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
259 switch (c) {
260 case 'r': is_realn = 1; break;
261 case 'e': flt_flag |= USE_EQUAL; break;
262 case 'd': flt_flag |= DROP_TAG; break;
263 case 'q': flt_flag |= BIN_QUAL; break;
264 case 'h': flt_flag |= HASH_QNM; break;
265 case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
266 case 'b': is_bam_out = 1; break;
267 case 'u': is_uncompressed = is_bam_out = 1; break;
268 case 'S': break;
269 case 'n': max_nm = atoi(optarg); break;
270 case 'C': capQ = atoi(optarg); break;
271 case 'A': baq_flag |= 1; break;
272 case 'E': baq_flag |= 2; break;
273 case 'Q': quiet_mode = 1; break;
274 case 1: no_pg = 1; break;
275 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
276 fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
277 /* else fall-through */
278 case '?': return calmd_usage();
279 }
280 }
281 if (is_bam_out) strcat(mode_w, "b");
282 else strcat(mode_w, "h");
283 if (is_uncompressed) strcat(mode_w, "0");
284 if (optind + (ga.reference == NULL) >= argc)
285 return calmd_usage();
286 fp = sam_open_format(argv[optind], "r", &ga.in);
287 if (fp == NULL) {
288 print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]);
289 return 1;
290 }
291
292 if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
293 print_error("calmd", "failed to create arg_list");
294 return 1;
295 }
296
297 header = sam_hdr_read(fp);
298 if (header == NULL || sam_hdr_nref(header) == 0) {
299 fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
300 goto fail;
301 }
302
303 fpout = sam_open_format("-", mode_w, &ga.out);
304 if (fpout == NULL) {
305 print_error_errno("calmd", "Failed to open output");
306 goto fail;
307 }
308 if (!no_pg && sam_hdr_add_pg(header, "samtools",
309 "VN", samtools_version(),
310 arg_list ? "CL": NULL,
311 arg_list ? arg_list : NULL,
312 NULL)) {
313 print_error("calmd", "failed to add PG line to header");
314 goto fail;
315 }
316 if (sam_hdr_write(fpout, header) < 0) {
317 print_error_errno("calmd", "Failed to write sam header");
318 goto fail;
319 }
320
321 if (ga.nthreads > 0) {
322 if (!(p.pool = hts_tpool_init(ga.nthreads))) {
323 fprintf(stderr, "Error creating thread pool\n");
324 goto fail;
325 }
326 hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
327 hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p);
328 }
329
330 ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
331 fai = fai_load(ref_file);
332
333 if (!fai) {
334 print_error_errno("calmd", "Failed to open reference file '%s'", ref_file);
335 goto fail;
336 }
337
338 b = bam_init1();
339 if (!b) {
340 fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n");
341 goto fail;
342 }
343 while ((ret = sam_read1(fp, header, b)) >= 0) {
344 if (b->core.tid >= 0) {
345 if (tid != b->core.tid) {
346 free(ref);
347 ref = NULL;
348 len = 0;
349 ref_name = sam_hdr_tid2name(header, b->core.tid);
350 if (ref_name) {
351 ref = fai_fetch64(fai, ref_name, &len);
352 }
353 tid = b->core.tid;
354 if (ref == 0) { // FIXME: Should this always be fatal?
355 fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
356 ref_name ? ref_name : "(unknown)");
357 if (is_realn || capQ > 10) goto fail; // Would otherwise crash
358 }
359 }
360 if (is_realn) {
361 if (sam_prob_realn(b, ref, len, baq_flag) < -3) {
362 print_error_errno("calmd", "BAQ alignment failed");
363 goto fail;
364 }
365 }
366 if (capQ > 10) {
367 int q = sam_cap_mapq(b, ref, len, capQ);
368 if (b->core.qual > q) b->core.qual = q;
369 }
370 if (ref) {
371 if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm,
372 quiet_mode, &skipped) < 0)
373 goto fail;
374 }
375 }
376 if (sam_write1(fpout, header, b) < 0) {
377 print_error_errno("calmd", "failed to write to output file");
378 goto fail;
379 }
380 }
381 if (ret < -1) {
382 fprintf(stderr, "[bam_fillmd] Error reading input.\n");
383 goto fail;
384 }
385
386 if (skipped) {
387 fprintf(stderr, "[calmd] Warning: %"PRIu32" records skipped due "
388 "to no query sequence\n",
389 skipped);
390 }
391
392 bam_destroy1(b);
393 sam_hdr_destroy(header);
394
395 free(arg_list);
396 free(ref);
397 fai_destroy(fai);
398 sam_close(fp);
399 if (sam_close(fpout) < 0) {
400 fprintf(stderr, "[bam_fillmd] error when closing output file\n");
401 return 1;
402 }
403 if (p.pool) hts_tpool_destroy(p.pool);
404
405 return 0;
406
407 fail:
408 free(arg_list);
409 free(ref);
410 if (b) bam_destroy1(b);
411 if (header) sam_hdr_destroy(header);
412 if (fai) fai_destroy(fai);
413 if (fp) sam_close(fp);
414 if (fpout) sam_close(fpout);
415 if (p.pool) hts_tpool_destroy(p.pool);
416
417 return 1;
418 }
419