1 /*  bam_md.c -- calmd subcommand.
2 
3     Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
4     Portions copyright (C) 2009-2011 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <limits.h>
33 #include <errno.h>
34 #include "htslib/faidx.h"
35 #include "htslib/sam.h"
36 #include "htslib/kstring.h"
37 #include "htslib/thread_pool.h"
38 #include "sam_opts.h"
39 #include "samtools.h"
40 
41 #define USE_EQUAL 1
42 #define DROP_TAG  2
43 #define BIN_QUAL  4
44 #define UPDATE_NM 8
45 #define UPDATE_MD 16
46 #define HASH_QNM  32
47 
48 int bam_aux_drop_other(bam1_t *b, uint8_t *s);
49 
bam_fillmd1_core(const char * ref_name,bam1_t * b,char * ref,hts_pos_t ref_len,int flag,int max_nm,int quiet_mode,uint32_t * skipped)50 static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
51                             hts_pos_t ref_len, int flag, int max_nm,
52                             int quiet_mode, uint32_t *skipped)
53 {
54     uint8_t *seq = bam_get_seq(b);
55     uint32_t *cigar = bam_get_cigar(b);
56     bam1_core_t *c = &b->core;
57     int i, qpos, matched = 0;
58     hts_pos_t rpos;
59     kstring_t str = KS_INITIALIZE;
60     int32_t old_nm_i = -1, nm = 0;
61     uint32_t err = 0;
62 
63     if (c->l_qseq == 0) {
64         if (!quiet_mode) {
65             if (ref_name) {
66                 fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
67                         "record for '%s' at %s:%"PRIhts_pos", skipped\n",
68                         bam_get_qname(b), ref_name, c->pos + 1);
69             } else {
70                 fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
71                         "record for '%s', skipped", bam_get_qname(b));
72             }
73         }
74         if (skipped) (*skipped)++;
75         return 0;
76     }
77 
78     for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
79         int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
80         if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
81             for (j = 0; j < oplen; ++j) {
82                 int c1, c2, z = qpos + j;
83                 if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
84                     break; // out of bounds
85                 c1 = bam_seqi(seq, z);
86                 c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
87                 if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
88                     if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
89                     ++matched;
90                 } else {
91                     err |= kputw(matched, &str) < 0;
92                     err |= kputc(toupper(ref[rpos+j]), &str) < 0;
93                     matched = 0; ++nm;
94                 }
95             }
96             if (j < oplen) break;
97             rpos += oplen; qpos += oplen;
98         } else if (op == BAM_CDEL) {
99             err |= kputw(matched, &str) < 0;
100             err |= kputc('^', &str) < 0;
101             for (j = 0; j < oplen; ++j) {
102                 if (rpos+j >= ref_len || ref[rpos+j] == '\0') break;
103                 err |= kputc(toupper(ref[rpos+j]), &str) < 0;
104             }
105             matched = 0;
106             rpos += j; nm += j;
107             if (j < oplen) break;
108         } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
109             qpos += oplen;
110             if (op == BAM_CINS) nm += oplen;
111         } else if (op == BAM_CREF_SKIP) {
112             rpos += oplen;
113         }
114     }
115     err |= kputw(matched, &str) < 0;
116     if (err) {
117         print_error_errno("calmd", "Couldn't build new MD string");
118         goto fail;
119     }
120     // apply max_nm
121     if (max_nm > 0 && nm >= max_nm) {
122         for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
123             int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
124             if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
125                 for (j = 0; j < oplen; ++j) {
126                     int c1, c2, z = qpos + j;
127                     if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
128                         break; // out of bounds
129                     c1 = bam_seqi(seq, z);
130                     c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
131                     if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
132                         seq[z/2] |= (z&1)? 0x0f : 0xf0;
133                         bam_get_qual(b)[z] = 0;
134                     }
135                 }
136                 if (j < oplen) break;
137                 rpos += oplen; qpos += oplen;
138             } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen;
139             else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen;
140         }
141     }
142     // update NM
143     if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
144         uint8_t *old_nm = bam_aux_get(b, "NM");
145         if (old_nm) old_nm_i = bam_aux2i(old_nm);
146         if (!old_nm) {
147             if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
148                 goto aux_fail;
149         }
150         else if (nm != old_nm_i) {
151             if (!quiet_mode) {
152                 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
153             }
154             if (bam_aux_del(b, old_nm) < 0) goto aux_fail;
155             if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
156                 goto aux_fail;
157         }
158     }
159     // update MD
160     if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
161         uint8_t *old_md = bam_aux_get(b, "MD");
162         if (!old_md) {
163             if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
164                 goto aux_fail;
165         } else {
166             int is_diff = 0;
167             if (strlen((char*)old_md+1) == str.l) {
168                 for (i = 0; i < str.l; ++i)
169                     if (toupper(old_md[i+1]) != toupper(str.s[i]))
170                         break;
171                 if (i < str.l) is_diff = 1;
172             } else is_diff = 1;
173             if (is_diff) {
174                 if (!quiet_mode) {
175                     fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s);
176                 }
177                 if (bam_aux_del(b, old_md) < 0) goto aux_fail;
178                 if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
179                     goto aux_fail;
180             }
181         }
182     }
183 
184     // drop all tags but RG
185     if (flag&DROP_TAG) {
186         uint8_t *q = bam_aux_get(b, "RG");
187         bam_aux_drop_other(b, q);
188     }
189     // reduce the resolution of base quality
190     if (flag&BIN_QUAL) {
191         uint8_t *qual = bam_get_qual(b);
192         for (i = 0; i < b->core.l_qseq; ++i)
193             if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
194     }
195 
196     free(str.s);
197     return 0;
198 
199  aux_fail:
200     if (errno == ENOMEM) {
201         print_error("calmd", "Couldn't add aux tag (too long)");
202     } else if (errno == EINVAL) {
203         print_error("calmd", "Corrupt aux data");
204     } else {
205         print_error_errno("calmd", "Couldn't add aux tag");
206     }
207  fail:
208     free(str.s);
209     return -1;
210 }
211 
bam_fillmd1(bam1_t * b,char * ref,int flag,int quiet_mode)212 int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
213 {
214     return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
215 }
216 
calmd_usage()217 int calmd_usage() {
218     fprintf(stderr,
219 "Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
220 "Options:\n"
221 "  -e       change identical bases to '='\n"
222 "  -u       uncompressed BAM output (for piping)\n"
223 "  -b       compressed BAM output\n"
224 "  -S       ignored (input format is auto-detected)\n"
225 "  -A       modify the quality string\n"
226 "  -Q       use quiet mode to output less debug info to stdout\n"
227 "  -r       compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
228 "  -E       extended BAQ for better sensitivity but lower specificity\n"
229 "  --no-PG  do not add a PG line\n");
230 
231     sam_global_opt_help(stderr, "-....@-.");
232     return 1;
233 }
234 
bam_fillmd(int argc,char * argv[])235 int bam_fillmd(int argc, char *argv[])
236 {
237     int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0;
238     hts_pos_t len;
239     htsThreadPool p = {NULL, 0};
240     samFile *fp = NULL, *fpout = NULL;
241     sam_hdr_t *header = NULL;
242     faidx_t *fai = NULL;
243     char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
244     const char *ref_name = NULL;
245     bam1_t *b = NULL;
246     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
247     uint32_t skipped = 0;
248 
249     static const struct option lopts[] = {
250         SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
251         {"no-PG", no_argument, NULL, 1},
252         { NULL, 0, NULL, 0 }
253     };
254 
255     flt_flag = UPDATE_NM | UPDATE_MD;
256     is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = quiet_mode = 0;
257     strcpy(mode_w, "w");
258     while ((c = getopt_long(argc, argv, "EqQreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
259         switch (c) {
260         case 'r': is_realn = 1; break;
261         case 'e': flt_flag |= USE_EQUAL; break;
262         case 'd': flt_flag |= DROP_TAG; break;
263         case 'q': flt_flag |= BIN_QUAL; break;
264         case 'h': flt_flag |= HASH_QNM; break;
265         case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
266         case 'b': is_bam_out = 1; break;
267         case 'u': is_uncompressed = is_bam_out = 1; break;
268         case 'S': break;
269         case 'n': max_nm = atoi(optarg); break;
270         case 'C': capQ = atoi(optarg); break;
271         case 'A': baq_flag |= 1; break;
272         case 'E': baq_flag |= 2; break;
273         case 'Q': quiet_mode = 1; break;
274         case 1: no_pg = 1; break;
275         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
276             fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
277             /* else fall-through */
278         case '?': return calmd_usage();
279         }
280     }
281     if (is_bam_out) strcat(mode_w, "b");
282     else strcat(mode_w, "h");
283     if (is_uncompressed) strcat(mode_w, "0");
284     if (optind + (ga.reference == NULL) >= argc)
285         return calmd_usage();
286     fp = sam_open_format(argv[optind], "r", &ga.in);
287     if (fp == NULL) {
288         print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]);
289         return 1;
290     }
291 
292     if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
293         print_error("calmd", "failed to create arg_list");
294         return 1;
295     }
296 
297     header = sam_hdr_read(fp);
298     if (header == NULL || sam_hdr_nref(header) == 0) {
299         fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
300         goto fail;
301     }
302 
303     fpout = sam_open_format("-", mode_w, &ga.out);
304     if (fpout == NULL) {
305         print_error_errno("calmd", "Failed to open output");
306         goto fail;
307     }
308     if (!no_pg && sam_hdr_add_pg(header, "samtools",
309                                  "VN", samtools_version(),
310                                  arg_list ? "CL": NULL,
311                                  arg_list ? arg_list : NULL,
312                                  NULL)) {
313         print_error("calmd", "failed to add PG line to header");
314         goto fail;
315     }
316     if (sam_hdr_write(fpout, header) < 0) {
317         print_error_errno("calmd", "Failed to write sam header");
318         goto fail;
319     }
320 
321     if (ga.nthreads > 0) {
322         if (!(p.pool = hts_tpool_init(ga.nthreads))) {
323             fprintf(stderr, "Error creating thread pool\n");
324             goto fail;
325         }
326         hts_set_opt(fp,    HTS_OPT_THREAD_POOL, &p);
327         hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p);
328     }
329 
330     ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
331     fai = fai_load(ref_file);
332 
333     if (!fai) {
334         print_error_errno("calmd", "Failed to open reference file '%s'", ref_file);
335         goto fail;
336     }
337 
338     b = bam_init1();
339     if (!b) {
340         fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n");
341         goto fail;
342     }
343     while ((ret = sam_read1(fp, header, b)) >= 0) {
344         if (b->core.tid >= 0) {
345             if (tid != b->core.tid) {
346                 free(ref);
347                 ref = NULL;
348                 len = 0;
349                 ref_name = sam_hdr_tid2name(header, b->core.tid);
350                 if (ref_name) {
351                     ref = fai_fetch64(fai, ref_name, &len);
352                 }
353                 tid = b->core.tid;
354                 if (ref == 0) { // FIXME: Should this always be fatal?
355                     fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
356                             ref_name ? ref_name : "(unknown)");
357                     if (is_realn || capQ > 10) goto fail; // Would otherwise crash
358                 }
359             }
360             if (is_realn) {
361                 if (sam_prob_realn(b, ref, len, baq_flag) < -3) {
362                     print_error_errno("calmd", "BAQ alignment failed");
363                     goto fail;
364                 }
365             }
366             if (capQ > 10) {
367                 int q = sam_cap_mapq(b, ref, len, capQ);
368                 if (b->core.qual > q) b->core.qual = q;
369             }
370             if (ref) {
371                 if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm,
372                                      quiet_mode, &skipped) < 0)
373                     goto fail;
374             }
375         }
376         if (sam_write1(fpout, header, b) < 0) {
377             print_error_errno("calmd", "failed to write to output file");
378             goto fail;
379         }
380     }
381     if (ret < -1) {
382         fprintf(stderr, "[bam_fillmd] Error reading input.\n");
383         goto fail;
384     }
385 
386     if (skipped) {
387         fprintf(stderr, "[calmd] Warning: %"PRIu32" records skipped due "
388                 "to no query sequence\n",
389                 skipped);
390     }
391 
392     bam_destroy1(b);
393     sam_hdr_destroy(header);
394 
395     free(arg_list);
396     free(ref);
397     fai_destroy(fai);
398     sam_close(fp);
399     if (sam_close(fpout) < 0) {
400         fprintf(stderr, "[bam_fillmd] error when closing output file\n");
401         return 1;
402     }
403     if (p.pool) hts_tpool_destroy(p.pool);
404 
405     return 0;
406 
407  fail:
408     free(arg_list);
409     free(ref);
410     if (b) bam_destroy1(b);
411     if (header) sam_hdr_destroy(header);
412     if (fai) fai_destroy(fai);
413     if (fp) sam_close(fp);
414     if (fpout) sam_close(fpout);
415     if (p.pool) hts_tpool_destroy(p.pool);
416 
417     return 1;
418 }
419