1 /* bedcov.c -- bedcov subcommand.
2
3 Copyright (C) 2012 Broad Institute.
4 Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <ctype.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include "htslib/kstring.h"
35 #include "htslib/sam.h"
36 #include "htslib/thread_pool.h"
37 #include "samtools.h"
38 #include "sam_opts.h"
39
40 #include "htslib/kseq.h"
41 KSTREAM_INIT(gzFile, gzread, 16384)
42
43 #define DEFAULT_DEPTH 64000
44
45 typedef struct {
46 htsFile *fp;
47 sam_hdr_t *header;
48 hts_itr_t *iter;
49 int min_mapQ;
50 uint32_t flags; // read filtering flags
51 } aux_t;
52
read_bam(void * data,bam1_t * b)53 static int read_bam(void *data, bam1_t *b)
54 {
55 aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
56 int ret;
57 while (1)
58 {
59 ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
60 if ( ret<0 ) break;
61 if ( b->core.flag & aux->flags ) continue;
62 if ( (int)b->core.qual < aux->min_mapQ ) continue;
63 break;
64 }
65 return ret;
66 }
67
main_bedcov(int argc,char * argv[])68 int main_bedcov(int argc, char *argv[])
69 {
70 gzFile fp;
71 kstring_t str;
72 kstream_t *ks;
73 hts_idx_t **idx;
74 aux_t **aux;
75 int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
76 int64_t *cnt, *pcov = NULL;;
77 const bam_pileup1_t **plp;
78 int usage = 0, has_index_file = 0;
79 uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
80 int tflags = 0, min_depth = -1;
81
82 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
83 static const struct option lopts[] = {
84 SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
85 { NULL, 0, NULL, 0 }
86 };
87
88 while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
89 switch (c) {
90 case 'Q': min_mapQ = atoi(optarg); break;
91 case 'X': has_index_file = 1; break;
92 case 'g':
93 tflags = bam_str2flag(optarg);
94 if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
95 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
96 return 1;
97 }
98 flags &= ~tflags;
99 break;
100 case 'G':
101 tflags = bam_str2flag(optarg);
102 if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
103 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
104 return 1;
105 }
106 flags |= tflags;
107 break;
108 case 'j': skip_DN = 1; break;
109 case 'd': min_depth = atoi(optarg); break;
110 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
111 /* else fall-through */
112 case '?': usage = 1; break;
113 }
114 if (usage) break;
115 }
116 if (usage || optind + 2 > argc) {
117 fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
118 fprintf(stderr, "Options:\n");
119 fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
120 fprintf(stderr, " -X use customized index files\n");
121 fprintf(stderr, " -g <flags> remove the specified flags from the set used to filter out reads\n");
122 fprintf(stderr, " -G <flags> add the specified flags to the set used to filter out reads\n"
123 " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
124 fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
125 fprintf(stderr, " -d <int> depth threshold. Number of reference bases with coverage above and"
126 " including this value will be displayed in a separate column\n");
127 sam_global_opt_help(stderr, "-.--.--.");
128 return 1;
129 }
130 if (has_index_file) {
131 if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files
132 fprintf(stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n");
133 return 1;
134 }
135 n = (argc - optind - 1) / 2;
136 } else {
137 n = argc - optind - 1;
138 }
139
140 memset(&str, 0, sizeof(kstring_t));
141 aux = calloc(n, sizeof(aux_t*));
142 idx = calloc(n, sizeof(hts_idx_t*));
143 for (i = 0; i < n; ++i) {
144 aux[i] = calloc(1, sizeof(aux_t));
145 aux[i]->min_mapQ = min_mapQ;
146 aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
147 if (aux[i]->fp) {
148 // If index filename has not been specfied, look in BAM folder
149 if (has_index_file) {
150 idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]);
151 } else {
152 idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
153 }
154 }
155 if (aux[i]->fp == 0 || idx[i] == 0) {
156 fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
157 return 2;
158 }
159 // TODO bgzf_set_cache_size(aux[i]->fp, 20);
160 aux[i]->header = sam_hdr_read(aux[i]->fp);
161 if (aux[i]->header == NULL) {
162 fprintf(stderr, "ERROR: failed to read header for '%s'\n",
163 argv[i+optind+1]);
164 return 2;
165 }
166 aux[i]->flags = flags;
167 }
168 cnt = calloc(n, sizeof(*cnt));
169 if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
170 if (!cnt || (min_depth >= 0 && !pcov)) return 2;
171
172 fp = gzopen(argv[optind], "rb");
173 if (fp == NULL) {
174 print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]);
175 return 2;
176 }
177 ks = ks_init(fp);
178 n_plp = calloc(n, sizeof(int));
179 plp = calloc(n, sizeof(bam_pileup1_t*));
180 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
181 char *p, *q;
182 int tid, pos, num = 0;
183 int64_t beg = 0, end = 0;
184 bam_mplp_t mplp;
185
186 if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
187 /* Track and browser lines. Also look for a trailing *space* in
188 case someone has badly-chosen a chromosome name (it would
189 be followed by a tab in that case). */
190 if (strncmp(str.s, "track ", 6) == 0) continue;
191 if (strncmp(str.s, "browser ", 8) == 0) continue;
192 for (p = q = str.s; *p && !isspace(*p); ++p);
193 if (*p == 0) goto bed_error;
194 char c = *p;
195 *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
196 if (tid < 0) goto bed_error;
197 num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
198 if (num < 2 || end < beg) goto bed_error;
199
200 for (i = 0; i < n; ++i) {
201 if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
202 aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
203 }
204
205 mplp = bam_mplp_init(n, read_bam, (void**)aux);
206 if (min_depth > DEFAULT_DEPTH)
207 bam_mplp_set_maxcnt(mplp, min_depth);
208 else
209 bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
210
211 memset(cnt, 0, sizeof(*cnt) * n);
212 if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
213
214 while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
215 if (pos >= beg && pos < end) {
216 for (i = 0; i < n; ++i) {
217 m = 0;
218 if (skip_DN || min_depth >= 0) {
219 for (j = 0; j < n_plp[i]; ++j) {
220 const bam_pileup1_t *pi = plp[i] + j;
221 if (pi->is_del || pi->is_refskip) ++m;
222 }
223 }
224 int pd = n_plp[i] - m;
225 cnt[i] += pd;
226 if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
227 }
228 }
229
230 if (ret < 0) {
231 print_error("bedcov", "error reading from input file");
232 status = 2;
233 bam_mplp_destroy(mplp);
234 break;
235 }
236
237 for (i = 0; i < n; ++i) {
238 kputc('\t', &str);
239 kputl(cnt[i], &str);
240 }
241 if (min_depth >= 0) {
242 for (i = 0; i < n; ++i) {
243 kputc('\t', &str);
244 kputl(pcov[i], &str);
245 }
246 }
247 puts(str.s);
248 bam_mplp_destroy(mplp);
249 continue;
250
251 bed_error:
252 fprintf(stderr, "Errors in BED line '%s'\n", str.s);
253 status = 2;
254 }
255 free(n_plp); free(plp);
256 ks_destroy(ks);
257 gzclose(fp);
258
259 free(cnt);
260 free(pcov);
261 for (i = 0; i < n; ++i) {
262 if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
263 hts_idx_destroy(idx[i]);
264 sam_hdr_destroy(aux[i]->header);
265 sam_close(aux[i]->fp);
266 free(aux[i]);
267 }
268 free(aux); free(idx);
269 free(str.s);
270 sam_global_args_free(&ga);
271 return status;
272 }
273