1 #include "samtools.pysam.h"
2
3 /* bedcov.c -- bedcov subcommand.
4
5 Copyright (C) 2012 Broad Institute.
6 Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
7
8 Author: Heng Li <lh3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #include <config.h>
29
30 #include <zlib.h>
31 #include <stdio.h>
32 #include <ctype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include "htslib/kstring.h"
37 #include "htslib/sam.h"
38 #include "htslib/thread_pool.h"
39 #include "samtools.h"
40 #include "sam_opts.h"
41
42 #include "htslib/kseq.h"
43 KSTREAM_INIT(gzFile, gzread, 16384)
44
45 #define DEFAULT_DEPTH 64000
46
47 typedef struct {
48 htsFile *fp;
49 sam_hdr_t *header;
50 hts_itr_t *iter;
51 int min_mapQ;
52 uint32_t flags; // read filtering flags
53 } aux_t;
54
read_bam(void * data,bam1_t * b)55 static int read_bam(void *data, bam1_t *b)
56 {
57 aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
58 int ret;
59 while (1)
60 {
61 ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
62 if ( ret<0 ) break;
63 if ( b->core.flag & aux->flags ) continue;
64 if ( (int)b->core.qual < aux->min_mapQ ) continue;
65 break;
66 }
67 return ret;
68 }
69
main_bedcov(int argc,char * argv[])70 int main_bedcov(int argc, char *argv[])
71 {
72 gzFile fp;
73 kstring_t str;
74 kstream_t *ks;
75 hts_idx_t **idx;
76 aux_t **aux;
77 int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
78 int64_t *cnt, *pcov = NULL;;
79 const bam_pileup1_t **plp;
80 int usage = 0, has_index_file = 0;
81 uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
82 int tflags = 0, min_depth = -1;
83
84 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
85 static const struct option lopts[] = {
86 SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
87 { NULL, 0, NULL, 0 }
88 };
89
90 while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
91 switch (c) {
92 case 'Q': min_mapQ = atoi(optarg); break;
93 case 'X': has_index_file = 1; break;
94 case 'g':
95 tflags = bam_str2flag(optarg);
96 if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
97 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
98 return 1;
99 }
100 flags &= ~tflags;
101 break;
102 case 'G':
103 tflags = bam_str2flag(optarg);
104 if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
105 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
106 return 1;
107 }
108 flags |= tflags;
109 break;
110 case 'j': skip_DN = 1; break;
111 case 'd': min_depth = atoi(optarg); break;
112 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
113 /* else fall-through */
114 case '?': usage = 1; break;
115 }
116 if (usage) break;
117 }
118 if (usage || optind + 2 > argc) {
119 fprintf(samtools_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
120 fprintf(samtools_stderr, "Options:\n");
121 fprintf(samtools_stderr, " -Q <int> mapping quality threshold [0]\n");
122 fprintf(samtools_stderr, " -X use customized index files\n");
123 fprintf(samtools_stderr, " -g <flags> remove the specified flags from the set used to filter out reads\n");
124 fprintf(samtools_stderr, " -G <flags> add the specified flags to the set used to filter out reads\n"
125 " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
126 fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
127 fprintf(samtools_stderr, " -d <int> depth threshold. Number of reference bases with coverage above and"
128 " including this value will be displayed in a separate column\n");
129 sam_global_opt_help(samtools_stderr, "-.--.--.");
130 return 1;
131 }
132 if (has_index_file) {
133 if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files
134 fprintf(samtools_stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n");
135 return 1;
136 }
137 n = (argc - optind - 1) / 2;
138 } else {
139 n = argc - optind - 1;
140 }
141
142 memset(&str, 0, sizeof(kstring_t));
143 aux = calloc(n, sizeof(aux_t*));
144 idx = calloc(n, sizeof(hts_idx_t*));
145 for (i = 0; i < n; ++i) {
146 aux[i] = calloc(1, sizeof(aux_t));
147 aux[i]->min_mapQ = min_mapQ;
148 aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
149 if (aux[i]->fp) {
150 // If index filename has not been specfied, look in BAM folder
151 if (has_index_file) {
152 idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]);
153 } else {
154 idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
155 }
156 }
157 if (aux[i]->fp == 0 || idx[i] == 0) {
158 fprintf(samtools_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
159 return 2;
160 }
161 // TODO bgzf_set_cache_size(aux[i]->fp, 20);
162 aux[i]->header = sam_hdr_read(aux[i]->fp);
163 if (aux[i]->header == NULL) {
164 fprintf(samtools_stderr, "ERROR: failed to read header for '%s'\n",
165 argv[i+optind+1]);
166 return 2;
167 }
168 aux[i]->flags = flags;
169 }
170 cnt = calloc(n, sizeof(*cnt));
171 if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
172 if (!cnt || (min_depth >= 0 && !pcov)) return 2;
173
174 fp = gzopen(argv[optind], "rb");
175 if (fp == NULL) {
176 print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]);
177 return 2;
178 }
179 ks = ks_init(fp);
180 n_plp = calloc(n, sizeof(int));
181 plp = calloc(n, sizeof(bam_pileup1_t*));
182 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
183 char *p, *q;
184 int tid, pos, num = 0;
185 int64_t beg = 0, end = 0;
186 bam_mplp_t mplp;
187
188 if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
189 /* Track and browser lines. Also look for a trailing *space* in
190 case someone has badly-chosen a chromosome name (it would
191 be followed by a tab in that case). */
192 if (strncmp(str.s, "track ", 6) == 0) continue;
193 if (strncmp(str.s, "browser ", 8) == 0) continue;
194 for (p = q = str.s; *p && !isspace(*p); ++p);
195 if (*p == 0) goto bed_error;
196 char c = *p;
197 *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
198 if (tid < 0) goto bed_error;
199 num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
200 if (num < 2 || end < beg) goto bed_error;
201
202 for (i = 0; i < n; ++i) {
203 if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
204 aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
205 }
206
207 mplp = bam_mplp_init(n, read_bam, (void**)aux);
208 if (min_depth > DEFAULT_DEPTH)
209 bam_mplp_set_maxcnt(mplp, min_depth);
210 else
211 bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
212
213 memset(cnt, 0, sizeof(*cnt) * n);
214 if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
215
216 while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
217 if (pos >= beg && pos < end) {
218 for (i = 0; i < n; ++i) {
219 m = 0;
220 if (skip_DN || min_depth >= 0) {
221 for (j = 0; j < n_plp[i]; ++j) {
222 const bam_pileup1_t *pi = plp[i] + j;
223 if (pi->is_del || pi->is_refskip) ++m;
224 }
225 }
226 int pd = n_plp[i] - m;
227 cnt[i] += pd;
228 if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
229 }
230 }
231
232 if (ret < 0) {
233 print_error("bedcov", "error reading from input file");
234 status = 2;
235 bam_mplp_destroy(mplp);
236 break;
237 }
238
239 for (i = 0; i < n; ++i) {
240 kputc('\t', &str);
241 kputl(cnt[i], &str);
242 }
243 if (min_depth >= 0) {
244 for (i = 0; i < n; ++i) {
245 kputc('\t', &str);
246 kputl(pcov[i], &str);
247 }
248 }
249 samtools_puts(str.s);
250 bam_mplp_destroy(mplp);
251 continue;
252
253 bed_error:
254 fprintf(samtools_stderr, "Errors in BED line '%s'\n", str.s);
255 status = 2;
256 }
257 free(n_plp); free(plp);
258 ks_destroy(ks);
259 gzclose(fp);
260
261 free(cnt);
262 free(pcov);
263 for (i = 0; i < n; ++i) {
264 if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
265 hts_idx_destroy(idx[i]);
266 sam_hdr_destroy(aux[i]->header);
267 sam_close(aux[i]->fp);
268 free(aux[i]);
269 }
270 free(aux); free(idx);
271 free(str.s);
272 sam_global_args_free(&ga);
273 return status;
274 }
275