1 /*  bedcov.c -- bedcov subcommand.
2 
3     Copyright (C) 2012 Broad Institute.
4     Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <ctype.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include "htslib/kstring.h"
35 #include "htslib/sam.h"
36 #include "htslib/thread_pool.h"
37 #include "samtools.h"
38 #include "sam_opts.h"
39 
40 #include "htslib/kseq.h"
41 KSTREAM_INIT(gzFile, gzread, 16384)
42 
43 #define DEFAULT_DEPTH 64000
44 
45 typedef struct {
46     htsFile *fp;
47     sam_hdr_t *header;
48     hts_itr_t *iter;
49     int min_mapQ;
50     uint32_t flags;  // read filtering flags
51 } aux_t;
52 
read_bam(void * data,bam1_t * b)53 static int read_bam(void *data, bam1_t *b)
54 {
55     aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
56     int ret;
57     while (1)
58     {
59         ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
60         if ( ret<0 ) break;
61         if ( b->core.flag & aux->flags ) continue;
62         if ( (int)b->core.qual < aux->min_mapQ ) continue;
63         break;
64     }
65     return ret;
66 }
67 
main_bedcov(int argc,char * argv[])68 int main_bedcov(int argc, char *argv[])
69 {
70     gzFile fp;
71     kstring_t str;
72     kstream_t *ks;
73     hts_idx_t **idx;
74     aux_t **aux;
75     int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
76     int64_t *cnt, *pcov = NULL;;
77     const bam_pileup1_t **plp;
78     int usage = 0, has_index_file = 0;
79     uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
80     int tflags = 0, min_depth = -1;
81 
82     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
83     static const struct option lopts[] = {
84         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
85         { NULL, 0, NULL, 0 }
86     };
87 
88     while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
89         switch (c) {
90         case 'Q': min_mapQ = atoi(optarg); break;
91         case 'X': has_index_file = 1; break;
92         case 'g':
93             tflags = bam_str2flag(optarg);
94             if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
95                 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
96                 return 1;
97             }
98             flags &= ~tflags;
99             break;
100         case 'G':
101             tflags = bam_str2flag(optarg);
102             if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
103                 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
104                 return 1;
105             }
106             flags |= tflags;
107             break;
108         case 'j': skip_DN = 1; break;
109         case 'd': min_depth = atoi(optarg); break;
110         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
111                   /* else fall-through */
112         case '?': usage = 1; break;
113         }
114         if (usage) break;
115     }
116     if (usage || optind + 2 > argc) {
117         fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
118         fprintf(stderr, "Options:\n");
119         fprintf(stderr, "      -Q <int>            mapping quality threshold [0]\n");
120         fprintf(stderr, "      -X                  use customized index files\n");
121         fprintf(stderr, "      -g <flags>          remove the specified flags from the set used to filter out reads\n");
122         fprintf(stderr, "      -G <flags>          add the specified flags to the set used to filter out reads\n"
123                         "                          The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
124         fprintf(stderr, "      -j                  do not include deletions (D) and ref skips (N) in bedcov computation\n");
125         fprintf(stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and"
126                         "                          including this value will be displayed in a separate column\n");
127         sam_global_opt_help(stderr, "-.--.--.");
128         return 1;
129     }
130     if (has_index_file) {
131         if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files
132             fprintf(stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n");
133             return 1;
134         }
135         n = (argc - optind - 1) / 2;
136     } else {
137         n = argc - optind - 1;
138     }
139 
140     memset(&str, 0, sizeof(kstring_t));
141     aux = calloc(n, sizeof(aux_t*));
142     idx = calloc(n, sizeof(hts_idx_t*));
143     for (i = 0; i < n; ++i) {
144         aux[i] = calloc(1, sizeof(aux_t));
145         aux[i]->min_mapQ = min_mapQ;
146         aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
147         if (aux[i]->fp) {
148             // If index filename has not been specfied, look in BAM folder
149             if (has_index_file) {
150                 idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]);
151             } else {
152                 idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
153             }
154         }
155         if (aux[i]->fp == 0 || idx[i] == 0) {
156             fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
157             return 2;
158         }
159         // TODO bgzf_set_cache_size(aux[i]->fp, 20);
160         aux[i]->header = sam_hdr_read(aux[i]->fp);
161         if (aux[i]->header == NULL) {
162             fprintf(stderr, "ERROR: failed to read header for '%s'\n",
163                     argv[i+optind+1]);
164             return 2;
165         }
166         aux[i]->flags = flags;
167     }
168     cnt = calloc(n, sizeof(*cnt));
169     if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
170     if (!cnt || (min_depth >= 0 && !pcov)) return 2;
171 
172     fp = gzopen(argv[optind], "rb");
173     if (fp == NULL) {
174         print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]);
175         return 2;
176     }
177     ks = ks_init(fp);
178     n_plp = calloc(n, sizeof(int));
179     plp = calloc(n, sizeof(bam_pileup1_t*));
180     while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
181         char *p, *q;
182         int tid, pos, num = 0;
183         int64_t beg = 0, end = 0;
184         bam_mplp_t mplp;
185 
186         if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
187         /* Track and browser lines.  Also look for a trailing *space* in
188            case someone has badly-chosen a chromosome name (it would
189            be followed by a tab in that case). */
190         if (strncmp(str.s, "track ", 6) == 0) continue;
191         if (strncmp(str.s, "browser ", 8) == 0) continue;
192         for (p = q = str.s; *p && !isspace(*p); ++p);
193         if (*p == 0) goto bed_error;
194         char c = *p;
195         *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
196         if (tid < 0) goto bed_error;
197         num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
198         if (num < 2 || end < beg) goto bed_error;
199 
200         for (i = 0; i < n; ++i) {
201             if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
202             aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
203         }
204 
205         mplp = bam_mplp_init(n, read_bam, (void**)aux);
206         if (min_depth > DEFAULT_DEPTH)
207             bam_mplp_set_maxcnt(mplp, min_depth);
208         else
209             bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
210 
211         memset(cnt, 0, sizeof(*cnt) * n);
212         if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
213 
214         while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
215             if (pos >= beg && pos < end) {
216                 for (i = 0; i < n; ++i) {
217                     m = 0;
218                     if (skip_DN || min_depth >= 0) {
219                         for (j = 0; j < n_plp[i]; ++j) {
220                             const bam_pileup1_t *pi = plp[i] + j;
221                             if (pi->is_del || pi->is_refskip) ++m;
222                         }
223                     }
224                     int pd = n_plp[i] - m;
225                     cnt[i] += pd;
226                     if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
227                 }
228             }
229 
230         if (ret < 0) {
231             print_error("bedcov", "error reading from input file");
232             status = 2;
233             bam_mplp_destroy(mplp);
234             break;
235         }
236 
237         for (i = 0; i < n; ++i) {
238             kputc('\t', &str);
239             kputl(cnt[i], &str);
240         }
241         if (min_depth >= 0) {
242             for (i = 0; i < n; ++i) {
243                 kputc('\t', &str);
244                 kputl(pcov[i], &str);
245             }
246         }
247         puts(str.s);
248         bam_mplp_destroy(mplp);
249         continue;
250 
251 bed_error:
252         fprintf(stderr, "Errors in BED line '%s'\n", str.s);
253         status = 2;
254     }
255     free(n_plp); free(plp);
256     ks_destroy(ks);
257     gzclose(fp);
258 
259     free(cnt);
260     free(pcov);
261     for (i = 0; i < n; ++i) {
262         if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
263         hts_idx_destroy(idx[i]);
264         sam_hdr_destroy(aux[i]->header);
265         sam_close(aux[i]->fp);
266         free(aux[i]);
267     }
268     free(aux); free(idx);
269     free(str.s);
270     sam_global_args_free(&ga);
271     return status;
272 }
273