1 #include "samtools.pysam.h"
2 
3 /*  bedcov.c -- bedcov subcommand.
4 
5     Copyright (C) 2012 Broad Institute.
6     Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #include <config.h>
29 
30 #include <zlib.h>
31 #include <stdio.h>
32 #include <ctype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include "htslib/kstring.h"
37 #include "htslib/sam.h"
38 #include "htslib/thread_pool.h"
39 #include "samtools.h"
40 #include "sam_opts.h"
41 
42 #include "htslib/kseq.h"
43 KSTREAM_INIT(gzFile, gzread, 16384)
44 
45 #define DEFAULT_DEPTH 64000
46 
47 typedef struct {
48     htsFile *fp;
49     sam_hdr_t *header;
50     hts_itr_t *iter;
51     int min_mapQ;
52     uint32_t flags;  // read filtering flags
53 } aux_t;
54 
read_bam(void * data,bam1_t * b)55 static int read_bam(void *data, bam1_t *b)
56 {
57     aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
58     int ret;
59     while (1)
60     {
61         ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
62         if ( ret<0 ) break;
63         if ( b->core.flag & aux->flags ) continue;
64         if ( (int)b->core.qual < aux->min_mapQ ) continue;
65         break;
66     }
67     return ret;
68 }
69 
main_bedcov(int argc,char * argv[])70 int main_bedcov(int argc, char *argv[])
71 {
72     gzFile fp;
73     kstring_t str;
74     kstream_t *ks;
75     hts_idx_t **idx;
76     aux_t **aux;
77     int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
78     int64_t *cnt, *pcov = NULL;;
79     const bam_pileup1_t **plp;
80     int usage = 0, has_index_file = 0;
81     uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
82     int tflags = 0, min_depth = -1;
83 
84     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
85     static const struct option lopts[] = {
86         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
87         { NULL, 0, NULL, 0 }
88     };
89 
90     while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
91         switch (c) {
92         case 'Q': min_mapQ = atoi(optarg); break;
93         case 'X': has_index_file = 1; break;
94         case 'g':
95             tflags = bam_str2flag(optarg);
96             if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
97                 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
98                 return 1;
99             }
100             flags &= ~tflags;
101             break;
102         case 'G':
103             tflags = bam_str2flag(optarg);
104             if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
105                 print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
106                 return 1;
107             }
108             flags |= tflags;
109             break;
110         case 'j': skip_DN = 1; break;
111         case 'd': min_depth = atoi(optarg); break;
112         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
113                   /* else fall-through */
114         case '?': usage = 1; break;
115         }
116         if (usage) break;
117     }
118     if (usage || optind + 2 > argc) {
119         fprintf(samtools_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
120         fprintf(samtools_stderr, "Options:\n");
121         fprintf(samtools_stderr, "      -Q <int>            mapping quality threshold [0]\n");
122         fprintf(samtools_stderr, "      -X                  use customized index files\n");
123         fprintf(samtools_stderr, "      -g <flags>          remove the specified flags from the set used to filter out reads\n");
124         fprintf(samtools_stderr, "      -G <flags>          add the specified flags to the set used to filter out reads\n"
125                         "                          The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
126         fprintf(samtools_stderr, "      -j                  do not include deletions (D) and ref skips (N) in bedcov computation\n");
127         fprintf(samtools_stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and"
128                         "                          including this value will be displayed in a separate column\n");
129         sam_global_opt_help(samtools_stderr, "-.--.--.");
130         return 1;
131     }
132     if (has_index_file) {
133         if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files
134             fprintf(samtools_stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n");
135             return 1;
136         }
137         n = (argc - optind - 1) / 2;
138     } else {
139         n = argc - optind - 1;
140     }
141 
142     memset(&str, 0, sizeof(kstring_t));
143     aux = calloc(n, sizeof(aux_t*));
144     idx = calloc(n, sizeof(hts_idx_t*));
145     for (i = 0; i < n; ++i) {
146         aux[i] = calloc(1, sizeof(aux_t));
147         aux[i]->min_mapQ = min_mapQ;
148         aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
149         if (aux[i]->fp) {
150             // If index filename has not been specfied, look in BAM folder
151             if (has_index_file) {
152                 idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]);
153             } else {
154                 idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
155             }
156         }
157         if (aux[i]->fp == 0 || idx[i] == 0) {
158             fprintf(samtools_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
159             return 2;
160         }
161         // TODO bgzf_set_cache_size(aux[i]->fp, 20);
162         aux[i]->header = sam_hdr_read(aux[i]->fp);
163         if (aux[i]->header == NULL) {
164             fprintf(samtools_stderr, "ERROR: failed to read header for '%s'\n",
165                     argv[i+optind+1]);
166             return 2;
167         }
168         aux[i]->flags = flags;
169     }
170     cnt = calloc(n, sizeof(*cnt));
171     if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
172     if (!cnt || (min_depth >= 0 && !pcov)) return 2;
173 
174     fp = gzopen(argv[optind], "rb");
175     if (fp == NULL) {
176         print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]);
177         return 2;
178     }
179     ks = ks_init(fp);
180     n_plp = calloc(n, sizeof(int));
181     plp = calloc(n, sizeof(bam_pileup1_t*));
182     while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
183         char *p, *q;
184         int tid, pos, num = 0;
185         int64_t beg = 0, end = 0;
186         bam_mplp_t mplp;
187 
188         if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
189         /* Track and browser lines.  Also look for a trailing *space* in
190            case someone has badly-chosen a chromosome name (it would
191            be followed by a tab in that case). */
192         if (strncmp(str.s, "track ", 6) == 0) continue;
193         if (strncmp(str.s, "browser ", 8) == 0) continue;
194         for (p = q = str.s; *p && !isspace(*p); ++p);
195         if (*p == 0) goto bed_error;
196         char c = *p;
197         *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
198         if (tid < 0) goto bed_error;
199         num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
200         if (num < 2 || end < beg) goto bed_error;
201 
202         for (i = 0; i < n; ++i) {
203             if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
204             aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
205         }
206 
207         mplp = bam_mplp_init(n, read_bam, (void**)aux);
208         if (min_depth > DEFAULT_DEPTH)
209             bam_mplp_set_maxcnt(mplp, min_depth);
210         else
211             bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
212 
213         memset(cnt, 0, sizeof(*cnt) * n);
214         if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
215 
216         while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
217             if (pos >= beg && pos < end) {
218                 for (i = 0; i < n; ++i) {
219                     m = 0;
220                     if (skip_DN || min_depth >= 0) {
221                         for (j = 0; j < n_plp[i]; ++j) {
222                             const bam_pileup1_t *pi = plp[i] + j;
223                             if (pi->is_del || pi->is_refskip) ++m;
224                         }
225                     }
226                     int pd = n_plp[i] - m;
227                     cnt[i] += pd;
228                     if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
229                 }
230             }
231 
232         if (ret < 0) {
233             print_error("bedcov", "error reading from input file");
234             status = 2;
235             bam_mplp_destroy(mplp);
236             break;
237         }
238 
239         for (i = 0; i < n; ++i) {
240             kputc('\t', &str);
241             kputl(cnt[i], &str);
242         }
243         if (min_depth >= 0) {
244             for (i = 0; i < n; ++i) {
245                 kputc('\t', &str);
246                 kputl(pcov[i], &str);
247             }
248         }
249         samtools_puts(str.s);
250         bam_mplp_destroy(mplp);
251         continue;
252 
253 bed_error:
254         fprintf(samtools_stderr, "Errors in BED line '%s'\n", str.s);
255         status = 2;
256     }
257     free(n_plp); free(plp);
258     ks_destroy(ks);
259     gzclose(fp);
260 
261     free(cnt);
262     free(pcov);
263     for (i = 0; i < n; ++i) {
264         if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
265         hts_idx_destroy(idx[i]);
266         sam_hdr_destroy(aux[i]->header);
267         sam_close(aux[i]->fp);
268         free(aux[i]);
269     }
270     free(aux); free(idx);
271     free(str.s);
272     sam_global_args_free(&ga);
273     return status;
274 }
275