1 /* coverage.c -- samtools coverage subcommand
2
3 Copyright (C) 2018,2019 Florian Breitwieser
4 Portions copyright (C) 2019-2021 Genome Research Ltd.
5
6 Author: Florian P Breitwieser <florian.bw@gmail.com>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 /* This program calculates coverage from multiple BAMs
27 * simultaneously, to achieve random access and to use the BED interface.
28 * To compile this program separately, you may:
29 *
30 * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz
31 */
32
33 // C headers
34 #include <config.h>
35
36 #include <ctype.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <stdarg.h> // variadic functions
40 #include <limits.h> // INT_MAX
41 #include <math.h> // round
42 #include <stdbool.h>
43 #include <string.h>
44 #include <unistd.h>
45 #include <assert.h>
46
47 #ifdef _WIN32
48 #include <windows.h>
49 #else
50 #include <sys/ioctl.h>
51 #endif
52
53 #include "htslib/sam.h"
54 #include "htslib/hts.h"
55 #include "samtools.h"
56 #include "sam_opts.h"
57
58 const char *VERSION = "0.1";
59
60 typedef struct { // auxiliary data structure to hold stats on coverage
61 unsigned long long n_covered_bases;
62 unsigned long long summed_coverage;
63 unsigned long long summed_baseQ;
64 unsigned long long summed_mapQ;
65 unsigned int n_reads;
66 unsigned int n_selected_reads;
67 bool covered;
68 hts_pos_t beg;
69 hts_pos_t end;
70 int64_t bin_width;
71 } stats_aux_t;
72
73 typedef struct { // auxiliary data structure to hold a BAM file
74 samFile *fp; // file handle
75 sam_hdr_t *hdr; // file header
76 hts_itr_t *iter; // iterator to a region - NULL for us by default
77 int min_mapQ; // mapQ filter
78 int min_len; // length filter
79 int fail_flags;
80 int required_flags;
81 stats_aux_t *stats;
82 } bam_aux_t;
83
84 #if __STDC_VERSION__ >= 199901L
85 #define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL
86
87 // UTF8 specifies block characters in eights going from \u2581 (lower one eight block) to \u2588 (full block)
88 // https://en.wikipedia.org/wiki/Block_Elements
89 // LOWER ONE EIGHTH BLOCK … FULL BLOCK
90 static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"};
91 // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those
92 static const char *const BLOCK_CHARS2[2] = {".", ":"};
93
94 #else
95
96 // Fall back to explicit UTF-8 encodings of the same characters
97 #define VERTICAL_LINE "\xE2\x94\x82"
98
99 static const char *const BLOCK_CHARS8[8] = {
100 "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84",
101 "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" };
102
103 static const char *const BLOCK_CHARS2[2] = {".", ":"};
104
105 #endif
106
107 // in bam_plcmd.c
108 int read_file_list(const char *file_list, int *n, char **argv[]);
109
usage()110 static int usage() {
111 fprintf(stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n"
112 "Input options:\n"
113 " -b, --bam-list FILE list of input BAM filenames, one per line\n"
114 " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n"
115 " -q, --min-MQ INT mapping quality threshold [0]\n"
116 " -Q, --min-BQ INT base quality threshold [0]\n"
117 " --rf <int|str> required flags: skip reads with mask bits unset []\n"
118 " --ff <int|str> filter flags: skip reads with mask bits set \n"
119 " [UNMAP,SECONDARY,QCFAIL,DUP]\n"
120 " -d, --depth INT maximum allowed coverage depth [1000000].\n"
121 " If 0, depth is set to the maximum integer value,\n"
122 " effectively removing any depth limit.\n"
123 "Output options:\n"
124 " -m, --histogram show histogram instead of tabular output\n"
125 " -A, --ascii show only ASCII characters in histogram\n"
126 " -o, --output FILE write output to FILE [stdout]\n"
127 " -H, --no-header don't print a header in tabular mode\n"
128 " -w, --n-bins INT number of bins in histogram [terminal width - 40]\n"
129 " -r, --region REG show specified region. Format: chr:start-end. \n"
130 " -h, --help help (this page)\n");
131
132 fprintf(stdout, "\nGeneric options:\n");
133 sam_global_opt_help(stdout, "-.--.--.");
134
135 fprintf(stdout,
136 "\nSee manpage for additional details.\n"
137 " rname Reference name / chromosome\n"
138 " startpos Start position\n"
139 " endpos End position (or sequence length)\n"
140 " numreads Number reads aligned to the region (after filtering)\n"
141 " covbases Number of covered bases with depth >= 1\n"
142 " coverage Percentage of covered bases [0..100]\n"
143 " meandepth Mean depth of coverage\n"
144 " meanbaseq Mean baseQ in covered region\n"
145 " meanmapq Mean mapQ of selected reads\n"
146 );
147
148 return EXIT_SUCCESS;
149 }
150
center_text(char * text,char * buf,int width)151 static char* center_text(char *text, char *buf, int width) {
152 int len = strlen(text);
153 assert(len <= width);
154 int padding = (width - len) / 2;
155 int padding_ex = (width - len) % 2;
156 if (padding >= 1)
157 sprintf(buf, " %*s%*s", len+padding, text, padding-1+padding_ex, " ");
158 else
159 sprintf(buf, "%s", text);
160
161 return buf;
162 }
163
readable_bps(double base_pairs,char * buf)164 static char* readable_bps(double base_pairs, char *buf) {
165 const char* units[] = {"", "K", "M", "G", "T"};
166 int i = 0;
167 while (base_pairs >= 1000 && i < (sizeof(units)/sizeof(units[0]) - 1)) {
168 base_pairs /= 1000;
169 i++;
170 }
171 sprintf(buf, "%.*f%s", i, base_pairs, units[i]);
172 return buf;
173 }
174
175 // read one alignment from one BAM file
read_bam(void * data,bam1_t * b)176 static int read_bam(void *data, bam1_t *b) {
177 bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure
178 int nref = sam_hdr_nref(aux->hdr);
179 int ret;
180 while (1) {
181 if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break;
182 if (b->core.tid >= 0 && b->core.tid < nref)
183 aux->stats[b->core.tid].n_reads++;
184
185 if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue;
186 if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue;
187 if ( b->core.qual < aux->min_mapQ ) continue;
188 if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
189 if (b->core.tid >= 0 && b->core.tid < nref) {
190 aux->stats[b->core.tid].n_selected_reads++;
191 aux->stats[b->core.tid].summed_mapQ += b->core.qual;
192 }
193 break;
194 }
195 return ret;
196 }
197
print_tabular_line(FILE * file_out,const sam_hdr_t * h,const stats_aux_t * stats,int tid)198 void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) {
199 fputs(sam_hdr_tid2name(h, tid), file_out);
200 double region_len = (double) stats[tid].end - stats[tid].beg;
201 fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n",
202 stats[tid].beg+1,
203 stats[tid].end,
204 stats[tid].n_selected_reads,
205 stats[tid].n_covered_bases,
206 100.0 * stats[tid].n_covered_bases / region_len,
207 stats[tid].summed_coverage / region_len,
208 stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
209 stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
210 );
211 }
212
print_hist(FILE * file_out,const sam_hdr_t * h,const stats_aux_t * stats,int tid,const uint32_t * hist,const int hist_size,const bool full_utf)213 void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
214 const int hist_size, const bool full_utf) {
215 int i, col;
216 bool show_percentiles = false;
217 const int n_rows = 10;
218 const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2;
219 const int blockchar_len = full_utf? 8 : 2;
220 double region_len = stats[tid].end - stats[tid].beg;
221
222 // Calculate histogram that contains percent covered
223 double hist_data[hist_size];
224 double max_val = 0.0;
225 for (i = 0; i < hist_size; ++i) {
226 hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
227 if (hist_data[i] > max_val) max_val = hist_data[i];
228 }
229
230 char buf[30];
231 fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf));
232
233 double row_bin_size = max_val / (double) n_rows;
234 for (i = n_rows-1; i >= 0; --i) {
235 double current_bin = row_bin_size * i;
236 if (show_percentiles) {
237 fprintf(file_out, ">%3i%% ", i*10);
238 } else {
239 fprintf(file_out, ">%7.2f%% ", current_bin);
240 }
241 fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
242 for (col = 0; col < hist_size; ++col) {
243 // get the difference in eights, or halfs when full UTF8 is not supported
244 int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1;
245 if (cur_val_diff < 0) {
246 fputc(' ', file_out);
247 } else {
248 if (cur_val_diff >= blockchar_len)
249 cur_val_diff = blockchar_len - 1;
250
251 fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]);
252 }
253 }
254 fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
255 fputc(' ', file_out);
256 switch (i) {
257 case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
258 case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
259 case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
260 case 6: fprintf(file_out, "Percent covered: %.4g%%",
261 100.0 * stats[tid].n_covered_bases / region_len); break;
262 case 5: fprintf(file_out, "Mean coverage: %.3gx",
263 stats[tid].summed_coverage / region_len); break;
264 case 4: fprintf(file_out, "Mean baseQ: %.3g",
265 stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
266 case 3: fprintf(file_out, "Mean mapQ: %.3g",
267 stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
268 case 1: fprintf(file_out, "Histo bin width: %sbp",
269 readable_bps(stats[tid].bin_width, buf)); break;
270 case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break;
271 };
272 fputc('\n', file_out);
273 }
274
275 // print x axis. Could be made pretty for widths that are not divisible
276 // by 10 by variable spacing of the labels, instead of placing a label every 10 characters
277 char buf2[50];
278 fprintf(file_out, " %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10));
279 int rest;
280 for (rest = 10; rest < 10*(hist_size/10); rest += 10) {
281 fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10));
282 }
283 int last_padding = hist_size%10;
284 fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10));
285 fprintf(file_out, "\n");
286 }
287
main_coverage(int argc,char * argv[])288 int main_coverage(int argc, char *argv[]) {
289 int status = EXIT_SUCCESS;
290
291 int ret, tid = -1, old_tid = -1, pos, i, j;
292
293 int max_depth = 1000000;
294 int opt_min_baseQ = 0;
295 int opt_min_mapQ = 0;
296 int opt_min_len = 0;
297 int opt_n_bins = 50;
298 bool opt_full_width = true;
299 char *opt_output_file = NULL;
300 bam_aux_t **data = NULL;
301 bam_mplp_t mplp = NULL;
302 const bam_pileup1_t **plp = NULL;
303 uint32_t *hist = NULL;
304 stats_aux_t *stats = NULL;
305 char *opt_reg = 0; // specified region
306 char *opt_file_list = NULL;
307 int n_bam_files = 0;
308 char **fn = NULL;
309 int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags
310 int required_flags = 0;
311
312 int *n_plp = NULL;
313 sam_hdr_t *h = NULL; // BAM header of the 1st input
314
315 bool opt_print_header = true;
316 bool opt_print_tabular = true;
317 bool opt_print_histogram = false;
318 bool opt_full_utf = true;
319
320 FILE *file_out = stdout;
321
322 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
323 static const struct option lopts[] = {
324 SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
325 {"rf", required_argument, NULL, 1}, // require flag
326 {"ff", required_argument, NULL, 2}, // filter flag
327 {"incl-flags", required_argument, NULL, 1}, // require flag
328 {"excl-flags", required_argument, NULL, 2}, // filter flag
329 {"bam-list", required_argument, NULL, 'b'},
330 {"min-read-len", required_argument, NULL, 'l'},
331 {"min-MQ", required_argument, NULL, 'q'},
332 {"min-mq", required_argument, NULL, 'q'},
333 {"min-BQ", required_argument, NULL, 'Q'},
334 {"min-bq", required_argument, NULL, 'Q'},
335 {"histogram", no_argument, NULL, 'm'},
336 {"ascii", no_argument, NULL, 'A'},
337 {"output", required_argument, NULL, 'o'},
338 {"no-header", no_argument, NULL, 'H'},
339 {"n-bins", required_argument, NULL, 'w'},
340 {"region", required_argument, NULL, 'r'},
341 {"help", no_argument, NULL, 'h'},
342 {"depth", required_argument, NULL, 'd'},
343 { NULL, 0, NULL, 0 }
344 };
345
346 // parse the command line
347 int c;
348 opterr = 0;
349 while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
350 switch (c) {
351 case 1:
352 if ((required_flags = bam_str2flag(optarg)) < 0) {
353 fprintf(stderr,"Could not parse --rf %s\n", optarg); return EXIT_FAILURE;
354 }; break;
355 case 2:
356 if ((fail_flags = bam_str2flag(optarg)) < 0) {
357 fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE;
358 }; break;
359 case 'o': opt_output_file = optarg; opt_full_width = false; break;
360 case 'l': opt_min_len = atoi(optarg); break;
361 case 'q': opt_min_mapQ = atoi(optarg); break;
362 case 'Q': opt_min_baseQ = atoi(optarg); break;
363 case 'd': max_depth = atoi(optarg); break; // maximum coverage depth
364 case 'w': opt_n_bins = atoi(optarg); opt_full_width = false;
365 opt_print_histogram = true; opt_print_tabular = false;
366 break;
367 case 'r': opt_reg = optarg; break; // parsing a region requires a BAM header (strdup unnecessary)
368 case 'b': opt_file_list = optarg; break;
369 case 'm': opt_print_histogram = true; opt_print_tabular = false; break;
370 case 'A': opt_full_utf = false;
371 opt_print_histogram = true; opt_print_tabular = false;
372 break;
373 case 'H': opt_print_header = false; break;
374 case 'h': return usage();
375 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
376 /* else fall-through */
377 case '?':
378 if (optopt != '?') { // '-?' appeared on command line
379 if (optopt) { // Bad short option
380 print_error("coverage", "invalid option -- '%c'", optopt);
381 } else { // Bad long option
382 // Do our best. There is no good solution to finding
383 // out what the bad option was.
384 // See, e.g. https://stackoverflow.com/questions/2723888/where-does-getopt-long-store-an-unrecognized-option
385 if (optind > 0 && strncmp(argv[optind - 1], "--", 2) == 0) {
386 print_error("coverage", "unrecognised option '%s'",
387 argv[optind - 1]);
388 }
389 }
390 }
391 return usage();
392 }
393 }
394 if (optind == argc && !opt_file_list)
395 return usage();
396
397 // output file provided by user
398 if (opt_output_file != NULL && strcmp(opt_output_file,"-")!=0) {
399 file_out = fopen( opt_output_file, "w" );
400 if (file_out == NULL) {
401 print_error_errno("coverage", "Cannot open \"%s\" for writing.", opt_output_file);
402 return EXIT_FAILURE;
403 }
404 }
405
406 if (opt_n_bins <= 0 || opt_full_width) {
407 // get number of columns of terminal
408 const char* env_columns = getenv("COLUMNS");
409 int columns = 0;
410 if (env_columns == NULL) {
411 #ifdef _WIN32
412 CONSOLE_SCREEN_BUFFER_INFO csbi;
413 if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) {
414 columns = csbi.srWindow.Right - csbi.srWindow.Left + 1;
415 }
416 #elif defined TIOCGWINSZ
417 struct winsize w;
418 if (ioctl(2, TIOCGWINSZ, &w) == 0)
419 columns = w.ws_col;
420 #endif
421 } else {
422 columns = atoi(env_columns); // atoi(NULL) returns 0
423 }
424
425 if (columns > 60) {
426 opt_n_bins = columns - 40;
427 } else {
428 opt_n_bins = 40;
429 }
430 }
431
432 // setvbuf(file_out, NULL, _IONBF, 0); //turn off buffering
433
434 // Open all BAM files
435 if (opt_file_list) {
436 // Read file names from opt_file_list into argv, and record the number of files in n_bam_files
437 if (read_file_list(opt_file_list, &n_bam_files, &fn)) {
438 print_error_errno("coverage", "Cannot open file list \"%s\".", opt_file_list);
439 return EXIT_FAILURE;
440 }
441 argv = fn;
442 optind = 0;
443 } else {
444 n_bam_files = argc - optind; // the number of BAMs on the command line
445 }
446
447 data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file
448 if (!data) {
449 print_error_errno("coverage", "Failed to allocate memory");
450 status = EXIT_FAILURE;
451 goto coverage_end;
452 }
453
454 for (i = 0; i < n_bam_files; ++i) {
455 int rf;
456 data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t));
457 if (!data[i]) {
458 print_error_errno("coverage", "Failed to allocate memory");
459 status = EXIT_FAILURE;
460 goto coverage_end;
461 }
462 data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
463
464 if (data[i]->fp == NULL) {
465 print_error_errno("coverage", "Could not open \"%s\"", argv[optind+i]);
466 status = EXIT_FAILURE;
467 goto coverage_end;
468 }
469 rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
470 if (opt_min_baseQ) rf |= SAM_QUAL;
471
472 // Set CRAM options on file handle - returns 0 on success
473 if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
474 print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
475 status = EXIT_FAILURE;
476 goto coverage_end;
477 }
478 if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
479 print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
480 status = EXIT_FAILURE;
481 goto coverage_end;
482 }
483 data[i]->min_mapQ = opt_min_mapQ; // set the mapQ filter
484 data[i]->min_len = opt_min_len; // set the qlen filter
485 data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
486 data[i]->fail_flags = fail_flags;
487 data[i]->required_flags = required_flags;
488 if (data[i]->hdr == NULL) {
489 print_error_errno("coverage", "Could not read header for \"%s\"", argv[optind+i]);
490 status = EXIT_FAILURE;
491 goto coverage_end;
492 }
493
494 // Lookup region if specified
495 if (opt_reg) { // if a region is specified
496 hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index
497 if (idx == NULL) {
498 print_error_errno("coverage", "Failed to load index for \"%s\"", argv[optind+i]);
499 status = EXIT_FAILURE;
500 goto coverage_end;
501 }
502 data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator
503 hts_idx_destroy(idx); // the index is not needed any more; free the memory
504 if (data[i]->iter == NULL) {
505 print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]);
506 status = EXIT_FAILURE;
507 goto coverage_end;
508 }
509 }
510 }
511
512 if (opt_print_tabular && opt_print_header)
513 fputs("#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\n", file_out);
514
515 h = data[0]->hdr; // easy access to the header of the 1st BAM
516 int n_targets = sam_hdr_nref(h);
517 stats = calloc(n_targets, sizeof(stats_aux_t));
518 if (!stats) {
519 print_error_errno("coverage", "Failed to allocate memory");
520 status = EXIT_FAILURE;
521 goto coverage_end;
522 }
523
524 int64_t n_bins = opt_n_bins;
525 if (opt_reg) {
526 stats_aux_t *s = stats + data[0]->iter->tid;
527 s->beg = data[0]->iter->beg; // and to the parsed region coordinates
528 s->end = data[0]->iter->end;
529 if (s->end == HTS_POS_MAX) {
530 s->end = sam_hdr_tid2len(h, data[0]->iter->tid);
531 }
532 if (opt_n_bins > s->end - s->beg) {
533 n_bins = s->end - s->beg;
534 }
535 s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1);
536 }
537
538 for (i=0; i<n_bam_files; i++)
539 data[i]->stats = stats;
540
541 int64_t current_bin = 0;
542
543 // the core multi-pileup loop
544 mplp = bam_mplp_init(n_bam_files, read_bam, (void**)data); // initialization
545 if (max_depth > 0)
546 bam_mplp_set_maxcnt(mplp, max_depth); // set maximum coverage depth
547 else if (!max_depth)
548 bam_mplp_set_maxcnt(mplp, INT_MAX);
549
550
551 // Extra info for histogram and coverage counting
552 hist = (uint32_t*) calloc(opt_n_bins, sizeof(uint32_t));
553 n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM
554 plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
555 if (!hist || !n_plp || !plp) {
556 print_error_errno("coverage", "Failed to allocate memory");
557 status = EXIT_FAILURE;
558 goto coverage_end;
559 }
560 while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
561
562 if (tid != old_tid) { // Next target sequence
563 if (old_tid >= 0) {
564 if (opt_print_histogram) {
565 print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
566 fputc('\n', file_out);
567 } else if (opt_print_tabular) {
568 print_tabular_line(file_out, h, stats, old_tid);
569 }
570
571 if (opt_print_histogram)
572 memset(hist, 0, n_bins*sizeof(uint32_t));
573 }
574
575 stats[tid].covered = true;
576 if (!opt_reg)
577 stats[tid].end = sam_hdr_tid2len(h, tid);
578
579 if (opt_print_histogram) {
580 n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins;
581 stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins;
582 }
583
584 old_tid = tid;
585 }
586 if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip
587 if (tid >= n_targets) continue; // diff number of @SQ lines per file?
588
589 if (opt_print_histogram) {
590 current_bin = (pos - stats[tid].beg) / stats[tid].bin_width;
591 }
592
593 bool count_base = false;
594 for (i = 0; i < n_bam_files; ++i) { // base level filters have to go here
595 int depth_at_pos = n_plp[i];
596 for (j = 0; j < n_plp[i]; ++j) {
597 const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know
598
599 if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos
600 else if (p->qpos < p->b->core.l_qseq &&
601 bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
602 else
603 stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
604 }
605 if (depth_at_pos > 0) {
606 count_base = true;
607 stats[tid].summed_coverage += depth_at_pos;
608 }
609 // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage
610 //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
611 }
612 if (count_base) {
613 stats[tid].n_covered_bases++;
614 if (opt_print_histogram && current_bin < n_bins)
615 ++(hist[current_bin]); // Histogram based on breadth of coverage
616 }
617 }
618
619 if (tid == -1 && opt_reg && *opt_reg != '*')
620 // Region specified but no data covering it.
621 tid = data[0]->iter->tid;
622
623 if (tid < n_targets && tid >=0) {
624 if (opt_print_histogram) {
625 print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
626 } else if (opt_print_tabular) {
627 print_tabular_line(file_out, h, stats, tid);
628 }
629 }
630
631
632 if (!opt_reg && opt_print_tabular) {
633 for (i = 0; i < n_targets; ++i) {
634 if (!stats[i].covered) {
635 stats[i].end = sam_hdr_tid2len(h, i);
636 print_tabular_line(file_out, h, stats, i);
637 }
638 }
639 }
640
641 if (ret < 0) status = EXIT_FAILURE;
642
643 coverage_end:
644 if (n_plp) free(n_plp);
645 if (plp) free(plp);
646 if (mplp) bam_mplp_destroy(mplp);
647
648 if (hist) free(hist);
649 if (stats) free(stats);
650
651 // Close files and free data structures
652 if (!(file_out == stdout || fclose(file_out) == 0)) {
653 if (status == EXIT_SUCCESS) {
654 print_error_errno("coverage", "error on closing \"%s\"",
655 (opt_output_file && strcmp(opt_output_file, "-") != 0?
656 opt_output_file : "stdout"));
657 status = EXIT_FAILURE;
658 }
659 }
660
661 if (data) {
662 for (i = 0; i < n_bam_files && data[i]; ++i) {
663 sam_hdr_destroy(data[i]->hdr);
664 if (data[i]->fp) sam_close(data[i]->fp);
665 hts_itr_destroy(data[i]->iter);
666 free(data[i]);
667 }
668 free(data);
669 }
670
671 if (opt_file_list && fn) {
672 for (i = 0; i < n_bam_files; ++i)
673 free(fn[i]);
674 free(fn);
675 }
676 sam_global_args_free(&ga);
677
678 return status;
679 }
680
681 #ifdef _MAIN_BAMCOV
main(int argc,char * argv[])682 int main(int argc, char *argv[]) {
683 return main_coverage(argc, argv);
684 }
685 #endif
686