1 /* coverage.c -- samtools coverage subcommand
2 
3     Copyright (C) 2018,2019 Florian Breitwieser
4     Portions copyright (C) 2019-2021 Genome Research Ltd.
5 
6     Author: Florian P Breitwieser <florian.bw@gmail.com>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 /* This program calculates coverage from multiple BAMs
27  * simultaneously, to achieve random access and to use the BED interface.
28  * To compile this program separately, you may:
29  *
30  *   gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz
31  */
32 
33 // C headers
34 #include <config.h>
35 
36 #include <ctype.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <stdarg.h>  // variadic functions
40 #include <limits.h>  // INT_MAX
41 #include <math.h>    // round
42 #include <stdbool.h>
43 #include <string.h>
44 #include <unistd.h>
45 #include <assert.h>
46 
47 #ifdef _WIN32
48 #include <windows.h>
49 #else
50 #include <sys/ioctl.h>
51 #endif
52 
53 #include "htslib/sam.h"
54 #include "htslib/hts.h"
55 #include "samtools.h"
56 #include "sam_opts.h"
57 
58 const char *VERSION = "0.1";
59 
60 typedef struct {  // auxiliary data structure to hold stats on coverage
61     unsigned long long n_covered_bases;
62     unsigned long long summed_coverage;
63     unsigned long long summed_baseQ;
64     unsigned long long summed_mapQ;
65     unsigned int n_reads;
66     unsigned int n_selected_reads;
67     bool covered;
68     hts_pos_t beg;
69     hts_pos_t end;
70     int64_t bin_width;
71 } stats_aux_t;
72 
73 typedef struct {  // auxiliary data structure to hold a BAM file
74     samFile *fp;     // file handle
75     sam_hdr_t *hdr;  // file header
76     hts_itr_t *iter; // iterator to a region - NULL for us by default
77     int min_mapQ;    // mapQ filter
78     int min_len;     // length filter
79     int fail_flags;
80     int required_flags;
81     stats_aux_t *stats;
82 } bam_aux_t;
83 
84 #if __STDC_VERSION__ >= 199901L
85 #define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL
86 
87 // UTF8 specifies block characters in eights going from \u2581 (lower one eight block) to \u2588 (full block)
88 //   https://en.wikipedia.org/wiki/Block_Elements
89 // LOWER ONE EIGHTH BLOCK … FULL BLOCK
90 static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"};
91 // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those
92 static const char *const BLOCK_CHARS2[2] = {".", ":"};
93 
94 #else
95 
96 // Fall back to explicit UTF-8 encodings of the same characters
97 #define VERTICAL_LINE "\xE2\x94\x82"
98 
99 static const char *const BLOCK_CHARS8[8] = {
100     "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84",
101     "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" };
102 
103 static const char *const BLOCK_CHARS2[2] = {".", ":"};
104 
105 #endif
106 
107 // in bam_plcmd.c
108 int read_file_list(const char *file_list, int *n, char **argv[]);
109 
usage()110 static int usage() {
111     fprintf(stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n"
112             "Input options:\n"
113             "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
114             "  -l, --min-read-len INT  ignore reads shorter than INT bp [0]\n"
115             "  -q, --min-MQ INT        mapping quality threshold [0]\n"
116             "  -Q, --min-BQ INT        base quality threshold [0]\n"
117             "  --rf <int|str>          required flags: skip reads with mask bits unset []\n"
118             "  --ff <int|str>          filter flags: skip reads with mask bits set \n"
119             "                                      [UNMAP,SECONDARY,QCFAIL,DUP]\n"
120             "  -d, --depth INT         maximum allowed coverage depth [1000000].\n"
121             "                          If 0, depth is set to the maximum integer value,\n"
122             "                          effectively removing any depth limit.\n"
123             "Output options:\n"
124             "  -m, --histogram         show histogram instead of tabular output\n"
125             "  -A, --ascii             show only ASCII characters in histogram\n"
126             "  -o, --output FILE       write output to FILE [stdout]\n"
127             "  -H, --no-header         don't print a header in tabular mode\n"
128             "  -w, --n-bins INT        number of bins in histogram [terminal width - 40]\n"
129             "  -r, --region REG        show specified region. Format: chr:start-end. \n"
130             "  -h, --help              help (this page)\n");
131 
132     fprintf(stdout, "\nGeneric options:\n");
133     sam_global_opt_help(stdout, "-.--.--.");
134 
135     fprintf(stdout,
136             "\nSee manpage for additional details.\n"
137             "  rname       Reference name / chromosome\n"
138             "  startpos    Start position\n"
139             "  endpos      End position (or sequence length)\n"
140             "  numreads    Number reads aligned to the region (after filtering)\n"
141             "  covbases    Number of covered bases with depth >= 1\n"
142             "  coverage    Percentage of covered bases [0..100]\n"
143             "  meandepth   Mean depth of coverage\n"
144             "  meanbaseq   Mean baseQ in covered region\n"
145             "  meanmapq    Mean mapQ of selected reads\n"
146            );
147 
148     return EXIT_SUCCESS;
149 }
150 
center_text(char * text,char * buf,int width)151 static char* center_text(char *text, char *buf, int width) {
152     int len = strlen(text);
153     assert(len <= width);
154     int padding = (width - len) / 2;
155     int padding_ex = (width - len) % 2;
156     if (padding >= 1)
157         sprintf(buf, " %*s%*s", len+padding, text, padding-1+padding_ex, " ");
158     else
159         sprintf(buf, "%s", text);
160 
161     return buf;
162 }
163 
readable_bps(double base_pairs,char * buf)164 static char* readable_bps(double base_pairs, char *buf) {
165     const char* units[] = {"", "K", "M", "G", "T"};
166     int i = 0;
167     while (base_pairs >= 1000 && i < (sizeof(units)/sizeof(units[0]) - 1)) {
168         base_pairs /= 1000;
169         i++;
170     }
171     sprintf(buf, "%.*f%s", i, base_pairs, units[i]);
172     return buf;
173 }
174 
175 // read one alignment from one BAM file
read_bam(void * data,bam1_t * b)176 static int read_bam(void *data, bam1_t *b) {
177     bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure
178     int nref = sam_hdr_nref(aux->hdr);
179     int ret;
180     while (1) {
181         if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break;
182         if (b->core.tid >= 0 && b->core.tid < nref)
183             aux->stats[b->core.tid].n_reads++;
184 
185         if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue;
186         if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue;
187         if ( b->core.qual < aux->min_mapQ ) continue;
188         if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
189         if (b->core.tid >= 0 && b->core.tid < nref) {
190             aux->stats[b->core.tid].n_selected_reads++;
191             aux->stats[b->core.tid].summed_mapQ += b->core.qual;
192         }
193         break;
194     }
195     return ret;
196 }
197 
print_tabular_line(FILE * file_out,const sam_hdr_t * h,const stats_aux_t * stats,int tid)198 void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) {
199     fputs(sam_hdr_tid2name(h, tid), file_out);
200     double region_len = (double) stats[tid].end - stats[tid].beg;
201     fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n",
202             stats[tid].beg+1,
203             stats[tid].end,
204             stats[tid].n_selected_reads,
205             stats[tid].n_covered_bases,
206             100.0 * stats[tid].n_covered_bases / region_len,
207             stats[tid].summed_coverage / region_len,
208             stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
209             stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
210            );
211 }
212 
print_hist(FILE * file_out,const sam_hdr_t * h,const stats_aux_t * stats,int tid,const uint32_t * hist,const int hist_size,const bool full_utf)213 void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
214         const int hist_size, const bool full_utf) {
215     int i, col;
216     bool show_percentiles = false;
217     const int n_rows = 10;
218     const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2;
219     const int blockchar_len = full_utf? 8 : 2;
220     double region_len = stats[tid].end - stats[tid].beg;
221 
222     // Calculate histogram that contains percent covered
223     double hist_data[hist_size];
224     double max_val = 0.0;
225     for (i = 0; i < hist_size; ++i) {
226         hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
227         if (hist_data[i] > max_val) max_val = hist_data[i];
228     }
229 
230     char buf[30];
231     fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf));
232 
233     double row_bin_size = max_val / (double) n_rows;
234     for (i = n_rows-1; i >= 0; --i) {
235         double current_bin = row_bin_size * i;
236         if (show_percentiles) {
237             fprintf(file_out, ">%3i%% ", i*10);
238         } else {
239             fprintf(file_out, ">%7.2f%% ", current_bin);
240         }
241         fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
242         for (col = 0; col < hist_size; ++col) {
243             // get the difference in eights, or halfs when full UTF8 is not supported
244             int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1;
245             if (cur_val_diff < 0) {
246                 fputc(' ', file_out);
247             } else {
248                 if (cur_val_diff >= blockchar_len)
249                     cur_val_diff = blockchar_len - 1;
250 
251                 fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]);
252             }
253         }
254         fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
255         fputc(' ', file_out);
256         switch (i) {
257             case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
258             case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, "    (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
259             case 7: fprintf(file_out, "Covered bases:   %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
260             case 6: fprintf(file_out, "Percent covered: %.4g%%",
261                             100.0 * stats[tid].n_covered_bases / region_len); break;
262             case 5: fprintf(file_out, "Mean coverage:   %.3gx",
263                             stats[tid].summed_coverage / region_len); break;
264             case 4: fprintf(file_out, "Mean baseQ:      %.3g",
265                             stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
266             case 3: fprintf(file_out, "Mean mapQ:       %.3g",
267                             stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
268             case 1: fprintf(file_out, "Histo bin width: %sbp",
269                             readable_bps(stats[tid].bin_width, buf)); break;
270             case 0: fprintf(file_out, "Histo max bin:   %.5g%%", max_val); break;
271         };
272         fputc('\n', file_out);
273     }
274 
275     // print x axis. Could be made pretty for widths that are not divisible
276     // by 10 by variable spacing of the labels, instead of placing a label every 10 characters
277     char buf2[50];
278     fprintf(file_out, "     %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10));
279     int rest;
280     for (rest = 10; rest < 10*(hist_size/10); rest += 10) {
281         fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10));
282     }
283     int last_padding = hist_size%10;
284     fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10));
285     fprintf(file_out, "\n");
286 }
287 
main_coverage(int argc,char * argv[])288 int main_coverage(int argc, char *argv[]) {
289     int status = EXIT_SUCCESS;
290 
291     int ret, tid = -1, old_tid = -1, pos, i, j;
292 
293     int max_depth = 1000000;
294     int opt_min_baseQ = 0;
295     int opt_min_mapQ = 0;
296     int opt_min_len = 0;
297     int opt_n_bins = 50;
298     bool opt_full_width = true;
299     char *opt_output_file = NULL;
300     bam_aux_t **data = NULL;
301     bam_mplp_t mplp = NULL;
302     const bam_pileup1_t **plp = NULL;
303     uint32_t *hist = NULL;
304     stats_aux_t *stats = NULL;
305     char *opt_reg = 0; // specified region
306     char *opt_file_list = NULL;
307     int n_bam_files = 0;
308     char **fn = NULL;
309     int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags
310     int required_flags = 0;
311 
312     int *n_plp = NULL;
313     sam_hdr_t *h = NULL; // BAM header of the 1st input
314 
315     bool opt_print_header = true;
316     bool opt_print_tabular = true;
317     bool opt_print_histogram = false;
318     bool opt_full_utf = true;
319 
320     FILE *file_out = stdout;
321 
322     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
323     static const struct option lopts[] = {
324         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
325         {"rf", required_argument, NULL, 1}, // require flag
326         {"ff", required_argument, NULL, 2}, // filter flag
327         {"incl-flags", required_argument, NULL, 1}, // require flag
328         {"excl-flags", required_argument, NULL, 2}, // filter flag
329         {"bam-list", required_argument, NULL, 'b'},
330         {"min-read-len", required_argument, NULL, 'l'},
331         {"min-MQ", required_argument, NULL, 'q'},
332         {"min-mq", required_argument, NULL, 'q'},
333         {"min-BQ", required_argument, NULL, 'Q'},
334         {"min-bq", required_argument, NULL, 'Q'},
335         {"histogram", no_argument, NULL, 'm'},
336         {"ascii", no_argument, NULL, 'A'},
337         {"output", required_argument, NULL, 'o'},
338         {"no-header", no_argument, NULL, 'H'},
339         {"n-bins", required_argument, NULL, 'w'},
340         {"region", required_argument, NULL, 'r'},
341         {"help", no_argument, NULL, 'h'},
342         {"depth", required_argument, NULL, 'd'},
343         { NULL, 0, NULL, 0 }
344     };
345 
346     // parse the command line
347     int c;
348     opterr = 0;
349     while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
350         switch (c) {
351             case 1:
352                 if ((required_flags = bam_str2flag(optarg)) < 0) {
353                     fprintf(stderr,"Could not parse --rf %s\n", optarg); return EXIT_FAILURE;
354                 }; break;
355             case 2:
356                 if ((fail_flags = bam_str2flag(optarg)) < 0) {
357                     fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE;
358                 }; break;
359             case 'o': opt_output_file = optarg; opt_full_width = false; break;
360             case 'l': opt_min_len = atoi(optarg); break;
361             case 'q': opt_min_mapQ = atoi(optarg); break;
362             case 'Q': opt_min_baseQ = atoi(optarg); break;
363             case 'd': max_depth = atoi(optarg); break; // maximum coverage depth
364             case 'w': opt_n_bins = atoi(optarg); opt_full_width = false;
365                       opt_print_histogram = true; opt_print_tabular = false;
366                       break;
367             case 'r': opt_reg = optarg; break;   // parsing a region requires a BAM header (strdup unnecessary)
368             case 'b': opt_file_list = optarg; break;
369             case 'm': opt_print_histogram = true; opt_print_tabular = false; break;
370             case 'A': opt_full_utf = false;
371                       opt_print_histogram = true; opt_print_tabular = false;
372                       break;
373             case 'H': opt_print_header = false; break;
374             case 'h': return usage();
375             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
376                           /* else fall-through */
377             case '?':
378                 if (optopt != '?') {  // '-?' appeared on command line
379                     if (optopt) { // Bad short option
380                         print_error("coverage", "invalid option -- '%c'", optopt);
381                     } else { // Bad long option
382                         // Do our best.  There is no good solution to finding
383                         // out what the bad option was.
384                         // See, e.g. https://stackoverflow.com/questions/2723888/where-does-getopt-long-store-an-unrecognized-option
385                         if (optind > 0 && strncmp(argv[optind - 1], "--", 2) == 0) {
386                             print_error("coverage", "unrecognised option '%s'",
387                                         argv[optind - 1]);
388                         }
389                     }
390                 }
391                 return usage();
392         }
393     }
394     if (optind == argc && !opt_file_list)
395         return usage();
396 
397     // output file provided by user
398     if (opt_output_file != NULL && strcmp(opt_output_file,"-")!=0) {
399         file_out = fopen( opt_output_file, "w" );
400         if (file_out == NULL) {
401             print_error_errno("coverage", "Cannot open \"%s\" for writing.", opt_output_file);
402             return EXIT_FAILURE;
403         }
404     }
405 
406     if (opt_n_bins <= 0 || opt_full_width) {
407         // get number of columns of terminal
408         const char* env_columns = getenv("COLUMNS");
409         int columns = 0;
410         if (env_columns == NULL) {
411 #ifdef _WIN32
412             CONSOLE_SCREEN_BUFFER_INFO csbi;
413             if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) {
414                 columns = csbi.srWindow.Right - csbi.srWindow.Left + 1;
415             }
416 #elif defined TIOCGWINSZ
417             struct winsize w;
418             if (ioctl(2, TIOCGWINSZ, &w) == 0)
419                 columns = w.ws_col;
420 #endif
421         } else {
422             columns = atoi(env_columns); // atoi(NULL) returns 0
423         }
424 
425         if (columns > 60) {
426             opt_n_bins = columns - 40;
427         } else {
428             opt_n_bins = 40;
429         }
430     }
431 
432     // setvbuf(file_out, NULL, _IONBF, 0); //turn off buffering
433 
434     // Open all BAM files
435     if (opt_file_list) {
436         // Read file names from opt_file_list into argv, and record the number of files in n_bam_files
437         if (read_file_list(opt_file_list, &n_bam_files, &fn)) {
438             print_error_errno("coverage", "Cannot open file list \"%s\".", opt_file_list);
439             return EXIT_FAILURE;
440         }
441         argv = fn;
442         optind = 0;
443     } else {
444         n_bam_files = argc - optind; // the number of BAMs on the command line
445     }
446 
447     data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file
448     if (!data) {
449         print_error_errno("coverage", "Failed to allocate memory");
450         status = EXIT_FAILURE;
451         goto coverage_end;
452     }
453 
454     for (i = 0; i < n_bam_files; ++i) {
455         int rf;
456         data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t));
457         if (!data[i]) {
458             print_error_errno("coverage", "Failed to allocate memory");
459             status = EXIT_FAILURE;
460             goto coverage_end;
461         }
462         data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
463 
464         if (data[i]->fp == NULL) {
465             print_error_errno("coverage", "Could not open \"%s\"", argv[optind+i]);
466             status = EXIT_FAILURE;
467             goto coverage_end;
468         }
469         rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
470         if (opt_min_baseQ) rf |= SAM_QUAL;
471 
472         // Set CRAM options on file handle - returns 0 on success
473         if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
474             print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
475             status = EXIT_FAILURE;
476             goto coverage_end;
477         }
478         if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
479             print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
480             status = EXIT_FAILURE;
481             goto coverage_end;
482         }
483         data[i]->min_mapQ = opt_min_mapQ;            // set the mapQ filter
484         data[i]->min_len  = opt_min_len;             // set the qlen filter
485         data[i]->hdr = sam_hdr_read(data[i]->fp);    // read the BAM header
486         data[i]->fail_flags = fail_flags;
487         data[i]->required_flags = required_flags;
488         if (data[i]->hdr == NULL) {
489             print_error_errno("coverage", "Could not read header for \"%s\"", argv[optind+i]);
490             status = EXIT_FAILURE;
491             goto coverage_end;
492         }
493 
494         // Lookup region if specified
495         if (opt_reg) { // if a region is specified
496             hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]);  // load the index
497             if (idx == NULL) {
498                 print_error_errno("coverage", "Failed to load index for \"%s\"", argv[optind+i]);
499                 status = EXIT_FAILURE;
500                 goto coverage_end;
501             }
502             data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator
503             hts_idx_destroy(idx); // the index is not needed any more; free the memory
504             if (data[i]->iter == NULL) {
505                 print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]);
506                 status = EXIT_FAILURE;
507                 goto coverage_end;
508             }
509         }
510     }
511 
512     if (opt_print_tabular && opt_print_header)
513         fputs("#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\n", file_out);
514 
515     h = data[0]->hdr; // easy access to the header of the 1st BAM
516     int n_targets = sam_hdr_nref(h);
517     stats = calloc(n_targets, sizeof(stats_aux_t));
518     if (!stats) {
519         print_error_errno("coverage", "Failed to allocate memory");
520         status = EXIT_FAILURE;
521         goto coverage_end;
522     }
523 
524     int64_t n_bins = opt_n_bins;
525     if (opt_reg) {
526         stats_aux_t *s = stats + data[0]->iter->tid;
527         s->beg = data[0]->iter->beg; // and to the parsed region coordinates
528         s->end = data[0]->iter->end;
529         if (s->end == HTS_POS_MAX) {
530             s->end = sam_hdr_tid2len(h, data[0]->iter->tid);
531         }
532         if (opt_n_bins > s->end - s->beg) {
533             n_bins = s->end - s->beg;
534         }
535         s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1);
536     }
537 
538     for (i=0; i<n_bam_files; i++)
539         data[i]->stats = stats;
540 
541     int64_t current_bin = 0;
542 
543     // the core multi-pileup loop
544     mplp = bam_mplp_init(n_bam_files, read_bam, (void**)data); // initialization
545     if (max_depth > 0)
546         bam_mplp_set_maxcnt(mplp, max_depth);  // set maximum coverage depth
547     else if (!max_depth)
548         bam_mplp_set_maxcnt(mplp, INT_MAX);
549 
550 
551     // Extra info for histogram and coverage counting
552     hist = (uint32_t*) calloc(opt_n_bins, sizeof(uint32_t));
553     n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM
554     plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
555     if (!hist || !n_plp || !plp) {
556         print_error_errno("coverage", "Failed to allocate memory");
557         status = EXIT_FAILURE;
558         goto coverage_end;
559     }
560     while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
561 
562         if (tid != old_tid) { // Next target sequence
563             if (old_tid >= 0) {
564                 if (opt_print_histogram) {
565                     print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
566                     fputc('\n', file_out);
567                 } else if (opt_print_tabular) {
568                     print_tabular_line(file_out, h, stats, old_tid);
569                 }
570 
571                 if (opt_print_histogram)
572                     memset(hist, 0, n_bins*sizeof(uint32_t));
573             }
574 
575             stats[tid].covered = true;
576             if (!opt_reg)
577                 stats[tid].end = sam_hdr_tid2len(h, tid);
578 
579             if (opt_print_histogram) {
580                 n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins;
581                 stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins;
582             }
583 
584             old_tid = tid;
585         }
586         if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip
587         if (tid >= n_targets) continue;     // diff number of @SQ lines per file?
588 
589         if (opt_print_histogram) {
590             current_bin = (pos - stats[tid].beg) / stats[tid].bin_width;
591         }
592 
593         bool count_base = false;
594         for (i = 0; i < n_bam_files; ++i) { // base level filters have to go here
595             int depth_at_pos = n_plp[i];
596             for (j = 0; j < n_plp[i]; ++j) {
597                 const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know
598 
599                 if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos
600                 else if (p->qpos < p->b->core.l_qseq &&
601                         bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
602                 else
603                     stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
604             }
605             if (depth_at_pos > 0) {
606                 count_base = true;
607                 stats[tid].summed_coverage += depth_at_pos;
608             }
609             // hist[current_bin] += depth_at_pos;  // Add counts to the histogram here to have one based on coverage
610             //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
611         }
612         if (count_base) {
613             stats[tid].n_covered_bases++;
614             if (opt_print_histogram && current_bin < n_bins)
615                 ++(hist[current_bin]); // Histogram based on breadth of coverage
616         }
617     }
618 
619     if (tid == -1 && opt_reg && *opt_reg != '*')
620         // Region specified but no data covering it.
621         tid = data[0]->iter->tid;
622 
623     if (tid < n_targets && tid >=0) {
624         if (opt_print_histogram) {
625             print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
626         } else if (opt_print_tabular) {
627             print_tabular_line(file_out, h, stats, tid);
628         }
629     }
630 
631 
632     if (!opt_reg && opt_print_tabular) {
633         for (i = 0; i < n_targets; ++i) {
634             if (!stats[i].covered) {
635                 stats[i].end = sam_hdr_tid2len(h, i);
636                 print_tabular_line(file_out, h, stats, i);
637             }
638         }
639     }
640 
641     if (ret < 0) status = EXIT_FAILURE;
642 
643 coverage_end:
644     if (n_plp) free(n_plp);
645     if (plp) free(plp);
646     if (mplp) bam_mplp_destroy(mplp);
647 
648     if (hist) free(hist);
649     if (stats) free(stats);
650 
651     // Close files and free data structures
652     if (!(file_out == stdout || fclose(file_out) == 0)) {
653         if (status == EXIT_SUCCESS) {
654             print_error_errno("coverage", "error on closing \"%s\"",
655                     (opt_output_file && strcmp(opt_output_file, "-") != 0?
656                      opt_output_file : "stdout"));
657             status = EXIT_FAILURE;
658         }
659     }
660 
661     if (data) {
662         for (i = 0; i < n_bam_files && data[i]; ++i) {
663             sam_hdr_destroy(data[i]->hdr);
664             if (data[i]->fp) sam_close(data[i]->fp);
665             hts_itr_destroy(data[i]->iter);
666             free(data[i]);
667         }
668         free(data);
669     }
670 
671     if (opt_file_list && fn) {
672         for (i = 0; i < n_bam_files; ++i)
673             free(fn[i]);
674         free(fn);
675     }
676     sam_global_args_free(&ga);
677 
678     return status;
679 }
680 
681 #ifdef _MAIN_BAMCOV
main(int argc,char * argv[])682 int main(int argc, char *argv[]) {
683     return main_coverage(argc, argv);
684 }
685 #endif
686