1 /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
2 
3     Copyright (C) 2008-2021 Genome Research Ltd.
4     Portions copyright (C) 2009-2012 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <math.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <unistd.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <limits.h>
34 #include <inttypes.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <getopt.h>
38 #include <htslib/sam.h>
39 #include <htslib/faidx.h>
40 #include <htslib/kstring.h>
41 #include <htslib/khash_str2int.h>
42 #include <htslib/hts_os.h>
43 #include <assert.h>
44 #include "regidx.h"
45 #include "bcftools.h"
46 #include "bam2bcf.h"
47 #include "bam_sample.h"
48 #include "gvcf.h"
49 
50 #define MPLP_BCF        1
51 #define MPLP_VCF        (1<<1)
52 #define MPLP_NO_COMP    (1<<2)
53 #define MPLP_NO_ORPHAN  (1<<3)
54 #define MPLP_REALN      (1<<4)
55 #define MPLP_NO_INDEL   (1<<5)
56 #define MPLP_REDO_BAQ   (1<<6)
57 #define MPLP_ILLUMINA13 (1<<7)
58 #define MPLP_IGNORE_RG  (1<<8)
59 #define MPLP_PRINT_POS  (1<<9)
60 #define MPLP_PRINT_MAPQ (1<<10)
61 #define MPLP_PER_SAMPLE (1<<11)
62 #define MPLP_SMART_OVERLAPS (1<<12)
63 #define MPLP_REALN_PARTIAL  (1<<13)
64 
65 typedef struct _mplp_aux_t mplp_aux_t;
66 typedef struct _mplp_pileup_t mplp_pileup_t;
67 
68 // Data shared by all bam files
69 typedef struct {
70     int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
71         max_indel_depth, max_read_len, fmt_flag, ambig_reads;
72     int rflag_require, rflag_filter, output_type;
73     int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
74     double min_frac; // for indels
75     double indel_bias;
76     char *reg_fname, *pl_list, *fai_fname, *output_fname;
77     int reg_is_file, record_cmd_line, n_threads, clevel;
78     faidx_t *fai;
79     regidx_t *bed, *reg;    // bed: skipping regions, reg: index-jump to regions
80     regitr_t *bed_itr, *reg_itr;
81     int bed_logic;          // 1: include region, 0: exclude region
82     gvcf_t *gvcf;
83 
84     // auxiliary structures for calling
85     bcf_callaux_t *bca;
86     bcf_callret1_t *bcr;
87     bcf_call_t bc;
88     bam_mplp_t iter;
89     mplp_aux_t **mplp_data;
90     int nfiles;
91     char **files;
92     mplp_pileup_t *gplp;
93     int *n_plp;
94     const bam_pileup1_t **plp;
95     bam_smpl_t *bsmpl;
96     kstring_t buf;
97     bcf1_t *bcf_rec;
98     htsFile *bcf_fp;
99     bcf_hdr_t *bcf_hdr;
100     int argc;
101     char **argv;
102 } mplp_conf_t;
103 
104 typedef struct {
105     char *ref[2];
106     int ref_id[2];
107     int ref_len[2];
108 } mplp_ref_t;
109 
110 #define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
111 
112 // Data specific to each bam file
113 struct _mplp_aux_t {
114     samFile *fp;
115     hts_itr_t *iter;
116     bam_hdr_t *h;
117     mplp_ref_t *ref;
118     const mplp_conf_t *conf;
119     int bam_id;
120     hts_idx_t *idx;     // maintained only with more than one -r regions
121 };
122 
123 // Data passed to htslib/mpileup
124 struct _mplp_pileup_t {
125     int n;
126     int *n_plp, *m_plp;
127     bam_pileup1_t **plp;
128 };
129 
mplp_get_ref(mplp_aux_t * ma,int tid,char ** ref,int * ref_len)130 static int mplp_get_ref(mplp_aux_t *ma, int tid,  char **ref, int *ref_len) {
131     mplp_ref_t *r = ma->ref;
132 
133     //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
134 
135     if (!r || !ma->conf->fai) {
136         *ref = NULL;
137         return 0;
138     }
139 
140     // Do we need to reference count this so multiple mplp_aux_t can
141     // track which references are in use?
142     // For now we just cache the last two. Sufficient?
143     if (tid == r->ref_id[0]) {
144         *ref = r->ref[0];
145         *ref_len = r->ref_len[0];
146         return 1;
147     }
148     if (tid == r->ref_id[1]) {
149         // Last, swap over
150         int tmp;
151         tmp = r->ref_id[0];  r->ref_id[0]  = r->ref_id[1];  r->ref_id[1]  = tmp;
152         tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
153 
154         char *tc;
155         tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
156         *ref = r->ref[0];
157         *ref_len = r->ref_len[0];
158         return 1;
159     }
160 
161     // New, so migrate to old and load new
162     free(r->ref[1]);
163     r->ref[1]     = r->ref[0];
164     r->ref_id[1]  = r->ref_id[0];
165     r->ref_len[1] = r->ref_len[0];
166 
167     r->ref_id[0] = tid;
168     r->ref[0] = faidx_fetch_seq(ma->conf->fai,
169                                 ma->h->target_name[r->ref_id[0]],
170                                 0,
171                                 INT_MAX,
172                                 &r->ref_len[0]);
173 
174     if (!r->ref[0]) {
175         r->ref[0] = NULL;
176         r->ref_id[0] = -1;
177         r->ref_len[0] = 0;
178         *ref = NULL;
179         return 0;
180     }
181 
182     *ref = r->ref[0];
183     *ref_len = r->ref_len[0];
184     return 1;
185 }
186 
mplp_func(void * data,bam1_t * b)187 static int mplp_func(void *data, bam1_t *b)
188 {
189     char *ref;
190     mplp_aux_t *ma = (mplp_aux_t*)data;
191     int ret, ref_len;
192     while (1)
193     {
194         int has_ref;
195         ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
196         if (ret < 0) break;
197         // The 'B' cigar operation is not part of the specification, considering as obsolete.
198         //  bam_remove_B(b);
199         if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
200         if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
201         if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
202         if (ma->conf->bed)
203         {
204             // test overlap
205             regitr_t *itr = ma->conf->bed_itr;
206             int beg = b->core.pos, end = bam_endpos(b)-1;
207             int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
208             if ( !ma->conf->bed_logic && !overlap )
209             {
210                 // exclude only reads which are fully contained in the region
211                 while ( regitr_overlap(itr) )
212                 {
213                     if ( beg < itr->beg ) { overlap = 1; break; }
214                     if ( end > itr->end ) { overlap = 1; break; }
215                 }
216             }
217             if ( !overlap ) continue;
218         }
219         if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
220         if (ma->conf->flag & MPLP_ILLUMINA13) {
221             int i;
222             uint8_t *qual = bam_get_qual(b);
223             for (i = 0; i < b->core.l_qseq; ++i)
224                 qual[i] = qual[i] > 31? qual[i] - 31 : 0;
225         }
226 
227         if (ma->conf->fai && b->core.tid >= 0) {
228             has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
229             if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
230                 fprintf(stderr,"[%s] Skipping because %"PRId64" is outside of %d [ref:%d]\n",
231                         __func__, (int64_t) b->core.pos, ref_len, b->core.tid);
232                 continue;
233             }
234         } else {
235             has_ref = 0;
236         }
237 
238         // Allow sufficient room for bam_aux_append of ZQ tag without
239         // a realloc and consequent breakage of pileup's cached pointers.
240         if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) {
241             // Doing sam_prob_realn later is problematic as it adds to
242             // the tag list (ZQ or BQ), which causes a realloc of b->data.
243             // This happens after pileup has built a hash table on the
244             // read name.  It's a deficiency in pileup IMO.
245 
246             // We could implement a new sam_prob_realn that returns ZQ
247             // somewhere else and cache it ourselves (pileup clientdata),
248             // but for now we simply use a workaround.
249             //
250             // We create a fake tag of the correct length, which we remove
251             // just prior calling sam_prob_realn so we can guarantee there is
252             // room. (We can't just make room now as bam_copy1 removes it
253             // again).
254             if (b->core.l_qseq > 500) {
255                 uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1);
256                 memset(ZQ, '@', b->core.l_qseq);
257                 ZQ[b->core.l_qseq] = 0;
258                 bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
259                 free(ZQ);
260             } else {
261                 static uint8_t ZQ[501] =
262                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
263                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
264                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
265                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
266                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
267                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
268                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
269                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
270                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
271                     "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
272                 ZQ[b->core.l_qseq] = 0;
273                 bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
274                 ZQ[b->core.l_qseq] = '@';
275             }
276         }
277 
278         if (has_ref && ma->conf->capQ_thres > 10) {
279             int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
280             if (q < 0) continue;    // skip
281             else if (b->core.qual > q) b->core.qual = q;
282         }
283         if (b->core.qual < ma->conf->min_mq) continue;
284         else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;
285 
286         return ret;
287     };
288     return ret;
289 }
290 
291 // Called once per new bam added to the pileup.
292 // We cache sample information here so we don't have to keep recomputing this
293 // on each and every pileup column. If FMT/SCR annotation is requested, a flag
294 // is set to indicate the presence of a soft clip.
295 //
296 // Cd is an arbitrary block of data we can write into, which ends up in
297 // the pileup structures. We stash the sample ID there:
298 //      has_soft_clip .. cd->i & 1
299 //      sample_id     .. cd->i >> 1
pileup_constructor(void * data,const bam1_t * b,bam_pileup_cd * cd)300 static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
301 {
302     mplp_aux_t *ma = (mplp_aux_t *)data;
303     int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
304     cd->i = 0;
305     PLP_SET_SAMPLE_ID(cd->i, n);
306     // Whether read has a soft-clip is used in mplp_realn's heuristics.
307     // TODO: consider whether clip length is beneficial to use?
308     int i;
309     for (i=0; i<b->core.n_cigar; i++) {
310         int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
311         if (cig == BAM_CSOFT_CLIP) {
312             PLP_SET_SOFT_CLIP(cd->i);
313             break;
314         }
315     }
316 
317     if (ma->conf->flag & MPLP_REALN) {
318         int i;
319         // int tot_ins = 0;
320         // int p = 0;
321         uint32_t *cigar = bam_get_cigar(b);
322         for (i=0; i<b->core.n_cigar; i++) {
323             int cig = cigar[i] & BAM_CIGAR_MASK;
324             // if (bam_cigar_type(cig) & 2)
325             //     p += cigar[i] >> BAM_CIGAR_SHIFT;
326             if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
327                 // tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
328                 // Possible further optimsation, check tot_ins==1 later
329                 // (and remove break) so we can detect single bp indels.
330                 // We may want to focus BAQ on more complex regions only.
331                 PLP_SET_INDEL(cd->i);
332                 break;
333             }
334 
335             // TODO: proper p->cd struct and have cd->i as a size rather
336             // than a flag.
337 
338             // Then aggregate together the sizes and if just 1 size for all
339             // reads or 2 sizes for approx 50/50 split in all reads, then
340             // treat this as a well-aligned variant and don't run BAQ.
341         }
342     }
343 
344     return 0;
345 }
346 
group_smpl(mplp_pileup_t * m,bam_smpl_t * bsmpl,int n,int * n_plp,const bam_pileup1_t ** plp)347 static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
348 {
349     int i, j;
350     memset(m->n_plp, 0, m->n * sizeof(int));
351     for (i = 0; i < n; ++i) // iterate over all bams
352     {
353         for (j = 0; j < n_plp[i]; ++j)  // iterate over all reads available at this position
354         {
355             const bam_pileup1_t *p = plp[i] + j;
356             int id = PLP_SAMPLE_ID(p->cd.i);
357             if (m->n_plp[id] == m->m_plp[id])
358             {
359                 m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
360                 m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
361             }
362             m->plp[id][m->n_plp[id]++] = *p;
363         }
364     }
365 }
366 
flush_bcf_records(mplp_conf_t * conf,htsFile * fp,bcf_hdr_t * hdr,bcf1_t * rec)367 static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec)
368 {
369     if ( !conf->gvcf )
370     {
371         if ( rec && bcf_write1(fp, hdr, rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
372         return;
373     }
374 
375     if ( !rec )
376     {
377         gvcf_write(conf->gvcf, fp, hdr, NULL, 0);
378         return;
379     }
380 
381     int is_ref = 0;
382     if ( rec->n_allele==1 ) is_ref = 1;
383     else if ( rec->n_allele==2 )
384     {
385         // second allele is mpileup's X, not a variant
386         if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1;
387     }
388     rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref);
389     if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
390 }
391 
392 /*
393  * Loops for an indel at this position.
394  *
395  * Only reads that overlap an indel loci get realigned.  This considerably
396  * reduces the cost of running BAQ while keeping the main benefits.
397  *
398  * TODO: also consider only realigning reads that don't span the indel
399  * by more than a certain amount either-side.  Ie focus BAQ only on reads
400  * ending adjacent to the indel, where the alignment is most likely to
401  * be wrong.  (2nd TODO: do this based on sequence context; STRs bad, unique
402  * data good.)
403  *
404  * NB: this may sadly realign after we've already used the data.  Hmm...
405  */
mplp_realn(int n,int * n_plp,const bam_pileup1_t ** plp,int flag,int max_read_len,char * ref,int ref_len,int pos)406 static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
407                        int flag, int max_read_len,
408                        char *ref, int ref_len, int pos) {
409     int i, j, has_indel = 0, has_clip = 0, nt = 0;
410     int min_indel = INT_MAX, max_indel = INT_MIN;
411 
412     // Is an indel present.
413     // NB: don't bother even checking if very long as almost guaranteed
414     // to have indel (and likely soft-clips too).
415     for (i = 0; i < n; i++) { // iterate over bams
416         nt += n_plp[i];
417         for (j = 0; j < n_plp[i]; j++) { // iterate over reads
418             bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
419             has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
420             // Has_clip is almost always true for very long reads
421             // (eg PacBio CCS), but these rarely matter as the clip
422             // is likely a long way from this indel.
423             has_clip  += (PLP_HAS_SOFT_CLIP(p->cd.i))         ? 1 : 0;
424             if (max_indel < p->indel)
425                 max_indel = p->indel;
426             if (min_indel > p->indel)
427                 min_indel = p->indel;
428         }
429     }
430 
431     if (flag & MPLP_REALN_PARTIAL) {
432         if (has_indel == 0 ||
433             (has_clip < 0.2*nt && max_indel == min_indel &&
434              (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1)))
435             return;
436     }
437 
438     // Realign
439     for (i = 0; i < n; i++) { // iterate over bams
440         for (j = 0; j < n_plp[i]; j++) { // iterate over reads
441             const bam_pileup1_t *p = plp[i] + j;
442             bam1_t *b = p->b;
443 
444             // Avoid doing multiple times.
445             //
446             // Note we cannot modify p->cd.i here with a PLP_SET macro
447             // because the cd item is held by mpileup in an lbnode_t
448             // struct and copied over to the pileup struct for each
449             // iteration, essentially making p->cd.i read only.
450             //
451             // We could use our own structure (p->cd.p), allocated during
452             // the constructor, but for simplicity we play dirty and
453             // abuse an unused flag bit instead.
454             if (b->core.flag & 32768)
455                 continue;
456             b->core.flag |= 32768;
457 
458             if (b->core.l_qseq > max_read_len)
459                 continue;
460 
461             // Check p->cigar_ind and see what cigar elements are before
462             // and after.  How close is this location to the end of the
463             // read?  Only realign if we don't span by more than X bases.
464             //
465             // Again, best only done on deeper data as BAQ helps
466             // disproportionately more on shallow data sets.
467             //
468             // This rescues some of the false negatives that are caused by
469             // systematic reduction in quality due to sample vs ref alignment.
470 
471 // At deep coverage we skip realigning more reads as we have sufficient depth.
472 // This rescues for false negatives.  At shallow depth we pay for this with
473 // more FP so are more stringent on spanning size.
474 #define REALN_DIST (40+10*(nt<40)+10*(nt<20))
475             uint32_t *cig = bam_get_cigar(b);
476             int ncig = b->core.n_cigar;
477 
478             // Don't realign reads where indel is in middle?
479             // On long read data we don't care about soft-clips at the ends.
480             // For short read data, we always calc BAQ on these as they're
481             // a common source of false positives.
482             if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
483                 // Left & right cigar op match.
484                 int lr = b->core.l_qseq > 500;
485                 int lm = 0, rm = 0, k;
486                 for (k = 0; k < ncig; k++) {
487                     int cop = bam_cigar_op(cig[k]);
488                     if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
489                         continue;
490 
491                     if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
492                         cop == BAM_CEQUAL)
493                         lm += bam_cigar_oplen(cig[k]);
494                     else
495                         break;
496                 }
497 
498                 for (k = ncig-1; k >= 0; k--) {
499                     int cop = bam_cigar_op(cig[k]);
500                     if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
501                         continue;
502 
503                     if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
504                         cop == BAM_CEQUAL)
505                         rm += bam_cigar_oplen(cig[k]);
506                     else
507                         break;
508                 }
509 
510                 if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
511                     continue;
512 
513                 if (lm >= REALN_DIST && rm >= REALN_DIST &&
514                     has_clip < (0.15+0.05*(nt>20))*nt)
515                     continue;
516             }
517 
518             if (b->core.l_qseq > 500) {
519                 // don't do BAQ on long-read data if it's going to
520                 // cause us to have a large band-with and costly in CPU
521                 int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
522                 if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000)
523                     continue;
524             }
525 
526             // Fudge: make room for ZQ tag.
527             uint8_t *_Q = bam_aux_get(b, "_Q");
528             if (_Q) bam_aux_del(b, _Q);
529             sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3);
530         }
531     }
532 
533     return;
534 }
535 
mpileup_reg(mplp_conf_t * conf,uint32_t beg,uint32_t end)536 static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
537 {
538     bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
539 
540     int ret, i, tid, pos, ref_len;
541     char *ref;
542 
543     while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
544     {
545         if ( pos<beg || pos>end ) continue;
546         if ( conf->bed && tid >= 0 )
547         {
548             int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL);
549             if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
550             if ( !overlap ) continue;
551         }
552         int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
553         if (has_ref && (conf->flag & MPLP_REALN))
554             mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
555                        conf->max_read_len, ref, ref_len, pos);
556 
557         int total_depth, _ref0, ref16;
558         for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
559         group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp);
560         _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
561         ref16 = seq_nt16_table[_ref0];
562         bcf_callaux_clean(conf->bca, &conf->bc);
563         for (i = 0; i < conf->gplp->n; ++i)
564             bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i);
565         conf->bc.tid = tid; conf->bc.pos = pos;
566         bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
567         bcf_clear1(conf->bcf_rec);
568         bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
569                      conf->bca, 0);
570         flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
571 
572         // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
573         // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
574         if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
575             && (bcf_callaux_clean(conf->bca, &conf->bc),
576                 bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
577         {
578             for (i = 0; i < conf->gplp->n; ++i)
579                 bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
580             if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
581             {
582                 bcf_clear1(conf->bcf_rec);
583                 bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
584                 flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
585             }
586         }
587     }
588     return 0;
589 }
590 
mpileup(mplp_conf_t * conf)591 static int mpileup(mplp_conf_t *conf)
592 {
593     if (conf->nfiles == 0) {
594         fprintf(stderr,"[%s] no input file/data given\n", __func__);
595         exit(EXIT_FAILURE);
596     }
597 
598     mplp_ref_t mp_ref = MPLP_REF_INIT;
599     conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
600     conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
601     conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
602     conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));
603 
604     // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
605     // must be kept in the memory for the whole time which can be a problem with many bams.
606     // Therefore if none or only one region is requested, we initialize the bam iterator as
607     // before and free the index. Only when multiple regions are queried, we keep the index.
608     int nregs = 0;
609     if ( conf->reg_fname )
610     {
611         if ( conf->reg_is_file )
612         {
613             conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
614             if ( !conf->reg ) {
615                 fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
616                 exit(EXIT_FAILURE);
617             }
618         }
619         else
620         {
621             conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
622             if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
623                 fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
624                 exit(EXIT_FAILURE);
625             }
626         }
627         nregs = regidx_nregs(conf->reg);
628         conf->reg_itr = regitr_init(conf->reg);
629         regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
630     }
631 
632     // read the header of each file in the list and initialize data
633     // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
634     bam_hdr_t *hdr = NULL;      // header of first file in input list
635     int i;
636     for (i = 0; i < conf->nfiles; ++i) {
637         bam_hdr_t *h_tmp;
638         conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
639         conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
640         if ( !conf->mplp_data[i]->fp )
641         {
642             fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
643             exit(EXIT_FAILURE);
644         }
645         if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
646             fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
647             exit(EXIT_FAILURE);
648         }
649         if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
650             fprintf(stderr, "[%s] failed to process %s: %s\n",
651                     __func__, conf->fai_fname, strerror(errno));
652             exit(EXIT_FAILURE);
653         }
654         conf->mplp_data[i]->conf = conf;
655         conf->mplp_data[i]->ref = &mp_ref;
656         h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
657         if ( !h_tmp ) {
658             fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
659             exit(EXIT_FAILURE);
660         }
661         conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
662         conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
663         if ( conf->mplp_data[i]->bam_id<0 )
664         {
665             // no usable readgroups in this bam, it can be skipped
666             sam_close(conf->mplp_data[i]->fp);
667             free(conf->mplp_data[i]);
668             bam_hdr_destroy(h_tmp);
669             free(conf->files[i]);
670             if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
671             conf->nfiles--;
672             i--;
673             continue;
674         }
675         if (conf->reg) {
676             hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
677             if (idx == NULL) {
678                 fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
679                 exit(EXIT_FAILURE);
680             }
681             conf->buf.l = 0;
682             ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
683             conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
684             if ( !conf->mplp_data[i]->iter )
685             {
686                 conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
687                 if ( conf->mplp_data[i]->iter ) {
688                     fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
689                     exit(EXIT_FAILURE);
690                 }
691                 fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
692                 exit(EXIT_FAILURE);
693             }
694             if ( nregs==1 ) // no need to keep the index in memory
695                hts_idx_destroy(idx);
696             else
697                 conf->mplp_data[i]->idx = idx;
698         }
699 
700         if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
701         else {
702             // FIXME: check consistency between h and h_tmp
703             bam_hdr_destroy(h_tmp);
704 
705             // we store only the first file's header; it's (alleged to be)
706             // compatible with the i-th file's target_name lookup needs
707             conf->mplp_data[i]->h = hdr;
708         }
709     }
710     if ( !hdr ) {
711         fprintf(stderr, "[%s] failed to find a file header with usable read groups\n", __func__);
712         exit(EXIT_FAILURE);
713     }
714     // allocate data storage proportionate to number of samples being studied sm->n
715     bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
716     conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
717     conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
718     conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
719 
720     fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
721     // write the VCF header
722     char wmode[8];
723     set_wmode(wmode,conf->output_type,conf->output_fname,conf->clevel);
724     conf->bcf_fp = hts_open(conf->output_fname ? conf->output_fname : "-", wmode);
725     if (conf->bcf_fp == NULL) {
726         fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
727         exit(EXIT_FAILURE);
728     }
729     if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
730 
731     // BCF header creation
732     conf->bcf_hdr = bcf_hdr_init("w");
733     conf->buf.l = 0;
734 
735     if (conf->record_cmd_line)
736     {
737         ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
738         bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
739 
740         conf->buf.l = 0;
741         ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
742         for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
743         kputc('\n', &conf->buf);
744         bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
745     }
746 
747     if (conf->fai_fname)
748     {
749         conf->buf.l = 0;
750         ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
751         bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
752     }
753 
754     // Translate BAM @SQ tags to BCF ##contig tags
755     // todo: use/write new BAM header manipulation routines, fill also UR, M5
756     for (i=0; i<hdr->n_targets; i++)
757     {
758         conf->buf.l = 0;
759         ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
760         bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
761     }
762     conf->buf.l = 0;
763 
764     bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
765     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
766     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">");
767     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">");
768     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
769     if ( conf->fmt_flag&B2B_INFO_VDB )
770         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
771 
772     if (conf->fmt_flag & B2B_INFO_ZSCORE) {
773         if ( conf->fmt_flag&B2B_INFO_RPB )
774             bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
775         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
776         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
777         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
778         if ( conf->fmt_flag&B2B_INFO_SCB )
779             bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
780     } else {
781         if ( conf->fmt_flag&B2B_INFO_RPB )
782             bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
783         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
784         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
785         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
786     }
787 
788     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
789 #if CDF_MWU_TESTS
790     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
791     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
792     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
793     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
794 #endif
795     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
796     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
797     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
798     bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
799     bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
800     if ( conf->fmt_flag&B2B_FMT_DP )
801         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
802     if ( conf->fmt_flag&B2B_FMT_DV )
803         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
804     if ( conf->fmt_flag&B2B_FMT_DPR )
805         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
806     if ( conf->fmt_flag&B2B_INFO_DPR )
807         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
808     if ( conf->fmt_flag&B2B_FMT_DP4 )
809         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
810     if ( conf->fmt_flag&B2B_FMT_SP )
811         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
812     if ( conf->fmt_flag&B2B_FMT_AD )
813         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths (high-quality bases)\">");
814     if ( conf->fmt_flag&B2B_FMT_ADF )
815         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand (high-quality bases)\">");
816     if ( conf->fmt_flag&B2B_FMT_ADR )
817         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand (high-quality bases)\">");
818     if ( conf->fmt_flag&B2B_FMT_QS )
819         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=QS,Number=R,Type=Integer,Description=\"Phred-score allele quality sum used by `call -mG` and `+trio-dnm`\">");
820     if ( conf->fmt_flag&B2B_INFO_AD )
821         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths (high-quality bases)\">");
822     if ( conf->fmt_flag&B2B_INFO_ADF )
823         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand (high-quality bases)\">");
824     if ( conf->fmt_flag&B2B_INFO_SCR )
825         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCR,Number=1,Type=Integer,Description=\"Number of soft-clipped reads (at high-quality bases)\">");
826     if ( conf->fmt_flag&B2B_FMT_SCR )
827         bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SCR,Number=1,Type=Integer,Description=\"Per-sample number of soft-clipped reads (at high-quality bases)\">");
828     if ( conf->fmt_flag&B2B_INFO_ADR )
829         bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand (high-quality bases)\">");
830     if ( conf->gvcf )
831         gvcf_update_header(conf->gvcf, conf->bcf_hdr);
832 
833     int nsmpl;
834     const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
835     for (i=0; i<nsmpl; i++)
836         bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
837     if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
838 
839     conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
840                               conf->delta_baseQ);
841     conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
842     conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
843     conf->bca->indel_bias = conf->indel_bias;
844     conf->bca->min_frac = conf->min_frac;
845     conf->bca->min_support = conf->min_support;
846     conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
847     conf->bca->fmt_flag = conf->fmt_flag;
848     conf->bca->ambig_reads = conf->ambig_reads;
849     conf->bca->indel_win_size = conf->indel_win_size;
850 
851     conf->bc.bcf_hdr = conf->bcf_hdr;
852     conf->bc.n  = nsmpl;
853     conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
854     conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES);
855     for (i=0; i<nsmpl; i++)
856         conf->bcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES;
857     if (conf->fmt_flag)
858     {
859         assert( sizeof(float)==sizeof(int32_t) );
860         conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
861         conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
862         if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
863         {
864             // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
865             conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
866             conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
867             for (i=0; i<nsmpl; i++)
868             {
869                 conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
870                 conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
871             }
872         }
873         if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) )
874             conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR));
875     }
876 
877     // init mpileup
878     conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
879     if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
880     fprintf(stderr, "[%s] maximum number of reads per input file set to -d %d\n",  __func__, conf->max_depth);
881     if ( (double)conf->max_depth * conf->nfiles > 1<<20)
882         fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
883     if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
884         fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
885     bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
886     conf->max_indel_depth = conf->max_indel_depth * nsmpl;
887     conf->bcf_rec = bcf_init1();
888     bam_mplp_constructor(conf->iter, pileup_constructor);
889 
890     // Run mpileup for multiple regions
891     if ( nregs )
892     {
893         int ireg = 0;
894         do
895         {
896             // first region is already positioned
897             if ( ireg++ > 0 )
898             {
899                 conf->buf.l = 0;
900                 ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
901 
902                 for (i=0; i<conf->nfiles; i++)
903                 {
904                     hts_itr_destroy(conf->mplp_data[i]->iter);
905                     conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
906                     if ( !conf->mplp_data[i]->iter )
907                     {
908                         conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
909                         if ( conf->mplp_data[i]->iter ) {
910                             fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
911                             exit(EXIT_FAILURE);
912                         }
913                         fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
914                         exit(EXIT_FAILURE);
915                     }
916                     bam_mplp_reset(conf->iter);
917                 }
918             }
919             mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
920         }
921         while ( regitr_loop(conf->reg_itr) );
922     }
923     else
924         mpileup_reg(conf,0,UINT32_MAX);
925 
926     flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
927 
928     // clean up
929     free(conf->bc.tmp.s);
930     bcf_destroy1(conf->bcf_rec);
931     if (conf->bcf_fp)
932     {
933         if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
934         bcf_hdr_destroy(conf->bcf_hdr);
935         bcf_call_destroy(conf->bca);
936         free(conf->bc.PL);
937         free(conf->bc.DP4);
938         free(conf->bc.ADR);
939         free(conf->bc.ADF);
940         free(conf->bc.SCR);
941         free(conf->bc.QS);
942         free(conf->bc.fmt_arr);
943         free(conf->bcr);
944     }
945     if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
946     free(conf->buf.s);
947     for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
948     free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
949     bam_mplp_destroy(conf->iter);
950     bam_hdr_destroy(hdr);
951     for (i = 0; i < conf->nfiles; ++i) {
952         if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
953         sam_close(conf->mplp_data[i]->fp);
954         if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
955         free(conf->mplp_data[i]);
956     }
957     if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
958     free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
959     free(mp_ref.ref[0]);
960     free(mp_ref.ref[1]);
961     return 0;
962 }
963 
is_url(const char * s)964 static int is_url(const char *s)
965 {
966     static const char uri_scheme_chars[] =
967         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
968     return s[strspn(s, uri_scheme_chars)] == ':';
969 }
970 
971 #define MAX_PATH_LEN 1024
read_file_list(const char * file_list,int * n,char ** argv[])972 int read_file_list(const char *file_list,int *n,char **argv[])
973 {
974     char buf[MAX_PATH_LEN];
975     int len, nfiles = 0;
976     char **files = NULL;
977     struct stat sb;
978 
979     *n = 0;
980     *argv = NULL;
981 
982     FILE *fh = fopen(file_list,"r");
983     if ( !fh )
984     {
985         fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
986         return 1;
987     }
988 
989     files = (char**) calloc(nfiles,sizeof(char*));
990     nfiles = 0;
991     while ( fgets(buf,MAX_PATH_LEN,fh) )
992     {
993         // allow empty lines and trailing spaces
994         len = strlen(buf);
995         while ( len>0 && isspace(buf[len-1]) ) len--;
996         if ( !len ) continue;
997 
998         // check sanity of the file list
999         buf[len] = 0;
1000         if (! (is_url(buf) || stat(buf, &sb) == 0))
1001         {
1002             // no such file, check if it is safe to print its name
1003             int i, safe_to_print = 1;
1004             for (i=0; i<len; i++)
1005                 if (!isprint(buf[i])) { safe_to_print = 0; break; }
1006             if ( safe_to_print )
1007                 fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
1008             else
1009                 fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
1010             return 1;
1011         }
1012 
1013         nfiles++;
1014         files = (char**) realloc(files,nfiles*sizeof(char*));
1015         files[nfiles-1] = strdup(buf);
1016     }
1017     if ( fclose(fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,file_list);
1018     if ( !nfiles )
1019     {
1020         fprintf(stderr,"No files read from %s\n", file_list);
1021         return 1;
1022     }
1023     *argv = files;
1024     *n    = nfiles;
1025     return 0;
1026 }
1027 #undef MAX_PATH_LEN
1028 
parse_format_flag(const char * str)1029 int parse_format_flag(const char *str)
1030 {
1031     int i, flag = 0, n_tags;
1032     char **tags = hts_readlist(str, 0, &n_tags);
1033     for(i=0; i<n_tags; i++)
1034     {
1035         if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
1036         else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
1037         else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
1038         else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
1039         else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
1040         else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
1041         else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
1042         else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
1043         else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
1044         else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
1045         else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
1046         else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
1047         else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
1048         else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
1049         else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
1050         else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
1051         else
1052         {
1053             fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
1054             exit(EXIT_FAILURE);
1055         }
1056         free(tags[i]);
1057     }
1058     if (n_tags) free(tags);
1059     return flag;
1060 }
1061 
1062 // todo: make it possible to turn off some annotations or change the defaults,
1063 //      specifically RPB, VDB, MWU, SGB tests. It would be good to do some
1064 //      benchmarking first to see if it's worth it.
list_annotations(FILE * fp)1065 static void list_annotations(FILE *fp)
1066 {
1067     fprintf(fp,
1068 "\n"
1069 "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
1070 "\n"
1071 "  FORMAT/AD  .. Allelic depth (Number=R,Type=Integer)\n"
1072 "  FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
1073 "  FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
1074 "  FORMAT/DP  .. Number of high-quality bases (Number=1,Type=Integer)\n"
1075 "  FORMAT/QS  .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
1076 "  FORMAT/SP  .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
1077 "  FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
1078 "\n"
1079 "INFO annotation tags available:\n"
1080 "\n"
1081 "  INFO/AD  .. Total allelic depth (Number=R,Type=Integer)\n"
1082 "  INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
1083 "  INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
1084 "  INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
1085 "\n");
1086 }
1087 
print_usage(FILE * fp,const mplp_conf_t * mplp)1088 static void print_usage(FILE *fp, const mplp_conf_t *mplp)
1089 {
1090     char *tmp_require = bam_flag2str(mplp->rflag_require);
1091     char *tmp_filter  = bam_flag2str(mplp->rflag_filter);
1092 
1093     // Display usage information, formatted for the standard 80 columns.
1094     // (The unusual string formatting here aids the readability of this
1095     // source code in 80 columns, to the extent that's possible.)
1096 
1097     fprintf(fp,
1098         "\n"
1099         "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
1100         "\n"
1101         "Input options:\n"
1102         "  -6, --illumina1.3+      Quality is in the Illumina-1.3+ encoding\n"
1103         "  -A, --count-orphans     Do not discard anomalous read pairs\n"
1104         "  -b, --bam-list FILE     List of input BAM filenames, one per line\n"
1105         "  -B, --no-BAQ            Disable BAQ (per-Base Alignment Quality)\n"
1106         "  -C, --adjust-MQ INT     Adjust mapping quality [0]\n"
1107         "  -D, --full-BAQ          Apply BAQ everywhere, not just in problematic regions\n"
1108         "  -d, --max-depth INT     Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
1109             fprintf(fp,
1110         "  -E, --redo-BAQ          Recalculate BAQ on the fly, ignore existing BQs\n"
1111         "  -f, --fasta-ref FILE    Faidx indexed reference sequence file\n"
1112         "      --no-reference      Do not require fasta reference file\n"
1113         "  -G, --read-groups FILE  Select or exclude read groups listed in the file\n"
1114         "  -q, --min-MQ INT        Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
1115     fprintf(fp,
1116         "  -Q, --min-BQ INT        Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
1117     fprintf(fp,
1118         "      --max-BQ INT        Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
1119     fprintf(fp,
1120         "      --delta-BQ INT      Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
1121     fprintf(fp,
1122         "  -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n"
1123         "  -R, --regions-file FILE Restrict to regions listed in a file\n"
1124         "      --ignore-RG         Ignore RG tags (one BAM = one sample)\n"
1125         "  --rf, --incl-flags STR|INT  Required flags: skip reads with mask bits unset [%s]\n", tmp_require);
1126     fprintf(fp,
1127         "  --ff, --excl-flags STR|INT  Filter flags: skip reads with mask bits set\n"
1128         "                                            [%s]\n", tmp_filter);
1129     fprintf(fp,
1130         "  -s, --samples LIST      Comma separated list of samples to include\n"
1131         "  -S, --samples-file FILE File of samples to include\n"
1132         "  -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n"
1133         "  -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
1134         "  -x, --ignore-overlaps   Disable read-pair overlap detection\n"
1135         "      --seed INT          Random number seed used for sampling deep regions [0]\n"
1136         "\n"
1137         "Output options:\n"
1138         "  -a, --annotate LIST     Optional tags to output; '?' to list available tags []\n"
1139         "  -g, --gvcf INT[,...]    Group non-variant sites into gVCF blocks according\n"
1140         "                          To minimum per-sample DP\n"
1141         "      --no-version        Do not append version and command line to the header\n"
1142         "  -o, --output FILE       Write output to FILE [standard output]\n"
1143         "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
1144         "                          'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
1145         "  -U, --mwu-u             Use older probability scale for Mann-Whitney U test\n"
1146         "      --threads INT       Use multithreading with INT worker threads [0]\n"
1147         "\n"
1148         "SNP/INDEL genotype likelihoods options:\n"
1149         "  -X, --config STR        Specify platform specific profiles (see below)\n"
1150         "  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
1151     fprintf(fp,
1152         "  -F, --gap-frac FLOAT    Minimum fraction of gapped reads [%g]\n", mplp->min_frac);
1153     fprintf(fp,
1154         "  -h, --tandem-qual INT   Coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
1155     fprintf(fp,
1156         "  -I, --skip-indels       Do not perform indel calling\n"
1157         "  -L, --max-idepth INT    Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
1158     fprintf(fp,
1159         "  -m, --min-ireads INT    Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
1160     fprintf(fp,
1161         "  -M, --max-read-len INT  Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
1162     fprintf(fp,
1163         "  -o, --open-prob INT     Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
1164     fprintf(fp,
1165         "  -p, --per-sample-mF     Apply -m and -F per-sample for increased sensitivity\n"
1166         "  -P, --platforms STR     Comma separated list of platforms for indels [all]\n"
1167         "  --ar, --ambig-reads STR   What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
1168     fprintf(fp,
1169         "      --indel-bias FLOAT  Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
1170     fprintf(fp,
1171         "      --indel-size INT    Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
1172     fprintf(fp,"\n");
1173     fprintf(fp,
1174         "Configuration profiles activated with -X, --config:\n"
1175         "    1.12:        -Q13 -h100 -m1 -F0.002\n"
1176         "    illumina:    [ default values ]\n"
1177         "    ont:         -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
1178         "    pacbio-ccs:  -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
1179         "\n"
1180         "Notes: Assuming diploid individuals.\n"
1181         "\n"
1182         "Example:\n"
1183         "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
1184         "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
1185         "\n");
1186 
1187     free(tmp_require);
1188     free(tmp_filter);
1189 }
1190 
main_mpileup(int argc,char * argv[])1191 int main_mpileup(int argc, char *argv[])
1192 {
1193     int c;
1194     const char *file_list = NULL;
1195     char **fn = NULL;
1196     int nfiles = 0, use_orphan = 0, noref = 0;
1197     mplp_conf_t mplp;
1198     memset(&mplp, 0, sizeof(mplp_conf_t));
1199     mplp.min_baseQ = 1;
1200     mplp.max_baseQ = 60;
1201     mplp.delta_baseQ = 30;
1202     mplp.capQ_thres = 0;
1203     mplp.max_depth = 250; mplp.max_indel_depth = 250;
1204     mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
1205     mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
1206     mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
1207               | MPLP_SMART_OVERLAPS;
1208     mplp.argc = argc; mplp.argv = argv;
1209     mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
1210     mplp.output_fname = NULL;
1211     mplp.output_type = FT_VCF;
1212     mplp.record_cmd_line = 1;
1213     mplp.n_threads = 0;
1214     mplp.bsmpl = bam_smpl_init();
1215     // the default to be changed in future, see also parse_format_flag()
1216     mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
1217     mplp.max_read_len = 500;
1218     mplp.ambig_reads = B2B_DROP;
1219     mplp.indel_win_size = 110;
1220     mplp.clevel = -1;
1221     hts_srand48(0);
1222 
1223     static const struct option lopts[] =
1224     {
1225         {"rf", required_argument, NULL, 1},   // require flag
1226         {"ff", required_argument, NULL, 2},   // filter flag
1227         {"incl-flags", required_argument, NULL, 1},
1228         {"excl-flags", required_argument, NULL, 2},
1229         {"output", required_argument, NULL, 3},
1230         {"open-prob", required_argument, NULL, 4},
1231         {"ignore-RG", no_argument, NULL, 5},
1232         {"ignore-rg", no_argument, NULL, 5},
1233         {"gvcf", required_argument, NULL, 'g'},
1234         {"no-reference", no_argument, NULL, 7},
1235         {"no-version", no_argument, NULL, 8},
1236         {"threads",required_argument,NULL,9},
1237         {"illumina1.3+", no_argument, NULL, '6'},
1238         {"count-orphans", no_argument, NULL, 'A'},
1239         {"bam-list", required_argument, NULL, 'b'},
1240         {"no-BAQ", no_argument, NULL, 'B'},
1241         {"no-baq", no_argument, NULL, 'B'},
1242         {"full-BAQ", no_argument, NULL, 'D'},
1243         {"full-baq", no_argument, NULL, 'D'},
1244         {"adjust-MQ", required_argument, NULL, 'C'},
1245         {"adjust-mq", required_argument, NULL, 'C'},
1246         {"max-depth", required_argument, NULL, 'd'},
1247         {"redo-BAQ", no_argument, NULL, 'E'},
1248         {"redo-baq", no_argument, NULL, 'E'},
1249         {"fasta-ref", required_argument, NULL, 'f'},
1250         {"read-groups", required_argument, NULL, 'G'},
1251         {"region", required_argument, NULL, 'r'},
1252         {"regions", required_argument, NULL, 'r'},
1253         {"regions-file", required_argument, NULL, 'R'},
1254         {"targets", required_argument, NULL, 't'},
1255         {"targets-file", required_argument, NULL, 'T'},
1256         {"min-MQ", required_argument, NULL, 'q'},
1257         {"min-mq", required_argument, NULL, 'q'},
1258         {"min-BQ", required_argument, NULL, 'Q'},
1259         {"min-bq", required_argument, NULL, 'Q'},
1260         {"max-bq", required_argument, NULL, 11},
1261         {"max-BQ", required_argument, NULL, 11},
1262         {"delta-BQ", required_argument, NULL, 12},
1263         {"ignore-overlaps", no_argument, NULL, 'x'},
1264         {"output-type", required_argument, NULL, 'O'},
1265         {"samples", required_argument, NULL, 's'},
1266         {"samples-file", required_argument, NULL, 'S'},
1267         {"annotate", required_argument, NULL, 'a'},
1268         {"ext-prob", required_argument, NULL, 'e'},
1269         {"gap-frac", required_argument, NULL, 'F'},
1270         {"indel-bias", required_argument, NULL, 10},
1271         {"indel-size", required_argument, NULL, 15},
1272         {"tandem-qual", required_argument, NULL, 'h'},
1273         {"skip-indels", no_argument, NULL, 'I'},
1274         {"max-idepth", required_argument, NULL, 'L'},
1275         {"min-ireads", required_argument, NULL, 'm'},
1276         {"per-sample-mF", no_argument, NULL, 'p'},
1277         {"per-sample-mf", no_argument, NULL, 'p'},
1278         {"platforms", required_argument, NULL, 'P'},
1279         {"max-read-len", required_argument, NULL, 'M'},
1280         {"config", required_argument, NULL, 'X'},
1281         {"mwu-u", no_argument, NULL, 'U'},
1282         {"seed", required_argument, NULL, 13},
1283         {"ambig-reads", required_argument, NULL, 14},
1284         {"ar", required_argument, NULL, 14},
1285         {NULL, 0, NULL, 0}
1286     };
1287     while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
1288         switch (c) {
1289         case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
1290         case  1 :
1291             mplp.rflag_require = bam_str2flag(optarg);
1292             if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; }
1293             break;
1294         case  2 :
1295             mplp.rflag_filter = bam_str2flag(optarg);
1296             if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; }
1297             break;
1298         case  3 : mplp.output_fname = optarg; break;
1299         case  4 : mplp.openQ = atoi(optarg); break;
1300         case  5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break;
1301         case 'g':
1302             mplp.gvcf = gvcf_init(optarg);
1303             if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
1304             break;
1305         case 'f':
1306             mplp.fai = fai_load(optarg);
1307             if (mplp.fai == NULL) return 1;
1308             mplp.fai_fname = optarg;
1309             break;
1310         case  7 : noref = 1; break;
1311         case  8 : mplp.record_cmd_line = 0; break;
1312         case  9 : mplp.n_threads = strtol(optarg, 0, 0); break;
1313         case 'd': mplp.max_depth = atoi(optarg); break;
1314         case 'r': mplp.reg_fname = strdup(optarg); break;
1315         case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break;
1316         case 't':
1317                   // In the original version the whole BAM was streamed which is inefficient
1318                   //  with few BED intervals and big BAMs. Todo: devise a heuristic to determine
1319                   //  best strategy, that is streaming or jumping.
1320                   if ( optarg[0]=='^' ) optarg++;
1321                   else mplp.bed_logic = 1;
1322                   mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL);
1323                   mplp.bed_itr = regitr_init(mplp.bed);
1324                   if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
1325                   {
1326                       fprintf(stderr,"Could not parse the targets: %s\n", optarg);
1327                       exit(EXIT_FAILURE);
1328                   }
1329                   break;
1330         case 'T':
1331                   if ( optarg[0]=='^' ) optarg++;
1332                   else mplp.bed_logic = 1;
1333                   mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
1334                   if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
1335                   break;
1336         case 'P': mplp.pl_list = strdup(optarg); break;
1337         case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
1338         case 'B': mplp.flag &= ~MPLP_REALN; break;
1339         case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break;
1340         case 'I': mplp.flag |= MPLP_NO_INDEL; break;
1341         case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
1342         case '6': mplp.flag |= MPLP_ILLUMINA13; break;
1343         case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
1344         case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
1345         case 'O':
1346             switch (optarg[0]) {
1347                 case 'b': mplp.output_type = FT_BCF_GZ; break;
1348                 case 'u': mplp.output_type = FT_BCF; break;
1349                 case 'z': mplp.output_type = FT_VCF_GZ; break;
1350                 case 'v': mplp.output_type = FT_VCF; break;
1351                 default:
1352                 {
1353                     char *tmp;
1354                     mplp.clevel = strtol(optarg,&tmp,10);
1355                     if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1356                 }
1357             }
1358             if ( optarg[1] )
1359             {
1360                 char *tmp;
1361                 mplp.clevel = strtol(optarg+1,&tmp,10);
1362                 if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
1363             }
1364             break;
1365         case 'C': mplp.capQ_thres = atoi(optarg); break;
1366         case 'q': mplp.min_mq = atoi(optarg); break;
1367         case 'Q': mplp.min_baseQ = atoi(optarg); break;
1368         case  11: mplp.max_baseQ = atoi(optarg); break;
1369         case  12: mplp.delta_baseQ = atoi(optarg); break;
1370         case 'b': file_list = optarg; break;
1371         case 'o': {
1372                 char *end;
1373                 long value = strtol(optarg, &end, 10);
1374                 // Distinguish between -o INT and -o FILE (a bit of a hack!)
1375                 if (*end == '\0') mplp.openQ = value;
1376                 else mplp.output_fname = optarg;
1377             }
1378             break;
1379         case 'e': mplp.extQ = atoi(optarg); break;
1380         case 'h': mplp.tandemQ = atoi(optarg); break;
1381         case 10: // --indel-bias (inverted so higher => more indels called)
1382             if (atof(optarg) < 1e-2)
1383                 mplp.indel_bias = 1/1e2;
1384             else
1385                 mplp.indel_bias = 1/atof(optarg);
1386             break;
1387         case  15: {
1388                 char *tmp;
1389                 mplp.indel_win_size = strtol(optarg,&tmp,10);
1390                 if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg);
1391                 if ( mplp.indel_win_size < 110 )
1392                 {
1393                     mplp.indel_win_size = 110;
1394                     fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size);
1395                 }
1396             }
1397             break;
1398         case 'A': use_orphan = 1; break;
1399         case 'F': mplp.min_frac = atof(optarg); break;
1400         case 'm': mplp.min_support = atoi(optarg); break;
1401         case 'L': mplp.max_indel_depth = atoi(optarg); break;
1402         case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break;
1403         case 'a':
1404             if (optarg[0]=='?') {
1405                 list_annotations(stderr);
1406                 return 1;
1407             }
1408             mplp.fmt_flag |= parse_format_flag(optarg);
1409         break;
1410         case 'M': mplp.max_read_len = atoi(optarg); break;
1411         case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
1412         case 'X':
1413             if (strcasecmp(optarg, "pacbio-ccs") == 0) {
1414                 mplp.min_frac = 0.1;
1415                 mplp.min_baseQ = 5;
1416                 mplp.max_baseQ = 50;
1417                 mplp.delta_baseQ = 10;
1418                 mplp.openQ = 25;
1419                 mplp.extQ = 1;
1420                 mplp.flag |= MPLP_REALN_PARTIAL;
1421                 mplp.max_read_len = 99999;
1422             } else if (strcasecmp(optarg, "ont") == 0) {
1423                 fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with "
1424                         "a higher -P, eg -P0.01 or -P 0.1\n");
1425                 mplp.min_baseQ = 5;
1426                 mplp.max_baseQ = 30;
1427                 mplp.flag &= ~MPLP_REALN;
1428                 mplp.flag |= MPLP_NO_INDEL;
1429             } else if (strcasecmp(optarg, "1.12") == 0) {
1430                 // 1.12 and earlier
1431                 mplp.min_frac = 0.002;
1432                 mplp.min_support = 1;
1433                 mplp.min_baseQ = 13;
1434                 mplp.tandemQ = 100;
1435                 mplp.flag &= ~MPLP_REALN_PARTIAL;
1436                 mplp.flag |= MPLP_REALN;
1437             } else if (strcasecmp(optarg, "illumina") == 0) {
1438                 mplp.flag |= MPLP_REALN_PARTIAL;
1439             } else {
1440                 fprintf(stderr, "Unknown configuration name '%s'\n"
1441                         "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
1442                         optarg);
1443                 return 1;
1444             }
1445             break;
1446         case 13: hts_srand48(atoi(optarg)); break;
1447         case 14:
1448             if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP;
1449             else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD;
1450             else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0;
1451             else error("The option to --ambig-reads not recognised: %s\n",optarg);
1452             break;
1453         default:
1454             fprintf(stderr,"Invalid option: '%c'\n", c);
1455             return 1;
1456         }
1457     }
1458 
1459     if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) )
1460     {
1461         fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n");
1462         mplp.fmt_flag |= B2B_FMT_DP;
1463     }
1464     if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) )
1465     {
1466         if ( mplp.flag&MPLP_VCF )
1467         {
1468             if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF;
1469             else mplp.output_type = FT_VCF_GZ;
1470         }
1471         else if ( mplp.flag&MPLP_BCF )
1472         {
1473             if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF;
1474             else mplp.output_type = FT_BCF_GZ;
1475         }
1476     }
1477     if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
1478     {
1479         fprintf(stderr,"Error: The -B option cannot be combined with -E\n");
1480         return 1;
1481     }
1482     if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
1483     if (argc == 1)
1484     {
1485         print_usage(stderr, &mplp);
1486         return 1;
1487     }
1488     if (!mplp.fai && !noref) {
1489         fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
1490         return 1;
1491     }
1492     int ret,i;
1493     if (file_list)
1494     {
1495         if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
1496         mplp.files  = fn;
1497         mplp.nfiles = nfiles;
1498     }
1499     else
1500     {
1501         mplp.nfiles = argc - optind;
1502         mplp.files  = (char**) malloc(mplp.nfiles*sizeof(char*));
1503         for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]);
1504     }
1505     ret = mpileup(&mplp);
1506 
1507     for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]);
1508     free(mplp.files);
1509     free(mplp.reg_fname); free(mplp.pl_list);
1510     if (mplp.fai) fai_destroy(mplp.fai);
1511     if (mplp.bed) regidx_destroy(mplp.bed);
1512     if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
1513     if (mplp.reg) regidx_destroy(mplp.reg);
1514     bam_smpl_destroy(mplp.bsmpl);
1515 
1516     return ret;
1517 }
1518