1 /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
2
3 Copyright (C) 2008-2021 Genome Research Ltd.
4 Portions copyright (C) 2009-2012 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <math.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <unistd.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <limits.h>
34 #include <inttypes.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <getopt.h>
38 #include <htslib/sam.h>
39 #include <htslib/faidx.h>
40 #include <htslib/kstring.h>
41 #include <htslib/khash_str2int.h>
42 #include <htslib/hts_os.h>
43 #include <assert.h>
44 #include "regidx.h"
45 #include "bcftools.h"
46 #include "bam2bcf.h"
47 #include "bam_sample.h"
48 #include "gvcf.h"
49
50 #define MPLP_BCF 1
51 #define MPLP_VCF (1<<1)
52 #define MPLP_NO_COMP (1<<2)
53 #define MPLP_NO_ORPHAN (1<<3)
54 #define MPLP_REALN (1<<4)
55 #define MPLP_NO_INDEL (1<<5)
56 #define MPLP_REDO_BAQ (1<<6)
57 #define MPLP_ILLUMINA13 (1<<7)
58 #define MPLP_IGNORE_RG (1<<8)
59 #define MPLP_PRINT_POS (1<<9)
60 #define MPLP_PRINT_MAPQ (1<<10)
61 #define MPLP_PER_SAMPLE (1<<11)
62 #define MPLP_SMART_OVERLAPS (1<<12)
63 #define MPLP_REALN_PARTIAL (1<<13)
64
65 typedef struct _mplp_aux_t mplp_aux_t;
66 typedef struct _mplp_pileup_t mplp_pileup_t;
67
68 // Data shared by all bam files
69 typedef struct {
70 int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
71 max_indel_depth, max_read_len, fmt_flag, ambig_reads;
72 int rflag_require, rflag_filter, output_type;
73 int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
74 double min_frac; // for indels
75 double indel_bias;
76 char *reg_fname, *pl_list, *fai_fname, *output_fname;
77 int reg_is_file, record_cmd_line, n_threads, clevel;
78 faidx_t *fai;
79 regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
80 regitr_t *bed_itr, *reg_itr;
81 int bed_logic; // 1: include region, 0: exclude region
82 gvcf_t *gvcf;
83
84 // auxiliary structures for calling
85 bcf_callaux_t *bca;
86 bcf_callret1_t *bcr;
87 bcf_call_t bc;
88 bam_mplp_t iter;
89 mplp_aux_t **mplp_data;
90 int nfiles;
91 char **files;
92 mplp_pileup_t *gplp;
93 int *n_plp;
94 const bam_pileup1_t **plp;
95 bam_smpl_t *bsmpl;
96 kstring_t buf;
97 bcf1_t *bcf_rec;
98 htsFile *bcf_fp;
99 bcf_hdr_t *bcf_hdr;
100 int argc;
101 char **argv;
102 } mplp_conf_t;
103
104 typedef struct {
105 char *ref[2];
106 int ref_id[2];
107 int ref_len[2];
108 } mplp_ref_t;
109
110 #define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
111
112 // Data specific to each bam file
113 struct _mplp_aux_t {
114 samFile *fp;
115 hts_itr_t *iter;
116 bam_hdr_t *h;
117 mplp_ref_t *ref;
118 const mplp_conf_t *conf;
119 int bam_id;
120 hts_idx_t *idx; // maintained only with more than one -r regions
121 };
122
123 // Data passed to htslib/mpileup
124 struct _mplp_pileup_t {
125 int n;
126 int *n_plp, *m_plp;
127 bam_pileup1_t **plp;
128 };
129
mplp_get_ref(mplp_aux_t * ma,int tid,char ** ref,int * ref_len)130 static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
131 mplp_ref_t *r = ma->ref;
132
133 //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
134
135 if (!r || !ma->conf->fai) {
136 *ref = NULL;
137 return 0;
138 }
139
140 // Do we need to reference count this so multiple mplp_aux_t can
141 // track which references are in use?
142 // For now we just cache the last two. Sufficient?
143 if (tid == r->ref_id[0]) {
144 *ref = r->ref[0];
145 *ref_len = r->ref_len[0];
146 return 1;
147 }
148 if (tid == r->ref_id[1]) {
149 // Last, swap over
150 int tmp;
151 tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
152 tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
153
154 char *tc;
155 tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
156 *ref = r->ref[0];
157 *ref_len = r->ref_len[0];
158 return 1;
159 }
160
161 // New, so migrate to old and load new
162 free(r->ref[1]);
163 r->ref[1] = r->ref[0];
164 r->ref_id[1] = r->ref_id[0];
165 r->ref_len[1] = r->ref_len[0];
166
167 r->ref_id[0] = tid;
168 r->ref[0] = faidx_fetch_seq(ma->conf->fai,
169 ma->h->target_name[r->ref_id[0]],
170 0,
171 INT_MAX,
172 &r->ref_len[0]);
173
174 if (!r->ref[0]) {
175 r->ref[0] = NULL;
176 r->ref_id[0] = -1;
177 r->ref_len[0] = 0;
178 *ref = NULL;
179 return 0;
180 }
181
182 *ref = r->ref[0];
183 *ref_len = r->ref_len[0];
184 return 1;
185 }
186
mplp_func(void * data,bam1_t * b)187 static int mplp_func(void *data, bam1_t *b)
188 {
189 char *ref;
190 mplp_aux_t *ma = (mplp_aux_t*)data;
191 int ret, ref_len;
192 while (1)
193 {
194 int has_ref;
195 ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
196 if (ret < 0) break;
197 // The 'B' cigar operation is not part of the specification, considering as obsolete.
198 // bam_remove_B(b);
199 if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
200 if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
201 if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
202 if (ma->conf->bed)
203 {
204 // test overlap
205 regitr_t *itr = ma->conf->bed_itr;
206 int beg = b->core.pos, end = bam_endpos(b)-1;
207 int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
208 if ( !ma->conf->bed_logic && !overlap )
209 {
210 // exclude only reads which are fully contained in the region
211 while ( regitr_overlap(itr) )
212 {
213 if ( beg < itr->beg ) { overlap = 1; break; }
214 if ( end > itr->end ) { overlap = 1; break; }
215 }
216 }
217 if ( !overlap ) continue;
218 }
219 if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
220 if (ma->conf->flag & MPLP_ILLUMINA13) {
221 int i;
222 uint8_t *qual = bam_get_qual(b);
223 for (i = 0; i < b->core.l_qseq; ++i)
224 qual[i] = qual[i] > 31? qual[i] - 31 : 0;
225 }
226
227 if (ma->conf->fai && b->core.tid >= 0) {
228 has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
229 if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
230 fprintf(stderr,"[%s] Skipping because %"PRId64" is outside of %d [ref:%d]\n",
231 __func__, (int64_t) b->core.pos, ref_len, b->core.tid);
232 continue;
233 }
234 } else {
235 has_ref = 0;
236 }
237
238 // Allow sufficient room for bam_aux_append of ZQ tag without
239 // a realloc and consequent breakage of pileup's cached pointers.
240 if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) {
241 // Doing sam_prob_realn later is problematic as it adds to
242 // the tag list (ZQ or BQ), which causes a realloc of b->data.
243 // This happens after pileup has built a hash table on the
244 // read name. It's a deficiency in pileup IMO.
245
246 // We could implement a new sam_prob_realn that returns ZQ
247 // somewhere else and cache it ourselves (pileup clientdata),
248 // but for now we simply use a workaround.
249 //
250 // We create a fake tag of the correct length, which we remove
251 // just prior calling sam_prob_realn so we can guarantee there is
252 // room. (We can't just make room now as bam_copy1 removes it
253 // again).
254 if (b->core.l_qseq > 500) {
255 uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1);
256 memset(ZQ, '@', b->core.l_qseq);
257 ZQ[b->core.l_qseq] = 0;
258 bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
259 free(ZQ);
260 } else {
261 static uint8_t ZQ[501] =
262 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
263 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
264 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
265 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
266 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
267 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
268 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
269 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
270 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
271 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
272 ZQ[b->core.l_qseq] = 0;
273 bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
274 ZQ[b->core.l_qseq] = '@';
275 }
276 }
277
278 if (has_ref && ma->conf->capQ_thres > 10) {
279 int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
280 if (q < 0) continue; // skip
281 else if (b->core.qual > q) b->core.qual = q;
282 }
283 if (b->core.qual < ma->conf->min_mq) continue;
284 else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;
285
286 return ret;
287 };
288 return ret;
289 }
290
291 // Called once per new bam added to the pileup.
292 // We cache sample information here so we don't have to keep recomputing this
293 // on each and every pileup column. If FMT/SCR annotation is requested, a flag
294 // is set to indicate the presence of a soft clip.
295 //
296 // Cd is an arbitrary block of data we can write into, which ends up in
297 // the pileup structures. We stash the sample ID there:
298 // has_soft_clip .. cd->i & 1
299 // sample_id .. cd->i >> 1
pileup_constructor(void * data,const bam1_t * b,bam_pileup_cd * cd)300 static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
301 {
302 mplp_aux_t *ma = (mplp_aux_t *)data;
303 int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
304 cd->i = 0;
305 PLP_SET_SAMPLE_ID(cd->i, n);
306 // Whether read has a soft-clip is used in mplp_realn's heuristics.
307 // TODO: consider whether clip length is beneficial to use?
308 int i;
309 for (i=0; i<b->core.n_cigar; i++) {
310 int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
311 if (cig == BAM_CSOFT_CLIP) {
312 PLP_SET_SOFT_CLIP(cd->i);
313 break;
314 }
315 }
316
317 if (ma->conf->flag & MPLP_REALN) {
318 int i;
319 // int tot_ins = 0;
320 // int p = 0;
321 uint32_t *cigar = bam_get_cigar(b);
322 for (i=0; i<b->core.n_cigar; i++) {
323 int cig = cigar[i] & BAM_CIGAR_MASK;
324 // if (bam_cigar_type(cig) & 2)
325 // p += cigar[i] >> BAM_CIGAR_SHIFT;
326 if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
327 // tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
328 // Possible further optimsation, check tot_ins==1 later
329 // (and remove break) so we can detect single bp indels.
330 // We may want to focus BAQ on more complex regions only.
331 PLP_SET_INDEL(cd->i);
332 break;
333 }
334
335 // TODO: proper p->cd struct and have cd->i as a size rather
336 // than a flag.
337
338 // Then aggregate together the sizes and if just 1 size for all
339 // reads or 2 sizes for approx 50/50 split in all reads, then
340 // treat this as a well-aligned variant and don't run BAQ.
341 }
342 }
343
344 return 0;
345 }
346
group_smpl(mplp_pileup_t * m,bam_smpl_t * bsmpl,int n,int * n_plp,const bam_pileup1_t ** plp)347 static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
348 {
349 int i, j;
350 memset(m->n_plp, 0, m->n * sizeof(int));
351 for (i = 0; i < n; ++i) // iterate over all bams
352 {
353 for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
354 {
355 const bam_pileup1_t *p = plp[i] + j;
356 int id = PLP_SAMPLE_ID(p->cd.i);
357 if (m->n_plp[id] == m->m_plp[id])
358 {
359 m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
360 m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
361 }
362 m->plp[id][m->n_plp[id]++] = *p;
363 }
364 }
365 }
366
flush_bcf_records(mplp_conf_t * conf,htsFile * fp,bcf_hdr_t * hdr,bcf1_t * rec)367 static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec)
368 {
369 if ( !conf->gvcf )
370 {
371 if ( rec && bcf_write1(fp, hdr, rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
372 return;
373 }
374
375 if ( !rec )
376 {
377 gvcf_write(conf->gvcf, fp, hdr, NULL, 0);
378 return;
379 }
380
381 int is_ref = 0;
382 if ( rec->n_allele==1 ) is_ref = 1;
383 else if ( rec->n_allele==2 )
384 {
385 // second allele is mpileup's X, not a variant
386 if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1;
387 }
388 rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref);
389 if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
390 }
391
392 /*
393 * Loops for an indel at this position.
394 *
395 * Only reads that overlap an indel loci get realigned. This considerably
396 * reduces the cost of running BAQ while keeping the main benefits.
397 *
398 * TODO: also consider only realigning reads that don't span the indel
399 * by more than a certain amount either-side. Ie focus BAQ only on reads
400 * ending adjacent to the indel, where the alignment is most likely to
401 * be wrong. (2nd TODO: do this based on sequence context; STRs bad, unique
402 * data good.)
403 *
404 * NB: this may sadly realign after we've already used the data. Hmm...
405 */
mplp_realn(int n,int * n_plp,const bam_pileup1_t ** plp,int flag,int max_read_len,char * ref,int ref_len,int pos)406 static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
407 int flag, int max_read_len,
408 char *ref, int ref_len, int pos) {
409 int i, j, has_indel = 0, has_clip = 0, nt = 0;
410 int min_indel = INT_MAX, max_indel = INT_MIN;
411
412 // Is an indel present.
413 // NB: don't bother even checking if very long as almost guaranteed
414 // to have indel (and likely soft-clips too).
415 for (i = 0; i < n; i++) { // iterate over bams
416 nt += n_plp[i];
417 for (j = 0; j < n_plp[i]; j++) { // iterate over reads
418 bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
419 has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
420 // Has_clip is almost always true for very long reads
421 // (eg PacBio CCS), but these rarely matter as the clip
422 // is likely a long way from this indel.
423 has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0;
424 if (max_indel < p->indel)
425 max_indel = p->indel;
426 if (min_indel > p->indel)
427 min_indel = p->indel;
428 }
429 }
430
431 if (flag & MPLP_REALN_PARTIAL) {
432 if (has_indel == 0 ||
433 (has_clip < 0.2*nt && max_indel == min_indel &&
434 (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1)))
435 return;
436 }
437
438 // Realign
439 for (i = 0; i < n; i++) { // iterate over bams
440 for (j = 0; j < n_plp[i]; j++) { // iterate over reads
441 const bam_pileup1_t *p = plp[i] + j;
442 bam1_t *b = p->b;
443
444 // Avoid doing multiple times.
445 //
446 // Note we cannot modify p->cd.i here with a PLP_SET macro
447 // because the cd item is held by mpileup in an lbnode_t
448 // struct and copied over to the pileup struct for each
449 // iteration, essentially making p->cd.i read only.
450 //
451 // We could use our own structure (p->cd.p), allocated during
452 // the constructor, but for simplicity we play dirty and
453 // abuse an unused flag bit instead.
454 if (b->core.flag & 32768)
455 continue;
456 b->core.flag |= 32768;
457
458 if (b->core.l_qseq > max_read_len)
459 continue;
460
461 // Check p->cigar_ind and see what cigar elements are before
462 // and after. How close is this location to the end of the
463 // read? Only realign if we don't span by more than X bases.
464 //
465 // Again, best only done on deeper data as BAQ helps
466 // disproportionately more on shallow data sets.
467 //
468 // This rescues some of the false negatives that are caused by
469 // systematic reduction in quality due to sample vs ref alignment.
470
471 // At deep coverage we skip realigning more reads as we have sufficient depth.
472 // This rescues for false negatives. At shallow depth we pay for this with
473 // more FP so are more stringent on spanning size.
474 #define REALN_DIST (40+10*(nt<40)+10*(nt<20))
475 uint32_t *cig = bam_get_cigar(b);
476 int ncig = b->core.n_cigar;
477
478 // Don't realign reads where indel is in middle?
479 // On long read data we don't care about soft-clips at the ends.
480 // For short read data, we always calc BAQ on these as they're
481 // a common source of false positives.
482 if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
483 // Left & right cigar op match.
484 int lr = b->core.l_qseq > 500;
485 int lm = 0, rm = 0, k;
486 for (k = 0; k < ncig; k++) {
487 int cop = bam_cigar_op(cig[k]);
488 if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
489 continue;
490
491 if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
492 cop == BAM_CEQUAL)
493 lm += bam_cigar_oplen(cig[k]);
494 else
495 break;
496 }
497
498 for (k = ncig-1; k >= 0; k--) {
499 int cop = bam_cigar_op(cig[k]);
500 if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
501 continue;
502
503 if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
504 cop == BAM_CEQUAL)
505 rm += bam_cigar_oplen(cig[k]);
506 else
507 break;
508 }
509
510 if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
511 continue;
512
513 if (lm >= REALN_DIST && rm >= REALN_DIST &&
514 has_clip < (0.15+0.05*(nt>20))*nt)
515 continue;
516 }
517
518 if (b->core.l_qseq > 500) {
519 // don't do BAQ on long-read data if it's going to
520 // cause us to have a large band-with and costly in CPU
521 int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
522 if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000)
523 continue;
524 }
525
526 // Fudge: make room for ZQ tag.
527 uint8_t *_Q = bam_aux_get(b, "_Q");
528 if (_Q) bam_aux_del(b, _Q);
529 sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3);
530 }
531 }
532
533 return;
534 }
535
mpileup_reg(mplp_conf_t * conf,uint32_t beg,uint32_t end)536 static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
537 {
538 bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
539
540 int ret, i, tid, pos, ref_len;
541 char *ref;
542
543 while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
544 {
545 if ( pos<beg || pos>end ) continue;
546 if ( conf->bed && tid >= 0 )
547 {
548 int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL);
549 if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
550 if ( !overlap ) continue;
551 }
552 int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
553 if (has_ref && (conf->flag & MPLP_REALN))
554 mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
555 conf->max_read_len, ref, ref_len, pos);
556
557 int total_depth, _ref0, ref16;
558 for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
559 group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp);
560 _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
561 ref16 = seq_nt16_table[_ref0];
562 bcf_callaux_clean(conf->bca, &conf->bc);
563 for (i = 0; i < conf->gplp->n; ++i)
564 bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i);
565 conf->bc.tid = tid; conf->bc.pos = pos;
566 bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
567 bcf_clear1(conf->bcf_rec);
568 bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
569 conf->bca, 0);
570 flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
571
572 // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
573 // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
574 if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
575 && (bcf_callaux_clean(conf->bca, &conf->bc),
576 bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
577 {
578 for (i = 0; i < conf->gplp->n; ++i)
579 bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
580 if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
581 {
582 bcf_clear1(conf->bcf_rec);
583 bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
584 flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
585 }
586 }
587 }
588 return 0;
589 }
590
mpileup(mplp_conf_t * conf)591 static int mpileup(mplp_conf_t *conf)
592 {
593 if (conf->nfiles == 0) {
594 fprintf(stderr,"[%s] no input file/data given\n", __func__);
595 exit(EXIT_FAILURE);
596 }
597
598 mplp_ref_t mp_ref = MPLP_REF_INIT;
599 conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
600 conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
601 conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
602 conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));
603
604 // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
605 // must be kept in the memory for the whole time which can be a problem with many bams.
606 // Therefore if none or only one region is requested, we initialize the bam iterator as
607 // before and free the index. Only when multiple regions are queried, we keep the index.
608 int nregs = 0;
609 if ( conf->reg_fname )
610 {
611 if ( conf->reg_is_file )
612 {
613 conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
614 if ( !conf->reg ) {
615 fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
616 exit(EXIT_FAILURE);
617 }
618 }
619 else
620 {
621 conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
622 if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
623 fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
624 exit(EXIT_FAILURE);
625 }
626 }
627 nregs = regidx_nregs(conf->reg);
628 conf->reg_itr = regitr_init(conf->reg);
629 regitr_loop(conf->reg_itr); // region iterator now positioned at the first region
630 }
631
632 // read the header of each file in the list and initialize data
633 // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
634 bam_hdr_t *hdr = NULL; // header of first file in input list
635 int i;
636 for (i = 0; i < conf->nfiles; ++i) {
637 bam_hdr_t *h_tmp;
638 conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
639 conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
640 if ( !conf->mplp_data[i]->fp )
641 {
642 fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
643 exit(EXIT_FAILURE);
644 }
645 if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
646 fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
647 exit(EXIT_FAILURE);
648 }
649 if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
650 fprintf(stderr, "[%s] failed to process %s: %s\n",
651 __func__, conf->fai_fname, strerror(errno));
652 exit(EXIT_FAILURE);
653 }
654 conf->mplp_data[i]->conf = conf;
655 conf->mplp_data[i]->ref = &mp_ref;
656 h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
657 if ( !h_tmp ) {
658 fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
659 exit(EXIT_FAILURE);
660 }
661 conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
662 conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
663 if ( conf->mplp_data[i]->bam_id<0 )
664 {
665 // no usable readgroups in this bam, it can be skipped
666 sam_close(conf->mplp_data[i]->fp);
667 free(conf->mplp_data[i]);
668 bam_hdr_destroy(h_tmp);
669 free(conf->files[i]);
670 if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
671 conf->nfiles--;
672 i--;
673 continue;
674 }
675 if (conf->reg) {
676 hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
677 if (idx == NULL) {
678 fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
679 exit(EXIT_FAILURE);
680 }
681 conf->buf.l = 0;
682 ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
683 conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
684 if ( !conf->mplp_data[i]->iter )
685 {
686 conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
687 if ( conf->mplp_data[i]->iter ) {
688 fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
689 exit(EXIT_FAILURE);
690 }
691 fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
692 exit(EXIT_FAILURE);
693 }
694 if ( nregs==1 ) // no need to keep the index in memory
695 hts_idx_destroy(idx);
696 else
697 conf->mplp_data[i]->idx = idx;
698 }
699
700 if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
701 else {
702 // FIXME: check consistency between h and h_tmp
703 bam_hdr_destroy(h_tmp);
704
705 // we store only the first file's header; it's (alleged to be)
706 // compatible with the i-th file's target_name lookup needs
707 conf->mplp_data[i]->h = hdr;
708 }
709 }
710 if ( !hdr ) {
711 fprintf(stderr, "[%s] failed to find a file header with usable read groups\n", __func__);
712 exit(EXIT_FAILURE);
713 }
714 // allocate data storage proportionate to number of samples being studied sm->n
715 bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
716 conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
717 conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
718 conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
719
720 fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
721 // write the VCF header
722 char wmode[8];
723 set_wmode(wmode,conf->output_type,conf->output_fname,conf->clevel);
724 conf->bcf_fp = hts_open(conf->output_fname ? conf->output_fname : "-", wmode);
725 if (conf->bcf_fp == NULL) {
726 fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
727 exit(EXIT_FAILURE);
728 }
729 if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
730
731 // BCF header creation
732 conf->bcf_hdr = bcf_hdr_init("w");
733 conf->buf.l = 0;
734
735 if (conf->record_cmd_line)
736 {
737 ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
738 bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
739
740 conf->buf.l = 0;
741 ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
742 for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
743 kputc('\n', &conf->buf);
744 bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
745 }
746
747 if (conf->fai_fname)
748 {
749 conf->buf.l = 0;
750 ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
751 bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
752 }
753
754 // Translate BAM @SQ tags to BCF ##contig tags
755 // todo: use/write new BAM header manipulation routines, fill also UR, M5
756 for (i=0; i<hdr->n_targets; i++)
757 {
758 conf->buf.l = 0;
759 ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
760 bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
761 }
762 conf->buf.l = 0;
763
764 bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
765 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
766 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">");
767 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">");
768 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
769 if ( conf->fmt_flag&B2B_INFO_VDB )
770 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
771
772 if (conf->fmt_flag & B2B_INFO_ZSCORE) {
773 if ( conf->fmt_flag&B2B_INFO_RPB )
774 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
775 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
776 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
777 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
778 if ( conf->fmt_flag&B2B_INFO_SCB )
779 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
780 } else {
781 if ( conf->fmt_flag&B2B_INFO_RPB )
782 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
783 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
784 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
785 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
786 }
787
788 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
789 #if CDF_MWU_TESTS
790 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
791 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
792 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
793 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
794 #endif
795 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
796 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
797 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
798 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
799 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
800 if ( conf->fmt_flag&B2B_FMT_DP )
801 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
802 if ( conf->fmt_flag&B2B_FMT_DV )
803 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
804 if ( conf->fmt_flag&B2B_FMT_DPR )
805 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
806 if ( conf->fmt_flag&B2B_INFO_DPR )
807 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
808 if ( conf->fmt_flag&B2B_FMT_DP4 )
809 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
810 if ( conf->fmt_flag&B2B_FMT_SP )
811 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
812 if ( conf->fmt_flag&B2B_FMT_AD )
813 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths (high-quality bases)\">");
814 if ( conf->fmt_flag&B2B_FMT_ADF )
815 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand (high-quality bases)\">");
816 if ( conf->fmt_flag&B2B_FMT_ADR )
817 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand (high-quality bases)\">");
818 if ( conf->fmt_flag&B2B_FMT_QS )
819 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=QS,Number=R,Type=Integer,Description=\"Phred-score allele quality sum used by `call -mG` and `+trio-dnm`\">");
820 if ( conf->fmt_flag&B2B_INFO_AD )
821 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths (high-quality bases)\">");
822 if ( conf->fmt_flag&B2B_INFO_ADF )
823 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand (high-quality bases)\">");
824 if ( conf->fmt_flag&B2B_INFO_SCR )
825 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCR,Number=1,Type=Integer,Description=\"Number of soft-clipped reads (at high-quality bases)\">");
826 if ( conf->fmt_flag&B2B_FMT_SCR )
827 bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SCR,Number=1,Type=Integer,Description=\"Per-sample number of soft-clipped reads (at high-quality bases)\">");
828 if ( conf->fmt_flag&B2B_INFO_ADR )
829 bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand (high-quality bases)\">");
830 if ( conf->gvcf )
831 gvcf_update_header(conf->gvcf, conf->bcf_hdr);
832
833 int nsmpl;
834 const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
835 for (i=0; i<nsmpl; i++)
836 bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
837 if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
838
839 conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
840 conf->delta_baseQ);
841 conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
842 conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
843 conf->bca->indel_bias = conf->indel_bias;
844 conf->bca->min_frac = conf->min_frac;
845 conf->bca->min_support = conf->min_support;
846 conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
847 conf->bca->fmt_flag = conf->fmt_flag;
848 conf->bca->ambig_reads = conf->ambig_reads;
849 conf->bca->indel_win_size = conf->indel_win_size;
850
851 conf->bc.bcf_hdr = conf->bcf_hdr;
852 conf->bc.n = nsmpl;
853 conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
854 conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES);
855 for (i=0; i<nsmpl; i++)
856 conf->bcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES;
857 if (conf->fmt_flag)
858 {
859 assert( sizeof(float)==sizeof(int32_t) );
860 conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
861 conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
862 if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
863 {
864 // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
865 conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
866 conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
867 for (i=0; i<nsmpl; i++)
868 {
869 conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
870 conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
871 }
872 }
873 if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) )
874 conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR));
875 }
876
877 // init mpileup
878 conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
879 if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
880 fprintf(stderr, "[%s] maximum number of reads per input file set to -d %d\n", __func__, conf->max_depth);
881 if ( (double)conf->max_depth * conf->nfiles > 1<<20)
882 fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
883 if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
884 fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
885 bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
886 conf->max_indel_depth = conf->max_indel_depth * nsmpl;
887 conf->bcf_rec = bcf_init1();
888 bam_mplp_constructor(conf->iter, pileup_constructor);
889
890 // Run mpileup for multiple regions
891 if ( nregs )
892 {
893 int ireg = 0;
894 do
895 {
896 // first region is already positioned
897 if ( ireg++ > 0 )
898 {
899 conf->buf.l = 0;
900 ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
901
902 for (i=0; i<conf->nfiles; i++)
903 {
904 hts_itr_destroy(conf->mplp_data[i]->iter);
905 conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
906 if ( !conf->mplp_data[i]->iter )
907 {
908 conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
909 if ( conf->mplp_data[i]->iter ) {
910 fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
911 exit(EXIT_FAILURE);
912 }
913 fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
914 exit(EXIT_FAILURE);
915 }
916 bam_mplp_reset(conf->iter);
917 }
918 }
919 mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
920 }
921 while ( regitr_loop(conf->reg_itr) );
922 }
923 else
924 mpileup_reg(conf,0,UINT32_MAX);
925
926 flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
927
928 // clean up
929 free(conf->bc.tmp.s);
930 bcf_destroy1(conf->bcf_rec);
931 if (conf->bcf_fp)
932 {
933 if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
934 bcf_hdr_destroy(conf->bcf_hdr);
935 bcf_call_destroy(conf->bca);
936 free(conf->bc.PL);
937 free(conf->bc.DP4);
938 free(conf->bc.ADR);
939 free(conf->bc.ADF);
940 free(conf->bc.SCR);
941 free(conf->bc.QS);
942 free(conf->bc.fmt_arr);
943 free(conf->bcr);
944 }
945 if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
946 free(conf->buf.s);
947 for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
948 free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
949 bam_mplp_destroy(conf->iter);
950 bam_hdr_destroy(hdr);
951 for (i = 0; i < conf->nfiles; ++i) {
952 if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
953 sam_close(conf->mplp_data[i]->fp);
954 if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
955 free(conf->mplp_data[i]);
956 }
957 if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
958 free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
959 free(mp_ref.ref[0]);
960 free(mp_ref.ref[1]);
961 return 0;
962 }
963
is_url(const char * s)964 static int is_url(const char *s)
965 {
966 static const char uri_scheme_chars[] =
967 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
968 return s[strspn(s, uri_scheme_chars)] == ':';
969 }
970
971 #define MAX_PATH_LEN 1024
read_file_list(const char * file_list,int * n,char ** argv[])972 int read_file_list(const char *file_list,int *n,char **argv[])
973 {
974 char buf[MAX_PATH_LEN];
975 int len, nfiles = 0;
976 char **files = NULL;
977 struct stat sb;
978
979 *n = 0;
980 *argv = NULL;
981
982 FILE *fh = fopen(file_list,"r");
983 if ( !fh )
984 {
985 fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
986 return 1;
987 }
988
989 files = (char**) calloc(nfiles,sizeof(char*));
990 nfiles = 0;
991 while ( fgets(buf,MAX_PATH_LEN,fh) )
992 {
993 // allow empty lines and trailing spaces
994 len = strlen(buf);
995 while ( len>0 && isspace(buf[len-1]) ) len--;
996 if ( !len ) continue;
997
998 // check sanity of the file list
999 buf[len] = 0;
1000 if (! (is_url(buf) || stat(buf, &sb) == 0))
1001 {
1002 // no such file, check if it is safe to print its name
1003 int i, safe_to_print = 1;
1004 for (i=0; i<len; i++)
1005 if (!isprint(buf[i])) { safe_to_print = 0; break; }
1006 if ( safe_to_print )
1007 fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
1008 else
1009 fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
1010 return 1;
1011 }
1012
1013 nfiles++;
1014 files = (char**) realloc(files,nfiles*sizeof(char*));
1015 files[nfiles-1] = strdup(buf);
1016 }
1017 if ( fclose(fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,file_list);
1018 if ( !nfiles )
1019 {
1020 fprintf(stderr,"No files read from %s\n", file_list);
1021 return 1;
1022 }
1023 *argv = files;
1024 *n = nfiles;
1025 return 0;
1026 }
1027 #undef MAX_PATH_LEN
1028
parse_format_flag(const char * str)1029 int parse_format_flag(const char *str)
1030 {
1031 int i, flag = 0, n_tags;
1032 char **tags = hts_readlist(str, 0, &n_tags);
1033 for(i=0; i<n_tags; i++)
1034 {
1035 if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
1036 else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
1037 else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
1038 else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
1039 else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
1040 else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
1041 else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
1042 else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
1043 else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
1044 else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
1045 else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
1046 else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
1047 else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
1048 else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
1049 else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
1050 else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
1051 else
1052 {
1053 fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
1054 exit(EXIT_FAILURE);
1055 }
1056 free(tags[i]);
1057 }
1058 if (n_tags) free(tags);
1059 return flag;
1060 }
1061
1062 // todo: make it possible to turn off some annotations or change the defaults,
1063 // specifically RPB, VDB, MWU, SGB tests. It would be good to do some
1064 // benchmarking first to see if it's worth it.
list_annotations(FILE * fp)1065 static void list_annotations(FILE *fp)
1066 {
1067 fprintf(fp,
1068 "\n"
1069 "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
1070 "\n"
1071 " FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
1072 " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
1073 " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
1074 " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
1075 " FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
1076 " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
1077 " FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
1078 "\n"
1079 "INFO annotation tags available:\n"
1080 "\n"
1081 " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
1082 " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
1083 " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
1084 " INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
1085 "\n");
1086 }
1087
print_usage(FILE * fp,const mplp_conf_t * mplp)1088 static void print_usage(FILE *fp, const mplp_conf_t *mplp)
1089 {
1090 char *tmp_require = bam_flag2str(mplp->rflag_require);
1091 char *tmp_filter = bam_flag2str(mplp->rflag_filter);
1092
1093 // Display usage information, formatted for the standard 80 columns.
1094 // (The unusual string formatting here aids the readability of this
1095 // source code in 80 columns, to the extent that's possible.)
1096
1097 fprintf(fp,
1098 "\n"
1099 "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
1100 "\n"
1101 "Input options:\n"
1102 " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n"
1103 " -A, --count-orphans Do not discard anomalous read pairs\n"
1104 " -b, --bam-list FILE List of input BAM filenames, one per line\n"
1105 " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n"
1106 " -C, --adjust-MQ INT Adjust mapping quality [0]\n"
1107 " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n"
1108 " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
1109 fprintf(fp,
1110 " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n"
1111 " -f, --fasta-ref FILE Faidx indexed reference sequence file\n"
1112 " --no-reference Do not require fasta reference file\n"
1113 " -G, --read-groups FILE Select or exclude read groups listed in the file\n"
1114 " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
1115 fprintf(fp,
1116 " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
1117 fprintf(fp,
1118 " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
1119 fprintf(fp,
1120 " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
1121 fprintf(fp,
1122 " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n"
1123 " -R, --regions-file FILE Restrict to regions listed in a file\n"
1124 " --ignore-RG Ignore RG tags (one BAM = one sample)\n"
1125 " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require);
1126 fprintf(fp,
1127 " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n"
1128 " [%s]\n", tmp_filter);
1129 fprintf(fp,
1130 " -s, --samples LIST Comma separated list of samples to include\n"
1131 " -S, --samples-file FILE File of samples to include\n"
1132 " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n"
1133 " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
1134 " -x, --ignore-overlaps Disable read-pair overlap detection\n"
1135 " --seed INT Random number seed used for sampling deep regions [0]\n"
1136 "\n"
1137 "Output options:\n"
1138 " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n"
1139 " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n"
1140 " To minimum per-sample DP\n"
1141 " --no-version Do not append version and command line to the header\n"
1142 " -o, --output FILE Write output to FILE [standard output]\n"
1143 " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
1144 " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
1145 " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n"
1146 " --threads INT Use multithreading with INT worker threads [0]\n"
1147 "\n"
1148 "SNP/INDEL genotype likelihoods options:\n"
1149 " -X, --config STR Specify platform specific profiles (see below)\n"
1150 " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
1151 fprintf(fp,
1152 " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac);
1153 fprintf(fp,
1154 " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
1155 fprintf(fp,
1156 " -I, --skip-indels Do not perform indel calling\n"
1157 " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
1158 fprintf(fp,
1159 " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
1160 fprintf(fp,
1161 " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
1162 fprintf(fp,
1163 " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
1164 fprintf(fp,
1165 " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n"
1166 " -P, --platforms STR Comma separated list of platforms for indels [all]\n"
1167 " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
1168 fprintf(fp,
1169 " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
1170 fprintf(fp,
1171 " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
1172 fprintf(fp,"\n");
1173 fprintf(fp,
1174 "Configuration profiles activated with -X, --config:\n"
1175 " 1.12: -Q13 -h100 -m1 -F0.002\n"
1176 " illumina: [ default values ]\n"
1177 " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
1178 " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
1179 "\n"
1180 "Notes: Assuming diploid individuals.\n"
1181 "\n"
1182 "Example:\n"
1183 " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
1184 " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
1185 "\n");
1186
1187 free(tmp_require);
1188 free(tmp_filter);
1189 }
1190
main_mpileup(int argc,char * argv[])1191 int main_mpileup(int argc, char *argv[])
1192 {
1193 int c;
1194 const char *file_list = NULL;
1195 char **fn = NULL;
1196 int nfiles = 0, use_orphan = 0, noref = 0;
1197 mplp_conf_t mplp;
1198 memset(&mplp, 0, sizeof(mplp_conf_t));
1199 mplp.min_baseQ = 1;
1200 mplp.max_baseQ = 60;
1201 mplp.delta_baseQ = 30;
1202 mplp.capQ_thres = 0;
1203 mplp.max_depth = 250; mplp.max_indel_depth = 250;
1204 mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
1205 mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
1206 mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
1207 | MPLP_SMART_OVERLAPS;
1208 mplp.argc = argc; mplp.argv = argv;
1209 mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
1210 mplp.output_fname = NULL;
1211 mplp.output_type = FT_VCF;
1212 mplp.record_cmd_line = 1;
1213 mplp.n_threads = 0;
1214 mplp.bsmpl = bam_smpl_init();
1215 // the default to be changed in future, see also parse_format_flag()
1216 mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
1217 mplp.max_read_len = 500;
1218 mplp.ambig_reads = B2B_DROP;
1219 mplp.indel_win_size = 110;
1220 mplp.clevel = -1;
1221 hts_srand48(0);
1222
1223 static const struct option lopts[] =
1224 {
1225 {"rf", required_argument, NULL, 1}, // require flag
1226 {"ff", required_argument, NULL, 2}, // filter flag
1227 {"incl-flags", required_argument, NULL, 1},
1228 {"excl-flags", required_argument, NULL, 2},
1229 {"output", required_argument, NULL, 3},
1230 {"open-prob", required_argument, NULL, 4},
1231 {"ignore-RG", no_argument, NULL, 5},
1232 {"ignore-rg", no_argument, NULL, 5},
1233 {"gvcf", required_argument, NULL, 'g'},
1234 {"no-reference", no_argument, NULL, 7},
1235 {"no-version", no_argument, NULL, 8},
1236 {"threads",required_argument,NULL,9},
1237 {"illumina1.3+", no_argument, NULL, '6'},
1238 {"count-orphans", no_argument, NULL, 'A'},
1239 {"bam-list", required_argument, NULL, 'b'},
1240 {"no-BAQ", no_argument, NULL, 'B'},
1241 {"no-baq", no_argument, NULL, 'B'},
1242 {"full-BAQ", no_argument, NULL, 'D'},
1243 {"full-baq", no_argument, NULL, 'D'},
1244 {"adjust-MQ", required_argument, NULL, 'C'},
1245 {"adjust-mq", required_argument, NULL, 'C'},
1246 {"max-depth", required_argument, NULL, 'd'},
1247 {"redo-BAQ", no_argument, NULL, 'E'},
1248 {"redo-baq", no_argument, NULL, 'E'},
1249 {"fasta-ref", required_argument, NULL, 'f'},
1250 {"read-groups", required_argument, NULL, 'G'},
1251 {"region", required_argument, NULL, 'r'},
1252 {"regions", required_argument, NULL, 'r'},
1253 {"regions-file", required_argument, NULL, 'R'},
1254 {"targets", required_argument, NULL, 't'},
1255 {"targets-file", required_argument, NULL, 'T'},
1256 {"min-MQ", required_argument, NULL, 'q'},
1257 {"min-mq", required_argument, NULL, 'q'},
1258 {"min-BQ", required_argument, NULL, 'Q'},
1259 {"min-bq", required_argument, NULL, 'Q'},
1260 {"max-bq", required_argument, NULL, 11},
1261 {"max-BQ", required_argument, NULL, 11},
1262 {"delta-BQ", required_argument, NULL, 12},
1263 {"ignore-overlaps", no_argument, NULL, 'x'},
1264 {"output-type", required_argument, NULL, 'O'},
1265 {"samples", required_argument, NULL, 's'},
1266 {"samples-file", required_argument, NULL, 'S'},
1267 {"annotate", required_argument, NULL, 'a'},
1268 {"ext-prob", required_argument, NULL, 'e'},
1269 {"gap-frac", required_argument, NULL, 'F'},
1270 {"indel-bias", required_argument, NULL, 10},
1271 {"indel-size", required_argument, NULL, 15},
1272 {"tandem-qual", required_argument, NULL, 'h'},
1273 {"skip-indels", no_argument, NULL, 'I'},
1274 {"max-idepth", required_argument, NULL, 'L'},
1275 {"min-ireads", required_argument, NULL, 'm'},
1276 {"per-sample-mF", no_argument, NULL, 'p'},
1277 {"per-sample-mf", no_argument, NULL, 'p'},
1278 {"platforms", required_argument, NULL, 'P'},
1279 {"max-read-len", required_argument, NULL, 'M'},
1280 {"config", required_argument, NULL, 'X'},
1281 {"mwu-u", no_argument, NULL, 'U'},
1282 {"seed", required_argument, NULL, 13},
1283 {"ambig-reads", required_argument, NULL, 14},
1284 {"ar", required_argument, NULL, 14},
1285 {NULL, 0, NULL, 0}
1286 };
1287 while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
1288 switch (c) {
1289 case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
1290 case 1 :
1291 mplp.rflag_require = bam_str2flag(optarg);
1292 if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; }
1293 break;
1294 case 2 :
1295 mplp.rflag_filter = bam_str2flag(optarg);
1296 if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; }
1297 break;
1298 case 3 : mplp.output_fname = optarg; break;
1299 case 4 : mplp.openQ = atoi(optarg); break;
1300 case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break;
1301 case 'g':
1302 mplp.gvcf = gvcf_init(optarg);
1303 if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
1304 break;
1305 case 'f':
1306 mplp.fai = fai_load(optarg);
1307 if (mplp.fai == NULL) return 1;
1308 mplp.fai_fname = optarg;
1309 break;
1310 case 7 : noref = 1; break;
1311 case 8 : mplp.record_cmd_line = 0; break;
1312 case 9 : mplp.n_threads = strtol(optarg, 0, 0); break;
1313 case 'd': mplp.max_depth = atoi(optarg); break;
1314 case 'r': mplp.reg_fname = strdup(optarg); break;
1315 case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break;
1316 case 't':
1317 // In the original version the whole BAM was streamed which is inefficient
1318 // with few BED intervals and big BAMs. Todo: devise a heuristic to determine
1319 // best strategy, that is streaming or jumping.
1320 if ( optarg[0]=='^' ) optarg++;
1321 else mplp.bed_logic = 1;
1322 mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL);
1323 mplp.bed_itr = regitr_init(mplp.bed);
1324 if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
1325 {
1326 fprintf(stderr,"Could not parse the targets: %s\n", optarg);
1327 exit(EXIT_FAILURE);
1328 }
1329 break;
1330 case 'T':
1331 if ( optarg[0]=='^' ) optarg++;
1332 else mplp.bed_logic = 1;
1333 mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
1334 if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
1335 break;
1336 case 'P': mplp.pl_list = strdup(optarg); break;
1337 case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
1338 case 'B': mplp.flag &= ~MPLP_REALN; break;
1339 case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break;
1340 case 'I': mplp.flag |= MPLP_NO_INDEL; break;
1341 case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
1342 case '6': mplp.flag |= MPLP_ILLUMINA13; break;
1343 case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
1344 case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
1345 case 'O':
1346 switch (optarg[0]) {
1347 case 'b': mplp.output_type = FT_BCF_GZ; break;
1348 case 'u': mplp.output_type = FT_BCF; break;
1349 case 'z': mplp.output_type = FT_VCF_GZ; break;
1350 case 'v': mplp.output_type = FT_VCF; break;
1351 default:
1352 {
1353 char *tmp;
1354 mplp.clevel = strtol(optarg,&tmp,10);
1355 if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1356 }
1357 }
1358 if ( optarg[1] )
1359 {
1360 char *tmp;
1361 mplp.clevel = strtol(optarg+1,&tmp,10);
1362 if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
1363 }
1364 break;
1365 case 'C': mplp.capQ_thres = atoi(optarg); break;
1366 case 'q': mplp.min_mq = atoi(optarg); break;
1367 case 'Q': mplp.min_baseQ = atoi(optarg); break;
1368 case 11: mplp.max_baseQ = atoi(optarg); break;
1369 case 12: mplp.delta_baseQ = atoi(optarg); break;
1370 case 'b': file_list = optarg; break;
1371 case 'o': {
1372 char *end;
1373 long value = strtol(optarg, &end, 10);
1374 // Distinguish between -o INT and -o FILE (a bit of a hack!)
1375 if (*end == '\0') mplp.openQ = value;
1376 else mplp.output_fname = optarg;
1377 }
1378 break;
1379 case 'e': mplp.extQ = atoi(optarg); break;
1380 case 'h': mplp.tandemQ = atoi(optarg); break;
1381 case 10: // --indel-bias (inverted so higher => more indels called)
1382 if (atof(optarg) < 1e-2)
1383 mplp.indel_bias = 1/1e2;
1384 else
1385 mplp.indel_bias = 1/atof(optarg);
1386 break;
1387 case 15: {
1388 char *tmp;
1389 mplp.indel_win_size = strtol(optarg,&tmp,10);
1390 if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg);
1391 if ( mplp.indel_win_size < 110 )
1392 {
1393 mplp.indel_win_size = 110;
1394 fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size);
1395 }
1396 }
1397 break;
1398 case 'A': use_orphan = 1; break;
1399 case 'F': mplp.min_frac = atof(optarg); break;
1400 case 'm': mplp.min_support = atoi(optarg); break;
1401 case 'L': mplp.max_indel_depth = atoi(optarg); break;
1402 case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break;
1403 case 'a':
1404 if (optarg[0]=='?') {
1405 list_annotations(stderr);
1406 return 1;
1407 }
1408 mplp.fmt_flag |= parse_format_flag(optarg);
1409 break;
1410 case 'M': mplp.max_read_len = atoi(optarg); break;
1411 case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
1412 case 'X':
1413 if (strcasecmp(optarg, "pacbio-ccs") == 0) {
1414 mplp.min_frac = 0.1;
1415 mplp.min_baseQ = 5;
1416 mplp.max_baseQ = 50;
1417 mplp.delta_baseQ = 10;
1418 mplp.openQ = 25;
1419 mplp.extQ = 1;
1420 mplp.flag |= MPLP_REALN_PARTIAL;
1421 mplp.max_read_len = 99999;
1422 } else if (strcasecmp(optarg, "ont") == 0) {
1423 fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with "
1424 "a higher -P, eg -P0.01 or -P 0.1\n");
1425 mplp.min_baseQ = 5;
1426 mplp.max_baseQ = 30;
1427 mplp.flag &= ~MPLP_REALN;
1428 mplp.flag |= MPLP_NO_INDEL;
1429 } else if (strcasecmp(optarg, "1.12") == 0) {
1430 // 1.12 and earlier
1431 mplp.min_frac = 0.002;
1432 mplp.min_support = 1;
1433 mplp.min_baseQ = 13;
1434 mplp.tandemQ = 100;
1435 mplp.flag &= ~MPLP_REALN_PARTIAL;
1436 mplp.flag |= MPLP_REALN;
1437 } else if (strcasecmp(optarg, "illumina") == 0) {
1438 mplp.flag |= MPLP_REALN_PARTIAL;
1439 } else {
1440 fprintf(stderr, "Unknown configuration name '%s'\n"
1441 "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
1442 optarg);
1443 return 1;
1444 }
1445 break;
1446 case 13: hts_srand48(atoi(optarg)); break;
1447 case 14:
1448 if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP;
1449 else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD;
1450 else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0;
1451 else error("The option to --ambig-reads not recognised: %s\n",optarg);
1452 break;
1453 default:
1454 fprintf(stderr,"Invalid option: '%c'\n", c);
1455 return 1;
1456 }
1457 }
1458
1459 if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) )
1460 {
1461 fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n");
1462 mplp.fmt_flag |= B2B_FMT_DP;
1463 }
1464 if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) )
1465 {
1466 if ( mplp.flag&MPLP_VCF )
1467 {
1468 if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF;
1469 else mplp.output_type = FT_VCF_GZ;
1470 }
1471 else if ( mplp.flag&MPLP_BCF )
1472 {
1473 if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF;
1474 else mplp.output_type = FT_BCF_GZ;
1475 }
1476 }
1477 if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
1478 {
1479 fprintf(stderr,"Error: The -B option cannot be combined with -E\n");
1480 return 1;
1481 }
1482 if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
1483 if (argc == 1)
1484 {
1485 print_usage(stderr, &mplp);
1486 return 1;
1487 }
1488 if (!mplp.fai && !noref) {
1489 fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
1490 return 1;
1491 }
1492 int ret,i;
1493 if (file_list)
1494 {
1495 if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
1496 mplp.files = fn;
1497 mplp.nfiles = nfiles;
1498 }
1499 else
1500 {
1501 mplp.nfiles = argc - optind;
1502 mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*));
1503 for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]);
1504 }
1505 ret = mpileup(&mplp);
1506
1507 for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]);
1508 free(mplp.files);
1509 free(mplp.reg_fname); free(mplp.pl_list);
1510 if (mplp.fai) fai_destroy(mplp.fai);
1511 if (mplp.bed) regidx_destroy(mplp.bed);
1512 if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
1513 if (mplp.reg) regidx_destroy(mplp.reg);
1514 bam_smpl_destroy(mplp.bsmpl);
1515
1516 return ret;
1517 }
1518