1 /* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
2
3 Copyright (C) 2013-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <stdarg.h>
26 #include <string.h>
27 #include <strings.h>
28 #include <assert.h>
29 #include <errno.h>
30 #include <unistd.h>
31 #include <getopt.h>
32 #include <math.h>
33 #include <htslib/vcf.h>
34 #include <time.h>
35 #include <zlib.h>
36 #include <stdarg.h>
37 #include <htslib/kfunc.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/khash_str2int.h>
40 #include <ctype.h>
41 #include "bcftools.h"
42 #include "call.h"
43 #include "prob1.h"
44 #include "ploidy.h"
45 #include "gvcf.h"
46 #include "regidx.h"
47 #include "vcfbuf.h"
48
49 void error(const char *format, ...);
50
51 #define CF_NO_GENO 1
52 #define CF_INS_MISSED (1<<1)
53 #define CF_CCALL (1<<2)
54 // (1<<3)
55 // (1<<4)
56 // (1<<5)
57 #define CF_ACGT_ONLY (1<<6)
58 #define CF_QCALL (1<<7)
59 #define CF_ADJLD (1<<8)
60 #define CF_NO_INDEL (1<<9)
61 #define CF_ANNO_MAX (1<<10)
62 #define CF_MCALL (1<<11)
63 #define CF_PAIRCALL (1<<12)
64 #define CF_QCNT (1<<13)
65 #define CF_INDEL_ONLY (1<<14)
66
67 typedef struct
68 {
69 tgt_als_t *als;
70 int nmatch_als, ibuf;
71 }
72 rec_tgt_t;
73
74 typedef struct
75 {
76 int flag; // combination of CF_* flags above
77 int output_type, n_threads, record_cmd_line, clevel;
78 htsFile *bcf_in, *out_fh;
79 char *bcf_fname, *output_fname;
80 char **samples; // for subsampling and ploidy
81 int nsamples, *samples_map; // mapping from output sample names to original VCF
82 char *regions, *targets; // regions to process
83 int regions_is_file, targets_is_file, regions_overlap;
84 regidx_t *tgt_idx;
85 regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp;
86 vcfbuf_t *vcfbuf;
87
88 char *samples_fname;
89 int samples_is_file;
90 int *sample2sex; // mapping for ploidy. If negative, interpreted as -1*ploidy
91 int *sex2ploidy, *sex2ploidy_prev, nsex;
92 ploidy_t *ploidy;
93 gvcf_t *gvcf;
94
95 bcf1_t *missed_line;
96 call_t aux; // parameters and temporary data
97 kstring_t str;
98
99 int argc;
100 char **argv;
101
102 // int flag, prior_type, n1, n_sub, *sublist, n_perm;
103 // uint32_t *trio_aux;
104 // char *prior_file, **subsam;
105 // uint8_t *ploidy;
106 // double theta, pref, indel_frac, min_smpl_frac, min_lrt;
107 // Permutation tests
108 // int n_perm, *seeds;
109 // double min_perm_p;
110 // void *bed;
111 }
112 args_t;
113
add_sample(void * name2idx,char ** lines,int * nlines,int * mlines,char * name,char sex,int * ith)114 static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith)
115 {
116 int ret = khash_str2int_get(name2idx, name, ith);
117 if ( ret==0 ) return lines;
118
119 hts_expand(char*,(*nlines+1),*mlines,lines);
120 int len = strlen(name);
121 lines[*nlines] = (char*) malloc(len+3);
122 memcpy(lines[*nlines],name,len);
123 lines[*nlines][len] = ' ';
124 lines[*nlines][len+1] = sex;
125 lines[*nlines][len+2] = 0;
126 *ith = *nlines;
127 (*nlines)++;
128 khash_str2int_set(name2idx, strdup(name), *ith);
129 return lines;
130 }
131
132 typedef struct
133 {
134 const char *alias, *about, *ploidy;
135 }
136 ploidy_predef_t;
137
138 static ploidy_predef_t ploidy_predefs[] =
139 {
140 { .alias = "GRCh37",
141 .about = "Human Genome reference assembly GRCh37 / hg19",
142 .ploidy =
143 "X 1 60000 M 1\n"
144 "X 2699521 154931043 M 1\n"
145 "Y 1 59373566 M 1\n"
146 "Y 1 59373566 F 0\n"
147 "MT 1 16569 M 1\n"
148 "MT 1 16569 F 1\n"
149 "chrX 1 60000 M 1\n"
150 "chrX 2699521 154931043 M 1\n"
151 "chrY 1 59373566 M 1\n"
152 "chrY 1 59373566 F 0\n"
153 "chrM 1 16569 M 1\n"
154 "chrM 1 16569 F 1\n"
155 "* * * M 2\n"
156 "* * * F 2\n"
157 },
158 { .alias = "GRCh38",
159 .about = "Human Genome reference assembly GRCh38 / hg38",
160 .ploidy =
161 "X 1 9999 M 1\n"
162 "X 2781480 155701381 M 1\n"
163 "Y 1 57227415 M 1\n"
164 "Y 1 57227415 F 0\n"
165 "MT 1 16569 M 1\n"
166 "MT 1 16569 F 1\n"
167 "chrX 1 9999 M 1\n"
168 "chrX 2781480 155701381 M 1\n"
169 "chrY 1 57227415 M 1\n"
170 "chrY 1 57227415 F 0\n"
171 "chrM 1 16569 M 1\n"
172 "chrM 1 16569 F 1\n"
173 "* * * M 2\n"
174 "* * * F 2\n"
175 },
176 { .alias = "X",
177 .about = "Treat male samples as haploid and female as diploid regardless of the chromosome name",
178 .ploidy =
179 "* * * M 1\n"
180 "* * * F 2\n"
181 },
182 { .alias = "Y",
183 .about = "Treat male samples as haploid and female as no-copy, regardless of the chromosome name",
184 .ploidy =
185 "* * * M 1\n"
186 "* * * F 0\n"
187 },
188 { .alias = "1",
189 .about = "Treat all samples as haploid",
190 .ploidy =
191 "* * * * 1\n"
192 },
193 { .alias = "2",
194 .about = "Treat all samples as diploid",
195 .ploidy =
196 "* * * * 2\n"
197 },
198 {
199 .alias = NULL,
200 .about = NULL,
201 .ploidy = NULL,
202 }
203 };
204
205 // only 5 columns are required and the first is ignored:
206 // ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F)
parse_ped_samples(call_t * call,char ** vals,int nvals,int * nsmpl)207 static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl)
208 {
209 int i, j, mlines = 0, nlines = 0;
210 kstring_t str = {0,0,0}, fam_str = {0,0,0};
211 void *name2idx = khash_str2int_init();
212 char **lines = NULL;
213 for (i=0; i<nvals; i++)
214 {
215 str.l = 0;
216 kputs(vals[i], &str);
217 char *col_ends[5], *tmp = str.s;
218 j = 0;
219 while ( *tmp && j<5 )
220 {
221 if ( isspace(*tmp) )
222 {
223 *tmp = 0;
224 ++tmp;
225 while ( isspace(*tmp) ) tmp++; // allow multiple spaces
226 col_ends[j] = tmp-1;
227 j++;
228 continue;
229 }
230 tmp++;
231 }
232 if ( j!=5 ) break;
233
234 char sex = col_ends[3][1]=='1' ? 'M' : 'F';
235 lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
236 if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
237 {
238 call->nfams++;
239 hts_expand(family_t, call->nfams, call->mfams, call->fams);
240 family_t *fam = &call->fams[call->nfams-1];
241 fam_str.l = 0;
242 ksprintf(&fam_str,"father=%s, mother=%s, child=%s", col_ends[1]+1,col_ends[2]+1,col_ends[0]+1);
243 fam->name = strdup(fam_str.s);
244
245 if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) )
246 lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]);
247 if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) )
248 lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]);
249
250 khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]);
251 khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]);
252 khash_str2int_get(name2idx, col_ends[2]+1, &fam->sample[MOTHER]);
253 }
254 }
255 free(str.s);
256 free(fam_str.s);
257 khash_str2int_destroy_free(name2idx);
258
259 if ( i!=nvals ) // not a ped file
260 {
261 if ( i>0 ) error("Could not parse samples, not a PED format.\n");
262 return NULL;
263 }
264 *nsmpl = nlines;
265 return lines;
266 }
267
268
269 /*
270 * Reads sample names and their ploidy (optional) from a file.
271 * Alternatively, if no such file exists, the file name is interpreted
272 * as a comma-separated list of samples. When ploidy is not present,
273 * the default ploidy 2 is assumed.
274 */
set_samples(args_t * args,const char * fn,int is_file)275 static void set_samples(args_t *args, const char *fn, int is_file)
276 {
277 int i, nlines;
278 char **lines = hts_readlist(fn, is_file, &nlines);
279 if ( !lines ) error("Could not read the file: %s\n", fn);
280
281 int nsmpls;
282 char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
283 if ( smpls )
284 {
285 for (i=0; i<nlines; i++) free(lines[i]);
286 free(lines);
287 lines = smpls;
288 nlines = nsmpls;
289 }
290
291 args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
292 args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
293 int dflt_sex_id = ploidy_nsex(args->ploidy) - 1;
294 for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
295
296 int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
297 for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) old2new[i] = -1;
298
299 int nsmpl = 0, map_needed = 0;
300 for (i=0; i<nlines; i++)
301 {
302 char *ss = lines[i];
303 while ( *ss && isspace(*ss) ) ss++;
304 if ( !*ss ) error("Could not parse: %s\n", lines[i]);
305 if ( *ss=='#' ) continue;
306 char *se = ss;
307 while ( *se && !isspace(*se) ) se++;
308 char x = *se, *xptr = se; *se = 0;
309
310 int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
311 if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
312 if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
313
314 ss = se+(x != '\0');
315 while ( *ss && isspace(*ss) ) ss++;
316 if ( !*ss ) ss = "2"; // default ploidy
317 se = ss;
318 while ( *se && !isspace(*se) ) se++;
319 if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
320
321 if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
322 args->sample2sex[nsmpl] = -1*(ss[0]-'0');
323 else
324 args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);
325
326 if ( ismpl!=nsmpl ) map_needed = 1;
327 args->samples_map[nsmpl] = ismpl;
328 old2new[ismpl] = nsmpl;
329 nsmpl++;
330 }
331
332 for (i=0; i<args->aux.nfams; i++)
333 {
334 int j, nmiss = 0;
335 family_t *fam = &args->aux.fams[i];
336 for (j=0; j<3; j++)
337 {
338 fam->sample[i] = old2new[fam->sample[i]];
339 if ( fam->sample[i]<0 ) nmiss++;
340 }
341 assert( nmiss==0 || nmiss==3 );
342 }
343 free(old2new);
344
345 if ( !map_needed ) { free(args->samples_map); args->samples_map = NULL; }
346
347 args->nsamples = nsmpl;
348 args->samples = lines;
349 }
350
init_missed_line(args_t * args)351 static void init_missed_line(args_t *args)
352 {
353 int i;
354 for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
355 {
356 args->aux.gts[i*2] = bcf_gt_missing;
357 args->aux.gts[i*2+1] = bcf_int32_vector_end;
358 }
359 args->missed_line = bcf_init1();
360 bcf_update_genotypes(args->aux.hdr, args->missed_line, args->aux.gts, 2*bcf_hdr_nsamples(args->aux.hdr));
361 bcf_float_set_missing(args->missed_line->qual);
362 }
363
tgt_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)364 static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
365 {
366 char *ss = (char*) line;
367 while ( *ss && isspace(*ss) ) ss++;
368 if ( !*ss ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; }
369 if ( *ss=='#' ) return -1; // skip comments
370
371 char *se = ss;
372 while ( *se && !isspace(*se) ) se++;
373
374 *chr_beg = ss;
375 *chr_end = se-1;
376
377 if ( !*se ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; }
378
379 ss = se+1;
380 *beg = strtod(ss, &se);
381 if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; }
382 if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
383 (*beg)--;
384 *end = *beg;
385
386 if ( !usr ) return 0; // allele information not required
387
388 ss = se+1;
389 tgt_als_t *als = (tgt_als_t*)payload;
390 als->used = 0;
391 als->n = 0;
392 als->allele = NULL;
393 while ( *ss )
394 {
395 se = ss;
396 while ( *se && *se!=',' ) se++;
397 als->n++;
398 als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele));
399 als->allele[als->n-1] = (char*)malloc(se-ss+1);
400 memcpy(als->allele[als->n-1],ss,se-ss);
401 als->allele[als->n-1][se-ss] = 0;
402 ss = se+1;
403 if ( !*se ) break;
404 }
405 return 0;
406 }
tgt_free(void * payload)407 static void tgt_free(void *payload)
408 {
409 tgt_als_t *als = (tgt_als_t*)payload;
410 int i;
411 for (i=0; i<als->n; i++) free(als->allele[i]);
412 free(als->allele);
413 }
tgt_flush_region(args_t * args,char * chr,uint32_t beg,uint32_t end)414 static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end)
415 {
416 if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr_tmp) ) return;
417 while ( regitr_overlap(args->tgt_itr_tmp) )
418 {
419 if ( args->tgt_itr_tmp->beg < beg ) continue;
420
421 tgt_als_t *tgt_als = ®itr_payload(args->tgt_itr_tmp,tgt_als_t);
422 if ( tgt_als->used ) continue;
423
424 args->missed_line->rid = bcf_hdr_name2id(args->aux.hdr,chr);
425 args->missed_line->pos = args->tgt_itr_tmp->beg;
426 bcf_unpack(args->missed_line,BCF_UN_ALL);
427 bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n);
428 tgt_als->used = 1;
429 if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
430 }
431 }
tgt_flush(args_t * args,bcf1_t * rec)432 static void tgt_flush(args_t *args, bcf1_t *rec)
433 {
434 if ( rec )
435 {
436 char *chr = (char*)bcf_seqname(args->aux.hdr,rec);
437
438 if ( !args->tgt_itr_prev ) // first record
439 tgt_flush_region(args,chr,0,rec->pos-1);
440
441 else if ( strcmp(chr,args->tgt_itr_prev->seq) ) // first record on a new chromosome
442 {
443 tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX);
444 tgt_flush_region(args,chr,0,rec->pos-1);
445 }
446 else // another record on the same chromosome
447 tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,rec->pos-1);
448 }
449 else
450 {
451 // flush everything
452 if ( args->tgt_itr_prev )
453 tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,REGIDX_MAX);
454
455 int i, nchr = 0;
456 char **chr = regidx_seq_names(args->tgt_idx, &nchr);
457 for (i=0; i<nchr; i++)
458 tgt_flush_region(args,chr[i],0,REGIDX_MAX);
459 }
460 }
is_indel(int nals,char ** als)461 inline static int is_indel(int nals, char **als)
462 {
463 // This is mpileup output, we can make some assumption:
464 // - no MNPs
465 // - "<*>" is not present at indels sites and there are no other symbolic alleles than <*>
466 if ( als[1][0]=='<' ) return 0;
467
468 int i;
469 for (i=0; i<nals; i++)
470 {
471 if ( als[i][0]=='<' ) continue;
472 if ( als[i][1] ) return 1;
473 }
474 return 0;
475 }
next_line(args_t * args)476 bcf1_t *next_line(args_t *args)
477 {
478 bcf1_t *rec = NULL;
479 if ( !args->vcfbuf )
480 {
481 while ( bcf_sr_next_line(args->aux.srs) )
482 {
483 rec = args->aux.srs->readers[0].buffer[0];
484 if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n");
485 if ( args->tgt_idx )
486 {
487 if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue;
488
489 // For backward compatibility: require the exact position, not an interval overlap
490 int pos_match = 0;
491 while ( regitr_overlap(args->tgt_itr) )
492 {
493 if ( args->tgt_itr->beg != rec->pos ) continue;
494 pos_match = 1;
495 break;
496 }
497 if ( !pos_match ) continue;
498 }
499 if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map);
500 bcf_unpack(rec, BCF_UN_STR);
501 return rec;
502 }
503 return NULL;
504 }
505
506 // If we are here,-C alleles was given and vcfbuf and tgt_idx are set
507
508 // Fill the buffer with duplicate lines
509 int vcfbuf_full = 1;
510 int nbuf = vcfbuf_nsites(args->vcfbuf);
511 bcf1_t *rec0 = NULL, *recN = NULL;
512 if ( nbuf==0 ) vcfbuf_full = 0;
513 else if ( nbuf==1 )
514 {
515 vcfbuf_full = 0;
516 rec0 = vcfbuf_peek(args->vcfbuf, 0);
517 }
518 else
519 {
520 rec0 = vcfbuf_peek(args->vcfbuf, 0);
521 recN = vcfbuf_peek(args->vcfbuf, nbuf-1);
522 if ( rec0->rid == recN->rid && rec0->pos == recN->pos ) vcfbuf_full = 0;
523 }
524 if ( !vcfbuf_full )
525 {
526 while ( bcf_sr_next_line(args->aux.srs) )
527 {
528 rec = args->aux.srs->readers[0].buffer[0];
529 if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n");
530 if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue;
531 // as above: require the exact position, not an interval overlap
532 int exact_match = 0;
533 while ( regitr_overlap(args->tgt_itr) )
534 {
535 if ( args->tgt_itr->beg != rec->pos ) continue;
536 exact_match = 1;
537 break;
538 }
539 if ( !exact_match ) continue;
540
541 if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map);
542 bcf_unpack(rec, BCF_UN_STR);
543 if ( !rec0 ) rec0 = rec;
544 recN = rec;
545 args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
546 if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break;
547 }
548 }
549
550 nbuf = vcfbuf_nsites(args->vcfbuf);
551 int n, i,j;
552 for (n=nbuf; n>1; n--)
553 {
554 recN = vcfbuf_peek(args->vcfbuf, n-1);
555 if ( rec0->rid==recN->rid && rec0->pos==recN->pos ) break;
556 }
557 if ( n==0 )
558 {
559 assert( !nbuf );
560 return NULL;
561 }
562
563 // Find the VCF and tab record with the best matching combination of alleles, prioritize
564 // records of the same type (snp vs indel)
565 rec_tgt_t rec_tgt;
566 memset(&rec_tgt,0,sizeof(rec_tgt));
567 regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr);
568 regitr_t *tmp_itr = regitr_init(args->tgt_idx);
569 regitr_copy(tmp_itr, args->tgt_itr);
570 for (i=0; i<n; i++)
571 {
572 rec = vcfbuf_peek(args->vcfbuf, i);
573 int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1;
574 while ( regitr_overlap(tmp_itr) )
575 {
576 if ( tmp_itr->beg != rec->pos ) continue;
577 tgt_als_t *als = ®itr_payload(tmp_itr,tgt_als_t);
578 if ( als->used ) continue;
579 int nmatch_als = 0;
580 vcmp_t *vcmp = vcmp_init();
581 int ret = vcmp_set_ref(vcmp, rec->d.allele[0], als->allele[0]);
582 if ( ret==0 )
583 {
584 nmatch_als++;
585 if ( rec->n_allele > 1 && als->n > 1 )
586 {
587 for (j=1; j<als->n; j++)
588 {
589 if ( vcmp_find_allele(vcmp, rec->d.allele+1, rec->n_allele-1, als->allele[j])>=0 ) nmatch_als++;
590 }
591 }
592 }
593 int als_indel = is_indel(als->n, als->allele) ? 1 : -1;
594 nmatch_als *= rec_indel*als_indel;
595 if ( nmatch_als > rec_tgt.nmatch_als || !rec_tgt.als )
596 {
597 rec_tgt.nmatch_als = nmatch_als;
598 rec_tgt.als = als;
599 rec_tgt.ibuf = i;
600 }
601 vcmp_destroy(vcmp);
602 }
603 }
604 regitr_destroy(tmp_itr);
605
606 args->aux.tgt_als = rec_tgt.als;
607 if ( rec_tgt.als ) rec_tgt.als->used = 1;
608
609 rec = vcfbuf_remove(args->vcfbuf, rec_tgt.ibuf);
610 return rec;
611 }
612
init_data(args_t * args)613 static void init_data(args_t *args)
614 {
615 args->aux.srs = bcf_sr_init();
616
617 // Open files for input and output, initialize structures
618 if ( args->targets )
619 {
620 args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
621 args->tgt_itr = regitr_init(args->tgt_idx);
622 args->tgt_itr_tmp = regitr_init(args->tgt_idx);
623 }
624
625 if ( args->regions )
626 {
627 bcf_sr_set_opt(args->aux.srs,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
628 if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
629 error("Failed to read the regions: %s\n", args->regions);
630 }
631
632 if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) )
633 error("Failed to read from %s: %s\n", !strcmp("-",args->bcf_fname)?"standard input":args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
634 args->aux.hdr = bcf_sr_get_header(args->aux.srs,0);
635
636 int i;
637 if ( args->samples_fname )
638 {
639 set_samples(args, args->samples_fname, args->samples_is_file);
640 if ( args->aux.flag&CALL_CONSTR_TRIO )
641 {
642 if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
643 fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
644 }
645 }
646 if ( args->ploidy )
647 {
648 args->nsex = ploidy_nsex(args->ploidy);
649 args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
650 args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
651 if ( !args->nsamples )
652 {
653 args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
654 args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
655 for (i=0; i<args->nsamples; i++) args->sample2sex[i] = args->nsex - 1;
656 }
657 }
658 if ( args->nsamples )
659 {
660 args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
661 for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
662 for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
663 for (i=0; i<args->nsamples; i++)
664 if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
665 }
666
667 if ( args->gvcf )
668 {
669 int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP");
670 if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n");
671 gvcf_update_header(args->gvcf, args->aux.hdr);
672 }
673
674 if ( args->samples_map )
675 {
676 args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map);
677 if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n");
678 for (i=0; i<args->nsamples; i++)
679 if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
680 if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
681 }
682 else
683 {
684 args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
685 if ( args->samples )
686 {
687 for (i=0; i<args->nsamples; i++)
688 if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
689 error("No such sample: %s\n", args->samples[i]);
690 }
691 }
692
693 if ( args->aux.flag & CALL_CONSTR_ALLELES )
694 args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
695
696 char wmode[8];
697 set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
698 args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
699 if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
700 if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
701
702 if ( args->flag & CF_QCALL )
703 return;
704
705 if ( args->flag & CF_MCALL )
706 mcall_init(&args->aux);
707
708 if ( args->flag & CF_CCALL )
709 ccall_init(&args->aux);
710
711 bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
712 bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
713
714 if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
715 if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
716
717 if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
718 }
719
destroy_data(args_t * args)720 static void destroy_data(args_t *args)
721 {
722 if ( args->vcfbuf ) vcfbuf_destroy(args->vcfbuf);
723 if ( args->tgt_idx )
724 {
725 regidx_destroy(args->tgt_idx);
726 regitr_destroy(args->tgt_itr);
727 regitr_destroy(args->tgt_itr_tmp);
728 if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev);
729 }
730 if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux);
731 else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
732 else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
733 int i;
734 if ( args->samples )
735 {
736 for (i=0; i<args->nsamples; i++) free(args->samples[i]);
737 }
738 if ( args->aux.fams )
739 {
740 for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
741 free(args->aux.fams);
742 }
743 if ( args->missed_line ) bcf_destroy(args->missed_line);
744 ploidy_destroy(args->ploidy);
745 free(args->sex2ploidy);
746 free(args->sex2ploidy_prev);
747 free(args->samples);
748 free(args->samples_map);
749 free(args->sample2sex);
750 free(args->aux.ploidy);
751 free(args->str.s);
752 if ( args->gvcf ) gvcf_destroy(args->gvcf);
753 bcf_hdr_destroy(args->aux.hdr);
754 if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
755 bcf_sr_destroy(args->aux.srs);
756 }
757
parse_novel_rate(args_t * args,const char * str)758 void parse_novel_rate(args_t *args, const char *str)
759 {
760 if ( sscanf(str,"%le,%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del,&args->aux.trio_Pm_ins)==3 ) // explicit for all
761 {
762 args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
763 args->aux.trio_Pm_del = 1 - args->aux.trio_Pm_del;
764 args->aux.trio_Pm_ins = 1 - args->aux.trio_Pm_ins;
765 }
766 else if ( sscanf(str,"%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del)==2 ) // dynamic for indels
767 {
768 args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
769 args->aux.trio_Pm_ins = -1; // negative value for dynamic calculation
770 }
771 else if ( sscanf(str,"%le",&args->aux.trio_Pm_SNPs)==1 ) // same for all
772 {
773 args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
774 args->aux.trio_Pm_del = -1;
775 args->aux.trio_Pm_ins = -1;
776 }
777 else error("Could not parse --novel-rate %s\n", str);
778 }
779
list_annotations(FILE * fp)780 static void list_annotations(FILE *fp)
781 {
782 fprintf(fp,
783 "\n"
784 "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n"
785 " INFO/PV4 .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n"
786 "\n"
787 "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n"
788 " FORMAT/GQ .. Phred-scaled genotype quality (Number=1,Type=Integer)\n"
789 " FORMAT/GP .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n"
790 "\n");
791 }
792
parse_output_tags(const char * str)793 static int parse_output_tags(const char *str)
794 {
795 int flag = 0;
796 const char *ss = str;
797 while ( *ss )
798 {
799 const char *se = ss;
800 while ( *se && *se!=',' ) se++;
801 if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss) ) flag |= CALL_FMT_GQ;
802 else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP;
803 else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4;
804 else
805 {
806 fprintf(stderr,"Could not parse \"%s\"\n", str);
807 exit(1);
808 }
809 if ( !*se ) break;
810 ss = se + 1;
811 }
812 return flag;
813 }
814
set_ploidy(args_t * args,bcf1_t * rec)815 static void set_ploidy(args_t *args, bcf1_t *rec)
816 {
817 ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL);
818
819 int i;
820 for (i=0; i<args->nsex; i++)
821 if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break;
822
823 if ( i==args->nsex ) return; // ploidy same as previously
824
825 for (i=0; i<args->nsamples; i++)
826 {
827 if ( args->sample2sex[i]<0 )
828 args->aux.ploidy[i] = -1*args->sample2sex[i];
829 else
830 args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
831 }
832 int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
833 }
834
init_ploidy(char * alias)835 ploidy_t *init_ploidy(char *alias)
836 {
837 const ploidy_predef_t *pld = ploidy_predefs;
838
839 int detailed = 0, len = strlen(alias);
840 if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; }
841
842 while ( pld->alias && strcasecmp(alias,pld->alias) ) pld++;
843
844 if ( !pld->alias )
845 {
846 fprintf(stderr,"\nPRE-DEFINED PLOIDY FILES\n\n");
847 fprintf(stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n");
848 fprintf(stderr," * Coordinates are 1-based inclusive.\n");
849 fprintf(stderr," * A '*' means any value not otherwise defined.\n\n");
850 pld = ploidy_predefs;
851 while ( pld->alias )
852 {
853 fprintf(stderr,"%s\n .. %s\n\n", pld->alias,pld->about);
854 if ( detailed )
855 fprintf(stderr,"%s\n", pld->ploidy);
856 pld++;
857 }
858 fprintf(stderr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
859 fprintf(stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
860 fprintf(stderr,"\n");
861 exit(-1);
862 }
863 else if ( detailed )
864 {
865 fprintf(stderr,"%s", pld->ploidy);
866 exit(-1);
867 }
868 return ploidy_init_string(pld->ploidy,2);
869 }
870
usage(args_t * args)871 static void usage(args_t *args)
872 {
873 fprintf(stderr, "\n");
874 fprintf(stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.\n");
875 fprintf(stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n");
876 fprintf(stderr, " functionality has been temporarily lost in the process of transition to htslib,\n");
877 fprintf(stderr, " but will be added back on popular demand. The original calling model can be\n");
878 fprintf(stderr, " invoked with the -c option.\n");
879 fprintf(stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
880 fprintf(stderr, "\n");
881 fprintf(stderr, "File format options:\n");
882 fprintf(stderr, " --no-version Do not append version and command line to the header\n");
883 fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
884 fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
885 fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
886 fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
887 fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
888 fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
889 fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
890 fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
891 fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n");
892 fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
893 fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
894 fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
895 fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
896 fprintf(stderr, "\n");
897 fprintf(stderr, "Input/output options:\n");
898 fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
899 fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
900 fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
901 fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
902 fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
903 fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
904 fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
905 fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
906 fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
907 fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
908 fprintf(stderr, " -v, --variants-only Output variant sites only\n");
909 fprintf(stderr, "\n");
910 fprintf(stderr, "Consensus/variant calling options:\n");
911 fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
912 fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
913 fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
914 fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
915 fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
916 fprintf(stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
917 fprintf(stderr, "\n");
918 fprintf(stderr, "Example:\n");
919 fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
920 fprintf(stderr, " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
921
922 // todo (and more)
923 // fprintf(stderr, "\nContrast calling and association test options:\n");
924 // fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
925 // fprintf(stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
926 // fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
927 // fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
928 fprintf(stderr, "\n");
929 exit(-1);
930 }
931
main_vcfcall(int argc,char * argv[])932 int main_vcfcall(int argc, char *argv[])
933 {
934 char *ploidy_fname = NULL, *ploidy = NULL;
935 args_t args;
936 memset(&args, 0, sizeof(args_t));
937 args.argc = argc; args.argv = argv;
938 args.aux.prior_type = -1;
939 args.aux.indel_frac = -1;
940 args.aux.theta = 1.1e-3;
941 args.aux.pref = 0.5;
942 args.aux.min_perm_p = 0.01;
943 args.aux.min_lrt = 1;
944 args.flag = CF_ACGT_ONLY;
945 args.output_fname = "-";
946 args.output_type = FT_VCF;
947 args.n_threads = 0;
948 args.record_cmd_line = 1;
949 args.aux.trio_Pm_SNPs = 1 - 1e-8;
950 args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
951 args.regions_overlap = 1;
952 args.clevel = -1;
953
954 int c;
955 static struct option loptions[] =
956 {
957 {"help",no_argument,NULL,'h'},
958 {"format-fields",required_argument,NULL,'f'},
959 {"annotate",required_argument,NULL,'a'},
960 {"prior-freqs",required_argument,NULL,'F'},
961 {"gvcf",required_argument,NULL,'g'},
962 {"group-samples",required_argument,NULL,'G'},
963 {"group-samples-tag",required_argument,NULL,3},
964 {"output",required_argument,NULL,'o'},
965 {"output-type",required_argument,NULL,'O'},
966 {"regions",required_argument,NULL,'r'},
967 {"regions-file",required_argument,NULL,'R'},
968 {"regions-overlap",required_argument,NULL,4},
969 {"samples",required_argument,NULL,'s'},
970 {"samples-file",required_argument,NULL,'S'},
971 {"targets",required_argument,NULL,'t'},
972 {"targets-file",required_argument,NULL,'T'},
973 {"threads",required_argument,NULL,9},
974 {"keep-alts",no_argument,NULL,'A'},
975 {"insert-missed",no_argument,NULL,'i'},
976 {"skip-Ns",no_argument,NULL,'N'}, // now the new default
977 {"keep-masked-refs",no_argument,NULL,'M'},
978 {"skip-variants",required_argument,NULL,'V'},
979 {"variants-only",no_argument,NULL,'v'},
980 {"consensus-caller",no_argument,NULL,'c'},
981 {"constrain",required_argument,NULL,'C'},
982 {"multiallelic-caller",no_argument,NULL,'m'},
983 {"pval-threshold",required_argument,NULL,'p'},
984 {"prior",required_argument,NULL,'P'},
985 {"novel-rate",required_argument,NULL,'n'},
986 {"ploidy",required_argument,NULL,1},
987 {"ploidy-file",required_argument,NULL,2},
988 {"chromosome-X",no_argument,NULL,'X'},
989 {"chromosome-Y",no_argument,NULL,'Y'},
990 {"no-version",no_argument,NULL,8},
991 {NULL,0,NULL,0}
992 };
993
994 char *tmp = NULL;
995 while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
996 {
997 switch (c)
998 {
999 case 2 : ploidy_fname = optarg; break;
1000 case 1 : ploidy = optarg; break;
1001 case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
1002 case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
1003 case 'G': args.aux.sample_groups = optarg; break;
1004 case 3 : args.aux.sample_groups_tag = optarg; break;
1005 case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
1006 case 'a':
1007 if (optarg[0]=='?') { list_annotations(stderr); return 1; }
1008 args.aux.output_tags |= parse_output_tags(optarg);
1009 break;
1010 case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
1011 case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
1012 case 'A': args.aux.flag |= CALL_KEEPALT; break;
1013 case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
1014 case 'i': args.flag |= CF_INS_MISSED; break;
1015 case 'v': args.aux.flag |= CALL_VARONLY; break;
1016 case 'F':
1017 args.aux.prior_AN = optarg;
1018 args.aux.prior_AC = strchr(optarg,',');
1019 if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
1020 *args.aux.prior_AC = 0;
1021 args.aux.prior_AC++;
1022 break;
1023 case 'g':
1024 args.gvcf = gvcf_init(optarg);
1025 if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
1026 break;
1027 case 'o': args.output_fname = optarg; break;
1028 case 'O':
1029 switch (optarg[0]) {
1030 case 'b': args.output_type = FT_BCF_GZ; break;
1031 case 'u': args.output_type = FT_BCF; break;
1032 case 'z': args.output_type = FT_VCF_GZ; break;
1033 case 'v': args.output_type = FT_VCF; break;
1034 default:
1035 {
1036 args.clevel = strtol(optarg,&tmp,10);
1037 if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1038 }
1039 }
1040 if ( optarg[1] )
1041 {
1042 args.clevel = strtol(optarg+1,&tmp,10);
1043 if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
1044 }
1045 break;
1046 case 'C':
1047 if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES;
1048 else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO;
1049 else error("Unknown argument to -C: \"%s\"\n", optarg);
1050 break;
1051 case 'V':
1052 if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY;
1053 else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL;
1054 else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg);
1055 break;
1056 case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method
1057 case 'p':
1058 args.aux.pref = strtod(optarg,&tmp);
1059 if ( *tmp ) error("Could not parse: --pval-threshold %s\n", optarg);
1060 break;
1061 case 'P': args.aux.theta = strtod(optarg,&tmp);
1062 if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg);
1063 break;
1064 case 'n': parse_novel_rate(&args,optarg); break;
1065 case 'r': args.regions = optarg; break;
1066 case 'R': args.regions = optarg; args.regions_is_file = 1; break;
1067 case 't': args.targets = optarg; break;
1068 case 'T': args.targets = optarg; args.targets_is_file = 1; break;
1069 case 's': args.samples_fname = optarg; break;
1070 case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
1071 case 9 : args.n_threads = strtol(optarg, 0, 0); break;
1072 case 8 : args.record_cmd_line = 0; break;
1073 case 4 :
1074 if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0;
1075 else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1;
1076 else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2;
1077 else error("Could not parse: --regions-overlap %s\n",optarg);
1078 break;
1079 default: usage(&args);
1080 }
1081 }
1082 // Sanity check options and initialize
1083 if ( ploidy_fname ) args.ploidy = ploidy_init(ploidy_fname, 2);
1084 else if ( ploidy ) args.ploidy = init_ploidy(ploidy);
1085
1086 if ( optind>=argc )
1087 {
1088 if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin
1089 else usage(&args);
1090 }
1091 else args.bcf_fname = argv[optind++];
1092
1093 if ( !ploidy_fname && !ploidy )
1094 {
1095 if ( !args.samples_is_file ) fprintf(stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n");
1096 args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2);
1097 }
1098
1099 if ( !args.ploidy ) error("Could not initialize ploidy\n");
1100 if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n");
1101 if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n");
1102 if ( (args.flag & CF_CCALL ? 1: 0) && args.gvcf ) error("gvcf -g option not functional with -c calling mode yet\n");
1103 if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix
1104 if ( args.aux.flag & CALL_CONSTR_ALLELES )
1105 {
1106 if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n");
1107 if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n");
1108 }
1109 if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n");
1110 if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n");
1111 if ( args.aux.sample_groups && !(args.flag & CF_MCALL) ) error("The -G feature is supported only with the -m calling mode\n");
1112 init_data(&args);
1113
1114 bcf1_t *bcf_rec;
1115 while ( (bcf_rec = next_line(&args)) )
1116 {
1117 // Skip duplicate positions with all matching `-C alleles -T` used up
1118 if ( args.aux.flag&CALL_CONSTR_ALLELES && !args.aux.tgt_als ) continue;
1119
1120 // Skip unwanted sites
1121 int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1;
1122 if ( (args.flag & CF_INDEL_ONLY) && !is_indel ) continue;
1123 if ( (args.flag & CF_NO_INDEL) && is_indel ) continue;
1124 if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N'
1125
1126 // Which allele is symbolic? All SNPs should have it, but not indels
1127 args.aux.unseen = 0;
1128 for (i=1; i<bcf_rec->n_allele; i++)
1129 {
1130 if ( bcf_rec->d.allele[i][0]=='X' ) { args.aux.unseen = i; break; } // old X
1131 if ( bcf_rec->d.allele[i][0]=='<' )
1132 {
1133 if ( bcf_rec->d.allele[i][1]=='X' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // old <X>
1134 if ( bcf_rec->d.allele[i][1]=='*' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // new <*>
1135 }
1136 }
1137 int is_ref = (bcf_rec->n_allele==1 || (bcf_rec->n_allele==2 && args.aux.unseen>0)) ? 1 : 0;
1138
1139 if ( is_ref && args.aux.flag&CALL_VARONLY )
1140 continue;
1141
1142 bcf_unpack(bcf_rec, BCF_UN_ALL);
1143 if ( args.nsex ) set_ploidy(&args, bcf_rec);
1144
1145 // Various output modes: QCall output (todo)
1146 if ( args.flag & CF_QCALL )
1147 {
1148 qcall(&args.aux, bcf_rec);
1149 continue;
1150 }
1151
1152 if ( args.flag & CF_INS_MISSED )
1153 {
1154 tgt_flush(&args,bcf_rec);
1155 if ( !args.tgt_itr_prev ) args.tgt_itr_prev = regitr_init(args.tgt_idx);
1156 regitr_copy(args.tgt_itr_prev, args.tgt_itr);
1157 }
1158
1159 // Calling modes which output VCFs
1160 int ret;
1161 if ( args.flag & CF_MCALL )
1162 ret = mcall(&args.aux, bcf_rec);
1163 else
1164 ret = ccall(&args.aux, bcf_rec);
1165 if ( ret==-1 ) error("Something is wrong\n");
1166 else if ( ret==-2 ) continue; // skip the site
1167
1168 // Normal output
1169 if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
1170 if ( args.gvcf )
1171 bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0);
1172 if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname);
1173 }
1174 if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0);
1175 if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL);
1176 destroy_data(&args);
1177 return 0;
1178 }
1179
1180