1 /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
2 
3     Copyright (C) 2013-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <stdarg.h>
26 #include <string.h>
27 #include <strings.h>
28 #include <assert.h>
29 #include <errno.h>
30 #include <unistd.h>
31 #include <getopt.h>
32 #include <math.h>
33 #include <htslib/vcf.h>
34 #include <time.h>
35 #include <zlib.h>
36 #include <stdarg.h>
37 #include <htslib/kfunc.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/khash_str2int.h>
40 #include <ctype.h>
41 #include "bcftools.h"
42 #include "call.h"
43 #include "prob1.h"
44 #include "ploidy.h"
45 #include "gvcf.h"
46 #include "regidx.h"
47 #include "vcfbuf.h"
48 
49 void error(const char *format, ...);
50 
51 #define CF_NO_GENO      1
52 #define CF_INS_MISSED   (1<<1)
53 #define CF_CCALL        (1<<2)
54 //                      (1<<3)
55 //                      (1<<4)
56 //                      (1<<5)
57 #define CF_ACGT_ONLY    (1<<6)
58 #define CF_QCALL        (1<<7)
59 #define CF_ADJLD        (1<<8)
60 #define CF_NO_INDEL     (1<<9)
61 #define CF_ANNO_MAX     (1<<10)
62 #define CF_MCALL        (1<<11)
63 #define CF_PAIRCALL     (1<<12)
64 #define CF_QCNT         (1<<13)
65 #define CF_INDEL_ONLY   (1<<14)
66 
67 typedef struct
68 {
69     tgt_als_t *als;
70     int nmatch_als, ibuf;
71 }
72 rec_tgt_t;
73 
74 typedef struct
75 {
76     int flag;   // combination of CF_* flags above
77     int output_type, n_threads, record_cmd_line, clevel;
78     htsFile *bcf_in, *out_fh;
79     char *bcf_fname, *output_fname;
80     char **samples;             // for subsampling and ploidy
81     int nsamples, *samples_map; // mapping from output sample names to original VCF
82     char *regions, *targets;    // regions to process
83     int regions_is_file, targets_is_file, regions_overlap;
84     regidx_t *tgt_idx;
85     regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp;
86     vcfbuf_t *vcfbuf;
87 
88     char *samples_fname;
89     int samples_is_file;
90     int *sample2sex;    // mapping for ploidy. If negative, interpreted as -1*ploidy
91     int *sex2ploidy, *sex2ploidy_prev, nsex;
92     ploidy_t *ploidy;
93     gvcf_t *gvcf;
94 
95     bcf1_t *missed_line;
96     call_t aux;     // parameters and temporary data
97     kstring_t str;
98 
99     int argc;
100     char **argv;
101 
102     //  int flag, prior_type, n1, n_sub, *sublist, n_perm;
103     //  uint32_t *trio_aux;
104     //  char *prior_file, **subsam;
105     //  uint8_t *ploidy;
106     //  double theta, pref, indel_frac, min_smpl_frac, min_lrt;
107     // Permutation tests
108     //  int n_perm, *seeds;
109     //  double min_perm_p;
110     //  void *bed;
111 }
112 args_t;
113 
add_sample(void * name2idx,char ** lines,int * nlines,int * mlines,char * name,char sex,int * ith)114 static char **add_sample(void *name2idx, char **lines, int *nlines, int *mlines, char *name, char sex, int *ith)
115 {
116     int ret = khash_str2int_get(name2idx, name, ith);
117     if ( ret==0 ) return lines;
118 
119     hts_expand(char*,(*nlines+1),*mlines,lines);
120     int len = strlen(name);
121     lines[*nlines] = (char*) malloc(len+3);
122     memcpy(lines[*nlines],name,len);
123     lines[*nlines][len]   = ' ';
124     lines[*nlines][len+1] = sex;
125     lines[*nlines][len+2] = 0;
126     *ith = *nlines;
127     (*nlines)++;
128     khash_str2int_set(name2idx, strdup(name), *ith);
129     return lines;
130 }
131 
132 typedef struct
133 {
134     const char *alias, *about, *ploidy;
135 }
136 ploidy_predef_t;
137 
138 static ploidy_predef_t ploidy_predefs[] =
139 {
140     { .alias  = "GRCh37",
141       .about  = "Human Genome reference assembly GRCh37 / hg19",
142       .ploidy =
143           "X 1 60000 M 1\n"
144           "X 2699521 154931043 M 1\n"
145           "Y 1 59373566 M 1\n"
146           "Y 1 59373566 F 0\n"
147           "MT 1 16569 M 1\n"
148           "MT 1 16569 F 1\n"
149           "chrX 1 60000 M 1\n"
150           "chrX 2699521 154931043 M 1\n"
151           "chrY 1 59373566 M 1\n"
152           "chrY 1 59373566 F 0\n"
153           "chrM 1 16569 M 1\n"
154           "chrM 1 16569 F 1\n"
155           "*  * *     M 2\n"
156           "*  * *     F 2\n"
157     },
158     { .alias  = "GRCh38",
159       .about  = "Human Genome reference assembly GRCh38 / hg38",
160       .ploidy =
161           "X 1 9999 M 1\n"
162           "X 2781480 155701381 M 1\n"
163           "Y 1 57227415 M 1\n"
164           "Y 1 57227415 F 0\n"
165           "MT 1 16569 M 1\n"
166           "MT 1 16569 F 1\n"
167           "chrX 1 9999 M 1\n"
168           "chrX 2781480 155701381 M 1\n"
169           "chrY 1 57227415 M 1\n"
170           "chrY 1 57227415 F 0\n"
171           "chrM 1 16569 M 1\n"
172           "chrM 1 16569 F 1\n"
173           "*  * *     M 2\n"
174           "*  * *     F 2\n"
175     },
176     { .alias  = "X",
177       .about  = "Treat male samples as haploid and female as diploid regardless of the chromosome name",
178       .ploidy =
179           "*  * *     M 1\n"
180           "*  * *     F 2\n"
181     },
182     { .alias  = "Y",
183       .about  = "Treat male samples as haploid and female as no-copy, regardless of the chromosome name",
184       .ploidy =
185           "*  * *     M 1\n"
186           "*  * *     F 0\n"
187     },
188     { .alias  = "1",
189       .about  = "Treat all samples as haploid",
190       .ploidy =
191           "*  * *     * 1\n"
192     },
193     { .alias  = "2",
194       .about  = "Treat all samples as diploid",
195       .ploidy =
196           "*  * *     * 2\n"
197     },
198     {
199         .alias  = NULL,
200         .about  = NULL,
201         .ploidy = NULL,
202     }
203 };
204 
205 // only 5 columns are required and the first is ignored:
206 //  ignored,sample,father(or 0),mother(or 0),sex(1=M,2=F)
parse_ped_samples(call_t * call,char ** vals,int nvals,int * nsmpl)207 static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl)
208 {
209     int i, j, mlines = 0, nlines = 0;
210     kstring_t str = {0,0,0}, fam_str = {0,0,0};
211     void *name2idx = khash_str2int_init();
212     char **lines = NULL;
213     for (i=0; i<nvals; i++)
214     {
215         str.l = 0;
216         kputs(vals[i], &str);
217         char *col_ends[5], *tmp = str.s;
218         j = 0;
219         while ( *tmp && j<5 )
220         {
221             if ( isspace(*tmp) )
222             {
223                 *tmp = 0;
224                 ++tmp;
225                 while ( isspace(*tmp) ) tmp++;  // allow multiple spaces
226                 col_ends[j] = tmp-1;
227                 j++;
228                 continue;
229             }
230             tmp++;
231         }
232         if ( j!=5 ) break;
233 
234         char sex = col_ends[3][1]=='1' ? 'M' : 'F';
235         lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
236         if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") )   // father and mother
237         {
238             call->nfams++;
239             hts_expand(family_t, call->nfams, call->mfams, call->fams);
240             family_t *fam = &call->fams[call->nfams-1];
241             fam_str.l = 0;
242             ksprintf(&fam_str,"father=%s, mother=%s, child=%s", col_ends[1]+1,col_ends[2]+1,col_ends[0]+1);
243             fam->name = strdup(fam_str.s);
244 
245             if ( !khash_str2int_has_key(name2idx, col_ends[1]+1) )
246                 lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[1]+1, 'M', &fam->sample[FATHER]);
247             if ( !khash_str2int_has_key(name2idx, col_ends[2]+1) )
248                 lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[2]+1, 'F', &fam->sample[MOTHER]);
249 
250             khash_str2int_get(name2idx, col_ends[0]+1, &fam->sample[CHILD]);
251             khash_str2int_get(name2idx, col_ends[1]+1, &fam->sample[FATHER]);
252             khash_str2int_get(name2idx, col_ends[2]+1, &fam->sample[MOTHER]);
253         }
254     }
255     free(str.s);
256     free(fam_str.s);
257     khash_str2int_destroy_free(name2idx);
258 
259     if ( i!=nvals ) // not a ped file
260     {
261         if ( i>0 ) error("Could not parse samples, not a PED format.\n");
262         return NULL;
263     }
264     *nsmpl = nlines;
265     return lines;
266 }
267 
268 
269 /*
270  *  Reads sample names and their ploidy (optional) from a file.
271  *  Alternatively, if no such file exists, the file name is interpreted
272  *  as a comma-separated list of samples. When ploidy is not present,
273  *  the default ploidy 2 is assumed.
274  */
set_samples(args_t * args,const char * fn,int is_file)275 static void set_samples(args_t *args, const char *fn, int is_file)
276 {
277     int i, nlines;
278     char **lines = hts_readlist(fn, is_file, &nlines);
279     if ( !lines ) error("Could not read the file: %s\n", fn);
280 
281     int nsmpls;
282     char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
283     if ( smpls )
284     {
285         for (i=0; i<nlines; i++) free(lines[i]);
286         free(lines);
287         lines = smpls;
288         nlines = nsmpls;
289     }
290 
291     args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
292     args->sample2sex  = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
293     int dflt_sex_id = ploidy_nsex(args->ploidy) - 1;
294     for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
295 
296     int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
297     for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) old2new[i] = -1;
298 
299     int nsmpl = 0, map_needed = 0;
300     for (i=0; i<nlines; i++)
301     {
302         char *ss = lines[i];
303         while ( *ss && isspace(*ss) ) ss++;
304         if ( !*ss ) error("Could not parse: %s\n", lines[i]);
305         if ( *ss=='#' ) continue;
306         char *se = ss;
307         while ( *se && !isspace(*se) ) se++;
308         char x = *se, *xptr = se; *se = 0;
309 
310         int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
311         if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
312         if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
313 
314         ss = se+(x != '\0');
315         while ( *ss && isspace(*ss) ) ss++;
316         if ( !*ss ) ss = "2";   // default ploidy
317         se = ss;
318         while ( *se && !isspace(*se) ) se++;
319         if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }
320 
321         if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
322             args->sample2sex[nsmpl] = -1*(ss[0]-'0');
323         else
324             args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);
325 
326         if ( ismpl!=nsmpl ) map_needed = 1;
327         args->samples_map[nsmpl] = ismpl;
328         old2new[ismpl] = nsmpl;
329         nsmpl++;
330     }
331 
332     for (i=0; i<args->aux.nfams; i++)
333     {
334         int j, nmiss = 0;
335         family_t *fam = &args->aux.fams[i];
336         for (j=0; j<3; j++)
337         {
338             fam->sample[i] = old2new[fam->sample[i]];
339             if ( fam->sample[i]<0 ) nmiss++;
340         }
341         assert( nmiss==0 || nmiss==3 );
342     }
343     free(old2new);
344 
345     if ( !map_needed ) { free(args->samples_map); args->samples_map = NULL; }
346 
347     args->nsamples = nsmpl;
348     args->samples = lines;
349 }
350 
init_missed_line(args_t * args)351 static void init_missed_line(args_t *args)
352 {
353     int i;
354     for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
355     {
356         args->aux.gts[i*2]   = bcf_gt_missing;
357         args->aux.gts[i*2+1] = bcf_int32_vector_end;
358     }
359     args->missed_line = bcf_init1();
360     bcf_update_genotypes(args->aux.hdr, args->missed_line, args->aux.gts, 2*bcf_hdr_nsamples(args->aux.hdr));
361     bcf_float_set_missing(args->missed_line->qual);
362 }
363 
tgt_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)364 static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
365 {
366     char *ss = (char*) line;
367     while ( *ss && isspace(*ss) ) ss++;
368     if ( !*ss ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; }
369     if ( *ss=='#' ) return -1;  // skip comments
370 
371     char *se = ss;
372     while ( *se && !isspace(*se) ) se++;
373 
374     *chr_beg = ss;
375     *chr_end = se-1;
376 
377     if ( !*se ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; }
378 
379     ss = se+1;
380     *beg = strtod(ss, &se);
381     if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; }
382     if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
383     (*beg)--;
384     *end = *beg;
385 
386     if ( !usr ) return 0; // allele information not required
387 
388     ss = se+1;
389     tgt_als_t *als = (tgt_als_t*)payload;
390     als->used   = 0;
391     als->n      = 0;
392     als->allele = NULL;
393     while ( *ss )
394     {
395         se = ss;
396         while ( *se && *se!=',' ) se++;
397         als->n++;
398         als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele));
399         als->allele[als->n-1] = (char*)malloc(se-ss+1);
400         memcpy(als->allele[als->n-1],ss,se-ss);
401         als->allele[als->n-1][se-ss] = 0;
402         ss = se+1;
403         if ( !*se ) break;
404     }
405     return 0;
406 }
tgt_free(void * payload)407 static void tgt_free(void *payload)
408 {
409     tgt_als_t *als = (tgt_als_t*)payload;
410     int i;
411     for (i=0; i<als->n; i++) free(als->allele[i]);
412     free(als->allele);
413 }
tgt_flush_region(args_t * args,char * chr,uint32_t beg,uint32_t end)414 static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end)
415 {
416     if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr_tmp) ) return;
417     while ( regitr_overlap(args->tgt_itr_tmp) )
418     {
419         if ( args->tgt_itr_tmp->beg < beg ) continue;
420 
421         tgt_als_t *tgt_als = &regitr_payload(args->tgt_itr_tmp,tgt_als_t);
422         if ( tgt_als->used ) continue;
423 
424         args->missed_line->rid  = bcf_hdr_name2id(args->aux.hdr,chr);
425         args->missed_line->pos  = args->tgt_itr_tmp->beg;
426         bcf_unpack(args->missed_line,BCF_UN_ALL);
427         bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n);
428         tgt_als->used = 1;
429         if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
430     }
431 }
tgt_flush(args_t * args,bcf1_t * rec)432 static void tgt_flush(args_t *args, bcf1_t *rec)
433 {
434     if ( rec )
435     {
436         char *chr = (char*)bcf_seqname(args->aux.hdr,rec);
437 
438         if ( !args->tgt_itr_prev )                  // first record
439             tgt_flush_region(args,chr,0,rec->pos-1);
440 
441         else if ( strcmp(chr,args->tgt_itr_prev->seq) )  // first record on a new chromosome
442         {
443             tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX);
444             tgt_flush_region(args,chr,0,rec->pos-1);
445         }
446         else                                        // another record on the same chromosome
447             tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,rec->pos-1);
448     }
449     else
450     {
451         // flush everything
452         if ( args->tgt_itr_prev )
453             tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,REGIDX_MAX);
454 
455         int i, nchr = 0;
456         char **chr = regidx_seq_names(args->tgt_idx, &nchr);
457         for (i=0; i<nchr; i++)
458             tgt_flush_region(args,chr[i],0,REGIDX_MAX);
459     }
460 }
is_indel(int nals,char ** als)461 inline static int is_indel(int nals, char **als)
462 {
463     // This is mpileup output, we can make some assumption:
464     //  - no MNPs
465     //  - "<*>" is not present at indels sites and there are no other symbolic alleles than <*>
466     if ( als[1][0]=='<' ) return 0;
467 
468     int i;
469     for (i=0; i<nals; i++)
470     {
471         if ( als[i][0]=='<' ) continue;
472         if ( als[i][1] ) return 1;
473     }
474     return 0;
475 }
next_line(args_t * args)476 bcf1_t *next_line(args_t *args)
477 {
478     bcf1_t *rec = NULL;
479     if ( !args->vcfbuf )
480     {
481         while ( bcf_sr_next_line(args->aux.srs) )
482         {
483             rec = args->aux.srs->readers[0].buffer[0];
484             if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n");
485             if ( args->tgt_idx )
486             {
487                 if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue;
488 
489                 // For backward compatibility: require the exact position, not an interval overlap
490                 int pos_match = 0;
491                 while ( regitr_overlap(args->tgt_itr) )
492                 {
493                     if ( args->tgt_itr->beg != rec->pos ) continue;
494                     pos_match = 1;
495                     break;
496                 }
497                 if ( !pos_match ) continue;
498             }
499             if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map);
500             bcf_unpack(rec, BCF_UN_STR);
501             return rec;
502         }
503         return NULL;
504     }
505 
506     // If we are here,-C alleles was given and vcfbuf and tgt_idx are set
507 
508     // Fill the buffer with duplicate lines
509     int vcfbuf_full = 1;
510     int nbuf = vcfbuf_nsites(args->vcfbuf);
511     bcf1_t *rec0 = NULL, *recN = NULL;
512     if ( nbuf==0 ) vcfbuf_full = 0;
513     else if ( nbuf==1 )
514     {
515         vcfbuf_full = 0;
516         rec0 = vcfbuf_peek(args->vcfbuf, 0);
517     }
518     else
519     {
520         rec0 = vcfbuf_peek(args->vcfbuf, 0);
521         recN = vcfbuf_peek(args->vcfbuf, nbuf-1);
522         if ( rec0->rid == recN->rid && rec0->pos == recN->pos ) vcfbuf_full = 0;
523     }
524     if ( !vcfbuf_full )
525     {
526         while ( bcf_sr_next_line(args->aux.srs) )
527         {
528             rec = args->aux.srs->readers[0].buffer[0];
529             if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n");
530             if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue;
531             // as above: require the exact position, not an interval overlap
532             int exact_match = 0;
533             while ( regitr_overlap(args->tgt_itr) )
534             {
535                 if ( args->tgt_itr->beg != rec->pos ) continue;
536                 exact_match = 1;
537                 break;
538             }
539             if ( !exact_match ) continue;
540 
541             if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map);
542             bcf_unpack(rec, BCF_UN_STR);
543             if ( !rec0 ) rec0 = rec;
544             recN = rec;
545             args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
546             if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break;
547         }
548     }
549 
550     nbuf = vcfbuf_nsites(args->vcfbuf);
551     int n, i,j;
552     for (n=nbuf; n>1; n--)
553     {
554         recN = vcfbuf_peek(args->vcfbuf, n-1);
555         if ( rec0->rid==recN->rid && rec0->pos==recN->pos ) break;
556     }
557     if ( n==0 )
558     {
559         assert( !nbuf );
560         return NULL;
561     }
562 
563     // Find the VCF and tab record with the best matching combination of alleles, prioritize
564     // records of the same type (snp vs indel)
565     rec_tgt_t rec_tgt;
566     memset(&rec_tgt,0,sizeof(rec_tgt));
567     regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr);
568     regitr_t *tmp_itr = regitr_init(args->tgt_idx);
569     regitr_copy(tmp_itr, args->tgt_itr);
570     for (i=0; i<n; i++)
571     {
572         rec = vcfbuf_peek(args->vcfbuf, i);
573         int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1;
574         while ( regitr_overlap(tmp_itr) )
575         {
576             if ( tmp_itr->beg != rec->pos ) continue;
577             tgt_als_t *als = &regitr_payload(tmp_itr,tgt_als_t);
578             if ( als->used ) continue;
579             int nmatch_als = 0;
580             vcmp_t *vcmp = vcmp_init();
581             int ret = vcmp_set_ref(vcmp, rec->d.allele[0], als->allele[0]);
582             if ( ret==0 )
583             {
584                 nmatch_als++;
585                 if ( rec->n_allele > 1 && als->n > 1 )
586                 {
587                     for (j=1; j<als->n; j++)
588                     {
589                         if ( vcmp_find_allele(vcmp, rec->d.allele+1, rec->n_allele-1, als->allele[j])>=0 ) nmatch_als++;
590                     }
591                 }
592             }
593             int als_indel = is_indel(als->n, als->allele) ? 1 : -1;
594             nmatch_als *= rec_indel*als_indel;
595             if ( nmatch_als > rec_tgt.nmatch_als || !rec_tgt.als )
596             {
597                 rec_tgt.nmatch_als = nmatch_als;
598                 rec_tgt.als  = als;
599                 rec_tgt.ibuf = i;
600             }
601             vcmp_destroy(vcmp);
602         }
603     }
604     regitr_destroy(tmp_itr);
605 
606     args->aux.tgt_als = rec_tgt.als;
607     if ( rec_tgt.als ) rec_tgt.als->used = 1;
608 
609     rec = vcfbuf_remove(args->vcfbuf, rec_tgt.ibuf);
610     return rec;
611 }
612 
init_data(args_t * args)613 static void init_data(args_t *args)
614 {
615     args->aux.srs = bcf_sr_init();
616 
617     // Open files for input and output, initialize structures
618     if ( args->targets )
619     {
620         args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
621         args->tgt_itr = regitr_init(args->tgt_idx);
622         args->tgt_itr_tmp = regitr_init(args->tgt_idx);
623     }
624 
625     if ( args->regions )
626     {
627         bcf_sr_set_opt(args->aux.srs,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
628         if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
629             error("Failed to read the regions: %s\n", args->regions);
630     }
631 
632     if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) )
633         error("Failed to read from %s: %s\n", !strcmp("-",args->bcf_fname)?"standard input":args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
634     args->aux.hdr = bcf_sr_get_header(args->aux.srs,0);
635 
636     int i;
637     if ( args->samples_fname )
638     {
639         set_samples(args, args->samples_fname, args->samples_is_file);
640         if ( args->aux.flag&CALL_CONSTR_TRIO )
641         {
642             if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
643             fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
644         }
645     }
646     if ( args->ploidy  )
647     {
648         args->nsex = ploidy_nsex(args->ploidy);
649         args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
650         args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
651         if ( !args->nsamples )
652         {
653             args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
654             args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
655             for (i=0; i<args->nsamples; i++) args->sample2sex[i] = args->nsex - 1;
656         }
657     }
658     if ( args->nsamples )
659     {
660         args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
661         for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
662         for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
663         for (i=0; i<args->nsamples; i++)
664             if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
665     }
666 
667     if ( args->gvcf )
668     {
669         int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP");
670         if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n");
671         gvcf_update_header(args->gvcf, args->aux.hdr);
672     }
673 
674     if ( args->samples_map )
675     {
676         args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map);
677         if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n");
678         for (i=0; i<args->nsamples; i++)
679             if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
680         if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
681     }
682     else
683     {
684         args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
685         if ( args->samples )
686         {
687             for (i=0; i<args->nsamples; i++)
688                 if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
689                     error("No such sample: %s\n", args->samples[i]);
690         }
691     }
692 
693     if ( args->aux.flag & CALL_CONSTR_ALLELES )
694         args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
695 
696     char wmode[8];
697     set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
698     args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
699     if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
700     if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
701 
702     if ( args->flag & CF_QCALL )
703         return;
704 
705     if ( args->flag & CF_MCALL )
706         mcall_init(&args->aux);
707 
708     if ( args->flag & CF_CCALL )
709         ccall_init(&args->aux);
710 
711     bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
712     bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
713 
714     if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
715     if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
716 
717     if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
718 }
719 
destroy_data(args_t * args)720 static void destroy_data(args_t *args)
721 {
722     if ( args->vcfbuf ) vcfbuf_destroy(args->vcfbuf);
723     if ( args->tgt_idx )
724     {
725         regidx_destroy(args->tgt_idx);
726         regitr_destroy(args->tgt_itr);
727         regitr_destroy(args->tgt_itr_tmp);
728         if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev);
729     }
730     if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux);
731     else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
732     else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
733     int i;
734     if ( args->samples )
735     {
736         for (i=0; i<args->nsamples; i++) free(args->samples[i]);
737     }
738     if ( args->aux.fams )
739     {
740         for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
741         free(args->aux.fams);
742     }
743     if ( args->missed_line ) bcf_destroy(args->missed_line);
744     ploidy_destroy(args->ploidy);
745     free(args->sex2ploidy);
746     free(args->sex2ploidy_prev);
747     free(args->samples);
748     free(args->samples_map);
749     free(args->sample2sex);
750     free(args->aux.ploidy);
751     free(args->str.s);
752     if ( args->gvcf ) gvcf_destroy(args->gvcf);
753     bcf_hdr_destroy(args->aux.hdr);
754     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
755     bcf_sr_destroy(args->aux.srs);
756 }
757 
parse_novel_rate(args_t * args,const char * str)758 void parse_novel_rate(args_t *args, const char *str)
759 {
760     if ( sscanf(str,"%le,%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del,&args->aux.trio_Pm_ins)==3 )  // explicit for all
761     {
762         args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
763         args->aux.trio_Pm_del  = 1 - args->aux.trio_Pm_del;
764         args->aux.trio_Pm_ins  = 1 - args->aux.trio_Pm_ins;
765     }
766     else if ( sscanf(str,"%le,%le",&args->aux.trio_Pm_SNPs,&args->aux.trio_Pm_del)==2 )   // dynamic for indels
767     {
768         args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
769         args->aux.trio_Pm_ins  = -1;    // negative value for dynamic calculation
770     }
771     else if ( sscanf(str,"%le",&args->aux.trio_Pm_SNPs)==1 )  // same for all
772     {
773         args->aux.trio_Pm_SNPs = 1 - args->aux.trio_Pm_SNPs;
774         args->aux.trio_Pm_del  = -1;
775         args->aux.trio_Pm_ins  = -1;
776     }
777     else error("Could not parse --novel-rate %s\n", str);
778 }
779 
list_annotations(FILE * fp)780 static void list_annotations(FILE *fp)
781 {
782     fprintf(fp,
783         "\n"
784         "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n"
785         "  INFO/PV4   .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n"
786         "\n"
787         "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n"
788         "  FORMAT/GQ  .. Phred-scaled genotype quality (Number=1,Type=Integer)\n"
789         "  FORMAT/GP  .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n"
790         "\n");
791 }
792 
parse_output_tags(const char * str)793 static int parse_output_tags(const char *str)
794 {
795     int flag = 0;
796     const char *ss = str;
797     while ( *ss )
798     {
799         const char *se = ss;
800         while ( *se && *se!=',' ) se++;
801         if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss)  ) flag |= CALL_FMT_GQ;
802         else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP;
803         else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4;
804         else
805         {
806             fprintf(stderr,"Could not parse \"%s\"\n", str);
807             exit(1);
808         }
809         if ( !*se ) break;
810         ss = se + 1;
811     }
812     return flag;
813 }
814 
set_ploidy(args_t * args,bcf1_t * rec)815 static void set_ploidy(args_t *args, bcf1_t *rec)
816 {
817     ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL);
818 
819     int i;
820     for (i=0; i<args->nsex; i++)
821         if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break;
822 
823     if ( i==args->nsex ) return;    // ploidy same as previously
824 
825     for (i=0; i<args->nsamples; i++)
826     {
827         if ( args->sample2sex[i]<0 )
828             args->aux.ploidy[i] = -1*args->sample2sex[i];
829         else
830             args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
831     }
832     int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
833 }
834 
init_ploidy(char * alias)835 ploidy_t *init_ploidy(char *alias)
836 {
837     const ploidy_predef_t *pld = ploidy_predefs;
838 
839     int detailed = 0, len = strlen(alias);
840     if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; }
841 
842     while ( pld->alias && strcasecmp(alias,pld->alias) ) pld++;
843 
844     if ( !pld->alias )
845     {
846         fprintf(stderr,"\nPRE-DEFINED PLOIDY FILES\n\n");
847         fprintf(stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n");
848         fprintf(stderr," * Coordinates are 1-based inclusive.\n");
849         fprintf(stderr," * A '*' means any value not otherwise defined.\n\n");
850         pld = ploidy_predefs;
851         while ( pld->alias )
852         {
853             fprintf(stderr,"%s\n   .. %s\n\n", pld->alias,pld->about);
854             if ( detailed )
855                 fprintf(stderr,"%s\n", pld->ploidy);
856             pld++;
857         }
858         fprintf(stderr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
859         fprintf(stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
860         fprintf(stderr,"\n");
861         exit(-1);
862     }
863     else if ( detailed )
864     {
865         fprintf(stderr,"%s", pld->ploidy);
866         exit(-1);
867     }
868     return ploidy_init_string(pld->ploidy,2);
869 }
870 
usage(args_t * args)871 static void usage(args_t *args)
872 {
873     fprintf(stderr, "\n");
874     fprintf(stderr, "About:   SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.\n");
875     fprintf(stderr, "         This command replaces the former \"bcftools view\" caller. Some of the original\n");
876     fprintf(stderr, "         functionality has been temporarily lost in the process of transition to htslib,\n");
877     fprintf(stderr, "         but will be added back on popular demand. The original calling model can be\n");
878     fprintf(stderr, "         invoked with the -c option.\n");
879     fprintf(stderr, "Usage:   bcftools call [options] <in.vcf.gz>\n");
880     fprintf(stderr, "\n");
881     fprintf(stderr, "File format options:\n");
882     fprintf(stderr, "       --no-version                Do not append version and command line to the header\n");
883     fprintf(stderr, "   -o, --output FILE               Write output to a file [standard output]\n");
884     fprintf(stderr, "   -O, --output-type b|u|z|v       Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
885     fprintf(stderr, "   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
886     fprintf(stderr, "       --ploidy ASSEMBLY[?]        Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
887     fprintf(stderr, "       --ploidy-file FILE          Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
888     fprintf(stderr, "   -r, --regions REGION            Restrict to comma-separated list of regions\n");
889     fprintf(stderr, "   -R, --regions-file FILE         Restrict to regions listed in a file\n");
890     fprintf(stderr, "       --regions-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
891     fprintf(stderr, "   -s, --samples LIST              List of samples to include [all samples]\n");
892     fprintf(stderr, "   -S, --samples-file FILE         PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
893     fprintf(stderr, "   -t, --targets REGION            Similar to -r but streams rather than index-jumps\n");
894     fprintf(stderr, "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n");
895     fprintf(stderr, "       --threads INT               Use multithreading with INT worker threads [0]\n");
896     fprintf(stderr, "\n");
897     fprintf(stderr, "Input/output options:\n");
898     fprintf(stderr, "   -A, --keep-alts                 Keep all possible alternate alleles at variant sites\n");
899     fprintf(stderr, "   -a, --annotate LIST             Optional tags to output (lowercase allowed); '?' to list available tags\n");
900     fprintf(stderr, "   -F, --prior-freqs AN,AC         Use prior allele frequencies, determined from these pre-filled tags\n");
901     fprintf(stderr, "   -G, --group-samples FILE|-      Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
902     fprintf(stderr, "                                   This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
903     fprintf(stderr, "       --group-samples-tag TAG     The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
904     fprintf(stderr, "   -g, --gvcf INT,[...]            Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
905     fprintf(stderr, "   -i, --insert-missed             Output also sites missed by mpileup but present in -T\n");
906     fprintf(stderr, "   -M, --keep-masked-ref           Keep sites with masked reference allele (REF=N)\n");
907     fprintf(stderr, "   -V, --skip-variants TYPE        Skip indels/snps\n");
908     fprintf(stderr, "   -v, --variants-only             Output variant sites only\n");
909     fprintf(stderr, "\n");
910     fprintf(stderr, "Consensus/variant calling options:\n");
911     fprintf(stderr, "   -c, --consensus-caller          The original calling method (conflicts with -m)\n");
912     fprintf(stderr, "   -C, --constrain STR             One of: alleles, trio (see manual)\n");
913     fprintf(stderr, "   -m, --multiallelic-caller       Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
914     fprintf(stderr, "   -n, --novel-rate FLOAT,[...]    Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
915     fprintf(stderr, "   -p, --pval-threshold FLOAT      Variant if P(ref|D)<FLOAT with -c [0.5]\n");
916     fprintf(stderr, "   -P, --prior FLOAT               Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
917     fprintf(stderr, "\n");
918     fprintf(stderr, "Example:\n");
919     fprintf(stderr, "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
920     fprintf(stderr, "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
921 
922     // todo (and more)
923     // fprintf(stderr, "\nContrast calling and association test options:\n");
924     // fprintf(stderr, "       -1 INT    number of group-1 samples [0]\n");
925     // fprintf(stderr, "       -C FLOAT  posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
926     // fprintf(stderr, "       -U INT    number of permutations for association testing (effective with -1) [0]\n");
927     // fprintf(stderr, "       -X FLOAT  only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
928     fprintf(stderr, "\n");
929     exit(-1);
930 }
931 
main_vcfcall(int argc,char * argv[])932 int main_vcfcall(int argc, char *argv[])
933 {
934     char *ploidy_fname = NULL, *ploidy = NULL;
935     args_t args;
936     memset(&args, 0, sizeof(args_t));
937     args.argc = argc; args.argv = argv;
938     args.aux.prior_type = -1;
939     args.aux.indel_frac = -1;
940     args.aux.theta      = 1.1e-3;
941     args.aux.pref       = 0.5;
942     args.aux.min_perm_p = 0.01;
943     args.aux.min_lrt    = 1;
944     args.flag           = CF_ACGT_ONLY;
945     args.output_fname   = "-";
946     args.output_type    = FT_VCF;
947     args.n_threads = 0;
948     args.record_cmd_line = 1;
949     args.aux.trio_Pm_SNPs = 1 - 1e-8;
950     args.aux.trio_Pm_ins  = args.aux.trio_Pm_del  = 1 - 1e-9;
951     args.regions_overlap = 1;
952     args.clevel = -1;
953 
954     int c;
955     static struct option loptions[] =
956     {
957         {"help",no_argument,NULL,'h'},
958         {"format-fields",required_argument,NULL,'f'},
959         {"annotate",required_argument,NULL,'a'},
960         {"prior-freqs",required_argument,NULL,'F'},
961         {"gvcf",required_argument,NULL,'g'},
962         {"group-samples",required_argument,NULL,'G'},
963         {"group-samples-tag",required_argument,NULL,3},
964         {"output",required_argument,NULL,'o'},
965         {"output-type",required_argument,NULL,'O'},
966         {"regions",required_argument,NULL,'r'},
967         {"regions-file",required_argument,NULL,'R'},
968         {"regions-overlap",required_argument,NULL,4},
969         {"samples",required_argument,NULL,'s'},
970         {"samples-file",required_argument,NULL,'S'},
971         {"targets",required_argument,NULL,'t'},
972         {"targets-file",required_argument,NULL,'T'},
973         {"threads",required_argument,NULL,9},
974         {"keep-alts",no_argument,NULL,'A'},
975         {"insert-missed",no_argument,NULL,'i'},
976         {"skip-Ns",no_argument,NULL,'N'},            // now the new default
977         {"keep-masked-refs",no_argument,NULL,'M'},
978         {"skip-variants",required_argument,NULL,'V'},
979         {"variants-only",no_argument,NULL,'v'},
980         {"consensus-caller",no_argument,NULL,'c'},
981         {"constrain",required_argument,NULL,'C'},
982         {"multiallelic-caller",no_argument,NULL,'m'},
983         {"pval-threshold",required_argument,NULL,'p'},
984         {"prior",required_argument,NULL,'P'},
985         {"novel-rate",required_argument,NULL,'n'},
986         {"ploidy",required_argument,NULL,1},
987         {"ploidy-file",required_argument,NULL,2},
988         {"chromosome-X",no_argument,NULL,'X'},
989         {"chromosome-Y",no_argument,NULL,'Y'},
990         {"no-version",no_argument,NULL,8},
991         {NULL,0,NULL,0}
992     };
993 
994     char *tmp = NULL;
995     while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
996     {
997         switch (c)
998         {
999             case  2 : ploidy_fname = optarg; break;
1000             case  1 : ploidy = optarg; break;
1001             case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
1002             case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
1003             case 'G': args.aux.sample_groups = optarg; break;
1004             case  3 : args.aux.sample_groups_tag = optarg; break;
1005             case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
1006             case 'a':
1007                       if (optarg[0]=='?') { list_annotations(stderr); return 1; }
1008                       args.aux.output_tags |= parse_output_tags(optarg);
1009                       break;
1010             case 'M': args.flag &= ~CF_ACGT_ONLY; break;     // keep sites where REF is N
1011             case 'N': args.flag |= CF_ACGT_ONLY; break;      // omit sites where first base in REF is N (the new default)
1012             case 'A': args.aux.flag |= CALL_KEEPALT; break;
1013             case 'c': args.flag |= CF_CCALL; break;          // the original EM based calling method
1014             case 'i': args.flag |= CF_INS_MISSED; break;
1015             case 'v': args.aux.flag |= CALL_VARONLY; break;
1016             case 'F':
1017                 args.aux.prior_AN = optarg;
1018                 args.aux.prior_AC = strchr(optarg,',');
1019                 if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
1020                 *args.aux.prior_AC = 0;
1021                 args.aux.prior_AC++;
1022                 break;
1023             case 'g':
1024                 args.gvcf = gvcf_init(optarg);
1025                 if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
1026                 break;
1027             case 'o': args.output_fname = optarg; break;
1028             case 'O':
1029                       switch (optarg[0]) {
1030                           case 'b': args.output_type = FT_BCF_GZ; break;
1031                           case 'u': args.output_type = FT_BCF; break;
1032                           case 'z': args.output_type = FT_VCF_GZ; break;
1033                           case 'v': args.output_type = FT_VCF; break;
1034                           default:
1035                           {
1036                               args.clevel = strtol(optarg,&tmp,10);
1037                               if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1038                           }
1039                       }
1040                       if ( optarg[1] )
1041                       {
1042                           args.clevel = strtol(optarg+1,&tmp,10);
1043                           if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
1044                       }
1045                       break;
1046             case 'C':
1047                       if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES;
1048                       else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO;
1049                       else error("Unknown argument to -C: \"%s\"\n", optarg);
1050                       break;
1051             case 'V':
1052                       if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY;
1053                       else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL;
1054                       else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg);
1055                       break;
1056             case 'm': args.flag |= CF_MCALL; break;         // multiallelic calling method
1057             case 'p':
1058                 args.aux.pref = strtod(optarg,&tmp);
1059                 if ( *tmp ) error("Could not parse: --pval-threshold %s\n", optarg);
1060                 break;
1061             case 'P': args.aux.theta = strtod(optarg,&tmp);
1062                       if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg);
1063                       break;
1064             case 'n': parse_novel_rate(&args,optarg); break;
1065             case 'r': args.regions = optarg; break;
1066             case 'R': args.regions = optarg; args.regions_is_file = 1; break;
1067             case 't': args.targets = optarg; break;
1068             case 'T': args.targets = optarg; args.targets_is_file = 1; break;
1069             case 's': args.samples_fname = optarg; break;
1070             case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
1071             case  9 : args.n_threads = strtol(optarg, 0, 0); break;
1072             case  8 : args.record_cmd_line = 0; break;
1073             case  4 :
1074                 if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0;
1075                 else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1;
1076                 else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2;
1077                 else error("Could not parse: --regions-overlap %s\n",optarg);
1078                 break;
1079             default: usage(&args);
1080         }
1081     }
1082     // Sanity check options and initialize
1083     if ( ploidy_fname ) args.ploidy = ploidy_init(ploidy_fname, 2);
1084     else if ( ploidy ) args.ploidy = init_ploidy(ploidy);
1085 
1086     if ( optind>=argc )
1087     {
1088         if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-";  // reading from stdin
1089         else usage(&args);
1090     }
1091     else args.bcf_fname = argv[optind++];
1092 
1093     if ( !ploidy_fname && !ploidy )
1094     {
1095         if ( !args.samples_is_file ) fprintf(stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n");
1096         args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2);
1097     }
1098 
1099     if ( !args.ploidy ) error("Could not initialize ploidy\n");
1100     if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n");
1101     if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n");
1102     if ( (args.flag & CF_CCALL ? 1: 0) && args.gvcf ) error("gvcf -g option not functional with -c calling mode yet\n");
1103     if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n");    // not sure about this, please fix
1104     if ( args.aux.flag & CALL_CONSTR_ALLELES )
1105     {
1106         if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n");
1107         if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n");
1108     }
1109     if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n");
1110     if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n");
1111     if ( args.aux.sample_groups && !(args.flag & CF_MCALL) ) error("The -G feature is supported only with the -m calling mode\n");
1112     init_data(&args);
1113 
1114     bcf1_t *bcf_rec;
1115     while ( (bcf_rec = next_line(&args)) )
1116     {
1117         // Skip duplicate positions with all matching `-C alleles -T` used up
1118         if ( args.aux.flag&CALL_CONSTR_ALLELES && !args.aux.tgt_als ) continue;
1119 
1120         // Skip unwanted sites
1121         int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1;
1122         if ( (args.flag & CF_INDEL_ONLY) && !is_indel ) continue;
1123         if ( (args.flag & CF_NO_INDEL) && is_indel ) continue;
1124         if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue;   // REF[0] is 'N'
1125 
1126         // Which allele is symbolic? All SNPs should have it, but not indels
1127         args.aux.unseen = 0;
1128         for (i=1; i<bcf_rec->n_allele; i++)
1129         {
1130             if ( bcf_rec->d.allele[i][0]=='X' ) { args.aux.unseen = i; break; }  // old X
1131             if ( bcf_rec->d.allele[i][0]=='<' )
1132             {
1133                 if ( bcf_rec->d.allele[i][1]=='X' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // old <X>
1134                 if ( bcf_rec->d.allele[i][1]=='*' && bcf_rec->d.allele[i][2]=='>' ) { args.aux.unseen = i; break; } // new <*>
1135             }
1136         }
1137         int is_ref = (bcf_rec->n_allele==1 || (bcf_rec->n_allele==2 && args.aux.unseen>0)) ? 1 : 0;
1138 
1139         if ( is_ref && args.aux.flag&CALL_VARONLY )
1140             continue;
1141 
1142         bcf_unpack(bcf_rec, BCF_UN_ALL);
1143         if ( args.nsex ) set_ploidy(&args, bcf_rec);
1144 
1145         // Various output modes: QCall output (todo)
1146         if ( args.flag & CF_QCALL )
1147         {
1148             qcall(&args.aux, bcf_rec);
1149             continue;
1150         }
1151 
1152         if ( args.flag & CF_INS_MISSED )
1153         {
1154             tgt_flush(&args,bcf_rec);
1155             if ( !args.tgt_itr_prev ) args.tgt_itr_prev = regitr_init(args.tgt_idx);
1156             regitr_copy(args.tgt_itr_prev, args.tgt_itr);
1157         }
1158 
1159         // Calling modes which output VCFs
1160         int ret;
1161         if ( args.flag & CF_MCALL )
1162             ret = mcall(&args.aux, bcf_rec);
1163         else
1164             ret = ccall(&args.aux, bcf_rec);
1165         if ( ret==-1 ) error("Something is wrong\n");
1166         else if ( ret==-2 ) continue;   // skip the site
1167 
1168         // Normal output
1169         if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue;     // not a variant
1170         if ( args.gvcf )
1171             bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0);
1172         if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname);
1173     }
1174     if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0);
1175     if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL);
1176     destroy_data(&args);
1177     return 0;
1178 }
1179 
1180