1 #include "bcftools.pysam.h"
2 
3 /* The MIT License
4 
5    Copyright (c) 2014-2021 Genome Research Ltd.
6 
7    Author: Petr Danecek <pd3@sanger.ac.uk>
8 
9    Permission is hereby granted, free of charge, to any person obtaining a copy
10    of this software and associated documentation files (the "Software"), to deal
11    in the Software without restriction, including without limitation the rights
12    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13    copies of the Software, and to permit persons to whom the Software is
14    furnished to do so, subject to the following conditions:
15 
16    The above copyright notice and this permission notice shall be included in
17    all copies or substantial portions of the Software.
18 
19    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25    THE SOFTWARE.
26 
27  */
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <assert.h>
34 #include <errno.h>
35 #include <getopt.h>
36 #include <unistd.h>
37 #include <ctype.h>
38 #include <htslib/vcf.h>
39 #include <htslib/kstring.h>
40 #include <htslib/synced_bcf_reader.h>
41 #include <htslib/kseq.h>
42 #include <htslib/bgzf.h>
43 #include "regidx.h"
44 #include "bcftools.h"
45 #include "rbuf.h"
46 #include "filter.h"
47 
48 // Logic of the filters: include or exclude sites which match the filters?
49 #define FLT_INCLUDE 1
50 #define FLT_EXCLUDE 2
51 
52 #define PICK_REF   1
53 #define PICK_ALT   2
54 #define PICK_LONG  4
55 #define PICK_SHORT 8
56 #define PICK_IUPAC 16
57 
58 #define TO_UPPER 0
59 #define TO_LOWER 1
60 
61 typedef struct
62 {
63     int num;                // number of ungapped blocks in this chain
64     int *block_lengths;     // length of the ungapped blocks in this chain
65     int *ref_gaps;          // length of the gaps on the reference sequence between blocks
66     int *alt_gaps;          // length of the gaps on the alternative sequence between blocks
67     int ori_pos;
68     int ref_last_block_ori; // start position on the reference sequence of the following ungapped block (0-based)
69     int alt_last_block_ori; // start position on the alternative sequence of the following ungapped block (0-based)
70 }
71 chain_t;
72 
73 #define MASK_LC 1
74 #define MASK_UC 2
75 #define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0)
76 typedef struct
77 {
78     char *fname, with;
79     regidx_t *idx;
80     regitr_t *itr;
81 }
82 mask_t;
83 
84 typedef struct
85 {
86     kstring_t fa_buf;   // buffered reference sequence
87     int fa_ori_pos;     // start position of the fa_buffer (wrt original sequence)
88     int fa_frz_pos;     // protected position to avoid conflicting variants (last pos for SNPs/ins)
89     int fa_mod_off;     // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
90     int fa_frz_mod;     // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence
91     int fa_end_pos;     // region's end position in the original sequence
92     int fa_length;      // region's length in the original sequence (in case end_pos not provided in the FASTA header)
93     int fa_case;        // output upper case or lower case: TO_UPPER|TO_LOWER
94     int fa_src_pos;     // last genomic coordinate read from the input fasta (0-based)
95     char prev_base;     // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
96     int prev_base_pos;  // the position of prev_base
97     int prev_is_insert;
98 
99     rbuf_t vcf_rbuf;
100     bcf1_t **vcf_buf;
101     int nvcf_buf, rid;
102     char *chr, *chr_prefix;
103 
104     mask_t *mask;
105     int nmask;
106 
107     int chain_id;       // chain_id, to provide a unique ID to each chain in the chain output
108     chain_t *chain;     // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
109                         // Note that the chain is re-initialised for each chromosome/seq_region
110 
111     filter_t *filter;
112     char *filter_str;
113     int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
114 
115     bcf_srs_t *files;
116     bcf_hdr_t *hdr;
117     FILE *fp_out;
118     FILE *fp_chain;
119     char **argv;
120     int argc, output_iupac, haplotype, allele, isample, napplied;
121     uint8_t *iupac_bitmask;
122     int miupac_bitmask;
123     char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
124     char mark_del, mark_ins, mark_snv;
125 }
126 args_t;
127 
init_chain(chain_t * chain,int ref_ori_pos)128 static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
129 {
130 //     fprintf(bcftools_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
131     chain = (chain_t*) calloc(1,sizeof(chain_t));
132     chain->num = 0;
133     chain->block_lengths = NULL;
134     chain->ref_gaps = NULL;
135     chain->alt_gaps = NULL;
136     chain->ori_pos = ref_ori_pos;
137     chain->ref_last_block_ori = ref_ori_pos;
138     chain->alt_last_block_ori = ref_ori_pos;
139     return chain;
140 }
141 
destroy_chain(args_t * args)142 static void destroy_chain(args_t *args)
143 {
144     chain_t *chain = args->chain;
145     free(chain->ref_gaps);
146     free(chain->alt_gaps);
147     free(chain->block_lengths);
148     free(chain);
149     chain = NULL;
150     free(args->chr);
151     args->chr = NULL;
152 }
153 
print_chain(args_t * args)154 static void print_chain(args_t *args)
155 {
156     /*
157         Example chain format (see: https://genome.ucsc.edu/goldenPath/help/chain.html):
158         chain 1 500 + 480 500 1 501 + 480 501 1
159         12 3 1
160         1 0 3
161         484
162 
163         chain line is:
164         - chain
165         - score (sum of the length of ungapped block in this case)
166         - ref_seqname (from the fasta header, parsed by htslib)
167         - ref_seqlength (from the fasta header)
168         - ref_strand (+ or -; always + for bcf-consensus)
169         - ref_start (as defined in the fasta header)
170         - ref_end (as defined in the fasta header)
171         - alt_seqname (same as ref_seqname as bcf-consensus only considers SNPs and indels)
172         - alt_seqlength (adjusted to match the length of the alt sequence)
173         - alt_strand (+ or -; always + for bcf-consensus)
174         - alt_start (same as ref_start, as no edits are recorded/applied before that position)
175         - alt_end (adjusted to match the length of the alt sequence)
176         - chain_num (just an auto-increment id)
177 
178         the other (sorted) lines are:
179         - length of the ungapped alignment block
180         - gap on the ref sequence between this and the next block (all but the last line)
181         - gap on the alt sequence between this and the next block (all but the last line)
182     */
183     chain_t *chain = args->chain;
184     int n = chain->num;
185     int ref_end_pos = args->fa_length + chain->ori_pos;
186     int last_block_size = ref_end_pos - chain->ref_last_block_ori;
187     int alt_end_pos = chain->alt_last_block_ori + last_block_size;
188     int score = 0;
189     for (n=0; n<chain->num; n++) {
190         score += chain->block_lengths[n];
191     }
192     score += last_block_size;
193     fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
194     for (n=0; n<chain->num; n++) {
195         fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
196     }
197     fprintf(args->fp_chain, "%d\n\n", last_block_size);
198 }
199 
push_chain_gap(chain_t * chain,int ref_start,int ref_len,int alt_start,int alt_len)200 static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
201 {
202 //     fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
203     int num = chain->num;
204 
205     if (num && ref_start <= chain->ref_last_block_ori) {
206         // In case this variant is back-to-back with the previous one
207         chain->ref_last_block_ori = ref_start + ref_len;
208         chain->alt_last_block_ori = alt_start + alt_len;
209         chain->ref_gaps[num-1] += ref_len;
210         chain->alt_gaps[num-1] += alt_len;
211 
212     } else {
213         // Extend the ungapped blocks, store the gap length
214         chain->block_lengths = (int*) realloc(chain->block_lengths, (num + 1) * sizeof(int));
215         chain->ref_gaps = (int*) realloc(chain->ref_gaps, (num + 1) * sizeof(int));
216         chain->alt_gaps = (int*) realloc(chain->alt_gaps, (num + 1) * sizeof(int));
217         chain->block_lengths[num] = ref_start - chain->ref_last_block_ori;
218         chain->ref_gaps[num] = ref_len;
219         chain->alt_gaps[num] = alt_len;
220         // Update the start positions of the next block
221         chain->ref_last_block_ori = ref_start + ref_len;
222         chain->alt_last_block_ori = alt_start + alt_len;
223         // Increment the number of ungapped blocks
224         chain->num++;
225     }
226 }
227 
init_data(args_t * args)228 static void init_data(args_t *args)
229 {
230     args->files = bcf_sr_init();
231     args->files->require_index = 1;
232     if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum));
233     args->hdr = args->files->readers[0].header;
234     args->isample = -1;
235     if ( args->sample )
236     {
237         args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
238         if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
239     }
240     if ( (args->haplotype || args->allele) && args->isample<0 )
241     {
242         if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
243         args->isample = 0;
244     }
245     int i;
246     for (i=0; i<args->nmask; i++)
247     {
248         mask_t *mask = &args->mask[i];
249         mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL);
250         if ( !mask->idx ) error("Failed to initialize mask regions\n");
251         mask->itr = regitr_init(mask->idx);
252     }
253     // In case we want to store the chains
254     if ( args->chain_fname )
255     {
256         args->fp_chain = fopen(args->chain_fname,"w");
257         if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno));
258         args->chain_id = 0;
259     }
260     rbuf_init(&args->vcf_rbuf, 100);
261     args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*));
262     if ( args->output_fname ) {
263         args->fp_out = fopen(args->output_fname,"w");
264         if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
265     }
266     else args->fp_out = bcftools_stdout;
267     if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
268     if ( args->filter_str )
269         args->filter = filter_init(args->hdr, args->filter_str);
270     args->rid = -1;
271 }
add_mask(args_t * args,char * fname)272 static void add_mask(args_t *args, char *fname)
273 {
274     args->nmask++;
275     args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask));
276     mask_t *mask = &args->mask[args->nmask-1];
277     mask->fname = fname;
278     mask->with  = 'N';
279 }
add_mask_with(args_t * args,char * with)280 static void add_mask_with(args_t *args, char *with)
281 {
282     if ( !args->nmask ) error("The --mask-with option must follow --mask\n");
283     mask_t *mask = &args->mask[args->nmask-1];
284     if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC;
285     else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC;
286     else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n");
287     else mask->with = *with;
288 }
destroy_data(args_t * args)289 static void destroy_data(args_t *args)
290 {
291     free(args->iupac_bitmask);
292     if (args->filter) filter_destroy(args->filter);
293     bcf_sr_destroy(args->files);
294     int i;
295     for (i=0; i<args->vcf_rbuf.m; i++)
296         if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
297     free(args->vcf_buf);
298     free(args->fa_buf.s);
299     free(args->chr);
300     for (i=0; i<args->nmask; i++)
301     {
302         mask_t *mask = &args->mask[i];
303         regidx_destroy(mask->idx);
304         regitr_destroy(mask->itr);
305     }
306     free(args->mask);
307     if ( args->chain_fname )
308         if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
309     if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
310 }
311 
init_region(args_t * args,char * line)312 static void init_region(args_t *args, char *line)
313 {
314     char *ss, *se = line;
315     while ( *se && !isspace(*se) && *se!=':' ) se++;
316     int from = 0, to = 0;
317     char tmp = 0, *tmp_ptr = NULL;
318     if ( *se )
319     {
320         tmp = *se; *se = 0; tmp_ptr = se;
321         ss = ++se;
322         from = strtol(ss,&se,10);
323         if ( ss==se || !*se || *se!='-' ) from = 0;
324         else
325         {
326             from--;
327             ss = ++se;
328             to = strtol(ss,&se,10);
329             if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0; }
330             else to--;
331         }
332     }
333     free(args->chr);
334     args->chr = strdup(line);
335     args->rid = bcf_hdr_name2id(args->hdr,line);
336     if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
337     args->prev_base_pos = -1;
338     args->fa_buf.l  = 0;
339     args->fa_length = 0;
340     args->fa_end_pos = to;
341     args->fa_ori_pos = from;
342     args->fa_src_pos = from;
343     args->fa_mod_off = 0;
344     args->fa_frz_pos = -1;
345     args->fa_frz_mod = -1;
346     args->fa_case    = -1;
347     args->vcf_rbuf.n = 0;
348     bcf_sr_seek(args->files,line,args->fa_ori_pos);
349     if ( tmp_ptr ) *tmp_ptr = tmp;
350     fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
351     if (args->chain_fname )
352     {
353         args->chain = init_chain(args->chain, args->fa_ori_pos);
354     } else {
355         args->chain = NULL;
356     }
357 }
358 
next_vcf_line(args_t * args)359 static bcf1_t **next_vcf_line(args_t *args)
360 {
361     if ( args->vcf_rbuf.n )
362     {
363         int i = rbuf_shift(&args->vcf_rbuf);
364         return &args->vcf_buf[i];
365     }
366     while ( bcf_sr_next_line(args->files) )
367     {
368         if ( args->filter )
369         {
370             int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
371             if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
372             if ( !is_ok ) continue;
373         }
374         return &args->files->readers[0].buffer[0];
375     }
376     return NULL;
377 }
unread_vcf_line(args_t * args,bcf1_t ** rec_ptr)378 static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
379 {
380     bcf1_t *rec = *rec_ptr;
381     if ( args->vcf_rbuf.n >= args->vcf_rbuf.m )
382         error("FIXME: too many overlapping records near %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
383 
384     // Insert the new record in the buffer. The line would be overwritten in
385     // the next bcf_sr_next_line call, therefore we need to swap it with an
386     // unused one
387     int i = rbuf_append(&args->vcf_rbuf);
388     if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1();
389     bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp;
390 }
flush_fa_buffer(args_t * args,int len)391 static void flush_fa_buffer(args_t *args, int len)
392 {
393     if ( !args->fa_buf.l ) return;
394     int nwr = 0;
395     while ( nwr + 60 <= args->fa_buf.l )
396     {
397         if ( fwrite(args->fa_buf.s+nwr,1,60,args->fp_out) != 60 ) error("Could not write: %s\n", args->output_fname);
398         if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
399         nwr += 60;
400     }
401     if ( nwr )
402         args->fa_ori_pos += nwr;
403 
404     args->fa_frz_mod -= nwr;
405 
406     if ( len )
407     {
408         // not finished on this chr yet and the buffer cannot be emptied completely
409         if ( nwr && nwr < args->fa_buf.l )
410             memmove(args->fa_buf.s,args->fa_buf.s+nwr,args->fa_buf.l-nwr);
411         args->fa_buf.l -= nwr;
412         return;
413     }
414 
415     // empty the whole buffer
416     if ( nwr == args->fa_buf.l ) { args->fa_buf.l = 0; return; }
417 
418     if ( fwrite(args->fa_buf.s+nwr,1,args->fa_buf.l - nwr,args->fp_out) != args->fa_buf.l - nwr ) error("Could not write: %s\n", args->output_fname);
419     if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
420 
421     args->fa_ori_pos += args->fa_buf.l - nwr - args->fa_mod_off;
422     args->fa_mod_off = 0;
423     args->fa_buf.l = 0;
424 }
apply_absent(args_t * args,hts_pos_t pos)425 static void apply_absent(args_t *args, hts_pos_t pos)
426 {
427     if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return;
428 
429     int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l;
430     int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod;
431     int i;
432     for (i=ib; i<ie; i++)
433         args->fa_buf.s[i] = args->absent_allele;
434 }
freeze_ref(args_t * args,bcf1_t * rec)435 static void freeze_ref(args_t *args, bcf1_t *rec)
436 {
437     if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return;
438     args->fa_frz_pos = rec->pos + rec->rlen - 1;
439     args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen;
440 }
mark_del(char * ref,int rlen,char * alt,int mark)441 static char *mark_del(char *ref, int rlen, char *alt, int mark)
442 {
443     char *out = malloc(rlen+1);
444     int i;
445     if ( alt )
446     {
447         int nalt = strlen(alt);
448         for (i=0; i<nalt; i++) out[i] = alt[i];
449     }
450     else    // symbolic <DEL>
451     {
452         int nref = strlen(ref);
453         for (i=0; i<nref; i++) out[i] = ref[i];
454     }
455     for (; i<rlen; i++) out[i] = mark;
456     out[rlen] = 0;
457     return out;
458 }
mark_ins(char * ref,char * alt,char mark)459 static void mark_ins(char *ref, char *alt, char mark)
460 {
461     int i, nref = strlen(ref), nalt = strlen(alt);
462     if ( mark=='l' )
463         for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
464     else
465         for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
466 }
mark_snv(char * ref,char * alt,char mark)467 static void mark_snv(char *ref, char *alt, char mark)
468 {
469     int i, nref = strlen(ref), nalt = strlen(alt);
470     int n = nref < nalt ? nref : nalt;
471     if ( mark=='l' )
472     {
473         for (i=0; i<n; i++)
474             if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
475     }
476     else
477     {
478         for (i=0; i<n; i++)
479             if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
480     }
481 }
apply_variant(args_t * args,bcf1_t * rec)482 static void apply_variant(args_t *args, bcf1_t *rec)
483 {
484     static int warned_haplotype = 0;
485 
486     if ( args->absent_allele ) apply_absent(args, rec->pos);
487     if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; }
488 
489     int i,j;
490     if ( args->mask )
491     {
492         char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
493         int start = rec->pos;
494         int end   = rec->pos + rec->rlen - 1;
495         for (i=0; i<args->nmask; i++)
496         {
497             mask_t *mask = &args->mask[i];
498             if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return;
499         }
500     }
501 
502     int ialt = 1;    // the alternate allele
503     if ( args->isample >= 0 )
504     {
505         bcf_unpack(rec, BCF_UN_FMT);
506         bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
507         if ( !fmt ) return;
508 
509         if ( fmt->type!=BCF_BT_INT8 )
510             error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
511         uint8_t *ptr = fmt->p + fmt->size*args->isample;
512 
513         enum { use_hap, use_iupac, pick_one } action = use_hap;
514         if ( args->allele==PICK_IUPAC )
515         {
516             if ( !args->haplotype ) action = use_iupac;
517             if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac;
518         }
519         else if ( args->output_iupac ) action = use_iupac;
520         else if ( !args->haplotype ) action = pick_one;
521 
522         if ( action==use_hap )
523         {
524             if ( args->haplotype > fmt->n )
525             {
526                 if ( bcf_gt_is_missing(ptr[fmt->n-1]) || bcf_gt_is_missing(ptr[0]) )
527                 {
528                     if ( !args->missing_allele ) return;
529                     ialt = -1;
530                 }
531                 else
532                 {
533                     if ( !warned_haplotype )
534                     {
535                         fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%"PRId64". (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
536                         warned_haplotype = 1;
537                     }
538                     return;
539                 }
540             }
541             else
542             {
543                 ialt = (int8_t)ptr[args->haplotype-1];
544                 if ( bcf_gt_is_missing(ialt) || ialt==bcf_int8_vector_end )
545                 {
546                     if ( !args->missing_allele ) return;
547                     ialt = -1;
548                 }
549                 else
550                     ialt = bcf_gt_allele(ialt);
551             }
552         }
553         else if ( action==use_iupac )
554         {
555             ialt = -1;
556             int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
557             for (i=0; i<fmt->n; i++)
558             {
559                 if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
560                 if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
561                 int jalt = bcf_gt_allele(ptr[i]);
562                 if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
563                 if ( fallback_alt <= 0 ) fallback_alt = jalt;
564 
565                 int l = strlen(rec->d.allele[jalt]);
566                 for (j=0; j<l; j++)
567                     if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
568                 if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
569 
570                 if ( l > mlen )
571                 {
572                     hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
573                     for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
574                     mlen = l;
575                 }
576                 if ( jalt>0 && l>alen )
577                 {
578                     alen = l;
579                     ialt = jalt;
580                 }
581                 for (j=0; j<l; j++)
582                     args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
583             }
584             if ( alen > 0 )
585                 for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
586             else if ( fallback_alt >= 0 )
587                 ialt = fallback_alt;
588             else if ( is_missing && !args->missing_allele ) return;
589         }
590         else
591         {
592             int is_hom = 1;
593             for (i=0; i<fmt->n; i++)
594             {
595                 if ( bcf_gt_is_missing(ptr[i]) )
596                 {
597                     if ( !args->missing_allele ) return;  // ignore missing or half-missing genotypes
598                     ialt = -1;
599                     break;
600                 }
601                 if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
602                 ialt = bcf_gt_allele(ptr[i]);
603                 if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
604             }
605             if ( !is_hom )
606             {
607                 int prev_len = 0, jalt;
608                 for (i=0; i<fmt->n; i++)
609                 {
610                     if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
611                     jalt = bcf_gt_allele(ptr[i]);
612                     if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
613                     if ( args->allele & (PICK_LONG|PICK_SHORT) )
614                     {
615                         int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
616                         if ( i==0 ) ialt = jalt, prev_len = len;
617                         else if ( len == prev_len )
618                         {
619                             if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
620                             else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
621                         }
622                         else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
623                         else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
624                     }
625                     else
626                     {
627                         if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
628                         else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
629                     }
630                 }
631             }
632         }
633         if ( !ialt )
634         {
635             // ref allele
636             if ( args->absent_allele ) freeze_ref(args,rec);
637             return;
638         }
639         if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
640     }
641     else if ( args->output_iupac && rec->n_allele>1 )
642     {
643         int ialt, alen = 0, mlen = 0;
644         for (i=0; i<rec->n_allele; i++)
645         {
646             int l = strlen(rec->d.allele[i]);
647             for (j=0; j<l; j++)
648                 if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
649             if ( j<l ) continue;    // symbolic allele, breakpoint or invalid character in the allele
650 
651             if ( l > mlen )
652             {
653                 hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
654                 for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
655                 mlen = l;
656             }
657             if ( i>0 && l>alen )
658             {
659                 alen = l;
660                 ialt = i;
661             }
662             for (j=0; j<l; j++)
663                 args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
664         }
665         if ( alen > 0 )
666             for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
667         else
668             ialt = 1;
669     }
670 
671     if ( rec->n_allele==1 && ialt!=-1 )
672     {
673         // non-missing reference
674         if ( args->absent_allele ) freeze_ref(args,rec);
675         return;
676     }
677     if ( ialt==-1 )
678     {
679         char alleles[4];
680         alleles[0] = rec->d.allele[0][0];
681         alleles[1] = ',';
682         alleles[2] = args->missing_allele;
683         alleles[3] = 0;
684         bcf_update_alleles_str(args->hdr, rec, alleles);
685         ialt = 1;
686     }
687 
688     // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg
689     int trim_beg = 0;
690     int var_type = bcf_get_variant_type(rec,ialt);
691     int var_len  = rec->d.var[ialt].n;
692     if ( var_type & VCF_INDEL )
693     {
694         // normally indel starts one base after, but not if the first base of the fa reference is deleted
695         if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] )
696             trim_beg = 1;
697         else
698             trim_beg = 0;
699     }
700     else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"<DEL>") )
701     {
702         trim_beg = 1;
703         var_len  = 1 - rec->rlen;
704     }
705     else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
706 
707     // Overlapping variant?
708     if ( rec->pos <= args->fa_frz_pos )
709     {
710         // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
711         // This still may not be enough for more complicated cases with multiple duplicate positions
712         // and other types in between. In such case let the user normalize the VCF and remove duplicates.
713 
714         int overlap = 0;
715         if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1;
716 
717         if ( overlap )
718         {
719             fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
720             return;
721         }
722 
723     }
724 
725     char *alt_allele = rec->d.allele[ialt];
726     int rmme_alt = 0;
727 
728     int len_diff = 0, alen = 0;
729     int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
730     if ( idx<0 )
731     {
732         fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
733         return;
734     }
735     if ( rec->rlen > args->fa_buf.l - idx )
736     {
737         rec->rlen = args->fa_buf.l - idx;
738         alen = strlen(alt_allele);
739         if ( alen > rec->rlen )
740         {
741             alt_allele[rec->rlen] = 0;
742             fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
743         }
744     }
745     if ( idx>=args->fa_buf.l )
746         error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
747 
748     // sanity check the reference base
749     if ( alt_allele[0]=='<' )
750     {
751         // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
752 
753         if ( strcasecmp(alt_allele,"<DEL>") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"<NON_REF>") )
754             error("Symbolic alleles other than <DEL>, <*> or <NON_REF> are currently not supported, e.g. %s at %s:%"PRId64".\n"
755                   "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
756                 alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
757         if ( !strcasecmp(alt_allele,"<DEL>") )
758         {
759             static int multibase_ref_del_warned = 0;
760             if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
761             {
762                 fprintf(bcftools_stderr,
763                     "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
764                     "         (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
765                 multibase_ref_del_warned = 1;
766             }
767             if ( args->mark_del )   // insert dashes instead of delete sequence
768             {
769                 alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
770                 alen = rec->rlen;
771                 len_diff = 0;
772                 rmme_alt = 1;
773             }
774             else
775             {
776                 len_diff = 1-rec->rlen;
777                 alt_allele = rec->d.allele[0];     // according to VCF spec, the first REF base must precede the event
778                 alen = 1;
779             }
780         }
781         else
782         {
783             // <*>  or <NON_REF> .. gVCF, evidence for the reference allele throughout the whole block
784             freeze_ref(args,rec);
785             return;
786         }
787     }
788     else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
789     {
790         // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA),
791         // the reference base in fa_buf is lost and the check fails. We do not keep a buffer
792         // with the original sequence as it should not be necessary, we should encounter max
793         // one base overlap
794 
795         int fail = 1;
796         if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
797         {
798             if ( rec->rlen==1 ) fail = 0;
799             else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
800         }
801 
802         if ( fail )
803         {
804             char tmp = 0;
805             if ( args->fa_buf.l - idx > rec->rlen )
806             {
807                 tmp = args->fa_buf.s[idx+rec->rlen];
808                 args->fa_buf.s[idx+rec->rlen] = 0;
809             }
810             error(
811                     "The fasta sequence does not match the REF allele at %s:%"PRId64":\n"
812                     "   REF .vcf: [%s]\n"
813                     "   ALT .vcf: [%s]\n"
814                     "   REF .fa : [%s]%c%s\n",
815                     bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
816                     tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
817                  );
818         }
819         alen = strlen(alt_allele);
820         len_diff = alen - rec->rlen;
821 
822         if ( args->mark_del && len_diff<0 )
823         {
824             alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
825             alen = rec->rlen;
826             len_diff = 0;
827             rmme_alt = 1;
828         }
829     }
830     else
831     {
832         alen = strlen(alt_allele);
833         len_diff = alen - rec->rlen;
834 
835         if ( args->mark_del && len_diff<0 )
836         {
837             alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
838             alen = rec->rlen;
839             len_diff = 0;
840             rmme_alt = 1;
841         }
842     }
843 
844     args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
845     if ( args->fa_case==TO_UPPER )
846         for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
847     else
848         for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
849 
850     if ( args->mark_ins && len_diff>0 )
851         mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
852     if ( args->mark_snv )
853         mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
854 
855     if ( len_diff <= 0 )
856     {
857         // deletion or same size event
858         assert( args->fa_buf.l >= idx+rec->rlen );
859         args->prev_base = args->fa_buf.s[idx+rec->rlen-1];
860         args->prev_base_pos = rec->pos + rec->rlen - 1;
861         args->prev_is_insert = 0;
862         args->fa_frz_mod = idx + alen;
863 
864         for (i=trim_beg; i<alen; i++)
865             args->fa_buf.s[idx+i] = alt_allele[i];
866 
867         if ( len_diff )
868             memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
869     }
870     else
871     {
872         args->prev_is_insert = 1;
873         args->prev_base_pos = rec->pos;
874 
875         // insertion
876         ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
877         memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
878 
879         // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceeding variants.
880         // For example, here we want to get TAA:
881         //      POS REF ALT
882         //      1   C   T
883         //      1   C   CAA
884         int ibeg = 0;
885         while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
886         for (i=ibeg; i<alen; i++)
887             args->fa_buf.s[idx+i] = alt_allele[i];
888 
889         args->fa_frz_mod = idx + alen - ibeg + 1;
890     }
891     if (args->chain && len_diff != 0)
892     {
893         // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
894         if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
895         {
896             // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
897             push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
898         }
899         else
900         {
901             // otherwise, just the coordinates of the variant as given
902             push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
903         }
904     }
905     args->fa_buf.l += len_diff;
906     args->fa_mod_off += len_diff;
907     args->fa_frz_pos  = rec->pos + rec->rlen - 1;
908     args->napplied++;
909     if ( rmme_alt ) free(alt_allele);
910 }
911 
912 
mask_region(args_t * args,char * seq,int len)913 static void mask_region(args_t *args, char *seq, int len)
914 {
915     int start = args->fa_src_pos - len;
916     int end   = args->fa_src_pos;
917     int i;
918 
919     for (i=0; i<args->nmask; i++)
920     {
921         mask_t *mask = &args->mask[i];
922         if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue;
923 
924         int idx_start, idx_end, j;
925         while ( regitr_overlap(mask->itr) )
926         {
927             idx_start = mask->itr->beg - start;
928             idx_end   = mask->itr->end - start;
929             if ( idx_start < 0 ) idx_start = 0;
930             if ( idx_end >= len ) idx_end = len - 1;
931             if ( mask->with==MASK_UC )
932                 for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]);
933             else if ( mask->with==MASK_LC )
934                 for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]);
935             else
936                 for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with;
937         }
938     }
939 }
940 
consensus(args_t * args)941 static void consensus(args_t *args)
942 {
943     BGZF *fasta = bgzf_open(args->ref_fname, "r");
944     if ( !fasta ) error("Error reading %s\n", args->ref_fname);
945     kstring_t str = {0,0,0};
946     while ( bgzf_getline(fasta, '\n', &str) > 0 )
947     {
948         if ( str.s[0]=='>' )
949         {
950             // new sequence encountered
951             if (args->chain) {
952                 print_chain(args);
953                 destroy_chain(args);
954             }
955             // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
956             bcf1_t **rec_ptr = NULL;
957             while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
958             {
959                 bcf1_t *rec = *rec_ptr;
960                 if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
961                 apply_variant(args, rec);
962             }
963             if ( args->absent_allele )
964             {
965                 int pos = 0;
966                 if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid )
967                     pos = args->vcf_buf[args->vcf_rbuf.f]->pos;
968                 apply_absent(args, pos);
969             }
970             flush_fa_buffer(args, 0);
971             init_region(args, str.s+1);
972             continue;
973         }
974         args->fa_length  += str.l;
975         args->fa_src_pos += str.l;
976 
977         // determine if uppercase or lowercase is used in this fasta file
978         if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;
979 
980         if ( args->mask ) mask_region(args, str.s, str.l);
981         kputs(str.s, &args->fa_buf);
982 
983         bcf1_t **rec_ptr = NULL;
984         while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
985         {
986             bcf1_t *rec = *rec_ptr;
987 
988             // still the same chr and the same region? if not, fasta buf can be flushed
989             if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) )
990             {
991                 // save the vcf record until next time and flush
992                 unread_vcf_line(args, rec_ptr);
993                 rec_ptr = NULL;
994                 break;
995             }
996 
997             // is the vcf record well beyond cached fasta buffer? if yes, the buf can be flushed
998             if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos )
999             {
1000                 unread_vcf_line(args, rec_ptr);
1001                 rec_ptr = NULL;
1002                 break;
1003             }
1004 
1005             // is the cached fasta buffer full enough? if not, read more fasta, no flushing
1006             if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off < rec->pos + rec->rlen )
1007             {
1008                 unread_vcf_line(args, rec_ptr);
1009                 break;
1010             }
1011             apply_variant(args, rec);
1012         }
1013         if ( !rec_ptr )
1014         {
1015             if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l);
1016             flush_fa_buffer(args, 60);
1017         }
1018     }
1019     bcf1_t **rec_ptr = NULL;
1020     while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
1021     {
1022         bcf1_t *rec = *rec_ptr;
1023         if ( rec->rid!=args->rid ) break;
1024         if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break;
1025         if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
1026         apply_variant(args, rec);
1027     }
1028     if (args->chain)
1029     {
1030         print_chain(args);
1031         destroy_chain(args);
1032     }
1033     if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
1034     flush_fa_buffer(args, 0);
1035     bgzf_close(fasta);
1036     free(str.s);
1037     fprintf(bcftools_stderr,"Applied %d variants\n", args->napplied);
1038 }
1039 
usage(args_t * args)1040 static void usage(args_t *args)
1041 {
1042     fprintf(bcftools_stderr, "\n");
1043     fprintf(bcftools_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
1044     fprintf(bcftools_stderr, "       file. By default, the program will apply all ALT variants. Using the\n");
1045     fprintf(bcftools_stderr, "       --sample (and, optionally, --haplotype) option will apply genotype\n");
1046     fprintf(bcftools_stderr, "       (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
1047     fprintf(bcftools_stderr, "       information, such as INFO/AD or FORMAT/AD.\n");
1048     fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
1049     fprintf(bcftools_stderr, "Options:\n");
1050     fprintf(bcftools_stderr, "    -c, --chain FILE               write a chain file for liftover\n");
1051     fprintf(bcftools_stderr, "    -a, --absent CHAR              replace positions absent from VCF with CHAR\n");
1052     fprintf(bcftools_stderr, "    -e, --exclude EXPR             exclude sites for which the expression is true (see man page for details)\n");
1053     fprintf(bcftools_stderr, "    -f, --fasta-ref FILE           reference sequence in fasta format\n");
1054     fprintf(bcftools_stderr, "    -H, --haplotype WHICH          choose which allele to use from the FORMAT/GT field, note\n");
1055     fprintf(bcftools_stderr, "                                   the codes are case-insensitive:\n");
1056     fprintf(bcftools_stderr, "                                       1: first allele from GT, regardless of phasing\n");
1057     fprintf(bcftools_stderr, "                                       2: second allele from GT, regardless of phasing\n");
1058     fprintf(bcftools_stderr, "                                       R: REF allele in het genotypes\n");
1059     fprintf(bcftools_stderr, "                                       A: ALT allele\n");
1060     fprintf(bcftools_stderr, "                                       I: IUPAC code for all genotypes\n");
1061     fprintf(bcftools_stderr, "                                       LR,LA: longer allele and REF/ALT if equal length\n");
1062     fprintf(bcftools_stderr, "                                       SR,SA: shorter allele and REF/ALT if equal length\n");
1063     fprintf(bcftools_stderr, "                                       1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
1064     fprintf(bcftools_stderr, "    -i, --include EXPR             select sites for which the expression is true (see man page for details)\n");
1065     fprintf(bcftools_stderr, "    -I, --iupac-codes              output variants in the form of IUPAC ambiguity codes\n");
1066     fprintf(bcftools_stderr, "        --mark-del CHAR            instead of removing sequence, insert CHAR for deletions\n");
1067     fprintf(bcftools_stderr, "        --mark-ins uc|lc           highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
1068     fprintf(bcftools_stderr, "        --mark-snv uc|lc           highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
1069     fprintf(bcftools_stderr, "    -m, --mask FILE                replace regions according to the next --mask-with option. The default is --mask-with N\n");
1070     fprintf(bcftools_stderr, "        --mask-with CHAR|uc|lc     replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
1071     fprintf(bcftools_stderr, "    -M, --missing CHAR             output CHAR instead of skipping a missing genotype \"./.\"\n");
1072     fprintf(bcftools_stderr, "    -o, --output FILE              write output to a file [standard output]\n");
1073     fprintf(bcftools_stderr, "    -p, --prefix STRING            prefix to add to output sequence names\n");
1074     fprintf(bcftools_stderr, "    -s, --sample NAME              apply variants of the given sample\n");
1075     fprintf(bcftools_stderr, "Examples:\n");
1076     fprintf(bcftools_stderr, "   # Get the consensus for one region. The fasta header lines are then expected\n");
1077     fprintf(bcftools_stderr, "   # in the form \">chr:from-to\".\n");
1078     fprintf(bcftools_stderr, "   samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
1079     fprintf(bcftools_stderr, "\n");
1080     bcftools_exit(1);
1081 }
1082 
main_consensus(int argc,char * argv[])1083 int main_consensus(int argc, char *argv[])
1084 {
1085     args_t *args = (args_t*) calloc(1,sizeof(args_t));
1086     args->argc   = argc; args->argv = argv;
1087 
1088     static struct option loptions[] =
1089     {
1090         {"mark-del",required_argument,NULL,1},
1091         {"mark-ins",required_argument,NULL,2},
1092         {"mark-snv",required_argument,NULL,3},
1093         {"mask-with",1,0,4},
1094         {"exclude",required_argument,NULL,'e'},
1095         {"include",required_argument,NULL,'i'},
1096         {"sample",1,0,'s'},
1097         {"iupac-codes",0,0,'I'},
1098         {"haplotype",1,0,'H'},
1099         {"output",1,0,'o'},
1100         {"fasta-ref",1,0,'f'},
1101         {"mask",1,0,'m'},
1102         {"missing",1,0,'M'},
1103         {"absent",1,0,'a'},
1104         {"chain",1,0,'c'},
1105         {"prefix",required_argument,0,'p'},
1106         {0,0,0,0}
1107     };
1108     int c;
1109     while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
1110     {
1111         switch (c)
1112         {
1113             case  1 : args->mark_del = optarg[0]; break;
1114             case  2 :
1115                 if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
1116                 else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
1117                 else error("The argument is not recognised: --mark-ins %s\n",optarg);
1118                 break;
1119             case  3 :
1120                 if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
1121                 else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
1122                 else error("The argument is not recognised: --mark-snv %s\n",optarg);
1123                 break;
1124             case 'p': args->chr_prefix = optarg; break;
1125             case 's': args->sample = optarg; break;
1126             case 'o': args->output_fname = optarg; break;
1127             case 'I': args->output_iupac = 1; break;
1128             case 'e':
1129                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1130                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
1131             case 'i':
1132                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1133                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
1134             case 'f': args->ref_fname = optarg; break;
1135             case 'm': add_mask(args,optarg); break;
1136             case  4 : add_mask_with(args,optarg); break;
1137             case 'a':
1138                 args->absent_allele = optarg[0];
1139                 if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
1140                 break;
1141             case 'M':
1142                 args->missing_allele = optarg[0];
1143                 if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
1144                 break;
1145             case 'c': args->chain_fname = optarg; break;
1146             case 'H':
1147                 if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
1148                 else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
1149                 else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
1150                 else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
1151                 else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
1152                 else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
1153                 else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
1154                 else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
1155                 else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC;
1156                 else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1;
1157                 else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2;
1158                 else
1159                 {
1160                     char *tmp;
1161                     args->haplotype = strtol(optarg, &tmp, 10);
1162                     if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
1163                     if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
1164                 }
1165                 break;
1166             default: usage(args); break;
1167         }
1168     }
1169     if ( optind>=argc ) usage(args);
1170     args->fname = argv[optind];
1171 
1172     if ( !args->ref_fname && !isatty(fileno((FILE *)stdin)) ) args->ref_fname = "-";
1173     if ( !args->ref_fname ) usage(args);
1174 
1175     init_data(args);
1176     consensus(args);
1177     destroy_data(args);
1178     free(args);
1179 
1180     return 0;
1181 }
1182 
1183 
1184