1 #include "bcftools.pysam.h"
2
3 /* The MIT License
4
5 Copyright (c) 2014-2021 Genome Research Ltd.
6
7 Author: Petr Danecek <pd3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26
27 */
28
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <assert.h>
34 #include <errno.h>
35 #include <getopt.h>
36 #include <unistd.h>
37 #include <ctype.h>
38 #include <htslib/vcf.h>
39 #include <htslib/kstring.h>
40 #include <htslib/synced_bcf_reader.h>
41 #include <htslib/kseq.h>
42 #include <htslib/bgzf.h>
43 #include "regidx.h"
44 #include "bcftools.h"
45 #include "rbuf.h"
46 #include "filter.h"
47
48 // Logic of the filters: include or exclude sites which match the filters?
49 #define FLT_INCLUDE 1
50 #define FLT_EXCLUDE 2
51
52 #define PICK_REF 1
53 #define PICK_ALT 2
54 #define PICK_LONG 4
55 #define PICK_SHORT 8
56 #define PICK_IUPAC 16
57
58 #define TO_UPPER 0
59 #define TO_LOWER 1
60
61 typedef struct
62 {
63 int num; // number of ungapped blocks in this chain
64 int *block_lengths; // length of the ungapped blocks in this chain
65 int *ref_gaps; // length of the gaps on the reference sequence between blocks
66 int *alt_gaps; // length of the gaps on the alternative sequence between blocks
67 int ori_pos;
68 int ref_last_block_ori; // start position on the reference sequence of the following ungapped block (0-based)
69 int alt_last_block_ori; // start position on the alternative sequence of the following ungapped block (0-based)
70 }
71 chain_t;
72
73 #define MASK_LC 1
74 #define MASK_UC 2
75 #define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0)
76 typedef struct
77 {
78 char *fname, with;
79 regidx_t *idx;
80 regitr_t *itr;
81 }
82 mask_t;
83
84 typedef struct
85 {
86 kstring_t fa_buf; // buffered reference sequence
87 int fa_ori_pos; // start position of the fa_buffer (wrt original sequence)
88 int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins)
89 int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
90 int fa_frz_mod; // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence
91 int fa_end_pos; // region's end position in the original sequence
92 int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
93 int fa_case; // output upper case or lower case: TO_UPPER|TO_LOWER
94 int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
95 char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
96 int prev_base_pos; // the position of prev_base
97 int prev_is_insert;
98
99 rbuf_t vcf_rbuf;
100 bcf1_t **vcf_buf;
101 int nvcf_buf, rid;
102 char *chr, *chr_prefix;
103
104 mask_t *mask;
105 int nmask;
106
107 int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
108 chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
109 // Note that the chain is re-initialised for each chromosome/seq_region
110
111 filter_t *filter;
112 char *filter_str;
113 int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
114
115 bcf_srs_t *files;
116 bcf_hdr_t *hdr;
117 FILE *fp_out;
118 FILE *fp_chain;
119 char **argv;
120 int argc, output_iupac, haplotype, allele, isample, napplied;
121 uint8_t *iupac_bitmask;
122 int miupac_bitmask;
123 char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
124 char mark_del, mark_ins, mark_snv;
125 }
126 args_t;
127
init_chain(chain_t * chain,int ref_ori_pos)128 static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
129 {
130 // fprintf(bcftools_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
131 chain = (chain_t*) calloc(1,sizeof(chain_t));
132 chain->num = 0;
133 chain->block_lengths = NULL;
134 chain->ref_gaps = NULL;
135 chain->alt_gaps = NULL;
136 chain->ori_pos = ref_ori_pos;
137 chain->ref_last_block_ori = ref_ori_pos;
138 chain->alt_last_block_ori = ref_ori_pos;
139 return chain;
140 }
141
destroy_chain(args_t * args)142 static void destroy_chain(args_t *args)
143 {
144 chain_t *chain = args->chain;
145 free(chain->ref_gaps);
146 free(chain->alt_gaps);
147 free(chain->block_lengths);
148 free(chain);
149 chain = NULL;
150 free(args->chr);
151 args->chr = NULL;
152 }
153
print_chain(args_t * args)154 static void print_chain(args_t *args)
155 {
156 /*
157 Example chain format (see: https://genome.ucsc.edu/goldenPath/help/chain.html):
158 chain 1 500 + 480 500 1 501 + 480 501 1
159 12 3 1
160 1 0 3
161 484
162
163 chain line is:
164 - chain
165 - score (sum of the length of ungapped block in this case)
166 - ref_seqname (from the fasta header, parsed by htslib)
167 - ref_seqlength (from the fasta header)
168 - ref_strand (+ or -; always + for bcf-consensus)
169 - ref_start (as defined in the fasta header)
170 - ref_end (as defined in the fasta header)
171 - alt_seqname (same as ref_seqname as bcf-consensus only considers SNPs and indels)
172 - alt_seqlength (adjusted to match the length of the alt sequence)
173 - alt_strand (+ or -; always + for bcf-consensus)
174 - alt_start (same as ref_start, as no edits are recorded/applied before that position)
175 - alt_end (adjusted to match the length of the alt sequence)
176 - chain_num (just an auto-increment id)
177
178 the other (sorted) lines are:
179 - length of the ungapped alignment block
180 - gap on the ref sequence between this and the next block (all but the last line)
181 - gap on the alt sequence between this and the next block (all but the last line)
182 */
183 chain_t *chain = args->chain;
184 int n = chain->num;
185 int ref_end_pos = args->fa_length + chain->ori_pos;
186 int last_block_size = ref_end_pos - chain->ref_last_block_ori;
187 int alt_end_pos = chain->alt_last_block_ori + last_block_size;
188 int score = 0;
189 for (n=0; n<chain->num; n++) {
190 score += chain->block_lengths[n];
191 }
192 score += last_block_size;
193 fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
194 for (n=0; n<chain->num; n++) {
195 fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
196 }
197 fprintf(args->fp_chain, "%d\n\n", last_block_size);
198 }
199
push_chain_gap(chain_t * chain,int ref_start,int ref_len,int alt_start,int alt_len)200 static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
201 {
202 // fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
203 int num = chain->num;
204
205 if (num && ref_start <= chain->ref_last_block_ori) {
206 // In case this variant is back-to-back with the previous one
207 chain->ref_last_block_ori = ref_start + ref_len;
208 chain->alt_last_block_ori = alt_start + alt_len;
209 chain->ref_gaps[num-1] += ref_len;
210 chain->alt_gaps[num-1] += alt_len;
211
212 } else {
213 // Extend the ungapped blocks, store the gap length
214 chain->block_lengths = (int*) realloc(chain->block_lengths, (num + 1) * sizeof(int));
215 chain->ref_gaps = (int*) realloc(chain->ref_gaps, (num + 1) * sizeof(int));
216 chain->alt_gaps = (int*) realloc(chain->alt_gaps, (num + 1) * sizeof(int));
217 chain->block_lengths[num] = ref_start - chain->ref_last_block_ori;
218 chain->ref_gaps[num] = ref_len;
219 chain->alt_gaps[num] = alt_len;
220 // Update the start positions of the next block
221 chain->ref_last_block_ori = ref_start + ref_len;
222 chain->alt_last_block_ori = alt_start + alt_len;
223 // Increment the number of ungapped blocks
224 chain->num++;
225 }
226 }
227
init_data(args_t * args)228 static void init_data(args_t *args)
229 {
230 args->files = bcf_sr_init();
231 args->files->require_index = 1;
232 if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum));
233 args->hdr = args->files->readers[0].header;
234 args->isample = -1;
235 if ( args->sample )
236 {
237 args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
238 if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
239 }
240 if ( (args->haplotype || args->allele) && args->isample<0 )
241 {
242 if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
243 args->isample = 0;
244 }
245 int i;
246 for (i=0; i<args->nmask; i++)
247 {
248 mask_t *mask = &args->mask[i];
249 mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL);
250 if ( !mask->idx ) error("Failed to initialize mask regions\n");
251 mask->itr = regitr_init(mask->idx);
252 }
253 // In case we want to store the chains
254 if ( args->chain_fname )
255 {
256 args->fp_chain = fopen(args->chain_fname,"w");
257 if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno));
258 args->chain_id = 0;
259 }
260 rbuf_init(&args->vcf_rbuf, 100);
261 args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*));
262 if ( args->output_fname ) {
263 args->fp_out = fopen(args->output_fname,"w");
264 if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
265 }
266 else args->fp_out = bcftools_stdout;
267 if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
268 if ( args->filter_str )
269 args->filter = filter_init(args->hdr, args->filter_str);
270 args->rid = -1;
271 }
add_mask(args_t * args,char * fname)272 static void add_mask(args_t *args, char *fname)
273 {
274 args->nmask++;
275 args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask));
276 mask_t *mask = &args->mask[args->nmask-1];
277 mask->fname = fname;
278 mask->with = 'N';
279 }
add_mask_with(args_t * args,char * with)280 static void add_mask_with(args_t *args, char *with)
281 {
282 if ( !args->nmask ) error("The --mask-with option must follow --mask\n");
283 mask_t *mask = &args->mask[args->nmask-1];
284 if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC;
285 else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC;
286 else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n");
287 else mask->with = *with;
288 }
destroy_data(args_t * args)289 static void destroy_data(args_t *args)
290 {
291 free(args->iupac_bitmask);
292 if (args->filter) filter_destroy(args->filter);
293 bcf_sr_destroy(args->files);
294 int i;
295 for (i=0; i<args->vcf_rbuf.m; i++)
296 if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
297 free(args->vcf_buf);
298 free(args->fa_buf.s);
299 free(args->chr);
300 for (i=0; i<args->nmask; i++)
301 {
302 mask_t *mask = &args->mask[i];
303 regidx_destroy(mask->idx);
304 regitr_destroy(mask->itr);
305 }
306 free(args->mask);
307 if ( args->chain_fname )
308 if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
309 if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
310 }
311
init_region(args_t * args,char * line)312 static void init_region(args_t *args, char *line)
313 {
314 char *ss, *se = line;
315 while ( *se && !isspace(*se) && *se!=':' ) se++;
316 int from = 0, to = 0;
317 char tmp = 0, *tmp_ptr = NULL;
318 if ( *se )
319 {
320 tmp = *se; *se = 0; tmp_ptr = se;
321 ss = ++se;
322 from = strtol(ss,&se,10);
323 if ( ss==se || !*se || *se!='-' ) from = 0;
324 else
325 {
326 from--;
327 ss = ++se;
328 to = strtol(ss,&se,10);
329 if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0; }
330 else to--;
331 }
332 }
333 free(args->chr);
334 args->chr = strdup(line);
335 args->rid = bcf_hdr_name2id(args->hdr,line);
336 if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
337 args->prev_base_pos = -1;
338 args->fa_buf.l = 0;
339 args->fa_length = 0;
340 args->fa_end_pos = to;
341 args->fa_ori_pos = from;
342 args->fa_src_pos = from;
343 args->fa_mod_off = 0;
344 args->fa_frz_pos = -1;
345 args->fa_frz_mod = -1;
346 args->fa_case = -1;
347 args->vcf_rbuf.n = 0;
348 bcf_sr_seek(args->files,line,args->fa_ori_pos);
349 if ( tmp_ptr ) *tmp_ptr = tmp;
350 fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
351 if (args->chain_fname )
352 {
353 args->chain = init_chain(args->chain, args->fa_ori_pos);
354 } else {
355 args->chain = NULL;
356 }
357 }
358
next_vcf_line(args_t * args)359 static bcf1_t **next_vcf_line(args_t *args)
360 {
361 if ( args->vcf_rbuf.n )
362 {
363 int i = rbuf_shift(&args->vcf_rbuf);
364 return &args->vcf_buf[i];
365 }
366 while ( bcf_sr_next_line(args->files) )
367 {
368 if ( args->filter )
369 {
370 int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
371 if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
372 if ( !is_ok ) continue;
373 }
374 return &args->files->readers[0].buffer[0];
375 }
376 return NULL;
377 }
unread_vcf_line(args_t * args,bcf1_t ** rec_ptr)378 static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
379 {
380 bcf1_t *rec = *rec_ptr;
381 if ( args->vcf_rbuf.n >= args->vcf_rbuf.m )
382 error("FIXME: too many overlapping records near %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
383
384 // Insert the new record in the buffer. The line would be overwritten in
385 // the next bcf_sr_next_line call, therefore we need to swap it with an
386 // unused one
387 int i = rbuf_append(&args->vcf_rbuf);
388 if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1();
389 bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp;
390 }
flush_fa_buffer(args_t * args,int len)391 static void flush_fa_buffer(args_t *args, int len)
392 {
393 if ( !args->fa_buf.l ) return;
394 int nwr = 0;
395 while ( nwr + 60 <= args->fa_buf.l )
396 {
397 if ( fwrite(args->fa_buf.s+nwr,1,60,args->fp_out) != 60 ) error("Could not write: %s\n", args->output_fname);
398 if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
399 nwr += 60;
400 }
401 if ( nwr )
402 args->fa_ori_pos += nwr;
403
404 args->fa_frz_mod -= nwr;
405
406 if ( len )
407 {
408 // not finished on this chr yet and the buffer cannot be emptied completely
409 if ( nwr && nwr < args->fa_buf.l )
410 memmove(args->fa_buf.s,args->fa_buf.s+nwr,args->fa_buf.l-nwr);
411 args->fa_buf.l -= nwr;
412 return;
413 }
414
415 // empty the whole buffer
416 if ( nwr == args->fa_buf.l ) { args->fa_buf.l = 0; return; }
417
418 if ( fwrite(args->fa_buf.s+nwr,1,args->fa_buf.l - nwr,args->fp_out) != args->fa_buf.l - nwr ) error("Could not write: %s\n", args->output_fname);
419 if ( fwrite("\n",1,1,args->fp_out) != 1 ) error("Could not write: %s\n", args->output_fname);
420
421 args->fa_ori_pos += args->fa_buf.l - nwr - args->fa_mod_off;
422 args->fa_mod_off = 0;
423 args->fa_buf.l = 0;
424 }
apply_absent(args_t * args,hts_pos_t pos)425 static void apply_absent(args_t *args, hts_pos_t pos)
426 {
427 if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return;
428
429 int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l;
430 int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod;
431 int i;
432 for (i=ib; i<ie; i++)
433 args->fa_buf.s[i] = args->absent_allele;
434 }
freeze_ref(args_t * args,bcf1_t * rec)435 static void freeze_ref(args_t *args, bcf1_t *rec)
436 {
437 if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return;
438 args->fa_frz_pos = rec->pos + rec->rlen - 1;
439 args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen;
440 }
mark_del(char * ref,int rlen,char * alt,int mark)441 static char *mark_del(char *ref, int rlen, char *alt, int mark)
442 {
443 char *out = malloc(rlen+1);
444 int i;
445 if ( alt )
446 {
447 int nalt = strlen(alt);
448 for (i=0; i<nalt; i++) out[i] = alt[i];
449 }
450 else // symbolic <DEL>
451 {
452 int nref = strlen(ref);
453 for (i=0; i<nref; i++) out[i] = ref[i];
454 }
455 for (; i<rlen; i++) out[i] = mark;
456 out[rlen] = 0;
457 return out;
458 }
mark_ins(char * ref,char * alt,char mark)459 static void mark_ins(char *ref, char *alt, char mark)
460 {
461 int i, nref = strlen(ref), nalt = strlen(alt);
462 if ( mark=='l' )
463 for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
464 else
465 for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
466 }
mark_snv(char * ref,char * alt,char mark)467 static void mark_snv(char *ref, char *alt, char mark)
468 {
469 int i, nref = strlen(ref), nalt = strlen(alt);
470 int n = nref < nalt ? nref : nalt;
471 if ( mark=='l' )
472 {
473 for (i=0; i<n; i++)
474 if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
475 }
476 else
477 {
478 for (i=0; i<n; i++)
479 if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
480 }
481 }
apply_variant(args_t * args,bcf1_t * rec)482 static void apply_variant(args_t *args, bcf1_t *rec)
483 {
484 static int warned_haplotype = 0;
485
486 if ( args->absent_allele ) apply_absent(args, rec->pos);
487 if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; }
488
489 int i,j;
490 if ( args->mask )
491 {
492 char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
493 int start = rec->pos;
494 int end = rec->pos + rec->rlen - 1;
495 for (i=0; i<args->nmask; i++)
496 {
497 mask_t *mask = &args->mask[i];
498 if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return;
499 }
500 }
501
502 int ialt = 1; // the alternate allele
503 if ( args->isample >= 0 )
504 {
505 bcf_unpack(rec, BCF_UN_FMT);
506 bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
507 if ( !fmt ) return;
508
509 if ( fmt->type!=BCF_BT_INT8 )
510 error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
511 uint8_t *ptr = fmt->p + fmt->size*args->isample;
512
513 enum { use_hap, use_iupac, pick_one } action = use_hap;
514 if ( args->allele==PICK_IUPAC )
515 {
516 if ( !args->haplotype ) action = use_iupac;
517 if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac;
518 }
519 else if ( args->output_iupac ) action = use_iupac;
520 else if ( !args->haplotype ) action = pick_one;
521
522 if ( action==use_hap )
523 {
524 if ( args->haplotype > fmt->n )
525 {
526 if ( bcf_gt_is_missing(ptr[fmt->n-1]) || bcf_gt_is_missing(ptr[0]) )
527 {
528 if ( !args->missing_allele ) return;
529 ialt = -1;
530 }
531 else
532 {
533 if ( !warned_haplotype )
534 {
535 fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%"PRId64". (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
536 warned_haplotype = 1;
537 }
538 return;
539 }
540 }
541 else
542 {
543 ialt = (int8_t)ptr[args->haplotype-1];
544 if ( bcf_gt_is_missing(ialt) || ialt==bcf_int8_vector_end )
545 {
546 if ( !args->missing_allele ) return;
547 ialt = -1;
548 }
549 else
550 ialt = bcf_gt_allele(ialt);
551 }
552 }
553 else if ( action==use_iupac )
554 {
555 ialt = -1;
556 int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
557 for (i=0; i<fmt->n; i++)
558 {
559 if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
560 if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
561 int jalt = bcf_gt_allele(ptr[i]);
562 if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
563 if ( fallback_alt <= 0 ) fallback_alt = jalt;
564
565 int l = strlen(rec->d.allele[jalt]);
566 for (j=0; j<l; j++)
567 if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
568 if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
569
570 if ( l > mlen )
571 {
572 hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
573 for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
574 mlen = l;
575 }
576 if ( jalt>0 && l>alen )
577 {
578 alen = l;
579 ialt = jalt;
580 }
581 for (j=0; j<l; j++)
582 args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
583 }
584 if ( alen > 0 )
585 for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
586 else if ( fallback_alt >= 0 )
587 ialt = fallback_alt;
588 else if ( is_missing && !args->missing_allele ) return;
589 }
590 else
591 {
592 int is_hom = 1;
593 for (i=0; i<fmt->n; i++)
594 {
595 if ( bcf_gt_is_missing(ptr[i]) )
596 {
597 if ( !args->missing_allele ) return; // ignore missing or half-missing genotypes
598 ialt = -1;
599 break;
600 }
601 if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
602 ialt = bcf_gt_allele(ptr[i]);
603 if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
604 }
605 if ( !is_hom )
606 {
607 int prev_len = 0, jalt;
608 for (i=0; i<fmt->n; i++)
609 {
610 if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
611 jalt = bcf_gt_allele(ptr[i]);
612 if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
613 if ( args->allele & (PICK_LONG|PICK_SHORT) )
614 {
615 int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
616 if ( i==0 ) ialt = jalt, prev_len = len;
617 else if ( len == prev_len )
618 {
619 if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
620 else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
621 }
622 else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
623 else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
624 }
625 else
626 {
627 if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
628 else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
629 }
630 }
631 }
632 }
633 if ( !ialt )
634 {
635 // ref allele
636 if ( args->absent_allele ) freeze_ref(args,rec);
637 return;
638 }
639 if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
640 }
641 else if ( args->output_iupac && rec->n_allele>1 )
642 {
643 int ialt, alen = 0, mlen = 0;
644 for (i=0; i<rec->n_allele; i++)
645 {
646 int l = strlen(rec->d.allele[i]);
647 for (j=0; j<l; j++)
648 if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
649 if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
650
651 if ( l > mlen )
652 {
653 hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
654 for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
655 mlen = l;
656 }
657 if ( i>0 && l>alen )
658 {
659 alen = l;
660 ialt = i;
661 }
662 for (j=0; j<l; j++)
663 args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
664 }
665 if ( alen > 0 )
666 for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
667 else
668 ialt = 1;
669 }
670
671 if ( rec->n_allele==1 && ialt!=-1 )
672 {
673 // non-missing reference
674 if ( args->absent_allele ) freeze_ref(args,rec);
675 return;
676 }
677 if ( ialt==-1 )
678 {
679 char alleles[4];
680 alleles[0] = rec->d.allele[0][0];
681 alleles[1] = ',';
682 alleles[2] = args->missing_allele;
683 alleles[3] = 0;
684 bcf_update_alleles_str(args->hdr, rec, alleles);
685 ialt = 1;
686 }
687
688 // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg
689 int trim_beg = 0;
690 int var_type = bcf_get_variant_type(rec,ialt);
691 int var_len = rec->d.var[ialt].n;
692 if ( var_type & VCF_INDEL )
693 {
694 // normally indel starts one base after, but not if the first base of the fa reference is deleted
695 if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] )
696 trim_beg = 1;
697 else
698 trim_beg = 0;
699 }
700 else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"<DEL>") )
701 {
702 trim_beg = 1;
703 var_len = 1 - rec->rlen;
704 }
705 else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
706
707 // Overlapping variant?
708 if ( rec->pos <= args->fa_frz_pos )
709 {
710 // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
711 // This still may not be enough for more complicated cases with multiple duplicate positions
712 // and other types in between. In such case let the user normalize the VCF and remove duplicates.
713
714 int overlap = 0;
715 if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1;
716
717 if ( overlap )
718 {
719 fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
720 return;
721 }
722
723 }
724
725 char *alt_allele = rec->d.allele[ialt];
726 int rmme_alt = 0;
727
728 int len_diff = 0, alen = 0;
729 int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
730 if ( idx<0 )
731 {
732 fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
733 return;
734 }
735 if ( rec->rlen > args->fa_buf.l - idx )
736 {
737 rec->rlen = args->fa_buf.l - idx;
738 alen = strlen(alt_allele);
739 if ( alen > rec->rlen )
740 {
741 alt_allele[rec->rlen] = 0;
742 fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
743 }
744 }
745 if ( idx>=args->fa_buf.l )
746 error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
747
748 // sanity check the reference base
749 if ( alt_allele[0]=='<' )
750 {
751 // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
752
753 if ( strcasecmp(alt_allele,"<DEL>") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"<NON_REF>") )
754 error("Symbolic alleles other than <DEL>, <*> or <NON_REF> are currently not supported, e.g. %s at %s:%"PRId64".\n"
755 "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
756 alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
757 if ( !strcasecmp(alt_allele,"<DEL>") )
758 {
759 static int multibase_ref_del_warned = 0;
760 if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
761 {
762 fprintf(bcftools_stderr,
763 "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
764 " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
765 multibase_ref_del_warned = 1;
766 }
767 if ( args->mark_del ) // insert dashes instead of delete sequence
768 {
769 alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
770 alen = rec->rlen;
771 len_diff = 0;
772 rmme_alt = 1;
773 }
774 else
775 {
776 len_diff = 1-rec->rlen;
777 alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event
778 alen = 1;
779 }
780 }
781 else
782 {
783 // <*> or <NON_REF> .. gVCF, evidence for the reference allele throughout the whole block
784 freeze_ref(args,rec);
785 return;
786 }
787 }
788 else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
789 {
790 // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA),
791 // the reference base in fa_buf is lost and the check fails. We do not keep a buffer
792 // with the original sequence as it should not be necessary, we should encounter max
793 // one base overlap
794
795 int fail = 1;
796 if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
797 {
798 if ( rec->rlen==1 ) fail = 0;
799 else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
800 }
801
802 if ( fail )
803 {
804 char tmp = 0;
805 if ( args->fa_buf.l - idx > rec->rlen )
806 {
807 tmp = args->fa_buf.s[idx+rec->rlen];
808 args->fa_buf.s[idx+rec->rlen] = 0;
809 }
810 error(
811 "The fasta sequence does not match the REF allele at %s:%"PRId64":\n"
812 " REF .vcf: [%s]\n"
813 " ALT .vcf: [%s]\n"
814 " REF .fa : [%s]%c%s\n",
815 bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
816 tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
817 );
818 }
819 alen = strlen(alt_allele);
820 len_diff = alen - rec->rlen;
821
822 if ( args->mark_del && len_diff<0 )
823 {
824 alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
825 alen = rec->rlen;
826 len_diff = 0;
827 rmme_alt = 1;
828 }
829 }
830 else
831 {
832 alen = strlen(alt_allele);
833 len_diff = alen - rec->rlen;
834
835 if ( args->mark_del && len_diff<0 )
836 {
837 alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
838 alen = rec->rlen;
839 len_diff = 0;
840 rmme_alt = 1;
841 }
842 }
843
844 args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
845 if ( args->fa_case==TO_UPPER )
846 for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
847 else
848 for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
849
850 if ( args->mark_ins && len_diff>0 )
851 mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
852 if ( args->mark_snv )
853 mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
854
855 if ( len_diff <= 0 )
856 {
857 // deletion or same size event
858 assert( args->fa_buf.l >= idx+rec->rlen );
859 args->prev_base = args->fa_buf.s[idx+rec->rlen-1];
860 args->prev_base_pos = rec->pos + rec->rlen - 1;
861 args->prev_is_insert = 0;
862 args->fa_frz_mod = idx + alen;
863
864 for (i=trim_beg; i<alen; i++)
865 args->fa_buf.s[idx+i] = alt_allele[i];
866
867 if ( len_diff )
868 memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
869 }
870 else
871 {
872 args->prev_is_insert = 1;
873 args->prev_base_pos = rec->pos;
874
875 // insertion
876 ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
877 memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
878
879 // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceeding variants.
880 // For example, here we want to get TAA:
881 // POS REF ALT
882 // 1 C T
883 // 1 C CAA
884 int ibeg = 0;
885 while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
886 for (i=ibeg; i<alen; i++)
887 args->fa_buf.s[idx+i] = alt_allele[i];
888
889 args->fa_frz_mod = idx + alen - ibeg + 1;
890 }
891 if (args->chain && len_diff != 0)
892 {
893 // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
894 if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
895 {
896 // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
897 push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
898 }
899 else
900 {
901 // otherwise, just the coordinates of the variant as given
902 push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
903 }
904 }
905 args->fa_buf.l += len_diff;
906 args->fa_mod_off += len_diff;
907 args->fa_frz_pos = rec->pos + rec->rlen - 1;
908 args->napplied++;
909 if ( rmme_alt ) free(alt_allele);
910 }
911
912
mask_region(args_t * args,char * seq,int len)913 static void mask_region(args_t *args, char *seq, int len)
914 {
915 int start = args->fa_src_pos - len;
916 int end = args->fa_src_pos;
917 int i;
918
919 for (i=0; i<args->nmask; i++)
920 {
921 mask_t *mask = &args->mask[i];
922 if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue;
923
924 int idx_start, idx_end, j;
925 while ( regitr_overlap(mask->itr) )
926 {
927 idx_start = mask->itr->beg - start;
928 idx_end = mask->itr->end - start;
929 if ( idx_start < 0 ) idx_start = 0;
930 if ( idx_end >= len ) idx_end = len - 1;
931 if ( mask->with==MASK_UC )
932 for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]);
933 else if ( mask->with==MASK_LC )
934 for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]);
935 else
936 for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with;
937 }
938 }
939 }
940
consensus(args_t * args)941 static void consensus(args_t *args)
942 {
943 BGZF *fasta = bgzf_open(args->ref_fname, "r");
944 if ( !fasta ) error("Error reading %s\n", args->ref_fname);
945 kstring_t str = {0,0,0};
946 while ( bgzf_getline(fasta, '\n', &str) > 0 )
947 {
948 if ( str.s[0]=='>' )
949 {
950 // new sequence encountered
951 if (args->chain) {
952 print_chain(args);
953 destroy_chain(args);
954 }
955 // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
956 bcf1_t **rec_ptr = NULL;
957 while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
958 {
959 bcf1_t *rec = *rec_ptr;
960 if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
961 apply_variant(args, rec);
962 }
963 if ( args->absent_allele )
964 {
965 int pos = 0;
966 if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid )
967 pos = args->vcf_buf[args->vcf_rbuf.f]->pos;
968 apply_absent(args, pos);
969 }
970 flush_fa_buffer(args, 0);
971 init_region(args, str.s+1);
972 continue;
973 }
974 args->fa_length += str.l;
975 args->fa_src_pos += str.l;
976
977 // determine if uppercase or lowercase is used in this fasta file
978 if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;
979
980 if ( args->mask ) mask_region(args, str.s, str.l);
981 kputs(str.s, &args->fa_buf);
982
983 bcf1_t **rec_ptr = NULL;
984 while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
985 {
986 bcf1_t *rec = *rec_ptr;
987
988 // still the same chr and the same region? if not, fasta buf can be flushed
989 if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) )
990 {
991 // save the vcf record until next time and flush
992 unread_vcf_line(args, rec_ptr);
993 rec_ptr = NULL;
994 break;
995 }
996
997 // is the vcf record well beyond cached fasta buffer? if yes, the buf can be flushed
998 if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos )
999 {
1000 unread_vcf_line(args, rec_ptr);
1001 rec_ptr = NULL;
1002 break;
1003 }
1004
1005 // is the cached fasta buffer full enough? if not, read more fasta, no flushing
1006 if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off < rec->pos + rec->rlen )
1007 {
1008 unread_vcf_line(args, rec_ptr);
1009 break;
1010 }
1011 apply_variant(args, rec);
1012 }
1013 if ( !rec_ptr )
1014 {
1015 if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l);
1016 flush_fa_buffer(args, 60);
1017 }
1018 }
1019 bcf1_t **rec_ptr = NULL;
1020 while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
1021 {
1022 bcf1_t *rec = *rec_ptr;
1023 if ( rec->rid!=args->rid ) break;
1024 if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break;
1025 if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
1026 apply_variant(args, rec);
1027 }
1028 if (args->chain)
1029 {
1030 print_chain(args);
1031 destroy_chain(args);
1032 }
1033 if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
1034 flush_fa_buffer(args, 0);
1035 bgzf_close(fasta);
1036 free(str.s);
1037 fprintf(bcftools_stderr,"Applied %d variants\n", args->napplied);
1038 }
1039
usage(args_t * args)1040 static void usage(args_t *args)
1041 {
1042 fprintf(bcftools_stderr, "\n");
1043 fprintf(bcftools_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
1044 fprintf(bcftools_stderr, " file. By default, the program will apply all ALT variants. Using the\n");
1045 fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
1046 fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
1047 fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
1048 fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
1049 fprintf(bcftools_stderr, "Options:\n");
1050 fprintf(bcftools_stderr, " -c, --chain FILE write a chain file for liftover\n");
1051 fprintf(bcftools_stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n");
1052 fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
1053 fprintf(bcftools_stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n");
1054 fprintf(bcftools_stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n");
1055 fprintf(bcftools_stderr, " the codes are case-insensitive:\n");
1056 fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n");
1057 fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n");
1058 fprintf(bcftools_stderr, " R: REF allele in het genotypes\n");
1059 fprintf(bcftools_stderr, " A: ALT allele\n");
1060 fprintf(bcftools_stderr, " I: IUPAC code for all genotypes\n");
1061 fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
1062 fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
1063 fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
1064 fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
1065 fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
1066 fprintf(bcftools_stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n");
1067 fprintf(bcftools_stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
1068 fprintf(bcftools_stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
1069 fprintf(bcftools_stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n");
1070 fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
1071 fprintf(bcftools_stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n");
1072 fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
1073 fprintf(bcftools_stderr, " -p, --prefix STRING prefix to add to output sequence names\n");
1074 fprintf(bcftools_stderr, " -s, --sample NAME apply variants of the given sample\n");
1075 fprintf(bcftools_stderr, "Examples:\n");
1076 fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
1077 fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n");
1078 fprintf(bcftools_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
1079 fprintf(bcftools_stderr, "\n");
1080 bcftools_exit(1);
1081 }
1082
main_consensus(int argc,char * argv[])1083 int main_consensus(int argc, char *argv[])
1084 {
1085 args_t *args = (args_t*) calloc(1,sizeof(args_t));
1086 args->argc = argc; args->argv = argv;
1087
1088 static struct option loptions[] =
1089 {
1090 {"mark-del",required_argument,NULL,1},
1091 {"mark-ins",required_argument,NULL,2},
1092 {"mark-snv",required_argument,NULL,3},
1093 {"mask-with",1,0,4},
1094 {"exclude",required_argument,NULL,'e'},
1095 {"include",required_argument,NULL,'i'},
1096 {"sample",1,0,'s'},
1097 {"iupac-codes",0,0,'I'},
1098 {"haplotype",1,0,'H'},
1099 {"output",1,0,'o'},
1100 {"fasta-ref",1,0,'f'},
1101 {"mask",1,0,'m'},
1102 {"missing",1,0,'M'},
1103 {"absent",1,0,'a'},
1104 {"chain",1,0,'c'},
1105 {"prefix",required_argument,0,'p'},
1106 {0,0,0,0}
1107 };
1108 int c;
1109 while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
1110 {
1111 switch (c)
1112 {
1113 case 1 : args->mark_del = optarg[0]; break;
1114 case 2 :
1115 if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
1116 else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
1117 else error("The argument is not recognised: --mark-ins %s\n",optarg);
1118 break;
1119 case 3 :
1120 if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
1121 else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
1122 else error("The argument is not recognised: --mark-snv %s\n",optarg);
1123 break;
1124 case 'p': args->chr_prefix = optarg; break;
1125 case 's': args->sample = optarg; break;
1126 case 'o': args->output_fname = optarg; break;
1127 case 'I': args->output_iupac = 1; break;
1128 case 'e':
1129 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1130 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
1131 case 'i':
1132 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1133 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
1134 case 'f': args->ref_fname = optarg; break;
1135 case 'm': add_mask(args,optarg); break;
1136 case 4 : add_mask_with(args,optarg); break;
1137 case 'a':
1138 args->absent_allele = optarg[0];
1139 if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
1140 break;
1141 case 'M':
1142 args->missing_allele = optarg[0];
1143 if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
1144 break;
1145 case 'c': args->chain_fname = optarg; break;
1146 case 'H':
1147 if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
1148 else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
1149 else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
1150 else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
1151 else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
1152 else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
1153 else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
1154 else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
1155 else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC;
1156 else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1;
1157 else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2;
1158 else
1159 {
1160 char *tmp;
1161 args->haplotype = strtol(optarg, &tmp, 10);
1162 if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
1163 if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
1164 }
1165 break;
1166 default: usage(args); break;
1167 }
1168 }
1169 if ( optind>=argc ) usage(args);
1170 args->fname = argv[optind];
1171
1172 if ( !args->ref_fname && !isatty(fileno((FILE *)stdin)) ) args->ref_fname = "-";
1173 if ( !args->ref_fname ) usage(args);
1174
1175 init_data(args);
1176 consensus(args);
1177 destroy_data(args);
1178 free(args);
1179
1180 return 0;
1181 }
1182
1183
1184