1 #include "bcftools.pysam.h"
2
3 /* vcfconvert.c -- convert between VCF/BCF and related formats.
4
5 Copyright (C) 2013-2021 Genome Research Ltd.
6
7 Author: Petr Danecek <pd3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE. */
26
27 #include <stdio.h>
28 #include <strings.h>
29 #include <unistd.h>
30 #include <getopt.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <inttypes.h>
37 #include <htslib/faidx.h>
38 #include <htslib/vcf.h>
39 #include <htslib/bgzf.h>
40 #include <htslib/synced_bcf_reader.h>
41 #include <htslib/vcfutils.h>
42 #include <htslib/kseq.h>
43 #include "bcftools.h"
44 #include "filter.h"
45 #include "convert.h"
46 #include "tsv2vcf.h"
47
48 // Logic of the filters: include or exclude sites which match the filters?
49 #define FLT_INCLUDE 1
50 #define FLT_EXCLUDE 2
51
52 typedef struct _args_t args_t;
53 struct _args_t
54 {
55 faidx_t *ref;
56 filter_t *filter;
57 char *filter_str;
58 int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
59 convert_t *convert;
60 bcf_srs_t *files;
61 bcf_hdr_t *header;
62 void (*convert_func)(struct _args_t *);
63 struct {
64 int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
65 } n;
66 kstring_t str;
67 int32_t *gts;
68 float *flt;
69 int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
70 int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
71 int regions_overlap, targets_overlap;
72 char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
73 char *outfname, *infname, *ref_fname, *sex_fname;
74 int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
75 };
76
destroy_data(args_t * args)77 static void destroy_data(args_t *args)
78 {
79 if ( args->ref ) fai_destroy(args->ref);
80 if ( args->convert) convert_destroy(args->convert);
81 if ( args->filter ) filter_destroy(args->filter);
82 free(args->samples);
83 if ( args->files ) bcf_sr_destroy(args->files);
84 }
85
open_vcf(args_t * args,const char * format_str)86 static void open_vcf(args_t *args, const char *format_str)
87 {
88 args->files = bcf_sr_init();
89 if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 )
90 error("Could not initialize --threads %d\n", args->n_threads);
91
92 if ( args->regions_list )
93 {
94 bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
95 if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
96 error("Failed to read the regions: %s\n", args->regions_list);
97 }
98 if ( args->targets_list )
99 {
100 bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
101 if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 )
102 error("Failed to read the targets: %s\n", args->targets_list);
103 }
104 if ( !bcf_sr_add_reader(args->files, args->infname) )
105 error("Failed to open %s: %s\n", args->infname,bcf_sr_strerror(args->files->errnum));
106
107 args->header = args->files->readers[0].header;
108
109 if ( args->filter_str )
110 args->filter = filter_init(args->header, args->filter_str);
111
112 int i, nsamples = 0, *samples = NULL;
113 if ( args->sample_list && strcmp("-",args->sample_list) )
114 {
115 for (i=0; i<args->files->nreaders; i++)
116 {
117 int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
118 if ( ret<0 ) error("Error parsing the sample list\n");
119 else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
120 }
121
122 if ( args->sample_list[0]!='^' )
123 {
124 // the sample ordering may be different if not negated
125 int n;
126 char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
127 if ( !smpls ) error("Could not parse %s\n", args->sample_list);
128 if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
129 error("The number of samples does not match, perhaps some are present multiple times?\n");
130 nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
131 samples = (int*) malloc(sizeof(int)*nsamples);
132 for (i=0; i<n; i++)
133 {
134 samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
135 free(smpls[i]);
136 }
137 free(smpls);
138 }
139 }
140 if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
141 free(samples);
142 }
143
tsv_setter_chrom_pos_ref_alt(tsv_t * tsv,bcf1_t * rec,void * usr)144 static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
145 {
146 args_t *args = (args_t*) usr;
147
148 char tmp, *se = tsv->ss, *ss = tsv->ss;
149 while ( se < tsv->se && *se!=':' ) se++;
150 if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
151 tmp = *se; *se = 0;
152 rec->rid = bcf_hdr_name2id(args->header,ss);
153 if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss);
154 *se = tmp;
155
156 // POS
157 rec->pos = strtol(se+1,&ss,10);
158 if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
159 rec->pos--;
160
161 // ID
162 if ( args->output_vcf_ids )
163 {
164 char tmp = *tsv->se;
165 *tsv->se = 0;
166 bcf_update_id(args->header, rec, tsv->ss);
167 *tsv->se = tmp;
168 }
169
170 // REF,ALT
171 args->str.l = 0;
172 se = ++ss;
173 while ( se < tsv->se && *se!='_' ) se++;
174 if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
175 kputsn(ss,se-ss,&args->str);
176 ss = ++se;
177 while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++;
178 if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
179 kputc(',',&args->str);
180 kputsn(ss,se-ss,&args->str);
181 bcf_update_alleles_str(args->header, rec, args->str.s);
182
183 // END - optional
184 if (*se && *se=='_') {
185 long end = strtol(se+1,&ss,10);
186 if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss);
187 bcf_update_info_int32(args->header, rec, "END", &end, 1);
188 }
189
190 return 0;
191 }
tsv_setter_verify_pos(tsv_t * tsv,bcf1_t * rec,void * usr)192 static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr)
193 {
194 char *se;
195 int pos = strtol(tsv->ss,&se,10);
196 if ( tsv->ss==se ) error("Could not parse POS: %s\n", tsv->ss);
197 if ( rec->pos != pos-1 ) error("POS mismatch: %s\n", tsv->ss);
198 return 0;
199 }
tsv_setter_verify_ref_alt(tsv_t * tsv,bcf1_t * rec,void * usr)200 static int tsv_setter_verify_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
201 {
202 args_t *args = (args_t*) usr;
203 args->rev_als = 0;
204 char tmp = *tsv->se; *tsv->se = 0;
205 if ( strcmp(tsv->ss,rec->d.allele[0]) )
206 {
207 if ( strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
208 args->rev_als = 1;
209 }
210 *tsv->se = tmp;
211 while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
212 tsv->ss = tsv->se;
213 while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
214 tmp = *tsv->se; *tsv->se = 0;
215 if ( !args->rev_als && strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
216 else if ( args->rev_als && strcmp(tsv->ss,rec->d.allele[0]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[0]); }
217 *tsv->se = tmp;
218 return 0;
219 }
tsv_setter_gt_gp(tsv_t * tsv,bcf1_t * rec,void * usr)220 static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
221 {
222 args_t *args = (args_t*) usr;
223 int i, nsamples = bcf_hdr_nsamples(args->header);
224 for (i=0; i<nsamples; i++)
225 {
226 float aa,ab,bb;
227 aa = strtod(tsv->ss, &tsv->se);
228 if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; }
229 tsv->ss = tsv->se+1;
230 ab = strtod(tsv->ss, &tsv->se);
231 if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; }
232 tsv->ss = tsv->se+1;
233 bb = strtod(tsv->ss, &tsv->se);
234 if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; }
235 tsv->ss = tsv->se+1;
236
237 if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; }
238 args->flt[3*i+0] = aa;
239 args->flt[3*i+1] = ab;
240 args->flt[3*i+2] = bb;
241
242 if ( aa >= ab )
243 {
244 if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
245 else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
246 }
247 else if ( ab >= bb )
248 {
249 args->gts[2*i+0] = bcf_gt_unphased(0);
250 args->gts[2*i+1] = bcf_gt_unphased(1);
251 }
252 else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
253 }
254 if ( *tsv->se ) error("Could not parse: %s\n", tsv->ss);
255 if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
256 if ( bcf_update_format_float(args->header,rec,"GP",args->flt,nsamples*3) ) error("Could not update GP field\n");
257 return 0;
258 }
tsv_setter_haps(tsv_t * tsv,bcf1_t * rec,void * usr)259 static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
260 {
261 args_t *args = (args_t*) usr;
262 int i, nsamples = bcf_hdr_nsamples(args->header);
263
264 int32_t a0, a1;
265 if ( args->rev_als ) { a0 = bcf_gt_phased(1); a1 = bcf_gt_phased(0); }
266 else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
267
268 // up is short for "unphased"
269 int nup = 0;
270 for (i=0; i<nsamples; i++)
271 {
272 char *ss = tsv->ss + 4*i + nup;
273 int up = 0, all;
274
275 for (all=0; all < 2; all++){
276 // checking for premature ending
277 if ( !ss[0] || !ss[1] || !ss[2] ||
278 (up && (!ss[3] || !ss[4]) ) )
279 {
280 fprintf(bcftools_stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]);
281 return -1;
282 }
283
284 switch(ss[all*2+up]){
285 case '0':
286 args->gts[2*i+all] = a0;
287 break;
288 case '1' :
289 args->gts[2*i+all] = a1;
290 break;
291 case '?' :
292 // there is no macro to express phased missing allele
293 args->gts[2*i+all] = bcf_gt_phased(-1);
294 break;
295 case '-' :
296 args->gts[2*i+all] = bcf_int32_vector_end;
297 break;
298 default :
299 fprintf(bcftools_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
300 return -1;
301 }
302 if( ss[all*2+up+1]=='*' ) up = up + 1;
303 }
304
305 if(up && up != 2)
306 {
307 fprintf(bcftools_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
308 return -1;
309 }
310
311 // change alleles to unphased if the alleles are unphased
312 if ( up )
313 {
314 args->gts[2*i] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i]));
315 args->gts[2*i+1] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i+1]));
316 }
317 nup = nup + up;
318 }
319 if ( tsv->ss[(nsamples-1)*4+3+nup] )
320 {
321 fprintf(bcftools_stderr,"nup: %d", nup);
322 fprintf(bcftools_stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]);
323 return -1;
324 }
325
326 if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
327 return 0;
328 }
gensample_to_vcf(args_t * args)329 static void gensample_to_vcf(args_t *args)
330 {
331 /*
332 * Inpute: IMPUTE2 output (indentation changed here for clarity):
333 *
334 * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
335 * --- 20:62116698_C_A 62116698 C A 1 0 0 ...
336 *
337 * Second column is expected in the form of CHROM:POS_REF_ALT. We use second
338 * column because the first can be empty ("--") when filling sites from reference
339 * panel.
340 *
341 * Output: VCF with filled GT,GP
342 *
343 */
344 kstring_t line = {0,0,0};
345
346 char *gen_fname = NULL, *sample_fname = NULL;
347 sample_fname = strchr(args->infname,',');
348 if ( !sample_fname )
349 {
350 args->str.l = 0;
351 ksprintf(&args->str,"%s.gen.gz", args->infname);
352 gen_fname = strdup(args->str.s);
353 args->str.l = 0;
354 ksprintf(&args->str,"%s.samples", args->infname);
355 sample_fname = strdup(args->str.s);
356 }
357 else
358 {
359 *sample_fname = 0;
360 gen_fname = strdup(args->infname);
361 sample_fname = strdup(sample_fname+1);
362 }
363 htsFile *gen_fh = hts_open(gen_fname, "r");
364 if ( !gen_fh ) error("Could not read: %s\n", gen_fname);
365 if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname);
366
367 // Find out the chromosome name, sample names, init and print the VCF header
368 args->str.l = 0;
369 char *ss, *se = line.s;
370 while ( *se && !isspace(*se) ) se++;
371 if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s);
372 ss = se+1;
373 se = strchr(ss,':');
374 if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname);
375 kputsn(ss, se-ss, &args->str);
376
377 tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP");
378 tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
379 tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
380 tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
381 tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args);
382
383 args->header = bcf_hdr_init("w");
384 bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
385 bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
386 bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
387 bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
388 if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
389
390 int i, nsamples;
391 char **samples = hts_readlist(sample_fname, 1, &nsamples);
392 if ( !samples ) error("Could not read %s\n", sample_fname);
393 for (i=2; i<nsamples; i++)
394 {
395 se = samples[i]; while ( *se && !isspace(*se) ) se++;
396 *se = 0;
397 bcf_hdr_add_sample(args->header,samples[i]);
398 }
399 for (i=0; i<nsamples; i++) free(samples[i]);
400 free(samples);
401
402 char wmode[8];
403 set_wmode(wmode,args->output_type,args->outfname,args->clevel);
404 htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
405 if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
406 if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
407 if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
408 bcf1_t *rec = bcf_init();
409
410 nsamples -= 2;
411 args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
412 args->flt = (float *) malloc(sizeof(float)*nsamples*3);
413
414 do
415 {
416 bcf_clear(rec);
417 args->n.total++;
418 if ( !tsv_parse(tsv, rec, line.s) )
419 {
420 if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
421 }
422 else
423 error("Error occurred while parsing: %s\n", line.s);
424 }
425 while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
426
427 if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
428 if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
429 bcf_hdr_destroy(args->header);
430 bcf_destroy(rec);
431 free(sample_fname);
432 free(gen_fname);
433 free(args->str.s);
434 free(line.s);
435 free(args->gts);
436 free(args->flt);
437 tsv_destroy(tsv);
438
439 fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total);
440 }
441
haplegendsample_to_vcf(args_t * args)442 static void haplegendsample_to_vcf(args_t *args)
443 {
444 /*
445 * Convert from IMPUTE2 hap/legend/sample output files to VCF
446 *
447 * hap:
448 * 0 1 0 1
449 * legend:
450 * id position a0 a1
451 * 1:186946386_G_T 186946386 G T
452 * sample:
453 * sample population group sex
454 * sample1 sample1 sample1 2
455 * sample2 sample2 sample2 2
456 *
457 * Output: VCF with filled GT
458 */
459 kstring_t line = {0,0,0};
460
461 char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL;
462 sample_fname = strchr(args->infname,',');
463 if ( !sample_fname )
464 {
465 args->str.l = 0;
466 ksprintf(&args->str,"%s.hap.gz", args->infname);
467 hap_fname = strdup(args->str.s);
468 args->str.l = 0;
469 ksprintf(&args->str,"%s.samples", args->infname);
470 sample_fname = strdup(args->str.s);
471 args->str.l = 0;
472 ksprintf(&args->str,"%s.legend.gz", args->infname);
473 leg_fname = strdup(args->str.s);
474 }
475 else
476 {
477 char *ss = sample_fname, *se = strchr(ss+1,',');
478 if ( !se ) error("Could not parse hap/legend/sample file names: %s\n", args->infname);
479 *ss = 0;
480 *se = 0;
481 hap_fname = strdup(args->infname);
482 leg_fname = strdup(ss+1);
483 sample_fname = strdup(se+1);
484 }
485 htsFile *hap_fh = hts_open(hap_fname, "r");
486 if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
487
488 htsFile *leg_fh = hts_open(leg_fname,"r");
489 if ( !leg_fh ) error("Could not read: %s\n", leg_fname);
490
491 // Eat up first legend line, then determine chromosome name
492 if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
493 if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
494
495 // Find out the chromosome name, sample names, init and print the VCF header
496 args->str.l = 0;
497 char *se = strchr(line.s,':');
498 if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", leg_fname);
499 kputsn(line.s, se-line.s, &args->str);
500
501 tsv_t *leg_tsv = tsv_init("CHROM_POS_REF_ALT,POS,REF_ALT");
502 tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
503 tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL);
504 tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
505
506 tsv_t *hap_tsv = tsv_init("HAPS");
507 tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args);
508
509 args->header = bcf_hdr_init("w");
510 bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
511 bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
512 bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
513 if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
514
515 int i, nrows, nsamples;
516 char **samples = hts_readlist(sample_fname, 1, &nrows);
517 if ( !samples ) error("Could not read %s\n", sample_fname);
518 nsamples = nrows - 1;
519
520 // sample_fname should contain a header line, so need to ignore first row
521 // returned from hts_readlist (i=1, and not i=0)
522 for (i=1; i<nrows; i++)
523 {
524 se = samples[i]; while ( *se && !isspace(*se) ) se++;
525 *se = 0;
526 bcf_hdr_add_sample(args->header,samples[i]);
527 }
528 bcf_hdr_add_sample(args->header,NULL);
529 for (i=0; i<nrows; i++) free(samples[i]);
530 free(samples);
531
532 char wmode[8];
533 set_wmode(wmode,args->output_type,args->outfname,args->clevel);
534 htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
535 if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
536 if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
537 if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
538 bcf1_t *rec = bcf_init();
539
540 args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
541
542 while (1)
543 {
544 bcf_clear(rec);
545 args->n.total++;
546 if ( tsv_parse(leg_tsv, rec, line.s) )
547 error("Error occurred while parsing %s: %s\n", leg_fname,line.s);
548
549 if ( hts_getline(hap_fh, KS_SEP_LINE, &line)<=0 )
550 error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
551
552 if ( tsv_parse(hap_tsv, rec, line.s) )
553 error("Error occurred while parsing %s: %s\n", hap_fname,line.s);
554
555 if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
556
557 if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 )
558 {
559 if ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 )
560 error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
561 break;
562 }
563 }
564
565 if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
566 if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
567 if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
568 bcf_hdr_destroy(args->header);
569 bcf_destroy(rec);
570 free(sample_fname);
571 free(hap_fname);
572 free(leg_fname);
573 free(args->str.s);
574 free(line.s);
575 free(args->gts);
576 tsv_destroy(hap_tsv);
577 tsv_destroy(leg_tsv);
578
579 fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total);
580 }
581
hapsample_to_vcf(args_t * args)582 static void hapsample_to_vcf(args_t *args)
583 {
584 /*
585 * Input: SHAPEIT output
586 *
587 * 20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ...
588 *
589 * First column is expected in the form of CHROM:POS_REF_ALT
590 *
591 * Output: VCF with filled GT
592 *
593 */
594 kstring_t line = {0,0,0};
595
596 char *hap_fname = NULL, *sample_fname = NULL;
597 sample_fname = strchr(args->infname,',');
598 if ( !sample_fname )
599 {
600 args->str.l = 0;
601 ksprintf(&args->str,"%s.hap.gz", args->infname);
602 hap_fname = strdup(args->str.s);
603 args->str.l = 0;
604 ksprintf(&args->str,"%s.samples", args->infname);
605 sample_fname = strdup(args->str.s);
606 }
607 else
608 {
609 *sample_fname = 0;
610 hap_fname = strdup(args->infname);
611 sample_fname = strdup(sample_fname+1);
612 }
613 htsFile *hap_fh = hts_open(hap_fname, "r");
614 if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
615 if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname);
616
617 // Find out the chromosome name, sample names, init and print the VCF header
618 args->str.l = 0;
619 char *se = strchr(line.s,':');
620 if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname);
621 kputsn(line.s, se-line.s, &args->str);
622
623 tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS");
624 tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
625 tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
626 tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
627 tsv_register(tsv, "HAPS", tsv_setter_haps, args);
628
629 args->header = bcf_hdr_init("w");
630 bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
631 bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
632 bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
633 if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
634
635 int i, nsamples;
636 char **samples = hts_readlist(sample_fname, 1, &nsamples);
637 if ( !samples ) error("Could not read %s\n", sample_fname);
638 for (i=2; i<nsamples; i++)
639 {
640 se = samples[i]; while ( *se && !isspace(*se) ) se++;
641 *se = 0;
642 bcf_hdr_add_sample(args->header,samples[i]);
643 }
644 bcf_hdr_add_sample(args->header,NULL);
645 for (i=0; i<nsamples; i++) free(samples[i]);
646 free(samples);
647
648 char wmode[8];
649 set_wmode(wmode,args->output_type,args->outfname,args->clevel);
650 htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
651 if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
652 if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
653 if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
654 bcf1_t *rec = bcf_init();
655
656 nsamples -= 2;
657 args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
658
659 do
660 {
661 bcf_clear(rec);
662 args->n.total++;
663 if ( !tsv_parse(tsv, rec, line.s) )
664 {
665 if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
666 }
667 else
668 error("Error occurred while parsing: %s\n", line.s);
669 }
670 while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
671
672 if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
673 if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
674 bcf_hdr_destroy(args->header);
675 bcf_destroy(rec);
676 free(sample_fname);
677 free(hap_fname);
678 free(args->str.s);
679 free(line.s);
680 free(args->gts);
681 tsv_destroy(tsv);
682
683 fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total);
684 }
685
init_sample2sex(bcf_hdr_t * hdr,char * sex_fname)686 char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
687 {
688 int i, nlines;
689 char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1);
690 char **lines = hts_readlist(sex_fname, 1, &nlines);
691 if ( !lines ) error("Could not read %s\n", sex_fname);
692 for (i=0; i<nlines; i++)
693 {
694 char *se = lines[i]; while ( *se && !isspace(*se) ) se++;
695 char tmp = *se;
696 *se = 0;
697 int id = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, lines[i]);
698 *se = tmp;
699 if ( id<0 ) continue;
700 while ( *se && isspace(*se) ) se++;
701 if ( *se=='M' ) sample2sex[id] = '1';
702 else if ( *se=='F' ) sample2sex[id] = '2';
703 else error("Could not parse %s: %s\n", sex_fname,lines[i]);
704 }
705 for (i=0; i<nlines; i++) free(lines[i]);
706 free(lines);
707 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
708 if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
709 return sample2sex;
710 }
711
vcf_to_gensample(args_t * args)712 static void vcf_to_gensample(args_t *args)
713 {
714 kstring_t str = {0,0,0};
715
716 // insert chrom as first column if needed
717 if(args->output_chrom_first_col)
718 kputs("%CHROM ", &str);
719 else
720 kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
721
722 // insert rsid as second column if needed
723 if(args->output_vcf_ids)
724 kputs("%ID ", &str);
725 else
726 kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
727
728 kputs("%POS %REF %FIRST_ALT", &str);
729 if ( !args->tag || !strcmp(args->tag,"GT") ) kputs("%_GT_TO_PROB3",&str);
730 else if ( !strcmp(args->tag,"PL") ) kputs("%_PL_TO_PROB3",&str);
731 else if ( !strcmp(args->tag,"GP") ) kputs("%_GP_TO_PROB3",&str);
732 else error("todo: --tag %s\n", args->tag);
733 kputs("\n", &str);
734 open_vcf(args,str.s);
735
736 int ret, gen_compressed = 1, sample_compressed = 0;
737 char *gen_fname = NULL, *sample_fname = NULL;
738 str.l = 0;
739 kputs(args->outfname,&str);
740 int n_files = 0, i;
741 char **files = hts_readlist(str.s, 0, &n_files);
742 if ( n_files==1 )
743 {
744 int l = str.l;
745 kputs(".samples",&str);
746 sample_fname = strdup(str.s);
747 str.l = l;
748 kputs(".gen.gz",&str);
749 gen_fname = strdup(str.s);
750 }
751 else if ( n_files==2 )
752 {
753 if (strlen(files[0]) && strcmp(files[0],".")!=0) gen_fname = strdup(files[0]);
754 if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
755 }
756 else
757 {
758 error("Error parsing --gensample filenames: %s\n", args->outfname);
759 }
760 for (i=0; i<n_files; i++) free(files[i]);
761 free(files);
762
763 if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0;
764 if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
765
766 if (gen_fname) fprintf(bcftools_stderr, "Gen file: %s\n", gen_fname);
767 if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
768
769 // write samples file
770 if (sample_fname)
771 {
772 char *sample2sex = NULL;
773 if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
774
775 int i;
776 BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
777 str.l = 0;
778 kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
779 ret = bgzf_write(sout, str.s, str.l);
780 if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
781 for (i=0; i<bcf_hdr_nsamples(args->header); i++)
782 {
783 str.l = 0;
784 if ( sample2sex )
785 ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
786 else
787 ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
788 ret = bgzf_write(sout, str.s, str.l);
789 if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
790 }
791 if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
792 free(sample_fname);
793 free(sample2sex);
794 }
795 if (!gen_fname) {
796 if ( str.m ) free(str.s);
797 return;
798 }
799
800 int prev_rid = -1, prev_pos = -1;
801 int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0;
802 BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu");
803 while ( bcf_sr_next_line(args->files) )
804 {
805 bcf1_t *line = bcf_sr_get_line(args->files,0);
806 if ( args->filter )
807 {
808 int pass = filter_test(args->filter, line, NULL);
809 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
810 if ( !pass ) { filtered++; continue; }
811 }
812
813 // ALT allele is required
814 if ( line->n_allele<2 ) { no_alt++; continue; }
815
816 // biallelic required
817 if ( line->n_allele>2 ) {
818 if (!non_biallelic)
819 fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
820 non_biallelic++;
821 continue;
822 }
823
824 // skip duplicate lines, or otherwise shapeit complains
825 if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
826 prev_rid = line->rid;
827 prev_pos = line->pos;
828
829 str.l = 0;
830 convert_line(args->convert, line, &str);
831 if ( str.l )
832 {
833 int ret = bgzf_write(gout, str.s, str.l);
834 if ( ret!= str.l ) error("Error writing %s: %s\n", gen_fname,strerror(errno));
835 nok++;
836 }
837 }
838 fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
839 nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
840
841 if ( str.m ) free(str.s);
842 if ( bgzf_close(gout)!=0 ) error("Error closing %s: %s\n", gen_fname,strerror(errno));
843 free(gen_fname);
844 }
845
vcf_to_haplegendsample(args_t * args)846 static void vcf_to_haplegendsample(args_t *args)
847 {
848 kstring_t str = {0,0,0};
849 if ( args->hap2dip )
850 kputs("%_GT_TO_HAP2\n", &str);
851 else
852 kputs("%_GT_TO_HAP\n", &str);
853 open_vcf(args,str.s);
854
855 int ret, hap_compressed = 1, legend_compressed = 1, sample_compressed = 0;
856 char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
857 str.l = 0;
858 kputs(args->outfname,&str);
859 int n_files = 0, i;
860 char **files = hts_readlist(str.s, 0, &n_files);
861 if ( n_files==1 )
862 {
863 int l = str.l;
864 kputs(".samples",&str);
865 sample_fname = strdup(str.s);
866 str.l = l;
867 kputs(".legend.gz",&str);
868 legend_fname = strdup(str.s);
869 str.l = l;
870 kputs(".hap.gz",&str);
871 hap_fname = strdup(str.s);
872 }
873 else if ( n_files==3 )
874 {
875 if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
876 if (strlen(files[1]) && strcmp(files[1],".")!=0) legend_fname = strdup(files[1]);
877 if (strlen(files[2]) && strcmp(files[2],".")!=0) sample_fname = strdup(files[2]);
878 }
879 else
880 {
881 error("Error parsing --hapslegendsample filenames: %s\n", args->outfname);
882 }
883 for (i=0; i<n_files; i++) free(files[i]);
884 free(files);
885
886 if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
887 if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
888 if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
889
890 if (hap_fname) fprintf(bcftools_stderr, "Hap file: %s\n", hap_fname);
891 if (legend_fname) fprintf(bcftools_stderr, "Legend file: %s\n", legend_fname);
892 if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
893
894 // write samples file
895 if (sample_fname)
896 {
897 char *sample2sex = NULL;
898 if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
899
900 int i;
901 BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
902 str.l = 0;
903 kputs("sample population group sex\n", &str);
904 ret = bgzf_write(sout, str.s, str.l);
905 if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
906 for (i=0; i<bcf_hdr_nsamples(args->header); i++)
907 {
908 str.l = 0;
909 ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2');
910 ret = bgzf_write(sout, str.s, str.l);
911 if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
912 }
913 if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
914 free(sample_fname);
915 free(sample2sex);
916 }
917 if (!hap_fname && !legend_fname) {
918 if ( str.m ) free(str.s);
919 return;
920 }
921
922 // open haps and legend outputs
923 BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
924 if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
925 BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
926 if (legend_fname) {
927 str.l = 0;
928 kputs("id position a0 a1\n", &str);
929 ret = bgzf_write(lout, str.s, str.l);
930 if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
931 }
932
933 int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
934 while ( bcf_sr_next_line(args->files) )
935 {
936 bcf1_t *line = bcf_sr_get_line(args->files,0);
937 if ( args->filter )
938 {
939 int pass = filter_test(args->filter, line, NULL);
940 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
941 if ( !pass ) { filtered++; continue; }
942 }
943
944 // ALT allele is required
945 if ( line->n_allele<2 ) { no_alt++; continue; }
946 // biallelic required
947 if ( line->n_allele>2 ) {
948 if (!non_biallelic)
949 fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
950 non_biallelic++;
951 continue;
952 }
953
954 str.l = 0;
955 convert_line(args->convert, line, &str);
956 if ( !str.l ) continue;
957
958 // write haps file
959 if (hap_fname) {
960 ret = bgzf_write(hout, str.s, str.l); // write hap file
961 if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
962 }
963 if (legend_fname) {
964 str.l = 0;
965 if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) )
966 ksprintf(&str, "%s %"PRId64" %s %s\n", line->d.id, (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]);
967 else
968 ksprintf(&str, "%s:%"PRId64"_%s_%s %"PRId64" %s %s\n", bcf_seqname(args->header, line), (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1], (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]);
969
970 // write legend file
971 ret = bgzf_write(lout, str.s, str.l);
972 if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
973 }
974 nok++;
975 }
976 fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
977 if ( str.m ) free(str.s);
978 if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
979 if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno));
980 if (hap_fname) free(hap_fname);
981 if (legend_fname) free(legend_fname);
982 }
983
vcf_to_hapsample(args_t * args)984 static void vcf_to_hapsample(args_t *args)
985 {
986 /*
987 * WTCCC style haplotypes file
988 * see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#hapsample
989 *
990 * These are essentially the haplotypes from the impute2 format with some
991 * legend info tacked on to the first 5 columns
992 *
993 */
994 kstring_t str = {0,0,0};
995
996 // print ID instead of CHROM:POS_REF_ALT1
997 if ( args->output_vcf_ids )
998 kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
999 else
1000 kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
1001
1002 if ( args->hap2dip )
1003 kputs("%_GT_TO_HAP2\n", &str);
1004 else
1005 kputs("%_GT_TO_HAP\n", &str);
1006 open_vcf(args,str.s);
1007
1008 int ret, hap_compressed = 1, sample_compressed = 0;
1009 char *hap_fname = NULL, *sample_fname = NULL;
1010 str.l = 0;
1011 kputs(args->outfname,&str);
1012 int n_files = 0, i;
1013 char **files = hts_readlist(str.s, 0, &n_files);
1014 if ( n_files==1 )
1015 {
1016 int l = str.l;
1017 kputs(".samples",&str);
1018 sample_fname = strdup(str.s);
1019 str.l = l;
1020 kputs(".hap.gz",&str);
1021 hap_fname = strdup(str.s);
1022 }
1023 else if ( n_files==2 )
1024 {
1025 if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
1026 if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
1027 }
1028 else
1029 {
1030 error("Error parsing --hapsample filenames: %s\n", args->outfname);
1031 }
1032 for (i=0; i<n_files; i++) free(files[i]);
1033 free(files);
1034
1035 if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
1036 if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
1037
1038 if (hap_fname) fprintf(bcftools_stderr, "Hap file: %s\n", hap_fname);
1039 if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
1040
1041 // write samples file
1042 if (sample_fname)
1043 {
1044 char *sample2sex = NULL;
1045 if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
1046
1047 int i;
1048 BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
1049 str.l = 0;
1050 kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
1051 ret = bgzf_write(sout, str.s, str.l);
1052 if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
1053 for (i=0; i<bcf_hdr_nsamples(args->header); i++)
1054 {
1055 str.l = 0;
1056 if ( sample2sex )
1057 ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
1058 else
1059 ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
1060 ret = bgzf_write(sout, str.s, str.l);
1061 if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
1062 }
1063 if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
1064 free(sample_fname);
1065 free(sample2sex);
1066 }
1067 if (!hap_fname) {
1068 if ( str.m ) free(str.s);
1069 return;
1070 }
1071
1072 // open haps output
1073 BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
1074 if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
1075
1076 int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
1077 while ( bcf_sr_next_line(args->files) )
1078 {
1079 bcf1_t *line = bcf_sr_get_line(args->files,0);
1080 if ( args->filter )
1081 {
1082 int pass = filter_test(args->filter, line, NULL);
1083 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1084 if ( !pass ) { filtered++; continue; }
1085 }
1086
1087 // ALT allele is required
1088 if ( line->n_allele<2 ) { no_alt++; continue; }
1089 // biallelic required
1090 if ( line->n_allele>2 ) {
1091 if (!non_biallelic)
1092 fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
1093 non_biallelic++;
1094 continue;
1095 }
1096
1097 str.l = 0;
1098 convert_line(args->convert, line, &str);
1099 if ( !str.l ) continue;
1100
1101 // write haps file
1102 if (hap_fname) {
1103 ret = bgzf_write(hout, str.s, str.l); // write hap file
1104 if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
1105 }
1106 nok++;
1107 }
1108 fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
1109 if ( str.m ) free(str.s);
1110 if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
1111 if (hap_fname) free(hap_fname);
1112 }
1113
bcf_hdr_set_chrs(bcf_hdr_t * hdr,faidx_t * fai)1114 static void bcf_hdr_set_chrs(bcf_hdr_t *hdr, faidx_t *fai)
1115 {
1116 int i, n = faidx_nseq(fai);
1117 for (i=0; i<n; i++)
1118 {
1119 const char *seq = faidx_iseq(fai,i);
1120 int len = faidx_seq_len(fai, seq);
1121 bcf_hdr_printf(hdr, "##contig=<ID=%s,length=%d>", seq,len);
1122 }
1123 }
acgt_to_5(char base)1124 static inline int acgt_to_5(char base)
1125 {
1126 if ( base=='A' ) return 0;
1127 if ( base=='C' ) return 1;
1128 if ( base=='G' ) return 2;
1129 if ( base=='T' ) return 3;
1130 return 4;
1131 }
tsv_setter_aa1(args_t * args,char * ss,char * se,int alleles[],int * nals,int ref,int32_t * gts)1132 static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[], int *nals, int ref, int32_t *gts)
1133 {
1134 if ( se - ss > 2 ) return -1; // currently only SNPs
1135
1136 if ( ss[0]=='-' )
1137 {
1138 // missing GT
1139 gts[0] = bcf_gt_missing;
1140 gts[1] = bcf_gt_missing;
1141 args->n.missing++;
1142 return 0;
1143 }
1144 if ( ss[0]=='I' ) return -2; // skip insertions/deletions for now
1145 if ( ss[0]=='D' ) return -2;
1146
1147 int a0 = acgt_to_5(toupper(ss[0]));
1148 int a1 = ss[1] ? acgt_to_5(toupper(ss[1])) : a0;
1149 if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
1150 if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
1151
1152 gts[0] = bcf_gt_unphased(alleles[a0]);
1153 gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
1154
1155 if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
1156 else if ( ref==a0 ) args->n.het_ra++; // het: RA
1157 else if ( ref==a1 ) args->n.het_ra++; // het: AR
1158 else if ( a0==a1 ) args->n.hom_aa++; // hom-alt: AA
1159 else args->n.het_aa++; // non-ref het: AA
1160
1161 return 0;
1162 }
tsv_setter_aa(tsv_t * tsv,bcf1_t * rec,void * usr)1163 static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
1164 {
1165 args_t *args = (args_t*) usr;
1166
1167 int len;
1168 char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len);
1169 if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1170
1171 int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n
1172 ref[0] = toupper(ref[0]);
1173 int iref = acgt_to_5(ref[0]);
1174 alleles[iref] = 0;
1175
1176 rec->n_sample = bcf_hdr_nsamples(args->header);
1177
1178 int i, ret;
1179 for (i=0; i<rec->n_sample; i++)
1180 {
1181 if ( i>0 )
1182 {
1183 ret = tsv_next(tsv);
1184 if ( ret==-1 ) error("Too few columns for %d samples at %s:%"PRId64"\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1185 }
1186 ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
1187 if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1188 if ( ret==-2 )
1189 {
1190 // something else than a SNP
1191 free(ref);
1192 return -1;
1193 }
1194 }
1195
1196 args->str.l = 0;
1197 kputc(ref[0], &args->str);
1198 for (i=0; i<5; i++)
1199 {
1200 if ( alleles[i]>0 )
1201 {
1202 kputc(',', &args->str);
1203 kputc("ACGTN"[i], &args->str);
1204 }
1205 }
1206 bcf_update_alleles_str(args->header, rec, args->str.s);
1207 if ( bcf_update_genotypes(args->header,rec,args->gts,rec->n_sample*2) ) error("Could not update the GT field\n");
1208
1209 free(ref);
1210 return 0;
1211 }
1212
tsv_to_vcf(args_t * args)1213 static void tsv_to_vcf(args_t *args)
1214 {
1215 if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
1216 if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
1217
1218 args->ref = fai_load(args->ref_fname);
1219 if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
1220
1221 args->header = bcf_hdr_init("w");
1222 bcf_hdr_set_chrs(args->header, args->ref);
1223 bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
1224 if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
1225
1226 int i, n;
1227 char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
1228 if ( !smpls ) error("Could not parse %s\n", args->sample_list);
1229 for (i=0; i<n; i++)
1230 {
1231 bcf_hdr_add_sample(args->header, smpls[i]);
1232 free(smpls[i]);
1233 }
1234 free(smpls);
1235 bcf_hdr_add_sample(args->header, NULL);
1236 args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
1237
1238 char wmode[8];
1239 set_wmode(wmode,args->output_type,args->outfname,args->clevel);
1240 htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
1241 if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
1242 if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
1243 if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1244
1245 tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
1246 if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
1247 if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
1248 if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
1249 if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
1250
1251 bcf1_t *rec = bcf_init();
1252 bcf_float_set_missing(rec->qual);
1253
1254 kstring_t line = {0,0,0};
1255 htsFile *in_fh = hts_open(args->infname, "r");
1256 if ( !in_fh ) error("Could not read: %s\n", args->infname);
1257 while ( hts_getline(in_fh, KS_SEP_LINE, &line) > 0 )
1258 {
1259 if ( line.s[0]=='#' ) continue; // skip comments
1260 bcf_clear(rec);
1261
1262 args->n.total++;
1263 if ( !tsv_parse(tsv, rec, line.s) )
1264 {
1265 if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1266 }
1267 else
1268 args->n.skipped++;
1269 }
1270 if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
1271 free(line.s);
1272
1273 bcf_hdr_destroy(args->header);
1274 if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1275 tsv_destroy(tsv);
1276 bcf_destroy(rec);
1277 free(args->str.s);
1278 free(args->gts);
1279
1280 fprintf(bcftools_stderr,"Rows total: \t%d\n", args->n.total);
1281 fprintf(bcftools_stderr,"Rows skipped: \t%d\n", args->n.skipped);
1282 fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing);
1283 fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
1284 fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra);
1285 fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
1286 fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa);
1287 }
1288
vcf_to_vcf(args_t * args)1289 static void vcf_to_vcf(args_t *args)
1290 {
1291 open_vcf(args,NULL);
1292 char wmode[8];
1293 set_wmode(wmode,args->output_type,args->outfname,args->clevel);
1294 htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
1295 if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
1296 if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
1297
1298 bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
1299 if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1300
1301 while ( bcf_sr_next_line(args->files) )
1302 {
1303 bcf1_t *line = bcf_sr_get_line(args->files,0);
1304 if ( args->filter )
1305 {
1306 int pass = filter_test(args->filter, line, NULL);
1307 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1308 if ( !pass ) continue;
1309 }
1310 if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1311 }
1312 if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1313 }
1314
gvcf_to_vcf(args_t * args)1315 static void gvcf_to_vcf(args_t *args)
1316 {
1317 if ( !args->ref_fname ) error("--gvcf2vcf requires the --fasta-ref option\n");
1318
1319 args->ref = fai_load(args->ref_fname);
1320 if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
1321
1322 open_vcf(args,NULL);
1323 char wmode[8];
1324 set_wmode(wmode,args->output_type,args->outfname,args->clevel);
1325 htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
1326 if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
1327 if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
1328
1329 bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
1330 if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
1331 if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1332
1333 int32_t *itmp = NULL, nitmp = 0;
1334
1335 while ( bcf_sr_next_line(args->files) )
1336 {
1337 bcf1_t *line = bcf_sr_get_line(args->files,0);
1338 if ( args->filter )
1339 {
1340 int pass = filter_test(args->filter, line, NULL);
1341 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1342 if ( !pass )
1343 {
1344 if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1345 continue;
1346 }
1347 }
1348
1349 // check if alleles compatible with being a gVCF record
1350 // ALT must be one of ., <*>, <X>, <NON_REF>
1351 // check for INFO/END is below
1352 int i, gallele = -1;
1353 if (line->n_allele==1)
1354 gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
1355 else if ( line->d.allele[1][0]=='<' )
1356 {
1357 for (i=1; i<line->n_allele; i++)
1358 {
1359 if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
1360 if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
1361 if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
1362 }
1363 }
1364
1365 // no gVCF compatible alleles
1366 if (gallele<0)
1367 {
1368 if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1369 continue;
1370 }
1371
1372 int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
1373 if ( nend!=1 )
1374 {
1375 // No INFO/END => not gVCF record
1376 if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1377 continue;
1378 }
1379 bcf_update_info_int32(hdr,line,"END",NULL,0);
1380 int pos, len;
1381 for (pos=line->pos; pos<itmp[0]; pos++)
1382 {
1383 line->pos = pos;
1384 char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
1385 if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1);
1386 strncpy(line->d.allele[0],ref,len);
1387 if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1388 free(ref);
1389 }
1390 }
1391 free(itmp);
1392 if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1393 }
1394
usage(void)1395 static void usage(void)
1396 {
1397 fprintf(bcftools_stderr, "\n");
1398 fprintf(bcftools_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
1399 fprintf(bcftools_stderr, " formats details. When specifying output files explicitly instead\n");
1400 fprintf(bcftools_stderr, " of with PREFIX, one can use '-' for bcftools_stdout and '.' to suppress.\n");
1401 fprintf(bcftools_stderr, "Usage: bcftools convert [OPTIONS] INPUT_FILE\n");
1402 fprintf(bcftools_stderr, "\n");
1403 fprintf(bcftools_stderr, "VCF input options:\n");
1404 fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
1405 fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true\n");
1406 fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
1407 fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
1408 fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
1409 fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include\n");
1410 fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
1411 fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
1412 fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
1413 fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
1414 fprintf(bcftools_stderr, "\n");
1415 fprintf(bcftools_stderr, "VCF output options:\n");
1416 fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
1417 fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
1418 fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
1419 fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
1420 fprintf(bcftools_stderr, "\n");
1421 fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
1422 fprintf(bcftools_stderr, " -G, --gensample2vcf ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
1423 fprintf(bcftools_stderr, " -g, --gensample ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
1424 fprintf(bcftools_stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
1425 fprintf(bcftools_stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n");
1426 fprintf(bcftools_stderr, " --keep-duplicates Keep duplicate positions\n");
1427 fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
1428 fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
1429 fprintf(bcftools_stderr, "\n");
1430 fprintf(bcftools_stderr, "gVCF conversion:\n");
1431 fprintf(bcftools_stderr, " --gvcf2vcf Expand gVCF reference blocks\n");
1432 fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
1433 fprintf(bcftools_stderr, "\n");
1434 fprintf(bcftools_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
1435 fprintf(bcftools_stderr, " --hapsample2vcf ... <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
1436 fprintf(bcftools_stderr, " --hapsample ... <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
1437 fprintf(bcftools_stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n");
1438 fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
1439 fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n");
1440 fprintf(bcftools_stderr, "\n");
1441 fprintf(bcftools_stderr, "HAP/LEGEND/SAMPLE conversion:\n");
1442 fprintf(bcftools_stderr, " -H, --haplegendsample2vcf ... <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
1443 fprintf(bcftools_stderr, " -h, --haplegendsample ... <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
1444 fprintf(bcftools_stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n");
1445 fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
1446 fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n");
1447 fprintf(bcftools_stderr, "\n");
1448 fprintf(bcftools_stderr, "TSV conversion:\n");
1449 fprintf(bcftools_stderr, " --tsv2vcf FILE\n");
1450 fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n");
1451 fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
1452 fprintf(bcftools_stderr, " -s, --samples LIST List of sample names\n");
1453 fprintf(bcftools_stderr, " -S, --samples-file FILE File of sample names\n");
1454 fprintf(bcftools_stderr, "\n");
1455 // fprintf(bcftools_stderr, "PLINK options:\n");
1456 // fprintf(bcftools_stderr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
1457 // fprintf(bcftools_stderr, " --tped make tped file instead\n");
1458 // fprintf(bcftools_stderr, " --bin make binary bed/fam/bim files\n");
1459 // fprintf(bcftools_stderr, "\n");
1460 // fprintf(bcftools_stderr, "PBWT options:\n");
1461 // fprintf(bcftools_stderr, " -b, --pbwt <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
1462 // fprintf(bcftools_stderr, "\n");
1463 bcftools_exit(1);
1464 }
1465
main_vcfconvert(int argc,char * argv[])1466 int main_vcfconvert(int argc, char *argv[])
1467 {
1468 int c;
1469 args_t *args = (args_t*) calloc(1,sizeof(args_t));
1470 args->argc = argc; args->argv = argv;
1471 args->outfname = "-";
1472 args->output_type = FT_VCF;
1473 args->n_threads = 0;
1474 args->record_cmd_line = 1;
1475 args->regions_overlap = 1;
1476 args->targets_overlap = 0;
1477 args->clevel = -1;
1478
1479 static struct option loptions[] =
1480 {
1481 {"include",required_argument,NULL,'i'},
1482 {"exclude",required_argument,NULL,'e'},
1483 {"output",required_argument,NULL,'o'},
1484 {"output-type",required_argument,NULL,'O'},
1485 {"threads",required_argument,NULL,9},
1486 {"regions",required_argument,NULL,'r'},
1487 {"regions-file",required_argument,NULL,'R'},
1488 {"regions-overlap",required_argument,NULL,13},
1489 {"targets",required_argument,NULL,'t'},
1490 {"targets-file",required_argument,NULL,'T'},
1491 {"targets-overlap",required_argument,NULL,14},
1492 {"samples",required_argument,NULL,'s'},
1493 {"samples-file",required_argument,NULL,'S'},
1494 {"sex",required_argument,NULL,11},
1495 {"gensample",required_argument,NULL,'g'},
1496 {"gensample2vcf",required_argument,NULL,'G'},
1497 {"tag",required_argument,NULL,1},
1498 {"chrom",no_argument,NULL,8},
1499 {"tsv2vcf",required_argument,NULL,2},
1500 {"hapsample",required_argument,NULL,7},
1501 {"hapsample2vcf",required_argument,NULL,3},
1502 {"vcf-ids",no_argument,NULL,4},
1503 {"haploid2diploid",no_argument,NULL,5},
1504 {"gvcf2vcf",no_argument,NULL,6},
1505 {"haplegendsample",required_argument,NULL,'h'},
1506 {"haplegendsample2vcf",required_argument,NULL,'H'},
1507 {"columns",required_argument,NULL,'c'},
1508 {"fasta-ref",required_argument,NULL,'f'},
1509 {"no-version",no_argument,NULL,10},
1510 {"keep-duplicates",no_argument,NULL,12},
1511 {NULL,0,NULL,0}
1512 };
1513 char *tmp;
1514 while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
1515 switch (c) {
1516 case 'e':
1517 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1518 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
1519 case 'i':
1520 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1521 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
1522 case 'r': args->regions_list = optarg; break;
1523 case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
1524 case 't': args->targets_list = optarg; break;
1525 case 'T': args->targets_list = optarg; args->targets_is_file = 1; break;
1526 case 's': args->sample_list = optarg; break;
1527 case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
1528 case 'g': args->convert_func = vcf_to_gensample; args->outfname = optarg; break;
1529 case 'G': args->convert_func = gensample_to_vcf; args->infname = optarg; break;
1530 case 1 : args->tag = optarg; break;
1531 case 2 : args->convert_func = tsv_to_vcf; args->infname = optarg; break;
1532 case 3 : args->convert_func = hapsample_to_vcf; args->infname = optarg; break;
1533 case 4 : args->output_vcf_ids = 1; break;
1534 case 5 : args->hap2dip = 1; break;
1535 case 6 : args->convert_func = gvcf_to_vcf; break;
1536 case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
1537 case 8 : args->output_chrom_first_col = 1; break;
1538 case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
1539 case 'f': args->ref_fname = optarg; break;
1540 case 'c': args->columns = optarg; break;
1541 case 'o': args->outfname = optarg; break;
1542 case 'O':
1543 switch (optarg[0]) {
1544 case 'b': args->output_type = FT_BCF_GZ; break;
1545 case 'u': args->output_type = FT_BCF; break;
1546 case 'z': args->output_type = FT_VCF_GZ; break;
1547 case 'v': args->output_type = FT_VCF; break;
1548 default:
1549 {
1550 args->clevel = strtol(optarg,&tmp,10);
1551 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1552 }
1553 }
1554 if ( optarg[1] )
1555 {
1556 args->clevel = strtol(optarg+1,&tmp,10);
1557 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
1558 }
1559 break;
1560 case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
1561 case 9 : args->n_threads = strtol(optarg, 0, 0); break;
1562 case 10 : args->record_cmd_line = 0; break;
1563 case 11 : args->sex_fname = optarg; break;
1564 case 12 : args->keep_duplicates = 1; break;
1565 case 13 :
1566 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
1567 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
1568 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
1569 else error("Could not parse: --regions-overlap %s\n",optarg);
1570 break;
1571 case 14 :
1572 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
1573 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
1574 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
1575 else error("Could not parse: --targets-overlap %s\n",optarg);
1576 break;
1577 case '?': usage(); break;
1578 default: error("Unknown argument: %s\n", optarg);
1579 }
1580 }
1581
1582 if ( !args->infname )
1583 {
1584 if ( optind>=argc )
1585 {
1586 if ( !isatty(fileno((FILE *)stdin)) ) args->infname = "-";
1587 }
1588 else args->infname = argv[optind];
1589 }
1590 if ( !args->infname ) usage();
1591
1592 if ( args->convert_func ) args->convert_func(args);
1593 else vcf_to_vcf(args);
1594
1595 destroy_data(args);
1596 free(args);
1597 return 0;
1598 }
1599