1 #include "bcftools.pysam.h"
2 
3 /*  vcfconvert.c -- convert between VCF/BCF and related formats.
4 
5     Copyright (C) 2013-2021 Genome Research Ltd.
6 
7     Author: Petr Danecek <pd3@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.  */
26 
27 #include <stdio.h>
28 #include <strings.h>
29 #include <unistd.h>
30 #include <getopt.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <inttypes.h>
37 #include <htslib/faidx.h>
38 #include <htslib/vcf.h>
39 #include <htslib/bgzf.h>
40 #include <htslib/synced_bcf_reader.h>
41 #include <htslib/vcfutils.h>
42 #include <htslib/kseq.h>
43 #include "bcftools.h"
44 #include "filter.h"
45 #include "convert.h"
46 #include "tsv2vcf.h"
47 
48 // Logic of the filters: include or exclude sites which match the filters?
49 #define FLT_INCLUDE 1
50 #define FLT_EXCLUDE 2
51 
52 typedef struct _args_t args_t;
53 struct _args_t
54 {
55     faidx_t *ref;
56     filter_t *filter;
57     char *filter_str;
58     int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
59     convert_t *convert;
60     bcf_srs_t *files;
61     bcf_hdr_t *header;
62     void (*convert_func)(struct _args_t *);
63     struct {
64         int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
65     } n;
66     kstring_t str;
67     int32_t *gts;
68     float *flt;
69     int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
70     int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
71     int regions_overlap, targets_overlap;
72     char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
73     char *outfname, *infname, *ref_fname, *sex_fname;
74     int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
75 };
76 
destroy_data(args_t * args)77 static void destroy_data(args_t *args)
78 {
79     if ( args->ref ) fai_destroy(args->ref);
80     if ( args->convert) convert_destroy(args->convert);
81     if ( args->filter ) filter_destroy(args->filter);
82     free(args->samples);
83     if ( args->files ) bcf_sr_destroy(args->files);
84 }
85 
open_vcf(args_t * args,const char * format_str)86 static void open_vcf(args_t *args, const char *format_str)
87 {
88     args->files = bcf_sr_init();
89     if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 )
90         error("Could not initialize --threads %d\n", args->n_threads);
91 
92     if ( args->regions_list )
93     {
94         bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
95         if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
96             error("Failed to read the regions: %s\n", args->regions_list);
97     }
98     if ( args->targets_list )
99     {
100         bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
101         if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 )
102             error("Failed to read the targets: %s\n", args->targets_list);
103     }
104     if ( !bcf_sr_add_reader(args->files, args->infname) )
105         error("Failed to open %s: %s\n", args->infname,bcf_sr_strerror(args->files->errnum));
106 
107     args->header = args->files->readers[0].header;
108 
109     if ( args->filter_str )
110         args->filter = filter_init(args->header, args->filter_str);
111 
112     int i, nsamples = 0, *samples = NULL;
113     if ( args->sample_list && strcmp("-",args->sample_list) )
114     {
115         for (i=0; i<args->files->nreaders; i++)
116         {
117             int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
118             if ( ret<0 ) error("Error parsing the sample list\n");
119             else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
120         }
121 
122         if ( args->sample_list[0]!='^' )
123         {
124             // the sample ordering may be different if not negated
125             int n;
126             char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
127             if ( !smpls ) error("Could not parse %s\n", args->sample_list);
128             if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
129                 error("The number of samples does not match, perhaps some are present multiple times?\n");
130             nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
131             samples = (int*) malloc(sizeof(int)*nsamples);
132             for (i=0; i<n; i++)
133             {
134                 samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
135                 free(smpls[i]);
136             }
137             free(smpls);
138         }
139     }
140     if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
141     free(samples);
142 }
143 
tsv_setter_chrom_pos_ref_alt(tsv_t * tsv,bcf1_t * rec,void * usr)144 static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
145 {
146     args_t *args = (args_t*) usr;
147 
148     char tmp, *se = tsv->ss, *ss = tsv->ss;
149     while ( se < tsv->se && *se!=':' ) se++;
150     if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
151     tmp = *se; *se = 0;
152     rec->rid = bcf_hdr_name2id(args->header,ss);
153     if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss);
154     *se = tmp;
155 
156     // POS
157     rec->pos = strtol(se+1,&ss,10);
158     if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
159     rec->pos--;
160 
161     // ID
162     if ( args->output_vcf_ids )
163     {
164         char tmp = *tsv->se;
165         *tsv->se = 0;
166         bcf_update_id(args->header, rec, tsv->ss);
167         *tsv->se = tmp;
168     }
169 
170     // REF,ALT
171     args->str.l = 0;
172     se = ++ss;
173     while ( se < tsv->se && *se!='_' ) se++;
174     if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
175     kputsn(ss,se-ss,&args->str);
176     ss = ++se;
177     while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++;
178     if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss);
179     kputc(',',&args->str);
180     kputsn(ss,se-ss,&args->str);
181     bcf_update_alleles_str(args->header, rec, args->str.s);
182 
183     // END - optional
184     if (*se && *se=='_') {
185         long end = strtol(se+1,&ss,10);
186         if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss);
187         bcf_update_info_int32(args->header, rec, "END", &end, 1);
188     }
189 
190     return 0;
191 }
tsv_setter_verify_pos(tsv_t * tsv,bcf1_t * rec,void * usr)192 static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr)
193 {
194     char *se;
195     int pos = strtol(tsv->ss,&se,10);
196     if ( tsv->ss==se ) error("Could not parse POS: %s\n", tsv->ss);
197     if ( rec->pos != pos-1 ) error("POS mismatch: %s\n", tsv->ss);
198     return 0;
199 }
tsv_setter_verify_ref_alt(tsv_t * tsv,bcf1_t * rec,void * usr)200 static int tsv_setter_verify_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
201 {
202     args_t *args = (args_t*) usr;
203     args->rev_als = 0;
204     char tmp = *tsv->se; *tsv->se = 0;
205     if ( strcmp(tsv->ss,rec->d.allele[0]) )
206     {
207         if ( strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
208         args->rev_als = 1;
209     }
210     *tsv->se = tmp;
211     while ( *tsv->se && isspace(*tsv->se) ) tsv->se++;
212     tsv->ss = tsv->se;
213     while ( *tsv->se && !isspace(*tsv->se) ) tsv->se++;
214     tmp = *tsv->se; *tsv->se = 0;
215     if ( !args->rev_als && strcmp(tsv->ss,rec->d.allele[1]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[1]); }
216     else if ( args->rev_als && strcmp(tsv->ss,rec->d.allele[0]) ) { *tsv->se = tmp; error("REF/ALT mismatch: [%s][%s]\n", tsv->ss,rec->d.allele[0]); }
217     *tsv->se = tmp;
218     return 0;
219 }
tsv_setter_gt_gp(tsv_t * tsv,bcf1_t * rec,void * usr)220 static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
221 {
222     args_t *args = (args_t*) usr;
223     int i, nsamples = bcf_hdr_nsamples(args->header);
224     for (i=0; i<nsamples; i++)
225     {
226         float aa,ab,bb;
227         aa = strtod(tsv->ss, &tsv->se);
228         if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; }
229         tsv->ss = tsv->se+1;
230         ab = strtod(tsv->ss, &tsv->se);
231         if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; }
232         tsv->ss = tsv->se+1;
233         bb = strtod(tsv->ss, &tsv->se);
234         if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; }
235         tsv->ss = tsv->se+1;
236 
237         if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; }
238         args->flt[3*i+0] = aa;
239         args->flt[3*i+1] = ab;
240         args->flt[3*i+2] = bb;
241 
242         if ( aa >= ab )
243         {
244             if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
245             else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
246         }
247         else if ( ab >= bb )
248         {
249             args->gts[2*i+0] = bcf_gt_unphased(0);
250             args->gts[2*i+1] = bcf_gt_unphased(1);
251         }
252         else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
253     }
254     if ( *tsv->se ) error("Could not parse: %s\n", tsv->ss);
255     if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
256     if ( bcf_update_format_float(args->header,rec,"GP",args->flt,nsamples*3) ) error("Could not update GP field\n");
257     return 0;
258 }
tsv_setter_haps(tsv_t * tsv,bcf1_t * rec,void * usr)259 static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
260 {
261     args_t *args = (args_t*) usr;
262     int i, nsamples = bcf_hdr_nsamples(args->header);
263 
264     int32_t a0, a1;
265     if ( args->rev_als ) { a0 = bcf_gt_phased(1); a1 = bcf_gt_phased(0); }
266     else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
267 
268     // up is short for "unphased"
269     int nup = 0;
270     for (i=0; i<nsamples; i++)
271     {
272         char *ss = tsv->ss + 4*i + nup;
273         int up = 0, all;
274 
275         for (all=0; all < 2; all++){
276             // checking for premature ending
277             if ( !ss[0] || !ss[1] || !ss[2] ||
278                  (up && (!ss[3] || !ss[4]) ) )
279             {
280                 fprintf(bcftools_stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]);
281                 return -1;
282             }
283 
284             switch(ss[all*2+up]){
285             case '0':
286                 args->gts[2*i+all] = a0;
287                 break;
288             case '1' :
289                 args->gts[2*i+all] = a1;
290                 break;
291             case '?' :
292                 // there is no macro to express phased missing allele
293                 args->gts[2*i+all] = bcf_gt_phased(-1);
294                 break;
295             case '-' :
296                 args->gts[2*i+all] = bcf_int32_vector_end;
297                 break;
298             default :
299                 fprintf(bcftools_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
300                 return -1;
301             }
302             if( ss[all*2+up+1]=='*' ) up = up + 1;
303         }
304 
305         if(up && up != 2)
306         {
307             fprintf(bcftools_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
308             return -1;
309         }
310 
311         // change alleles to unphased if the alleles are unphased
312         if ( up )
313         {
314             args->gts[2*i] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i]));
315             args->gts[2*i+1] = bcf_gt_unphased(bcf_gt_allele(args->gts[2*i+1]));
316         }
317         nup = nup + up;
318     }
319     if ( tsv->ss[(nsamples-1)*4+3+nup] )
320     {
321         fprintf(bcftools_stderr,"nup: %d", nup);
322         fprintf(bcftools_stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]);
323         return -1;
324     }
325 
326     if ( bcf_update_genotypes(args->header,rec,args->gts,nsamples*2) ) error("Could not update GT field\n");
327     return 0;
328 }
gensample_to_vcf(args_t * args)329 static void gensample_to_vcf(args_t *args)
330 {
331     /*
332      *  Inpute: IMPUTE2 output (indentation changed here for clarity):
333      *
334      *      20:62116619_C_T 20:62116619     62116619 C T 0.969 0.031 0 ...
335      *      ---             20:62116698_C_A 62116698 C A 1     0     0 ...
336      *
337      *  Second column is expected in the form of CHROM:POS_REF_ALT. We use second
338      *  column because the first can be empty ("--") when filling sites from reference
339      *  panel.
340      *
341      *  Output: VCF with filled GT,GP
342      *
343      */
344     kstring_t line = {0,0,0};
345 
346     char *gen_fname = NULL, *sample_fname = NULL;
347     sample_fname = strchr(args->infname,',');
348     if ( !sample_fname )
349     {
350         args->str.l = 0;
351         ksprintf(&args->str,"%s.gen.gz", args->infname);
352         gen_fname = strdup(args->str.s);
353         args->str.l = 0;
354         ksprintf(&args->str,"%s.samples", args->infname);
355         sample_fname = strdup(args->str.s);
356     }
357     else
358     {
359         *sample_fname = 0;
360         gen_fname = strdup(args->infname);
361         sample_fname = strdup(sample_fname+1);
362     }
363     htsFile *gen_fh = hts_open(gen_fname, "r");
364     if ( !gen_fh ) error("Could not read: %s\n", gen_fname);
365     if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname);
366 
367     // Find out the chromosome name, sample names, init and print the VCF header
368     args->str.l = 0;
369     char *ss, *se = line.s;
370     while ( *se && !isspace(*se) ) se++;
371     if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s);
372     ss = se+1;
373     se = strchr(ss,':');
374     if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname);
375     kputsn(ss, se-ss, &args->str);
376 
377     tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP");
378     tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
379     tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
380     tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
381     tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args);
382 
383     args->header = bcf_hdr_init("w");
384     bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
385     bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
386     bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
387     bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff);   // MAX_CSI_COOR
388     if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
389 
390     int i, nsamples;
391     char **samples = hts_readlist(sample_fname, 1, &nsamples);
392     if ( !samples ) error("Could not read %s\n", sample_fname);
393     for (i=2; i<nsamples; i++)
394     {
395         se = samples[i]; while ( *se && !isspace(*se) ) se++;
396         *se = 0;
397         bcf_hdr_add_sample(args->header,samples[i]);
398     }
399     for (i=0; i<nsamples; i++) free(samples[i]);
400     free(samples);
401 
402     char wmode[8];
403     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
404     htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
405     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
406     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
407     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
408     bcf1_t *rec = bcf_init();
409 
410     nsamples -= 2;
411     args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
412     args->flt = (float *) malloc(sizeof(float)*nsamples*3);
413 
414     do
415     {
416         bcf_clear(rec);
417         args->n.total++;
418         if ( !tsv_parse(tsv, rec, line.s) )
419         {
420             if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
421         }
422         else
423             error("Error occurred while parsing: %s\n", line.s);
424     }
425     while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
426 
427     if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
428     if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
429     bcf_hdr_destroy(args->header);
430     bcf_destroy(rec);
431     free(sample_fname);
432     free(gen_fname);
433     free(args->str.s);
434     free(line.s);
435     free(args->gts);
436     free(args->flt);
437     tsv_destroy(tsv);
438 
439     fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total);
440 }
441 
haplegendsample_to_vcf(args_t * args)442 static void haplegendsample_to_vcf(args_t *args)
443 {
444     /*
445      *  Convert from IMPUTE2 hap/legend/sample output files to VCF
446      *
447      *      hap:
448      *          0 1 0 1
449      *      legend:
450      *          id position a0 a1
451      *          1:186946386_G_T 186946386 G T
452      *      sample:
453      *          sample population group sex
454      *          sample1 sample1 sample1 2
455      *          sample2 sample2 sample2 2
456      *
457      *  Output: VCF with filled GT
458      */
459     kstring_t line = {0,0,0};
460 
461     char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL;
462     sample_fname = strchr(args->infname,',');
463     if ( !sample_fname )
464     {
465         args->str.l = 0;
466         ksprintf(&args->str,"%s.hap.gz", args->infname);
467         hap_fname = strdup(args->str.s);
468         args->str.l = 0;
469         ksprintf(&args->str,"%s.samples", args->infname);
470         sample_fname = strdup(args->str.s);
471         args->str.l = 0;
472         ksprintf(&args->str,"%s.legend.gz", args->infname);
473         leg_fname = strdup(args->str.s);
474     }
475     else
476     {
477         char *ss = sample_fname, *se = strchr(ss+1,',');
478         if ( !se ) error("Could not parse hap/legend/sample file names: %s\n", args->infname);
479         *ss = 0;
480         *se = 0;
481         hap_fname = strdup(args->infname);
482         leg_fname = strdup(ss+1);
483         sample_fname = strdup(se+1);
484     }
485     htsFile *hap_fh = hts_open(hap_fname, "r");
486     if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
487 
488     htsFile *leg_fh = hts_open(leg_fname,"r");
489     if ( !leg_fh ) error("Could not read: %s\n", leg_fname);
490 
491     // Eat up first legend line, then determine chromosome name
492     if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
493     if ( hts_getline(leg_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", leg_fname);
494 
495     // Find out the chromosome name, sample names, init and print the VCF header
496     args->str.l = 0;
497     char *se = strchr(line.s,':');
498     if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", leg_fname);
499     kputsn(line.s, se-line.s, &args->str);
500 
501     tsv_t *leg_tsv = tsv_init("CHROM_POS_REF_ALT,POS,REF_ALT");
502     tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
503     tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL);
504     tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
505 
506     tsv_t *hap_tsv = tsv_init("HAPS");
507     tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args);
508 
509     args->header = bcf_hdr_init("w");
510     bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
511     bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
512     bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff);   // MAX_CSI_COOR
513     if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
514 
515     int i, nrows, nsamples;
516     char **samples = hts_readlist(sample_fname, 1, &nrows);
517     if ( !samples ) error("Could not read %s\n", sample_fname);
518     nsamples = nrows - 1;
519 
520     // sample_fname should contain a header line, so need to ignore first row
521     // returned from hts_readlist (i=1, and not i=0)
522     for (i=1; i<nrows; i++)
523     {
524         se = samples[i]; while ( *se && !isspace(*se) ) se++;
525         *se = 0;
526         bcf_hdr_add_sample(args->header,samples[i]);
527     }
528     bcf_hdr_add_sample(args->header,NULL);
529     for (i=0; i<nrows; i++) free(samples[i]);
530     free(samples);
531 
532     char wmode[8];
533     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
534     htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
535     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
536     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
537     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
538     bcf1_t *rec = bcf_init();
539 
540     args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
541 
542     while (1)
543     {
544         bcf_clear(rec);
545         args->n.total++;
546         if ( tsv_parse(leg_tsv, rec, line.s) )
547             error("Error occurred while parsing %s: %s\n", leg_fname,line.s);
548 
549         if ( hts_getline(hap_fh,  KS_SEP_LINE, &line)<=0 )
550             error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
551 
552         if ( tsv_parse(hap_tsv, rec, line.s) )
553             error("Error occurred while parsing %s: %s\n", hap_fname,line.s);
554 
555         if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
556 
557         if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 )
558         {
559             if ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 )
560                 error("Different number of records in %s and %s?\n", leg_fname,hap_fname);
561             break;
562         }
563     }
564 
565     if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
566     if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
567     if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
568     bcf_hdr_destroy(args->header);
569     bcf_destroy(rec);
570     free(sample_fname);
571     free(hap_fname);
572     free(leg_fname);
573     free(args->str.s);
574     free(line.s);
575     free(args->gts);
576     tsv_destroy(hap_tsv);
577     tsv_destroy(leg_tsv);
578 
579     fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total);
580 }
581 
hapsample_to_vcf(args_t * args)582 static void hapsample_to_vcf(args_t *args)
583 {
584     /*
585      *  Input: SHAPEIT output
586      *
587      *      20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ...
588      *
589      *  First column is expected in the form of CHROM:POS_REF_ALT
590      *
591      *  Output: VCF with filled GT
592      *
593      */
594     kstring_t line = {0,0,0};
595 
596     char *hap_fname = NULL, *sample_fname = NULL;
597     sample_fname = strchr(args->infname,',');
598     if ( !sample_fname )
599     {
600         args->str.l = 0;
601         ksprintf(&args->str,"%s.hap.gz", args->infname);
602         hap_fname = strdup(args->str.s);
603         args->str.l = 0;
604         ksprintf(&args->str,"%s.samples", args->infname);
605         sample_fname = strdup(args->str.s);
606     }
607     else
608     {
609         *sample_fname = 0;
610         hap_fname = strdup(args->infname);
611         sample_fname = strdup(sample_fname+1);
612     }
613     htsFile *hap_fh = hts_open(hap_fname, "r");
614     if ( !hap_fh ) error("Could not read: %s\n", hap_fname);
615     if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname);
616 
617     // Find out the chromosome name, sample names, init and print the VCF header
618     args->str.l = 0;
619     char *se = strchr(line.s,':');
620     if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname);
621     kputsn(line.s, se-line.s, &args->str);
622 
623     tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS");
624     tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args);
625     tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL);
626     tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args);
627     tsv_register(tsv, "HAPS", tsv_setter_haps, args);
628 
629     args->header = bcf_hdr_init("w");
630     bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
631     bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
632     bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff);   // MAX_CSI_COOR
633     if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
634 
635     int i, nsamples;
636     char **samples = hts_readlist(sample_fname, 1, &nsamples);
637     if ( !samples ) error("Could not read %s\n", sample_fname);
638     for (i=2; i<nsamples; i++)
639     {
640         se = samples[i]; while ( *se && !isspace(*se) ) se++;
641         *se = 0;
642         bcf_hdr_add_sample(args->header,samples[i]);
643     }
644     bcf_hdr_add_sample(args->header,NULL);
645     for (i=0; i<nsamples; i++) free(samples[i]);
646     free(samples);
647 
648     char wmode[8];
649     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
650     htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
651     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
652     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
653     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
654     bcf1_t *rec = bcf_init();
655 
656     nsamples -= 2;
657     args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
658 
659     do
660     {
661         bcf_clear(rec);
662         args->n.total++;
663         if ( !tsv_parse(tsv, rec, line.s) )
664         {
665             if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
666         }
667         else
668             error("Error occurred while parsing: %s\n", line.s);
669     }
670     while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
671 
672     if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
673     if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
674     bcf_hdr_destroy(args->header);
675     bcf_destroy(rec);
676     free(sample_fname);
677     free(hap_fname);
678     free(args->str.s);
679     free(line.s);
680     free(args->gts);
681     tsv_destroy(tsv);
682 
683     fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total);
684 }
685 
init_sample2sex(bcf_hdr_t * hdr,char * sex_fname)686 char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
687 {
688     int i, nlines;
689     char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1);
690     char **lines = hts_readlist(sex_fname, 1, &nlines);
691     if ( !lines ) error("Could not read %s\n", sex_fname);
692     for (i=0; i<nlines; i++)
693     {
694         char *se = lines[i]; while ( *se && !isspace(*se) ) se++;
695         char tmp = *se;
696         *se = 0;
697         int id = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, lines[i]);
698         *se = tmp;
699         if ( id<0 ) continue;
700         while ( *se && isspace(*se) ) se++;
701         if ( *se=='M' ) sample2sex[id] = '1';
702         else if ( *se=='F' ) sample2sex[id] = '2';
703         else error("Could not parse %s: %s\n", sex_fname,lines[i]);
704     }
705     for (i=0; i<nlines; i++) free(lines[i]);
706     free(lines);
707     for (i=0; i<bcf_hdr_nsamples(hdr); i++)
708         if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
709     return sample2sex;
710 }
711 
vcf_to_gensample(args_t * args)712 static void vcf_to_gensample(args_t *args)
713 {
714     kstring_t str = {0,0,0};
715 
716     // insert chrom as first column if needed
717     if(args->output_chrom_first_col)
718         kputs("%CHROM ", &str);
719     else
720         kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
721 
722     // insert rsid as second column if needed
723     if(args->output_vcf_ids)
724         kputs("%ID ", &str);
725     else
726         kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str);
727 
728     kputs("%POS %REF %FIRST_ALT", &str);
729     if ( !args->tag || !strcmp(args->tag,"GT") ) kputs("%_GT_TO_PROB3",&str);
730     else if ( !strcmp(args->tag,"PL") ) kputs("%_PL_TO_PROB3",&str);
731     else if ( !strcmp(args->tag,"GP") ) kputs("%_GP_TO_PROB3",&str);
732     else error("todo: --tag %s\n", args->tag);
733     kputs("\n", &str);
734     open_vcf(args,str.s);
735 
736     int ret, gen_compressed = 1, sample_compressed = 0;
737     char *gen_fname = NULL, *sample_fname = NULL;
738     str.l = 0;
739     kputs(args->outfname,&str);
740     int n_files = 0, i;
741     char **files = hts_readlist(str.s, 0, &n_files);
742     if ( n_files==1 )
743     {
744         int l = str.l;
745         kputs(".samples",&str);
746         sample_fname = strdup(str.s);
747         str.l = l;
748         kputs(".gen.gz",&str);
749         gen_fname = strdup(str.s);
750     }
751     else if ( n_files==2 )
752     {
753         if (strlen(files[0]) && strcmp(files[0],".")!=0) gen_fname = strdup(files[0]);
754         if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
755     }
756     else
757     {
758         error("Error parsing --gensample filenames: %s\n", args->outfname);
759     }
760     for (i=0; i<n_files; i++) free(files[i]);
761     free(files);
762 
763     if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0;
764     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
765 
766     if (gen_fname) fprintf(bcftools_stderr, "Gen file: %s\n", gen_fname);
767     if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
768 
769     // write samples file
770     if (sample_fname)
771     {
772         char *sample2sex = NULL;
773         if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
774 
775         int i;
776         BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
777         str.l = 0;
778         kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
779         ret = bgzf_write(sout, str.s, str.l);
780         if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
781         for (i=0; i<bcf_hdr_nsamples(args->header); i++)
782         {
783             str.l = 0;
784             if ( sample2sex )
785                 ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
786             else
787                 ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
788             ret = bgzf_write(sout, str.s, str.l);
789             if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
790         }
791         if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
792         free(sample_fname);
793         free(sample2sex);
794     }
795     if (!gen_fname) {
796         if ( str.m ) free(str.s);
797         return;
798     }
799 
800     int prev_rid = -1, prev_pos = -1;
801     int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0;
802     BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu");
803     while ( bcf_sr_next_line(args->files) )
804     {
805         bcf1_t *line = bcf_sr_get_line(args->files,0);
806         if ( args->filter )
807         {
808             int pass = filter_test(args->filter, line, NULL);
809             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
810             if ( !pass ) { filtered++; continue; }
811         }
812 
813         // ALT allele is required
814         if ( line->n_allele<2 ) { no_alt++; continue; }
815 
816         // biallelic required
817         if ( line->n_allele>2 ) {
818             if (!non_biallelic)
819                 fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
820             non_biallelic++;
821             continue;
822         }
823 
824         // skip duplicate lines, or otherwise shapeit complains
825         if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
826         prev_rid = line->rid;
827         prev_pos = line->pos;
828 
829         str.l = 0;
830         convert_line(args->convert, line, &str);
831         if ( str.l )
832         {
833             int ret = bgzf_write(gout, str.s, str.l);
834             if ( ret!= str.l ) error("Error writing %s: %s\n", gen_fname,strerror(errno));
835             nok++;
836         }
837     }
838     fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
839         nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
840 
841     if ( str.m ) free(str.s);
842     if ( bgzf_close(gout)!=0 ) error("Error closing %s: %s\n", gen_fname,strerror(errno));
843     free(gen_fname);
844 }
845 
vcf_to_haplegendsample(args_t * args)846 static void vcf_to_haplegendsample(args_t *args)
847 {
848     kstring_t str = {0,0,0};
849     if ( args->hap2dip )
850         kputs("%_GT_TO_HAP2\n", &str);
851     else
852         kputs("%_GT_TO_HAP\n", &str);
853     open_vcf(args,str.s);
854 
855     int ret, hap_compressed = 1, legend_compressed = 1, sample_compressed = 0;
856     char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
857     str.l = 0;
858     kputs(args->outfname,&str);
859     int n_files = 0, i;
860     char **files = hts_readlist(str.s, 0, &n_files);
861     if ( n_files==1 )
862     {
863         int l = str.l;
864         kputs(".samples",&str);
865         sample_fname = strdup(str.s);
866         str.l = l;
867         kputs(".legend.gz",&str);
868         legend_fname = strdup(str.s);
869         str.l = l;
870         kputs(".hap.gz",&str);
871         hap_fname = strdup(str.s);
872     }
873     else if ( n_files==3 )
874     {
875         if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
876         if (strlen(files[1]) && strcmp(files[1],".")!=0) legend_fname = strdup(files[1]);
877         if (strlen(files[2]) && strcmp(files[2],".")!=0) sample_fname = strdup(files[2]);
878     }
879     else
880     {
881         error("Error parsing --hapslegendsample filenames: %s\n", args->outfname);
882     }
883     for (i=0; i<n_files; i++) free(files[i]);
884     free(files);
885 
886     if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
887     if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
888     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
889 
890     if (hap_fname) fprintf(bcftools_stderr, "Hap file: %s\n", hap_fname);
891     if (legend_fname) fprintf(bcftools_stderr, "Legend file: %s\n", legend_fname);
892     if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
893 
894     // write samples file
895     if (sample_fname)
896     {
897         char *sample2sex = NULL;
898         if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
899 
900         int i;
901         BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
902         str.l = 0;
903         kputs("sample population group sex\n", &str);
904         ret = bgzf_write(sout, str.s, str.l);
905         if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
906         for (i=0; i<bcf_hdr_nsamples(args->header); i++)
907         {
908             str.l = 0;
909             ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2');
910             ret = bgzf_write(sout, str.s, str.l);
911             if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
912         }
913         if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
914         free(sample_fname);
915         free(sample2sex);
916     }
917     if (!hap_fname && !legend_fname) {
918         if ( str.m ) free(str.s);
919         return;
920     }
921 
922     // open haps and legend outputs
923     BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
924     if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
925     BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
926     if (legend_fname) {
927         str.l = 0;
928         kputs("id position a0 a1\n", &str);
929         ret = bgzf_write(lout, str.s, str.l);
930         if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
931     }
932 
933     int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
934     while ( bcf_sr_next_line(args->files) )
935     {
936         bcf1_t *line = bcf_sr_get_line(args->files,0);
937         if ( args->filter )
938         {
939             int pass = filter_test(args->filter, line, NULL);
940             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
941             if ( !pass ) { filtered++; continue; }
942         }
943 
944         // ALT allele is required
945         if ( line->n_allele<2 ) { no_alt++; continue; }
946         // biallelic required
947         if ( line->n_allele>2 ) {
948             if (!non_biallelic)
949                 fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
950             non_biallelic++;
951             continue;
952         }
953 
954         str.l = 0;
955         convert_line(args->convert, line, &str);
956         if ( !str.l ) continue;
957 
958         // write haps file
959         if (hap_fname) {
960             ret = bgzf_write(hout, str.s, str.l); // write hap file
961             if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
962         }
963         if (legend_fname) {
964             str.l = 0;
965             if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) )
966                 ksprintf(&str, "%s %"PRId64" %s %s\n", line->d.id, (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]);
967             else
968                 ksprintf(&str, "%s:%"PRId64"_%s_%s %"PRId64" %s %s\n", bcf_seqname(args->header, line), (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1], (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]);
969 
970             // write legend file
971             ret = bgzf_write(lout, str.s, str.l);
972             if ( ret != str.l ) error("Error writing %s: %s\n", legend_fname, strerror(errno));
973         }
974         nok++;
975     }
976     fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
977     if ( str.m ) free(str.s);
978     if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
979     if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno));
980     if (hap_fname) free(hap_fname);
981     if (legend_fname) free(legend_fname);
982 }
983 
vcf_to_hapsample(args_t * args)984 static void vcf_to_hapsample(args_t *args)
985 {
986     /*
987      *  WTCCC style haplotypes file
988      *  see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#hapsample
989      *
990      *  These are essentially the haplotypes from the impute2 format with some
991      *  legend info tacked on to the first 5 columns
992      *
993      */
994     kstring_t str = {0,0,0};
995 
996     // print ID instead of CHROM:POS_REF_ALT1
997     if ( args->output_vcf_ids )
998         kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
999     else
1000         kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
1001 
1002     if ( args->hap2dip )
1003         kputs("%_GT_TO_HAP2\n", &str);
1004     else
1005         kputs("%_GT_TO_HAP\n", &str);
1006     open_vcf(args,str.s);
1007 
1008     int ret, hap_compressed = 1, sample_compressed = 0;
1009     char *hap_fname = NULL, *sample_fname = NULL;
1010     str.l = 0;
1011     kputs(args->outfname,&str);
1012     int n_files = 0, i;
1013     char **files = hts_readlist(str.s, 0, &n_files);
1014     if ( n_files==1 )
1015     {
1016         int l = str.l;
1017         kputs(".samples",&str);
1018         sample_fname = strdup(str.s);
1019         str.l = l;
1020         kputs(".hap.gz",&str);
1021         hap_fname = strdup(str.s);
1022     }
1023     else if ( n_files==2 )
1024     {
1025         if (strlen(files[0]) && strcmp(files[0],".")!=0) hap_fname = strdup(files[0]);
1026         if (strlen(files[1]) && strcmp(files[1],".")!=0) sample_fname = strdup(files[1]);
1027     }
1028     else
1029     {
1030         error("Error parsing --hapsample filenames: %s\n", args->outfname);
1031     }
1032     for (i=0; i<n_files; i++) free(files[i]);
1033     free(files);
1034 
1035     if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
1036     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
1037 
1038     if (hap_fname) fprintf(bcftools_stderr, "Hap file: %s\n", hap_fname);
1039     if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
1040 
1041     // write samples file
1042     if (sample_fname)
1043     {
1044         char *sample2sex = NULL;
1045         if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
1046 
1047         int i;
1048         BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
1049         str.l = 0;
1050         kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
1051         ret = bgzf_write(sout, str.s, str.l);
1052         if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
1053         for (i=0; i<bcf_hdr_nsamples(args->header); i++)
1054         {
1055             str.l = 0;
1056             if ( sample2sex )
1057                 ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
1058             else
1059                 ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
1060             ret = bgzf_write(sout, str.s, str.l);
1061             if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
1062         }
1063         if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
1064         free(sample_fname);
1065         free(sample2sex);
1066     }
1067     if (!hap_fname) {
1068         if ( str.m ) free(str.s);
1069         return;
1070     }
1071 
1072     // open haps output
1073     BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
1074     if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
1075 
1076     int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
1077     while ( bcf_sr_next_line(args->files) )
1078     {
1079         bcf1_t *line = bcf_sr_get_line(args->files,0);
1080         if ( args->filter )
1081         {
1082             int pass = filter_test(args->filter, line, NULL);
1083             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1084             if ( !pass ) { filtered++; continue; }
1085         }
1086 
1087         // ALT allele is required
1088         if ( line->n_allele<2 ) { no_alt++; continue; }
1089         // biallelic required
1090         if ( line->n_allele>2 ) {
1091             if (!non_biallelic)
1092                 fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n");
1093             non_biallelic++;
1094             continue;
1095         }
1096 
1097         str.l = 0;
1098         convert_line(args->convert, line, &str);
1099         if ( !str.l ) continue;
1100 
1101         // write haps file
1102         if (hap_fname) {
1103             ret = bgzf_write(hout, str.s, str.l); // write hap file
1104             if ( ret != str.l ) error("Error writing %s: %s\n", hap_fname, strerror(errno));
1105         }
1106         nok++;
1107     }
1108     fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered);
1109     if ( str.m ) free(str.s);
1110     if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno));
1111     if (hap_fname) free(hap_fname);
1112 }
1113 
bcf_hdr_set_chrs(bcf_hdr_t * hdr,faidx_t * fai)1114 static void bcf_hdr_set_chrs(bcf_hdr_t *hdr, faidx_t *fai)
1115 {
1116     int i, n = faidx_nseq(fai);
1117     for (i=0; i<n; i++)
1118     {
1119         const char *seq = faidx_iseq(fai,i);
1120         int len = faidx_seq_len(fai, seq);
1121         bcf_hdr_printf(hdr, "##contig=<ID=%s,length=%d>", seq,len);
1122     }
1123 }
acgt_to_5(char base)1124 static inline int acgt_to_5(char base)
1125 {
1126     if ( base=='A' ) return 0;
1127     if ( base=='C' ) return 1;
1128     if ( base=='G' ) return 2;
1129     if ( base=='T' ) return 3;
1130     return 4;
1131 }
tsv_setter_aa1(args_t * args,char * ss,char * se,int alleles[],int * nals,int ref,int32_t * gts)1132 static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[], int *nals, int ref, int32_t *gts)
1133 {
1134     if ( se - ss > 2 ) return -1;   // currently only SNPs
1135 
1136     if ( ss[0]=='-' )
1137     {
1138         // missing GT
1139         gts[0] = bcf_gt_missing;
1140         gts[1] = bcf_gt_missing;
1141         args->n.missing++;
1142         return 0;
1143     }
1144     if ( ss[0]=='I' ) return -2;    // skip insertions/deletions for now
1145     if ( ss[0]=='D' ) return -2;
1146 
1147     int a0 = acgt_to_5(toupper(ss[0]));
1148     int a1 = ss[1] ? acgt_to_5(toupper(ss[1])) : a0;
1149     if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
1150     if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
1151 
1152     gts[0] = bcf_gt_unphased(alleles[a0]);
1153     gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
1154 
1155     if ( ref==a0 && ref==a1  ) args->n.hom_rr++;    // hom ref: RR
1156     else if ( ref==a0 ) args->n.het_ra++;           // het: RA
1157     else if ( ref==a1 ) args->n.het_ra++;           // het: AR
1158     else if ( a0==a1 ) args->n.hom_aa++;            // hom-alt: AA
1159     else args->n.het_aa++;                          // non-ref het: AA
1160 
1161     return 0;
1162 }
tsv_setter_aa(tsv_t * tsv,bcf1_t * rec,void * usr)1163 static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
1164 {
1165     args_t *args = (args_t*) usr;
1166 
1167     int len;
1168     char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len);
1169     if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1170 
1171     int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 };    // a,c,g,t,n
1172     ref[0] = toupper(ref[0]);
1173     int iref = acgt_to_5(ref[0]);
1174     alleles[iref] = 0;
1175 
1176     rec->n_sample = bcf_hdr_nsamples(args->header);
1177 
1178     int i, ret;
1179     for (i=0; i<rec->n_sample; i++)
1180     {
1181         if ( i>0 )
1182         {
1183             ret = tsv_next(tsv);
1184             if ( ret==-1 ) error("Too few columns for %d samples at %s:%"PRId64"\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1185         }
1186         ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
1187         if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1188         if ( ret==-2 )
1189         {
1190             // something else than a SNP
1191             free(ref);
1192             return -1;
1193         }
1194     }
1195 
1196     args->str.l = 0;
1197     kputc(ref[0], &args->str);
1198     for (i=0; i<5; i++)
1199     {
1200         if ( alleles[i]>0 )
1201         {
1202             kputc(',', &args->str);
1203             kputc("ACGTN"[i], &args->str);
1204         }
1205     }
1206     bcf_update_alleles_str(args->header, rec, args->str.s);
1207     if ( bcf_update_genotypes(args->header,rec,args->gts,rec->n_sample*2) ) error("Could not update the GT field\n");
1208 
1209     free(ref);
1210     return 0;
1211 }
1212 
tsv_to_vcf(args_t * args)1213 static void tsv_to_vcf(args_t *args)
1214 {
1215     if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
1216     if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
1217 
1218     args->ref = fai_load(args->ref_fname);
1219     if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
1220 
1221     args->header = bcf_hdr_init("w");
1222     bcf_hdr_set_chrs(args->header, args->ref);
1223     bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
1224     if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
1225 
1226     int i, n;
1227     char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
1228     if ( !smpls ) error("Could not parse %s\n", args->sample_list);
1229     for (i=0; i<n; i++)
1230     {
1231         bcf_hdr_add_sample(args->header, smpls[i]);
1232         free(smpls[i]);
1233     }
1234     free(smpls);
1235     bcf_hdr_add_sample(args->header, NULL);
1236     args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
1237 
1238     char wmode[8];
1239     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
1240     htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
1241     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
1242     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
1243     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1244 
1245     tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
1246     if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
1247     if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
1248     if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
1249     if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
1250 
1251     bcf1_t *rec = bcf_init();
1252     bcf_float_set_missing(rec->qual);
1253 
1254     kstring_t line = {0,0,0};
1255     htsFile *in_fh = hts_open(args->infname, "r");
1256     if ( !in_fh ) error("Could not read: %s\n", args->infname);
1257     while ( hts_getline(in_fh, KS_SEP_LINE, &line) > 0 )
1258     {
1259         if ( line.s[0]=='#' ) continue;     // skip comments
1260         bcf_clear(rec);
1261 
1262         args->n.total++;
1263         if ( !tsv_parse(tsv, rec, line.s) )
1264         {
1265             if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1266         }
1267         else
1268             args->n.skipped++;
1269     }
1270     if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
1271     free(line.s);
1272 
1273     bcf_hdr_destroy(args->header);
1274     if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1275     tsv_destroy(tsv);
1276     bcf_destroy(rec);
1277     free(args->str.s);
1278     free(args->gts);
1279 
1280     fprintf(bcftools_stderr,"Rows total: \t%d\n", args->n.total);
1281     fprintf(bcftools_stderr,"Rows skipped: \t%d\n", args->n.skipped);
1282     fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing);
1283     fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
1284     fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra);
1285     fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
1286     fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa);
1287 }
1288 
vcf_to_vcf(args_t * args)1289 static void vcf_to_vcf(args_t *args)
1290 {
1291     open_vcf(args,NULL);
1292     char wmode[8];
1293     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
1294     htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
1295     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
1296     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
1297 
1298     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
1299     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1300 
1301     while ( bcf_sr_next_line(args->files) )
1302     {
1303         bcf1_t *line = bcf_sr_get_line(args->files,0);
1304         if ( args->filter )
1305         {
1306             int pass = filter_test(args->filter, line, NULL);
1307             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1308             if ( !pass ) continue;
1309         }
1310         if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1311     }
1312     if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1313 }
1314 
gvcf_to_vcf(args_t * args)1315 static void gvcf_to_vcf(args_t *args)
1316 {
1317     if ( !args->ref_fname ) error("--gvcf2vcf requires the --fasta-ref option\n");
1318 
1319     args->ref = fai_load(args->ref_fname);
1320     if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
1321 
1322     open_vcf(args,NULL);
1323     char wmode[8];
1324     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
1325     htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
1326     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
1327     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
1328 
1329     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
1330     if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
1331     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1332 
1333     int32_t *itmp = NULL, nitmp = 0;
1334 
1335     while ( bcf_sr_next_line(args->files) )
1336     {
1337         bcf1_t *line = bcf_sr_get_line(args->files,0);
1338         if ( args->filter )
1339         {
1340             int pass = filter_test(args->filter, line, NULL);
1341             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1342             if ( !pass )
1343             {
1344                 if ( bcf_write(out_fh,hdr,line)!=0  ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1345                 continue;
1346             }
1347         }
1348 
1349         // check if alleles compatible with being a gVCF record
1350         // ALT must be one of ., <*>, <X>, <NON_REF>
1351         // check for INFO/END is below
1352         int i, gallele = -1;
1353         if (line->n_allele==1)
1354             gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
1355         else if ( line->d.allele[1][0]=='<' )
1356         {
1357             for (i=1; i<line->n_allele; i++)
1358             {
1359                 if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
1360                 if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
1361                 if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; }               // GATK gVCF
1362             }
1363         }
1364 
1365         // no gVCF compatible alleles
1366         if (gallele<0)
1367         {
1368             if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1369             continue;
1370         }
1371 
1372         int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
1373         if ( nend!=1 )
1374         {
1375             // No INFO/END => not gVCF record
1376             if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1377             continue;
1378         }
1379         bcf_update_info_int32(hdr,line,"END",NULL,0);
1380         int pos, len;
1381         for (pos=line->pos; pos<itmp[0]; pos++)
1382         {
1383             line->pos = pos;
1384             char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
1385             if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1);
1386             strncpy(line->d.allele[0],ref,len);
1387             if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1388             free(ref);
1389         }
1390     }
1391     free(itmp);
1392     if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1393 }
1394 
usage(void)1395 static void usage(void)
1396 {
1397     fprintf(bcftools_stderr, "\n");
1398     fprintf(bcftools_stderr, "About:   Converts VCF/BCF to other formats and back. See man page for file\n");
1399     fprintf(bcftools_stderr, "         formats details. When specifying output files explicitly instead\n");
1400     fprintf(bcftools_stderr, "         of with PREFIX, one can use '-' for bcftools_stdout and '.' to suppress.\n");
1401     fprintf(bcftools_stderr, "Usage:   bcftools convert [OPTIONS] INPUT_FILE\n");
1402     fprintf(bcftools_stderr, "\n");
1403     fprintf(bcftools_stderr, "VCF input options:\n");
1404     fprintf(bcftools_stderr, "   -e, --exclude EXPR             Exclude sites for which the expression is true\n");
1405     fprintf(bcftools_stderr, "   -i, --include EXPR             Select sites for which the expression is true\n");
1406     fprintf(bcftools_stderr, "   -r, --regions REGION           Restrict to comma-separated list of regions\n");
1407     fprintf(bcftools_stderr, "   -R, --regions-file FILE        Restrict to regions listed in a file\n");
1408     fprintf(bcftools_stderr, "       --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
1409     fprintf(bcftools_stderr, "   -s, --samples LIST             List of samples to include\n");
1410     fprintf(bcftools_stderr, "   -S, --samples-file FILE        File of samples to include\n");
1411     fprintf(bcftools_stderr, "   -t, --targets REGION           Similar to -r but streams rather than index-jumps\n");
1412     fprintf(bcftools_stderr, "   -T, --targets-file FILE        Similar to -R but streams rather than index-jumps\n");
1413     fprintf(bcftools_stderr, "       --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
1414     fprintf(bcftools_stderr, "\n");
1415     fprintf(bcftools_stderr, "VCF output options:\n");
1416     fprintf(bcftools_stderr, "       --no-version               Do not append version and command line to the header\n");
1417     fprintf(bcftools_stderr, "   -o, --output FILE              Output file name [bcftools_stdout]\n");
1418     fprintf(bcftools_stderr, "   -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
1419     fprintf(bcftools_stderr, "       --threads INT              Use multithreading with INT worker threads [0]\n");
1420     fprintf(bcftools_stderr, "\n");
1421     fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
1422     fprintf(bcftools_stderr, "   -G, --gensample2vcf ...        <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
1423     fprintf(bcftools_stderr, "   -g, --gensample ...            <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
1424     fprintf(bcftools_stderr, "       --tag STRING               Tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
1425     fprintf(bcftools_stderr, "       --chrom                    Output chromosome in first column instead of CHROM:POS_REF_ALT\n");
1426     fprintf(bcftools_stderr, "       --keep-duplicates          Keep duplicate positions\n");
1427     fprintf(bcftools_stderr, "       --sex FILE                 Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
1428     fprintf(bcftools_stderr, "       --vcf-ids                  Output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
1429     fprintf(bcftools_stderr, "\n");
1430     fprintf(bcftools_stderr, "gVCF conversion:\n");
1431     fprintf(bcftools_stderr, "       --gvcf2vcf                 Expand gVCF reference blocks\n");
1432     fprintf(bcftools_stderr, "   -f, --fasta-ref FILE           Reference sequence in fasta format\n");
1433     fprintf(bcftools_stderr, "\n");
1434     fprintf(bcftools_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
1435     fprintf(bcftools_stderr, "       --hapsample2vcf ...        <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
1436     fprintf(bcftools_stderr, "       --hapsample ...            <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
1437     fprintf(bcftools_stderr, "       --haploid2diploid          Convert haploid genotypes to diploid homozygotes\n");
1438     fprintf(bcftools_stderr, "       --sex FILE                 Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
1439     fprintf(bcftools_stderr, "       --vcf-ids                  Output VCF IDs instead of CHROM:POS_REF_ALT\n");
1440     fprintf(bcftools_stderr, "\n");
1441     fprintf(bcftools_stderr, "HAP/LEGEND/SAMPLE conversion:\n");
1442     fprintf(bcftools_stderr, "   -H, --haplegendsample2vcf ...  <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
1443     fprintf(bcftools_stderr, "   -h, --haplegendsample ...      <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
1444     fprintf(bcftools_stderr, "       --haploid2diploid          Convert haploid genotypes to diploid homozygotes\n");
1445     fprintf(bcftools_stderr, "       --sex FILE                 Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
1446     fprintf(bcftools_stderr, "       --vcf-ids                  Output VCF IDs instead of CHROM:POS_REF_ALT\n");
1447     fprintf(bcftools_stderr, "\n");
1448     fprintf(bcftools_stderr, "TSV conversion:\n");
1449     fprintf(bcftools_stderr, "       --tsv2vcf FILE\n");
1450     fprintf(bcftools_stderr, "   -c, --columns STRING           Columns of the input tsv file [ID,CHROM,POS,AA]\n");
1451     fprintf(bcftools_stderr, "   -f, --fasta-ref FILE           Reference sequence in fasta format\n");
1452     fprintf(bcftools_stderr, "   -s, --samples LIST             List of sample names\n");
1453     fprintf(bcftools_stderr, "   -S, --samples-file FILE        File of sample names\n");
1454     fprintf(bcftools_stderr, "\n");
1455     // fprintf(bcftools_stderr, "PLINK options:\n");
1456     // fprintf(bcftools_stderr, "   -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
1457     // fprintf(bcftools_stderr, "       --tped              make tped file instead\n");
1458     // fprintf(bcftools_stderr, "       --bin               make binary bed/fam/bim files\n");
1459     // fprintf(bcftools_stderr, "\n");
1460     // fprintf(bcftools_stderr, "PBWT options:\n");
1461     // fprintf(bcftools_stderr, "   -b, --pbwt          <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
1462     // fprintf(bcftools_stderr, "\n");
1463     bcftools_exit(1);
1464 }
1465 
main_vcfconvert(int argc,char * argv[])1466 int main_vcfconvert(int argc, char *argv[])
1467 {
1468     int c;
1469     args_t *args = (args_t*) calloc(1,sizeof(args_t));
1470     args->argc   = argc; args->argv = argv;
1471     args->outfname = "-";
1472     args->output_type = FT_VCF;
1473     args->n_threads = 0;
1474     args->record_cmd_line = 1;
1475     args->regions_overlap = 1;
1476     args->targets_overlap = 0;
1477     args->clevel = -1;
1478 
1479     static struct option loptions[] =
1480     {
1481         {"include",required_argument,NULL,'i'},
1482         {"exclude",required_argument,NULL,'e'},
1483         {"output",required_argument,NULL,'o'},
1484         {"output-type",required_argument,NULL,'O'},
1485         {"threads",required_argument,NULL,9},
1486         {"regions",required_argument,NULL,'r'},
1487         {"regions-file",required_argument,NULL,'R'},
1488         {"regions-overlap",required_argument,NULL,13},
1489         {"targets",required_argument,NULL,'t'},
1490         {"targets-file",required_argument,NULL,'T'},
1491         {"targets-overlap",required_argument,NULL,14},
1492         {"samples",required_argument,NULL,'s'},
1493         {"samples-file",required_argument,NULL,'S'},
1494         {"sex",required_argument,NULL,11},
1495         {"gensample",required_argument,NULL,'g'},
1496         {"gensample2vcf",required_argument,NULL,'G'},
1497         {"tag",required_argument,NULL,1},
1498         {"chrom",no_argument,NULL,8},
1499         {"tsv2vcf",required_argument,NULL,2},
1500         {"hapsample",required_argument,NULL,7},
1501         {"hapsample2vcf",required_argument,NULL,3},
1502         {"vcf-ids",no_argument,NULL,4},
1503         {"haploid2diploid",no_argument,NULL,5},
1504         {"gvcf2vcf",no_argument,NULL,6},
1505         {"haplegendsample",required_argument,NULL,'h'},
1506         {"haplegendsample2vcf",required_argument,NULL,'H'},
1507         {"columns",required_argument,NULL,'c'},
1508         {"fasta-ref",required_argument,NULL,'f'},
1509         {"no-version",no_argument,NULL,10},
1510         {"keep-duplicates",no_argument,NULL,12},
1511         {NULL,0,NULL,0}
1512     };
1513     char *tmp;
1514     while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
1515         switch (c) {
1516             case 'e':
1517                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1518                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
1519             case 'i':
1520                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
1521                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
1522             case 'r': args->regions_list = optarg; break;
1523             case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
1524             case 't': args->targets_list = optarg; break;
1525             case 'T': args->targets_list = optarg; args->targets_is_file = 1; break;
1526             case 's': args->sample_list = optarg; break;
1527             case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
1528             case 'g': args->convert_func = vcf_to_gensample; args->outfname = optarg; break;
1529             case 'G': args->convert_func = gensample_to_vcf; args->infname = optarg; break;
1530             case  1 : args->tag = optarg; break;
1531             case  2 : args->convert_func = tsv_to_vcf; args->infname = optarg; break;
1532             case  3 : args->convert_func = hapsample_to_vcf; args->infname = optarg; break;
1533             case  4 : args->output_vcf_ids = 1; break;
1534             case  5 : args->hap2dip = 1; break;
1535             case  6 : args->convert_func = gvcf_to_vcf; break;
1536             case  7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
1537             case  8 : args->output_chrom_first_col = 1; break;
1538             case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
1539             case 'f': args->ref_fname = optarg; break;
1540             case 'c': args->columns = optarg; break;
1541             case 'o': args->outfname = optarg; break;
1542             case 'O':
1543                 switch (optarg[0]) {
1544                     case 'b': args->output_type = FT_BCF_GZ; break;
1545                     case 'u': args->output_type = FT_BCF; break;
1546                     case 'z': args->output_type = FT_VCF_GZ; break;
1547                     case 'v': args->output_type = FT_VCF; break;
1548                     default:
1549                     {
1550                         args->clevel = strtol(optarg,&tmp,10);
1551                         if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
1552                     }
1553                 }
1554                 if ( optarg[1] )
1555                 {
1556                     args->clevel = strtol(optarg+1,&tmp,10);
1557                     if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
1558                 }
1559                 break;
1560             case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
1561             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
1562             case 10 : args->record_cmd_line = 0; break;
1563             case 11 : args->sex_fname = optarg; break;
1564             case 12 : args->keep_duplicates = 1; break;
1565             case 13 :
1566                 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
1567                 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
1568                 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
1569                 else error("Could not parse: --regions-overlap %s\n",optarg);
1570                 break;
1571             case 14 :
1572                 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
1573                 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
1574                 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
1575                 else error("Could not parse: --targets-overlap %s\n",optarg);
1576                 break;
1577             case '?': usage(); break;
1578             default: error("Unknown argument: %s\n", optarg);
1579         }
1580     }
1581 
1582     if ( !args->infname )
1583     {
1584         if ( optind>=argc )
1585         {
1586             if ( !isatty(fileno((FILE *)stdin)) ) args->infname = "-";
1587         }
1588         else args->infname = argv[optind];
1589     }
1590     if ( !args->infname ) usage();
1591 
1592     if ( args->convert_func ) args->convert_func(args);
1593     else vcf_to_vcf(args);
1594 
1595     destroy_data(args);
1596     free(args);
1597     return 0;
1598 }
1599