1 /*  test/test-vcf-api.c -- VCF test harness.
2 
3     Copyright (C) 2013, 2014 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <config.h>
26 
27 #include <stdio.h>
28 #include <htslib/hts.h>
29 #include <htslib/vcf.h>
30 #include <htslib/kstring.h>
31 #include <htslib/kseq.h>
32 
write_bcf(char * fname)33 void write_bcf(char *fname)
34 {
35     // Init
36     htsFile *fp    = hts_open(fname,"wb");
37     bcf_hdr_t *hdr = bcf_hdr_init("w");
38     bcf1_t *rec    = bcf_init1();
39 
40     // Create VCF header
41     kstring_t str = {0,0,0};
42     bcf_hdr_append(hdr, "##fileDate=20090805");
43     bcf_hdr_append(hdr, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">");
44     bcf_hdr_append(hdr, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">");
45     bcf_hdr_append(hdr, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">");
46     bcf_hdr_append(hdr, "##unused=<XX=AA,Description=\"Unused generic\">");
47     bcf_hdr_append(hdr, "##unused=unformatted text 1");
48     bcf_hdr_append(hdr, "##unused=unformatted text 2");
49     bcf_hdr_append(hdr, "##contig=<ID=Unused,length=62435964>");
50     bcf_hdr_append(hdr, "##source=myImputationProgramV3.1");
51     bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta");
52     bcf_hdr_append(hdr, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>");
53     bcf_hdr_append(hdr, "##phasing=partial");
54     bcf_hdr_append(hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
55     bcf_hdr_append(hdr, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
56     bcf_hdr_append(hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
57     bcf_hdr_append(hdr, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">");
58     bcf_hdr_append(hdr, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">");
59     bcf_hdr_append(hdr, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">");
60     bcf_hdr_append(hdr, "##FILTER=<ID=q10,Description=\"Quality below 10\">");
61     bcf_hdr_append(hdr, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">");
62     bcf_hdr_append(hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
63     bcf_hdr_append(hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
64     bcf_hdr_append(hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
65     bcf_hdr_append(hdr, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
66     bcf_hdr_append(hdr, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">");
67 
68     bcf_hdr_add_sample(hdr, "NA00001");
69     bcf_hdr_add_sample(hdr, "NA00002");
70     bcf_hdr_add_sample(hdr, "NA00003");
71     bcf_hdr_add_sample(hdr, NULL);      // to update internal structures
72     bcf_hdr_write(fp, hdr);
73 
74 
75     // Add a record
76     // 20     14370   rs6054257 G      A       29   PASS   NS=3;DP=14;AF=0.5;DB;H2           GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
77     // .. CHROM
78     rec->rid = bcf_hdr_name2id(hdr, "20");
79     // .. POS
80     rec->pos = 14369;
81     // .. ID
82     bcf_update_id(hdr, rec, "rs6054257");
83     // .. REF and ALT
84     bcf_update_alleles_str(hdr, rec, "G,A");
85     // .. QUAL
86     rec->qual = 29;
87     // .. FILTER
88     int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS");
89     bcf_update_filter(hdr, rec, &tmpi, 1);
90     // .. INFO
91     tmpi = 3;
92     bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
93     tmpi = 14;
94     bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
95     float tmpf = 0.5;
96     bcf_update_info_float(hdr, rec, "AF", &tmpf, 1);
97     bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
98     bcf_update_info_flag(hdr, rec, "H2", NULL, 1);
99     // .. FORMAT
100     int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int));
101     tmpia[0] = bcf_gt_phased(0);
102     tmpia[1] = bcf_gt_phased(0);
103     tmpia[2] = bcf_gt_phased(1);
104     tmpia[3] = bcf_gt_phased(0);
105     tmpia[4] = bcf_gt_unphased(1);
106     tmpia[5] = bcf_gt_unphased(1);
107     bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
108     tmpia[0] = 48;
109     tmpia[1] = 48;
110     tmpia[2] = 43;
111     bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr));
112     tmpia[0] = 1;
113     tmpia[1] = 8;
114     tmpia[2] = 5;
115     bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr));
116     tmpia[0] = 51;
117     tmpia[1] = 51;
118     tmpia[2] = 51;
119     tmpia[3] = 51;
120     tmpia[4] = bcf_int32_missing;
121     tmpia[5] = bcf_int32_missing;
122     bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2);
123     char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"};
124     bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3);
125     bcf_write1(fp, hdr, rec);
126 
127     // 20     1110696 . A      G,T     67   .   NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1   ./.
128     bcf_clear1(rec);
129     rec->rid = bcf_hdr_name2id(hdr, "20");
130     rec->pos = 1110695;
131     bcf_update_alleles_str(hdr, rec, "A,G,T");
132     rec->qual = 67;
133     tmpi = 2;
134     bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
135     tmpi = 10;
136     bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
137     float *tmpfa = (float*)malloc(2*sizeof(float));
138     tmpfa[0] = 0.333;
139     bcf_float_set_missing(tmpfa[1]);
140     bcf_update_info_float(hdr, rec, "AF", tmpfa, 2);
141     bcf_update_info_string(hdr, rec, "AA", "T");
142     bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
143     tmpia[0] = bcf_gt_phased(2);
144     tmpia[1] = bcf_int32_vector_end;
145     tmpia[2] = bcf_gt_phased(1);
146     tmpia[3] = bcf_int32_vector_end;
147     tmpia[4] = bcf_gt_missing;
148     tmpia[5] = bcf_gt_missing;
149     bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
150     bcf_write1(fp, hdr, rec);
151 
152     free(tmpia);
153     free(tmpfa);
154 
155     // Clean
156     free(str.s);
157     bcf_destroy1(rec);
158     bcf_hdr_destroy(hdr);
159     int ret;
160     if ( (ret=hts_close(fp)) )
161     {
162         fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
163         exit(ret);
164     }
165 }
166 
bcf_to_vcf(char * fname)167 void bcf_to_vcf(char *fname)
168 {
169     htsFile *fp    = hts_open(fname,"rb");
170     bcf_hdr_t *hdr = bcf_hdr_read(fp);
171     bcf1_t *rec    = bcf_init1();
172 
173     char *gz_fname = (char*) malloc(strlen(fname)+4);
174     snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname);
175     htsFile *out   = hts_open(gz_fname,"wg");
176 
177     bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr);
178     bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused");
179     bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused");
180     bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt");
181     bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI");
182     bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF");
183     bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused");
184     bcf_hdr_write(out, hdr_out);
185 
186     while ( bcf_read1(fp, hdr, rec)>=0 )
187     {
188         bcf_write1(out, hdr_out, rec);
189 
190         // Test problems caused by bcf1_sync: the data block
191         // may be realloced, also the unpacked structures must
192         // get updated.
193         bcf_unpack(rec, BCF_UN_STR);
194         bcf_update_id(hdr, rec, 0);
195         bcf_update_format_int32(hdr, rec, "GQ", NULL, 0);
196 
197         bcf1_t *dup = bcf_dup(rec);     // force bcf1_sync call
198         bcf_write1(out, hdr_out, dup);
199         bcf_destroy1(dup);
200 
201         bcf_update_alleles_str(hdr_out, rec, "G,A");
202         int32_t tmpi = 99;
203         bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1);
204         int32_t tmpia[] = {9,9,9};
205         bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3);
206 
207         bcf_write1(out, hdr_out, rec);
208     }
209 
210     bcf_destroy1(rec);
211     bcf_hdr_destroy(hdr);
212     bcf_hdr_destroy(hdr_out);
213     int ret;
214     if ( (ret=hts_close(fp)) )
215     {
216         fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
217         exit(ret);
218     }
219     if ( (ret=hts_close(out)) )
220     {
221         fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
222         exit(ret);
223     }
224 
225 
226     // read gzip, write stdout
227     htsFile *gz_in = hts_open(gz_fname, "r");
228     if ( !gz_in )
229     {
230         fprintf(stderr,"Could not read: %s\n", gz_fname);
231         exit(1);
232     }
233 
234     kstring_t line = {0,0,0};
235     while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 )
236     {
237         kputc('\n',&line);
238         fwrite(line.s,1,line.l,stdout);
239     }
240 
241     if ( (ret=hts_close(gz_in)) )
242     {
243         fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
244         exit(ret);
245     }
246     free(line.s);
247     free(gz_fname);
248 }
249 
iterator(const char * fname)250 void iterator(const char *fname)
251 {
252     htsFile *fp = hts_open(fname, "r");
253     bcf_hdr_t *hdr = bcf_hdr_read(fp);
254     hts_idx_t *idx;
255     hts_itr_t *iter;
256 
257     bcf_index_build(fname, 0);
258     idx = bcf_index_load(fname);
259 
260     iter = bcf_itr_queryi(idx, bcf_hdr_name2id(hdr, "20"), 1110600, 1110800);
261     bcf_itr_destroy(iter);
262 
263     iter = bcf_itr_querys(idx, hdr, "20:1110600-1110800");
264     bcf_itr_destroy(iter);
265 
266     hts_idx_destroy(idx);
267     bcf_hdr_destroy(hdr);
268     int ret;
269     if ( (ret=hts_close(fp)) )
270     {
271         fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
272         exit(ret);
273     }
274 }
275 
test_get_info_values(const char * fname)276 void test_get_info_values(const char *fname)
277 {
278     htsFile *fp = hts_open(fname, "r");
279     bcf_hdr_t *hdr = bcf_hdr_read(fp);
280     bcf1_t *line = bcf_init();
281 
282     while (bcf_read(fp, hdr, line) == 0)
283     {
284         float *afs = 0;
285         int count = 0;
286         int ret = bcf_get_info_float(hdr, line, "AF", &afs, &count);
287 
288         if (line->pos == 14369)
289         {
290             if (ret != 1 || afs[0] != 0.5f)
291             {
292                 fprintf(stderr, "AF on position 14370 should be 0.5\n");
293                 exit(-1);
294             }
295         }
296         else
297         {
298             if (ret != 2 || afs[0] != 0.333f || !bcf_float_is_missing(afs[1]))
299             {
300                 fprintf(stderr, "AF on position 1110696 should be 0.333, missing\n");
301                 exit(-1);
302             }
303         }
304 
305         free(afs);
306     }
307 
308     bcf_destroy(line);
309     bcf_hdr_destroy(hdr);
310     hts_close(fp);
311 }
312 
write_format_values(const char * fname)313 void write_format_values(const char *fname)
314 {
315     // Init
316     htsFile *fp = hts_open(fname, "wb");
317     bcf_hdr_t *hdr = bcf_hdr_init("w");
318     bcf1_t *rec = bcf_init1();
319 
320     // Create VCF header
321     bcf_hdr_append(hdr, "##contig=<ID=1>");
322     bcf_hdr_append(hdr, "##FORMAT=<ID=TF,Number=1,Type=Float,Description=\"Test Float\">");
323     bcf_hdr_add_sample(hdr, "S");
324     bcf_hdr_add_sample(hdr, NULL); // to update internal structures
325     bcf_hdr_write(fp, hdr);
326 
327     // Add a record
328     // .. FORMAT
329     float test[4];
330     bcf_float_set_missing(test[0]);
331     test[1] = 47.11f;
332     bcf_float_set_vector_end(test[2]);
333     test[3] = -1.2e-13;
334     bcf_update_format_float(hdr, rec, "TF", test, 4);
335     bcf_write1(fp, hdr, rec);
336 
337     bcf_destroy1(rec);
338     bcf_hdr_destroy(hdr);
339     int ret;
340     if ((ret = hts_close(fp)))
341     {
342         fprintf(stderr, "hts_close(%s): non-zero status %d\n", fname, ret);
343         exit(ret);
344     }
345 }
346 
check_format_values(const char * fname)347 void check_format_values(const char *fname)
348 {
349     htsFile *fp = hts_open(fname, "r");
350     bcf_hdr_t *hdr = bcf_hdr_read(fp);
351     bcf1_t *line = bcf_init();
352 
353     while (bcf_read(fp, hdr, line) == 0)
354     {
355         float *values = 0;
356         int count = 0;
357         int ret = bcf_get_format_float(hdr, line, "TF", &values, &count);
358 
359         // NOTE the return value from bcf_get_format_float is different from
360         // bcf_get_info_float in the sense that vector-end markers also count.
361         if (ret != 4 ||
362             count < ret ||
363             !bcf_float_is_missing(values[0]) ||
364             values[1] != 47.11f ||
365             !bcf_float_is_vector_end(values[2]) ||
366             !bcf_float_is_vector_end(values[3]))
367         {
368             fprintf(stderr, "bcf_get_format_float didn't produce the expected output.\n");
369             exit(-1);
370         }
371 
372         free(values);
373     }
374 
375     bcf_destroy(line);
376     bcf_hdr_destroy(hdr);
377     hts_close(fp);
378 }
379 
test_get_format_values(const char * fname)380 void test_get_format_values(const char *fname)
381 {
382     write_format_values(fname);
383     check_format_values(fname);
384 }
385 
main(int argc,char ** argv)386 int main(int argc, char **argv)
387 {
388     char *fname = argc>1 ? argv[1] : "rmme.bcf";
389 
390     // format test. quiet unless there's a failure
391     test_get_format_values(fname);
392 
393     // main test. writes to stdout
394     write_bcf(fname);
395     bcf_to_vcf(fname);
396     iterator(fname);
397 
398     // additional tests. quiet unless there's a failure.
399     test_get_info_values(fname);
400 
401     return 0;
402 }
403