1 /*
2 * bcf_file.cpp
3 *
4 * Created on: Dec 11, 2012
5 * Author: amarcketta
6 */
7
8 #include "bcf_file.h"
9
10 char bcf_21[5] = {'B','C','F',(int8_t)2,(int8_t)1};
11 char bcf_22[5] = {'B','C','F',(int8_t)2,(int8_t)2};
12
bcf_file(const parameters & p,bool diff)13 bcf_file::bcf_file(const parameters &p, bool diff)
14 {
15 if(!diff)
16 filename = p.vcf_filename;
17 else
18 filename = p.diff_file;
19
20 big_endian = is_big_endian();
21 is_BGZF = false; stream = p.stream_in;
22 N_entries = 0; N_kept_entries = 0;
23 meta_data = header();
24
25 if (stream)
26 {
27 is_BGZF = true;
28 open_gz();
29 }
30 else
31 open();
32
33 check_bcf();
34 read_header();
35
36 include_indv = vector<bool>(meta_data.N_indv,true);
37 }
38
~bcf_file()39 bcf_file::~bcf_file()
40 {
41 close();
42 }
43
open()44 void bcf_file::open()
45 {
46 int ret;
47
48 if (filename.substr(filename.size()-4) == ".vcf")
49 LOG.error("Filename ends in '.vcf'. Shouldn't you be using --vcf?\n");
50
51 if (filename.substr(filename.size()-7) == ".vcf.gz")
52 LOG.error("Filename ends in '.vcf.gz'. Shouldn't you be using --gzvcf?\n");
53
54 ret = bgzf_is_bgzf(filename.c_str());
55
56 if (ret == 1)
57 is_BGZF = true;
58 else
59 is_BGZF = false;
60
61 if (is_BGZF)
62 open_gz();
63 else
64 {
65 file_tmp.open(filename.c_str(), ios::in);
66 if (!file_tmp.is_open())
67 LOG.error("Could not open VCF file: " + filename, 0);
68 file_in = &file_tmp;
69 }
70 }
71
open_gz()72 void bcf_file::open_gz()
73 {
74 int ret;
75 gzMAX_LINE_LEN = 1024*1024;
76
77 if (stream)
78 gzfile_in = gzdopen(fileno(stdin), "r");
79 else
80 gzfile_in = gzopen(filename.c_str(), "rb");
81
82 if (gzfile_in == NULL)
83 LOG.error("Could not open BGZF BCF file: " + filename, 0);
84 #ifdef ZLIB_VERNUM
85 string tmp(ZLIB_VERSION);
86 LOG.printLOG("Using zlib version: " + tmp + "\n");
87 #if (ZLIB_VERNUM >= 0x1240)
88 ret = gzbuffer(gzfile_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster
89 if (ret != 0)
90 LOG.warning("Unable to change zlib buffer size.");
91 #else
92 LOG.printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading compressed BCF files.\n");
93 #endif
94 #endif
95 }
96
check_bcf()97 void bcf_file::check_bcf()
98 {
99 char magic[5];
100
101 read(magic, 5, 1);
102 if ((strcmp(magic, bcf_21) == 0) && (strcmp(magic, bcf_22) == 0))
103 LOG.error("Does not appear to be a BCF file\n");
104 }
105
close()106 void bcf_file::close()
107 {
108 if (!stream && is_BGZF)
109 gzclose(gzfile_in);
110 }
111
get_entry(vector<char> & out)112 void bcf_file::get_entry(vector<char> &out)
113 {
114 uint32_t size_int[2];
115 int ret, read_size = 0;
116
117 ret = read(&size_int[0], 2, sizeof(uint32_t) );
118 read_size = size_int[0] + size_int[1];
119
120 if (ret)
121 {
122 out.resize(read_size+2*sizeof(uint32_t));
123 memcpy(&out[0], size_int, 2*sizeof(uint32_t));
124 read(&out[2*sizeof(uint32_t)], 1, read_size);
125 }
126 else
127 out.resize(0);
128 }
129
get_entry_object()130 entry* bcf_file::get_entry_object()
131 {
132 return new bcf_entry(meta_data, include_indv);
133 }
134
read(void * buffer,unsigned int len,size_t size)135 int bcf_file::read(void *buffer, unsigned int len, size_t size)
136 {
137 int ret;
138
139 if (is_BGZF)
140 {
141 ret = gzread(gzfile_in, buffer, size*len);
142 ret = (ret == (int)(len*size) );
143 }
144 else
145 {
146 file_in->read((char*)buffer, size*len);
147 ret = !file_in->eof();
148 }
149
150 if ((big_endian) && (size > 1)) // Note: don't both swapping character arrays - BCF is defined as little endian.
151 {
152 unsigned int ui;
153 for (ui=0; ui<len; ui++)
154 ByteSwap((unsigned char *)buffer+(size*ui), size);
155 }
156 return ret;
157 }
158
read_header()159 void bcf_file::read_header()
160 {
161 uint32_t len_text;
162
163 read(&len_text, 1, sizeof(uint32_t));
164 char *header_array = new char[(uint32_t)len_text];
165 read(header_array, len_text, 1);
166
167 string header(header_array);
168 delete [] header_array;
169 string line;
170 unsigned int line_index = 0;
171 header.erase( header.find_last_not_of(" \f\n\r\t\v\0" ) + 1 );
172
173 istringstream iss(header);
174 line_index += meta_data.add_FILTER_descriptor("ID=PASS,Description=PASS", line_index);
175
176 while (getline(iss, line))
177 {
178 if (line[0] == '#')
179 {
180 if (line[1] == '#')
181 meta_data.parse_meta(line, line_index);
182 else
183 meta_data.parse_header(line);
184 }
185 }
186 }
187
eof()188 bool bcf_file::eof()
189 {
190 if (is_BGZF)
191 return gzeof(gzfile_in);
192 else
193 return(file_in->eof());
194 }
195
print(const parameters & params)196 void bcf_file::print(const parameters ¶ms)
197 {
198 LOG.printLOG("Outputting VCF file...\n");
199
200 string output_file = params.output_prefix + ".recode.vcf";
201 streambuf * buf;
202 ofstream temp_out;
203 if (!params.stream_out)
204 {
205 temp_out.open(output_file.c_str(), ios::out);
206 if (!temp_out.is_open()) LOG.error("Could not open VCF Output file: " + output_file, 3);
207 buf = temp_out.rdbuf();
208 }
209 else
210 buf = cout.rdbuf();
211
212 ostream out(buf);
213 if (meta_data.has_idx)
214 {
215 LOG.warning("BCF file contains IDX values in header. These are being removed for conversion to VCF.");
216 meta_data.reprint();
217 }
218 for (unsigned int ui=0; ui<meta_data.lines.size(); ui++)
219 out << meta_data.lines[ui] << endl;
220
221 out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
222 if (meta_data.N_indv > 0)
223 out << "\tFORMAT";
224 for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
225 if (include_indv[ui])
226 out << "\t" << meta_data.indv[ui];
227 out << endl;
228
229 vector<char> variant_line;
230 entry *e = new bcf_entry(meta_data, include_indv);
231 while(!eof())
232 {
233 get_entry(variant_line);
234 e->reset(variant_line);
235 N_entries += e->apply_filters(params);
236 if(!e->passed_filters)
237 continue;
238 N_kept_entries++;
239 e->parse_basic_entry(true, true, true);
240 e->parse_full_entry(true);
241 e->parse_genotype_entries(true);
242 e->print(out, params.recode_INFO_to_keep, params.recode_all_INFO);
243 }
244 delete e;
245 }
246
print_bcf(const parameters & params)247 void bcf_file::print_bcf(const parameters ¶ms)
248 {
249 LOG.printLOG("Outputting BCF file...\n");
250 BGZF * out;
251 if(!params.stream_out)
252 {
253 string output_file = params.output_prefix + ".recode.bcf";
254 out = bgzf_open(output_file.c_str(), "w");
255 }
256 else
257 out = bgzf_dopen(1, "w");
258
259 string header_str;
260 uint32_t len_text = 0;
261 vector<char> header;
262
263 char magic[5] = {'B','C','F','\2','\2'};
264 bgzf_write(out, magic, 5);
265
266 for (unsigned int ui=0; ui<meta_data.lines.size(); ui++)
267 {
268 for (unsigned int uj=0; uj<meta_data.lines[ui].length(); uj++)
269 header.push_back( meta_data.lines[ui][uj] );
270 header.push_back('\n');
271 }
272
273 header_str = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
274 if (meta_data.N_indv > 0)
275 header_str += "\tFORMAT";
276
277 for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
278 if (include_indv[ui])
279 {
280 header_str += "\t";
281 header_str += meta_data.indv[ui];
282 }
283 header_str += "\n";
284
285 for (unsigned int ui=0; ui<header_str.length(); ui++)
286 header.push_back( header_str[ui] );
287 header.push_back( '\0' );
288 len_text = header.size();
289
290 bgzf_write(out, (char *)&len_text, sizeof(len_text) );
291 bgzf_write(out, (char *)&header[0], len_text );
292 vector<char> variant_line;
293 entry * e = new bcf_entry(meta_data, include_indv);
294 while(!eof())
295 {
296 get_entry(variant_line);
297 e->reset(variant_line);
298 N_entries += e->apply_filters(params);
299 if(!e->passed_filters)
300 continue;
301 N_kept_entries++;
302 e->parse_basic_entry(true, true, true);
303 e->parse_full_entry(true);
304 e->parse_genotype_entries(true);
305 e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO);
306 }
307 delete e;
308 bgzf_close(out);
309 }
310