1 /*
2  * bcf_file.cpp
3  *
4  *  Created on: Dec 11, 2012
5  *      Author: amarcketta
6  */
7 
8 #include "bcf_file.h"
9 
10 char bcf_21[5] = {'B','C','F',(int8_t)2,(int8_t)1};
11 char bcf_22[5] = {'B','C','F',(int8_t)2,(int8_t)2};
12 
bcf_file(const parameters & p,bool diff)13 bcf_file::bcf_file(const parameters &p, bool diff)
14 {
15 	if(!diff)
16 		filename = p.vcf_filename;
17 	else
18 		filename = p.diff_file;
19 
20 	big_endian = is_big_endian();
21 	is_BGZF = false; stream = p.stream_in;
22 	N_entries = 0; N_kept_entries = 0;
23 	meta_data = header();
24 
25 	if (stream)
26 	{
27 		is_BGZF = true;
28 		open_gz();
29 	}
30 	else
31 		open();
32 
33 	check_bcf();
34 	read_header();
35 
36 	include_indv = vector<bool>(meta_data.N_indv,true);
37 }
38 
~bcf_file()39 bcf_file::~bcf_file()
40 {
41 	close();
42 }
43 
open()44 void bcf_file::open()
45 {
46 	int ret;
47 
48 	if (filename.substr(filename.size()-4) == ".vcf")
49 			LOG.error("Filename ends in '.vcf'. Shouldn't you be using --vcf?\n");
50 
51 	if (filename.substr(filename.size()-7) == ".vcf.gz")
52 			LOG.error("Filename ends in '.vcf.gz'. Shouldn't you be using --gzvcf?\n");
53 
54 	ret = bgzf_is_bgzf(filename.c_str());
55 
56 	if (ret == 1)
57 		is_BGZF = true;
58 	else
59 		is_BGZF = false;
60 
61 	if (is_BGZF)
62 		open_gz();
63 	else
64 	{
65 		file_tmp.open(filename.c_str(), ios::in);
66 		if (!file_tmp.is_open())
67 			LOG.error("Could not open VCF file: " + filename, 0);
68 		file_in = &file_tmp;
69 	}
70 }
71 
open_gz()72 void bcf_file::open_gz()
73 {
74 	int ret;
75 	gzMAX_LINE_LEN = 1024*1024;
76 
77 	if (stream)
78 		gzfile_in = gzdopen(fileno(stdin), "r");
79 	else
80 		gzfile_in = gzopen(filename.c_str(), "rb");
81 
82 	if (gzfile_in == NULL)
83 		LOG.error("Could not open BGZF BCF file: " + filename, 0);
84 	#ifdef ZLIB_VERNUM
85 		string tmp(ZLIB_VERSION);
86 		LOG.printLOG("Using zlib version: " + tmp + "\n");
87 		#if (ZLIB_VERNUM >= 0x1240)
88 			ret = gzbuffer(gzfile_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster
89 			if (ret != 0)
90 				LOG.warning("Unable to change zlib buffer size.");
91 		#else
92 			LOG.printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading compressed BCF files.\n");
93 		#endif
94 	#endif
95 }
96 
check_bcf()97 void bcf_file::check_bcf()
98 {
99 	char magic[5];
100 
101 	read(magic, 5, 1);
102 	if ((strcmp(magic, bcf_21) == 0) && (strcmp(magic, bcf_22) == 0))
103 		LOG.error("Does not appear to be a BCF file\n");
104 }
105 
close()106 void bcf_file::close()
107 {
108 	if (!stream && is_BGZF)
109 		gzclose(gzfile_in);
110 }
111 
get_entry(vector<char> & out)112 void bcf_file::get_entry(vector<char> &out)
113 {
114 	uint32_t size_int[2];
115 	int ret, read_size = 0;
116 
117 	ret = read(&size_int[0], 2, sizeof(uint32_t) );
118 	read_size = size_int[0] + size_int[1];
119 
120 	if (ret)
121 	{
122 		out.resize(read_size+2*sizeof(uint32_t));
123 		memcpy(&out[0], size_int, 2*sizeof(uint32_t));
124 		read(&out[2*sizeof(uint32_t)], 1, read_size);
125 	}
126 	else
127 		out.resize(0);
128 }
129 
get_entry_object()130 entry* bcf_file::get_entry_object()
131 {
132 	return new bcf_entry(meta_data, include_indv);
133 }
134 
read(void * buffer,unsigned int len,size_t size)135 int bcf_file::read(void *buffer, unsigned int len, size_t size)
136 {
137 	int ret;
138 
139 	if (is_BGZF)
140 	{
141 		ret = gzread(gzfile_in, buffer, size*len);
142 		ret = (ret == (int)(len*size) );
143 	}
144 	else
145 	{
146 		file_in->read((char*)buffer, size*len);
147 		ret = !file_in->eof();
148 	}
149 
150 	if ((big_endian) && (size > 1)) // Note: don't both swapping character arrays - BCF is defined as little endian.
151 	{
152 		unsigned int ui;
153 		for (ui=0; ui<len; ui++)
154 			ByteSwap((unsigned char *)buffer+(size*ui), size);
155 	}
156 	return ret;
157 }
158 
read_header()159 void bcf_file::read_header()
160 {
161 	uint32_t len_text;
162 
163 	read(&len_text, 1, sizeof(uint32_t));
164 	char *header_array = new char[(uint32_t)len_text];
165 	read(header_array, len_text, 1);
166 
167 	string header(header_array);
168 	delete [] header_array;
169 	string line;
170 	unsigned int line_index = 0;
171 	header.erase( header.find_last_not_of(" \f\n\r\t\v\0" ) + 1 );
172 
173 	istringstream iss(header);
174 	line_index += meta_data.add_FILTER_descriptor("ID=PASS,Description=PASS", line_index);
175 
176 	while (getline(iss, line))
177 	{
178 		if (line[0] == '#')
179 		{
180 			if (line[1] == '#')
181 				meta_data.parse_meta(line, line_index);
182 			else
183 				meta_data.parse_header(line);
184 		}
185 	}
186 }
187 
eof()188 bool bcf_file::eof()
189 {
190 	if (is_BGZF)
191 		return gzeof(gzfile_in);
192 	else
193 		return(file_in->eof());
194 }
195 
print(const parameters & params)196 void bcf_file::print(const parameters &params)
197 {
198 	LOG.printLOG("Outputting VCF file...\n");
199 
200 	string output_file = params.output_prefix + ".recode.vcf";
201 	streambuf * buf;
202 	ofstream temp_out;
203 	if (!params.stream_out)
204 	{
205 		temp_out.open(output_file.c_str(), ios::out);
206 		if (!temp_out.is_open()) LOG.error("Could not open VCF Output file: " + output_file, 3);
207 		buf = temp_out.rdbuf();
208 	}
209 	else
210 		buf = cout.rdbuf();
211 
212 	ostream out(buf);
213 	if (meta_data.has_idx)
214 	{
215 		LOG.warning("BCF file contains IDX values in header. These are being removed for conversion to VCF.");
216 		meta_data.reprint();
217 	}
218 	for (unsigned int ui=0; ui<meta_data.lines.size(); ui++)
219 		out << meta_data.lines[ui] << endl;
220 
221 	out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
222 	if (meta_data.N_indv > 0)
223 		out << "\tFORMAT";
224 	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
225 		if (include_indv[ui])
226 			out << "\t" << meta_data.indv[ui];
227 	out << endl;
228 
229 	vector<char> variant_line;
230 	entry *e = new bcf_entry(meta_data, include_indv);
231 	while(!eof())
232 	{
233 		get_entry(variant_line);
234 		e->reset(variant_line);
235 		N_entries += e->apply_filters(params);
236 		if(!e->passed_filters)
237 			continue;
238 		N_kept_entries++;
239 		e->parse_basic_entry(true, true, true);
240 		e->parse_full_entry(true);
241 		e->parse_genotype_entries(true);
242 		e->print(out, params.recode_INFO_to_keep, params.recode_all_INFO);
243 	}
244 	delete e;
245 }
246 
print_bcf(const parameters & params)247 void bcf_file::print_bcf(const parameters &params)
248 {
249 	LOG.printLOG("Outputting BCF file...\n");
250 	BGZF * out;
251 	if(!params.stream_out)
252 	{
253 		string output_file = params.output_prefix + ".recode.bcf";
254 		out = bgzf_open(output_file.c_str(), "w");
255 	}
256 	else
257 		out = bgzf_dopen(1, "w");
258 
259 	string header_str;
260 	uint32_t len_text = 0;
261 	vector<char> header;
262 
263 	char magic[5] = {'B','C','F','\2','\2'};
264 	bgzf_write(out, magic, 5);
265 
266 	for (unsigned int ui=0; ui<meta_data.lines.size(); ui++)
267 	{
268 		for (unsigned int uj=0; uj<meta_data.lines[ui].length(); uj++)
269 			header.push_back( meta_data.lines[ui][uj] );
270 		header.push_back('\n');
271 	}
272 
273 	header_str = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
274 	if (meta_data.N_indv > 0)
275 		header_str += "\tFORMAT";
276 
277 	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
278 		if (include_indv[ui])
279 	{
280 		header_str += "\t";
281 		header_str += meta_data.indv[ui];
282 	}
283 	header_str += "\n";
284 
285 	for (unsigned int ui=0; ui<header_str.length(); ui++)
286 		header.push_back( header_str[ui] );
287 	header.push_back( '\0' );
288 	len_text = header.size();
289 
290 	bgzf_write(out, (char *)&len_text, sizeof(len_text) );
291 	bgzf_write(out, (char *)&header[0], len_text );
292 	vector<char> variant_line;
293 	entry * e = new bcf_entry(meta_data, include_indv);
294 	while(!eof())
295 	{
296 		get_entry(variant_line);
297 		e->reset(variant_line);
298 		N_entries += e->apply_filters(params);
299 		if(!e->passed_filters)
300 			continue;
301 		N_kept_entries++;
302 		e->parse_basic_entry(true, true, true);
303 		e->parse_full_entry(true);
304 		e->parse_genotype_entries(true);
305 		e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO);
306 	}
307 	delete e;
308 	bgzf_close(out);
309 }
310