1 /*
2  * bcf_entry_setters.cpp
3  *
4  *  Created on: Sep 20, 2012
5  *      Author: Anthony Marcketta
6  *      ($Revision: 1 $)
7  */
8 
9 #include "bcf_entry.h"
10 
set_ALT(const int n_allele)11 void bcf_entry::set_ALT(const int n_allele)
12 {
13 	ALT.resize(n_allele-1);
14 	unsigned int pos = ALT_pos;
15 	string allele;
16 	for (int ui=0; ui<(n_allele-1); ui++)
17 	{
18 		allele = get_typed_string( &pos, line );
19 		std::transform(allele.begin(), allele.end(), allele.begin(), ::toupper);
20 		ALT[ui] = allele;
21 	}
22 
23 	parsed_ALT = true;
24 }
25 
set_ALT(const string & in)26 void bcf_entry::set_ALT(const string &in)
27 {
28 	istringstream ss(in);
29 	string tmpstr;
30 	ALT.resize(0);
31 	while(!ss.eof())
32 	{
33 		getline(ss, tmpstr, ',');
34 		add_ALT_allele(tmpstr);
35 	}
36 	parsed_ALT = true;
37 }
38 
set_INFO()39 void bcf_entry::set_INFO()
40 {
41 	int key;
42 	unsigned int size, type, i = INFO_pos;
43 	string data_type;
44 	INFO.resize(N_info);
45 	bool miss = true;
46 
47 	for (unsigned int ui=0; ui<N_info; ui++)
48 	{
49 		key = get_typed_int(&i, line, type, size);
50 		get_type(&i, line, type, size);
51 
52 		pair<string, string> INFO_entry(entry_header.INFO_map[key].ID, ".");
53 		data_type = entry_header.INFO_map[key].Type_str;
54 		ostringstream ss(ostringstream::out);
55 
56 		for (unsigned int uj=0; uj<size; uj++)
57 		{
58 			if (uj !=0 && type != 7)
59 				ss << ",";
60 
61 			if ( check_missing(i, type, line) )
62 				ss << ".";
63 
64 			else if(type==1)
65 			{
66 				int8_t tmp;
67 				memcpy(&tmp, &line[i], sizeof(tmp));
68 				i += sizeof(tmp);
69 				ss << (int)tmp;
70 				miss = false;
71 			}
72 			else if(type==2)
73 			{
74 				int16_t tmp;
75 				memcpy(&tmp, &line[i], sizeof(tmp));
76 				i += sizeof(tmp);
77 				ss << (int)tmp;
78 				miss = false;
79 			}
80 			else if(type==3)
81 			{
82 				int32_t tmp;
83 				memcpy(&tmp, &line[i], sizeof(tmp));
84 				i += sizeof(tmp);
85 				ss << (int)tmp;
86 				miss = false;
87 			}
88 			else if(type==5)
89 			{
90 				float tmp;
91 				memcpy(&tmp, &line[i], sizeof(tmp));
92 				i += sizeof(tmp);
93 				ss << tmp;
94 				miss = false;
95 			}
96 			else if(type==7)
97 			{
98 				char tmp;
99 				memcpy(&tmp, &line[i], sizeof(tmp));
100 				i += sizeof(tmp);
101 				ss << tmp;
102 				miss = false;
103 			}
104 			else
105 			{
106 				LOG.printLOG("Error: Unknown type: " + header::int2str(type) + "\n");
107 				parsed_INFO = false;
108 				return;
109 			}
110 		}
111 
112 		string value = ss.str();
113 		if (miss)
114 			INFO_entry.second = ".";
115 		else if (data_type == "Flag")
116 			INFO_entry.second = "1";
117 		else if (value != "")
118 			INFO_entry.second = ss.str();
119 		else
120 			INFO_entry.second = ".";
121 		INFO[ui] = INFO_entry;
122 	}
123 	parsed_INFO = true;
124 }
125 
set_FORMAT()126 void bcf_entry::set_FORMAT()
127 {
128 	FORMAT.resize(0);
129 	FORMAT_to_idx.clear();
130 	unsigned int l_pos = L_shared + 2*sizeof(uint32_t);
131 	unsigned int fmt_key, type, size, skip;
132 
133 	for (unsigned int ui=0; ui<N_format; ui++)
134 	{
135 		fmt_key = get_typed_int(&l_pos, line, type, size);
136 		get_type(&l_pos, line, type, size);
137 
138 		if ( (type == 1) or (type == 7) )
139 			skip = sizeof(int8_t)*size*N_indv;
140 		else if (type == 2)
141 			skip = sizeof(int16_t)*size*N_indv;
142 		else if ( (type == 3) or (type == 5) )
143 			skip = sizeof(int32_t)*size*N_indv;
144 		else
145 		{
146 			LOG.printLOG("Error: Unknown type: " + header::int2str(type) + "\n");
147 			parsed_FORMAT = false;
148 			return;
149 		}
150 		add_FORMAT_entry( entry_header.FORMAT_map[fmt_key].ID, fmt_key, ui, l_pos, type, size);
151 		l_pos += skip;
152 	}
153 
154 	GT_idx = -1;
155 	GQ_idx = -1;
156 	DP_idx = -1;
157 	FT_idx = -1;
158 
159 	if (FORMAT_to_idx.find("GT") != FORMAT_to_idx.end())
160 		GT_idx = FORMAT_to_idx["GT"];
161 	if (FORMAT_to_idx.find("GQ") != FORMAT_to_idx.end())
162 		GQ_idx = FORMAT_to_idx["GQ"];
163 	if (FORMAT_to_idx.find("DP") != FORMAT_to_idx.end())
164 		DP_idx = FORMAT_to_idx["DP"];
165 	if (FORMAT_to_idx.find("FT") != FORMAT_to_idx.end())
166 		FT_idx = FORMAT_to_idx["FT"];
167 	parsed_FORMAT = true;
168 }
169 
add_FORMAT_entry(const string & in,const unsigned int & fmt_key,const unsigned int & pos,const unsigned int & line_pos,const unsigned int & type,const unsigned int & size)170 void bcf_entry::add_FORMAT_entry(const string &in, const unsigned int &fmt_key, const unsigned int &pos, const unsigned int &line_pos, const unsigned int &type, const unsigned int &size)
171 {
172 	FORMAT.push_back(in);
173 	FORMAT_to_idx[in] = pos;
174 	FORMAT_positions[pos] = line_pos;
175 	FORMAT_types[pos] = type;
176 	FORMAT_sizes[pos] = size;
177 	FORMAT_keys[pos] = fmt_key;
178 
179 	if ( (type==1) or (type==7) )
180 		FORMAT_skip[pos] = ( sizeof(int8_t)*size );
181 	else if ( type==2 )
182 		FORMAT_skip[pos] = ( sizeof(int16_t)*size );
183 	else if ( (type==3) or (type==5) )
184 		FORMAT_skip[pos] = ( sizeof(int32_t)*size );
185 }
186 
set_indv_GENOTYPE_and_PHASE(unsigned int indv,const unsigned int & pos,const unsigned int & size)187 void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const unsigned int &pos, const unsigned int &size)
188 {
189 	int8_t tmp, tmp2;
190 	unsigned int cur_pos = pos;
191 	char phased[2] = {'/', '|'};
192 	ploidy.resize(N_indv);
193 	ploidy[indv] = 0;
194 
195 	for (unsigned int ui=0; ui<size; ui++)
196 	{
197 		tmp = *reinterpret_cast<const int8_t*>(&line[cur_pos]);
198 		if ( tmp == (int8_t)0x81 )
199 			break;
200 		ploidy[indv]++;
201 		cur_pos += sizeof(int8_t);
202 	}
203 
204 	if (ploidy[indv] == 0)
205 	{
206 		set_indv_GENOTYPE_alleles(indv, make_pair(-2, -2));
207 	}
208 	else if (ploidy[indv] == 1)
209 	{
210 		set_indv_PHASE(indv, '|');
211 		tmp = *reinterpret_cast<const int8_t*>(&line[pos]);
212 		if (tmp == (int8_t)0x80)
213 			tmp = -1;
214 		else
215 			tmp = (tmp >> 1) - 1;
216 		set_indv_GENOTYPE_alleles(indv, make_pair(tmp, -2));
217 	}
218 	else if (ploidy[indv] == 2)
219 	{
220 		tmp = *reinterpret_cast<const int8_t*>(&line[pos]);
221 		tmp2 = *reinterpret_cast<const int8_t*>(&line[pos+sizeof(int8_t)]);
222 
223 		if (tmp == (int8_t)0x80)
224 			tmp = -1;
225 		else
226 			tmp = (tmp >> 1) - 1;
227 
228 		if (tmp2 == (int8_t)0x80)
229 		{
230 			tmp2 = -1;
231 			set_indv_PHASE(indv, '/');
232 		}
233 		else
234 		{
235 			char phase = phased[ tmp2 & (int8_t)1 ];
236 			tmp2 = (tmp2 >> 1) - 1;
237 			set_indv_PHASE(indv, phase);
238 		}
239 		set_indv_GENOTYPE_alleles(indv, make_pair((int)tmp, (int)tmp2));
240 	}
241 	else if (ploidy[indv] > 2)
242 		LOG.error("Polyploidy found, and is not supported by vcftools: " + CHROM + ":" + header::int2str(POS));
243 	parsed_GT[indv] = true;
244 }
245 
set_indv_GENOTYPE_and_PHASE(unsigned int indv,const pair<int,int> & genotype,char phase)246 void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair<int, int> &genotype, char phase)
247 {
248 	set_indv_GENOTYPE_ids(indv, genotype);
249 	set_indv_PHASE(indv, phase);
250 	parsed_GT[indv] = true;
251 }
252 
set_indv_GENOTYPE_and_PHASE(unsigned int indv,const pair<string,string> & genotype,char phase)253 void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair<string, string> &genotype, char phase)
254 {
255 	pair<int, int> a(-1,-1);
256 	if (genotype.first != ".")
257 		a.first = header::str2int(genotype.first);
258 	if (genotype.second != ".")
259 		a.second = header::str2int(genotype.second);
260 
261 	set_indv_GENOTYPE_alleles(indv, a);
262 	set_indv_PHASE(indv, phase);
263 	parsed_GT[indv] = true;
264 }
265 
set_indv_GENOTYPE_alleles(unsigned int indv,const pair<int,int> & in)266 void bcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, const pair<int, int> &in)
267 {
268 	if (GENOTYPE.size() == 0)
269 		GENOTYPE.resize(N_indv, make_pair(-1,-1));
270 
271 	pair<int, int> a(-1,-1);
272 
273 	if (in.first == 0x81)
274 		a.first = -2;
275 	else if (in.first != 0x80)
276 		a.first = in.first;
277 
278 	if (in.second == 0x81)
279 		a.second = -2;
280 	else if (in.second != 0x80)
281 		a.second = in.second;
282 
283 	GENOTYPE[indv] = in;
284 	parsed_GT[indv] = true;
285 }
286 
set_indv_GENOTYPE_ids(unsigned int indv,const pair<int,int> & in)287 void bcf_entry::set_indv_GENOTYPE_ids(unsigned int indv, const pair<int, int> &in)
288 {
289 	if (GENOTYPE.size() == 0)
290 		GENOTYPE.resize(N_indv, make_pair(-2,-2));
291 	GENOTYPE[indv] = in;
292 }
293 
set_indv_PHASE(unsigned int indv,char in)294 void bcf_entry::set_indv_PHASE(unsigned int indv, char in)
295 {
296 	if (PHASE.size() == 0)
297 		PHASE.resize(N_indv, '/');
298 
299 	PHASE[indv] = in;
300 	parsed_GT[indv] = true;
301 }
302 
set_indv_GQUALITY(unsigned int indv,const vector<char> & in)303 void bcf_entry::set_indv_GQUALITY(unsigned int indv, const vector<char> &in)
304 {
305 	float tmp;
306 	memcpy(&tmp, &in[0], sizeof(tmp));
307 
308 	parsed_GQ[indv] = true;
309 	if (tmp == 0x7F800001)
310 	{
311 		if (GQUALITY.size() > 0)
312 			GQUALITY[indv] = -1;
313 		return;
314 	}
315 	if (GQUALITY.size() == 0)
316 		GQUALITY.resize(N_indv, -1);
317 
318 	if (tmp > 99.0)
319 		tmp = 99;
320 	GQUALITY[indv] = tmp;
321 }
322 
set_indv_GQUALITY(unsigned int indv,const float & in)323 void bcf_entry::set_indv_GQUALITY(unsigned int indv, const float &in)
324 {
325 	parsed_GQ[indv] = true;
326 	if ( (in == -1) or (in == 0x7F800001) )
327 	{
328 		if (GQUALITY.size() > 0)
329 			GQUALITY[indv] = -1;
330 		return;
331 	}
332 	if (GQUALITY.size() == 0)
333 		GQUALITY.resize(N_indv, -1);
334 
335 	if (in > 99)
336 		GQUALITY[indv] = 99;
337 	else
338 		GQUALITY[indv] = in;
339 }
340 
set_indv_GFILTER(unsigned int indv,const vector<char> & in)341 void bcf_entry::set_indv_GFILTER(unsigned int indv, const vector<char> &in)
342 {
343 	parsed_FT[indv] = true;
344 
345 	if (GFILTER.size() == 0)
346 		GFILTER.resize(N_indv);
347 
348 	GFILTER[indv].resize(0);
349 	if (in.empty())
350 		return;
351 	else if ((in.size() == 1) and (in[0] == '\0') )
352 		return;
353 
354 	ostringstream ss;
355 	string ith_FILTER;
356 	ss.clear();
357 	for (unsigned int ui=0; ui<in.size(); ui++)
358 	{
359 		if (in[ui] == ';')
360 		{
361 			ith_FILTER = ss.str();
362 			ss.clear();
363 
364 			if ((ith_FILTER.size()==0) || (ith_FILTER == "."))
365 				continue;	// Don't bother storing "unfiltered" state.
366 
367 			GFILTER[indv].push_back(ith_FILTER);
368 		}
369 		else
370 			ss << in[ui];
371 	}
372 	ith_FILTER = ss.str();
373 	ss.clear();
374 
375 	if ((ith_FILTER.size()!=0) || (ith_FILTER != "."))
376 		GFILTER[indv].push_back(ith_FILTER);
377 }
378 
set_indv_GFILTER(unsigned int indv,const string & in)379 void bcf_entry::set_indv_GFILTER(unsigned int indv, const string &in)
380 {
381 	parsed_FT[indv] = true;
382 
383 	if (GFILTER.size() == 0)
384 		GFILTER.resize(N_indv);
385 
386 	GFILTER[indv].resize(0);
387 	if ((in.size() == 0) || (in == "."))
388 		return;
389 
390 	static istringstream ss;
391 	static string ith_FILTER;
392 	ss.clear();
393 	ss.str(in);
394 	while (!ss.eof())
395 	{
396 		getline(ss, ith_FILTER, ';');
397 
398 		if ((ith_FILTER.size()==0) || (ith_FILTER == "."))
399 			continue;	// Don't bother storing "unfiltered" state.
400 
401 		GFILTER[indv].push_back(ith_FILTER);
402 	}
403 }
404 
set_FILTER()405 void bcf_entry::set_FILTER()
406 {
407 	FILTER_str = get_int_vector( &FILTER_pos, line );
408 	FILTER.resize(0);
409 
410 	if ( (FILTER_str.size() == 1) and entry_header.FILTER_map[ FILTER_str[0] ].ID == "")
411 	{
412 		FILTER.push_back("PASS");
413 		return;
414 	}
415 
416 	for (unsigned int ui=0; ui<FILTER_str.size(); ui++)
417 	{
418 		if ((int8_t)FILTER_str[ui] != (int8_t)0x80)
419 			FILTER.push_back( entry_header.FILTER_map[ FILTER_str[ui] ].ID );
420 		else
421 			FILTER.push_back( "." );
422 	}
423 	sort(FILTER.begin(), FILTER.end());
424 	parsed_FILTER = true;
425 }
426