1 /*
2  * vcf_entry.cpp
3  *
4  *  Created on: Aug 19, 2009
5  *      Author: Adam Auton
6  *      ($Revision: 230 $)
7  */
8 
9 #include "vcf_entry.h"
10 
11 string vcf_entry::convert_line;
12 
vcf_entry(header & meta_data,vector<bool> & include_individual)13 vcf_entry::vcf_entry(header &meta_data, vector<bool> &include_individual)
14 {
15 	N_indv = meta_data.N_indv;
16 	include_indv = include_individual;
17 	include_genotype = vector<bool>(N_indv, true);
18 	basic_parsed = false; fully_parsed = false;
19 	parsed_ALT = false; parsed_FILTER = false;
20 	parsed_INFO = false; parsed_FORMAT = false;
21 	CHROM = ""; POS = -1; REF = ""; QUAL = -1;
22 	passed_filters = true; parsed_FORMAT_binary = false;
23 	N_INFO_removed = 0; N_FORMAT_removed = 0;
24 	parsed_GT = vector<bool>(N_indv, false); parsed_GQ = vector<bool>(N_indv, false);
25 	parsed_DP = vector<bool>(N_indv, false); parsed_FT = vector<bool>(N_indv, false);
26 	GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1;
27 	FORMAT_positions.resize(N_indv); FORMAT_types.resize(N_indv); FORMAT_sizes.resize(N_indv); FORMAT_skip.resize(N_indv); FORMAT_keys.resize(N_indv);
28 	convert_line.clear();
29 	data_stream.str("");
30 	entry_header = meta_data;
31 }
32 
~vcf_entry()33 vcf_entry::~vcf_entry() {}
34 
35 // Reset the VCF entry object with a new data line
reset(const vector<char> & data_line)36 void vcf_entry::reset(const vector<char> &data_line)
37 {
38 	basic_parsed = false;
39 	fully_parsed = false;
40 	parsed_ALT = false;
41 	parsed_FILTER = false;
42 	parsed_INFO = false;
43 	parsed_FORMAT = false;
44 	parsed_FORMAT_binary = false;
45 	passed_filters = true;
46 
47 	data_stream.clear();
48 	line = data_line;
49 
50 	convert_line.assign(line.begin(), line.end());
51 	data_stream.str(convert_line);
52 
53 	fill(parsed_GT.begin(), parsed_GT.end(), 0);
54 	fill(parsed_GQ.begin(), parsed_GQ.end(), 0);
55 	fill(parsed_DP.begin(), parsed_DP.end(), 0);
56 	fill(parsed_FT.begin(), parsed_FT.end(), 0);
57 	fill(include_genotype.begin(), include_genotype.end(), 1);
58 
59 	N_INFO_removed = 0; N_FORMAT_removed = 0;
60 	FORMAT_positions.clear(); FORMAT_types.clear(); FORMAT_sizes.clear(); FORMAT_skip.clear(); FORMAT_keys.clear();
61 }
62 
63 // Tokenize the basic information in a VCF data line (at the tab level)
parse_basic_entry(bool parse_ALT,bool parse_FILTER,bool parse_INFO)64 void vcf_entry::parse_basic_entry(bool parse_ALT, bool parse_FILTER, bool parse_INFO)
65 {
66 	if(!basic_parsed)
67 	{
68 		getline(data_stream, CHROM, '\t');
69 
70 		getline(data_stream, ID, '\t');
71 		POS = atoi(ID.c_str());
72 		getline(data_stream, ID, '\t');
73 		getline(data_stream, REF, '\t');
74 		getline(data_stream, ALT_str, '\t');
75 		getline(data_stream, QUAL_str, '\t');
76 		getline(data_stream, FILTER_str, '\t');
77 		getline(data_stream, INFO_str, '\t');
78 
79 		QUAL = header::str2double(QUAL_str);
80 
81 		// Convert to uppercase for consistency
82 		// Note that VCF v4.1 allows mixtures of lower/upper case in REF and ALT.
83 		// However, the spec specifically states that tools using VCF are not required
84 		// to preserve the case.
85 		std::transform(REF.begin(), REF.end(), REF.begin(), ::toupper);
86 		std::transform(ALT_str.begin(), ALT_str.end(),ALT_str.begin(), ::toupper);
87 	}
88 	basic_parsed = true;
89 
90 	if (parse_ALT && !parsed_ALT)
91 		set_ALT(ALT_str);
92 	if (parse_FILTER && !parsed_FILTER)
93 		set_FILTER(FILTER_str);
94 	if (parse_INFO && !parsed_INFO)
95 		set_INFO(INFO_str);
96 }
97 
98 // Tokenize the genotype information (at the 'tab' level) in the VCF entry
parse_full_entry(bool parse_FORMAT)99 void vcf_entry::parse_full_entry(bool parse_FORMAT)
100 {
101 	if (fully_parsed)
102 		return;
103 
104 	if (basic_parsed == false)
105 		parse_basic_entry();
106 
107 	getline(data_stream, FORMAT_str, '\t');
108 
109 	if (parse_FORMAT)
110 		set_FORMAT(FORMAT_str);
111 
112 	string tmpstr; tmpstr.reserve(64);
113 	GENOTYPE_str.resize(N_indv, tmpstr);
114 
115 	for (unsigned int ui=0; ui<N_indv; ui++)
116 		getline(data_stream, GENOTYPE_str[ui], '\t');
117 
118 	fully_parsed = true;
119 }
120 
121 // Tokenize a given genotype entry into it's component parts
parse_genotype_entry(unsigned int indv,bool GT,bool GQ,bool DP,bool FT)122 void vcf_entry::parse_genotype_entry(unsigned int indv, bool GT, bool GQ, bool DP, bool FT)
123 {
124 	if (fully_parsed == false)
125 		parse_full_entry(true);
126 
127 	if (parsed_FORMAT == false)
128 		set_FORMAT(FORMAT_str);
129 
130 	static string tmpstr;
131 	int N_required = GT + GQ + DP + FT;
132 	int N_got = 0;
133 
134 	int i=0, start = 0, end = 0;
135 	while ((end = GENOTYPE_str[indv].find(':',start)) != string::npos)
136 	{
137 		if (GT && (i == GT_idx))
138 		{
139 			tmpstr = GENOTYPE_str[indv].substr(start, end - start);
140 			set_indv_GENOTYPE_and_PHASE(indv, tmpstr);
141 			N_got++;
142 		}
143 		else if (GQ && (i == GQ_idx))
144 		{
145 			tmpstr = GENOTYPE_str[indv].substr(start, end - start);
146 			set_indv_GQUALITY(indv, header::str2double(tmpstr));
147 			N_got++;
148 		}
149 		else if (DP && (i == DP_idx))
150 		{
151 			tmpstr = GENOTYPE_str[indv].substr(start, end - start);
152 			set_indv_DEPTH(indv, header::str2int(tmpstr));
153 			N_got++;
154 		}
155 		else if (FT && (i == FT_idx))
156 		{
157 			tmpstr = GENOTYPE_str[indv].substr(start, end - start);
158 			set_indv_GFILTER(indv, tmpstr);
159 			N_got++;
160 		}
161 
162 		if (N_got == N_required)
163 			break;
164 		i++;
165 		start = end + 1;
166 	}
167 	if (N_got != N_required)
168 	{
169 		if (GT && (i == GT_idx))
170 		{
171 			tmpstr = GENOTYPE_str[indv].substr(start);
172 			set_indv_GENOTYPE_and_PHASE(indv, tmpstr);
173 			N_got++;
174 		}
175 		else if (GQ && (i == GQ_idx))
176 		{
177 			tmpstr = GENOTYPE_str[indv].substr(start);
178 			set_indv_GQUALITY(indv, header::str2double(tmpstr));
179 			N_got++;
180 		}
181 		else if (DP && (i == DP_idx))
182 		{
183 			tmpstr = GENOTYPE_str[indv].substr(start);
184 			set_indv_DEPTH(indv, header::str2int(tmpstr));
185 			N_got++;
186 		}
187 		else if (FT && (i == FT_idx))
188 		{
189 			tmpstr = GENOTYPE_str[indv].substr(start);
190 			set_indv_GFILTER(indv, tmpstr);
191 			N_got++;
192 		}
193 	}
194 
195 	// Set missing return values if requested a value, but couldn't find it
196 	if (GT && (parsed_GT[indv] == false))
197 	{
198 		set_indv_GENOTYPE_and_PHASE(indv, make_pair(-1,-1), '/');
199 	}
200 	if (GQ && (parsed_GQ[indv] == false))
201 	{
202 		set_indv_GQUALITY(indv, -1);
203 	}
204 	if (DP && (parsed_DP[indv] == false))
205 	{
206 		set_indv_DEPTH(indv, -1);
207 	}
208 	if (FT && (parsed_FT[indv] == false))
209 	{
210 		set_indv_GFILTER(indv, "");
211 	}
212 }
213 
214 // Read the VCF entry and fully populate the object
parse_genotype_entries(bool GT,bool GQ,bool DP,bool FT)215 void vcf_entry::parse_genotype_entries(bool GT, bool GQ, bool DP, bool FT)
216 {
217 	for (unsigned int ui=0; ui<N_indv; ui++)
218 		parse_genotype_entry(ui, GT, GQ, DP, FT);
219 }
220 
parse_FORMAT()221 void vcf_entry::parse_FORMAT()
222 {
223 	FORMAT_binary.resize(0);
224 	vector<char> tmp_vector;
225 	vector<string> tmp_split;
226 	vector< vector<string> > format_matrix(N_indv);
227 
228 	unsigned int type, number, size, position=0;
229 	tmp_split.resize(FORMAT.size());
230 	for (unsigned int ui=0; ui<N_indv; ui++)
231 	{
232 		format_matrix[ui].resize(FORMAT.size());
233 		if ( (GENOTYPE_str[ui] != "") and (GENOTYPE_str[ui] != ".") )
234 			header::split(GENOTYPE_str[ui],':', tmp_split);
235 		else
236 			tmp_split.assign(FORMAT.size(),".");
237 
238 		for (unsigned int uj=0; uj<tmp_split.size(); uj++)
239 			format_matrix[ui][uj] = tmp_split[uj];
240 
241 		for (unsigned int uj=format_matrix[ui].size(); uj<FORMAT.size(); uj++)
242 			format_matrix[ui][uj] = ".";
243 	}
244 
245 	FORMAT_positions.resize(FORMAT.size()); FORMAT_types.resize(FORMAT.size()); FORMAT_skip.resize(FORMAT.size());
246 
247 	for (unsigned int ui=0; ui<FORMAT.size(); ui++)
248 	{
249 		if (entry_header.FORMAT_reverse_map.find(FORMAT[ui]) == entry_header.FORMAT_reverse_map.end())
250 		{
251 			LOG.one_off_warning("FORMAT value " + FORMAT[ui] + " is not defined in the header and will not be encoded.");
252 			N_FORMAT_removed++;
253 			continue;
254 		}
255 		FORMAT_positions[ui] = position;
256 		make_typed_int(tmp_vector, entry_header.FORMAT_reverse_map[ FORMAT[ui] ], true );
257 		FORMAT_binary.insert(FORMAT_binary.end(), tmp_vector.begin(), tmp_vector.end() );
258 
259 		tmp_vector.resize(0);
260 		tmp_split.resize(0);
261 
262 		type = entry_header.FORMAT_map[ entry_header.FORMAT_reverse_map[FORMAT[ui]] ].Type;
263 		number = entry_header.FORMAT_map[ entry_header.FORMAT_reverse_map[FORMAT[ui]] ].N_entries;
264 
265 		for (unsigned int uj=0; uj<N_indv; uj++)
266 			tmp_split.push_back( format_matrix[uj][ui] );
267 
268 		if ((int)ui == GT_idx)
269 			make_typed_GT_vector(tmp_vector, tmp_split );
270 		else if (type == Integer)
271 			make_typed_int_vector(tmp_vector, tmp_split, number );
272 		else if (type == Float)
273 			make_typed_float_vector(tmp_vector, tmp_split, number );
274 		else if ( (type == Character) or (type == String) )
275 			make_typed_string_vector(tmp_vector, tmp_split, number );
276 		else
277 			LOG.error("Invalid type in FORMAT definition", 0);
278 
279 		position = 0;
280 		get_type(&position, tmp_vector, type, size);
281 		FORMAT_types[ui] = type;
282 
283 		if ( (type == 1) || (type == 7) )
284 			FORMAT_skip[ui] = size*sizeof(int8_t);
285 		else if (type == 2)
286 			FORMAT_skip[ui] = size*sizeof(int16_t);
287 		else if ( (type == 3) || (type == 5) )
288 			FORMAT_skip[ui] = size*sizeof(int32_t);
289 
290 		FORMAT_binary.insert(FORMAT_binary.end(), tmp_vector.begin(), tmp_vector.end() );
291 		tmp_vector.resize(0);
292 		position = FORMAT_binary.size();
293 	}
294 	parsed_FORMAT_binary = true;
295 }
296 
297 // Output VCF entry to output stream
print(ostream & out,const set<string> & INFO_to_keep,bool keep_all_INFO)298 void vcf_entry::print(ostream &out, const set<string> &INFO_to_keep, bool keep_all_INFO)
299 {
300 	if (fully_parsed == false)
301 		parse_full_entry();
302 
303 	out << get_CHROM() << '\t' << POS << '\t' << get_ID() << '\t' << REF << '\t' << get_ALT();
304 
305 	out << '\t' << header::double2str(QUAL);
306 	out << '\t' << get_FILTER();
307 
308 	if (keep_all_INFO == false)
309 		out << '\t' << get_INFO(INFO_to_keep);
310 	else
311 		out << '\t' << INFO_str;
312 
313 	pair<int, int> genotype;
314 	string GFILTER_tmp;
315 	if (FORMAT.size() > 0)
316 	{
317 		char PHASE;
318 		out << '\t' << get_FORMAT();
319 
320 		for (unsigned int ui=0; ui<N_indv; ui++)
321 		{
322 			if (include_indv[ui] == false)
323 				continue;
324 			out << '\t';
325 			for (int count=0; count<(int)FORMAT.size(); count++)
326 			{
327 				if (count == GT_idx) // (FORMAT[count] == "GT")
328 				{
329 					if (count != 0)	out << ':';
330 					if (include_genotype[ui] == true)
331 					{
332 						get_indv_GENOTYPE_ids(ui, genotype);
333 						PHASE = get_indv_PHASE(ui);
334 						if ((genotype.first != -1) && (genotype.second != -1))
335 							out << header::int2str(genotype.first) << PHASE << header::int2str(genotype.second);
336 						else if ((PHASE == '|') && (genotype.second == -1))
337 							out << header::int2str(genotype.first);	// Handle haploid case
338 						else
339 							out << header::int2str(genotype.first) << PHASE << header::int2str(genotype.second);
340 					}
341 					else
342 						out << "./.";
343 				}
344 				else if (count == GQ_idx) //(FORMAT[count] == "GQ")
345 				{
346 					if (count != 0)	out << ':';
347 					out << header::double2str(get_indv_GQUALITY(ui));
348 				}
349 				else if (count == DP_idx) // (FORMAT[count] == "DP")
350 				{
351 					if (count != 0)	out << ':';
352 					out << header::int2str(get_indv_DEPTH(ui));
353 				}
354 				else if (count == FT_idx) // (FORMAT[count] == "FT")
355 				{
356 					if (count != 0)	out << ':';
357 					get_indv_GFILTER(ui, GFILTER_tmp);
358 					out << GFILTER_tmp;
359 				}
360 				else
361 				{	// Unknown FORMAT so just replicate original output
362 					if (count != 0)	out << ':';
363 					read_indv_generic_entry(ui, FORMAT[count], GFILTER_tmp);
364 					out << GFILTER_tmp;
365 				}
366 			}
367 		}
368 	}
369 	out << '\n';
370 }
371 
372 // Output VCF entry to output stream in binary
print_bcf(BGZF * out,const set<string> & INFO_to_keep,bool keep_all_INFO)373 void vcf_entry::print_bcf(BGZF* out, const set<string> &INFO_to_keep, bool keep_all_INFO)
374 {
375 	if (fully_parsed == false)
376 		parse_full_entry();
377 
378 	if (parsed_FORMAT_binary == false)
379 		parse_FORMAT();
380 
381 	vector<char> out_vector, tmp_vector;
382 	out_vector.resize(8*sizeof(int32_t));
383 	int vector_pos = 2*sizeof(uint32_t);
384 	string tmp_string;
385 	int index;
386 	vector<string> filter_vector;
387 	vector<pair< string, string > > tmp_info;
388 
389 	tmp_string = get_CHROM();
390 	if (tmp_string == "." or tmp_string == " " or tmp_string == "")
391 		LOG.error("CHROM value must be defined for all entries.",0);
392 
393 	if (entry_header.CONTIG_reverse_map.find(tmp_string) == entry_header.CONTIG_reverse_map.end() )
394 		LOG.error("CHROM value " + tmp_string + " is not defined on contig dictionary.",0);
395 
396 	int32_t chrom = (int32_t)entry_header.CONTIG_reverse_map[tmp_string];
397 	memcpy(&out_vector[vector_pos], &chrom, sizeof(chrom));
398 	vector_pos += sizeof(chrom);
399 
400 	get_POS_binary(tmp_vector);
401 	memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size());
402 	vector_pos += tmp_vector.size();
403 	tmp_vector.resize(0);
404 
405 	get_rlen(tmp_vector);
406 	memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size());
407 	vector_pos += tmp_vector.size();
408 	tmp_vector.resize(0);
409 
410 	get_QUAL_binary(tmp_vector);
411 	memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size());
412 	vector_pos += tmp_vector.size();
413 	tmp_vector.resize(0);
414 
415 	get_ID_binary(tmp_vector);
416 	out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
417 	tmp_vector.resize(0);
418 
419 	get_ALLELES_binary(tmp_vector);
420 	out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
421 	tmp_vector.resize(0);
422 
423 	get_FILTER_vector(filter_vector);
424 	if (filter_vector.empty())
425 		make_typed_int_vector(tmp_vector, filter_vector);
426 	else
427 	{
428 		vector<int> index_vector;
429 		for(unsigned int ui=0; ui<filter_vector.size(); ui++)
430 		{
431 			if ( entry_header.FILTER_reverse_map.find( filter_vector[ui] ) == entry_header.FILTER_reverse_map.end() )
432 				LOG.one_off_warning("FILTER value " + filter_vector[ui] + " is not defined in the header and will not be encoded.");
433 			else
434 				index_vector.push_back( entry_header.FILTER_reverse_map[ filter_vector[ui] ] );
435 		}
436 		make_typed_int_vector(tmp_vector, index_vector );
437 	}
438 
439 	if (tmp_vector.empty())
440 		make_typed_int(tmp_vector, 0, true);
441 
442 	out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
443 
444 	int map_type, number;
445 	tmp_info = get_INFO_vector(INFO_to_keep, keep_all_INFO);
446 
447 	for(unsigned int ui=0; ui<tmp_info.size(); ui++)
448 	{
449 		if (entry_header.INFO_reverse_map.find(tmp_info[ui].first) == entry_header.INFO_reverse_map.end())
450 		{
451 			LOG.one_off_warning("INFO value " + tmp_info[ui].first + " is not defined in the header and will not be encoded.");
452 			N_INFO_removed++;
453 			continue;
454 		}
455 
456 		tmp_vector.resize(0);
457 		index = entry_header.INFO_reverse_map[ tmp_info[ui].first ];
458 		make_typed_int(tmp_vector, index, true);
459 		out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
460 
461 		tmp_vector.resize(0);
462 		map_type = entry_header.INFO_map[ index ].Type;
463 		number = entry_header.INFO_map[ index ].N_entries;
464 
465 		if (map_type == Integer)
466 			make_typed_int_vector(tmp_vector, tmp_info[ui].second, number );
467 		else if (map_type == Float)
468 			make_typed_float_vector(tmp_vector, tmp_info[ui].second, number );
469 		else if ( (map_type == Character) or (map_type == String) )
470 			make_typed_string(tmp_vector, tmp_info[ui].second, true );
471 		else if (map_type == Flag)
472 			make_typed_int(tmp_vector, 1, true );
473 		else
474 			LOG.error("Invalid type in INFO definition", 0);
475 
476 		out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
477 	}
478 	tmp_vector.resize(0);
479 	uint32_t l_shared = (uint32_t)out_vector.size() - (uint32_t)(2*sizeof(uint32_t));
480 	memcpy(&out_vector[0], &l_shared, sizeof(l_shared));
481 
482 	tmp_vector.resize(0);
483 	get_n_allele_info(tmp_vector);
484 	memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size());
485 	vector_pos += tmp_vector.size();
486 
487 	tmp_vector.resize(0);
488 	get_FORMAT_binary(tmp_vector);
489 	unsigned int key_size, size, type, old_pos, out_size, format_pos = 0;
490 	unsigned int skip_size = 0;
491 	for (unsigned int ui=0; ui<(FORMAT.size()-N_FORMAT_removed); ui++)
492 	{
493 		format_pos = FORMAT_positions[ui];
494 		old_pos = format_pos;
495 
496 		get_typed_int(&format_pos, tmp_vector, type, size);
497 		get_type(&format_pos, tmp_vector, type, size);
498 		key_size = format_pos - old_pos;
499 		out_size = out_vector.size();
500 		out_vector.resize( out_size + key_size );
501 		memcpy(&out_vector[out_size], &tmp_vector[old_pos], key_size );
502 
503 		skip_size = FORMAT_skip[ui];
504 		for (unsigned int uj=0; uj<N_indv; uj++)
505 		{
506 			if (include_indv[uj] == false)
507 				continue;
508 
509 			format_pos = FORMAT_positions[ui]+key_size+skip_size*uj;
510 			out_size = out_vector.size();
511 			out_vector.resize( out_size + skip_size );
512 			if ( ((int)ui == GT_idx) and (include_genotype[ui] == false) )
513 			{
514 				for (int p = 0; p < (int)skip_size; p++)
515 				{
516 					if (p < ploidy[uj])
517 						out_vector[out_size] = (int8_t)0x80;
518 					else
519 						out_vector[out_size] = (int8_t)0x81;
520 					out_size++;
521 				}
522 			}
523 			else
524 			{
525 				memcpy(&out_vector[out_size], &tmp_vector[format_pos], skip_size);
526 				out_size += skip_size;
527 			}
528 		}
529 	}
530 	tmp_vector.resize(0);
531 	get_n_fmt_sample(tmp_vector);
532 	memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size());
533 
534 	uint32_t l_indv = (uint32_t)out_vector.size() - l_shared - (uint32_t)(2*sizeof(uint32_t));
535 	memcpy(&out_vector[sizeof(l_shared)], &l_indv, sizeof(l_indv));
536 	bgzf_write(out, &out_vector[0], out_vector.size());
537 }
538 
539 // Set the include_genotype flag on the basis of depth
filter_genotypes_by_depth(int min_depth,int max_depth)540 void vcf_entry::filter_genotypes_by_depth(int min_depth, int max_depth)
541 {
542 	if (fully_parsed == false)
543 		parse_full_entry();
544 
545 	if (DP_idx != -1)
546 	{	// Have depth info
547 		int depth;
548 		for (unsigned int ui=0; ui<N_indv; ui++)
549 		{
550 			if (parsed_DP[ui] == false)
551 				parse_genotype_entry(ui, false, false, true);
552 			depth = get_indv_DEPTH(ui);
553 			if ((depth < min_depth) || (depth > max_depth))
554 				include_genotype[ui] = false;
555 		}
556 	}
557 }
558 
559 // Filter specific genotypes by quality
filter_genotypes_by_quality(double min_genotype_quality)560 void vcf_entry::filter_genotypes_by_quality(double min_genotype_quality)
561 {
562 	if (fully_parsed == false)
563 		parse_full_entry();
564 
565 	if (GQ_idx != -1)
566 	{	// Have quality info
567 		double quality;
568 		for (unsigned int ui=0; ui<N_indv; ui++)
569 		{
570 			if (parsed_GQ[ui] == false)
571 				parse_genotype_entry(ui, false, true);
572 			quality = get_indv_GQUALITY(ui);
573 			if (quality < min_genotype_quality)
574 				include_genotype[ui] = false;
575 		}
576 	}
577 }
578 
579 // Exclude genotypes with a filter flag.
filter_genotypes_by_filter_status(const set<string> & filter_flags_to_remove,bool remove_all)580 void vcf_entry::filter_genotypes_by_filter_status(const set<string> &filter_flags_to_remove, bool remove_all)
581 {
582 	if (fully_parsed == false)
583 		parse_full_entry();
584 
585 	vector<string> GFILTERs;
586 	if (FT_idx != -1)
587 	{	// Have GFilter info
588 		for (unsigned int ui=0; ui<N_indv; ui++)
589 		{
590 			if (parsed_FT[ui] == false)
591 				parse_genotype_entry(ui, false, false, false, true);
592 			get_indv_GFILTER_vector(ui, GFILTERs);
593 
594 			if (remove_all == true)
595 			{	// If removing all filters, only keep things with label PASS
596 				if (!GFILTERs.empty())
597 					if ((GFILTERs[0] != "PASS") && (GFILTERs[0] != "."))
598 						include_genotype[ui] = false;
599 			}
600 			else
601 			{	// Only removing specific filters
602 				for (unsigned int uj=0; uj<GFILTERs.size(); uj++)
603 					if (filter_flags_to_remove.find(GFILTERs[uj]) != filter_flags_to_remove.end())
604 							include_genotype[ui] = false;
605 			}
606 		}
607 	}
608 }
609 
read_indv_generic_entry(unsigned int indv,const string & FORMAT_id,string & out)610 void vcf_entry::read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out)
611 {
612 	if (fully_parsed == false)
613 		parse_full_entry(true);
614 
615 	if (parsed_FORMAT == false)
616 		set_FORMAT(FORMAT_str);
617 
618 	out = ".";
619 
620 	if (FORMAT_to_idx.find(FORMAT_id) != FORMAT_to_idx.end())
621 	{
622 		unsigned int idx = FORMAT_to_idx[FORMAT_id];
623 		static string tmpstr;
624 		static istringstream ss;
625 		ss.clear();
626 		ss.str(GENOTYPE_str[indv]);
627 
628 		for (unsigned int ui=0; ui <= idx; ui++)
629 		{
630 			getline(ss, tmpstr, ':');
631 			if (ui == idx)
632 			{
633 				out = tmpstr;
634 				break;
635 			}
636 			if (!ss.good())
637 				break;
638 		}
639 	}
640 }
641