1 /*
2 	entry_getters.cpp
3  *
4  *  Created on: Nov 11, 2009
5  *      Author: Adam Auton
6  *      ($Revision: 230 $)
7  */
8 
9 #include "entry.h"
10 
11 // Return the CHROMosome name
get_CHROM() const12 string entry::get_CHROM() const
13 {
14 	return CHROM;
15 }
16 
17 // Return the CHROMosome name
get_CHROM(string & out) const18 void entry::get_CHROM(string &out) const
19 {
20 	out = CHROM;
21 }
22 
get_POS() const23 int entry::get_POS() const
24 {
25 	return POS;
26 }
27 
get_ID() const28 string entry::get_ID() const
29 {
30 	if (ID.size() == 0)
31 		return ".";
32 	return ID;
33 }
34 
get_REF() const35 string entry::get_REF() const
36 {
37 	if (REF == "")
38 		return ".";
39 	else
40 		return REF;
41 }
42 
get_ALT() const43 string entry::get_ALT() const
44 {
45 	assert(parsed_ALT == true);
46 
47 	string out;
48 	if (ALT.empty())
49 		out = ".";
50 	else if (ALT.size() == 1 && ALT[0] == "")
51 		out = ".";
52 	else
53 	{
54 		out = ALT[0];
55 		for (unsigned int ui=1; ui<ALT.size(); ui++)
56 			out += "," + ALT[ui];
57 	}
58 	return out;
59 }
60 
is_SNP() const61 bool entry::is_SNP() const
62 {
63 	assert(parsed_ALT == true);
64 
65 	if (REF.size() != 1)
66 		return false;	// Reference isn't a single base
67 
68 	if (ALT.empty())
69 		return false;	// No alternative allele
70 
71 	for (unsigned int ui=0; ui<ALT.size(); ui++)
72 		if (ALT[ui].size() != 1)
73 			return false;	// Alternative allele isn't a single base
74 
75 	return true;
76 }
77 
is_biallelic_SNP() const78 bool entry::is_biallelic_SNP() const
79 {
80 	assert(parsed_ALT == true);
81 
82 	if (REF.size() != 1)
83 		return false;	// Reference isn't a single base
84 
85 	if (ALT.size() != 1)
86 		return false;	// Not biallelic
87 
88 	if (ALT[0].size() != 1)
89 		return false;	// Alternative allele isn't a single base
90 
91 	return true;
92 }
93 
is_diploid() const94 bool entry::is_diploid() const
95 {
96 	for (unsigned int ui=0; ui<N_indv; ui++)
97 	{
98 		if ((include_indv[ui] == true) && (include_genotype[ui] == true))
99 		{
100 			assert(parsed_GT[ui] == true);
101 			if (ploidy[ui] != 2)
102 				return false;
103 		}
104 	}
105 	return true;
106 }
107 
get_allele(int allele_num,string & out) const108 void entry::get_allele(int allele_num, string &out) const
109 {
110 	assert(parsed_ALT == true);
111 
112 	if (allele_num == -2)
113 		out = "";
114 	else if (allele_num == 0)
115 		out = REF;
116 	else if ((allele_num == -1) || (unsigned(allele_num - 1) >=  ALT.size()))
117 		out = ".";
118 	else
119 		out = ALT[allele_num-1];
120 }
121 
get_allele(int allele_num) const122 string entry::get_allele(int allele_num) const
123 {
124 	assert(parsed_ALT == true);
125 
126 	if (allele_num == -2)
127 		return "";
128 	else if (allele_num == 0)
129 		return REF;
130 	else if ((allele_num < 0) || (unsigned(allele_num - 1) >=  ALT.size()))
131 		return ".";
132 	else
133 		return ALT[allele_num-1];
134 }
135 
get_ALT_allele(int allele_num) const136 string entry::get_ALT_allele(int allele_num) const
137 {
138 	assert(parsed_ALT == true);
139 
140 	if (allele_num == -2)
141 		return "";
142 	else if ((allele_num == -1) || (unsigned(allele_num) >=  ALT.size()))
143 		return ".";
144 	return ALT[allele_num];
145 }
146 
get_alleles_vector(vector<string> & out) const147 void entry::get_alleles_vector(vector<string> &out) const
148 {
149 	assert(parsed_ALT == true);
150 	out.resize(ALT.size()+1);
151 	out[0] = REF;
152 	copy(ALT.begin(), ALT.end(), out.begin()+1);
153 }
154 
get_QUAL() const155 double entry::get_QUAL() const
156 {
157 	return QUAL;
158 }
159 
get_FILTER() const160 string entry::get_FILTER() const
161 {
162 	assert(parsed_FILTER == true);
163 
164 	ostringstream out;
165 	if (FILTER.empty())
166 		out << ".";
167 	else
168 	{
169 		out << FILTER[0];
170 		for (unsigned int ui=1; ui<FILTER.size(); ui++)
171 			out << ";" << FILTER[ui];
172 	}
173 	return out.str();
174 }
175 
get_FILTER_vector(vector<string> & out) const176 void entry::get_FILTER_vector(vector<string> &out) const
177 {
178 	assert(parsed_FILTER == true);
179 	out = FILTER;
180 }
181 
get_INFO(const set<string> & INFO_to_keep,bool keep_all_INFO) const182 string entry::get_INFO(const set<string> &INFO_to_keep, bool keep_all_INFO) const
183 {
184 	assert(parsed_INFO == true);
185 
186 	ostringstream sout;
187 	sout.str("");
188 	sout.clear();
189 
190 	bool first=true;
191 	if ( ( (!INFO.empty()) && (!INFO_to_keep.empty())  ) || keep_all_INFO )
192 	{
193 		string key;
194 		for (unsigned int ui=0; ui<INFO.size();ui++)
195 		{
196 			key = INFO[ui].first;
197 			if ( keep_all_INFO or (INFO_to_keep.find(key) != INFO_to_keep.end() ) )
198 			{
199 				if (first != true)
200 					sout << ";";
201 
202 				sout << key;
203 				if (INFO[ui].second != "")
204 					sout << "=" << INFO[ui].second;
205 				first = false;
206 			}
207 		}
208 	}
209 
210 	if (first == true)
211 	{	// Didn't find any INFO fields to keep
212 		sout.str(".");
213 	}
214 	return sout.str();
215 }
216 
get_INFO_vector(const set<string> & INFO_to_keep,bool keep_all_INFO)217 vector<pair<string, string> > entry::get_INFO_vector(const set<string> &INFO_to_keep, bool keep_all_INFO)
218 {
219 	assert(parsed_INFO == true);
220 
221 	vector<pair<string, string> > out_vector;
222 	if (keep_all_INFO == true)
223 		return INFO;
224 
225 	if ( (!INFO.empty()) && (!INFO_to_keep.empty()) )
226 	{
227 		string key;
228 		for (unsigned int ui=0; ui<INFO.size();ui++)
229 		{
230 			key = INFO[ui].first;
231 			if ( keep_all_INFO or (INFO_to_keep.find(key) != INFO_to_keep.end() ) )
232 				out_vector.push_back( INFO[ui] );
233 			else
234 				N_INFO_removed++;
235 		}
236 	}
237 
238 	return out_vector;
239 }
240 
get_INFO_value(const string & key) const241 string entry::get_INFO_value(const string &key) const
242 {
243 	assert(parsed_INFO == true);
244 
245 	for (unsigned int ui=0; ui<INFO.size(); ui++)
246 	{
247 		if (INFO[ui].first == key)
248 			return INFO[ui].second;
249 	}
250 	return "?";
251 }
252 
get_INFO_values(const string & key) const253 vector<string> entry::get_INFO_values(const string &key) const
254 {
255 	vector<string> out;
256 	string tmp;
257 
258 	tmp = get_INFO_value(key);
259 	if (tmp != "?")	header::tokenize(tmp, ',', out);
260 
261 	return out;
262 }
263 
get_FORMAT() const264 string entry::get_FORMAT() const
265 {
266 	assert(parsed_FORMAT == true);
267 
268 	string out;
269 	bool first = true;
270 	for (unsigned int ui=0; ui<FORMAT.size(); ui++)
271 	{
272 		if (first == false)
273 			out += ":";
274 		out += FORMAT[ui];
275 		first = false;
276 	}
277 	return out;
278 }
279 
get_FORMAT_binary(vector<char> & out) const280 void entry::get_FORMAT_binary(vector<char> &out) const
281 {
282 	assert(parsed_FORMAT_binary == true);
283 	out = FORMAT_binary;
284 }
285 
286 // Return the alleles of a genotype as a pair of strings.
get_indv_GENOTYPE_strings(unsigned int indv,pair<string,string> & out) const287 void entry::get_indv_GENOTYPE_strings(unsigned int indv, pair<string, string> &out) const
288 {
289 	assert(parsed_GT[indv] == true);
290 
291 	static string out_allele1, out_allele2;
292 
293 	get_allele(GENOTYPE[indv].first, out_allele1);
294 	get_allele(GENOTYPE[indv].second, out_allele2);
295 	out = make_pair(out_allele1, out_allele2);
296 }
297 
get_indv_GENOTYPE_ids(unsigned int indv,pair<int,int> & out) const298 void entry::get_indv_GENOTYPE_ids(unsigned int indv, pair<int, int> &out) const
299 {
300 	assert(parsed_GT[indv] == true);
301 	out = GENOTYPE[indv];
302 }
303 
get_indv_PHASE(unsigned int indv) const304 char entry::get_indv_PHASE(unsigned int indv) const
305 {
306 	assert(parsed_GT[indv] == true);
307 	return PHASE[indv];
308 }
309 
get_indv_DEPTH(unsigned int indv) const310 int entry::get_indv_DEPTH(unsigned int indv) const
311 {
312 	assert(parsed_DP[indv] == true);
313 	if (DEPTH.empty())
314 		return -1;
315 	return DEPTH[indv];
316 }
317 
get_indv_GQUALITY(unsigned int indv) const318 double entry::get_indv_GQUALITY(unsigned int indv) const
319 {
320 	assert(parsed_GQ[indv] == true);
321 	if (GQUALITY.empty())
322 		return -1;
323 	return GQUALITY[indv];
324 }
325 
get_indv_GFILTER_vector(unsigned int indv,vector<string> & out) const326 void entry::get_indv_GFILTER_vector(unsigned int indv, vector<string> &out) const
327 {
328 	assert(parsed_FT[indv] == true);
329 	if (!GFILTER.empty())
330 		out = GFILTER[indv];
331 	else
332 		out.resize(0);
333 }
334 
get_indv_GFILTER(unsigned int indv,string & out) const335 void entry::get_indv_GFILTER(unsigned int indv, string &out) const
336 {
337 	assert(parsed_FT[indv] == true);
338 
339 	if ((!GFILTER.empty()) && (GFILTER[indv].size()>0))
340 	{
341 		out="";
342 		for (unsigned int ui=0; ui<GFILTER[indv].size(); ui++)
343 		{
344 			if (ui!=0)
345 				out += ";";
346 			out += GFILTER[indv][ui];
347 		}
348 	}
349 	else
350 		out = ".";
351 }
352 
get_indv_ploidy(unsigned int indv) const353 int entry::get_indv_ploidy(unsigned int indv) const
354 {
355 	assert (parsed_GT[indv]==true);
356 	return ploidy[indv];
357 }
358 
FORMAT_id_exists(const string & FORMAT_id)359 bool entry::FORMAT_id_exists(const string &FORMAT_id)
360 {
361 	assert(parsed_FORMAT == true);
362 	if (FORMAT_to_idx.find(FORMAT_id) != FORMAT_to_idx.end())
363 		return true;
364 	return false;
365 }
366 
get_N_alleles() const367 unsigned int entry::get_N_alleles() const
368 {
369 	assert(parsed_ALT == true);
370 	return (ALT.size()+1);
371 }
372 
get_N_chr() const373 unsigned int entry::get_N_chr() const
374 {
375 	unsigned int out=0;
376 
377 	for (unsigned int ui=0; ui<N_indv; ui++)
378 	{
379 		if (include_indv[ui] == true)
380 		{
381 			assert(parsed_GT[ui] == true);
382 			out += ploidy[ui];
383 		}
384 	}
385 	return out;
386 }
387 
388 // Return the frequency (counts) of each allele.
get_allele_counts(vector<int> & out,unsigned int & N_non_missing_chr_out) const389 void entry::get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out) const
390 {
391 	get_allele_counts(out, N_non_missing_chr_out, include_indv, include_genotype);
392 }
393 
394 // Return the frequency (counts) of each allele.
get_allele_counts(vector<int> & out,unsigned int & N_non_missing_chr_out,const vector<bool> & include_indv,const vector<bool> & include_genotype) const395 void entry::get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out, const vector<bool> &include_indv, const vector<bool> &include_genotype) const
396 {
397 	pair<int,int> genotype;
398 	vector<int> allele_counts(get_N_alleles(), 0);
399 	N_non_missing_chr_out = 0;
400 
401 	for (unsigned int ui=0; ui<N_indv; ui++)
402 	{
403 		//FILTERING BY INDIVIDUAL
404 		if ((include_indv[ui] == true) && (include_genotype[ui] == true))
405 		{
406 			assert(parsed_GT[ui] == true);
407 			get_indv_GENOTYPE_ids(ui, genotype);
408 
409 			if (genotype.first > -1)
410 			{
411 				allele_counts[genotype.first]++;
412 				N_non_missing_chr_out++;
413 			}
414 			if (genotype.second > -1)
415 			{
416 				allele_counts[genotype.second]++;
417 				N_non_missing_chr_out++;
418 			}
419 		}
420 	}
421 	out = allele_counts;
422 }
423 
get_genotype_counts(const vector<bool> & include_indv,const vector<bool> & include_genotype,unsigned int & out_N_hom1,unsigned int & out_N_het,unsigned int & out_N_hom2) const424 void entry::get_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const
425 {
426 	out_N_hom1 = 0; out_N_hom2 = 0; out_N_het = 0;
427 	pair<int, int> genotype;
428 	if (ALT.size() > 1)
429 		LOG.error("Tried to return the genotype counts of a non-biallelic SNP", 99);
430 
431 	for (unsigned int ui=0; ui<N_indv; ui++)
432 	{
433 		if ((include_indv[ui] == true) && (include_genotype[ui] == true))
434 		{
435 			assert(parsed_GT[ui] == true);
436 			get_indv_GENOTYPE_ids(ui, genotype);
437 			if ((genotype.first > -1) && (genotype.second > -1))
438 			{
439 				if (genotype.first != genotype.second)
440 					out_N_het++;
441 				else if (genotype.first == 0)
442 					out_N_hom1++;
443 				else if (genotype.first == 1)
444 					out_N_hom2++;
445 				else
446 					LOG.error("Unknown allele in genotype", 98);
447 			}
448 		}
449 	}
450 }
451 
get_multiple_genotype_counts(const vector<bool> & include_indv,const vector<bool> & include_genotype,vector<unsigned int> & out_N_hom,vector<unsigned int> & out_N_het) const452 void entry::get_multiple_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, vector<unsigned int> &out_N_hom, vector<unsigned int> &out_N_het) const
453 {
454 	out_N_hom.assign(ALT.size()+1, 0);
455 	out_N_het.assign(ALT.size()+1, 0);
456 	pair<int, int> genotype;
457 
458 	for (unsigned int ui=0; ui<N_indv; ui++)
459 	{
460 		if ((include_indv[ui] == true) && (include_genotype[ui] == true))
461 		{
462 			assert(parsed_GT[ui] == true);
463 			get_indv_GENOTYPE_ids(ui, genotype);
464 
465 			for (int uj=0; uj<=(int)ALT.size(); uj++)
466 			{
467 				if ((genotype.first == uj) && (genotype.second == uj))
468 					out_N_hom[uj]++;
469 				else if (((genotype.first == uj) || (genotype.second == uj)) && (genotype.first != -1) && (genotype.second != -1))
470 					out_N_het[uj]++;
471 			}
472 		}
473 	}
474 }
475 
476 // Return the counts of homozygote1, heterozygotes, and homozygote2
get_genotype_counts(unsigned int & out_N_hom1,unsigned int & out_N_het,unsigned int & out_N_hom2) const477 void entry::get_genotype_counts(unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const
478 {
479 	get_genotype_counts(include_indv, include_genotype, out_N_hom1, out_N_het, out_N_hom2);
480 }
481 
get_POS_binary(vector<char> & out) const482 void entry::get_POS_binary(vector<char> &out) const
483 {
484 	out.resize(sizeof(uint32_t));
485 	uint32_t pos = POS - 1;
486 	memcpy(&out[0], &pos, sizeof(pos));
487 }
488 
get_rlen(vector<char> & out) const489 void entry::get_rlen(vector<char> &out) const
490 {
491 	out.resize(sizeof(int32_t));
492 	int32_t rlen;
493 	if (REF != "" and REF != "." and REF != " ")
494 		rlen = (int32_t)REF.length();
495 	else
496 		rlen = (int32_t)0;
497 	memcpy(&out[0], &rlen, sizeof(rlen));
498 }
499 
get_QUAL_binary(vector<char> & out) const500 void entry::get_QUAL_binary(vector<char> &out) const
501 {
502 	out.resize(sizeof(float));
503 	float qual = (float)QUAL;
504 	memcpy(&out[0], &qual, sizeof(qual));
505 }
506 
get_n_allele_info(vector<char> & out) const507 void entry::get_n_allele_info(vector<char> &out) const
508 {
509 	out.resize(sizeof(uint32_t));
510 	uint32_t n_allele_info = (uint32_t)ALT.size() + 1;
511 	uint32_t n_info = (uint32_t)(INFO.size()-N_INFO_removed);
512 
513 	n_allele_info = n_allele_info << 16;
514 	n_allele_info = n_allele_info | n_info;
515 
516 	memcpy(&out[0], &n_allele_info, sizeof(n_allele_info));
517 }
518 
get_n_fmt_sample(vector<char> & out) const519 void entry::get_n_fmt_sample(vector<char> &out) const
520 {
521 	out.resize(sizeof(uint32_t));
522 	uint32_t n_fmt_sample = (uint32_t)(FORMAT.size()-N_FORMAT_removed);
523 	uint32_t n_sample = (uint32_t)N_indv;
524 
525 	n_fmt_sample = n_fmt_sample << 24;
526 	n_fmt_sample = n_fmt_sample | n_sample;
527 
528 	memcpy(&out[0], &n_fmt_sample, sizeof(n_fmt_sample));
529 }
530 
get_ID_binary(vector<char> & out)531 void entry::get_ID_binary(vector<char> &out)
532 {
533 	make_typed_string(out, ID, true );
534 }
535 
get_ALLELES_binary(vector<char> & out)536 void entry::get_ALLELES_binary(vector<char> &out)
537 {
538 	vector<char> tmp;
539 	out.resize(0);
540 
541 	make_typed_string(tmp, REF, true );
542 	out.insert(out.end(), tmp.begin(), tmp.end());
543 
544 	for (unsigned int ui=0; ui<ALT.size(); ui++)
545 	{
546 		tmp.resize(0);
547 		make_typed_string(tmp, ALT[ui], true );
548 		out.insert(out.end(), tmp.begin(), tmp.end());
549 	}
550 }
551