1 /*
2 entry_getters.cpp
3 *
4 * Created on: Nov 11, 2009
5 * Author: Adam Auton
6 * ($Revision: 230 $)
7 */
8
9 #include "entry.h"
10
11 // Return the CHROMosome name
get_CHROM() const12 string entry::get_CHROM() const
13 {
14 return CHROM;
15 }
16
17 // Return the CHROMosome name
get_CHROM(string & out) const18 void entry::get_CHROM(string &out) const
19 {
20 out = CHROM;
21 }
22
get_POS() const23 int entry::get_POS() const
24 {
25 return POS;
26 }
27
get_ID() const28 string entry::get_ID() const
29 {
30 if (ID.size() == 0)
31 return ".";
32 return ID;
33 }
34
get_REF() const35 string entry::get_REF() const
36 {
37 if (REF == "")
38 return ".";
39 else
40 return REF;
41 }
42
get_ALT() const43 string entry::get_ALT() const
44 {
45 assert(parsed_ALT == true);
46
47 string out;
48 if (ALT.empty())
49 out = ".";
50 else if (ALT.size() == 1 && ALT[0] == "")
51 out = ".";
52 else
53 {
54 out = ALT[0];
55 for (unsigned int ui=1; ui<ALT.size(); ui++)
56 out += "," + ALT[ui];
57 }
58 return out;
59 }
60
is_SNP() const61 bool entry::is_SNP() const
62 {
63 assert(parsed_ALT == true);
64
65 if (REF.size() != 1)
66 return false; // Reference isn't a single base
67
68 if (ALT.empty())
69 return false; // No alternative allele
70
71 for (unsigned int ui=0; ui<ALT.size(); ui++)
72 if (ALT[ui].size() != 1)
73 return false; // Alternative allele isn't a single base
74
75 return true;
76 }
77
is_biallelic_SNP() const78 bool entry::is_biallelic_SNP() const
79 {
80 assert(parsed_ALT == true);
81
82 if (REF.size() != 1)
83 return false; // Reference isn't a single base
84
85 if (ALT.size() != 1)
86 return false; // Not biallelic
87
88 if (ALT[0].size() != 1)
89 return false; // Alternative allele isn't a single base
90
91 return true;
92 }
93
is_diploid() const94 bool entry::is_diploid() const
95 {
96 for (unsigned int ui=0; ui<N_indv; ui++)
97 {
98 if ((include_indv[ui] == true) && (include_genotype[ui] == true))
99 {
100 assert(parsed_GT[ui] == true);
101 if (ploidy[ui] != 2)
102 return false;
103 }
104 }
105 return true;
106 }
107
get_allele(int allele_num,string & out) const108 void entry::get_allele(int allele_num, string &out) const
109 {
110 assert(parsed_ALT == true);
111
112 if (allele_num == -2)
113 out = "";
114 else if (allele_num == 0)
115 out = REF;
116 else if ((allele_num == -1) || (unsigned(allele_num - 1) >= ALT.size()))
117 out = ".";
118 else
119 out = ALT[allele_num-1];
120 }
121
get_allele(int allele_num) const122 string entry::get_allele(int allele_num) const
123 {
124 assert(parsed_ALT == true);
125
126 if (allele_num == -2)
127 return "";
128 else if (allele_num == 0)
129 return REF;
130 else if ((allele_num < 0) || (unsigned(allele_num - 1) >= ALT.size()))
131 return ".";
132 else
133 return ALT[allele_num-1];
134 }
135
get_ALT_allele(int allele_num) const136 string entry::get_ALT_allele(int allele_num) const
137 {
138 assert(parsed_ALT == true);
139
140 if (allele_num == -2)
141 return "";
142 else if ((allele_num == -1) || (unsigned(allele_num) >= ALT.size()))
143 return ".";
144 return ALT[allele_num];
145 }
146
get_alleles_vector(vector<string> & out) const147 void entry::get_alleles_vector(vector<string> &out) const
148 {
149 assert(parsed_ALT == true);
150 out.resize(ALT.size()+1);
151 out[0] = REF;
152 copy(ALT.begin(), ALT.end(), out.begin()+1);
153 }
154
get_QUAL() const155 double entry::get_QUAL() const
156 {
157 return QUAL;
158 }
159
get_FILTER() const160 string entry::get_FILTER() const
161 {
162 assert(parsed_FILTER == true);
163
164 ostringstream out;
165 if (FILTER.empty())
166 out << ".";
167 else
168 {
169 out << FILTER[0];
170 for (unsigned int ui=1; ui<FILTER.size(); ui++)
171 out << ";" << FILTER[ui];
172 }
173 return out.str();
174 }
175
get_FILTER_vector(vector<string> & out) const176 void entry::get_FILTER_vector(vector<string> &out) const
177 {
178 assert(parsed_FILTER == true);
179 out = FILTER;
180 }
181
get_INFO(const set<string> & INFO_to_keep,bool keep_all_INFO) const182 string entry::get_INFO(const set<string> &INFO_to_keep, bool keep_all_INFO) const
183 {
184 assert(parsed_INFO == true);
185
186 ostringstream sout;
187 sout.str("");
188 sout.clear();
189
190 bool first=true;
191 if ( ( (!INFO.empty()) && (!INFO_to_keep.empty()) ) || keep_all_INFO )
192 {
193 string key;
194 for (unsigned int ui=0; ui<INFO.size();ui++)
195 {
196 key = INFO[ui].first;
197 if ( keep_all_INFO or (INFO_to_keep.find(key) != INFO_to_keep.end() ) )
198 {
199 if (first != true)
200 sout << ";";
201
202 sout << key;
203 if (INFO[ui].second != "")
204 sout << "=" << INFO[ui].second;
205 first = false;
206 }
207 }
208 }
209
210 if (first == true)
211 { // Didn't find any INFO fields to keep
212 sout.str(".");
213 }
214 return sout.str();
215 }
216
get_INFO_vector(const set<string> & INFO_to_keep,bool keep_all_INFO)217 vector<pair<string, string> > entry::get_INFO_vector(const set<string> &INFO_to_keep, bool keep_all_INFO)
218 {
219 assert(parsed_INFO == true);
220
221 vector<pair<string, string> > out_vector;
222 if (keep_all_INFO == true)
223 return INFO;
224
225 if ( (!INFO.empty()) && (!INFO_to_keep.empty()) )
226 {
227 string key;
228 for (unsigned int ui=0; ui<INFO.size();ui++)
229 {
230 key = INFO[ui].first;
231 if ( keep_all_INFO or (INFO_to_keep.find(key) != INFO_to_keep.end() ) )
232 out_vector.push_back( INFO[ui] );
233 else
234 N_INFO_removed++;
235 }
236 }
237
238 return out_vector;
239 }
240
get_INFO_value(const string & key) const241 string entry::get_INFO_value(const string &key) const
242 {
243 assert(parsed_INFO == true);
244
245 for (unsigned int ui=0; ui<INFO.size(); ui++)
246 {
247 if (INFO[ui].first == key)
248 return INFO[ui].second;
249 }
250 return "?";
251 }
252
get_INFO_values(const string & key) const253 vector<string> entry::get_INFO_values(const string &key) const
254 {
255 vector<string> out;
256 string tmp;
257
258 tmp = get_INFO_value(key);
259 if (tmp != "?") header::tokenize(tmp, ',', out);
260
261 return out;
262 }
263
get_FORMAT() const264 string entry::get_FORMAT() const
265 {
266 assert(parsed_FORMAT == true);
267
268 string out;
269 bool first = true;
270 for (unsigned int ui=0; ui<FORMAT.size(); ui++)
271 {
272 if (first == false)
273 out += ":";
274 out += FORMAT[ui];
275 first = false;
276 }
277 return out;
278 }
279
get_FORMAT_binary(vector<char> & out) const280 void entry::get_FORMAT_binary(vector<char> &out) const
281 {
282 assert(parsed_FORMAT_binary == true);
283 out = FORMAT_binary;
284 }
285
286 // Return the alleles of a genotype as a pair of strings.
get_indv_GENOTYPE_strings(unsigned int indv,pair<string,string> & out) const287 void entry::get_indv_GENOTYPE_strings(unsigned int indv, pair<string, string> &out) const
288 {
289 assert(parsed_GT[indv] == true);
290
291 static string out_allele1, out_allele2;
292
293 get_allele(GENOTYPE[indv].first, out_allele1);
294 get_allele(GENOTYPE[indv].second, out_allele2);
295 out = make_pair(out_allele1, out_allele2);
296 }
297
get_indv_GENOTYPE_ids(unsigned int indv,pair<int,int> & out) const298 void entry::get_indv_GENOTYPE_ids(unsigned int indv, pair<int, int> &out) const
299 {
300 assert(parsed_GT[indv] == true);
301 out = GENOTYPE[indv];
302 }
303
get_indv_PHASE(unsigned int indv) const304 char entry::get_indv_PHASE(unsigned int indv) const
305 {
306 assert(parsed_GT[indv] == true);
307 return PHASE[indv];
308 }
309
get_indv_DEPTH(unsigned int indv) const310 int entry::get_indv_DEPTH(unsigned int indv) const
311 {
312 assert(parsed_DP[indv] == true);
313 if (DEPTH.empty())
314 return -1;
315 return DEPTH[indv];
316 }
317
get_indv_GQUALITY(unsigned int indv) const318 double entry::get_indv_GQUALITY(unsigned int indv) const
319 {
320 assert(parsed_GQ[indv] == true);
321 if (GQUALITY.empty())
322 return -1;
323 return GQUALITY[indv];
324 }
325
get_indv_GFILTER_vector(unsigned int indv,vector<string> & out) const326 void entry::get_indv_GFILTER_vector(unsigned int indv, vector<string> &out) const
327 {
328 assert(parsed_FT[indv] == true);
329 if (!GFILTER.empty())
330 out = GFILTER[indv];
331 else
332 out.resize(0);
333 }
334
get_indv_GFILTER(unsigned int indv,string & out) const335 void entry::get_indv_GFILTER(unsigned int indv, string &out) const
336 {
337 assert(parsed_FT[indv] == true);
338
339 if ((!GFILTER.empty()) && (GFILTER[indv].size()>0))
340 {
341 out="";
342 for (unsigned int ui=0; ui<GFILTER[indv].size(); ui++)
343 {
344 if (ui!=0)
345 out += ";";
346 out += GFILTER[indv][ui];
347 }
348 }
349 else
350 out = ".";
351 }
352
get_indv_ploidy(unsigned int indv) const353 int entry::get_indv_ploidy(unsigned int indv) const
354 {
355 assert (parsed_GT[indv]==true);
356 return ploidy[indv];
357 }
358
FORMAT_id_exists(const string & FORMAT_id)359 bool entry::FORMAT_id_exists(const string &FORMAT_id)
360 {
361 assert(parsed_FORMAT == true);
362 if (FORMAT_to_idx.find(FORMAT_id) != FORMAT_to_idx.end())
363 return true;
364 return false;
365 }
366
get_N_alleles() const367 unsigned int entry::get_N_alleles() const
368 {
369 assert(parsed_ALT == true);
370 return (ALT.size()+1);
371 }
372
get_N_chr() const373 unsigned int entry::get_N_chr() const
374 {
375 unsigned int out=0;
376
377 for (unsigned int ui=0; ui<N_indv; ui++)
378 {
379 if (include_indv[ui] == true)
380 {
381 assert(parsed_GT[ui] == true);
382 out += ploidy[ui];
383 }
384 }
385 return out;
386 }
387
388 // Return the frequency (counts) of each allele.
get_allele_counts(vector<int> & out,unsigned int & N_non_missing_chr_out) const389 void entry::get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out) const
390 {
391 get_allele_counts(out, N_non_missing_chr_out, include_indv, include_genotype);
392 }
393
394 // Return the frequency (counts) of each allele.
get_allele_counts(vector<int> & out,unsigned int & N_non_missing_chr_out,const vector<bool> & include_indv,const vector<bool> & include_genotype) const395 void entry::get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out, const vector<bool> &include_indv, const vector<bool> &include_genotype) const
396 {
397 pair<int,int> genotype;
398 vector<int> allele_counts(get_N_alleles(), 0);
399 N_non_missing_chr_out = 0;
400
401 for (unsigned int ui=0; ui<N_indv; ui++)
402 {
403 //FILTERING BY INDIVIDUAL
404 if ((include_indv[ui] == true) && (include_genotype[ui] == true))
405 {
406 assert(parsed_GT[ui] == true);
407 get_indv_GENOTYPE_ids(ui, genotype);
408
409 if (genotype.first > -1)
410 {
411 allele_counts[genotype.first]++;
412 N_non_missing_chr_out++;
413 }
414 if (genotype.second > -1)
415 {
416 allele_counts[genotype.second]++;
417 N_non_missing_chr_out++;
418 }
419 }
420 }
421 out = allele_counts;
422 }
423
get_genotype_counts(const vector<bool> & include_indv,const vector<bool> & include_genotype,unsigned int & out_N_hom1,unsigned int & out_N_het,unsigned int & out_N_hom2) const424 void entry::get_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const
425 {
426 out_N_hom1 = 0; out_N_hom2 = 0; out_N_het = 0;
427 pair<int, int> genotype;
428 if (ALT.size() > 1)
429 LOG.error("Tried to return the genotype counts of a non-biallelic SNP", 99);
430
431 for (unsigned int ui=0; ui<N_indv; ui++)
432 {
433 if ((include_indv[ui] == true) && (include_genotype[ui] == true))
434 {
435 assert(parsed_GT[ui] == true);
436 get_indv_GENOTYPE_ids(ui, genotype);
437 if ((genotype.first > -1) && (genotype.second > -1))
438 {
439 if (genotype.first != genotype.second)
440 out_N_het++;
441 else if (genotype.first == 0)
442 out_N_hom1++;
443 else if (genotype.first == 1)
444 out_N_hom2++;
445 else
446 LOG.error("Unknown allele in genotype", 98);
447 }
448 }
449 }
450 }
451
get_multiple_genotype_counts(const vector<bool> & include_indv,const vector<bool> & include_genotype,vector<unsigned int> & out_N_hom,vector<unsigned int> & out_N_het) const452 void entry::get_multiple_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, vector<unsigned int> &out_N_hom, vector<unsigned int> &out_N_het) const
453 {
454 out_N_hom.assign(ALT.size()+1, 0);
455 out_N_het.assign(ALT.size()+1, 0);
456 pair<int, int> genotype;
457
458 for (unsigned int ui=0; ui<N_indv; ui++)
459 {
460 if ((include_indv[ui] == true) && (include_genotype[ui] == true))
461 {
462 assert(parsed_GT[ui] == true);
463 get_indv_GENOTYPE_ids(ui, genotype);
464
465 for (int uj=0; uj<=(int)ALT.size(); uj++)
466 {
467 if ((genotype.first == uj) && (genotype.second == uj))
468 out_N_hom[uj]++;
469 else if (((genotype.first == uj) || (genotype.second == uj)) && (genotype.first != -1) && (genotype.second != -1))
470 out_N_het[uj]++;
471 }
472 }
473 }
474 }
475
476 // Return the counts of homozygote1, heterozygotes, and homozygote2
get_genotype_counts(unsigned int & out_N_hom1,unsigned int & out_N_het,unsigned int & out_N_hom2) const477 void entry::get_genotype_counts(unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const
478 {
479 get_genotype_counts(include_indv, include_genotype, out_N_hom1, out_N_het, out_N_hom2);
480 }
481
get_POS_binary(vector<char> & out) const482 void entry::get_POS_binary(vector<char> &out) const
483 {
484 out.resize(sizeof(uint32_t));
485 uint32_t pos = POS - 1;
486 memcpy(&out[0], &pos, sizeof(pos));
487 }
488
get_rlen(vector<char> & out) const489 void entry::get_rlen(vector<char> &out) const
490 {
491 out.resize(sizeof(int32_t));
492 int32_t rlen;
493 if (REF != "" and REF != "." and REF != " ")
494 rlen = (int32_t)REF.length();
495 else
496 rlen = (int32_t)0;
497 memcpy(&out[0], &rlen, sizeof(rlen));
498 }
499
get_QUAL_binary(vector<char> & out) const500 void entry::get_QUAL_binary(vector<char> &out) const
501 {
502 out.resize(sizeof(float));
503 float qual = (float)QUAL;
504 memcpy(&out[0], &qual, sizeof(qual));
505 }
506
get_n_allele_info(vector<char> & out) const507 void entry::get_n_allele_info(vector<char> &out) const
508 {
509 out.resize(sizeof(uint32_t));
510 uint32_t n_allele_info = (uint32_t)ALT.size() + 1;
511 uint32_t n_info = (uint32_t)(INFO.size()-N_INFO_removed);
512
513 n_allele_info = n_allele_info << 16;
514 n_allele_info = n_allele_info | n_info;
515
516 memcpy(&out[0], &n_allele_info, sizeof(n_allele_info));
517 }
518
get_n_fmt_sample(vector<char> & out) const519 void entry::get_n_fmt_sample(vector<char> &out) const
520 {
521 out.resize(sizeof(uint32_t));
522 uint32_t n_fmt_sample = (uint32_t)(FORMAT.size()-N_FORMAT_removed);
523 uint32_t n_sample = (uint32_t)N_indv;
524
525 n_fmt_sample = n_fmt_sample << 24;
526 n_fmt_sample = n_fmt_sample | n_sample;
527
528 memcpy(&out[0], &n_fmt_sample, sizeof(n_fmt_sample));
529 }
530
get_ID_binary(vector<char> & out)531 void entry::get_ID_binary(vector<char> &out)
532 {
533 make_typed_string(out, ID, true );
534 }
535
get_ALLELES_binary(vector<char> & out)536 void entry::get_ALLELES_binary(vector<char> &out)
537 {
538 vector<char> tmp;
539 out.resize(0);
540
541 make_typed_string(tmp, REF, true );
542 out.insert(out.end(), tmp.begin(), tmp.end());
543
544 for (unsigned int ui=0; ui<ALT.size(); ui++)
545 {
546 tmp.resize(0);
547 make_typed_string(tmp, ALT[ui], true );
548 out.insert(out.end(), tmp.begin(), tmp.end());
549 }
550 }
551