1 /*
2 * bcf_entry_setters.cpp
3 *
4 * Created on: Sep 20, 2012
5 * Author: Anthony Marcketta
6 * ($Revision: 1 $)
7 */
8
9 #include "bcf_entry.h"
10
set_ALT(const int n_allele)11 void bcf_entry::set_ALT(const int n_allele)
12 {
13 ALT.resize(n_allele-1);
14 unsigned int pos = ALT_pos;
15 string allele;
16 for (int ui=0; ui<(n_allele-1); ui++)
17 {
18 allele = get_typed_string( &pos, line );
19 std::transform(allele.begin(), allele.end(), allele.begin(), ::toupper);
20 ALT[ui] = allele;
21 }
22
23 parsed_ALT = true;
24 }
25
set_ALT(const string & in)26 void bcf_entry::set_ALT(const string &in)
27 {
28 istringstream ss(in);
29 string tmpstr;
30 ALT.resize(0);
31 while(!ss.eof())
32 {
33 getline(ss, tmpstr, ',');
34 add_ALT_allele(tmpstr);
35 }
36 parsed_ALT = true;
37 }
38
set_INFO()39 void bcf_entry::set_INFO()
40 {
41 int key;
42 unsigned int size, type, i = INFO_pos;
43 string data_type;
44 INFO.resize(N_info);
45 bool miss = true;
46
47 for (unsigned int ui=0; ui<N_info; ui++)
48 {
49 key = get_typed_int(&i, line, type, size);
50 get_type(&i, line, type, size);
51
52 pair<string, string> INFO_entry(entry_header.INFO_map[key].ID, ".");
53 data_type = entry_header.INFO_map[key].Type_str;
54 ostringstream ss(ostringstream::out);
55
56 for (unsigned int uj=0; uj<size; uj++)
57 {
58 if (uj !=0 && type != 7)
59 ss << ",";
60
61 if ( check_missing(i, type, line) )
62 ss << ".";
63
64 else if(type==1)
65 {
66 int8_t tmp;
67 memcpy(&tmp, &line[i], sizeof(tmp));
68 i += sizeof(tmp);
69 ss << (int)tmp;
70 miss = false;
71 }
72 else if(type==2)
73 {
74 int16_t tmp;
75 memcpy(&tmp, &line[i], sizeof(tmp));
76 i += sizeof(tmp);
77 ss << (int)tmp;
78 miss = false;
79 }
80 else if(type==3)
81 {
82 int32_t tmp;
83 memcpy(&tmp, &line[i], sizeof(tmp));
84 i += sizeof(tmp);
85 ss << (int)tmp;
86 miss = false;
87 }
88 else if(type==5)
89 {
90 float tmp;
91 memcpy(&tmp, &line[i], sizeof(tmp));
92 i += sizeof(tmp);
93 ss << tmp;
94 miss = false;
95 }
96 else if(type==7)
97 {
98 char tmp;
99 memcpy(&tmp, &line[i], sizeof(tmp));
100 i += sizeof(tmp);
101 ss << tmp;
102 miss = false;
103 }
104 else
105 {
106 LOG.printLOG("Error: Unknown type: " + header::int2str(type) + "\n");
107 parsed_INFO = false;
108 return;
109 }
110 }
111
112 string value = ss.str();
113 if (miss)
114 INFO_entry.second = ".";
115 else if (data_type == "Flag")
116 INFO_entry.second = "1";
117 else if (value != "")
118 INFO_entry.second = ss.str();
119 else
120 INFO_entry.second = ".";
121 INFO[ui] = INFO_entry;
122 }
123 parsed_INFO = true;
124 }
125
set_FORMAT()126 void bcf_entry::set_FORMAT()
127 {
128 FORMAT.resize(0);
129 FORMAT_to_idx.clear();
130 unsigned int l_pos = L_shared + 2*sizeof(uint32_t);
131 unsigned int fmt_key, type, size, skip;
132
133 for (unsigned int ui=0; ui<N_format; ui++)
134 {
135 fmt_key = get_typed_int(&l_pos, line, type, size);
136 get_type(&l_pos, line, type, size);
137
138 if ( (type == 1) or (type == 7) )
139 skip = sizeof(int8_t)*size*N_indv;
140 else if (type == 2)
141 skip = sizeof(int16_t)*size*N_indv;
142 else if ( (type == 3) or (type == 5) )
143 skip = sizeof(int32_t)*size*N_indv;
144 else
145 {
146 LOG.printLOG("Error: Unknown type: " + header::int2str(type) + "\n");
147 parsed_FORMAT = false;
148 return;
149 }
150 add_FORMAT_entry( entry_header.FORMAT_map[fmt_key].ID, fmt_key, ui, l_pos, type, size);
151 l_pos += skip;
152 }
153
154 GT_idx = -1;
155 GQ_idx = -1;
156 DP_idx = -1;
157 FT_idx = -1;
158
159 if (FORMAT_to_idx.find("GT") != FORMAT_to_idx.end())
160 GT_idx = FORMAT_to_idx["GT"];
161 if (FORMAT_to_idx.find("GQ") != FORMAT_to_idx.end())
162 GQ_idx = FORMAT_to_idx["GQ"];
163 if (FORMAT_to_idx.find("DP") != FORMAT_to_idx.end())
164 DP_idx = FORMAT_to_idx["DP"];
165 if (FORMAT_to_idx.find("FT") != FORMAT_to_idx.end())
166 FT_idx = FORMAT_to_idx["FT"];
167 parsed_FORMAT = true;
168 }
169
add_FORMAT_entry(const string & in,const unsigned int & fmt_key,const unsigned int & pos,const unsigned int & line_pos,const unsigned int & type,const unsigned int & size)170 void bcf_entry::add_FORMAT_entry(const string &in, const unsigned int &fmt_key, const unsigned int &pos, const unsigned int &line_pos, const unsigned int &type, const unsigned int &size)
171 {
172 FORMAT.push_back(in);
173 FORMAT_to_idx[in] = pos;
174 FORMAT_positions[pos] = line_pos;
175 FORMAT_types[pos] = type;
176 FORMAT_sizes[pos] = size;
177 FORMAT_keys[pos] = fmt_key;
178
179 if ( (type==1) or (type==7) )
180 FORMAT_skip[pos] = ( sizeof(int8_t)*size );
181 else if ( type==2 )
182 FORMAT_skip[pos] = ( sizeof(int16_t)*size );
183 else if ( (type==3) or (type==5) )
184 FORMAT_skip[pos] = ( sizeof(int32_t)*size );
185 }
186
set_indv_GENOTYPE_and_PHASE(unsigned int indv,const unsigned int & pos,const unsigned int & size)187 void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const unsigned int &pos, const unsigned int &size)
188 {
189 int8_t tmp, tmp2;
190 unsigned int cur_pos = pos;
191 char phased[2] = {'/', '|'};
192 ploidy.resize(N_indv);
193 ploidy[indv] = 0;
194
195 for (unsigned int ui=0; ui<size; ui++)
196 {
197 tmp = *reinterpret_cast<const int8_t*>(&line[cur_pos]);
198 if ( tmp == (int8_t)0x81 )
199 break;
200 ploidy[indv]++;
201 cur_pos += sizeof(int8_t);
202 }
203
204 if (ploidy[indv] == 0)
205 {
206 set_indv_GENOTYPE_alleles(indv, make_pair(-2, -2));
207 }
208 else if (ploidy[indv] == 1)
209 {
210 set_indv_PHASE(indv, '|');
211 tmp = *reinterpret_cast<const int8_t*>(&line[pos]);
212 if (tmp == (int8_t)0x80)
213 tmp = -1;
214 else
215 tmp = (tmp >> 1) - 1;
216 set_indv_GENOTYPE_alleles(indv, make_pair(tmp, -2));
217 }
218 else if (ploidy[indv] == 2)
219 {
220 tmp = *reinterpret_cast<const int8_t*>(&line[pos]);
221 tmp2 = *reinterpret_cast<const int8_t*>(&line[pos+sizeof(int8_t)]);
222
223 if (tmp == (int8_t)0x80)
224 tmp = -1;
225 else
226 tmp = (tmp >> 1) - 1;
227
228 if (tmp2 == (int8_t)0x80)
229 {
230 tmp2 = -1;
231 set_indv_PHASE(indv, '/');
232 }
233 else
234 {
235 char phase = phased[ tmp2 & (int8_t)1 ];
236 tmp2 = (tmp2 >> 1) - 1;
237 set_indv_PHASE(indv, phase);
238 }
239 set_indv_GENOTYPE_alleles(indv, make_pair((int)tmp, (int)tmp2));
240 }
241 else if (ploidy[indv] > 2)
242 LOG.error("Polyploidy found, and is not supported by vcftools: " + CHROM + ":" + header::int2str(POS));
243 parsed_GT[indv] = true;
244 }
245
set_indv_GENOTYPE_and_PHASE(unsigned int indv,const pair<int,int> & genotype,char phase)246 void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair<int, int> &genotype, char phase)
247 {
248 set_indv_GENOTYPE_ids(indv, genotype);
249 set_indv_PHASE(indv, phase);
250 parsed_GT[indv] = true;
251 }
252
set_indv_GENOTYPE_and_PHASE(unsigned int indv,const pair<string,string> & genotype,char phase)253 void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair<string, string> &genotype, char phase)
254 {
255 pair<int, int> a(-1,-1);
256 if (genotype.first != ".")
257 a.first = header::str2int(genotype.first);
258 if (genotype.second != ".")
259 a.second = header::str2int(genotype.second);
260
261 set_indv_GENOTYPE_alleles(indv, a);
262 set_indv_PHASE(indv, phase);
263 parsed_GT[indv] = true;
264 }
265
set_indv_GENOTYPE_alleles(unsigned int indv,const pair<int,int> & in)266 void bcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, const pair<int, int> &in)
267 {
268 if (GENOTYPE.size() == 0)
269 GENOTYPE.resize(N_indv, make_pair(-1,-1));
270
271 pair<int, int> a(-1,-1);
272
273 if (in.first == 0x81)
274 a.first = -2;
275 else if (in.first != 0x80)
276 a.first = in.first;
277
278 if (in.second == 0x81)
279 a.second = -2;
280 else if (in.second != 0x80)
281 a.second = in.second;
282
283 GENOTYPE[indv] = in;
284 parsed_GT[indv] = true;
285 }
286
set_indv_GENOTYPE_ids(unsigned int indv,const pair<int,int> & in)287 void bcf_entry::set_indv_GENOTYPE_ids(unsigned int indv, const pair<int, int> &in)
288 {
289 if (GENOTYPE.size() == 0)
290 GENOTYPE.resize(N_indv, make_pair(-2,-2));
291 GENOTYPE[indv] = in;
292 }
293
set_indv_PHASE(unsigned int indv,char in)294 void bcf_entry::set_indv_PHASE(unsigned int indv, char in)
295 {
296 if (PHASE.size() == 0)
297 PHASE.resize(N_indv, '/');
298
299 PHASE[indv] = in;
300 parsed_GT[indv] = true;
301 }
302
set_indv_GQUALITY(unsigned int indv,const vector<char> & in)303 void bcf_entry::set_indv_GQUALITY(unsigned int indv, const vector<char> &in)
304 {
305 float tmp;
306 memcpy(&tmp, &in[0], sizeof(tmp));
307
308 parsed_GQ[indv] = true;
309 if (tmp == 0x7F800001)
310 {
311 if (GQUALITY.size() > 0)
312 GQUALITY[indv] = -1;
313 return;
314 }
315 if (GQUALITY.size() == 0)
316 GQUALITY.resize(N_indv, -1);
317
318 if (tmp > 99.0)
319 tmp = 99;
320 GQUALITY[indv] = tmp;
321 }
322
set_indv_GQUALITY(unsigned int indv,const float & in)323 void bcf_entry::set_indv_GQUALITY(unsigned int indv, const float &in)
324 {
325 parsed_GQ[indv] = true;
326 if ( (in == -1) or (in == 0x7F800001) )
327 {
328 if (GQUALITY.size() > 0)
329 GQUALITY[indv] = -1;
330 return;
331 }
332 if (GQUALITY.size() == 0)
333 GQUALITY.resize(N_indv, -1);
334
335 if (in > 99)
336 GQUALITY[indv] = 99;
337 else
338 GQUALITY[indv] = in;
339 }
340
set_indv_GFILTER(unsigned int indv,const vector<char> & in)341 void bcf_entry::set_indv_GFILTER(unsigned int indv, const vector<char> &in)
342 {
343 parsed_FT[indv] = true;
344
345 if (GFILTER.size() == 0)
346 GFILTER.resize(N_indv);
347
348 GFILTER[indv].resize(0);
349 if (in.empty())
350 return;
351 else if ((in.size() == 1) and (in[0] == '\0') )
352 return;
353
354 ostringstream ss;
355 string ith_FILTER;
356 ss.clear();
357 for (unsigned int ui=0; ui<in.size(); ui++)
358 {
359 if (in[ui] == ';')
360 {
361 ith_FILTER = ss.str();
362 ss.clear();
363
364 if ((ith_FILTER.size()==0) || (ith_FILTER == "."))
365 continue; // Don't bother storing "unfiltered" state.
366
367 GFILTER[indv].push_back(ith_FILTER);
368 }
369 else
370 ss << in[ui];
371 }
372 ith_FILTER = ss.str();
373 ss.clear();
374
375 if ((ith_FILTER.size()!=0) || (ith_FILTER != "."))
376 GFILTER[indv].push_back(ith_FILTER);
377 }
378
set_indv_GFILTER(unsigned int indv,const string & in)379 void bcf_entry::set_indv_GFILTER(unsigned int indv, const string &in)
380 {
381 parsed_FT[indv] = true;
382
383 if (GFILTER.size() == 0)
384 GFILTER.resize(N_indv);
385
386 GFILTER[indv].resize(0);
387 if ((in.size() == 0) || (in == "."))
388 return;
389
390 static istringstream ss;
391 static string ith_FILTER;
392 ss.clear();
393 ss.str(in);
394 while (!ss.eof())
395 {
396 getline(ss, ith_FILTER, ';');
397
398 if ((ith_FILTER.size()==0) || (ith_FILTER == "."))
399 continue; // Don't bother storing "unfiltered" state.
400
401 GFILTER[indv].push_back(ith_FILTER);
402 }
403 }
404
set_FILTER()405 void bcf_entry::set_FILTER()
406 {
407 FILTER_str = get_int_vector( &FILTER_pos, line );
408 FILTER.resize(0);
409
410 if ( (FILTER_str.size() == 1) and entry_header.FILTER_map[ FILTER_str[0] ].ID == "")
411 {
412 FILTER.push_back("PASS");
413 return;
414 }
415
416 for (unsigned int ui=0; ui<FILTER_str.size(); ui++)
417 {
418 if ((int8_t)FILTER_str[ui] != (int8_t)0x80)
419 FILTER.push_back( entry_header.FILTER_map[ FILTER_str[ui] ].ID );
420 else
421 FILTER.push_back( "." );
422 }
423 sort(FILTER.begin(), FILTER.end());
424 parsed_FILTER = true;
425 }
426