1 /*
2 * header.cpp
3 *
4 * Created on: Apr 29, 2013
5 * Author: amarcketta
6 */
7
8 #include "header.h"
9
10
header()11 header::header()
12 {
13 has_contigs = false; has_file_format = false; has_genotypes = false;
14 has_header = false; has_idx = false;
15 contig_index = 0; N_indv = 0;
16 }
17
parse_meta(const string & line,unsigned int & line_index)18 void header::parse_meta(const string &line, unsigned int &line_index)
19 {
20 lines.push_back(line);
21 if (line.compare(0,13,"##fileformat=")==0)
22 {
23 has_file_format = true;
24 string version = line.substr(13);
25 if ((version != "VCFv4.0") && (version != "VCFv4.1") && (version != "VCFv4.2"))
26 LOG.error("VCF version must be v4.0, v4.1 or v4.2:\nYou are using version " + version);
27 }
28 else if (line.compare(0,7,"##INFO=")==0)
29 { // Found an INFO descriptor
30 line_index += add_INFO_descriptor(line.substr(8, line.size()-8), line_index);
31 }
32 else if (line.compare(0,9,"##FILTER=")==0)
33 { // Found a FILTER descriptor
34 line_index += add_FILTER_descriptor(line.substr(10, line.size()-8), line_index);
35 }
36 else if (line.compare(0,9,"##FORMAT=")==0)
37 { // Found a genotype filter descriptor
38 line_index += add_FORMAT_descriptor(line.substr(10, line.size()-8), line_index);
39 }
40 else if (line.compare(0,9,"##contig=")==0)
41 { // Found a contig descriptor
42 add_CONTIG_descriptor(line.substr(10, line.size()-8), contig_index);
43 contig_index++;
44 has_contigs = true;
45 }
46 else
47 {
48 Field_description I;
49 size_t found = line.find_first_of("=");
50 I.Field = line.substr(0,found);
51 I.Other = line.substr(found+1);
52 parsed_lines.push_back(I);
53 }
54 }
55
parse_header(const string & line)56 void header::parse_header(const string &line)
57 {
58 // #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... )
59 if (has_header == true)
60 LOG.warning("Multiple Header lines.");
61
62 has_header = true;
63 istringstream header(line);
64 int count = 0;
65 string tmp_str;
66 unsigned int N_header_indv = 0;
67 has_genotypes = false;
68 while (!header.eof())
69 {
70 getline(header, tmp_str, '\t');
71 switch (count)
72 {
73 case 0: if (tmp_str != "#CHROM") LOG.warning("First Header entry should be #CHROM: " + tmp_str); break;
74 case 1: if (tmp_str != "POS") LOG.warning("Second Header entry should be POS: " + tmp_str); break;
75 case 2: if (tmp_str != "ID") LOG.warning("Third Header entry should be ID: " + tmp_str); break;
76 case 3: if (tmp_str != "REF") LOG.warning("Fourth Header entry should be REF: " + tmp_str); break;
77 case 4: if (tmp_str != "ALT") LOG.warning("Fifth Header entry should be ALT: " + tmp_str); break;
78 case 5: if (tmp_str != "QUAL") LOG.warning("Sixth Header entry should be QUAL: " + tmp_str); break;
79 case 6: if (tmp_str != "FILTER") LOG.warning("Seventh Header entry should be FILTER: " + tmp_str); break;
80 case 7: if (tmp_str != "INFO") LOG.warning("Eighth Header entry should be INFO: " + tmp_str); break;
81 case 8:
82 if (tmp_str != "FORMAT")
83 LOG.warning("Ninth Header entry should be FORMAT: " + tmp_str);
84 else
85 has_genotypes = true;
86 break;
87 default:
88 {
89 if (count <= 8)
90 LOG.error("Incorrectly formatted header.");
91 indv.push_back(tmp_str);
92 N_header_indv++;
93 }
94 break;
95 }
96 count++;
97 }
98 N_indv = N_header_indv;
99
100 if ((has_genotypes == true ) && (N_indv == 0))
101 LOG.warning("FORMAT field without genotypes?");
102 }
103
add_INFO_descriptor(const string & in,int index)104 int header::add_INFO_descriptor(const string &in, int index)
105 {
106 size_t found_end=in.find_last_of(">");
107 string details = in.substr(0, found_end);
108
109 Field_description I;
110 I.Field = "INFO";
111 vector<string> tokens;
112 tokenize(details, ',', tokens);
113 if (tokens.size() < 4)
114 LOG.error("Expected at least 4 parts in INFO definition: " + in);
115
116 vector<string> entry;
117 for (unsigned int ui=0; ui<tokens.size(); ui++)
118 {
119 tokenize(tokens[ui], '=', entry);
120 if (entry.size() < 2)
121 {
122 LOG.warning("Warning: Expected at least 2 parts in INFO entry: " + in);
123 continue;
124 }
125 if (entry[0] == "ID") I.ID = entry[1];
126 else if (entry[0] == "Number")
127 {
128 if ((entry[1] == "A") || (entry[1] == "G"))
129 I.N_entries = -1;
130 else
131 I.N_entries = str2int(entry[1]);
132 I.N_entries_str = entry[1];
133 }
134 else if (entry[0] == "Type")
135 {
136 if (entry[1] == "Integer") { I.Type_str = "Integer"; I.Type = Integer; }
137 else if ((entry[1] == "Float") || (entry[1] == "Numeric")) {I.Type_str = "Float"; I.Type = Float;}
138 else if (entry[1] == "Character") {I.Type_str = "Character"; I.Type = Character;}
139 else if (entry[1] == "String") {I.Type_str = "String"; I.Type = String;}
140 else if (entry[1] == "Flag")
141 {
142 I.Type = Flag;
143 I.Type_str = "Flag";
144 if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in);
145 }
146 else LOG.error("Unknown Type in INFO meta-information: " + in);
147 }
148 else if (entry[0] == "Description") I.Description = entry[1];
149 else if (entry[0] == "Source") I.Source = entry[1];
150 else if (entry[0] == "Version") I.Version = entry[1];
151 else if (entry[0] == "IDX")
152 {
153 has_idx = true;
154 I.idx = str2int(entry[1]);
155 }
156 else
157 {
158 if (I.Other != "")
159 I.Other += ",";
160 I.Other += tokens[ui];
161 }
162 }
163
164 if (I.ID == "") LOG.error("ID required in INFO field description: " + in);
165 if (I.N_entries_str == "") LOG.error("Number required in INFO field description: " + in);
166 if (I.Type_str == "") LOG.error("Type required in INFO field description: " + in);
167 if (I.Description == "") LOG.error("Description required in INFO field description: " + in);
168 if ((I.idx == -1) && (has_idx)) LOG.error("Missing index in INFO field description: " + in);
169 parsed_lines.push_back(I);
170
171 if (I.idx != -1)
172 {
173 INFO_map[I.idx] = I;
174 INFO_reverse_map[I.ID] = I.idx;
175 return 0;
176 }
177 else
178 {
179 if ( FORMAT_reverse_map.find( I.ID ) != FORMAT_reverse_map.end() )
180 {
181 INFO_map[ FORMAT_reverse_map[ I.ID ] ] = I;
182 INFO_reverse_map[I.ID] = FORMAT_reverse_map[I.ID];
183 return 0;
184 }
185 else if ( FILTER_reverse_map.find( I.ID ) != FILTER_reverse_map.end() )
186 {
187 INFO_map[ FILTER_reverse_map[ I.ID ] ] = I;
188 INFO_reverse_map[I.ID] = FILTER_reverse_map[ I.ID ];
189 return 0;
190 }
191 else if ( INFO_reverse_map.find( I.ID ) != INFO_reverse_map.end() )
192 return 0;
193 else
194 {
195 INFO_map[index] = I;
196 INFO_reverse_map[I.ID] = index;
197 return 1;
198 }
199 }
200 }
201
add_FORMAT_descriptor(const string & in,int index)202 int header::add_FORMAT_descriptor(const string &in, int index)
203 {
204 size_t found_end=in.find_last_of(">");
205 string details = in.substr(0, found_end);
206
207 vector<string> tokens;
208 tokenize(details, ',', tokens);
209 Field_description I;
210 I.Field = "FORMAT";
211 if (tokens.size() < 4)
212 LOG.error("Expected at least 4 parts in FORMAT definition: " + in);
213
214 vector<string> entry;
215 for (unsigned int ui=0; ui<tokens.size(); ui++)
216 {
217 tokenize(tokens[ui], '=', entry);
218 if (entry.size() < 2)
219 {
220 LOG.warning("Warning: Expected at least 2 parts in FORMAT entry: " + in);
221 continue;
222 }
223 if (entry[0] == "ID") I.ID = entry[1];
224 else if (entry[0] == "Number")
225 {
226 if ((entry[1] == "A") || (entry[1] == "G"))
227 I.N_entries = -1;
228 else
229 I.N_entries = str2int(entry[1]);
230 I.N_entries_str = entry[1];
231 }
232 else if (entry[0] == "Type")
233 {
234 if (entry[1] == "Integer") {I.Type_str = "Integer"; I.Type = Integer;}
235 else if ((entry[1] == "Float") || (entry[1] == "Numeric")) {I.Type_str = "Float"; I.Type = Float;}
236 else if (entry[1] == "Character") {I.Type_str = "Character"; I.Type = Character;}
237 else if (entry[1] == "String") {I.Type_str = "String"; I.Type = String;}
238 else if (entry[1] == "Flag")
239 {
240 I.Type = Flag;
241 I.Type_str = "Flag";
242 if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in);
243 }
244 else LOG.error("Unknown Type in FORMAT meta-information: " + in);
245 }
246 else if (entry[0] == "Description") I.Description = entry[1];
247 else if (entry[0] == "IDX")
248 {
249 has_idx = true;
250 I.idx = str2int(entry[1]);
251 }
252 else
253 {
254 if (I.Other != "")
255 I.Other += ",";
256 I.Other += tokens[ui];
257 }
258 }
259
260 if (I.ID == "") LOG.error("ID required in FORMAT field description: " + in);
261 if (I.N_entries_str == "") LOG.error("Number required in FORMAT field description: " + in);
262 if (I.Type_str == "") LOG.error("Type required in FORMAT field description: " + in);
263 if (I.Description == "") LOG.error("Description required in FORMAT field description: " + in);
264 if ((I.idx == -1) && (has_idx)) LOG.error("Missing index in FORMAT field description: " + in);
265 parsed_lines.push_back(I);
266
267 if (I.idx != -1)
268 {
269 FORMAT_map[I.idx] = I;
270 FORMAT_reverse_map[I.ID] = I.idx;
271 return 0;
272 }
273 else
274 {
275 if ( FILTER_reverse_map.find( I.ID ) != FILTER_reverse_map.end() )
276 {
277 FORMAT_map[ FILTER_reverse_map[ I.ID ] ] = I;
278 FORMAT_reverse_map[I.ID] = FILTER_reverse_map[ I.ID ];
279 return 0;
280 }
281 else if ( INFO_reverse_map.find( I.ID ) != INFO_reverse_map.end() )
282 {
283 FORMAT_map[ INFO_reverse_map[ I.ID ] ] = I;
284 FORMAT_reverse_map[I.ID] = INFO_reverse_map[ I.ID ];
285 return 0;
286 }
287 else if ( FORMAT_reverse_map.find( I.ID ) != FORMAT_reverse_map.end() )
288 return 0;
289 else
290 {
291 FORMAT_map[ index ] = I;
292 FORMAT_reverse_map[I.ID] = index;
293 return 1;
294 }
295 }
296 }
297
add_CONTIG_descriptor(const string & in,int index)298 void header::add_CONTIG_descriptor(const string &in, int index)
299 {
300 size_t found_end=in.find_last_of(">");
301 string details = in.substr(0, found_end);
302
303 vector<string> tokens;
304 tokenize(details, ',', tokens);
305 Field_description I;
306 I.Field = "contig";
307 bool id_found = false;
308 vector<string> entry;
309
310 for (unsigned int ui=0; ui<tokens.size(); ui++)
311 {
312 tokenize(tokens[ui], '=', entry);
313 if (entry.size() < 2)
314 {
315 LOG.warning("Warning: Expected at least 2 parts in CONTIG entry: " + in);
316 continue;
317 }
318 if (entry[0] == "ID")
319 {
320 I.ID = entry[1];
321 id_found = true;
322 }
323 else if (entry[0] == "length") I.Length = entry[1];
324 else if (entry[0] == "assembly") I.Assembly = entry[1];
325 else
326 {
327 if (I.Other != "")
328 I.Other += ",";
329 I.Other += tokens[ui];
330 }
331 }
332 if (id_found == false)
333 LOG.warning("CONTIG declaration found without ID: "+ in + "\n");
334
335 parsed_lines.push_back(I);
336 CONTIG_map[index] = I;
337 CONTIG_reverse_map[I.ID] = index;
338 }
339
add_FILTER_descriptor(const string & in,int index)340 int header::add_FILTER_descriptor(const string &in, int index)
341 {
342 size_t found_end=in.find_last_of(">");
343 string details = in.substr(0, found_end);
344 vector<string> tokens;
345 tokenize(details, ',', tokens);
346 if (tokens.size() < 2)
347 LOG.error("Expected at least 2 parts in FILTER definition: " + in);
348
349 string Description;
350 Field_description I;
351 I.Field = "FILTER";
352 vector<string> entry;
353
354 for (unsigned int ui=0; ui<tokens.size(); ui++)
355 {
356 tokenize(tokens[ui], '=', entry);
357 if (entry.size() < 2)
358 {
359 LOG.warning("Warning: Expected at least 2 parts in FORMAT entry: " + in);
360 continue;
361 }
362 if (entry[0] == "ID") I.ID = entry[1];
363 else if (entry[0] == "Description")
364 {
365 I.Description = entry[1];
366 }
367 else if (entry[0] == "IDX")
368 {
369 has_idx = true;
370 I.idx = str2int(entry[1]);
371 }
372 else
373 {
374 if (I.Other != "")
375 I.Other += ",";
376 I.Other += tokens[ui];
377 }
378 }
379
380 if (I.ID == "") LOG.error("ID required in FILTER field description: " + in);
381 if (I.Description == "") LOG.error("Description required in FILTER field description: " + in);
382 if ((I.idx == -1) && (has_idx)) LOG.error("Missing index in FILTER field description: " + in);
383 if (I.ID != "PASS") parsed_lines.push_back(I);
384
385 if (I.idx != -1)
386 {
387 FILTER_map[I.idx] = I;
388 FILTER_reverse_map[I.ID] = I.idx;
389 return 0;
390 }
391 else
392 {
393 if ( INFO_reverse_map.find( I.ID ) != INFO_reverse_map.end() )
394 {
395 FILTER_map[ INFO_reverse_map[ I.ID ] ] = I;
396 FILTER_reverse_map[I.ID] = INFO_reverse_map[ I.ID ];
397 return 0;
398 }
399 else if ( FORMAT_reverse_map.find( I.ID ) != FORMAT_reverse_map.end() )
400 {
401 FILTER_map[ FORMAT_reverse_map[ I.ID ] ] = I;
402 FILTER_reverse_map[I.ID] = FORMAT_reverse_map[ I.ID ];
403 return 0;
404 }
405 else if ( FILTER_reverse_map.find( I.ID ) != FILTER_reverse_map.end() )
406 return 0;
407 else
408 {
409 FILTER_map[index] = I;
410 FILTER_reverse_map[I.ID] = index;
411 return 1;
412 }
413 }
414 }
415
reprint()416 void header::reprint()
417 {
418 lines.resize(0);
419 Field_description I;
420 for (unsigned int ui=0; ui<parsed_lines.size(); ui++)
421 {
422 I = parsed_lines[ui];
423 stringstream new_line;
424
425 if (I.Field[0] == '#')
426 new_line << I.Field << "=" << I.Other;
427 else
428 {
429 new_line << "##" << I.Field << "=<";
430 if (I.ID != "") new_line << "ID=" << I.ID;
431 if (I.N_entries_str != "") new_line << ",Number=" << I.N_entries_str;
432 if (I.Type_str != "") new_line << ",Type=" << I.Type_str;
433 if (I.Description != "") new_line << ",Description=" << I.Description;
434 if (I.Source != "") new_line << ",Source=" << I.Source;
435 if (I.Version != "") new_line << ",Version=" << I.Version;
436 if (I.Length != "") new_line << ",Length=" << I.Length;
437 if (I.Assembly != "") new_line << ",Assembly=" << I.Assembly;
438 if (I.Other != "") new_line << "," << I.Other;
439 new_line << ">";
440 }
441 lines.push_back(new_line.str());
442 }
443 }
444
reparse()445 void header::reparse()
446 {
447 unsigned int index = 0;
448 has_idx = false; contig_index = 0;
449 vector<string> old_lines(lines.size(),"");
450 copy(lines.begin(), lines.end(), old_lines.begin());
451 lines.resize(0);
452
453 INFO_map.clear(); INFO_reverse_map.clear();
454 FILTER_map.clear(); FILTER_reverse_map.clear();
455 FORMAT_map.clear(); FORMAT_reverse_map.clear();
456 CONTIG_map.clear(); CONTIG_reverse_map.clear();
457
458 index += add_FILTER_descriptor("ID=PASS,Description=PASS", index);
459 for (unsigned int ui=0; ui<old_lines.size(); ui++)
460 parse_meta(old_lines[ui],index);
461 }
462
tokenize(const string & in,char token,vector<string> & out)463 void header::tokenize(const string &in, char token, vector<string> &out)
464 {
465 out.resize(0);
466 istringstream ss(in);
467 string tmp;
468 while( getline(ss, tmp, token) )
469 {
470 out.push_back(tmp);
471 }
472 }
473
split(const string & text,char sep,vector<string> & tokens)474 void header::split(const string &text, char sep, vector<string> &tokens)
475 {
476 int start = 0, end = 0, idx = 0, max = tokens.size();
477 while ((end = text.find(sep, start)) != string::npos)
478 {
479 if (idx < max)
480 tokens[idx] = text.substr(start, end - start);
481 else
482 tokens.push_back(text.substr(start, end - start));
483 start = end + 1;
484 idx++;
485 }
486 if (idx < max)
487 tokens[idx] = text.substr(start);
488 else
489 tokens.push_back(text.substr(start));
490 }
491
int2str(const int in,const int missing_value)492 string header::int2str(const int in, const int missing_value)
493 {
494 if (in == missing_value)
495 return ".";
496 else
497 {
498 static ostringstream out;
499 out.str(""); out.clear();
500 out << in;
501 return out.str();
502 }
503 }
504
str2int(const string & in,const int missing_value)505 int header::str2int(const string &in, const int missing_value)
506 {
507 if ((in.size() == 0) || (in == "."))
508 return missing_value;
509 else
510 return atoi(in.c_str());
511 }
512
str2double(const string & in,const double missing_value)513 double header::str2double(const string &in, const double missing_value)
514 {
515 if ((in.size() == 0) || (in == "."))
516 return missing_value;
517 else
518 return atof(in.c_str());
519 }
520
double2str(const double in,const double missing_value)521 string header::double2str(const double in, const double missing_value)
522 {
523 if (in == missing_value)
524 return ".";
525 else
526 {
527 static ostringstream out;
528 out.str(""); out.clear();
529 out << in;
530 return out.str();
531 }
532 }
533