1 /*
2  * header.cpp
3  *
4  *  Created on: Apr 29, 2013
5  *      Author: amarcketta
6  */
7 
8 #include "header.h"
9 
10 
header()11 header::header()
12 {
13 	has_contigs = false; has_file_format = false; has_genotypes = false;
14 	has_header = false; has_idx = false;
15 	contig_index = 0; N_indv = 0;
16 }
17 
parse_meta(const string & line,unsigned int & line_index)18 void header::parse_meta(const string &line, unsigned int &line_index)
19 {
20 	lines.push_back(line);
21 	if (line.compare(0,13,"##fileformat=")==0)
22 	{
23 		has_file_format = true;
24 		string version = line.substr(13);
25 		if ((version != "VCFv4.0") && (version != "VCFv4.1") && (version != "VCFv4.2"))
26 			LOG.error("VCF version must be v4.0, v4.1 or v4.2:\nYou are using version " + version);
27 	}
28 	else if (line.compare(0,7,"##INFO=")==0)
29 	{	// Found an INFO descriptor
30 		line_index += add_INFO_descriptor(line.substr(8, line.size()-8), line_index);
31 	}
32 	else if (line.compare(0,9,"##FILTER=")==0)
33 	{	// Found a FILTER descriptor
34 		line_index += add_FILTER_descriptor(line.substr(10, line.size()-8), line_index);
35 	}
36 	else if (line.compare(0,9,"##FORMAT=")==0)
37 	{	// Found a genotype filter descriptor
38 		line_index += add_FORMAT_descriptor(line.substr(10, line.size()-8), line_index);
39 	}
40 	else if (line.compare(0,9,"##contig=")==0)
41 	{	// Found a contig descriptor
42 		add_CONTIG_descriptor(line.substr(10, line.size()-8), contig_index);
43 		contig_index++;
44 		has_contigs = true;
45 	}
46 	else
47 	{
48 		Field_description I;
49 		size_t found = line.find_first_of("=");
50 		I.Field = line.substr(0,found);
51 		I.Other = line.substr(found+1);
52 		parsed_lines.push_back(I);
53 	}
54 }
55 
parse_header(const string & line)56 void header::parse_header(const string &line)
57 {
58 	// #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	(FORMAT	NA00001 NA00002 ... )
59 	if (has_header == true)
60 		LOG.warning("Multiple Header lines.");
61 
62 	has_header = true;
63 	istringstream header(line);
64 	int count = 0;
65 	string tmp_str;
66 	unsigned int N_header_indv = 0;
67 	has_genotypes = false;
68 	while (!header.eof())
69 	{
70 		getline(header, tmp_str, '\t');
71 		switch (count)
72 		{
73 			case 0: if (tmp_str != "#CHROM") LOG.warning("First Header entry should be #CHROM: " + tmp_str); break;
74 			case 1: if (tmp_str != "POS") LOG.warning("Second Header entry should be POS: " + tmp_str); break;
75 			case 2: if (tmp_str != "ID") LOG.warning("Third Header entry should be ID: " + tmp_str); break;
76 			case 3: if (tmp_str != "REF") LOG.warning("Fourth Header entry should be REF: " + tmp_str); break;
77 			case 4: if (tmp_str != "ALT") LOG.warning("Fifth Header entry should be ALT: " + tmp_str); break;
78 			case 5: if (tmp_str != "QUAL") LOG.warning("Sixth Header entry should be QUAL: " + tmp_str); break;
79 			case 6: if (tmp_str != "FILTER") LOG.warning("Seventh Header entry should be FILTER: " + tmp_str); break;
80 			case 7: if (tmp_str != "INFO") LOG.warning("Eighth Header entry should be INFO: " + tmp_str); break;
81 			case 8:
82 				if (tmp_str != "FORMAT")
83 					LOG.warning("Ninth Header entry should be FORMAT: " + tmp_str);
84 				else
85 					has_genotypes = true;
86 				break;
87 			default:
88 			{
89 				if (count <= 8)
90 					LOG.error("Incorrectly formatted header.");
91 				indv.push_back(tmp_str);
92 				N_header_indv++;
93 			}
94 			break;
95 		}
96 		count++;
97 	}
98 	N_indv = N_header_indv;
99 
100 	if ((has_genotypes == true ) && (N_indv == 0))
101 		LOG.warning("FORMAT field without genotypes?");
102 }
103 
add_INFO_descriptor(const string & in,int index)104 int header::add_INFO_descriptor(const string &in, int index)
105 {
106 	size_t found_end=in.find_last_of(">");
107 	string details = in.substr(0, found_end);
108 
109 	Field_description I;
110 	I.Field = "INFO";
111 	vector<string> tokens;
112 	tokenize(details, ',', tokens);
113 	if (tokens.size() < 4)
114 		LOG.error("Expected at least 4 parts in INFO definition: " + in);
115 
116 	vector<string> entry;
117 	for (unsigned int ui=0; ui<tokens.size(); ui++)
118 	{
119 		tokenize(tokens[ui], '=', entry);
120 		if (entry.size() < 2)
121 		{
122 			LOG.warning("Warning: Expected at least 2 parts in INFO entry: " + in);
123 			continue;
124 		}
125 		if (entry[0] == "ID") I.ID = entry[1];
126 		else if (entry[0] == "Number")
127 		{
128 			if ((entry[1] == "A") || (entry[1] == "G"))
129 				I.N_entries = -1;
130 			else
131 				I.N_entries =  str2int(entry[1]);
132 			I.N_entries_str = entry[1];
133 		}
134 		else if (entry[0] == "Type")
135 		{
136 			if (entry[1] == "Integer") { I.Type_str = "Integer"; I.Type = Integer; }
137 			else if ((entry[1] == "Float") || (entry[1] == "Numeric")) {I.Type_str = "Float"; I.Type = Float;}
138 			else if (entry[1] == "Character") {I.Type_str = "Character"; I.Type = Character;}
139 			else if (entry[1] == "String") {I.Type_str = "String"; I.Type = String;}
140 			else if (entry[1] == "Flag")
141 			{
142 				I.Type = Flag;
143 				I.Type_str = "Flag";
144 				if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in);
145 			}
146 			else LOG.error("Unknown Type in INFO meta-information: " + in);
147 		}
148 		else if (entry[0] == "Description") I.Description = entry[1];
149 		else if (entry[0] == "Source") I.Source = entry[1];
150 		else if (entry[0] == "Version") I.Version = entry[1];
151 		else if (entry[0] == "IDX")
152 		{
153 			has_idx = true;
154 			I.idx = str2int(entry[1]);
155 		}
156 		else
157 		{
158 			if (I.Other != "")
159 				I.Other += ",";
160 			I.Other += tokens[ui];
161 		}
162 	}
163 
164 	if (I.ID == "") LOG.error("ID required in INFO field description: " + in);
165 	if (I.N_entries_str == "") LOG.error("Number required in INFO field description: " + in);
166 	if (I.Type_str == "") LOG.error("Type required in INFO field description: " + in);
167 	if (I.Description == "") LOG.error("Description required in INFO field description: " + in);
168 	if ((I.idx == -1) && (has_idx)) LOG.error("Missing index in INFO field description: " + in);
169 	parsed_lines.push_back(I);
170 
171 	if (I.idx != -1)
172 	{
173 		INFO_map[I.idx] = I;
174 		INFO_reverse_map[I.ID] = I.idx;
175 		return 0;
176 	}
177 	else
178 	{
179 		if ( FORMAT_reverse_map.find( I.ID ) != FORMAT_reverse_map.end() )
180 		{
181 			INFO_map[ FORMAT_reverse_map[ I.ID ] ] = I;
182 			INFO_reverse_map[I.ID] = FORMAT_reverse_map[I.ID];
183 			return 0;
184 		}
185 		else if ( FILTER_reverse_map.find( I.ID ) != FILTER_reverse_map.end() )
186 		{
187 			INFO_map[ FILTER_reverse_map[ I.ID ] ] = I;
188 			INFO_reverse_map[I.ID] = FILTER_reverse_map[ I.ID ];
189 			return 0;
190 		}
191 		else if ( INFO_reverse_map.find( I.ID ) != INFO_reverse_map.end() )
192 			return 0;
193 		else
194 		{
195 			INFO_map[index] = I;
196 			INFO_reverse_map[I.ID] = index;
197 			return 1;
198 		}
199 	}
200 }
201 
add_FORMAT_descriptor(const string & in,int index)202 int header::add_FORMAT_descriptor(const string &in, int index)
203 {
204 	size_t found_end=in.find_last_of(">");
205 	string details = in.substr(0, found_end);
206 
207 	vector<string> tokens;
208 	tokenize(details, ',', tokens);
209 	Field_description I;
210 	I.Field = "FORMAT";
211 	if (tokens.size() < 4)
212 		LOG.error("Expected at least 4 parts in FORMAT definition: " + in);
213 
214 	vector<string> entry;
215 	for (unsigned int ui=0; ui<tokens.size(); ui++)
216 	{
217 		tokenize(tokens[ui], '=', entry);
218 		if (entry.size() < 2)
219 		{
220 			LOG.warning("Warning: Expected at least 2 parts in FORMAT entry: " + in);
221 			continue;
222 		}
223 		if (entry[0] == "ID") I.ID = entry[1];
224 		else if (entry[0] == "Number")
225 		{
226 			if ((entry[1] == "A") || (entry[1] == "G"))
227 				I.N_entries = -1;
228 			else
229 				I.N_entries = str2int(entry[1]);
230 			I.N_entries_str = entry[1];
231 		}
232 		else if (entry[0] == "Type")
233 		{
234 			if (entry[1] == "Integer") {I.Type_str = "Integer"; I.Type = Integer;}
235 			else if ((entry[1] == "Float") || (entry[1] == "Numeric")) {I.Type_str = "Float"; I.Type = Float;}
236 			else if (entry[1] == "Character") {I.Type_str = "Character"; I.Type = Character;}
237 			else if (entry[1] == "String") {I.Type_str = "String"; I.Type = String;}
238 			else if (entry[1] == "Flag")
239 			{
240 				I.Type = Flag;
241 				I.Type_str = "Flag";
242 				if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in);
243 			}
244 			else LOG.error("Unknown Type in FORMAT meta-information: " + in);
245 		}
246 		else if (entry[0] == "Description")	I.Description = entry[1];
247 		else if (entry[0] == "IDX")
248 		{
249 			has_idx = true;
250 			I.idx = str2int(entry[1]);
251 		}
252 		else
253 		{
254 			if (I.Other != "")
255 				I.Other += ",";
256 			I.Other += tokens[ui];
257 		}
258 	}
259 
260 	if (I.ID == "") LOG.error("ID required in FORMAT field description: " + in);
261 	if (I.N_entries_str == "") LOG.error("Number required in FORMAT field description: " + in);
262 	if (I.Type_str == "") LOG.error("Type required in FORMAT field description: " + in);
263 	if (I.Description == "") LOG.error("Description required in FORMAT field description: " + in);
264 	if ((I.idx == -1) && (has_idx)) LOG.error("Missing index in FORMAT field description: " + in);
265 	parsed_lines.push_back(I);
266 
267 	if (I.idx != -1)
268 	{
269 		FORMAT_map[I.idx] = I;
270 		FORMAT_reverse_map[I.ID] = I.idx;
271 		return 0;
272 	}
273 	else
274 	{
275 		if ( FILTER_reverse_map.find( I.ID ) != FILTER_reverse_map.end() )
276 		{
277 			FORMAT_map[ FILTER_reverse_map[ I.ID ] ] = I;
278 			FORMAT_reverse_map[I.ID] = FILTER_reverse_map[ I.ID ];
279 			return 0;
280 		}
281 		else if ( INFO_reverse_map.find( I.ID ) != INFO_reverse_map.end() )
282 		{
283 			FORMAT_map[ INFO_reverse_map[ I.ID ] ] = I;
284 			FORMAT_reverse_map[I.ID] = INFO_reverse_map[ I.ID ];
285 			return 0;
286 		}
287 		else if ( FORMAT_reverse_map.find( I.ID ) != FORMAT_reverse_map.end() )
288 			return 0;
289 		else
290 		{
291 			FORMAT_map[ index ] = I;
292 			FORMAT_reverse_map[I.ID] = index;
293 			return 1;
294 		}
295 	}
296 }
297 
add_CONTIG_descriptor(const string & in,int index)298 void header::add_CONTIG_descriptor(const string &in, int index)
299 {
300 	size_t found_end=in.find_last_of(">");
301 	string details = in.substr(0, found_end);
302 
303 	vector<string> tokens;
304 	tokenize(details, ',', tokens);
305 	Field_description I;
306 	I.Field = "contig";
307 	bool id_found = false;
308 	vector<string> entry;
309 
310 	for (unsigned int ui=0; ui<tokens.size(); ui++)
311 	{
312 		tokenize(tokens[ui], '=', entry);
313 		if (entry.size() < 2)
314 		{
315 			LOG.warning("Warning: Expected at least 2 parts in CONTIG entry: " + in);
316 			continue;
317 		}
318 		if (entry[0] == "ID")
319 		{
320 			I.ID = entry[1];
321 			id_found = true;
322 		}
323 		else if (entry[0] == "length") I.Length = entry[1];
324 		else if (entry[0] == "assembly") I.Assembly = entry[1];
325 		else
326 		{
327 			if (I.Other != "")
328 				I.Other += ",";
329 			I.Other += tokens[ui];
330 		}
331 	}
332 	if (id_found == false)
333 		LOG.warning("CONTIG declaration found without ID: "+ in + "\n");
334 
335 	parsed_lines.push_back(I);
336 	CONTIG_map[index] = I;
337 	CONTIG_reverse_map[I.ID] = index;
338 }
339 
add_FILTER_descriptor(const string & in,int index)340 int header::add_FILTER_descriptor(const string &in, int index)
341 {
342 	size_t found_end=in.find_last_of(">");
343 	string details = in.substr(0, found_end);
344 	vector<string> tokens;
345 	tokenize(details, ',', tokens);
346 	if (tokens.size() < 2)
347 		LOG.error("Expected at least 2 parts in FILTER definition: " + in);
348 
349 	string Description;
350 	Field_description I;
351 	I.Field = "FILTER";
352 	vector<string> entry;
353 
354 	for (unsigned int ui=0; ui<tokens.size(); ui++)
355 	{
356 		tokenize(tokens[ui], '=', entry);
357 		if (entry.size() < 2)
358 		{
359 			LOG.warning("Warning: Expected at least 2 parts in FORMAT entry: " + in);
360 			continue;
361 		}
362 		if (entry[0] == "ID") I.ID = entry[1];
363 		else if (entry[0] == "Description")
364 		{
365 			I.Description = entry[1];
366 		}
367 		else if (entry[0] == "IDX")
368 		{
369 			has_idx = true;
370 			I.idx = str2int(entry[1]);
371 		}
372 		else
373 		{
374 			if (I.Other != "")
375 				I.Other += ",";
376 			I.Other += tokens[ui];
377 		}
378 	}
379 
380 	if (I.ID == "") LOG.error("ID required in FILTER field description: " + in);
381 	if (I.Description == "") LOG.error("Description required in FILTER field description: " + in);
382 	if ((I.idx == -1) && (has_idx)) LOG.error("Missing index in FILTER field description: " + in);
383 	if (I.ID != "PASS") parsed_lines.push_back(I);
384 
385 	if (I.idx != -1)
386 	{
387 		FILTER_map[I.idx] = I;
388 		FILTER_reverse_map[I.ID] = I.idx;
389 		return 0;
390 	}
391 	else
392 	{
393 		if ( INFO_reverse_map.find( I.ID ) != INFO_reverse_map.end() )
394 		{
395 			FILTER_map[ INFO_reverse_map[ I.ID ] ] = I;
396 			FILTER_reverse_map[I.ID] = INFO_reverse_map[ I.ID ];
397 			return 0;
398 		}
399 		else if ( FORMAT_reverse_map.find( I.ID ) != FORMAT_reverse_map.end() )
400 		{
401 			FILTER_map[ FORMAT_reverse_map[ I.ID ] ] = I;
402 			FILTER_reverse_map[I.ID] = FORMAT_reverse_map[ I.ID ];
403 			return 0;
404 		}
405 		else if ( FILTER_reverse_map.find( I.ID ) != FILTER_reverse_map.end() )
406 			return 0;
407 		else
408 		{
409 			FILTER_map[index] = I;
410 			FILTER_reverse_map[I.ID] = index;
411 			return 1;
412 		}
413 	}
414 }
415 
reprint()416 void header::reprint()
417 {
418 	lines.resize(0);
419 	Field_description I;
420 	for (unsigned int ui=0; ui<parsed_lines.size(); ui++)
421 	{
422 		I = parsed_lines[ui];
423 		stringstream new_line;
424 
425 		if (I.Field[0] == '#')
426 			new_line << I.Field << "=" << I.Other;
427 		else
428 		{
429 			new_line << "##" << I.Field << "=<";
430 			if (I.ID != "") new_line << "ID=" << I.ID;
431 			if (I.N_entries_str != "") new_line << ",Number=" << I.N_entries_str;
432 			if (I.Type_str != "") new_line << ",Type=" << I.Type_str;
433 			if (I.Description != "") new_line << ",Description=" << I.Description;
434 			if (I.Source != "") new_line << ",Source=" << I.Source;
435 			if (I.Version != "") new_line << ",Version=" << I.Version;
436 			if (I.Length != "") new_line << ",Length=" << I.Length;
437 			if (I.Assembly != "") new_line << ",Assembly=" << I.Assembly;
438 			if (I.Other != "") new_line << "," << I.Other;
439 			new_line << ">";
440 		}
441 		lines.push_back(new_line.str());
442 	}
443 }
444 
reparse()445 void header::reparse()
446 {
447 	unsigned int index = 0;
448 	has_idx = false; contig_index = 0;
449 	vector<string> old_lines(lines.size(),"");
450 	copy(lines.begin(), lines.end(), old_lines.begin());
451 	lines.resize(0);
452 
453 	INFO_map.clear(); INFO_reverse_map.clear();
454 	FILTER_map.clear(); FILTER_reverse_map.clear();
455 	FORMAT_map.clear(); FORMAT_reverse_map.clear();
456 	CONTIG_map.clear(); CONTIG_reverse_map.clear();
457 
458 	index += add_FILTER_descriptor("ID=PASS,Description=PASS", index);
459 	for (unsigned int ui=0; ui<old_lines.size(); ui++)
460 		parse_meta(old_lines[ui],index);
461 }
462 
tokenize(const string & in,char token,vector<string> & out)463 void header::tokenize(const string &in, char token, vector<string> &out)
464 {
465 	out.resize(0);
466 	istringstream ss(in);
467 	string tmp;
468 	while( getline(ss, tmp, token) )
469 	{
470 		out.push_back(tmp);
471 	}
472 }
473 
split(const string & text,char sep,vector<string> & tokens)474 void header::split(const string &text, char sep, vector<string> &tokens)
475 {
476 	int start = 0, end = 0, idx = 0, max = tokens.size();
477 	while ((end = text.find(sep, start)) != string::npos)
478 	{
479 		if (idx < max)
480 			tokens[idx] = text.substr(start, end - start);
481 		else
482 			tokens.push_back(text.substr(start, end - start));
483 		start = end + 1;
484 		idx++;
485 	}
486 	if (idx < max)
487 		tokens[idx] = text.substr(start);
488 	else
489 		tokens.push_back(text.substr(start));
490 }
491 
int2str(const int in,const int missing_value)492 string header::int2str(const int in, const int missing_value)
493 {
494 	if (in == missing_value)
495 		return ".";
496 	else
497 	{
498 		static ostringstream out;
499 		out.str(""); out.clear();
500 		out << in;
501 		return out.str();
502 	}
503 }
504 
str2int(const string & in,const int missing_value)505 int header::str2int(const string &in, const int missing_value)
506 {
507 	if ((in.size() == 0) || (in == "."))
508 		return missing_value;
509 	else
510 		return atoi(in.c_str());
511 }
512 
str2double(const string & in,const double missing_value)513 double header::str2double(const string &in, const double missing_value)
514 {
515 	if ((in.size() == 0) || (in == "."))
516 		return missing_value;
517 	else
518 		return atof(in.c_str());
519 }
520 
double2str(const double in,const double missing_value)521 string header::double2str(const double in, const double missing_value)
522 {
523 	if (in == missing_value)
524 		return ".";
525 	else
526 	{
527 		static ostringstream out;
528 		out.str(""); out.clear();
529 		out << in;
530 		return out.str();
531 	}
532 }
533