1 /*
2  * variant_file_diff.cpp
3  *
4  *  Created on: Oct 30, 2009
5  *      Author: Adam Auton
6  *      ($Revision: 230 $)
7  */
8 
9 #include "variant_file.h"
10 
return_indv_union(variant_file & file2,map<string,pair<int,int>> & combined_individuals,const string & indv_ID_map_file)11 void variant_file::return_indv_union(variant_file &file2, map<string, pair< int, int> > &combined_individuals, const string &indv_ID_map_file)
12 {
13 	map<string, string> indv_map;
14 	bool use_map = false;
15 	if (indv_ID_map_file != "")
16 	{
17 		LOG.printLOG("Reading individual mapping file. ");
18 		ifstream map(indv_ID_map_file.c_str());
19 		if (!map.is_open())
20 			LOG.error("Could not open map file: " + indv_ID_map_file);
21 		while (!map.eof())
22 		{
23 			string indv1, indv2;
24 			map >> indv1 >> indv2;
25 			map.ignore(numeric_limits<streamsize>::max(), '\n');
26 			if ((indv1 != "") && (indv1.substr(0,1) != "#"))
27 			{
28 				indv_map[indv1] = indv2;
29 			}
30 		}
31 		map.close();
32 		use_map = true;
33 		LOG.printLOG("Read " + LOG.int2str(indv_map.size()) + " entries.\n");
34 	}
35 
36 	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
37 		if (include_indv[ui] == true)
38 		{
39 			combined_individuals[meta_data.indv[ui]] = make_pair<int,int>((int)ui, -1);
40 		}
41 
42 	for (unsigned int ui=0; ui<file2.meta_data.N_indv; ui++)
43 		if (file2.include_indv[ui] == true)
44 		{
45 			string indv_id = file2.meta_data.indv[ui];
46 			if (use_map == true)
47 			{
48 				if (indv_map.find(indv_id) != indv_map.end())
49 					indv_id = indv_map[indv_id];
50 			}
51 			if (combined_individuals.find(indv_id) != combined_individuals.end())
52 			{
53 				combined_individuals[indv_id].second = ui;
54 			}
55 			else
56 				combined_individuals[indv_id] = make_pair<int,int>(-1, (int)ui);
57 		}
58 }
59 
output_sites_in_files(const parameters & params,variant_file & diff_variant_file)60 void variant_file::output_sites_in_files(const parameters &params, variant_file &diff_variant_file)
61 {
62 	string CHROM;
63 	vector<char> variant_line;
64 	entry *e1 = get_entry_object();
65 	entry *e2 = diff_variant_file.get_entry_object();
66 
67 	bool new_e1 = true;
68 	bool new_e2 = true;
69 	string CHROM1 = "";
70 	string CHROM2 = "";
71 	string curr_CHROM = "";
72 	vector<string> all_CHROM;
73 	int POS1 = -1;
74 	int POS2 = -1;
75 	string REF1 = "";
76 	string REF2 = "";
77 	string ALT1 = "";
78 	string ALT2 = "";
79 	int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0, N_overlap_SNPs = 0;
80 	string output_file = params.output_prefix + ".diff.sites_in_files";
81 
82 	streambuf * buf;
83 	ofstream temp_out;
84 	if (!params.stream_out)
85 	{
86 		temp_out.open(output_file.c_str(), ios::out);
87 		if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
88 		buf = temp_out.rdbuf();
89 	}
90 	else
91 		buf = cout.rdbuf();
92 
93 	ostream sites_in_files(buf);
94 
95 	sites_in_files << "CHROM\tPOS1\tPOS2\tIN_FILE\tREF1\tREF2\tALT1\tALT2" << endl;
96 
97 	LOG.printLOG("Comparing sites in VCF files...\n");
98 	while(true)
99 	{
100 		if(new_e1)
101 		{
102 			while(!eof())
103 			{
104 				get_entry(variant_line);
105 				e1->reset(variant_line);
106 				N_entries += e1->apply_filters(params);
107 
108 				if(!e1->passed_filters)
109 					continue;
110 				N_kept_entries++;
111 				e1->parse_basic_entry(true);
112 				CHROM1 = e1->get_CHROM();
113 				POS1 = e1->get_POS();
114 				REF1 = e1->get_REF();
115 				ALT1 = e1->get_ALT();
116 				break;
117 			}
118 			new_e1 = false;
119 		}
120 		if(new_e2)
121 		{
122 			while(!diff_variant_file.eof())
123 			{
124 				diff_variant_file.get_entry(variant_line);
125 				e2->reset(variant_line);
126 				diff_variant_file.N_entries += e2->apply_filters(params);
127 
128 				if(!e2->passed_filters)
129 					continue;
130 				diff_variant_file.N_kept_entries++;
131 				e2->parse_basic_entry(true);
132 				CHROM2 = e2->get_CHROM();
133 				POS2 = e2->get_POS();
134 				REF2 = e2->get_REF();
135 				ALT2 = e2->get_ALT();
136 				break;
137 			}
138 			new_e2 = false;
139 		}
140 
141 		if(eof() && diff_variant_file.eof())
142 			break;
143 		else if(diff_variant_file.eof())
144 		{
145 			if(CHROM1 == curr_CHROM)
146 			{
147 				sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
148 				N_SNPs_file1_only++;
149 				new_e1 = true;
150 
151 			}
152 			else
153 			{
154 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
155 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
156 				else
157 				{
158 					curr_CHROM = CHROM1;
159 					all_CHROM.push_back(CHROM1);
160 					sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
161 					N_SNPs_file1_only++;
162 					new_e1 = true;
163 				}
164 			}
165 		}
166 		else if(eof())
167 		{
168 			if(CHROM2 == curr_CHROM)
169 			{
170 				sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
171 				N_SNPs_file2_only++;
172 				new_e2 = true;
173 
174 			}
175 			else
176 			{
177 				if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
178 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
179 				else
180 				{
181 					curr_CHROM = CHROM2;
182 					all_CHROM.push_back(CHROM2);
183 					sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
184 					N_SNPs_file2_only++;
185 					new_e2 = true;
186 				}
187 			}
188 		}
189 		else if(CHROM1 == CHROM2)
190 		{
191 			if (CHROM1 != curr_CHROM)
192 			{
193 				curr_CHROM = CHROM1;
194 				all_CHROM.push_back(curr_CHROM);
195 			}
196 
197 			if(POS1 == POS2)
198 			{
199 				if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") )
200 					REF1 = REF2;
201 				if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") )
202 					REF2 = REF1;
203 
204 				new_e1 = true;
205 				new_e2 = true;
206 				if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != ""))
207 				{
208 					sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 << "\t" << ALT1 << "\t" << ALT2 << endl;
209 					N_overlap_SNPs++;
210 				}
211 				else
212 				{
213 					sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tB\t" << REF1 << "\t" << REF2 << "\t" << ALT1 << "\t" << ALT2 << endl;
214 					N_common_SNPs++;
215 				}
216 			}
217 			else if(POS1 < POS2)
218 			{
219 				if (POS2 < (POS1+REF1.size()))
220 				{
221 					sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 <<"\t" << ALT1 << "\t" << ALT2 << endl;
222 					N_overlap_SNPs++;
223 					new_e1 = true;
224 					new_e2 = true;
225 				}
226 				else
227 				{
228 					sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
229 					N_SNPs_file1_only++;
230 					new_e1 = true;
231 				}
232 			}
233 			else
234 			{
235 				if (POS1 < (POS2+REF2.size()))
236 				{
237 					sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 <<"\t" << ALT1 << "\t" << ALT2 << endl;
238 					N_overlap_SNPs++;
239 					new_e1 = true;
240 					new_e2 = true;
241 				}
242 				else
243 				{
244 					sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
245 					N_SNPs_file2_only++;
246 					new_e2 = true;
247 				}
248 			}
249 		}
250 		else
251 		{
252 			if (CHROM1 == curr_CHROM)
253 			{
254 				sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
255 				N_SNPs_file1_only++;
256 				new_e1 = true;
257 			}
258 			else if (CHROM2 == curr_CHROM)
259 			{
260 				sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
261 				N_SNPs_file2_only++;
262 				new_e2 = true;
263 			}
264 			else
265 			{
266 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
267 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
268 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
269 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
270 
271 				LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
272 			}
273 		}
274 	}
275 
276 	LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
277 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
278 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
279 	LOG.printLOG("Found " + output_log::int2str(N_overlap_SNPs) + " non-matching overlapping sites.\n");
280 	delete e1;
281 	delete e2;
282 }
283 
output_indv_in_files(const parameters & params,variant_file & diff_variant_file)284 void variant_file::output_indv_in_files(const parameters &params, variant_file &diff_variant_file)
285 {
286 	LOG.printLOG("Comparing individuals in VCF files...\n");
287 
288 	string output_file = params.output_prefix + ".diff.indv_in_files";
289 
290 	streambuf * buf;
291 	ofstream temp_out;
292 	if (!params.stream_out)
293 	{
294 		temp_out.open(output_file.c_str(), ios::out);
295 		if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
296 		buf = temp_out.rdbuf();
297 	}
298 	else
299 		buf = cout.rdbuf();
300 
301 	ostream out(buf);
302 	out << "INDV\tFILES" << endl;
303 
304 	// Build a list of individuals contained in each file
305 	map<string, pair< int, int> > combined_individuals;
306 	map<string, pair< int, int> >::iterator combined_individuals_it;
307 	return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
308 
309 	unsigned int N_combined_indv = combined_individuals.size();
310 	unsigned int N[3]={0,0,0};
311 	for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
312 	{
313 		if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1))
314 		{
315 			N[0]++;
316 			out << combined_individuals_it->first << "\tB" << endl;
317 		}
318 		else if (combined_individuals_it->second.first != -1)
319 		{
320 			N[1]++;
321 			out << combined_individuals_it->first << "\t1" << endl;
322 		}
323 		else if (combined_individuals_it->second.second != -1)
324 		{
325 			N[2]++;
326 			out << combined_individuals_it->first << "\t2" << endl;
327 		}
328 		else
329 			LOG.error("Unhandled case");
330 	}
331 
332 	LOG.printLOG("N_combined_individuals:\t" + output_log::int2str(N_combined_indv) + "\n");
333 	LOG.printLOG("N_individuals_common_to_both_files:\t" + output_log::int2str(N[0]) + "\n");
334 	LOG.printLOG("N_individuals_unique_to_file1:\t" + output_log::int2str(N[1]) + "\n");
335 	LOG.printLOG("N_individuals_unique_to_file2:\t" + output_log::int2str(N[2]) + "\n");
336 }
337 
output_discordance_by_indv(const parameters & params,variant_file & diff_variant_file)338 void variant_file::output_discordance_by_indv(const parameters &params, variant_file &diff_variant_file)
339 {
340 	map<string, pair< int, int> > combined_individuals;
341 	map<string, pair< int, int> >::iterator combined_individuals_it;
342 	return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
343 
344 	LOG.printLOG("Outputting Discordance By Individual...\n");
345 	map<string, pair<int, int> > indv_sums;
346 
347 	vector<char> variant_line;
348 	int indv1, indv2;
349 	entry * e1 = get_entry_object();
350 	entry * e2 = diff_variant_file.get_entry_object();
351 	string CHROM;
352 	bool new_e1 = true;
353 	bool new_e2 = true;
354 	string CHROM1 = "";
355 	string CHROM2 = "";
356 	string curr_CHROM = "";
357 	vector<string> all_CHROM;
358 	int POS1 = -1;
359 	int POS2 = -1;
360 	string REF1 = "";
361 	string REF2 = "";
362 	string ALT1 = "";
363 	string ALT2 = "";
364 	bool alleles_match = false;
365 	pair<string, string> genotype1, genotype2;
366 	pair<int,int> geno_ids1, geno_ids2;
367 	pair<string, string> missing_genotype(".",".");
368 	pair<int, int> missing_id(-1,-1);
369 	int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
370 
371 	while(true)
372 	{
373 		if(new_e1)
374 		{
375 			while(!eof())
376 			{
377 				get_entry(variant_line);
378 				e1->reset(variant_line);
379 				N_entries += e1->apply_filters(params);
380 
381 				if(!e1->passed_filters)
382 					continue;
383 				N_kept_entries++;
384 				e1->parse_basic_entry(true);
385 				CHROM1 = e1->get_CHROM();
386 				POS1 = e1->get_POS();
387 				REF1 = e1->get_REF();
388 				ALT1 = e1->get_ALT();
389 				break;
390 			}
391 			new_e1 = false;
392 		}
393 		if(new_e2)
394 		{
395 			while(!diff_variant_file.eof())
396 			{
397 				diff_variant_file.get_entry(variant_line);
398 				e2->reset(variant_line);
399 				diff_variant_file.N_entries += e2->apply_filters(params);
400 
401 				if(!e2->passed_filters)
402 					continue;
403 				diff_variant_file.N_kept_entries++;
404 				e2->parse_basic_entry(true);
405 				CHROM2 = e2->get_CHROM();
406 				POS2 = e2->get_POS();
407 				REF2 = e2->get_REF();
408 				ALT2 = e2->get_ALT();
409 				break;
410 			}
411 			new_e2 = false;
412 		}
413 
414 		if(eof() && diff_variant_file.eof())
415 			break;
416 		else if(diff_variant_file.eof())
417 		{
418 			if(CHROM1 == curr_CHROM)
419 			{
420 				N_SNPs_file1_only++;
421 				new_e1 = true;
422 
423 			}
424 			else
425 			{
426 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
427 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
428 				else
429 				{
430 					curr_CHROM = CHROM1;
431 					all_CHROM.push_back(CHROM1);
432 					N_SNPs_file1_only++;
433 					new_e1 = true;
434 				}
435 			}
436 		}
437 		else if(eof())
438 		{
439 			if(CHROM2 == curr_CHROM)
440 			{
441 				N_SNPs_file2_only++;
442 				new_e2 = true;
443 
444 			}
445 			else
446 			{
447 				if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
448 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
449 				else
450 				{
451 					curr_CHROM = CHROM2;
452 					all_CHROM.push_back(CHROM2);
453 					N_SNPs_file2_only++;
454 					new_e2 = true;
455 				}
456 			}
457 		}
458 		else if(CHROM1 == CHROM2)
459 		{
460 			if (CHROM1 != curr_CHROM)
461 			{
462 				curr_CHROM = CHROM1;
463 				all_CHROM.push_back(curr_CHROM);
464 			}
465 
466 			if(POS1 == POS2)
467 			{
468 				new_e1 = true;
469 				new_e2 = true;
470 				N_common_SNPs++;
471 			}
472 			else if(POS1 < POS2)
473 			{
474 				new_e1 = true;
475 				N_SNPs_file1_only++;
476 			}
477 			else
478 			{
479 				new_e2 = true;
480 				N_SNPs_file2_only++;
481 			}
482 		}
483 		else
484 		{
485 			if (CHROM1 == curr_CHROM)
486 			{
487 				new_e1 = true;
488 				N_SNPs_file1_only++;
489 			}
490 			else if (CHROM2 == curr_CHROM)
491 			{
492 				new_e2 = true;
493 				N_SNPs_file2_only++;
494 			}
495 			else
496 			{
497 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
498 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
499 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
500 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
501 
502 				LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
503 			}
504 		}
505 
506 		if(new_e1 && new_e2)
507 		{
508 			if (REF1 == "N")
509 				REF1 = REF2;
510 			if (REF2 == "N")
511 				REF2 = REF1;
512 
513 			if ((REF1.size() != REF2.size()) || ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N")))
514 			{
515 				LOG.one_off_warning("Non-matching REF. Skipping all such sites.");
516 				continue;
517 			}
518 			alleles_match = (ALT1 == ALT2) && (REF1 == REF2);
519 			e1->parse_full_entry(true);
520 			e1->parse_genotype_entries(true);
521 			e2->parse_full_entry(true);
522 			e2->parse_genotype_entries(true);
523 
524 			for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
525 			{
526 				indv1 = combined_individuals_it->second.first;
527 				indv2 = combined_individuals_it->second.second;
528 
529 				if ((indv1 == -1) || (indv2 == -1))
530 					continue;	// Individual not found in one of the files
531 
532 				if (alleles_match)
533 				{	// Alleles match, so can compare ids instead of strings
534 					e1->get_indv_GENOTYPE_ids(indv1, geno_ids1);
535 					e2->get_indv_GENOTYPE_ids(indv2, geno_ids2);
536 
537 					if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
538 					{
539 						indv_sums[combined_individuals_it->first].first++;
540 						if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
541 								((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
542 						{	// Match
543 							// Don't do anything
544 						}
545 						else
546 						{	// Mismatch
547 							indv_sums[combined_individuals_it->first].second++;
548 						}
549 					}
550 					else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
551 					{	// Both missing
552 						// Don't do anything.
553 					}
554 					else if (geno_ids1 != missing_id)
555 					{	// Genotype 1 is not missing, genotype 2 is.
556 						// Don't do anything.
557 					}
558 					else if (geno_ids2 != missing_id)
559 					{	// Genotype 2 is not missing, genotype 1 is.
560 						// Don't do anything.
561 					}
562 					else
563 						LOG.error("Unknown condition");
564 				}
565 				else
566 				{	// Alleles don't match, so need to be more careful and compare strings
567 					e1->get_indv_GENOTYPE_strings(indv1, genotype1);
568 					e2->get_indv_GENOTYPE_strings(indv2, genotype2);
569 
570 					if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
571 					{	// No missing data
572 						indv_sums[combined_individuals_it->first].first++;
573 						if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
574 								((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
575 						{	// Match
576 							// Don't do anything
577 						}
578 						else
579 						{	// Mismatch
580 							indv_sums[combined_individuals_it->first].second++;
581 						}
582 					}
583 					else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
584 					{	// Both missing
585 						// Don't do anything
586 					}
587 					else if (genotype1 != missing_genotype)
588 					{	// Genotype 1 is not missing, genotype 2 is.
589 						// Don't do anything
590 					}
591 					else if (genotype2 != missing_genotype)
592 					{	// Genotype 2 is not missing, genotype 1 is.
593 						// Don't do anything
594 					}
595 					else
596 						LOG.error("Unknown condition");
597 				}
598 			}
599 		}
600 	}
601 
602 	string output_file = params.output_prefix + ".diff.indv";
603 	streambuf * buf;
604 	ofstream temp_out;
605 	if (!params.stream_out)
606 	{
607 		temp_out.open(output_file.c_str(), ios::out);
608 		if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
609 		buf = temp_out.rdbuf();
610 	}
611 	else
612 		buf = cout.rdbuf();
613 
614 	ostream out(buf);
615 	out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
616 
617 	int N, N_discord;
618 	double discordance;
619 	for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
620 	{
621 		out << combined_individuals_it->first;
622 		N = indv_sums[combined_individuals_it->first].first;
623 		N_discord = indv_sums[combined_individuals_it->first].second;
624 		discordance = N_discord / double(N);
625 		out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl;
626 	}
627 	LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
628 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
629 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
630 
631 	delete e1;
632 	delete e2;
633 }
634 
output_discordance_by_site(const parameters & params,variant_file & diff_variant_file)635 void variant_file::output_discordance_by_site(const parameters &params, variant_file &diff_variant_file)
636 {
637 	map<string, pair< int, int> > combined_individuals;
638 	map<string, pair< int, int> >::iterator combined_individuals_it;
639 	return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
640 
641 	LOG.printLOG("Outputting Discordance By Site...\n");
642 
643 	string CHROM;
644 	vector<char> variant_line;
645 	int indv1, indv2;
646 	entry *e1 = get_entry_object();
647 	entry *e2 = diff_variant_file.get_entry_object();
648 
649 	bool new_e1 = true;
650 	bool new_e2 = true;
651 	string CHROM1 = "";
652 	string CHROM2 = "";
653 	string curr_CHROM = "";
654 	vector<string> all_CHROM;
655 	int POS1 = -1;
656 	int POS2 = -1;
657 	string REF1 = "";
658 	string REF2 = "";
659 	string ALT1 = "";
660 	string ALT2 = "";
661 	bool alleles_match = false;
662 	int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
663 
664 	string output_file = params.output_prefix + ".diff.sites";
665 	streambuf * buf;
666 	ofstream temp_out;
667 	if (!params.stream_out)
668 	{
669 		temp_out.open(output_file.c_str(), ios::out);
670 		if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
671 		buf = temp_out.rdbuf();
672 	}
673 	else
674 		buf = cout.rdbuf();
675 
676 	ostream diffsites(buf);
677 	diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
678 
679 	while(true)
680 	{
681 		if(new_e1)
682 		{
683 			while(!eof())
684 			{
685 				get_entry(variant_line);
686 				e1->reset(variant_line);
687 				N_entries += e1->apply_filters(params);
688 
689 				if(!e1->passed_filters)
690 					continue;
691 				N_kept_entries++;
692 				e1->parse_basic_entry(true);
693 				CHROM1 = e1->get_CHROM();
694 				POS1 = e1->get_POS();
695 				REF1 = e1->get_REF();
696 				ALT1 = e1->get_ALT();
697 				break;
698 			}
699 			new_e1 = false;
700 		}
701 		if(new_e2)
702 		{
703 			while(!diff_variant_file.eof())
704 			{
705 				diff_variant_file.get_entry(variant_line);
706 				e2->reset(variant_line);
707 				diff_variant_file.N_entries += e2->apply_filters(params);
708 
709 				if(!e2->passed_filters)
710 					continue;
711 				diff_variant_file.N_kept_entries++;
712 				e2->parse_basic_entry(true);
713 				CHROM2 = e2->get_CHROM();
714 				POS2 = e2->get_POS();
715 				REF2 = e2->get_REF();
716 				ALT2 = e2->get_ALT();
717 				break;
718 			}
719 			new_e2 = false;
720 		}
721 
722 		if(eof() && diff_variant_file.eof())
723 			break;
724 		else if(diff_variant_file.eof())
725 		{
726 			if(CHROM1 == curr_CHROM)
727 			{
728 				diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
729 				N_SNPs_file1_only++;
730 				new_e1 = true;
731 
732 			}
733 			else
734 			{
735 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
736 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
737 				else
738 				{
739 					curr_CHROM = CHROM1;
740 					all_CHROM.push_back(CHROM1);
741 					diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
742 					N_SNPs_file1_only++;
743 					new_e1 = true;
744 				}
745 			}
746 		}
747 		else if(eof())
748 		{
749 			if(CHROM2 == curr_CHROM)
750 			{
751 				diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
752 				N_SNPs_file2_only++;
753 				new_e2 = true;
754 
755 			}
756 			else
757 			{
758 				if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
759 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
760 				else
761 				{
762 					curr_CHROM = CHROM2;
763 					all_CHROM.push_back(CHROM2);
764 					diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
765 					N_SNPs_file2_only++;
766 					new_e2 = true;
767 				}
768 			}
769 		}
770 		else if(CHROM1 == CHROM2)
771 		{
772 			if (CHROM1 != curr_CHROM)
773 			{
774 				curr_CHROM = CHROM1;
775 				all_CHROM.push_back(curr_CHROM);
776 			}
777 
778 			if(POS1 == POS2)
779 			{
780 				if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") )
781 					REF1 = REF2;
782 				if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") )
783 					REF2 = REF1;
784 
785 				new_e1 = true;
786 				new_e2 = true;
787 				if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != ""))
788 				{
789 					LOG.one_off_warning("Non-matching REF. Skipping all such sites.");
790 					continue;
791 				}
792 				diffsites << CHROM1 << "\t" << POS1 << "\tB\t";
793 				N_common_SNPs++;
794 			}
795 			else if(POS1 < POS2)
796 			{
797 				diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
798 				N_SNPs_file1_only++;
799 				new_e1 = true;
800 			}
801 			else
802 			{
803 				diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
804 				N_SNPs_file2_only++;
805 				new_e2 = true;
806 			}
807 		}
808 		else
809 		{
810 			if (CHROM1 == curr_CHROM)
811 			{
812 				diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
813 				N_SNPs_file1_only++;
814 				new_e1 = true;
815 			}
816 			else if (CHROM2 == curr_CHROM)
817 			{
818 				diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
819 				N_SNPs_file2_only++;
820 				new_e2 = true;
821 			}
822 			else
823 			{
824 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
825 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
826 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
827 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
828 
829 				LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
830 			}
831 		}
832 
833 		pair<string, string> genotype1, genotype2;
834 		pair<int,int> geno_ids1, geno_ids2;
835 		pair<string, string> missing_genotype(".",".");
836 		pair<int, int> missing_id(-1,-1);
837 
838 		unsigned int N_common_called=0;	// Number of genotypes called in both files
839 		unsigned int N_missing_1=0, N_missing_2=0;
840 		unsigned int N_discord=0;
841 		unsigned int N_concord_non_missing=0;
842 
843 		if(new_e1 && new_e2)
844 		{
845 			alleles_match = (ALT1 == ALT2) && (REF1 == REF2);
846 			diffsites << alleles_match;
847 
848 			e1->parse_full_entry(true);
849 			e1->parse_genotype_entries(true);
850 			e2->parse_full_entry(true);
851 			e2->parse_genotype_entries(true);
852 
853 			for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
854 			{
855 				indv1 = combined_individuals_it->second.first;
856 				indv2 = combined_individuals_it->second.second;
857 
858 				if ((indv1 == -1) || (indv2 == -1))
859 					continue;	// Individual not found in one of the files
860 
861 				if (alleles_match)
862 				{	// Alleles match, so can compare ids instead of strings
863 					e1->get_indv_GENOTYPE_ids(indv1, geno_ids1);
864 					e2->get_indv_GENOTYPE_ids(indv2, geno_ids2);
865 
866 					if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
867 					{
868 						N_common_called++;
869 						if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
870 								((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
871 						{	// Match
872 							N_concord_non_missing++;
873 						}
874 						else
875 						{	// Mismatch
876 							N_discord++;
877 						}
878 					}
879 					else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
880 					{	// Both missing
881 						N_missing_1++; N_missing_2++;
882 					}
883 					else if (geno_ids1 != missing_id)
884 					{	// Genotype 1 is not missing, genotype 2 is.
885 						N_missing_2++;
886 					}
887 					else if (geno_ids2 != missing_id)
888 					{	// Genotype 2 is not missing, genotype 1 is.
889 						N_missing_1++;
890 					}
891 					else
892 						LOG.error("Unknown condition");
893 				}
894 				else
895 				{	// Alleles don't match, so need to be more careful and compare strings
896 					e1->get_indv_GENOTYPE_strings(indv1, genotype1);
897 					e2->get_indv_GENOTYPE_strings(indv2, genotype2);
898 
899 					if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
900 					{	// No missing data
901 						N_common_called++;
902 						if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
903 								((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
904 						{	// Match
905 							N_concord_non_missing++;
906 						}
907 						else
908 						{	// Mismatch
909 							N_discord++;
910 						}
911 					}
912 					else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
913 					{	// Both missing
914 						N_missing_1++; N_missing_2++;
915 					}
916 					else if (genotype1 != missing_genotype)
917 					{	// Genotype 1 is not missing, genotype 2 is.
918 						N_missing_2++;
919 					}
920 					else if (genotype2 != missing_genotype)
921 					{	// Genotype 2 is not missing, genotype 1 is.
922 						N_missing_1++;
923 					}
924 					else
925 						LOG.error("Unknown condition");
926 				}
927 			}
928 		}
929 		else
930 			diffsites << "0";
931 
932 		double discordance = N_discord / double(N_common_called);
933 		diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance;
934 		diffsites << endl;
935 	}
936 
937 	LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
938 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
939 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
940 	delete e1;
941 	delete e2;
942 }
943 
output_discordance_matrix(const parameters & params,variant_file & diff_variant_file)944 void variant_file::output_discordance_matrix(const parameters &params, variant_file &diff_variant_file)
945 {
946 	map<string, pair< int, int> > combined_individuals;
947 	map<string, pair< int, int> >::iterator combined_individuals_it;
948 	return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
949 
950 	LOG.printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n");
951 	string CHROM;
952 	vector<char> variant_line;
953 	int indv1, indv2;
954 	entry *e1 = get_entry_object();
955 	entry *e2 = diff_variant_file.get_entry_object();
956 
957 	bool new_e1 = true;
958 	bool new_e2 = true;
959 	string CHROM1 = "";
960 	string CHROM2 = "";
961 	string curr_CHROM = "";
962 	vector<string> all_CHROM;
963 	int POS1 = -1;
964 	int POS2 = -1;
965 	string REF1 = "";
966 	string REF2 = "";
967 	string ALT1 = "";
968 	string ALT2 = "";
969 	int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
970 	vector<vector<int> > discordance_matrix(4, vector<int>(4, 0));
971 
972 	if (combined_individuals.size() <= 0)
973 		LOG.error("No overlapping individuals can be found.");
974 
975 	while(true)
976 	{
977 		if(new_e1)
978 		{
979 			while(!eof())
980 			{
981 				get_entry(variant_line);
982 				e1->reset(variant_line);
983 				N_entries += e1->apply_filters(params);
984 
985 				if(!e1->passed_filters)
986 					continue;
987 				N_kept_entries++;
988 				e1->parse_basic_entry(true);
989 				CHROM1 = e1->get_CHROM();
990 				POS1 = e1->get_POS();
991 				REF1 = e1->get_REF();
992 				ALT1 = e1->get_ALT();
993 				break;
994 			}
995 			new_e1 = false;
996 		}
997 		if(new_e2)
998 		{
999 			while(!diff_variant_file.eof())
1000 			{
1001 				diff_variant_file.get_entry(variant_line);
1002 				e2->reset(variant_line);
1003 				diff_variant_file.N_entries += e2->apply_filters(params);
1004 
1005 				if(!e2->passed_filters)
1006 					continue;
1007 				diff_variant_file.N_kept_entries++;
1008 				e2->parse_basic_entry(true);
1009 				CHROM2 = e2->get_CHROM();
1010 				POS2 = e2->get_POS();
1011 				REF2 = e2->get_REF();
1012 				ALT2 = e2->get_ALT();
1013 				break;
1014 			}
1015 			new_e2 = false;
1016 		}
1017 
1018 		if(eof() && diff_variant_file.eof())
1019 			break;
1020 		else if(diff_variant_file.eof())
1021 		{
1022 			if(CHROM1 == curr_CHROM)
1023 			{
1024 				N_SNPs_file1_only++;
1025 				new_e1 = true;
1026 
1027 			}
1028 			else
1029 			{
1030 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1031 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1032 				else
1033 				{
1034 					curr_CHROM = CHROM1;
1035 					all_CHROM.push_back(CHROM1);
1036 					N_SNPs_file1_only++;
1037 					new_e1 = true;
1038 				}
1039 			}
1040 		}
1041 		else if(eof())
1042 		{
1043 			if(CHROM2 == curr_CHROM)
1044 			{
1045 				N_SNPs_file2_only++;
1046 				new_e2 = true;
1047 
1048 			}
1049 			else
1050 			{
1051 				if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1052 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1053 				else
1054 				{
1055 					curr_CHROM = CHROM2;
1056 					all_CHROM.push_back(CHROM2);
1057 					N_SNPs_file2_only++;
1058 					new_e2 = true;
1059 				}
1060 			}
1061 		}
1062 		else if(CHROM1 == CHROM2)
1063 		{
1064 			if (CHROM1 != curr_CHROM)
1065 			{
1066 				curr_CHROM = CHROM1;
1067 				all_CHROM.push_back(curr_CHROM);
1068 			}
1069 
1070 			if(POS1 == POS2)
1071 			{
1072 				if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") )
1073 					REF1 = REF2;
1074 				if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") )
1075 					REF2 = REF1;
1076 
1077 				new_e1 = true;
1078 				new_e2 = true;
1079 				if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != ""))
1080 				{
1081 					LOG.one_off_warning("Non-matching REF. Skipping all such sites.");
1082 					continue;
1083 				}
1084 				N_common_SNPs++;
1085 			}
1086 			else if(POS1 < POS2)
1087 			{
1088 				N_SNPs_file1_only++;
1089 				new_e1 = true;
1090 			}
1091 			else
1092 			{
1093 				N_SNPs_file2_only++;
1094 				new_e2 = true;
1095 			}
1096 		}
1097 		else
1098 		{
1099 			if (CHROM1 == curr_CHROM)
1100 			{
1101 				N_SNPs_file1_only++;
1102 				new_e1 = true;
1103 			}
1104 			else if (CHROM2 == curr_CHROM)
1105 			{
1106 				N_SNPs_file2_only++;
1107 				new_e2 = true;
1108 			}
1109 			else
1110 			{
1111 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1112 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1113 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1114 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1115 
1116 				LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
1117 			}
1118 		}
1119 
1120 		if(new_e1 && new_e2)
1121 		{
1122 			if (e1->get_N_alleles() != 2 || e2->get_N_alleles() != 2)
1123 				continue;
1124 
1125 			if (ALT1 != ALT2)
1126 			{
1127 				LOG.one_off_warning("Non-matching ALT. Skipping all such sites.");
1128 				continue;
1129 			}
1130 			e1->parse_full_entry(true);
1131 			e1->parse_genotype_entries(true);
1132 
1133 			e2->parse_full_entry(true);
1134 			e2->parse_genotype_entries(true);
1135 
1136 			pair<int,int> geno_ids1, geno_ids2;
1137 			int N1, N2;
1138 
1139 			for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1140 			{
1141 				indv1 = combined_individuals_it->second.first;
1142 				indv2 = combined_individuals_it->second.second;
1143 
1144 				if ((indv1 == -1) || (indv2 == -1))
1145 				{
1146 					LOG.one_off_warning("Non-matching individual found. Skipping all such combinations.");
1147 					continue;	// Individual not found in one of the files
1148 				}
1149 
1150 				// Alleles match, so can compare ids instead of strings
1151 				e1->get_indv_GENOTYPE_ids(indv1, geno_ids1);
1152 				e2->get_indv_GENOTYPE_ids(indv2, geno_ids2);
1153 
1154 				if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) ||
1155 						((geno_ids2.first != -1) && (geno_ids2.second == -1)))
1156 				{	// Haploid
1157 					LOG.one_off_warning("***Warning: Haploid chromosomes not counted!***");
1158 					continue;
1159 				}
1160 
1161 				N1 = geno_ids1.first + geno_ids1.second;
1162 				N2 = geno_ids2.first + geno_ids2.second;
1163 
1164 				if ((N1 == -1) || (N1 < -2) || (N1 > 2))
1165 					LOG.error("Unhandled case");
1166 				if ((N2 == -1) || (N2 < -2) || (N2 > 2))
1167 					LOG.error("Unhandled case");
1168 
1169 				if (N1 == -2)
1170 					N1 = 3;
1171 
1172 				if (N2 == -2)
1173 					N2 = 3;
1174 
1175 				discordance_matrix[N1][N2]++;
1176 			}
1177 		}
1178 	}
1179 
1180 	string output_file = params.output_prefix + ".diff.discordance_matrix";
1181 	streambuf * buf;
1182 	ofstream temp_out;
1183 	if (!params.stream_out)
1184 	{
1185 		temp_out.open(output_file.c_str(), ios::out);
1186 		if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
1187 		buf = temp_out.rdbuf();
1188 	}
1189 	else
1190 		buf = cout.rdbuf();
1191 
1192 	ostream out(buf);
1193 
1194 	out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl;
1195 	out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl;
1196 	out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl;
1197 	out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl;
1198 	out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl;
1199 
1200 	LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
1201 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
1202 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
1203 	delete e1;
1204 	delete e2;
1205 }
1206 
output_switch_error(const parameters & params,variant_file & diff_variant_file)1207 void variant_file::output_switch_error(const parameters &params, variant_file &diff_variant_file)
1208 {
1209 	map<string, pair< int, int> > combined_individuals;
1210 	map<string, pair< int, int> >::iterator combined_individuals_it;
1211 	return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
1212 
1213 	LOG.printLOG("Outputting Phase Switch Errors...\n");
1214 
1215 	vector<char> variant_line;
1216 	int indv1, indv2;
1217 	entry *e1 = get_entry_object();
1218 	entry *e2 = diff_variant_file.get_entry_object();
1219 
1220 	bool new_e1 = true;
1221 	bool new_e2 = true;
1222 	string CHROM1 = "";
1223 	string CHROM2 = "";
1224 	string curr_CHROM = "";
1225 	vector<string> all_CHROM;
1226 	int POS1 = -1;
1227 	int POS2 = -1;
1228 	string REF1 = "";
1229 	string REF2 = "";
1230 	string ALT1 = "";
1231 	string ALT2 = "";
1232 	int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
1233 
1234 	string output_file = params.output_prefix + ".diff.switch";
1235 	streambuf * buf;
1236 	ofstream temp_out;
1237 	if (!params.stream_out)
1238 	{
1239 		temp_out.open(output_file.c_str(), ios::out);
1240 		if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
1241 		buf = temp_out.rdbuf();
1242 	}
1243 	else
1244 		buf = cout.rdbuf();
1245 
1246 	ostream switcherror(buf);
1247 
1248 	switcherror << "CHROM\tPOS_START\tPOS_END\tINDV" << endl;
1249 
1250 	unsigned int N_combined_indv = combined_individuals.size();
1251 	vector<int> N_phased_het_sites(N_combined_indv, 0);
1252 	vector<int> N_switch_errors(N_combined_indv, 0);
1253 
1254 	pair<string, string> missing_genotype(".",".");
1255 	pair<string, int> missing_loc(".",-1);
1256 	vector<pair<string, string> > prev_geno_file1(N_combined_indv, missing_genotype);
1257 	vector<pair<string, string> > prev_geno_file2(N_combined_indv, missing_genotype);
1258 	vector<pair<string, int> > prev_pos_file1(N_combined_indv, missing_loc);
1259 	vector<pair<string, int> > prev_pos_file2(N_combined_indv, missing_loc);
1260 	pair<string, string> file1_hap1, file1_hap2, file2_hap1;
1261 
1262 	if (N_combined_indv <= 0)
1263 		LOG.error("No overlapping individuals can be found.");
1264 
1265 	while(true)
1266 	{
1267 		if(new_e1)
1268 		{
1269 			while(!eof())
1270 			{
1271 				get_entry(variant_line);
1272 				e1->reset(variant_line);
1273 				N_entries += e1->apply_filters(params);
1274 
1275 				if(!e1->passed_filters)
1276 					continue;
1277 				N_kept_entries++;
1278 				e1->parse_basic_entry(true);
1279 				CHROM1 = e1->get_CHROM();
1280 				POS1 = e1->get_POS();
1281 				REF1 = e1->get_REF();
1282 				ALT1 = e1->get_ALT();
1283 				break;
1284 			}
1285 			new_e1 = false;
1286 		}
1287 		if(new_e2)
1288 		{
1289 			while(!diff_variant_file.eof())
1290 			{
1291 				diff_variant_file.get_entry(variant_line);
1292 				e2->reset(variant_line);
1293 				diff_variant_file.N_entries += e2->apply_filters(params);
1294 
1295 				if(!e2->passed_filters)
1296 					continue;
1297 				diff_variant_file.N_kept_entries++;
1298 				e2->parse_basic_entry(true);
1299 				CHROM2 = e2->get_CHROM();
1300 				POS2 = e2->get_POS();
1301 				REF2 = e2->get_REF();
1302 				ALT2 = e2->get_ALT();
1303 				break;
1304 			}
1305 			new_e2 = false;
1306 		}
1307 
1308 		if(eof() && diff_variant_file.eof())
1309 			break;
1310 		else if(diff_variant_file.eof())
1311 		{
1312 			if(CHROM1 == curr_CHROM)
1313 			{
1314 				N_SNPs_file1_only++;
1315 				new_e1 = true;
1316 			}
1317 			else
1318 			{
1319 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1320 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1321 				else
1322 				{
1323 					curr_CHROM = CHROM1;
1324 					all_CHROM.push_back(CHROM1);
1325 					N_SNPs_file1_only++;
1326 					new_e1 = true;
1327 				}
1328 			}
1329 		}
1330 		else if(eof())
1331 		{
1332 			if(CHROM2 == curr_CHROM)
1333 			{
1334 				N_SNPs_file2_only++;
1335 				new_e2 = true;
1336 			}
1337 			else
1338 			{
1339 				if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1340 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1341 				else
1342 				{
1343 					curr_CHROM = CHROM2;
1344 					all_CHROM.push_back(CHROM2);
1345 					N_SNPs_file2_only++;
1346 					new_e2 = true;
1347 				}
1348 			}
1349 		}
1350 		else if(CHROM1 == CHROM2)
1351 		{
1352 			if (CHROM1 != curr_CHROM)
1353 			{
1354 				curr_CHROM = CHROM1;
1355 				all_CHROM.push_back(curr_CHROM);
1356 			}
1357 
1358 			if(POS1 == POS2)
1359 			{
1360 				N_common_SNPs++;
1361 				new_e1 = true;
1362 				new_e2 = true;
1363 			}
1364 			else if(POS1 < POS2)
1365 			{
1366 				N_SNPs_file1_only++;
1367 				new_e1 = true;
1368 			}
1369 			else
1370 			{
1371 				N_SNPs_file2_only++;
1372 				new_e2 = true;
1373 			}
1374 		}
1375 		else
1376 		{
1377 			if (CHROM1 == curr_CHROM)
1378 			{
1379 				N_SNPs_file1_only++;
1380 				new_e1 = true;
1381 			}
1382 			else if (CHROM2 == curr_CHROM)
1383 			{
1384 				N_SNPs_file2_only++;
1385 				new_e2 = true;
1386 			}
1387 			else
1388 			{
1389 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1390 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1391 				if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1392 					LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1393 
1394 				LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");			}
1395 		}
1396 		if(new_e1 && new_e2)
1397 		{
1398 			e1->parse_full_entry(true);
1399 			e1->parse_genotype_entries(true);
1400 
1401 			e2->parse_full_entry(true);
1402 			e2->parse_genotype_entries(true);
1403 
1404 			pair<string, string> genotype1, genotype2;
1405 			pair<string, string> missing_genotype(".",".");
1406 
1407 			unsigned int N_common_called=0;	// Number of genotypes called in both files
1408 			unsigned int indv_count=0;
1409 
1410 			for (combined_individuals_it=combined_individuals.begin();
1411 					combined_individuals_it!=combined_individuals.end();
1412 					++combined_individuals_it, indv_count++)
1413 			{
1414 				indv1 = combined_individuals_it->second.first;
1415 				indv2 = combined_individuals_it->second.second;
1416 
1417 				if ((indv1 == -1) || (indv2 == -1))
1418 				{
1419 					LOG.one_off_warning("Non-matching individual found. Skipping all such combinations.");
1420 					continue;	// Individual not found in one of the files
1421 				}
1422 
1423 				e1->get_indv_GENOTYPE_strings(indv1, genotype1);
1424 				e2->get_indv_GENOTYPE_strings(indv2, genotype2);
1425 
1426 				if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
1427 				{	// No missing data
1428 					N_common_called++;
1429 					if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
1430 							((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
1431 					{	// Have a matching genotypes in files 1 and 2
1432 						if (genotype1.first != genotype1.second)
1433 						{	// It's a heterozgote
1434 							char phase1, phase2;
1435 							phase1 = e1->get_indv_PHASE(indv1);
1436 							phase2 = e2->get_indv_PHASE(indv2);
1437 							if ((phase1 == '|') && (phase2 == '|'))
1438 							{	// Calculate Phasing error (switch error)
1439 								N_phased_het_sites[indv_count]++;
1440 								file1_hap1 = make_pair<string,string>((string)prev_geno_file1[indv_count].first, (string)genotype1.first);
1441 								file1_hap2 = make_pair<string,string>((string)prev_geno_file1[indv_count].second, (string)genotype1.second);
1442 								file2_hap1 = make_pair<string,string>((string)prev_geno_file2[indv_count].first, (string)genotype2.first);
1443 
1444 								if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2))
1445 								{	// Must be a switch error
1446 									string indv_id;
1447 									N_switch_errors[indv_count]++;
1448 									if (indv1 != -1)
1449 										indv_id = meta_data.indv[indv1];
1450 									else
1451 										indv_id = diff_variant_file.meta_data.indv[indv2];
1452 
1453 									if (prev_pos_file1[indv_count].first == prev_pos_file2[indv_count].first)
1454 									{
1455 										if (prev_pos_file1[indv_count].second <= prev_pos_file2[indv_count].second)
1456 											switcherror << prev_pos_file1[indv_count].first << "\t" << prev_pos_file1[indv_count].second << "\t" << POS1 << "\t" << indv_id << endl;
1457 										else
1458 											switcherror << prev_pos_file1[indv_count].first << "\t" << prev_pos_file2[indv_count].second << "\t" << POS1 << "\t" << indv_id << endl;
1459 									}
1460 								}
1461 								prev_geno_file1[indv_count] = genotype1;
1462 								prev_geno_file2[indv_count] = genotype2;
1463 								prev_pos_file1[indv_count] = std::pair<string,int>(CHROM1,POS1);
1464 								prev_pos_file2[indv_count] = std::pair<string,int>(CHROM2,POS2);
1465 							}
1466 						}
1467 					}
1468 				}
1469 			}
1470 		}
1471 	}
1472 	delete e1;
1473 	delete e2;
1474 
1475 	output_file = params.output_prefix + ".diff.indv.switch";
1476 	ofstream idiscord(output_file.c_str());
1477 	if (!idiscord.is_open())
1478 		LOG.error("Could not open Individual Discordance File: " + output_file, 3);
1479 
1480 	idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl;
1481 	unsigned int indv_count=0;
1482 	double switch_error;
1483 	string indv_id;
1484 	for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1485 	{
1486 		indv1 = combined_individuals_it->second.first;
1487 		indv2 = combined_individuals_it->second.second;
1488 
1489 		if (indv1 != -1)
1490 			indv_id = meta_data.indv[indv1];
1491 		else
1492 			indv_id = diff_variant_file.meta_data.indv[indv2];
1493 
1494 		if (N_phased_het_sites[indv_count] > 0)
1495 			switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count];
1496 		else
1497 			switch_error = 0;
1498 		idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl;
1499 
1500 		indv_count++;
1501 	}
1502 	idiscord.close();
1503 
1504 	LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
1505 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
1506 	LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
1507 }
1508