1 /*
2 * variant_file_diff.cpp
3 *
4 * Created on: Oct 30, 2009
5 * Author: Adam Auton
6 * ($Revision: 230 $)
7 */
8
9 #include "variant_file.h"
10
return_indv_union(variant_file & file2,map<string,pair<int,int>> & combined_individuals,const string & indv_ID_map_file)11 void variant_file::return_indv_union(variant_file &file2, map<string, pair< int, int> > &combined_individuals, const string &indv_ID_map_file)
12 {
13 map<string, string> indv_map;
14 bool use_map = false;
15 if (indv_ID_map_file != "")
16 {
17 LOG.printLOG("Reading individual mapping file. ");
18 ifstream map(indv_ID_map_file.c_str());
19 if (!map.is_open())
20 LOG.error("Could not open map file: " + indv_ID_map_file);
21 while (!map.eof())
22 {
23 string indv1, indv2;
24 map >> indv1 >> indv2;
25 map.ignore(numeric_limits<streamsize>::max(), '\n');
26 if ((indv1 != "") && (indv1.substr(0,1) != "#"))
27 {
28 indv_map[indv1] = indv2;
29 }
30 }
31 map.close();
32 use_map = true;
33 LOG.printLOG("Read " + LOG.int2str(indv_map.size()) + " entries.\n");
34 }
35
36 for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
37 if (include_indv[ui] == true)
38 {
39 combined_individuals[meta_data.indv[ui]] = make_pair<int,int>((int)ui, -1);
40 }
41
42 for (unsigned int ui=0; ui<file2.meta_data.N_indv; ui++)
43 if (file2.include_indv[ui] == true)
44 {
45 string indv_id = file2.meta_data.indv[ui];
46 if (use_map == true)
47 {
48 if (indv_map.find(indv_id) != indv_map.end())
49 indv_id = indv_map[indv_id];
50 }
51 if (combined_individuals.find(indv_id) != combined_individuals.end())
52 {
53 combined_individuals[indv_id].second = ui;
54 }
55 else
56 combined_individuals[indv_id] = make_pair<int,int>(-1, (int)ui);
57 }
58 }
59
output_sites_in_files(const parameters & params,variant_file & diff_variant_file)60 void variant_file::output_sites_in_files(const parameters ¶ms, variant_file &diff_variant_file)
61 {
62 string CHROM;
63 vector<char> variant_line;
64 entry *e1 = get_entry_object();
65 entry *e2 = diff_variant_file.get_entry_object();
66
67 bool new_e1 = true;
68 bool new_e2 = true;
69 string CHROM1 = "";
70 string CHROM2 = "";
71 string curr_CHROM = "";
72 vector<string> all_CHROM;
73 int POS1 = -1;
74 int POS2 = -1;
75 string REF1 = "";
76 string REF2 = "";
77 string ALT1 = "";
78 string ALT2 = "";
79 int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0, N_overlap_SNPs = 0;
80 string output_file = params.output_prefix + ".diff.sites_in_files";
81
82 streambuf * buf;
83 ofstream temp_out;
84 if (!params.stream_out)
85 {
86 temp_out.open(output_file.c_str(), ios::out);
87 if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
88 buf = temp_out.rdbuf();
89 }
90 else
91 buf = cout.rdbuf();
92
93 ostream sites_in_files(buf);
94
95 sites_in_files << "CHROM\tPOS1\tPOS2\tIN_FILE\tREF1\tREF2\tALT1\tALT2" << endl;
96
97 LOG.printLOG("Comparing sites in VCF files...\n");
98 while(true)
99 {
100 if(new_e1)
101 {
102 while(!eof())
103 {
104 get_entry(variant_line);
105 e1->reset(variant_line);
106 N_entries += e1->apply_filters(params);
107
108 if(!e1->passed_filters)
109 continue;
110 N_kept_entries++;
111 e1->parse_basic_entry(true);
112 CHROM1 = e1->get_CHROM();
113 POS1 = e1->get_POS();
114 REF1 = e1->get_REF();
115 ALT1 = e1->get_ALT();
116 break;
117 }
118 new_e1 = false;
119 }
120 if(new_e2)
121 {
122 while(!diff_variant_file.eof())
123 {
124 diff_variant_file.get_entry(variant_line);
125 e2->reset(variant_line);
126 diff_variant_file.N_entries += e2->apply_filters(params);
127
128 if(!e2->passed_filters)
129 continue;
130 diff_variant_file.N_kept_entries++;
131 e2->parse_basic_entry(true);
132 CHROM2 = e2->get_CHROM();
133 POS2 = e2->get_POS();
134 REF2 = e2->get_REF();
135 ALT2 = e2->get_ALT();
136 break;
137 }
138 new_e2 = false;
139 }
140
141 if(eof() && diff_variant_file.eof())
142 break;
143 else if(diff_variant_file.eof())
144 {
145 if(CHROM1 == curr_CHROM)
146 {
147 sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
148 N_SNPs_file1_only++;
149 new_e1 = true;
150
151 }
152 else
153 {
154 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
155 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
156 else
157 {
158 curr_CHROM = CHROM1;
159 all_CHROM.push_back(CHROM1);
160 sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
161 N_SNPs_file1_only++;
162 new_e1 = true;
163 }
164 }
165 }
166 else if(eof())
167 {
168 if(CHROM2 == curr_CHROM)
169 {
170 sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
171 N_SNPs_file2_only++;
172 new_e2 = true;
173
174 }
175 else
176 {
177 if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
178 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
179 else
180 {
181 curr_CHROM = CHROM2;
182 all_CHROM.push_back(CHROM2);
183 sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
184 N_SNPs_file2_only++;
185 new_e2 = true;
186 }
187 }
188 }
189 else if(CHROM1 == CHROM2)
190 {
191 if (CHROM1 != curr_CHROM)
192 {
193 curr_CHROM = CHROM1;
194 all_CHROM.push_back(curr_CHROM);
195 }
196
197 if(POS1 == POS2)
198 {
199 if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") )
200 REF1 = REF2;
201 if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") )
202 REF2 = REF1;
203
204 new_e1 = true;
205 new_e2 = true;
206 if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != ""))
207 {
208 sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 << "\t" << ALT1 << "\t" << ALT2 << endl;
209 N_overlap_SNPs++;
210 }
211 else
212 {
213 sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tB\t" << REF1 << "\t" << REF2 << "\t" << ALT1 << "\t" << ALT2 << endl;
214 N_common_SNPs++;
215 }
216 }
217 else if(POS1 < POS2)
218 {
219 if (POS2 < (POS1+REF1.size()))
220 {
221 sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 <<"\t" << ALT1 << "\t" << ALT2 << endl;
222 N_overlap_SNPs++;
223 new_e1 = true;
224 new_e2 = true;
225 }
226 else
227 {
228 sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
229 N_SNPs_file1_only++;
230 new_e1 = true;
231 }
232 }
233 else
234 {
235 if (POS1 < (POS2+REF2.size()))
236 {
237 sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 <<"\t" << ALT1 << "\t" << ALT2 << endl;
238 N_overlap_SNPs++;
239 new_e1 = true;
240 new_e2 = true;
241 }
242 else
243 {
244 sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
245 N_SNPs_file2_only++;
246 new_e2 = true;
247 }
248 }
249 }
250 else
251 {
252 if (CHROM1 == curr_CHROM)
253 {
254 sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl;
255 N_SNPs_file1_only++;
256 new_e1 = true;
257 }
258 else if (CHROM2 == curr_CHROM)
259 {
260 sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl;
261 N_SNPs_file2_only++;
262 new_e2 = true;
263 }
264 else
265 {
266 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
267 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
268 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
269 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
270
271 LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
272 }
273 }
274 }
275
276 LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
277 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
278 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
279 LOG.printLOG("Found " + output_log::int2str(N_overlap_SNPs) + " non-matching overlapping sites.\n");
280 delete e1;
281 delete e2;
282 }
283
output_indv_in_files(const parameters & params,variant_file & diff_variant_file)284 void variant_file::output_indv_in_files(const parameters ¶ms, variant_file &diff_variant_file)
285 {
286 LOG.printLOG("Comparing individuals in VCF files...\n");
287
288 string output_file = params.output_prefix + ".diff.indv_in_files";
289
290 streambuf * buf;
291 ofstream temp_out;
292 if (!params.stream_out)
293 {
294 temp_out.open(output_file.c_str(), ios::out);
295 if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
296 buf = temp_out.rdbuf();
297 }
298 else
299 buf = cout.rdbuf();
300
301 ostream out(buf);
302 out << "INDV\tFILES" << endl;
303
304 // Build a list of individuals contained in each file
305 map<string, pair< int, int> > combined_individuals;
306 map<string, pair< int, int> >::iterator combined_individuals_it;
307 return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
308
309 unsigned int N_combined_indv = combined_individuals.size();
310 unsigned int N[3]={0,0,0};
311 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
312 {
313 if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1))
314 {
315 N[0]++;
316 out << combined_individuals_it->first << "\tB" << endl;
317 }
318 else if (combined_individuals_it->second.first != -1)
319 {
320 N[1]++;
321 out << combined_individuals_it->first << "\t1" << endl;
322 }
323 else if (combined_individuals_it->second.second != -1)
324 {
325 N[2]++;
326 out << combined_individuals_it->first << "\t2" << endl;
327 }
328 else
329 LOG.error("Unhandled case");
330 }
331
332 LOG.printLOG("N_combined_individuals:\t" + output_log::int2str(N_combined_indv) + "\n");
333 LOG.printLOG("N_individuals_common_to_both_files:\t" + output_log::int2str(N[0]) + "\n");
334 LOG.printLOG("N_individuals_unique_to_file1:\t" + output_log::int2str(N[1]) + "\n");
335 LOG.printLOG("N_individuals_unique_to_file2:\t" + output_log::int2str(N[2]) + "\n");
336 }
337
output_discordance_by_indv(const parameters & params,variant_file & diff_variant_file)338 void variant_file::output_discordance_by_indv(const parameters ¶ms, variant_file &diff_variant_file)
339 {
340 map<string, pair< int, int> > combined_individuals;
341 map<string, pair< int, int> >::iterator combined_individuals_it;
342 return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
343
344 LOG.printLOG("Outputting Discordance By Individual...\n");
345 map<string, pair<int, int> > indv_sums;
346
347 vector<char> variant_line;
348 int indv1, indv2;
349 entry * e1 = get_entry_object();
350 entry * e2 = diff_variant_file.get_entry_object();
351 string CHROM;
352 bool new_e1 = true;
353 bool new_e2 = true;
354 string CHROM1 = "";
355 string CHROM2 = "";
356 string curr_CHROM = "";
357 vector<string> all_CHROM;
358 int POS1 = -1;
359 int POS2 = -1;
360 string REF1 = "";
361 string REF2 = "";
362 string ALT1 = "";
363 string ALT2 = "";
364 bool alleles_match = false;
365 pair<string, string> genotype1, genotype2;
366 pair<int,int> geno_ids1, geno_ids2;
367 pair<string, string> missing_genotype(".",".");
368 pair<int, int> missing_id(-1,-1);
369 int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
370
371 while(true)
372 {
373 if(new_e1)
374 {
375 while(!eof())
376 {
377 get_entry(variant_line);
378 e1->reset(variant_line);
379 N_entries += e1->apply_filters(params);
380
381 if(!e1->passed_filters)
382 continue;
383 N_kept_entries++;
384 e1->parse_basic_entry(true);
385 CHROM1 = e1->get_CHROM();
386 POS1 = e1->get_POS();
387 REF1 = e1->get_REF();
388 ALT1 = e1->get_ALT();
389 break;
390 }
391 new_e1 = false;
392 }
393 if(new_e2)
394 {
395 while(!diff_variant_file.eof())
396 {
397 diff_variant_file.get_entry(variant_line);
398 e2->reset(variant_line);
399 diff_variant_file.N_entries += e2->apply_filters(params);
400
401 if(!e2->passed_filters)
402 continue;
403 diff_variant_file.N_kept_entries++;
404 e2->parse_basic_entry(true);
405 CHROM2 = e2->get_CHROM();
406 POS2 = e2->get_POS();
407 REF2 = e2->get_REF();
408 ALT2 = e2->get_ALT();
409 break;
410 }
411 new_e2 = false;
412 }
413
414 if(eof() && diff_variant_file.eof())
415 break;
416 else if(diff_variant_file.eof())
417 {
418 if(CHROM1 == curr_CHROM)
419 {
420 N_SNPs_file1_only++;
421 new_e1 = true;
422
423 }
424 else
425 {
426 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
427 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
428 else
429 {
430 curr_CHROM = CHROM1;
431 all_CHROM.push_back(CHROM1);
432 N_SNPs_file1_only++;
433 new_e1 = true;
434 }
435 }
436 }
437 else if(eof())
438 {
439 if(CHROM2 == curr_CHROM)
440 {
441 N_SNPs_file2_only++;
442 new_e2 = true;
443
444 }
445 else
446 {
447 if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
448 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
449 else
450 {
451 curr_CHROM = CHROM2;
452 all_CHROM.push_back(CHROM2);
453 N_SNPs_file2_only++;
454 new_e2 = true;
455 }
456 }
457 }
458 else if(CHROM1 == CHROM2)
459 {
460 if (CHROM1 != curr_CHROM)
461 {
462 curr_CHROM = CHROM1;
463 all_CHROM.push_back(curr_CHROM);
464 }
465
466 if(POS1 == POS2)
467 {
468 new_e1 = true;
469 new_e2 = true;
470 N_common_SNPs++;
471 }
472 else if(POS1 < POS2)
473 {
474 new_e1 = true;
475 N_SNPs_file1_only++;
476 }
477 else
478 {
479 new_e2 = true;
480 N_SNPs_file2_only++;
481 }
482 }
483 else
484 {
485 if (CHROM1 == curr_CHROM)
486 {
487 new_e1 = true;
488 N_SNPs_file1_only++;
489 }
490 else if (CHROM2 == curr_CHROM)
491 {
492 new_e2 = true;
493 N_SNPs_file2_only++;
494 }
495 else
496 {
497 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
498 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
499 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
500 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
501
502 LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
503 }
504 }
505
506 if(new_e1 && new_e2)
507 {
508 if (REF1 == "N")
509 REF1 = REF2;
510 if (REF2 == "N")
511 REF2 = REF1;
512
513 if ((REF1.size() != REF2.size()) || ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N")))
514 {
515 LOG.one_off_warning("Non-matching REF. Skipping all such sites.");
516 continue;
517 }
518 alleles_match = (ALT1 == ALT2) && (REF1 == REF2);
519 e1->parse_full_entry(true);
520 e1->parse_genotype_entries(true);
521 e2->parse_full_entry(true);
522 e2->parse_genotype_entries(true);
523
524 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
525 {
526 indv1 = combined_individuals_it->second.first;
527 indv2 = combined_individuals_it->second.second;
528
529 if ((indv1 == -1) || (indv2 == -1))
530 continue; // Individual not found in one of the files
531
532 if (alleles_match)
533 { // Alleles match, so can compare ids instead of strings
534 e1->get_indv_GENOTYPE_ids(indv1, geno_ids1);
535 e2->get_indv_GENOTYPE_ids(indv2, geno_ids2);
536
537 if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
538 {
539 indv_sums[combined_individuals_it->first].first++;
540 if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
541 ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
542 { // Match
543 // Don't do anything
544 }
545 else
546 { // Mismatch
547 indv_sums[combined_individuals_it->first].second++;
548 }
549 }
550 else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
551 { // Both missing
552 // Don't do anything.
553 }
554 else if (geno_ids1 != missing_id)
555 { // Genotype 1 is not missing, genotype 2 is.
556 // Don't do anything.
557 }
558 else if (geno_ids2 != missing_id)
559 { // Genotype 2 is not missing, genotype 1 is.
560 // Don't do anything.
561 }
562 else
563 LOG.error("Unknown condition");
564 }
565 else
566 { // Alleles don't match, so need to be more careful and compare strings
567 e1->get_indv_GENOTYPE_strings(indv1, genotype1);
568 e2->get_indv_GENOTYPE_strings(indv2, genotype2);
569
570 if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
571 { // No missing data
572 indv_sums[combined_individuals_it->first].first++;
573 if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
574 ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
575 { // Match
576 // Don't do anything
577 }
578 else
579 { // Mismatch
580 indv_sums[combined_individuals_it->first].second++;
581 }
582 }
583 else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
584 { // Both missing
585 // Don't do anything
586 }
587 else if (genotype1 != missing_genotype)
588 { // Genotype 1 is not missing, genotype 2 is.
589 // Don't do anything
590 }
591 else if (genotype2 != missing_genotype)
592 { // Genotype 2 is not missing, genotype 1 is.
593 // Don't do anything
594 }
595 else
596 LOG.error("Unknown condition");
597 }
598 }
599 }
600 }
601
602 string output_file = params.output_prefix + ".diff.indv";
603 streambuf * buf;
604 ofstream temp_out;
605 if (!params.stream_out)
606 {
607 temp_out.open(output_file.c_str(), ios::out);
608 if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
609 buf = temp_out.rdbuf();
610 }
611 else
612 buf = cout.rdbuf();
613
614 ostream out(buf);
615 out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
616
617 int N, N_discord;
618 double discordance;
619 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
620 {
621 out << combined_individuals_it->first;
622 N = indv_sums[combined_individuals_it->first].first;
623 N_discord = indv_sums[combined_individuals_it->first].second;
624 discordance = N_discord / double(N);
625 out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl;
626 }
627 LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
628 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
629 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
630
631 delete e1;
632 delete e2;
633 }
634
output_discordance_by_site(const parameters & params,variant_file & diff_variant_file)635 void variant_file::output_discordance_by_site(const parameters ¶ms, variant_file &diff_variant_file)
636 {
637 map<string, pair< int, int> > combined_individuals;
638 map<string, pair< int, int> >::iterator combined_individuals_it;
639 return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
640
641 LOG.printLOG("Outputting Discordance By Site...\n");
642
643 string CHROM;
644 vector<char> variant_line;
645 int indv1, indv2;
646 entry *e1 = get_entry_object();
647 entry *e2 = diff_variant_file.get_entry_object();
648
649 bool new_e1 = true;
650 bool new_e2 = true;
651 string CHROM1 = "";
652 string CHROM2 = "";
653 string curr_CHROM = "";
654 vector<string> all_CHROM;
655 int POS1 = -1;
656 int POS2 = -1;
657 string REF1 = "";
658 string REF2 = "";
659 string ALT1 = "";
660 string ALT2 = "";
661 bool alleles_match = false;
662 int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
663
664 string output_file = params.output_prefix + ".diff.sites";
665 streambuf * buf;
666 ofstream temp_out;
667 if (!params.stream_out)
668 {
669 temp_out.open(output_file.c_str(), ios::out);
670 if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
671 buf = temp_out.rdbuf();
672 }
673 else
674 buf = cout.rdbuf();
675
676 ostream diffsites(buf);
677 diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl;
678
679 while(true)
680 {
681 if(new_e1)
682 {
683 while(!eof())
684 {
685 get_entry(variant_line);
686 e1->reset(variant_line);
687 N_entries += e1->apply_filters(params);
688
689 if(!e1->passed_filters)
690 continue;
691 N_kept_entries++;
692 e1->parse_basic_entry(true);
693 CHROM1 = e1->get_CHROM();
694 POS1 = e1->get_POS();
695 REF1 = e1->get_REF();
696 ALT1 = e1->get_ALT();
697 break;
698 }
699 new_e1 = false;
700 }
701 if(new_e2)
702 {
703 while(!diff_variant_file.eof())
704 {
705 diff_variant_file.get_entry(variant_line);
706 e2->reset(variant_line);
707 diff_variant_file.N_entries += e2->apply_filters(params);
708
709 if(!e2->passed_filters)
710 continue;
711 diff_variant_file.N_kept_entries++;
712 e2->parse_basic_entry(true);
713 CHROM2 = e2->get_CHROM();
714 POS2 = e2->get_POS();
715 REF2 = e2->get_REF();
716 ALT2 = e2->get_ALT();
717 break;
718 }
719 new_e2 = false;
720 }
721
722 if(eof() && diff_variant_file.eof())
723 break;
724 else if(diff_variant_file.eof())
725 {
726 if(CHROM1 == curr_CHROM)
727 {
728 diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
729 N_SNPs_file1_only++;
730 new_e1 = true;
731
732 }
733 else
734 {
735 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
736 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
737 else
738 {
739 curr_CHROM = CHROM1;
740 all_CHROM.push_back(CHROM1);
741 diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
742 N_SNPs_file1_only++;
743 new_e1 = true;
744 }
745 }
746 }
747 else if(eof())
748 {
749 if(CHROM2 == curr_CHROM)
750 {
751 diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
752 N_SNPs_file2_only++;
753 new_e2 = true;
754
755 }
756 else
757 {
758 if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
759 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
760 else
761 {
762 curr_CHROM = CHROM2;
763 all_CHROM.push_back(CHROM2);
764 diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
765 N_SNPs_file2_only++;
766 new_e2 = true;
767 }
768 }
769 }
770 else if(CHROM1 == CHROM2)
771 {
772 if (CHROM1 != curr_CHROM)
773 {
774 curr_CHROM = CHROM1;
775 all_CHROM.push_back(curr_CHROM);
776 }
777
778 if(POS1 == POS2)
779 {
780 if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") )
781 REF1 = REF2;
782 if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") )
783 REF2 = REF1;
784
785 new_e1 = true;
786 new_e2 = true;
787 if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != ""))
788 {
789 LOG.one_off_warning("Non-matching REF. Skipping all such sites.");
790 continue;
791 }
792 diffsites << CHROM1 << "\t" << POS1 << "\tB\t";
793 N_common_SNPs++;
794 }
795 else if(POS1 < POS2)
796 {
797 diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
798 N_SNPs_file1_only++;
799 new_e1 = true;
800 }
801 else
802 {
803 diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
804 N_SNPs_file2_only++;
805 new_e2 = true;
806 }
807 }
808 else
809 {
810 if (CHROM1 == curr_CHROM)
811 {
812 diffsites << CHROM1 << "\t" << POS1 << "\t1\t";
813 N_SNPs_file1_only++;
814 new_e1 = true;
815 }
816 else if (CHROM2 == curr_CHROM)
817 {
818 diffsites << CHROM2 << "\t" << POS2 << "\t2\t";
819 N_SNPs_file2_only++;
820 new_e2 = true;
821 }
822 else
823 {
824 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
825 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
826 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
827 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
828
829 LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
830 }
831 }
832
833 pair<string, string> genotype1, genotype2;
834 pair<int,int> geno_ids1, geno_ids2;
835 pair<string, string> missing_genotype(".",".");
836 pair<int, int> missing_id(-1,-1);
837
838 unsigned int N_common_called=0; // Number of genotypes called in both files
839 unsigned int N_missing_1=0, N_missing_2=0;
840 unsigned int N_discord=0;
841 unsigned int N_concord_non_missing=0;
842
843 if(new_e1 && new_e2)
844 {
845 alleles_match = (ALT1 == ALT2) && (REF1 == REF2);
846 diffsites << alleles_match;
847
848 e1->parse_full_entry(true);
849 e1->parse_genotype_entries(true);
850 e2->parse_full_entry(true);
851 e2->parse_genotype_entries(true);
852
853 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
854 {
855 indv1 = combined_individuals_it->second.first;
856 indv2 = combined_individuals_it->second.second;
857
858 if ((indv1 == -1) || (indv2 == -1))
859 continue; // Individual not found in one of the files
860
861 if (alleles_match)
862 { // Alleles match, so can compare ids instead of strings
863 e1->get_indv_GENOTYPE_ids(indv1, geno_ids1);
864 e2->get_indv_GENOTYPE_ids(indv2, geno_ids2);
865
866 if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id))
867 {
868 N_common_called++;
869 if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) ||
870 ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) )
871 { // Match
872 N_concord_non_missing++;
873 }
874 else
875 { // Mismatch
876 N_discord++;
877 }
878 }
879 else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id))
880 { // Both missing
881 N_missing_1++; N_missing_2++;
882 }
883 else if (geno_ids1 != missing_id)
884 { // Genotype 1 is not missing, genotype 2 is.
885 N_missing_2++;
886 }
887 else if (geno_ids2 != missing_id)
888 { // Genotype 2 is not missing, genotype 1 is.
889 N_missing_1++;
890 }
891 else
892 LOG.error("Unknown condition");
893 }
894 else
895 { // Alleles don't match, so need to be more careful and compare strings
896 e1->get_indv_GENOTYPE_strings(indv1, genotype1);
897 e2->get_indv_GENOTYPE_strings(indv2, genotype2);
898
899 if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
900 { // No missing data
901 N_common_called++;
902 if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
903 ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
904 { // Match
905 N_concord_non_missing++;
906 }
907 else
908 { // Mismatch
909 N_discord++;
910 }
911 }
912 else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype))
913 { // Both missing
914 N_missing_1++; N_missing_2++;
915 }
916 else if (genotype1 != missing_genotype)
917 { // Genotype 1 is not missing, genotype 2 is.
918 N_missing_2++;
919 }
920 else if (genotype2 != missing_genotype)
921 { // Genotype 2 is not missing, genotype 1 is.
922 N_missing_1++;
923 }
924 else
925 LOG.error("Unknown condition");
926 }
927 }
928 }
929 else
930 diffsites << "0";
931
932 double discordance = N_discord / double(N_common_called);
933 diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance;
934 diffsites << endl;
935 }
936
937 LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
938 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
939 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
940 delete e1;
941 delete e2;
942 }
943
output_discordance_matrix(const parameters & params,variant_file & diff_variant_file)944 void variant_file::output_discordance_matrix(const parameters ¶ms, variant_file &diff_variant_file)
945 {
946 map<string, pair< int, int> > combined_individuals;
947 map<string, pair< int, int> >::iterator combined_individuals_it;
948 return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
949
950 LOG.printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n");
951 string CHROM;
952 vector<char> variant_line;
953 int indv1, indv2;
954 entry *e1 = get_entry_object();
955 entry *e2 = diff_variant_file.get_entry_object();
956
957 bool new_e1 = true;
958 bool new_e2 = true;
959 string CHROM1 = "";
960 string CHROM2 = "";
961 string curr_CHROM = "";
962 vector<string> all_CHROM;
963 int POS1 = -1;
964 int POS2 = -1;
965 string REF1 = "";
966 string REF2 = "";
967 string ALT1 = "";
968 string ALT2 = "";
969 int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
970 vector<vector<int> > discordance_matrix(4, vector<int>(4, 0));
971
972 if (combined_individuals.size() <= 0)
973 LOG.error("No overlapping individuals can be found.");
974
975 while(true)
976 {
977 if(new_e1)
978 {
979 while(!eof())
980 {
981 get_entry(variant_line);
982 e1->reset(variant_line);
983 N_entries += e1->apply_filters(params);
984
985 if(!e1->passed_filters)
986 continue;
987 N_kept_entries++;
988 e1->parse_basic_entry(true);
989 CHROM1 = e1->get_CHROM();
990 POS1 = e1->get_POS();
991 REF1 = e1->get_REF();
992 ALT1 = e1->get_ALT();
993 break;
994 }
995 new_e1 = false;
996 }
997 if(new_e2)
998 {
999 while(!diff_variant_file.eof())
1000 {
1001 diff_variant_file.get_entry(variant_line);
1002 e2->reset(variant_line);
1003 diff_variant_file.N_entries += e2->apply_filters(params);
1004
1005 if(!e2->passed_filters)
1006 continue;
1007 diff_variant_file.N_kept_entries++;
1008 e2->parse_basic_entry(true);
1009 CHROM2 = e2->get_CHROM();
1010 POS2 = e2->get_POS();
1011 REF2 = e2->get_REF();
1012 ALT2 = e2->get_ALT();
1013 break;
1014 }
1015 new_e2 = false;
1016 }
1017
1018 if(eof() && diff_variant_file.eof())
1019 break;
1020 else if(diff_variant_file.eof())
1021 {
1022 if(CHROM1 == curr_CHROM)
1023 {
1024 N_SNPs_file1_only++;
1025 new_e1 = true;
1026
1027 }
1028 else
1029 {
1030 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1031 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1032 else
1033 {
1034 curr_CHROM = CHROM1;
1035 all_CHROM.push_back(CHROM1);
1036 N_SNPs_file1_only++;
1037 new_e1 = true;
1038 }
1039 }
1040 }
1041 else if(eof())
1042 {
1043 if(CHROM2 == curr_CHROM)
1044 {
1045 N_SNPs_file2_only++;
1046 new_e2 = true;
1047
1048 }
1049 else
1050 {
1051 if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1052 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1053 else
1054 {
1055 curr_CHROM = CHROM2;
1056 all_CHROM.push_back(CHROM2);
1057 N_SNPs_file2_only++;
1058 new_e2 = true;
1059 }
1060 }
1061 }
1062 else if(CHROM1 == CHROM2)
1063 {
1064 if (CHROM1 != curr_CHROM)
1065 {
1066 curr_CHROM = CHROM1;
1067 all_CHROM.push_back(curr_CHROM);
1068 }
1069
1070 if(POS1 == POS2)
1071 {
1072 if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") )
1073 REF1 = REF2;
1074 if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") )
1075 REF2 = REF1;
1076
1077 new_e1 = true;
1078 new_e2 = true;
1079 if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != ""))
1080 {
1081 LOG.one_off_warning("Non-matching REF. Skipping all such sites.");
1082 continue;
1083 }
1084 N_common_SNPs++;
1085 }
1086 else if(POS1 < POS2)
1087 {
1088 N_SNPs_file1_only++;
1089 new_e1 = true;
1090 }
1091 else
1092 {
1093 N_SNPs_file2_only++;
1094 new_e2 = true;
1095 }
1096 }
1097 else
1098 {
1099 if (CHROM1 == curr_CHROM)
1100 {
1101 N_SNPs_file1_only++;
1102 new_e1 = true;
1103 }
1104 else if (CHROM2 == curr_CHROM)
1105 {
1106 N_SNPs_file2_only++;
1107 new_e2 = true;
1108 }
1109 else
1110 {
1111 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1112 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1113 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1114 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1115
1116 LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file.");
1117 }
1118 }
1119
1120 if(new_e1 && new_e2)
1121 {
1122 if (e1->get_N_alleles() != 2 || e2->get_N_alleles() != 2)
1123 continue;
1124
1125 if (ALT1 != ALT2)
1126 {
1127 LOG.one_off_warning("Non-matching ALT. Skipping all such sites.");
1128 continue;
1129 }
1130 e1->parse_full_entry(true);
1131 e1->parse_genotype_entries(true);
1132
1133 e2->parse_full_entry(true);
1134 e2->parse_genotype_entries(true);
1135
1136 pair<int,int> geno_ids1, geno_ids2;
1137 int N1, N2;
1138
1139 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1140 {
1141 indv1 = combined_individuals_it->second.first;
1142 indv2 = combined_individuals_it->second.second;
1143
1144 if ((indv1 == -1) || (indv2 == -1))
1145 {
1146 LOG.one_off_warning("Non-matching individual found. Skipping all such combinations.");
1147 continue; // Individual not found in one of the files
1148 }
1149
1150 // Alleles match, so can compare ids instead of strings
1151 e1->get_indv_GENOTYPE_ids(indv1, geno_ids1);
1152 e2->get_indv_GENOTYPE_ids(indv2, geno_ids2);
1153
1154 if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) ||
1155 ((geno_ids2.first != -1) && (geno_ids2.second == -1)))
1156 { // Haploid
1157 LOG.one_off_warning("***Warning: Haploid chromosomes not counted!***");
1158 continue;
1159 }
1160
1161 N1 = geno_ids1.first + geno_ids1.second;
1162 N2 = geno_ids2.first + geno_ids2.second;
1163
1164 if ((N1 == -1) || (N1 < -2) || (N1 > 2))
1165 LOG.error("Unhandled case");
1166 if ((N2 == -1) || (N2 < -2) || (N2 > 2))
1167 LOG.error("Unhandled case");
1168
1169 if (N1 == -2)
1170 N1 = 3;
1171
1172 if (N2 == -2)
1173 N2 = 3;
1174
1175 discordance_matrix[N1][N2]++;
1176 }
1177 }
1178 }
1179
1180 string output_file = params.output_prefix + ".diff.discordance_matrix";
1181 streambuf * buf;
1182 ofstream temp_out;
1183 if (!params.stream_out)
1184 {
1185 temp_out.open(output_file.c_str(), ios::out);
1186 if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
1187 buf = temp_out.rdbuf();
1188 }
1189 else
1190 buf = cout.rdbuf();
1191
1192 ostream out(buf);
1193
1194 out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl;
1195 out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl;
1196 out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl;
1197 out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl;
1198 out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl;
1199
1200 LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
1201 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
1202 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
1203 delete e1;
1204 delete e2;
1205 }
1206
output_switch_error(const parameters & params,variant_file & diff_variant_file)1207 void variant_file::output_switch_error(const parameters ¶ms, variant_file &diff_variant_file)
1208 {
1209 map<string, pair< int, int> > combined_individuals;
1210 map<string, pair< int, int> >::iterator combined_individuals_it;
1211 return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file);
1212
1213 LOG.printLOG("Outputting Phase Switch Errors...\n");
1214
1215 vector<char> variant_line;
1216 int indv1, indv2;
1217 entry *e1 = get_entry_object();
1218 entry *e2 = diff_variant_file.get_entry_object();
1219
1220 bool new_e1 = true;
1221 bool new_e2 = true;
1222 string CHROM1 = "";
1223 string CHROM2 = "";
1224 string curr_CHROM = "";
1225 vector<string> all_CHROM;
1226 int POS1 = -1;
1227 int POS2 = -1;
1228 string REF1 = "";
1229 string REF2 = "";
1230 string ALT1 = "";
1231 string ALT2 = "";
1232 int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0;
1233
1234 string output_file = params.output_prefix + ".diff.switch";
1235 streambuf * buf;
1236 ofstream temp_out;
1237 if (!params.stream_out)
1238 {
1239 temp_out.open(output_file.c_str(), ios::out);
1240 if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12);
1241 buf = temp_out.rdbuf();
1242 }
1243 else
1244 buf = cout.rdbuf();
1245
1246 ostream switcherror(buf);
1247
1248 switcherror << "CHROM\tPOS_START\tPOS_END\tINDV" << endl;
1249
1250 unsigned int N_combined_indv = combined_individuals.size();
1251 vector<int> N_phased_het_sites(N_combined_indv, 0);
1252 vector<int> N_switch_errors(N_combined_indv, 0);
1253
1254 pair<string, string> missing_genotype(".",".");
1255 pair<string, int> missing_loc(".",-1);
1256 vector<pair<string, string> > prev_geno_file1(N_combined_indv, missing_genotype);
1257 vector<pair<string, string> > prev_geno_file2(N_combined_indv, missing_genotype);
1258 vector<pair<string, int> > prev_pos_file1(N_combined_indv, missing_loc);
1259 vector<pair<string, int> > prev_pos_file2(N_combined_indv, missing_loc);
1260 pair<string, string> file1_hap1, file1_hap2, file2_hap1;
1261
1262 if (N_combined_indv <= 0)
1263 LOG.error("No overlapping individuals can be found.");
1264
1265 while(true)
1266 {
1267 if(new_e1)
1268 {
1269 while(!eof())
1270 {
1271 get_entry(variant_line);
1272 e1->reset(variant_line);
1273 N_entries += e1->apply_filters(params);
1274
1275 if(!e1->passed_filters)
1276 continue;
1277 N_kept_entries++;
1278 e1->parse_basic_entry(true);
1279 CHROM1 = e1->get_CHROM();
1280 POS1 = e1->get_POS();
1281 REF1 = e1->get_REF();
1282 ALT1 = e1->get_ALT();
1283 break;
1284 }
1285 new_e1 = false;
1286 }
1287 if(new_e2)
1288 {
1289 while(!diff_variant_file.eof())
1290 {
1291 diff_variant_file.get_entry(variant_line);
1292 e2->reset(variant_line);
1293 diff_variant_file.N_entries += e2->apply_filters(params);
1294
1295 if(!e2->passed_filters)
1296 continue;
1297 diff_variant_file.N_kept_entries++;
1298 e2->parse_basic_entry(true);
1299 CHROM2 = e2->get_CHROM();
1300 POS2 = e2->get_POS();
1301 REF2 = e2->get_REF();
1302 ALT2 = e2->get_ALT();
1303 break;
1304 }
1305 new_e2 = false;
1306 }
1307
1308 if(eof() && diff_variant_file.eof())
1309 break;
1310 else if(diff_variant_file.eof())
1311 {
1312 if(CHROM1 == curr_CHROM)
1313 {
1314 N_SNPs_file1_only++;
1315 new_e1 = true;
1316 }
1317 else
1318 {
1319 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1320 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1321 else
1322 {
1323 curr_CHROM = CHROM1;
1324 all_CHROM.push_back(CHROM1);
1325 N_SNPs_file1_only++;
1326 new_e1 = true;
1327 }
1328 }
1329 }
1330 else if(eof())
1331 {
1332 if(CHROM2 == curr_CHROM)
1333 {
1334 N_SNPs_file2_only++;
1335 new_e2 = true;
1336 }
1337 else
1338 {
1339 if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1340 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1341 else
1342 {
1343 curr_CHROM = CHROM2;
1344 all_CHROM.push_back(CHROM2);
1345 N_SNPs_file2_only++;
1346 new_e2 = true;
1347 }
1348 }
1349 }
1350 else if(CHROM1 == CHROM2)
1351 {
1352 if (CHROM1 != curr_CHROM)
1353 {
1354 curr_CHROM = CHROM1;
1355 all_CHROM.push_back(curr_CHROM);
1356 }
1357
1358 if(POS1 == POS2)
1359 {
1360 N_common_SNPs++;
1361 new_e1 = true;
1362 new_e2 = true;
1363 }
1364 else if(POS1 < POS2)
1365 {
1366 N_SNPs_file1_only++;
1367 new_e1 = true;
1368 }
1369 else
1370 {
1371 N_SNPs_file2_only++;
1372 new_e2 = true;
1373 }
1374 }
1375 else
1376 {
1377 if (CHROM1 == curr_CHROM)
1378 {
1379 N_SNPs_file1_only++;
1380 new_e1 = true;
1381 }
1382 else if (CHROM2 == curr_CHROM)
1383 {
1384 N_SNPs_file2_only++;
1385 new_e2 = true;
1386 }
1387 else
1388 {
1389 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end())
1390 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order.");
1391 if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end())
1392 LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order.");
1393
1394 LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file."); }
1395 }
1396 if(new_e1 && new_e2)
1397 {
1398 e1->parse_full_entry(true);
1399 e1->parse_genotype_entries(true);
1400
1401 e2->parse_full_entry(true);
1402 e2->parse_genotype_entries(true);
1403
1404 pair<string, string> genotype1, genotype2;
1405 pair<string, string> missing_genotype(".",".");
1406
1407 unsigned int N_common_called=0; // Number of genotypes called in both files
1408 unsigned int indv_count=0;
1409
1410 for (combined_individuals_it=combined_individuals.begin();
1411 combined_individuals_it!=combined_individuals.end();
1412 ++combined_individuals_it, indv_count++)
1413 {
1414 indv1 = combined_individuals_it->second.first;
1415 indv2 = combined_individuals_it->second.second;
1416
1417 if ((indv1 == -1) || (indv2 == -1))
1418 {
1419 LOG.one_off_warning("Non-matching individual found. Skipping all such combinations.");
1420 continue; // Individual not found in one of the files
1421 }
1422
1423 e1->get_indv_GENOTYPE_strings(indv1, genotype1);
1424 e2->get_indv_GENOTYPE_strings(indv2, genotype2);
1425
1426 if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype))
1427 { // No missing data
1428 N_common_called++;
1429 if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) ||
1430 ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) )
1431 { // Have a matching genotypes in files 1 and 2
1432 if (genotype1.first != genotype1.second)
1433 { // It's a heterozgote
1434 char phase1, phase2;
1435 phase1 = e1->get_indv_PHASE(indv1);
1436 phase2 = e2->get_indv_PHASE(indv2);
1437 if ((phase1 == '|') && (phase2 == '|'))
1438 { // Calculate Phasing error (switch error)
1439 N_phased_het_sites[indv_count]++;
1440 file1_hap1 = make_pair<string,string>((string)prev_geno_file1[indv_count].first, (string)genotype1.first);
1441 file1_hap2 = make_pair<string,string>((string)prev_geno_file1[indv_count].second, (string)genotype1.second);
1442 file2_hap1 = make_pair<string,string>((string)prev_geno_file2[indv_count].first, (string)genotype2.first);
1443
1444 if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2))
1445 { // Must be a switch error
1446 string indv_id;
1447 N_switch_errors[indv_count]++;
1448 if (indv1 != -1)
1449 indv_id = meta_data.indv[indv1];
1450 else
1451 indv_id = diff_variant_file.meta_data.indv[indv2];
1452
1453 if (prev_pos_file1[indv_count].first == prev_pos_file2[indv_count].first)
1454 {
1455 if (prev_pos_file1[indv_count].second <= prev_pos_file2[indv_count].second)
1456 switcherror << prev_pos_file1[indv_count].first << "\t" << prev_pos_file1[indv_count].second << "\t" << POS1 << "\t" << indv_id << endl;
1457 else
1458 switcherror << prev_pos_file1[indv_count].first << "\t" << prev_pos_file2[indv_count].second << "\t" << POS1 << "\t" << indv_id << endl;
1459 }
1460 }
1461 prev_geno_file1[indv_count] = genotype1;
1462 prev_geno_file2[indv_count] = genotype2;
1463 prev_pos_file1[indv_count] = std::pair<string,int>(CHROM1,POS1);
1464 prev_pos_file2[indv_count] = std::pair<string,int>(CHROM2,POS2);
1465 }
1466 }
1467 }
1468 }
1469 }
1470 }
1471 }
1472 delete e1;
1473 delete e2;
1474
1475 output_file = params.output_prefix + ".diff.indv.switch";
1476 ofstream idiscord(output_file.c_str());
1477 if (!idiscord.is_open())
1478 LOG.error("Could not open Individual Discordance File: " + output_file, 3);
1479
1480 idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl;
1481 unsigned int indv_count=0;
1482 double switch_error;
1483 string indv_id;
1484 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it)
1485 {
1486 indv1 = combined_individuals_it->second.first;
1487 indv2 = combined_individuals_it->second.second;
1488
1489 if (indv1 != -1)
1490 indv_id = meta_data.indv[indv1];
1491 else
1492 indv_id = diff_variant_file.meta_data.indv[indv2];
1493
1494 if (N_phased_het_sites[indv_count] > 0)
1495 switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count];
1496 else
1497 switch_error = 0;
1498 idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl;
1499
1500 indv_count++;
1501 }
1502 idiscord.close();
1503
1504 LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n");
1505 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n");
1506 LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n");
1507 }
1508