1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2013-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // phasedstacks -- analyse phased data, descended from a Stacks analysis.
23 //
24
25 #include "phasedstacks.h"
26
27 // Global variables to hold command-line options.
28 FileT in_file_type = FileT::unknown;
29 int num_threads = 1;
30 int batch_id = 0;
31 string cat_path;
32 string in_path;
33 string out_path;
34 string out_file;
35 string pmap_path;
36 bool haplotypes = false;
37 bool write_zeros = true;
38 double p_value_cutoff = 0.05;
39 double chi_sq_limit = 3.84;
40 double minor_freq_lim = 0.1;
41 double min_inform_pairs = 0.90;
42 uint max_pair_dist = 1000000;
43 uint bucket_dist = 5000;
44 double dprime_threshold = false;
45 double dprime_threshold_level = 0.0;
46
47 set<int> whitelist, blacklist;
48
49 map<string, int> pop_map;
50 map<int, int> pop_cnts;
51
main(int argc,char * argv[])52 int main (int argc, char* argv[]) {
53 IF_NDEBUG_TRY
54
55 parse_command_line(argc, argv);
56
57 if (p_value_cutoff == 0.1) {
58 chi_sq_limit = 2.71;
59 } else if (p_value_cutoff == 0.05) {
60 chi_sq_limit = 3.84;
61 } else if (p_value_cutoff == 0.01) {
62 chi_sq_limit = 6.64;
63 } else if (p_value_cutoff == 0.001) {
64 chi_sq_limit = 10.83;
65 }
66
67 cerr << "Minor allele frequency cutoff: " << minor_freq_lim << "\n"
68 << "Looking for ";
69 switch(in_file_type) {
70 case FileT::beagle:
71 cerr << "Beagle";
72 break;
73 case FileT::phase:
74 cerr << "PHASE";
75 break;
76 case FileT::fastphase:
77 default:
78 cerr << "fastPhase";
79 break;
80 }
81 cerr << " input files.\n"
82 << "Size of buckets for binning D' values at a particular distance: " << bucket_dist / 1000 << "kb.\n";
83 if (dprime_threshold)
84 cerr << "D' Threshold set at " << dprime_threshold_level << ". D' values above this limit will be set to 1.0, values below will be set to 0.0.\n";
85
86 //
87 // Parse the population map.
88 //
89 parse_population_map(pmap_path, pop_map, pop_cnts);
90
91 //
92 // Set the number of OpenMP parallel threads to execute.
93 //
94 #ifdef _OPENMP
95 omp_set_num_threads(num_threads);
96 #endif
97
98 vector<pair<int, string> > files;
99 if (!build_file_list(files))
100 exit(1);
101
102 cerr << "Identified " << files.size() << " files.\n";
103
104 //
105 // Open the log file.
106 //
107 stringstream log;
108 log << "phasedstacks.log";
109 string log_path = in_path + log.str();
110 ofstream log_fh(log_path.c_str(), ofstream::out);
111
112 if (log_fh.fail()) {
113 cerr << "Error opening log file '" << log_path << "'\n";
114 exit(1);
115 }
116
117 init_log(log_fh, argc, argv);
118
119 //
120 // Load the catalog
121 //
122 cerr << "Parsing the catalog...\n";
123 stringstream catalog_file;
124 map<int, CSLocus *> catalog;
125 bool compressed = false;
126 catalog_file << cat_path << "batch_" << batch_id << ".catalog";
127 int res = load_loci(catalog_file.str(), catalog, 0, false, compressed);
128 if (res == 0) {
129 cerr << "Error: Unable to load the catalog '" << catalog_file.str() << "'\n";
130 throw exception();
131 }
132 cerr << "done.\n";
133
134 //
135 // Implement the black/white list
136 //
137 reduce_catalog(catalog, whitelist, blacklist);
138
139 map<int, int> fgt_block_lens, fgt_snp_cnts;
140 map<int, int> dp_block_lens, dp_snp_cnts;
141
142 //
143 // Vectors to store D' measures of SNPs at bucketed distances.
144 //
145 vector<double> dprime_buckets, dprime_bucket_cnts;
146
147 for (uint i = 0; i < files.size(); i++) {
148
149 // if (files[i].second != "batch_1.groupV.phase") continue;
150
151 PhasedSummary *psum = NULL;
152
153 if (in_file_type == FileT::fastphase) {
154 if ((psum = parse_fastphase(in_path + files[i].second)) == NULL) {
155 cerr << "Unable to parse fastPhase input files.\n";
156 exit(1);
157 }
158 } else if (in_file_type == FileT::beagle && haplotypes) {
159 if ((psum = parse_beagle_haplotypes(catalog, in_path + files[i].second)) == NULL) {
160 cerr << "Unable to parse Beagle input files.\n";
161 exit(1);
162 }
163 } else if (in_file_type == FileT::beagle) {
164 if ((psum = parse_beagle(catalog, in_path + files[i].second)) == NULL) {
165 cerr << "Unable to parse Beagle input files.\n";
166 exit(1);
167 }
168 }
169
170 //
171 // Summarize the genotypes in the populations.
172 //
173 summarize_phased_genotypes(psum);
174
175 // for (uint j = 0; j < psum->size; j++) {
176 // cerr << "BP: " << psum->nucs[j].bp << "\t"
177 // << "A: " << std::setw(3) << psum->nucs[j].nuc[0] << " "
178 // << "C: " << std::setw(3) << psum->nucs[j].nuc[1] << " "
179 // << "G: " << std::setw(3) << psum->nucs[j].nuc[2] << " "
180 // << "T: " << std::setw(3) << psum->nucs[j].nuc[3] << "\n";
181 // }
182
183 //
184 // Calculate D'
185 //
186 cerr << "Calculating D'...";
187 calc_dprime(psum);
188 cerr << "done.\n";
189
190 write_dprime(in_path + files[i].second, psum);
191
192 //
193 // Generate haplotype blocks based on D'.
194 //
195 dprime_blocks(in_path + files[i].second, pop_map, psum, dp_block_lens, dp_snp_cnts);
196
197 //
198 // Generate haplotype blocks using the four gamete test.
199 //
200 four_gamete_test(in_path + files[i].second, pop_map, psum, fgt_block_lens, fgt_snp_cnts);
201
202 //
203 // Bucket the D' measures by distance between SNPs.
204 //
205 bucket_dprime(dprime_buckets, dprime_bucket_cnts, psum);
206
207 //
208 // Free the Samples objects
209 //
210 delete psum;
211 }
212
213 //
214 // Write average D' values bucketed according to their distance in the genome.
215 //
216 write_buckets(in_path, dprime_buckets, dprime_bucket_cnts);
217
218 //
219 // Write the FGT bucketed distances.
220 //
221 log_fh << "# Distribution of FGT haplotype block lengths.\n";
222 map<int, int>::iterator buck_it;
223 for (buck_it = fgt_block_lens.begin(); buck_it != fgt_block_lens.end(); buck_it++)
224 log_fh << buck_it->first << "\t" << buck_it->second << "\n";
225
226 //
227 // Write the FGT bucketed SNP counts.
228 //
229 log_fh << "\n\n"
230 << "# Distribution of FGT SNP counts per haplotype block.\n";
231 for (buck_it = fgt_snp_cnts.begin(); buck_it != fgt_snp_cnts.end(); buck_it++)
232 log_fh << buck_it->first << "\t" << buck_it->second << "\n";
233
234 //
235 // Write the D' haplotype block bucketed distances.
236 //
237 log_fh << "\n\n"
238 << "# Distribution of D' haplotype block lengths.\n";
239 for (buck_it = dp_block_lens.begin(); buck_it != dp_block_lens.end(); buck_it++)
240 log_fh << buck_it->first << "\t" << buck_it->second << "\n";
241
242 //
243 // Write the D' bucketed SNP counts.
244 //
245 log_fh << "\n\n"
246 << "# Distribution of D' SNP counts per haplotype block.\n";
247 for (buck_it = dp_snp_cnts.begin(); buck_it != dp_snp_cnts.end(); buck_it++)
248 log_fh << buck_it->first << "\t" << buck_it->second << "\n";
249
250 log_fh.close();
251
252 return 0;
253 IF_NDEBUG_CATCH_ALL_EXCEPTIONS
254 }
255
256 int
bucket_dprime(vector<double> & dprime_buckets,vector<double> & dprime_bucket_cnts,PhasedSummary * psum)257 bucket_dprime(vector<double> &dprime_buckets, vector<double> &dprime_bucket_cnts, PhasedSummary *psum)
258 {
259 uint bucket, dist, max_bucket;
260 uint max_dist = 0;
261
262 //
263 // Check that we have enough buckets in our vectors. Find the maximum distance between
264 // SNPs on this chromosome and add buckets as necessary.
265 //
266 for (uint i = 0; i < psum->size; i++) {
267 for (uint j = i+1; j < psum->size; j++) {
268
269 if (psum->nucs[i].freq < minor_freq_lim ||
270 psum->nucs[j].freq < minor_freq_lim)
271 continue;
272
273 if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
274 continue;
275
276 dist = psum->nucs[j].bp - psum->nucs[i].bp;
277 max_dist = dist > max_dist ? dist : max_dist;
278 }
279 }
280 max_bucket = max_dist / bucket_dist;
281 if (dprime_buckets.size() < max_bucket) {
282 uint cnt = max_bucket + 1 - dprime_buckets.size();
283 for (uint i = 0; i < cnt; i++) {
284 dprime_buckets.push_back(0.0);
285 dprime_bucket_cnts.push_back(0.0);
286 }
287 }
288
289 //
290 // Populate buckets
291 //
292 for (uint i = 0; i < psum->size; i++) {
293 for (uint j = i+1; j < psum->size; j++) {
294
295 if (psum->nucs[i].freq < minor_freq_lim ||
296 psum->nucs[j].freq < minor_freq_lim)
297 continue;
298
299 if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
300 continue;
301
302 bucket = ((psum->nucs[j].bp - psum->nucs[i].bp) / bucket_dist);
303
304 dprime_buckets[bucket] += (psum->dprime[i][j].chisq_p ? psum->dprime[i][j].dprime : 0.0);
305 dprime_bucket_cnts[bucket]++;
306 }
307 }
308
309 return 0;
310 }
311
312 int
write_buckets(string path,vector<double> & dprime_buckets,vector<double> & dprime_bucket_cnts)313 write_buckets(string path, vector<double> &dprime_buckets, vector<double> &dprime_bucket_cnts)
314 {
315 //
316 // Write the bucketed D' data for plotting.
317 //
318 stringstream file;
319 file << path << "Dprime_dist_buckets" << bucket_dist/1000 << "kb.tsv";
320
321 cerr << "Writing bucketed D' data to '" << file.str() << "'...";
322
323 ofstream fh(file.str().c_str(), ofstream::out);
324
325 if (fh.fail()) {
326 cerr << "Error opening D' file '" << file.str() << "'\n";
327 exit(1);
328 }
329
330 fh << "# Distance (Kb)\tD' Average\n";
331
332 for (uint i = 0; i < dprime_buckets.size(); i++)
333 fh << (i * bucket_dist) << "\t"
334 << std::setprecision(3) << (dprime_buckets[i] / dprime_bucket_cnts[i]) << "\n";
335
336 fh.close();
337
338 cerr << "done\n";
339
340 return 0;
341 }
342
343 int
four_gamete_test(string path,map<string,int> & pop_map,PhasedSummary * psum,map<int,int> & len_buckets,map<int,int> & snp_buckets)344 four_gamete_test(string path, map<string, int> &pop_map, PhasedSummary *psum, map<int, int> &len_buckets, map<int, int> &snp_buckets)
345 {
346 //
347 // Write haplotypes as found by the four gamete test:
348 // Wang, et al., Am. J. Hum. Genet. 71:1227–1234, 2002
349 //
350 string file = path + ".fgt.tsv";
351
352 cerr << "Determining four gamete test haplotypes blocks, writing to:\n '" << file << "'...\n";
353
354 ofstream fh(file.c_str(), ofstream::out);
355
356 if (fh.fail()) {
357 cerr << "Error opening FGT file '" << file << "'\n";
358 exit(1);
359 }
360
361 fh << "# ID\tStart\tEnd\tLen\tSNP Count\tHaplotype Count\tHaplotype\tPopulations\tHapPopCnt\n";
362
363 uint id = 1;
364 uint start, end=-1, cnt, dist;
365 bool bound;
366 map<int, int> buckets, snps;
367
368 for (uint i = 0; i < psum->size; i++) {
369 if (psum->nucs[i].freq < minor_freq_lim)
370 continue;
371
372 //
373 // Start a new block.
374 //
375 start = i;
376 bound = false;
377 cnt = 0;
378 uint j = i;
379
380 do {
381 if (psum->nucs[j].freq < minor_freq_lim) {
382 j++;
383 continue;
384 }
385
386 for (int k = j; k >= (int) start; k--) {
387
388 if (psum->nucs[k].freq < minor_freq_lim)
389 continue;
390
391 if (psum->recomb[k][j] == true) {
392 bound = true;
393 end = j;
394 }
395 }
396
397 j++;
398 cnt++;
399 } while (bound == false && j < psum->size);
400
401 if (j == psum->size)
402 end = j - 1;
403
404 fh << id << "\t"
405 << psum->nucs[start].bp << "\t"
406 << psum->nucs[end].bp << "\t"
407 << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
408 << cnt << "\t";
409 //
410 // Bucket the SNP counts for plotting.
411 //
412 snps[cnt]++;
413
414 //
415 // Bucket the haplotype block lengths for plotting.
416 //
417 dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
418 buckets[dist]++;
419
420 enumerate_haplotypes(fh, pop_map, psum, start, end);
421
422 i = end;
423 id++;
424 }
425
426 //
427 // Write the bucketed distances.
428 //
429 fh << "\n\n"
430 << "# Distribution of FGT haplotype block lengths.\n";
431 map<int, int>::iterator it;
432 for (it = buckets.begin(); it != buckets.end(); it++) {
433 fh << it->first << "\t" << it->second << "\n";
434
435 len_buckets[it->first] += it->second;
436 }
437
438 //
439 // Write the bucketed SNP counts.
440 //
441 fh << "\n\n"
442 << "# Distribution of SNP counts per FGT haplotype block.\n";
443 for (it = snps.begin(); it != snps.end(); it++) {
444 fh << it->first << "\t" << it->second << "\n";
445
446 snp_buckets[it->first] += it->second;
447 }
448
449 fh.close();
450
451 cerr << "done.\n";
452
453 return 0;
454 }
455
456 int
dprime_blocks(string path,map<string,int> & pop_map,PhasedSummary * psum,map<int,int> & len_buckets,map<int,int> & snp_buckets)457 dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<int, int> &len_buckets, map<int, int> &snp_buckets)
458 {
459 //
460 // Generate haplotype blocks according to strength of linkage disequilibrium measured using D'.
461 // Stacey B. Gabriel et al. (2002). The Structure of Haplotype Blocks in the Human Genome. Science 296:2225-2229
462 //
463 string file = path + ".dpblocks.tsv";
464
465 cerr << "Determining D' haplotypes blocks, writing to:\n '" << file << "'...\n";
466
467 ofstream fh(file.c_str(), ofstream::out);
468
469 if (fh.fail()) {
470 cerr << "Error opening D' blocks file '" << file << "'\n";
471 exit(1);
472 }
473
474 fh << "# ID\tStart\tEnd\tLen\tSNP Count\tHaplotype Count\tHaplotype\tPopulations\tHapPopCnt\n";
475
476 uint dist;
477 set<int> loci;
478 vector<pair<int, int> > ld_pairs;
479 map<int, vector<int> > ld_map;
480 map<int, int> buckets, snps;
481
482 uint tot_pairs = 0;
483 uint recomb_pairs = 0;
484
485 for (uint i = 0; i < psum->size; i++) {
486 if (psum->nucs[i].freq < minor_freq_lim)
487 continue;
488
489 for (uint j = i+1; j < psum->size; j++) {
490 if (psum->nucs[j].freq < minor_freq_lim)
491 continue;
492
493 tot_pairs++;
494 dist = psum->nucs[j].bp - psum->nucs[i].bp + 1;
495
496 //
497 // Does this pair of markers show a strong measure of LD?
498 //
499 if (psum->dprime[i][j].ci_high > 0.98 &&
500 psum->dprime[i][j].ci_low > 0.7 &&
501 dist <= max_pair_dist) {
502 psum->dprime[i][j].type = strong_ld;
503
504 ld_pairs.push_back(make_pair(i, j));
505 ld_map[i].push_back(j);
506
507 loci.insert(i);
508 loci.insert(j);
509 }
510
511 //
512 // Does this pair of markers show a strong measure of historical recombination?
513 //
514 if (psum->dprime[i][j].ci_high < 0.9) {
515 psum->dprime[i][j].type = recomb;
516 recomb_pairs++;
517 }
518 }
519 }
520
521 // map<int, vector<int> >::iterator it;
522 // for (it = ld_map.begin(); it != ld_map.end(); it++) {
523 // cerr << " " << it->first << " ->\n";
524 // for (uint i = 0; i < it->second.size(); i++)
525 // cerr << " " << it->second[i] << " dist: " << (psum->nucs[it->second[i]].bp - psum->nucs[it->first].bp + 1) << "bp\n";
526 // }
527
528 cerr << " Total pairs examined: " << tot_pairs
529 << "; Strong LD pairs: " << ld_pairs.size()
530 << "; Recombination pairs: " << recomb_pairs
531 << "; Informative markers: " << std::setprecision(3)
532 << ((double) (ld_pairs.size() + recomb_pairs) / (double) tot_pairs) * 100 << "%\n";
533
534 //
535 // Convert our list of loci into an ordered, linked list, where each node
536 // represents a haplotype block.
537 //
538 dPrimeBlocks blocks;
539 blocks.initialize(loci);
540
541 //
542 // Merge nodes together where D' is strong enough to maintain the block.
543 //
544 HBlock *cur;
545
546 cur = blocks.head();
547
548 do {
549 //
550 // Can we merge these two nodes together?
551 //
552 if (check_adjacent_blocks(psum, cur)) {
553 // cerr << " Merging blocks: ";
554 // for (uint i = 0; i < cur->loci.size(); i++)
555 // cerr << cur->loci[i] << ", ";
556 // cerr << " and ";
557 // for (uint i = 0; i < cur->next->loci.size(); i++)
558 // cerr << cur->next->loci[i] << ", ";
559 // cerr << "\n";
560 blocks.merge_adjacent(cur);
561 } else {
562 cur = cur->next;
563 }
564 } while (cur->next != NULL);
565
566 // blocks.print();
567
568 //
569 // Write the blocks.
570 //
571 uint id = 1;
572 uint start, end;
573
574 cur = blocks.head();
575
576 do {
577 start = *(cur->loci.begin());
578 end = *(cur->loci.rbegin());
579
580 fh << id << "\t"
581 << psum->nucs[start].bp << "\t"
582 << psum->nucs[end].bp << "\t"
583 << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
584 << cur->loci.size() << "\t";
585
586 //
587 // Bucket the SNP counts for plotting.
588 //
589 snps[cur->loci.size()]++;
590
591 //
592 // Bucket the haplotype block lengths for plotting.
593 //
594 dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
595 buckets[dist]++;
596
597 enumerate_haplotypes(fh, pop_map, psum, start, end);
598
599 id++;
600 cur = cur->next;
601 } while (cur != NULL);
602
603 //
604 // Write the bucketed distances.
605 //
606 fh << "\n\n"
607 << "# Distribution of D' haplotype block lengths.\n";
608 map<int, int>::iterator it;
609 for (it = buckets.begin(); it != buckets.end(); it++) {
610 fh << it->first << "\t" << it->second << "\n";
611
612 len_buckets[it->first] += it->second;
613 }
614
615 //
616 // Write the bucketed SNP counts.
617 //
618 fh << "\n\n"
619 << "# Distribution of SNP counts per D' haplotype block.\n";
620 for (it = snps.begin(); it != snps.end(); it++) {
621 fh << it->first << "\t" << it->second << "\n";
622
623 snp_buckets[it->first] += it->second;
624 }
625
626 fh.close();
627
628 cerr << "done.\n";
629
630 return 0;
631 }
632
633 bool
check_adjacent_blocks(PhasedSummary * psum,HBlock * block)634 check_adjacent_blocks(PhasedSummary *psum, HBlock *block)
635 {
636 //
637 // Create a list of all loci contained in the two blocks.
638 //
639 uint start = *(block->loci.begin());
640 uint end = *(block->next->loci.rbegin());
641
642 //
643 // Check the D' measure between each pair in the proposed combined block.
644 //
645 double tot = 0.0;
646 double strong_ld = 0.0;
647
648 for (uint i = start; i <= end; i++) {
649 if (psum->nucs[i].freq < minor_freq_lim)
650 continue;
651
652 for (uint j = i + 1; j <= end; j++) {
653 if (psum->dprime[i][j].type == uninformative ||
654 psum->nucs[j].freq < minor_freq_lim)
655 continue;
656
657 tot++;
658 if (psum->dprime[i][j].type == strong_ld)
659 strong_ld++;
660 }
661 }
662
663 // cerr << "Comparing range " << start << " to " << end
664 // << "; total pairs: " << tot << "; strong LD: " << strong_ld
665 // << "; proportion: " << std::setprecision(3) << strong_ld / tot << "\n";
666
667 if (strong_ld / tot >= min_inform_pairs)
668 return true;
669
670 return false;
671 }
672
673 HBlock *
merge_adjacent(HBlock * a)674 dPrimeBlocks::merge_adjacent(HBlock *a)
675 {
676 //
677 // Merge two adjacent nodes.
678 //
679 HBlock *b = a->next;
680
681 for (uint i = 0; i < b->loci.size(); i++)
682 a->loci.push_back(b->loci[i]);
683 a->next = b->next;
684 delete b;
685 return a;
686 }
687
688 HBlock *
initialize(set<int> & loci)689 dPrimeBlocks::initialize(set<int> &loci)
690 {
691 set<int>::iterator it, prev_it;
692 HBlock *cur, *next;
693
694 this->_head = new HBlock;
695 it = loci.begin();
696 this->_head->loci.push_back(*it);
697 it++;
698
699 // //
700 // // Create a node from each locus and add to it all immediately adjacent loci.
701 // //
702 // do {
703 // this->_head->loci.push_back(*it);
704 // prev_it = it;
705 // it++;
706 // } while (it != loci.end() && (*prev_it) + 1 == *it);
707
708 next = this->_head;
709
710 // if (it == loci.end()) return this->_head;
711
712 do {
713 cur = new HBlock;
714 cur->loci.push_back(*it);
715 it++;
716
717 // do {
718 // cur->loci.push_back(*it);
719 // prev_it = it;
720 // it++;
721 // } while (it != loci.end() &&
722 // (*prev_it) + 1 == *it);
723
724 next->next = cur;
725 next = next->next;
726
727 } while (it != loci.end());
728
729 return this->_head;
730 }
731
732 int
print()733 dPrimeBlocks::print()
734 {
735 HBlock *cur = this->_head;
736 while (cur != NULL) {
737 for (uint i = 0; i < cur->loci.size(); i++) {
738 if (i > 0)
739 cerr << ", ";
740 cerr << cur->loci[i];
741 }
742 cerr << "\n";
743
744 cur = cur->next;
745 }
746
747 return 0;
748 }
749
750 int
enumerate_haplotypes(ofstream & fh,map<string,int> & pop_map,PhasedSummary * psum,uint start,uint end)751 enumerate_haplotypes(ofstream &fh, map<string, int> &pop_map, PhasedSummary *psum, uint start, uint end)
752 {
753 map<string, map<int, int> >::iterator it;
754 map<string, map<int, int> > haplotypes;
755 map<int, int>::iterator sit;
756 string haplotype;
757 set<int> pops;
758
759 //
760 // Enumerate all haplotypes occurring in this block.
761 //
762 for (uint k = 0; k < psum->sample_cnt; k++) {
763
764 for (uint n = start; n <= end; n++)
765 if (psum->nucs[n].freq >= minor_freq_lim)
766 haplotype += psum->samples[k].nucs_1[n];
767
768 pops.insert(pop_map[psum->samples[k].name]);
769
770 if (haplotypes.count(haplotype) == 0)
771 haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
772 else
773 haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
774 haplotype.clear();
775 }
776
777 for (uint k = 0; k < psum->sample_cnt; k++) {
778
779 for (uint n = start; n <= end; n++)
780 if (psum->nucs[n].freq >= minor_freq_lim)
781 haplotype += psum->samples[k].nucs_2[n];
782
783 pops.insert(pop_map[psum->samples[k].name]);
784
785 if (haplotypes.count(haplotype) == 0)
786 haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
787 else
788 haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
789 haplotype.clear();
790 }
791
792 //
793 // Write the haplotypes.
794 //
795 float tot = 0.0;
796
797 fh << haplotypes.size() << "\t";
798 for (it = haplotypes.begin(); it != haplotypes.end(); it++) {
799 //
800 // Haplotypes are stored per population; sum them up here.
801 //
802 for (sit = it->second.begin(); sit != it->second.end(); sit++)
803 tot += sit->second;
804
805 if (it != haplotypes.begin())
806 fh << ",";
807 fh << it->first << "|"
808 << std::setprecision(3) << tot / ((float) psum->sample_cnt * 2.0);
809 }
810 fh << "\t";
811
812 set<int>::iterator pit;
813 stringstream pops_str;
814
815 //
816 // Write which populations this haplotype block occurs in.
817 //
818 if (pops.size() == 0)
819 fh << "-1\t";
820 else
821 for (pit = pops.begin(); pit != pops.end(); pit++)
822 pops_str << *pit << ",";
823 fh << pops_str.str().substr(0, pops_str.str().length()-1);
824 pops_str.str("");
825
826 //
827 // Write the frequency of occurence of each haplotype in each population.
828 //
829 for (it = haplotypes.begin(); it != haplotypes.end(); it++) {
830 pops_str << "\t";
831 for (pit = pops.begin(); pit != pops.end(); pit++)
832 pops_str << (it->second)[*pit] << "|"
833 << std::setprecision(3)
834 << (float) (it->second)[*pit] / (float) (pop_cnts[*pit] * 2.0) << ",";
835 fh << pops_str.str().substr(0, pops_str.str().length()-1);
836 pops_str.str("");
837 }
838 fh << "\n";
839
840 return 0;
841 }
842
843 int
calc_dprime(PhasedSummary * psum)844 calc_dprime(PhasedSummary *psum)
845 {
846 #pragma omp parallel
847 {
848 char allele_A, allele_a, allele_B, allele_b;
849 double freq_A, freq_a, freq_B, freq_b;
850 double freq_AB, freq_Ab, freq_aB, freq_ab;
851 double D, min, var, chisq;
852 double tot = psum->sample_cnt * 2.0;
853 uint hap_cnt;
854
855 #pragma omp for schedule(dynamic, 1)
856 for (uint i = 0; i < psum->size; i++) {
857 //
858 // Assign nucleotides to allele A, and a.
859 //
860 assign_alleles(psum->nucs[i], allele_A, allele_a, freq_A, freq_a);
861
862 for (uint j = i+1; j < psum->size; j++) {
863 //
864 // Assign nucleotides to allele B, and b.
865 //
866 assign_alleles(psum->nucs[j], allele_B, allele_b, freq_B, freq_b);
867
868 freq_AB = 0.0;
869 freq_Ab = 0.0;
870 freq_aB = 0.0;
871 freq_ab = 0.0;
872 hap_cnt = 0;
873 D = 0.0;
874
875 //
876 // Tally up haplotype frequencies.
877 //
878 for (uint k = 0; k < psum->sample_cnt; k++) {
879
880 if (psum->samples[k].nucs_1[i] == allele_A &&
881 psum->samples[k].nucs_1[j] == allele_B)
882 freq_AB++;
883 else if (psum->samples[k].nucs_1[i] == allele_A &&
884 psum->samples[k].nucs_1[j] == allele_b)
885 freq_Ab++;
886 else if (psum->samples[k].nucs_1[i] == allele_a &&
887 psum->samples[k].nucs_1[j] == allele_B)
888 freq_aB++;
889 else if (psum->samples[k].nucs_1[i] == allele_a &&
890 psum->samples[k].nucs_1[j] == allele_b)
891 freq_ab++;
892
893 if (psum->samples[k].nucs_2[i] == allele_A &&
894 psum->samples[k].nucs_2[j] == allele_B)
895 freq_AB++;
896 else if (psum->samples[k].nucs_2[i] == allele_A &&
897 psum->samples[k].nucs_2[j] == allele_b)
898 freq_Ab++;
899 else if (psum->samples[k].nucs_2[i] == allele_a &&
900 psum->samples[k].nucs_2[j] == allele_B)
901 freq_aB++;
902 else if (psum->samples[k].nucs_2[i] == allele_a &&
903 psum->samples[k].nucs_2[j] == allele_b)
904 freq_ab++;
905 }
906
907 freq_AB = freq_AB / tot;
908 freq_Ab = freq_Ab / tot;
909 freq_aB = freq_aB / tot;
910 freq_ab = freq_ab / tot;
911
912 //
913 // Using the four-gamete test, check whether recombination has occurred
914 // between these two loci.
915 // Four-gamete test: if no recombination has occurred between any two loci (SNPs) there will
916 // be three haplotypes present, if recombination has occurred there will be four haplotypes.
917 //
918 hap_cnt += freq_AB > 0 ? 1 : 0;
919 hap_cnt += freq_Ab > 0 ? 1 : 0;
920 hap_cnt += freq_aB > 0 ? 1 : 0;
921 hap_cnt += freq_ab > 0 ? 1 : 0;
922
923 if (hap_cnt == 3)
924 psum->recomb[i][j] = false;
925 else
926 psum->recomb[i][j] = true;
927
928
929 D = freq_AB - (freq_A * freq_B);
930 // cerr << "D_AB: " << D << "; ";
931 // D = freq_Ab - (freq_A * freq_b);
932 // cerr << "D_Ab: " << D << "; ";
933 // D = freq_aB - (freq_a * freq_B);
934 // cerr << "D_aB: " << D << "; ";
935 // D = freq_ab - (freq_a * freq_b);
936 // cerr << "D_ab: " << D << "\n";
937 // cerr << " freq_AB: " << freq_AB << "; freq_Ab: " << freq_Ab << "; freq_aB: " << freq_aB << "; freq_ab: " << freq_ab << "\n";
938
939 if (D > 0) {
940 min = (freq_A * freq_b) < (freq_a * freq_B) ? (freq_A * freq_b) : (freq_a * freq_B);
941 psum->dprime[i][j].dprime = min == 0 ? 0.0 : D / min;
942 } else {
943 min = (freq_A * freq_B) < (freq_a * freq_b) ? (freq_A * freq_B) : (freq_a * freq_b);
944 psum->dprime[i][j].dprime = min == 0 ? 0.0 :(-1 * D) / min;
945 }
946
947 //
948 // Test D against a chi square distribution with 1 degree of freedom to show
949 // whether these two loci have a D that is statistically significantly different from 0.
950 //
951 chisq = (tot * (D * D)) / (freq_A * freq_a * freq_B * freq_b);
952 if (chisq >= chi_sq_limit)
953 psum->dprime[i][j].chisq_p = true;
954
955 //
956 // Calculate variance and confidence limits.
957 //
958 if (psum->dprime[i][j].chisq_p) {
959 var = (1.0 / tot) * ((freq_A * freq_a * freq_B * freq_b) + ((1 - (2 * freq_A)) * (1 - (2 * freq_B)) * D) - (D * D));
960 psum->dprime[i][j].var = var;
961 psum->dprime[i][j].ci_high = psum->dprime[i][j].dprime + (1.96 * sqrt(var));
962 psum->dprime[i][j].ci_low = psum->dprime[i][j].dprime - (1.96 * sqrt(var));
963 } else {
964 psum->dprime[i][j].ci_high = 0.0;
965 psum->dprime[i][j].ci_low = 0.0;
966 }
967 }
968 }
969 }
970
971 return 0;
972 }
973
974 int
assign_alleles(NucSum nsum,char & p_allele,char & q_allele,double & p_freq,double & q_freq)975 assign_alleles(NucSum nsum, char &p_allele, char &q_allele, double &p_freq, double &q_freq)
976 {
977 p_allele = 0;
978 q_allele = 0;
979
980 uint i = 0;
981 float tot = 0;
982
983 while (p_allele == 0 && i < 4) {
984 if (nsum.nuc[i] > 0) {
985 tot += nsum.nuc[i];
986 switch(i) {
987 case 0:
988 p_allele = 'A';
989 p_freq = nsum.nuc[0];
990 break;
991 case 1:
992 p_allele = 'C';
993 p_freq = nsum.nuc[1];
994 break;
995 case 2:
996 p_allele = 'G';
997 p_freq = nsum.nuc[2];
998 break;
999 case 3:
1000 p_allele = 'T';
1001 p_freq = nsum.nuc[3];
1002 break;
1003 }
1004 }
1005 i++;
1006 }
1007 while (q_allele == 0 && i < 4) {
1008 if (nsum.nuc[i] > 0) {
1009 tot += nsum.nuc[i];
1010 switch(i) {
1011 case 1:
1012 q_allele = 'C';
1013 q_freq = nsum.nuc[1];
1014 break;
1015 case 2:
1016 q_allele = 'G';
1017 q_freq = nsum.nuc[2];
1018 break;
1019 case 3:
1020 q_allele = 'T';
1021 q_freq = nsum.nuc[3];
1022 break;
1023 }
1024 }
1025 i++;
1026 }
1027
1028 p_freq = p_freq / tot;
1029 q_freq = 1 - p_freq;
1030
1031 return 0;
1032 }
1033
1034 int
write_dprime(string path,PhasedSummary * psum)1035 write_dprime(string path, PhasedSummary *psum)
1036 {
1037 //
1038 // Write the D' data for plotting as a heatmap.
1039 //
1040 string file = path + ".dprime.tsv";
1041
1042 cerr << "Writing D' data to '" << file << "'...";
1043
1044 ofstream fh(file.c_str(), ofstream::out);
1045
1046 if (fh.fail()) {
1047 cerr << "Error opening D' file '" << file << "'\n";
1048 exit(1);
1049 }
1050
1051 fh << "# Basepair 1\tBasepair 2\tD'\tCorrected D'\tVariance\tCI Low\tCI High\n";
1052
1053 double dprime = 0.0;
1054
1055 for (uint i = 0; i < psum->size; i++) {
1056 for (uint j = i+1; j < psum->size; j++) {
1057
1058 if (psum->nucs[i].freq < minor_freq_lim ||
1059 psum->nucs[j].freq < minor_freq_lim)
1060 continue;
1061
1062 dprime = psum->dprime[i][j].dprime;
1063
1064 if (dprime_threshold)
1065 dprime = dprime >= dprime_threshold_level ? 1.0 : 0.0;
1066
1067 if (write_zeros == false && (dprime == 0.0 || psum->dprime[i][j].chisq_p == false))
1068 continue;
1069
1070 fh << psum->nucs[i].bp << "\t"
1071 << psum->nucs[j].bp << "\t"
1072 << std::setprecision(3) << dprime << "\t"
1073 << std::setprecision(3) << (psum->dprime[i][j].chisq_p ? dprime : 0.0) << "\t"
1074 << psum->dprime[i][j].var << "\t"
1075 << psum->dprime[i][j].ci_low << "\t"
1076 << psum->dprime[i][j].ci_high << "\n";
1077 }
1078 }
1079
1080 fh.close();
1081
1082 cerr << "done.\n";
1083
1084 return 0;
1085 }
1086
1087 int
summarize_phased_genotypes(PhasedSummary * psum)1088 summarize_phased_genotypes(PhasedSummary *psum)
1089 {
1090 //
1091 // Construct a two dimensional array out of all the nucleotide arrays in the samples.
1092 //
1093 char **gtypes = new char *[psum->sample_cnt];
1094
1095 for (uint i = 0; i < psum->sample_cnt; i++) {
1096 gtypes[i] = psum->samples[i].nucs_1;
1097 }
1098
1099 //
1100 // Sum up the occurences of each nucleotide.
1101 //
1102 for (uint i = 0; i < psum->size; i++) {
1103 for (uint j = 0; j < psum->sample_cnt; j++) {
1104 switch(gtypes[j][i]) {
1105 case 'A':
1106 psum->nucs[i].nuc[0]++;
1107 break;
1108 case 'C':
1109 psum->nucs[i].nuc[1]++;
1110 break;
1111 case 'G':
1112 psum->nucs[i].nuc[2]++;
1113 break;
1114 case 'T':
1115 psum->nucs[i].nuc[3]++;
1116 break;
1117 case 'N':
1118 default:
1119 break;
1120 }
1121 }
1122 }
1123
1124 //
1125 // Repeat for the second set of phased genotypes.
1126 //
1127 for (uint i = 0; i < psum->sample_cnt; i++) {
1128 gtypes[i] = psum->samples[i].nucs_2;
1129 }
1130
1131 //
1132 // Sum up the occurences of each nucleotide.
1133 //
1134 for (uint i = 0; i < psum->size; i++) {
1135 for (uint j = 0; j < psum->sample_cnt; j++) {
1136 switch(gtypes[j][i]) {
1137 case 'A':
1138 psum->nucs[i].nuc[0]++;
1139 break;
1140 case 'C':
1141 psum->nucs[i].nuc[1]++;
1142 break;
1143 case 'G':
1144 psum->nucs[i].nuc[2]++;
1145 break;
1146 case 'T':
1147 psum->nucs[i].nuc[3]++;
1148 break;
1149 case 'N':
1150 default:
1151 break;
1152 }
1153 }
1154
1155 //
1156 // Calculate minor allele frequency.
1157 //
1158 float tot = (float) psum->sample_cnt * 2.0;
1159 float freq = 0.0;
1160 for (uint j = 0; j < 4; j++) {
1161 if (psum->nucs[i].nuc[j] > 0) {
1162 freq = (float) psum->nucs[i].nuc[j] / tot;
1163 psum->nucs[i].freq = freq < psum->nucs[i].freq ? freq : psum->nucs[i].freq;
1164 }
1165 }
1166 }
1167
1168 delete [] gtypes;
1169
1170 return 0;
1171 }
1172
1173 //
1174 // Code to parse fastPhase format.
1175 //
1176 PhasedSummary *
parse_fastphase(string path)1177 parse_fastphase(string path)
1178 {
1179 ifstream fh;
1180 char line[max_len];
1181 string buf, filepath;
1182 const char *p, *q, *end;
1183 int i, sindex, pos;
1184
1185 memset(line, '\0', max_len);
1186
1187 //
1188 // Read in the original fastPhase export from Stacks to obtain the original base pair positions.
1189 //
1190 //
1191 // Open the file for reading
1192 //
1193 filepath = path + ".inp";
1194 fh.open(filepath.c_str(), ifstream::in);
1195
1196 if (fh.fail()) {
1197 cerr << "Error opening input file '" << path << "'\n";
1198 return NULL;
1199 }
1200
1201 cerr << "Parsing " << filepath << "...\n";
1202
1203 int num_samples, num_genotypes;
1204 char bp[id_len];
1205
1206 //
1207 // Get the number of samples in the dataset.
1208 //
1209 fh.getline(line, max_len);
1210 num_samples = is_integer(line);
1211
1212 if (num_samples < 0) {
1213 cerr << "Unable to find the number of samples, should be the first line.\n";
1214 return NULL;
1215 }
1216
1217 //
1218 // Get the number of genotypes in the dataset.
1219 //
1220 fh.getline(line, max_len);
1221 num_genotypes = is_integer(line);
1222
1223 if (num_genotypes < 0) {
1224 cerr << "Unable to find the number of genotypes, should be the second line.\n";
1225 return NULL;
1226 }
1227
1228 PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
1229
1230 //
1231 // Get the set of base pair positions.
1232 //
1233 buf.clear();
1234 do {
1235 fh.clear();
1236 fh.getline(line, max_len);
1237 buf += line;
1238 } while (fh.fail() && !fh.bad() && !fh.eof());
1239
1240 i = 0;
1241 p = buf.c_str();
1242 end = p + buf.length();
1243
1244 if (*p != 'P') {
1245 cerr << "Unable to locate line of basepair positions, should be the third line.\n";
1246 delete psum;
1247 return NULL;
1248 }
1249 for (p += 2, q = p; p < end; p++, q++) {
1250 while (*q != ' ' && q < end) {
1251 q++;
1252 }
1253 strncpy(bp, p, q - p);
1254 bp[q - p] = '\0';
1255 pos = is_integer(bp);
1256
1257 if (pos < 0) {
1258 cerr << "Unable to parse base pair positions.\n";
1259 delete psum;
1260 return NULL;
1261 } else {
1262 psum->nucs[i].bp = (uint) pos;
1263 }
1264
1265 i++;
1266 p = q;
1267 }
1268
1269 fh.close();
1270
1271 //
1272 // Open the file for reading
1273 //
1274 filepath = path + "_hapguess_switch.out";
1275 fh.open(filepath.c_str(), ifstream::in);
1276
1277 if (fh.fail()) {
1278 cerr << "Error opening input file '" << path << "'\n";
1279 return NULL;
1280 }
1281
1282 cerr << "Parsing " << filepath << "...\n";
1283
1284 //
1285 // Read from the "*_hapguess_switch.out" file until we hit the genotypes section
1286 // marked by the string "BEGIN GENOTYPES".
1287 //
1288 do {
1289 fh.getline(line, max_len);
1290
1291 if (!fh.good()) {
1292 cerr << "Unable to find file section entitled 'BEGIN GENOTYPES'\n";
1293 delete psum;
1294 return NULL;
1295 }
1296
1297 } while (strcmp(line, "BEGIN GENOTYPES") != 0);
1298
1299 //
1300 // Now read lines from the file in groups of three:
1301 // 1. Sample label
1302 // 2. Phased genotypes from chromosome 1
1303 // 3. Phased genotypes from chromosome 2
1304 // Stop reading individuals when we encounter the string, "END GENOTYPES".
1305 //
1306 fh.getline(line, max_len);
1307
1308 do {
1309 //
1310 // Create a new Sample object and store the sample label.
1311 //
1312 sindex = psum->add_sample(line);
1313
1314 //
1315 // Get the first set of phased genotypes.
1316 //
1317 buf.clear();
1318 do {
1319 fh.clear();
1320 fh.getline(line, max_len);
1321 buf += line;
1322 } while (fh.fail() && !fh.bad() && !fh.eof());
1323
1324 //
1325 // Count the number of genotypes on this line (they should be space deliniated).
1326 //
1327 i = 0;
1328 for (p = buf.c_str(); *p != '\0'; p++)
1329 if (*p != ' ') psum->samples[sindex].size++;
1330 //
1331 // Store the genotypes into our internal buffer.
1332 //
1333 psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
1334 for (p = buf.c_str(); *p != '\0'; p++) {
1335 if (*p == ' ') continue;
1336 psum->samples[sindex].nucs_1[i] = *p;
1337 i++;
1338 }
1339
1340 // len = strlen(line);
1341 // if (line[len - 1] == '\r') line[len - 1] = '\0';
1342
1343 //
1344 // Get the second set of phased genotypes.
1345 //
1346 buf.clear();
1347 do {
1348 fh.clear();
1349 fh.getline(line, max_len);
1350 buf += line;
1351 } while (fh.fail() && !fh.bad() && !fh.eof());
1352
1353 i = 0;
1354 psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
1355 for (p = buf.c_str(); *p != '\0'; p++) {
1356 if (*p == ' ') continue;
1357 psum->samples[sindex].nucs_2[i] = *p;
1358 i++;
1359 }
1360
1361 //
1362 // Get the sample label of the next record.
1363 //
1364 fh.getline(line, max_len);
1365
1366 } while (strcmp(line, "END GENOTYPES") != 0 && fh.good());
1367
1368 fh.close();
1369
1370 return psum;
1371 }
1372
1373 //
1374 // Code to parse Beagle format.
1375 //
1376 PhasedSummary *
parse_beagle(map<int,CSLocus * > & catalog,string path)1377 parse_beagle(map<int, CSLocus *> &catalog, string path)
1378 {
1379 gzFile gz_fh;
1380 char *line;
1381 string buf, filepath;
1382 const char *p, *q;
1383 uint len, line_len, i, sindex;
1384 bool eol;
1385
1386 line_len = max_len;
1387 line = new char[line_len];
1388 memset(line, '\0', line_len);
1389
1390 //
1391 // Open the Beagle file for reading
1392 //
1393 filepath = path + ".phased.gz";
1394 gz_fh = gzopen(filepath.c_str(), "rb");
1395 if (!gz_fh) {
1396 cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
1397 return NULL;
1398 }
1399
1400 cerr << "Parsing " << filepath << "...\n";
1401
1402 vector<string> parts;
1403 uint num_samples = 0;
1404 uint num_genotypes = 0;
1405 char cat_loc_str[id_len], col_str[id_len];
1406
1407 //
1408 // Parse the file twice. On the first round:
1409 // 1. Determine the number of samples in the dataset (column count)
1410 // 2. Determine the number of markers (row count).
1411 // On the second round, parse the SNP genotypes.
1412 //
1413
1414 //
1415 // Read each line in the file. If it starts with:
1416 // '#' it is a comment, skip the line.
1417 // 'I' it is the list of samples, parse them.
1418 // 'S' is the population ID for each SNP, skip this line.
1419 // 'M' is a marker, count the number of markers.
1420 //
1421 do {
1422 eol = false;
1423 buf.clear();
1424 do {
1425 gzgets(gz_fh, line, line_len);
1426 buf += line;
1427
1428 len = strlen(line);
1429 if (len > 0 && line[len - 1] == '\n') {
1430 eol = true;
1431 line[len - 1] = '\0';
1432 }
1433 } while (!gzeof(gz_fh) && !eol);
1434
1435 if (line_len < buf.length()) {
1436 // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
1437 delete [] line;
1438 line = new char[buf.length() + 1];
1439 line_len = buf.length() + 1;
1440 memset(line, '\0', line_len);
1441 }
1442
1443 if (buf[0] == 'M') {
1444 num_genotypes++;
1445 } else if (buf[0] == 'I') {
1446 //
1447 // Count the number of samples.
1448 //
1449 parse_ssv(buf.c_str(), parts);
1450 num_samples = (parts.size() - 2) / 2;
1451 }
1452
1453 } while (!gzeof(gz_fh));
1454
1455 PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
1456
1457 for (uint j = 2; j < parts.size(); j++) {
1458 if (j % 2 == 0) {
1459 sindex = psum->add_sample(parts[j]);
1460 psum->samples[sindex].size = num_genotypes;
1461 psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
1462 psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
1463 }
1464 }
1465
1466 cerr << " Found " << num_samples << " samples; " << num_genotypes << " genotypes.\n";
1467
1468 gzrewind(gz_fh);
1469
1470 uint marker_num = 0;
1471 memset(line, '\0', line_len);
1472
1473 do {
1474 do {
1475 gzgets(gz_fh, line, line_len);
1476 } while (!gzeof(gz_fh) && line[0] != 'M');
1477
1478 len = strlen(line);
1479
1480 if (len == 0) break;
1481 if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
1482
1483 parse_ssv(line, parts);
1484
1485 //
1486 // Parse the catalog locus ID and the column number of the SNP:
1487 // e.g. LocId_column or 10329_37
1488 //
1489 p = parts[1].c_str();
1490 for (q = p + 1; *q != '_' && *q != '\0'; q++);
1491 strncpy(cat_loc_str, p, q - p);
1492 cat_loc_str[q-p] = '\0';
1493 q++;
1494 strcpy(col_str, q);
1495
1496 psum->nucs[marker_num].clocus = is_integer(cat_loc_str);
1497 psum->nucs[marker_num].col = is_integer(col_str);
1498
1499 //
1500 // Store the genotypes into our internal buffer.
1501 //
1502 sindex = 0;
1503 i = 2;
1504 while (i < parts.size()) {
1505 p = parts[i].c_str();
1506 psum->samples[sindex].nucs_1[marker_num] = *p;
1507 i++;
1508 p = parts[i].c_str();
1509 psum->samples[sindex].nucs_2[marker_num] = *p;
1510 i++;
1511 sindex++;
1512 }
1513
1514 marker_num++;
1515
1516 } while (!gzeof(gz_fh));
1517
1518 gzclose(gz_fh);
1519
1520 //
1521 // Use the catalog to look up the basepair positions for each catalog locus.
1522 //
1523 CSLocus *loc;
1524 for (i = 0; i < psum->size; i++) {
1525 loc = catalog[psum->nucs[i].clocus];
1526 psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
1527 }
1528
1529 return psum;
1530 }
1531
1532 //
1533 // Code to parse Beagle format.
1534 //
1535 PhasedSummary *
parse_beagle_haplotypes(map<int,CSLocus * > & catalog,string path)1536 parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
1537 {
1538 gzFile gz_fh;
1539 char *line;
1540 string buf, filepath;
1541 const char *p;
1542 uint len, line_len, i, j, sindex;
1543 bool eol;
1544
1545 line_len = max_len;
1546 line = new char[line_len];
1547 memset(line, '\0', line_len);
1548
1549 //
1550 // Open the Beagle file for reading
1551 //
1552 filepath = path + ".phased.gz";
1553 gz_fh = gzopen(filepath.c_str(), "rb");
1554 if (!gz_fh) {
1555 cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
1556 return NULL;
1557 }
1558
1559 cerr << "Parsing " << filepath << "...\n";
1560
1561 vector<string> parts, samples;
1562 uint num_samples = 0;
1563 uint num_genotypes = 0;
1564 uint cat_loc;
1565
1566 //
1567 // Parse the file twice. On the first round:
1568 // 1. Determine the number of samples in the dataset (column count)
1569 // 2. Determine the number of markers (row count).
1570 // On the second round, parse the SNP genotypes.
1571 //
1572
1573 //
1574 // Read each line in the file. If it starts with:
1575 // '#' it is a comment, skip the line.
1576 // 'I' it is the list of samples, parse them.
1577 // 'S' is the population ID for each SNP, skip this line.
1578 // 'M' is a marker, count the number of markers.
1579 //
1580 do {
1581 eol = false;
1582 buf.clear();
1583 do {
1584 gzgets(gz_fh, line, line_len);
1585 buf += line;
1586
1587 len = strlen(line);
1588 if (len > 0 && line[len - 1] == '\n') {
1589 eol = true;
1590 line[len - 1] = '\0';
1591 }
1592 } while (!gzeof(gz_fh) && !eol);
1593
1594 if (line_len < buf.length()) {
1595 // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
1596 delete [] line;
1597 line = new char[buf.length() + 1];
1598 line_len = buf.length() + 1;
1599 memset(line, '\0', line_len);
1600 }
1601
1602 if (buf[0] == 'M') {
1603 //
1604 // Count the number of genotypes by counting the number or nucleotides in each
1605 // haplotype for each marker.
1606 //
1607 parse_ssv(buf.c_str(), parts);
1608 num_genotypes += parts[2].length();
1609
1610 } else if (buf[0] == 'I') {
1611 //
1612 // Count the number of samples.
1613 //
1614 parse_ssv(buf.c_str(), samples);
1615 num_samples = (samples.size() - 2) / 2;
1616 }
1617
1618 } while (!gzeof(gz_fh));
1619
1620 PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
1621
1622 for (uint j = 2; j < samples.size(); j++) {
1623 if (j % 2 == 0) {
1624 sindex = psum->add_sample(samples[j]);
1625 psum->samples[sindex].size = num_genotypes;
1626 psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
1627 psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
1628 }
1629 }
1630
1631 cerr << " Found " << num_samples << " samples; " << num_genotypes << " genotypes.\n";
1632
1633 gzrewind(gz_fh);
1634
1635 CSLocus *loc;
1636 uint hap_len = 0;
1637 uint marker_num = 0;
1638 memset(line, '\0', line_len);
1639
1640 do {
1641 do {
1642 gzgets(gz_fh, line, line_len);
1643 } while (!gzeof(gz_fh) && line[0] != 'M');
1644
1645 len = strlen(line);
1646
1647 if (len == 0) break;
1648 if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
1649
1650 parse_ssv(line, parts);
1651
1652 //
1653 // Use the catalog to look up the basepair positions for each catalog locus.
1654 //
1655 cat_loc = is_integer(parts[1].c_str());
1656 loc = catalog[cat_loc];
1657 hap_len = parts[2].length();
1658
1659 if (hap_len != loc->snps.size())
1660 cerr << "Haplotypes don't match between catalog and beagle; Locus ID: " << loc->id << "; beagle hap len: " << hap_len << "; catalog hap len: " << loc->snps.size() << "\n";
1661
1662 for (j = 0, i = marker_num; i < marker_num + hap_len; i++, j++) {
1663 psum->nucs[i].clocus = cat_loc;
1664 psum->nucs[i].col = loc->snps[j]->col;
1665 psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
1666 }
1667
1668 //
1669 // Store the genotypes into our internal buffer.
1670 //
1671 sindex = 0;
1672 i = 2;
1673 while (i < parts.size()) {
1674 p = parts[i].c_str();
1675 for (j = marker_num; j < marker_num + hap_len; j++) {
1676 psum->samples[sindex].nucs_1[j] = *p;
1677 p++;
1678 }
1679 i++;
1680 p = parts[i].c_str();
1681 for (j = marker_num; j < marker_num + hap_len; j++) {
1682 psum->samples[sindex].nucs_2[j] = *p;
1683 p++;
1684 }
1685 i++;
1686 sindex++;
1687 }
1688
1689 marker_num += hap_len;
1690
1691 } while (!gzeof(gz_fh));
1692
1693 gzclose(gz_fh);
1694
1695 return psum;
1696 }
1697
1698 int
parse_population_map(string popmap_path,map<string,int> & pop_map,map<int,int> & pop_cnts)1699 parse_population_map(string popmap_path, map<string, int> &pop_map, map<int, int> &pop_cnts)
1700 {
1701 char line[max_len];
1702 char pop_id_str[id_len];
1703 vector<string> parts;
1704 uint len;
1705
1706 if (pmap_path.length() == 0)
1707 return 0;
1708
1709 cerr << "Parsing population map.\n";
1710
1711 ifstream fh(popmap_path.c_str(), ifstream::in);
1712
1713 if (fh.fail()) {
1714 cerr << "Error opening population map '" << popmap_path << "'\n";
1715 return 0;
1716 }
1717
1718 while (fh.good()) {
1719 fh.getline(line, max_len);
1720
1721 len = strlen(line);
1722 if (len == 0) continue;
1723
1724 //
1725 // Check that there is no carraige return in the buffer.
1726 //
1727 if (line[len - 1] == '\r') line[len - 1] = '\0';
1728
1729 //
1730 // Ignore comments
1731 //
1732 if (line[0] == '#') continue;
1733
1734 //
1735 // Parse the population map, we expect:
1736 // <file name> <tab> <population ID>
1737 //
1738 parse_tsv(line, parts);
1739
1740 if (parts.size() != 2) {
1741 cerr << "Population map is not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
1742 return 0;
1743 }
1744
1745 strncpy(pop_id_str, parts[1].c_str(), id_len);
1746 for (int i = 0; i < id_len && pop_id_str[i] != '\0'; i++)
1747 if (!isdigit(pop_id_str[i])) {
1748 cerr << "Population map is not formated correctly: expecting numerical ID in second column, found '" << parts[1] << "'.\n";
1749 return 0;
1750 }
1751
1752 //
1753 // Add the sample name to population number mapping.
1754 //
1755 pop_map[parts[0]] = atoi(parts[1].c_str());
1756 if (pop_cnts.count(atoi(parts[1].c_str())) == 0)
1757 pop_cnts[atoi(parts[1].c_str())] = 1;
1758 else
1759 pop_cnts[atoi(parts[1].c_str())]++;
1760 }
1761
1762 fh.close();
1763
1764 return 0;
1765 }
1766
1767 int
build_file_list(vector<pair<int,string>> & files)1768 build_file_list(vector<pair<int, string> > &files)
1769 {
1770 vector<string> parts;
1771 string pattern;
1772
1773 //
1774 // Read all the files from the Stacks directory.
1775 //
1776 uint pos;
1777 string file;
1778 struct dirent *direntry;
1779
1780 DIR *dir = opendir(in_path.c_str());
1781
1782 if (dir == NULL) {
1783 cerr << "Unable to open directory '" << in_path << "' for reading.\n";
1784 exit(1);
1785 }
1786
1787 switch(in_file_type) {
1788 case FileT::beagle:
1789 pattern = ".phased.gz";
1790 break;
1791 case FileT::fastphase:
1792 default:
1793 pattern = "_hapguess_switch.out";
1794 break;
1795 }
1796
1797 while ((direntry = readdir(dir)) != NULL) {
1798 file = direntry->d_name;
1799
1800 if (file == "." || file == "..")
1801 continue;
1802
1803 pos = file.rfind(pattern);
1804 if (pos < file.length())
1805 files.push_back(make_pair(1, file.substr(0, pos)));
1806 }
1807
1808 closedir(dir);
1809
1810 if (files.size() == 0) {
1811 cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
1812 return 0;
1813 }
1814
1815 return 1;
1816 }
1817
parse_command_line(int argc,char * argv[])1818 int parse_command_line(int argc, char* argv[]) {
1819 int c;
1820
1821 while (1) {
1822 static struct option long_options[] = {
1823 {"help", no_argument, NULL, 'h'},
1824 {"version", no_argument, NULL, 'v'},
1825 {"haplotypes", no_argument, NULL, 'H'},
1826 {"skip-zeros", no_argument, NULL, 'Z'}, {"skip_zeros", no_argument, NULL, 'Z'},
1827 {"infile-type", required_argument, NULL, 't'}, {"infile_type", required_argument, NULL, 't'},
1828 {"num-threads", required_argument, NULL, 'p'}, {"num_threads", required_argument, NULL, 'p'},
1829 {"in-path", required_argument, NULL, 'P'}, {"in_path", required_argument, NULL, 'P'},
1830 {"cat-path", required_argument, NULL, 'S'}, {"cat_path", required_argument, NULL, 'S'},
1831 {"pop-map", required_argument, NULL, 'M'}, {"pop_map", required_argument, NULL, 'M'},
1832 {"batch-id", required_argument, NULL, 'b'}, {"batch_id", required_argument, NULL, 'b'},
1833 {"dprime-bin-size", required_argument, NULL, 'B'}, {"dprime_bin_size", required_argument, NULL, 'B'},
1834 {"minor-allele-freq", required_argument, NULL, 'a'}, {"minor_allele_freq", required_argument, NULL, 'a'},
1835 {"min-inform-pairs", required_argument, NULL, 'm'}, {"min_inform_pairs", required_argument, NULL, 'm'},
1836 {"dprime-threshold", required_argument, NULL, 'T'}, {"dprime_threshold", required_argument, NULL, 'T'},
1837 {0, 0, 0, 0}
1838 };
1839
1840 // getopt_long stores the option index here.
1841 int option_index = 0;
1842
1843 c = getopt_long(argc, argv, "hvZHAb:M:t:P:S:p:a:B:T:", long_options, &option_index);
1844
1845 // Detect the end of the options.
1846 if (c == -1)
1847 break;
1848
1849 switch (c) {
1850 case 'h':
1851 help();
1852 break;
1853 case 'b':
1854 batch_id = is_integer(optarg);
1855 if (batch_id < 0) {
1856 cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
1857 help();
1858 }
1859 break;
1860 case 'p':
1861 num_threads = atoi(optarg);
1862 break;
1863 case 'a':
1864 minor_freq_lim = atof(optarg);
1865 break;
1866 case 'm':
1867 min_inform_pairs = atof(optarg);
1868 break;
1869 case 'P':
1870 in_path = optarg;
1871 break;
1872 case 'S':
1873 cat_path = optarg;
1874 break;
1875 case 't':
1876 if (strcasecmp(optarg, "phase") == 0)
1877 in_file_type = FileT::phase;
1878 else if (strcasecmp(optarg, "fastphase") == 0)
1879 in_file_type = FileT::fastphase;
1880 else if (strcasecmp(optarg, "beagle") == 0)
1881 in_file_type = FileT::beagle;
1882 else
1883 in_file_type = FileT::unknown;
1884 break;
1885 case 'M':
1886 pmap_path = optarg;
1887 break;
1888 case 'H':
1889 haplotypes = true;
1890 break;
1891 case 'Z':
1892 write_zeros = false;
1893 break;
1894 case 'B':
1895 bucket_dist = atoi(optarg);
1896 break;
1897 case 'T':
1898 dprime_threshold = true;
1899 dprime_threshold_level = atof(optarg);
1900 break;
1901 case 'v':
1902 version();
1903 break;
1904 case '?':
1905 // getopt_long already printed an error message.
1906 help();
1907 break;
1908 default:
1909 help();
1910 exit(1);
1911 }
1912 }
1913
1914 if (in_path.length() == 0) {
1915 cerr << "You must specify a path to the directory containing Stacks output files.\n";
1916 help();
1917 }
1918
1919 if (in_path.at(in_path.length() - 1) != '/')
1920 in_path += "/";
1921
1922 if (minor_freq_lim > 0) {
1923 if (minor_freq_lim > 1)
1924 minor_freq_lim = minor_freq_lim / 100;
1925
1926 if (minor_freq_lim > 0.5) {
1927 cerr << "Unable to parse the minor allele frequency\n";
1928 help();
1929 }
1930 }
1931
1932 if (min_inform_pairs > 0) {
1933 if (min_inform_pairs > 1)
1934 min_inform_pairs = min_inform_pairs / 100;
1935 }
1936
1937 return 0;
1938 }
1939
version()1940 void version() {
1941 cerr << "phasedstacks " << VERSION << "\n\n";
1942
1943 exit(1);
1944 }
1945
help()1946 void help() {
1947 cerr << "phasedstacks " << VERSION << "\n"
1948 << "phasedstacks -b id -S path -P path -t file_type [-p threads] [-M popmap] [-v] [-h]" << "\n"
1949 << " b: Stacks batch ID.\n"
1950 << " P: path to the phased output files.\n"
1951 << " S: path to the Stacks output files.\n"
1952 << " t: input file type. Supported types: fastphase, and beagle.\n"
1953 << " p: number of processes to run in parallel sections of code.\n"
1954 << " M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
1955 << " v: print program version." << "\n"
1956 << " h: display this help messsage." << "\n"
1957 << " --haplotypes: data were phased as RAD locus haplotypes.\n"
1958 << " --dprime-bin-size: size of buckets for binning SNPs at a particular distance to calculate the mean D' value.\n"
1959 << " --dprime-threshold <val>: if D' values fall above <val>, set the D' to 1, otherwise set D' to 0.\n\n"
1960 << " Filtering options:\n"
1961 << " --skip-zeros: do not include D' values of zero in the D' output.\n"
1962 << " --minor-allele-freq: specify a minimum minor allele frequency required to process a nucleotide site (0 < a < 0.5).\n"
1963 << " --min-inform-pairs: when building D' haplotype blocks, the minimum number of informative D' measures to combine two blocks (default 0.9).\n\n";
1964
1965
1966 exit(1);
1967 }
1968