1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2013-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // phasedstacks -- analyse phased data, descended from a Stacks analysis.
23 //
24 
25 #include "phasedstacks.h"
26 
27 // Global variables to hold command-line options.
28 FileT  in_file_type = FileT::unknown;
29 int    num_threads  = 1;
30 int    batch_id     = 0;
31 string cat_path;
32 string in_path;
33 string out_path;
34 string out_file;
35 string pmap_path;
36 bool   haplotypes       = false;
37 bool   write_zeros      = true;
38 double p_value_cutoff   = 0.05;
39 double chi_sq_limit     = 3.84;
40 double minor_freq_lim   = 0.1;
41 double min_inform_pairs = 0.90;
42 uint   max_pair_dist    = 1000000;
43 uint   bucket_dist      = 5000;
44 double dprime_threshold = false;
45 double dprime_threshold_level = 0.0;
46 
47 set<int> whitelist, blacklist;
48 
49 map<string, int> pop_map;
50 map<int, int>    pop_cnts;
51 
main(int argc,char * argv[])52 int main (int argc, char* argv[]) {
53     IF_NDEBUG_TRY
54 
55     parse_command_line(argc, argv);
56 
57     if (p_value_cutoff == 0.1) {
58         chi_sq_limit =  2.71;
59     } else if (p_value_cutoff == 0.05) {
60         chi_sq_limit =  3.84;
61     } else if (p_value_cutoff == 0.01) {
62         chi_sq_limit =  6.64;
63     } else if (p_value_cutoff == 0.001) {
64         chi_sq_limit = 10.83;
65     }
66 
67     cerr << "Minor allele frequency cutoff: " << minor_freq_lim << "\n"
68          << "Looking for ";
69     switch(in_file_type) {
70     case FileT::beagle:
71         cerr << "Beagle";
72         break;
73     case FileT::phase:
74         cerr << "PHASE";
75         break;
76     case FileT::fastphase:
77     default:
78         cerr << "fastPhase";
79         break;
80     }
81     cerr << " input files.\n"
82          << "Size of buckets for binning D' values at a particular distance: " << bucket_dist / 1000 << "kb.\n";
83     if (dprime_threshold)
84         cerr << "D' Threshold set at " << dprime_threshold_level << ". D' values above this limit will be set to 1.0, values below will be set to 0.0.\n";
85 
86     //
87     // Parse the population map.
88     //
89     parse_population_map(pmap_path, pop_map, pop_cnts);
90 
91     //
92     // Set the number of OpenMP parallel threads to execute.
93     //
94     #ifdef _OPENMP
95     omp_set_num_threads(num_threads);
96     #endif
97 
98     vector<pair<int, string> > files;
99     if (!build_file_list(files))
100         exit(1);
101 
102     cerr << "Identified " << files.size() << " files.\n";
103 
104     //
105     // Open the log file.
106     //
107     stringstream log;
108     log << "phasedstacks.log";
109     string log_path = in_path + log.str();
110     ofstream log_fh(log_path.c_str(), ofstream::out);
111 
112     if (log_fh.fail()) {
113         cerr << "Error opening log file '" << log_path << "'\n";
114         exit(1);
115     }
116 
117     init_log(log_fh, argc, argv);
118 
119     //
120     // Load the catalog
121     //
122     cerr << "Parsing the catalog...\n";
123     stringstream catalog_file;
124     map<int, CSLocus *> catalog;
125     bool compressed = false;
126     catalog_file << cat_path << "batch_" << batch_id << ".catalog";
127     int res = load_loci(catalog_file.str(), catalog, 0, false, compressed);
128     if (res == 0) {
129         cerr << "Error: Unable to load the catalog '" << catalog_file.str() << "'\n";
130         throw exception();
131     }
132     cerr << "done.\n";
133 
134     //
135     // Implement the black/white list
136     //
137     reduce_catalog(catalog, whitelist, blacklist);
138 
139     map<int, int> fgt_block_lens, fgt_snp_cnts;
140     map<int, int> dp_block_lens, dp_snp_cnts;
141 
142     //
143     // Vectors to store D' measures of SNPs at bucketed distances.
144     //
145     vector<double> dprime_buckets, dprime_bucket_cnts;
146 
147     for (uint i = 0; i < files.size(); i++) {
148 
149         // if (files[i].second != "batch_1.groupV.phase") continue;
150 
151         PhasedSummary *psum = NULL;
152 
153         if (in_file_type == FileT::fastphase) {
154             if ((psum = parse_fastphase(in_path + files[i].second)) == NULL) {
155                 cerr << "Unable to parse fastPhase input files.\n";
156                 exit(1);
157             }
158         } else if (in_file_type == FileT::beagle && haplotypes) {
159             if ((psum = parse_beagle_haplotypes(catalog, in_path + files[i].second)) == NULL) {
160                 cerr << "Unable to parse Beagle input files.\n";
161                 exit(1);
162             }
163         } else if (in_file_type == FileT::beagle) {
164             if ((psum = parse_beagle(catalog, in_path + files[i].second)) == NULL) {
165                 cerr << "Unable to parse Beagle input files.\n";
166                 exit(1);
167             }
168         }
169 
170         //
171         // Summarize the genotypes in the populations.
172         //
173         summarize_phased_genotypes(psum);
174 
175         // for (uint j = 0; j < psum->size; j++) {
176         //     cerr << "BP: " << psum->nucs[j].bp << "\t"
177         //       << "A: "  << std::setw(3) << psum->nucs[j].nuc[0] << " "
178         //       << "C: "  << std::setw(3) << psum->nucs[j].nuc[1] << " "
179         //       << "G: "  << std::setw(3) << psum->nucs[j].nuc[2] << " "
180         //       << "T: "  << std::setw(3) << psum->nucs[j].nuc[3] << "\n";
181         // }
182 
183         //
184         // Calculate D'
185         //
186         cerr << "Calculating D'...";
187         calc_dprime(psum);
188         cerr << "done.\n";
189 
190         write_dprime(in_path + files[i].second, psum);
191 
192         //
193         // Generate haplotype blocks based on D'.
194         //
195         dprime_blocks(in_path + files[i].second, pop_map, psum, dp_block_lens, dp_snp_cnts);
196 
197         //
198         // Generate haplotype blocks using the four gamete test.
199         //
200         four_gamete_test(in_path + files[i].second, pop_map, psum, fgt_block_lens, fgt_snp_cnts);
201 
202         //
203         // Bucket the D' measures by distance between SNPs.
204         //
205         bucket_dprime(dprime_buckets, dprime_bucket_cnts, psum);
206 
207         //
208         // Free the Samples objects
209         //
210         delete psum;
211     }
212 
213     //
214     // Write average D' values bucketed according to their distance in the genome.
215     //
216     write_buckets(in_path, dprime_buckets, dprime_bucket_cnts);
217 
218     //
219     // Write the FGT bucketed distances.
220     //
221     log_fh << "# Distribution of FGT haplotype block lengths.\n";
222     map<int, int>::iterator buck_it;
223     for (buck_it = fgt_block_lens.begin(); buck_it != fgt_block_lens.end(); buck_it++)
224         log_fh << buck_it->first << "\t" << buck_it->second << "\n";
225 
226     //
227     // Write the FGT bucketed SNP counts.
228     //
229     log_fh << "\n\n"
230            << "# Distribution of FGT SNP counts per haplotype block.\n";
231     for (buck_it = fgt_snp_cnts.begin(); buck_it != fgt_snp_cnts.end(); buck_it++)
232         log_fh << buck_it->first << "\t" << buck_it->second << "\n";
233 
234     //
235     // Write the D' haplotype block bucketed distances.
236     //
237     log_fh << "\n\n"
238            << "# Distribution of D' haplotype block lengths.\n";
239     for (buck_it = dp_block_lens.begin(); buck_it != dp_block_lens.end(); buck_it++)
240         log_fh << buck_it->first << "\t" << buck_it->second << "\n";
241 
242     //
243     // Write the D' bucketed SNP counts.
244     //
245     log_fh << "\n\n"
246            << "# Distribution of D' SNP counts per haplotype block.\n";
247     for (buck_it = dp_snp_cnts.begin(); buck_it != dp_snp_cnts.end(); buck_it++)
248         log_fh << buck_it->first << "\t" << buck_it->second << "\n";
249 
250     log_fh.close();
251 
252     return 0;
253     IF_NDEBUG_CATCH_ALL_EXCEPTIONS
254 }
255 
256 int
bucket_dprime(vector<double> & dprime_buckets,vector<double> & dprime_bucket_cnts,PhasedSummary * psum)257 bucket_dprime(vector<double> &dprime_buckets, vector<double> &dprime_bucket_cnts, PhasedSummary *psum)
258 {
259     uint bucket, dist, max_bucket;
260     uint max_dist = 0;
261 
262     //
263     // Check that we have enough buckets in our vectors. Find the maximum distance between
264     // SNPs on this chromosome and add buckets as necessary.
265     //
266     for (uint i = 0; i < psum->size; i++) {
267         for (uint j = i+1; j < psum->size; j++) {
268 
269             if (psum->nucs[i].freq < minor_freq_lim ||
270                 psum->nucs[j].freq < minor_freq_lim)
271                 continue;
272 
273             if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
274                 continue;
275 
276             dist     = psum->nucs[j].bp - psum->nucs[i].bp;
277             max_dist = dist > max_dist ? dist : max_dist;
278         }
279     }
280     max_bucket = max_dist / bucket_dist;
281     if (dprime_buckets.size() < max_bucket) {
282         uint cnt = max_bucket + 1 - dprime_buckets.size();
283         for (uint i = 0; i < cnt; i++) {
284             dprime_buckets.push_back(0.0);
285             dprime_bucket_cnts.push_back(0.0);
286         }
287     }
288 
289     //
290     // Populate buckets
291     //
292     for (uint i = 0; i < psum->size; i++) {
293         for (uint j = i+1; j < psum->size; j++) {
294 
295             if (psum->nucs[i].freq < minor_freq_lim ||
296                 psum->nucs[j].freq < minor_freq_lim)
297                 continue;
298 
299             if (write_zeros == false && psum->dprime[i][j].chisq_p == false)
300                 continue;
301 
302             bucket = ((psum->nucs[j].bp - psum->nucs[i].bp) / bucket_dist);
303 
304             dprime_buckets[bucket] += (psum->dprime[i][j].chisq_p ? psum->dprime[i][j].dprime : 0.0);
305             dprime_bucket_cnts[bucket]++;
306         }
307     }
308 
309     return 0;
310 }
311 
312 int
write_buckets(string path,vector<double> & dprime_buckets,vector<double> & dprime_bucket_cnts)313 write_buckets(string path, vector<double> &dprime_buckets, vector<double> &dprime_bucket_cnts)
314 {
315     //
316     // Write the bucketed D' data for plotting.
317     //
318     stringstream file;
319     file << path << "Dprime_dist_buckets" << bucket_dist/1000 << "kb.tsv";
320 
321     cerr << "Writing bucketed D' data to '" << file.str() << "'...";
322 
323     ofstream fh(file.str().c_str(), ofstream::out);
324 
325     if (fh.fail()) {
326         cerr << "Error opening D' file '" << file.str() << "'\n";
327         exit(1);
328     }
329 
330     fh << "# Distance (Kb)\tD' Average\n";
331 
332     for (uint i = 0; i < dprime_buckets.size(); i++)
333         fh << (i * bucket_dist) << "\t"
334            << std::setprecision(3) << (dprime_buckets[i] / dprime_bucket_cnts[i]) << "\n";
335 
336     fh.close();
337 
338     cerr << "done\n";
339 
340     return 0;
341 }
342 
343 int
four_gamete_test(string path,map<string,int> & pop_map,PhasedSummary * psum,map<int,int> & len_buckets,map<int,int> & snp_buckets)344 four_gamete_test(string path, map<string, int> &pop_map, PhasedSummary *psum, map<int, int> &len_buckets, map<int, int> &snp_buckets)
345 {
346     //
347     // Write haplotypes as found by the four gamete test:
348     //   Wang, et al., Am. J. Hum. Genet. 71:1227–1234, 2002
349     //
350     string file = path + ".fgt.tsv";
351 
352     cerr << "Determining four gamete test haplotypes blocks, writing to:\n    '" << file << "'...\n";
353 
354     ofstream fh(file.c_str(), ofstream::out);
355 
356     if (fh.fail()) {
357         cerr << "Error opening FGT file '" << file << "'\n";
358         exit(1);
359     }
360 
361     fh << "# ID\tStart\tEnd\tLen\tSNP Count\tHaplotype Count\tHaplotype\tPopulations\tHapPopCnt\n";
362 
363     uint id = 1;
364     uint start, end=-1, cnt, dist;
365     bool bound;
366     map<int, int> buckets, snps;
367 
368     for (uint i = 0; i < psum->size; i++) {
369         if (psum->nucs[i].freq < minor_freq_lim)
370             continue;
371 
372         //
373         // Start a new block.
374         //
375         start  = i;
376         bound  = false;
377         cnt    = 0;
378         uint j = i;
379 
380         do {
381             if (psum->nucs[j].freq < minor_freq_lim) {
382                 j++;
383                 continue;
384             }
385 
386             for (int k = j; k >= (int) start; k--) {
387 
388                 if (psum->nucs[k].freq < minor_freq_lim)
389                     continue;
390 
391                 if (psum->recomb[k][j] == true) {
392                     bound = true;
393                     end   = j;
394                 }
395             }
396 
397             j++;
398             cnt++;
399         } while (bound == false && j < psum->size);
400 
401         if (j == psum->size)
402             end = j - 1;
403 
404         fh << id << "\t"
405            << psum->nucs[start].bp << "\t"
406            << psum->nucs[end].bp << "\t"
407            << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
408            << cnt << "\t";
409         //
410         // Bucket the SNP counts for plotting.
411         //
412         snps[cnt]++;
413 
414         //
415         // Bucket the haplotype block lengths for plotting.
416         //
417         dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
418         buckets[dist]++;
419 
420         enumerate_haplotypes(fh, pop_map, psum, start, end);
421 
422         i = end;
423         id++;
424     }
425 
426     //
427     // Write the bucketed distances.
428     //
429     fh << "\n\n"
430        << "# Distribution of FGT haplotype block lengths.\n";
431     map<int, int>::iterator it;
432     for (it = buckets.begin(); it != buckets.end(); it++) {
433         fh << it->first << "\t" << it->second << "\n";
434 
435         len_buckets[it->first] += it->second;
436     }
437 
438     //
439     // Write the bucketed SNP counts.
440     //
441     fh << "\n\n"
442        << "# Distribution of SNP counts per FGT haplotype block.\n";
443     for (it = snps.begin(); it != snps.end(); it++) {
444         fh << it->first << "\t" << it->second << "\n";
445 
446         snp_buckets[it->first] += it->second;
447     }
448 
449     fh.close();
450 
451     cerr << "done.\n";
452 
453     return 0;
454 }
455 
456 int
dprime_blocks(string path,map<string,int> & pop_map,PhasedSummary * psum,map<int,int> & len_buckets,map<int,int> & snp_buckets)457 dprime_blocks(string path, map<string, int> &pop_map, PhasedSummary *psum, map<int, int> &len_buckets, map<int, int> &snp_buckets)
458 {
459     //
460     // Generate haplotype blocks according to strength of linkage disequilibrium measured using D'.
461     //   Stacey B. Gabriel et al. (2002). The Structure of Haplotype Blocks in the Human Genome. Science 296:2225-2229
462     //
463     string file = path + ".dpblocks.tsv";
464 
465     cerr << "Determining D' haplotypes blocks, writing to:\n    '" << file << "'...\n";
466 
467     ofstream fh(file.c_str(), ofstream::out);
468 
469     if (fh.fail()) {
470         cerr << "Error opening D' blocks file '" << file << "'\n";
471         exit(1);
472     }
473 
474     fh << "# ID\tStart\tEnd\tLen\tSNP Count\tHaplotype Count\tHaplotype\tPopulations\tHapPopCnt\n";
475 
476     uint dist;
477     set<int> loci;
478     vector<pair<int, int> > ld_pairs;
479     map<int, vector<int> > ld_map;
480     map<int, int> buckets, snps;
481 
482     uint tot_pairs    = 0;
483     uint recomb_pairs = 0;
484 
485     for (uint i = 0; i < psum->size; i++) {
486         if (psum->nucs[i].freq < minor_freq_lim)
487             continue;
488 
489         for (uint j = i+1; j < psum->size; j++) {
490             if (psum->nucs[j].freq < minor_freq_lim)
491                 continue;
492 
493             tot_pairs++;
494             dist = psum->nucs[j].bp - psum->nucs[i].bp + 1;
495 
496             //
497             // Does this pair of markers show a strong measure of LD?
498             //
499             if (psum->dprime[i][j].ci_high > 0.98 &&
500                 psum->dprime[i][j].ci_low  > 0.7 &&
501                 dist <= max_pair_dist) {
502                 psum->dprime[i][j].type = strong_ld;
503 
504                 ld_pairs.push_back(make_pair(i, j));
505                 ld_map[i].push_back(j);
506 
507                 loci.insert(i);
508                 loci.insert(j);
509             }
510 
511             //
512             // Does this pair of markers show a strong measure of historical recombination?
513             //
514             if (psum->dprime[i][j].ci_high < 0.9) {
515                 psum->dprime[i][j].type = recomb;
516                 recomb_pairs++;
517             }
518         }
519     }
520 
521     // map<int, vector<int> >::iterator it;
522     // for (it = ld_map.begin(); it != ld_map.end(); it++) {
523     //  cerr << "      " << it->first << " ->\n";
524     //  for (uint i = 0; i < it->second.size(); i++)
525     //      cerr << "         " << it->second[i] << " dist: " << (psum->nucs[it->second[i]].bp - psum->nucs[it->first].bp + 1) << "bp\n";
526     // }
527 
528     cerr << "    Total pairs examined: " << tot_pairs
529          << "; Strong LD pairs: " << ld_pairs.size()
530          << "; Recombination pairs: " << recomb_pairs
531          << "; Informative markers: " << std::setprecision(3)
532          << ((double) (ld_pairs.size() + recomb_pairs) / (double) tot_pairs) * 100 << "%\n";
533 
534     //
535     // Convert our list of loci into an ordered, linked list, where each node
536     // represents a haplotype block.
537     //
538     dPrimeBlocks blocks;
539     blocks.initialize(loci);
540 
541     //
542     // Merge nodes together where D' is strong enough to maintain the block.
543     //
544     HBlock *cur;
545 
546     cur = blocks.head();
547 
548     do {
549         //
550         // Can we merge these two nodes together?
551         //
552         if (check_adjacent_blocks(psum, cur)) {
553             // cerr << "    Merging blocks: ";
554             // for (uint i = 0; i < cur->loci.size(); i++)
555             //  cerr << cur->loci[i] << ", ";
556             // cerr << " and ";
557             // for (uint i = 0; i < cur->next->loci.size(); i++)
558             //  cerr << cur->next->loci[i] << ", ";
559             // cerr << "\n";
560             blocks.merge_adjacent(cur);
561         } else {
562             cur = cur->next;
563         }
564     } while (cur->next != NULL);
565 
566     // blocks.print();
567 
568     //
569     // Write the blocks.
570     //
571     uint id = 1;
572     uint start, end;
573 
574     cur = blocks.head();
575 
576     do {
577         start = *(cur->loci.begin());
578         end   = *(cur->loci.rbegin());
579 
580         fh << id << "\t"
581            << psum->nucs[start].bp << "\t"
582            << psum->nucs[end].bp << "\t"
583            << psum->nucs[end].bp - psum->nucs[start].bp + 1 << "\t"
584            << cur->loci.size() << "\t";
585 
586         //
587         // Bucket the SNP counts for plotting.
588         //
589         snps[cur->loci.size()]++;
590 
591         //
592         // Bucket the haplotype block lengths for plotting.
593         //
594         dist = (psum->nucs[end].bp - psum->nucs[start].bp + 1) / 10000 * 10000;
595         buckets[dist]++;
596 
597         enumerate_haplotypes(fh, pop_map, psum, start, end);
598 
599         id++;
600         cur = cur->next;
601     } while (cur != NULL);
602 
603     //
604     // Write the bucketed distances.
605     //
606     fh << "\n\n"
607        << "# Distribution of D' haplotype block lengths.\n";
608     map<int, int>::iterator it;
609     for (it = buckets.begin(); it != buckets.end(); it++) {
610         fh << it->first << "\t" << it->second << "\n";
611 
612         len_buckets[it->first] += it->second;
613     }
614 
615     //
616     // Write the bucketed SNP counts.
617     //
618     fh << "\n\n"
619        << "# Distribution of SNP counts per D' haplotype block.\n";
620     for (it = snps.begin(); it != snps.end(); it++) {
621         fh << it->first << "\t" << it->second << "\n";
622 
623         snp_buckets[it->first] += it->second;
624     }
625 
626     fh.close();
627 
628     cerr << "done.\n";
629 
630     return 0;
631 }
632 
633 bool
check_adjacent_blocks(PhasedSummary * psum,HBlock * block)634 check_adjacent_blocks(PhasedSummary *psum, HBlock *block)
635 {
636     //
637     // Create a list of all loci contained in the two blocks.
638     //
639     uint start = *(block->loci.begin());
640     uint end   = *(block->next->loci.rbegin());
641 
642     //
643     // Check the D' measure between each pair in the proposed combined block.
644     //
645     double tot       = 0.0;
646     double strong_ld = 0.0;
647 
648     for (uint i = start; i <= end; i++) {
649         if (psum->nucs[i].freq < minor_freq_lim)
650             continue;
651 
652         for (uint j = i + 1; j <= end; j++) {
653             if (psum->dprime[i][j].type == uninformative ||
654                 psum->nucs[j].freq < minor_freq_lim)
655                 continue;
656 
657             tot++;
658             if (psum->dprime[i][j].type == strong_ld)
659                 strong_ld++;
660         }
661     }
662 
663     // cerr << "Comparing range " << start << " to " << end
664     //   << "; total pairs: " << tot << "; strong LD: " << strong_ld
665     //   << "; proportion: " << std::setprecision(3) << strong_ld / tot << "\n";
666 
667     if (strong_ld / tot >= min_inform_pairs)
668         return true;
669 
670     return false;
671 }
672 
673 HBlock *
merge_adjacent(HBlock * a)674 dPrimeBlocks::merge_adjacent(HBlock *a)
675 {
676     //
677     // Merge two adjacent nodes.
678     //
679     HBlock *b = a->next;
680 
681     for (uint i = 0; i < b->loci.size(); i++)
682         a->loci.push_back(b->loci[i]);
683     a->next = b->next;
684     delete b;
685     return a;
686 }
687 
688 HBlock *
initialize(set<int> & loci)689 dPrimeBlocks::initialize(set<int> &loci)
690 {
691     set<int>::iterator it, prev_it;
692     HBlock *cur, *next;
693 
694     this->_head = new HBlock;
695     it = loci.begin();
696     this->_head->loci.push_back(*it);
697     it++;
698 
699     // //
700     // // Create a node from each locus and add to it all immediately adjacent loci.
701     // //
702     // do {
703     //  this->_head->loci.push_back(*it);
704     //  prev_it = it;
705     //  it++;
706     // } while (it != loci.end() && (*prev_it) + 1 == *it);
707 
708     next = this->_head;
709 
710     // if (it == loci.end()) return this->_head;
711 
712     do {
713         cur = new HBlock;
714         cur->loci.push_back(*it);
715         it++;
716 
717         // do {
718         //     cur->loci.push_back(*it);
719         //     prev_it = it;
720         //     it++;
721         // } while (it != loci.end() &&
722         //       (*prev_it) + 1 == *it);
723 
724         next->next = cur;
725         next = next->next;
726 
727     } while (it != loci.end());
728 
729     return this->_head;
730 }
731 
732 int
print()733 dPrimeBlocks::print()
734 {
735     HBlock *cur = this->_head;
736     while (cur != NULL) {
737         for (uint i = 0; i < cur->loci.size(); i++) {
738             if (i > 0)
739                 cerr << ", ";
740             cerr << cur->loci[i];
741         }
742         cerr << "\n";
743 
744         cur = cur->next;
745     }
746 
747     return 0;
748 }
749 
750 int
enumerate_haplotypes(ofstream & fh,map<string,int> & pop_map,PhasedSummary * psum,uint start,uint end)751 enumerate_haplotypes(ofstream &fh, map<string, int> &pop_map, PhasedSummary *psum, uint start, uint end)
752 {
753     map<string, map<int, int> >::iterator it;
754     map<string, map<int, int> > haplotypes;
755     map<int, int>::iterator sit;
756     string haplotype;
757     set<int> pops;
758 
759     //
760     // Enumerate all haplotypes occurring in this block.
761     //
762     for (uint k = 0; k < psum->sample_cnt; k++) {
763 
764         for (uint n = start; n <= end; n++)
765             if (psum->nucs[n].freq >= minor_freq_lim)
766                 haplotype += psum->samples[k].nucs_1[n];
767 
768         pops.insert(pop_map[psum->samples[k].name]);
769 
770         if (haplotypes.count(haplotype) == 0)
771             haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
772         else
773             haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
774         haplotype.clear();
775     }
776 
777     for (uint k = 0; k < psum->sample_cnt; k++) {
778 
779         for (uint n = start; n <= end; n++)
780             if (psum->nucs[n].freq >= minor_freq_lim)
781                 haplotype += psum->samples[k].nucs_2[n];
782 
783         pops.insert(pop_map[psum->samples[k].name]);
784 
785         if (haplotypes.count(haplotype) == 0)
786             haplotypes[haplotype][pop_map[psum->samples[k].name]] = 1;
787         else
788             haplotypes[haplotype][pop_map[psum->samples[k].name]]++;
789         haplotype.clear();
790     }
791 
792     //
793     // Write the haplotypes.
794     //
795     float tot = 0.0;
796 
797     fh << haplotypes.size() << "\t";
798     for (it = haplotypes.begin(); it != haplotypes.end(); it++) {
799         //
800         // Haplotypes are stored per population; sum them up here.
801         //
802         for (sit = it->second.begin(); sit != it->second.end(); sit++)
803             tot += sit->second;
804 
805         if (it != haplotypes.begin())
806             fh << ",";
807         fh << it->first << "|"
808            << std::setprecision(3) << tot / ((float) psum->sample_cnt * 2.0);
809     }
810     fh << "\t";
811 
812     set<int>::iterator pit;
813     stringstream       pops_str;
814 
815     //
816     // Write which populations this haplotype block occurs in.
817     //
818     if (pops.size() == 0)
819         fh << "-1\t";
820     else
821         for (pit = pops.begin(); pit != pops.end(); pit++)
822             pops_str << *pit << ",";
823     fh << pops_str.str().substr(0, pops_str.str().length()-1);
824     pops_str.str("");
825 
826     //
827     // Write the frequency of occurence of each haplotype in each population.
828     //
829     for (it = haplotypes.begin(); it != haplotypes.end(); it++) {
830         pops_str << "\t";
831         for (pit = pops.begin(); pit != pops.end(); pit++)
832             pops_str << (it->second)[*pit] << "|"
833                      << std::setprecision(3)
834                      << (float) (it->second)[*pit] / (float) (pop_cnts[*pit] * 2.0) << ",";
835         fh << pops_str.str().substr(0, pops_str.str().length()-1);
836         pops_str.str("");
837     }
838     fh << "\n";
839 
840     return 0;
841 }
842 
843 int
calc_dprime(PhasedSummary * psum)844 calc_dprime(PhasedSummary *psum)
845 {
846     #pragma omp parallel
847     {
848         char   allele_A, allele_a, allele_B, allele_b;
849         double freq_A,     freq_a,   freq_B,   freq_b;
850         double freq_AB,   freq_Ab,  freq_aB,  freq_ab;
851         double D, min, var, chisq;
852         double tot = psum->sample_cnt  * 2.0;
853         uint   hap_cnt;
854 
855         #pragma omp for schedule(dynamic, 1)
856         for (uint i = 0; i < psum->size; i++) {
857             //
858             // Assign nucleotides to allele A, and a.
859             //
860             assign_alleles(psum->nucs[i], allele_A, allele_a, freq_A, freq_a);
861 
862             for (uint j = i+1; j < psum->size; j++) {
863                 //
864                 // Assign nucleotides to allele B, and b.
865                 //
866                 assign_alleles(psum->nucs[j], allele_B, allele_b, freq_B, freq_b);
867 
868                 freq_AB = 0.0;
869                 freq_Ab = 0.0;
870                 freq_aB = 0.0;
871                 freq_ab = 0.0;
872                 hap_cnt = 0;
873                 D       = 0.0;
874 
875                 //
876                 // Tally up haplotype frequencies.
877                 //
878                 for (uint k = 0; k < psum->sample_cnt; k++) {
879 
880                     if (psum->samples[k].nucs_1[i] == allele_A &&
881                         psum->samples[k].nucs_1[j] == allele_B)
882                         freq_AB++;
883                     else if (psum->samples[k].nucs_1[i] == allele_A &&
884                              psum->samples[k].nucs_1[j] == allele_b)
885                         freq_Ab++;
886                     else if (psum->samples[k].nucs_1[i] == allele_a &&
887                              psum->samples[k].nucs_1[j] == allele_B)
888                         freq_aB++;
889                     else if (psum->samples[k].nucs_1[i] == allele_a &&
890                              psum->samples[k].nucs_1[j] == allele_b)
891                         freq_ab++;
892 
893                     if (psum->samples[k].nucs_2[i] == allele_A &&
894                         psum->samples[k].nucs_2[j] == allele_B)
895                         freq_AB++;
896                     else if (psum->samples[k].nucs_2[i] == allele_A &&
897                              psum->samples[k].nucs_2[j] == allele_b)
898                         freq_Ab++;
899                     else if (psum->samples[k].nucs_2[i] == allele_a &&
900                              psum->samples[k].nucs_2[j] == allele_B)
901                         freq_aB++;
902                     else if (psum->samples[k].nucs_2[i] == allele_a &&
903                              psum->samples[k].nucs_2[j] == allele_b)
904                         freq_ab++;
905                 }
906 
907                 freq_AB = freq_AB / tot;
908                 freq_Ab = freq_Ab / tot;
909                 freq_aB = freq_aB / tot;
910                 freq_ab = freq_ab / tot;
911 
912                 //
913                 // Using the four-gamete test, check whether recombination has occurred
914                 // between these two loci.
915                 //  Four-gamete test: if no recombination has occurred between any two loci (SNPs) there will
916                 //  be three haplotypes present, if recombination has occurred there will be four haplotypes.
917                 //
918                 hap_cnt += freq_AB > 0 ? 1 : 0;
919                 hap_cnt += freq_Ab > 0 ? 1 : 0;
920                 hap_cnt += freq_aB > 0 ? 1 : 0;
921                 hap_cnt += freq_ab > 0 ? 1 : 0;
922 
923                 if (hap_cnt == 3)
924                     psum->recomb[i][j] = false;
925                 else
926                     psum->recomb[i][j] = true;
927 
928 
929                 D = freq_AB - (freq_A * freq_B);
930                 // cerr << "D_AB: " << D << "; ";
931                 // D = freq_Ab - (freq_A * freq_b);
932                 // cerr << "D_Ab: " << D << "; ";
933                 // D = freq_aB - (freq_a * freq_B);
934                 // cerr << "D_aB: " << D << "; ";
935                 // D = freq_ab - (freq_a * freq_b);
936                 // cerr << "D_ab: " << D << "\n";
937                 // cerr << "    freq_AB: " << freq_AB << "; freq_Ab: " << freq_Ab << "; freq_aB: " << freq_aB << "; freq_ab: " << freq_ab << "\n";
938 
939                 if (D > 0) {
940                     min = (freq_A * freq_b) < (freq_a * freq_B) ? (freq_A * freq_b) : (freq_a * freq_B);
941                     psum->dprime[i][j].dprime = min == 0 ? 0.0 : D / min;
942                 } else {
943                     min = (freq_A * freq_B) < (freq_a * freq_b) ? (freq_A * freq_B) : (freq_a * freq_b);
944                     psum->dprime[i][j].dprime = min == 0 ? 0.0 :(-1 * D) / min;
945                 }
946 
947                 //
948                 // Test D against a chi square distribution with 1 degree of freedom to show
949                 // whether these two loci have a D that is statistically significantly different from 0.
950                 //
951                 chisq = (tot * (D * D)) / (freq_A * freq_a * freq_B * freq_b);
952                 if (chisq >= chi_sq_limit)
953                     psum->dprime[i][j].chisq_p = true;
954 
955                 //
956                 // Calculate variance and confidence limits.
957                 //
958                 if (psum->dprime[i][j].chisq_p) {
959                     var   = (1.0 / tot) * ((freq_A * freq_a * freq_B * freq_b) + ((1 - (2 * freq_A)) * (1 - (2 * freq_B)) * D) - (D * D));
960                     psum->dprime[i][j].var = var;
961                     psum->dprime[i][j].ci_high = psum->dprime[i][j].dprime + (1.96 * sqrt(var));
962                     psum->dprime[i][j].ci_low  = psum->dprime[i][j].dprime - (1.96 * sqrt(var));
963                 } else {
964                     psum->dprime[i][j].ci_high = 0.0;
965                     psum->dprime[i][j].ci_low  = 0.0;
966                 }
967             }
968         }
969     }
970 
971     return 0;
972 }
973 
974 int
assign_alleles(NucSum nsum,char & p_allele,char & q_allele,double & p_freq,double & q_freq)975 assign_alleles(NucSum nsum, char &p_allele, char &q_allele, double &p_freq, double &q_freq)
976 {
977     p_allele = 0;
978     q_allele = 0;
979 
980     uint  i   = 0;
981     float tot = 0;
982 
983     while (p_allele == 0 && i < 4) {
984         if (nsum.nuc[i] > 0) {
985             tot += nsum.nuc[i];
986             switch(i) {
987             case 0:
988                 p_allele = 'A';
989                 p_freq   = nsum.nuc[0];
990                 break;
991             case 1:
992                 p_allele = 'C';
993                 p_freq   = nsum.nuc[1];
994                 break;
995             case 2:
996                 p_allele = 'G';
997                 p_freq   = nsum.nuc[2];
998                 break;
999             case 3:
1000                 p_allele = 'T';
1001                 p_freq   = nsum.nuc[3];
1002                 break;
1003             }
1004         }
1005         i++;
1006     }
1007     while (q_allele == 0 && i < 4) {
1008         if (nsum.nuc[i] > 0) {
1009             tot += nsum.nuc[i];
1010             switch(i) {
1011             case 1:
1012                 q_allele = 'C';
1013                 q_freq   = nsum.nuc[1];
1014                 break;
1015             case 2:
1016                 q_allele = 'G';
1017                 q_freq   = nsum.nuc[2];
1018                 break;
1019             case 3:
1020                 q_allele = 'T';
1021                 q_freq   = nsum.nuc[3];
1022                 break;
1023             }
1024         }
1025         i++;
1026     }
1027 
1028     p_freq = p_freq / tot;
1029     q_freq = 1 - p_freq;
1030 
1031     return 0;
1032 }
1033 
1034 int
write_dprime(string path,PhasedSummary * psum)1035 write_dprime(string path, PhasedSummary *psum)
1036 {
1037     //
1038     // Write the D' data for plotting as a heatmap.
1039     //
1040     string file = path + ".dprime.tsv";
1041 
1042     cerr << "Writing D' data to '" << file << "'...";
1043 
1044     ofstream fh(file.c_str(), ofstream::out);
1045 
1046     if (fh.fail()) {
1047         cerr << "Error opening D' file '" << file << "'\n";
1048         exit(1);
1049     }
1050 
1051     fh << "# Basepair 1\tBasepair 2\tD'\tCorrected D'\tVariance\tCI Low\tCI High\n";
1052 
1053     double dprime = 0.0;
1054 
1055     for (uint i = 0; i < psum->size; i++) {
1056         for (uint j = i+1; j < psum->size; j++) {
1057 
1058             if (psum->nucs[i].freq < minor_freq_lim ||
1059                 psum->nucs[j].freq < minor_freq_lim)
1060                 continue;
1061 
1062             dprime = psum->dprime[i][j].dprime;
1063 
1064             if (dprime_threshold)
1065                 dprime = dprime >= dprime_threshold_level ? 1.0 : 0.0;
1066 
1067             if (write_zeros == false && (dprime == 0.0 || psum->dprime[i][j].chisq_p == false))
1068                 continue;
1069 
1070             fh << psum->nucs[i].bp << "\t"
1071                << psum->nucs[j].bp << "\t"
1072                << std::setprecision(3) << dprime << "\t"
1073                << std::setprecision(3) << (psum->dprime[i][j].chisq_p ? dprime : 0.0) << "\t"
1074                << psum->dprime[i][j].var << "\t"
1075                << psum->dprime[i][j].ci_low  << "\t"
1076                << psum->dprime[i][j].ci_high << "\n";
1077         }
1078     }
1079 
1080     fh.close();
1081 
1082     cerr << "done.\n";
1083 
1084     return 0;
1085 }
1086 
1087 int
summarize_phased_genotypes(PhasedSummary * psum)1088 summarize_phased_genotypes(PhasedSummary *psum)
1089 {
1090     //
1091     // Construct a two dimensional array out of all the nucleotide arrays in the samples.
1092     //
1093     char **gtypes = new char *[psum->sample_cnt];
1094 
1095     for (uint i = 0; i < psum->sample_cnt; i++) {
1096         gtypes[i] = psum->samples[i].nucs_1;
1097     }
1098 
1099     //
1100     // Sum up the occurences of each nucleotide.
1101     //
1102     for (uint i = 0; i < psum->size; i++) {
1103         for (uint j = 0; j < psum->sample_cnt; j++) {
1104             switch(gtypes[j][i]) {
1105             case 'A':
1106                 psum->nucs[i].nuc[0]++;
1107                 break;
1108             case 'C':
1109                 psum->nucs[i].nuc[1]++;
1110                 break;
1111             case 'G':
1112                 psum->nucs[i].nuc[2]++;
1113                 break;
1114             case 'T':
1115                 psum->nucs[i].nuc[3]++;
1116                 break;
1117             case 'N':
1118             default:
1119                 break;
1120             }
1121         }
1122     }
1123 
1124     //
1125     // Repeat for the second set of phased genotypes.
1126     //
1127     for (uint i = 0; i < psum->sample_cnt; i++) {
1128         gtypes[i] = psum->samples[i].nucs_2;
1129     }
1130 
1131     //
1132     // Sum up the occurences of each nucleotide.
1133     //
1134     for (uint i = 0; i < psum->size; i++) {
1135         for (uint j = 0; j < psum->sample_cnt; j++) {
1136             switch(gtypes[j][i]) {
1137             case 'A':
1138                 psum->nucs[i].nuc[0]++;
1139                 break;
1140             case 'C':
1141                 psum->nucs[i].nuc[1]++;
1142                 break;
1143             case 'G':
1144                 psum->nucs[i].nuc[2]++;
1145                 break;
1146             case 'T':
1147                 psum->nucs[i].nuc[3]++;
1148                 break;
1149             case 'N':
1150             default:
1151                 break;
1152             }
1153         }
1154 
1155         //
1156         // Calculate minor allele frequency.
1157         //
1158         float tot  = (float) psum->sample_cnt * 2.0;
1159         float freq = 0.0;
1160         for (uint j = 0; j < 4; j++) {
1161             if (psum->nucs[i].nuc[j] > 0) {
1162                 freq = (float) psum->nucs[i].nuc[j] / tot;
1163                 psum->nucs[i].freq = freq < psum->nucs[i].freq ? freq : psum->nucs[i].freq;
1164             }
1165         }
1166     }
1167 
1168     delete [] gtypes;
1169 
1170     return 0;
1171 }
1172 
1173 //
1174 // Code to parse fastPhase format.
1175 //
1176 PhasedSummary *
parse_fastphase(string path)1177 parse_fastphase(string path)
1178 {
1179     ifstream    fh;
1180     char        line[max_len];
1181     string      buf, filepath;
1182     const char *p, *q, *end;
1183     int         i, sindex, pos;
1184 
1185     memset(line, '\0', max_len);
1186 
1187     //
1188     // Read in the original fastPhase export from Stacks to obtain the original base pair positions.
1189     //
1190     //
1191     // Open the file for reading
1192     //
1193     filepath = path + ".inp";
1194     fh.open(filepath.c_str(), ifstream::in);
1195 
1196     if (fh.fail()) {
1197         cerr << "Error opening input file '" << path << "'\n";
1198         return NULL;
1199     }
1200 
1201     cerr << "Parsing " << filepath << "...\n";
1202 
1203     int  num_samples, num_genotypes;
1204     char bp[id_len];
1205 
1206     //
1207     // Get the number of samples in the dataset.
1208     //
1209     fh.getline(line, max_len);
1210     num_samples = is_integer(line);
1211 
1212     if (num_samples < 0) {
1213         cerr << "Unable to find the number of samples, should be the first line.\n";
1214         return NULL;
1215     }
1216 
1217     //
1218     // Get the number of genotypes in the dataset.
1219     //
1220     fh.getline(line, max_len);
1221     num_genotypes = is_integer(line);
1222 
1223     if (num_genotypes < 0) {
1224         cerr << "Unable to find the number of genotypes, should be the second line.\n";
1225         return NULL;
1226     }
1227 
1228     PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
1229 
1230     //
1231     // Get the set of base pair positions.
1232     //
1233     buf.clear();
1234     do {
1235         fh.clear();
1236         fh.getline(line, max_len);
1237         buf += line;
1238     } while (fh.fail() && !fh.bad() && !fh.eof());
1239 
1240     i   = 0;
1241     p   = buf.c_str();
1242     end = p + buf.length();
1243 
1244     if (*p != 'P') {
1245         cerr << "Unable to locate line of basepair positions, should be the third line.\n";
1246         delete psum;
1247         return NULL;
1248     }
1249     for (p += 2, q = p; p < end; p++, q++) {
1250         while (*q != ' ' && q < end) {
1251             q++;
1252         }
1253         strncpy(bp, p, q - p);
1254         bp[q - p] = '\0';
1255         pos = is_integer(bp);
1256 
1257         if (pos < 0) {
1258             cerr << "Unable to parse base pair positions.\n";
1259             delete psum;
1260             return NULL;
1261         } else {
1262             psum->nucs[i].bp = (uint) pos;
1263         }
1264 
1265         i++;
1266         p = q;
1267     }
1268 
1269     fh.close();
1270 
1271     //
1272     // Open the file for reading
1273     //
1274     filepath = path + "_hapguess_switch.out";
1275     fh.open(filepath.c_str(), ifstream::in);
1276 
1277     if (fh.fail()) {
1278         cerr << "Error opening input file '" << path << "'\n";
1279         return NULL;
1280     }
1281 
1282     cerr << "Parsing " << filepath << "...\n";
1283 
1284     //
1285     // Read from the "*_hapguess_switch.out" file until we hit the genotypes section
1286     // marked by the string "BEGIN GENOTYPES".
1287     //
1288     do {
1289         fh.getline(line, max_len);
1290 
1291         if (!fh.good()) {
1292             cerr << "Unable to find file section entitled 'BEGIN GENOTYPES'\n";
1293             delete psum;
1294             return NULL;
1295         }
1296 
1297     } while (strcmp(line, "BEGIN GENOTYPES") != 0);
1298 
1299     //
1300     // Now read lines from the file in groups of three:
1301     //   1. Sample label
1302     //   2. Phased genotypes from chromosome 1
1303     //   3. Phased genotypes from chromosome 2
1304     // Stop reading individuals when we encounter the string, "END GENOTYPES".
1305     //
1306     fh.getline(line, max_len);
1307 
1308     do {
1309         //
1310         // Create a new Sample object and store the sample label.
1311         //
1312         sindex = psum->add_sample(line);
1313 
1314         //
1315         // Get the first set of phased genotypes.
1316         //
1317         buf.clear();
1318         do {
1319             fh.clear();
1320             fh.getline(line, max_len);
1321             buf += line;
1322         } while (fh.fail() && !fh.bad() && !fh.eof());
1323 
1324         //
1325         // Count the number of genotypes on this line (they should be space deliniated).
1326         //
1327         i = 0;
1328         for (p = buf.c_str(); *p != '\0'; p++)
1329             if (*p != ' ') psum->samples[sindex].size++;
1330         //
1331         // Store the genotypes into our internal buffer.
1332         //
1333         psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
1334         for (p = buf.c_str(); *p != '\0'; p++) {
1335             if (*p == ' ') continue;
1336             psum->samples[sindex].nucs_1[i] = *p;
1337             i++;
1338         }
1339 
1340         // len = strlen(line);
1341         // if (line[len - 1] == '\r') line[len - 1] = '\0';
1342 
1343         //
1344         // Get the second set of phased genotypes.
1345         //
1346         buf.clear();
1347         do {
1348             fh.clear();
1349             fh.getline(line, max_len);
1350             buf += line;
1351         } while (fh.fail() && !fh.bad() && !fh.eof());
1352 
1353         i = 0;
1354         psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
1355         for (p = buf.c_str(); *p != '\0'; p++) {
1356             if (*p == ' ') continue;
1357             psum->samples[sindex].nucs_2[i] = *p;
1358             i++;
1359         }
1360 
1361         //
1362         // Get the sample label of the next record.
1363         //
1364         fh.getline(line, max_len);
1365 
1366     } while (strcmp(line, "END GENOTYPES") != 0 && fh.good());
1367 
1368     fh.close();
1369 
1370     return psum;
1371 }
1372 
1373 //
1374 // Code to parse Beagle format.
1375 //
1376 PhasedSummary *
parse_beagle(map<int,CSLocus * > & catalog,string path)1377 parse_beagle(map<int, CSLocus *> &catalog, string path)
1378 {
1379     gzFile      gz_fh;
1380     char        *line;
1381     string      buf, filepath;
1382     const char *p, *q;
1383     uint        len, line_len, i, sindex;
1384     bool        eol;
1385 
1386     line_len = max_len;
1387     line = new char[line_len];
1388     memset(line, '\0', line_len);
1389 
1390     //
1391     // Open the Beagle file for reading
1392     //
1393     filepath = path + ".phased.gz";
1394     gz_fh = gzopen(filepath.c_str(), "rb");
1395     if (!gz_fh) {
1396         cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
1397         return NULL;
1398     }
1399 
1400     cerr << "Parsing " << filepath << "...\n";
1401 
1402     vector<string> parts;
1403     uint num_samples   = 0;
1404     uint num_genotypes = 0;
1405     char cat_loc_str[id_len], col_str[id_len];
1406 
1407     //
1408     // Parse the file twice. On the first round:
1409     //   1. Determine the number of samples in the dataset (column count)
1410     //   2. Determine the number of markers (row count).
1411     // On the second round, parse the SNP genotypes.
1412     //
1413 
1414     //
1415     // Read each line in the file. If it starts with:
1416     //   '#' it is a comment, skip the line.
1417     //   'I' it is the list of samples, parse them.
1418     //   'S' is the population ID for each SNP, skip this line.
1419     //   'M' is a marker, count the number of markers.
1420     //
1421     do {
1422         eol = false;
1423         buf.clear();
1424         do {
1425             gzgets(gz_fh, line, line_len);
1426             buf += line;
1427 
1428             len = strlen(line);
1429             if (len > 0 && line[len - 1] == '\n') {
1430                 eol = true;
1431                 line[len - 1] = '\0';
1432             }
1433         } while (!gzeof(gz_fh) && !eol);
1434 
1435         if (line_len < buf.length()) {
1436             // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
1437             delete [] line;
1438             line = new char[buf.length() + 1];
1439             line_len = buf.length() + 1;
1440             memset(line, '\0', line_len);
1441         }
1442 
1443         if (buf[0] == 'M') {
1444             num_genotypes++;
1445         } else if (buf[0] == 'I') {
1446             //
1447             // Count the number of samples.
1448             //
1449             parse_ssv(buf.c_str(), parts);
1450             num_samples = (parts.size() - 2) / 2;
1451         }
1452 
1453     } while (!gzeof(gz_fh));
1454 
1455     PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
1456 
1457     for (uint j = 2; j < parts.size(); j++) {
1458         if (j % 2 == 0) {
1459             sindex = psum->add_sample(parts[j]);
1460             psum->samples[sindex].size   = num_genotypes;
1461             psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
1462             psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
1463         }
1464     }
1465 
1466     cerr << "  Found " << num_samples << " samples; " << num_genotypes << " genotypes.\n";
1467 
1468     gzrewind(gz_fh);
1469 
1470     uint marker_num = 0;
1471     memset(line, '\0', line_len);
1472 
1473     do {
1474         do {
1475             gzgets(gz_fh, line, line_len);
1476         } while (!gzeof(gz_fh) && line[0] != 'M');
1477 
1478         len = strlen(line);
1479 
1480         if (len == 0) break;
1481         if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
1482 
1483         parse_ssv(line, parts);
1484 
1485         //
1486         // Parse the catalog locus ID and the column number of the SNP:
1487         //   e.g. LocId_column or 10329_37
1488         //
1489         p = parts[1].c_str();
1490         for (q = p + 1; *q != '_' && *q != '\0'; q++);
1491         strncpy(cat_loc_str, p, q - p);
1492         cat_loc_str[q-p] = '\0';
1493         q++;
1494         strcpy(col_str, q);
1495 
1496         psum->nucs[marker_num].clocus = is_integer(cat_loc_str);
1497         psum->nucs[marker_num].col    = is_integer(col_str);
1498 
1499         //
1500         // Store the genotypes into our internal buffer.
1501         //
1502         sindex = 0;
1503         i      = 2;
1504         while (i < parts.size()) {
1505             p = parts[i].c_str();
1506             psum->samples[sindex].nucs_1[marker_num] = *p;
1507             i++;
1508             p = parts[i].c_str();
1509             psum->samples[sindex].nucs_2[marker_num] = *p;
1510             i++;
1511             sindex++;
1512         }
1513 
1514         marker_num++;
1515 
1516     } while (!gzeof(gz_fh));
1517 
1518     gzclose(gz_fh);
1519 
1520     //
1521     // Use the catalog to look up the basepair positions for each catalog locus.
1522     //
1523     CSLocus *loc;
1524     for (i = 0; i < psum->size; i++) {
1525         loc = catalog[psum->nucs[i].clocus];
1526         psum->nucs[i].bp = loc->sort_bp(psum->nucs[i].col);
1527     }
1528 
1529     return psum;
1530 }
1531 
1532 //
1533 // Code to parse Beagle format.
1534 //
1535 PhasedSummary *
parse_beagle_haplotypes(map<int,CSLocus * > & catalog,string path)1536 parse_beagle_haplotypes(map<int, CSLocus *> &catalog, string path)
1537 {
1538     gzFile      gz_fh;
1539     char        *line;
1540     string      buf, filepath;
1541     const char *p;
1542     uint        len, line_len, i, j, sindex;
1543     bool        eol;
1544 
1545     line_len = max_len;
1546     line = new char[line_len];
1547     memset(line, '\0', line_len);
1548 
1549     //
1550     // Open the Beagle file for reading
1551     //
1552     filepath = path + ".phased.gz";
1553     gz_fh = gzopen(filepath.c_str(), "rb");
1554     if (!gz_fh) {
1555         cerr << "Failed to open gzipped file '" << filepath << "': " << strerror(errno) << ".\n";
1556         return NULL;
1557     }
1558 
1559     cerr << "Parsing " << filepath << "...\n";
1560 
1561     vector<string> parts, samples;
1562     uint num_samples   = 0;
1563     uint num_genotypes = 0;
1564     uint cat_loc;
1565 
1566     //
1567     // Parse the file twice. On the first round:
1568     //   1. Determine the number of samples in the dataset (column count)
1569     //   2. Determine the number of markers (row count).
1570     // On the second round, parse the SNP genotypes.
1571     //
1572 
1573     //
1574     // Read each line in the file. If it starts with:
1575     //   '#' it is a comment, skip the line.
1576     //   'I' it is the list of samples, parse them.
1577     //   'S' is the population ID for each SNP, skip this line.
1578     //   'M' is a marker, count the number of markers.
1579     //
1580     do {
1581         eol = false;
1582         buf.clear();
1583         do {
1584             gzgets(gz_fh, line, line_len);
1585             buf += line;
1586 
1587             len = strlen(line);
1588             if (len > 0 && line[len - 1] == '\n') {
1589                 eol = true;
1590                 line[len - 1] = '\0';
1591             }
1592         } while (!gzeof(gz_fh) && !eol);
1593 
1594         if (line_len < buf.length()) {
1595             // cerr << "Resizing line buffer from " << line_len << " to " << buf.length() << "\n";
1596             delete [] line;
1597             line = new char[buf.length() + 1];
1598             line_len = buf.length() + 1;
1599             memset(line, '\0', line_len);
1600         }
1601 
1602         if (buf[0] == 'M') {
1603             //
1604             // Count the number of genotypes by counting the number or nucleotides in each
1605             // haplotype for each marker.
1606             //
1607             parse_ssv(buf.c_str(), parts);
1608             num_genotypes += parts[2].length();
1609 
1610         } else if (buf[0] == 'I') {
1611             //
1612             // Count the number of samples.
1613             //
1614             parse_ssv(buf.c_str(), samples);
1615             num_samples = (samples.size() - 2) / 2;
1616         }
1617 
1618     } while (!gzeof(gz_fh));
1619 
1620     PhasedSummary *psum = new PhasedSummary(num_samples, num_genotypes);
1621 
1622     for (uint j = 2; j < samples.size(); j++) {
1623         if (j % 2 == 0) {
1624             sindex = psum->add_sample(samples[j]);
1625             psum->samples[sindex].size   = num_genotypes;
1626             psum->samples[sindex].nucs_1 = new char[psum->samples[sindex].size];
1627             psum->samples[sindex].nucs_2 = new char[psum->samples[sindex].size];
1628         }
1629     }
1630 
1631     cerr << "  Found " << num_samples << " samples; " << num_genotypes << " genotypes.\n";
1632 
1633     gzrewind(gz_fh);
1634 
1635     CSLocus *loc;
1636     uint hap_len    = 0;
1637     uint marker_num = 0;
1638     memset(line, '\0', line_len);
1639 
1640     do {
1641         do {
1642             gzgets(gz_fh, line, line_len);
1643         } while (!gzeof(gz_fh) && line[0] != 'M');
1644 
1645         len = strlen(line);
1646 
1647         if (len == 0) break;
1648         if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
1649 
1650         parse_ssv(line, parts);
1651 
1652         //
1653         // Use the catalog to look up the basepair positions for each catalog locus.
1654         //
1655         cat_loc = is_integer(parts[1].c_str());
1656         loc     = catalog[cat_loc];
1657         hap_len = parts[2].length();
1658 
1659         if (hap_len != loc->snps.size())
1660             cerr << "Haplotypes don't match between catalog and beagle; Locus ID: " << loc->id << "; beagle hap len: " << hap_len << "; catalog hap len: " << loc->snps.size() << "\n";
1661 
1662         for (j = 0, i = marker_num; i < marker_num + hap_len; i++, j++) {
1663             psum->nucs[i].clocus = cat_loc;
1664             psum->nucs[i].col    = loc->snps[j]->col;
1665             psum->nucs[i].bp     = loc->sort_bp(psum->nucs[i].col);
1666         }
1667 
1668         //
1669         // Store the genotypes into our internal buffer.
1670         //
1671         sindex = 0;
1672         i      = 2;
1673         while (i < parts.size()) {
1674             p = parts[i].c_str();
1675             for (j = marker_num; j < marker_num + hap_len; j++) {
1676                 psum->samples[sindex].nucs_1[j] = *p;
1677                 p++;
1678             }
1679             i++;
1680             p = parts[i].c_str();
1681             for (j = marker_num; j < marker_num + hap_len; j++) {
1682                 psum->samples[sindex].nucs_2[j] = *p;
1683                 p++;
1684             }
1685             i++;
1686             sindex++;
1687         }
1688 
1689         marker_num += hap_len;
1690 
1691     } while (!gzeof(gz_fh));
1692 
1693     gzclose(gz_fh);
1694 
1695     return psum;
1696 }
1697 
1698 int
parse_population_map(string popmap_path,map<string,int> & pop_map,map<int,int> & pop_cnts)1699 parse_population_map(string popmap_path, map<string, int> &pop_map, map<int, int> &pop_cnts)
1700 {
1701     char   line[max_len];
1702     char   pop_id_str[id_len];
1703     vector<string> parts;
1704     uint   len;
1705 
1706     if (pmap_path.length() == 0)
1707         return 0;
1708 
1709     cerr << "Parsing population map.\n";
1710 
1711     ifstream fh(popmap_path.c_str(), ifstream::in);
1712 
1713     if (fh.fail()) {
1714         cerr << "Error opening population map '" << popmap_path << "'\n";
1715         return 0;
1716     }
1717 
1718     while (fh.good()) {
1719         fh.getline(line, max_len);
1720 
1721         len = strlen(line);
1722         if (len == 0) continue;
1723 
1724         //
1725         // Check that there is no carraige return in the buffer.
1726         //
1727         if (line[len - 1] == '\r') line[len - 1] = '\0';
1728 
1729         //
1730         // Ignore comments
1731         //
1732         if (line[0] == '#') continue;
1733 
1734         //
1735         // Parse the population map, we expect:
1736         // <file name> <tab> <population ID>
1737         //
1738         parse_tsv(line, parts);
1739 
1740         if (parts.size() != 2) {
1741             cerr << "Population map is not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
1742             return 0;
1743         }
1744 
1745         strncpy(pop_id_str, parts[1].c_str(), id_len);
1746         for (int i = 0; i < id_len && pop_id_str[i] != '\0'; i++)
1747             if (!isdigit(pop_id_str[i])) {
1748                 cerr << "Population map is not formated correctly: expecting numerical ID in second column, found '" << parts[1] << "'.\n";
1749                 return 0;
1750             }
1751 
1752         //
1753         // Add the sample name to population number mapping.
1754         //
1755         pop_map[parts[0]] = atoi(parts[1].c_str());
1756         if (pop_cnts.count(atoi(parts[1].c_str())) == 0)
1757             pop_cnts[atoi(parts[1].c_str())] = 1;
1758         else
1759             pop_cnts[atoi(parts[1].c_str())]++;
1760     }
1761 
1762     fh.close();
1763 
1764     return 0;
1765 }
1766 
1767 int
build_file_list(vector<pair<int,string>> & files)1768 build_file_list(vector<pair<int, string> > &files)
1769 {
1770     vector<string> parts;
1771     string pattern;
1772 
1773     //
1774     // Read all the files from the Stacks directory.
1775     //
1776     uint   pos;
1777     string file;
1778     struct dirent *direntry;
1779 
1780     DIR *dir = opendir(in_path.c_str());
1781 
1782     if (dir == NULL) {
1783         cerr << "Unable to open directory '" << in_path << "' for reading.\n";
1784         exit(1);
1785     }
1786 
1787     switch(in_file_type) {
1788     case FileT::beagle:
1789         pattern = ".phased.gz";
1790         break;
1791     case FileT::fastphase:
1792     default:
1793         pattern = "_hapguess_switch.out";
1794         break;
1795     }
1796 
1797     while ((direntry = readdir(dir)) != NULL) {
1798         file = direntry->d_name;
1799 
1800         if (file == "." || file == "..")
1801             continue;
1802 
1803         pos = file.rfind(pattern);
1804         if (pos < file.length())
1805             files.push_back(make_pair(1, file.substr(0, pos)));
1806     }
1807 
1808     closedir(dir);
1809 
1810     if (files.size() == 0) {
1811         cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
1812         return 0;
1813     }
1814 
1815     return 1;
1816 }
1817 
parse_command_line(int argc,char * argv[])1818 int parse_command_line(int argc, char* argv[]) {
1819     int c;
1820 
1821     while (1) {
1822         static struct option long_options[] = {
1823             {"help",        no_argument,       NULL, 'h'},
1824             {"version",     no_argument,       NULL, 'v'},
1825             {"haplotypes",  no_argument,       NULL, 'H'},
1826             {"skip-zeros",  no_argument,       NULL, 'Z'}, {"skip_zeros",  no_argument,       NULL, 'Z'},
1827             {"infile-type", required_argument, NULL, 't'}, {"infile_type", required_argument, NULL, 't'},
1828             {"num-threads", required_argument, NULL, 'p'}, {"num_threads", required_argument, NULL, 'p'},
1829             {"in-path",     required_argument, NULL, 'P'}, {"in_path",     required_argument, NULL, 'P'},
1830             {"cat-path",    required_argument, NULL, 'S'}, {"cat_path",    required_argument, NULL, 'S'},
1831             {"pop-map",     required_argument, NULL, 'M'}, {"pop_map",     required_argument, NULL, 'M'},
1832             {"batch-id",    required_argument, NULL, 'b'}, {"batch_id",    required_argument, NULL, 'b'},
1833             {"dprime-bin-size",   required_argument, NULL, 'B'}, {"dprime_bin_size",   required_argument, NULL, 'B'},
1834             {"minor-allele-freq", required_argument, NULL, 'a'}, {"minor_allele_freq", required_argument, NULL, 'a'},
1835             {"min-inform-pairs",  required_argument, NULL, 'm'}, {"min_inform_pairs",  required_argument, NULL, 'm'},
1836             {"dprime-threshold",  required_argument, NULL, 'T'}, {"dprime_threshold",  required_argument, NULL, 'T'},
1837             {0, 0, 0, 0}
1838         };
1839 
1840         // getopt_long stores the option index here.
1841         int option_index = 0;
1842 
1843         c = getopt_long(argc, argv, "hvZHAb:M:t:P:S:p:a:B:T:", long_options, &option_index);
1844 
1845         // Detect the end of the options.
1846         if (c == -1)
1847             break;
1848 
1849         switch (c) {
1850         case 'h':
1851             help();
1852             break;
1853         case 'b':
1854             batch_id = is_integer(optarg);
1855             if (batch_id < 0) {
1856                 cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
1857                 help();
1858             }
1859             break;
1860         case 'p':
1861             num_threads = atoi(optarg);
1862             break;
1863         case 'a':
1864             minor_freq_lim = atof(optarg);
1865             break;
1866         case 'm':
1867             min_inform_pairs = atof(optarg);
1868             break;
1869         case 'P':
1870             in_path = optarg;
1871             break;
1872         case 'S':
1873             cat_path = optarg;
1874             break;
1875         case 't':
1876             if (strcasecmp(optarg, "phase") == 0)
1877                 in_file_type = FileT::phase;
1878             else if (strcasecmp(optarg, "fastphase") == 0)
1879                 in_file_type = FileT::fastphase;
1880             else if (strcasecmp(optarg, "beagle") == 0)
1881                 in_file_type = FileT::beagle;
1882             else
1883                 in_file_type = FileT::unknown;
1884             break;
1885         case 'M':
1886             pmap_path = optarg;
1887             break;
1888         case 'H':
1889             haplotypes = true;
1890             break;
1891         case 'Z':
1892             write_zeros = false;
1893             break;
1894         case 'B':
1895             bucket_dist = atoi(optarg);
1896             break;
1897         case 'T':
1898             dprime_threshold = true;
1899             dprime_threshold_level = atof(optarg);
1900             break;
1901         case 'v':
1902             version();
1903             break;
1904         case '?':
1905             // getopt_long already printed an error message.
1906             help();
1907             break;
1908         default:
1909             help();
1910             exit(1);
1911         }
1912     }
1913 
1914     if (in_path.length() == 0) {
1915         cerr << "You must specify a path to the directory containing Stacks output files.\n";
1916         help();
1917     }
1918 
1919     if (in_path.at(in_path.length() - 1) != '/')
1920         in_path += "/";
1921 
1922     if (minor_freq_lim > 0) {
1923         if (minor_freq_lim > 1)
1924             minor_freq_lim = minor_freq_lim / 100;
1925 
1926         if (minor_freq_lim > 0.5) {
1927             cerr << "Unable to parse the minor allele frequency\n";
1928             help();
1929         }
1930     }
1931 
1932     if (min_inform_pairs > 0) {
1933         if (min_inform_pairs > 1)
1934             min_inform_pairs = min_inform_pairs / 100;
1935     }
1936 
1937     return 0;
1938 }
1939 
version()1940 void version() {
1941     cerr << "phasedstacks " << VERSION << "\n\n";
1942 
1943     exit(1);
1944 }
1945 
help()1946 void help() {
1947     cerr << "phasedstacks " << VERSION << "\n"
1948               << "phasedstacks -b id -S path -P path -t file_type [-p threads] [-M popmap] [-v] [-h]" << "\n"
1949               << "  b: Stacks batch ID.\n"
1950               << "  P: path to the phased output files.\n"
1951               << "  S: path to the Stacks output files.\n"
1952               << "  t: input file type. Supported types: fastphase, and beagle.\n"
1953               << "  p: number of processes to run in parallel sections of code.\n"
1954               << "  M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
1955               << "  v: print program version." << "\n"
1956               << "  h: display this help messsage." << "\n"
1957               << "  --haplotypes: data were phased as RAD locus haplotypes.\n"
1958               << "  --dprime-bin-size: size of buckets for binning SNPs at a particular distance to calculate the mean D' value.\n"
1959               << "  --dprime-threshold <val>: if D' values fall above <val>, set the D' to 1, otherwise set D' to 0.\n\n"
1960               << "  Filtering options:\n"
1961               << "  --skip-zeros: do not include D' values of zero in the D' output.\n"
1962               << "  --minor-allele-freq: specify a minimum minor allele frequency required to process a nucleotide site (0 < a < 0.5).\n"
1963               << "  --min-inform-pairs: when building D' haplotype blocks, the minimum number of informative D' measures to combine two blocks (default 0.9).\n\n";
1964 
1965 
1966     exit(1);
1967 }
1968