1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2012-2019, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // populations -- generate population genetic statistics and output
23 // haplotypes in a population context.
24 //
25
26 #include <cctype>
27 #include <typeinfo>
28
29 #include "populations.h"
30 #include "export_formats.h"
31
32 using namespace std;
33
34 // Global variables to hold command-line options.
35 InputMode input_mode = InputMode::stacks2;
36 int num_threads = 1;
37 bool quiet = false;
38 string in_path;
39 string in_vcf_path;
40 string out_path;
41 string pmap_path;
42 string bl_file;
43 string wl_file;
44 string bs_wl_file;
45 string enz;
46 double sigma = 150000.0;
47 double min_samples_overall = 0.0;
48 double min_samples_per_pop = 0.0;
49 int min_populations = 1;
50 bool filter_haplotype_wise = false;
51 int batch_size = 10000;
52 bool calc_fstats = false;
53 bool calc_hwp = false;
54 bool bootstrap = false;
55 bool bootstrap_fst = false;
56 bool bootstrap_pifis = false;
57 bool bootstrap_phist = false;
58 bool bootstrap_div = false;
59 bs_type bootstrap_type = bs_exact;
60 int bootstrap_reps = 100;
61 bool bootstrap_wl = false;
62 bool write_single_snp = false;
63 bool write_random_snp = false;
64 bool merge_sites = false;
65 bool write_gtypes = false;
66 bool ordered_export = false;
67 bool smooth_fstats = false;
68 bool smooth_popstats = false;
69 bool loci_ordered = false;
70 bool log_fst_comp = false;
71 bool verbose = false;
72 size_t min_gt_depth = 0;
73 double merge_prune_lim = 1.0;
74 double minor_allele_freq = 0.0;
75 long minor_allele_cnt = 0;
76 double max_obs_het = 1.0;
77 double p_value_cutoff = 0.05;
78 corr_type fst_correction = no_correction;
79 set<string> debug_flags;
80
81 string out_prefix;
82 MetaPopInfo mpopi;
83 set<int> bootstraplist;
84
85 //
86 // Hold information about restriction enzymes
87 //
88 map<string, const char **> renz;
89 map<string, int> renz_cnt;
90 map<string, int> renz_len;
91 map<string, int> renz_olap;
92
93 const int max_snp_dist = 500;
94
95 vector<Export *> exports;
96 template<typename E>
find_export()97 vector<Export*>::iterator find_export()
98 {
99 return std::find_if(
100 exports.begin(), exports.end(),
101 [](Export* e){ return typeid(*e) == typeid(E); }
102 );
103 }
104 template<typename E>
add_export()105 void add_export()
106 {
107 // Check that the export isn't already present and add it.
108 if (find_export<E>() == exports.end())
109 exports.push_back(new E());
110 }
111
main(int argc,char * argv[])112 int main (int argc, char* argv[])
113 {
114 unique_ptr<LogAlterator> logger;
115 try {
116
117 #ifndef HAVE_LIBZ
118 cout << "Stacks was compiled without zlib, and will refuse to parse compressed files.\n";
119 #endif
120
121 //
122 // Initialize the globals that need it.
123 //
124 initialize_renz(renz, renz_cnt, renz_len);
125 initialize_renz_olap(renz_olap);
126 srandom(time(NULL));
127
128 //
129 // Parse the command line.
130 //
131 parse_command_line(argc, argv);
132
133 //
134 // Open and initialize the log file.
135 //
136 logger.reset(new LogAlterator(out_path + out_prefix, true, quiet, argc, argv));
137 output_parameters(cout);
138
139 //
140 // Set the number of OpenMP parallel threads to execute.
141 //
142 #ifdef _OPENMP
143 omp_set_num_threads(num_threads);
144 #endif
145
146 //
147 // Read the population map file, if any.
148 //
149 if (!pmap_path.empty()) {
150 cout << "Parsing population map...\n";
151 mpopi.init_popmap(pmap_path);
152 cout << "The population map contained " << mpopi.n_samples() << " samples, "
153 << mpopi.pops().size() << " population(s), " << mpopi.groups().size() << " group(s).\n";
154 } else {
155 cout << "A population map was not specified, all samples will be read from '"
156 << in_path << "' as a single popultaion.\n";
157 }
158
159 //
160 // Locate and open input files, read VCF headers, parse population map, load black/white lists.
161 //
162 BatchLocusProcessor bloc(input_mode, batch_size, &mpopi);
163
164 bloc.init(in_path, pmap_path);
165
166 //
167 // Report information on the structure of the populations specified.
168 //
169 mpopi.status(cout);
170
171 if (size_t(min_populations) > mpopi.pops().size()) {
172 cout << "Notice: Population limit (" << min_populations << ")"
173 << " larger than number of popualtions present, adjusting parameter to "
174 << mpopi.pops().size() << "\n";
175 min_populations = mpopi.pops().size();
176 }
177
178 //
179 // Setup the default data exports.
180 //
181 exports.push_back(new MarkersExport());
182 exports.push_back(new GenotypesExport());
183 exports.push_back(new SumstatsExport());
184 exports.push_back(new HapstatsExport());
185
186 SnpDivergenceExport *sdiv_exp = NULL;
187 HapDivergenceExport *hdiv_exp = NULL;
188 if (calc_fstats) {
189 sdiv_exp = new SnpDivergenceExport(logger->x);
190 exports.push_back(sdiv_exp);
191 hdiv_exp = new HapDivergenceExport();
192 exports.push_back(hdiv_exp);
193 }
194
195 //
196 // Setup the kernel smoothing apparatus.
197 //
198 LocusSmoothing *smooth = NULL;
199 if (smooth_fstats || smooth_popstats)
200 smooth = new LocusSmoothing(&mpopi, logger->x);
201
202 //
203 // Setup the divergence statistics calculator, if requested.
204 //
205 LocusDivergence *ldiv = NULL;
206 if (calc_fstats)
207 ldiv = new LocusDivergence(&mpopi);
208
209 //
210 // Open the export files and write any headers.
211 //
212 for (uint i = 0; i < exports.size(); i++) {
213 exports[i]->open((const MetaPopInfo *) &mpopi);
214 exports[i]->write_header();
215 }
216
217 //
218 // Initialize the summary statistics object which will accumulate the metapopulation summary statistics
219 // as the individual loci are read and processed.
220 //
221 SumStatsSummary sumstats(mpopi.pops().size());
222 size_t n_sites = 0;
223 size_t n_multiloci_sites = 0;
224
225 const LocusFilter &filter = bloc.filter();
226 cout << "\nProcessing data in batches:\n"
227 << " * load a batch of catalog loci and apply filters\n"
228 << " * compute SNP- and haplotype-wise per-population statistics\n";
229 if (calc_hwp)
230 cout << " * compute SNP- and haplotype-wise deviation from HWE\n";
231 if (calc_fstats)
232 cout << " * compute F-statistics\n";
233 if (smooth_popstats)
234 cout << " * smooth per-population statistics\n";
235 if (smooth_fstats)
236 cout << " * smooth F-statistics\n";
237 cout << " * write the above statistics in the output files\n"
238 << " * export the genotypes/haplotypes in specified format(s)\n"
239 << "More details in '" << logger->distribs_path << "'.\n"
240 << "Now processing...\n"
241 << flush;
242
243 Timer timer;
244 logger->x << "\nBEGIN batch_progress\n";
245 while (true) {
246 //
247 // Read the next set of loci to process.
248 // - If data are denovo, load blim._batch_size loci.
249 // - If data are reference aligned, load one chromosome.
250 // - Filter the loci according to command line parameters (-r, -p, --maf, --write-single-snp, etc.)
251 // - Sort the loci by basepair if they are ordered.
252 //
253 size_t loc_cnt = bloc.next_batch(logger->x);
254 if (filter.batch_seen() == 0)
255 break;
256
257 if (loci_ordered) {
258 cout << bloc.chr() << " " << flush;
259 logger->x << bloc.chr();
260 } else {
261 cout << "Batch " << bloc.next_batch_number() - 1 << " " << flush;
262 logger->x << "Batch " << bloc.next_batch_number() - 1;
263 }
264 logger->x << ": analyzed "
265 << filter.batch_total() << " loci; filtered "
266 << filter.batch_filtered() << " loci; "
267 << filter.batch_seen() << " loci seen.\n";
268
269 if (loc_cnt == 0)
270 break;
271 assert(!bloc.loci().empty());
272
273 sumstats.accumulate(bloc.loci());
274
275 //
276 // Calculate haplotype and gene diversity, Hardy-Weinberg proportions per locus per population.
277 //
278 bloc.hapstats(logger->x);
279
280 //
281 // Calculate and report the extent of overlap between different RAD loci.
282 //
283 if (loci_ordered) {
284 size_t chr_n_sites;
285 size_t chr_n_multiloci_sites;
286 bloc.report_locus_overlap(chr_n_sites, chr_n_multiloci_sites, (verbose ? &logger->x : NULL));
287 n_sites += chr_n_sites;
288 n_multiloci_sites += chr_n_multiloci_sites;
289 logger->x << " " << chr_n_sites << " genomic sites, of which "
290 << chr_n_multiloci_sites << " were covered by multiple loci ("
291 << as_percentage(chr_n_multiloci_sites, chr_n_sites) << ").\n";
292 }
293
294 //
295 // Calculate divergence statistics (Fst), if requested.
296 //
297 if (calc_fstats) {
298 logger->x << " Calculating F statistics...";
299 ldiv->snp_divergence(bloc.loci());
300 ldiv->haplotype_divergence_pairwise(bloc.loci());
301 ldiv->haplotype_divergence(bloc.loci());
302 logger->x << "done.\n";
303 }
304
305 //
306 // Smooth population statistics across individual populations, and between populations.
307 //
308 if ( (smooth_fstats || smooth_popstats) && loci_ordered == false) {
309 logger->x << " Notice: Smoothing was requested (-k), but will not be performed as the loci are not ordered.\n";
310 } else if (smooth_fstats || smooth_popstats) {
311 logger->x << " Generating kernel-smoothed population statistics...";
312 if (smooth_popstats) {
313 smooth->snpstats(bloc.loci(), logger->x);
314 smooth->hapstats(bloc.loci(), logger->x);
315 }
316 if (smooth_fstats) {
317 smooth->snp_divergence(bloc.loci(), ldiv->snp_values(), logger->x);
318 smooth->hap_divergence(bloc.loci(), ldiv->haplotype_values(), ldiv->metapop_haplotype_values(), logger->x);
319 }
320 logger->x << "done.\n";
321 }
322
323 //
324 // Export this subset of the loci.
325 //
326 for (uint i = 0; i < exports.size(); i++)
327 exports[i]->write_batch(bloc.loci());
328
329 if (calc_fstats) {
330 sdiv_exp->write_batch_pairwise(bloc.loci(), ldiv->snp_values());
331 hdiv_exp->write_batch_pairwise(bloc.loci(), ldiv->haplotype_values(), ldiv->metapop_haplotype_values());
332 ldiv->clear(bloc.loci());
333 }
334 logger->x << flush;
335 timer.update();
336 #ifdef DEBUG
337 cout << "(" << (size_t) timer.elapsed() << "s)\n" << flush;
338 #else
339 cout << "\n";
340 #endif
341
342 }
343 logger->x << "END batch_progress\n";
344
345 //
346 // Report what we read from the input files.
347 //
348 bloc.summarize(cout);
349
350 cout << "\n"
351 << "Removed " << filter.filtered() << " loci that did not pass sample/population constraints from " << filter.seen() << " loci.\n"
352 << "Kept " << filter.total() << " loci, composed of " << filter.total_sites() << " sites; "
353 << filter.filtered_sites() << " of those sites were filtered, " << filter.variant_sites() << " variant sites remained.\n";
354 if (loci_ordered)
355 cout << " " << n_sites << " genomic sites, of which "
356 << n_multiloci_sites << " were covered by multiple loci ("
357 << as_percentage(n_multiloci_sites, n_sites) << ").\n";
358 if (filter.total_sites() == 0) {
359 cerr << "Error: All data has been filtered out.\n";
360 throw exception();
361 }
362
363 //
364 // Do the final sumstats calculations and write the sumstats summary files.
365 //
366 sumstats.final_calculation();
367 sumstats.write_results();
368
369 if (calc_hwp) {
370 cout << "Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<" << p_value_cutoff << "):\n";
371 for (uint j = 0; j < mpopi.pops().size(); j++)
372 cout << " "
373 << mpopi.pops()[j].name << ": "
374 << bloc._sig_hwe_dev[j] << "\n";
375 cout << "(more detail in populations.sumstats.tsv and populations.hapstats.tsv)\n";
376 }
377
378 if (calc_fstats)
379 ldiv->write_summary(out_path + out_prefix);
380
381 //
382 // Write out the distributions of catalog loci.
383 //
384 bloc.write_distributions(logger->x);
385
386 if (smooth_fstats || smooth_popstats)
387 delete smooth;
388 if (calc_fstats)
389 delete ldiv;
390
391 //
392 // Close the export files and do any required post processing.
393 //
394 for (uint i = 0; i < exports.size(); i++) {
395 exports[i]->post_processing();
396 exports[i]->close();
397 delete exports[i];
398 }
399
400 cout << "Populations is done.\n";
401 return 0;
402
403 } catch (exception& e) {
404 return stacks_handle_exceptions(e);
405 }
406 }
407
408 int
init(string in_path,string pmap_path)409 BatchLocusProcessor::init(string in_path, string pmap_path)
410 {
411 //
412 // Read the blacklist and whitelist to control which loci we load.
413 //
414 int cnt;
415 if (bl_file.length() > 0) {
416 cnt = this->_loc_filter.load_blacklist(bl_file);
417 cout << "Loaded " << cnt << " blacklisted markers.\n";
418 }
419 if (wl_file.length() > 0) {
420 cnt = this->_loc_filter.load_whitelist(wl_file);
421 cout << "Loaded " << cnt << " whitelisted markers.\n";
422 this->_user_supplied_whitelist = true;
423 }
424
425 if (this->_input_mode == InputMode::vcf)
426 this->init_external_loci(in_vcf_path, pmap_path);
427 else
428 this->init_stacks_loci(in_path, pmap_path);
429
430 //
431 // Initialize our per-population haplotype counters.
432 //
433 this->_sig_hwe_dev = new size_t[this->_mpopi->pops().size()];
434 memset(this->_sig_hwe_dev, 0, sizeof(size_t)*this->_mpopi->pops().size());
435
436 return 0;
437 }
438
439 size_t
next_batch(ostream & log_fh)440 BatchLocusProcessor::next_batch(ostream &log_fh)
441 {
442 this->_batch_num++;
443 if (this->_input_mode == InputMode::vcf)
444 return this->next_batch_external_loci(log_fh);
445 else
446 return this->next_batch_stacks_loci(log_fh);
447 }
448
batch_clear()449 void BatchLocusProcessor::batch_clear()
450 {
451 for (LocBin* loc : this->_loci)
452 delete loc;
453 this->_loci.clear();
454 this->_chr.clear();
455 this->_loc_filter.batch_clear();
456 }
457
458 int
init_stacks_loci(string in_path,string pmap_path)459 BatchLocusProcessor::init_stacks_loci(string in_path, string pmap_path)
460 {
461 //
462 // Open the files.
463 //
464 string catalog_fa_path = in_path + "catalog.fa.gz";
465 string catalog_vcf_path = in_path + "catalog.calls";
466
467 this->_fasta_reader.open(catalog_fa_path);
468 this->_cloc_reader.open(catalog_vcf_path);
469
470 // Create the population map or check that all samples have data.
471 if (pmap_path.empty()) {
472 cout << "No population map specified, using all samples...\n";
473 this->_mpopi->init_names(this->cloc_reader().header().samples());
474 } else {
475 size_t n_samples_before = this->_mpopi->n_samples();
476 this->_mpopi->intersect_with(this->cloc_reader().header().samples());
477 size_t n_rm_samples = n_samples_before - this->_mpopi->n_samples();
478
479 if (n_rm_samples > 0) {
480 cerr << "Warning: No genotype data exists for " << n_rm_samples
481 << " of the samples listed in the population map.\n";
482 if (this->_mpopi->samples().empty()) {
483 cerr << "Error: No more samples.\n";
484 throw exception();
485 }
486 }
487 }
488
489 this->cloc_reader().set_sample_ids(*this->_mpopi);
490
491 //
492 // Initialize the locus filter after we have constructed the population map.
493 //
494 this->_loc_filter.init(this->pop_info());
495
496 //
497 // Create the {sample_vcf_index : sample_popmap_index} table.
498 //
499 const vector<string>& vcf_samples = this->_cloc_reader.header().samples();
500 this->_samples_vcf_to_mpopi.assign(vcf_samples.size(), SIZE_MAX);
501 for (size_t vcf_i=0; vcf_i<vcf_samples.size(); ++vcf_i) {
502 size_t mpopi_i = this->_mpopi->get_sample_index(vcf_samples[vcf_i], false);
503 if (mpopi_i != SIZE_MAX)
504 this->_samples_vcf_to_mpopi[vcf_i] = mpopi_i;
505 }
506
507 return 0;
508 }
509
510 size_t
next_batch_stacks_loci(ostream & log_fh)511 BatchLocusProcessor::next_batch_stacks_loci(ostream &log_fh)
512 {
513 this->batch_clear();
514
515 //
516 // Check if we queued a LocBin object from the last round of reading.
517 //
518 if (this->_next_loc != NULL) {
519 this->_loci.push_back(this->_next_loc);
520 this->_next_loc = NULL;
521 this->_loc_filter.locus_seen();
522 this->_loc_filter.keep_locus(this->_loci.back());
523 }
524
525 Seq seq;
526 seq.id = new char[id_len];
527 vector<VcfRecord> records;
528 while(this->_cloc_reader.read_one_locus(records)) {
529 this->_loc_filter.locus_seen();
530
531 //
532 // Get the current locus ID and find the corresponding fasta record.
533 // (Note: c-loci with very low coverage might be entirely missing from
534 // the VCF; in this case ignore them.)
535 //
536 assert(!records.empty());
537 int cloc_id = is_integer(records[0].chrom());
538 assert(cloc_id >= 0);
539 do {
540 int rv = this->_fasta_reader.next_seq(seq);
541 if (rv == 0) {
542 cerr << "Error: catalog VCF and FASTA files are discordant, maybe trucated. rv: "
543 << rv << "; cloc_id: " << cloc_id << "\n";
544 throw exception();
545 }
546 } while (is_integer(seq.id) != cloc_id);
547
548 //
549 // Check if this locus is white/blacklisted.
550 //
551 if (this->_loc_filter.whitelist_filter(cloc_id) ||
552 this->_loc_filter.blacklist_filter(cloc_id))
553 continue;
554
555 //
556 // Create and populate a new catalog locus & the associated genotypes.
557 //
558 LocBin* loc = new LocBin(this->_mpopi->n_samples());
559 loc->cloc = new CSLocus();
560 loc->d = new Datum *[this->_mpopi->n_samples()];
561 for (size_t i=0; i<this->_mpopi->n_samples(); ++i)
562 loc->d[i] = NULL;
563
564 PopMap<CSLocus>::populate_internal(
565 loc->cloc, loc->d,
566 seq, records,
567 this->_cloc_reader.header(), this->_mpopi, this->_samples_vcf_to_mpopi);
568
569 //
570 // Apply locus & SNP filters.
571 //
572 this->_dists.accumulate_pre_filtering(loc->sample_cnt, loc->cloc);
573 this->_loc_filter.whitelist_snp_filter(*loc);
574 if (this->_loc_filter.apply_filters_stacks(*loc, log_fh, *this->_mpopi)) {
575 delete loc;
576 continue;
577 }
578 assert(loc->s != NULL);
579
580 //
581 // Tabulate haplotypes present and in what combinations.
582 //
583 tabulate_locus_haplotypes(loc->cloc, loc->d, this->_mpopi->n_samples());
584
585 //
586 // Detect the end of batch.
587 //
588 loci_ordered = !loc->cloc->loc.empty();
589 if (loci_ordered) {
590 //
591 // Ref-based.
592 //
593 if (this->_chr.empty()) {
594 this->_chr = loc->cloc->loc.chr();
595 } else if (this->_chr.compare(loc->cloc->loc.chr()) != 0) {
596 if (!this->_loci.empty()) {
597 this->_next_loc = loc;
598 this->_loc_filter.locus_unsee();
599 break;
600 } else {
601 //
602 // No loci were analyzed on the previous chromosome, keep processing
603 // the current chromosome without returning.
604 //
605 this->_chr = loc->cloc->loc.chr();
606 }
607 }
608 this->_loci.push_back(loc);
609 this->_loc_filter.keep_locus(loc);
610 } else {
611 //
612 // De novo.
613 //
614 if (ordered_export) {
615 cerr << "Error: Options --ordered-export/--smooth/--bootstrap"
616 << " are for reference-aligned data only.\n";
617 throw exception();
618 }
619 loc->cloc->loc.set("un", this->_unordered_bp, strand_plus);
620 this->_unordered_bp += loc->cloc->len;
621 this->_loci.push_back(loc);
622 this->_loc_filter.keep_locus(loc);
623 if (this->_loci.size() == this->_batch_size)
624 break;
625 }
626 }
627
628 //
629 // Record the post-filtering distribution of catalog loci for this batch.
630 //
631 this->_dists.accumulate(this->_loci);
632
633 //
634 // Sort the catalog loci, if possible.
635 //
636 if (loci_ordered)
637 sort(this->_loci.begin(), this->_loci.end(),
638 [] (const LocBin *a, const LocBin *b) -> bool {
639 return a->cloc->loc.bp < b->cloc->loc.bp;
640 });
641
642 return this->_loci.size();
643 }
644
645 int
init_external_loci(string in_path,string pmap_path)646 BatchLocusProcessor::init_external_loci(string in_path, string pmap_path)
647 {
648 //
649 // Open the VCF file
650 //
651 cout << "Opening the VCF file...\n";
652 this->_vcf_parser.open(in_path);
653
654 if (this->_vcf_parser.header().samples().empty()) {
655 cerr << "Error: No samples in VCF file '" << in_path << "'.\n";
656 throw exception();
657 }
658
659 // Reconsider the MetaPopInfo in light of the VCF header.
660 if (pmap_path.empty()) {
661 cout << "No population map specified, creating one from the VCF header...\n";
662 this->_mpopi->init_names(this->_vcf_parser.header().samples());
663
664 } else {
665 // Intersect the samples present in the population map and the VCF.
666 size_t n_samples_before = this->_mpopi->n_samples();
667
668 this->_mpopi->intersect_with(this->_vcf_parser.header().samples());
669
670 size_t n_rm_samples = n_samples_before - this->_mpopi->n_samples();
671 if (n_rm_samples > 0) {
672 cerr << "Warning: Of the samples listed in the population map, "
673 << n_rm_samples << " could not be found in the VCF :";
674 if (this->_mpopi->samples().empty()) {
675 cerr << "Error: No more samples.\n";
676 throw exception();
677 }
678 }
679 }
680
681 // Create arbitrary sample IDs.
682 for (size_t i = 0; i < this->_mpopi->n_samples(); ++i)
683 this->_mpopi->set_sample_id(i, i+1); //id=i+1
684
685 //
686 // Initialize the locus filter after we have constructed the population map.
687 //
688 this->_loc_filter.init(this->pop_info());
689
690 //
691 // Create the {sample_vcf_index : sample_popmap_index} table.
692 //
693 const vector<string>& vcf_samples = this->_vcf_parser.header().samples();
694 this->_samples_vcf_to_mpopi.assign(vcf_samples.size(), SIZE_MAX);
695 for (size_t vcf_i=0; vcf_i<vcf_samples.size(); ++vcf_i) {
696 size_t mpopi_i = this->_mpopi->get_sample_index(vcf_samples[vcf_i], false);
697 if (mpopi_i != SIZE_MAX)
698 this->_samples_vcf_to_mpopi[vcf_i] = mpopi_i;
699 }
700
701 this->_total_ext_vcf = 0;
702
703 return 0;
704 }
705
706 size_t
next_batch_external_loci(ostream & log_fh)707 BatchLocusProcessor::next_batch_external_loci(ostream &log_fh)
708 {
709 //
710 // VCF mode
711 //
712 this->batch_clear();
713 loci_ordered = true;
714
715 //
716 // Check if we queued a LocBin object from the last round of reading.
717 //
718 if (this->_next_loc != NULL) {
719 this->_loci.push_back(this->_next_loc);
720 this->_next_loc = NULL;
721 this->_loc_filter.locus_seen();
722 this->_loc_filter.keep_locus(this->_loci.back());
723 }
724
725 int cloc_id = (this->_loci.empty() ? 1 : this->_loci.back()->cloc->id + 1);
726 VcfRecord rec;
727 while (this->_vcf_parser.next_record(rec)) {
728 this->_loc_filter.locus_seen();
729 this->_total_ext_vcf++;
730
731 // Check for a SNP.
732 if (not rec.is_snp()) {
733 this->_skipped_notsnp.push_back(this->_vcf_parser.line_number());
734 continue;
735 }
736
737 // Check for a filtered-out SNP
738 if (strncmp(rec.filters(), ".", 2) != 0 && strncmp(rec.filters(), "PASS", 5) != 0) {
739 this->_skipped_filter.push_back(this->_vcf_parser.line_number());
740 continue;
741 }
742
743 //
744 // Create and populate a new catalog locus.
745 //
746 LocBin* loc = new LocBin(this->_mpopi->n_samples());
747 loc->cloc = new CSLocus();
748 loc->d = new Datum *[this->_mpopi->n_samples()];
749 for (size_t i = 0; i < this->_mpopi->n_samples(); i++)
750 loc->d[i] = NULL;
751 if (!PopMap<CSLocus>::populate_external(
752 loc->cloc, loc->d,
753 cloc_id++, rec,
754 this->_vcf_parser.header(), this->_mpopi, this->_samples_vcf_to_mpopi)
755 ) {
756 // Bad record; a warning has been printed.
757 delete loc;
758 continue;
759 }
760 assert(!loc->cloc->loc.empty());
761 assert(loc->cloc->len == 1);
762 assert(loc->cloc->snps.size() == 1);
763
764 //
765 // Apply filters.
766 //
767 this->_dists.accumulate_pre_filtering(loc->sample_cnt, loc->cloc);
768 if (this->_loc_filter.apply_filters_external(*loc, log_fh, *this->_mpopi)) {
769 delete loc;
770 continue;
771 }
772 assert(loc->s != NULL);
773
774 //
775 // Tabulate haplotypes present and in what combinations.
776 //
777 tabulate_locus_haplotypes(loc->cloc, loc->d, this->_mpopi->n_samples());
778
779 //
780 // Detect the end of batch.
781 //
782 if (this->_chr.empty()) {
783 this->_chr = loc->cloc->loc.chr();
784 } else if (this->_chr.compare(loc->cloc->loc.chr()) != 0) {
785 this->_next_loc = loc;
786 this->_loc_filter.locus_unsee();
787 break;
788 }
789 this->_loci.push_back(loc);
790 this->_loc_filter.keep_locus(loc);
791 }
792
793 //
794 // Record the post-filtering distribution of catalog loci for this batch.
795 //
796 this->_dists.accumulate(this->_loci);
797
798 //
799 // Sort the catalog loci, if possible.
800 //
801 sort(this->_loci.begin(), this->_loci.end(),
802 [] (const LocBin *a, const LocBin *b) -> bool {
803 return a->cloc->loc.bp < b->cloc->loc.bp;
804 });
805
806 return this->_loci.size();
807 }
808
809 int
summarize(ostream & log_fh)810 BatchLocusProcessor::summarize(ostream &log_fh)
811 {
812 if (this->_input_mode == InputMode::vcf) {
813 log_fh << "Found " << this->_total_ext_vcf << " SNP records in file '" << in_vcf_path
814 << "'. (Skipped " << this->_skipped_filter.size() << " already filtered-out SNPs and "
815 << this->_skipped_notsnp.size() << " non-SNP records ; more with --verbose.)\n";
816 if (verbose && not this->_skipped_notsnp.empty()) {
817 log_fh << "The following VCF record lines were determined not to be SNPs and skipped :";
818 for (vector<size_t>::const_iterator l = this->_skipped_notsnp.begin(); l != this->_skipped_notsnp.end(); ++l)
819 log_fh << " " << *l;
820 log_fh << "\n";
821 }
822 } else {
823 }
824
825 return 0;
826 }
827
828 int
hapstats(ostream & log_fh)829 BatchLocusProcessor::hapstats(ostream &log_fh)
830 {
831 #pragma omp parallel
832 {
833 #pragma omp for schedule(dynamic, 1)
834 for (uint i = 0; i < this->_loci.size(); i++) {
835 LocBin *loc = this->_loci[i];
836
837 loc->s->calc_hapstats(loc->cloc, (const Datum **) loc->d, *this->_mpopi);
838 }
839 }
840
841 if (calc_hwp) {
842 const LocStat *l;
843 const vector<Pop> &pops = this->_mpopi->pops();
844
845 for (uint i = 0; i < this->_loci.size(); i++) {
846 for (uint j = 0; j < pops.size(); j++) {
847 l = this->_loci[i]->s->hapstats_per_pop(j);
848
849 if (l == NULL)
850 continue;
851
852 this->_sig_hwe_dev[j] += l->stat[2] < p_value_cutoff ? 1 : 0;
853 }
854 }
855 }
856
857 return 0;
858 }
859
860 void
report_locus_overlap(size_t & n_sites,size_t & n_multiple_sites,ofstream * log_fh)861 BatchLocusProcessor::report_locus_overlap(size_t& n_sites, size_t& n_multiple_sites, ofstream *log_fh)
862 {
863 n_sites = 0;
864 n_multiple_sites = 0;
865
866 //
867 // Create a vector of maps, to record for each population, which sites overlap between different loci.
868 //
869 map<uint, set<uint>> final_map;
870 //vector<map<uint, set<uint>>> sites (this->_mpopi->pops().size());
871 for (uint pop = 0; pop < this->_mpopi->pops().size(); pop++) {
872 for (const LocBin* locbin : this->_loci) {
873 const CSLocus* loc = locbin->cloc;
874 const LocSum* lsum = locbin->s->per_pop(pop);
875 for (uint k = 0; k < loc->len; k++)
876 if (lsum->nucs[k].num_indv > 0) {
877 final_map[lsum->nucs[k].bp].insert(loc->id);
878 //sites[pop][lsum->nucs[k].bp].insert(loc->id);
879 }
880 }
881 }
882
883 n_sites = final_map.size();
884
885 for (auto bp : final_map) {
886 if (bp.second.size() > 1) {
887 n_multiple_sites++;
888 if (log_fh != NULL) {
889 *log_fh << "multilocus_pos"
890 << '\t' << this->_loci[0]->cloc->loc.chr() << ':' << bp.first + 1
891 << "\tloci=";
892 join(bp.second, ',', *log_fh);
893 *log_fh << '\n';
894 }
895 }
896 }
897 }
898
899 void
erase_snp(CSLocus * cloc,Datum ** d,size_t n_samples,size_t snp_index)900 LocusFilter::erase_snp(CSLocus *cloc, Datum **d, size_t n_samples, size_t snp_index)
901 {
902 //
903 // N.B. For this to work as expected we must be iterating over cloc->snps
904 // in reverse, e.g. for(size_t i=cloc->snps.size(); i!=0;) {--i; ...}
905 // because we're making vector::erase() calls.
906 //
907 // Prune out the given SNP. We need to update:
908 // * `Locus::snps`
909 // * `Locus::alleles`
910 // * `Datum::model`
911 // * `Datum::snpdata`
912 // * `Datum::obshap`
913 //
914
915 //
916 // Update the Datums.
917 //
918 uint col = cloc->snps[snp_index]->col;
919 for (size_t s=0; s<n_samples; ++s) {
920 if (d[s] == NULL)
921 continue;
922
923 assert(d[s]->model);
924 assert(!d[s]->obshap.empty() && strlen(d[s]->obshap[0]) == cloc->snps.size());
925 assert(d[s]->snpdata.size() == cloc->snps.size());
926
927 //
928 // Correct the model calls.
929 //
930 char& m = d[s]->model[col];
931 assert(m == 'O' || m == 'E' || m == 'U');
932 if (m == 'E' || (m == 'O' && d[s]->obshap[0][snp_index] != cloc->con[col]))
933 m = 'U';
934 //
935 // Erase the nucleotide in the haplotypes.
936 //
937 for (char* hap : d[s]->obshap) {
938 hap += snp_index;
939 while(*hap != '\0') {
940 *hap = *(hap+1);
941 ++hap;
942 }
943 }
944 //
945 // Splice the depths.
946 //
947 d[s]->snpdata.erase(d[s]->snpdata.begin() + snp_index);
948 }
949
950 //
951 // Update the CSLocus.
952 //
953 delete cloc->snps[snp_index];
954 cloc->snps.erase(cloc->snps.begin() + snp_index);
955 // Splice the haplotypes. (Why does this have to be a map?!)
956 vector<pair<string,int>> alleles (cloc->alleles.begin(), cloc->alleles.end());
957 cloc->alleles.clear();
958 for (pair<string,int>& allele : alleles)
959 allele.first.erase(snp_index, 1);
960 cloc->alleles.insert(std::make_move_iterator(alleles.begin()), std::make_move_iterator(alleles.end()));
961 }
962
963 void
batch_clear()964 LocusFilter::batch_clear()
965 {
966 this->_batch_total_loci = 0;
967 this->_batch_filtered_loci = 0;
968 this->_batch_seen_loci = 0;
969 }
970
971 void
locus_seen()972 LocusFilter::locus_seen()
973 {
974 this->_seen_loci++;
975 this->_batch_seen_loci++;
976 }
977
978 void
locus_unsee()979 LocusFilter::locus_unsee()
980 {
981 this->_seen_loci--;
982 this->_batch_seen_loci--;
983 }
984
985 void
keep_locus(LocBin * loc)986 LocusFilter::keep_locus(LocBin *loc)
987 {
988 this->_total_loci++;
989 this->_batch_total_loci++;
990
991 //
992 // Count up the number of sites and variable sites.
993 //
994 const LocTally *t = loc->s->meta_pop();
995
996 for (uint i = 0; i < loc->cloc->len; i++) {
997 if (t->nucs[i].fixed == false)
998 this->_variant_sites++;
999 this->_total_sites++;
1000 }
1001 }
1002
1003 bool
whitelist_filter(size_t locus_id)1004 LocusFilter::whitelist_filter(size_t locus_id)
1005 {
1006 if (this->_whitelist.empty() || this->_whitelist.count(locus_id)) {
1007 return false;
1008 } else {
1009 this->_filtered_loci++;
1010 this->_batch_filtered_loci++;
1011 return true;
1012 }
1013 }
1014
1015 bool
blacklist_filter(size_t locus_id)1016 LocusFilter::blacklist_filter(size_t locus_id)
1017 {
1018 if (this->_blacklist.count(locus_id)) {
1019 this->_filtered_loci++;
1020 this->_batch_filtered_loci++;
1021 return true;
1022 } else {
1023 return false;
1024 }
1025 }
1026
1027 void
whitelist_snp_filter(LocBin & loc) const1028 LocusFilter::whitelist_snp_filter(LocBin& loc) const
1029 {
1030 if (this->_whitelist.empty())
1031 return;
1032 CSLocus* cloc = loc.cloc;
1033 assert(this->_whitelist.count(cloc->id));
1034 const set<int>& snp_wl = this->_whitelist.at(cloc->id);
1035 if (snp_wl.empty())
1036 // Accept any SNP.
1037 return;
1038 for(size_t i=cloc->snps.size(); i!=0;) {
1039 --i;
1040 if (!snp_wl.count(cloc->snps[i]->col))
1041 erase_snp(cloc, loc.d, loc.sample_cnt, i);
1042 }
1043 }
1044
1045 bool
apply_filters_stacks(LocBin & loc,ostream & log_fh,const MetaPopInfo & mpopi)1046 LocusFilter::apply_filters_stacks(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi)
1047 {
1048 //
1049 // Apply the -r/-p thresholds at the locus level.
1050 //
1051 if (this->filter(&mpopi, loc.d, loc.cloc))
1052 return true;
1053
1054 //
1055 // Filter genotypes for depth.
1056 //
1057 this->gt_depth_filter(loc.d, loc.cloc);
1058
1059 //
1060 // Identify individual SNPs that are below the -r threshold or the minor allele
1061 // frequency threshold (-a). In these cases we will remove the SNP, but keep the locus.
1062 //
1063 loc.s = new LocPopSum(strlen(loc.cloc->con), mpopi);
1064 this->filter_snps(loc, mpopi, log_fh);
1065
1066 //
1067 // If write_single_snp, write_random_snp or filter_haplotype_wise has been specified,
1068 // mark sites to be pruned using the whitelist.
1069 //
1070 assert(write_single_snp + write_random_snp + filter_haplotype_wise <= 1);
1071 if (write_single_snp)
1072 this->keep_single_snp(loc.cloc, loc.d, mpopi.n_samples(), loc.s->meta_pop());
1073 else if (write_random_snp)
1074 this->keep_random_snp(loc.cloc, loc.d, mpopi.n_samples(), loc.s->meta_pop());
1075 else if (filter_haplotype_wise)
1076 this->filter_haps(loc, mpopi, log_fh);
1077
1078 //
1079 // Regenerate summary statistics after pruning SNPs.
1080 //
1081 loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
1082 loc.s->tally_metapop(loc.cloc);
1083 return false;
1084 }
1085
1086 bool
apply_filters_external(LocBin & loc,ostream & log_fh,const MetaPopInfo & mpopi)1087 LocusFilter::apply_filters_external(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi)
1088 {
1089 //
1090 // Apply the -r/-p thresholds at the locus level.
1091 //
1092 if (this->filter(&mpopi, loc.d, loc.cloc))
1093 return true;
1094
1095 //
1096 // Identify individual SNPs that are below the -r threshold or the minor allele
1097 // frequency threshold (-a). In these cases we will remove the SNP, but keep the locus.
1098 //
1099 loc.s = new LocPopSum(strlen(loc.cloc->con), mpopi);
1100 this->filter_snps(loc, mpopi, log_fh);
1101
1102 loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
1103 loc.s->tally_metapop(loc.cloc);
1104 return false;
1105 }
1106
1107 bool
filter(const MetaPopInfo * mpopi,Datum ** d,CSLocus * cloc)1108 LocusFilter::filter(const MetaPopInfo *mpopi, Datum **d, CSLocus *cloc)
1109 {
1110 // Filter out populations that don't have enough samples.
1111 size_t n_pops_present = 0;
1112 for (const Pop& pop : mpopi->pops()) {
1113 size_t n_samples_present = 0;
1114 for (size_t s=pop.first_sample; s<=pop.last_sample; ++s)
1115 if (d[s] != NULL)
1116 ++n_samples_present;
1117 if ((double) n_samples_present / pop.n_samples() >= min_samples_per_pop) {
1118 ++n_pops_present;
1119 } else {
1120 // Remove all samples of that population.
1121 for (size_t s=pop.first_sample; s<=pop.last_sample; ++s) {
1122 if (d[s] != NULL) {
1123 delete d[s];
1124 d[s] = NULL;
1125 cloc->cnt--;
1126 }
1127 }
1128 }
1129 }
1130 // Check the number of (remaining) samples.
1131 size_t n_samples_present = 0;
1132 for (size_t s=0; s<mpopi->n_samples(); ++s)
1133 if (d[s] != NULL)
1134 ++n_samples_present;
1135 // Determine if the locus is to be kept.
1136 if (n_pops_present < size_t(min_populations)
1137 || (double) n_samples_present / mpopi->n_samples() < min_samples_overall)
1138 {
1139 this->_filtered_loci++;
1140 this->_batch_filtered_loci++;
1141 return true;
1142 }
1143 return false;
1144 }
1145
1146 void
gt_depth_filter(Datum ** data,const CSLocus * cloc)1147 LocusFilter::gt_depth_filter(Datum** data, const CSLocus* cloc)
1148 {
1149 if (min_gt_depth == 0)
1150 return;
1151 else if (cloc->snps.empty())
1152 return;
1153 for (size_t spl=0; spl<this->_sample_cnt; ++spl) {
1154 Datum* d = data[spl];
1155 assert(d->obshap.size() == 2);
1156 assert(strlen(d->obshap[0]) == cloc->snps.size()
1157 && strlen(d->obshap[1]) == cloc->snps.size());
1158 for (size_t snp=0; snp<cloc->snps.size(); ++snp) {
1159 size_t col = cloc->snps[snp]->col;
1160 if (d->model[col] == 'U')
1161 continue;
1162 // Check this genotype's depth.
1163 if (d->snpdata[snp].tot_depth < min_gt_depth) {
1164 d->model[col] = 'U';
1165 for(size_t i=0; i<2; ++i)
1166 d->obshap[i][snp] = 'N';
1167 }
1168 }
1169 }
1170 }
1171
1172 void
init(MetaPopInfo * mpopi)1173 LocusFilter::init(MetaPopInfo *mpopi)
1174 {
1175 this->_pop_cnt = mpopi->pops().size();
1176 this->_sample_cnt = mpopi->n_samples();
1177
1178 assert(this->_pop_cnt > 0);
1179 assert(this->_sample_cnt > 0);
1180
1181 if (this->_pop_order != NULL)
1182 delete [] this->_pop_order;
1183 if (this->_samples != NULL)
1184 delete [] this->_samples;
1185 if (this->_pop_tot != NULL)
1186 delete [] this->_pop_tot;
1187
1188 this->_pop_order = new size_t [this->_pop_cnt];
1189 this->_samples = new size_t [this->_sample_cnt];
1190 this->_pop_tot = new size_t [this->_pop_cnt];
1191
1192 this->_filtered_loci = 0;
1193 this->_total_loci = 0;
1194 this->_filtered_sites = 0;
1195 this->_total_sites = 0;
1196
1197 size_t pop_sthg = 0;
1198
1199 for (size_t i_pop = 0; i_pop < mpopi->pops().size(); ++i_pop) {
1200 const Pop& pop = mpopi->pops()[i_pop];
1201 this->_pop_tot[pop_sthg] = 0;
1202
1203 for (uint i = pop.first_sample; i <= pop.last_sample; i++) {
1204 this->_samples[i] = pop_sthg;
1205 this->_pop_tot[pop_sthg]++;
1206 }
1207 this->_pop_order[pop_sthg] = i_pop;
1208 pop_sthg++;
1209 }
1210 }
1211
1212 void
keep_single_snp(CSLocus * cloc,Datum ** d,size_t n_samples,const LocTally * t) const1213 LocusFilter::keep_single_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const
1214 {
1215 //
1216 // Check that we have at least one variable site within this population for this locus.
1217 //
1218 size_t n_actual_snps = 0;
1219 for (const SNP* snp : cloc->snps)
1220 if (!t->nucs[snp->col].fixed)
1221 ++n_actual_snps;
1222 if (n_actual_snps == 0)
1223 return;
1224
1225 //
1226 // Find the first SNP that is not fixed in this subpopulation.
1227 //
1228 size_t kept_snp = 0;
1229 while (kept_snp != cloc->snps.size()) {
1230 if (t->nucs[cloc->snps[kept_snp]->col].fixed == false)
1231 break;
1232 kept_snp++;
1233 }
1234
1235 //
1236 // Remove all the SNPs except for the one marked previously.
1237 //
1238 for (size_t snp=cloc->snps.size(); snp!=0;) {
1239 --snp;
1240 if (snp != kept_snp)
1241 erase_snp(cloc, d, n_samples, snp);
1242 }
1243 }
1244
1245 void
keep_random_snp(CSLocus * cloc,Datum ** d,size_t n_samples,const LocTally * t) const1246 LocusFilter::keep_random_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const
1247 {
1248 //
1249 // Check that we have at least one variable site within this population for this locus.
1250 //
1251 size_t n_actual_snps = 0;
1252 for (const SNP* snp : cloc->snps)
1253 if (!t->nucs[snp->col].fixed)
1254 ++n_actual_snps;
1255 if (n_actual_snps == 0)
1256 return;
1257
1258 //
1259 // Identify a random SNP that isn't fixed in this subset of populations.
1260 //
1261 size_t kept_snp_i;
1262 do {
1263 kept_snp_i = rand() % cloc->snps.size();
1264 } while (t->nucs[cloc->snps[kept_snp_i]->col].fixed);
1265
1266 //
1267 // Remove all the SNPs except for the one marked previously.
1268 //
1269 for (auto snp_i=cloc->snps.size(); snp_i!=0; ) {
1270 --snp_i;
1271 if (snp_i != kept_snp_i)
1272 erase_snp(cloc, d, n_samples, snp_i);
1273 }
1274 }
1275
1276 void
filter_snps(LocBin & loc,const MetaPopInfo & mpopi,ostream & log_fh)1277 LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
1278 {
1279 assert(loc.s != NULL);
1280 CSLocus* cloc = loc.cloc;
1281 Datum** d = loc.d;
1282 LocPopSum* s = loc.s;
1283
1284 //
1285 // If this locus is fixed, ignore it.
1286 //
1287 if (cloc->snps.size() == 0)
1288 return;
1289
1290 loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
1291 loc.s->tally_metapop(loc.cloc);
1292 const LocTally *t = s->meta_pop();
1293
1294 for (uint snp_index = cloc->snps.size(); snp_index > 0;) { // Must be reverse because we `erase()` things.
1295 --snp_index;
1296 uint col = cloc->snps[snp_index]->col;
1297 bool sample_prune = false;
1298 bool overall_sample_prune = false;
1299 bool maf_prune = false;
1300 bool het_prune = false;
1301
1302 //
1303 // If the site is fixed, ignore it.
1304 //
1305 NucTally& nuct = t->nucs[col];
1306 if (t->nucs[col].fixed == true)
1307 continue;
1308
1309 size_t n_pruned_pops = 0;
1310 size_t n_samples_left = 0;
1311 for (size_t p = 0; p < s->pop_cnt(); ++p) {
1312 const LocSum* sum = s->per_pop(p);
1313 if (sum->nucs[col].incompatible_site) {
1314 DOES_NOT_HAPPEN;
1315 }
1316 if ((double) sum->nucs[col].num_indv / this->_pop_tot[p] >= min_samples_per_pop) {
1317 n_samples_left += sum->nucs[col].num_indv;
1318 } else {
1319 ++n_pruned_pops;
1320 const Pop& pop = mpopi.pops()[p];
1321 for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
1322 if (d[k] == NULL || col >= (uint) d[k]->len)
1323 continue;
1324
1325 if (d[k]->model != NULL)
1326 d[k]->model[col] = 'U';
1327 for(size_t j=0; j < 2; ++j)
1328 d[k]->obshap[j][snp_index] = 'N';
1329 }
1330 }
1331 }
1332 if (mpopi.pops().size() - n_pruned_pops < (uint) min_populations)
1333 sample_prune = true;
1334 else if ((double) n_samples_left / mpopi.n_samples() < min_samples_overall)
1335 overall_sample_prune = true;
1336
1337 if (t->nucs[col].allele_cnt > 1) {
1338 //
1339 // Test for minor allele frequency.
1340 //
1341 if ((1 - nuct.p_freq) < minor_allele_freq
1342 || long(std::round((1 - nuct.p_freq) * 2.0 * nuct.num_indv)) < minor_allele_cnt)
1343 maf_prune = true;
1344 //
1345 // Test for observed heterozygosity.
1346 //
1347 if (t->nucs[col].obs_het > max_obs_het)
1348 het_prune = true;
1349 }
1350
1351 if (maf_prune || het_prune || sample_prune || overall_sample_prune) {
1352 this->_filtered_sites++;
1353 if (verbose) {
1354 log_fh << "pruned_polymorphic_site\t"
1355 << cloc->id << "\t"
1356 << cloc->loc.chr() << "\t"
1357 << cloc->sort_bp(col) +1 << "\t"
1358 << col << "\t";
1359 if (sample_prune)
1360 log_fh << "min_samples_per_pop\n";
1361 else if (overall_sample_prune)
1362 log_fh << "min_samples_overall\n";
1363 else if (maf_prune)
1364 log_fh << "maf_limit\n";
1365 else if (het_prune)
1366 log_fh << "obshet_limit\n";
1367 else
1368 DOES_NOT_HAPPEN;
1369 }
1370 this->erase_snp(cloc, d, mpopi.n_samples(), snp_index);
1371 }
1372 }
1373 }
1374
1375 void
filter_haps(LocBin & loc,const MetaPopInfo & mpopi,ostream & log_fh)1376 LocusFilter::filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
1377 {
1378 CSLocus* cloc = loc.cloc;
1379 Datum** d = loc.d;
1380
1381 // Tally SNP abundancies.
1382 vector<pair<size_t,size_t>> snps_n_samples;
1383 for (size_t i=0; i<cloc->snps.size(); ++i) {
1384 snps_n_samples.push_back({0, i});
1385 uint col = cloc->snps[i]->col;
1386 for (size_t s=0; s<mpopi.n_samples(); ++s)
1387 if (d[s] != NULL
1388 && size_t(d[s]->len) > col && (d[s]->model[col] == 'O' || d[s]->model[col] == 'E'))
1389 ++snps_n_samples.back().first;
1390 }
1391 std::sort(snps_n_samples.rbegin(), snps_n_samples.rend());
1392
1393 // Prune SNPs until enough samples have fully resolved haplotypes.
1394 // (N.B. This is applied after snp-wise -r/-R as any individual SNP failing
1395 // these filters would cause them to fail at the haplotype level as well anyway.)
1396 while (!cloc->snps.empty()) {
1397 // Tally haplotypes.
1398 size_t n_pops_present_hapwise = 0;
1399 size_t n_samples_w_haps = 0;
1400 for (const Pop& pop : mpopi.pops()) {
1401 size_t n_pop_samples = 0;
1402 for (size_t s=pop.first_sample; s<=pop.last_sample; ++s) {
1403 if (d[s] == NULL)
1404 continue;
1405 assert(d[s]->obshap.size() == 2);
1406 if (strchr(d[s]->obshap[0], 'N') == NULL)
1407 ++n_pop_samples;
1408 }
1409 if ((double) n_pop_samples / pop.n_samples() >= min_samples_per_pop) {
1410 ++n_pops_present_hapwise;
1411 n_samples_w_haps += n_pop_samples;
1412 }
1413 }
1414 // Check the number of haplotypes.
1415 if (n_pops_present_hapwise >= size_t(min_populations)
1416 && (double) n_samples_w_haps / mpopi.n_samples() >= min_samples_overall)
1417 // Filters are satisfied.
1418 break;
1419 // Prune the SNP that is present in the fewest samples.
1420 ++this->_filtered_sites;
1421 assert(!snps_n_samples.empty());
1422 size_t snp_i = snps_n_samples.back().second;
1423 this->erase_snp(cloc, d, mpopi.n_samples(), snp_i);
1424 snps_n_samples.pop_back();
1425 assert(snps_n_samples.size() == cloc->snps.size());
1426 for (pair<size_t,size_t>& snp : snps_n_samples)
1427 if (snp.second > snp_i)
1428 --snp.second;
1429 }
1430 }
1431
1432 int
accumulate_pre_filtering(const size_t sample_cnt,const CSLocus * loc)1433 CatalogDists::accumulate_pre_filtering(const size_t sample_cnt, const CSLocus *loc)
1434 {
1435 size_t missing;
1436
1437 if (this->_pre_valid.count(loc->cnt) == 0)
1438 this->_pre_valid[loc->cnt] = 1;
1439 else
1440 this->_pre_valid[loc->cnt]++;
1441
1442 missing = sample_cnt - loc->cnt;
1443
1444 if (this->_pre_absent.count(missing) == 0)
1445 this->_pre_absent[missing] = 1;
1446 else
1447 this->_pre_absent[missing]++;
1448
1449 if (this->_pre_snps_per_loc.count(loc->snps.size()) == 0)
1450 this->_pre_snps_per_loc[loc->snps.size()] = 1;
1451 else
1452 this->_pre_snps_per_loc[loc->snps.size()]++;
1453
1454 return 0;
1455 }
1456
1457 int
accumulate(const vector<LocBin * > & loci)1458 CatalogDists::accumulate(const vector<LocBin *> &loci)
1459 {
1460 const CSLocus *loc;
1461 const LocTally *t;
1462 size_t missing;
1463
1464 for (uint i = 0; i < loci.size(); i++) {
1465 loc = loci[i]->cloc;
1466
1467 if (this->_post_valid.count(loc->cnt) == 0)
1468 this->_post_valid[loc->cnt] = 1;
1469 else
1470 this->_post_valid[loc->cnt]++;
1471
1472 missing = loci[i]->sample_cnt - loc->cnt;
1473
1474 if (this->_post_absent.count(missing) == 0)
1475 this->_post_absent[missing] = 1;
1476 else
1477 this->_post_absent[missing]++;
1478
1479 //
1480 // Don't count SNPs that are fixed in the metapopulation.
1481 //
1482 t = loci[i]->s->meta_pop();
1483 size_t n_actual_snps = 0;
1484 for (const SNP* snp : loc->snps)
1485 if (!t->nucs[snp->col].fixed)
1486 ++n_actual_snps;
1487
1488 if (this->_post_snps_per_loc.count(n_actual_snps) == 0)
1489 this->_post_snps_per_loc[n_actual_snps] = 1;
1490 else
1491 this->_post_snps_per_loc[n_actual_snps]++;
1492 }
1493
1494 return 0;
1495 }
1496
1497 int
write_results(ostream & log_fh)1498 CatalogDists::write_results(ostream &log_fh)
1499 {
1500 map<size_t, size_t>::iterator cnt_it;
1501 string section;
1502 auto begin_section = [&](const string& s){
1503 section = s;
1504 log_fh << "\n" << "BEGIN " << section << "\n";
1505 };
1506 auto end_section = [&](){
1507 log_fh << "END " << section << "\n";
1508 };
1509
1510 begin_section("samples_per_loc_prefilters");
1511 log_fh << "# Distribution of valid samples matched to a catalog locus prior to filtering.\n"
1512 << "n_samples\tn_loci\n";
1513 for (cnt_it = this->_pre_valid.begin(); cnt_it != this->_pre_valid.end(); cnt_it++)
1514 log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1515 end_section();
1516
1517 begin_section("missing_samples_per_loc_prefilters");
1518 log_fh << "# Distribution of missing samples for each catalog locus prior to filtering.\n"
1519 << "# Absent samples at locus\tCount\n";
1520 for (cnt_it = this->_pre_absent.begin(); cnt_it != this->_pre_absent.end(); cnt_it++)
1521 log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1522 end_section();
1523
1524 begin_section("snps_per_loc_prefilters");
1525 log_fh << "# Distribution of the number of SNPs per catalog locus prior to filtering.\n"
1526 << "n_snps\tn_loci\n";
1527 for (cnt_it = this->_pre_snps_per_loc.begin(); cnt_it != this->_pre_snps_per_loc.end(); cnt_it++)
1528 log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1529 end_section();
1530
1531 begin_section("samples_per_loc_postfilters");
1532 log_fh << "# Distribution of valid samples matched to a catalog locus after filtering.\n"
1533 << "n_samples\tn_loci\n";
1534 for (cnt_it = this->_post_valid.begin(); cnt_it != this->_post_valid.end(); cnt_it++)
1535 log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1536 end_section();
1537
1538 begin_section("missing_samples_per_loc_postfilters");
1539 log_fh << "# Distribution of missing samples for each catalog locus after filtering.\n"
1540 << "# Absent samples at locus\tCount\n";
1541 for (cnt_it = this->_post_absent.begin(); cnt_it != this->_post_absent.end(); cnt_it++)
1542 log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1543 end_section();
1544
1545 begin_section("snps_per_loc_postfilters");
1546 log_fh << "# Distribution of the number of SNPs per catalog locus (after filtering).\n"
1547 << "n_snps\tn_loci\n";
1548 for (cnt_it = this->_post_snps_per_loc.begin(); cnt_it != this->_post_snps_per_loc.end(); cnt_it++)
1549 log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1550 end_section();
1551
1552 return 0;
1553 }
1554
1555 int
tabulate_locus_haplotypes(CSLocus * cloc,Datum ** d,int sample_cnt)1556 tabulate_locus_haplotypes(CSLocus *cloc, Datum **d, int sample_cnt)
1557 {
1558 double cnt = 0.0;
1559
1560 for (int i = 0; i < sample_cnt; i++) {
1561 if (d[i] == NULL)
1562 continue;
1563
1564 if (d[i]->obshap.size() > 1)
1565 cloc->marker = "heterozygous";
1566
1567 cnt++;
1568 }
1569
1570 if (cloc->marker.length() > 0) {
1571 create_genotype_map(cloc, d, sample_cnt);
1572 call_population_genotypes(cloc, d, sample_cnt);
1573 }
1574
1575 return 0;
1576 }
1577
1578 /*int
1579 tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
1580 {
1581 map<int, CSLocus *>::iterator it;
1582 vector<char *>::iterator hit;
1583 Datum **d;
1584 CSLocus *loc;
1585 double mean, cnt;
1586
1587 for (it = catalog.begin(); it != catalog.end(); it++) {
1588 loc = it->second;
1589 d = pmap->locus(loc->id);
1590
1591 mean = 0.0;
1592 cnt = 0.0;
1593
1594 for (int i = 0; i < pmap->sample_cnt(); i++) {
1595 if (d[i] == NULL)
1596 continue;
1597
1598 if (d[i]->obshap.size() > 1)
1599 loc->marker = "heterozygous";
1600
1601 mean += d[i]->lnl;
1602 cnt++;
1603 }
1604
1605 if (loc->marker.length() > 0) {
1606 create_genotype_map(loc, d, pmap->sample_cnt());
1607 call_population_genotypes(loc, d, pmap->sample_cnt());
1608 }
1609
1610 loc->lnl = mean / cnt;
1611 }
1612
1613 return 0;
1614 }*/
1615
1616 /*
1617 int
1618 merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
1619 PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
1620 map<int, pair<merget, int> > &merge_map,
1621 ofstream &log_fh)
1622 {
1623 map<string, vector<CSLocus *> >::iterator it;
1624 CSLocus *cur, *next;
1625 Datum **d_1, **d_2;
1626 double prune_pct;
1627 uint unmergable, tot_loci, tot_samp;
1628 uint success = 0;
1629 uint failure = 0;
1630 uint overlap = 0;
1631 uint simple_merge_cnt = 0;
1632 uint complex_merge_cnt = 0;
1633 uint missing_samps_cnt = 0;
1634 uint phase_fail_cnt = 0;
1635 uint nomapping_cnt = 0;
1636 uint multimapping_cnt = 0;
1637 uint multifails_cnt = 0;
1638
1639 tot_loci = pmap->loci_cnt();
1640
1641 set<int> loci_to_destroy;
1642 map<int, int> missing_samps_dist;
1643
1644 cout << "To merge adjacent loci at least " << merge_prune_lim * 100 << "% of samples must have both adjacent loci;"
1645 << " the remaining " << 100 - (merge_prune_lim * 100) << "% of individuals will be pruned.\n"
1646 << "Attempting to merge adjacent loci that share a cutsite...";
1647
1648 if (verbose)
1649 log_fh << "\n#\n# List of locus pairs that share a cutsite that failed to merge because they could not be phased.\n#\n";
1650
1651 //
1652 // Iterate over each chromosome.
1653 //
1654 for (it = pmap->ordered_loci_nconst().begin(); it != pmap->ordered_loci_nconst().end(); it++) {
1655 //
1656 // Iterate over each ordered locus on this chromosome.
1657 //
1658 next = it->second[0];
1659 for (uint pos = 1; pos < it->second.size(); pos++) {
1660 cur = next;
1661 next = it->second[pos];
1662
1663 //
1664 // Do these two loci overlap?
1665 // +Must occur on opposite strands
1666 // +Must overlap according to the length of the cutsite.
1667 //
1668 if (((cur->loc.strand == strand_minus && next->loc.strand == strand_plus) &&
1669 ((int) (cur->loc.bp - next->loc.bp + 1) == renz_olap[enz])) ||
1670 ((cur->loc.strand == strand_plus && next->loc.strand == strand_minus) &&
1671 ((int) (next->loc.bp - cur->loc.bp + 1) == renz_olap[enz]))) {
1672 overlap++;
1673
1674 d_1 = pmap->locus(cur->id);
1675 d_2 = pmap->locus(next->id);
1676 unmergable = 0;
1677 tot_samp = 0;
1678
1679 //
1680 // Check if all members of the population contain these two loci (or are missing both).
1681 //
1682 for (int i = 0; i < pmap->sample_cnt(); i++) {
1683 if (d_1[i] != NULL || d_2[i] != NULL)
1684 tot_samp++;
1685 if ((d_1[i] != NULL && d_2[i] == NULL) ||
1686 (d_1[i] == NULL && d_2[i] != NULL))
1687 unmergable++;
1688 }
1689
1690 prune_pct = (double) (tot_samp - unmergable) / (double) tot_samp;
1691
1692 //
1693 // If some of the individuals only have one locus and not the other, prune them out.
1694 //
1695 if (prune_pct < 1.0 && prune_pct >= merge_prune_lim) {
1696 for (int i = 0; i < pmap->sample_cnt(); i++)
1697 if (d_1[i] != NULL && d_2[i] == NULL) {
1698 delete d_1[i];
1699 d_1[i] = NULL;
1700 } else if (d_1[i] == NULL && d_2[i] != NULL) {
1701 delete d_2[i];
1702 d_2[i] = NULL;
1703 }
1704 }
1705
1706 //
1707 // If possible, merge the two loci together.
1708 //
1709 if (prune_pct < merge_prune_lim) {
1710 int pct = (int) (prune_pct * 100);
1711 missing_samps_dist[pct]++;
1712 if (verbose) log_fh << "Missing samples, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1713 << pct << "% present (" << 100 - pct << "% missing)\n";
1714 missing_samps_cnt++;
1715 failure++;
1716 continue;
1717 }
1718
1719 phaset res = merge_and_phase_loci(pmap, cur, next, loci_to_destroy, log_fh);
1720 switch(res) {
1721 case multiple_fails:
1722 if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1723 << "multiple failures\n";
1724 multifails_cnt++;
1725 phase_fail_cnt++;
1726 failure++;
1727 break;
1728 case multimapping_fail:
1729 if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1730 << "multimapping in one or more individuals\n";
1731 multimapping_cnt++;
1732 phase_fail_cnt++;
1733 failure++;
1734 break;
1735 case nomapping_fail:
1736 if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1737 << "no mapping in one or more individuals\n";
1738 nomapping_cnt++;
1739 phase_fail_cnt++;
1740 failure++;
1741 break;
1742 case complex_phase:
1743 if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1744 << "a complex phasing operation.\n";
1745 complex_merge_cnt++;
1746 success++;
1747 merge_map[cur->id] = make_pair(merge_sink, next->id);
1748 merge_map[next->id] = make_pair(merge_src, cur->id);
1749 break;
1750 case simple_merge:
1751 if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1752 << "a simple merge operation.\n";
1753 simple_merge_cnt++;
1754 success++;
1755 merge_map[cur->id] = make_pair(merge_sink, next->id);
1756 merge_map[next->id] = make_pair(merge_src, cur->id);
1757 break;
1758 default:
1759 cerr << "Warning: Merge failure.\n";
1760 break;
1761 }
1762 }
1763 }
1764 }
1765
1766 //
1767 // Remove those loci that have been merged from both the popualtion map and catalog.
1768 //
1769 set<int> emptyset;
1770 pmap->prune(loci_to_destroy);
1771 reduce_catalog(catalog, emptyset, loci_to_destroy);
1772
1773 cout << "done.\n"
1774 << "Of " << tot_loci << " loci, "
1775 << overlap << " pairs share a cutsite; "
1776 << success << " pairs were merged; "
1777 << failure << " pairs failed to merge; "
1778 << pmap->loci_cnt() << " remaining loci.\n"
1779 << " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
1780 << "while " << complex_merge_cnt << " required phasing.\n"
1781 << " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
1782 << "while " << phase_fail_cnt << " failed to be phased.\n"
1783 << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
1784 << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
1785
1786 log_fh << "\n#\n# Merging adjacent loci with a shared restriction enzyme cutsite\n#\n"
1787 << "Of " << tot_loci << " loci, "
1788 << overlap << " pairs share a cutsite; "
1789 << success << " pairs were merged; "
1790 << failure << " pairs failed to merge; "
1791 << pmap->loci_cnt() << " remaining loci.\n"
1792 << " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
1793 << "while " << complex_merge_cnt << " required phasing.\n"
1794 << " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
1795 << "while " << phase_fail_cnt << " failed to be phased.\n"
1796 << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
1797 << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
1798 log_fh << "#\n# Distribution of loci with samples missing one of two loci to be merged\n"
1799 << "# Percent samples with both loci present\tNumber of cases\n";
1800 map<int, int>::iterator mit;
1801 for (mit = missing_samps_dist.begin(); mit != missing_samps_dist.end(); mit++)
1802 log_fh << mit->first << "\t" << mit->second << "\n";
1803 log_fh << "\n";
1804
1805 return 0;
1806 }
1807 */
1808 /*phaset
1809 merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
1810 set<int> &loci_to_destroy,
1811 ofstream &log_fh)
1812 {
1813 Datum **d_1 = pmap->locus(cur->id);
1814 Datum **d_2 = pmap->locus(next->id);
1815
1816 set<int> phased_results;
1817 set<string> phased_haplotypes;
1818 string merged_hap;
1819 char *h_1, *h_2;
1820 int merge_type;
1821
1822 if (verbose) log_fh << "Attempting to phase source locus " << cur->id << " with sink locus " << next->id << "\n";
1823
1824 int sample_cnt = 0;
1825 int phased_sample_cnt = 0;
1826 //
1827 // Take a census of the already phased haplotypes. We have phased haplotypes
1828 // if for individual i:
1829 // 1. d_1 has a single haplotype and d_2 has a single haplotype
1830 // 2. d_1 has a single haplotpye and d_2 has multiple haplotypes
1831 // 3. d_1 has multiple haplotpyes and d_2 has a single haplotype
1832 //
1833 // If one or both of the loci have no SNPs, then the haplotype is
1834 // recorded as "consensus." Check that condition before we start merging.
1835 //
1836 if (cur->snps.size() > 0 && next->snps.size() > 0)
1837 merge_type = 0;
1838 else if (cur->snps.size() == 0)
1839 merge_type = 1;
1840 else if (next->snps.size() == 0)
1841 merge_type = 2;
1842 else
1843 merge_type = 3;
1844
1845 for (int i = 0; i < pmap->sample_cnt(); i++) {
1846 if (d_1[i] == NULL || d_2[i] == NULL)
1847 continue;
1848 else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1)
1849 continue;
1850 else {
1851 for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
1852 for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
1853 switch (merge_type) {
1854 case 0:
1855 merged_hap = string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]);
1856 break;
1857 case 1:
1858 merged_hap = string(d_2[i]->obshap[k]);
1859 break;
1860 case 2:
1861 merged_hap = string(d_1[i]->obshap[j]);
1862 break;
1863 case 3:
1864 default:
1865 merged_hap = "consensus";
1866 break;
1867 }
1868 phased_haplotypes.insert(merged_hap);
1869 // cout << "Phasing: '" << d_1[i]->obshap[j] << "' + '" << d_2[i]->obshap[k] << "' => '" << merged_hap << "'\n";
1870 }
1871 }
1872 phased_sample_cnt++;
1873 sample_cnt++;
1874 }
1875 }
1876
1877 //
1878 // Indicate that these two loci had a simple merge, with no phasing necessary.
1879 //
1880 phased_results.insert(simple_merge);
1881
1882 //
1883 // Now we need to check if we can phase the remaining haplotypes.
1884 //
1885 for (int i = 0; i < pmap->sample_cnt(); i++) {
1886 if (d_1[i] == NULL || d_2[i] == NULL)
1887 continue;
1888 else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1) {
1889 // cout << "Attempting to phase individual " << i << ": " << d_1[i]->id << " / " << d_2[i]->id << "\n";
1890
1891 sample_cnt++;
1892 //
1893 // We should be able to find a sinlge phasing mapping for each haplotype from d_1 to d_2
1894 // that includes all the haplotypes in these two loci.
1895 //
1896 vector<pair<char *, char *> > seen_phased;
1897 uint tot_obshap = d_1[i]->obshap.size() + d_2[i]->obshap.size();
1898 uint phased_cnt = 0;
1899 for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
1900 for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
1901 // cout << " " << d_1[i]->obshap[j] << " + " << d_2[i]->obshap[k];
1902 //
1903 // Record each pair of haplotypes that has been seen phased previously.
1904 //
1905 if (phased_haplotypes.count(string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]))) {
1906 seen_phased.push_back(make_pair(d_1[i]->obshap[j], d_2[i]->obshap[k]));
1907 // cout << " => " << d_1[i]->obshap[j] << d_2[i]->obshap[k];
1908 }
1909 // cout << "\n";
1910 }
1911 }
1912 //
1913 // Now, we will iterate over all sets of phased haplotypes and look
1914 // for combinations that use all four individual haplotypes.
1915 //
1916 for (uint j = 0; j < seen_phased.size(); j++) {
1917 for (uint k = j; k < seen_phased.size(); k++) {
1918 set<char *> incorporated_haplotypes;
1919 //
1920 // Count the number of distinct char pointers. If this combination
1921 // of haplotypes includes all unphased haplotypes, count it.
1922 //
1923 incorporated_haplotypes.insert(seen_phased[j].first);
1924 incorporated_haplotypes.insert(seen_phased[j].second);
1925 incorporated_haplotypes.insert(seen_phased[k].first);
1926 incorporated_haplotypes.insert(seen_phased[k].second);
1927 if (incorporated_haplotypes.size() == tot_obshap)
1928 phased_cnt++;
1929 }
1930 }
1931
1932 //
1933 // If one pair of haplotypes is mapped, but the other is not, assume the second pair or
1934 // haplotypes must be phased by process of elimination.
1935 //
1936 if (phased_cnt == 0 && seen_phased.size() == 1) {
1937 h_1 = seen_phased[0].first == d_1[i]->obshap[1] ?
1938 d_1[i]->obshap[0] : d_1[i]->obshap[1];
1939 h_2 = seen_phased[0].second == d_2[i]->obshap[1] ?
1940 d_2[i]->obshap[0] : d_2[i]->obshap[1];
1941 phased_haplotypes.insert(string(h_1) + string(h_2));
1942 phased_cnt++;
1943 // cout << " Phasing: '" << hap_1 << "' + '" << hap_2 << "' => '" << string(hap_1) + string(hap_2) << "'\n";
1944 }
1945
1946 if (phased_cnt == 0) {
1947 phased_results.insert(nomapping_fail);
1948 if (verbose) log_fh << " Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " no mapping found.\n";
1949 } else if (phased_cnt == 1) {
1950 phased_sample_cnt++;
1951 phased_results.insert(complex_phase);
1952 } else {
1953 phased_results.insert(multimapping_fail);
1954 if (verbose) log_fh << " Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " multiple mappings found.\n";
1955 }
1956 }
1957 }
1958
1959 if (phased_sample_cnt != sample_cnt) {
1960 if (phased_results.count(nomapping_fail) > 0 &&
1961 phased_results.count(multimapping_fail) > 0)
1962 return multiple_fails;
1963 else if (phased_results.count(nomapping_fail) > 0)
1964 return nomapping_fail;
1965 else if (phased_results.count(multimapping_fail) > 0)
1966 return multimapping_fail;
1967 else {
1968 cout << "WE SHOULD NOT GET HERE\n";
1969 return merge_failure;
1970 }
1971 }
1972
1973 //
1974 // Okay, merge these two loci together.
1975 //
1976 if (!merge_datums(pmap->sample_cnt(), cur->len, d_1, d_2, phased_haplotypes, merge_type))
1977 return merge_failure;
1978
1979 //
1980 // Merge the catalog entries together.
1981 //
1982 if (!merge_csloci(cur, next, phased_haplotypes))
1983 return merge_failure;
1984
1985 //
1986 // Mark the merged locus for destruction.
1987 //
1988 loci_to_destroy.insert(next->id);
1989
1990 if (phased_results.count(complex_phase) > 0)
1991 return complex_phase;
1992 return simple_merge;
1993 }*/
1994
1995 /*int
1996 merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
1997 {
1998 //
1999 // We assume that we are merging two loci: one on the negative strand, one on the
2000 // positive. We will keep the sink cslocus and delete the src cslocus.
2001 // -> The sink cslocus is assumed to be on the negative strand.
2002 //
2003
2004 //
2005 // 1. Reverse complement the SNP coordinates in the sink locus so that they are
2006 // enumerated on the positive strand. Complement the alleles as well.
2007 //
2008 for (uint j = 0; j < sink->snps.size(); j++) {
2009 sink->snps[j]->col = sink->len - sink->snps[j]->col - 1;
2010 sink->snps[j]->rank_1 = reverse(sink->snps[j]->rank_1);
2011 sink->snps[j]->rank_2 = reverse(sink->snps[j]->rank_2);
2012 sink->snps[j]->rank_3 = reverse(sink->snps[j]->rank_3);
2013 sink->snps[j]->rank_4 = reverse(sink->snps[j]->rank_4);
2014 }
2015
2016 //
2017 // 2. Adjust the SNP coordinates in the src locus to account for the now, longer length.
2018 //
2019 for (uint j = 0; j < src->snps.size(); j++)
2020 src->snps[j]->col = sink->len + src->snps[j]->col - renz_olap[enz];
2021
2022 //
2023 // 3. Combine SNPs between the two catalog loci: add the SNPs from the sink (formerly on the
2024 // negative strand) in reverse order, followed by the SNPs from the src.
2025 //
2026 vector<SNP *> tmpsnp;
2027 for (int j = (int) sink->snps.size() - 1; j >= 0; j--)
2028 tmpsnp.push_back(sink->snps[j]);
2029 for (uint j = 0; j < src->snps.size(); j++)
2030 tmpsnp.push_back(src->snps[j]);
2031 sink->snps.clear();
2032 for (uint j = 0; j < tmpsnp.size(); j++)
2033 sink->snps.push_back(tmpsnp[j]);
2034
2035 //
2036 // 4. Adjust the genomic location of the sink locus.
2037 //
2038 uint bp = sink->sort_bp();
2039 sink->loc.bp = bp;
2040 sink->loc.strand = strand_plus;
2041
2042 //
2043 // 5. Adjust the length of the sequence.
2044 //
2045 sink->len += src->len - renz_olap[enz];
2046
2047 //
2048 // 6. Merge the consensus sequence together.
2049 //
2050 char *new_con = rev_comp(sink->con);
2051 delete [] sink->con;
2052 sink->con = new_con;
2053 new_con = new char[sink->len + 1];
2054 strcpy(new_con, sink->con);
2055 delete [] sink->con;
2056 sink->con = new_con;
2057 new_con += src->len - renz_olap[enz];
2058 strcpy(new_con, src->con);
2059
2060 //
2061 // 7. Record the now phased haplotypes.
2062 //
2063 sink->alleles.clear();
2064 set<string>::iterator it;
2065 for (it = phased_haplotypes.begin(); it != phased_haplotypes.end(); it++)
2066 sink->alleles[*it] = 0;
2067
2068 // cout << "CSLocus " << sink->id << ":\n"
2069 // << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == strand_plus ? "+" : "-") << "\n"
2070 // << " SNPs:\n";
2071 // for (uint j = 0; j < sink->snps.size(); j++)
2072 // cout << " Col: " << sink->snps[j]->col
2073 // << " Rank 1: " << sink->snps[j]->rank_1
2074 // << " Rank 2: " << sink->snps[j]->rank_2 << "\n";
2075 // cout << " Alleles:\n";
2076 // map<string, int>::iterator ait;
2077 // for (ait = sink->alleles.begin(); ait != sink->alleles.end(); ait++)
2078 // cout << " " << ait->first << "\n";
2079
2080 return 1;
2081 }*/
2082
2083 /*int
2084 merge_datums(int sample_cnt,
2085 int sink_locus_len,
2086 Datum **sink, Datum **src,
2087 set<string> &phased_haplotypes,
2088 int merge_type)
2089 {
2090 char tmphap[id_len], *new_hap;
2091 uint haplen, model_len, offset;
2092 vector<SNP *> tmpsnp;
2093 vector<string> tmpobshap;
2094 vector<int> tmpobsdep;
2095
2096 //
2097 // We assume that we are merging two loci: one on the negative strand, one on the
2098 // positive. We will keep the sink datum and delete the src datum.
2099 // -The sink datum is assumed to be on the negative strand.
2100 //
2101 for (int i = 0; i < sample_cnt; i++) {
2102 if (sink[i] == NULL && src[i] == NULL)
2103 continue;
2104 else if (sink[i] == NULL || src[i] == NULL)
2105 cout << "Unexpected condition in merging datums: one datum is NULL while the other is not.\n";
2106
2107 //
2108 // 1. Reverse complement the observed haplotypes in the sink locus.
2109 //
2110 haplen = strlen(sink[i]->obshap[0]);
2111 for (uint j = 0; j < sink[i]->obshap.size(); j++) {
2112 for (uint k = 0; k < haplen; k++)
2113 tmphap[k] = reverse(sink[i]->obshap[j][haplen - k - 1]);
2114 tmphap[haplen] = '\0';
2115 strcpy(sink[i]->obshap[j], tmphap);
2116 }
2117 }
2118
2119 //
2120 // 2. Combine observed haplotypes between the two datums while phasing them.
2121 // 2.1 First combine the haplotypes from samples that are already in phase.
2122 //
2123 string merged_hap;
2124 vector<int> to_be_phased;
2125 phased_haplotypes.clear();
2126 for (int i = 0; i < sample_cnt; i++) {
2127 if (sink[i] == NULL && src[i] == NULL)
2128 continue;
2129
2130 if (sink[i]->obshap.size() > 1 && src[i]->obshap.size() > 1) {
2131 to_be_phased.push_back(i);
2132 continue;
2133 } else {
2134 tmpobshap.clear();
2135 tmpobsdep.clear();
2136 for (uint j = 0; j < sink[i]->obshap.size(); j++) {
2137 for (uint k = 0; k < src[i]->obshap.size(); k++) {
2138 switch (merge_type) {
2139 case 0:
2140 merged_hap = string(sink[i]->obshap[j]) + string(src[i]->obshap[k]);
2141 break;
2142 case 1:
2143 merged_hap = string(src[i]->obshap[j]);
2144 break;
2145 case 2:
2146 merged_hap = string(sink[i]->obshap[j]);
2147 break;
2148 case 3:
2149 default:
2150 merged_hap = "consensus";
2151 break;
2152 }
2153 phased_haplotypes.insert(merged_hap);
2154 tmpobshap.push_back(merged_hap);
2155 tmpobsdep.push_back((sink[i]->depth[j] + src[i]->depth[k]) / 2);
2156 }
2157 }
2158 sink[i]->depth.clear();
2159 for (uint j = 0; j < sink[i]->obshap.size(); j++)
2160 delete [] sink[i]->obshap[j];
2161 sink[i]->obshap.clear();
2162 for (uint j = 0; j < tmpobshap.size(); j++) {
2163 new_hap = new char[tmpobshap[j].length() + 1];
2164 strcpy(new_hap, tmpobshap[j].c_str());
2165 sink[i]->obshap.push_back(new_hap);
2166 sink[i]->depth.push_back(tmpobsdep[j]);
2167 }
2168 }
2169 }
2170 //
2171 // 2.2 Phase and combine the haplotypes from the remaining samples.
2172 //
2173 int index;
2174 for (uint i = 0; i < to_be_phased.size(); i++) {
2175 index = to_be_phased[i];
2176 tmpobshap.clear();
2177 tmpobsdep.clear();
2178
2179 vector<pair<char *, char *> > seen_phased;
2180 uint tot_obshap = sink[index]->obshap.size() + src[index]->obshap.size();
2181
2182 for (uint j = 0; j < sink[index]->obshap.size(); j++) {
2183 for (uint k = 0; k < src[index]->obshap.size(); k++) {
2184 if (phased_haplotypes.count(string(sink[index]->obshap[j]) + string(src[index]->obshap[k])))
2185 seen_phased.push_back(make_pair(sink[index]->obshap[j], src[index]->obshap[k]));
2186 }
2187 }
2188
2189 for (uint j = 0; j < seen_phased.size(); j++) {
2190 for (uint k = j; k < seen_phased.size(); k++) {
2191 set<char *> incorporated_haplotypes;
2192 incorporated_haplotypes.insert(seen_phased[j].first);
2193 incorporated_haplotypes.insert(seen_phased[j].second);
2194 incorporated_haplotypes.insert(seen_phased[k].first);
2195 incorporated_haplotypes.insert(seen_phased[k].second);
2196 if (incorporated_haplotypes.size() == tot_obshap) {
2197 tmpobshap.push_back(string(seen_phased[j].first) + string(seen_phased[j].second));
2198 tmpobshap.push_back(string(seen_phased[k].first) + string(seen_phased[k].second));
2199 //tmpobsdep.push_back((sink[index]->depth[j] + src[index]->depth[k]) / 2);
2200 }
2201 }
2202 }
2203
2204 sink[index]->depth.clear();
2205 for (uint j = 0; j < sink[index]->obshap.size(); j++)
2206 delete [] sink[index]->obshap[j];
2207 sink[index]->obshap.clear();
2208 for (uint j = 0; j < tmpobshap.size(); j++) {
2209 new_hap = new char[tmpobshap[j].length() + 1];
2210 strcpy(new_hap, tmpobshap[j].c_str());
2211 sink[index]->obshap.push_back(new_hap);
2212 // sink[index]->depth.push_back(tmpobsdep[j]);
2213 }
2214 }
2215
2216 //
2217 // 3. Merge model calls; Set the length; combine the two depth and lnl measures together.
2218 //
2219 string model_calls;
2220 char *p;
2221
2222 for (int i = 0; i < sample_cnt; i++) {
2223 if (sink[i] == NULL && src[i] == NULL)
2224 continue;
2225
2226 //
2227 // Merge the two strings of model calls together.
2228 // We need to check if the locus for this individual is shorter than the catalog
2229 // locus. If so, we need to expand out the model call array to be the proper length.
2230 //
2231 reverse_string(sink[i]->model);
2232 offset = 0;
2233 model_calls.clear();
2234 if (sink_locus_len > sink[i]->len) {
2235 offset = sink_locus_len - sink[i]->len;
2236 model_calls.assign(offset, 'N');
2237 }
2238 model_len = offset + sink[i]->len + src[i]->len - renz_olap[enz];
2239 model_calls.append(sink[i]->model);
2240 delete [] sink[i]->model;
2241 sink[i]->model = new char[model_len + 1];
2242 strcpy(sink[i]->model, model_calls.c_str());
2243 p = sink[i]->model;
2244 p += offset + sink[i]->len - renz_olap[enz];
2245 strcpy(p, src[i]->model);
2246
2247 sink[i]->len = model_len;
2248 sink[i]->tot_depth = (sink[i]->tot_depth + src[i]->tot_depth) / 2;
2249 sink[i]->lnl = (sink[i]->lnl + src[i]->lnl) / 2.0;
2250
2251 //
2252 // Record which datum was merged into this one.
2253 //
2254 sink[i]->merge_partner = src[i]->id;
2255 }
2256
2257 return 1;
2258 }*/
2259
2260 int
create_genotype_map(CSLocus * locus,Datum ** d,int sample_cnt)2261 create_genotype_map(CSLocus *locus, Datum **d, int sample_cnt)
2262 {
2263 //
2264 // Create a genotype map. For any set of haplotypes, this routine will
2265 // assign each haplotype to a genotype, e.g. given the haplotypes
2266 // 'AC' and 'GT' in the population, this routine will assign 'AC' == 'a'
2267 // and 'GT' == 'b'. If an individual is homozygous for 'AC', they will be
2268 // assigned an 'aa' genotype.
2269 //
2270 //cout << "Creating genotype map for catalog ID " << locus->id << ", marker: " << locus->marker << ".\n";
2271
2272 char gtypes[26] ={'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
2273 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
2274 'u', 'v', 'w', 'x', 'y', 'z'};
2275
2276 map<string, int> haplotypes;
2277 map<string, int>::iterator k;
2278 vector<pair<string, int> > sorted_haplotypes;
2279
2280 for (int i = 0; i < sample_cnt; i++) {
2281
2282 if (d[i] != NULL)
2283 for (uint n = 0; n < d[i]->obshap.size(); n++)
2284 haplotypes[d[i]->obshap[n]]++;
2285 }
2286
2287 //
2288 // Check that there are not more haplotypes than we have encodings.
2289 //
2290 if (haplotypes.size() > 26) return 0;
2291
2292 //
2293 // Sort the haplotypes map by value
2294 //
2295 for (k = haplotypes.begin(); k != haplotypes.end(); k++)
2296 sorted_haplotypes.push_back(*k);
2297 sort(sorted_haplotypes.begin(), sorted_haplotypes.end(), hap_compare);
2298
2299 for (uint n = 0, index = 0; n < sorted_haplotypes.size() && index <= 26; n++, index++) {
2300 locus->gmap[sorted_haplotypes[n].first] = gtypes[index];
2301 //cout << "GMAP: " << sorted_haplotypes[n].first << " == " << gtypes[index] << "\n";
2302 }
2303
2304 return 0;
2305 }
2306
2307 int
call_population_genotypes(CSLocus * locus,Datum ** d,int sample_cnt)2308 call_population_genotypes(CSLocus *locus, Datum **d, int sample_cnt)
2309 {
2310 //
2311 // Fetch the array of observed haplotypes from the population
2312 //
2313 for (int i = 0; i < sample_cnt; i++) {
2314 if (d[i] == NULL)
2315 continue;
2316
2317 vector<string> gtypes;
2318 string gtype;
2319
2320 //cout << "Sample Id: " << pmap->rev_sample_index(i) << "\n";
2321
2322 for (uint j = 0; j < d[i]->obshap.size(); j++) {
2323 //
2324 // Impossible allele encountered.
2325 //
2326 if (locus->gmap.count(d[i]->obshap[j]) == 0) {
2327 gtypes.clear();
2328 gtypes.push_back("-");
2329 goto impossible;
2330 }
2331
2332 gtypes.push_back(locus->gmap[d[i]->obshap[j]]);
2333 //cout << " Observed Haplotype: " << d[i]->obshap[j] << ", Genotype: " << locus->gmap[d[i]->obshap[j]] << "\n";
2334 }
2335
2336 impossible:
2337 sort(gtypes.begin(), gtypes.end());
2338 for (uint j = 0; j < gtypes.size(); j++) {
2339 gtype += gtypes[j];
2340 //cout << " Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
2341 }
2342
2343 string m = gtype.length() == 1 ?
2344 gtype + gtype : gtype;
2345
2346 d[i]->gtype = new char[m.length() + 1];
2347 strcpy(d[i]->gtype, m.c_str());
2348
2349 if (m != "-")
2350 locus->gcnt++;
2351
2352 //cout << "Assigning datum, marker: " << locus->marker << ", string: " << m << ", haplotype: " << d[i]->obshap[0] << ", gtype: " << gtype << "\n";
2353 }
2354
2355 return 0;
2356 }
2357
SumStatsSummary(size_t pop_cnt)2358 SumStatsSummary::SumStatsSummary(size_t pop_cnt)
2359 {
2360 this->_pop_cnt = pop_cnt;
2361 this->_private_cnt = new size_t[this->_pop_cnt];
2362 this->_sig_hwe_dev = new size_t[this->_pop_cnt];
2363 this->_n = new double[this->_pop_cnt];
2364 this->_var_sites = new double[this->_pop_cnt];
2365
2366 this->_num_indv_mean = new double[this->_pop_cnt];
2367 this->_num_indv_acc_mean = new double[this->_pop_cnt];
2368 this->_num_indv_var = new double[this->_pop_cnt];
2369 this->_p_mean = new double[this->_pop_cnt];
2370 this->_p_acc_mean = new double[this->_pop_cnt];
2371 this->_p_var = new double[this->_pop_cnt];
2372 this->_obs_het_mean = new double[this->_pop_cnt];
2373 this->_obs_het_acc_mean = new double[this->_pop_cnt];
2374 this->_obs_het_var = new double[this->_pop_cnt];
2375 this->_obs_hom_mean = new double[this->_pop_cnt];
2376 this->_obs_hom_acc_mean = new double[this->_pop_cnt];
2377 this->_obs_hom_var = new double[this->_pop_cnt];
2378 this->_exp_het_mean = new double[this->_pop_cnt];
2379 this->_exp_het_acc_mean = new double[this->_pop_cnt];
2380 this->_exp_het_var = new double[this->_pop_cnt];
2381 this->_exp_hom_mean = new double[this->_pop_cnt];
2382 this->_exp_hom_acc_mean = new double[this->_pop_cnt];
2383 this->_exp_hom_var = new double[this->_pop_cnt];
2384 this->_pi_mean = new double[this->_pop_cnt];
2385 this->_pi_acc_mean = new double[this->_pop_cnt];
2386 this->_pi_var = new double[this->_pop_cnt];
2387 this->_fis_mean = new double[this->_pop_cnt];
2388 this->_fis_acc_mean = new double[this->_pop_cnt];
2389 this->_fis_var = new double[this->_pop_cnt];
2390
2391 this->_n_all = new double[this->_pop_cnt];
2392 this->_num_indv_mean_all = new double[this->_pop_cnt];
2393 this->_num_indv_acc_mean_all = new double[this->_pop_cnt];
2394 this->_num_indv_var_all = new double[this->_pop_cnt];
2395 this->_p_mean_all = new double[this->_pop_cnt];
2396 this->_p_acc_mean_all = new double[this->_pop_cnt];
2397 this->_p_var_all = new double[this->_pop_cnt];
2398 this->_obs_het_mean_all = new double[this->_pop_cnt];
2399 this->_obs_het_acc_mean_all = new double[this->_pop_cnt];
2400 this->_obs_het_var_all = new double[this->_pop_cnt];
2401 this->_obs_hom_mean_all = new double[this->_pop_cnt];
2402 this->_obs_hom_acc_mean_all = new double[this->_pop_cnt];
2403 this->_obs_hom_var_all = new double[this->_pop_cnt];
2404 this->_exp_het_mean_all = new double[this->_pop_cnt];
2405 this->_exp_het_acc_mean_all = new double[this->_pop_cnt];
2406 this->_exp_het_var_all = new double[this->_pop_cnt];
2407 this->_exp_hom_mean_all = new double[this->_pop_cnt];
2408 this->_exp_hom_acc_mean_all = new double[this->_pop_cnt];
2409 this->_exp_hom_var_all = new double[this->_pop_cnt];
2410 this->_pi_mean_all = new double[this->_pop_cnt];
2411 this->_pi_acc_mean_all = new double[this->_pop_cnt];
2412 this->_pi_var_all = new double[this->_pop_cnt];
2413 this->_fis_mean_all = new double[this->_pop_cnt];
2414 this->_fis_acc_mean_all = new double[this->_pop_cnt];
2415 this->_fis_var_all = new double[this->_pop_cnt];
2416
2417 this->_sq_n = new double[this->_pop_cnt];
2418 this->_sq_n_all = new double[this->_pop_cnt];
2419
2420 for (uint j = 0; j < this->_pop_cnt; j++) {
2421 this->_private_cnt[j] = 0;
2422 this->_sig_hwe_dev[j] = 0;
2423 this->_n[j] = 0.0;
2424 this->_var_sites[j] = 0.0;
2425 this->_num_indv_mean[j] = 0.0;
2426 this->_num_indv_acc_mean[j] = 0.0;
2427 this->_num_indv_var[j] = 0.0;
2428 this->_p_mean[j] = 0.0;
2429 this->_p_acc_mean[j] = 0.0;
2430 this->_p_var[j] = 0.0;
2431 this->_obs_het_mean[j] = 0.0;
2432 this->_obs_het_acc_mean[j] = 0.0;
2433 this->_obs_het_var[j] = 0.0;
2434 this->_obs_hom_mean[j] = 0.0;
2435 this->_obs_hom_acc_mean[j] = 0.0;
2436 this->_obs_hom_var[j] = 0.0;
2437 this->_exp_het_mean[j] = 0.0;
2438 this->_exp_het_acc_mean[j] = 0.0;
2439 this->_exp_het_var[j] = 0.0;
2440 this->_exp_hom_mean[j] = 0.0;
2441 this->_exp_hom_acc_mean[j] = 0.0;
2442 this->_exp_hom_var[j] = 0.0;
2443 this->_pi_mean[j] = 0.0;
2444 this->_pi_acc_mean[j] = 0.0;
2445 this->_pi_var[j] = 0.0;
2446 this->_fis_mean[j] = 0.0;
2447 this->_fis_acc_mean[j] = 0.0;
2448 this->_fis_var[j] = 0.0;
2449
2450 this->_n_all[j] = 0.0;
2451 this->_num_indv_mean_all[j] = 0.0;
2452 this->_num_indv_acc_mean_all[j] = 0.0;
2453 this->_num_indv_var_all[j] = 0.0;
2454 this->_p_mean_all[j] = 0.0;
2455 this->_p_acc_mean_all[j] = 0.0;
2456 this->_p_var_all[j] = 0.0;
2457 this->_obs_het_mean_all[j] = 0.0;
2458 this->_obs_het_acc_mean_all[j] = 0.0;
2459 this->_obs_het_var_all[j] = 0.0;
2460 this->_obs_hom_mean_all[j] = 0.0;
2461 this->_obs_hom_acc_mean_all[j] = 0.0;
2462 this->_obs_hom_var_all[j] = 0.0;
2463 this->_exp_het_mean_all[j] = 0.0;
2464 this->_exp_het_acc_mean_all[j] = 0.0;
2465 this->_exp_het_var_all[j] = 0.0;
2466 this->_exp_hom_mean_all[j] = 0.0;
2467 this->_exp_hom_acc_mean_all[j] = 0.0;
2468 this->_exp_hom_var_all[j] = 0.0;
2469 this->_pi_mean_all[j] = 0.0;
2470 this->_pi_acc_mean_all[j] = 0.0;
2471 this->_pi_var_all[j] = 0.0;
2472 this->_fis_mean_all[j] = 0.0;
2473 this->_fis_acc_mean_all[j] = 0.0;
2474 this->_fis_var_all[j] = 0.0;
2475
2476 this->_sq_n[j] = 0.0;
2477 this->_sq_n_all[j] = 0.0;
2478 }
2479
2480 this->_locus_n = 0.0;
2481 this->_locus_overlap_n = 0.0;
2482 this->_locus_pe_ctg_n = 0.0;
2483 this->_locus_len_mean = 0.0;
2484 this->_locus_len_acc_mean = 0.0;
2485 this->_locus_len_var = 0.0;
2486 this->_overlap_mean = 0.0;
2487 this->_overlap_acc_mean = 0.0;
2488 this->_overlap_var = 0.0;
2489 this->_locus_gt_sites_mean = 0.0;
2490 this->_locus_gt_sites_acc_mean = 0.0;
2491 this->_locus_gt_sites_var = 0.0;
2492 }
2493
2494 int
accumulate(const vector<LocBin * > & loci)2495 SumStatsSummary::accumulate(const vector<LocBin *> &loci)
2496 {
2497 //
2498 // We are calculating the mean, variance, and standard deviation for several variables.
2499 // We will calculate them partially, for each set of loci input to the program using
2500 // the algorithm described in:
2501 // B. P. Welford. (1962) Note on a Method for Calculating Corrected Sums of Squares and
2502 // Products. Technometrics: 4(3), pp. 419-420.
2503 //
2504 CSLocus *cloc;
2505 const LocSum *s;
2506 const LocTally *t;
2507
2508 for (uint i = 0; i < loci.size(); i++) {
2509 cloc = loci[i]->cloc;
2510 t = loci[i]->s->meta_pop();
2511
2512 size_t site_cnt = 0;
2513
2514 for (uint pos = 0; pos < cloc->len; pos++) {
2515 //
2516 // Compile private alleles
2517 //
2518 if (t->nucs[pos].priv_allele >= 0)
2519 _private_cnt[t->nucs[pos].priv_allele]++;
2520
2521 if (t->nucs[pos].allele_cnt == 2) {
2522 site_cnt++;
2523
2524 for (uint pop = 0; pop < this->_pop_cnt; pop++) {
2525
2526 s = loci[i]->s->per_pop(pop);
2527
2528 if (s->nucs[pos].num_indv == 0) continue;
2529
2530 _n[pop]++;
2531
2532 if (s->nucs[pos].pi > 0) _var_sites[pop]++;
2533
2534 //
2535 // Record if site deviates from HWE.
2536 //
2537 if (calc_hwp && s->nucs[pos].stat[2] < p_value_cutoff) _sig_hwe_dev[pop]++;
2538
2539 //
2540 // Accumulate sums for each variable to calculate the means.
2541 //
2542 _num_indv_mean[pop] += s->nucs[pos].num_indv;
2543 _p_mean[pop] += s->nucs[pos].p;
2544 _obs_het_mean[pop] += s->nucs[pos].obs_het;
2545 _obs_hom_mean[pop] += s->nucs[pos].obs_hom;
2546 _exp_het_mean[pop] += s->nucs[pos].exp_het;
2547 _exp_hom_mean[pop] += s->nucs[pos].exp_hom;
2548 _pi_mean[pop] += s->nucs[pos].stat[0];
2549 _fis_mean[pop] += s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0;
2550
2551 _n_all[pop]++;
2552 _num_indv_mean_all[pop] += s->nucs[pos].num_indv;
2553 _p_mean_all[pop] += s->nucs[pos].p;
2554 _obs_het_mean_all[pop] += s->nucs[pos].obs_het;
2555 _obs_hom_mean_all[pop] += s->nucs[pos].obs_hom;
2556 _exp_het_mean_all[pop] += s->nucs[pos].exp_het;
2557 _exp_hom_mean_all[pop] += s->nucs[pos].exp_hom;
2558 _pi_mean_all[pop] += s->nucs[pos].stat[0];
2559 _fis_mean_all[pop] += s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0;
2560
2561 //
2562 // Accumulate a partial sum of squares to calculate the variance.
2563 //
2564 _num_indv_var[pop] += this->online_variance(s->nucs[pos].num_indv, _num_indv_acc_mean[pop], _n[pop]);
2565 _p_var[pop] += this->online_variance(s->nucs[pos].p, _p_acc_mean[pop], _n[pop]);
2566 _obs_het_var[pop] += this->online_variance(s->nucs[pos].obs_het, _obs_het_acc_mean[pop], _n[pop]);
2567 _obs_hom_var[pop] += this->online_variance(s->nucs[pos].obs_hom, _obs_hom_acc_mean[pop], _n[pop]);
2568 _exp_het_var[pop] += this->online_variance(s->nucs[pos].exp_het, _exp_het_acc_mean[pop], _n[pop]);
2569 _exp_hom_var[pop] += this->online_variance(s->nucs[pos].exp_hom, _exp_hom_acc_mean[pop], _n[pop]);
2570 _pi_var[pop] += this->online_variance(s->nucs[pos].stat[0], _pi_acc_mean[pop], _n[pop]);
2571 _fis_var[pop] += this->online_variance(s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0, _fis_acc_mean[pop], _n[pop]);
2572
2573 _num_indv_var_all[pop] += this->online_variance(s->nucs[pos].num_indv, _num_indv_acc_mean_all[pop], _n_all[pop]);
2574 _p_var_all[pop] += this->online_variance(s->nucs[pos].p, _p_acc_mean_all[pop], _n_all[pop]);
2575 _obs_het_var_all[pop] += this->online_variance(s->nucs[pos].obs_het, _obs_het_acc_mean_all[pop], _n_all[pop]);
2576 _obs_hom_var_all[pop] += this->online_variance(s->nucs[pos].obs_hom, _obs_hom_acc_mean_all[pop], _n_all[pop]);
2577 _exp_het_var_all[pop] += this->online_variance(s->nucs[pos].exp_het, _exp_het_acc_mean_all[pop], _n_all[pop]);
2578 _exp_hom_var_all[pop] += this->online_variance(s->nucs[pos].exp_hom, _exp_hom_acc_mean_all[pop], _n_all[pop]);
2579 _pi_var_all[pop] += this->online_variance(s->nucs[pos].stat[0], _pi_acc_mean_all[pop], _n_all[pop]);
2580 _fis_var_all[pop] += this->online_variance(s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0, _fis_acc_mean_all[pop], _n_all[pop]);
2581 }
2582
2583 } else if (t->nucs[pos].allele_cnt == 1) {
2584 site_cnt++;
2585
2586 for (uint pop = 0; pop < this->_pop_cnt; pop++) {
2587 s = loci[i]->s->per_pop(pop);
2588
2589 if (s->nucs[pos].num_indv == 0) continue;
2590
2591 _n_all[pop]++;
2592 _num_indv_mean_all[pop] += s->nucs[pos].num_indv;
2593 _p_mean_all[pop] += s->nucs[pos].p;
2594 _obs_het_mean_all[pop] += s->nucs[pos].obs_het;
2595 _obs_hom_mean_all[pop] += s->nucs[pos].obs_hom;
2596 _exp_het_mean_all[pop] += s->nucs[pos].exp_het;
2597 _exp_hom_mean_all[pop] += s->nucs[pos].exp_hom;
2598 _pi_mean_all[pop] += s->nucs[pos].stat[0];
2599 _fis_mean_all[pop] += s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0;
2600
2601 _num_indv_var_all[pop] += this->online_variance(s->nucs[pos].num_indv, _num_indv_acc_mean_all[pop], _n_all[pop]);
2602 _p_var_all[pop] += this->online_variance(s->nucs[pos].p, _p_acc_mean_all[pop], _n_all[pop]);
2603 _obs_het_var_all[pop] += this->online_variance(s->nucs[pos].obs_het, _obs_het_acc_mean_all[pop], _n_all[pop]);
2604 _obs_hom_var_all[pop] += this->online_variance(s->nucs[pos].obs_hom, _obs_hom_acc_mean_all[pop], _n_all[pop]);
2605 _exp_het_var_all[pop] += this->online_variance(s->nucs[pos].exp_het, _exp_het_acc_mean_all[pop], _n_all[pop]);
2606 _exp_hom_var_all[pop] += this->online_variance(s->nucs[pos].exp_hom, _exp_hom_acc_mean_all[pop], _n_all[pop]);
2607 _pi_var_all[pop] += this->online_variance(s->nucs[pos].stat[0], _pi_acc_mean_all[pop], _n_all[pop]);
2608 _fis_var_all[pop] += this->online_variance(s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0, _fis_acc_mean_all[pop], _n_all[pop]);
2609 }
2610 }
2611 }
2612
2613 //
2614 // Accumulate the locus length and overlap.
2615 //
2616 _locus_n++;
2617 _locus_gt_sites_mean += site_cnt;
2618 _locus_gt_sites_var += this->online_variance(site_cnt, _locus_gt_sites_acc_mean, _locus_n);
2619
2620 if (cloc->pe_ctg) {
2621 _locus_pe_ctg_n++;
2622 _locus_len_mean_all += cloc->len - 10;
2623 _locus_len_var_all += this->online_variance(cloc->len - 10, _locus_len_acc_mean_all, _locus_pe_ctg_n);
2624
2625 if (cloc->overlap > 0) {
2626 _locus_overlap_n++;
2627 _locus_len_mean += cloc->len;
2628 _locus_len_var += this->online_variance(cloc->len, _locus_len_acc_mean, _locus_overlap_n);
2629 _overlap_mean += cloc->overlap;
2630 _overlap_var += this->online_variance(cloc->overlap, _overlap_acc_mean, _locus_overlap_n);
2631 }
2632 }
2633 }
2634
2635 return 0;
2636 }
2637
2638 inline double
online_variance(double x,double & acc_mean,double n)2639 SumStatsSummary::online_variance(double x, double &acc_mean, double n)
2640 {
2641 double delta1, delta2;
2642
2643 delta1 = x - acc_mean;
2644 acc_mean += delta1 / n;
2645 delta2 = x - acc_mean;
2646 return delta1 * delta2;
2647 }
2648
2649 int
final_calculation()2650 SumStatsSummary::final_calculation()
2651 {
2652 //
2653 // Finish the mean calculation.
2654 //
2655 for (uint j = 0; j < this->_pop_cnt; j++) {
2656 _num_indv_mean[j] = _num_indv_mean[j] / _n[j];
2657 _p_mean[j] = _p_mean[j] / _n[j];
2658 _obs_het_mean[j] = _obs_het_mean[j] / _n[j];
2659 _obs_hom_mean[j] = _obs_hom_mean[j] / _n[j];
2660 _exp_het_mean[j] = _exp_het_mean[j] / _n[j];
2661 _exp_hom_mean[j] = _exp_hom_mean[j] / _n[j];
2662 _pi_mean[j] = _pi_mean[j] / _n[j];
2663 _fis_mean[j] = _fis_mean[j] / _n[j];
2664
2665 _num_indv_mean_all[j] = _num_indv_mean_all[j] / _n_all[j];
2666 _p_mean_all[j] = _p_mean_all[j] / _n_all[j];
2667 _obs_het_mean_all[j] = _obs_het_mean_all[j] / _n_all[j];
2668 _obs_hom_mean_all[j] = _obs_hom_mean_all[j] / _n_all[j];
2669 _exp_het_mean_all[j] = _exp_het_mean_all[j] / _n_all[j];
2670 _exp_hom_mean_all[j] = _exp_hom_mean_all[j] / _n_all[j];
2671 _pi_mean_all[j] = _pi_mean_all[j] / _n_all[j];
2672 _fis_mean_all[j] = _fis_mean_all[j] / _n_all[j];
2673 }
2674
2675 _locus_len_mean = _locus_len_mean / _locus_overlap_n;
2676 _locus_len_mean_all = _locus_len_mean_all / _locus_pe_ctg_n;
2677 _overlap_mean = _overlap_mean / _locus_overlap_n;
2678 _locus_gt_sites_mean = _locus_gt_sites_mean / _locus_n;
2679
2680 //
2681 // Finish the online variance calculation.
2682 //
2683 for (uint j = 0; j < this->_pop_cnt; j++) {
2684 _num_indv_var[j] = _num_indv_var[j] / (_n[j] - 1);
2685 _p_var[j] = _p_var[j] / (_n[j] - 1);
2686 _obs_het_var[j] = _obs_het_var[j] / (_n[j] - 1);
2687 _obs_hom_var[j] = _obs_hom_var[j] / (_n[j] - 1);
2688 _exp_het_var[j] = _exp_het_var[j] / (_n[j] - 1);
2689 _exp_hom_var[j] = _exp_hom_var[j] / (_n[j] - 1);
2690 _pi_var[j] = _pi_var[j] / (_n[j] - 1);
2691 _fis_var[j] = _fis_var[j] / (_n[j] - 1);
2692
2693 _num_indv_var_all[j] = _num_indv_var_all[j] / (_n_all[j] - 1);
2694 _p_var_all[j] = _p_var_all[j] / (_n_all[j] - 1);
2695 _obs_het_var_all[j] = _obs_het_var_all[j] / (_n_all[j] - 1);
2696 _obs_hom_var_all[j] = _obs_hom_var_all[j] / (_n_all[j] - 1);
2697 _exp_het_var_all[j] = _exp_het_var_all[j] / (_n_all[j] - 1);
2698 _exp_hom_var_all[j] = _exp_hom_var_all[j] / (_n_all[j] - 1);
2699 _pi_var_all[j] = _pi_var_all[j] / (_n_all[j] - 1);
2700 _fis_var_all[j] = _fis_var_all[j] / (_n_all[j] - 1);
2701 }
2702
2703 _locus_len_var = _locus_len_var / (_locus_overlap_n - 1);
2704 _locus_len_var_all = _locus_len_var_all / (_locus_pe_ctg_n - 1);
2705 _overlap_var = _overlap_var / (_locus_overlap_n - 1);
2706
2707 _locus_gt_sites_var = _locus_gt_sites_var / (_locus_n - 1);
2708
2709 //
2710 // Calculate the first half of the standard deviation.
2711 //
2712 for (uint j = 0; j < this->_pop_cnt; j++) {
2713 _sq_n[j] = sqrt(_n[j]);
2714 _sq_n_all[j] = sqrt(_n_all[j]);
2715 }
2716
2717 return 0;
2718 }
2719
2720 int
write_results()2721 SumStatsSummary::write_results()
2722 {
2723 string path = out_path + out_prefix + ".sumstats_summary.tsv";
2724 ofstream fh(path.c_str(), ofstream::out);
2725 if (fh.fail()) {
2726 cerr << "Error opening sumstats summary file '" << path << "'\n";
2727 exit(1);
2728 }
2729 fh.precision(fieldw);
2730 fh.setf(std::ios::fixed);
2731
2732 //
2733 // Write out locus length and overlap statistics for de novo data.
2734 //
2735 ostream os (cout.rdbuf());
2736 os << std::fixed << std::setprecision(2);
2737 if (!loci_ordered && this->_locus_pe_ctg_n > 0) {
2738 os << "Number of loci with PE contig: " << this->_locus_pe_ctg_n << " ("
2739 << as_percentage(this->_locus_pe_ctg_n, this->_locus_n) << ");\n"
2740 << " Mean length of loci: " << this->_locus_len_mean_all << "bp "
2741 << "(stderr " << sqrt(this->_locus_len_var_all) / sqrt(this->_locus_n) << ");\n"
2742 << "Number of loci with SE/PE overlap: " << this->_locus_overlap_n << " ("
2743 << as_percentage(this->_locus_overlap_n, this->_locus_n) << ");\n"
2744 << " Mean length of overlapping loci: " << this->_locus_len_mean << "bp "
2745 << "(stderr " << sqrt(this->_locus_len_var) / sqrt(this->_locus_n) << "); "
2746 << "mean overlap: " << this->_overlap_mean
2747 << "bp (stderr " << sqrt(this->_overlap_var) / sqrt(this->_locus_n) << ");\n";
2748 }
2749 os << "Mean genotyped sites per locus: " << this->_locus_gt_sites_mean << "bp "
2750 << "(stderr " << sqrt(this->_locus_gt_sites_var) / sqrt(this->_locus_n) << ").\n";
2751
2752 //
2753 // Write out summary statistics of the summary statistics.
2754 //
2755 fh << "# Variant positions\n"
2756 << "# Pop ID\t"
2757 << "Private\t"
2758 << "Num_Indv\t"
2759 << "Var\t"
2760 << "StdErr\t"
2761 << "P\t"
2762 << "Var\t"
2763 << "StdErr\t"
2764 << "Obs_Het\t"
2765 << "Var\t"
2766 << "StdErr\t"
2767 << "Obs_Hom\t"
2768 << "Var\t"
2769 << "StdErr\t"
2770 << "Exp_Het\t"
2771 << "Var\t"
2772 << "StdErr\t"
2773 << "Exp_Hom\t"
2774 << "Var\t"
2775 << "StdErr\t"
2776 << "Pi\t"
2777 << "Var\t"
2778 << "StdErr\t"
2779 << "Fis\t"
2780 << "Var\t"
2781 << "StdErr\n";
2782
2783 for (uint j = 0; j < this->_pop_cnt; j++)
2784 fh << mpopi.pops()[j].name << "\t"
2785 << _private_cnt[j] << "\t"
2786 << _num_indv_mean[j] << "\t"
2787 << _num_indv_var[j] << "\t"
2788 << sqrt(_num_indv_var[j]) / _sq_n[j] << "\t"
2789 << _p_mean[j] << "\t"
2790 << _p_var[j] << "\t"
2791 << sqrt(_p_var[j]) / _sq_n[j] << "\t"
2792 << _obs_het_mean[j] << "\t"
2793 << _obs_het_var[j] << "\t"
2794 << sqrt(_obs_het_var[j]) / _sq_n[j] << "\t"
2795 << _obs_hom_mean[j] << "\t"
2796 << _obs_hom_var[j] << "\t"
2797 << sqrt(_obs_hom_var[j]) / _sq_n[j] << "\t"
2798 << _exp_het_mean[j] << "\t"
2799 << _exp_het_var[j] << "\t"
2800 << sqrt(_exp_het_var[j]) / _sq_n[j] << "\t"
2801 << _exp_hom_mean[j] << "\t"
2802 << _exp_hom_var[j] << "\t"
2803 << sqrt(_exp_hom_var[j]) / _sq_n[j] << "\t"
2804 << _pi_mean[j] << "\t"
2805 << _pi_var[j] << "\t"
2806 << sqrt(_pi_var[j]) / _sq_n[j] << "\t"
2807 << _fis_mean[j] << "\t"
2808 << _fis_var[j] << "\t"
2809 << sqrt(_num_indv_var[j]) / _sq_n[j] << "\n";
2810
2811 cout << "\nPopulation summary statistics (more detail in populations.sumstats_summary.tsv):\n";
2812
2813 for (uint j = 0; j < this->_pop_cnt; j++)
2814 cout << " " << mpopi.pops()[j].name << ": "
2815 << setprecision(fieldw) << _num_indv_mean[j] << " samples per locus; "
2816 << "pi: " << _pi_mean[j] << "; "
2817 << setprecision(10) << "all/variant/polymorphic sites: " << _n_all[j] << "/" << _n[j] << "/" << _var_sites[j] << "; "
2818 << setprecision(fieldw) << "private alleles: " << _private_cnt[j] << "\n";
2819
2820 fh << "# All positions (variant and fixed)\n"
2821 << "# Pop ID\t"
2822 << "Private\t"
2823 << "Sites\t"
2824 << "Variant_Sites\t"
2825 << "Polymorphic_Sites\t"
2826 << "%Polymorphic_Loci\t"
2827 << "Num_Indv\t"
2828 << "Var\t"
2829 << "StdErr\t"
2830 << "P\t"
2831 << "Var\t"
2832 << "StdErr\t"
2833 << "Obs_Het\t"
2834 << "Var\t"
2835 << "StdErr\t"
2836 << "Obs_Hom\t"
2837 << "Var\t"
2838 << "StdErr\t"
2839 << "Exp_Het\t"
2840 << "Var\t"
2841 << "StdErr\t"
2842 << "Exp_Hom\t"
2843 << "Var\t"
2844 << "StdErr\t"
2845 << "Pi\t"
2846 << "Var\t"
2847 << "StdErr\t"
2848 << "Fis\t"
2849 << "Var\t"
2850 << "StdErr\n";
2851
2852 for (uint j = 0; j < this->_pop_cnt; j++) {
2853 fh << mpopi.pops()[j].name << "\t"
2854 << _private_cnt[j] << "\t" << setprecision(0)
2855 << _n_all[j] << "\t"
2856 << _n[j] << "\t"
2857 << _var_sites[j] << "\t" << setprecision(fieldw)
2858 << _var_sites[j] / _n_all[j] * 100 << "\t"
2859 << _num_indv_mean_all[j] << "\t"
2860 << _num_indv_var_all[j] << "\t"
2861 << sqrt(_num_indv_var_all[j]) / _sq_n_all[j] << "\t"
2862 << _p_mean_all[j] << "\t"
2863 << _p_var_all[j] << "\t"
2864 << sqrt(_p_var_all[j]) / _sq_n_all[j] << "\t"
2865 << _obs_het_mean_all[j] << "\t"
2866 << _obs_het_var_all[j] << "\t"
2867 << sqrt(_obs_het_var_all[j]) / _sq_n_all[j] << "\t"
2868 << _obs_hom_mean_all[j] << "\t"
2869 << _obs_hom_var_all[j] << "\t"
2870 << sqrt(_obs_hom_var_all[j]) / _sq_n_all[j] << "\t"
2871 << _exp_het_mean_all[j] << "\t"
2872 << _exp_het_var_all[j] << "\t"
2873 << sqrt(_exp_het_var_all[j]) / _sq_n_all[j] << "\t"
2874 << _exp_hom_mean_all[j] << "\t"
2875 << _exp_hom_var_all[j] << "\t"
2876 << sqrt(_exp_hom_var_all[j]) / _sq_n_all[j] << "\t"
2877 << _pi_mean_all[j] << "\t"
2878 << _pi_var_all[j] << "\t"
2879 << sqrt(_pi_var_all[j]) / _sq_n_all[j] << "\t"
2880 << _fis_mean_all[j] << "\t"
2881 << _fis_var_all[j] << "\t"
2882 << sqrt(_num_indv_var_all[j]) / _sq_n_all[j] << "\n";
2883 }
2884
2885 fh.close();
2886
2887 if (calc_hwp) {
2888 os << "\n"
2889 << "Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<" << p_value_cutoff << "):\n";
2890 for (uint j = 0; j < this->_pop_cnt; j++)
2891 os << " "
2892 << mpopi.pops()[j].name << ": "
2893 << _sig_hwe_dev[j] << "\n";
2894 }
2895
2896 return 0;
2897 }
2898
~SumStatsSummary()2899 SumStatsSummary::~SumStatsSummary()
2900 {
2901 delete [] this->_private_cnt;
2902 delete [] this->_n;
2903 delete [] this->_var_sites;
2904 delete [] this->_sq_n;
2905 delete [] this->_num_indv_mean;
2906 delete [] this->_num_indv_acc_mean;
2907 delete [] this->_num_indv_var;
2908 delete [] this->_p_mean;
2909 delete [] this->_p_acc_mean;
2910 delete [] this->_p_var;
2911 delete [] this->_obs_het_mean;
2912 delete [] this->_obs_het_acc_mean;
2913 delete [] this->_obs_het_var;
2914 delete [] this->_obs_hom_mean;
2915 delete [] this->_obs_hom_acc_mean;
2916 delete [] this->_obs_hom_var;
2917 delete [] this->_exp_het_mean;
2918 delete [] this->_exp_het_acc_mean;
2919 delete [] this->_exp_het_var;
2920 delete [] this->_exp_hom_mean;
2921 delete [] this->_exp_hom_acc_mean;
2922 delete [] this->_exp_hom_var;
2923 delete [] this->_pi_mean;
2924 delete [] this->_pi_acc_mean;
2925 delete [] this->_pi_var;
2926 delete [] this->_fis_mean;
2927 delete [] this->_fis_acc_mean;
2928 delete [] this->_fis_var;
2929
2930 delete [] this->_n_all;
2931 delete [] this->_sq_n_all;
2932 delete [] this->_num_indv_mean_all;
2933 delete [] this->_num_indv_acc_mean_all;
2934 delete [] this->_num_indv_var_all;
2935 delete [] this->_p_mean_all;
2936 delete [] this->_p_acc_mean_all;
2937 delete [] this->_p_var_all;
2938 delete [] this->_obs_het_mean_all;
2939 delete [] this->_obs_het_acc_mean_all;
2940 delete [] this->_obs_het_var_all;
2941 delete [] this->_obs_hom_mean_all;
2942 delete [] this->_obs_hom_acc_mean_all;
2943 delete [] this->_obs_hom_var_all;
2944 delete [] this->_exp_het_mean_all;
2945 delete [] this->_exp_het_acc_mean_all;
2946 delete [] this->_exp_het_var_all;
2947 delete [] this->_exp_hom_mean_all;
2948 delete [] this->_exp_hom_acc_mean_all;
2949 delete [] this->_exp_hom_var_all;
2950 delete [] this->_pi_mean_all;
2951 delete [] this->_pi_acc_mean_all;
2952 delete [] this->_pi_var_all;
2953 delete [] this->_fis_mean_all;
2954 delete [] this->_fis_acc_mean_all;
2955 delete [] this->_fis_var_all;
2956 }
2957
2958 int
correct_fst_bonferroni_win(vector<PopPair * > & pairs)2959 correct_fst_bonferroni_win(vector<PopPair *> &pairs)
2960 {
2961 int limit = 3 * sigma;
2962 int limit_l, limit_u;
2963 double correction;
2964 uint cnt, pos_l, pos_u;
2965
2966 pos_l = 0;
2967 pos_u = 0;
2968
2969 for (uint pos_c = 0; pos_c < pairs.size(); pos_c++) {
2970 if (pairs[pos_c] == NULL) continue;
2971
2972 limit_l = pairs[pos_c]->bp - limit > 0 ? pairs[pos_c]->bp - limit : 0;
2973 limit_u = pairs[pos_c]->bp + limit;
2974
2975 while (pos_l < pairs.size()) {
2976 if (pairs[pos_l] == NULL) {
2977 pos_l++;
2978 } else {
2979 if (pairs[pos_l]->bp < limit_l)
2980 pos_l++;
2981 else
2982 break;
2983 }
2984 }
2985 while (pos_u < pairs.size()) {
2986 if (pairs[pos_u] == NULL) {
2987 pos_u++;
2988 } else {
2989 if (pairs[pos_u]->bp < limit_u)
2990 pos_u++;
2991 else
2992 break;
2993 }
2994 }
2995
2996 cnt = 0;
2997 for (uint i = pos_l; i < pos_u; i++) {
2998 if (pairs[i] == NULL) continue;
2999 cnt++;
3000 }
3001
3002 correction = p_value_cutoff / cnt;
3003 pairs[pos_c]->stat[0] = pairs[pos_c]->fet_p < correction ? pairs[pos_c]->fst : 0;
3004 }
3005
3006 return 0;
3007 }
3008
3009 int
snpstats(const vector<LocBin * > & loci,ofstream & log_fh)3010 LocusSmoothing::snpstats(const vector<LocBin *> &loci, ofstream &log_fh)
3011 {
3012 for (uint i = 0; i < this->_mpopi->pops().size(); i++) {
3013
3014 vector<const SumStat *> sites;
3015
3016 this->_ord_ss->order(sites, loci, i);
3017 this->_ks_ss->smooth(sites);
3018 }
3019
3020 return 0;
3021 }
3022
3023 int
hapstats(const vector<LocBin * > & loci,ofstream & log_fh)3024 LocusSmoothing::hapstats(const vector<LocBin *> &loci, ofstream &log_fh)
3025 {
3026 vector<const LocStat *> sites;
3027
3028 for (uint i = 0; i < this->_mpopi->pops().size(); i++) {
3029 sites.clear();
3030
3031 this->_ord_ls->order(sites, loci);
3032 this->_ks_ls->smooth(sites);
3033 }
3034
3035 return 0;
3036 }
3037
3038 int
snp_divergence(const vector<LocBin * > & loci,const vector<vector<PopPair ** >> & div,ofstream & log_fh)3039 LocusSmoothing::snp_divergence(const vector<LocBin *> &loci, const vector<vector<PopPair **>> &div, ofstream &log_fh)
3040 {
3041 vector<const PopPair *> sites;
3042
3043 for (uint i = 0; i < div.size(); i++) {
3044 assert(div[i].size() == loci.size());
3045
3046 sites.clear();
3047
3048 if (this->_ord_pp->order(sites, loci, div[i]))
3049 this->_ks_pp->smooth(sites);
3050 }
3051
3052 return 0;
3053 }
3054
3055 int
hap_divergence(const vector<LocBin * > & loci,const vector<vector<HapStat * >> & div,const vector<HapStat * > & metadiv,ofstream & log_fh)3056 LocusSmoothing::hap_divergence(const vector<LocBin *> &loci,
3057 const vector<vector<HapStat *>> &div,
3058 const vector<HapStat *> &metadiv,
3059 ofstream &log_fh)
3060 {
3061 vector<const HapStat *> sites;
3062
3063 for (uint i = 0; i < div.size(); i++) {
3064 assert(div[i].size() == loci.size());
3065
3066 sites.clear();
3067
3068 this->_ord_hs->order(sites, loci, div[i]);
3069 this->_ks_hs->smooth(sites);
3070 }
3071
3072 //
3073 // Kernel-smooth the haplotype divergence statistics for the metapopulation.
3074 //
3075 sites.clear();
3076
3077 this->_ord_hs->order(sites, loci, metadiv);
3078 this->_ks_hs->smooth(sites);
3079
3080 return 0;
3081 }
3082
3083 int
bootstrap_popstats_approximate_dist(vector<double> & fis_samples,vector<double> & pi_samples,vector<int> & allele_samples,double * weights,int * snp_dist,int sites_per_snp,map<int,vector<double>> & approx_fis_dist,map<int,vector<double>> & approx_pi_dist)3084 bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
3085 vector<double> &pi_samples,
3086 vector<int> &allele_samples,
3087 double *weights, int *snp_dist, int sites_per_snp,
3088 map<int, vector<double> > &approx_fis_dist,
3089 map<int, vector<double> > &approx_pi_dist)
3090 {
3091 //
3092 // Allocate an array of bootstrap resampling objects.
3093 //
3094 int win_size = 6 * sigma + 1;
3095 int win_cntr = win_size / 2;
3096
3097 //
3098 // Initialize the Fst distribution map.
3099 //
3100 for (int i = 0; i < max_snp_dist; i++) {
3101 if (snp_dist[i] == 0.0) continue;
3102
3103 // cout << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
3104 approx_fis_dist[i] = vector<double> ();
3105 approx_fis_dist[i].reserve(bootstrap_reps);
3106
3107 approx_pi_dist[i] = vector<double> ();
3108 approx_pi_dist[i].reserve(bootstrap_reps);
3109 }
3110
3111 vector<int> poss;
3112 poss.reserve(max_snp_dist);
3113 double weighted_fis, weighted_pi, sum_fis, sum_pi, final_weight_fis, final_weight_pi;
3114 // int index_1, index_2;
3115 int pos, index_3, dist, start, end;
3116 int half = sites_per_snp / 2;
3117
3118 for (int i = 0; i < max_snp_dist; i++) {
3119 if (snp_dist[i] == 0.0) continue;
3120
3121 cout << " Generating NULL distribution for " << i << " SNPs...\n";
3122
3123 // #pragma omp parallel private(poss, pos, index_1, index_2, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
3124 #pragma omp parallel private(poss, pos, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
3125 {
3126 BSample *bs = new BSample[win_size];
3127
3128 //
3129 // Populate the BSample objects.
3130 //
3131 for (int n = 0; n < win_size; n++)
3132 bs[n].bp = n + 1;
3133
3134 vector<double> fiss, pis;
3135
3136 //
3137 // Bootstrap this bitch.
3138 //
3139 #pragma omp for schedule(dynamic, 1)
3140 for (int j = 0; j < bootstrap_reps; j++) {
3141 // cout << " Bootsrap rep " << j << "\n";
3142
3143 //
3144 // First SNP is always placed at the center of the window.
3145 //
3146 pos = win_cntr;
3147 // index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
3148 // index_2 = (int) (pi_samples.size() * (random() / (RAND_MAX + 1.0)));
3149 index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3150 //
3151 // Fill in the area around the SNP with fixed sites.
3152 //
3153 start = pos - half > 0 ? pos - half : 0;
3154 end = pos + half < win_size ? pos + half : win_size;
3155 for (int n = start; n < end; n++) {
3156 // bs[n].f = 0;
3157 // bs[n].pi = 0;
3158 bs[n].alleles = bs[pos].alleles;
3159 poss.push_back(n);
3160 }
3161 // bs[pos].f = fis_samples[index_1];
3162 // bs[pos].pi = pi_samples[index_2];
3163 bs[pos].alleles = allele_samples[index_3];
3164 // cout << " Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
3165
3166 //
3167 // Randomly select the positions and values for each SNP to populate the window
3168 //
3169 for (int k = 0; k < i - 1; k++) {
3170 pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
3171 // index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
3172 // index_2 = (int) (pi_samples.size() * (random() / (RAND_MAX + 1.0)));
3173 index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3174
3175 poss.push_back(pos);
3176 //
3177 // Fill in the area around the SNP with fixed sites.
3178 //
3179 start = pos - half > 0 ? pos - half : 0;
3180 end = pos + half < win_size ? pos + half : win_size;
3181 for (int n = start; n < end; n++) {
3182 // bs[n].f = 0;
3183 // bs[n].pi = 0;
3184 bs[n].alleles = bs[pos].alleles;
3185 poss.push_back(n);
3186 }
3187 // bs[pos].f = fis_samples[index_1];
3188 // bs[pos].pi = pi_samples[index_2];
3189 bs[pos].alleles = allele_samples[index_3];
3190 // cout << " Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
3191 }
3192
3193 weighted_fis = 0.0;
3194 sum_fis = 0.0;
3195 weighted_pi = 0.0;
3196 sum_pi = 0.0;
3197
3198 for (int n = 0; n < win_size; n++) {
3199 // if (bs[n].pi < 0.0)
3200 // continue;
3201 //
3202 // Calculate weighted Fst at this position.
3203 //
3204 dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
3205
3206 final_weight_fis = (bs[n].alleles - 1) * weights[dist];
3207 // weighted_fis += bs[n].f * final_weight_fis;
3208 sum_fis += final_weight_fis;
3209
3210 final_weight_pi = (bs[n].alleles - 1) * weights[dist];
3211 // weighted_pi += bs[n].pi * final_weight_pi;
3212 sum_pi += final_weight_pi;
3213 }
3214
3215 fiss.push_back(weighted_fis / sum_fis);
3216 pis.push_back(weighted_pi / sum_pi);
3217 // cout << " New weighted fis value: " << weighted_fis / sum_fis << "; size: " << fiss.size() << "\n";
3218
3219 for (uint n = 0; n < poss.size(); n++) {
3220 // bs[poss[n]].f = 0.0;
3221 // bs[poss[n]].pi = -1.0;
3222 }
3223 poss.clear();
3224 }
3225
3226 // #pragma omp critical
3227 // {
3228 // vector<double> &f = approx_fis_dist[i];
3229 // for (uint n = 0; n < fiss.size(); n++)
3230 // f.push_back(fiss[n]);
3231 // vector<double> &p = approx_pi_dist[i];
3232 // for (uint n = 0; n < pis.size(); n++)
3233 // p.push_back(pis[n]);
3234 // }
3235
3236 delete [] bs;
3237 }
3238
3239 sort(approx_fis_dist[i].begin(), approx_fis_dist[i].end());
3240 sort(approx_pi_dist[i].begin(), approx_pi_dist[i].end());
3241 }
3242
3243 return 0;
3244 }
3245
3246 int
bootstrap_fst_approximate_dist(vector<double> & fst_samples,vector<int> & allele_samples,double * weights,int * snp_dist,map<int,vector<double>> & approx_fst_dist)3247 bootstrap_fst_approximate_dist(vector<double> &fst_samples,
3248 vector<int> &allele_samples,
3249 double *weights, int *snp_dist,
3250 map<int, vector<double> > &approx_fst_dist)
3251 {
3252 //
3253 // Allocate an array of bootstrap resampling objects.
3254 //
3255 int win_size = 6 * sigma + 1;
3256 int win_cntr = win_size / 2;
3257
3258 //
3259 // Initialize the Fst distribution map.
3260 //
3261 for (int i = 0; i < max_snp_dist; i++) {
3262 if (snp_dist[i] == 0.0) continue;
3263
3264 // cout << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
3265 approx_fst_dist[i] = vector<double> ();
3266 approx_fst_dist[i].reserve(bootstrap_reps);
3267 }
3268
3269 vector<int> poss;
3270 poss.reserve(max_snp_dist);
3271 double weighted_fst, sum, final_weight;
3272 //int index_1;
3273 int pos, index_2, dist;
3274
3275 for (int i = 0; i < max_snp_dist; i++) {
3276 if (snp_dist[i] == 0.0) continue;
3277
3278 cout << " Generating NULL distribution for " << i << " SNPs...\n";
3279
3280 // #pragma omp parallel private(poss, pos, index_1, index_2, dist, sum, weighted_fst, final_weight)
3281 #pragma omp parallel private(poss, pos, index_2, dist, sum, weighted_fst, final_weight)
3282 {
3283 BSample *bs = new BSample[win_size];
3284
3285 //
3286 // Populate the BSample objects.
3287 //
3288 for (int n = 0; n < win_size; n++)
3289 bs[n].bp = n + 1;
3290
3291 vector<double> fsts;
3292
3293 //
3294 // Bootstrap this bitch.
3295 //
3296 #pragma omp for schedule(dynamic, 1)
3297 for (int j = 0; j < bootstrap_reps; j++) {
3298 // cout << "Bootsrap rep " << j << "\n";
3299
3300 //
3301 // First SNP is always placed at the center of the window.
3302 //
3303 pos = win_cntr;
3304 // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
3305 index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3306 // bs[pos].f = fst_samples[index_1];
3307 bs[pos].alleles = allele_samples[index_2];
3308
3309 //
3310 // Randomly select the positions and values for each SNP to populate the window
3311 //
3312 for (int k = 0; k < i - 1; k++) {
3313 pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
3314 // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
3315 index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3316 // bs[pos].f = fst_samples[index_1];
3317 // bs[pos].alleles = allele_samples[index_2];
3318 // cout << " " << j << ": Placing SNP at position: " << pos << " with data from index " << index_1 << "\n";
3319
3320 poss.push_back(pos);
3321 }
3322
3323 weighted_fst = 0.0;
3324 sum = 0.0;
3325
3326 for (int n = 0; n < win_size; n++) {
3327 // if (bs[n].f == 0.0)
3328 // continue;
3329 //
3330 // Calculate weighted Fst at this position.
3331 //
3332 dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
3333
3334 final_weight = (bs[n].alleles - 1) * weights[dist];
3335 // weighted_fst += bs[n].f * final_weight;
3336 sum += final_weight;
3337 }
3338
3339 fsts.push_back(weighted_fst / sum);
3340 // cout << " New weighted Fst value: " << weighted_fst / sum << "; size: " << fsts.size() << "\n";
3341
3342 // for (uint n = 0; n < poss.size(); n++)
3343 // bs[poss[n]].f = 0.0;
3344 poss.clear();
3345 }
3346
3347 // #pragma omp critical
3348 // {
3349 // vector<double> &f = approx_fst_dist[i];
3350 // for (uint n = 0; n < fsts.size(); n++)
3351 // f.push_back(fsts[n]);
3352 // }
3353
3354 delete [] bs;
3355 }
3356
3357 sort(approx_fst_dist[i].begin(), approx_fst_dist[i].end());
3358 }
3359
3360 return 0;
3361 }
3362
3363 double
bootstrap_approximate_pval(int snp_cnt,double stat,map<int,vector<double>> & approx_dist)3364 bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &approx_dist)
3365 {
3366 if (approx_dist.count(snp_cnt) == 0)
3367 return 1.0;
3368
3369 vector<double>::iterator up;
3370 vector<double> &dist = approx_dist[snp_cnt];
3371 double pos;
3372
3373 up = upper_bound(dist.begin(), dist.end(), stat);
3374
3375 if (up == dist.begin())
3376 pos = 1;
3377 else if (up == dist.end())
3378 pos = dist.size();
3379 else
3380 pos = up - dist.begin() + 1;
3381
3382 double res = 1.0 - (pos / (double) dist.size());
3383
3384 // cout << "Generated Approx Smoothed Fst Distribution:\n";
3385 // for (uint n = 0; n < dist.size(); n++)
3386 // cout << " n: " << n << "; Fst: " << dist[n] << "\n";
3387
3388 // cout << "Comparing Fst value: " << stat
3389 // << " at position " << (up - dist.begin()) << " out of "
3390 // << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
3391
3392 return res;
3393 }
3394
3395 int
load_blacklist(string path)3396 LocusFilter::load_blacklist(string path)
3397 {
3398 char line[id_len];
3399 ifstream fh(path.c_str(), ifstream::in);
3400
3401 if (fh.fail()) {
3402 cerr << "Error opening white/black list file '" << path << "'\n";
3403 exit(1);
3404 }
3405
3406 size_t line_num = 0;
3407 while (fh.getline(line, id_len)) {
3408 ++line_num;
3409
3410 //
3411 // Skip blank & commented lines ; correct windows-style line ends.
3412 //
3413 size_t len = strlen(line);
3414 if (len == 0) {
3415 continue;
3416 } else if (line[len-1] == '\r') {
3417 line[len-1] = '\0';
3418 --len;
3419 if (len == 0)
3420 continue;
3421 }
3422 char* p = line;
3423 while (isspace(*p) && *p != '\0')
3424 ++p;
3425 if (*p == '#')
3426 continue;
3427
3428 //
3429 // Parse the blacklist
3430 //
3431 char* e;
3432 int marker = (int) strtol(line, &e, 10);
3433 if (*e == '\0') {
3434 this->_blacklist.insert(marker);
3435 } else {
3436 cerr << "Error: Unable to parse blacklist '" << path << "' at line " << line_num << ".\n";
3437 throw exception();
3438 }
3439 }
3440
3441 fh.close();
3442
3443 if (this->_blacklist.size() == 0) {
3444 cerr << "Error: Unable to load any markers from '" << path << "'\n";
3445 exit(1);
3446 }
3447
3448 return (int) this->_blacklist.size();
3449 }
3450
load_marker_list(string path,set<int> & list)3451 int load_marker_list(string path, set<int> &list) {
3452 char line[id_len];
3453 ifstream fh(path.c_str(), ifstream::in);
3454
3455 if (fh.fail()) {
3456 cerr << "Error opening white/black list file '" << path << "'\n";
3457 exit(1);
3458 }
3459
3460 size_t line_num = 0;
3461 while (fh.getline(line, id_len)) {
3462 ++line_num;
3463
3464 //
3465 // Skip blank & commented lines ; correct windows-style line ends.
3466 //
3467 size_t len = strlen(line);
3468 if (len == 0) {
3469 continue;
3470 } else if (line[len-1] == '\r') {
3471 line[len-1] = '\0';
3472 --len;
3473 if (len == 0)
3474 continue;
3475 }
3476 char* p = line;
3477 while (isspace(*p) && *p != '\0')
3478 ++p;
3479 if (*p == '#')
3480 continue;
3481
3482 //
3483 // Parse the blacklist
3484 //
3485 char* e;
3486 int marker = (int) strtol(line, &e, 10);
3487 if (*e == '\0') {
3488 list.insert(marker);
3489 } else {
3490 cerr << "Error: Unable to parse blacklist '" << path << "' at line " << line_num << ".\n";
3491 throw exception();
3492 }
3493 }
3494
3495 fh.close();
3496
3497 if (list.size() == 0) {
3498 cerr << "Error: Unable to load any markers from '" << path << "'\n";
3499 exit(1);
3500 }
3501
3502 return 0;
3503 }
3504
3505 int
load_whitelist(string path)3506 LocusFilter::load_whitelist(string path)
3507 {
3508 char line[id_len];
3509 ifstream fh(path.c_str(), ifstream::in);
3510
3511 if (fh.fail()) {
3512 cerr << "Error opening white/black list file '" << path << "'\n";
3513 exit(1);
3514 }
3515
3516 vector<string> parts;
3517 uint col;
3518 char *e;
3519
3520 uint line_num = 0;
3521 while (fh.getline(line, id_len)) {
3522 ++line_num;
3523
3524 //
3525 // Skip blank & commented lines ; correct windows-style line ends.
3526 //
3527 size_t len = strlen(line);
3528 if (len == 0) {
3529 continue;
3530 } else if (line[len-1] == '\r') {
3531 line[len-1] = '\0';
3532 --len;
3533 if (len == 0)
3534 continue;
3535 }
3536 char* p = line;
3537 while (isspace(*p) && *p != '\0')
3538 ++p;
3539 if (*p == '#')
3540 continue;
3541
3542 //
3543 // Parse the whitelist, we expect:
3544 // <marker>[<tab><snp column>]
3545 //
3546 parse_tsv(line, parts);
3547
3548 if (parts.size() > 2) {
3549 cerr << "Error: Too many columns in whitelist " << path << "' at line " << line_num << "\n";
3550 exit(1);
3551 }
3552 int marker = (int) strtol(parts[0].c_str(), &e, 10);
3553 if (*e != '\0') {
3554 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3555 exit(1);
3556 }
3557 this->_whitelist[marker];
3558 if (parts.size() == 2) {
3559 col = (int) strtol(parts[1].c_str(), &e, 10);
3560 if (*e != '\0') {
3561 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3562 exit(1);
3563 }
3564 this->_whitelist[marker].insert(col);
3565 }
3566 }
3567 if (this->_whitelist.size() == 0) {
3568 cerr << "Error: Unable to load any markers from '" << path << "'\n";
3569 help();
3570 }
3571 return (int) this->_whitelist.size();
3572 }
3573
load_marker_column_list(string path,map<int,set<int>> & list)3574 int load_marker_column_list(string path, map<int, set<int> > &list) {
3575 char line[id_len];
3576 ifstream fh(path.c_str(), ifstream::in);
3577
3578 if (fh.fail()) {
3579 cerr << "Error opening white/black list file '" << path << "'\n";
3580 exit(1);
3581 }
3582
3583 vector<string> parts;
3584 uint col;
3585 char *e;
3586
3587 uint line_num = 1;
3588 while (fh.getline(line, id_len)) {
3589
3590 //
3591 // Skip blank & commented lines ; correct windows-style line ends.
3592 //
3593 size_t len = strlen(line);
3594 if (len == 0) {
3595 continue;
3596 } else if (line[len-1] == '\r') {
3597 line[len-1] = '\0';
3598 --len;
3599 if (len == 0)
3600 continue;
3601 }
3602 char* p = line;
3603 while (isspace(*p) && *p != '\0')
3604 ++p;
3605 if (*p == '#')
3606 continue;
3607
3608 //
3609 // Parse the whitelist, we expect:
3610 // <marker>[<tab><snp column>]
3611 //
3612 parse_tsv(line, parts);
3613
3614 if (parts.size() > 2) {
3615 cerr << "Error: Too many columns in whitelist " << path << "' at line " << line_num << "\n";
3616 exit(1);
3617
3618 } else if (parts.size() == 2) {
3619 int marker = (int) strtol(parts[0].c_str(), &e, 10);
3620 if (*e != '\0') {
3621 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3622 exit(1);
3623 }
3624 col = (int) strtol(parts[1].c_str(), &e, 10);
3625 if (*e != '\0') {
3626 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3627 exit(1);
3628 }
3629 list[marker].insert(col);
3630
3631 } else {
3632 int marker = (int) strtol(parts[0].c_str(), &e, 10);
3633 if (*e != '\0') {
3634 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3635 exit(1);
3636 }
3637 list.insert(make_pair(marker, set<int>()));
3638 }
3639
3640 line_num++;
3641 }
3642
3643 fh.close();
3644
3645 if (list.size() == 0) {
3646 cerr << "Error: Unable to load any markers from '" << path << "'\n";
3647 help();
3648 }
3649
3650 return 0;
3651 }
3652
3653 bool
hap_compare(const pair<string,int> & a,const pair<string,int> & b)3654 hap_compare(const pair<string, int>& a, const pair<string, int>& b)
3655 {
3656 return (a.second > b.second);
3657 }
3658
3659 void
output_parameters(ostream & fh)3660 output_parameters(ostream &fh)
3661 {
3662 fh << "populations parameters selected:\n";
3663 if (input_mode == InputMode::vcf)
3664 fh << " Input mode: VCF\n";
3665 fh
3666 << " Percent samples limit per population: " << min_samples_per_pop << "\n"
3667 << " Locus Population limit: " << min_populations << "\n"
3668 << " Percent samples overall: " << min_samples_overall << "\n"
3669 << " Minor allele frequency cutoff: " << minor_allele_freq << "\n"
3670 << " Maximum observed heterozygosity cutoff: " << max_obs_het << "\n"
3671 << " Applying Fst correction: ";
3672 switch(fst_correction) {
3673 case p_value:
3674 fh << "P-value correction.\n";
3675 break;
3676 case bonferroni_win:
3677 fh << "Bonferroni correction within sliding window.\n";
3678 break;
3679 case bonferroni_gen:
3680 fh << "Bonferroni correction across genome wide sites.\n";
3681 break;
3682 case no_correction:
3683 fh << "none.\n";
3684 break;
3685 }
3686 fh << " Pi/Fis kernel smoothing: " << (smooth_popstats == true ? "on" : "off") << "\n"
3687 << " Fstats kernel smoothing: " << (smooth_fstats == true ? "on" : "off") << "\n"
3688 << " Bootstrap resampling: ";
3689 if (bootstrap)
3690 fh << "on, " << (bootstrap_type == bs_exact ? "exact; " : "approximate; ") << bootstrap_reps << " reptitions\n";
3691 else
3692 fh << "off\n";
3693
3694 fh << "\n";
3695 }
3696
3697 int
3698
parse_command_line(int argc,char * argv[])3699 parse_command_line(int argc, char* argv[])
3700 {
3701 bool no_hap_exports = false;
3702 char* tmp_str;
3703 while (1) {
3704 static struct option long_options[] = {
3705 {"help", no_argument, NULL, 'h'},
3706 {"version", no_argument, NULL, 'v'},
3707 {"quiet", no_argument, NULL, 'q'},
3708 {"verbose", no_argument, NULL, 'd'},
3709 {"vcf", no_argument, NULL, 1004}, {"vcf-haps", no_argument, NULL, 1004}, {"vcf-haplotypes", no_argument, NULL, 1004}, {"vcf_haps", no_argument, NULL, 1004}, {"vcf_haplotypes", no_argument, NULL, 1004},
3710 {"fasta-loci", no_argument, NULL, 1006}, {"fasta_loci", no_argument, NULL, 1006},
3711 {"fasta-samples", no_argument, NULL, 'J'}, {"fasta-strict", no_argument, NULL, 'J'}, {"fasta_samples", no_argument, NULL, 'J'}, {"fasta_strict", no_argument, NULL, 'J'},
3712 {"fasta-samples-raw", no_argument, NULL, 'F'}, {"fasta", no_argument, NULL, 'F'}, {"fasta_samples_raw", no_argument, NULL, 'F'},
3713 {"structure", no_argument, NULL, 'S'},
3714 {"radpainter", no_argument, NULL, 1015}, {"fineRADstructure", no_argument, NULL, 1015},
3715 {"fastphase", no_argument, NULL, 'A'},
3716 {"phase", no_argument, NULL, 'C'},
3717 {"plink", no_argument, NULL, 'K'},
3718 {"genepop", no_argument, NULL, 1010},
3719 {"genepop-haps-3digits", no_argument, NULL, 1011},
3720 {"phylip", no_argument, NULL, 'Y'},
3721 {"phylip-var", no_argument, NULL, 'L'}, {"phylip_var", no_argument, NULL, 'L'},
3722 {"phylip-var-all", no_argument, NULL, 'T'}, {"phylip_var_all", no_argument, NULL, 'T'},
3723 {"hzar", no_argument, NULL, 'Z'},
3724 {"treemix", no_argument, NULL, 'U'},
3725 {"merge-sites", no_argument, NULL, 'D'}, {"merge_sites", no_argument, NULL, 'D'},
3726 {"sigma", required_argument, NULL, 1005},
3727 {"threads", required_argument, NULL, 't'},
3728 {"in-path", required_argument, NULL, 'P'}, {"in_path", required_argument, NULL, 'P'},
3729 {"v1", no_argument, NULL, 2000},
3730 {"out-path", required_argument, NULL, 'O'}, {"out_path", required_argument, NULL, 'O'},
3731 {"in-vcf", required_argument, NULL, 'V'}, {"in_vcf", required_argument, NULL, 'V'},
3732 {"min-samples-overall", required_argument, NULL, 'R'},
3733 {"min-samples-per-pop", required_argument, NULL, 'r'}, {"progeny", required_argument, NULL, 'r'},
3734 {"min-populations", required_argument, NULL, 'p'}, {"min_populations", required_argument, NULL, 'p'},
3735 {"filter-haplotype-wise", no_argument, NULL, 'H'},
3736 {"renz", required_argument, NULL, 'e'},
3737 {"popmap", required_argument, NULL, 'M'},
3738 {"no-popmap", no_argument, NULL, 1017}, // Negates a previous -M/--popmap
3739 {"whitelist", required_argument, NULL, 'W'},
3740 {"blacklist", required_argument, NULL, 'B'},
3741 {"batch-size", required_argument, NULL, 1999}, {"batch_size", required_argument, NULL, 1999},
3742 {"dbg-min-gt-depth", required_argument, NULL, 2001},
3743 {"write-single-snp", no_argument, NULL, 'I'}, {"write_single_snp", no_argument, NULL, 'I'},
3744 {"write-random-snp", no_argument, NULL, 'j'}, {"write_random_snp", no_argument, NULL, 'j'},
3745 {"no-hap-exports", no_argument, NULL, 1012}, {"no_hap_exports", no_argument, NULL, 1012},
3746 {"ordered-export", no_argument, NULL, 1002}, {"ordered_export", no_argument, NULL, 1002},
3747 {"smooth", no_argument, NULL, 'k'},
3748 {"smooth-fstats", no_argument, NULL, 1007}, {"smooth_fstats", no_argument, NULL, 1007},
3749 {"smooth-popstats", no_argument, NULL, 1008}, {"smooth_popstats", no_argument, NULL, 1008},
3750 {"fstats", no_argument, NULL, '6'},
3751 {"hwe", no_argument, NULL, 1014},
3752 {"log-fst-comp", no_argument, NULL, 'l'}, {"log_fst_comp", no_argument, NULL, 'l'},
3753 {"bootstrap-type", required_argument, NULL, 1001}, {"bootstrap_type", required_argument, NULL, 1001},
3754 {"bootstrap-reps", required_argument, NULL, 1003}, {"bootstrap_reps", required_argument, NULL, 1003},
3755 {"bootstrap-wl", required_argument, NULL, 'Q'}, {"bootstrap_wl", required_argument, NULL, 'Q'},
3756 {"bootstrap", no_argument, NULL, '1'},
3757 {"bootstrap-fst", no_argument, NULL, '2'}, {"bootstrap_fst", no_argument, NULL, '2'},
3758 {"bootstrap-phist", no_argument, NULL, '3'}, {"bootstrap_phist", no_argument, NULL, '3'},
3759 {"bootstrap-div", no_argument, NULL, '4'}, {"bootstrap_div", no_argument, NULL, '4'},
3760 {"bootstrap-pifis", no_argument, NULL, '5'}, {"bootstrap_pifis", no_argument, NULL, '5'},
3761 {"min-maf", required_argument, NULL, 'a'}, {"min_maf", required_argument, NULL, 'a'},
3762 {"min-mac", required_argument, NULL, 1016}, {"min_mac", required_argument, NULL, 1016},
3763 {"max-obs-het", required_argument, NULL, 1013}, {"max_obs_het", required_argument, NULL, 1013},
3764 {"merge-prune-lim", required_argument, NULL, 'i'}, {"merge_prune_lim", required_argument, NULL, 'i'},
3765 {"fst-correction", required_argument, NULL, 'f'}, {"fst_correction", required_argument, NULL, 'f'},
3766 {"p-value-cutoff", required_argument, NULL, 'u'}, {"p_value_cutoff", required_argument, NULL, 'u'},
3767 {"debug-flags", required_argument, NULL, 1000}, {"debug_flags", required_argument, NULL, 1000},
3768 {"lnl-lim", required_argument, NULL, 7000}, {"lnl_lim", required_argument, NULL, 7000}, // (deprecated)
3769 {0, 0, 0, 0}
3770 };
3771
3772 // getopt_long stores the option index here.
3773 int c = getopt_long(argc, argv, "ACDFHJKLNSTUV:YZ123456dhjklnqva:c:e:f:i:o:p:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, NULL);
3774
3775 // Detect the end of the options.
3776 if (c == -1)
3777 break;
3778
3779 switch (c) {
3780 case 'v':
3781 version();
3782 exit(1);
3783 break;
3784 case 'h':
3785 help();
3786 break;
3787 case 'q':
3788 quiet = true;
3789 break;
3790 case 'd':
3791 verbose = true;
3792 break;
3793 case 't':
3794 num_threads = atoi(optarg);
3795 break;
3796 case 'P':
3797 in_path = optarg;
3798 if (!in_path.empty() && in_path.back() != '/')
3799 in_path += "/";
3800 break;
3801 case 2000: //v1
3802 input_mode = InputMode::stacks;
3803 break;
3804 case 1999:
3805 batch_size = is_integer(optarg);
3806 break;
3807 case 2001:
3808 min_gt_depth = stol(optarg);
3809 break;
3810 case 'O':
3811 out_path = optarg;
3812 if (!out_path.empty() && out_path.back() != '/')
3813 out_path += "/";
3814 break;
3815 case 'V':
3816 in_vcf_path = optarg;
3817 break;
3818 case 'M': // popmap
3819 pmap_path = optarg;
3820 break;
3821 case 1017: //no-popmap
3822 pmap_path.clear();
3823 break;
3824 case 'D':
3825 merge_sites = true;
3826 break;
3827 case 'i':
3828 merge_prune_lim = is_double(optarg);
3829 if (merge_prune_lim > 1.0)
3830 merge_prune_lim = merge_prune_lim / 100;
3831
3832 if (merge_prune_lim < 0 || merge_prune_lim > 1.0) {
3833 cerr << "Error: Unable to parse the merge sites pruning limit.\n";
3834 help();
3835 }
3836 break;
3837 case 1013:
3838 max_obs_het = is_double(optarg);
3839 if (max_obs_het > 1)
3840 max_obs_het = max_obs_het / 100;
3841
3842 if (max_obs_het < 0 || max_obs_het > 1.0) {
3843 cerr << "Error: Unable to parse the maximum observed heterozygosity.\n";
3844 help();
3845 }
3846 break;
3847 case 'r':
3848 min_samples_per_pop = stod(optarg);
3849 if (!std::isfinite(min_samples_per_pop)) {
3850 cerr << "Error: Invalid value for --min-samples-per-pop, \""
3851 << optarg << "\"\n";
3852 help();
3853 }
3854 if (min_samples_per_pop > 1.0)
3855 min_samples_per_pop /= 100.0;
3856 if (min_samples_per_pop < 0.0 || min_samples_per_pop > 1.0) {
3857 cerr << "Error: Invalid value for --min-samples-per-pop, \""
3858 << optarg << "\"\n";
3859 help();
3860 }
3861 break;
3862 case 'R':
3863 min_samples_overall = stod(optarg);
3864 if (!std::isfinite(min_samples_overall)) {
3865 cerr << "Error: Invalid value for --min-samples-overall, \""
3866 << optarg << "\"\n";
3867 help();
3868 }
3869 if (min_samples_overall > 1.0)
3870 min_samples_overall /= 100.0;
3871 if (min_samples_overall < 0.0 || min_samples_overall > 1.0) {
3872 cerr << "Error: Invalid value for --min-samples-overall, \""
3873 << optarg << "\"\n";
3874 help();
3875 }
3876 break;
3877 case 'p':
3878 min_populations = stoi(optarg);
3879 break;
3880 case 'H':
3881 filter_haplotype_wise = true;
3882 break;
3883 case 'k':
3884 smooth_popstats = true;
3885 smooth_fstats = true;
3886 calc_fstats = true;
3887 ordered_export = true;
3888 break;
3889 case 1007:
3890 smooth_fstats = true;
3891 calc_fstats = true;
3892 ordered_export = true;
3893 break;
3894 case 1008:
3895 smooth_popstats = true;
3896 ordered_export = true;
3897 break;
3898 case '6':
3899 calc_fstats = true;
3900 break;
3901 case 'l':
3902 log_fst_comp = true;
3903 break;
3904 case '1':
3905 ordered_export = true;
3906 bootstrap = true;
3907 bootstrap_fst = true;
3908 bootstrap_phist = true;
3909 bootstrap_pifis = true;
3910 bootstrap_div = true;
3911 break;
3912 case '2':
3913 ordered_export = true;
3914 bootstrap_fst = true;
3915 break;
3916 case '3':
3917 ordered_export = true;
3918 bootstrap_phist = true;
3919 break;
3920 case '4':
3921 ordered_export = true;
3922 bootstrap_div = true;
3923 break;
3924 case '5':
3925 ordered_export = true;
3926 bootstrap_pifis = true;
3927 break;
3928 case 1001:
3929 if (strcasecmp(optarg, "exact") == 0)
3930 bootstrap_type = bs_exact;
3931 else if (strcasecmp(optarg, "approx") == 0)
3932 bootstrap_type = bs_approx;
3933 else {
3934 cerr << "Error: Unknown bootstrap type specified '" << optarg << "'\n";
3935 help();
3936 }
3937 break;
3938 case 1003:
3939 ordered_export = true;
3940 bootstrap = true;
3941 bootstrap_reps = atoi(optarg);
3942 break;
3943 case 'Q':
3944 bs_wl_file = optarg;
3945 bootstrap_wl = true;
3946 break;
3947 case 7000:
3948 cerr << "WARNING: --lnl-lim is deprecated and has no effect.\n";
3949 break;
3950 case 'I':
3951 write_single_snp = true;
3952 no_hap_exports = true;
3953 break;
3954 case 'j':
3955 write_random_snp = true;
3956 no_hap_exports = true;
3957 break;
3958 case 1012: // --no-haps
3959 no_hap_exports = true;
3960 break;
3961 case 1002:
3962 ordered_export = true;
3963 break;
3964 case 1004: // --vcf
3965 add_export<VcfExport>();
3966 add_export<VcfHapsExport>();
3967 break;
3968 case 1006: // --fasta-loci
3969 add_export<FastaLociExport>();
3970 break;
3971 case 'F':
3972 add_export<FastaRawExport>();
3973 break;
3974 case 'J':
3975 add_export<FastaSamplesExport>();
3976 break;
3977 case 1010: // --genepop
3978 add_export<GenePopExport>();
3979 add_export<GenePopHapsExport>();
3980 break;
3981 case 1011: //genepop-haps-3digits
3982 add_export<GenePopHapsExport>();
3983 dynamic_cast<GenePopHapsExport&>(**find_export<GenePopHapsExport>()).set_digits(3);
3984 break;
3985 case 1015: // --radpainter,--fineradstructure
3986 add_export<FineRADStructureExport>();
3987 break;
3988 case 1014: // --hwe
3989 calc_hwp = true;
3990 break;
3991 case 'S':
3992 add_export<StructureExport>();
3993 break;
3994 case 'A':
3995 cerr << "BETA: Ignoring --fastphase output request, which is not currently implemented.\n";
3996 break;
3997 case 'C':
3998 cerr << "BETA: Ignoring --phase output request, which is not currently implemented.\n";
3999 break;
4000 case 'K': // --plink
4001 add_export<PlinkExport>();
4002 break;
4003 case 'Z':
4004 add_export<HzarExport>();
4005 break;
4006 case 'Y':
4007 add_export<PhylipFixedExport>();
4008 break;
4009 case 'L':
4010 add_export<PhylipVarExport>();
4011 break;
4012 case 'T':
4013 cerr << "BETA: Ignoring --phylip-var-all output request, which is not currently implemented.\n";
4014 break;
4015 case 'U':
4016 add_export<TreemixExport>();
4017 break;
4018 case 'W':
4019 wl_file = optarg;
4020 break;
4021 case 'B':
4022 bl_file = optarg;
4023 break;
4024 case 'a':
4025 minor_allele_freq = atof(optarg);
4026 if (minor_allele_freq > 1)
4027 minor_allele_freq = minor_allele_freq / 100;
4028
4029 if (minor_allele_freq < 0 || minor_allele_freq > 0.5) {
4030 cerr << "Error: Unable to parse the minor allele frequency.\n";
4031 help();
4032 }
4033 break;
4034 case 1016:
4035 minor_allele_cnt = strtol(optarg, &tmp_str, 10);
4036 if (tmp_str == optarg || *tmp_str != '\0' || minor_allele_cnt < 0) {
4037 cerr << "Error: Unable to parse the minor allele count from '" << optarg << "'.\n";
4038 help();
4039 }
4040 break;
4041 case 'f':
4042 if (strcasecmp(optarg, "p_value") == 0)
4043 fst_correction = p_value;
4044 else if (strcasecmp(optarg, "bonferroni_win") == 0)
4045 fst_correction = bonferroni_win;
4046 else if (strcasecmp(optarg, "bonferroni_gen") == 0)
4047 fst_correction = bonferroni_gen;
4048 else {
4049 cerr << "Error: Unknown Fst correction specified '" << optarg << "'\n";
4050 help();
4051 }
4052 break;
4053 case 'u':
4054 p_value_cutoff = atof(optarg);
4055 break;
4056 case 'e':
4057 enz = optarg;
4058 enz.at(0) = tolower(enz.at(0));
4059 if (renz.count(enz) == 0) {
4060 cerr << "Error: Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
4061 help();
4062 }
4063 break;
4064 case 1005: //sigma
4065 sigma = atof(optarg);
4066 break;
4067 case '?':
4068 // getopt_long already printed an error message.
4069 help();
4070 break;
4071 case 1000:
4072 {
4073 static const set<string> known_debug_flags = {"VCFCOMP"};
4074 stringstream ss (optarg);
4075 string s;
4076 while (getline(ss, s, ',')) {
4077 if (known_debug_flags.count(s)) {
4078 debug_flags.insert(s);
4079 } else {
4080 cerr << "DEBUG: Error: Unknown error flag '" << s << "'.\n";
4081 return -1;
4082 }
4083 }
4084 cout << "DEBUG: Debug flag(s) : '" << optarg << "'.\n";
4085
4086 if (debug_flags.count("VCFCOMP") && not write_random_snp) {
4087 write_single_snp = true;
4088 cout << "DEBUG: Added --write-single-snp.\n";
4089 }
4090
4091 break;
4092 }
4093 default:
4094 cerr << "Error: Unknown command line option: '" << (char) c << "'\n";
4095 help();
4096 exit(1);
4097 }
4098 }
4099
4100 if (optind < argc) {
4101 cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
4102 help();
4103 }
4104
4105 //
4106 // Check argument constrains.
4107 //
4108 if (input_mode == InputMode::stacks && in_path.empty()) {
4109 cerr << "Error: Option --v1 requires -P to be given.\n";
4110 help();
4111 }
4112
4113 if (!in_path.empty() && !in_vcf_path.empty()) {
4114 cerr << "Error: Please specify either '-P/--in-path' or '-V/--in-vcf', not both.\n";
4115 help();
4116 } else if (in_path.empty() && in_vcf_path.empty()) {
4117 cerr << "Error: One of '-P/--in-path' or '-V/--in-vcf' is required.\n";
4118 help();
4119 } else if (not in_vcf_path.empty()) {
4120 input_mode = InputMode::vcf;
4121 }
4122
4123 if (input_mode == InputMode::stacks || input_mode == InputMode::stacks2) {
4124 if (out_path.empty())
4125 out_path = in_path;
4126
4127 out_prefix = "populations";
4128
4129 } else if (input_mode == InputMode::vcf) {
4130
4131 if (out_path.empty()) {
4132 cerr << "Error: Malformed arguments: input mode 'vcf' requires an output directory (--out-path).\n";
4133 help();
4134 }
4135
4136 // Determine out_prefix
4137 string fname = in_vcf_path;
4138 if (in_vcf_path.find_last_of('/') != string::npos && in_vcf_path.back() != '/')
4139 fname = in_vcf_path.substr(in_vcf_path.find_last_of('/')+1);
4140 size_t trim = 0;
4141 if (fname.length() > 4 && fname.substr(fname.length()-4) == ".vcf")
4142 trim = 4;
4143 else if (fname.length() > 7 && fname.substr(fname.length()-7) == ".vcf.gz")
4144 trim = 7;
4145 out_prefix = fname.substr(0, fname.length()-trim);
4146 out_prefix += ".p";
4147 }
4148
4149 // Other
4150 if (write_single_snp + write_random_snp + filter_haplotype_wise > 1) {
4151 cerr << "Error: At most one of --write-single-snp, --write-random-snp"
4152 << " or --filter-haplotype-wise may be specified.\n";
4153 help();
4154 }
4155
4156 if (merge_sites == true && enz.length() == 0) {
4157 cerr << "Error: You must specify the restriction enzyme associated with this data set to merge overlaping cutsites.\n";
4158 help();
4159 }
4160
4161 if (no_hap_exports) {
4162 for (Export*& e : exports) {
4163 if (e->is_hap_export()) {
4164 delete e;
4165 e = NULL;
4166 }
4167 }
4168 stacks_erase_if(exports, [](const Export* e){return e == NULL;});
4169 }
4170
4171 return 0;
4172 }
4173
version()4174 void version() {
4175 cout << "populations " << VERSION << "\n\n";
4176 }
4177
help()4178 void help() {
4179 cout << "populations " << VERSION << "\n"
4180 << "Usage:\n"
4181 << "populations -P dir [-O dir] [-M popmap] (filters) [--fstats] [-k [--sigma=150000] [--bootstrap [-N 100]]] (output formats)\n"
4182 << "populations -V vcf -O dir [-M popmap] (filters) [--fstats] [-k [--sigma=150000] [--bootstrap [-N 100]]] (output formats)\n"
4183 << "\n"
4184 << " -P,--in-path: path to a directory containing GStacks ouput files.\n"
4185 << " -V,--in-vcf: path to a standalone input VCF file.\n"
4186 << " -O,--out-path: path to a directory where to write the output files. (Required by -V; otherwise defaults to value of -P.)\n"
4187 << " -M,--popmap: path to a population map. (Format is 'SAMPLE1 \\t POP1 \\n SAMPLE2 ...'.)\n"
4188 << " -t,--threads: number of threads to run in parallel sections of code.\n"
4189 << " --batch-size [int]: the number of loci to process in a batch (default: 10,000 in de novo mode; in reference mode, one chromosome\n"
4190 << " per batch). Increase to speed analysis, uses more memory, decrease to save memory).\n"
4191 << "\n"
4192 << "Data Filtering:\n"
4193 << " -p,--min-populations [int]: minimum number of populations a locus must be present in to process a locus.\n"
4194 << " -r,--min-samples-per-pop [float]: minimum percentage of individuals in a population required to process a locus for that population.\n"
4195 << " -R,--min-samples-overall [float]: minimum percentage of individuals across populations required to process a locus.\n"
4196 << " -H,--filter-haplotype-wise: apply the above filters haplotype wise\n"
4197 << " (unshared SNPs will be pruned to reduce haplotype-wise missing data).\n"
4198 << " --min-maf [float]: specify a minimum minor allele frequency required to process a SNP (0 < min_maf < 0.5).\n"
4199 << " --min-mac [int]: specify a minimum minor allele count required to process a SNP.\n"
4200 << " --max-obs-het [float]: specify a maximum observed heterozygosity required to process a SNP.\n"
4201 << "\n"
4202 << " --write-single-snp: restrict data analysis to only the first SNP per locus (implies --no-haps).\n"
4203 << " --write-random-snp: restrict data analysis to one random SNP per locus (implies --no-haps).\n"
4204 << " -B,--blacklist: path to a file containing Blacklisted markers to be excluded from the export.\n"
4205 << " -W,--whitelist: path to a file containing Whitelisted markers to include in the export.\n"
4206 << "\n"
4207 << "Merging and Phasing:\n"
4208 << " -e,--renz: restriction enzyme name.\n"
4209 << " --merge-sites: merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).\n"
4210 << " --merge-prune-lim: when merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.\n"
4211 << "\n"
4212 << "Locus stats:\n"
4213 << " --hwe: calculate divergence from Hardy-Weinberg equilibrium for each locus.\n"
4214 << "\n"
4215 << "Fstats:\n"
4216 << " --fstats: enable SNP and haplotype-based F statistics.\n"
4217 << " --fst-correction: specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'. Default: off.\n"
4218 << " --p-value-cutoff [float]: maximum p-value to keep an Fst measurement. Default: 0.05. (Also used as base for Bonferroni correction.)\n"
4219 << "\n"
4220 << "Kernel-smoothing algorithm:\n"
4221 << " -k,--smooth: enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.\n"
4222 << " --smooth-fstats: enable kernel-smoothed Fst, Fst', and Phi_st calculations.\n"
4223 << " --smooth-popstats: enable kernel-smoothed Pi and Fis calculations.\n"
4224 << " (Note: turning on smoothing implies --ordered-export.)\n"
4225 << " --sigma [int]: standard deviation of the kernel smoothing weight distribution. Default 150kb.\n"
4226 << " --bootstrap: turn on boostrap resampling for all smoothed statistics.\n"
4227 << " -N,--bootstrap-reps [int]: number of bootstrap resamplings to calculate (default 100).\n"
4228 << " --bootstrap-pifis: turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.\n"
4229 << " --bootstrap-fst: turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.\n"
4230 << " --bootstrap-div: turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.\n"
4231 << " --bootstrap-phist: turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.\n"
4232 << " --bootstrap-wl [path]: only bootstrap loci contained in this whitelist.\n"
4233 << "\n"
4234 << "File output options:\n"
4235 << " --ordered-export: if data is reference aligned, exports will be ordered; only a single representative of each overlapping site.\n"
4236 << " --fasta-loci: output locus consensus sequences in FASTA format.\n"
4237 << " --fasta-samples: output the sequences of the two haplotypes of each (diploid) sample, for each locus, in FASTA format.\n"
4238 << " --vcf: output SNPs and haplotypes in Variant Call Format (VCF).\n"
4239 << " --genepop: output SNPs and haplotypes in GenePop format.\n"
4240 << " --structure: output results in Structure format.\n"
4241 << " --radpainter: output results in fineRADstructure/RADpainter format.\n"
4242 << " --phase*: output genotypes in PHASE format.\n"
4243 << " --fastphase*: output genotypes in fastPHASE format.\n"
4244 << " --plink: output genotypes in PLINK format.\n"
4245 << " --hzar: output genotypes in Hybrid Zone Analysis using R (HZAR) format.\n"
4246 << " --phylip: output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.\n"
4247 << " --phylip-var: include variable sites in the phylip output encoded using IUPAC notation.\n"
4248 << " --phylip-var-all*: include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.\n"
4249 << " --treemix: output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).\n"
4250 << " --no-hap-exports: omit haplotype outputs.\n"
4251 << " --fasta-samples-raw: output all haplotypes observed in each sample, for each locus, in FASTA format.\n"
4252 << " (*not implemented as of v2.4)\n"
4253 << "\n"
4254 << "Additional options:\n"
4255 << " -h,--help: display this help messsage.\n"
4256 << " -v,--version: print program version.\n"
4257 << " --verbose: turn on additional logging.\n"
4258 << (" --log-fst-comp: log components of Fst/Phi_st calculations to a file.\n")
4259 #ifdef DEBUG
4260 << "\n"
4261 << "DEBUG:\n"
4262 << " --dbg-min-gt-depth\n"
4263 << " --genepop-haps-3digits: Use 3-digit alleles in the genepop haps output (default is 2-digit).\n"
4264 #endif
4265 ;
4266
4267 // << " --bootstrap-type [exact|approx]: enable bootstrap resampling for population statistics (reference genome required).\n"
4268
4269 exit(1);
4270 }
4271