1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2012-2019, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // populations -- generate population genetic statistics and output
23 // haplotypes in a population context.
24 //
25 
26 #include <cctype>
27 #include <typeinfo>
28 
29 #include "populations.h"
30 #include "export_formats.h"
31 
32 using namespace std;
33 
34 // Global variables to hold command-line options.
35 InputMode input_mode  = InputMode::stacks2;
36 int       num_threads =  1;
37 bool      quiet       = false;
38 string    in_path;
39 string    in_vcf_path;
40 string    out_path;
41 string    pmap_path;
42 string    bl_file;
43 string    wl_file;
44 string    bs_wl_file;
45 string    enz;
46 double    sigma             = 150000.0;
47 double    min_samples_overall   = 0.0;
48 double    min_samples_per_pop   = 0.0;
49 int       min_populations       = 1;
50 bool      filter_haplotype_wise = false;
51 int       batch_size        = 10000;
52 bool      calc_fstats       = false;
53 bool      calc_hwp          = false;
54 bool      bootstrap         = false;
55 bool      bootstrap_fst     = false;
56 bool      bootstrap_pifis   = false;
57 bool      bootstrap_phist   = false;
58 bool      bootstrap_div     = false;
59 bs_type   bootstrap_type    = bs_exact;
60 int       bootstrap_reps    = 100;
61 bool      bootstrap_wl      = false;
62 bool      write_single_snp  = false;
63 bool      write_random_snp  = false;
64 bool      merge_sites       = false;
65 bool      write_gtypes      = false;
66 bool      ordered_export    = false;
67 bool      smooth_fstats     = false;
68 bool      smooth_popstats   = false;
69 bool      loci_ordered      = false;
70 bool      log_fst_comp      = false;
71 bool      verbose           = false;
72 size_t    min_gt_depth      = 0;
73 double    merge_prune_lim   = 1.0;
74 double    minor_allele_freq = 0.0;
75 long      minor_allele_cnt  = 0;
76 double    max_obs_het       = 1.0;
77 double    p_value_cutoff    = 0.05;
78 corr_type fst_correction    = no_correction;
79 set<string> debug_flags;
80 
81 string           out_prefix;
82 MetaPopInfo      mpopi;
83 set<int>         bootstraplist;
84 
85 //
86 // Hold information about restriction enzymes
87 //
88 map<string, const char **> renz;
89 map<string, int>           renz_cnt;
90 map<string, int>           renz_len;
91 map<string, int>           renz_olap;
92 
93 const int max_snp_dist = 500;
94 
95 vector<Export *> exports;
96 template<typename E>
find_export()97 vector<Export*>::iterator find_export()
98 {
99     return std::find_if(
100             exports.begin(), exports.end(),
101             [](Export* e){ return typeid(*e) == typeid(E); }
102         );
103 }
104 template<typename E>
add_export()105 void add_export()
106 {
107     // Check that the export isn't already present and add it.
108     if (find_export<E>() == exports.end())
109         exports.push_back(new E());
110 }
111 
main(int argc,char * argv[])112 int main (int argc, char* argv[])
113 {
114     unique_ptr<LogAlterator> logger;
115 try {
116 
117 #ifndef HAVE_LIBZ
118     cout << "Stacks was compiled without zlib, and will refuse to parse compressed files.\n";
119 #endif
120 
121     //
122     // Initialize the globals that need it.
123     //
124     initialize_renz(renz, renz_cnt, renz_len);
125     initialize_renz_olap(renz_olap);
126     srandom(time(NULL));
127 
128     //
129     // Parse the command line.
130     //
131     parse_command_line(argc, argv);
132 
133     //
134     // Open and initialize the log file.
135     //
136     logger.reset(new LogAlterator(out_path + out_prefix, true, quiet, argc, argv));
137     output_parameters(cout);
138 
139     //
140     // Set the number of OpenMP parallel threads to execute.
141     //
142     #ifdef _OPENMP
143     omp_set_num_threads(num_threads);
144     #endif
145 
146     //
147     // Read the population map file, if any.
148     //
149     if (!pmap_path.empty()) {
150         cout << "Parsing population map...\n";
151         mpopi.init_popmap(pmap_path);
152         cout << "The population map contained " << mpopi.n_samples() << " samples, "
153              << mpopi.pops().size() << " population(s), " << mpopi.groups().size() << " group(s).\n";
154     } else {
155         cout << "A population map was not specified, all samples will be read from '"
156              << in_path << "' as a single popultaion.\n";
157     }
158 
159     //
160     // Locate and open input files, read VCF headers, parse population map, load black/white lists.
161     //
162     BatchLocusProcessor bloc(input_mode, batch_size, &mpopi);
163 
164     bloc.init(in_path, pmap_path);
165 
166     //
167     // Report information on the structure of the populations specified.
168     //
169     mpopi.status(cout);
170 
171     if (size_t(min_populations) > mpopi.pops().size()) {
172         cout << "Notice: Population limit (" << min_populations << ")"
173              << " larger than number of popualtions present, adjusting parameter to "
174              << mpopi.pops().size() << "\n";
175         min_populations = mpopi.pops().size();
176     }
177 
178     //
179     // Setup the default data exports.
180     //
181     exports.push_back(new MarkersExport());
182     exports.push_back(new GenotypesExport());
183     exports.push_back(new SumstatsExport());
184     exports.push_back(new HapstatsExport());
185 
186     SnpDivergenceExport *sdiv_exp = NULL;
187     HapDivergenceExport *hdiv_exp = NULL;
188     if (calc_fstats) {
189         sdiv_exp = new SnpDivergenceExport(logger->x);
190         exports.push_back(sdiv_exp);
191         hdiv_exp = new HapDivergenceExport();
192         exports.push_back(hdiv_exp);
193     }
194 
195     //
196     // Setup the kernel smoothing apparatus.
197     //
198     LocusSmoothing *smooth = NULL;
199     if (smooth_fstats || smooth_popstats)
200         smooth = new LocusSmoothing(&mpopi, logger->x);
201 
202     //
203     // Setup the divergence statistics calculator, if requested.
204     //
205     LocusDivergence *ldiv = NULL;
206     if (calc_fstats)
207         ldiv = new LocusDivergence(&mpopi);
208 
209     //
210     // Open the export files and write any headers.
211     //
212     for (uint i = 0; i < exports.size(); i++) {
213         exports[i]->open((const MetaPopInfo *) &mpopi);
214         exports[i]->write_header();
215     }
216 
217     //
218     // Initialize the summary statistics object which will accumulate the metapopulation summary statistics
219     // as the individual loci are read and processed.
220     //
221     SumStatsSummary sumstats(mpopi.pops().size());
222     size_t n_sites = 0;
223     size_t n_multiloci_sites = 0;
224 
225     const LocusFilter &filter = bloc.filter();
226     cout << "\nProcessing data in batches:\n"
227          << "  * load a batch of catalog loci and apply filters\n"
228          << "  * compute SNP- and haplotype-wise per-population statistics\n";
229     if (calc_hwp)
230         cout << "  * compute SNP- and haplotype-wise deviation from HWE\n";
231     if (calc_fstats)
232         cout << "  * compute F-statistics\n";
233     if (smooth_popstats)
234         cout << "  * smooth per-population statistics\n";
235     if (smooth_fstats)
236         cout << "  * smooth F-statistics\n";
237     cout << "  * write the above statistics in the output files\n"
238          << "  * export the genotypes/haplotypes in specified format(s)\n"
239          << "More details in '" << logger->distribs_path << "'.\n"
240          << "Now processing...\n"
241          << flush;
242 
243     Timer timer;
244     logger->x << "\nBEGIN batch_progress\n";
245     while (true) {
246         //
247         // Read the next set of loci to process.
248         // - If data are denovo, load blim._batch_size loci.
249         // - If data are reference aligned, load one chromosome.
250         // - Filter the loci according to command line parameters (-r, -p, --maf, --write-single-snp, etc.)
251         // - Sort the loci by basepair if they are ordered.
252         //
253         size_t loc_cnt = bloc.next_batch(logger->x);
254         if (filter.batch_seen() == 0)
255             break;
256 
257         if (loci_ordered) {
258             cout << bloc.chr() << " " << flush;
259             logger->x << bloc.chr();
260         } else {
261             cout << "Batch " << bloc.next_batch_number() - 1 << " " << flush;
262             logger->x << "Batch " << bloc.next_batch_number() - 1;
263         }
264         logger->x << ": analyzed "
265                   << filter.batch_total() << " loci; filtered "
266                   << filter.batch_filtered() << " loci; "
267                   << filter.batch_seen() << " loci seen.\n";
268 
269         if (loc_cnt == 0)
270             break;
271         assert(!bloc.loci().empty());
272 
273         sumstats.accumulate(bloc.loci());
274 
275         //
276         // Calculate haplotype and gene diversity, Hardy-Weinberg proportions per locus per population.
277         //
278         bloc.hapstats(logger->x);
279 
280         //
281         // Calculate and report the extent of overlap between different RAD loci.
282         //
283         if (loci_ordered) {
284             size_t chr_n_sites;
285             size_t chr_n_multiloci_sites;
286             bloc.report_locus_overlap(chr_n_sites, chr_n_multiloci_sites, (verbose ? &logger->x : NULL));
287             n_sites += chr_n_sites;
288             n_multiloci_sites += chr_n_multiloci_sites;
289             logger->x << "    " << chr_n_sites << " genomic sites, of which "
290                       << chr_n_multiloci_sites <<  " were covered by multiple loci ("
291                       << as_percentage(chr_n_multiloci_sites, chr_n_sites) << ").\n";
292         }
293 
294         //
295         // Calculate divergence statistics (Fst), if requested.
296         //
297         if (calc_fstats) {
298             logger->x << "    Calculating F statistics...";
299             ldiv->snp_divergence(bloc.loci());
300             ldiv->haplotype_divergence_pairwise(bloc.loci());
301             ldiv->haplotype_divergence(bloc.loci());
302             logger->x << "done.\n";
303         }
304 
305         //
306         // Smooth population statistics across individual populations, and between populations.
307         //
308         if ( (smooth_fstats || smooth_popstats) && loci_ordered == false) {
309             logger->x << "    Notice: Smoothing was requested (-k), but will not be performed as the loci are not ordered.\n";
310         } else if (smooth_fstats || smooth_popstats) {
311             logger->x << "    Generating kernel-smoothed population statistics...";
312             if (smooth_popstats) {
313                 smooth->snpstats(bloc.loci(), logger->x);
314                 smooth->hapstats(bloc.loci(), logger->x);
315             }
316             if (smooth_fstats) {
317                 smooth->snp_divergence(bloc.loci(), ldiv->snp_values(), logger->x);
318                 smooth->hap_divergence(bloc.loci(), ldiv->haplotype_values(), ldiv->metapop_haplotype_values(), logger->x);
319             }
320             logger->x << "done.\n";
321         }
322 
323         //
324         // Export this subset of the loci.
325         //
326         for (uint i = 0; i < exports.size(); i++)
327             exports[i]->write_batch(bloc.loci());
328 
329         if (calc_fstats) {
330             sdiv_exp->write_batch_pairwise(bloc.loci(), ldiv->snp_values());
331             hdiv_exp->write_batch_pairwise(bloc.loci(), ldiv->haplotype_values(), ldiv->metapop_haplotype_values());
332             ldiv->clear(bloc.loci());
333         }
334         logger->x << flush;
335         timer.update();
336         #ifdef DEBUG
337         cout << "(" << (size_t) timer.elapsed() << "s)\n" << flush;
338         #else
339         cout << "\n";
340         #endif
341 
342     }
343     logger->x << "END batch_progress\n";
344 
345     //
346     // Report what we read from the input files.
347     //
348     bloc.summarize(cout);
349 
350     cout << "\n"
351          << "Removed " << filter.filtered() << " loci that did not pass sample/population constraints from " << filter.seen() << " loci.\n"
352          << "Kept " << filter.total() << " loci, composed of " << filter.total_sites() << " sites; "
353          << filter.filtered_sites() << " of those sites were filtered, " << filter.variant_sites() << " variant sites remained.\n";
354     if (loci_ordered)
355         cout << "    " << n_sites << " genomic sites, of which "
356              << n_multiloci_sites <<  " were covered by multiple loci ("
357              << as_percentage(n_multiloci_sites, n_sites) << ").\n";
358     if (filter.total_sites() == 0) {
359         cerr << "Error: All data has been filtered out.\n";
360         throw exception();
361     }
362 
363     //
364     // Do the final sumstats calculations and write the sumstats summary files.
365     //
366     sumstats.final_calculation();
367     sumstats.write_results();
368 
369     if (calc_hwp) {
370         cout << "Number of loci found to be significantly out of Hardy-Weinberg equilibrium (<" << p_value_cutoff << "):\n";
371         for (uint j = 0; j < mpopi.pops().size(); j++)
372             cout << "  "
373                  << mpopi.pops()[j].name << ": "
374                  << bloc._sig_hwe_dev[j] << "\n";
375         cout << "(more detail in populations.sumstats.tsv and populations.hapstats.tsv)\n";
376     }
377 
378     if (calc_fstats)
379         ldiv->write_summary(out_path + out_prefix);
380 
381     //
382     // Write out the distributions of catalog loci.
383     //
384     bloc.write_distributions(logger->x);
385 
386     if (smooth_fstats || smooth_popstats)
387         delete smooth;
388     if (calc_fstats)
389         delete ldiv;
390 
391     //
392     // Close the export files and do any required post processing.
393     //
394     for (uint i = 0; i < exports.size(); i++) {
395         exports[i]->post_processing();
396         exports[i]->close();
397         delete exports[i];
398     }
399 
400     cout << "Populations is done.\n";
401     return 0;
402 
403 } catch (exception& e) {
404     return stacks_handle_exceptions(e);
405 }
406 }
407 
408 int
init(string in_path,string pmap_path)409 BatchLocusProcessor::init(string in_path, string pmap_path)
410 {
411     //
412     // Read the blacklist and whitelist to control which loci we load.
413     //
414     int cnt;
415     if (bl_file.length() > 0) {
416         cnt = this->_loc_filter.load_blacklist(bl_file);
417         cout << "Loaded " << cnt << " blacklisted markers.\n";
418     }
419     if (wl_file.length() > 0) {
420         cnt = this->_loc_filter.load_whitelist(wl_file);
421         cout << "Loaded " << cnt << " whitelisted markers.\n";
422         this->_user_supplied_whitelist = true;
423     }
424 
425     if (this->_input_mode == InputMode::vcf)
426         this->init_external_loci(in_vcf_path, pmap_path);
427     else
428         this->init_stacks_loci(in_path, pmap_path);
429 
430     //
431     // Initialize our per-population haplotype counters.
432     //
433     this->_sig_hwe_dev = new size_t[this->_mpopi->pops().size()];
434     memset(this->_sig_hwe_dev, 0, sizeof(size_t)*this->_mpopi->pops().size());
435 
436     return 0;
437 }
438 
439 size_t
next_batch(ostream & log_fh)440 BatchLocusProcessor::next_batch(ostream &log_fh)
441 {
442     this->_batch_num++;
443     if (this->_input_mode == InputMode::vcf)
444         return this->next_batch_external_loci(log_fh);
445     else
446         return this->next_batch_stacks_loci(log_fh);
447 }
448 
batch_clear()449 void BatchLocusProcessor::batch_clear()
450 {
451     for (LocBin* loc : this->_loci)
452         delete loc;
453     this->_loci.clear();
454     this->_chr.clear();
455     this->_loc_filter.batch_clear();
456 }
457 
458 int
init_stacks_loci(string in_path,string pmap_path)459 BatchLocusProcessor::init_stacks_loci(string in_path, string pmap_path)
460 {
461     //
462     // Open the files.
463     //
464     string catalog_fa_path  = in_path + "catalog.fa.gz";
465     string catalog_vcf_path = in_path + "catalog.calls";
466 
467     this->_fasta_reader.open(catalog_fa_path);
468     this->_cloc_reader.open(catalog_vcf_path);
469 
470     // Create the population map or check that all samples have data.
471     if (pmap_path.empty()) {
472         cout << "No population map specified, using all samples...\n";
473         this->_mpopi->init_names(this->cloc_reader().header().samples());
474     } else {
475         size_t n_samples_before = this->_mpopi->n_samples();
476         this->_mpopi->intersect_with(this->cloc_reader().header().samples());
477         size_t n_rm_samples = n_samples_before - this->_mpopi->n_samples();
478 
479         if (n_rm_samples > 0) {
480             cerr << "Warning: No genotype data exists for " << n_rm_samples
481                  << " of the samples listed in the population map.\n";
482             if (this->_mpopi->samples().empty()) {
483                 cerr << "Error: No more samples.\n";
484                 throw exception();
485             }
486         }
487     }
488 
489     this->cloc_reader().set_sample_ids(*this->_mpopi);
490 
491     //
492     // Initialize the locus filter after we have constructed the population map.
493     //
494     this->_loc_filter.init(this->pop_info());
495 
496     //
497     // Create the {sample_vcf_index : sample_popmap_index} table.
498     //
499     const vector<string>& vcf_samples = this->_cloc_reader.header().samples();
500     this->_samples_vcf_to_mpopi.assign(vcf_samples.size(), SIZE_MAX);
501     for (size_t vcf_i=0; vcf_i<vcf_samples.size(); ++vcf_i) {
502         size_t mpopi_i = this->_mpopi->get_sample_index(vcf_samples[vcf_i], false);
503         if (mpopi_i != SIZE_MAX)
504             this->_samples_vcf_to_mpopi[vcf_i] = mpopi_i;
505     }
506 
507     return 0;
508 }
509 
510 size_t
next_batch_stacks_loci(ostream & log_fh)511 BatchLocusProcessor::next_batch_stacks_loci(ostream &log_fh)
512 {
513     this->batch_clear();
514 
515     //
516     // Check if we queued a LocBin object from the last round of reading.
517     //
518     if (this->_next_loc != NULL) {
519         this->_loci.push_back(this->_next_loc);
520         this->_next_loc = NULL;
521         this->_loc_filter.locus_seen();
522         this->_loc_filter.keep_locus(this->_loci.back());
523     }
524 
525     Seq seq;
526     seq.id = new char[id_len];
527     vector<VcfRecord> records;
528     while(this->_cloc_reader.read_one_locus(records)) {
529         this->_loc_filter.locus_seen();
530 
531         //
532         // Get the current locus ID and find the corresponding fasta record.
533         // (Note: c-loci with very low coverage might be entirely missing from
534         // the VCF; in this case ignore them.)
535         //
536         assert(!records.empty());
537         int cloc_id = is_integer(records[0].chrom());
538         assert(cloc_id >= 0);
539         do {
540             int rv = this->_fasta_reader.next_seq(seq);
541             if (rv == 0) {
542                 cerr << "Error: catalog VCF and FASTA files are discordant, maybe trucated. rv: "
543                      << rv << "; cloc_id: " << cloc_id << "\n";
544                 throw exception();
545             }
546         } while (is_integer(seq.id) != cloc_id);
547 
548         //
549         // Check if this locus is white/blacklisted.
550         //
551         if (this->_loc_filter.whitelist_filter(cloc_id) ||
552             this->_loc_filter.blacklist_filter(cloc_id))
553             continue;
554 
555         //
556         // Create and populate a new catalog locus & the associated genotypes.
557         //
558         LocBin* loc = new LocBin(this->_mpopi->n_samples());
559         loc->cloc = new CSLocus();
560         loc->d = new Datum *[this->_mpopi->n_samples()];
561         for (size_t i=0; i<this->_mpopi->n_samples(); ++i)
562             loc->d[i] = NULL;
563 
564         PopMap<CSLocus>::populate_internal(
565             loc->cloc, loc->d,
566             seq, records,
567             this->_cloc_reader.header(), this->_mpopi, this->_samples_vcf_to_mpopi);
568 
569         //
570         // Apply locus & SNP filters.
571         //
572         this->_dists.accumulate_pre_filtering(loc->sample_cnt, loc->cloc);
573         this->_loc_filter.whitelist_snp_filter(*loc);
574         if (this->_loc_filter.apply_filters_stacks(*loc, log_fh, *this->_mpopi)) {
575             delete loc;
576             continue;
577         }
578         assert(loc->s != NULL);
579 
580         //
581         // Tabulate haplotypes present and in what combinations.
582         //
583         tabulate_locus_haplotypes(loc->cloc, loc->d, this->_mpopi->n_samples());
584 
585         //
586         // Detect the end of batch.
587         //
588         loci_ordered = !loc->cloc->loc.empty();
589         if (loci_ordered) {
590             //
591             // Ref-based.
592             //
593             if (this->_chr.empty()) {
594                 this->_chr = loc->cloc->loc.chr();
595             } else if (this->_chr.compare(loc->cloc->loc.chr()) != 0) {
596                 if (!this->_loci.empty()) {
597                     this->_next_loc = loc;
598                     this->_loc_filter.locus_unsee();
599                     break;
600                 } else {
601                     //
602                     // No loci were analyzed on the previous chromosome, keep processing
603                     // the current chromosome without returning.
604                     //
605                     this->_chr = loc->cloc->loc.chr();
606                 }
607             }
608             this->_loci.push_back(loc);
609             this->_loc_filter.keep_locus(loc);
610         } else {
611             //
612             // De novo.
613             //
614             if (ordered_export) {
615                 cerr << "Error: Options --ordered-export/--smooth/--bootstrap"
616                     << " are for reference-aligned data only.\n";
617                 throw exception();
618             }
619             loc->cloc->loc.set("un", this->_unordered_bp, strand_plus);
620             this->_unordered_bp += loc->cloc->len;
621             this->_loci.push_back(loc);
622             this->_loc_filter.keep_locus(loc);
623             if (this->_loci.size() == this->_batch_size)
624                 break;
625         }
626     }
627 
628     //
629     // Record the post-filtering distribution of catalog loci for this batch.
630     //
631     this->_dists.accumulate(this->_loci);
632 
633     //
634     // Sort the catalog loci, if possible.
635     //
636     if (loci_ordered)
637         sort(this->_loci.begin(), this->_loci.end(),
638              [] (const LocBin *a, const LocBin *b) -> bool {
639                  return a->cloc->loc.bp < b->cloc->loc.bp;
640              });
641 
642     return this->_loci.size();
643 }
644 
645 int
init_external_loci(string in_path,string pmap_path)646 BatchLocusProcessor::init_external_loci(string in_path, string pmap_path)
647 {
648     //
649     // Open the VCF file
650     //
651     cout << "Opening the VCF file...\n";
652     this->_vcf_parser.open(in_path);
653 
654     if (this->_vcf_parser.header().samples().empty()) {
655         cerr << "Error: No samples in VCF file '" << in_path << "'.\n";
656         throw exception();
657     }
658 
659     // Reconsider the MetaPopInfo in light of the VCF header.
660     if (pmap_path.empty()) {
661         cout << "No population map specified, creating one from the VCF header...\n";
662         this->_mpopi->init_names(this->_vcf_parser.header().samples());
663 
664     } else {
665         // Intersect the samples present in the population map and the VCF.
666         size_t n_samples_before = this->_mpopi->n_samples();
667 
668         this->_mpopi->intersect_with(this->_vcf_parser.header().samples());
669 
670         size_t n_rm_samples = n_samples_before - this->_mpopi->n_samples();
671         if (n_rm_samples > 0) {
672             cerr << "Warning: Of the samples listed in the population map, "
673                  << n_rm_samples << " could not be found in the VCF :";
674             if (this->_mpopi->samples().empty()) {
675                 cerr << "Error: No more samples.\n";
676                 throw exception();
677             }
678         }
679     }
680 
681     // Create arbitrary sample IDs.
682     for (size_t i = 0; i < this->_mpopi->n_samples(); ++i)
683         this->_mpopi->set_sample_id(i, i+1); //id=i+1
684 
685     //
686     // Initialize the locus filter after we have constructed the population map.
687     //
688     this->_loc_filter.init(this->pop_info());
689 
690     //
691     // Create the {sample_vcf_index : sample_popmap_index} table.
692     //
693     const vector<string>& vcf_samples = this->_vcf_parser.header().samples();
694     this->_samples_vcf_to_mpopi.assign(vcf_samples.size(), SIZE_MAX);
695     for (size_t vcf_i=0; vcf_i<vcf_samples.size(); ++vcf_i) {
696         size_t mpopi_i = this->_mpopi->get_sample_index(vcf_samples[vcf_i], false);
697         if (mpopi_i != SIZE_MAX)
698             this->_samples_vcf_to_mpopi[vcf_i] = mpopi_i;
699     }
700 
701     this->_total_ext_vcf = 0;
702 
703     return 0;
704 }
705 
706 size_t
next_batch_external_loci(ostream & log_fh)707 BatchLocusProcessor::next_batch_external_loci(ostream &log_fh)
708 {
709     //
710     // VCF mode
711     //
712     this->batch_clear();
713     loci_ordered = true;
714 
715     //
716     // Check if we queued a LocBin object from the last round of reading.
717     //
718     if (this->_next_loc != NULL) {
719         this->_loci.push_back(this->_next_loc);
720         this->_next_loc = NULL;
721         this->_loc_filter.locus_seen();
722         this->_loc_filter.keep_locus(this->_loci.back());
723     }
724 
725     int cloc_id = (this->_loci.empty() ? 1 : this->_loci.back()->cloc->id + 1);
726     VcfRecord rec;
727     while (this->_vcf_parser.next_record(rec)) {
728         this->_loc_filter.locus_seen();
729         this->_total_ext_vcf++;
730 
731         // Check for a SNP.
732         if (not rec.is_snp()) {
733             this->_skipped_notsnp.push_back(this->_vcf_parser.line_number());
734             continue;
735         }
736 
737         // Check for a filtered-out SNP
738         if (strncmp(rec.filters(), ".", 2) != 0 && strncmp(rec.filters(), "PASS", 5) != 0) {
739             this->_skipped_filter.push_back(this->_vcf_parser.line_number());
740             continue;
741         }
742 
743         //
744         // Create and populate a new catalog locus.
745         //
746         LocBin* loc = new LocBin(this->_mpopi->n_samples());
747         loc->cloc = new CSLocus();
748         loc->d = new Datum *[this->_mpopi->n_samples()];
749         for (size_t i = 0; i < this->_mpopi->n_samples(); i++)
750             loc->d[i] = NULL;
751         if (!PopMap<CSLocus>::populate_external(
752                 loc->cloc, loc->d,
753                 cloc_id++, rec,
754                 this->_vcf_parser.header(), this->_mpopi, this->_samples_vcf_to_mpopi)
755         ) {
756             // Bad record; a warning has been printed.
757             delete loc;
758             continue;
759         }
760         assert(!loc->cloc->loc.empty());
761         assert(loc->cloc->len == 1);
762         assert(loc->cloc->snps.size() == 1);
763 
764         //
765         // Apply filters.
766         //
767         this->_dists.accumulate_pre_filtering(loc->sample_cnt, loc->cloc);
768         if (this->_loc_filter.apply_filters_external(*loc, log_fh, *this->_mpopi)) {
769             delete loc;
770             continue;
771         }
772         assert(loc->s != NULL);
773 
774         //
775         // Tabulate haplotypes present and in what combinations.
776         //
777         tabulate_locus_haplotypes(loc->cloc, loc->d, this->_mpopi->n_samples());
778 
779         //
780         // Detect the end of batch.
781         //
782         if (this->_chr.empty()) {
783             this->_chr = loc->cloc->loc.chr();
784         } else if (this->_chr.compare(loc->cloc->loc.chr()) != 0) {
785             this->_next_loc = loc;
786             this->_loc_filter.locus_unsee();
787             break;
788         }
789         this->_loci.push_back(loc);
790         this->_loc_filter.keep_locus(loc);
791     }
792 
793     //
794     // Record the post-filtering distribution of catalog loci for this batch.
795     //
796     this->_dists.accumulate(this->_loci);
797 
798     //
799     // Sort the catalog loci, if possible.
800     //
801     sort(this->_loci.begin(), this->_loci.end(),
802         [] (const LocBin *a, const LocBin *b) -> bool {
803             return a->cloc->loc.bp < b->cloc->loc.bp;
804         });
805 
806     return this->_loci.size();
807 }
808 
809 int
summarize(ostream & log_fh)810 BatchLocusProcessor::summarize(ostream &log_fh)
811 {
812     if (this->_input_mode == InputMode::vcf) {
813         log_fh << "Found " << this->_total_ext_vcf << " SNP records in file '" << in_vcf_path
814              << "'. (Skipped " << this->_skipped_filter.size() << " already filtered-out SNPs and "
815              << this->_skipped_notsnp.size() << " non-SNP records ; more with --verbose.)\n";
816         if (verbose && not this->_skipped_notsnp.empty()) {
817             log_fh << "The following VCF record lines were determined not to be SNPs and skipped :";
818             for (vector<size_t>::const_iterator l = this->_skipped_notsnp.begin(); l != this->_skipped_notsnp.end(); ++l)
819                 log_fh << " " << *l;
820             log_fh << "\n";
821         }
822     } else {
823     }
824 
825     return 0;
826 }
827 
828 int
hapstats(ostream & log_fh)829 BatchLocusProcessor::hapstats(ostream &log_fh)
830 {
831     #pragma omp parallel
832     {
833         #pragma omp for schedule(dynamic, 1)
834         for (uint i = 0; i < this->_loci.size(); i++) {
835             LocBin *loc = this->_loci[i];
836 
837             loc->s->calc_hapstats(loc->cloc, (const Datum **) loc->d, *this->_mpopi);
838         }
839     }
840 
841     if (calc_hwp) {
842         const LocStat *l;
843         const vector<Pop> &pops = this->_mpopi->pops();
844 
845         for (uint i = 0; i < this->_loci.size(); i++) {
846             for (uint j = 0; j < pops.size(); j++) {
847                 l = this->_loci[i]->s->hapstats_per_pop(j);
848 
849                 if (l == NULL)
850                     continue;
851 
852                 this->_sig_hwe_dev[j] += l->stat[2] < p_value_cutoff ? 1 : 0;
853             }
854         }
855     }
856 
857     return 0;
858 }
859 
860 void
report_locus_overlap(size_t & n_sites,size_t & n_multiple_sites,ofstream * log_fh)861 BatchLocusProcessor::report_locus_overlap(size_t& n_sites, size_t& n_multiple_sites, ofstream *log_fh)
862 {
863     n_sites = 0;
864     n_multiple_sites = 0;
865 
866     //
867     // Create a vector of maps, to record for each population, which sites overlap between different loci.
868     //
869     map<uint, set<uint>> final_map;
870     //vector<map<uint, set<uint>>> sites (this->_mpopi->pops().size());
871     for (uint pop = 0; pop < this->_mpopi->pops().size(); pop++) {
872         for (const LocBin* locbin : this->_loci) {
873             const CSLocus* loc = locbin->cloc;
874             const LocSum* lsum = locbin->s->per_pop(pop);
875             for (uint k = 0; k < loc->len; k++)
876                 if (lsum->nucs[k].num_indv > 0) {
877                     final_map[lsum->nucs[k].bp].insert(loc->id);
878                     //sites[pop][lsum->nucs[k].bp].insert(loc->id);
879                 }
880         }
881     }
882 
883     n_sites = final_map.size();
884 
885     for (auto bp : final_map) {
886         if (bp.second.size() > 1) {
887             n_multiple_sites++;
888             if (log_fh != NULL) {
889                 *log_fh << "multilocus_pos"
890                         << '\t' << this->_loci[0]->cloc->loc.chr() << ':' << bp.first + 1
891                         << "\tloci=";
892                 join(bp.second, ',', *log_fh);
893                 *log_fh << '\n';
894             }
895         }
896     }
897 }
898 
899 void
erase_snp(CSLocus * cloc,Datum ** d,size_t n_samples,size_t snp_index)900 LocusFilter::erase_snp(CSLocus *cloc, Datum **d, size_t n_samples, size_t snp_index)
901 {
902     //
903     // N.B. For this to work as expected we must be iterating over cloc->snps
904     // in reverse, e.g. for(size_t i=cloc->snps.size(); i!=0;) {--i; ...}
905     // because we're making vector::erase() calls.
906     //
907     // Prune out the given SNP. We need to update:
908     // * `Locus::snps`
909     // * `Locus::alleles`
910     // * `Datum::model`
911     // * `Datum::snpdata`
912     // * `Datum::obshap`
913     //
914 
915     //
916     // Update the Datums.
917     //
918     uint col = cloc->snps[snp_index]->col;
919     for (size_t s=0; s<n_samples; ++s) {
920         if (d[s] == NULL)
921             continue;
922 
923         assert(d[s]->model);
924         assert(!d[s]->obshap.empty() && strlen(d[s]->obshap[0]) == cloc->snps.size());
925         assert(d[s]->snpdata.size() == cloc->snps.size());
926 
927         //
928         // Correct the model calls.
929         //
930         char& m = d[s]->model[col];
931         assert(m == 'O' || m == 'E' || m == 'U');
932         if (m == 'E' || (m == 'O' && d[s]->obshap[0][snp_index] != cloc->con[col]))
933             m = 'U';
934         //
935         // Erase the nucleotide in the haplotypes.
936         //
937         for (char* hap : d[s]->obshap) {
938             hap += snp_index;
939             while(*hap != '\0') {
940                 *hap = *(hap+1);
941                 ++hap;
942             }
943         }
944         //
945         // Splice the depths.
946         //
947         d[s]->snpdata.erase(d[s]->snpdata.begin() + snp_index);
948     }
949 
950     //
951     // Update the CSLocus.
952     //
953     delete cloc->snps[snp_index];
954     cloc->snps.erase(cloc->snps.begin() + snp_index);
955     // Splice the haplotypes. (Why does this have to be a map?!)
956     vector<pair<string,int>> alleles (cloc->alleles.begin(), cloc->alleles.end());
957     cloc->alleles.clear();
958     for (pair<string,int>& allele : alleles)
959         allele.first.erase(snp_index, 1);
960     cloc->alleles.insert(std::make_move_iterator(alleles.begin()), std::make_move_iterator(alleles.end()));
961 }
962 
963 void
batch_clear()964 LocusFilter::batch_clear()
965 {
966     this->_batch_total_loci    = 0;
967     this->_batch_filtered_loci = 0;
968     this->_batch_seen_loci     = 0;
969 }
970 
971 void
locus_seen()972 LocusFilter::locus_seen()
973 {
974     this->_seen_loci++;
975     this->_batch_seen_loci++;
976 }
977 
978 void
locus_unsee()979 LocusFilter::locus_unsee()
980 {
981     this->_seen_loci--;
982     this->_batch_seen_loci--;
983 }
984 
985 void
keep_locus(LocBin * loc)986 LocusFilter::keep_locus(LocBin *loc)
987 {
988     this->_total_loci++;
989     this->_batch_total_loci++;
990 
991     //
992     // Count up the number of sites and variable sites.
993     //
994     const LocTally *t = loc->s->meta_pop();
995 
996     for (uint i = 0; i < loc->cloc->len; i++) {
997         if (t->nucs[i].fixed == false)
998             this->_variant_sites++;
999         this->_total_sites++;
1000     }
1001 }
1002 
1003 bool
whitelist_filter(size_t locus_id)1004 LocusFilter::whitelist_filter(size_t locus_id)
1005 {
1006     if (this->_whitelist.empty() || this->_whitelist.count(locus_id)) {
1007         return false;
1008     } else {
1009         this->_filtered_loci++;
1010         this->_batch_filtered_loci++;
1011         return true;
1012     }
1013 }
1014 
1015 bool
blacklist_filter(size_t locus_id)1016 LocusFilter::blacklist_filter(size_t locus_id)
1017 {
1018     if (this->_blacklist.count(locus_id)) {
1019         this->_filtered_loci++;
1020         this->_batch_filtered_loci++;
1021         return true;
1022     } else {
1023         return false;
1024     }
1025 }
1026 
1027 void
whitelist_snp_filter(LocBin & loc) const1028 LocusFilter::whitelist_snp_filter(LocBin& loc) const
1029 {
1030     if (this->_whitelist.empty())
1031         return;
1032     CSLocus* cloc = loc.cloc;
1033     assert(this->_whitelist.count(cloc->id));
1034     const set<int>& snp_wl = this->_whitelist.at(cloc->id);
1035     if (snp_wl.empty())
1036         // Accept any SNP.
1037         return;
1038     for(size_t i=cloc->snps.size(); i!=0;) {
1039         --i;
1040         if (!snp_wl.count(cloc->snps[i]->col))
1041             erase_snp(cloc, loc.d, loc.sample_cnt, i);
1042     }
1043 }
1044 
1045 bool
apply_filters_stacks(LocBin & loc,ostream & log_fh,const MetaPopInfo & mpopi)1046 LocusFilter::apply_filters_stacks(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi)
1047 {
1048     //
1049     // Apply the -r/-p thresholds at the locus level.
1050     //
1051     if (this->filter(&mpopi, loc.d, loc.cloc))
1052         return true;
1053 
1054     //
1055     // Filter genotypes for depth.
1056     //
1057     this->gt_depth_filter(loc.d, loc.cloc);
1058 
1059     //
1060     // Identify individual SNPs that are below the -r threshold or the minor allele
1061     // frequency threshold (-a). In these cases we will remove the SNP, but keep the locus.
1062     //
1063     loc.s = new LocPopSum(strlen(loc.cloc->con), mpopi);
1064     this->filter_snps(loc, mpopi, log_fh);
1065 
1066     //
1067     // If write_single_snp, write_random_snp or filter_haplotype_wise has been specified,
1068     // mark sites to be pruned using the whitelist.
1069     //
1070     assert(write_single_snp + write_random_snp + filter_haplotype_wise <= 1);
1071     if (write_single_snp)
1072         this->keep_single_snp(loc.cloc, loc.d, mpopi.n_samples(), loc.s->meta_pop());
1073     else if (write_random_snp)
1074         this->keep_random_snp(loc.cloc, loc.d, mpopi.n_samples(), loc.s->meta_pop());
1075     else if (filter_haplotype_wise)
1076         this->filter_haps(loc, mpopi, log_fh);
1077 
1078     //
1079     // Regenerate summary statistics after pruning SNPs.
1080     //
1081     loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
1082     loc.s->tally_metapop(loc.cloc);
1083     return false;
1084 }
1085 
1086 bool
apply_filters_external(LocBin & loc,ostream & log_fh,const MetaPopInfo & mpopi)1087 LocusFilter::apply_filters_external(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi)
1088 {
1089     //
1090     // Apply the -r/-p thresholds at the locus level.
1091     //
1092     if (this->filter(&mpopi, loc.d, loc.cloc))
1093         return true;
1094 
1095     //
1096     // Identify individual SNPs that are below the -r threshold or the minor allele
1097     // frequency threshold (-a). In these cases we will remove the SNP, but keep the locus.
1098     //
1099     loc.s = new LocPopSum(strlen(loc.cloc->con), mpopi);
1100     this->filter_snps(loc, mpopi, log_fh);
1101 
1102     loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
1103     loc.s->tally_metapop(loc.cloc);
1104     return false;
1105 }
1106 
1107 bool
filter(const MetaPopInfo * mpopi,Datum ** d,CSLocus * cloc)1108 LocusFilter::filter(const MetaPopInfo *mpopi, Datum **d, CSLocus *cloc)
1109 {
1110     // Filter out populations that don't have enough samples.
1111     size_t n_pops_present = 0;
1112     for (const Pop& pop : mpopi->pops()) {
1113         size_t n_samples_present = 0;
1114         for (size_t s=pop.first_sample; s<=pop.last_sample; ++s)
1115             if (d[s] != NULL)
1116                 ++n_samples_present;
1117         if ((double) n_samples_present / pop.n_samples() >= min_samples_per_pop) {
1118             ++n_pops_present;
1119         } else {
1120             // Remove all samples of that population.
1121             for (size_t s=pop.first_sample; s<=pop.last_sample; ++s) {
1122                 if (d[s] != NULL) {
1123                     delete d[s];
1124                     d[s] = NULL;
1125                     cloc->cnt--;
1126                 }
1127             }
1128         }
1129     }
1130     // Check the number of (remaining) samples.
1131     size_t n_samples_present = 0;
1132     for (size_t s=0; s<mpopi->n_samples(); ++s)
1133         if (d[s] != NULL)
1134             ++n_samples_present;
1135     // Determine if the locus is to be kept.
1136     if (n_pops_present < size_t(min_populations)
1137         || (double) n_samples_present / mpopi->n_samples() < min_samples_overall)
1138     {
1139         this->_filtered_loci++;
1140         this->_batch_filtered_loci++;
1141         return true;
1142     }
1143     return false;
1144 }
1145 
1146 void
gt_depth_filter(Datum ** data,const CSLocus * cloc)1147 LocusFilter::gt_depth_filter(Datum** data, const CSLocus* cloc)
1148 {
1149     if (min_gt_depth == 0)
1150         return;
1151     else if (cloc->snps.empty())
1152         return;
1153     for (size_t spl=0; spl<this->_sample_cnt; ++spl) {
1154         Datum* d = data[spl];
1155         assert(d->obshap.size() == 2);
1156         assert(strlen(d->obshap[0]) == cloc->snps.size()
1157             && strlen(d->obshap[1]) == cloc->snps.size());
1158         for (size_t snp=0; snp<cloc->snps.size(); ++snp) {
1159             size_t col = cloc->snps[snp]->col;
1160             if (d->model[col] == 'U')
1161                 continue;
1162             // Check this genotype's depth.
1163             if (d->snpdata[snp].tot_depth < min_gt_depth) {
1164                 d->model[col] = 'U';
1165                 for(size_t i=0; i<2; ++i)
1166                     d->obshap[i][snp] = 'N';
1167             }
1168         }
1169     }
1170 }
1171 
1172 void
init(MetaPopInfo * mpopi)1173 LocusFilter::init(MetaPopInfo *mpopi)
1174 {
1175     this->_pop_cnt    = mpopi->pops().size();
1176     this->_sample_cnt = mpopi->n_samples();
1177 
1178     assert(this->_pop_cnt > 0);
1179     assert(this->_sample_cnt > 0);
1180 
1181     if (this->_pop_order != NULL)
1182         delete [] this->_pop_order;
1183     if (this->_samples != NULL)
1184         delete [] this->_samples;
1185     if (this->_pop_tot != NULL)
1186         delete [] this->_pop_tot;
1187 
1188     this->_pop_order  = new size_t [this->_pop_cnt];
1189     this->_samples    = new size_t [this->_sample_cnt];
1190     this->_pop_tot    = new size_t [this->_pop_cnt];
1191 
1192     this->_filtered_loci  = 0;
1193     this->_total_loci     = 0;
1194     this->_filtered_sites = 0;
1195     this->_total_sites    = 0;
1196 
1197     size_t pop_sthg = 0;
1198 
1199     for (size_t i_pop = 0; i_pop < mpopi->pops().size(); ++i_pop) {
1200         const Pop& pop = mpopi->pops()[i_pop];
1201         this->_pop_tot[pop_sthg]  = 0;
1202 
1203         for (uint i = pop.first_sample; i <= pop.last_sample; i++) {
1204             this->_samples[i] = pop_sthg;
1205             this->_pop_tot[pop_sthg]++;
1206         }
1207         this->_pop_order[pop_sthg] = i_pop;
1208         pop_sthg++;
1209     }
1210 }
1211 
1212 void
keep_single_snp(CSLocus * cloc,Datum ** d,size_t n_samples,const LocTally * t) const1213 LocusFilter::keep_single_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const
1214 {
1215     //
1216     // Check that we have at least one variable site within this population for this locus.
1217     //
1218     size_t n_actual_snps = 0;
1219     for (const SNP* snp : cloc->snps)
1220         if (!t->nucs[snp->col].fixed)
1221             ++n_actual_snps;
1222     if (n_actual_snps == 0)
1223         return;
1224 
1225     //
1226     // Find the first SNP that is not fixed in this subpopulation.
1227     //
1228     size_t kept_snp = 0;
1229     while (kept_snp != cloc->snps.size()) {
1230         if (t->nucs[cloc->snps[kept_snp]->col].fixed == false)
1231             break;
1232         kept_snp++;
1233     }
1234 
1235     //
1236     // Remove all the SNPs except for the one marked previously.
1237     //
1238     for (size_t snp=cloc->snps.size(); snp!=0;) {
1239         --snp;
1240         if (snp != kept_snp)
1241             erase_snp(cloc, d, n_samples, snp);
1242     }
1243 }
1244 
1245 void
keep_random_snp(CSLocus * cloc,Datum ** d,size_t n_samples,const LocTally * t) const1246 LocusFilter::keep_random_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const
1247 {
1248     //
1249     // Check that we have at least one variable site within this population for this locus.
1250     //
1251     size_t n_actual_snps = 0;
1252     for (const SNP* snp : cloc->snps)
1253         if (!t->nucs[snp->col].fixed)
1254             ++n_actual_snps;
1255     if (n_actual_snps == 0)
1256         return;
1257 
1258     //
1259     // Identify a random SNP that isn't fixed in this subset of populations.
1260     //
1261     size_t kept_snp_i;
1262     do {
1263         kept_snp_i = rand() % cloc->snps.size();
1264     } while (t->nucs[cloc->snps[kept_snp_i]->col].fixed);
1265 
1266     //
1267     // Remove all the SNPs except for the one marked previously.
1268     //
1269     for (auto snp_i=cloc->snps.size(); snp_i!=0; ) {
1270         --snp_i;
1271         if (snp_i != kept_snp_i)
1272             erase_snp(cloc, d, n_samples, snp_i);
1273     }
1274 }
1275 
1276 void
filter_snps(LocBin & loc,const MetaPopInfo & mpopi,ostream & log_fh)1277 LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
1278 {
1279     assert(loc.s != NULL);
1280     CSLocus* cloc = loc.cloc;
1281     Datum** d = loc.d;
1282     LocPopSum* s = loc.s;
1283 
1284     //
1285     // If this locus is fixed, ignore it.
1286     //
1287     if (cloc->snps.size() == 0)
1288         return;
1289 
1290     loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
1291     loc.s->tally_metapop(loc.cloc);
1292     const LocTally *t = s->meta_pop();
1293 
1294     for (uint snp_index = cloc->snps.size(); snp_index > 0;) { // Must be reverse because we `erase()` things.
1295         --snp_index;
1296         uint col = cloc->snps[snp_index]->col;
1297         bool sample_prune = false;
1298         bool overall_sample_prune = false;
1299         bool maf_prune    = false;
1300         bool het_prune    = false;
1301 
1302         //
1303         // If the site is fixed, ignore it.
1304         //
1305         NucTally& nuct = t->nucs[col];
1306         if (t->nucs[col].fixed == true)
1307             continue;
1308 
1309         size_t n_pruned_pops = 0;
1310         size_t n_samples_left = 0;
1311         for (size_t p = 0; p < s->pop_cnt(); ++p) {
1312             const LocSum* sum = s->per_pop(p);
1313             if (sum->nucs[col].incompatible_site) {
1314                 DOES_NOT_HAPPEN;
1315             }
1316             if ((double) sum->nucs[col].num_indv / this->_pop_tot[p] >= min_samples_per_pop) {
1317                 n_samples_left += sum->nucs[col].num_indv;
1318             } else {
1319                 ++n_pruned_pops;
1320                 const Pop& pop = mpopi.pops()[p];
1321                 for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
1322                     if (d[k] == NULL || col >= (uint) d[k]->len)
1323                         continue;
1324 
1325                     if (d[k]->model != NULL)
1326                         d[k]->model[col] = 'U';
1327                     for(size_t j=0; j < 2; ++j)
1328                         d[k]->obshap[j][snp_index] = 'N';
1329                 }
1330             }
1331         }
1332         if (mpopi.pops().size() - n_pruned_pops < (uint) min_populations)
1333             sample_prune = true;
1334         else if ((double) n_samples_left / mpopi.n_samples() < min_samples_overall)
1335             overall_sample_prune = true;
1336 
1337         if (t->nucs[col].allele_cnt > 1) {
1338             //
1339             // Test for minor allele frequency.
1340             //
1341             if ((1 - nuct.p_freq) < minor_allele_freq
1342                     || long(std::round((1 - nuct.p_freq) * 2.0 * nuct.num_indv)) < minor_allele_cnt)
1343                 maf_prune = true;
1344             //
1345             // Test for observed heterozygosity.
1346             //
1347             if (t->nucs[col].obs_het > max_obs_het)
1348                 het_prune = true;
1349         }
1350 
1351         if (maf_prune || het_prune || sample_prune || overall_sample_prune) {
1352             this->_filtered_sites++;
1353             if (verbose) {
1354                 log_fh << "pruned_polymorphic_site\t"
1355                        << cloc->id << "\t"
1356                        << cloc->loc.chr() << "\t"
1357                        << cloc->sort_bp(col) +1 << "\t"
1358                        << col << "\t";
1359                 if (sample_prune)
1360                     log_fh << "min_samples_per_pop\n";
1361                 else if (overall_sample_prune)
1362                     log_fh << "min_samples_overall\n";
1363                 else if (maf_prune)
1364                     log_fh << "maf_limit\n";
1365                 else if (het_prune)
1366                     log_fh << "obshet_limit\n";
1367                 else
1368                     DOES_NOT_HAPPEN;
1369             }
1370             this->erase_snp(cloc, d, mpopi.n_samples(), snp_index);
1371         }
1372     }
1373 }
1374 
1375 void
filter_haps(LocBin & loc,const MetaPopInfo & mpopi,ostream & log_fh)1376 LocusFilter::filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
1377 {
1378     CSLocus* cloc = loc.cloc;
1379     Datum** d = loc.d;
1380 
1381     // Tally SNP abundancies.
1382     vector<pair<size_t,size_t>> snps_n_samples;
1383     for (size_t i=0; i<cloc->snps.size(); ++i) {
1384         snps_n_samples.push_back({0, i});
1385         uint col = cloc->snps[i]->col;
1386         for (size_t s=0; s<mpopi.n_samples(); ++s)
1387             if (d[s] != NULL
1388                     && size_t(d[s]->len) > col && (d[s]->model[col] == 'O' || d[s]->model[col] == 'E'))
1389                 ++snps_n_samples.back().first;
1390     }
1391     std::sort(snps_n_samples.rbegin(), snps_n_samples.rend());
1392 
1393     // Prune SNPs until enough samples have fully resolved haplotypes.
1394     // (N.B. This is applied after snp-wise -r/-R as any individual SNP failing
1395     // these filters would cause them to fail at the haplotype level as well anyway.)
1396     while (!cloc->snps.empty()) {
1397         // Tally haplotypes.
1398         size_t n_pops_present_hapwise = 0;
1399         size_t n_samples_w_haps = 0;
1400         for (const Pop& pop : mpopi.pops()) {
1401             size_t n_pop_samples = 0;
1402             for (size_t s=pop.first_sample; s<=pop.last_sample; ++s) {
1403                 if (d[s] == NULL)
1404                     continue;
1405                 assert(d[s]->obshap.size() == 2);
1406                 if (strchr(d[s]->obshap[0], 'N') == NULL)
1407                     ++n_pop_samples;
1408             }
1409             if ((double) n_pop_samples / pop.n_samples() >= min_samples_per_pop) {
1410                 ++n_pops_present_hapwise;
1411                 n_samples_w_haps += n_pop_samples;
1412             }
1413         }
1414         // Check the number of haplotypes.
1415         if (n_pops_present_hapwise >= size_t(min_populations)
1416                 && (double) n_samples_w_haps / mpopi.n_samples() >= min_samples_overall)
1417             // Filters are satisfied.
1418             break;
1419         // Prune the SNP that is present in the fewest samples.
1420         ++this->_filtered_sites;
1421         assert(!snps_n_samples.empty());
1422         size_t snp_i = snps_n_samples.back().second;
1423         this->erase_snp(cloc, d, mpopi.n_samples(), snp_i);
1424         snps_n_samples.pop_back();
1425         assert(snps_n_samples.size() == cloc->snps.size());
1426         for (pair<size_t,size_t>& snp : snps_n_samples)
1427             if (snp.second > snp_i)
1428                 --snp.second;
1429     }
1430 }
1431 
1432 int
accumulate_pre_filtering(const size_t sample_cnt,const CSLocus * loc)1433 CatalogDists::accumulate_pre_filtering(const size_t sample_cnt, const CSLocus *loc)
1434 {
1435     size_t missing;
1436 
1437     if (this->_pre_valid.count(loc->cnt) == 0)
1438         this->_pre_valid[loc->cnt] = 1;
1439     else
1440         this->_pre_valid[loc->cnt]++;
1441 
1442     missing = sample_cnt - loc->cnt;
1443 
1444     if (this->_pre_absent.count(missing) == 0)
1445         this->_pre_absent[missing] = 1;
1446     else
1447         this->_pre_absent[missing]++;
1448 
1449     if (this->_pre_snps_per_loc.count(loc->snps.size()) == 0)
1450         this->_pre_snps_per_loc[loc->snps.size()] = 1;
1451     else
1452         this->_pre_snps_per_loc[loc->snps.size()]++;
1453 
1454     return 0;
1455 }
1456 
1457 int
accumulate(const vector<LocBin * > & loci)1458 CatalogDists::accumulate(const vector<LocBin *> &loci)
1459 {
1460     const CSLocus *loc;
1461     const LocTally *t;
1462     size_t missing;
1463 
1464     for (uint i = 0; i < loci.size(); i++) {
1465         loc = loci[i]->cloc;
1466 
1467         if (this->_post_valid.count(loc->cnt) == 0)
1468             this->_post_valid[loc->cnt] = 1;
1469         else
1470             this->_post_valid[loc->cnt]++;
1471 
1472         missing = loci[i]->sample_cnt - loc->cnt;
1473 
1474         if (this->_post_absent.count(missing) == 0)
1475             this->_post_absent[missing] = 1;
1476         else
1477             this->_post_absent[missing]++;
1478 
1479         //
1480         // Don't count SNPs that are fixed in the metapopulation.
1481         //
1482         t = loci[i]->s->meta_pop();
1483         size_t n_actual_snps = 0;
1484         for (const SNP* snp : loc->snps)
1485             if (!t->nucs[snp->col].fixed)
1486                 ++n_actual_snps;
1487 
1488         if (this->_post_snps_per_loc.count(n_actual_snps) == 0)
1489             this->_post_snps_per_loc[n_actual_snps] = 1;
1490         else
1491             this->_post_snps_per_loc[n_actual_snps]++;
1492     }
1493 
1494     return 0;
1495 }
1496 
1497 int
write_results(ostream & log_fh)1498 CatalogDists::write_results(ostream &log_fh)
1499 {
1500     map<size_t, size_t>::iterator cnt_it;
1501     string section;
1502     auto begin_section = [&](const string& s){
1503         section = s;
1504         log_fh << "\n" << "BEGIN " << section << "\n";
1505     };
1506     auto end_section = [&](){
1507         log_fh << "END " << section << "\n";
1508     };
1509 
1510     begin_section("samples_per_loc_prefilters");
1511     log_fh << "# Distribution of valid samples matched to a catalog locus prior to filtering.\n"
1512            << "n_samples\tn_loci\n";
1513     for (cnt_it = this->_pre_valid.begin(); cnt_it != this->_pre_valid.end(); cnt_it++)
1514         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1515     end_section();
1516 
1517     begin_section("missing_samples_per_loc_prefilters");
1518     log_fh << "# Distribution of missing samples for each catalog locus prior to filtering.\n"
1519            << "# Absent samples at locus\tCount\n";
1520     for (cnt_it = this->_pre_absent.begin(); cnt_it != this->_pre_absent.end(); cnt_it++)
1521         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1522     end_section();
1523 
1524     begin_section("snps_per_loc_prefilters");
1525     log_fh << "# Distribution of the number of SNPs per catalog locus prior to filtering.\n"
1526            << "n_snps\tn_loci\n";
1527     for (cnt_it = this->_pre_snps_per_loc.begin(); cnt_it != this->_pre_snps_per_loc.end(); cnt_it++)
1528         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1529     end_section();
1530 
1531     begin_section("samples_per_loc_postfilters");
1532     log_fh << "# Distribution of valid samples matched to a catalog locus after filtering.\n"
1533            << "n_samples\tn_loci\n";
1534     for (cnt_it = this->_post_valid.begin(); cnt_it != this->_post_valid.end(); cnt_it++)
1535         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1536     end_section();
1537 
1538     begin_section("missing_samples_per_loc_postfilters");
1539     log_fh << "# Distribution of missing samples for each catalog locus after filtering.\n"
1540            << "# Absent samples at locus\tCount\n";
1541     for (cnt_it = this->_post_absent.begin(); cnt_it != this->_post_absent.end(); cnt_it++)
1542         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1543     end_section();
1544 
1545     begin_section("snps_per_loc_postfilters");
1546     log_fh << "# Distribution of the number of SNPs per catalog locus (after filtering).\n"
1547            << "n_snps\tn_loci\n";
1548     for (cnt_it = this->_post_snps_per_loc.begin(); cnt_it != this->_post_snps_per_loc.end(); cnt_it++)
1549         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
1550     end_section();
1551 
1552     return 0;
1553 }
1554 
1555 int
tabulate_locus_haplotypes(CSLocus * cloc,Datum ** d,int sample_cnt)1556 tabulate_locus_haplotypes(CSLocus *cloc, Datum **d, int sample_cnt)
1557 {
1558     double cnt  = 0.0;
1559 
1560     for (int i = 0; i < sample_cnt; i++) {
1561         if (d[i] == NULL)
1562             continue;
1563 
1564         if (d[i]->obshap.size() > 1)
1565             cloc->marker = "heterozygous";
1566 
1567         cnt++;
1568     }
1569 
1570     if (cloc->marker.length() > 0) {
1571         create_genotype_map(cloc, d, sample_cnt);
1572         call_population_genotypes(cloc, d, sample_cnt);
1573     }
1574 
1575     return 0;
1576 }
1577 
1578 /*int
1579 tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
1580 {
1581     map<int, CSLocus *>::iterator it;
1582     vector<char *>::iterator hit;
1583     Datum  **d;
1584     CSLocus *loc;
1585     double   mean, cnt;
1586 
1587     for (it = catalog.begin(); it != catalog.end(); it++) {
1588         loc = it->second;
1589         d   = pmap->locus(loc->id);
1590 
1591         mean = 0.0;
1592         cnt  = 0.0;
1593 
1594         for (int i = 0; i < pmap->sample_cnt(); i++) {
1595             if (d[i] == NULL)
1596                 continue;
1597 
1598             if (d[i]->obshap.size() > 1)
1599                 loc->marker = "heterozygous";
1600 
1601             mean += d[i]->lnl;
1602             cnt++;
1603         }
1604 
1605         if (loc->marker.length() > 0) {
1606             create_genotype_map(loc, d, pmap->sample_cnt());
1607             call_population_genotypes(loc, d, pmap->sample_cnt());
1608         }
1609 
1610         loc->lnl = mean / cnt;
1611     }
1612 
1613     return 0;
1614 }*/
1615 
1616 /*
1617 int
1618 merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
1619                           PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
1620                           map<int, pair<merget, int> > &merge_map,
1621                           ofstream &log_fh)
1622 {
1623     map<string, vector<CSLocus *> >::iterator it;
1624     CSLocus *cur, *next;
1625     Datum  **d_1, **d_2;
1626     double   prune_pct;
1627     uint unmergable, tot_loci, tot_samp;
1628     uint success           = 0;
1629     uint failure           = 0;
1630     uint overlap           = 0;
1631     uint simple_merge_cnt  = 0;
1632     uint complex_merge_cnt = 0;
1633     uint missing_samps_cnt = 0;
1634     uint phase_fail_cnt    = 0;
1635     uint nomapping_cnt     = 0;
1636     uint multimapping_cnt  = 0;
1637     uint multifails_cnt    = 0;
1638 
1639     tot_loci = pmap->loci_cnt();
1640 
1641     set<int> loci_to_destroy;
1642     map<int, int> missing_samps_dist;
1643 
1644     cout << "To merge adjacent loci at least " << merge_prune_lim * 100 << "% of samples must have both adjacent loci;"
1645          << " the remaining " << 100 - (merge_prune_lim * 100) << "% of individuals will be pruned.\n"
1646          << "Attempting to merge adjacent loci that share a cutsite...";
1647 
1648     if (verbose)
1649         log_fh << "\n#\n# List of locus pairs that share a cutsite that failed to merge because they could not be phased.\n#\n";
1650 
1651     //
1652     // Iterate over each chromosome.
1653     //
1654     for (it = pmap->ordered_loci_nconst().begin(); it != pmap->ordered_loci_nconst().end(); it++) {
1655         //
1656         // Iterate over each ordered locus on this chromosome.
1657         //
1658         next = it->second[0];
1659         for (uint pos = 1; pos < it->second.size(); pos++) {
1660             cur  = next;
1661             next = it->second[pos];
1662 
1663             //
1664             // Do these two loci overlap?
1665             //   +Must occur on opposite strands
1666             //   +Must overlap according to the length of the cutsite.
1667             //
1668             if (((cur->loc.strand == strand_minus && next->loc.strand == strand_plus) &&
1669                  ((int) (cur->loc.bp  - next->loc.bp + 1) == renz_olap[enz])) ||
1670                 ((cur->loc.strand == strand_plus  && next->loc.strand == strand_minus) &&
1671                  ((int) (next->loc.bp - cur->loc.bp  + 1) == renz_olap[enz]))) {
1672                 overlap++;
1673 
1674                 d_1        = pmap->locus(cur->id);
1675                 d_2        = pmap->locus(next->id);
1676                 unmergable = 0;
1677                 tot_samp   = 0;
1678 
1679                 //
1680                 // Check if all members of the population contain these two loci (or are missing both).
1681                 //
1682                 for (int i = 0; i < pmap->sample_cnt(); i++) {
1683                     if (d_1[i] != NULL || d_2[i] != NULL)
1684                         tot_samp++;
1685                     if ((d_1[i] != NULL && d_2[i] == NULL) ||
1686                         (d_1[i] == NULL && d_2[i] != NULL))
1687                         unmergable++;
1688                 }
1689 
1690                 prune_pct = (double) (tot_samp - unmergable) / (double) tot_samp;
1691 
1692                 //
1693                 // If some of the individuals only have one locus and not the other, prune them out.
1694                 //
1695                 if (prune_pct < 1.0 && prune_pct >= merge_prune_lim) {
1696                     for (int i = 0; i < pmap->sample_cnt(); i++)
1697                         if (d_1[i] != NULL && d_2[i] == NULL) {
1698                             delete d_1[i];
1699                             d_1[i] = NULL;
1700                         } else if (d_1[i] == NULL && d_2[i] != NULL) {
1701                             delete d_2[i];
1702                             d_2[i] = NULL;
1703                         }
1704                 }
1705 
1706                 //
1707                 // If possible, merge the two loci together.
1708                 //
1709                 if (prune_pct < merge_prune_lim) {
1710                     int pct = (int) (prune_pct * 100);
1711                     missing_samps_dist[pct]++;
1712                     if (verbose) log_fh << "Missing samples, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1713                                         << pct << "% present (" << 100 - pct << "% missing)\n";
1714                     missing_samps_cnt++;
1715                     failure++;
1716                     continue;
1717                 }
1718 
1719                 phaset res = merge_and_phase_loci(pmap, cur, next, loci_to_destroy, log_fh);
1720                 switch(res) {
1721                 case multiple_fails:
1722                     if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1723                                         << "multiple failures\n";
1724                     multifails_cnt++;
1725                     phase_fail_cnt++;
1726                     failure++;
1727                     break;
1728                 case multimapping_fail:
1729                     if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1730                                         << "multimapping in one or more individuals\n";
1731                     multimapping_cnt++;
1732                     phase_fail_cnt++;
1733                     failure++;
1734                     break;
1735                 case nomapping_fail:
1736                     if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1737                                         << "no mapping in one or more individuals\n";
1738                     nomapping_cnt++;
1739                     phase_fail_cnt++;
1740                     failure++;
1741                     break;
1742                 case complex_phase:
1743                     if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1744                                         << "a complex phasing operation.\n";
1745                     complex_merge_cnt++;
1746                     success++;
1747                     merge_map[cur->id]  = make_pair(merge_sink, next->id);
1748                     merge_map[next->id] = make_pair(merge_src, cur->id);
1749                     break;
1750                 case simple_merge:
1751                     if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
1752                                         << "a simple merge operation.\n";
1753                     simple_merge_cnt++;
1754                     success++;
1755                     merge_map[cur->id]  = make_pair(merge_sink, next->id);
1756                     merge_map[next->id] = make_pair(merge_src, cur->id);
1757                     break;
1758                 default:
1759                     cerr << "Warning: Merge failure.\n";
1760                     break;
1761                 }
1762             }
1763         }
1764     }
1765 
1766     //
1767     // Remove those loci that have been merged from both the popualtion map and catalog.
1768     //
1769     set<int> emptyset;
1770     pmap->prune(loci_to_destroy);
1771     reduce_catalog(catalog, emptyset, loci_to_destroy);
1772 
1773     cout << "done.\n"
1774          << "Of " << tot_loci << " loci, "
1775          << overlap << " pairs share a cutsite; "
1776          << success << " pairs were merged; "
1777          << failure << " pairs failed to merge; "
1778          << pmap->loci_cnt() << " remaining loci.\n"
1779          << "  Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
1780          << "while " << complex_merge_cnt << " required phasing.\n"
1781          << "  Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
1782          << "while " << phase_fail_cnt << " failed to be phased.\n"
1783          << "    Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
1784          << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
1785 
1786     log_fh << "\n#\n# Merging adjacent loci with a shared restriction enzyme cutsite\n#\n"
1787            << "Of " << tot_loci << " loci, "
1788            << overlap << " pairs share a cutsite; "
1789            << success << " pairs were merged; "
1790            << failure << " pairs failed to merge; "
1791            << pmap->loci_cnt() << " remaining loci.\n"
1792            << "  Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
1793            << "while " << complex_merge_cnt << " required phasing.\n"
1794            << "  Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
1795            << "while " << phase_fail_cnt << " failed to be phased.\n"
1796            << "    Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
1797            << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
1798     log_fh << "#\n# Distribution of loci with samples missing one of two loci to be merged\n"
1799            << "# Percent samples with both loci present\tNumber of cases\n";
1800     map<int, int>::iterator mit;
1801     for (mit = missing_samps_dist.begin(); mit != missing_samps_dist.end(); mit++)
1802         log_fh << mit->first << "\t" << mit->second << "\n";
1803     log_fh << "\n";
1804 
1805     return 0;
1806 }
1807 */
1808 /*phaset
1809 merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
1810                      set<int> &loci_to_destroy,
1811                      ofstream &log_fh)
1812 {
1813     Datum **d_1 = pmap->locus(cur->id);
1814     Datum **d_2 = pmap->locus(next->id);
1815 
1816     set<int>    phased_results;
1817     set<string> phased_haplotypes;
1818     string      merged_hap;
1819     char       *h_1, *h_2;
1820     int         merge_type;
1821 
1822     if (verbose) log_fh << "Attempting to phase source locus " << cur->id << " with sink locus " << next->id << "\n";
1823 
1824     int sample_cnt        = 0;
1825     int phased_sample_cnt = 0;
1826     //
1827     // Take a census of the already phased haplotypes. We have phased haplotypes
1828     // if for individual i:
1829     //   1. d_1 has a single haplotype and d_2 has a single haplotype
1830     //   2. d_1 has a single haplotpye and d_2 has multiple haplotypes
1831     //   3. d_1 has multiple haplotpyes and d_2 has a single haplotype
1832     //
1833     // If one or both of the loci have no SNPs, then the haplotype is
1834     // recorded as "consensus." Check that condition before we start merging.
1835     //
1836     if (cur->snps.size() > 0 && next->snps.size() > 0)
1837         merge_type = 0;
1838     else if (cur->snps.size() == 0)
1839         merge_type = 1;
1840     else if (next->snps.size() == 0)
1841         merge_type = 2;
1842     else
1843         merge_type = 3;
1844 
1845     for (int i = 0; i < pmap->sample_cnt(); i++) {
1846         if (d_1[i] == NULL || d_2[i] == NULL)
1847             continue;
1848         else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1)
1849             continue;
1850         else {
1851             for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
1852                 for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
1853                     switch (merge_type) {
1854                     case 0:
1855                         merged_hap = string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]);
1856                         break;
1857                     case 1:
1858                         merged_hap = string(d_2[i]->obshap[k]);
1859                         break;
1860                     case 2:
1861                         merged_hap = string(d_1[i]->obshap[j]);
1862                         break;
1863                     case 3:
1864                     default:
1865                         merged_hap = "consensus";
1866                         break;
1867                     }
1868                     phased_haplotypes.insert(merged_hap);
1869                     // cout << "Phasing: '" << d_1[i]->obshap[j] << "' + '" << d_2[i]->obshap[k] << "' => '" << merged_hap << "'\n";
1870                 }
1871             }
1872             phased_sample_cnt++;
1873             sample_cnt++;
1874         }
1875     }
1876 
1877     //
1878     // Indicate that these two loci had a simple merge, with no phasing necessary.
1879     //
1880     phased_results.insert(simple_merge);
1881 
1882     //
1883     // Now we need to check if we can phase the remaining haplotypes.
1884     //
1885     for (int i = 0; i < pmap->sample_cnt(); i++) {
1886         if (d_1[i] == NULL || d_2[i] == NULL)
1887             continue;
1888         else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1) {
1889             // cout << "Attempting to phase individual " << i << ": " << d_1[i]->id << " / " << d_2[i]->id << "\n";
1890 
1891             sample_cnt++;
1892             //
1893             // We should be able to find a sinlge phasing mapping for each haplotype from d_1 to d_2
1894             // that includes all the haplotypes in these two loci.
1895             //
1896             vector<pair<char *, char *> > seen_phased;
1897             uint tot_obshap = d_1[i]->obshap.size() + d_2[i]->obshap.size();
1898             uint phased_cnt = 0;
1899             for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
1900                 for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
1901                     // cout << "  " << d_1[i]->obshap[j] << " + " << d_2[i]->obshap[k];
1902                     //
1903                     // Record each pair of haplotypes that has been seen phased previously.
1904                     //
1905                     if (phased_haplotypes.count(string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]))) {
1906                         seen_phased.push_back(make_pair(d_1[i]->obshap[j], d_2[i]->obshap[k]));
1907                         // cout << " => " << d_1[i]->obshap[j] << d_2[i]->obshap[k];
1908                     }
1909                     // cout << "\n";
1910                 }
1911             }
1912             //
1913             // Now, we will iterate over all sets of phased haplotypes and look
1914             // for combinations that use all four individual haplotypes.
1915             //
1916             for (uint j = 0; j < seen_phased.size(); j++) {
1917                 for (uint k = j; k < seen_phased.size(); k++) {
1918                     set<char *> incorporated_haplotypes;
1919                     //
1920                     // Count the number of distinct char pointers. If this combination
1921                     // of haplotypes includes all unphased haplotypes, count it.
1922                     //
1923                     incorporated_haplotypes.insert(seen_phased[j].first);
1924                     incorporated_haplotypes.insert(seen_phased[j].second);
1925                     incorporated_haplotypes.insert(seen_phased[k].first);
1926                     incorporated_haplotypes.insert(seen_phased[k].second);
1927                     if (incorporated_haplotypes.size() == tot_obshap)
1928                         phased_cnt++;
1929                 }
1930             }
1931 
1932             //
1933             // If one pair of haplotypes is mapped, but the other is not, assume the second pair or
1934             // haplotypes must be phased by process of elimination.
1935             //
1936             if (phased_cnt == 0 && seen_phased.size() == 1) {
1937                 h_1 = seen_phased[0].first  == d_1[i]->obshap[1] ?
1938                     d_1[i]->obshap[0] : d_1[i]->obshap[1];
1939                 h_2 = seen_phased[0].second == d_2[i]->obshap[1] ?
1940                     d_2[i]->obshap[0] : d_2[i]->obshap[1];
1941                 phased_haplotypes.insert(string(h_1) + string(h_2));
1942                 phased_cnt++;
1943                 // cout << "  Phasing: '" << hap_1 << "' + '" << hap_2 << "' => '" << string(hap_1) + string(hap_2) << "'\n";
1944             }
1945 
1946             if (phased_cnt == 0) {
1947                 phased_results.insert(nomapping_fail);
1948                 if (verbose) log_fh << "    Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " no mapping found.\n";
1949             } else if (phased_cnt == 1) {
1950                 phased_sample_cnt++;
1951                 phased_results.insert(complex_phase);
1952             } else {
1953                 phased_results.insert(multimapping_fail);
1954                 if (verbose) log_fh << "    Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " multiple mappings found.\n";
1955             }
1956         }
1957     }
1958 
1959     if (phased_sample_cnt != sample_cnt) {
1960         if (phased_results.count(nomapping_fail) > 0 &&
1961             phased_results.count(multimapping_fail) > 0)
1962             return multiple_fails;
1963         else if (phased_results.count(nomapping_fail) > 0)
1964             return nomapping_fail;
1965         else if (phased_results.count(multimapping_fail) > 0)
1966             return multimapping_fail;
1967         else {
1968             cout << "WE SHOULD NOT GET HERE\n";
1969             return merge_failure;
1970         }
1971     }
1972 
1973     //
1974     // Okay, merge these two loci together.
1975     //
1976     if (!merge_datums(pmap->sample_cnt(), cur->len, d_1, d_2, phased_haplotypes, merge_type))
1977         return merge_failure;
1978 
1979     //
1980     // Merge the catalog entries together.
1981     //
1982     if (!merge_csloci(cur, next, phased_haplotypes))
1983         return merge_failure;
1984 
1985     //
1986     // Mark the merged locus for destruction.
1987     //
1988     loci_to_destroy.insert(next->id);
1989 
1990     if (phased_results.count(complex_phase) > 0)
1991         return complex_phase;
1992     return simple_merge;
1993 }*/
1994 
1995 /*int
1996 merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
1997 {
1998     //
1999     // We assume that we are merging two loci: one on the negative strand, one on the
2000     // positive. We will keep the sink cslocus and delete the src cslocus.
2001     //   -> The sink cslocus is assumed to be on the negative strand.
2002     //
2003 
2004     //
2005     // 1. Reverse complement the SNP coordinates in the sink locus so that they are
2006     //    enumerated on the positive strand. Complement the alleles as well.
2007     //
2008     for (uint j = 0; j < sink->snps.size(); j++) {
2009         sink->snps[j]->col    = sink->len - sink->snps[j]->col - 1;
2010         sink->snps[j]->rank_1 = reverse(sink->snps[j]->rank_1);
2011         sink->snps[j]->rank_2 = reverse(sink->snps[j]->rank_2);
2012         sink->snps[j]->rank_3 = reverse(sink->snps[j]->rank_3);
2013         sink->snps[j]->rank_4 = reverse(sink->snps[j]->rank_4);
2014     }
2015 
2016     //
2017     // 2. Adjust the SNP coordinates in the src locus to account for the now, longer length.
2018     //
2019     for (uint j = 0; j < src->snps.size(); j++)
2020         src->snps[j]->col = sink->len + src->snps[j]->col - renz_olap[enz];
2021 
2022     //
2023     // 3. Combine SNPs between the two catalog loci: add the SNPs from the sink (formerly on the
2024     //    negative strand) in reverse order, followed by the SNPs from the src.
2025     //
2026     vector<SNP *> tmpsnp;
2027     for (int j = (int) sink->snps.size() - 1; j >= 0; j--)
2028         tmpsnp.push_back(sink->snps[j]);
2029     for (uint j = 0; j < src->snps.size(); j++)
2030         tmpsnp.push_back(src->snps[j]);
2031     sink->snps.clear();
2032     for (uint j = 0; j < tmpsnp.size(); j++)
2033         sink->snps.push_back(tmpsnp[j]);
2034 
2035     //
2036     // 4. Adjust the genomic location of the sink locus.
2037     //
2038     uint bp = sink->sort_bp();
2039     sink->loc.bp     = bp;
2040     sink->loc.strand = strand_plus;
2041 
2042     //
2043     // 5. Adjust the length of the sequence.
2044     //
2045     sink->len += src->len - renz_olap[enz];
2046 
2047     //
2048     // 6. Merge the consensus sequence together.
2049     //
2050     char *new_con = rev_comp(sink->con);
2051     delete [] sink->con;
2052     sink->con = new_con;
2053     new_con   = new char[sink->len + 1];
2054     strcpy(new_con, sink->con);
2055     delete [] sink->con;
2056     sink->con = new_con;
2057     new_con  += src->len - renz_olap[enz];
2058     strcpy(new_con, src->con);
2059 
2060     //
2061     // 7. Record the now phased haplotypes.
2062     //
2063     sink->alleles.clear();
2064     set<string>::iterator it;
2065     for (it = phased_haplotypes.begin(); it != phased_haplotypes.end(); it++)
2066         sink->alleles[*it] = 0;
2067 
2068     // cout << "CSLocus " << sink->id << ":\n"
2069     //   << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == strand_plus ? "+" : "-") << "\n"
2070     //   << "  SNPs:\n";
2071     // for (uint j = 0; j < sink->snps.size(); j++)
2072     //  cout << "    Col: " << sink->snps[j]->col
2073     //       << "    Rank 1: " << sink->snps[j]->rank_1
2074     //       << "    Rank 2: " << sink->snps[j]->rank_2 << "\n";
2075     // cout << "  Alleles:\n";
2076     // map<string, int>::iterator ait;
2077     // for (ait = sink->alleles.begin(); ait != sink->alleles.end(); ait++)
2078     //  cout << "    " << ait->first << "\n";
2079 
2080     return 1;
2081 }*/
2082 
2083 /*int
2084 merge_datums(int sample_cnt,
2085              int sink_locus_len,
2086              Datum **sink, Datum **src,
2087              set<string> &phased_haplotypes,
2088              int merge_type)
2089 {
2090     char           tmphap[id_len], *new_hap;
2091     uint           haplen, model_len, offset;
2092     vector<SNP *>  tmpsnp;
2093     vector<string> tmpobshap;
2094     vector<int>    tmpobsdep;
2095 
2096     //
2097     // We assume that we are merging two loci: one on the negative strand, one on the
2098     // positive. We will keep the sink datum and delete the src datum.
2099     //   -The sink datum is assumed to be on the negative strand.
2100     //
2101     for (int i = 0; i < sample_cnt; i++) {
2102         if (sink[i] == NULL && src[i] == NULL)
2103             continue;
2104         else if (sink[i] == NULL || src[i] == NULL)
2105             cout << "Unexpected condition in merging datums: one datum is NULL while the other is not.\n";
2106 
2107         //
2108         // 1. Reverse complement the observed haplotypes in the sink locus.
2109         //
2110         haplen = strlen(sink[i]->obshap[0]);
2111         for (uint j = 0; j < sink[i]->obshap.size(); j++) {
2112             for (uint k = 0; k < haplen; k++)
2113                 tmphap[k] = reverse(sink[i]->obshap[j][haplen - k - 1]);
2114             tmphap[haplen] = '\0';
2115             strcpy(sink[i]->obshap[j], tmphap);
2116         }
2117     }
2118 
2119     //
2120     // 2. Combine observed haplotypes between the two datums while phasing them.
2121     //    2.1 First combine the haplotypes from samples that are already in phase.
2122     //
2123     string      merged_hap;
2124     vector<int> to_be_phased;
2125     phased_haplotypes.clear();
2126     for (int i = 0; i < sample_cnt; i++) {
2127         if (sink[i] == NULL && src[i] == NULL)
2128             continue;
2129 
2130         if (sink[i]->obshap.size() > 1 && src[i]->obshap.size() > 1) {
2131             to_be_phased.push_back(i);
2132             continue;
2133         } else {
2134             tmpobshap.clear();
2135             tmpobsdep.clear();
2136             for (uint j = 0; j < sink[i]->obshap.size(); j++) {
2137                 for (uint k = 0; k < src[i]->obshap.size(); k++) {
2138                     switch (merge_type) {
2139                     case 0:
2140                         merged_hap = string(sink[i]->obshap[j]) + string(src[i]->obshap[k]);
2141                         break;
2142                     case 1:
2143                         merged_hap = string(src[i]->obshap[j]);
2144                         break;
2145                     case 2:
2146                         merged_hap = string(sink[i]->obshap[j]);
2147                         break;
2148                     case 3:
2149                     default:
2150                         merged_hap = "consensus";
2151                         break;
2152                     }
2153                     phased_haplotypes.insert(merged_hap);
2154                     tmpobshap.push_back(merged_hap);
2155                     tmpobsdep.push_back((sink[i]->depth[j] + src[i]->depth[k]) / 2);
2156                 }
2157             }
2158             sink[i]->depth.clear();
2159             for (uint j = 0; j < sink[i]->obshap.size(); j++)
2160                 delete [] sink[i]->obshap[j];
2161             sink[i]->obshap.clear();
2162             for (uint j = 0; j < tmpobshap.size(); j++) {
2163                 new_hap = new char[tmpobshap[j].length() + 1];
2164                 strcpy(new_hap, tmpobshap[j].c_str());
2165                 sink[i]->obshap.push_back(new_hap);
2166                 sink[i]->depth.push_back(tmpobsdep[j]);
2167             }
2168         }
2169     }
2170     //
2171     //    2.2 Phase and combine the haplotypes from the remaining samples.
2172     //
2173     int index;
2174     for (uint i = 0; i < to_be_phased.size(); i++) {
2175         index = to_be_phased[i];
2176         tmpobshap.clear();
2177         tmpobsdep.clear();
2178 
2179         vector<pair<char *, char *> > seen_phased;
2180         uint tot_obshap = sink[index]->obshap.size() + src[index]->obshap.size();
2181 
2182         for (uint j = 0; j < sink[index]->obshap.size(); j++) {
2183             for (uint k = 0; k < src[index]->obshap.size(); k++) {
2184                 if (phased_haplotypes.count(string(sink[index]->obshap[j]) + string(src[index]->obshap[k])))
2185                     seen_phased.push_back(make_pair(sink[index]->obshap[j], src[index]->obshap[k]));
2186             }
2187         }
2188 
2189         for (uint j = 0; j < seen_phased.size(); j++) {
2190             for (uint k = j; k < seen_phased.size(); k++) {
2191                 set<char *> incorporated_haplotypes;
2192                 incorporated_haplotypes.insert(seen_phased[j].first);
2193                 incorporated_haplotypes.insert(seen_phased[j].second);
2194                 incorporated_haplotypes.insert(seen_phased[k].first);
2195                 incorporated_haplotypes.insert(seen_phased[k].second);
2196                 if (incorporated_haplotypes.size() == tot_obshap) {
2197                     tmpobshap.push_back(string(seen_phased[j].first) + string(seen_phased[j].second));
2198                     tmpobshap.push_back(string(seen_phased[k].first) + string(seen_phased[k].second));
2199                     //tmpobsdep.push_back((sink[index]->depth[j] + src[index]->depth[k]) / 2);
2200                 }
2201             }
2202         }
2203 
2204         sink[index]->depth.clear();
2205         for (uint j = 0; j < sink[index]->obshap.size(); j++)
2206             delete [] sink[index]->obshap[j];
2207         sink[index]->obshap.clear();
2208         for (uint j = 0; j < tmpobshap.size(); j++) {
2209             new_hap = new char[tmpobshap[j].length() + 1];
2210             strcpy(new_hap, tmpobshap[j].c_str());
2211             sink[index]->obshap.push_back(new_hap);
2212             // sink[index]->depth.push_back(tmpobsdep[j]);
2213         }
2214     }
2215 
2216     //
2217     // 3. Merge model calls; Set the length; combine the two depth and lnl measures together.
2218     //
2219     string model_calls;
2220     char  *p;
2221 
2222     for (int i = 0; i < sample_cnt; i++) {
2223         if (sink[i] == NULL && src[i] == NULL)
2224             continue;
2225 
2226         //
2227         // Merge the two strings of model calls together.
2228         // We need to check if the locus for this individual is shorter than the catalog
2229         // locus. If so, we need to expand out the model call array to be the proper length.
2230         //
2231         reverse_string(sink[i]->model);
2232         offset = 0;
2233         model_calls.clear();
2234         if (sink_locus_len > sink[i]->len) {
2235             offset = sink_locus_len - sink[i]->len;
2236             model_calls.assign(offset, 'N');
2237         }
2238         model_len = offset + sink[i]->len + src[i]->len - renz_olap[enz];
2239         model_calls.append(sink[i]->model);
2240         delete [] sink[i]->model;
2241         sink[i]->model = new char[model_len + 1];
2242         strcpy(sink[i]->model, model_calls.c_str());
2243         p  = sink[i]->model;
2244         p += offset + sink[i]->len - renz_olap[enz];
2245         strcpy(p, src[i]->model);
2246 
2247         sink[i]->len       = model_len;
2248         sink[i]->tot_depth = (sink[i]->tot_depth + src[i]->tot_depth) / 2;
2249         sink[i]->lnl       = (sink[i]->lnl + src[i]->lnl) / 2.0;
2250 
2251         //
2252         // Record which datum was merged into this one.
2253         //
2254         sink[i]->merge_partner = src[i]->id;
2255     }
2256 
2257     return 1;
2258 }*/
2259 
2260 int
create_genotype_map(CSLocus * locus,Datum ** d,int sample_cnt)2261 create_genotype_map(CSLocus *locus, Datum **d, int sample_cnt)
2262 {
2263     //
2264     // Create a genotype map. For any set of haplotypes, this routine will
2265     // assign each haplotype to a genotype, e.g. given the haplotypes
2266     // 'AC' and 'GT' in the population, this routine will assign 'AC' == 'a'
2267     // and 'GT' == 'b'. If an individual is homozygous for 'AC', they will be
2268     // assigned an 'aa' genotype.
2269     //
2270     //cout << "Creating genotype map for catalog ID " << locus->id  << ", marker: " << locus->marker << ".\n";
2271 
2272     char gtypes[26] ={'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
2273                       'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
2274                       'u', 'v', 'w', 'x', 'y', 'z'};
2275 
2276     map<string, int> haplotypes;
2277     map<string, int>::iterator k;
2278     vector<pair<string, int> > sorted_haplotypes;
2279 
2280     for (int i = 0; i < sample_cnt; i++) {
2281 
2282         if (d[i] != NULL)
2283             for (uint n = 0; n < d[i]->obshap.size(); n++)
2284                 haplotypes[d[i]->obshap[n]]++;
2285     }
2286 
2287     //
2288     // Check that there are not more haplotypes than we have encodings.
2289     //
2290     if (haplotypes.size() > 26) return 0;
2291 
2292     //
2293     // Sort the haplotypes map by value
2294     //
2295     for (k = haplotypes.begin(); k != haplotypes.end(); k++)
2296         sorted_haplotypes.push_back(*k);
2297     sort(sorted_haplotypes.begin(), sorted_haplotypes.end(), hap_compare);
2298 
2299     for (uint n = 0, index = 0; n < sorted_haplotypes.size() && index <= 26; n++, index++) {
2300         locus->gmap[sorted_haplotypes[n].first] = gtypes[index];
2301         //cout << "GMAP: " << sorted_haplotypes[n].first << " == " << gtypes[index] << "\n";
2302     }
2303 
2304     return 0;
2305 }
2306 
2307 int
call_population_genotypes(CSLocus * locus,Datum ** d,int sample_cnt)2308 call_population_genotypes(CSLocus *locus, Datum **d, int sample_cnt)
2309 {
2310     //
2311     // Fetch the array of observed haplotypes from the population
2312     //
2313     for (int i = 0; i < sample_cnt; i++) {
2314         if (d[i] == NULL)
2315             continue;
2316 
2317         vector<string> gtypes;
2318         string gtype;
2319 
2320         //cout << "Sample Id: " << pmap->rev_sample_index(i) << "\n";
2321 
2322         for (uint j = 0; j < d[i]->obshap.size(); j++) {
2323             //
2324             // Impossible allele encountered.
2325             //
2326             if (locus->gmap.count(d[i]->obshap[j]) == 0) {
2327                 gtypes.clear();
2328                 gtypes.push_back("-");
2329                 goto impossible;
2330             }
2331 
2332             gtypes.push_back(locus->gmap[d[i]->obshap[j]]);
2333             //cout << "  Observed Haplotype: " << d[i]->obshap[j] << ", Genotype: " << locus->gmap[d[i]->obshap[j]] << "\n";
2334         }
2335 
2336     impossible:
2337         sort(gtypes.begin(), gtypes.end());
2338         for (uint j = 0; j < gtypes.size(); j++) {
2339             gtype += gtypes[j];
2340             //cout << "  Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
2341         }
2342 
2343         string m = gtype.length() == 1 ?
2344             gtype + gtype : gtype;
2345 
2346         d[i]->gtype = new char[m.length() + 1];
2347         strcpy(d[i]->gtype, m.c_str());
2348 
2349         if (m != "-")
2350             locus->gcnt++;
2351 
2352         //cout << "Assigning datum, marker: " << locus->marker << ", string: " << m << ", haplotype: " << d[i]->obshap[0] << ", gtype: " << gtype << "\n";
2353      }
2354 
2355     return 0;
2356 }
2357 
SumStatsSummary(size_t pop_cnt)2358 SumStatsSummary::SumStatsSummary(size_t pop_cnt)
2359 {
2360     this->_pop_cnt           = pop_cnt;
2361     this->_private_cnt       = new size_t[this->_pop_cnt];
2362     this->_sig_hwe_dev       = new size_t[this->_pop_cnt];
2363     this->_n                 = new double[this->_pop_cnt];
2364     this->_var_sites         = new double[this->_pop_cnt];
2365 
2366     this->_num_indv_mean     = new double[this->_pop_cnt];
2367     this->_num_indv_acc_mean = new double[this->_pop_cnt];
2368     this->_num_indv_var      = new double[this->_pop_cnt];
2369     this->_p_mean            = new double[this->_pop_cnt];
2370     this->_p_acc_mean        = new double[this->_pop_cnt];
2371     this->_p_var             = new double[this->_pop_cnt];
2372     this->_obs_het_mean      = new double[this->_pop_cnt];
2373     this->_obs_het_acc_mean  = new double[this->_pop_cnt];
2374     this->_obs_het_var       = new double[this->_pop_cnt];
2375     this->_obs_hom_mean      = new double[this->_pop_cnt];
2376     this->_obs_hom_acc_mean  = new double[this->_pop_cnt];
2377     this->_obs_hom_var       = new double[this->_pop_cnt];
2378     this->_exp_het_mean      = new double[this->_pop_cnt];
2379     this->_exp_het_acc_mean  = new double[this->_pop_cnt];
2380     this->_exp_het_var       = new double[this->_pop_cnt];
2381     this->_exp_hom_mean      = new double[this->_pop_cnt];
2382     this->_exp_hom_acc_mean  = new double[this->_pop_cnt];
2383     this->_exp_hom_var       = new double[this->_pop_cnt];
2384     this->_pi_mean           = new double[this->_pop_cnt];
2385     this->_pi_acc_mean       = new double[this->_pop_cnt];
2386     this->_pi_var            = new double[this->_pop_cnt];
2387     this->_fis_mean          = new double[this->_pop_cnt];
2388     this->_fis_acc_mean      = new double[this->_pop_cnt];
2389     this->_fis_var           = new double[this->_pop_cnt];
2390 
2391     this->_n_all                 = new double[this->_pop_cnt];
2392     this->_num_indv_mean_all     = new double[this->_pop_cnt];
2393     this->_num_indv_acc_mean_all = new double[this->_pop_cnt];
2394     this->_num_indv_var_all      = new double[this->_pop_cnt];
2395     this->_p_mean_all            = new double[this->_pop_cnt];
2396     this->_p_acc_mean_all        = new double[this->_pop_cnt];
2397     this->_p_var_all             = new double[this->_pop_cnt];
2398     this->_obs_het_mean_all      = new double[this->_pop_cnt];
2399     this->_obs_het_acc_mean_all  = new double[this->_pop_cnt];
2400     this->_obs_het_var_all       = new double[this->_pop_cnt];
2401     this->_obs_hom_mean_all      = new double[this->_pop_cnt];
2402     this->_obs_hom_acc_mean_all  = new double[this->_pop_cnt];
2403     this->_obs_hom_var_all       = new double[this->_pop_cnt];
2404     this->_exp_het_mean_all      = new double[this->_pop_cnt];
2405     this->_exp_het_acc_mean_all  = new double[this->_pop_cnt];
2406     this->_exp_het_var_all       = new double[this->_pop_cnt];
2407     this->_exp_hom_mean_all      = new double[this->_pop_cnt];
2408     this->_exp_hom_acc_mean_all  = new double[this->_pop_cnt];
2409     this->_exp_hom_var_all       = new double[this->_pop_cnt];
2410     this->_pi_mean_all           = new double[this->_pop_cnt];
2411     this->_pi_acc_mean_all       = new double[this->_pop_cnt];
2412     this->_pi_var_all            = new double[this->_pop_cnt];
2413     this->_fis_mean_all          = new double[this->_pop_cnt];
2414     this->_fis_acc_mean_all      = new double[this->_pop_cnt];
2415     this->_fis_var_all           = new double[this->_pop_cnt];
2416 
2417     this->_sq_n     = new double[this->_pop_cnt];
2418     this->_sq_n_all = new double[this->_pop_cnt];
2419 
2420     for (uint j = 0; j < this->_pop_cnt; j++) {
2421         this->_private_cnt[j]       = 0;
2422         this->_sig_hwe_dev[j]       = 0;
2423         this->_n[j]                 = 0.0;
2424         this->_var_sites[j]         = 0.0;
2425         this->_num_indv_mean[j]     = 0.0;
2426         this->_num_indv_acc_mean[j] = 0.0;
2427         this->_num_indv_var[j]      = 0.0;
2428         this->_p_mean[j]            = 0.0;
2429         this->_p_acc_mean[j]        = 0.0;
2430         this->_p_var[j]             = 0.0;
2431         this->_obs_het_mean[j]      = 0.0;
2432         this->_obs_het_acc_mean[j]  = 0.0;
2433         this->_obs_het_var[j]       = 0.0;
2434         this->_obs_hom_mean[j]      = 0.0;
2435         this->_obs_hom_acc_mean[j]  = 0.0;
2436         this->_obs_hom_var[j]       = 0.0;
2437         this->_exp_het_mean[j]      = 0.0;
2438         this->_exp_het_acc_mean[j]  = 0.0;
2439         this->_exp_het_var[j]       = 0.0;
2440         this->_exp_hom_mean[j]      = 0.0;
2441         this->_exp_hom_acc_mean[j]  = 0.0;
2442         this->_exp_hom_var[j]       = 0.0;
2443         this->_pi_mean[j]           = 0.0;
2444         this->_pi_acc_mean[j]       = 0.0;
2445         this->_pi_var[j]            = 0.0;
2446         this->_fis_mean[j]          = 0.0;
2447         this->_fis_acc_mean[j]      = 0.0;
2448         this->_fis_var[j]           = 0.0;
2449 
2450         this->_n_all[j]                 = 0.0;
2451         this->_num_indv_mean_all[j]     = 0.0;
2452         this->_num_indv_acc_mean_all[j] = 0.0;
2453         this->_num_indv_var_all[j]      = 0.0;
2454         this->_p_mean_all[j]            = 0.0;
2455         this->_p_acc_mean_all[j]        = 0.0;
2456         this->_p_var_all[j]             = 0.0;
2457         this->_obs_het_mean_all[j]      = 0.0;
2458         this->_obs_het_acc_mean_all[j]  = 0.0;
2459         this->_obs_het_var_all[j]       = 0.0;
2460         this->_obs_hom_mean_all[j]      = 0.0;
2461         this->_obs_hom_acc_mean_all[j]  = 0.0;
2462         this->_obs_hom_var_all[j]       = 0.0;
2463         this->_exp_het_mean_all[j]      = 0.0;
2464         this->_exp_het_acc_mean_all[j]  = 0.0;
2465         this->_exp_het_var_all[j]       = 0.0;
2466         this->_exp_hom_mean_all[j]      = 0.0;
2467         this->_exp_hom_acc_mean_all[j]  = 0.0;
2468         this->_exp_hom_var_all[j]       = 0.0;
2469         this->_pi_mean_all[j]           = 0.0;
2470         this->_pi_acc_mean_all[j]       = 0.0;
2471         this->_pi_var_all[j]            = 0.0;
2472         this->_fis_mean_all[j]          = 0.0;
2473         this->_fis_acc_mean_all[j]      = 0.0;
2474         this->_fis_var_all[j]           = 0.0;
2475 
2476         this->_sq_n[j]     = 0.0;
2477         this->_sq_n_all[j] = 0.0;
2478     }
2479 
2480     this->_locus_n         = 0.0;
2481     this->_locus_overlap_n = 0.0;
2482     this->_locus_pe_ctg_n  = 0.0;
2483     this->_locus_len_mean     = 0.0;
2484     this->_locus_len_acc_mean = 0.0;
2485     this->_locus_len_var      = 0.0;
2486     this->_overlap_mean     = 0.0;
2487     this->_overlap_acc_mean = 0.0;
2488     this->_overlap_var      = 0.0;
2489     this->_locus_gt_sites_mean     = 0.0;
2490     this->_locus_gt_sites_acc_mean = 0.0;
2491     this->_locus_gt_sites_var      = 0.0;
2492 }
2493 
2494 int
accumulate(const vector<LocBin * > & loci)2495 SumStatsSummary::accumulate(const vector<LocBin *> &loci)
2496 {
2497     //
2498     // We are calculating the mean, variance, and standard deviation for several variables.
2499     //   We will calculate them partially, for each set of loci input to the program using
2500     //   the algorithm described in:
2501     //     B. P. Welford. (1962) Note on a Method for Calculating Corrected Sums of Squares and
2502     //     Products. Technometrics: 4(3), pp. 419-420.
2503     //
2504     CSLocus        *cloc;
2505     const LocSum   *s;
2506     const LocTally *t;
2507 
2508     for (uint i = 0; i < loci.size(); i++) {
2509         cloc = loci[i]->cloc;
2510         t    = loci[i]->s->meta_pop();
2511 
2512         size_t site_cnt = 0;
2513 
2514         for (uint pos = 0; pos < cloc->len; pos++) {
2515             //
2516             // Compile private alleles
2517             //
2518             if (t->nucs[pos].priv_allele >= 0)
2519                 _private_cnt[t->nucs[pos].priv_allele]++;
2520 
2521             if (t->nucs[pos].allele_cnt == 2) {
2522                 site_cnt++;
2523 
2524                 for (uint pop = 0; pop < this->_pop_cnt; pop++) {
2525 
2526                     s = loci[i]->s->per_pop(pop);
2527 
2528                     if (s->nucs[pos].num_indv == 0) continue;
2529 
2530                     _n[pop]++;
2531 
2532                     if (s->nucs[pos].pi > 0) _var_sites[pop]++;
2533 
2534                     //
2535                     // Record if site deviates from HWE.
2536                     //
2537                     if (calc_hwp && s->nucs[pos].stat[2] < p_value_cutoff) _sig_hwe_dev[pop]++;
2538 
2539                     //
2540                     // Accumulate sums for each variable to calculate the means.
2541                     //
2542                     _num_indv_mean[pop] += s->nucs[pos].num_indv;
2543                     _p_mean[pop]        += s->nucs[pos].p;
2544                     _obs_het_mean[pop]  += s->nucs[pos].obs_het;
2545                     _obs_hom_mean[pop]  += s->nucs[pos].obs_hom;
2546                     _exp_het_mean[pop]  += s->nucs[pos].exp_het;
2547                     _exp_hom_mean[pop]  += s->nucs[pos].exp_hom;
2548                     _pi_mean[pop]       += s->nucs[pos].stat[0];
2549                     _fis_mean[pop]      += s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0;
2550 
2551                     _n_all[pop]++;
2552                     _num_indv_mean_all[pop] += s->nucs[pos].num_indv;
2553                     _p_mean_all[pop]        += s->nucs[pos].p;
2554                     _obs_het_mean_all[pop]  += s->nucs[pos].obs_het;
2555                     _obs_hom_mean_all[pop]  += s->nucs[pos].obs_hom;
2556                     _exp_het_mean_all[pop]  += s->nucs[pos].exp_het;
2557                     _exp_hom_mean_all[pop]  += s->nucs[pos].exp_hom;
2558                     _pi_mean_all[pop]       += s->nucs[pos].stat[0];
2559                     _fis_mean_all[pop]      += s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0;
2560 
2561                     //
2562                     // Accumulate a partial sum of squares to calculate the variance.
2563                     //
2564                     _num_indv_var[pop] += this->online_variance(s->nucs[pos].num_indv, _num_indv_acc_mean[pop], _n[pop]);
2565                     _p_var[pop]        += this->online_variance(s->nucs[pos].p,        _p_acc_mean[pop],        _n[pop]);
2566                     _obs_het_var[pop]  += this->online_variance(s->nucs[pos].obs_het,  _obs_het_acc_mean[pop],  _n[pop]);
2567                     _obs_hom_var[pop]  += this->online_variance(s->nucs[pos].obs_hom,  _obs_hom_acc_mean[pop],  _n[pop]);
2568                     _exp_het_var[pop]  += this->online_variance(s->nucs[pos].exp_het,  _exp_het_acc_mean[pop],  _n[pop]);
2569                     _exp_hom_var[pop]  += this->online_variance(s->nucs[pos].exp_hom,  _exp_hom_acc_mean[pop],  _n[pop]);
2570                     _pi_var[pop]       += this->online_variance(s->nucs[pos].stat[0],  _pi_acc_mean[pop],       _n[pop]);
2571                     _fis_var[pop]      += this->online_variance(s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0, _fis_acc_mean[pop], _n[pop]);
2572 
2573                     _num_indv_var_all[pop] += this->online_variance(s->nucs[pos].num_indv, _num_indv_acc_mean_all[pop], _n_all[pop]);
2574                     _p_var_all[pop]        += this->online_variance(s->nucs[pos].p,        _p_acc_mean_all[pop],        _n_all[pop]);
2575                     _obs_het_var_all[pop]  += this->online_variance(s->nucs[pos].obs_het,  _obs_het_acc_mean_all[pop],  _n_all[pop]);
2576                     _obs_hom_var_all[pop]  += this->online_variance(s->nucs[pos].obs_hom,  _obs_hom_acc_mean_all[pop],  _n_all[pop]);
2577                     _exp_het_var_all[pop]  += this->online_variance(s->nucs[pos].exp_het,  _exp_het_acc_mean_all[pop],  _n_all[pop]);
2578                     _exp_hom_var_all[pop]  += this->online_variance(s->nucs[pos].exp_hom,  _exp_hom_acc_mean_all[pop],  _n_all[pop]);
2579                     _pi_var_all[pop]       += this->online_variance(s->nucs[pos].stat[0],  _pi_acc_mean_all[pop],       _n_all[pop]);
2580                     _fis_var_all[pop]      += this->online_variance(s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0, _fis_acc_mean_all[pop], _n_all[pop]);
2581                 }
2582 
2583             } else if (t->nucs[pos].allele_cnt == 1) {
2584                 site_cnt++;
2585 
2586                 for (uint pop = 0; pop < this->_pop_cnt; pop++) {
2587                     s = loci[i]->s->per_pop(pop);
2588 
2589                     if (s->nucs[pos].num_indv == 0) continue;
2590 
2591                     _n_all[pop]++;
2592                     _num_indv_mean_all[pop] += s->nucs[pos].num_indv;
2593                     _p_mean_all[pop]        += s->nucs[pos].p;
2594                     _obs_het_mean_all[pop]  += s->nucs[pos].obs_het;
2595                     _obs_hom_mean_all[pop]  += s->nucs[pos].obs_hom;
2596                     _exp_het_mean_all[pop]  += s->nucs[pos].exp_het;
2597                     _exp_hom_mean_all[pop]  += s->nucs[pos].exp_hom;
2598                     _pi_mean_all[pop]       += s->nucs[pos].stat[0];
2599                     _fis_mean_all[pop]      += s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0;
2600 
2601                     _num_indv_var_all[pop] += this->online_variance(s->nucs[pos].num_indv, _num_indv_acc_mean_all[pop], _n_all[pop]);
2602                     _p_var_all[pop]        += this->online_variance(s->nucs[pos].p,        _p_acc_mean_all[pop],        _n_all[pop]);
2603                     _obs_het_var_all[pop]  += this->online_variance(s->nucs[pos].obs_het,  _obs_het_acc_mean_all[pop],  _n_all[pop]);
2604                     _obs_hom_var_all[pop]  += this->online_variance(s->nucs[pos].obs_hom,  _obs_hom_acc_mean_all[pop],  _n_all[pop]);
2605                     _exp_het_var_all[pop]  += this->online_variance(s->nucs[pos].exp_het,  _exp_het_acc_mean_all[pop],  _n_all[pop]);
2606                     _exp_hom_var_all[pop]  += this->online_variance(s->nucs[pos].exp_hom,  _exp_hom_acc_mean_all[pop],  _n_all[pop]);
2607                     _pi_var_all[pop]       += this->online_variance(s->nucs[pos].stat[0],  _pi_acc_mean_all[pop],       _n_all[pop]);
2608                     _fis_var_all[pop]      += this->online_variance(s->nucs[pos].stat[1] != -7.0 ? s->nucs[pos].stat[1] : 0.0, _fis_acc_mean_all[pop], _n_all[pop]);
2609                 }
2610             }
2611         }
2612 
2613         //
2614         // Accumulate the locus length and overlap.
2615         //
2616         _locus_n++;
2617         _locus_gt_sites_mean += site_cnt;
2618         _locus_gt_sites_var  += this->online_variance(site_cnt, _locus_gt_sites_acc_mean, _locus_n);
2619 
2620         if (cloc->pe_ctg) {
2621             _locus_pe_ctg_n++;
2622             _locus_len_mean_all += cloc->len - 10;
2623             _locus_len_var_all  += this->online_variance(cloc->len - 10, _locus_len_acc_mean_all, _locus_pe_ctg_n);
2624 
2625             if (cloc->overlap > 0) {
2626                 _locus_overlap_n++;
2627                 _locus_len_mean += cloc->len;
2628                 _locus_len_var  += this->online_variance(cloc->len, _locus_len_acc_mean, _locus_overlap_n);
2629                 _overlap_mean   += cloc->overlap;
2630                 _overlap_var    += this->online_variance(cloc->overlap, _overlap_acc_mean, _locus_overlap_n);
2631             }
2632         }
2633     }
2634 
2635     return 0;
2636 }
2637 
2638 inline double
online_variance(double x,double & acc_mean,double n)2639 SumStatsSummary::online_variance(double x, double &acc_mean, double n)
2640 {
2641     double delta1, delta2;
2642 
2643     delta1    = x - acc_mean;
2644     acc_mean += delta1 / n;
2645     delta2    = x - acc_mean;
2646     return delta1 * delta2;
2647 }
2648 
2649 int
final_calculation()2650 SumStatsSummary::final_calculation()
2651 {
2652     //
2653     // Finish the mean calculation.
2654     //
2655     for (uint j = 0; j < this->_pop_cnt; j++) {
2656         _num_indv_mean[j] = _num_indv_mean[j] / _n[j];
2657         _p_mean[j]        = _p_mean[j]        / _n[j];
2658         _obs_het_mean[j]  = _obs_het_mean[j]  / _n[j];
2659         _obs_hom_mean[j]  = _obs_hom_mean[j]  / _n[j];
2660         _exp_het_mean[j]  = _exp_het_mean[j]  / _n[j];
2661         _exp_hom_mean[j]  = _exp_hom_mean[j]  / _n[j];
2662         _pi_mean[j]       = _pi_mean[j]       / _n[j];
2663         _fis_mean[j]      = _fis_mean[j]      / _n[j];
2664 
2665         _num_indv_mean_all[j] = _num_indv_mean_all[j] / _n_all[j];
2666         _p_mean_all[j]        = _p_mean_all[j]        / _n_all[j];
2667         _obs_het_mean_all[j]  = _obs_het_mean_all[j]  / _n_all[j];
2668         _obs_hom_mean_all[j]  = _obs_hom_mean_all[j]  / _n_all[j];
2669         _exp_het_mean_all[j]  = _exp_het_mean_all[j]  / _n_all[j];
2670         _exp_hom_mean_all[j]  = _exp_hom_mean_all[j]  / _n_all[j];
2671         _pi_mean_all[j]       = _pi_mean_all[j]       / _n_all[j];
2672         _fis_mean_all[j]      = _fis_mean_all[j]      / _n_all[j];
2673     }
2674 
2675     _locus_len_mean      = _locus_len_mean      / _locus_overlap_n;
2676     _locus_len_mean_all  = _locus_len_mean_all  / _locus_pe_ctg_n;
2677     _overlap_mean        = _overlap_mean        / _locus_overlap_n;
2678     _locus_gt_sites_mean = _locus_gt_sites_mean / _locus_n;
2679 
2680     //
2681     // Finish the online variance calculation.
2682     //
2683     for (uint j = 0; j < this->_pop_cnt; j++) {
2684         _num_indv_var[j] = _num_indv_var[j] / (_n[j] - 1);
2685         _p_var[j]        = _p_var[j]        / (_n[j] - 1);
2686         _obs_het_var[j]  = _obs_het_var[j]  / (_n[j] - 1);
2687         _obs_hom_var[j]  = _obs_hom_var[j]  / (_n[j] - 1);
2688         _exp_het_var[j]  = _exp_het_var[j]  / (_n[j] - 1);
2689         _exp_hom_var[j]  = _exp_hom_var[j]  / (_n[j] - 1);
2690         _pi_var[j]       = _pi_var[j]       / (_n[j] - 1);
2691         _fis_var[j]      = _fis_var[j]      / (_n[j] - 1);
2692 
2693         _num_indv_var_all[j] = _num_indv_var_all[j] / (_n_all[j] - 1);
2694         _p_var_all[j]        = _p_var_all[j]        / (_n_all[j] - 1);
2695         _obs_het_var_all[j]  = _obs_het_var_all[j]  / (_n_all[j] - 1);
2696         _obs_hom_var_all[j]  = _obs_hom_var_all[j]  / (_n_all[j] - 1);
2697         _exp_het_var_all[j]  = _exp_het_var_all[j]  / (_n_all[j] - 1);
2698         _exp_hom_var_all[j]  = _exp_hom_var_all[j]  / (_n_all[j] - 1);
2699         _pi_var_all[j]       = _pi_var_all[j]       / (_n_all[j] - 1);
2700         _fis_var_all[j]      = _fis_var_all[j]      / (_n_all[j] - 1);
2701     }
2702 
2703     _locus_len_var     = _locus_len_var     / (_locus_overlap_n - 1);
2704     _locus_len_var_all = _locus_len_var_all / (_locus_pe_ctg_n  - 1);
2705     _overlap_var       = _overlap_var       / (_locus_overlap_n - 1);
2706 
2707     _locus_gt_sites_var = _locus_gt_sites_var / (_locus_n - 1);
2708 
2709     //
2710     // Calculate the first half of the standard deviation.
2711     //
2712     for (uint j = 0; j < this->_pop_cnt; j++) {
2713         _sq_n[j]     = sqrt(_n[j]);
2714         _sq_n_all[j] = sqrt(_n_all[j]);
2715     }
2716 
2717     return 0;
2718 }
2719 
2720 int
write_results()2721 SumStatsSummary::write_results()
2722 {
2723     string   path = out_path + out_prefix + ".sumstats_summary.tsv";
2724     ofstream fh(path.c_str(), ofstream::out);
2725     if (fh.fail()) {
2726         cerr << "Error opening sumstats summary file '" << path << "'\n";
2727         exit(1);
2728     }
2729     fh.precision(fieldw);
2730     fh.setf(std::ios::fixed);
2731 
2732     //
2733     // Write out locus length and overlap statistics for de novo data.
2734     //
2735     ostream os (cout.rdbuf());
2736     os << std::fixed << std::setprecision(2);
2737     if (!loci_ordered && this->_locus_pe_ctg_n > 0) {
2738         os   << "Number of loci with PE contig: " << this->_locus_pe_ctg_n << " ("
2739              << as_percentage(this->_locus_pe_ctg_n, this->_locus_n) << ");\n"
2740              << "  Mean length of loci: " << this->_locus_len_mean_all << "bp "
2741              << "(stderr " << sqrt(this->_locus_len_var_all) / sqrt(this->_locus_n) << ");\n"
2742              << "Number of loci with SE/PE overlap: " << this->_locus_overlap_n << " ("
2743              << as_percentage(this->_locus_overlap_n, this->_locus_n) << ");\n"
2744              << "  Mean length of overlapping loci: " << this->_locus_len_mean << "bp "
2745              << "(stderr " << sqrt(this->_locus_len_var) / sqrt(this->_locus_n) << "); "
2746              << "mean overlap: " << this->_overlap_mean
2747              << "bp (stderr " << sqrt(this->_overlap_var) / sqrt(this->_locus_n) << ");\n";
2748     }
2749     os   << "Mean genotyped sites per locus: " << this->_locus_gt_sites_mean << "bp "
2750          << "(stderr " << sqrt(this->_locus_gt_sites_var) / sqrt(this->_locus_n) << ").\n";
2751 
2752     //
2753     // Write out summary statistics of the summary statistics.
2754     //
2755     fh << "# Variant positions\n"
2756        << "# Pop ID\t"
2757        << "Private\t"
2758        << "Num_Indv\t"
2759        << "Var\t"
2760        << "StdErr\t"
2761        << "P\t"
2762        << "Var\t"
2763        << "StdErr\t"
2764        << "Obs_Het\t"
2765        << "Var\t"
2766        << "StdErr\t"
2767        << "Obs_Hom\t"
2768        << "Var\t"
2769        << "StdErr\t"
2770        << "Exp_Het\t"
2771        << "Var\t"
2772        << "StdErr\t"
2773        << "Exp_Hom\t"
2774        << "Var\t"
2775        << "StdErr\t"
2776        << "Pi\t"
2777        << "Var\t"
2778        << "StdErr\t"
2779        << "Fis\t"
2780        << "Var\t"
2781        << "StdErr\n";
2782 
2783     for (uint j = 0; j < this->_pop_cnt; j++)
2784         fh << mpopi.pops()[j].name << "\t"
2785            << _private_cnt[j]         << "\t"
2786            << _num_indv_mean[j]       << "\t"
2787            << _num_indv_var[j]        << "\t"
2788            << sqrt(_num_indv_var[j]) / _sq_n[j] << "\t"
2789            << _p_mean[j]              << "\t"
2790            << _p_var[j]               << "\t"
2791            << sqrt(_p_var[j])         / _sq_n[j] << "\t"
2792            << _obs_het_mean[j]        << "\t"
2793            << _obs_het_var[j]         << "\t"
2794            << sqrt(_obs_het_var[j])   / _sq_n[j] << "\t"
2795            << _obs_hom_mean[j]        << "\t"
2796            << _obs_hom_var[j]         << "\t"
2797            << sqrt(_obs_hom_var[j])   / _sq_n[j] << "\t"
2798            << _exp_het_mean[j]        << "\t"
2799            << _exp_het_var[j]         << "\t"
2800            << sqrt(_exp_het_var[j])   / _sq_n[j] << "\t"
2801            << _exp_hom_mean[j]        << "\t"
2802            << _exp_hom_var[j]         << "\t"
2803            << sqrt(_exp_hom_var[j])   / _sq_n[j] << "\t"
2804            << _pi_mean[j]             << "\t"
2805            << _pi_var[j]              << "\t"
2806            << sqrt(_pi_var[j])        / _sq_n[j] << "\t"
2807            << _fis_mean[j]            << "\t"
2808            << _fis_var[j]             << "\t"
2809            << sqrt(_num_indv_var[j])  / _sq_n[j] << "\n";
2810 
2811     cout << "\nPopulation summary statistics (more detail in populations.sumstats_summary.tsv):\n";
2812 
2813     for (uint j = 0; j < this->_pop_cnt; j++)
2814         cout << "  " << mpopi.pops()[j].name << ": "
2815              << setprecision(fieldw) << _num_indv_mean[j] << " samples per locus; "
2816              << "pi: " << _pi_mean[j] << "; "
2817              << setprecision(10) << "all/variant/polymorphic sites: " << _n_all[j] << "/" << _n[j] << "/" << _var_sites[j] << "; "
2818              << setprecision(fieldw) << "private alleles: " << _private_cnt[j] << "\n";
2819 
2820     fh << "# All positions (variant and fixed)\n"
2821        << "# Pop ID\t"
2822        << "Private\t"
2823        << "Sites\t"
2824        << "Variant_Sites\t"
2825        << "Polymorphic_Sites\t"
2826        << "%Polymorphic_Loci\t"
2827        << "Num_Indv\t"
2828        << "Var\t"
2829        << "StdErr\t"
2830        << "P\t"
2831        << "Var\t"
2832        << "StdErr\t"
2833        << "Obs_Het\t"
2834        << "Var\t"
2835        << "StdErr\t"
2836        << "Obs_Hom\t"
2837        << "Var\t"
2838        << "StdErr\t"
2839        << "Exp_Het\t"
2840        << "Var\t"
2841        << "StdErr\t"
2842        << "Exp_Hom\t"
2843        << "Var\t"
2844        << "StdErr\t"
2845        << "Pi\t"
2846        << "Var\t"
2847        << "StdErr\t"
2848        << "Fis\t"
2849        << "Var\t"
2850        << "StdErr\n";
2851 
2852     for (uint j = 0; j < this->_pop_cnt; j++) {
2853         fh << mpopi.pops()[j].name << "\t"
2854            << _private_cnt[j]             << "\t" << setprecision(0)
2855            << _n_all[j]                   << "\t"
2856            << _n[j]                       << "\t"
2857            << _var_sites[j]               << "\t" << setprecision(fieldw)
2858            << _var_sites[j]             / _n_all[j] * 100 << "\t"
2859            << _num_indv_mean_all[j]       << "\t"
2860            << _num_indv_var_all[j]        << "\t"
2861            << sqrt(_num_indv_var_all[j]) / _sq_n_all[j] << "\t"
2862            << _p_mean_all[j]              << "\t"
2863            << _p_var_all[j]               << "\t"
2864            << sqrt(_p_var_all[j])        / _sq_n_all[j] << "\t"
2865            << _obs_het_mean_all[j]        << "\t"
2866            << _obs_het_var_all[j]         << "\t"
2867            << sqrt(_obs_het_var_all[j])  / _sq_n_all[j] << "\t"
2868            << _obs_hom_mean_all[j]        << "\t"
2869            << _obs_hom_var_all[j]         << "\t"
2870            << sqrt(_obs_hom_var_all[j])  / _sq_n_all[j] << "\t"
2871            << _exp_het_mean_all[j]        << "\t"
2872            << _exp_het_var_all[j]         << "\t"
2873            << sqrt(_exp_het_var_all[j])  / _sq_n_all[j] << "\t"
2874            << _exp_hom_mean_all[j]        << "\t"
2875            << _exp_hom_var_all[j]         << "\t"
2876            << sqrt(_exp_hom_var_all[j])  / _sq_n_all[j] << "\t"
2877            << _pi_mean_all[j]             << "\t"
2878            << _pi_var_all[j]              << "\t"
2879            << sqrt(_pi_var_all[j])       / _sq_n_all[j] << "\t"
2880            << _fis_mean_all[j]            << "\t"
2881            << _fis_var_all[j]             << "\t"
2882            << sqrt(_num_indv_var_all[j]) / _sq_n_all[j] << "\n";
2883     }
2884 
2885     fh.close();
2886 
2887     if (calc_hwp) {
2888         os << "\n"
2889            << "Number of variable sites found to be significantly out of Hardy-Weinberg equilibrium (<" << p_value_cutoff << "):\n";
2890         for (uint j = 0; j < this->_pop_cnt; j++)
2891             os << "  "
2892                << mpopi.pops()[j].name << ": "
2893                << _sig_hwe_dev[j]      << "\n";
2894     }
2895 
2896     return 0;
2897 }
2898 
~SumStatsSummary()2899 SumStatsSummary::~SumStatsSummary()
2900 {
2901     delete [] this->_private_cnt;
2902     delete [] this->_n;
2903     delete [] this->_var_sites;
2904     delete [] this->_sq_n;
2905     delete [] this->_num_indv_mean;
2906     delete [] this->_num_indv_acc_mean;
2907     delete [] this->_num_indv_var;
2908     delete [] this->_p_mean;
2909     delete [] this->_p_acc_mean;
2910     delete [] this->_p_var;
2911     delete [] this->_obs_het_mean;
2912     delete [] this->_obs_het_acc_mean;
2913     delete [] this->_obs_het_var;
2914     delete [] this->_obs_hom_mean;
2915     delete [] this->_obs_hom_acc_mean;
2916     delete [] this->_obs_hom_var;
2917     delete [] this->_exp_het_mean;
2918     delete [] this->_exp_het_acc_mean;
2919     delete [] this->_exp_het_var;
2920     delete [] this->_exp_hom_mean;
2921     delete [] this->_exp_hom_acc_mean;
2922     delete [] this->_exp_hom_var;
2923     delete [] this->_pi_mean;
2924     delete [] this->_pi_acc_mean;
2925     delete [] this->_pi_var;
2926     delete [] this->_fis_mean;
2927     delete [] this->_fis_acc_mean;
2928     delete [] this->_fis_var;
2929 
2930     delete [] this->_n_all;
2931     delete [] this->_sq_n_all;
2932     delete [] this->_num_indv_mean_all;
2933     delete [] this->_num_indv_acc_mean_all;
2934     delete [] this->_num_indv_var_all;
2935     delete [] this->_p_mean_all;
2936     delete [] this->_p_acc_mean_all;
2937     delete [] this->_p_var_all;
2938     delete [] this->_obs_het_mean_all;
2939     delete [] this->_obs_het_acc_mean_all;
2940     delete [] this->_obs_het_var_all;
2941     delete [] this->_obs_hom_mean_all;
2942     delete [] this->_obs_hom_acc_mean_all;
2943     delete [] this->_obs_hom_var_all;
2944     delete [] this->_exp_het_mean_all;
2945     delete [] this->_exp_het_acc_mean_all;
2946     delete [] this->_exp_het_var_all;
2947     delete [] this->_exp_hom_mean_all;
2948     delete [] this->_exp_hom_acc_mean_all;
2949     delete [] this->_exp_hom_var_all;
2950     delete [] this->_pi_mean_all;
2951     delete [] this->_pi_acc_mean_all;
2952     delete [] this->_pi_var_all;
2953     delete [] this->_fis_mean_all;
2954     delete [] this->_fis_acc_mean_all;
2955     delete [] this->_fis_var_all;
2956 }
2957 
2958 int
correct_fst_bonferroni_win(vector<PopPair * > & pairs)2959 correct_fst_bonferroni_win(vector<PopPair *> &pairs)
2960 {
2961     int      limit = 3 * sigma;
2962     int      limit_l, limit_u;
2963     double   correction;
2964     uint     cnt, pos_l, pos_u;
2965 
2966     pos_l = 0;
2967     pos_u = 0;
2968 
2969     for (uint pos_c = 0; pos_c < pairs.size(); pos_c++) {
2970         if (pairs[pos_c] == NULL) continue;
2971 
2972         limit_l = pairs[pos_c]->bp - limit > 0 ? pairs[pos_c]->bp - limit : 0;
2973         limit_u = pairs[pos_c]->bp + limit;
2974 
2975         while (pos_l <  pairs.size()) {
2976             if (pairs[pos_l] == NULL) {
2977                 pos_l++;
2978             } else {
2979                 if (pairs[pos_l]->bp < limit_l)
2980                     pos_l++;
2981                 else
2982                     break;
2983             }
2984         }
2985         while (pos_u < pairs.size()) {
2986             if (pairs[pos_u] == NULL) {
2987                 pos_u++;
2988             } else {
2989                 if (pairs[pos_u]->bp < limit_u)
2990                     pos_u++;
2991                 else
2992                     break;
2993             }
2994         }
2995 
2996         cnt = 0;
2997         for (uint i = pos_l; i < pos_u; i++) {
2998             if (pairs[i] == NULL) continue;
2999             cnt++;
3000         }
3001 
3002         correction = p_value_cutoff / cnt;
3003         pairs[pos_c]->stat[0] = pairs[pos_c]->fet_p < correction ? pairs[pos_c]->fst : 0;
3004     }
3005 
3006     return 0;
3007 }
3008 
3009 int
snpstats(const vector<LocBin * > & loci,ofstream & log_fh)3010 LocusSmoothing::snpstats(const vector<LocBin *> &loci, ofstream &log_fh)
3011 {
3012     for (uint i = 0; i < this->_mpopi->pops().size(); i++) {
3013 
3014         vector<const SumStat *> sites;
3015 
3016         this->_ord_ss->order(sites, loci, i);
3017         this->_ks_ss->smooth(sites);
3018     }
3019 
3020     return 0;
3021 }
3022 
3023 int
hapstats(const vector<LocBin * > & loci,ofstream & log_fh)3024 LocusSmoothing::hapstats(const vector<LocBin *> &loci, ofstream &log_fh)
3025 {
3026     vector<const LocStat *> sites;
3027 
3028     for (uint i = 0; i < this->_mpopi->pops().size(); i++) {
3029         sites.clear();
3030 
3031         this->_ord_ls->order(sites, loci);
3032         this->_ks_ls->smooth(sites);
3033     }
3034 
3035     return 0;
3036 }
3037 
3038 int
snp_divergence(const vector<LocBin * > & loci,const vector<vector<PopPair ** >> & div,ofstream & log_fh)3039 LocusSmoothing::snp_divergence(const vector<LocBin *> &loci, const vector<vector<PopPair **>> &div, ofstream &log_fh)
3040 {
3041     vector<const PopPair *> sites;
3042 
3043     for (uint i = 0; i < div.size(); i++) {
3044         assert(div[i].size() == loci.size());
3045 
3046         sites.clear();
3047 
3048         if (this->_ord_pp->order(sites, loci, div[i]))
3049             this->_ks_pp->smooth(sites);
3050     }
3051 
3052     return 0;
3053 }
3054 
3055 int
hap_divergence(const vector<LocBin * > & loci,const vector<vector<HapStat * >> & div,const vector<HapStat * > & metadiv,ofstream & log_fh)3056 LocusSmoothing::hap_divergence(const vector<LocBin *> &loci,
3057                                const vector<vector<HapStat *>> &div,
3058                                const vector<HapStat *> &metadiv,
3059                                ofstream &log_fh)
3060 {
3061     vector<const HapStat *> sites;
3062 
3063     for (uint i = 0; i < div.size(); i++) {
3064         assert(div[i].size() == loci.size());
3065 
3066         sites.clear();
3067 
3068         this->_ord_hs->order(sites, loci, div[i]);
3069         this->_ks_hs->smooth(sites);
3070     }
3071 
3072     //
3073     // Kernel-smooth the haplotype divergence statistics for the metapopulation.
3074     //
3075     sites.clear();
3076 
3077     this->_ord_hs->order(sites, loci, metadiv);
3078     this->_ks_hs->smooth(sites);
3079 
3080     return 0;
3081 }
3082 
3083 int
bootstrap_popstats_approximate_dist(vector<double> & fis_samples,vector<double> & pi_samples,vector<int> & allele_samples,double * weights,int * snp_dist,int sites_per_snp,map<int,vector<double>> & approx_fis_dist,map<int,vector<double>> & approx_pi_dist)3084 bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
3085                                     vector<double> &pi_samples,
3086                                     vector<int>  &allele_samples,
3087                                     double *weights, int *snp_dist, int sites_per_snp,
3088                                     map<int, vector<double> > &approx_fis_dist,
3089                                     map<int, vector<double> > &approx_pi_dist)
3090 {
3091     //
3092     // Allocate an array of bootstrap resampling objects.
3093     //
3094     int win_size = 6 * sigma + 1;
3095     int win_cntr = win_size / 2;
3096 
3097     //
3098     // Initialize the Fst distribution map.
3099     //
3100     for (int i = 0; i < max_snp_dist; i++) {
3101         if (snp_dist[i] == 0.0) continue;
3102 
3103         // cout << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
3104         approx_fis_dist[i] = vector<double> ();
3105         approx_fis_dist[i].reserve(bootstrap_reps);
3106 
3107         approx_pi_dist[i] = vector<double> ();
3108         approx_pi_dist[i].reserve(bootstrap_reps);
3109     }
3110 
3111     vector<int> poss;
3112     poss.reserve(max_snp_dist);
3113     double weighted_fis, weighted_pi, sum_fis, sum_pi, final_weight_fis, final_weight_pi;
3114     // int    index_1, index_2;
3115     int    pos, index_3, dist, start, end;
3116     int    half = sites_per_snp / 2;
3117 
3118     for (int i = 0; i < max_snp_dist; i++) {
3119         if (snp_dist[i] == 0.0) continue;
3120 
3121         cout << "  Generating NULL distribution for " << i << " SNPs...\n";
3122 
3123         // #pragma omp parallel private(poss, pos, index_1, index_2, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
3124         #pragma omp parallel private(poss, pos, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
3125         {
3126             BSample *bs  = new BSample[win_size];
3127 
3128             //
3129             // Populate the BSample objects.
3130             //
3131             for (int n = 0; n < win_size;  n++)
3132                 bs[n].bp = n + 1;
3133 
3134             vector<double> fiss, pis;
3135 
3136             //
3137             // Bootstrap this bitch.
3138             //
3139             #pragma omp for schedule(dynamic, 1)
3140             for (int j = 0; j < bootstrap_reps; j++) {
3141                 // cout << "    Bootsrap rep " << j << "\n";
3142 
3143                 //
3144                 // First SNP is always placed at the center of the window.
3145                 //
3146                 pos     = win_cntr;
3147                 // index_1 = (int) (fis_samples.size()    * (random() / (RAND_MAX + 1.0)));
3148                 // index_2 = (int) (pi_samples.size()     * (random() / (RAND_MAX + 1.0)));
3149                 index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3150                 //
3151                 // Fill in the area around the SNP with fixed sites.
3152                 //
3153                 start = pos - half > 0 ? pos - half : 0;
3154                 end   = pos + half < win_size ? pos + half : win_size;
3155                 for (int n = start; n < end; n++) {
3156                     // bs[n].f       = 0;
3157                     // bs[n].pi      = 0;
3158                     bs[n].alleles = bs[pos].alleles;
3159                     poss.push_back(n);
3160                 }
3161                 // bs[pos].f       = fis_samples[index_1];
3162                 // bs[pos].pi      = pi_samples[index_2];
3163                 bs[pos].alleles = allele_samples[index_3];
3164                 // cout << "      Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
3165 
3166                 //
3167                 // Randomly select the positions and values for each SNP to populate the window
3168                 //
3169                 for (int k = 0; k < i - 1; k++) {
3170                     pos     = (int) (win_size * (random() / (RAND_MAX + 1.0)));
3171                     // index_1 = (int) (fis_samples.size()    * (random() / (RAND_MAX + 1.0)));
3172                     // index_2 = (int) (pi_samples.size()     * (random() / (RAND_MAX + 1.0)));
3173                     index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3174 
3175                     poss.push_back(pos);
3176                     //
3177                     // Fill in the area around the SNP with fixed sites.
3178                     //
3179                     start = pos - half > 0 ? pos - half : 0;
3180                     end   = pos + half < win_size ? pos + half : win_size;
3181                     for (int n = start; n < end; n++) {
3182                         // bs[n].f       = 0;
3183                         // bs[n].pi      = 0;
3184                         bs[n].alleles = bs[pos].alleles;
3185                         poss.push_back(n);
3186                     }
3187                     // bs[pos].f       = fis_samples[index_1];
3188                     // bs[pos].pi      = pi_samples[index_2];
3189                     bs[pos].alleles = allele_samples[index_3];
3190                     // cout << "      Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
3191                 }
3192 
3193                 weighted_fis = 0.0;
3194                 sum_fis      = 0.0;
3195                 weighted_pi  = 0.0;
3196                 sum_pi       = 0.0;
3197 
3198                 for (int n = 0; n < win_size; n++) {
3199                     // if (bs[n].pi < 0.0)
3200                     // continue;
3201                     //
3202                     // Calculate weighted Fst at this position.
3203                     //
3204                     dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
3205 
3206                     final_weight_fis = (bs[n].alleles - 1) * weights[dist];
3207                     // weighted_fis    += bs[n].f * final_weight_fis;
3208                     sum_fis         += final_weight_fis;
3209 
3210                     final_weight_pi  = (bs[n].alleles - 1) * weights[dist];
3211                     // weighted_pi     += bs[n].pi * final_weight_pi;
3212                     sum_pi          += final_weight_pi;
3213                 }
3214 
3215                 fiss.push_back(weighted_fis / sum_fis);
3216                 pis.push_back(weighted_pi  / sum_pi);
3217                 // cout << "      New weighted fis value: " << weighted_fis / sum_fis << "; size: " << fiss.size() << "\n";
3218 
3219                 for (uint n = 0; n < poss.size(); n++) {
3220                     // bs[poss[n]].f  = 0.0;
3221                     // bs[poss[n]].pi = -1.0;
3222                 }
3223                 poss.clear();
3224             }
3225 
3226 //          #pragma omp critical
3227 //          {
3228 //              vector<double> &f = approx_fis_dist[i];
3229 //              for (uint n = 0; n < fiss.size(); n++)
3230 //                  f.push_back(fiss[n]);
3231 //              vector<double> &p = approx_pi_dist[i];
3232 //              for (uint n = 0; n < pis.size(); n++)
3233 //                  p.push_back(pis[n]);
3234 //          }
3235 
3236             delete [] bs;
3237         }
3238 
3239         sort(approx_fis_dist[i].begin(), approx_fis_dist[i].end());
3240         sort(approx_pi_dist[i].begin(),  approx_pi_dist[i].end());
3241     }
3242 
3243     return 0;
3244 }
3245 
3246 int
bootstrap_fst_approximate_dist(vector<double> & fst_samples,vector<int> & allele_samples,double * weights,int * snp_dist,map<int,vector<double>> & approx_fst_dist)3247 bootstrap_fst_approximate_dist(vector<double> &fst_samples,
3248                                vector<int>  &allele_samples,
3249                                double *weights, int *snp_dist,
3250                                map<int, vector<double> > &approx_fst_dist)
3251 {
3252     //
3253     // Allocate an array of bootstrap resampling objects.
3254     //
3255     int win_size = 6 * sigma + 1;
3256     int win_cntr = win_size / 2;
3257 
3258     //
3259     // Initialize the Fst distribution map.
3260     //
3261     for (int i = 0; i < max_snp_dist; i++) {
3262         if (snp_dist[i] == 0.0) continue;
3263 
3264         // cout << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
3265         approx_fst_dist[i] = vector<double> ();
3266         approx_fst_dist[i].reserve(bootstrap_reps);
3267     }
3268 
3269     vector<int> poss;
3270     poss.reserve(max_snp_dist);
3271     double weighted_fst, sum, final_weight;
3272     //int    index_1;
3273     int    pos, index_2, dist;
3274 
3275     for (int i = 0; i < max_snp_dist; i++) {
3276         if (snp_dist[i] == 0.0) continue;
3277 
3278         cout << "  Generating NULL distribution for " << i << " SNPs...\n";
3279 
3280         // #pragma omp parallel private(poss, pos, index_1, index_2, dist, sum, weighted_fst, final_weight)
3281         #pragma omp parallel private(poss, pos, index_2, dist, sum, weighted_fst, final_weight)
3282         {
3283             BSample *bs  = new BSample[win_size];
3284 
3285             //
3286             // Populate the BSample objects.
3287             //
3288             for (int n = 0; n < win_size;  n++)
3289                 bs[n].bp = n + 1;
3290 
3291             vector<double> fsts;
3292 
3293             //
3294             // Bootstrap this bitch.
3295             //
3296             #pragma omp for schedule(dynamic, 1)
3297             for (int j = 0; j < bootstrap_reps; j++) {
3298                 // cout << "Bootsrap rep " << j << "\n";
3299 
3300                 //
3301                 // First SNP is always placed at the center of the window.
3302                 //
3303                 pos     = win_cntr;
3304                 // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
3305                 index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3306                 // bs[pos].f       = fst_samples[index_1];
3307                 bs[pos].alleles = allele_samples[index_2];
3308 
3309                 //
3310                 // Randomly select the positions and values for each SNP to populate the window
3311                 //
3312                 for (int k = 0; k < i - 1; k++) {
3313                     pos     = (int) (win_size * (random() / (RAND_MAX + 1.0)));
3314                     // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
3315                     index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
3316                     // bs[pos].f       = fst_samples[index_1];
3317                     // bs[pos].alleles = allele_samples[index_2];
3318                     // cout << "  " << j << ": Placing SNP at position: " << pos << " with data from index " << index_1 << "\n";
3319 
3320                     poss.push_back(pos);
3321                 }
3322 
3323                 weighted_fst = 0.0;
3324                 sum          = 0.0;
3325 
3326                 for (int n = 0; n < win_size; n++) {
3327                     // if (bs[n].f == 0.0)
3328                     // continue;
3329                     //
3330                     // Calculate weighted Fst at this position.
3331                     //
3332                     dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
3333 
3334                     final_weight  = (bs[n].alleles - 1) * weights[dist];
3335                     // weighted_fst += bs[n].f * final_weight;
3336                     sum          += final_weight;
3337                 }
3338 
3339                 fsts.push_back(weighted_fst / sum);
3340                 // cout << "    New weighted Fst value: " << weighted_fst / sum << "; size: " << fsts.size() << "\n";
3341 
3342                 // for (uint n = 0; n < poss.size(); n++)
3343                 // bs[poss[n]].f = 0.0;
3344                 poss.clear();
3345             }
3346 
3347 //          #pragma omp critical
3348 //          {
3349 //              vector<double> &f = approx_fst_dist[i];
3350 //              for (uint n = 0; n < fsts.size(); n++)
3351 //                  f.push_back(fsts[n]);
3352 //          }
3353 
3354             delete [] bs;
3355         }
3356 
3357         sort(approx_fst_dist[i].begin(), approx_fst_dist[i].end());
3358     }
3359 
3360     return 0;
3361 }
3362 
3363 double
bootstrap_approximate_pval(int snp_cnt,double stat,map<int,vector<double>> & approx_dist)3364 bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &approx_dist)
3365 {
3366     if (approx_dist.count(snp_cnt) == 0)
3367         return 1.0;
3368 
3369     vector<double>::iterator up;
3370     vector<double> &dist = approx_dist[snp_cnt];
3371     double pos;
3372 
3373     up  = upper_bound(dist.begin(), dist.end(), stat);
3374 
3375     if (up == dist.begin())
3376         pos = 1;
3377     else if (up == dist.end())
3378         pos = dist.size();
3379     else
3380         pos = up - dist.begin() + 1;
3381 
3382     double res = 1.0 - (pos / (double) dist.size());
3383 
3384     // cout << "Generated Approx Smoothed Fst Distribution:\n";
3385     // for (uint n = 0; n < dist.size(); n++)
3386     //  cout << "  n: " << n << "; Fst: " << dist[n] << "\n";
3387 
3388     // cout << "Comparing Fst value: " << stat
3389     //   << " at position " << (up - dist.begin()) << " out of "
3390     //   << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
3391 
3392     return res;
3393 }
3394 
3395 int
load_blacklist(string path)3396 LocusFilter::load_blacklist(string path)
3397 {
3398     char     line[id_len];
3399     ifstream fh(path.c_str(), ifstream::in);
3400 
3401     if (fh.fail()) {
3402         cerr << "Error opening white/black list file '" << path << "'\n";
3403         exit(1);
3404     }
3405 
3406     size_t line_num = 0;
3407     while (fh.getline(line, id_len)) {
3408         ++line_num;
3409 
3410         //
3411         // Skip blank & commented lines ; correct windows-style line ends.
3412         //
3413         size_t len = strlen(line);
3414         if (len == 0) {
3415             continue;
3416         } else if (line[len-1] == '\r') {
3417             line[len-1] = '\0';
3418             --len;
3419             if (len == 0)
3420                 continue;
3421         }
3422         char* p = line;
3423         while (isspace(*p) && *p != '\0')
3424             ++p;
3425         if (*p == '#')
3426             continue;
3427 
3428         //
3429         // Parse the blacklist
3430         //
3431         char* e;
3432         int marker = (int) strtol(line, &e, 10);
3433         if (*e == '\0') {
3434             this->_blacklist.insert(marker);
3435         } else {
3436             cerr << "Error: Unable to parse blacklist '" << path << "' at line " << line_num << ".\n";
3437             throw exception();
3438         }
3439     }
3440 
3441     fh.close();
3442 
3443     if (this->_blacklist.size() == 0) {
3444         cerr << "Error: Unable to load any markers from '" << path << "'\n";
3445         exit(1);
3446     }
3447 
3448     return (int) this->_blacklist.size();
3449 }
3450 
load_marker_list(string path,set<int> & list)3451 int load_marker_list(string path, set<int> &list) {
3452     char     line[id_len];
3453     ifstream fh(path.c_str(), ifstream::in);
3454 
3455     if (fh.fail()) {
3456         cerr << "Error opening white/black list file '" << path << "'\n";
3457         exit(1);
3458     }
3459 
3460     size_t line_num = 0;
3461     while (fh.getline(line, id_len)) {
3462         ++line_num;
3463 
3464         //
3465         // Skip blank & commented lines ; correct windows-style line ends.
3466         //
3467         size_t len = strlen(line);
3468         if (len == 0) {
3469             continue;
3470         } else if (line[len-1] == '\r') {
3471             line[len-1] = '\0';
3472             --len;
3473             if (len == 0)
3474                 continue;
3475         }
3476         char* p = line;
3477         while (isspace(*p) && *p != '\0')
3478             ++p;
3479         if (*p == '#')
3480             continue;
3481 
3482         //
3483         // Parse the blacklist
3484         //
3485         char* e;
3486         int marker = (int) strtol(line, &e, 10);
3487         if (*e == '\0') {
3488             list.insert(marker);
3489         } else {
3490             cerr << "Error: Unable to parse blacklist '" << path << "' at line " << line_num << ".\n";
3491             throw exception();
3492         }
3493     }
3494 
3495     fh.close();
3496 
3497     if (list.size() == 0) {
3498         cerr << "Error: Unable to load any markers from '" << path << "'\n";
3499         exit(1);
3500     }
3501 
3502     return 0;
3503 }
3504 
3505 int
load_whitelist(string path)3506 LocusFilter::load_whitelist(string path)
3507 {
3508     char     line[id_len];
3509     ifstream fh(path.c_str(), ifstream::in);
3510 
3511     if (fh.fail()) {
3512         cerr << "Error opening white/black list file '" << path << "'\n";
3513         exit(1);
3514     }
3515 
3516     vector<string> parts;
3517     uint col;
3518     char *e;
3519 
3520     uint line_num = 0;
3521     while (fh.getline(line, id_len)) {
3522         ++line_num;
3523 
3524         //
3525         // Skip blank & commented lines ; correct windows-style line ends.
3526         //
3527         size_t len = strlen(line);
3528         if (len == 0) {
3529             continue;
3530         } else if (line[len-1] == '\r') {
3531             line[len-1] = '\0';
3532             --len;
3533             if (len == 0)
3534                 continue;
3535         }
3536         char* p = line;
3537         while (isspace(*p) && *p != '\0')
3538             ++p;
3539         if (*p == '#')
3540             continue;
3541 
3542         //
3543         // Parse the whitelist, we expect:
3544         // <marker>[<tab><snp column>]
3545         //
3546         parse_tsv(line, parts);
3547 
3548         if (parts.size() > 2) {
3549             cerr << "Error: Too many columns in whitelist " << path << "' at line " << line_num << "\n";
3550             exit(1);
3551         }
3552         int marker = (int) strtol(parts[0].c_str(), &e, 10);
3553         if (*e != '\0') {
3554             cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3555             exit(1);
3556         }
3557         this->_whitelist[marker];
3558         if (parts.size() == 2) {
3559             col = (int) strtol(parts[1].c_str(), &e, 10);
3560             if (*e != '\0') {
3561                 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3562                 exit(1);
3563             }
3564             this->_whitelist[marker].insert(col);
3565         }
3566     }
3567     if (this->_whitelist.size() == 0) {
3568         cerr << "Error: Unable to load any markers from '" << path << "'\n";
3569         help();
3570     }
3571     return (int) this->_whitelist.size();
3572 }
3573 
load_marker_column_list(string path,map<int,set<int>> & list)3574 int load_marker_column_list(string path, map<int, set<int> > &list) {
3575     char     line[id_len];
3576     ifstream fh(path.c_str(), ifstream::in);
3577 
3578     if (fh.fail()) {
3579         cerr << "Error opening white/black list file '" << path << "'\n";
3580         exit(1);
3581     }
3582 
3583     vector<string> parts;
3584     uint col;
3585     char *e;
3586 
3587     uint line_num = 1;
3588     while (fh.getline(line, id_len)) {
3589 
3590         //
3591         // Skip blank & commented lines ; correct windows-style line ends.
3592         //
3593         size_t len = strlen(line);
3594         if (len == 0) {
3595             continue;
3596         } else if (line[len-1] == '\r') {
3597             line[len-1] = '\0';
3598             --len;
3599             if (len == 0)
3600                 continue;
3601         }
3602         char* p = line;
3603         while (isspace(*p) && *p != '\0')
3604             ++p;
3605         if (*p == '#')
3606             continue;
3607 
3608         //
3609         // Parse the whitelist, we expect:
3610         // <marker>[<tab><snp column>]
3611         //
3612         parse_tsv(line, parts);
3613 
3614         if (parts.size() > 2) {
3615             cerr << "Error: Too many columns in whitelist " << path << "' at line " << line_num << "\n";
3616             exit(1);
3617 
3618         } else if (parts.size() == 2) {
3619             int marker = (int) strtol(parts[0].c_str(), &e, 10);
3620             if (*e != '\0') {
3621                 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3622                 exit(1);
3623             }
3624             col = (int) strtol(parts[1].c_str(), &e, 10);
3625             if (*e != '\0') {
3626                 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3627                 exit(1);
3628             }
3629             list[marker].insert(col);
3630 
3631         } else {
3632             int marker = (int) strtol(parts[0].c_str(), &e, 10);
3633             if (*e != '\0') {
3634                 cerr << "Error: Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
3635                 exit(1);
3636             }
3637             list.insert(make_pair(marker, set<int>()));
3638         }
3639 
3640         line_num++;
3641     }
3642 
3643     fh.close();
3644 
3645     if (list.size() == 0) {
3646         cerr << "Error: Unable to load any markers from '" << path << "'\n";
3647         help();
3648     }
3649 
3650     return 0;
3651 }
3652 
3653 bool
hap_compare(const pair<string,int> & a,const pair<string,int> & b)3654 hap_compare(const pair<string, int>& a, const pair<string, int>& b)
3655 {
3656     return (a.second > b.second);
3657 }
3658 
3659 void
output_parameters(ostream & fh)3660 output_parameters(ostream &fh)
3661 {
3662     fh << "populations parameters selected:\n";
3663     if (input_mode == InputMode::vcf)
3664         fh << "  Input mode: VCF\n";
3665     fh
3666         << "  Percent samples limit per population: " << min_samples_per_pop << "\n"
3667         << "  Locus Population limit: " << min_populations << "\n"
3668         << "  Percent samples overall: " << min_samples_overall << "\n"
3669         << "  Minor allele frequency cutoff: " << minor_allele_freq << "\n"
3670         << "  Maximum observed heterozygosity cutoff: " << max_obs_het << "\n"
3671         << "  Applying Fst correction: ";
3672     switch(fst_correction) {
3673     case p_value:
3674         fh << "P-value correction.\n";
3675         break;
3676     case bonferroni_win:
3677         fh << "Bonferroni correction within sliding window.\n";
3678         break;
3679     case bonferroni_gen:
3680         fh << "Bonferroni correction across genome wide sites.\n";
3681         break;
3682     case no_correction:
3683         fh << "none.\n";
3684         break;
3685     }
3686     fh << "  Pi/Fis kernel smoothing: " << (smooth_popstats == true ? "on" : "off") << "\n"
3687        << "  Fstats kernel smoothing: " << (smooth_fstats   == true ? "on" : "off") << "\n"
3688        << "  Bootstrap resampling: ";
3689     if (bootstrap)
3690         fh << "on, " << (bootstrap_type == bs_exact ? "exact; " : "approximate; ") << bootstrap_reps << " reptitions\n";
3691     else
3692         fh << "off\n";
3693 
3694     fh << "\n";
3695 }
3696 
3697 int
3698 
parse_command_line(int argc,char * argv[])3699 parse_command_line(int argc, char* argv[])
3700 {
3701     bool no_hap_exports = false;
3702     char* tmp_str;
3703     while (1) {
3704         static struct option long_options[] = {
3705             {"help",           no_argument,       NULL, 'h'},
3706             {"version",        no_argument,       NULL, 'v'},
3707             {"quiet",          no_argument,       NULL, 'q'},
3708             {"verbose",        no_argument,       NULL, 'd'},
3709             {"vcf",            no_argument,       NULL, 1004}, {"vcf-haps", no_argument, NULL, 1004}, {"vcf-haplotypes", no_argument, NULL, 1004}, {"vcf_haps", no_argument, NULL, 1004}, {"vcf_haplotypes", no_argument, NULL, 1004},
3710             {"fasta-loci",     no_argument,       NULL, 1006}, {"fasta_loci",     no_argument,       NULL, 1006},
3711             {"fasta-samples",  no_argument,       NULL, 'J'}, {"fasta-strict", no_argument, NULL, 'J'}, {"fasta_samples",  no_argument,       NULL, 'J'}, {"fasta_strict", no_argument, NULL, 'J'},
3712             {"fasta-samples-raw", no_argument,    NULL, 'F'}, {"fasta", no_argument, NULL, 'F'}, {"fasta_samples_raw", no_argument,    NULL, 'F'},
3713             {"structure",      no_argument,       NULL, 'S'},
3714             {"radpainter",     no_argument,       NULL, 1015}, {"fineRADstructure", no_argument, NULL, 1015},
3715             {"fastphase",      no_argument,       NULL, 'A'},
3716             {"phase",          no_argument,       NULL, 'C'},
3717             {"plink",          no_argument,       NULL, 'K'},
3718             {"genepop",        no_argument,       NULL, 1010},
3719             {"genepop-haps-3digits", no_argument, NULL, 1011},
3720             {"phylip",         no_argument,       NULL, 'Y'},
3721             {"phylip-var",     no_argument,       NULL, 'L'}, {"phylip_var",     no_argument,       NULL, 'L'},
3722             {"phylip-var-all", no_argument,       NULL, 'T'}, {"phylip_var_all", no_argument,       NULL, 'T'},
3723             {"hzar",           no_argument,       NULL, 'Z'},
3724             {"treemix",        no_argument,       NULL, 'U'},
3725             {"merge-sites",    no_argument,       NULL, 'D'}, {"merge_sites",    no_argument,       NULL, 'D'},
3726             {"sigma",          required_argument, NULL, 1005},
3727             {"threads",        required_argument, NULL, 't'},
3728             {"in-path",        required_argument, NULL, 'P'}, {"in_path",        required_argument, NULL, 'P'},
3729             {"v1",             no_argument,       NULL, 2000},
3730             {"out-path",       required_argument, NULL, 'O'}, {"out_path",       required_argument, NULL, 'O'},
3731             {"in-vcf",         required_argument, NULL, 'V'}, {"in_vcf",         required_argument, NULL, 'V'},
3732             {"min-samples-overall", required_argument, NULL, 'R'},
3733             {"min-samples-per-pop", required_argument, NULL, 'r'}, {"progeny", required_argument, NULL, 'r'},
3734             {"min-populations",   required_argument, NULL, 'p'}, {"min_populations",   required_argument, NULL, 'p'},
3735             {"filter-haplotype-wise", no_argument, NULL, 'H'},
3736             {"renz",           required_argument, NULL, 'e'},
3737             {"popmap",         required_argument, NULL, 'M'},
3738             {"no-popmap",      no_argument,       NULL, 1017}, // Negates a previous -M/--popmap
3739             {"whitelist",      required_argument, NULL, 'W'},
3740             {"blacklist",      required_argument, NULL, 'B'},
3741             {"batch-size",     required_argument, NULL, 1999}, {"batch_size",     required_argument, NULL, 1999},
3742             {"dbg-min-gt-depth", required_argument, NULL, 2001},
3743             {"write-single-snp",  no_argument,       NULL, 'I'}, {"write_single_snp",  no_argument,       NULL, 'I'},
3744             {"write-random-snp",  no_argument,       NULL, 'j'}, {"write_random_snp",  no_argument,       NULL, 'j'},
3745             {"no-hap-exports",    no_argument,       NULL, 1012}, {"no_hap_exports",    no_argument,       NULL, 1012},
3746             {"ordered-export",    no_argument,       NULL, 1002}, {"ordered_export",    no_argument,       NULL, 1002},
3747             {"smooth",            no_argument,       NULL, 'k'},
3748             {"smooth-fstats",     no_argument,       NULL, 1007}, {"smooth_fstats",     no_argument,       NULL, 1007},
3749             {"smooth-popstats",   no_argument,       NULL, 1008}, {"smooth_popstats",   no_argument,       NULL, 1008},
3750             {"fstats",            no_argument,       NULL, '6'},
3751             {"hwe",               no_argument,       NULL, 1014},
3752             {"log-fst-comp",      no_argument,       NULL, 'l'}, {"log_fst_comp",      no_argument,       NULL, 'l'},
3753             {"bootstrap-type",    required_argument, NULL, 1001}, {"bootstrap_type",    required_argument, NULL, 1001},
3754             {"bootstrap-reps",    required_argument, NULL, 1003}, {"bootstrap_reps",    required_argument, NULL, 1003},
3755             {"bootstrap-wl",      required_argument, NULL, 'Q'}, {"bootstrap_wl",      required_argument, NULL, 'Q'},
3756             {"bootstrap",         no_argument,       NULL, '1'},
3757             {"bootstrap-fst",     no_argument,       NULL, '2'}, {"bootstrap_fst",     no_argument,       NULL, '2'},
3758             {"bootstrap-phist",   no_argument,       NULL, '3'}, {"bootstrap_phist",   no_argument,       NULL, '3'},
3759             {"bootstrap-div",     no_argument,       NULL, '4'}, {"bootstrap_div",     no_argument,       NULL, '4'},
3760             {"bootstrap-pifis",   no_argument,       NULL, '5'}, {"bootstrap_pifis",   no_argument,       NULL, '5'},
3761             {"min-maf",           required_argument, NULL, 'a'}, {"min_maf",           required_argument, NULL, 'a'},
3762             {"min-mac",           required_argument, NULL, 1016}, {"min_mac",           required_argument, NULL, 1016},
3763             {"max-obs-het",       required_argument, NULL, 1013}, {"max_obs_het",       required_argument, NULL, 1013},
3764             {"merge-prune-lim",   required_argument, NULL, 'i'}, {"merge_prune_lim",   required_argument, NULL, 'i'},
3765             {"fst-correction",    required_argument, NULL, 'f'}, {"fst_correction",    required_argument, NULL, 'f'},
3766             {"p-value-cutoff",    required_argument, NULL, 'u'}, {"p_value_cutoff",    required_argument, NULL, 'u'},
3767             {"debug-flags",       required_argument, NULL, 1000}, {"debug_flags",       required_argument, NULL, 1000},
3768             {"lnl-lim",           required_argument, NULL, 7000}, {"lnl_lim",           required_argument, NULL, 7000}, // (deprecated)
3769             {0, 0, 0, 0}
3770         };
3771 
3772         // getopt_long stores the option index here.
3773         int c = getopt_long(argc, argv, "ACDFHJKLNSTUV:YZ123456dhjklnqva:c:e:f:i:o:p:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, NULL);
3774 
3775         // Detect the end of the options.
3776         if (c == -1)
3777             break;
3778 
3779         switch (c) {
3780         case 'v':
3781             version();
3782             exit(1);
3783             break;
3784         case 'h':
3785             help();
3786             break;
3787         case 'q':
3788             quiet = true;
3789             break;
3790         case 'd':
3791             verbose = true;
3792             break;
3793         case 't':
3794             num_threads = atoi(optarg);
3795             break;
3796         case 'P':
3797             in_path = optarg;
3798             if (!in_path.empty() && in_path.back() != '/')
3799                 in_path += "/";
3800             break;
3801         case 2000: //v1
3802             input_mode = InputMode::stacks;
3803             break;
3804         case 1999:
3805             batch_size = is_integer(optarg);
3806             break;
3807         case 2001:
3808             min_gt_depth = stol(optarg);
3809             break;
3810         case 'O':
3811             out_path = optarg;
3812             if (!out_path.empty() && out_path.back() != '/')
3813                 out_path += "/";
3814             break;
3815         case 'V':
3816             in_vcf_path = optarg;
3817             break;
3818         case 'M': // popmap
3819             pmap_path = optarg;
3820             break;
3821         case 1017: //no-popmap
3822             pmap_path.clear();
3823             break;
3824         case 'D':
3825             merge_sites = true;
3826             break;
3827         case 'i':
3828             merge_prune_lim = is_double(optarg);
3829             if (merge_prune_lim > 1.0)
3830                 merge_prune_lim = merge_prune_lim / 100;
3831 
3832             if (merge_prune_lim < 0 || merge_prune_lim > 1.0) {
3833                 cerr << "Error: Unable to parse the merge sites pruning limit.\n";
3834                 help();
3835             }
3836             break;
3837         case 1013:
3838             max_obs_het = is_double(optarg);
3839             if (max_obs_het > 1)
3840                 max_obs_het = max_obs_het / 100;
3841 
3842             if (max_obs_het < 0 || max_obs_het > 1.0) {
3843                 cerr << "Error: Unable to parse the maximum observed heterozygosity.\n";
3844                 help();
3845             }
3846             break;
3847         case 'r':
3848             min_samples_per_pop = stod(optarg);
3849             if (!std::isfinite(min_samples_per_pop)) {
3850                 cerr << "Error: Invalid value for --min-samples-per-pop, \""
3851                      << optarg << "\"\n";
3852                 help();
3853             }
3854             if (min_samples_per_pop > 1.0)
3855                 min_samples_per_pop /= 100.0;
3856             if (min_samples_per_pop < 0.0 || min_samples_per_pop > 1.0) {
3857                 cerr << "Error: Invalid value for --min-samples-per-pop, \""
3858                      << optarg << "\"\n";
3859                 help();
3860             }
3861             break;
3862         case 'R':
3863             min_samples_overall = stod(optarg);
3864             if (!std::isfinite(min_samples_overall)) {
3865                 cerr << "Error: Invalid value for --min-samples-overall, \""
3866                      << optarg << "\"\n";
3867                 help();
3868             }
3869             if (min_samples_overall > 1.0)
3870                 min_samples_overall /= 100.0;
3871             if (min_samples_overall < 0.0 || min_samples_overall > 1.0) {
3872                 cerr << "Error: Invalid value for --min-samples-overall, \""
3873                      << optarg << "\"\n";
3874                 help();
3875             }
3876             break;
3877         case 'p':
3878             min_populations = stoi(optarg);
3879             break;
3880         case 'H':
3881             filter_haplotype_wise = true;
3882             break;
3883         case 'k':
3884             smooth_popstats = true;
3885             smooth_fstats   = true;
3886             calc_fstats     = true;
3887             ordered_export  = true;
3888             break;
3889         case 1007:
3890             smooth_fstats   = true;
3891             calc_fstats     = true;
3892             ordered_export  = true;
3893             break;
3894         case 1008:
3895             smooth_popstats = true;
3896             ordered_export  = true;
3897             break;
3898         case '6':
3899             calc_fstats = true;
3900             break;
3901         case 'l':
3902             log_fst_comp = true;
3903             break;
3904         case '1':
3905             ordered_export  = true;
3906             bootstrap       = true;
3907             bootstrap_fst   = true;
3908             bootstrap_phist = true;
3909             bootstrap_pifis = true;
3910             bootstrap_div   = true;
3911             break;
3912         case '2':
3913             ordered_export  = true;
3914             bootstrap_fst   = true;
3915             break;
3916         case '3':
3917             ordered_export  = true;
3918             bootstrap_phist = true;
3919             break;
3920         case '4':
3921             ordered_export  = true;
3922             bootstrap_div = true;
3923             break;
3924         case '5':
3925             ordered_export  = true;
3926             bootstrap_pifis = true;
3927             break;
3928         case 1001:
3929             if (strcasecmp(optarg, "exact") == 0)
3930                 bootstrap_type = bs_exact;
3931             else if (strcasecmp(optarg, "approx") == 0)
3932                 bootstrap_type = bs_approx;
3933             else {
3934                 cerr << "Error: Unknown bootstrap type specified '" << optarg << "'\n";
3935                 help();
3936             }
3937             break;
3938         case 1003:
3939             ordered_export  = true;
3940             bootstrap = true;
3941             bootstrap_reps = atoi(optarg);
3942             break;
3943         case 'Q':
3944             bs_wl_file = optarg;
3945             bootstrap_wl = true;
3946             break;
3947         case 7000:
3948             cerr << "WARNING: --lnl-lim is deprecated and has no effect.\n";
3949             break;
3950         case 'I':
3951             write_single_snp = true;
3952             no_hap_exports = true;
3953             break;
3954         case 'j':
3955             write_random_snp = true;
3956             no_hap_exports = true;
3957             break;
3958         case 1012: // --no-haps
3959             no_hap_exports = true;
3960             break;
3961         case 1002:
3962             ordered_export = true;
3963             break;
3964         case 1004: // --vcf
3965             add_export<VcfExport>();
3966             add_export<VcfHapsExport>();
3967             break;
3968         case 1006: // --fasta-loci
3969             add_export<FastaLociExport>();
3970             break;
3971         case 'F':
3972             add_export<FastaRawExport>();
3973             break;
3974         case 'J':
3975             add_export<FastaSamplesExport>();
3976             break;
3977         case 1010: // --genepop
3978             add_export<GenePopExport>();
3979             add_export<GenePopHapsExport>();
3980             break;
3981         case 1011: //genepop-haps-3digits
3982             add_export<GenePopHapsExport>();
3983             dynamic_cast<GenePopHapsExport&>(**find_export<GenePopHapsExport>()).set_digits(3);
3984             break;
3985         case 1015: // --radpainter,--fineradstructure
3986             add_export<FineRADStructureExport>();
3987             break;
3988         case 1014: // --hwe
3989             calc_hwp = true;
3990             break;
3991         case 'S':
3992             add_export<StructureExport>();
3993             break;
3994         case 'A':
3995             cerr << "BETA: Ignoring --fastphase output request, which is not currently implemented.\n";
3996             break;
3997         case 'C':
3998             cerr << "BETA: Ignoring --phase output request, which is not currently implemented.\n";
3999             break;
4000         case 'K': // --plink
4001             add_export<PlinkExport>();
4002             break;
4003         case 'Z':
4004             add_export<HzarExport>();
4005             break;
4006         case 'Y':
4007             add_export<PhylipFixedExport>();
4008             break;
4009         case 'L':
4010             add_export<PhylipVarExport>();
4011             break;
4012         case 'T':
4013             cerr << "BETA: Ignoring --phylip-var-all output request, which is not currently implemented.\n";
4014             break;
4015         case 'U':
4016             add_export<TreemixExport>();
4017             break;
4018         case 'W':
4019             wl_file = optarg;
4020             break;
4021         case 'B':
4022             bl_file = optarg;
4023             break;
4024         case 'a':
4025             minor_allele_freq = atof(optarg);
4026             if (minor_allele_freq > 1)
4027                 minor_allele_freq = minor_allele_freq / 100;
4028 
4029             if (minor_allele_freq < 0 || minor_allele_freq > 0.5) {
4030                 cerr << "Error: Unable to parse the minor allele frequency.\n";
4031                 help();
4032             }
4033             break;
4034         case 1016:
4035             minor_allele_cnt = strtol(optarg, &tmp_str, 10);
4036             if (tmp_str == optarg || *tmp_str != '\0' || minor_allele_cnt < 0) {
4037                 cerr << "Error: Unable to parse the minor allele count from '" << optarg << "'.\n";
4038                 help();
4039             }
4040             break;
4041         case 'f':
4042             if (strcasecmp(optarg, "p_value") == 0)
4043                 fst_correction = p_value;
4044             else if (strcasecmp(optarg, "bonferroni_win") == 0)
4045                 fst_correction = bonferroni_win;
4046             else if (strcasecmp(optarg, "bonferroni_gen") == 0)
4047                 fst_correction = bonferroni_gen;
4048             else {
4049                 cerr << "Error: Unknown Fst correction specified '" << optarg << "'\n";
4050                 help();
4051             }
4052             break;
4053         case 'u':
4054             p_value_cutoff = atof(optarg);
4055             break;
4056         case 'e':
4057             enz = optarg;
4058             enz.at(0) = tolower(enz.at(0));
4059             if (renz.count(enz) == 0) {
4060                 cerr << "Error: Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
4061                 help();
4062             }
4063             break;
4064         case 1005: //sigma
4065             sigma = atof(optarg);
4066             break;
4067         case '?':
4068             // getopt_long already printed an error message.
4069             help();
4070             break;
4071         case 1000:
4072         {
4073             static const set<string> known_debug_flags = {"VCFCOMP"};
4074             stringstream ss (optarg);
4075             string s;
4076             while (getline(ss, s, ',')) {
4077                 if (known_debug_flags.count(s)) {
4078                     debug_flags.insert(s);
4079                 } else {
4080                     cerr << "DEBUG: Error: Unknown error flag '" << s << "'.\n";
4081                     return -1;
4082                 }
4083             }
4084             cout << "DEBUG: Debug flag(s) : '" << optarg << "'.\n";
4085 
4086             if (debug_flags.count("VCFCOMP") && not write_random_snp) {
4087                 write_single_snp = true;
4088                 cout << "DEBUG: Added --write-single-snp.\n";
4089             }
4090 
4091             break;
4092         }
4093         default:
4094             cerr << "Error: Unknown command line option: '" << (char) c << "'\n";
4095             help();
4096             exit(1);
4097         }
4098     }
4099 
4100     if (optind < argc) {
4101         cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
4102         help();
4103     }
4104 
4105     //
4106     // Check argument constrains.
4107     //
4108     if (input_mode == InputMode::stacks && in_path.empty()) {
4109         cerr << "Error: Option --v1 requires -P to be given.\n";
4110         help();
4111     }
4112 
4113     if (!in_path.empty() && !in_vcf_path.empty()) {
4114         cerr << "Error: Please specify either '-P/--in-path' or '-V/--in-vcf', not both.\n";
4115         help();
4116     } else if (in_path.empty() && in_vcf_path.empty()) {
4117         cerr << "Error: One of '-P/--in-path' or '-V/--in-vcf' is required.\n";
4118         help();
4119     } else if (not in_vcf_path.empty()) {
4120         input_mode = InputMode::vcf;
4121     }
4122 
4123     if (input_mode == InputMode::stacks || input_mode == InputMode::stacks2) {
4124         if (out_path.empty())
4125             out_path = in_path;
4126 
4127         out_prefix = "populations";
4128 
4129     } else if (input_mode == InputMode::vcf) {
4130 
4131         if (out_path.empty()) {
4132             cerr << "Error: Malformed arguments: input mode 'vcf' requires an output directory (--out-path).\n";
4133             help();
4134         }
4135 
4136         // Determine out_prefix
4137         string fname = in_vcf_path;
4138         if (in_vcf_path.find_last_of('/') != string::npos && in_vcf_path.back() != '/')
4139             fname = in_vcf_path.substr(in_vcf_path.find_last_of('/')+1);
4140         size_t trim = 0;
4141         if (fname.length() > 4 && fname.substr(fname.length()-4) == ".vcf")
4142             trim = 4;
4143         else if (fname.length() > 7 && fname.substr(fname.length()-7) == ".vcf.gz")
4144             trim = 7;
4145         out_prefix = fname.substr(0, fname.length()-trim);
4146         out_prefix += ".p";
4147     }
4148 
4149     // Other
4150     if (write_single_snp + write_random_snp + filter_haplotype_wise > 1) {
4151         cerr << "Error: At most one of --write-single-snp, --write-random-snp"
4152              << " or --filter-haplotype-wise may be specified.\n";
4153         help();
4154     }
4155 
4156     if (merge_sites == true && enz.length() == 0) {
4157         cerr << "Error: You must specify the restriction enzyme associated with this data set to merge overlaping cutsites.\n";
4158         help();
4159     }
4160 
4161     if (no_hap_exports) {
4162         for (Export*& e : exports) {
4163             if (e->is_hap_export()) {
4164                 delete e;
4165                 e = NULL;
4166             }
4167         }
4168         stacks_erase_if(exports, [](const Export* e){return e == NULL;});
4169     }
4170 
4171     return 0;
4172 }
4173 
version()4174 void version() {
4175     cout << "populations " << VERSION << "\n\n";
4176 }
4177 
help()4178 void help() {
4179     cout << "populations " << VERSION << "\n"
4180          << "Usage:\n"
4181          << "populations -P dir [-O dir] [-M popmap] (filters) [--fstats] [-k [--sigma=150000] [--bootstrap [-N 100]]] (output formats)\n"
4182          << "populations -V vcf -O dir [-M popmap] (filters) [--fstats] [-k [--sigma=150000] [--bootstrap [-N 100]]] (output formats)\n"
4183          << "\n"
4184          << "  -P,--in-path: path to a directory containing GStacks ouput files.\n"
4185          << "  -V,--in-vcf: path to a standalone input VCF file.\n"
4186          << "  -O,--out-path: path to a directory where to write the output files. (Required by -V; otherwise defaults to value of -P.)\n"
4187          << "  -M,--popmap: path to a population map. (Format is 'SAMPLE1 \\t POP1 \\n SAMPLE2 ...'.)\n"
4188          << "  -t,--threads: number of threads to run in parallel sections of code.\n"
4189          << "  --batch-size [int]: the number of loci to process in a batch (default: 10,000 in de novo mode; in reference mode, one chromosome\n"
4190          << "                      per batch). Increase to speed analysis, uses more memory, decrease to save memory).\n"
4191          << "\n"
4192          << "Data Filtering:\n"
4193          << "  -p,--min-populations [int]: minimum number of populations a locus must be present in to process a locus.\n"
4194          << "  -r,--min-samples-per-pop [float]: minimum percentage of individuals in a population required to process a locus for that population.\n"
4195          << "  -R,--min-samples-overall [float]: minimum percentage of individuals across populations required to process a locus.\n"
4196          << "  -H,--filter-haplotype-wise: apply the above filters haplotype wise\n"
4197          << "                              (unshared SNPs will be pruned to reduce haplotype-wise missing data).\n"
4198          << "  --min-maf [float]: specify a minimum minor allele frequency required to process a SNP (0 < min_maf < 0.5).\n"
4199          << "  --min-mac [int]: specify a minimum minor allele count required to process a SNP.\n"
4200          << "  --max-obs-het [float]: specify a maximum observed heterozygosity required to process a SNP.\n"
4201          << "\n"
4202          << "  --write-single-snp: restrict data analysis to only the first SNP per locus (implies --no-haps).\n"
4203          << "  --write-random-snp: restrict data analysis to one random SNP per locus (implies --no-haps).\n"
4204          << "  -B,--blacklist: path to a file containing Blacklisted markers to be excluded from the export.\n"
4205          << "  -W,--whitelist: path to a file containing Whitelisted markers to include in the export.\n"
4206          << "\n"
4207          << "Merging and Phasing:\n"
4208          << "  -e,--renz: restriction enzyme name.\n"
4209          << "  --merge-sites: merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).\n"
4210          << "  --merge-prune-lim: when merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.\n"
4211          << "\n"
4212          << "Locus stats:\n"
4213          << "  --hwe: calculate divergence from Hardy-Weinberg equilibrium for each locus.\n"
4214          << "\n"
4215          << "Fstats:\n"
4216          << "  --fstats: enable SNP and haplotype-based F statistics.\n"
4217          << "  --fst-correction: specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'. Default: off.\n"
4218          << "  --p-value-cutoff [float]: maximum p-value to keep an Fst measurement. Default: 0.05. (Also used as base for Bonferroni correction.)\n"
4219          << "\n"
4220          << "Kernel-smoothing algorithm:\n"
4221          << "  -k,--smooth: enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.\n"
4222          << "  --smooth-fstats: enable kernel-smoothed Fst, Fst', and Phi_st calculations.\n"
4223          << "  --smooth-popstats: enable kernel-smoothed Pi and Fis calculations.\n"
4224          << "    (Note: turning on smoothing implies --ordered-export.)\n"
4225          << "  --sigma [int]: standard deviation of the kernel smoothing weight distribution. Default 150kb.\n"
4226          << "  --bootstrap: turn on boostrap resampling for all smoothed statistics.\n"
4227          << "  -N,--bootstrap-reps [int]: number of bootstrap resamplings to calculate (default 100).\n"
4228          << "  --bootstrap-pifis: turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.\n"
4229          << "  --bootstrap-fst: turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.\n"
4230          << "  --bootstrap-div: turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.\n"
4231          << "  --bootstrap-phist: turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.\n"
4232          << "  --bootstrap-wl [path]: only bootstrap loci contained in this whitelist.\n"
4233          << "\n"
4234          << "File output options:\n"
4235          << "  --ordered-export: if data is reference aligned, exports will be ordered; only a single representative of each overlapping site.\n"
4236          << "  --fasta-loci: output locus consensus sequences in FASTA format.\n"
4237          << "  --fasta-samples: output the sequences of the two haplotypes of each (diploid) sample, for each locus, in FASTA format.\n"
4238          << "  --vcf: output SNPs and haplotypes in Variant Call Format (VCF).\n"
4239          << "  --genepop: output SNPs and haplotypes in GenePop format.\n"
4240          << "  --structure: output results in Structure format.\n"
4241          << "  --radpainter: output results in fineRADstructure/RADpainter format.\n"
4242          << "  --phase*: output genotypes in PHASE format.\n"
4243          << "  --fastphase*: output genotypes in fastPHASE format.\n"
4244          << "  --plink: output genotypes in PLINK format.\n"
4245          << "  --hzar: output genotypes in Hybrid Zone Analysis using R (HZAR) format.\n"
4246          << "  --phylip: output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.\n"
4247          << "  --phylip-var: include variable sites in the phylip output encoded using IUPAC notation.\n"
4248          << "  --phylip-var-all*: include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.\n"
4249          << "  --treemix: output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).\n"
4250          << "  --no-hap-exports: omit haplotype outputs.\n"
4251          << "  --fasta-samples-raw: output all haplotypes observed in each sample, for each locus, in FASTA format.\n"
4252          << "  (*not implemented as of v2.4)\n"
4253          << "\n"
4254          << "Additional options:\n"
4255          << "  -h,--help: display this help messsage.\n"
4256          << "  -v,--version: print program version.\n"
4257          << "  --verbose: turn on additional logging.\n"
4258          << ("  --log-fst-comp: log components of Fst/Phi_st calculations to a file.\n")
4259          #ifdef DEBUG
4260          << "\n"
4261          << "DEBUG:\n"
4262          << "  --dbg-min-gt-depth\n"
4263          << "  --genepop-haps-3digits: Use 3-digit alleles in the genepop haps output (default is 2-digit).\n"
4264          #endif
4265          ;
4266 
4267     // << "    --bootstrap-type [exact|approx]: enable bootstrap resampling for population statistics (reference genome required).\n"
4268 
4269     exit(1);
4270 }
4271