1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*- 2 // 3 // Copyright 2012-2019, Julian Catchen <jcatchen@illinois.edu> 4 // 5 // This file is part of Stacks. 6 // 7 // Stacks is free software: you can redistribute it and/or modify 8 // it under the terms of the GNU General Public License as published by 9 // the Free Software Foundation, either version 3 of the License, or 10 // (at your option) any later version. 11 // 12 // Stacks is distributed in the hope that it will be useful, 13 // but WITHOUT ANY WARRANTY; without even the implied warranty of 14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 // GNU General Public License for more details. 16 // 17 // You should have received a copy of the GNU General Public License 18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>. 19 // 20 21 #ifndef __POPULATIONS_H__ 22 #define __POPULATIONS_H__ 23 24 #ifdef _OPENMP 25 #include <omp.h> // OpenMP library 26 #endif 27 #include <getopt.h> // Process command-line options 28 #include <dirent.h> // Open/Read contents of a directory 29 #include <cmath> 30 #include <cstdlib> 31 #include <cstring> 32 #include <ctime> 33 #include <utility> 34 #include <string> 35 #include <iostream> 36 #include <fstream> 37 #include <iomanip> 38 #include <sstream> 39 #include <vector> 40 #include <map> 41 #include <set> 42 43 #include "constants.h" 44 #include "stacks.h" 45 #include "locus.h" 46 #include "renz.h" 47 #include "PopMap.h" 48 #include "PopSum.h" 49 #include "utils.h" 50 #include "log_utils.h" 51 #include "catalog_utils.h" 52 #include "sql_utilities.h" 53 #include "genotype_dictionaries.h" 54 #include "ordered.h" 55 #include "smoothing.h" 56 #include "bootstrap.h" 57 #include "MetaPopInfo.h" 58 #include "gzFasta.h" 59 #include "locus_readers.h" 60 61 enum bs_type {bs_exact, bs_approx, bs_none}; 62 enum merget {merge_sink, merge_src}; 63 enum phaset {merge_failure, simple_merge, complex_phase, nomapping_fail, multimapping_fail, multiple_fails}; 64 enum class InputMode {stacks, stacks2, vcf}; 65 66 // 67 // Class for accumulating summary statistics as we read each batch of data. 68 // After all loci are processed, output the summary statistics summary. 69 // 70 class SumStatsSummary { 71 size_t _pop_cnt; 72 size_t *_private_cnt; 73 size_t *_sig_hwe_dev; 74 double *_num_indv_mean, *_p_mean, *_obs_het_mean, *_obs_hom_mean, *_exp_het_mean, *_exp_hom_mean, *_pi_mean, *_fis_mean; 75 double *_num_indv_acc_mean, *_p_acc_mean, *_obs_het_acc_mean, *_obs_hom_acc_mean, *_exp_het_acc_mean, *_exp_hom_acc_mean, *_pi_acc_mean, *_fis_acc_mean; 76 double *_num_indv_var, *_p_var, *_obs_het_var, *_obs_hom_var, *_exp_het_var, *_exp_hom_var, *_pi_var, *_fis_var; 77 double *_num_indv_mean_all, *_p_mean_all, *_obs_het_mean_all, *_obs_hom_mean_all, *_exp_het_mean_all, *_exp_hom_mean_all, *_pi_mean_all, *_fis_mean_all; 78 double *_num_indv_acc_mean_all, *_p_acc_mean_all, *_obs_het_acc_mean_all, *_obs_hom_acc_mean_all, *_exp_het_acc_mean_all, *_exp_hom_acc_mean_all, *_pi_acc_mean_all, *_fis_acc_mean_all; 79 double *_num_indv_var_all, *_p_var_all, *_obs_het_var_all, *_obs_hom_var_all, *_exp_het_var_all, *_exp_hom_var_all, *_pi_var_all, *_fis_var_all; 80 double *_n, *_n_all, *_var_sites; 81 double *_sq_n, *_sq_n_all; 82 double _locus_len_mean, _locus_len_acc_mean, _locus_len_var, _locus_len_mean_all, _locus_len_acc_mean_all, _locus_len_var_all; 83 double _locus_gt_sites_mean, _locus_gt_sites_acc_mean, _locus_gt_sites_var; 84 double _overlap_mean, _overlap_acc_mean, _overlap_var; 85 double _locus_n, _locus_pe_ctg_n, _locus_overlap_n; 86 87 public: 88 SumStatsSummary(size_t pop_cnt); 89 ~SumStatsSummary(); 90 int accumulate(const vector<LocBin *> &); 91 int final_calculation(); 92 int write_results(); 93 94 private: 95 double online_variance(double x, double &acc_mean, double n); 96 }; 97 98 // 99 // Class for storing distributions of data related to catalog loci. 100 // 101 class CatalogDists { 102 map<size_t, size_t> _pre_valid, _pre_absent, _pre_confounded; // Prior to any filtering. 103 map<size_t, size_t> _post_valid, _post_absent, _post_confounded; // After filtering. 104 map<size_t, size_t> _pre_snps_per_loc, _post_snps_per_loc; 105 106 public: 107 int accumulate_pre_filtering(const size_t, const CSLocus *); 108 int accumulate(const vector<LocBin *> &); 109 int write_results(ostream &log_fh); 110 }; 111 112 // 113 // Class for smoothing various calculated population statistics for each batch of loci. 114 // 115 class LocusSmoothing { 116 const MetaPopInfo *_mpopi; // Population Map 117 KSmooth<SumStat> *_ks_ss; 118 OSumStat<SumStat> *_ord_ss; 119 KSmooth<LocStat> *_ks_ls; 120 OHaplotypes<LocStat> *_ord_ls; 121 KSmooth<HapStat> *_ks_hs; 122 OHaplotypes<HapStat> *_ord_hs; 123 KSmooth<PopPair> *_ks_pp; 124 OPopPair<PopPair> *_ord_pp; 125 126 public: LocusSmoothing(const MetaPopInfo * mpopi,ofstream & log_fh)127 LocusSmoothing(const MetaPopInfo *mpopi, ofstream &log_fh) { 128 this->_mpopi = mpopi; 129 this->_ks_ss = new KSmooth<SumStat>(2); 130 this->_ord_ss = new OSumStat<SumStat>(log_fh); 131 this->_ks_ls = new KSmooth<LocStat>(2); 132 this->_ord_ls = new OHaplotypes<LocStat>(); 133 this->_ks_hs = new KSmooth<HapStat>(5); 134 this->_ord_hs = new OHaplotypes<HapStat>(); 135 this->_ks_pp = new KSmooth<PopPair>(2); 136 this->_ord_pp = new OPopPair<PopPair>(log_fh); 137 } ~LocusSmoothing()138 ~LocusSmoothing() { 139 delete this->_ks_ss; 140 delete this->_ord_ss; 141 delete this->_ks_ls; 142 delete this->_ord_ls; 143 delete this->_ks_hs; 144 delete this->_ord_hs; 145 delete this->_ks_pp; 146 delete this->_ord_pp; 147 } 148 149 int snpstats(const vector<LocBin *> &, ofstream &); 150 int hapstats(const vector<LocBin *> &, ofstream &); 151 int snp_divergence(const vector<LocBin *> &, const vector<vector<PopPair **>> &, ofstream &); 152 int hap_divergence(const vector<LocBin *> &, const vector<vector<HapStat *>> &, const vector<HapStat *> &, ofstream &); 153 }; 154 155 // 156 // Class for filtering whole loci based on the sample and population limits (-r, -p). 157 // 158 class LocusFilter { 159 public: LocusFilter()160 LocusFilter() { 161 this->_pop_cnt = 0; 162 this->_sample_cnt = 0; 163 this->_pop_order = NULL; // The array order of each population. 164 this->_samples = NULL; // Which population each sample belongs to. 165 this->_pop_tot = NULL; // The total number of samples in each population. 166 this->_filtered_loci = 0; 167 this->_total_loci = 0; 168 this->_seen_loci = 0; 169 this->_batch_filtered_loci = 0; 170 this->_batch_total_loci = 0; 171 this->_batch_seen_loci = 0; 172 this->_filtered_sites = 0; 173 this->_total_sites = 0; 174 this->_variant_sites = 0; 175 } LocusFilter(MetaPopInfo * mpopi)176 LocusFilter(MetaPopInfo *mpopi) { 177 assert(mpopi != NULL); 178 this->init(mpopi); 179 } ~LocusFilter()180 ~LocusFilter() { 181 if (this->_pop_order != NULL) 182 delete [] this->_pop_order; 183 if (this->_samples != NULL) 184 delete [] this->_samples; 185 if (this->_pop_tot != NULL) 186 delete [] this->_pop_tot; 187 } 188 // Remove the copy and move constructors. 189 LocusFilter(const LocusFilter &) = delete; 190 LocusFilter(LocusFilter &&) = delete; 191 192 int load_whitelist(string); 193 int load_blacklist(string); 194 195 void init(MetaPopInfo *mpopi); 196 bool whitelist_filter(size_t locus_id); 197 bool blacklist_filter(size_t locus_id); 198 void whitelist_snp_filter(LocBin& loc) const; 199 bool apply_filters_stacks(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi); 200 bool apply_filters_external(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi); 201 bool filter(const MetaPopInfo *mpopi, Datum **d, CSLocus *cloc); 202 void filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh); 203 void filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh); 204 void gt_depth_filter(Datum **d, const CSLocus *cloc); 205 void keep_single_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const; 206 void keep_random_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const; 207 filtered()208 size_t filtered() const { return this->_filtered_loci; } total()209 size_t total() const { return this->_total_loci; } seen()210 size_t seen() const { return this->_seen_loci; } batch_filtered()211 size_t batch_filtered() const { return this->_batch_filtered_loci; } batch_seen()212 size_t batch_seen() const { return this->_batch_seen_loci; } batch_total()213 size_t batch_total() const { return this->_batch_total_loci; } filtered_sites()214 size_t filtered_sites() const { return this->_filtered_sites; } total_sites()215 size_t total_sites() const { return this->_total_sites; } variant_sites()216 size_t variant_sites() const { return this->_variant_sites; } 217 void locus_seen(); 218 void locus_unsee(); 219 void keep_locus(LocBin *); 220 void batch_clear(); 221 blacklist()222 const set<int>& blacklist() { return this->_blacklist; } whitelist()223 const map<int, set<int>>& whitelist() { return this->_whitelist; } 224 225 private: 226 static void erase_snp(CSLocus *cloc, Datum **d, size_t n_samples, size_t snp_index); 227 228 size_t _pop_cnt; 229 size_t _sample_cnt; 230 size_t *_pop_order; 231 size_t *_samples; 232 size_t *_pop_tot; 233 size_t _filtered_loci; 234 size_t _total_loci; 235 size_t _seen_loci; 236 size_t _batch_filtered_loci; 237 size_t _batch_total_loci; 238 size_t _batch_seen_loci; 239 size_t _filtered_sites; 240 size_t _total_sites; 241 size_t _variant_sites; 242 243 set<int> _blacklist; 244 map<int, set<int>> _whitelist; 245 }; 246 247 // 248 // BatchLocusProcessor 249 // ---------- 250 // Class for processing loci in batches, or per chromosome. 251 // 252 class BatchLocusProcessor { 253 public: BatchLocusProcessor()254 BatchLocusProcessor(): 255 _input_mode(InputMode::stacks2), _user_supplied_whitelist(false), _batch_size(0), _batch_num(0), 256 _mpopi(NULL), _next_loc(NULL), _unordered_bp(0) {} BatchLocusProcessor(InputMode mode,size_t batch_size,MetaPopInfo * popi)257 BatchLocusProcessor(InputMode mode, size_t batch_size, MetaPopInfo *popi): 258 _input_mode(mode), _user_supplied_whitelist(false), _batch_size(batch_size), _batch_num(0), 259 _mpopi(popi), _next_loc(NULL), _unordered_bp(0) {} BatchLocusProcessor(InputMode mode,size_t batch_size)260 BatchLocusProcessor(InputMode mode, size_t batch_size): 261 _input_mode(mode), _user_supplied_whitelist(false), _batch_size(batch_size), _batch_num(0), 262 _mpopi(NULL), _next_loc(NULL), _unordered_bp(0) {} ~BatchLocusProcessor()263 ~BatchLocusProcessor() { 264 for (uint i = 0; i < this->_loci.size(); i++) 265 delete this->_loci[i]; 266 delete [] this->_sig_hwe_dev; 267 }; 268 269 int init(string, string); 270 size_t next_batch(ostream &); next_batch_number()271 size_t next_batch_number() { return this->_batch_num + 1; } 272 int summarize(ostream &); 273 int hapstats(ostream &); write_distributions(ostream & log_fh)274 int write_distributions(ostream &log_fh) { return this->_dists.write_results(log_fh); } 275 pop_info()276 MetaPopInfo* pop_info() { return this->_mpopi; } pop_info(MetaPopInfo * popi)277 int pop_info(MetaPopInfo *popi) { this->_mpopi = popi; return 0; } vcf_reader()278 VcfParser& vcf_reader() { return this->_vcf_parser; } fasta_reader()279 GzFasta& fasta_reader() { return this->_fasta_reader; } cloc_reader()280 VcfCLocReader& cloc_reader() { return this->_cloc_reader; } batch_size()281 size_t batch_size() { return this->_batch_size; } batch_size(size_t bsize)282 size_t batch_size(size_t bsize) { this->_batch_size = bsize; return bsize; } 283 void report_locus_overlap(size_t&, size_t&, ofstream* = NULL); 284 filter()285 const LocusFilter& filter() { return this->_loc_filter; } loci()286 const vector<LocBin *>& loci() { return this->_loci; } chr()287 const string& chr() { return this->_chr; } dists()288 const CatalogDists& dists() { return this->_dists; } 289 290 // Per-population haplotype counters 291 size_t *_sig_hwe_dev; 292 293 private: 294 InputMode _input_mode; 295 bool _user_supplied_whitelist; 296 size_t _batch_size; // Number of loci to process at a time. 297 size_t _batch_num; // Counter for how many batches we have processed. 298 MetaPopInfo *_mpopi; // Population Map 299 300 // Parsers 301 VcfParser _vcf_parser; 302 VcfCLocReader _cloc_reader; 303 GzFasta _fasta_reader; 304 vector<size_t> _samples_vcf_to_mpopi; 305 306 // Data stores 307 vector<LocBin *> _loci; 308 LocBin *_next_loc; 309 string _chr; 310 CatalogDists _dists; 311 312 // Counters for external VCF 313 size_t _total_ext_vcf; 314 vector<size_t> _skipped_notsnp; 315 vector<size_t> _skipped_filter; 316 317 // Controls for which loci are loaded 318 LocusFilter _loc_filter; 319 320 size_t _unordered_bp; 321 322 private: 323 int init_external_loci(string, string); 324 int init_stacks_loci(string, string); 325 void batch_clear(); 326 size_t next_batch_external_loci(ostream &); 327 size_t next_batch_stacks_loci(ostream &); 328 }; 329 330 void help( void ); 331 void version( void ); 332 int parse_command_line(int, char**); 333 void output_parameters(ostream &); 334 void open_log(ofstream &); 335 int build_file_list(); 336 int load_marker_list(string, set<int> &); 337 int load_marker_column_list(string, map<int, set<int> > &); 338 //int merge_shared_cutsite_loci(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<merget, int> > &, ofstream &); 339 phaset merge_and_phase_loci(PopMap<CSLocus> *, CSLocus *, CSLocus *, set<int> &, ofstream &); 340 int merge_datums(int, int, Datum **, Datum **, set<string> &, int); 341 int merge_csloci(CSLocus *, CSLocus *, set<string> &); 342 int tabulate_haplotypes(map<int, CSLocus *> &, PopMap<CSLocus> *); 343 int tabulate_locus_haplotypes(CSLocus *, Datum **, int); 344 int create_genotype_map(CSLocus *, Datum **, int); 345 int call_population_genotypes(CSLocus *, Datum **, int); 346 int translate_genotypes(map<string, string> &, map<string, map<string, string> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, set<int> &); // This function doesn't exist (March 24, 2016) 347 int correct_fst_bonferroni_win(vector<PopPair *> &); 348 int bootstrap_fst_approximate_dist(vector<double> &, vector<int> &, double *, int *, map<int, vector<double> > &); // not used (March 23, 2016) 349 int bootstrap_popstats_approximate_dist(vector<double> &, vector<double> &, vector<int> &, double *, int *, int, map<int, vector<double> > &, map<int, vector<double> > &); // not used (March 23, 2016) 350 double bootstrap_approximate_pval(int, double, map<int, vector<double> > &); 351 352 bool hap_compare(const pair<string,int>&, const pair<string,int>&); 353 354 void vcfcomp_simplify_pmap (map<int, CSLocus*>& catalog, PopMap<CSLocus>* pmap); 355 356 #endif // __POPULATIONS_H__ 357