1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2012-2019, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __POPULATIONS_H__
22 #define __POPULATIONS_H__
23 
24 #ifdef _OPENMP
25 #include <omp.h>    // OpenMP library
26 #endif
27 #include <getopt.h> // Process command-line options
28 #include <dirent.h> // Open/Read contents of a directory
29 #include <cmath>
30 #include <cstdlib>
31 #include <cstring>
32 #include <ctime>
33 #include <utility>
34 #include <string>
35 #include <iostream>
36 #include <fstream>
37 #include <iomanip>
38 #include <sstream>
39 #include <vector>
40 #include <map>
41 #include <set>
42 
43 #include "constants.h"
44 #include "stacks.h"
45 #include "locus.h"
46 #include "renz.h"
47 #include "PopMap.h"
48 #include "PopSum.h"
49 #include "utils.h"
50 #include "log_utils.h"
51 #include "catalog_utils.h"
52 #include "sql_utilities.h"
53 #include "genotype_dictionaries.h"
54 #include "ordered.h"
55 #include "smoothing.h"
56 #include "bootstrap.h"
57 #include "MetaPopInfo.h"
58 #include "gzFasta.h"
59 #include "locus_readers.h"
60 
61 enum bs_type   {bs_exact, bs_approx, bs_none};
62 enum merget    {merge_sink, merge_src};
63 enum phaset    {merge_failure, simple_merge, complex_phase, nomapping_fail, multimapping_fail, multiple_fails};
64 enum class InputMode {stacks, stacks2, vcf};
65 
66 //
67 // Class for accumulating summary statistics as we read each batch of data.
68 // After all loci are processed, output the summary statistics summary.
69 //
70 class SumStatsSummary {
71     size_t  _pop_cnt;
72     size_t *_private_cnt;
73     size_t *_sig_hwe_dev;
74     double *_num_indv_mean,         *_p_mean,         *_obs_het_mean,         *_obs_hom_mean,         *_exp_het_mean,         *_exp_hom_mean,         *_pi_mean,         *_fis_mean;
75     double *_num_indv_acc_mean,     *_p_acc_mean,     *_obs_het_acc_mean,     *_obs_hom_acc_mean,     *_exp_het_acc_mean,     *_exp_hom_acc_mean,     *_pi_acc_mean,     *_fis_acc_mean;
76     double *_num_indv_var,          *_p_var,          *_obs_het_var,          *_obs_hom_var,          *_exp_het_var,          *_exp_hom_var,          *_pi_var,          *_fis_var;
77     double *_num_indv_mean_all,     *_p_mean_all,     *_obs_het_mean_all,     *_obs_hom_mean_all,     *_exp_het_mean_all,     *_exp_hom_mean_all,     *_pi_mean_all,     *_fis_mean_all;
78     double *_num_indv_acc_mean_all, *_p_acc_mean_all, *_obs_het_acc_mean_all, *_obs_hom_acc_mean_all, *_exp_het_acc_mean_all, *_exp_hom_acc_mean_all, *_pi_acc_mean_all, *_fis_acc_mean_all;
79     double *_num_indv_var_all,      *_p_var_all,      *_obs_het_var_all,      *_obs_hom_var_all,      *_exp_het_var_all,      *_exp_hom_var_all,      *_pi_var_all,      *_fis_var_all;
80     double *_n, *_n_all, *_var_sites;
81     double *_sq_n, *_sq_n_all;
82     double _locus_len_mean, _locus_len_acc_mean, _locus_len_var, _locus_len_mean_all, _locus_len_acc_mean_all, _locus_len_var_all;
83     double _locus_gt_sites_mean, _locus_gt_sites_acc_mean, _locus_gt_sites_var;
84     double _overlap_mean, _overlap_acc_mean, _overlap_var;
85     double _locus_n, _locus_pe_ctg_n, _locus_overlap_n;
86 
87 public:
88     SumStatsSummary(size_t pop_cnt);
89     ~SumStatsSummary();
90     int accumulate(const vector<LocBin *> &);
91     int final_calculation();
92     int write_results();
93 
94 private:
95     double online_variance(double x, double &acc_mean, double n);
96 };
97 
98 //
99 // Class for storing distributions of data related to catalog loci.
100 //
101 class CatalogDists {
102     map<size_t, size_t> _pre_valid,  _pre_absent,  _pre_confounded;  // Prior to any filtering.
103     map<size_t, size_t> _post_valid, _post_absent, _post_confounded; // After filtering.
104     map<size_t, size_t> _pre_snps_per_loc, _post_snps_per_loc;
105 
106 public:
107     int accumulate_pre_filtering(const size_t, const CSLocus *);
108     int accumulate(const vector<LocBin *> &);
109     int write_results(ostream &log_fh);
110 };
111 
112 //
113 // Class for smoothing various calculated population statistics for each batch of loci.
114 //
115 class LocusSmoothing {
116     const MetaPopInfo    *_mpopi; // Population Map
117     KSmooth<SumStat>     *_ks_ss;
118     OSumStat<SumStat>    *_ord_ss;
119     KSmooth<LocStat>     *_ks_ls;
120     OHaplotypes<LocStat> *_ord_ls;
121     KSmooth<HapStat>     *_ks_hs;
122     OHaplotypes<HapStat> *_ord_hs;
123     KSmooth<PopPair>     *_ks_pp;
124     OPopPair<PopPair>    *_ord_pp;
125 
126 public:
LocusSmoothing(const MetaPopInfo * mpopi,ofstream & log_fh)127     LocusSmoothing(const MetaPopInfo *mpopi, ofstream &log_fh) {
128         this->_mpopi  = mpopi;
129         this->_ks_ss  = new KSmooth<SumStat>(2);
130         this->_ord_ss = new OSumStat<SumStat>(log_fh);
131         this->_ks_ls  = new KSmooth<LocStat>(2);
132         this->_ord_ls = new OHaplotypes<LocStat>();
133         this->_ks_hs  = new KSmooth<HapStat>(5);
134         this->_ord_hs = new OHaplotypes<HapStat>();
135         this->_ks_pp  = new KSmooth<PopPair>(2);
136         this->_ord_pp = new OPopPair<PopPair>(log_fh);
137     }
~LocusSmoothing()138     ~LocusSmoothing() {
139         delete this->_ks_ss;
140         delete this->_ord_ss;
141         delete this->_ks_ls;
142         delete this->_ord_ls;
143         delete this->_ks_hs;
144         delete this->_ord_hs;
145         delete this->_ks_pp;
146         delete this->_ord_pp;
147     }
148 
149     int    snpstats(const vector<LocBin *> &, ofstream &);
150     int    hapstats(const vector<LocBin *> &, ofstream &);
151     int    snp_divergence(const vector<LocBin *> &, const vector<vector<PopPair **>> &, ofstream &);
152     int    hap_divergence(const vector<LocBin *> &, const vector<vector<HapStat *>>  &, const vector<HapStat *> &, ofstream &);
153 };
154 
155 //
156 // Class for filtering whole loci based on the sample and population limits (-r, -p).
157 //
158 class LocusFilter {
159 public:
LocusFilter()160     LocusFilter() {
161         this->_pop_cnt    = 0;
162         this->_sample_cnt = 0;
163         this->_pop_order  = NULL; // The array order of each population.
164         this->_samples    = NULL; // Which population each sample belongs to.
165         this->_pop_tot    = NULL; // The total number of samples in each population.
166         this->_filtered_loci       = 0;
167         this->_total_loci          = 0;
168         this->_seen_loci           = 0;
169         this->_batch_filtered_loci = 0;
170         this->_batch_total_loci    = 0;
171         this->_batch_seen_loci     = 0;
172         this->_filtered_sites      = 0;
173         this->_total_sites         = 0;
174         this->_variant_sites       = 0;
175     }
LocusFilter(MetaPopInfo * mpopi)176     LocusFilter(MetaPopInfo *mpopi) {
177         assert(mpopi != NULL);
178         this->init(mpopi);
179     }
~LocusFilter()180     ~LocusFilter() {
181         if (this->_pop_order != NULL)
182             delete [] this->_pop_order;
183         if (this->_samples != NULL)
184             delete [] this->_samples;
185         if (this->_pop_tot != NULL)
186             delete [] this->_pop_tot;
187     }
188     // Remove the copy and move constructors.
189     LocusFilter(const LocusFilter &) = delete;
190     LocusFilter(LocusFilter &&) = delete;
191 
192     int    load_whitelist(string);
193     int    load_blacklist(string);
194 
195     void   init(MetaPopInfo *mpopi);
196     bool   whitelist_filter(size_t locus_id);
197     bool   blacklist_filter(size_t locus_id);
198     void   whitelist_snp_filter(LocBin& loc) const;
199     bool   apply_filters_stacks(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi);
200     bool   apply_filters_external(LocBin& loc, ostream& log_fh, const MetaPopInfo& mpopi);
201     bool   filter(const MetaPopInfo *mpopi, Datum **d, CSLocus *cloc);
202     void   filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
203     void   filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
204     void   gt_depth_filter(Datum **d, const CSLocus *cloc);
205     void   keep_single_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const;
206     void   keep_random_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const;
207 
filtered()208     size_t filtered()       const { return this->_filtered_loci; }
total()209     size_t total()          const { return this->_total_loci; }
seen()210     size_t seen()           const { return this->_seen_loci; }
batch_filtered()211     size_t batch_filtered() const { return this->_batch_filtered_loci; }
batch_seen()212     size_t batch_seen()     const { return this->_batch_seen_loci; }
batch_total()213     size_t batch_total()    const { return this->_batch_total_loci; }
filtered_sites()214     size_t filtered_sites() const { return this->_filtered_sites; }
total_sites()215     size_t total_sites()    const { return this->_total_sites; }
variant_sites()216     size_t variant_sites()  const { return this->_variant_sites; }
217     void   locus_seen();
218     void   locus_unsee();
219     void   keep_locus(LocBin *);
220     void   batch_clear();
221 
blacklist()222     const set<int>&            blacklist() { return this->_blacklist; }
whitelist()223     const map<int, set<int>>&  whitelist() { return this->_whitelist; }
224 
225 private:
226     static void erase_snp(CSLocus *cloc, Datum **d, size_t n_samples, size_t snp_index);
227 
228     size_t  _pop_cnt;
229     size_t  _sample_cnt;
230     size_t *_pop_order;
231     size_t *_samples;
232     size_t *_pop_tot;
233     size_t  _filtered_loci;
234     size_t  _total_loci;
235     size_t  _seen_loci;
236     size_t  _batch_filtered_loci;
237     size_t  _batch_total_loci;
238     size_t  _batch_seen_loci;
239     size_t  _filtered_sites;
240     size_t  _total_sites;
241     size_t  _variant_sites;
242 
243     set<int>           _blacklist;
244     map<int, set<int>> _whitelist;
245 };
246 
247 //
248 // BatchLocusProcessor
249 // ----------
250 // Class for processing loci in batches, or per chromosome.
251 //
252 class BatchLocusProcessor {
253 public:
BatchLocusProcessor()254     BatchLocusProcessor():
255         _input_mode(InputMode::stacks2), _user_supplied_whitelist(false), _batch_size(0), _batch_num(0),
256         _mpopi(NULL), _next_loc(NULL), _unordered_bp(0) {}
BatchLocusProcessor(InputMode mode,size_t batch_size,MetaPopInfo * popi)257     BatchLocusProcessor(InputMode mode, size_t batch_size, MetaPopInfo *popi):
258         _input_mode(mode), _user_supplied_whitelist(false), _batch_size(batch_size), _batch_num(0),
259         _mpopi(popi), _next_loc(NULL), _unordered_bp(0) {}
BatchLocusProcessor(InputMode mode,size_t batch_size)260     BatchLocusProcessor(InputMode mode, size_t batch_size):
261         _input_mode(mode), _user_supplied_whitelist(false), _batch_size(batch_size), _batch_num(0),
262         _mpopi(NULL), _next_loc(NULL), _unordered_bp(0) {}
~BatchLocusProcessor()263     ~BatchLocusProcessor() {
264         for (uint i = 0; i < this->_loci.size(); i++)
265             delete this->_loci[i];
266         delete [] this->_sig_hwe_dev;
267     };
268 
269     int            init(string, string);
270     size_t         next_batch(ostream &);
next_batch_number()271     size_t         next_batch_number() { return this->_batch_num + 1; }
272     int            summarize(ostream &);
273     int            hapstats(ostream &);
write_distributions(ostream & log_fh)274     int            write_distributions(ostream &log_fh) { return this->_dists.write_results(log_fh); }
275 
pop_info()276     MetaPopInfo*   pop_info()      { return this->_mpopi; }
pop_info(MetaPopInfo * popi)277     int            pop_info(MetaPopInfo *popi) { this->_mpopi = popi; return 0; }
vcf_reader()278     VcfParser&     vcf_reader()    { return this->_vcf_parser; }
fasta_reader()279     GzFasta&       fasta_reader()  { return this->_fasta_reader; }
cloc_reader()280     VcfCLocReader& cloc_reader()   { return this->_cloc_reader; }
batch_size()281     size_t         batch_size()    { return this->_batch_size; }
batch_size(size_t bsize)282     size_t         batch_size(size_t bsize) { this->_batch_size = bsize; return bsize; }
283     void           report_locus_overlap(size_t&, size_t&, ofstream* = NULL);
284 
filter()285     const LocusFilter&      filter() { return this->_loc_filter; }
loci()286     const vector<LocBin *>& loci()   { return this->_loci; }
chr()287     const string&           chr()    { return this->_chr; }
dists()288     const CatalogDists&     dists()  { return this->_dists; }
289 
290     // Per-population haplotype counters
291     size_t           *_sig_hwe_dev;
292 
293 private:
294     InputMode    _input_mode;
295     bool         _user_supplied_whitelist;
296     size_t       _batch_size; // Number of loci to process at a time.
297     size_t       _batch_num;  // Counter for how many batches we have processed.
298     MetaPopInfo *_mpopi;      // Population Map
299 
300     // Parsers
301     VcfParser     _vcf_parser;
302     VcfCLocReader _cloc_reader;
303     GzFasta       _fasta_reader;
304     vector<size_t> _samples_vcf_to_mpopi;
305 
306     // Data stores
307     vector<LocBin *>  _loci;
308     LocBin           *_next_loc;
309     string            _chr;
310     CatalogDists      _dists;
311 
312     // Counters for external VCF
313     size_t            _total_ext_vcf;
314     vector<size_t>    _skipped_notsnp;
315     vector<size_t>    _skipped_filter;
316 
317     // Controls for which loci are loaded
318     LocusFilter _loc_filter;
319 
320     size_t _unordered_bp;
321 
322 private:
323     int    init_external_loci(string, string);
324     int    init_stacks_loci(string, string);
325     void   batch_clear();
326     size_t next_batch_external_loci(ostream &);
327     size_t next_batch_stacks_loci(ostream &);
328 };
329 
330 void    help( void );
331 void    version( void );
332 int     parse_command_line(int, char**);
333 void    output_parameters(ostream &);
334 void    open_log(ofstream &);
335 int     build_file_list();
336 int     load_marker_list(string, set<int> &);
337 int     load_marker_column_list(string, map<int, set<int> > &);
338 //int     merge_shared_cutsite_loci(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<merget, int> > &, ofstream &);
339 phaset  merge_and_phase_loci(PopMap<CSLocus> *, CSLocus *, CSLocus *, set<int> &, ofstream &);
340 int     merge_datums(int, int, Datum **, Datum **, set<string> &, int);
341 int     merge_csloci(CSLocus *, CSLocus *, set<string> &);
342 int     tabulate_haplotypes(map<int, CSLocus *> &, PopMap<CSLocus> *);
343 int     tabulate_locus_haplotypes(CSLocus *, Datum **, int);
344 int     create_genotype_map(CSLocus *, Datum **, int);
345 int     call_population_genotypes(CSLocus *, Datum **, int);
346 int     translate_genotypes(map<string, string> &, map<string, map<string, string> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, set<int> &); // This function doesn't exist (March 24, 2016)
347 int     correct_fst_bonferroni_win(vector<PopPair *> &);
348 int     bootstrap_fst_approximate_dist(vector<double> &, vector<int>  &, double *, int *, map<int, vector<double> > &); // not used (March 23, 2016)
349 int     bootstrap_popstats_approximate_dist(vector<double> &, vector<double> &, vector<int>  &, double *, int *, int, map<int, vector<double> > &, map<int, vector<double> > &); // not used (March 23, 2016)
350 double  bootstrap_approximate_pval(int, double, map<int, vector<double> > &);
351 
352 bool hap_compare(const pair<string,int>&, const pair<string,int>&);
353 
354 void vcfcomp_simplify_pmap (map<int, CSLocus*>& catalog, PopMap<CSLocus>* pmap);
355 
356 #endif // __POPULATIONS_H__
357