1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #ifndef PHENOBUILDER_HPP
20 #define PHENOBUILDER_HPP
21 
22 #include <vector>
23 #include <string>
24 #include <set>
25 #include <boost/random.hpp>
26 #include <boost/random/mersenne_twister.hpp>
27 #include <boost/random/normal_distribution.hpp>
28 #include <boost/random/uniform_01.hpp>
29 
30 #include "Bolt.hpp"
31 
32 namespace LMM {
33 
34   class PhenoBuilder {
35 
36   public:
37 
38     enum SnpRegionType {
39       CAUSAL_REGION, // chrom 1st-halves (pre-buffer): used in LD Score regression in simulations
40       CANDIDATE_SNP, // simulated candidate snp with fixed large effect sizes
41       BUFFER_REGION, // buffer in middle of chrom (not used for anything)
42       NULL_REGION_H2HI_CHROMS, // 2nd-halves (post-buffer) of hi-h2 chroms: used to check avg null
43       NULL_REGION_H2LO_CHROMS, // 2nd-halves (post-buffer) of lo-h2 chroms: used to check avg null
44       BAD_REGION // masked SNPs (not used for anything)
45     };
46     enum EffectDistType {
47       GAUSSIAN,
48       LAPLACE
49     };
50 
51   private:
52 
53     const Bolt &bolt;
54     uint64 Nstride;
55     EffectDistType effectDist;
56 
57     boost::mt19937 rng;
58     boost::variate_generator<boost::mt19937&, boost::normal_distribution<> > randn;
59     boost::variate_generator<boost::mt19937&, boost::uniform_01<> > rand;
60     boost::variate_generator<boost::mt19937&, boost::exponential_distribution<> > exprnd;
61 
62     // build masked, mean-centered, normalized genetic component of phenotype from input betas
63     std::vector <double> buildPhenoGenetic(const std::vector <double> &betas);
64     void maskCenterNormalize(std::vector <double> &pheno);
65 
66   public:
67 
68     static const int DEFAULT_MID_CHROM_HALF_BUFFER_PHYSPOS;
69     static const double DEFAULT_MID_CHROM_HALF_BUFFER_GENPOS;
70 
71     // note: the Bolt instance should apply any additional masking needed (indivs, snps)
72     PhenoBuilder(const Bolt &_bolt, uint seed, EffectDistType _effectDist);
73 
74     static bool isNullRegion(SnpRegionType snpRegion);
75 
76     /**
77      * build masked, mean-centered, normalized genetic component of phenotype
78      *
79      * params:
80      * - pCausal: prob of a 1st-half chrom being causal, BEFORE further reduction from MAF matching
81      * - MAFhistFile: file containing allele freq spectrum to match
82      * - stdPow: MAF-dependence; 0 for equal per-normalized-genotype, 1 for equal per-allele
83     */
84     std::vector <double> genPhenoCausal(std::vector <double> &simBetas,
85 					std::vector <SnpRegionType> &simRegions, int vcNum,
86 					int Mcausal, const std::string &MAFhistFile, double stdPow,
87 					std::set <int> highH2Chroms,
88 					int midChromHalfBufferPhyspos);
89 
90     std::vector <double> genPhenoCausalRegions(std::vector <double> &simBetas,
91 					       std::vector <SnpRegionType> &simRegions,
92 					       double lambdaRegion, double pRegion);
93     /**
94      * build masked, mean-centered, normalized candidate snp genetic component of phenotype
95      * update simRegions accordingly to identify chosen candidate snps
96      */
97     std::vector <double> genPhenoCandidate(std::vector <SnpRegionType> &simRegions,
98 					   int Mcandidate);
99 
100     // build masked, mean-centered, normalized stratification component of phenotype
101     std::vector <double> genPhenoStrat(const std::string &phenoStratFile);
102 
103     /**
104      * build masked, mean-centered, normalized relatedness component of phenotype
105      * - input matrix will be N x N
106      * - also input ibd thresh
107      * - mask (i,j) entries for i!=j, at least one of i, j in masked-out indivs
108      * - cholesky factor
109      * - create random pheno
110      * - mask, mean-center, normalize
111     // input file: IBS mat (unmasked: N x N for N indivs in fam file... need to augment if < Nstride)
112     // compute IBS mat with GCTA; only do this for FHS testing on relatedness
113     vector <double> genPhenoRelated(const string &IBSfile, double IBSthresh) {
114       vector <double> pheno(Nstride);
115       // todo (later)
116       maskCenterNormalize(pheno); return pheno;
117     }
118      */
119 
120     // build masked, mean-centered, normalized environmental (random) component of phenotype
121     std::vector <double> genPhenoEnv(void);
122 
123     std::vector <double> combinePhenoComps
124     (const std::vector <double> &h2causal, double h2candidate, double h2strat,
125      const std::vector < std::vector <double> > &phenoCausal,
126      const std::vector <double> &phenoCandidate, const std::vector <double> &phenoStrat,
127      const std::vector <double> &phenoEnv);
128   };
129 }
130 
131 #endif
132