1 /*
2     Genome-wide Efficient Mixed Model Association (GEMMA)
3     Copyright © 2011-2017, Xiang Zhou
4     Copyright © 2017, Peter Carbonetto
5     Copyright © 2017, Pjotr Prins
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef __PARAM_H__
22 #define __PARAM_H__
23 
24 #include "debug.h"
25 #include "gsl/gsl_matrix.h"
26 #include <gsl/gsl_rng.h>
27 #include "gsl/gsl_vector.h"
28 #include <map>
29 #include <set>
30 #include <vector>
31 
32 #define K_BATCH_SIZE 20000 // #snps used for batched K
33 #define DEFAULT_PACE 1000  // for display only
34 
35 using namespace std;
36 
37 class SNPINFO {
38 public:
39   string chr;
40   string rs_number;
41   double cM;
42   long int base_position;
43   string a_minor;
44   string a_major;
45   size_t n_miss;
46   double missingness;
47   double maf;
48   size_t n_idv;         // Number of non-missing individuals.
49   size_t n_nb;          // Number of neighbours on the right hand side.
50   size_t file_position; // SNP location in file.
51 };
52 
53 // Results for LMM.
54 class SUMSTAT {
55 public:
56   double beta;         // REML estimator for beta.
57   double se;           // SE for beta.
58   double lambda_remle; // REML estimator for lambda.
59   double lambda_mle;   // MLE estimator for lambda.
60   double p_wald;       // p value from a Wald test.
61   double p_lrt;        // p value from a likelihood ratio test.
62   double p_score;      // p value from a score test.
63   double logl_H1;      // log likelihood under the alternative
64                        // hypothesis as a measure of goodness of fit,
65                        // see https://github.com/genetics-statistics/GEMMA/issues/81
66 };
67 
68 // Results for mvLMM.
69 class MPHSUMSTAT {
70 public:
71   vector<double> v_beta;  // REML estimator for beta.
72   double p_wald;          // p value from a Wald test.
73   double p_lrt;           // p value from a likelihood ratio test.
74   double p_score;         // p value from a score test.
75   vector<double> v_Vg;    // Estimator for Vg, right half.
76   vector<double> v_Ve;    // Estimator for Ve, right half.
77   vector<double> v_Vbeta; // Estimator for Vbeta, right half.
78 };
79 
80 // Hyper-parameters for BSLMM.
81 class HYPBSLMM {
82 public:
83   double h;
84   double pve;
85   double rho;
86   double pge;
87   double logp;
88   size_t n_gamma;
89 };
90 
91 // Header class.
92 class HEADER {
93 public:
94   size_t rs_col;
95   size_t chr_col;
96   size_t pos_col;
97   size_t cm_col;
98   size_t a1_col;
99   size_t a0_col;
100   size_t z_col;
101   size_t beta_col;
102   size_t sebeta_col;
103   size_t chisq_col;
104   size_t p_col;
105   size_t n_col;
106   size_t nmis_col;
107   size_t nobs_col;
108   size_t ncase_col;
109   size_t ncontrol_col;
110   size_t af_col;
111   size_t var_col;
112   size_t ws_col;
113   size_t cor_col;
114   size_t coln; // Number of columns.
115   set<size_t> catc_col;
116   set<size_t> catd_col;
117 };
118 
119 class PARAM {
120 public:
121   // IO-related parameters
122   // bool mode_check = true;   // run data checks (slower)
123   // bool mode_strict = false; // exit on some data checks
124   // bool mode_silence;
125   // bool mode_debug = false;
126   // uint issue; // enable tests for issue on github tracker
127 
128   uint a_mode; // Analysis mode, 1/2/3/4 for Frequentist tests
129   int k_mode; // Kinship read mode: 1: n by n matrix, 2: id/id/k_value;
130   vector<size_t> p_column; // Which phenotype column needs analysis.
131   size_t d_pace = DEFAULT_PACE;   // Display pace (-pace switch)
132 
133   string file_bfile, file_mbfile;
134   string file_geno, file_mgeno;
135   string file_pheno;
136   string file_anno; // Optional.
137   string file_gxe;  // Optional.
138   string file_cvt;  // Optional.
139   string file_cat, file_mcat;
140   string file_catc, file_mcatc;
141   string file_var;
142   string file_beta;
143   string file_cor;
144   string file_kin, file_mk;
145   string file_ku, file_kd;
146   string file_study, file_mstudy;
147   string file_ref, file_mref;
148   string file_weight, file_wsnp, file_wcat;
149   string file_out;
150   string file_bf, file_hyp;
151   string path_out;
152 
153   string file_epm;     // Estimated parameter file.
154   string file_ebv;     // Estimated breeding value file.
155   string file_log;     // Log file containing mean estimate.
156   string file_read;    // File containing total number of reads.
157   string file_gene;    // Gene expression file.
158   string file_snps;    // File containing analyzed SNPs or genes.
159   string file_ksnps;   // File SNPs for computing K
160   string file_gwasnps; // File SNPs for computing GWAS
161 
162   // QC-related parameters.
163   double miss_level;
164   double maf_level;
165   double hwe_level;
166   double r2_level;
167 
168   // LMM-related parameters.
169   string loco;
170   double l_min;
171   double l_max;
172   size_t n_region;
173   double l_mle_null, l_remle_null;
174   double logl_mle_H0, logl_remle_H0;
175   double pve_null, pve_se_null, pve_total, se_pve_total;
176   double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null;
177   vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null;
178   vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null;
179   vector<double> VVe_mle_null;
180   vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null;
181   vector<double> se_beta_mle_null;
182   double p_nr;
183   double em_prec, nr_prec;
184   size_t em_iter, nr_iter;
185   size_t crt;
186   double pheno_mean; // Phenotype mean from BSLMM fitting or prediction.
187 
188   // For fitting multiple variance components.
189   // The first 3 are of size (n_vc), and the next 2 are of size n_vc+1.
190   bool noconstrain;
191   vector<double> v_traceG;
192   vector<double> v_pve;
193   vector<double> v_se_pve;
194 
195   vector<double> v_sigma2;
196   vector<double> v_se_sigma2;
197   vector<double> v_enrich;
198   vector<double> v_se_enrich;
199   vector<double> v_beta;
200   vector<double> v_se_beta;
201 
202   // BSLMM/MCMC-related parameters.
203   double h_min, h_max, h_scale;          // Priors for h.
204   double rho_min, rho_max, rho_scale;    // Priors for rho.
205   double logp_min, logp_max, logp_scale; // Priors for log(pi).
206   size_t h_ngrid, rho_ngrid;
207   size_t s_min, s_max; // Min & max. number of gammas.
208   size_t w_step;       // # warm up/burn in iter.
209   size_t s_step;       // # sampling iterations.
210   size_t r_pace;       // Record pace.
211   size_t w_pace;       // Write pace.
212   size_t n_accept;     // Number of acceptance.
213   size_t n_mh;         // # MH steps in each iter.
214   double geo_mean;     // Mean of geometric dist.
215   long int randseed;   // holds -seed parameter
216   gsl_rng *gsl_r;      // Track the randomizer
217   double trace_G;
218 
219   HYPBSLMM cHyp_initial;
220 
221   // VARCOV-related parameters.
222   double window_cm;
223   size_t window_bp;
224   size_t window_ns;
225 
226   // vc-related parameters.
227   size_t n_block;
228 
229   // Summary statistics.
230   bool error;
231 
232   // Number of individuals.
233   size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref;
234   size_t ni_max = 0; // -nind switch for testing purposes
235 
236   // Number of observed and missing phenotypes.
237   size_t np_obs, np_miss;
238 
239   // Number of SNPs.
240   size_t ns_total, ns_test, ns_study, ns_ref;
241 
242   size_t ng_total, ng_test;   // Number of genes.
243   size_t ni_control, ni_case; // Number of controls and number of cases.
244   size_t ni_subsample;        // Number of subsampled individuals.
245   size_t n_cvt;               // Number of covariates.
246   size_t n_cat;               // Number of continuous categories.
247   size_t n_ph;                // Number of phenotypes.
248   size_t n_vc;                // Number of variance components
249                               // (including the diagonal matrix).
250   double time_total;          // Record total time.
251   double time_G;              // Time spent on reading files the
252                               // second time and calculate K.
253   double time_eigen;          // Time spent on eigen-decomposition.
254   double time_UtX;            // Time spent on calculating UX and Uy.
255   double time_UtZ;            // Time calculating UtZ for probit BSLMM.
256   double time_opt;            // Time on optimization iterations/MCMC.
257   double time_Omega;          // Time spent on calculating Omega.
258   double time_hyp;            // Time sampling hyperparameters in PMM.
259   double time_Proposal;       // Time spent on constructing the
260                               // proposal distribution (i.e. the
261                               // initial LMM or LM analysis).
262 
263   // Data.
264   // Vector recording all phenotypes (NA replaced with -9).
265   vector<vector<double>> pheno;
266 
267   // Vector recording all covariates (NA replaced with -9).
268   vector<vector<double>> cvt;
269 
270   // Vector recording all covariates (NA replaced with -9).
271   vector<double> gxe;
272 
273   // Vector recording weights for the individuals, which is
274   // useful for animal breeding studies.
275   vector<double> weight;
276 
277   // Matrix recording when a phenotype is missing for an
278   // individual; 0 missing, 1 available.
279   vector<vector<int>> indicator_pheno;
280 
281   // Indicator for individuals (phenotypes): 0 missing, 1
282   // available for analysis
283   vector<int> indicator_idv;
284 
285   // Sequence indicator for SNPs: 0 ignored because of (a) maf,
286   // (b) miss, (c) non-poly; 1 available for analysis.
287   vector<int> indicator_snp;
288 
289   // Sequence indicator for SNPs: 0 ignored because of (a) maf,
290   // (b) miss, (c) non-poly; 1 available for analysis.
291   vector<vector<int>> mindicator_snp;
292 
293   // Indicator for covariates: 0 missing, 1 available for
294   // analysis.
295   vector<int> indicator_cvt;
296 
297   // Indicator for gxe: 0 missing, 1 available for analysis.
298   vector<int> indicator_gxe;
299 
300   // Indicator for weight: 0 missing, 1 available for analysis.
301   vector<int> indicator_weight;
302 
303   // Indicator for estimated breeding value file: 0 missing, 1
304   // available for analysis.
305   vector<int> indicator_bv;
306 
307   // Indicator for read file: 0 missing, 1 available for analysis.
308   vector<int> indicator_read;
309   vector<double> vec_read; // Total number of reads.
310   vector<double> vec_bv;   // Breeding values.
311   vector<size_t> est_column;
312 
313   map<string, int> mapID2num;             // Map small ID to number, 0 to n-1.
314   map<string, string> mapRS2chr;          // Map rs# to chromosome location.
315   map<string, long int> mapRS2bp;         // Map rs# to base position.
316   map<string, double> mapRS2cM;           // Map rs# to cM.
317   map<string, double> mapRS2est;          // Map rs# to parameters.
318   map<string, size_t> mapRS2cat;          // Map rs# to category number.
319   map<string, vector<double>> mapRS2catc; // Map rs# to cont. cat's.
320   map<string, double> mapRS2wsnp;         // Map rs# to SNP weights.
321   map<string, vector<double>> mapRS2wcat; // Map rs# to SNP cat weights.
322 
323   vector<SNPINFO> snpInfo;          // Record SNP information.
324   vector<vector<SNPINFO>> msnpInfo; // Record SNP information.
325   set<string> setSnps;              // Set of snps for analysis (-snps).
326   set<string> setKSnps;             // Set of snps for K (-ksnps and LOCO)
327   set<string> setGWASnps;           // Set of snps for GWA (-gwasnps and LOCO)
328 
329   // Constructor and destructor
330   PARAM();
331   ~PARAM();
332 
333   // Functions.
334   void ReadFiles();
335   void CheckParam();
336   void CheckData();
337   void PrintSummary();
338   void ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);
339   void ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K,
340                      const bool calc_K);
341   void CheckCvt();
342   void CopyCvt(gsl_matrix *W);
343   void CopyA(size_t flag, gsl_matrix *A);
344   void CopyGxe(gsl_vector *gxe);
345   void CopyWeight(gsl_vector *w);
346   void ProcessCvtPhen();
347   void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag);
348   void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag);
349   void CalcKin(gsl_matrix *matrix_kin);
350   void CalcS(const map<string, double> &mapRS2wA,
351              const map<string, double> &mapRS2wK, const gsl_matrix *W,
352              gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar,
353              gsl_vector *ns);
354   void WriteVector(const gsl_vector *q, const gsl_vector *s,
355                    const size_t n_total, const string suffix);
356   void WriteVar(const string suffix);
357   void WriteMatrix(const gsl_matrix *matrix_U, const string suffix);
358   void WriteVector(const gsl_vector *vector_D, const string suffix);
359   void CopyRead(gsl_vector *log_N);
360   void ObtainWeight(const set<string> &setSnps_beta,
361                     map<string, double> &mapRS2wK);
362   void UpdateWeight(const size_t pve_flag, const map<string, double> &mapRS2wK,
363                     const size_t ni_test, const gsl_vector *ns,
364                     map<string, double> &mapRS2wA);
365   void UpdateSNPnZ(const map<string, double> &mapRS2wA,
366                    const map<string, string> &mapRS2A1,
367                    const map<string, double> &mapRS2z, gsl_vector *w,
368                    gsl_vector *z, vector<size_t> &vec_cat);
369   void UpdateSNP(const map<string, double> &mapRS2wA);
370 };
371 
372 size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt);
373 
374 #endif
375