1 /* 2 Genome-wide Efficient Mixed Model Association (GEMMA) 3 Copyright © 2011-2017, Xiang Zhou 4 Copyright © 2017, Peter Carbonetto 5 Copyright © 2017, Pjotr Prins 6 7 This program is free software: you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation, either version 3 of the License, or 10 (at your option) any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #ifndef __PARAM_H__ 22 #define __PARAM_H__ 23 24 #include "debug.h" 25 #include "gsl/gsl_matrix.h" 26 #include <gsl/gsl_rng.h> 27 #include "gsl/gsl_vector.h" 28 #include <map> 29 #include <set> 30 #include <vector> 31 32 #define K_BATCH_SIZE 20000 // #snps used for batched K 33 #define DEFAULT_PACE 1000 // for display only 34 35 using namespace std; 36 37 class SNPINFO { 38 public: 39 string chr; 40 string rs_number; 41 double cM; 42 long int base_position; 43 string a_minor; 44 string a_major; 45 size_t n_miss; 46 double missingness; 47 double maf; 48 size_t n_idv; // Number of non-missing individuals. 49 size_t n_nb; // Number of neighbours on the right hand side. 50 size_t file_position; // SNP location in file. 51 }; 52 53 // Results for LMM. 54 class SUMSTAT { 55 public: 56 double beta; // REML estimator for beta. 57 double se; // SE for beta. 58 double lambda_remle; // REML estimator for lambda. 59 double lambda_mle; // MLE estimator for lambda. 60 double p_wald; // p value from a Wald test. 61 double p_lrt; // p value from a likelihood ratio test. 62 double p_score; // p value from a score test. 63 double logl_H1; // log likelihood under the alternative 64 // hypothesis as a measure of goodness of fit, 65 // see https://github.com/genetics-statistics/GEMMA/issues/81 66 }; 67 68 // Results for mvLMM. 69 class MPHSUMSTAT { 70 public: 71 vector<double> v_beta; // REML estimator for beta. 72 double p_wald; // p value from a Wald test. 73 double p_lrt; // p value from a likelihood ratio test. 74 double p_score; // p value from a score test. 75 vector<double> v_Vg; // Estimator for Vg, right half. 76 vector<double> v_Ve; // Estimator for Ve, right half. 77 vector<double> v_Vbeta; // Estimator for Vbeta, right half. 78 }; 79 80 // Hyper-parameters for BSLMM. 81 class HYPBSLMM { 82 public: 83 double h; 84 double pve; 85 double rho; 86 double pge; 87 double logp; 88 size_t n_gamma; 89 }; 90 91 // Header class. 92 class HEADER { 93 public: 94 size_t rs_col; 95 size_t chr_col; 96 size_t pos_col; 97 size_t cm_col; 98 size_t a1_col; 99 size_t a0_col; 100 size_t z_col; 101 size_t beta_col; 102 size_t sebeta_col; 103 size_t chisq_col; 104 size_t p_col; 105 size_t n_col; 106 size_t nmis_col; 107 size_t nobs_col; 108 size_t ncase_col; 109 size_t ncontrol_col; 110 size_t af_col; 111 size_t var_col; 112 size_t ws_col; 113 size_t cor_col; 114 size_t coln; // Number of columns. 115 set<size_t> catc_col; 116 set<size_t> catd_col; 117 }; 118 119 class PARAM { 120 public: 121 // IO-related parameters 122 // bool mode_check = true; // run data checks (slower) 123 // bool mode_strict = false; // exit on some data checks 124 // bool mode_silence; 125 // bool mode_debug = false; 126 // uint issue; // enable tests for issue on github tracker 127 128 uint a_mode; // Analysis mode, 1/2/3/4 for Frequentist tests 129 int k_mode; // Kinship read mode: 1: n by n matrix, 2: id/id/k_value; 130 vector<size_t> p_column; // Which phenotype column needs analysis. 131 size_t d_pace = DEFAULT_PACE; // Display pace (-pace switch) 132 133 string file_bfile, file_mbfile; 134 string file_geno, file_mgeno; 135 string file_pheno; 136 string file_anno; // Optional. 137 string file_gxe; // Optional. 138 string file_cvt; // Optional. 139 string file_cat, file_mcat; 140 string file_catc, file_mcatc; 141 string file_var; 142 string file_beta; 143 string file_cor; 144 string file_kin, file_mk; 145 string file_ku, file_kd; 146 string file_study, file_mstudy; 147 string file_ref, file_mref; 148 string file_weight, file_wsnp, file_wcat; 149 string file_out; 150 string file_bf, file_hyp; 151 string path_out; 152 153 string file_epm; // Estimated parameter file. 154 string file_ebv; // Estimated breeding value file. 155 string file_log; // Log file containing mean estimate. 156 string file_read; // File containing total number of reads. 157 string file_gene; // Gene expression file. 158 string file_snps; // File containing analyzed SNPs or genes. 159 string file_ksnps; // File SNPs for computing K 160 string file_gwasnps; // File SNPs for computing GWAS 161 162 // QC-related parameters. 163 double miss_level; 164 double maf_level; 165 double hwe_level; 166 double r2_level; 167 168 // LMM-related parameters. 169 string loco; 170 double l_min; 171 double l_max; 172 size_t n_region; 173 double l_mle_null, l_remle_null; 174 double logl_mle_H0, logl_remle_H0; 175 double pve_null, pve_se_null, pve_total, se_pve_total; 176 double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null; 177 vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; 178 vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null; 179 vector<double> VVe_mle_null; 180 vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null; 181 vector<double> se_beta_mle_null; 182 double p_nr; 183 double em_prec, nr_prec; 184 size_t em_iter, nr_iter; 185 size_t crt; 186 double pheno_mean; // Phenotype mean from BSLMM fitting or prediction. 187 188 // For fitting multiple variance components. 189 // The first 3 are of size (n_vc), and the next 2 are of size n_vc+1. 190 bool noconstrain; 191 vector<double> v_traceG; 192 vector<double> v_pve; 193 vector<double> v_se_pve; 194 195 vector<double> v_sigma2; 196 vector<double> v_se_sigma2; 197 vector<double> v_enrich; 198 vector<double> v_se_enrich; 199 vector<double> v_beta; 200 vector<double> v_se_beta; 201 202 // BSLMM/MCMC-related parameters. 203 double h_min, h_max, h_scale; // Priors for h. 204 double rho_min, rho_max, rho_scale; // Priors for rho. 205 double logp_min, logp_max, logp_scale; // Priors for log(pi). 206 size_t h_ngrid, rho_ngrid; 207 size_t s_min, s_max; // Min & max. number of gammas. 208 size_t w_step; // # warm up/burn in iter. 209 size_t s_step; // # sampling iterations. 210 size_t r_pace; // Record pace. 211 size_t w_pace; // Write pace. 212 size_t n_accept; // Number of acceptance. 213 size_t n_mh; // # MH steps in each iter. 214 double geo_mean; // Mean of geometric dist. 215 long int randseed; // holds -seed parameter 216 gsl_rng *gsl_r; // Track the randomizer 217 double trace_G; 218 219 HYPBSLMM cHyp_initial; 220 221 // VARCOV-related parameters. 222 double window_cm; 223 size_t window_bp; 224 size_t window_ns; 225 226 // vc-related parameters. 227 size_t n_block; 228 229 // Summary statistics. 230 bool error; 231 232 // Number of individuals. 233 size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref; 234 size_t ni_max = 0; // -nind switch for testing purposes 235 236 // Number of observed and missing phenotypes. 237 size_t np_obs, np_miss; 238 239 // Number of SNPs. 240 size_t ns_total, ns_test, ns_study, ns_ref; 241 242 size_t ng_total, ng_test; // Number of genes. 243 size_t ni_control, ni_case; // Number of controls and number of cases. 244 size_t ni_subsample; // Number of subsampled individuals. 245 size_t n_cvt; // Number of covariates. 246 size_t n_cat; // Number of continuous categories. 247 size_t n_ph; // Number of phenotypes. 248 size_t n_vc; // Number of variance components 249 // (including the diagonal matrix). 250 double time_total; // Record total time. 251 double time_G; // Time spent on reading files the 252 // second time and calculate K. 253 double time_eigen; // Time spent on eigen-decomposition. 254 double time_UtX; // Time spent on calculating UX and Uy. 255 double time_UtZ; // Time calculating UtZ for probit BSLMM. 256 double time_opt; // Time on optimization iterations/MCMC. 257 double time_Omega; // Time spent on calculating Omega. 258 double time_hyp; // Time sampling hyperparameters in PMM. 259 double time_Proposal; // Time spent on constructing the 260 // proposal distribution (i.e. the 261 // initial LMM or LM analysis). 262 263 // Data. 264 // Vector recording all phenotypes (NA replaced with -9). 265 vector<vector<double>> pheno; 266 267 // Vector recording all covariates (NA replaced with -9). 268 vector<vector<double>> cvt; 269 270 // Vector recording all covariates (NA replaced with -9). 271 vector<double> gxe; 272 273 // Vector recording weights for the individuals, which is 274 // useful for animal breeding studies. 275 vector<double> weight; 276 277 // Matrix recording when a phenotype is missing for an 278 // individual; 0 missing, 1 available. 279 vector<vector<int>> indicator_pheno; 280 281 // Indicator for individuals (phenotypes): 0 missing, 1 282 // available for analysis 283 vector<int> indicator_idv; 284 285 // Sequence indicator for SNPs: 0 ignored because of (a) maf, 286 // (b) miss, (c) non-poly; 1 available for analysis. 287 vector<int> indicator_snp; 288 289 // Sequence indicator for SNPs: 0 ignored because of (a) maf, 290 // (b) miss, (c) non-poly; 1 available for analysis. 291 vector<vector<int>> mindicator_snp; 292 293 // Indicator for covariates: 0 missing, 1 available for 294 // analysis. 295 vector<int> indicator_cvt; 296 297 // Indicator for gxe: 0 missing, 1 available for analysis. 298 vector<int> indicator_gxe; 299 300 // Indicator for weight: 0 missing, 1 available for analysis. 301 vector<int> indicator_weight; 302 303 // Indicator for estimated breeding value file: 0 missing, 1 304 // available for analysis. 305 vector<int> indicator_bv; 306 307 // Indicator for read file: 0 missing, 1 available for analysis. 308 vector<int> indicator_read; 309 vector<double> vec_read; // Total number of reads. 310 vector<double> vec_bv; // Breeding values. 311 vector<size_t> est_column; 312 313 map<string, int> mapID2num; // Map small ID to number, 0 to n-1. 314 map<string, string> mapRS2chr; // Map rs# to chromosome location. 315 map<string, long int> mapRS2bp; // Map rs# to base position. 316 map<string, double> mapRS2cM; // Map rs# to cM. 317 map<string, double> mapRS2est; // Map rs# to parameters. 318 map<string, size_t> mapRS2cat; // Map rs# to category number. 319 map<string, vector<double>> mapRS2catc; // Map rs# to cont. cat's. 320 map<string, double> mapRS2wsnp; // Map rs# to SNP weights. 321 map<string, vector<double>> mapRS2wcat; // Map rs# to SNP cat weights. 322 323 vector<SNPINFO> snpInfo; // Record SNP information. 324 vector<vector<SNPINFO>> msnpInfo; // Record SNP information. 325 set<string> setSnps; // Set of snps for analysis (-snps). 326 set<string> setKSnps; // Set of snps for K (-ksnps and LOCO) 327 set<string> setGWASnps; // Set of snps for GWA (-gwasnps and LOCO) 328 329 // Constructor and destructor 330 PARAM(); 331 ~PARAM(); 332 333 // Functions. 334 void ReadFiles(); 335 void CheckParam(); 336 void CheckData(); 337 void PrintSummary(); 338 void ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); 339 void ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, 340 const bool calc_K); 341 void CheckCvt(); 342 void CopyCvt(gsl_matrix *W); 343 void CopyA(size_t flag, gsl_matrix *A); 344 void CopyGxe(gsl_vector *gxe); 345 void CopyWeight(gsl_vector *w); 346 void ProcessCvtPhen(); 347 void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag); 348 void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag); 349 void CalcKin(gsl_matrix *matrix_kin); 350 void CalcS(const map<string, double> &mapRS2wA, 351 const map<string, double> &mapRS2wK, const gsl_matrix *W, 352 gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, 353 gsl_vector *ns); 354 void WriteVector(const gsl_vector *q, const gsl_vector *s, 355 const size_t n_total, const string suffix); 356 void WriteVar(const string suffix); 357 void WriteMatrix(const gsl_matrix *matrix_U, const string suffix); 358 void WriteVector(const gsl_vector *vector_D, const string suffix); 359 void CopyRead(gsl_vector *log_N); 360 void ObtainWeight(const set<string> &setSnps_beta, 361 map<string, double> &mapRS2wK); 362 void UpdateWeight(const size_t pve_flag, const map<string, double> &mapRS2wK, 363 const size_t ni_test, const gsl_vector *ns, 364 map<string, double> &mapRS2wA); 365 void UpdateSNPnZ(const map<string, double> &mapRS2wA, 366 const map<string, string> &mapRS2A1, 367 const map<string, double> &mapRS2z, gsl_vector *w, 368 gsl_vector *z, vector<size_t> &vec_cat); 369 void UpdateSNP(const map<string, double> &mapRS2wA); 370 }; 371 372 size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt); 373 374 #endif 375