1 /* 2 * GCTA: a tool for Genome-wide Complex Trait Analysis 3 * 4 * Interface to all the GCTA functions 5 * 6 * 2010 by Jian Yang <jian.yang@qimr.edu.au> 7 * 8 * This file is distributed under the GNU General Public 9 * License, Version 2. Please see the file COPYING for more 10 * details 11 */ 12 13 #ifndef _GCTA_H 14 #define _GCTA_H 15 16 #ifndef EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET 17 #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET 18 #endif 19 20 //#ifndef EIGEN_USE_MKL_ALL 21 //#define EIGEN_USE_MKL_ALL 22 //#endif 23 24 #include "CommFunc.h" 25 #include "StrFunc.h" 26 #include "StatFunc.h" 27 #include "eigen_func.h" 28 #include <fstream> 29 #include <iomanip> 30 #include <bitset> 31 #include <map> 32 //#include <random> 33 #include "zfstream.h" 34 #include <Eigen/Dense> 35 #include <Eigen/Sparse> 36 #include <unsupported/Eigen/SparseExtra> 37 #include <unsupported/Eigen/IterativeSolvers> 38 #include <omp.h> 39 #include <cblas.h> 40 #include <lapacke.h> 41 // Looks like openblas, cblas and lapacke are all that's needed 42 //#include <blaspp.h> 43 //#include <lapackpp.h> 44 45 using namespace Eigen; 46 using namespace std; 47 48 #ifdef SINGLE_PRECISION 49 typedef DiagonalMatrix<float, Dynamic, Dynamic> eigenDiagMat; 50 typedef MatrixXf eigenMatrix; 51 typedef VectorXf eigenVector; 52 typedef SparseMatrix<float> eigenSparseMat; 53 typedef DynamicSparseMatrix<float> eigenDynSparseMat; 54 #else 55 typedef DiagonalMatrix<double, Dynamic, Dynamic> eigenDiagMat; 56 typedef MatrixXd eigenMatrix; 57 typedef VectorXd eigenVector; 58 typedef SparseMatrix<double> eigenSparseMat; 59 typedef DynamicSparseMatrix<double> eigenDynSparseMat; 60 #endif 61 62 class gcta { 63 public: 64 gcta(int autosome_num, double rm_ld_cutoff, string out); 65 gcta(); 66 virtual ~gcta(); 67 68 void read_famfile(string famfile); 69 void read_bimfile(string bimfile); 70 void read_bedfile(string bedfile); 71 void read_imp_info_mach_gz(string zinfofile); 72 void read_imp_info_mach(string infofile); 73 void read_imp_dose_mach_gz(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file); 74 void read_imp_dose_mach(string dosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file); 75 void read_imp_info_beagle(string zinfofile); 76 void read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file); 77 void update_ref_A(string ref_A_file); 78 void update_impRsq(string zinfofile); 79 void update_freq(string freq); 80 void save_freq(bool ssq_flag); 81 void extract_snp(string snplistfile); 82 void extract_single_snp(string snpname); 83 void extract_region_snp(string snpname, int wind_size); 84 void extract_region_bp(int chr, int bp, int wind_size); 85 void exclude_snp(string snplistfile); 86 void exclude_single_snp(string snpname); 87 void exclude_region_snp(string snpname, int wind_size); 88 void exclude_region_bp(int chr, int bp, int wind_size); 89 void extract_chr(int chr_start, int chr_end); 90 void filter_snp_maf(double maf); 91 void filter_snp_max_maf(double max_maf); 92 void filter_impRsq(double rsq_cutoff); 93 void keep_indi(string indi_list_file); 94 void remove_indi(string indi_list_file); 95 void update_sex(string sex_file); 96 void read_indi_blup(string blup_indi_file); 97 void save_XMat(bool miss_with_mu, bool std); 98 99 void make_grm(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag = false, string subpopu_file = ""); 100 //void make_grm_pca(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, double wind_size, bool mlmassoc); 101 void save_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool output_grm_bin); 102 void pca(string grm_file, string keep_indi_file, string remove_indi_file, double grm_cutoff, bool merge_grm_flag, int out_pc_num); 103 void snp_pc_loading(string pc_file, int grm_N); 104 105 // bigK + smallK method 106 void grm_bK(string grm_file, string keep_indi_file, string remove_indi_file, double threshold, bool grm_out_bin_flag); 107 108 void enable_grm_bin_flag(); 109 void fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc = false, bool within_family = false, bool reml_bending = false, bool reml_diag_one = false); 110 void HE_reg(string grm_file, string phen_file, string keep_indi_file, string remove_indi_file, int mphen); 111 void blup_snp_geno(); 112 void blup_snp_dosage(); 113 void set_reml_force_inv(); 114 void set_reml_force_converge(); 115 void set_reml_no_converge(); 116 void set_reml_fixed_var(); 117 void set_reml_mtd(int reml_mtd); 118 119 // bivariate REML analysis 120 void fit_bivar_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, int mphen2, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, double prevalence2, bool no_constrain, bool ignore_Ce, vector<double> &fixed_rg_val, bool bivar_no_constrain); 121 122 // LD 123 void read_LD_target_SNPs(string snplistfile); 124 void LD_Blocks(int stp, double wind_size, double alpha, bool IncldQ = true, bool save_ram = false); 125 void calcu_mean_rsq(int wind_size, double rsq_cutoff, bool dominance_flag); 126 void calcu_mean_rsq_multiSet(string snpset_filenames_file, int wind_size, double rsq_cutoff, bool dominance_flag); 127 void calcu_max_ld_rsq(int wind_size, double rsq_cutoff, bool dominance_flag); 128 void ld_seg(string i_ld_file, int seg_size, int wind_size, double rsq_cutoff, bool dominance_flag); 129 void set_ldscore_adj_flag(bool ldscore_adj); 130 131 void genet_dst(string bfile, string hapmap_genet_map); 132 133 void GWAS_simu(string bfile, int simu_num, string qtl_file, int case_num, int control_num, double hsq, double K, int seed, bool output_causal, bool simu_emb_flag, int eff_mod=0); 134 //void simu_geno_unlinked(int N, int M, double maf); 135 136 void run_massoc_slct(string metafile, int wind_size, double p_cutoff, double collinear, int top_SNPs, bool joint_only, bool GC, double GC_val, bool actual_geno, int mld_slct_alg); 137 void run_massoc_cond(string metafile, string snplistfile, int wind_size, double collinear, bool GC, double GC_val, bool actual_geno); 138 void run_massoc_sblup(string metafile, int wind_size, double lambda); 139 140 void save_plink(); 141 void dose2bed(); 142 143 void read_IRG_fnames(string snp_info_file, string fname_file, double GC_cutoff); 144 145 // population genetics 146 void Fst(string filename); 147 void paa(string aa_file); 148 void ibc(bool ibc_all); 149 150 // mkl 151 void make_grm_mkl(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag = false); 152 void calcu_mean_rsq_mkl(int wind_size, double rsq_cutoff); 153 void LD_pruning_mkl(double rsq_cutoff, int wind_size); 154 //void make_grm_wt_mkl(string i_ld_file, int wind_m, double wt_ld_cut, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd=0, int wt_mtd=0, bool mlmassoc=false, bool impData_flag=false, int ttl_snp_num=-1); 155 156 // mlma 157 void mlma(string grm_file, bool m_grm_flag, string subtract_grm_file, string phen_file, string qcovar_file, string covar_file, int mphen, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, bool no_constrain, bool within_family, bool inbred, bool no_adj_covar); 158 void mlma_loco(string phen_file, string qcovar_file, string covar_file, int mphen, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, bool no_constrain, bool inbred, bool no_adj_covar); 159 160 // gene based association test 161 void sbat_gene(string sAssoc_file, string gAnno_file, int wind, double sbat_ld_cutoff, bool sbat_write_snpset); 162 void sbat(string sAssoc_file, string snpset_file, double sbat_ld_cutoff, bool sbat_write_snpset); 163 void sbat_seg(string sAssoc_file, int seg_size, double sbat_ld_cutoff, bool sbat_write_snpset); 164 165 166 ///////////////////////// 167 // gene expresion data 168 void read_efile(string efile); 169 170 // ecojo 171 void read_eR(string eR_file); 172 void run_ecojo_slct(string e_metafile, double p_cutoff, double collinear); 173 void run_ecojo_blup_efile(string e_metafile, double lambda); 174 void run_ecojo_blup_eR(string e_metafile, double lambda); 175 176 // ERM 177 void make_erm(int erm_mtd, bool output_bin); 178 179 private: 180 void init_keep(); 181 void init_include(); 182 void get_rsnp(vector<int> &rsnp); 183 void get_rindi(vector<int> &rindi); 184 185 void save_famfile(); 186 void save_bimfile(); 187 void save_bedfile(); 188 189 void update_bim(vector<int> &rsnp); 190 void update_fam(vector<int> &rindi); 191 192 void update_id_map_kp(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep); 193 void update_id_map_rm(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep); 194 void read_snplist(string snplistfile, vector<string> &snplist, string msg = "SNPs"); 195 void read_indi_list(string indi_list_file, vector<string> &indi_list); 196 197 bool make_XMat(MatrixXf &X); 198 bool make_XMat_d(MatrixXf &X); 199 //void make_XMat_SNPs(vector< vector<float> > &X, bool miss_with_mu); 200 void std_XMat(MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std); 201 void std_XMat_subpopu(string subpopu_file, MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std); 202 void std_XMat_d(MatrixXf &X, eigenVector &sd_SNP, bool miss_with_mu, bool divid_by_std); 203 //void std_XMat(vector< vector<float> > &X, vector<double> &sd_SNP, bool grm_xchr_flag, bool divid_by_std = true); 204 void makex_eigenVector(int j, eigenVector &x, bool resize = true, bool minus_2p = false); 205 //void make_XMat_eigenMatrix(MatrixXf &X); 206 bool make_XMat_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std); 207 bool make_XMat_d_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std); 208 209 210 void calcu_mu(bool ssq_flag = false); 211 void calcu_maf(); 212 void mu_func(int j, vector<double> &fac); 213 void check_autosome(); 214 void check_chrX(); 215 void check_sex(); 216 217 // grm 218 void calcu_grm_var(double &diag_m, double &diag_v, double &off_m, double &off_v); 219 int read_grm_id(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only); 220 void read_grm(string grm_file, vector<string> &grm_id, bool out_id_log = true, bool read_id_only = false, bool dont_read_N = false); 221 void read_grm_gz(string grm_file, vector<string> &grm_id, bool out_id_log = true, bool read_id_only = false); 222 void read_grm_bin(string grm_file, vector<string> &grm_id, bool out_id_log = true, bool read_id_only = false, bool dont_read_N = false); 223 void read_grm_filenames(string merge_grm_file, vector<string> &grm_files, bool out_log = true); 224 void merge_grm(string merge_grm_file); 225 void rm_cor_indi(double grm_cutoff); 226 void adj_grm(double adj_grm_fac); 227 void dc(int dosage_compen); 228 void manipulate_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool dont_read_N = false); 229 void output_grm_vec(vector< vector<float> > &A, vector< vector<int> > &A_N, bool output_grm_bin); 230 void output_grm(bool output_grm_bin); 231 232 // reml 233 void read_phen(string phen_file, vector<string> &phen_ID, vector< vector<string> > &phen_buf, int mphen, int mphen2 = 0); 234 int read_fac(ifstream &ifstrm, vector<string> &ID, vector< vector<string> > &fac); 235 int read_covar(string covar_file, vector<string> &covar_ID, vector< vector<string> > &covar, bool qcovar_flag); 236 int read_GE(string GE_file, vector<string> &GE_ID, vector< vector<string> > &GE, bool qGE_flag = false); 237 bool check_case_control(double &ncase, eigenVector &y); 238 double transform_hsq_L(double P, double K, double hsq); 239 int constrain_varcmp(eigenVector &varcmp); 240 void drop_comp(vector<int> &drop); 241 void construct_X(int n, map<string, int> &uni_id_map, bool qcovar_flag, int qcovar_num, vector<string> &qcovar_ID, vector< vector<string> > &qcovar, bool covar_flag, int covar_num, vector<string> &covar_ID, vector< vector<string> > &covar, vector<eigenMatrix> &E_float, eigenMatrix &qE_float); 242 void coeff_mat(const vector<string> &vec, eigenMatrix &coeff_mat, string errmsg1, string errmsg2); 243 void reml(bool pred_rand_eff, bool est_fix_eff, vector<double> &reml_priors, vector<double> &reml_priors_var, double prevalence, double prevalence2, bool no_constrain, bool no_lrt, bool mlmassoc = false); 244 double reml_iteration(eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp, bool prior_var_flag, bool no_constrain, bool reml_bivar_fix_rg = false); 245 void init_varcomp(vector<double> &reml_priors_var, vector<double> &reml_priors, eigenVector &varcmp); 246 bool calcu_Vi(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter); 247 bool inverse_H(eigenMatrix &H); 248 bool comput_inverse_logdet_LDLT(eigenMatrix &Vi, double &logdet); 249 void bend_V(eigenMatrix &Vi); 250 void bend_A(); 251 bool bending_eigenval(eigenVector &eval); 252 bool comput_inverse_logdet_PLU(eigenMatrix &Vi, double &logdet); 253 bool comput_inverse_logdet_LU(eigenMatrix &Vi, double &logdet); 254 double calcu_P(eigenMatrix &Vi, eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &P); 255 void calcu_Hi(eigenMatrix &P, eigenMatrix &Hi); 256 void reml_equation(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp); 257 double lgL_reduce_mdl(bool no_constrain); 258 void em_reml(eigenMatrix &P, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp); 259 void ai_reml(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp, double dlogL); 260 void calcu_tr_PA(eigenMatrix &P, eigenVector &tr_PA); 261 void calcu_Vp(double &Vp, double &Vp2, double &VarVp, double &VarVp2, eigenVector &varcmp, eigenMatrix &Hi); 262 void calcu_hsq(int i, double Vp, double Vp2, double VarVp, double VarVp2, double &hsq, double &var_hsq, eigenVector &varcmp, eigenMatrix &Hi); 263 void calcu_sum_hsq(double Vp, double VarVp, double &sum_hsq, double &var_sum_hsq, eigenVector &varcmp, eigenMatrix &Hi); 264 void output_blup_snp(eigenMatrix &b_SNP); 265 266 // within-family reml analysis 267 void detect_family(); 268 bool calcu_Vi_within_family(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter); 269 270 // bivariate REML analysis 271 void calcu_rg(eigenVector &varcmp, eigenMatrix &Hi, eigenVector &rg, eigenVector &rg_var, vector<string> &rg_name); 272 void update_A(eigenVector &prev_varcmp); 273 void constrain_rg(eigenVector &varcmp); 274 double lgL_fix_rg(eigenVector &prev_varcmp, bool no_constrain); 275 bool calcu_Vi_bivar(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter); 276 277 // GWAS simulation 278 void kosambi(); 279 int read_QTL_file(string qtl_file, vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, vector<int> &have_eff); 280 void output_simu_par(vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, double Vp); 281 void save_phenfile(vector< vector<double> > &y); 282 // not usefully any more 283 void GenerCases(string bfile, string qtl_file, int case_num, int control_num, double hsq, double K, bool curr_popu = false, double gnrt = 100); 284 285 // LD 286 void EstLD(vector<int> &smpl, double wind_size, vector< vector<string> > &snp, vector< vector<double> > &r, vector<double> &r2, vector<double> &md_r2, vector<double> &max_r2, vector<string> &max_r2_snp, vector<double> &dL, vector<double> &dR, vector<int> &K, vector<string> &L_SNP, vector<string> &R_SNP, double alpha, bool IncldQ); 287 eigenMatrix reg(vector<double> &y, vector<double> &x, vector<double> &rst, bool table = false); 288 void rm_cor_snp(int m, int start, float *rsq, double rsq_cutoff, vector<int> &rm_snp_ID1); 289 void get_ld_blk_pnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp = 0); 290 void get_ld_blk_pnt_max_limit(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp); 291 void get_lds_brkpnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, int ldwt_seg, int wind_snp_num=0); 292 void calcu_ld_blk(vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff, bool dominance_flag = false); 293 void calcu_ld_blk_split(int size, int size_limit, MatrixXf &X_sub, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second); 294 void calcu_ssx_sqrt_i(eigenVector &ssx_sqrt_i); 295 void calcu_max_ld_rsq_blk(eigenVector &multi_rsq, eigenVector &multi_rsq_adj, eigenVector &max_rsq, vector<int> &max_pos, vector<int> &brk_pnt, double rsq_cutoff, bool dominance_flag); 296 bool bending_eigenval_Xf(VectorXf &eval); 297 void calcu_ld_blk_multiSet(vector<int> &brk_pnt, vector<int> &brk_pnt3, vector< vector<bool> > &set_flag, vector<eigenVector> &mean_rsq, vector<eigenVector> &snp_num, vector<eigenVector> &max_rsq, bool second, double rsq_cutoff, bool dominance_flag); 298 void calcu_ld_blk_split_multiSet(int size, int size_limit, MatrixXf &X_sub, MatrixXf &X_sub2, vector<int> &used_in_this_set, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second); 299 300 // Joint analysis of GWAS MA results 301 void read_metafile(string metafile, bool GC, double GC_val); 302 void init_massoc(string metafile, bool GC, double GC_val); 303 void read_fixed_snp(string snplistfile, string msg, vector<int> &pgiven, vector<int> &remain); 304 void eigenVector2Vector(eigenVector &x, vector<double> &y); 305 //double crossprod(int indx1, int indx2); 306 void get_x_vec(float *x, int indx); 307 void get_x_mat(float *x, vector<int> &indx); 308 void vec_t_mat(float *vec, int nrow, float *mat, int ncol, eigenVector &out); // 1 x n vector multiplied by m x n matrix 309 void stepwise_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC, int mld_slct_alg, int top_SNPs); 310 bool slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC); 311 void slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ); 312 double massoc_calcu_Ve(const vector<int> &slct, eigenVector &bJ, eigenVector &b); 313 void massoc_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC); 314 void massoc_joint(const vector<int> &indx, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ); 315 bool init_B(const vector<int> &indx); 316 void init_Z(const vector<int> &indx); 317 bool insert_B_and_Z(const vector<int> &indx, int insert_indx); 318 void erase_B_and_Z(const vector<int> &indx, int erase_indx); 319 void LD_rval(const vector<int> &indx, eigenMatrix &rval); 320 bool massoc_sblup(double lambda, eigenVector &bJ); 321 void massoc_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ, eigenMatrix &rval); 322 void massoc_cond_output(vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC); 323 324 // read raw genotype data (Illumina) 325 char flip_allele(char a); 326 void read_one_IRG(ofstream &oped, int ind, string IRG_fname, double GC_cutoff); 327 328 // mkl 329 void make_XMat_mkl(float* X, bool grm_d_flag); 330 void std_XMat_mkl(float* X, vector<double> &sd_SNP, bool grm_xchr_flag, bool miss_with_mu = false, bool divid_by_std = true); 331 void std_XMat_d_mkl(float *X, vector<double> &sd_SNP, bool miss_with_mu = false, bool divid_by_std = true); 332 void output_grm_mkl(float* A, bool output_grm_bin); 333 bool comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet); 334 bool comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet); 335 bool comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet); 336 void LD_pruning_blk_mkl(float *X, vector<int> &brk_pnt, double rsq_cutoff, vector<int> &rm_snp_ID1); 337 void calcu_ssx_sqrt_i_mkl(float *X_std, vector<double> &ssx); 338 void calcu_ld_blk_mkl(float *X, vector<double> &ssx, vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff); 339 void calcu_ld_blk_split_mkl(int size, int size_limit, float *X_sub, vector<double> &ssx_sub, double rsq_cutoff, vector<double> &rsq_size, vector<double> &mean_rsq_sub, vector<double> &max_rsq_sub, int s1, int s2, bool second); 340 341 342 // mlma 343 void mlma_calcu_stat(float *y, float *geno_mkl, unsigned long n, unsigned long m, eigenVector &beta, eigenVector &se, eigenVector &pval); 344 void mlma_calcu_stat_covar(float *y, float *geno_mkl, unsigned long n, unsigned long m, eigenVector &beta, eigenVector &se, eigenVector &pval); 345 void grm_minus_grm(float *grm, float *sub_grm); 346 347 // population 348 void read_subpopu(string filename, vector<string> &subpopu, vector<string> &subpopu_name); 349 350 /* 351 // weighting GRM: ldwt_wind = window size for mean LD calculation; ld_seg = block size; 352 void calcu_ld_blk_ldwt(eigenVector &ssx_sqrt_i, vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff, bool adj); 353 void calcu_ld_blk_split_ldwt(int size, int size_limit, int s_pnt, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second, bool adj); 354 void calcu_lds(string i_ld_file, eigenVector &wt, int ldwt_wind, int ldwt_seg, double rsq_cutoff); 355 void calcu_ldak(eigenVector &wt, int ldwt_seg, double rsq_cutoff); 356 void calcu_ldak_blk(eigenVector &wt, eigenVector &sum_rsq, eigenVector &ssx_sqrt_i, vector<int> &brk_pnt, bool second, double rsq_cutoff); 357 void calcu_ldwt(string i_ld_file, eigenVector &wt, int wind_size, double rsq_cutoff); 358 void read_mrsq_mb(string i_ld_file, vector<float> &seq, vector<double> &mrsq_mb, eigenVector &wt, eigenVector &snp_m); 359 void adj_wt_4_maf(eigenVector &wt); 360 void cal_sum_rsq_mb(eigenVector &sum_rsq_mb); 361 void col_std(MatrixXf &X); 362 void assign_snp_2_mb(vector<float> &seq, vector< vector<int> > &maf_bin_pos, int mb_num); 363 void make_grm_pca_blk(vector<int> & maf_bin_pos_i, int ldwt_seg, double &trace); 364 void glpk_simplex_solver(MatrixXf &rsq, eigenVector &mrsq, eigenVector &wt, int maxiter); 365 void calcu_grm_wt_mkl(string i_ld_file, float *X, vector<double> &sd_SNP, eigenVector &wt, int wind_size, double rsq_cutoff, int wt_mtd, int ttl_snp_num); 366 */ 367 368 // gene based association test 369 void sbat_read_snpAssoc(string snpAssoc_file, vector<string> &snp_name, vector<int> &snp_chr, vector<int> &snp_bp, vector<double> &snp_pval); 370 void sbat_read_geneAnno(string gAnno_file, vector<string> &gene_name, vector<int> &gene_chr, vector<int> &gene_bp1, vector<int> &gene_bp2); 371 void sbat_read_snpset(string snpset_file, vector<string> &set_name, vector< vector<string> > &snpset); 372 void sbat_calcu_lambda(vector<int> &snp_indx, VectorXd &eigenval, int &snp_count, double sbat_ld_cutoff, vector<int> &sub_indx); 373 void get_sbat_seg_blk(int seg_size, vector< vector<int> > &snp_set_indx, vector<int> &set_chr, vector<int> &set_start_bp, vector<int> &set_end_bp); 374 void rm_cor_sbat(MatrixXf &R, double R_cutoff, int m, vector<int> &rm_ID1); 375 376 377 ////////////////////// 378 // gene expresion data 379 void init_e_include(); 380 void std_probe(vector< vector<bool> > &X_bool, bool divid_by_std); 381 void std_probe_ind(vector< vector<bool> > &X_bool, bool divid_by_std); 382 383 // ecojo 384 void calcu_eR(); 385 void read_e_metafile(string e_metafile); 386 void ecojo_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ); 387 void ecojo_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC); 388 bool ecojo_slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC); 389 void ecojo_slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ); 390 void ecojo_joint(const vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ); 391 void ecojo_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC); 392 bool ecojo_init_R(const vector<int> &slct); 393 void ecojo_init_RC(const vector<int> &slct, const vector<int> &remain); 394 bool ecojo_insert_R(const vector<int> &slct, int insert_indx); 395 void ecojo_erase_R(const vector<int> &slct); 396 void ecojo_inv_R(); 397 void ecojo_blup(double lambda); 398 399 // inline functions 400 template<typename ElemType> 401 void makex(int j, vector<ElemType> &x, bool minus_2p = false) { 402 int i = 0; 403 x.resize(_keep.size()); 404 for (i = 0; i < _keep.size(); i++) { 405 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) { 406 if (_allele1[_include[j]] == _ref_A[_include[j]]) x[i] = (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]); 407 else x[i] = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]); 408 } else x[i] = _mu[_include[j]]; 409 if (minus_2p) x[i] -= _mu[_include[j]]; 410 } 411 } 412 413 private: 414 // read in plink files 415 // bim file 416 int _autosome_num; 417 vector<int> _chr; 418 vector<string> _snp_name; 419 map<string, int> _snp_name_map; 420 vector<double> _genet_dst; 421 vector<int> _bp; 422 vector<string> _allele1; 423 vector<string> _allele2; 424 vector<string> _ref_A; // reference allele 425 vector<string> _other_A; // the other allele 426 int _snp_num; 427 vector<double> _rc_rate; 428 vector<int> _include; // initialized in the read_bimfile() 429 eigenVector _maf; 430 431 // fam file 432 vector<string> _fid; 433 vector<string> _pid; 434 map<string, int> _id_map; 435 vector<string> _fa_id; 436 vector<string> _mo_id; 437 vector<int> _sex; 438 vector<double> _pheno; 439 int _indi_num; 440 vector<int> _keep; // initialized in the read_famfile() 441 eigenMatrix _varcmp_Py; // BLUP solution to the total genetic effects of individuals 442 443 // bed file 444 vector< vector<bool> > _snp_1; 445 vector< vector<bool> > _snp_2; 446 447 // imputed data 448 bool _dosage_flag; 449 vector< vector<float> > _geno_dose; 450 vector<double> _impRsq; 451 452 // genotypes 453 MatrixXf _geno; 454 455 // QC 456 double _rm_ld_cutoff; 457 458 // grm 459 MatrixXf _grm_N; 460 eigenMatrix _grm; 461 float * _grm_mkl; 462 float * _geno_mkl; 463 bool _grm_bin_flag; 464 465 // reml 466 int _n; 467 int _X_c; 468 vector<int> _r_indx; 469 vector<int> _r_indx_drop; 470 int _reml_max_iter; 471 int _reml_mtd; 472 int _reml_inv_mtd; 473 eigenMatrix _X; 474 vector<eigenMatrix> _A; 475 eigenVector _y; 476 eigenMatrix _Vi; 477 eigenMatrix _P; 478 eigenVector _b; 479 vector<string> _var_name; 480 vector<double> _varcmp; 481 vector<string> _hsq_name; 482 double _y_Ssq; 483 double _ncase; 484 bool _flag_CC; 485 bool _flag_CC2; 486 bool _reml_diag_one; 487 bool _reml_have_bend_A; 488 int _V_inv_mtd; 489 bool _reml_force_inv; 490 bool _reml_AI_not_invertible; 491 bool _reml_force_converge; 492 bool _reml_no_converge; 493 bool _reml_fixed_var; 494 495 // within-family reml analysis 496 bool _within_family; 497 vector<int> _fam_brk_pnt; 498 499 // bivariate reml 500 bool _bivar_reml; 501 bool _ignore_Ce; 502 bool _bivar_no_constrain; 503 double _y2_Ssq; 504 double _ncase2; 505 vector< vector<int> > _bivar_pos; 506 vector< vector<int> > _bivar_pos_prev; 507 vector< eigenSparseMat > _Asp; 508 vector< eigenSparseMat > _Asp_prev; 509 vector<eigenMatrix> _A_prev; 510 vector<double> _fixed_rg_val; 511 512 vector<double> _mu; 513 string _out; 514 bool _save_ram; 515 516 // LD 517 vector<string> _ld_target_snp; 518 bool _ldscore_adj; 519 520 // joint analysis of META 521 bool _jma_actual_geno; 522 int _jma_wind_size; 523 double _jma_p_cutoff; 524 double _jma_collinear; 525 double _jma_Vp; 526 double _jma_Ve; 527 double _GC_val; 528 int _jma_snpnum_backward; 529 int _jma_snpnum_collienar; 530 eigenVector _freq; 531 eigenVector _beta; 532 eigenVector _beta_se; 533 eigenVector _pval; 534 eigenVector _N_o; 535 eigenVector _Nd; 536 eigenVector _MSX; 537 eigenVector _MSX_B; 538 eigenSparseMat _B_N; 539 eigenSparseMat _B; 540 eigenMatrix _B_N_i; 541 eigenMatrix _B_i; 542 eigenVector _D_N; 543 eigenSparseMat _Z_N; 544 eigenSparseMat _Z; 545 546 // gene-trait association 547 map<string, int> _probe_name_map; 548 vector<string> _probe_name; 549 int _probe_num; 550 eigenMatrix _probe_data; 551 vector<int> _e_include; 552 eigenVector _ecojo_z; 553 eigenVector _ecojo_b; 554 eigenVector _ecojo_se; 555 eigenVector _ecojo_n; 556 eigenVector _ecojo_pval; 557 eigenMatrix _ecojo_R; 558 eigenMatrix _ecojo_RC; 559 eigenMatrix _ecojo_wholeR; 560 double _ecojo_p_cutoff; 561 double _ecojo_collinear; 562 }; 563 564 class locus_bp { 565 public: 566 string locus_name; 567 int chr; 568 int bp; 569 locus_bp(string locus_name_buf,int chr_buf,int bp_buf)570 locus_bp(string locus_name_buf, int chr_buf, int bp_buf) { 571 locus_name = locus_name_buf; 572 chr = chr_buf; 573 bp = bp_buf; 574 } 575 operator()576 bool operator()(const locus_bp & other) { 577 return (chr == other.chr && bp <= other.bp); 578 } 579 }; 580 581 #endif 582