1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Interface to all the GCTA functions
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #ifndef _GCTA_H
14 #define _GCTA_H
15 
16 #ifndef EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
17 #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
18 #endif
19 
20 //#ifndef EIGEN_USE_MKL_ALL
21 //#define EIGEN_USE_MKL_ALL
22 //#endif
23 
24 #include "CommFunc.h"
25 #include "StrFunc.h"
26 #include "StatFunc.h"
27 #include "eigen_func.h"
28 #include <fstream>
29 #include <iomanip>
30 #include <bitset>
31 #include <map>
32 //#include <random>
33 #include "zfstream.h"
34 #include <Eigen/Dense>
35 #include <Eigen/Sparse>
36 #include <unsupported/Eigen/SparseExtra>
37 #include <unsupported/Eigen/IterativeSolvers>
38 #include <omp.h>
39 #include <cblas.h>
40 #include <lapacke.h>
41 // Looks like openblas, cblas and lapacke are all that's needed
42 //#include <blaspp.h>
43 //#include <lapackpp.h>
44 
45 using namespace Eigen;
46 using namespace std;
47 
48 #ifdef SINGLE_PRECISION
49 typedef DiagonalMatrix<float, Dynamic, Dynamic> eigenDiagMat;
50 typedef MatrixXf eigenMatrix;
51 typedef VectorXf eigenVector;
52 typedef SparseMatrix<float> eigenSparseMat;
53 typedef DynamicSparseMatrix<float> eigenDynSparseMat;
54 #else
55 typedef DiagonalMatrix<double, Dynamic, Dynamic> eigenDiagMat;
56 typedef MatrixXd eigenMatrix;
57 typedef VectorXd eigenVector;
58 typedef SparseMatrix<double> eigenSparseMat;
59 typedef DynamicSparseMatrix<double> eigenDynSparseMat;
60 #endif
61 
62 class gcta {
63 public:
64     gcta(int autosome_num, double rm_ld_cutoff, string out);
65     gcta();
66     virtual ~gcta();
67 
68     void read_famfile(string famfile);
69     void read_bimfile(string bimfile);
70     void read_bedfile(string bedfile);
71     void read_imp_info_mach_gz(string zinfofile);
72     void read_imp_info_mach(string infofile);
73     void read_imp_dose_mach_gz(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file);
74     void read_imp_dose_mach(string dosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file);
75     void read_imp_info_beagle(string zinfofile);
76     void read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file);
77     void update_ref_A(string ref_A_file);
78     void update_impRsq(string zinfofile);
79     void update_freq(string freq);
80     void save_freq(bool ssq_flag);
81     void extract_snp(string snplistfile);
82     void extract_single_snp(string snpname);
83     void extract_region_snp(string snpname, int wind_size);
84     void extract_region_bp(int chr, int bp, int wind_size);
85     void exclude_snp(string snplistfile);
86     void exclude_single_snp(string snpname);
87     void exclude_region_snp(string snpname, int wind_size);
88     void exclude_region_bp(int chr, int bp, int wind_size);
89     void extract_chr(int chr_start, int chr_end);
90     void filter_snp_maf(double maf);
91     void filter_snp_max_maf(double max_maf);
92     void filter_impRsq(double rsq_cutoff);
93     void keep_indi(string indi_list_file);
94     void remove_indi(string indi_list_file);
95     void update_sex(string sex_file);
96     void read_indi_blup(string blup_indi_file);
97     void save_XMat(bool miss_with_mu, bool std);
98 
99     void make_grm(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag = false, string subpopu_file = "");
100     //void make_grm_pca(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, double wind_size, bool mlmassoc);
101     void save_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool output_grm_bin);
102     void pca(string grm_file, string keep_indi_file, string remove_indi_file, double grm_cutoff, bool merge_grm_flag, int out_pc_num);
103     void snp_pc_loading(string pc_file, int grm_N);
104 
105     // bigK + smallK method
106     void grm_bK(string grm_file, string keep_indi_file, string remove_indi_file, double threshold, bool grm_out_bin_flag);
107 
108     void enable_grm_bin_flag();
109     void fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc = false, bool within_family = false, bool reml_bending = false, bool reml_diag_one = false);
110     void HE_reg(string grm_file, string phen_file, string keep_indi_file, string remove_indi_file, int mphen);
111     void blup_snp_geno();
112     void blup_snp_dosage();
113     void set_reml_force_inv();
114     void set_reml_force_converge();
115     void set_reml_no_converge();
116     void set_reml_fixed_var();
117     void set_reml_mtd(int reml_mtd);
118 
119     // bivariate REML analysis
120     void fit_bivar_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, int mphen2, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, double prevalence2, bool no_constrain, bool ignore_Ce, vector<double> &fixed_rg_val, bool bivar_no_constrain);
121 
122     // LD
123     void read_LD_target_SNPs(string snplistfile);
124     void LD_Blocks(int stp, double wind_size, double alpha, bool IncldQ = true, bool save_ram = false);
125     void calcu_mean_rsq(int wind_size, double rsq_cutoff, bool dominance_flag);
126     void calcu_mean_rsq_multiSet(string snpset_filenames_file, int wind_size, double rsq_cutoff, bool dominance_flag);
127     void calcu_max_ld_rsq(int wind_size, double rsq_cutoff, bool dominance_flag);
128     void ld_seg(string i_ld_file, int seg_size, int wind_size, double rsq_cutoff, bool dominance_flag);
129     void set_ldscore_adj_flag(bool ldscore_adj);
130 
131     void genet_dst(string bfile, string hapmap_genet_map);
132 
133     void GWAS_simu(string bfile, int simu_num, string qtl_file, int case_num, int control_num, double hsq, double K, int seed, bool output_causal, bool simu_emb_flag, int eff_mod=0);
134     //void simu_geno_unlinked(int N, int M, double maf);
135 
136     void run_massoc_slct(string metafile, int wind_size, double p_cutoff, double collinear, int top_SNPs, bool joint_only, bool GC, double GC_val, bool actual_geno, int mld_slct_alg);
137     void run_massoc_cond(string metafile, string snplistfile, int wind_size, double collinear, bool GC, double GC_val, bool actual_geno);
138     void run_massoc_sblup(string metafile, int wind_size, double lambda);
139 
140     void save_plink();
141     void dose2bed();
142 
143     void read_IRG_fnames(string snp_info_file, string fname_file, double GC_cutoff);
144 
145     // population genetics
146     void Fst(string filename);
147     void paa(string aa_file);
148     void ibc(bool ibc_all);
149 
150     // mkl
151     void make_grm_mkl(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag = false);
152     void calcu_mean_rsq_mkl(int wind_size, double rsq_cutoff);
153     void LD_pruning_mkl(double rsq_cutoff, int wind_size);
154     //void make_grm_wt_mkl(string i_ld_file, int wind_m, double wt_ld_cut, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd=0, int wt_mtd=0, bool mlmassoc=false, bool impData_flag=false, int ttl_snp_num=-1);
155 
156     // mlma
157     void mlma(string grm_file, bool m_grm_flag, string subtract_grm_file, string phen_file, string qcovar_file, string covar_file, int mphen, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, bool no_constrain, bool within_family, bool inbred, bool no_adj_covar);
158     void mlma_loco(string phen_file, string qcovar_file, string covar_file, int mphen, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, bool no_constrain, bool inbred, bool no_adj_covar);
159 
160     // gene based association test
161     void sbat_gene(string sAssoc_file, string gAnno_file, int wind, double sbat_ld_cutoff, bool sbat_write_snpset);
162     void sbat(string sAssoc_file, string snpset_file, double sbat_ld_cutoff, bool sbat_write_snpset);
163     void sbat_seg(string sAssoc_file, int seg_size, double sbat_ld_cutoff, bool sbat_write_snpset);
164 
165 
166     /////////////////////////
167     // gene expresion data
168     void read_efile(string efile);
169 
170     // ecojo
171     void read_eR(string eR_file);
172     void run_ecojo_slct(string e_metafile, double p_cutoff, double collinear);
173     void run_ecojo_blup_efile(string e_metafile, double lambda);
174     void run_ecojo_blup_eR(string e_metafile, double lambda);
175 
176     // ERM
177     void make_erm(int erm_mtd, bool output_bin);
178 
179 private:
180     void init_keep();
181     void init_include();
182     void get_rsnp(vector<int> &rsnp);
183     void get_rindi(vector<int> &rindi);
184 
185     void save_famfile();
186     void save_bimfile();
187     void save_bedfile();
188 
189     void update_bim(vector<int> &rsnp);
190     void update_fam(vector<int> &rindi);
191 
192     void update_id_map_kp(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep);
193     void update_id_map_rm(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep);
194     void read_snplist(string snplistfile, vector<string> &snplist, string msg = "SNPs");
195     void read_indi_list(string indi_list_file, vector<string> &indi_list);
196 
197     bool make_XMat(MatrixXf &X);
198     bool make_XMat_d(MatrixXf &X);
199     //void make_XMat_SNPs(vector< vector<float> > &X, bool miss_with_mu);
200     void std_XMat(MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std);
201     void std_XMat_subpopu(string subpopu_file, MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std);
202     void std_XMat_d(MatrixXf &X, eigenVector &sd_SNP, bool miss_with_mu, bool divid_by_std);
203     //void std_XMat(vector< vector<float> > &X, vector<double> &sd_SNP, bool grm_xchr_flag, bool divid_by_std = true);
204     void makex_eigenVector(int j, eigenVector &x, bool resize = true, bool minus_2p = false);
205     //void make_XMat_eigenMatrix(MatrixXf &X);
206     bool make_XMat_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std);
207     bool make_XMat_d_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std);
208 
209 
210     void calcu_mu(bool ssq_flag = false);
211     void calcu_maf();
212     void mu_func(int j, vector<double> &fac);
213     void check_autosome();
214     void check_chrX();
215     void check_sex();
216 
217     // grm
218     void calcu_grm_var(double &diag_m, double &diag_v, double &off_m, double &off_v);
219     int read_grm_id(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only);
220     void read_grm(string grm_file, vector<string> &grm_id, bool out_id_log = true, bool read_id_only = false, bool dont_read_N = false);
221     void read_grm_gz(string grm_file, vector<string> &grm_id, bool out_id_log = true, bool read_id_only = false);
222     void read_grm_bin(string grm_file, vector<string> &grm_id, bool out_id_log = true, bool read_id_only = false, bool dont_read_N = false);
223     void read_grm_filenames(string merge_grm_file, vector<string> &grm_files, bool out_log = true);
224     void merge_grm(string merge_grm_file);
225     void rm_cor_indi(double grm_cutoff);
226     void adj_grm(double adj_grm_fac);
227     void dc(int dosage_compen);
228     void manipulate_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool dont_read_N = false);
229     void output_grm_vec(vector< vector<float> > &A, vector< vector<int> > &A_N, bool output_grm_bin);
230     void output_grm(bool output_grm_bin);
231 
232     // reml
233     void read_phen(string phen_file, vector<string> &phen_ID, vector< vector<string> > &phen_buf, int mphen, int mphen2 = 0);
234     int read_fac(ifstream &ifstrm, vector<string> &ID, vector< vector<string> > &fac);
235     int read_covar(string covar_file, vector<string> &covar_ID, vector< vector<string> > &covar, bool qcovar_flag);
236     int read_GE(string GE_file, vector<string> &GE_ID, vector< vector<string> > &GE, bool qGE_flag = false);
237     bool check_case_control(double &ncase, eigenVector &y);
238     double transform_hsq_L(double P, double K, double hsq);
239     int constrain_varcmp(eigenVector &varcmp);
240     void drop_comp(vector<int> &drop);
241     void construct_X(int n, map<string, int> &uni_id_map, bool qcovar_flag, int qcovar_num, vector<string> &qcovar_ID, vector< vector<string> > &qcovar, bool covar_flag, int covar_num, vector<string> &covar_ID, vector< vector<string> > &covar, vector<eigenMatrix> &E_float, eigenMatrix &qE_float);
242     void coeff_mat(const vector<string> &vec, eigenMatrix &coeff_mat, string errmsg1, string errmsg2);
243     void reml(bool pred_rand_eff, bool est_fix_eff, vector<double> &reml_priors, vector<double> &reml_priors_var, double prevalence, double prevalence2, bool no_constrain, bool no_lrt, bool mlmassoc = false);
244     double reml_iteration(eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp, bool prior_var_flag, bool no_constrain, bool reml_bivar_fix_rg = false);
245     void init_varcomp(vector<double> &reml_priors_var, vector<double> &reml_priors, eigenVector &varcmp);
246     bool calcu_Vi(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter);
247     bool inverse_H(eigenMatrix &H);
248     bool comput_inverse_logdet_LDLT(eigenMatrix &Vi, double &logdet);
249     void bend_V(eigenMatrix &Vi);
250     void bend_A();
251     bool bending_eigenval(eigenVector &eval);
252     bool comput_inverse_logdet_PLU(eigenMatrix &Vi, double &logdet);
253     bool comput_inverse_logdet_LU(eigenMatrix &Vi, double &logdet);
254     double calcu_P(eigenMatrix &Vi, eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &P);
255     void calcu_Hi(eigenMatrix &P, eigenMatrix &Hi);
256     void reml_equation(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp);
257     double lgL_reduce_mdl(bool no_constrain);
258     void em_reml(eigenMatrix &P, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp);
259     void ai_reml(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp, double dlogL);
260     void calcu_tr_PA(eigenMatrix &P, eigenVector &tr_PA);
261     void calcu_Vp(double &Vp, double &Vp2, double &VarVp, double &VarVp2, eigenVector &varcmp, eigenMatrix &Hi);
262     void calcu_hsq(int i, double Vp, double Vp2, double VarVp, double VarVp2, double &hsq, double &var_hsq, eigenVector &varcmp, eigenMatrix &Hi);
263     void calcu_sum_hsq(double Vp, double VarVp, double &sum_hsq, double &var_sum_hsq, eigenVector &varcmp, eigenMatrix &Hi);
264     void output_blup_snp(eigenMatrix &b_SNP);
265 
266     // within-family reml analysis
267     void detect_family();
268     bool calcu_Vi_within_family(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter);
269 
270     // bivariate REML analysis
271     void calcu_rg(eigenVector &varcmp, eigenMatrix &Hi, eigenVector &rg, eigenVector &rg_var, vector<string> &rg_name);
272     void update_A(eigenVector &prev_varcmp);
273     void constrain_rg(eigenVector &varcmp);
274     double lgL_fix_rg(eigenVector &prev_varcmp, bool no_constrain);
275     bool calcu_Vi_bivar(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter);
276 
277     // GWAS simulation
278     void kosambi();
279     int read_QTL_file(string qtl_file, vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, vector<int> &have_eff);
280     void output_simu_par(vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, double Vp);
281     void save_phenfile(vector< vector<double> > &y);
282     // not usefully any more
283     void GenerCases(string bfile, string qtl_file, int case_num, int control_num, double hsq, double K, bool curr_popu = false, double gnrt = 100);
284 
285     // LD
286     void EstLD(vector<int> &smpl, double wind_size, vector< vector<string> > &snp, vector< vector<double> > &r, vector<double> &r2, vector<double> &md_r2, vector<double> &max_r2, vector<string> &max_r2_snp, vector<double> &dL, vector<double> &dR, vector<int> &K, vector<string> &L_SNP, vector<string> &R_SNP, double alpha, bool IncldQ);
287     eigenMatrix reg(vector<double> &y, vector<double> &x, vector<double> &rst, bool table = false);
288     void rm_cor_snp(int m, int start, float *rsq, double rsq_cutoff, vector<int> &rm_snp_ID1);
289     void get_ld_blk_pnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp = 0);
290     void get_ld_blk_pnt_max_limit(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp);
291     void get_lds_brkpnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, int ldwt_seg, int wind_snp_num=0);
292     void calcu_ld_blk(vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff, bool dominance_flag = false);
293     void calcu_ld_blk_split(int size, int size_limit, MatrixXf &X_sub, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second);
294     void calcu_ssx_sqrt_i(eigenVector &ssx_sqrt_i);
295     void calcu_max_ld_rsq_blk(eigenVector &multi_rsq, eigenVector &multi_rsq_adj, eigenVector &max_rsq, vector<int> &max_pos, vector<int> &brk_pnt, double rsq_cutoff, bool dominance_flag);
296     bool bending_eigenval_Xf(VectorXf &eval);
297     void calcu_ld_blk_multiSet(vector<int> &brk_pnt, vector<int> &brk_pnt3, vector< vector<bool> > &set_flag, vector<eigenVector> &mean_rsq, vector<eigenVector> &snp_num, vector<eigenVector> &max_rsq, bool second, double rsq_cutoff, bool dominance_flag);
298     void calcu_ld_blk_split_multiSet(int size, int size_limit, MatrixXf &X_sub, MatrixXf &X_sub2, vector<int> &used_in_this_set, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second);
299 
300     // Joint analysis of GWAS MA results
301     void read_metafile(string metafile, bool GC, double GC_val);
302     void init_massoc(string metafile, bool GC, double GC_val);
303     void read_fixed_snp(string snplistfile, string msg, vector<int> &pgiven, vector<int> &remain);
304     void eigenVector2Vector(eigenVector &x, vector<double> &y);
305     //double crossprod(int indx1, int indx2);
306     void get_x_vec(float *x, int indx);
307     void get_x_mat(float *x, vector<int> &indx);
308     void vec_t_mat(float *vec, int nrow, float *mat, int ncol, eigenVector &out); // 1 x n vector multiplied by m x n matrix
309     void stepwise_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC, int mld_slct_alg, int top_SNPs);
310     bool slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC);
311     void slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ);
312     double massoc_calcu_Ve(const vector<int> &slct, eigenVector &bJ, eigenVector &b);
313     void massoc_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC);
314     void massoc_joint(const vector<int> &indx, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ);
315     bool init_B(const vector<int> &indx);
316     void init_Z(const vector<int> &indx);
317     bool insert_B_and_Z(const vector<int> &indx, int insert_indx);
318     void erase_B_and_Z(const vector<int> &indx, int erase_indx);
319     void LD_rval(const vector<int> &indx, eigenMatrix &rval);
320     bool massoc_sblup(double lambda, eigenVector &bJ);
321     void massoc_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ, eigenMatrix &rval);
322     void massoc_cond_output(vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC);
323 
324     // read raw genotype data (Illumina)
325     char flip_allele(char a);
326     void read_one_IRG(ofstream &oped, int ind, string IRG_fname, double GC_cutoff);
327 
328     // mkl
329     void make_XMat_mkl(float* X, bool grm_d_flag);
330     void std_XMat_mkl(float* X, vector<double> &sd_SNP, bool grm_xchr_flag, bool miss_with_mu = false, bool divid_by_std = true);
331     void std_XMat_d_mkl(float *X, vector<double> &sd_SNP, bool miss_with_mu = false, bool divid_by_std = true);
332     void output_grm_mkl(float* A, bool output_grm_bin);
333     bool comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet);
334     bool comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet);
335     bool comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet);
336     void LD_pruning_blk_mkl(float *X, vector<int> &brk_pnt, double rsq_cutoff, vector<int> &rm_snp_ID1);
337     void calcu_ssx_sqrt_i_mkl(float *X_std, vector<double> &ssx);
338     void calcu_ld_blk_mkl(float *X, vector<double> &ssx, vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff);
339     void calcu_ld_blk_split_mkl(int size, int size_limit, float *X_sub, vector<double> &ssx_sub, double rsq_cutoff, vector<double> &rsq_size, vector<double> &mean_rsq_sub, vector<double> &max_rsq_sub, int s1, int s2, bool second);
340 
341 
342     // mlma
343     void mlma_calcu_stat(float *y, float *geno_mkl, unsigned long n, unsigned long m, eigenVector &beta, eigenVector &se, eigenVector &pval);
344     void mlma_calcu_stat_covar(float *y, float *geno_mkl, unsigned long n, unsigned long m, eigenVector &beta, eigenVector &se, eigenVector &pval);
345     void grm_minus_grm(float *grm, float *sub_grm);
346 
347     // population
348     void read_subpopu(string filename, vector<string> &subpopu, vector<string> &subpopu_name);
349 
350     /*
351     // weighting GRM: ldwt_wind = window size for mean LD calculation; ld_seg = block size;
352     void calcu_ld_blk_ldwt(eigenVector &ssx_sqrt_i, vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff, bool adj);
353     void calcu_ld_blk_split_ldwt(int size, int size_limit, int s_pnt, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second, bool adj);
354     void calcu_lds(string i_ld_file, eigenVector &wt, int ldwt_wind, int ldwt_seg, double rsq_cutoff);
355     void calcu_ldak(eigenVector &wt, int ldwt_seg, double rsq_cutoff);
356     void calcu_ldak_blk(eigenVector &wt, eigenVector &sum_rsq, eigenVector &ssx_sqrt_i, vector<int> &brk_pnt, bool second, double rsq_cutoff);
357     void calcu_ldwt(string i_ld_file, eigenVector &wt, int wind_size, double rsq_cutoff);
358     void read_mrsq_mb(string i_ld_file, vector<float> &seq, vector<double> &mrsq_mb, eigenVector &wt, eigenVector &snp_m);
359     void adj_wt_4_maf(eigenVector &wt);
360     void cal_sum_rsq_mb(eigenVector &sum_rsq_mb);
361     void col_std(MatrixXf &X);
362     void assign_snp_2_mb(vector<float> &seq, vector< vector<int> > &maf_bin_pos, int mb_num);
363     void make_grm_pca_blk(vector<int> & maf_bin_pos_i, int ldwt_seg, double &trace);
364     void glpk_simplex_solver(MatrixXf &rsq, eigenVector &mrsq, eigenVector &wt, int maxiter);
365     void calcu_grm_wt_mkl(string i_ld_file, float *X, vector<double> &sd_SNP, eigenVector &wt, int wind_size, double rsq_cutoff, int wt_mtd, int ttl_snp_num);
366     */
367 
368     // gene based association test
369     void sbat_read_snpAssoc(string snpAssoc_file, vector<string> &snp_name, vector<int> &snp_chr, vector<int> &snp_bp, vector<double> &snp_pval);
370     void sbat_read_geneAnno(string gAnno_file, vector<string> &gene_name, vector<int> &gene_chr, vector<int> &gene_bp1, vector<int> &gene_bp2);
371     void sbat_read_snpset(string snpset_file, vector<string> &set_name, vector< vector<string> > &snpset);
372     void sbat_calcu_lambda(vector<int> &snp_indx, VectorXd &eigenval, int &snp_count, double sbat_ld_cutoff, vector<int> &sub_indx);
373     void get_sbat_seg_blk(int seg_size, vector< vector<int> > &snp_set_indx, vector<int> &set_chr, vector<int> &set_start_bp, vector<int> &set_end_bp);
374     void rm_cor_sbat(MatrixXf &R, double R_cutoff, int m, vector<int> &rm_ID1);
375 
376 
377     //////////////////////
378     // gene expresion data
379     void init_e_include();
380     void std_probe(vector< vector<bool> > &X_bool, bool divid_by_std);
381     void std_probe_ind(vector< vector<bool> > &X_bool, bool divid_by_std);
382 
383     // ecojo
384     void calcu_eR();
385     void read_e_metafile(string e_metafile);
386     void ecojo_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ);
387     void ecojo_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC);
388     bool ecojo_slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC);
389     void ecojo_slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ);
390     void ecojo_joint(const vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ);
391     void ecojo_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC);
392     bool ecojo_init_R(const vector<int> &slct);
393     void ecojo_init_RC(const vector<int> &slct, const vector<int> &remain);
394     bool ecojo_insert_R(const vector<int> &slct, int insert_indx);
395     void ecojo_erase_R(const vector<int> &slct);
396     void ecojo_inv_R();
397     void ecojo_blup(double lambda);
398 
399     // inline functions
400     template<typename ElemType>
401     void makex(int j, vector<ElemType> &x, bool minus_2p = false) {
402         int i = 0;
403         x.resize(_keep.size());
404         for (i = 0; i < _keep.size(); i++) {
405             if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
406                 if (_allele1[_include[j]] == _ref_A[_include[j]]) x[i] = (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
407                 else x[i] = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
408             } else x[i] = _mu[_include[j]];
409             if (minus_2p) x[i] -= _mu[_include[j]];
410         }
411     }
412 
413 private:
414     // read in plink files
415     // bim file
416     int _autosome_num;
417     vector<int> _chr;
418     vector<string> _snp_name;
419     map<string, int> _snp_name_map;
420     vector<double> _genet_dst;
421     vector<int> _bp;
422     vector<string> _allele1;
423     vector<string> _allele2;
424     vector<string> _ref_A; // reference allele
425     vector<string> _other_A; // the other allele
426     int _snp_num;
427     vector<double> _rc_rate;
428     vector<int> _include; // initialized in the read_bimfile()
429     eigenVector _maf;
430 
431     // fam file
432     vector<string> _fid;
433     vector<string> _pid;
434     map<string, int> _id_map;
435     vector<string> _fa_id;
436     vector<string> _mo_id;
437     vector<int> _sex;
438     vector<double> _pheno;
439     int _indi_num;
440     vector<int> _keep; // initialized in the read_famfile()
441     eigenMatrix _varcmp_Py; // BLUP solution to the total genetic effects of individuals
442 
443     // bed file
444     vector< vector<bool> > _snp_1;
445     vector< vector<bool> > _snp_2;
446 
447     // imputed data
448     bool _dosage_flag;
449     vector< vector<float> > _geno_dose;
450     vector<double> _impRsq;
451 
452     // genotypes
453     MatrixXf _geno;
454 
455     // QC
456     double _rm_ld_cutoff;
457 
458     // grm
459     MatrixXf _grm_N;
460     eigenMatrix _grm;
461     float * _grm_mkl;
462     float * _geno_mkl;
463     bool _grm_bin_flag;
464 
465     // reml
466     int _n;
467     int _X_c;
468     vector<int> _r_indx;
469     vector<int> _r_indx_drop;
470     int _reml_max_iter;
471     int _reml_mtd;
472     int _reml_inv_mtd;
473     eigenMatrix _X;
474     vector<eigenMatrix> _A;
475     eigenVector _y;
476     eigenMatrix _Vi;
477     eigenMatrix _P;
478     eigenVector _b;
479     vector<string> _var_name;
480     vector<double> _varcmp;
481     vector<string> _hsq_name;
482     double _y_Ssq;
483     double _ncase;
484     bool _flag_CC;
485     bool _flag_CC2;
486     bool _reml_diag_one;
487     bool _reml_have_bend_A;
488     int _V_inv_mtd;
489     bool _reml_force_inv;
490     bool _reml_AI_not_invertible;
491     bool _reml_force_converge;
492     bool _reml_no_converge;
493     bool _reml_fixed_var;
494 
495     // within-family reml analysis
496     bool _within_family;
497     vector<int> _fam_brk_pnt;
498 
499     // bivariate reml
500     bool _bivar_reml;
501     bool _ignore_Ce;
502     bool _bivar_no_constrain;
503     double _y2_Ssq;
504     double _ncase2;
505     vector< vector<int> > _bivar_pos;
506     vector< vector<int> > _bivar_pos_prev;
507     vector< eigenSparseMat > _Asp;
508     vector< eigenSparseMat > _Asp_prev;
509     vector<eigenMatrix> _A_prev;
510     vector<double> _fixed_rg_val;
511 
512     vector<double> _mu;
513     string _out;
514     bool _save_ram;
515 
516     // LD
517     vector<string> _ld_target_snp;
518     bool _ldscore_adj;
519 
520     // joint analysis of META
521     bool _jma_actual_geno;
522     int _jma_wind_size;
523     double _jma_p_cutoff;
524     double _jma_collinear;
525     double _jma_Vp;
526     double _jma_Ve;
527     double _GC_val;
528     int _jma_snpnum_backward;
529     int _jma_snpnum_collienar;
530     eigenVector _freq;
531     eigenVector _beta;
532     eigenVector _beta_se;
533     eigenVector _pval;
534     eigenVector _N_o;
535     eigenVector _Nd;
536     eigenVector _MSX;
537     eigenVector _MSX_B;
538     eigenSparseMat _B_N;
539     eigenSparseMat _B;
540     eigenMatrix _B_N_i;
541     eigenMatrix _B_i;
542     eigenVector _D_N;
543     eigenSparseMat _Z_N;
544     eigenSparseMat _Z;
545 
546     // gene-trait association
547     map<string, int> _probe_name_map;
548     vector<string> _probe_name;
549     int _probe_num;
550     eigenMatrix _probe_data;
551     vector<int> _e_include;
552     eigenVector _ecojo_z;
553     eigenVector _ecojo_b;
554     eigenVector _ecojo_se;
555     eigenVector _ecojo_n;
556     eigenVector _ecojo_pval;
557     eigenMatrix _ecojo_R;
558     eigenMatrix _ecojo_RC;
559     eigenMatrix _ecojo_wholeR;
560     double _ecojo_p_cutoff;
561     double _ecojo_collinear;
562 };
563 
564 class locus_bp {
565 public:
566     string locus_name;
567     int chr;
568     int bp;
569 
locus_bp(string locus_name_buf,int chr_buf,int bp_buf)570     locus_bp(string locus_name_buf, int chr_buf, int bp_buf) {
571         locus_name = locus_name_buf;
572         chr = chr_buf;
573         bp = bp_buf;
574     }
575 
operator()576     bool operator()(const locus_bp & other) {
577         return (chr == other.chr && bp <= other.bp);
578     }
579 };
580 
581 #endif
582