1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for REML analysis
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
set_reml_force_inv()15 void gcta::set_reml_force_inv()
16 {
17     _reml_force_inv = true;
18 }
19 
set_reml_force_converge()20 void gcta::set_reml_force_converge()
21 {
22     _reml_force_converge = true;
23 }
24 
set_reml_no_converge()25 void gcta::set_reml_no_converge()
26 {
27     _reml_no_converge = true;
28 }
29 
set_reml_fixed_var()30 void gcta::set_reml_fixed_var()
31 {
32     _reml_fixed_var = true;
33 }
34 
set_reml_mtd(int reml_mtd)35 void gcta::set_reml_mtd(int reml_mtd)
36 {
37     _reml_mtd = reml_mtd;
38 }
39 
read_phen(string phen_file,vector<string> & phen_ID,vector<vector<string>> & phen_buf,int mphen,int mphen2)40 void gcta::read_phen(string phen_file, vector<string> &phen_ID, vector< vector<string> > &phen_buf, int mphen, int mphen2) {
41     // Read phenotype data
42     ifstream in_phen(phen_file.c_str());
43     if (!in_phen) throw ("Error: can not open the file [" + phen_file + "] to read.");
44 
45     int i = 0;
46     vector<string> fid, pid, vs_buf;
47     string str_buf, fid_buf, pid_buf;
48     phen_ID.clear();
49     phen_buf.clear();
50     cout << "Reading phenotypes from [" + phen_file + "]." << endl;
51     getline(in_phen, str_buf);
52     int phen_num = StrFunc::split_string(str_buf, vs_buf) - 2;
53     if (phen_num <= 0) throw ("Error: no phenotype data is found.");
54     if (phen_num > 1) cout << "There are " << phen_num << " traits specified in the file [" + phen_file + "]." << endl;
55     if (mphen > phen_num) {
56         stringstream errmsg;
57         errmsg << "Error: can not find the " << mphen << "th trait in the file [" + phen_file + "].";
58         throw (errmsg.str());
59     }
60     if (_bivar_reml && mphen2 > phen_num) {
61         stringstream errmsg;
62         errmsg << "Error: can not find the " << mphen2 << "th trait in the file [" + phen_file + "].";
63         throw (errmsg.str());
64     }
65     if (_bivar_reml) cout << "Traits " << mphen << " and " << mphen2 << " are included in the bivariate analysis." << endl;
66     else {
67         if (phen_num > 1) cout << "Trait #" << mphen << " is included for analysis." << endl;
68     }
69     in_phen.seekg(ios::beg);
70     mphen--;
71     mphen2--;
72     int line = 1;
73     while (in_phen) {
74         line++;
75         in_phen >> fid_buf;
76         if (in_phen.eof()) break;
77         in_phen >> pid_buf;
78         getline(in_phen, str_buf);
79         if (StrFunc::split_string(str_buf, vs_buf) != phen_num) {
80             stringstream errmsg;
81             errmsg << "Error: " << vs_buf.size() - phen_num << " phenotype values are missing in line #" << line << " in the file [" + phen_file + "]";
82             throw (errmsg.str());
83         }
84         if (_bivar_reml) {
85             if ((vs_buf[mphen] == "-9" || vs_buf[mphen] == "NA") && (vs_buf[mphen2] == "-9" || vs_buf[mphen2] == "NA")) continue;
86         } else {
87             if (vs_buf[mphen] == "-9" || vs_buf[mphen] == "NA") continue;
88         }
89         phen_ID.push_back(fid_buf + ":" + pid_buf);
90         fid.push_back(fid_buf);
91         pid.push_back(pid_buf);
92         phen_buf.push_back(vs_buf);
93     }
94     in_phen.close();
95     cout << "Non-missing phenotypes of " << phen_buf.size() << " individuals are included from [" + phen_file + "]." << endl;
96 
97     if (_id_map.empty()) {
98         _fid = fid;
99         _pid = pid;
100         _indi_num = _fid.size();
101         init_keep();
102     }
103 }
104 
read_covar(string covar_file,vector<string> & covar_ID,vector<vector<string>> & covar,bool qcovar_flag)105 int gcta::read_covar(string covar_file, vector<string> &covar_ID, vector< vector<string> > &covar, bool qcovar_flag) {
106     // Read covariate data
107     ifstream in_covar(covar_file.c_str());
108     if (!in_covar) throw ("Error: can not open the file [" + covar_file + "] to read.");
109 
110     int i = 0, covar_num = 0;
111     string str_buf, id_buf;
112     vector<string> covar_buf, vs_buf;
113     covar_ID.clear();
114     covar.clear();
115     if (qcovar_flag) cout << "Reading quantitative covariates from [" + covar_file + "]." << endl;
116     else cout << "Reading discrete covariate(s) from [" + covar_file + "]." << endl;
117     covar_num = read_fac(in_covar, covar_ID, covar);
118     if (qcovar_flag) cout << covar_num << " quantitative covariate(s) of " << covar_ID.size() << " individuals read from [" + covar_file + "]." << endl;
119     else cout << covar_num << " discrete covariate(s) of " << covar_ID.size() << " individuals are included from [" + covar_file + "]." << endl;
120 
121     return covar_num;
122 }
123 
read_fac(ifstream & ifstrm,vector<string> & ID,vector<vector<string>> & fac)124 int gcta::read_fac(ifstream &ifstrm, vector<string> &ID, vector< vector<string> > &fac) {
125     int i = 0, line = 0, fac_num = 0, prev_fac_num = 0;
126     string str_buf, id_buf;
127     vector<string> vs_buf;
128     while (ifstrm) {
129         ifstrm >> str_buf;
130         if (ifstrm.eof()) break;
131         id_buf = str_buf;
132         ifstrm >> str_buf;
133         id_buf += ":" + str_buf;
134         getline(ifstrm, str_buf);
135         fac_num = StrFunc::split_string(str_buf, vs_buf);
136         if (line > 0 && fac_num != prev_fac_num) throw ("Error: each row should have the same number of columns.\n" + id_buf + "\t" + str_buf);
137         line++;
138         prev_fac_num = fac_num;
139         bool continue_flag = false;
140         for (i = 0; i < fac_num; i++) {
141             if (vs_buf[i] == "-9" || vs_buf[i] == "NA") continue_flag = true;
142         }
143         if (continue_flag) continue;
144         ID.push_back(id_buf);
145         fac.push_back(vs_buf);
146     }
147     ifstrm.close();
148     return fac_num;
149 }
150 
read_GE(string GE_file,vector<string> & GE_ID,vector<vector<string>> & GE,bool qGE_flag)151 int gcta::read_GE(string GE_file, vector<string> &GE_ID, vector< vector<string> > &GE, bool qGE_flag) {
152     // Read phenotype data
153     ifstream in_GE(GE_file.c_str());
154     if (!in_GE) throw ("Error: can not open the file [" + GE_file + "] to read.");
155 
156     string str_buf, id_buf;
157     vector<string> vs_buf;
158     GE_ID.clear();
159     GE.clear();
160     string env = "environmental";
161     if (qGE_flag == true) env = "continuous " + env;
162     else env = "categorical " + env;
163     cout << "Reading " << env << " factor(s) for the analysis of GE interaction from [" + GE_file + "]." << endl;
164     int GE_num = read_fac(in_GE, GE_ID, GE);
165     if (GE_num == 0) throw ("Error: no " + env + " factor is specified. Please check the format of the file: " + GE_file + ".");
166     cout << GE_num << " " << env << " factor(s) for " << GE_ID.size() << " individuals are included from [" + GE_file + "]." << endl;
167 
168     return GE_num;
169 }
170 
fit_reml(string grm_file,string phen_file,string qcovar_file,string covar_file,string qGE_file,string GE_file,string keep_indi_file,string remove_indi_file,string sex_file,int mphen,double grm_cutoff,double adj_grm_fac,int dosage_compen,bool m_grm_flag,bool pred_rand_eff,bool est_fix_eff,int reml_mtd,int MaxIter,vector<double> reml_priors,vector<double> reml_priors_var,vector<int> drop,bool no_lrt,double prevalence,bool no_constrain,bool mlmassoc,bool within_family,bool reml_bending,bool reml_diag_one)171 void gcta::fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc, bool within_family, bool reml_bending, bool reml_diag_one) {
172     _within_family = within_family;
173     _reml_mtd = reml_mtd;
174     _reml_max_iter = MaxIter;
175     _reml_diag_one = reml_diag_one;
176     int i = 0, j = 0, k = 0;
177     bool grm_flag = (!grm_file.empty());
178     bool qcovar_flag = (!qcovar_file.empty());
179     bool covar_flag = (!covar_file.empty());
180     bool GE_flag = (!GE_file.empty());
181     bool qGE_flag = (!qGE_file.empty());
182     if (m_grm_flag) grm_flag = false;
183 
184     // Read data
185     stringstream errmsg;
186     int qcovar_num = 0, covar_num = 0, qE_fac_num = 0, E_fac_num = 0;
187     vector<string> phen_ID, qcovar_ID, covar_ID, qGE_ID, GE_ID, grm_id, grm_files;
188     vector< vector<string> > phen_buf, qcovar, covar, GE, qGE; // save individuals by column
189 
190     if (grm_flag) {
191         read_grm(grm_file, grm_id, true, false, !(adj_grm_fac > -1.0));
192         update_id_map_kp(grm_id, _id_map, _keep);
193         grm_files.push_back(grm_file);
194     }
195     else if (m_grm_flag) {
196         read_grm_filenames(grm_file, grm_files, false);
197         for (i = 0; i < grm_files.size(); i++) {
198             read_grm(grm_files[i], grm_id, false, true, !(adj_grm_fac > -1.0));
199             update_id_map_kp(grm_id, _id_map, _keep);
200         }
201     }
202     read_phen(phen_file, phen_ID, phen_buf, mphen);
203     update_id_map_kp(phen_ID, _id_map, _keep);
204     if (qcovar_flag) {
205         qcovar_num = read_covar(qcovar_file, qcovar_ID, qcovar, true);
206         update_id_map_kp(qcovar_ID, _id_map, _keep);
207     }
208     if (covar_flag) {
209         covar_num = read_covar(covar_file, covar_ID, covar, false);
210         update_id_map_kp(covar_ID, _id_map, _keep);
211     }
212     if (qGE_flag) {
213         qE_fac_num = read_GE(qGE_file, qGE_ID, qGE, true);
214         update_id_map_kp(qGE_ID, _id_map, _keep);
215     }
216     if (GE_flag) {
217         E_fac_num = read_GE(GE_file, GE_ID, GE, false);
218         update_id_map_kp(GE_ID, _id_map, _keep);
219     }
220     if (!mlmassoc) {
221         if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
222         if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
223     }
224     if (grm_flag) {
225         if (grm_cutoff>-1.0) rm_cor_indi(grm_cutoff);
226         if (!sex_file.empty()) update_sex(sex_file);
227         if (adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
228         if (dosage_compen>-1) dc(dosage_compen);
229         _grm_N.resize(0, 0);
230     }
231 
232     vector<string> uni_id;
233     map<string, int> uni_id_map;
234     map<string, int>::iterator iter;
235     for (i = 0; i < _keep.size(); i++) {
236         uni_id.push_back(_fid[_keep[i]] + ":" + _pid[_keep[i]]);
237         uni_id_map.insert(pair<string, int>(_fid[_keep[i]] + ":" + _pid[_keep[i]], i));
238     }
239     _n = _keep.size();
240     if (_n < 1) throw ("Error: no individual is in common in the input files.");
241 
242     // construct model terms
243     _y.setZero(_n);
244     for (i = 0; i < phen_ID.size(); i++) {
245         iter = uni_id_map.find(phen_ID[i]);
246         if (iter == uni_id_map.end()) continue;
247         _y[iter->second] = atof(phen_buf[i][mphen - 1].c_str());
248     }
249 
250     // case-control
251     _ncase = 0.0;
252     _flag_CC = check_case_control(_ncase, _y);
253     cout << endl;
254     if (_flag_CC) {
255         //if(mlmassoc) throw("Error: the option --mlm-assoc is valid for the quantitative trait only.");
256         if (!_bivar_reml) {
257             if (!mlmassoc && prevalence<-1) cout << "Note: you can specify the disease prevalence by the option --prevalence so that GCTA can transform the variance explained to the underlying liability scale." << endl;
258         } else {
259             cout << "Note: you can specify the prevalences of the two diseases by the option --reml-bivar-prevalence so that GCTA can transform the estimates of variance explained to the underlying liability scale." << endl;
260         }
261     }
262 
263     int pos = 0;
264     _r_indx.clear();
265     vector<int> kp;
266     if (grm_flag) {
267         for (i = 0; i < 1 + qE_fac_num + E_fac_num + 1; i++) _r_indx.push_back(i);
268         if (!no_lrt) drop_comp(drop);
269         _A.resize(_r_indx.size());
270         if (mlmassoc) StrFunc::match(uni_id, grm_id, kp);
271         else kp = _keep;
272         (_A[0]) = eigenMatrix::Zero(_n, _n);
273 
274         #pragma omp parallel for private(j)
275         for (i = 0; i < _n; i++) {
276             for (j = 0; j <= i; j++) (_A[0])(j, i) = (_A[0])(i, j) = _grm(kp[i], kp[j]);
277         }
278         if (_reml_diag_one) {
279             double diag_mean = (_A[0]).diagonal().mean();
280             cout << "Mean of diagonal elements of the GRM = " << diag_mean << endl;
281             #pragma omp parallel for private(j)
282             for (i = 0; i < _n; i++) {
283                 for (j = 0; j <= i; j++) {
284                     (_A[0])(i, j) /= (_A[0])(i, i);
285                     (_A[0])(j, i) = (_A[0])(i, j);
286                 }
287             }
288         }
289 
290         pos++;
291         _grm.resize(0, 0);
292     } else if (m_grm_flag) {
293         if (!sex_file.empty()) update_sex(sex_file);
294         for (i = 0; i < (1 + qE_fac_num + E_fac_num) * grm_files.size() + 1; i++) _r_indx.push_back(i);
295         if (!no_lrt) drop_comp(drop);
296         _A.resize(_r_indx.size());
297         string prev_file = grm_files[0];
298         vector<string> prev_grm_id(grm_id);
299         cout << "There are " << grm_files.size() << " GRM file names specified in the file [" + grm_file + "]." << endl;
300         for (i = 0; i < grm_files.size(); i++, pos++) {
301             cout << "Reading the GRM from the " << i + 1 << "th file ..." << endl;
302             read_grm(grm_files[i], grm_id, true, false, !(adj_grm_fac > -1.0));
303             if (adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
304             if (dosage_compen>-1) dc(dosage_compen);
305             StrFunc::match(uni_id, grm_id, kp);
306             (_A[pos]) = eigenMatrix::Zero(_n, _n);
307 
308             #pragma omp parallel for private(j)
309             for (j = 0; j < _n; j++) {
310                 for (k = 0; k <= j; k++) {
311                     if (kp[j] >= kp[k]) (_A[pos])(k, j) = (_A[pos])(j, k) = _grm(kp[j], kp[k]);
312                     else (_A[pos])(k, j) = (_A[pos])(j, k) = _grm(kp[k], kp[j]);
313                 }
314             }
315 
316             if (_reml_diag_one) {
317                 double diag_mean = (_A[pos]).diagonal().mean();
318                 cout << "Mean of diagonal elements of the GRM = " << diag_mean << endl;
319                 #pragma omp parallel for private(j)
320                 for (j = 0; j < _n; j++) {
321                     //(_A[pos])(j,j)=diag_mean;
322                     for (k = 0; k <= j; k++) {
323                         (_A[pos])(j, k) /= (_A[pos])(j, j);
324                         (_A[pos])(k, j) = (_A[pos])(j, k);
325                     }
326                 }
327             }
328 
329             prev_file = grm_files[i];
330             prev_grm_id = grm_id;
331         }
332         _grm_N.resize(0, 0);
333         _grm.resize(0, 0);
334     }
335     else {
336         _r_indx.push_back(0);
337         _A.resize(_r_indx.size());
338     }
339     _A[_r_indx.size() - 1] = eigenMatrix::Identity(_n, _n);
340 
341     // GE interaction
342     vector<eigenMatrix> E_float(E_fac_num);
343     eigenMatrix qE_float, mbuf;
344     if (qGE_flag) {
345         qE_float.resize(_n, qE_fac_num);
346         for (i = 0; i < qGE_ID.size(); i++) {
347             iter = uni_id_map.find(qGE_ID[i]);
348             if (iter == uni_id_map.end()) continue;
349             for (j = 0; j < qE_fac_num; j++) qE_float(iter->second, j) = atof(qGE[i][j].c_str());
350         }
351         for (j = 0; j < qE_fac_num; j++) {
352             mbuf = ((qE_float.block(0, j, _n, 1))*(qE_float.block(0, j, _n, 1)).transpose());
353             for (i = 0; i < grm_files.size(); i++, pos++) (_A[pos]) = (_A[i]).array() * mbuf.array();
354         }
355     }
356     if (GE_flag) {
357         vector< vector<string> > E_str(E_fac_num);
358         for (i = 0; i < E_fac_num; i++) E_str[i].resize(_n);
359         for (i = 0; i < GE_ID.size(); i++) {
360             iter = uni_id_map.find(GE_ID[i]);
361             if (iter != uni_id_map.end()) {
362                 for (j = 0; j < E_fac_num; j++) E_str[j][iter->second] = GE[i][j];
363             }
364         }
365         for (j = 0; j < E_fac_num; j++) {
366             stringstream errmsg;
367             errmsg << "Error: too many classes for the " << j + 1 << "th environmental factor. \nPlease make sure you input a discrete variable as the environmental factor.";
368             string errmsg1 = errmsg.str();
369             errmsg.str("");
370             errmsg << "Error: the " << j + 1 << "th environmental factor has only one class.";
371             string errmsg2 = errmsg.str();
372             coeff_mat(E_str[j], E_float[j], errmsg1, errmsg2);
373             mbuf = ((E_float[j])*(E_float[j]).transpose());
374             for (i = 0; i < grm_files.size(); i++, pos++) (_A[pos]) = (_A[i]).array() * mbuf.array();
375         }
376     }
377 
378     // construct X matrix
379     construct_X(_n, uni_id_map, qcovar_flag, qcovar_num, qcovar_ID, qcovar, covar_flag, covar_num, covar_ID, covar, E_float, qE_float);
380 
381     // names of variance component
382     for (i = 0; i < grm_files.size(); i++) {
383         stringstream strstrm;
384         if (grm_files.size() == 1) strstrm << "";
385         else strstrm << i + 1;
386         _var_name.push_back("V(G" + strstrm.str() + ")");
387         _hsq_name.push_back("V(G" + strstrm.str() + ")/Vp");
388     }
389     for (j = 0; j < qE_fac_num; j++) {
390         for (i = 0; i < grm_files.size(); i++) {
391             stringstream strstrm1, strstrm2;
392             if (grm_files.size() == 1) strstrm1 << "";
393             else strstrm1 << i + 1;
394             if (qE_fac_num == 1) strstrm2 << "";
395             else strstrm2 << j + 1;
396             _var_name.push_back("V(G" + strstrm1.str() + "xqE" + strstrm2.str() + ")");
397             _hsq_name.push_back("V(G" + strstrm1.str() + "xqE" + strstrm2.str() + ")" + "/Vp");
398         }
399     }
400     for (j = 0; j < E_fac_num; j++) {
401         for (i = 0; i < grm_files.size(); i++) {
402             stringstream strstrm1, strstrm2;
403             if (grm_files.size() == 1) strstrm1 << "";
404             else strstrm1 << i + 1;
405             if (E_fac_num == 1) strstrm2 << "";
406             else strstrm2 << j + 1;
407             _var_name.push_back("V(G" + strstrm1.str() + "xE" + strstrm2.str() + ")");
408             _hsq_name.push_back("V(G" + strstrm1.str() + "xE" + strstrm2.str() + ")" + "/Vp");
409         }
410     }
411     _var_name.push_back("V(e)");
412 
413     cout << _n << " individuals are in common in these files." << endl;
414 
415     // within family
416     if (_within_family) detect_family();
417 
418     // bending
419     if (reml_bending) bend_A();
420 
421     // run REML algorithm
422     reml(pred_rand_eff, est_fix_eff, reml_priors, reml_priors_var, prevalence, -2.0, no_constrain, no_lrt, mlmassoc);
423 }
424 
drop_comp(vector<int> & drop)425 void gcta::drop_comp(vector<int> &drop) {
426     int i = 0;
427     stringstream errmsg;
428     _r_indx_drop = _r_indx;
429     stable_sort(drop.begin(), drop.end());
430     drop.erase(unique(drop.begin(), drop.end()), drop.end());
431     for (i = drop.size() - 1; i >= 0; i--) {
432         if (drop[i] < 1 || drop[i] > _r_indx.size() - 1) {
433             errmsg << "Error: there " << (_r_indx.size() > 2 ? "are" : "is") << " only " << _r_indx.size() - 1 << " genetic variance component in the model. You can't drop the " << drop[i] << "-th component.";
434             throw (errmsg.str());
435         }
436         _r_indx_drop.erase(_r_indx_drop.begin() + drop[i] - 1);
437     }
438     if (_r_indx.size() == _r_indx_drop.size()) throw ("Error: no component has been dropped from the model. Please check the --reml-lrt option.");
439 }
440 
construct_X(int n,map<string,int> & uni_id_map,bool qcovar_flag,int qcovar_num,vector<string> & qcovar_ID,vector<vector<string>> & qcovar,bool covar_flag,int covar_num,vector<string> & covar_ID,vector<vector<string>> & covar,vector<eigenMatrix> & E_float,eigenMatrix & qE_float)441 void gcta::construct_X(int n, map<string, int> &uni_id_map, bool qcovar_flag, int qcovar_num, vector<string> &qcovar_ID, vector< vector<string> > &qcovar, bool covar_flag, int covar_num, vector<string> &covar_ID, vector< vector<string> > &covar, vector<eigenMatrix> &E_float, eigenMatrix &qE_float) {
442     int i = 0, j = 0;
443     map<string, int>::iterator iter;
444     stringstream errmsg;
445 
446     _X_c = 1;
447     // quantitative covariates
448     eigenMatrix X_q;
449     if (qcovar_flag) {
450         X_q.resize(n, qcovar_num);
451         for (i = 0; i < qcovar_ID.size(); i++) {
452             iter = uni_id_map.find(qcovar_ID[i]);
453             if (iter == uni_id_map.end()) continue;
454             for (j = 0; j < qcovar_num; j++) X_q(iter->second, j) = atof(qcovar[i][j].c_str());
455         }
456         if (qcovar_num == 0) throw ("Error: no quantitative covariate is found.");
457         cout << qcovar_num << " quantitative variable(s) included as covariate(s)." << endl;
458         _X_c += qcovar_num;
459     }
460     // discrete covariates
461     vector<eigenMatrix> X_d;
462     if (covar_flag) {
463         vector< vector<string> > covar_tmp(covar_num);
464         for (i = 0; i < covar_num; i++) covar_tmp[i].resize(n);
465         for (i = 0; i < covar_ID.size(); i++) {
466             iter = uni_id_map.find(covar_ID[i]);
467             if (iter == uni_id_map.end()) continue;
468             for (j = 0; j < covar_num; j++) covar_tmp[j][iter->second] = covar[i][j];
469         }
470         cout << covar_num << " discrete variable(s) included as covariate(s)." << endl;
471         X_d.resize(covar_num);
472         for (i = 0; i < covar_num; i++) {
473             stringstream errmsg;
474             errmsg << "Error: too many classes for the " << i + 1 << "th discrete variable. \nPlease use the --qcovar if it is a quantitative covariate.";
475             string errmsg1 = errmsg.str();
476             errmsg.str("");
477             errmsg << "Error: the " << i + 1 << "th discrete variable has only one class.";
478             string errmsg2 = errmsg.str();
479             coeff_mat(covar_tmp[i], X_d[i], errmsg1, errmsg2);
480             _X_c += (X_d[i]).cols() - 1;
481         }
482     }
483 
484     // E factor
485     _X_c += qE_float.cols();
486     for (i = 0; i < E_float.size(); i++) _X_c += (E_float[i]).cols() - 1;
487 
488     // Construct _X
489     int col = 0;
490     _X.resize(n, _X_c);
491     _X.block(0, col, n, 1) = eigenMatrix::Ones(n, 1);
492     col++;
493     if (qcovar_flag) {
494         _X.block(0, col, n, X_q.cols()) = X_q;
495         col += X_q.cols();
496     }
497     for (i = 0; i < X_d.size(); i++) {
498         _X.block(0, col, n, (X_d[i]).cols() - 1) = (X_d[i]).block(0, 1, n, (X_d[i]).cols() - 1);
499         col += (X_d[i]).cols() - 1;
500     }
501     if (qE_float.cols() > 0) {
502         _X.block(0, col, n, qE_float.cols()) = qE_float;
503         col += qE_float.cols();
504     }
505     for (i = 0; i < E_float.size(); i++) {
506         _X.block(0, col, n, (E_float[i]).cols() - 1) = (E_float[i]).block(0, 1, n, (E_float[i]).cols() - 1);
507         col += (E_float[i]).cols() - 1;
508     }
509 }
510 
coeff_mat(const vector<string> & vec,eigenMatrix & coeff_mat,string errmsg1,string errmsg2)511 void gcta::coeff_mat(const vector<string> &vec, eigenMatrix &coeff_mat, string errmsg1, string errmsg2) {
512     vector<string> value(vec);
513     stable_sort(value.begin(), value.end());
514     value.erase(unique(value.begin(), value.end()), value.end());
515     if (value.size() > 0.5 * vec.size()) throw (errmsg1); // throw("Error: too many classes for the envronmental factor. \nPlease make sure you input a discrete variable as the environmental factor.");
516     if (value.size() == 1) throw (errmsg2); //throw("Error: the envronmental factor should has more than one classes.");
517 
518     int i = 0, j = 0, row_num = vec.size(), column_num = value.size();
519     map<string, int> val_map;
520     for (i = 0; i < value.size(); i++) val_map.insert(pair<string, int>(value[i], i));
521 
522     coeff_mat.resize(row_num, column_num);
523     coeff_mat.setZero(row_num, column_num);
524     map<string, int>::iterator iter;
525     for (i = 0; i < row_num; i++) {
526         iter = val_map.find(vec[i]);
527         coeff_mat(i, iter->second) = 1.0;
528     }
529 }
530 
check_case_control(double & ncase,eigenVector & y)531 bool gcta::check_case_control(double &ncase, eigenVector &y) {
532     int n = y.size();
533     double case_num = 0.0;
534     vector<double> value(n);
535     for (int i = 0; i < n; i++) value[i] = y(i);
536     stable_sort(value.begin(), value.end());
537     value.erase(unique(value.begin(), value.end()), value.end());
538     if (value.size() == 2) {
539         if (CommFunc::FloatEqual(value[0], 0.0) && CommFunc::FloatEqual(value[1], 1.0)) case_num = y.sum();
540         else if (CommFunc::FloatEqual(value[0], 1.0) && CommFunc::FloatEqual(value[1], 2.0)) case_num = (y.sum() - n);
541         if (!_bivar_reml) cout << "Assuming a disease phenotype for a case-control study: ";
542         cout << (int) case_num << " cases and " << (int) (n - case_num) << " controls ";
543         ncase = case_num / (double) n;
544         return true;
545     } else if (value.size() < 2) throw ("Error: invalid phenotype. Please check the phenotype file.");
546     return false;
547 }
548 
transform_hsq_L(double P,double K,double hsq)549 double gcta::transform_hsq_L(double P, double K, double hsq) {
550     double t = StatFunc::qnorm(1.0 - K);
551     double z = StatFunc::dnorm(t);
552     double C = (K * (1 - K) / (z * z))*(K * (1 - K) / (P * (1 - P)));
553     return (hsq * C);
554 }
555 
constrain_varcmp(eigenVector & varcmp)556 int gcta::constrain_varcmp(eigenVector &varcmp) {
557     int pos = 0;
558     double delta = 0.0, constr_scale = 1e-6;
559     int i = 0, num = 0;
560     vector<int> constrain(_r_indx.size());
561 
562     if (_bivar_reml) {
563         for (i = 0, num = 0; i < _bivar_pos[0].size(); i++) {
564             pos = _bivar_pos[0][i];
565             if (varcmp[pos] < 0) {
566                 delta += _y_Ssq * constr_scale - varcmp[pos];
567                 varcmp[pos] = _y_Ssq * constr_scale;
568                 constrain[i] = 1;
569                 num++;
570             }
571         }
572         delta /= (_bivar_pos[0].size() - num);
573         for (i = 0; i < _bivar_pos[0].size(); i++) {
574             pos = _bivar_pos[0][i];
575             if (constrain[pos] < 1 && varcmp[pos] > delta) varcmp[pos] -= delta;
576         }
577 
578         for (i = 0, num = 0; i < _bivar_pos[1].size(); i++) {
579             pos = _bivar_pos[1][i];
580             if (varcmp[pos] < 0) {
581                 delta += _y_Ssq * constr_scale - varcmp[pos];
582                 varcmp[pos] = _y_Ssq * constr_scale;
583                 constrain[i] = 1;
584                 num++;
585             }
586         }
587         delta /= (_bivar_pos[1].size() - num);
588         for (i = 0; i < _bivar_pos[1].size(); i++) {
589             pos = _bivar_pos[1][i];
590             if (constrain[pos] < 1 && varcmp[pos] > delta) varcmp[pos] -= delta;
591         }
592 
593         for (i = 0, num = 0; i < constrain.size(); i++) {
594             if (constrain[i] == 1) num++;
595 
596         }
597         return num;
598     }
599 
600     for (i = 0; i < _r_indx.size(); i++) {
601         if (varcmp[i] < 0) {
602             delta += _y_Ssq * constr_scale - varcmp[i];
603             varcmp[i] = _y_Ssq * constr_scale;
604             constrain[i] = 1;
605             num++;
606         }
607     }
608     delta /= (_r_indx.size() - num);
609     for (i = 0; i < _r_indx.size(); i++) {
610         if (constrain[i] < 1 && varcmp[i] > delta) varcmp[i] -= delta;
611     }
612 
613     return num;
614 }
615 
reml(bool pred_rand_eff,bool est_fix_eff,vector<double> & reml_priors,vector<double> & reml_priors_var,double prevalence,double prevalence2,bool no_constrain,bool no_lrt,bool mlmassoc)616 void gcta::reml(bool pred_rand_eff, bool est_fix_eff, vector<double> &reml_priors, vector<double> &reml_priors_var, double prevalence, double prevalence2, bool no_constrain, bool no_lrt, bool mlmassoc)
617 {
618     int i = 0, j = 0, k = 0;
619 
620     // Initialize variance component
621     // 0: AI; 1: Fisher-scoring; 2: EM
622     stringstream errmsg;
623     double d_buf = 0.0;
624     eigenVector y_tmp = _y.array() - _y.mean();
625     if (!_bivar_reml) {
626         _y_Ssq = y_tmp.squaredNorm() / (_n - 1.0);
627         if (!(fabs(_y_Ssq) < 1e30)) throw ("Error: the phenotypic variance is infinite. Please check the missing data in your phenotype file. Missing values should be represented by \"NA\" or \"-9\".");
628     }
629     bool reml_priors_flag = !reml_priors.empty(), reml_priors_var_flag = !reml_priors_var.empty();
630     if (reml_priors_flag && reml_priors.size() < _r_indx.size() - 1) {
631         errmsg << "Error: in option --reml-priors. There are " << _r_indx.size() << " variance components. At least " << _r_indx.size() - 1 << " prior values should be specified.";
632         throw (errmsg.str());
633     }
634     if (reml_priors_var_flag && reml_priors_var.size() < _r_indx.size() - 1) {
635         errmsg << "Error: in option --reml-priors-var. There are " << _r_indx.size() << " variance components. At least " << _r_indx.size() - 1 << " prior values should be specified.";
636         throw (errmsg.str());
637     }
638 
639     cout << "\nPerforming " << (_bivar_reml ? "bivariate" : "") << " REML analysis ... (Note: may take hours depending on sample size)." << endl;
640     if (_n < 10) throw ("Error: sample size is too small.");
641     cout << _n << " observations, " << _X_c << " fixed effect(s), and " << _r_indx.size() << " variance component(s)(including residual variance)." << endl;
642     eigenMatrix Vi_X(_n, _X_c), Xt_Vi_X_i(_X_c, _X_c), Hi(_r_indx.size(), _r_indx.size());
643     eigenVector Py(_n), varcmp;
644     init_varcomp(reml_priors_var, reml_priors, varcmp);
645     double lgL = reml_iteration(Vi_X, Xt_Vi_X_i, Hi, Py, varcmp, reml_priors_var_flag | reml_priors_flag, no_constrain);
646     eigenMatrix u;
647     if (pred_rand_eff) {
648         u.resize(_n, _r_indx.size());
649         for (i = 0; i < _r_indx.size(); i++) {
650             if (_bivar_reml || _within_family)(u.col(i)) = (((_Asp[_r_indx[i]]) * Py) * varcmp[i]);
651             else (u.col(i)) = (((_A[_r_indx[i]]) * Py) * varcmp[i]);
652         }
653     }
654     if (est_fix_eff) _b = Xt_Vi_X_i * (Vi_X.transpose() * _y);
655 
656     // calculate Hsq and SE
657     double Vp = 0.0, Vp2 = 0.0, VarVp = 0.0, VarVp2 = 0.0, Vp_f = 0.0, VarVp_f = 0.0;
658     vector<double> Hsq(_r_indx.size() - 1), VarHsq(_r_indx.size() - 1);
659     calcu_Vp(Vp, Vp2, VarVp, VarVp2, varcmp, Hi);
660     for (i = 0; i < Hsq.size(); i++) calcu_hsq(i, Vp, Vp2, VarVp, VarVp2, Hsq[i], VarHsq[i], varcmp, Hi);
661 
662     // calculate the logL for a reduce model
663     double lgL_rdu_mdl = 0.0, LRT = 0.0;
664     if (!no_lrt) {
665         lgL_rdu_mdl = lgL_reduce_mdl(no_constrain);
666         LRT = 2.0 * (lgL - lgL_rdu_mdl);
667         if (LRT < 0.0) LRT = 0.0;
668     }
669 
670     // calcuate the logL given a rG in a bivariate analysis
671     double lgL_fixed_rg = 0.0;
672     if (_bivar_reml && !_fixed_rg_val.empty()) {
673         lgL_fixed_rg = lgL_fix_rg(varcmp, no_constrain);
674         LRT = 2.0 * (lgL - lgL_fixed_rg);
675         if (LRT < 0.0) LRT = 0.0;
676     }
677 
678     if (mlmassoc) {
679         eigenVector2Vector(varcmp, _varcmp);
680         return;
681     }
682 
683     // output results
684     double sum_hsq = 0.0, var_sum_hsq = 0.0;
685     if (!_bivar_reml && _r_indx.size() > 2) calcu_sum_hsq(Vp, VarVp, sum_hsq, var_sum_hsq, varcmp, Hi);
686     cout << "\nSummary result of REML analysis:" << endl;
687     cout << "Source\tVariance\tSE" << setiosflags(ios::fixed) << setprecision(6) << endl;
688     for (i = 0; i < _r_indx.size(); i++) cout << _var_name[i] << "\t" << varcmp[i] << "\t" << sqrt(Hi(i, i)) << endl;
689     if (_bivar_reml) {
690         cout << "Vp_tr1\t" << Vp << "\t" << sqrt(VarVp) << endl;
691         cout << "Vp_tr2\t" << Vp2 << "\t" << sqrt(VarVp2) << endl;
692         for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
693             cout << _hsq_name[j] << "\t" << Hsq[_bivar_pos[0][i]] << "\t" << sqrt(VarHsq[_bivar_pos[0][i]]) << endl;
694             cout << _hsq_name[j + 1] << "\t" << Hsq[_bivar_pos[1][i]] << "\t" << sqrt(VarHsq[_bivar_pos[1][i]]) << endl;
695         }
696     } else {
697         cout << "Vp\t" << Vp << "\t" << sqrt(VarVp) << endl;
698         for (i = 0; i < Hsq.size(); i++) cout << _hsq_name[i] << "\t" << Hsq[i] << "\t" << sqrt(VarHsq[i]) << endl;
699         if (_r_indx.size() > 2) cout << "\nSum of V(G)/Vp\t" << sum_hsq << "\t" << sqrt(var_sum_hsq) << endl;
700     }
701     if ((_flag_CC && prevalence>-1) || (_flag_CC2 && prevalence2>-1)) {
702         cout << "The estimate of variance explained on the observed scale is transformed to that on the underlying scale:" << endl;
703         if (_bivar_reml) {
704             if (_flag_CC) cout << "Proportion of cases in the sample = " << _ncase << " for trait #1; User-specified disease prevalence = " << prevalence << " for trait #1" << endl;
705             if (_flag_CC2) cout << "Proportion of cases in the sample = " << _ncase2 << " for trait #2; User-specified disease prevalence = " << prevalence2 << " for trait #2" << endl;
706             for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
707                 if (_flag_CC) cout << _hsq_name[j] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[_bivar_pos[0][i]]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[_bivar_pos[0][i]])) << endl;
708                 if (_flag_CC2) cout << _hsq_name[j + 1] << "_L\t" << transform_hsq_L(_ncase2, prevalence2, Hsq[_bivar_pos[1][i]]) << "\t" << transform_hsq_L(_ncase2, prevalence2, sqrt(VarHsq[_bivar_pos[1][i]])) << endl;
709             }
710         } else {
711             cout << "(Proportion of cases in the sample = " << _ncase << "; User-specified disease prevalence = " << prevalence << ")" << endl;
712             for (i = 0; i < Hsq.size(); i++) cout << _hsq_name[i] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[i]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[i])) << endl;
713             if (_r_indx.size() > 2)  cout << "\nSum of V(G)_L/Vp\t" << transform_hsq_L(_ncase, prevalence, sum_hsq) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(var_sum_hsq)) << endl;
714         }
715     }
716     // output genetic correlation
717     eigenVector rg, rg_var;
718     vector<string> rg_name;
719     if (_bivar_reml) {
720         calcu_rg(varcmp, Hi, rg, rg_var, rg_name);
721         for (i = 0; i < rg_name.size(); i++) {
722             cout << rg_name[i] << "\t" << rg[i] << "\t" << sqrt(rg_var[i]) << endl;
723         }
724     }
725     if(!_reml_force_converge || !_reml_AI_not_invertible){
726         cout << "\nSampling variance/covariance of the estimates of variance components:" << endl;
727         for (i = 0; i < _r_indx.size(); i++) {
728             for (j = 0; j < _r_indx.size(); j++) cout << setiosflags(ios::scientific) << Hi(i, j) << "\t";
729             cout << endl;
730         }
731     }
732     if (est_fix_eff) {
733         cout << "Estimate" << (_X_c > 1 ? "s" : "") << "of fixed effect" << (_X_c > 1 ? "s" : "") << ":" << endl;
734         cout << "\nSource\tEstimate\tSE" << endl;
735         for (i = 0; i < _X_c; i++) {
736             if (i == 0) cout << "mean\t";
737             else cout << "X_" << i + 1 << "\t";
738             cout << setiosflags(ios::fixed) << _b[i] << "\t" << sqrt(Xt_Vi_X_i(i, i)) << endl;
739         }
740     }
741 
742     // save summary result into a file
743     string reml_rst_file = _out + ".hsq";
744     ofstream o_reml(reml_rst_file.c_str());
745     if (!o_reml) throw ("Error: can not open the file [" + reml_rst_file + "] to write.");
746     o_reml << "Source\tVariance\tSE" << setiosflags(ios::fixed) << setprecision(6) << endl;
747     for (i = 0; i < _r_indx.size(); i++) o_reml << _var_name[i] << "\t" << varcmp[i] << "\t" << sqrt(Hi(i, i)) << endl;
748     if (_bivar_reml) {
749         o_reml << "Vp_tr1\t" << Vp << "\t" << sqrt(VarVp) << endl;
750         o_reml << "Vp_tr2\t" << Vp2 << "\t" << sqrt(VarVp2) << endl;
751         for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
752             o_reml << _hsq_name[j] << "\t" << Hsq[_bivar_pos[0][i]] << "\t" << sqrt(VarHsq[_bivar_pos[0][i]]) << endl;
753             o_reml << _hsq_name[j + 1] << "\t" << Hsq[_bivar_pos[1][i]] << "\t" << sqrt(VarHsq[_bivar_pos[1][i]]) << endl;
754         }
755     } else {
756         o_reml << "Vp\t" << Vp << "\t" << sqrt(VarVp) << endl;
757         for (i = 0; i < Hsq.size(); i++) o_reml << _hsq_name[i] << "\t" << Hsq[i] << "\t" << sqrt(VarHsq[i]) << endl;
758         if (_r_indx.size() > 2) o_reml << "\nSum of V(G)/Vp\t" << sum_hsq << "\t" << sqrt(var_sum_hsq) << endl;
759     }
760     if (_flag_CC && prevalence>-1) {
761         o_reml << "The estimate of variance explained on the observed scale is transformed to that on the underlying scale:" << endl;
762         if (_bivar_reml) {
763             o_reml << "(Proportion of cases in the sample = " << _ncase << "; User-specified disease prevalence = " << prevalence << " for disease 1 and = " << prevalence2 << " for disease 2)" << endl;
764             for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
765                 o_reml << _hsq_name[j] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[_bivar_pos[0][i]]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[_bivar_pos[0][i]])) << endl;
766                 o_reml << _hsq_name[j + 1] << "_L\t" << transform_hsq_L(_ncase2, prevalence2, Hsq[_bivar_pos[1][i]]) << "\t" << transform_hsq_L(_ncase2, prevalence2, sqrt(VarHsq[_bivar_pos[1][i]])) << endl;
767             }
768         } else {
769             o_reml << "(Proportion of cases in the sample = " << _ncase << "; User-specified disease prevalence = " << prevalence << ")" << endl;
770             for (i = 0; i < Hsq.size(); i++) o_reml << _hsq_name[i] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[i]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[i])) << endl;
771             if (_r_indx.size() > 2)  o_reml << "\nSum of V(G)_L/Vp\t" << transform_hsq_L(_ncase, prevalence, sum_hsq) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(var_sum_hsq)) << endl;
772         }
773     }
774     if (_bivar_reml) {
775         for (i = 0; i < rg_name.size(); i++) o_reml << rg_name[i] << "\t" << rg[i] << "\t" << sqrt(rg_var[i]) << endl;
776     }
777     o_reml << "logL\t" << setprecision(3) << lgL << endl;
778     if (!no_lrt && _r_indx.size() - 1 > 0) {
779         o_reml << "logL0\t" << setprecision(3) << lgL_rdu_mdl << endl;
780         o_reml << "LRT\t" << setprecision(3) << LRT << endl;
781         o_reml << "df\t" << setprecision(1) << _r_indx.size() - _r_indx_drop.size() << endl;
782         o_reml << "Pval\t" << setprecision(4) << setiosflags(ios::scientific) << 0.5 * StatFunc::chi_prob(_r_indx.size() - _r_indx_drop.size(), LRT) << setiosflags(ios::fixed) << endl;
783     }
784     if (_bivar_reml && !_fixed_rg_val.empty()) {
785         o_reml << "logL0\t" << setprecision(3) << lgL_fixed_rg << " (when rG fixed at ";
786         for (i = 0; i < _fixed_rg_val.size() - 1; i++) o_reml << _fixed_rg_val[i] << "\t";
787         o_reml << _fixed_rg_val[_fixed_rg_val.size() - 1] << ")" << endl;
788         o_reml << "LRT\t" << setprecision(3) << LRT << endl;
789         o_reml << "df\t" << setprecision(1) << _fixed_rg_val.size() << endl;
790         o_reml << "Pval\t" << setprecision(4) << setiosflags(ios::scientific) << 0.5 * StatFunc::chi_prob(_fixed_rg_val.size(), LRT) << setiosflags(ios::fixed) << " (one-tailed test)" << endl;
791     }
792     o_reml << "n\t" << _n << endl;
793     if (est_fix_eff) {
794         o_reml << "\nFix_eff\tSE" << endl;
795         for (i = 0; i < _X_c; i++) o_reml << setprecision(6) << _b[i] << "\t" << sqrt(Xt_Vi_X_i(i, i)) << endl;
796         o_reml.close();
797     }
798     cout << "\nSummary result of REML analysis has been saved in the file [" + reml_rst_file + "]." << endl;
799 
800     // save random effect to a file
801     if (pred_rand_eff) {
802         string rand_eff_file = _out + ".indi.blp";
803         ofstream o_rand_eff(rand_eff_file.c_str());
804         for (i = 0; i < _keep.size(); i++) {
805             o_rand_eff << _fid[_keep[i]] << "\t" << _pid[_keep[i]] << "\t";
806             for (j = 0; j < _r_indx.size(); j++) o_rand_eff << setprecision(6) << Py[i] * varcmp[j] << "\t" << u(i, j) << "\t";
807             o_rand_eff << endl;
808         }
809         cout << "\nBLUP of the genetic effects for " << _keep.size() << " individuals has been saved in the file [" + rand_eff_file + "]." << endl;
810     }
811 }
812 
init_varcomp(vector<double> & reml_priors_var,vector<double> & reml_priors,eigenVector & varcmp)813 void gcta::init_varcomp(vector<double> &reml_priors_var, vector<double> &reml_priors, eigenVector &varcmp) {
814     int i = 0, pos = 0;
815     double d_buf = 0.0;
816 
817     varcmp = eigenVector::Zero(_r_indx.size());
818     if (_bivar_reml) {
819         if (!reml_priors_var.empty()) {
820             for (i = 0; i < _r_indx.size(); i++) varcmp[i] = reml_priors_var[i];
821         }
822         else if (!reml_priors.empty()) {
823             for (i = 0, d_buf = 0; i < _bivar_pos[0].size() - 1; i++) {
824                 pos = _bivar_pos[0][i];
825                 varcmp[pos] = reml_priors[pos] * _y_Ssq;
826                 d_buf += reml_priors[pos];
827             }
828             if (d_buf > 1.0) throw ("\nError: --reml-priors. The sum of all prior values for trait 1 should not exceed 1.0.");
829             varcmp[_bivar_pos[0][_bivar_pos[0].size() - 1]] = (1.0 - d_buf) * _y_Ssq;
830             for (i = 0, d_buf = 0; i < _bivar_pos[1].size() - 1; i++) {
831                 pos = _bivar_pos[1][i];
832                 varcmp[pos] = reml_priors[pos] * _y_Ssq;
833                 d_buf += reml_priors[pos];
834             }
835             if (d_buf > 1.0) throw ("\nError: --reml-priors. The sum of all prior values for trait 2 should not exceed 1.0.");
836             varcmp[_bivar_pos[1][_bivar_pos[1].size() - 1]] = (1.0 - d_buf) * _y2_Ssq;
837             for (i = 0; i < _bivar_pos[2].size(); i++) varcmp[_bivar_pos[2][i]] = reml_priors[_bivar_pos[2][i]] * sqrt(_y_Ssq * _y2_Ssq);
838         }
839         else {
840             for (i = 0; i < _bivar_pos[0].size(); i++) varcmp[_bivar_pos[0][i]] = _y_Ssq / _bivar_pos[0].size();
841             for (i = 0; i < _bivar_pos[1].size(); i++) varcmp[_bivar_pos[1][i]] = _y2_Ssq / _bivar_pos[1].size();
842             for (i = 0; i < _bivar_pos[2].size(); i++) varcmp[_bivar_pos[2][i]] = 0.5 * sqrt(varcmp[_bivar_pos[0][i]] * varcmp[_bivar_pos[1][i]]);
843         }
844 
845         return;
846     }
847 
848     if (!reml_priors_var.empty()) {
849         for (i = 0; i < _r_indx.size() - 1; i++) varcmp[i] = reml_priors_var[i];
850         if (reml_priors_var.size() < _r_indx.size()) varcmp[_r_indx.size() - 1] = _y_Ssq - varcmp.sum();
851         else varcmp[_r_indx.size() - 1] = reml_priors_var[_r_indx.size() - 1];
852     }
853     else if (!reml_priors.empty()) {
854         for (i = 0, d_buf = 0; i < _r_indx.size() - 1; i++) {
855             varcmp[i] = reml_priors[i] * _y_Ssq;
856             d_buf += reml_priors[i];
857         }
858         if (d_buf > 1.0) throw ("\nError: --reml-priors. The sum of all prior values should not exceed 1.0.");
859         varcmp[_r_indx.size() - 1] = (1.0 - d_buf) * _y_Ssq;
860     }
861     else varcmp.setConstant(_y_Ssq / (_r_indx.size()));
862 }
863 
lgL_reduce_mdl(bool no_constrain)864 double gcta::lgL_reduce_mdl(bool no_constrain) {
865     if (_r_indx.size() - 1 == 0) return 0;
866     bool multi_comp = (_r_indx.size() - _r_indx_drop.size() > 1);
867     cout << "\nCalculating the logLikelihood for the reduced model ...\n(variance component" << (multi_comp ? "s " : " ");
868     for (int i = 0; i < _r_indx.size() - 1; i++) {
869         if (find(_r_indx_drop.begin(), _r_indx_drop.end(), _r_indx[i]) == _r_indx_drop.end()) cout << _r_indx[i] + 1 << " ";
870     }
871     cout << (multi_comp ? "are" : "is") << " dropped from the model)" << endl;
872     vector<int> vi_buf(_r_indx);
873     _r_indx = _r_indx_drop;
874     eigenMatrix Vi_X(_n, _X_c), Xt_Vi_X_i(_X_c, _X_c), Hi(_r_indx.size(), _r_indx.size());
875     eigenVector Py(_n);
876     eigenVector varcmp;
877     vector<double> reml_priors_var, reml_priors;
878     init_varcomp(reml_priors_var, reml_priors, varcmp);
879     double lgL = reml_iteration(Vi_X, Xt_Vi_X_i, Hi, Py, varcmp, false, no_constrain);
880     _r_indx = vi_buf;
881     return lgL;
882 }
883 
reml_iteration(eigenMatrix & Vi_X,eigenMatrix & Xt_Vi_X_i,eigenMatrix & Hi,eigenVector & Py,eigenVector & varcmp,bool prior_var_flag,bool no_constrain,bool reml_bivar_fix_rg)884 double gcta::reml_iteration(eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp, bool prior_var_flag, bool no_constrain, bool reml_bivar_fix_rg)
885 {
886     /*if(reml_bivar_fix_rg){
887         if(no_constrain){
888             no_constrain=false;
889             cout<<"Warning: --reml-no-constrain disabled. The genetic correlation is fixed so that all the variance components are constrained to be positive."<<endl;
890         }
891     }*/
892 
893     char *mtd_str[3] = {"AI-REML algorithm", "Fisher-scoring ...", "EM-REML algorithm ..."};
894     int i = 0, constrain_num = 0, iter = 0, reml_mtd_tmp = _reml_mtd;
895     double logdet = 0.0, logdet_Xt_Vi_X = 0.0, prev_lgL = -1e20, lgL = -1e20, dlogL = 1000.0;
896     eigenVector prev_prev_varcmp(varcmp), prev_varcmp(varcmp), varcomp_init(varcmp);
897     bool converged_flag = false;
898     for (iter = 0; iter < _reml_max_iter; iter++) {
899         if (reml_bivar_fix_rg) update_A(prev_varcmp);
900         if (iter == 0) {
901             prev_varcmp = varcomp_init;
902             if (prior_var_flag){
903                 if(_reml_fixed_var) cout << "Variance components are fixed at: " << varcmp.transpose() << endl;
904                 else cout << "Prior values of variance components: " << varcmp.transpose() << endl;
905             }
906             else {
907                 _reml_mtd = 2;
908                 cout << "Calculating prior values of variance components by EM-REML ..." << endl;
909             }
910         }
911         if (iter == 1) {
912             _reml_mtd = reml_mtd_tmp;
913             cout << "Running " << mtd_str[_reml_mtd] << " ..." << "\nIter.\tlogL\t";
914             for (i = 0; i < _r_indx.size(); i++) cout << _var_name[_r_indx[i]] << "\t";
915             cout << endl;
916         }
917         if (_bivar_reml) calcu_Vi_bivar(_Vi, prev_varcmp, logdet, iter); // Calculate Vi, bivariate analysis
918         else if (_within_family) calcu_Vi_within_family(_Vi, prev_varcmp, logdet, iter); // within-family REML
919         else {
920             if (!calcu_Vi(_Vi, prev_varcmp, logdet, iter)){ // Calculate Vi
921                 cout<<"Warning: V matrix is not positive-definite.\n";
922                 varcmp = prev_prev_varcmp;
923                 if(!calcu_Vi(_Vi, varcmp, logdet, iter)) throw("Error: V matrix is not positive-definite.");
924                 calcu_Hi(_P, Hi);
925                 Hi = 2 * Hi;
926                 break;
927             }
928         }
929         logdet_Xt_Vi_X = calcu_P(_Vi, Vi_X, Xt_Vi_X_i, _P); // Calculate P
930         if (_reml_mtd == 0) ai_reml(_P, Hi, Py, prev_varcmp, varcmp, dlogL);
931         else if (_reml_mtd == 1) reml_equation(_P, Hi, Py, varcmp);
932         else if (_reml_mtd == 2) em_reml(_P, Py, prev_varcmp, varcmp);
933         lgL = -0.5 * (logdet_Xt_Vi_X + logdet + (_y.transpose() * Py)(0, 0));
934 
935         if(_reml_force_converge && _reml_AI_not_invertible) break;
936             /*{
937             if(_reml_mtd != 1){
938                 cout<<"Warning: the information matrix is not invertible. Trying to fix the problem using the Fisher-scoring approach."<<endl;
939                 _reml_mtd = 1;
940                 _reml_AI_not_invertible = false;
941                 iter--;
942                 continue;
943             }
944             else {
945                 cout<<"Warning: the information matrix is not invertible using the Fisher-scoring approach."<<endl;
946                 break;
947             }
948         }*/
949 
950         // output log
951         if (!no_constrain) constrain_num = constrain_varcmp(varcmp);
952         if (_bivar_reml && !_bivar_no_constrain) constrain_rg(varcmp);
953         if (iter > 0) {
954             cout << iter << "\t" << setiosflags(ios::fixed) << setprecision(2) << lgL << "\t";
955             for (i = 0; i < _r_indx.size(); i++) cout << setprecision(5) << varcmp[i] << "\t";
956             if (constrain_num > 0) cout << "(" << constrain_num << " component(s) constrained)" << endl;
957             else cout << endl;
958         } else {
959             if (!prior_var_flag) cout << "Updated prior values: " << varcmp.transpose() << endl;
960             cout << "logL: " << lgL << endl;
961             //if(_reml_max_iter==1) cout<<"logL: "<<lgL<<endl;
962         }
963         if(_reml_fixed_var){
964             varcmp = prev_varcmp;
965             break;
966         }
967         if (constrain_num * 2 > _r_indx.size()) throw ("Error: analysis stopped because more than half of the variance components are constrained. The result would be unreliable.\n Please have a try to add the option --reml-no-constrain.");
968         // added by Jian Yang on 22 Oct 2014
969         //if (constrain_num == _r_indx.size()) throw ("Error: analysis stopped because all variance components are constrained. You may have a try of adding the option --reml-no-constrain.");
970 
971         if((_reml_force_converge || _reml_no_converge) && prev_lgL > lgL){
972             varcmp = prev_varcmp;
973             calcu_Hi(_P, Hi);
974             Hi = 2 * Hi;
975             break;
976         }
977 
978         // convergence
979         dlogL = lgL - prev_lgL;
980         if ((varcmp - prev_varcmp).squaredNorm() / varcmp.squaredNorm() < 1e-8 && (fabs(dlogL) < 1e-4 || (fabs(dlogL) < 1e-2 && dlogL < 0))) {
981             converged_flag = true;
982             if (_reml_mtd == 2) {
983                 calcu_Hi(_P, Hi);
984                 Hi = 2 * Hi;
985             } // for calculation of SE
986             break;
987         }
988         prev_prev_varcmp = prev_varcmp;
989         prev_varcmp = varcmp;
990         prev_lgL = lgL;
991     }
992 
993     if(_reml_fixed_var) cout << "Warning: the model is evaluated at fixed variance components. The likelihood might not be maximised." <<endl;
994     else {
995         if(converged_flag) cout << "Log-likelihood ratio converged." << endl;
996         else {
997             if(_reml_force_converge || _reml_no_converge) cout << "Warning: Log-likelihood not converged. Results are not reliable." <<endl;
998             else if(iter == _reml_max_iter){
999                 stringstream errmsg;
1000                 errmsg << "Error: Log-likelihood not converged (stop after " << _reml_max_iter << " iteractions). \nYou can specify the option --reml-maxit to allow for more iterations." << endl;
1001                 if (_reml_max_iter > 1) throw (errmsg.str());
1002             }
1003         }
1004     }
1005     return lgL;
1006 }
1007 
calcu_Vp(double & Vp,double & Vp2,double & VarVp,double & VarVp2,eigenVector & varcmp,eigenMatrix & Hi)1008 void gcta::calcu_Vp(double &Vp, double &Vp2, double &VarVp, double &VarVp2, eigenVector &varcmp, eigenMatrix &Hi) {
1009     int i = 0, j = 0;
1010     Vp = 0.0;
1011     VarVp = 0.0;
1012     Vp2 = 0.0;
1013     VarVp2 = 0.0;
1014     if (_bivar_reml) {
1015         for (i = 0; i < _bivar_pos[0].size(); i++) {
1016             Vp += varcmp[_bivar_pos[0][i]];
1017             for (j = 0; j < _bivar_pos[0].size(); j++) VarVp += Hi(_bivar_pos[0][i], _bivar_pos[0][j]);
1018         }
1019         for (i = 0; i < _bivar_pos[1].size(); i++) {
1020             Vp2 += varcmp[_bivar_pos[1][i]];
1021             for (j = 0; j < _bivar_pos[1].size(); j++) VarVp2 += Hi(_bivar_pos[1][i], _bivar_pos[1][j]);
1022         }
1023         return;
1024     }
1025     for (i = 0; i < _r_indx.size(); i++) {
1026         Vp += varcmp[i];
1027         for (j = 0; j < _r_indx.size(); j++) VarVp += Hi(i, j);
1028     }
1029 }
1030 
calcu_hsq(int i,double Vp,double Vp2,double VarVp,double VarVp2,double & hsq,double & var_hsq,eigenVector & varcmp,eigenMatrix & Hi)1031 void gcta::calcu_hsq(int i, double Vp, double Vp2, double VarVp, double VarVp2, double &hsq, double &var_hsq, eigenVector &varcmp, eigenMatrix &Hi) {
1032     int j = 0;
1033     double V1 = varcmp[i], VarV1 = Hi(i, i), Cov12 = 0.0;
1034 
1035     if (_bivar_reml) {
1036         vector<int>::iterator iter;
1037         iter = find(_bivar_pos[0].begin(), _bivar_pos[0].end(), i);
1038         if (iter != _bivar_pos[0].end()) {
1039             for (j = 0; j < _bivar_pos[0].size(); j++) {
1040                 Cov12 += Hi(*iter, _bivar_pos[0][j]);
1041             }
1042             hsq = V1 / Vp;
1043             var_hsq = (V1 / Vp)*(V1 / Vp)*(VarV1 / (V1 * V1) + VarVp / (Vp * Vp)-(2 * Cov12) / (V1 * Vp));
1044             return;
1045         }
1046         iter = find(_bivar_pos[1].begin(), _bivar_pos[1].end(), i);
1047         if (iter != _bivar_pos[1].end()) {
1048             for (j = 0; j < _bivar_pos[1].size(); j++) {
1049                 Cov12 += Hi(*iter, _bivar_pos[1][j]);
1050             }
1051             hsq = V1 / Vp2;
1052             var_hsq = (V1 / Vp2)*(V1 / Vp2)*(VarV1 / (V1 * V1) + VarVp2 / (Vp2 * Vp2)-(2 * Cov12) / (V1 * Vp2));
1053             return;
1054         }
1055         hsq = var_hsq = -2;
1056         return;
1057     }
1058 
1059     for (j = 0; j < _r_indx.size(); j++) {
1060         Cov12 += Hi(i, j);
1061     }
1062     hsq = V1 / Vp;
1063     var_hsq = (V1 / Vp)*(V1 / Vp)*(VarV1 / (V1 * V1) + VarVp / (Vp * Vp)-(2 * Cov12) / (V1 * Vp));
1064 }
1065 
calcu_sum_hsq(double Vp,double VarVp,double & sum_hsq,double & var_sum_hsq,eigenVector & varcmp,eigenMatrix & Hi)1066 void gcta::calcu_sum_hsq(double Vp, double VarVp, double &sum_hsq, double &var_sum_hsq, eigenVector &varcmp, eigenMatrix &Hi) {
1067     int i = 0, j = 0;
1068     double V1 = 0.0, VarV1 = 0.0, Cov12 = 0.0;
1069     for(i = 0; i < _r_indx.size()-1; i++) {
1070         V1 += varcmp[i];
1071         for(j = 0; j < _r_indx.size()-1; j++) VarV1 += Hi(i, j);
1072         for(j = 0; j < _r_indx.size(); j++) Cov12 += Hi(i, j);
1073     }
1074     sum_hsq = V1/Vp;
1075     var_sum_hsq = (V1/Vp)*(V1/Vp)*(VarV1/(V1*V1)+VarVp/(Vp*Vp)-(2*Cov12)/(V1*Vp));
1076 }
1077 
calcu_Vi(eigenMatrix & Vi,eigenVector & prev_varcmp,double & logdet,int & iter)1078 bool gcta::calcu_Vi(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter)
1079 {
1080     int i = 0, j = 0, k = 0;
1081     string errmsg = "\nError: the V (variance-covariance) matrix is not invertible.";
1082 
1083     Vi = eigenMatrix::Zero(_n, _n);
1084     if (_r_indx.size() == 1) {
1085         Vi.diagonal() = eigenVector::Constant(_n, 1.0 / prev_varcmp[0]);
1086         logdet = _n * log(prev_varcmp[0]);
1087     }
1088     else {
1089         for (i = 0; i < _r_indx.size(); i++) Vi += (_A[_r_indx[i]]) * prev_varcmp[i];
1090 
1091         if (_V_inv_mtd == 0) {
1092             if (!comput_inverse_logdet_LDLT_mkl(Vi, logdet)) {
1093                 if(_reml_force_inv) {
1094                     cout<<"Warning: the variance-covaraince matrix V is non-positive definite." << endl;
1095                     _V_inv_mtd = 1;
1096                     cout << "\nSwitching to the \"bending\" approach to invert V. This method hasn't been tested. The results might not be reliable!" << endl;
1097                 }
1098                 /*else {
1099                     if(_reml_no_converge){
1100                         cout<<"Warning: the variance-covaraince matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1101                         double d_buf = Vi.diagonal().mean() * 0.01;
1102                         for(j = 0; j < _n ; j++) Vi(j,j) += d_buf;
1103                         if(!comput_inverse_logdet_LDLT_mkl(Vi, logdet)) return false;
1104                     }
1105                     else throw("Error: the variance-covaraince matrix V is not positive definite.");
1106                 }*/
1107             }
1108         }
1109         if (_V_inv_mtd == 1) bend_V(Vi);
1110        /*if (_V_inv_mtd == 2) {
1111             if(!_reml_force_converge){
1112                 cout << "Switching from Cholesky to LU decomposition approach. The results might not be reliable!" << endl;
1113                 if (!comput_inverse_logdet_LU_mkl(Vi, logdet)){
1114                     if(_reml_no_converge){
1115                         cout<<"Warning: the variance-covaraince matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1116                         double d_buf = Vi.diagonal().mean() * 0.01;
1117                         for(j = 0; j < _n ; j++) Vi(j,j) += d_buf;
1118                         if(!comput_inverse_logdet_LDLT_mkl(Vi, logdet)) return false;
1119                     }
1120                     else throw ("Error: the variance-covaraince matrix V is still not invertible using LU decomposition.");
1121                 }
1122             }
1123             else{
1124                 cout<<"Warning: the variance-covaraince matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1125                 double d_buf = Vi.diagonal().mean() * 0.01;
1126                 for(j = 0; j < _n ; j++) Vi(j,j) += d_buf;
1127                 comput_inverse_logdet_LU_mkl(Vi, logdet);
1128             }
1129         }*/
1130     }
1131 
1132     return true;
1133 }
1134 
1135 
bend_V(eigenMatrix & Vi)1136 void gcta::bend_V(eigenMatrix &Vi)
1137 {
1138     SelfAdjointEigenSolver<eigenMatrix> eigensolver(Vi);
1139     eigenVector eval = eigensolver.eigenvalues();
1140     bending_eigenval(eval);
1141     eval.array() = 1.0 / eval.array();
1142     Vi = eigensolver.eigenvectors() * eigenDiagMat(eval) * eigensolver.eigenvectors().transpose();
1143 }
1144 
1145 
bend_A()1146 void gcta::bend_A() {
1147     _Vi.resize(0, 0);
1148     _P.resize(0, 0);
1149     cout << "Bending the GRM(s) to be positive-definite (may take a while if there are multiple GRMs)..." << endl;
1150     int i = 0;
1151     for (i = 0; i < _r_indx.size() - 1; i++) {
1152         #ifdef SINGLE_PRECISION
1153         SelfAdjointEigenSolver<eigenMatrix> eigensolver(_A[_r_indx[i]]);
1154         #else
1155         SelfAdjointEigenSolver<eigenMatrix> eigensolver((_A[_r_indx[i]]).cast<double>());
1156         #endif
1157         eigenVector eval = eigensolver.eigenvalues();
1158         if (bending_eigenval(eval)) {
1159             (_A[_r_indx[i]]) = eigensolver.eigenvectors() * eigenDiagMat(eval) * eigensolver.eigenvectors().transpose();
1160             cout << "Bending the " << i + 1 << "th GRM completed." << endl;
1161         }
1162     }
1163 }
1164 
bending_eigenval(eigenVector & eval)1165 bool gcta::bending_eigenval(eigenVector &eval) {
1166     int j = 0;
1167     double eval_m = eval.mean();
1168     if (eval.minCoeff() > 0.0) return false;
1169     double S = 0.0, P = 0.0;
1170     for (j = 0; j < eval.size(); j++) {
1171         if (eval[j] >= 0) continue;
1172         S += eval[j];
1173         P = -eval[j];
1174     }
1175     double W = S * S * 100.0 + 1;
1176     for (j = 0; j < eval.size(); j++) {
1177         if (eval[j] >= 0) continue;
1178         eval[j] = P * (S - eval[j])*(S - eval[j]) / W;
1179     }
1180     eval *= eval_m / eval.mean();
1181     return true;
1182 }
1183 
comput_inverse_logdet_LDLT(eigenMatrix & Vi,double & logdet)1184 bool gcta::comput_inverse_logdet_LDLT(eigenMatrix &Vi, double &logdet) {
1185     int i = 0, n = Vi.cols();
1186     LDLT<eigenMatrix> ldlt(Vi);
1187     eigenVector d = ldlt.vectorD();
1188 
1189     if (d.minCoeff() < 0) return false;
1190     else {
1191         logdet = 0.0;
1192         for (i = 0; i < n; i++) logdet += log(d[i]);
1193         Vi.setIdentity();
1194         ldlt.solveInPlace(Vi);
1195     }
1196     return true;
1197 }
1198 
comput_inverse_logdet_PLU(eigenMatrix & Vi,double & logdet)1199 bool gcta::comput_inverse_logdet_PLU(eigenMatrix &Vi, double &logdet)
1200 {
1201     int n = Vi.cols();
1202 
1203     PartialPivLU<eigenMatrix> lu(Vi);
1204     if (lu.determinant()<1e-6) return false;
1205     eigenVector u = lu.matrixLU().diagonal();
1206     logdet = 0.0;
1207     for (int i = 0; i < n; i++) logdet += log(fabs(u[i]));
1208     Vi = lu.inverse();
1209     return true;
1210 }
1211 
comput_inverse_logdet_LU(eigenMatrix & Vi,double & logdet)1212 bool gcta::comput_inverse_logdet_LU(eigenMatrix &Vi, double &logdet)
1213 {
1214     int n = Vi.cols();
1215 
1216     FullPivLU<eigenMatrix> lu(Vi);
1217     if (!lu.isInvertible()) return false;
1218     eigenVector u = lu.matrixLU().diagonal();
1219     logdet = 0.0;
1220     for (int i = 0; i < n; i++) logdet += log(fabs(u[i]));
1221     Vi = lu.inverse();
1222     return true;
1223 }
1224 
inverse_H(eigenMatrix & H)1225 bool gcta::inverse_H(eigenMatrix &H)
1226 {
1227     double d_buf = 0.0;
1228     if (!comput_inverse_logdet_LDLT_mkl(H, d_buf)) return false;
1229     /*{
1230         if(_reml_force_inv) {
1231             cout<<"Warning: the information matrix is non-positive definite. Switching from Cholesky to LU decomposition approach. The results might not be reliable!"<<endl;
1232             if (!comput_inverse_logdet_LU_mkl(H, d_buf)){
1233                 cout<<"Warning: the information matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1234                 int i = 0;
1235                 d_buf = H.diagonal().mean() * 0.001;
1236                 for(i = 0; i < H.rows(); i++) H(i,i) += d_buf;
1237                 if (!comput_inverse_logdet_LU_mkl(H, d_buf)) return false;
1238             }
1239         }
1240         else return false;
1241     }*/
1242     else return true;
1243 }
1244 
calcu_P(eigenMatrix & Vi,eigenMatrix & Vi_X,eigenMatrix & Xt_Vi_X_i,eigenMatrix & P)1245 double gcta::calcu_P(eigenMatrix &Vi, eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &P)
1246 {
1247     Vi_X = Vi*_X;
1248     Xt_Vi_X_i = _X.transpose() * Vi_X;
1249     double logdet_Xt_Vi_X = 0.0;
1250     if(!comput_inverse_logdet_LU(Xt_Vi_X_i, logdet_Xt_Vi_X)) throw("\nError: the X^t * V^-1 * X matrix is not invertible. Please check the covariate(s) and/or the environmental factor(s).");
1251     P = Vi - Vi_X * Xt_Vi_X_i * Vi_X.transpose();
1252     return logdet_Xt_Vi_X;
1253 }
1254 
1255 // input P, calculate PA and Hi
calcu_Hi(eigenMatrix & P,eigenMatrix & Hi)1256 void gcta::calcu_Hi(eigenMatrix &P, eigenMatrix &Hi)
1257 {
1258     int i = 0, j = 0, k = 0, l = 0;
1259     double d_buf = 0.0;
1260 
1261     // Calculate PA
1262     vector<eigenMatrix> PA(_r_indx.size());
1263     for (i = 0; i < _r_indx.size(); i++) {
1264         (PA[i]).resize(_n, _n);
1265         if (_bivar_reml || _within_family) (PA[i]) = P * (_Asp[_r_indx[i]]);
1266         else (PA[i]) = P * (_A[_r_indx[i]]);
1267     }
1268 
1269     // Calculate Hi
1270     for (i = 0; i < _r_indx.size(); i++) {
1271         for (j = 0; j <= i; j++) {
1272             d_buf = 0.0;
1273             for (k = 0; k < _n; k++) {
1274                 for (l = 0; l < _n; l++) d_buf += (PA[i])(k, l)*(PA[j])(l, k);
1275             }
1276             Hi(i, j) = Hi(j, i) = d_buf;
1277         }
1278     }
1279 
1280     if (!inverse_H(Hi)){
1281         if(_reml_force_converge){
1282             cout << "Warning: the information matrix is not invertible." << endl;
1283             _reml_AI_not_invertible = true;
1284         }
1285         else throw ("Error: the information matrix is not invertible.");
1286     }
1287 }
1288 
1289 // use Fisher-scoring to estimate variance component
1290 // input P, calculate PA, H, R and varcmp
1291 
reml_equation(eigenMatrix & P,eigenMatrix & Hi,eigenVector & Py,eigenVector & varcmp)1292 void gcta::reml_equation(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp)
1293 {
1294     // Calculate Hi
1295     calcu_Hi(P, Hi);
1296     if(_reml_AI_not_invertible) return;
1297 
1298     // Calculate R
1299     Py = P*_y;
1300     eigenVector R(_r_indx.size());
1301     for (int i = 0; i < _r_indx.size(); i++) {
1302         if (_bivar_reml || _within_family) R(i) = (Py.transpose()*(_Asp[_r_indx[i]]) * Py)(0, 0);
1303         else R(i) = (Py.transpose()*(_A[_r_indx[i]]) * Py)(0, 0);
1304     }
1305 
1306     // Calculate variance component
1307     varcmp = Hi*R;
1308     Hi = 2 * Hi; // for calculation of SE
1309 }
1310 
1311 // use Fisher-scoring to estimate variance component
1312 // input P, calculate PA, H, R and varcmp
1313 
ai_reml(eigenMatrix & P,eigenMatrix & Hi,eigenVector & Py,eigenVector & prev_varcmp,eigenVector & varcmp,double dlogL)1314 void gcta::ai_reml(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp, double dlogL)
1315 {
1316     int i = 0, j = 0;
1317 
1318     Py = P*_y;
1319     eigenVector cvec(_n);
1320     eigenMatrix APy(_n, _r_indx.size());
1321     for (i = 0; i < _r_indx.size(); i++) {
1322         if (_bivar_reml || _within_family) (APy.col(i)) = (_Asp[_r_indx[i]]) * Py;
1323         else (APy.col(i)) = (_A[_r_indx[i]]) * Py;
1324     }
1325 
1326     // Calculate Hi
1327     eigenVector R(_r_indx.size());
1328     for (i = 0; i < _r_indx.size(); i++) {
1329         R(i) = (Py.transpose()*(APy.col(i)))(0, 0);
1330         cvec = P * (APy.col(i));
1331         Hi(i, i) = ((APy.col(i)).transpose() * cvec)(0, 0);
1332         for (j = 0; j < i; j++) Hi(j, i) = Hi(i, j) = ((APy.col(j)).transpose() * cvec)(0, 0);
1333     }
1334     Hi = 0.5 * Hi;
1335 
1336     // Calcualte tr(PA) and dL
1337     eigenVector tr_PA;
1338     calcu_tr_PA(P, tr_PA);
1339     R = -0.5 * (tr_PA - R);
1340 
1341     // Calculate variance component
1342     if (!inverse_H(Hi)){
1343         if(_reml_force_converge){
1344             cout << "Warning: the information matrix is not invertible." << endl;
1345             _reml_AI_not_invertible = true;
1346             return;
1347         }
1348         else throw ("Error: the information matrix is not invertible.");
1349     }
1350 
1351     eigenVector delta(_r_indx.size());
1352     delta = Hi*R;
1353     if (dlogL > 1.0) varcmp = prev_varcmp + 0.316 * delta;
1354     else varcmp = prev_varcmp + delta;
1355 }
1356 
1357 // input P, calculate varcmp
1358 
em_reml(eigenMatrix & P,eigenVector & Py,eigenVector & prev_varcmp,eigenVector & varcmp)1359 void gcta::em_reml(eigenMatrix &P, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp)
1360 {
1361     int i = 0;
1362 
1363     // Calculate trace(PA)
1364     eigenVector tr_PA;
1365     calcu_tr_PA(P, tr_PA);
1366 
1367     // Calculate R
1368     Py = P*_y;
1369     eigenVector R(_r_indx.size());
1370     for (i = 0; i < _r_indx.size(); i++) {
1371         if (_bivar_reml || _within_family) R(i) = (Py.transpose()*(_Asp[_r_indx[i]]) * Py)(0, 0);
1372         else R(i) = (Py.transpose()*(_A[_r_indx[i]]) * Py)(0, 0);
1373     }
1374 
1375     // Calculate variance component
1376     for (i = 0; i < _r_indx.size(); i++) varcmp(i) = (prev_varcmp(i) * _n - prev_varcmp(i) * prev_varcmp(i) * tr_PA(i) + prev_varcmp(i) * prev_varcmp(i) * R(i)) / _n;
1377 
1378     // added by Jian Yang Dec 2014
1379     //varcmp = (varcmp.array() - prev_varcmp.array())*2 + prev_varcmp.array();
1380 }
1381 
1382 // input P, calculate tr(PA)
calcu_tr_PA(eigenMatrix & P,eigenVector & tr_PA)1383 void gcta::calcu_tr_PA(eigenMatrix &P, eigenVector &tr_PA) {
1384     int i = 0, k = 0, l = 0;
1385     double d_buf = 0.0;
1386 
1387     // Calculate trace(PA)
1388     tr_PA.resize(_r_indx.size());
1389     for (i = 0; i < _r_indx.size(); i++) {
1390         if (_bivar_reml || _within_family) tr_PA(i) = (P * (_Asp[_r_indx[i]])).diagonal().sum();
1391         else {
1392             d_buf = 0.0;
1393             for (k = 0; k < _n; k++) {
1394                 for (l = 0; l < _n; l++) d_buf += P(k, l)*(_A[_r_indx[i]])(k, l);
1395             }
1396             tr_PA(i) = d_buf;
1397         }
1398     }
1399 }
1400 
1401 // blue estimate of SNP effect
1402 
blup_snp_geno()1403 void gcta::blup_snp_geno() {
1404     check_autosome();
1405 
1406     if (_mu.empty()) calcu_mu();
1407 
1408     int i = 0, j = 0, k = 0, col_num = _varcmp_Py.cols();
1409     double x = 0.0, fcount = 0.0;
1410 
1411     // Calcuate A matrix
1412     cout << "Calculating the BLUP solution to SNP effects ..." << endl;
1413     vector<double> var_SNP(_include.size());
1414     eigenMatrix b_SNP = eigenMatrix::Zero(_include.size(), col_num); // variance of each SNP, 2pq
1415     for (j = 0; j < _include.size(); j++) {
1416         var_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1417         if (fabs(var_SNP[j]) < 1.0e-50) var_SNP[j] = 0.0;
1418         else var_SNP[j] = 1.0 / var_SNP[j];
1419     }
1420 
1421     for (k = 0; k < _include.size(); k++) {
1422         fcount = 0.0;
1423         for (i = 0; i < _keep.size(); i++) {
1424             if (!_snp_1[_include[k]][_keep[i]] || _snp_2[_include[k]][_keep[i]]) {
1425                 if (_allele1[_include[k]] == _ref_A[_include[k]]) x = _snp_1[_include[k]][_keep[i]] + _snp_2[_include[k]][_keep[i]];
1426                 else x = 2.0 - (_snp_1[_include[k]][_keep[i]] + _snp_2[_include[k]][_keep[i]]);
1427                 x = (x - _mu[_include[k]]);
1428                 for (j = 0; j < col_num; j++) b_SNP(k, j) += x * _varcmp_Py(i, j);
1429                 fcount += 1.0;
1430             }
1431         }
1432         for (j = 0; j < col_num; j++) b_SNP(k, j) = (b_SNP(k, j) * var_SNP[k] / fcount)*((double) _keep.size() / (double) _include.size());
1433     }
1434     output_blup_snp(b_SNP);
1435 }
1436 
blup_snp_dosage()1437 void gcta::blup_snp_dosage() {
1438     check_autosome();
1439 
1440     if (_mu.empty()) calcu_mu();
1441 
1442     int i = 0, j = 0, k = 0, col_num = _varcmp_Py.cols();
1443 
1444     // Subtract each element by 2p
1445     for (i = 0; i < _keep.size(); i++) {
1446         for (j = 0; j < _include.size(); j++) _geno_dose[_keep[i]][_include[j]] -= _mu[_include[j]];
1447     }
1448 
1449     // Calculate A matrix
1450     cout << "Calculating the BLUP solution to SNP effects using imputed dosage scores ... " << endl;
1451     vector<double> var_SNP(_include.size()); // variance of each SNP, 2pq
1452     eigenMatrix b_SNP = eigenMatrix::Zero(_include.size(), col_num); // variance of each SNP, 2pq
1453     for (j = 0; j < _include.size(); j++) {
1454         for (i = 0; i < _keep.size(); i++) var_SNP[j] += _geno_dose[_keep[i]][_include[j]] * _geno_dose[_keep[i]][_include[j]];
1455         var_SNP[j] /= (double) (_keep.size() - 1);
1456         if (fabs(var_SNP[j]) < 1.0e-50) var_SNP[j] = 0.0;
1457         else var_SNP[j] = 1.0 / var_SNP[j];
1458     }
1459     for (k = 0; k < _include.size(); k++) {
1460         for (i = 0; i < _keep.size(); i++) {
1461             for (j = 0; j < col_num; j++) b_SNP(k, j) += _geno_dose[_keep[i]][_include[k]] * _varcmp_Py(i, j);
1462         }
1463         for (j = 0; j < col_num; j++) b_SNP(k, j) = b_SNP(k, j) * var_SNP[k] / (double) _include.size();
1464     }
1465     output_blup_snp(b_SNP);
1466 }
1467 
output_blup_snp(eigenMatrix & b_SNP)1468 void gcta::output_blup_snp(eigenMatrix &b_SNP) {
1469     string o_b_snp_file = _out + ".snp.blp";
1470     ofstream o_b_snp(o_b_snp_file.c_str());
1471     if (!o_b_snp) throw ("Error: can not open the file " + o_b_snp_file + " to write.");
1472     int i = 0, j = 0, col_num = b_SNP.cols();
1473     cout << "Writing BLUP solutions of SNP effects for " << _include.size() << " SNPs to [" + o_b_snp_file + "]." << endl;
1474     for (i = 0; i < _include.size(); i++) {
1475         o_b_snp << _snp_name[_include[i]] << "\t" << _ref_A[_include[i]] << "\t";
1476         for (j = 0; j < col_num; j++) o_b_snp << b_SNP(i, j) << "\t";
1477         o_b_snp << endl;
1478     }
1479     o_b_snp.close();
1480     cout << "BLUP solutions of SNP effects for " << _include.size() << " SNPs have been saved in the file [" + o_b_snp_file + "]." << endl;
1481 }
1482 
HE_reg(string grm_file,string phen_file,string keep_indi_file,string remove_indi_file,int mphen)1483 void gcta::HE_reg(string grm_file, string phen_file, string keep_indi_file, string remove_indi_file, int mphen) {
1484     int i = 0, j = 0, k = 0, l = 0;
1485     stringstream errmsg;
1486     vector<string> phen_ID, grm_id;
1487     vector< vector<string> > phen_buf; // save individuals by column
1488 
1489     read_grm(grm_file, grm_id, true, false, true);
1490     update_id_map_kp(grm_id, _id_map, _keep);
1491     read_phen(phen_file, phen_ID, phen_buf, mphen);
1492     update_id_map_kp(phen_ID, _id_map, _keep);
1493     if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
1494     if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
1495 
1496     vector<string> uni_id;
1497     map<string, int> uni_id_map;
1498     map<string, int>::iterator iter;
1499     for (i = 0; i < _keep.size(); i++) {
1500         uni_id.push_back(_fid[_keep[i]] + ":" + _pid[_keep[i]]);
1501         uni_id_map.insert(pair<string, int>(_fid[_keep[i]] + ":" + _pid[_keep[i]], i));
1502     }
1503     _n = _keep.size();
1504     if (_n < 1) throw ("Error: no individual is in common in the input files.");
1505 
1506     _y.setZero(_n);
1507     for (i = 0; i < phen_ID.size(); i++) {
1508         iter = uni_id_map.find(phen_ID[i]);
1509         if (iter == uni_id_map.end()) continue;
1510         _y[iter->second] = atof(phen_buf[i][mphen - 1].c_str());
1511     }
1512 
1513     cout << "\nPerforming Haseman-Elston regression ...\n" << endl;
1514     int n = _n * (_n - 1) / 2;
1515     /*   vector<bool> nomiss(_n*(_n-1));
1516        for(i=0, k=0; i<_n; i++){
1517            for(j=0; j<i; j++, k++){
1518                if(CommFunc::FloatNotEqual(_grm(i,j),0)){
1519                    nomiss[k]=true;
1520                    n++;
1521                }
1522                else nomiss[k]=false;
1523            }
1524        }*/
1525 
1526     // normalise phenotype
1527     cout << "Standardising the phenotype ..." << endl;
1528     eigenVector y = _y.array() - _y.mean();
1529     y = y.array() / sqrt(y.squaredNorm() / (_n - 1.0));
1530 
1531     vector<double> y_cp(n), y_sd(n), x(n), rst;
1532     for (i = 0, k = 0; i < _n; i++) {
1533         for (j = 0; j < i; j++, k++) {
1534             y_sd[k] = (y[i] - y[j])*(y[i] - y[j]);
1535             y_cp[k] = y[i]*y[j];
1536             x[k] = _grm(i, j);
1537         }
1538     }
1539     eigenMatrix reg_sum_sd = reg(y_sd, x, rst, true);
1540     eigenMatrix reg_sum_cp = reg(y_cp, x, rst, true);
1541 
1542     stringstream ss;
1543     ss << "HE-SD" << endl;
1544     ss << "Coefficient\tEstimate\tSE\tP\n";
1545     ss << "Intercept\t" << reg_sum_sd.row(0) << endl;
1546     ss << "Slope\t" << reg_sum_sd.row(1) << endl;
1547     ss << "V(G)/Vp\t" << -1 * reg_sum_sd(1, 0) / reg_sum_sd(0.0) << "\t" << reg_sum_sd(1, 1) / fabs(reg_sum_sd(0.0)) << endl;
1548     ss << "\nHE-CP" << endl;
1549     ss << "Coefficient\tEstimate\tSE\tP\n";
1550     ss << "Intercept\t" << reg_sum_cp.row(0) << endl;
1551     ss << "Slope\t" << reg_sum_cp.row(1) << endl;
1552     ss << "V(G)/Vp\t" << reg_sum_cp(1, 0) << "\t" << reg_sum_cp(1, 1) << endl;
1553 
1554     cout << ss.str() << endl;
1555     string ofile = _out + ".HEreg";
1556     ofstream os(ofile.c_str());
1557     if (!os) throw ("Error: can not open the file [" + ofile + "] to write.");
1558     os << ss.str() << endl;
1559     cout << "Results from Haseman-Elston regression have been saved in [" + ofile + "]." << endl;
1560 
1561 }