1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for REML analysis
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
set_reml_force_inv()15 void gcta::set_reml_force_inv()
16 {
17 _reml_force_inv = true;
18 }
19
set_reml_force_converge()20 void gcta::set_reml_force_converge()
21 {
22 _reml_force_converge = true;
23 }
24
set_reml_no_converge()25 void gcta::set_reml_no_converge()
26 {
27 _reml_no_converge = true;
28 }
29
set_reml_fixed_var()30 void gcta::set_reml_fixed_var()
31 {
32 _reml_fixed_var = true;
33 }
34
set_reml_mtd(int reml_mtd)35 void gcta::set_reml_mtd(int reml_mtd)
36 {
37 _reml_mtd = reml_mtd;
38 }
39
read_phen(string phen_file,vector<string> & phen_ID,vector<vector<string>> & phen_buf,int mphen,int mphen2)40 void gcta::read_phen(string phen_file, vector<string> &phen_ID, vector< vector<string> > &phen_buf, int mphen, int mphen2) {
41 // Read phenotype data
42 ifstream in_phen(phen_file.c_str());
43 if (!in_phen) throw ("Error: can not open the file [" + phen_file + "] to read.");
44
45 int i = 0;
46 vector<string> fid, pid, vs_buf;
47 string str_buf, fid_buf, pid_buf;
48 phen_ID.clear();
49 phen_buf.clear();
50 cout << "Reading phenotypes from [" + phen_file + "]." << endl;
51 getline(in_phen, str_buf);
52 int phen_num = StrFunc::split_string(str_buf, vs_buf) - 2;
53 if (phen_num <= 0) throw ("Error: no phenotype data is found.");
54 if (phen_num > 1) cout << "There are " << phen_num << " traits specified in the file [" + phen_file + "]." << endl;
55 if (mphen > phen_num) {
56 stringstream errmsg;
57 errmsg << "Error: can not find the " << mphen << "th trait in the file [" + phen_file + "].";
58 throw (errmsg.str());
59 }
60 if (_bivar_reml && mphen2 > phen_num) {
61 stringstream errmsg;
62 errmsg << "Error: can not find the " << mphen2 << "th trait in the file [" + phen_file + "].";
63 throw (errmsg.str());
64 }
65 if (_bivar_reml) cout << "Traits " << mphen << " and " << mphen2 << " are included in the bivariate analysis." << endl;
66 else {
67 if (phen_num > 1) cout << "Trait #" << mphen << " is included for analysis." << endl;
68 }
69 in_phen.seekg(ios::beg);
70 mphen--;
71 mphen2--;
72 int line = 1;
73 while (in_phen) {
74 line++;
75 in_phen >> fid_buf;
76 if (in_phen.eof()) break;
77 in_phen >> pid_buf;
78 getline(in_phen, str_buf);
79 if (StrFunc::split_string(str_buf, vs_buf) != phen_num) {
80 stringstream errmsg;
81 errmsg << "Error: " << vs_buf.size() - phen_num << " phenotype values are missing in line #" << line << " in the file [" + phen_file + "]";
82 throw (errmsg.str());
83 }
84 if (_bivar_reml) {
85 if ((vs_buf[mphen] == "-9" || vs_buf[mphen] == "NA") && (vs_buf[mphen2] == "-9" || vs_buf[mphen2] == "NA")) continue;
86 } else {
87 if (vs_buf[mphen] == "-9" || vs_buf[mphen] == "NA") continue;
88 }
89 phen_ID.push_back(fid_buf + ":" + pid_buf);
90 fid.push_back(fid_buf);
91 pid.push_back(pid_buf);
92 phen_buf.push_back(vs_buf);
93 }
94 in_phen.close();
95 cout << "Non-missing phenotypes of " << phen_buf.size() << " individuals are included from [" + phen_file + "]." << endl;
96
97 if (_id_map.empty()) {
98 _fid = fid;
99 _pid = pid;
100 _indi_num = _fid.size();
101 init_keep();
102 }
103 }
104
read_covar(string covar_file,vector<string> & covar_ID,vector<vector<string>> & covar,bool qcovar_flag)105 int gcta::read_covar(string covar_file, vector<string> &covar_ID, vector< vector<string> > &covar, bool qcovar_flag) {
106 // Read covariate data
107 ifstream in_covar(covar_file.c_str());
108 if (!in_covar) throw ("Error: can not open the file [" + covar_file + "] to read.");
109
110 int i = 0, covar_num = 0;
111 string str_buf, id_buf;
112 vector<string> covar_buf, vs_buf;
113 covar_ID.clear();
114 covar.clear();
115 if (qcovar_flag) cout << "Reading quantitative covariates from [" + covar_file + "]." << endl;
116 else cout << "Reading discrete covariate(s) from [" + covar_file + "]." << endl;
117 covar_num = read_fac(in_covar, covar_ID, covar);
118 if (qcovar_flag) cout << covar_num << " quantitative covariate(s) of " << covar_ID.size() << " individuals read from [" + covar_file + "]." << endl;
119 else cout << covar_num << " discrete covariate(s) of " << covar_ID.size() << " individuals are included from [" + covar_file + "]." << endl;
120
121 return covar_num;
122 }
123
read_fac(ifstream & ifstrm,vector<string> & ID,vector<vector<string>> & fac)124 int gcta::read_fac(ifstream &ifstrm, vector<string> &ID, vector< vector<string> > &fac) {
125 int i = 0, line = 0, fac_num = 0, prev_fac_num = 0;
126 string str_buf, id_buf;
127 vector<string> vs_buf;
128 while (ifstrm) {
129 ifstrm >> str_buf;
130 if (ifstrm.eof()) break;
131 id_buf = str_buf;
132 ifstrm >> str_buf;
133 id_buf += ":" + str_buf;
134 getline(ifstrm, str_buf);
135 fac_num = StrFunc::split_string(str_buf, vs_buf);
136 if (line > 0 && fac_num != prev_fac_num) throw ("Error: each row should have the same number of columns.\n" + id_buf + "\t" + str_buf);
137 line++;
138 prev_fac_num = fac_num;
139 bool continue_flag = false;
140 for (i = 0; i < fac_num; i++) {
141 if (vs_buf[i] == "-9" || vs_buf[i] == "NA") continue_flag = true;
142 }
143 if (continue_flag) continue;
144 ID.push_back(id_buf);
145 fac.push_back(vs_buf);
146 }
147 ifstrm.close();
148 return fac_num;
149 }
150
read_GE(string GE_file,vector<string> & GE_ID,vector<vector<string>> & GE,bool qGE_flag)151 int gcta::read_GE(string GE_file, vector<string> &GE_ID, vector< vector<string> > &GE, bool qGE_flag) {
152 // Read phenotype data
153 ifstream in_GE(GE_file.c_str());
154 if (!in_GE) throw ("Error: can not open the file [" + GE_file + "] to read.");
155
156 string str_buf, id_buf;
157 vector<string> vs_buf;
158 GE_ID.clear();
159 GE.clear();
160 string env = "environmental";
161 if (qGE_flag == true) env = "continuous " + env;
162 else env = "categorical " + env;
163 cout << "Reading " << env << " factor(s) for the analysis of GE interaction from [" + GE_file + "]." << endl;
164 int GE_num = read_fac(in_GE, GE_ID, GE);
165 if (GE_num == 0) throw ("Error: no " + env + " factor is specified. Please check the format of the file: " + GE_file + ".");
166 cout << GE_num << " " << env << " factor(s) for " << GE_ID.size() << " individuals are included from [" + GE_file + "]." << endl;
167
168 return GE_num;
169 }
170
fit_reml(string grm_file,string phen_file,string qcovar_file,string covar_file,string qGE_file,string GE_file,string keep_indi_file,string remove_indi_file,string sex_file,int mphen,double grm_cutoff,double adj_grm_fac,int dosage_compen,bool m_grm_flag,bool pred_rand_eff,bool est_fix_eff,int reml_mtd,int MaxIter,vector<double> reml_priors,vector<double> reml_priors_var,vector<int> drop,bool no_lrt,double prevalence,bool no_constrain,bool mlmassoc,bool within_family,bool reml_bending,bool reml_diag_one)171 void gcta::fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc, bool within_family, bool reml_bending, bool reml_diag_one) {
172 _within_family = within_family;
173 _reml_mtd = reml_mtd;
174 _reml_max_iter = MaxIter;
175 _reml_diag_one = reml_diag_one;
176 int i = 0, j = 0, k = 0;
177 bool grm_flag = (!grm_file.empty());
178 bool qcovar_flag = (!qcovar_file.empty());
179 bool covar_flag = (!covar_file.empty());
180 bool GE_flag = (!GE_file.empty());
181 bool qGE_flag = (!qGE_file.empty());
182 if (m_grm_flag) grm_flag = false;
183
184 // Read data
185 stringstream errmsg;
186 int qcovar_num = 0, covar_num = 0, qE_fac_num = 0, E_fac_num = 0;
187 vector<string> phen_ID, qcovar_ID, covar_ID, qGE_ID, GE_ID, grm_id, grm_files;
188 vector< vector<string> > phen_buf, qcovar, covar, GE, qGE; // save individuals by column
189
190 if (grm_flag) {
191 read_grm(grm_file, grm_id, true, false, !(adj_grm_fac > -1.0));
192 update_id_map_kp(grm_id, _id_map, _keep);
193 grm_files.push_back(grm_file);
194 }
195 else if (m_grm_flag) {
196 read_grm_filenames(grm_file, grm_files, false);
197 for (i = 0; i < grm_files.size(); i++) {
198 read_grm(grm_files[i], grm_id, false, true, !(adj_grm_fac > -1.0));
199 update_id_map_kp(grm_id, _id_map, _keep);
200 }
201 }
202 read_phen(phen_file, phen_ID, phen_buf, mphen);
203 update_id_map_kp(phen_ID, _id_map, _keep);
204 if (qcovar_flag) {
205 qcovar_num = read_covar(qcovar_file, qcovar_ID, qcovar, true);
206 update_id_map_kp(qcovar_ID, _id_map, _keep);
207 }
208 if (covar_flag) {
209 covar_num = read_covar(covar_file, covar_ID, covar, false);
210 update_id_map_kp(covar_ID, _id_map, _keep);
211 }
212 if (qGE_flag) {
213 qE_fac_num = read_GE(qGE_file, qGE_ID, qGE, true);
214 update_id_map_kp(qGE_ID, _id_map, _keep);
215 }
216 if (GE_flag) {
217 E_fac_num = read_GE(GE_file, GE_ID, GE, false);
218 update_id_map_kp(GE_ID, _id_map, _keep);
219 }
220 if (!mlmassoc) {
221 if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
222 if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
223 }
224 if (grm_flag) {
225 if (grm_cutoff>-1.0) rm_cor_indi(grm_cutoff);
226 if (!sex_file.empty()) update_sex(sex_file);
227 if (adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
228 if (dosage_compen>-1) dc(dosage_compen);
229 _grm_N.resize(0, 0);
230 }
231
232 vector<string> uni_id;
233 map<string, int> uni_id_map;
234 map<string, int>::iterator iter;
235 for (i = 0; i < _keep.size(); i++) {
236 uni_id.push_back(_fid[_keep[i]] + ":" + _pid[_keep[i]]);
237 uni_id_map.insert(pair<string, int>(_fid[_keep[i]] + ":" + _pid[_keep[i]], i));
238 }
239 _n = _keep.size();
240 if (_n < 1) throw ("Error: no individual is in common in the input files.");
241
242 // construct model terms
243 _y.setZero(_n);
244 for (i = 0; i < phen_ID.size(); i++) {
245 iter = uni_id_map.find(phen_ID[i]);
246 if (iter == uni_id_map.end()) continue;
247 _y[iter->second] = atof(phen_buf[i][mphen - 1].c_str());
248 }
249
250 // case-control
251 _ncase = 0.0;
252 _flag_CC = check_case_control(_ncase, _y);
253 cout << endl;
254 if (_flag_CC) {
255 //if(mlmassoc) throw("Error: the option --mlm-assoc is valid for the quantitative trait only.");
256 if (!_bivar_reml) {
257 if (!mlmassoc && prevalence<-1) cout << "Note: you can specify the disease prevalence by the option --prevalence so that GCTA can transform the variance explained to the underlying liability scale." << endl;
258 } else {
259 cout << "Note: you can specify the prevalences of the two diseases by the option --reml-bivar-prevalence so that GCTA can transform the estimates of variance explained to the underlying liability scale." << endl;
260 }
261 }
262
263 int pos = 0;
264 _r_indx.clear();
265 vector<int> kp;
266 if (grm_flag) {
267 for (i = 0; i < 1 + qE_fac_num + E_fac_num + 1; i++) _r_indx.push_back(i);
268 if (!no_lrt) drop_comp(drop);
269 _A.resize(_r_indx.size());
270 if (mlmassoc) StrFunc::match(uni_id, grm_id, kp);
271 else kp = _keep;
272 (_A[0]) = eigenMatrix::Zero(_n, _n);
273
274 #pragma omp parallel for private(j)
275 for (i = 0; i < _n; i++) {
276 for (j = 0; j <= i; j++) (_A[0])(j, i) = (_A[0])(i, j) = _grm(kp[i], kp[j]);
277 }
278 if (_reml_diag_one) {
279 double diag_mean = (_A[0]).diagonal().mean();
280 cout << "Mean of diagonal elements of the GRM = " << diag_mean << endl;
281 #pragma omp parallel for private(j)
282 for (i = 0; i < _n; i++) {
283 for (j = 0; j <= i; j++) {
284 (_A[0])(i, j) /= (_A[0])(i, i);
285 (_A[0])(j, i) = (_A[0])(i, j);
286 }
287 }
288 }
289
290 pos++;
291 _grm.resize(0, 0);
292 } else if (m_grm_flag) {
293 if (!sex_file.empty()) update_sex(sex_file);
294 for (i = 0; i < (1 + qE_fac_num + E_fac_num) * grm_files.size() + 1; i++) _r_indx.push_back(i);
295 if (!no_lrt) drop_comp(drop);
296 _A.resize(_r_indx.size());
297 string prev_file = grm_files[0];
298 vector<string> prev_grm_id(grm_id);
299 cout << "There are " << grm_files.size() << " GRM file names specified in the file [" + grm_file + "]." << endl;
300 for (i = 0; i < grm_files.size(); i++, pos++) {
301 cout << "Reading the GRM from the " << i + 1 << "th file ..." << endl;
302 read_grm(grm_files[i], grm_id, true, false, !(adj_grm_fac > -1.0));
303 if (adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
304 if (dosage_compen>-1) dc(dosage_compen);
305 StrFunc::match(uni_id, grm_id, kp);
306 (_A[pos]) = eigenMatrix::Zero(_n, _n);
307
308 #pragma omp parallel for private(j)
309 for (j = 0; j < _n; j++) {
310 for (k = 0; k <= j; k++) {
311 if (kp[j] >= kp[k]) (_A[pos])(k, j) = (_A[pos])(j, k) = _grm(kp[j], kp[k]);
312 else (_A[pos])(k, j) = (_A[pos])(j, k) = _grm(kp[k], kp[j]);
313 }
314 }
315
316 if (_reml_diag_one) {
317 double diag_mean = (_A[pos]).diagonal().mean();
318 cout << "Mean of diagonal elements of the GRM = " << diag_mean << endl;
319 #pragma omp parallel for private(j)
320 for (j = 0; j < _n; j++) {
321 //(_A[pos])(j,j)=diag_mean;
322 for (k = 0; k <= j; k++) {
323 (_A[pos])(j, k) /= (_A[pos])(j, j);
324 (_A[pos])(k, j) = (_A[pos])(j, k);
325 }
326 }
327 }
328
329 prev_file = grm_files[i];
330 prev_grm_id = grm_id;
331 }
332 _grm_N.resize(0, 0);
333 _grm.resize(0, 0);
334 }
335 else {
336 _r_indx.push_back(0);
337 _A.resize(_r_indx.size());
338 }
339 _A[_r_indx.size() - 1] = eigenMatrix::Identity(_n, _n);
340
341 // GE interaction
342 vector<eigenMatrix> E_float(E_fac_num);
343 eigenMatrix qE_float, mbuf;
344 if (qGE_flag) {
345 qE_float.resize(_n, qE_fac_num);
346 for (i = 0; i < qGE_ID.size(); i++) {
347 iter = uni_id_map.find(qGE_ID[i]);
348 if (iter == uni_id_map.end()) continue;
349 for (j = 0; j < qE_fac_num; j++) qE_float(iter->second, j) = atof(qGE[i][j].c_str());
350 }
351 for (j = 0; j < qE_fac_num; j++) {
352 mbuf = ((qE_float.block(0, j, _n, 1))*(qE_float.block(0, j, _n, 1)).transpose());
353 for (i = 0; i < grm_files.size(); i++, pos++) (_A[pos]) = (_A[i]).array() * mbuf.array();
354 }
355 }
356 if (GE_flag) {
357 vector< vector<string> > E_str(E_fac_num);
358 for (i = 0; i < E_fac_num; i++) E_str[i].resize(_n);
359 for (i = 0; i < GE_ID.size(); i++) {
360 iter = uni_id_map.find(GE_ID[i]);
361 if (iter != uni_id_map.end()) {
362 for (j = 0; j < E_fac_num; j++) E_str[j][iter->second] = GE[i][j];
363 }
364 }
365 for (j = 0; j < E_fac_num; j++) {
366 stringstream errmsg;
367 errmsg << "Error: too many classes for the " << j + 1 << "th environmental factor. \nPlease make sure you input a discrete variable as the environmental factor.";
368 string errmsg1 = errmsg.str();
369 errmsg.str("");
370 errmsg << "Error: the " << j + 1 << "th environmental factor has only one class.";
371 string errmsg2 = errmsg.str();
372 coeff_mat(E_str[j], E_float[j], errmsg1, errmsg2);
373 mbuf = ((E_float[j])*(E_float[j]).transpose());
374 for (i = 0; i < grm_files.size(); i++, pos++) (_A[pos]) = (_A[i]).array() * mbuf.array();
375 }
376 }
377
378 // construct X matrix
379 construct_X(_n, uni_id_map, qcovar_flag, qcovar_num, qcovar_ID, qcovar, covar_flag, covar_num, covar_ID, covar, E_float, qE_float);
380
381 // names of variance component
382 for (i = 0; i < grm_files.size(); i++) {
383 stringstream strstrm;
384 if (grm_files.size() == 1) strstrm << "";
385 else strstrm << i + 1;
386 _var_name.push_back("V(G" + strstrm.str() + ")");
387 _hsq_name.push_back("V(G" + strstrm.str() + ")/Vp");
388 }
389 for (j = 0; j < qE_fac_num; j++) {
390 for (i = 0; i < grm_files.size(); i++) {
391 stringstream strstrm1, strstrm2;
392 if (grm_files.size() == 1) strstrm1 << "";
393 else strstrm1 << i + 1;
394 if (qE_fac_num == 1) strstrm2 << "";
395 else strstrm2 << j + 1;
396 _var_name.push_back("V(G" + strstrm1.str() + "xqE" + strstrm2.str() + ")");
397 _hsq_name.push_back("V(G" + strstrm1.str() + "xqE" + strstrm2.str() + ")" + "/Vp");
398 }
399 }
400 for (j = 0; j < E_fac_num; j++) {
401 for (i = 0; i < grm_files.size(); i++) {
402 stringstream strstrm1, strstrm2;
403 if (grm_files.size() == 1) strstrm1 << "";
404 else strstrm1 << i + 1;
405 if (E_fac_num == 1) strstrm2 << "";
406 else strstrm2 << j + 1;
407 _var_name.push_back("V(G" + strstrm1.str() + "xE" + strstrm2.str() + ")");
408 _hsq_name.push_back("V(G" + strstrm1.str() + "xE" + strstrm2.str() + ")" + "/Vp");
409 }
410 }
411 _var_name.push_back("V(e)");
412
413 cout << _n << " individuals are in common in these files." << endl;
414
415 // within family
416 if (_within_family) detect_family();
417
418 // bending
419 if (reml_bending) bend_A();
420
421 // run REML algorithm
422 reml(pred_rand_eff, est_fix_eff, reml_priors, reml_priors_var, prevalence, -2.0, no_constrain, no_lrt, mlmassoc);
423 }
424
drop_comp(vector<int> & drop)425 void gcta::drop_comp(vector<int> &drop) {
426 int i = 0;
427 stringstream errmsg;
428 _r_indx_drop = _r_indx;
429 stable_sort(drop.begin(), drop.end());
430 drop.erase(unique(drop.begin(), drop.end()), drop.end());
431 for (i = drop.size() - 1; i >= 0; i--) {
432 if (drop[i] < 1 || drop[i] > _r_indx.size() - 1) {
433 errmsg << "Error: there " << (_r_indx.size() > 2 ? "are" : "is") << " only " << _r_indx.size() - 1 << " genetic variance component in the model. You can't drop the " << drop[i] << "-th component.";
434 throw (errmsg.str());
435 }
436 _r_indx_drop.erase(_r_indx_drop.begin() + drop[i] - 1);
437 }
438 if (_r_indx.size() == _r_indx_drop.size()) throw ("Error: no component has been dropped from the model. Please check the --reml-lrt option.");
439 }
440
construct_X(int n,map<string,int> & uni_id_map,bool qcovar_flag,int qcovar_num,vector<string> & qcovar_ID,vector<vector<string>> & qcovar,bool covar_flag,int covar_num,vector<string> & covar_ID,vector<vector<string>> & covar,vector<eigenMatrix> & E_float,eigenMatrix & qE_float)441 void gcta::construct_X(int n, map<string, int> &uni_id_map, bool qcovar_flag, int qcovar_num, vector<string> &qcovar_ID, vector< vector<string> > &qcovar, bool covar_flag, int covar_num, vector<string> &covar_ID, vector< vector<string> > &covar, vector<eigenMatrix> &E_float, eigenMatrix &qE_float) {
442 int i = 0, j = 0;
443 map<string, int>::iterator iter;
444 stringstream errmsg;
445
446 _X_c = 1;
447 // quantitative covariates
448 eigenMatrix X_q;
449 if (qcovar_flag) {
450 X_q.resize(n, qcovar_num);
451 for (i = 0; i < qcovar_ID.size(); i++) {
452 iter = uni_id_map.find(qcovar_ID[i]);
453 if (iter == uni_id_map.end()) continue;
454 for (j = 0; j < qcovar_num; j++) X_q(iter->second, j) = atof(qcovar[i][j].c_str());
455 }
456 if (qcovar_num == 0) throw ("Error: no quantitative covariate is found.");
457 cout << qcovar_num << " quantitative variable(s) included as covariate(s)." << endl;
458 _X_c += qcovar_num;
459 }
460 // discrete covariates
461 vector<eigenMatrix> X_d;
462 if (covar_flag) {
463 vector< vector<string> > covar_tmp(covar_num);
464 for (i = 0; i < covar_num; i++) covar_tmp[i].resize(n);
465 for (i = 0; i < covar_ID.size(); i++) {
466 iter = uni_id_map.find(covar_ID[i]);
467 if (iter == uni_id_map.end()) continue;
468 for (j = 0; j < covar_num; j++) covar_tmp[j][iter->second] = covar[i][j];
469 }
470 cout << covar_num << " discrete variable(s) included as covariate(s)." << endl;
471 X_d.resize(covar_num);
472 for (i = 0; i < covar_num; i++) {
473 stringstream errmsg;
474 errmsg << "Error: too many classes for the " << i + 1 << "th discrete variable. \nPlease use the --qcovar if it is a quantitative covariate.";
475 string errmsg1 = errmsg.str();
476 errmsg.str("");
477 errmsg << "Error: the " << i + 1 << "th discrete variable has only one class.";
478 string errmsg2 = errmsg.str();
479 coeff_mat(covar_tmp[i], X_d[i], errmsg1, errmsg2);
480 _X_c += (X_d[i]).cols() - 1;
481 }
482 }
483
484 // E factor
485 _X_c += qE_float.cols();
486 for (i = 0; i < E_float.size(); i++) _X_c += (E_float[i]).cols() - 1;
487
488 // Construct _X
489 int col = 0;
490 _X.resize(n, _X_c);
491 _X.block(0, col, n, 1) = eigenMatrix::Ones(n, 1);
492 col++;
493 if (qcovar_flag) {
494 _X.block(0, col, n, X_q.cols()) = X_q;
495 col += X_q.cols();
496 }
497 for (i = 0; i < X_d.size(); i++) {
498 _X.block(0, col, n, (X_d[i]).cols() - 1) = (X_d[i]).block(0, 1, n, (X_d[i]).cols() - 1);
499 col += (X_d[i]).cols() - 1;
500 }
501 if (qE_float.cols() > 0) {
502 _X.block(0, col, n, qE_float.cols()) = qE_float;
503 col += qE_float.cols();
504 }
505 for (i = 0; i < E_float.size(); i++) {
506 _X.block(0, col, n, (E_float[i]).cols() - 1) = (E_float[i]).block(0, 1, n, (E_float[i]).cols() - 1);
507 col += (E_float[i]).cols() - 1;
508 }
509 }
510
coeff_mat(const vector<string> & vec,eigenMatrix & coeff_mat,string errmsg1,string errmsg2)511 void gcta::coeff_mat(const vector<string> &vec, eigenMatrix &coeff_mat, string errmsg1, string errmsg2) {
512 vector<string> value(vec);
513 stable_sort(value.begin(), value.end());
514 value.erase(unique(value.begin(), value.end()), value.end());
515 if (value.size() > 0.5 * vec.size()) throw (errmsg1); // throw("Error: too many classes for the envronmental factor. \nPlease make sure you input a discrete variable as the environmental factor.");
516 if (value.size() == 1) throw (errmsg2); //throw("Error: the envronmental factor should has more than one classes.");
517
518 int i = 0, j = 0, row_num = vec.size(), column_num = value.size();
519 map<string, int> val_map;
520 for (i = 0; i < value.size(); i++) val_map.insert(pair<string, int>(value[i], i));
521
522 coeff_mat.resize(row_num, column_num);
523 coeff_mat.setZero(row_num, column_num);
524 map<string, int>::iterator iter;
525 for (i = 0; i < row_num; i++) {
526 iter = val_map.find(vec[i]);
527 coeff_mat(i, iter->second) = 1.0;
528 }
529 }
530
check_case_control(double & ncase,eigenVector & y)531 bool gcta::check_case_control(double &ncase, eigenVector &y) {
532 int n = y.size();
533 double case_num = 0.0;
534 vector<double> value(n);
535 for (int i = 0; i < n; i++) value[i] = y(i);
536 stable_sort(value.begin(), value.end());
537 value.erase(unique(value.begin(), value.end()), value.end());
538 if (value.size() == 2) {
539 if (CommFunc::FloatEqual(value[0], 0.0) && CommFunc::FloatEqual(value[1], 1.0)) case_num = y.sum();
540 else if (CommFunc::FloatEqual(value[0], 1.0) && CommFunc::FloatEqual(value[1], 2.0)) case_num = (y.sum() - n);
541 if (!_bivar_reml) cout << "Assuming a disease phenotype for a case-control study: ";
542 cout << (int) case_num << " cases and " << (int) (n - case_num) << " controls ";
543 ncase = case_num / (double) n;
544 return true;
545 } else if (value.size() < 2) throw ("Error: invalid phenotype. Please check the phenotype file.");
546 return false;
547 }
548
transform_hsq_L(double P,double K,double hsq)549 double gcta::transform_hsq_L(double P, double K, double hsq) {
550 double t = StatFunc::qnorm(1.0 - K);
551 double z = StatFunc::dnorm(t);
552 double C = (K * (1 - K) / (z * z))*(K * (1 - K) / (P * (1 - P)));
553 return (hsq * C);
554 }
555
constrain_varcmp(eigenVector & varcmp)556 int gcta::constrain_varcmp(eigenVector &varcmp) {
557 int pos = 0;
558 double delta = 0.0, constr_scale = 1e-6;
559 int i = 0, num = 0;
560 vector<int> constrain(_r_indx.size());
561
562 if (_bivar_reml) {
563 for (i = 0, num = 0; i < _bivar_pos[0].size(); i++) {
564 pos = _bivar_pos[0][i];
565 if (varcmp[pos] < 0) {
566 delta += _y_Ssq * constr_scale - varcmp[pos];
567 varcmp[pos] = _y_Ssq * constr_scale;
568 constrain[i] = 1;
569 num++;
570 }
571 }
572 delta /= (_bivar_pos[0].size() - num);
573 for (i = 0; i < _bivar_pos[0].size(); i++) {
574 pos = _bivar_pos[0][i];
575 if (constrain[pos] < 1 && varcmp[pos] > delta) varcmp[pos] -= delta;
576 }
577
578 for (i = 0, num = 0; i < _bivar_pos[1].size(); i++) {
579 pos = _bivar_pos[1][i];
580 if (varcmp[pos] < 0) {
581 delta += _y_Ssq * constr_scale - varcmp[pos];
582 varcmp[pos] = _y_Ssq * constr_scale;
583 constrain[i] = 1;
584 num++;
585 }
586 }
587 delta /= (_bivar_pos[1].size() - num);
588 for (i = 0; i < _bivar_pos[1].size(); i++) {
589 pos = _bivar_pos[1][i];
590 if (constrain[pos] < 1 && varcmp[pos] > delta) varcmp[pos] -= delta;
591 }
592
593 for (i = 0, num = 0; i < constrain.size(); i++) {
594 if (constrain[i] == 1) num++;
595
596 }
597 return num;
598 }
599
600 for (i = 0; i < _r_indx.size(); i++) {
601 if (varcmp[i] < 0) {
602 delta += _y_Ssq * constr_scale - varcmp[i];
603 varcmp[i] = _y_Ssq * constr_scale;
604 constrain[i] = 1;
605 num++;
606 }
607 }
608 delta /= (_r_indx.size() - num);
609 for (i = 0; i < _r_indx.size(); i++) {
610 if (constrain[i] < 1 && varcmp[i] > delta) varcmp[i] -= delta;
611 }
612
613 return num;
614 }
615
reml(bool pred_rand_eff,bool est_fix_eff,vector<double> & reml_priors,vector<double> & reml_priors_var,double prevalence,double prevalence2,bool no_constrain,bool no_lrt,bool mlmassoc)616 void gcta::reml(bool pred_rand_eff, bool est_fix_eff, vector<double> &reml_priors, vector<double> &reml_priors_var, double prevalence, double prevalence2, bool no_constrain, bool no_lrt, bool mlmassoc)
617 {
618 int i = 0, j = 0, k = 0;
619
620 // Initialize variance component
621 // 0: AI; 1: Fisher-scoring; 2: EM
622 stringstream errmsg;
623 double d_buf = 0.0;
624 eigenVector y_tmp = _y.array() - _y.mean();
625 if (!_bivar_reml) {
626 _y_Ssq = y_tmp.squaredNorm() / (_n - 1.0);
627 if (!(fabs(_y_Ssq) < 1e30)) throw ("Error: the phenotypic variance is infinite. Please check the missing data in your phenotype file. Missing values should be represented by \"NA\" or \"-9\".");
628 }
629 bool reml_priors_flag = !reml_priors.empty(), reml_priors_var_flag = !reml_priors_var.empty();
630 if (reml_priors_flag && reml_priors.size() < _r_indx.size() - 1) {
631 errmsg << "Error: in option --reml-priors. There are " << _r_indx.size() << " variance components. At least " << _r_indx.size() - 1 << " prior values should be specified.";
632 throw (errmsg.str());
633 }
634 if (reml_priors_var_flag && reml_priors_var.size() < _r_indx.size() - 1) {
635 errmsg << "Error: in option --reml-priors-var. There are " << _r_indx.size() << " variance components. At least " << _r_indx.size() - 1 << " prior values should be specified.";
636 throw (errmsg.str());
637 }
638
639 cout << "\nPerforming " << (_bivar_reml ? "bivariate" : "") << " REML analysis ... (Note: may take hours depending on sample size)." << endl;
640 if (_n < 10) throw ("Error: sample size is too small.");
641 cout << _n << " observations, " << _X_c << " fixed effect(s), and " << _r_indx.size() << " variance component(s)(including residual variance)." << endl;
642 eigenMatrix Vi_X(_n, _X_c), Xt_Vi_X_i(_X_c, _X_c), Hi(_r_indx.size(), _r_indx.size());
643 eigenVector Py(_n), varcmp;
644 init_varcomp(reml_priors_var, reml_priors, varcmp);
645 double lgL = reml_iteration(Vi_X, Xt_Vi_X_i, Hi, Py, varcmp, reml_priors_var_flag | reml_priors_flag, no_constrain);
646 eigenMatrix u;
647 if (pred_rand_eff) {
648 u.resize(_n, _r_indx.size());
649 for (i = 0; i < _r_indx.size(); i++) {
650 if (_bivar_reml || _within_family)(u.col(i)) = (((_Asp[_r_indx[i]]) * Py) * varcmp[i]);
651 else (u.col(i)) = (((_A[_r_indx[i]]) * Py) * varcmp[i]);
652 }
653 }
654 if (est_fix_eff) _b = Xt_Vi_X_i * (Vi_X.transpose() * _y);
655
656 // calculate Hsq and SE
657 double Vp = 0.0, Vp2 = 0.0, VarVp = 0.0, VarVp2 = 0.0, Vp_f = 0.0, VarVp_f = 0.0;
658 vector<double> Hsq(_r_indx.size() - 1), VarHsq(_r_indx.size() - 1);
659 calcu_Vp(Vp, Vp2, VarVp, VarVp2, varcmp, Hi);
660 for (i = 0; i < Hsq.size(); i++) calcu_hsq(i, Vp, Vp2, VarVp, VarVp2, Hsq[i], VarHsq[i], varcmp, Hi);
661
662 // calculate the logL for a reduce model
663 double lgL_rdu_mdl = 0.0, LRT = 0.0;
664 if (!no_lrt) {
665 lgL_rdu_mdl = lgL_reduce_mdl(no_constrain);
666 LRT = 2.0 * (lgL - lgL_rdu_mdl);
667 if (LRT < 0.0) LRT = 0.0;
668 }
669
670 // calcuate the logL given a rG in a bivariate analysis
671 double lgL_fixed_rg = 0.0;
672 if (_bivar_reml && !_fixed_rg_val.empty()) {
673 lgL_fixed_rg = lgL_fix_rg(varcmp, no_constrain);
674 LRT = 2.0 * (lgL - lgL_fixed_rg);
675 if (LRT < 0.0) LRT = 0.0;
676 }
677
678 if (mlmassoc) {
679 eigenVector2Vector(varcmp, _varcmp);
680 return;
681 }
682
683 // output results
684 double sum_hsq = 0.0, var_sum_hsq = 0.0;
685 if (!_bivar_reml && _r_indx.size() > 2) calcu_sum_hsq(Vp, VarVp, sum_hsq, var_sum_hsq, varcmp, Hi);
686 cout << "\nSummary result of REML analysis:" << endl;
687 cout << "Source\tVariance\tSE" << setiosflags(ios::fixed) << setprecision(6) << endl;
688 for (i = 0; i < _r_indx.size(); i++) cout << _var_name[i] << "\t" << varcmp[i] << "\t" << sqrt(Hi(i, i)) << endl;
689 if (_bivar_reml) {
690 cout << "Vp_tr1\t" << Vp << "\t" << sqrt(VarVp) << endl;
691 cout << "Vp_tr2\t" << Vp2 << "\t" << sqrt(VarVp2) << endl;
692 for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
693 cout << _hsq_name[j] << "\t" << Hsq[_bivar_pos[0][i]] << "\t" << sqrt(VarHsq[_bivar_pos[0][i]]) << endl;
694 cout << _hsq_name[j + 1] << "\t" << Hsq[_bivar_pos[1][i]] << "\t" << sqrt(VarHsq[_bivar_pos[1][i]]) << endl;
695 }
696 } else {
697 cout << "Vp\t" << Vp << "\t" << sqrt(VarVp) << endl;
698 for (i = 0; i < Hsq.size(); i++) cout << _hsq_name[i] << "\t" << Hsq[i] << "\t" << sqrt(VarHsq[i]) << endl;
699 if (_r_indx.size() > 2) cout << "\nSum of V(G)/Vp\t" << sum_hsq << "\t" << sqrt(var_sum_hsq) << endl;
700 }
701 if ((_flag_CC && prevalence>-1) || (_flag_CC2 && prevalence2>-1)) {
702 cout << "The estimate of variance explained on the observed scale is transformed to that on the underlying scale:" << endl;
703 if (_bivar_reml) {
704 if (_flag_CC) cout << "Proportion of cases in the sample = " << _ncase << " for trait #1; User-specified disease prevalence = " << prevalence << " for trait #1" << endl;
705 if (_flag_CC2) cout << "Proportion of cases in the sample = " << _ncase2 << " for trait #2; User-specified disease prevalence = " << prevalence2 << " for trait #2" << endl;
706 for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
707 if (_flag_CC) cout << _hsq_name[j] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[_bivar_pos[0][i]]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[_bivar_pos[0][i]])) << endl;
708 if (_flag_CC2) cout << _hsq_name[j + 1] << "_L\t" << transform_hsq_L(_ncase2, prevalence2, Hsq[_bivar_pos[1][i]]) << "\t" << transform_hsq_L(_ncase2, prevalence2, sqrt(VarHsq[_bivar_pos[1][i]])) << endl;
709 }
710 } else {
711 cout << "(Proportion of cases in the sample = " << _ncase << "; User-specified disease prevalence = " << prevalence << ")" << endl;
712 for (i = 0; i < Hsq.size(); i++) cout << _hsq_name[i] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[i]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[i])) << endl;
713 if (_r_indx.size() > 2) cout << "\nSum of V(G)_L/Vp\t" << transform_hsq_L(_ncase, prevalence, sum_hsq) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(var_sum_hsq)) << endl;
714 }
715 }
716 // output genetic correlation
717 eigenVector rg, rg_var;
718 vector<string> rg_name;
719 if (_bivar_reml) {
720 calcu_rg(varcmp, Hi, rg, rg_var, rg_name);
721 for (i = 0; i < rg_name.size(); i++) {
722 cout << rg_name[i] << "\t" << rg[i] << "\t" << sqrt(rg_var[i]) << endl;
723 }
724 }
725 if(!_reml_force_converge || !_reml_AI_not_invertible){
726 cout << "\nSampling variance/covariance of the estimates of variance components:" << endl;
727 for (i = 0; i < _r_indx.size(); i++) {
728 for (j = 0; j < _r_indx.size(); j++) cout << setiosflags(ios::scientific) << Hi(i, j) << "\t";
729 cout << endl;
730 }
731 }
732 if (est_fix_eff) {
733 cout << "Estimate" << (_X_c > 1 ? "s" : "") << "of fixed effect" << (_X_c > 1 ? "s" : "") << ":" << endl;
734 cout << "\nSource\tEstimate\tSE" << endl;
735 for (i = 0; i < _X_c; i++) {
736 if (i == 0) cout << "mean\t";
737 else cout << "X_" << i + 1 << "\t";
738 cout << setiosflags(ios::fixed) << _b[i] << "\t" << sqrt(Xt_Vi_X_i(i, i)) << endl;
739 }
740 }
741
742 // save summary result into a file
743 string reml_rst_file = _out + ".hsq";
744 ofstream o_reml(reml_rst_file.c_str());
745 if (!o_reml) throw ("Error: can not open the file [" + reml_rst_file + "] to write.");
746 o_reml << "Source\tVariance\tSE" << setiosflags(ios::fixed) << setprecision(6) << endl;
747 for (i = 0; i < _r_indx.size(); i++) o_reml << _var_name[i] << "\t" << varcmp[i] << "\t" << sqrt(Hi(i, i)) << endl;
748 if (_bivar_reml) {
749 o_reml << "Vp_tr1\t" << Vp << "\t" << sqrt(VarVp) << endl;
750 o_reml << "Vp_tr2\t" << Vp2 << "\t" << sqrt(VarVp2) << endl;
751 for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
752 o_reml << _hsq_name[j] << "\t" << Hsq[_bivar_pos[0][i]] << "\t" << sqrt(VarHsq[_bivar_pos[0][i]]) << endl;
753 o_reml << _hsq_name[j + 1] << "\t" << Hsq[_bivar_pos[1][i]] << "\t" << sqrt(VarHsq[_bivar_pos[1][i]]) << endl;
754 }
755 } else {
756 o_reml << "Vp\t" << Vp << "\t" << sqrt(VarVp) << endl;
757 for (i = 0; i < Hsq.size(); i++) o_reml << _hsq_name[i] << "\t" << Hsq[i] << "\t" << sqrt(VarHsq[i]) << endl;
758 if (_r_indx.size() > 2) o_reml << "\nSum of V(G)/Vp\t" << sum_hsq << "\t" << sqrt(var_sum_hsq) << endl;
759 }
760 if (_flag_CC && prevalence>-1) {
761 o_reml << "The estimate of variance explained on the observed scale is transformed to that on the underlying scale:" << endl;
762 if (_bivar_reml) {
763 o_reml << "(Proportion of cases in the sample = " << _ncase << "; User-specified disease prevalence = " << prevalence << " for disease 1 and = " << prevalence2 << " for disease 2)" << endl;
764 for (i = 0, j = 0; i < _bivar_pos[0].size() - 1; i++, j += 2) {
765 o_reml << _hsq_name[j] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[_bivar_pos[0][i]]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[_bivar_pos[0][i]])) << endl;
766 o_reml << _hsq_name[j + 1] << "_L\t" << transform_hsq_L(_ncase2, prevalence2, Hsq[_bivar_pos[1][i]]) << "\t" << transform_hsq_L(_ncase2, prevalence2, sqrt(VarHsq[_bivar_pos[1][i]])) << endl;
767 }
768 } else {
769 o_reml << "(Proportion of cases in the sample = " << _ncase << "; User-specified disease prevalence = " << prevalence << ")" << endl;
770 for (i = 0; i < Hsq.size(); i++) o_reml << _hsq_name[i] << "_L\t" << transform_hsq_L(_ncase, prevalence, Hsq[i]) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(VarHsq[i])) << endl;
771 if (_r_indx.size() > 2) o_reml << "\nSum of V(G)_L/Vp\t" << transform_hsq_L(_ncase, prevalence, sum_hsq) << "\t" << transform_hsq_L(_ncase, prevalence, sqrt(var_sum_hsq)) << endl;
772 }
773 }
774 if (_bivar_reml) {
775 for (i = 0; i < rg_name.size(); i++) o_reml << rg_name[i] << "\t" << rg[i] << "\t" << sqrt(rg_var[i]) << endl;
776 }
777 o_reml << "logL\t" << setprecision(3) << lgL << endl;
778 if (!no_lrt && _r_indx.size() - 1 > 0) {
779 o_reml << "logL0\t" << setprecision(3) << lgL_rdu_mdl << endl;
780 o_reml << "LRT\t" << setprecision(3) << LRT << endl;
781 o_reml << "df\t" << setprecision(1) << _r_indx.size() - _r_indx_drop.size() << endl;
782 o_reml << "Pval\t" << setprecision(4) << setiosflags(ios::scientific) << 0.5 * StatFunc::chi_prob(_r_indx.size() - _r_indx_drop.size(), LRT) << setiosflags(ios::fixed) << endl;
783 }
784 if (_bivar_reml && !_fixed_rg_val.empty()) {
785 o_reml << "logL0\t" << setprecision(3) << lgL_fixed_rg << " (when rG fixed at ";
786 for (i = 0; i < _fixed_rg_val.size() - 1; i++) o_reml << _fixed_rg_val[i] << "\t";
787 o_reml << _fixed_rg_val[_fixed_rg_val.size() - 1] << ")" << endl;
788 o_reml << "LRT\t" << setprecision(3) << LRT << endl;
789 o_reml << "df\t" << setprecision(1) << _fixed_rg_val.size() << endl;
790 o_reml << "Pval\t" << setprecision(4) << setiosflags(ios::scientific) << 0.5 * StatFunc::chi_prob(_fixed_rg_val.size(), LRT) << setiosflags(ios::fixed) << " (one-tailed test)" << endl;
791 }
792 o_reml << "n\t" << _n << endl;
793 if (est_fix_eff) {
794 o_reml << "\nFix_eff\tSE" << endl;
795 for (i = 0; i < _X_c; i++) o_reml << setprecision(6) << _b[i] << "\t" << sqrt(Xt_Vi_X_i(i, i)) << endl;
796 o_reml.close();
797 }
798 cout << "\nSummary result of REML analysis has been saved in the file [" + reml_rst_file + "]." << endl;
799
800 // save random effect to a file
801 if (pred_rand_eff) {
802 string rand_eff_file = _out + ".indi.blp";
803 ofstream o_rand_eff(rand_eff_file.c_str());
804 for (i = 0; i < _keep.size(); i++) {
805 o_rand_eff << _fid[_keep[i]] << "\t" << _pid[_keep[i]] << "\t";
806 for (j = 0; j < _r_indx.size(); j++) o_rand_eff << setprecision(6) << Py[i] * varcmp[j] << "\t" << u(i, j) << "\t";
807 o_rand_eff << endl;
808 }
809 cout << "\nBLUP of the genetic effects for " << _keep.size() << " individuals has been saved in the file [" + rand_eff_file + "]." << endl;
810 }
811 }
812
init_varcomp(vector<double> & reml_priors_var,vector<double> & reml_priors,eigenVector & varcmp)813 void gcta::init_varcomp(vector<double> &reml_priors_var, vector<double> &reml_priors, eigenVector &varcmp) {
814 int i = 0, pos = 0;
815 double d_buf = 0.0;
816
817 varcmp = eigenVector::Zero(_r_indx.size());
818 if (_bivar_reml) {
819 if (!reml_priors_var.empty()) {
820 for (i = 0; i < _r_indx.size(); i++) varcmp[i] = reml_priors_var[i];
821 }
822 else if (!reml_priors.empty()) {
823 for (i = 0, d_buf = 0; i < _bivar_pos[0].size() - 1; i++) {
824 pos = _bivar_pos[0][i];
825 varcmp[pos] = reml_priors[pos] * _y_Ssq;
826 d_buf += reml_priors[pos];
827 }
828 if (d_buf > 1.0) throw ("\nError: --reml-priors. The sum of all prior values for trait 1 should not exceed 1.0.");
829 varcmp[_bivar_pos[0][_bivar_pos[0].size() - 1]] = (1.0 - d_buf) * _y_Ssq;
830 for (i = 0, d_buf = 0; i < _bivar_pos[1].size() - 1; i++) {
831 pos = _bivar_pos[1][i];
832 varcmp[pos] = reml_priors[pos] * _y_Ssq;
833 d_buf += reml_priors[pos];
834 }
835 if (d_buf > 1.0) throw ("\nError: --reml-priors. The sum of all prior values for trait 2 should not exceed 1.0.");
836 varcmp[_bivar_pos[1][_bivar_pos[1].size() - 1]] = (1.0 - d_buf) * _y2_Ssq;
837 for (i = 0; i < _bivar_pos[2].size(); i++) varcmp[_bivar_pos[2][i]] = reml_priors[_bivar_pos[2][i]] * sqrt(_y_Ssq * _y2_Ssq);
838 }
839 else {
840 for (i = 0; i < _bivar_pos[0].size(); i++) varcmp[_bivar_pos[0][i]] = _y_Ssq / _bivar_pos[0].size();
841 for (i = 0; i < _bivar_pos[1].size(); i++) varcmp[_bivar_pos[1][i]] = _y2_Ssq / _bivar_pos[1].size();
842 for (i = 0; i < _bivar_pos[2].size(); i++) varcmp[_bivar_pos[2][i]] = 0.5 * sqrt(varcmp[_bivar_pos[0][i]] * varcmp[_bivar_pos[1][i]]);
843 }
844
845 return;
846 }
847
848 if (!reml_priors_var.empty()) {
849 for (i = 0; i < _r_indx.size() - 1; i++) varcmp[i] = reml_priors_var[i];
850 if (reml_priors_var.size() < _r_indx.size()) varcmp[_r_indx.size() - 1] = _y_Ssq - varcmp.sum();
851 else varcmp[_r_indx.size() - 1] = reml_priors_var[_r_indx.size() - 1];
852 }
853 else if (!reml_priors.empty()) {
854 for (i = 0, d_buf = 0; i < _r_indx.size() - 1; i++) {
855 varcmp[i] = reml_priors[i] * _y_Ssq;
856 d_buf += reml_priors[i];
857 }
858 if (d_buf > 1.0) throw ("\nError: --reml-priors. The sum of all prior values should not exceed 1.0.");
859 varcmp[_r_indx.size() - 1] = (1.0 - d_buf) * _y_Ssq;
860 }
861 else varcmp.setConstant(_y_Ssq / (_r_indx.size()));
862 }
863
lgL_reduce_mdl(bool no_constrain)864 double gcta::lgL_reduce_mdl(bool no_constrain) {
865 if (_r_indx.size() - 1 == 0) return 0;
866 bool multi_comp = (_r_indx.size() - _r_indx_drop.size() > 1);
867 cout << "\nCalculating the logLikelihood for the reduced model ...\n(variance component" << (multi_comp ? "s " : " ");
868 for (int i = 0; i < _r_indx.size() - 1; i++) {
869 if (find(_r_indx_drop.begin(), _r_indx_drop.end(), _r_indx[i]) == _r_indx_drop.end()) cout << _r_indx[i] + 1 << " ";
870 }
871 cout << (multi_comp ? "are" : "is") << " dropped from the model)" << endl;
872 vector<int> vi_buf(_r_indx);
873 _r_indx = _r_indx_drop;
874 eigenMatrix Vi_X(_n, _X_c), Xt_Vi_X_i(_X_c, _X_c), Hi(_r_indx.size(), _r_indx.size());
875 eigenVector Py(_n);
876 eigenVector varcmp;
877 vector<double> reml_priors_var, reml_priors;
878 init_varcomp(reml_priors_var, reml_priors, varcmp);
879 double lgL = reml_iteration(Vi_X, Xt_Vi_X_i, Hi, Py, varcmp, false, no_constrain);
880 _r_indx = vi_buf;
881 return lgL;
882 }
883
reml_iteration(eigenMatrix & Vi_X,eigenMatrix & Xt_Vi_X_i,eigenMatrix & Hi,eigenVector & Py,eigenVector & varcmp,bool prior_var_flag,bool no_constrain,bool reml_bivar_fix_rg)884 double gcta::reml_iteration(eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp, bool prior_var_flag, bool no_constrain, bool reml_bivar_fix_rg)
885 {
886 /*if(reml_bivar_fix_rg){
887 if(no_constrain){
888 no_constrain=false;
889 cout<<"Warning: --reml-no-constrain disabled. The genetic correlation is fixed so that all the variance components are constrained to be positive."<<endl;
890 }
891 }*/
892
893 char *mtd_str[3] = {"AI-REML algorithm", "Fisher-scoring ...", "EM-REML algorithm ..."};
894 int i = 0, constrain_num = 0, iter = 0, reml_mtd_tmp = _reml_mtd;
895 double logdet = 0.0, logdet_Xt_Vi_X = 0.0, prev_lgL = -1e20, lgL = -1e20, dlogL = 1000.0;
896 eigenVector prev_prev_varcmp(varcmp), prev_varcmp(varcmp), varcomp_init(varcmp);
897 bool converged_flag = false;
898 for (iter = 0; iter < _reml_max_iter; iter++) {
899 if (reml_bivar_fix_rg) update_A(prev_varcmp);
900 if (iter == 0) {
901 prev_varcmp = varcomp_init;
902 if (prior_var_flag){
903 if(_reml_fixed_var) cout << "Variance components are fixed at: " << varcmp.transpose() << endl;
904 else cout << "Prior values of variance components: " << varcmp.transpose() << endl;
905 }
906 else {
907 _reml_mtd = 2;
908 cout << "Calculating prior values of variance components by EM-REML ..." << endl;
909 }
910 }
911 if (iter == 1) {
912 _reml_mtd = reml_mtd_tmp;
913 cout << "Running " << mtd_str[_reml_mtd] << " ..." << "\nIter.\tlogL\t";
914 for (i = 0; i < _r_indx.size(); i++) cout << _var_name[_r_indx[i]] << "\t";
915 cout << endl;
916 }
917 if (_bivar_reml) calcu_Vi_bivar(_Vi, prev_varcmp, logdet, iter); // Calculate Vi, bivariate analysis
918 else if (_within_family) calcu_Vi_within_family(_Vi, prev_varcmp, logdet, iter); // within-family REML
919 else {
920 if (!calcu_Vi(_Vi, prev_varcmp, logdet, iter)){ // Calculate Vi
921 cout<<"Warning: V matrix is not positive-definite.\n";
922 varcmp = prev_prev_varcmp;
923 if(!calcu_Vi(_Vi, varcmp, logdet, iter)) throw("Error: V matrix is not positive-definite.");
924 calcu_Hi(_P, Hi);
925 Hi = 2 * Hi;
926 break;
927 }
928 }
929 logdet_Xt_Vi_X = calcu_P(_Vi, Vi_X, Xt_Vi_X_i, _P); // Calculate P
930 if (_reml_mtd == 0) ai_reml(_P, Hi, Py, prev_varcmp, varcmp, dlogL);
931 else if (_reml_mtd == 1) reml_equation(_P, Hi, Py, varcmp);
932 else if (_reml_mtd == 2) em_reml(_P, Py, prev_varcmp, varcmp);
933 lgL = -0.5 * (logdet_Xt_Vi_X + logdet + (_y.transpose() * Py)(0, 0));
934
935 if(_reml_force_converge && _reml_AI_not_invertible) break;
936 /*{
937 if(_reml_mtd != 1){
938 cout<<"Warning: the information matrix is not invertible. Trying to fix the problem using the Fisher-scoring approach."<<endl;
939 _reml_mtd = 1;
940 _reml_AI_not_invertible = false;
941 iter--;
942 continue;
943 }
944 else {
945 cout<<"Warning: the information matrix is not invertible using the Fisher-scoring approach."<<endl;
946 break;
947 }
948 }*/
949
950 // output log
951 if (!no_constrain) constrain_num = constrain_varcmp(varcmp);
952 if (_bivar_reml && !_bivar_no_constrain) constrain_rg(varcmp);
953 if (iter > 0) {
954 cout << iter << "\t" << setiosflags(ios::fixed) << setprecision(2) << lgL << "\t";
955 for (i = 0; i < _r_indx.size(); i++) cout << setprecision(5) << varcmp[i] << "\t";
956 if (constrain_num > 0) cout << "(" << constrain_num << " component(s) constrained)" << endl;
957 else cout << endl;
958 } else {
959 if (!prior_var_flag) cout << "Updated prior values: " << varcmp.transpose() << endl;
960 cout << "logL: " << lgL << endl;
961 //if(_reml_max_iter==1) cout<<"logL: "<<lgL<<endl;
962 }
963 if(_reml_fixed_var){
964 varcmp = prev_varcmp;
965 break;
966 }
967 if (constrain_num * 2 > _r_indx.size()) throw ("Error: analysis stopped because more than half of the variance components are constrained. The result would be unreliable.\n Please have a try to add the option --reml-no-constrain.");
968 // added by Jian Yang on 22 Oct 2014
969 //if (constrain_num == _r_indx.size()) throw ("Error: analysis stopped because all variance components are constrained. You may have a try of adding the option --reml-no-constrain.");
970
971 if((_reml_force_converge || _reml_no_converge) && prev_lgL > lgL){
972 varcmp = prev_varcmp;
973 calcu_Hi(_P, Hi);
974 Hi = 2 * Hi;
975 break;
976 }
977
978 // convergence
979 dlogL = lgL - prev_lgL;
980 if ((varcmp - prev_varcmp).squaredNorm() / varcmp.squaredNorm() < 1e-8 && (fabs(dlogL) < 1e-4 || (fabs(dlogL) < 1e-2 && dlogL < 0))) {
981 converged_flag = true;
982 if (_reml_mtd == 2) {
983 calcu_Hi(_P, Hi);
984 Hi = 2 * Hi;
985 } // for calculation of SE
986 break;
987 }
988 prev_prev_varcmp = prev_varcmp;
989 prev_varcmp = varcmp;
990 prev_lgL = lgL;
991 }
992
993 if(_reml_fixed_var) cout << "Warning: the model is evaluated at fixed variance components. The likelihood might not be maximised." <<endl;
994 else {
995 if(converged_flag) cout << "Log-likelihood ratio converged." << endl;
996 else {
997 if(_reml_force_converge || _reml_no_converge) cout << "Warning: Log-likelihood not converged. Results are not reliable." <<endl;
998 else if(iter == _reml_max_iter){
999 stringstream errmsg;
1000 errmsg << "Error: Log-likelihood not converged (stop after " << _reml_max_iter << " iteractions). \nYou can specify the option --reml-maxit to allow for more iterations." << endl;
1001 if (_reml_max_iter > 1) throw (errmsg.str());
1002 }
1003 }
1004 }
1005 return lgL;
1006 }
1007
calcu_Vp(double & Vp,double & Vp2,double & VarVp,double & VarVp2,eigenVector & varcmp,eigenMatrix & Hi)1008 void gcta::calcu_Vp(double &Vp, double &Vp2, double &VarVp, double &VarVp2, eigenVector &varcmp, eigenMatrix &Hi) {
1009 int i = 0, j = 0;
1010 Vp = 0.0;
1011 VarVp = 0.0;
1012 Vp2 = 0.0;
1013 VarVp2 = 0.0;
1014 if (_bivar_reml) {
1015 for (i = 0; i < _bivar_pos[0].size(); i++) {
1016 Vp += varcmp[_bivar_pos[0][i]];
1017 for (j = 0; j < _bivar_pos[0].size(); j++) VarVp += Hi(_bivar_pos[0][i], _bivar_pos[0][j]);
1018 }
1019 for (i = 0; i < _bivar_pos[1].size(); i++) {
1020 Vp2 += varcmp[_bivar_pos[1][i]];
1021 for (j = 0; j < _bivar_pos[1].size(); j++) VarVp2 += Hi(_bivar_pos[1][i], _bivar_pos[1][j]);
1022 }
1023 return;
1024 }
1025 for (i = 0; i < _r_indx.size(); i++) {
1026 Vp += varcmp[i];
1027 for (j = 0; j < _r_indx.size(); j++) VarVp += Hi(i, j);
1028 }
1029 }
1030
calcu_hsq(int i,double Vp,double Vp2,double VarVp,double VarVp2,double & hsq,double & var_hsq,eigenVector & varcmp,eigenMatrix & Hi)1031 void gcta::calcu_hsq(int i, double Vp, double Vp2, double VarVp, double VarVp2, double &hsq, double &var_hsq, eigenVector &varcmp, eigenMatrix &Hi) {
1032 int j = 0;
1033 double V1 = varcmp[i], VarV1 = Hi(i, i), Cov12 = 0.0;
1034
1035 if (_bivar_reml) {
1036 vector<int>::iterator iter;
1037 iter = find(_bivar_pos[0].begin(), _bivar_pos[0].end(), i);
1038 if (iter != _bivar_pos[0].end()) {
1039 for (j = 0; j < _bivar_pos[0].size(); j++) {
1040 Cov12 += Hi(*iter, _bivar_pos[0][j]);
1041 }
1042 hsq = V1 / Vp;
1043 var_hsq = (V1 / Vp)*(V1 / Vp)*(VarV1 / (V1 * V1) + VarVp / (Vp * Vp)-(2 * Cov12) / (V1 * Vp));
1044 return;
1045 }
1046 iter = find(_bivar_pos[1].begin(), _bivar_pos[1].end(), i);
1047 if (iter != _bivar_pos[1].end()) {
1048 for (j = 0; j < _bivar_pos[1].size(); j++) {
1049 Cov12 += Hi(*iter, _bivar_pos[1][j]);
1050 }
1051 hsq = V1 / Vp2;
1052 var_hsq = (V1 / Vp2)*(V1 / Vp2)*(VarV1 / (V1 * V1) + VarVp2 / (Vp2 * Vp2)-(2 * Cov12) / (V1 * Vp2));
1053 return;
1054 }
1055 hsq = var_hsq = -2;
1056 return;
1057 }
1058
1059 for (j = 0; j < _r_indx.size(); j++) {
1060 Cov12 += Hi(i, j);
1061 }
1062 hsq = V1 / Vp;
1063 var_hsq = (V1 / Vp)*(V1 / Vp)*(VarV1 / (V1 * V1) + VarVp / (Vp * Vp)-(2 * Cov12) / (V1 * Vp));
1064 }
1065
calcu_sum_hsq(double Vp,double VarVp,double & sum_hsq,double & var_sum_hsq,eigenVector & varcmp,eigenMatrix & Hi)1066 void gcta::calcu_sum_hsq(double Vp, double VarVp, double &sum_hsq, double &var_sum_hsq, eigenVector &varcmp, eigenMatrix &Hi) {
1067 int i = 0, j = 0;
1068 double V1 = 0.0, VarV1 = 0.0, Cov12 = 0.0;
1069 for(i = 0; i < _r_indx.size()-1; i++) {
1070 V1 += varcmp[i];
1071 for(j = 0; j < _r_indx.size()-1; j++) VarV1 += Hi(i, j);
1072 for(j = 0; j < _r_indx.size(); j++) Cov12 += Hi(i, j);
1073 }
1074 sum_hsq = V1/Vp;
1075 var_sum_hsq = (V1/Vp)*(V1/Vp)*(VarV1/(V1*V1)+VarVp/(Vp*Vp)-(2*Cov12)/(V1*Vp));
1076 }
1077
calcu_Vi(eigenMatrix & Vi,eigenVector & prev_varcmp,double & logdet,int & iter)1078 bool gcta::calcu_Vi(eigenMatrix &Vi, eigenVector &prev_varcmp, double &logdet, int &iter)
1079 {
1080 int i = 0, j = 0, k = 0;
1081 string errmsg = "\nError: the V (variance-covariance) matrix is not invertible.";
1082
1083 Vi = eigenMatrix::Zero(_n, _n);
1084 if (_r_indx.size() == 1) {
1085 Vi.diagonal() = eigenVector::Constant(_n, 1.0 / prev_varcmp[0]);
1086 logdet = _n * log(prev_varcmp[0]);
1087 }
1088 else {
1089 for (i = 0; i < _r_indx.size(); i++) Vi += (_A[_r_indx[i]]) * prev_varcmp[i];
1090
1091 if (_V_inv_mtd == 0) {
1092 if (!comput_inverse_logdet_LDLT_mkl(Vi, logdet)) {
1093 if(_reml_force_inv) {
1094 cout<<"Warning: the variance-covaraince matrix V is non-positive definite." << endl;
1095 _V_inv_mtd = 1;
1096 cout << "\nSwitching to the \"bending\" approach to invert V. This method hasn't been tested. The results might not be reliable!" << endl;
1097 }
1098 /*else {
1099 if(_reml_no_converge){
1100 cout<<"Warning: the variance-covaraince matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1101 double d_buf = Vi.diagonal().mean() * 0.01;
1102 for(j = 0; j < _n ; j++) Vi(j,j) += d_buf;
1103 if(!comput_inverse_logdet_LDLT_mkl(Vi, logdet)) return false;
1104 }
1105 else throw("Error: the variance-covaraince matrix V is not positive definite.");
1106 }*/
1107 }
1108 }
1109 if (_V_inv_mtd == 1) bend_V(Vi);
1110 /*if (_V_inv_mtd == 2) {
1111 if(!_reml_force_converge){
1112 cout << "Switching from Cholesky to LU decomposition approach. The results might not be reliable!" << endl;
1113 if (!comput_inverse_logdet_LU_mkl(Vi, logdet)){
1114 if(_reml_no_converge){
1115 cout<<"Warning: the variance-covaraince matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1116 double d_buf = Vi.diagonal().mean() * 0.01;
1117 for(j = 0; j < _n ; j++) Vi(j,j) += d_buf;
1118 if(!comput_inverse_logdet_LDLT_mkl(Vi, logdet)) return false;
1119 }
1120 else throw ("Error: the variance-covaraince matrix V is still not invertible using LU decomposition.");
1121 }
1122 }
1123 else{
1124 cout<<"Warning: the variance-covaraince matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1125 double d_buf = Vi.diagonal().mean() * 0.01;
1126 for(j = 0; j < _n ; j++) Vi(j,j) += d_buf;
1127 comput_inverse_logdet_LU_mkl(Vi, logdet);
1128 }
1129 }*/
1130 }
1131
1132 return true;
1133 }
1134
1135
bend_V(eigenMatrix & Vi)1136 void gcta::bend_V(eigenMatrix &Vi)
1137 {
1138 SelfAdjointEigenSolver<eigenMatrix> eigensolver(Vi);
1139 eigenVector eval = eigensolver.eigenvalues();
1140 bending_eigenval(eval);
1141 eval.array() = 1.0 / eval.array();
1142 Vi = eigensolver.eigenvectors() * eigenDiagMat(eval) * eigensolver.eigenvectors().transpose();
1143 }
1144
1145
bend_A()1146 void gcta::bend_A() {
1147 _Vi.resize(0, 0);
1148 _P.resize(0, 0);
1149 cout << "Bending the GRM(s) to be positive-definite (may take a while if there are multiple GRMs)..." << endl;
1150 int i = 0;
1151 for (i = 0; i < _r_indx.size() - 1; i++) {
1152 #ifdef SINGLE_PRECISION
1153 SelfAdjointEigenSolver<eigenMatrix> eigensolver(_A[_r_indx[i]]);
1154 #else
1155 SelfAdjointEigenSolver<eigenMatrix> eigensolver((_A[_r_indx[i]]).cast<double>());
1156 #endif
1157 eigenVector eval = eigensolver.eigenvalues();
1158 if (bending_eigenval(eval)) {
1159 (_A[_r_indx[i]]) = eigensolver.eigenvectors() * eigenDiagMat(eval) * eigensolver.eigenvectors().transpose();
1160 cout << "Bending the " << i + 1 << "th GRM completed." << endl;
1161 }
1162 }
1163 }
1164
bending_eigenval(eigenVector & eval)1165 bool gcta::bending_eigenval(eigenVector &eval) {
1166 int j = 0;
1167 double eval_m = eval.mean();
1168 if (eval.minCoeff() > 0.0) return false;
1169 double S = 0.0, P = 0.0;
1170 for (j = 0; j < eval.size(); j++) {
1171 if (eval[j] >= 0) continue;
1172 S += eval[j];
1173 P = -eval[j];
1174 }
1175 double W = S * S * 100.0 + 1;
1176 for (j = 0; j < eval.size(); j++) {
1177 if (eval[j] >= 0) continue;
1178 eval[j] = P * (S - eval[j])*(S - eval[j]) / W;
1179 }
1180 eval *= eval_m / eval.mean();
1181 return true;
1182 }
1183
comput_inverse_logdet_LDLT(eigenMatrix & Vi,double & logdet)1184 bool gcta::comput_inverse_logdet_LDLT(eigenMatrix &Vi, double &logdet) {
1185 int i = 0, n = Vi.cols();
1186 LDLT<eigenMatrix> ldlt(Vi);
1187 eigenVector d = ldlt.vectorD();
1188
1189 if (d.minCoeff() < 0) return false;
1190 else {
1191 logdet = 0.0;
1192 for (i = 0; i < n; i++) logdet += log(d[i]);
1193 Vi.setIdentity();
1194 ldlt.solveInPlace(Vi);
1195 }
1196 return true;
1197 }
1198
comput_inverse_logdet_PLU(eigenMatrix & Vi,double & logdet)1199 bool gcta::comput_inverse_logdet_PLU(eigenMatrix &Vi, double &logdet)
1200 {
1201 int n = Vi.cols();
1202
1203 PartialPivLU<eigenMatrix> lu(Vi);
1204 if (lu.determinant()<1e-6) return false;
1205 eigenVector u = lu.matrixLU().diagonal();
1206 logdet = 0.0;
1207 for (int i = 0; i < n; i++) logdet += log(fabs(u[i]));
1208 Vi = lu.inverse();
1209 return true;
1210 }
1211
comput_inverse_logdet_LU(eigenMatrix & Vi,double & logdet)1212 bool gcta::comput_inverse_logdet_LU(eigenMatrix &Vi, double &logdet)
1213 {
1214 int n = Vi.cols();
1215
1216 FullPivLU<eigenMatrix> lu(Vi);
1217 if (!lu.isInvertible()) return false;
1218 eigenVector u = lu.matrixLU().diagonal();
1219 logdet = 0.0;
1220 for (int i = 0; i < n; i++) logdet += log(fabs(u[i]));
1221 Vi = lu.inverse();
1222 return true;
1223 }
1224
inverse_H(eigenMatrix & H)1225 bool gcta::inverse_H(eigenMatrix &H)
1226 {
1227 double d_buf = 0.0;
1228 if (!comput_inverse_logdet_LDLT_mkl(H, d_buf)) return false;
1229 /*{
1230 if(_reml_force_inv) {
1231 cout<<"Warning: the information matrix is non-positive definite. Switching from Cholesky to LU decomposition approach. The results might not be reliable!"<<endl;
1232 if (!comput_inverse_logdet_LU_mkl(H, d_buf)){
1233 cout<<"Warning: the information matrix is invertible. A small positive value is added to the diagonals. The results might not be reliable!"<<endl;
1234 int i = 0;
1235 d_buf = H.diagonal().mean() * 0.001;
1236 for(i = 0; i < H.rows(); i++) H(i,i) += d_buf;
1237 if (!comput_inverse_logdet_LU_mkl(H, d_buf)) return false;
1238 }
1239 }
1240 else return false;
1241 }*/
1242 else return true;
1243 }
1244
calcu_P(eigenMatrix & Vi,eigenMatrix & Vi_X,eigenMatrix & Xt_Vi_X_i,eigenMatrix & P)1245 double gcta::calcu_P(eigenMatrix &Vi, eigenMatrix &Vi_X, eigenMatrix &Xt_Vi_X_i, eigenMatrix &P)
1246 {
1247 Vi_X = Vi*_X;
1248 Xt_Vi_X_i = _X.transpose() * Vi_X;
1249 double logdet_Xt_Vi_X = 0.0;
1250 if(!comput_inverse_logdet_LU(Xt_Vi_X_i, logdet_Xt_Vi_X)) throw("\nError: the X^t * V^-1 * X matrix is not invertible. Please check the covariate(s) and/or the environmental factor(s).");
1251 P = Vi - Vi_X * Xt_Vi_X_i * Vi_X.transpose();
1252 return logdet_Xt_Vi_X;
1253 }
1254
1255 // input P, calculate PA and Hi
calcu_Hi(eigenMatrix & P,eigenMatrix & Hi)1256 void gcta::calcu_Hi(eigenMatrix &P, eigenMatrix &Hi)
1257 {
1258 int i = 0, j = 0, k = 0, l = 0;
1259 double d_buf = 0.0;
1260
1261 // Calculate PA
1262 vector<eigenMatrix> PA(_r_indx.size());
1263 for (i = 0; i < _r_indx.size(); i++) {
1264 (PA[i]).resize(_n, _n);
1265 if (_bivar_reml || _within_family) (PA[i]) = P * (_Asp[_r_indx[i]]);
1266 else (PA[i]) = P * (_A[_r_indx[i]]);
1267 }
1268
1269 // Calculate Hi
1270 for (i = 0; i < _r_indx.size(); i++) {
1271 for (j = 0; j <= i; j++) {
1272 d_buf = 0.0;
1273 for (k = 0; k < _n; k++) {
1274 for (l = 0; l < _n; l++) d_buf += (PA[i])(k, l)*(PA[j])(l, k);
1275 }
1276 Hi(i, j) = Hi(j, i) = d_buf;
1277 }
1278 }
1279
1280 if (!inverse_H(Hi)){
1281 if(_reml_force_converge){
1282 cout << "Warning: the information matrix is not invertible." << endl;
1283 _reml_AI_not_invertible = true;
1284 }
1285 else throw ("Error: the information matrix is not invertible.");
1286 }
1287 }
1288
1289 // use Fisher-scoring to estimate variance component
1290 // input P, calculate PA, H, R and varcmp
1291
reml_equation(eigenMatrix & P,eigenMatrix & Hi,eigenVector & Py,eigenVector & varcmp)1292 void gcta::reml_equation(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &varcmp)
1293 {
1294 // Calculate Hi
1295 calcu_Hi(P, Hi);
1296 if(_reml_AI_not_invertible) return;
1297
1298 // Calculate R
1299 Py = P*_y;
1300 eigenVector R(_r_indx.size());
1301 for (int i = 0; i < _r_indx.size(); i++) {
1302 if (_bivar_reml || _within_family) R(i) = (Py.transpose()*(_Asp[_r_indx[i]]) * Py)(0, 0);
1303 else R(i) = (Py.transpose()*(_A[_r_indx[i]]) * Py)(0, 0);
1304 }
1305
1306 // Calculate variance component
1307 varcmp = Hi*R;
1308 Hi = 2 * Hi; // for calculation of SE
1309 }
1310
1311 // use Fisher-scoring to estimate variance component
1312 // input P, calculate PA, H, R and varcmp
1313
ai_reml(eigenMatrix & P,eigenMatrix & Hi,eigenVector & Py,eigenVector & prev_varcmp,eigenVector & varcmp,double dlogL)1314 void gcta::ai_reml(eigenMatrix &P, eigenMatrix &Hi, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp, double dlogL)
1315 {
1316 int i = 0, j = 0;
1317
1318 Py = P*_y;
1319 eigenVector cvec(_n);
1320 eigenMatrix APy(_n, _r_indx.size());
1321 for (i = 0; i < _r_indx.size(); i++) {
1322 if (_bivar_reml || _within_family) (APy.col(i)) = (_Asp[_r_indx[i]]) * Py;
1323 else (APy.col(i)) = (_A[_r_indx[i]]) * Py;
1324 }
1325
1326 // Calculate Hi
1327 eigenVector R(_r_indx.size());
1328 for (i = 0; i < _r_indx.size(); i++) {
1329 R(i) = (Py.transpose()*(APy.col(i)))(0, 0);
1330 cvec = P * (APy.col(i));
1331 Hi(i, i) = ((APy.col(i)).transpose() * cvec)(0, 0);
1332 for (j = 0; j < i; j++) Hi(j, i) = Hi(i, j) = ((APy.col(j)).transpose() * cvec)(0, 0);
1333 }
1334 Hi = 0.5 * Hi;
1335
1336 // Calcualte tr(PA) and dL
1337 eigenVector tr_PA;
1338 calcu_tr_PA(P, tr_PA);
1339 R = -0.5 * (tr_PA - R);
1340
1341 // Calculate variance component
1342 if (!inverse_H(Hi)){
1343 if(_reml_force_converge){
1344 cout << "Warning: the information matrix is not invertible." << endl;
1345 _reml_AI_not_invertible = true;
1346 return;
1347 }
1348 else throw ("Error: the information matrix is not invertible.");
1349 }
1350
1351 eigenVector delta(_r_indx.size());
1352 delta = Hi*R;
1353 if (dlogL > 1.0) varcmp = prev_varcmp + 0.316 * delta;
1354 else varcmp = prev_varcmp + delta;
1355 }
1356
1357 // input P, calculate varcmp
1358
em_reml(eigenMatrix & P,eigenVector & Py,eigenVector & prev_varcmp,eigenVector & varcmp)1359 void gcta::em_reml(eigenMatrix &P, eigenVector &Py, eigenVector &prev_varcmp, eigenVector &varcmp)
1360 {
1361 int i = 0;
1362
1363 // Calculate trace(PA)
1364 eigenVector tr_PA;
1365 calcu_tr_PA(P, tr_PA);
1366
1367 // Calculate R
1368 Py = P*_y;
1369 eigenVector R(_r_indx.size());
1370 for (i = 0; i < _r_indx.size(); i++) {
1371 if (_bivar_reml || _within_family) R(i) = (Py.transpose()*(_Asp[_r_indx[i]]) * Py)(0, 0);
1372 else R(i) = (Py.transpose()*(_A[_r_indx[i]]) * Py)(0, 0);
1373 }
1374
1375 // Calculate variance component
1376 for (i = 0; i < _r_indx.size(); i++) varcmp(i) = (prev_varcmp(i) * _n - prev_varcmp(i) * prev_varcmp(i) * tr_PA(i) + prev_varcmp(i) * prev_varcmp(i) * R(i)) / _n;
1377
1378 // added by Jian Yang Dec 2014
1379 //varcmp = (varcmp.array() - prev_varcmp.array())*2 + prev_varcmp.array();
1380 }
1381
1382 // input P, calculate tr(PA)
calcu_tr_PA(eigenMatrix & P,eigenVector & tr_PA)1383 void gcta::calcu_tr_PA(eigenMatrix &P, eigenVector &tr_PA) {
1384 int i = 0, k = 0, l = 0;
1385 double d_buf = 0.0;
1386
1387 // Calculate trace(PA)
1388 tr_PA.resize(_r_indx.size());
1389 for (i = 0; i < _r_indx.size(); i++) {
1390 if (_bivar_reml || _within_family) tr_PA(i) = (P * (_Asp[_r_indx[i]])).diagonal().sum();
1391 else {
1392 d_buf = 0.0;
1393 for (k = 0; k < _n; k++) {
1394 for (l = 0; l < _n; l++) d_buf += P(k, l)*(_A[_r_indx[i]])(k, l);
1395 }
1396 tr_PA(i) = d_buf;
1397 }
1398 }
1399 }
1400
1401 // blue estimate of SNP effect
1402
blup_snp_geno()1403 void gcta::blup_snp_geno() {
1404 check_autosome();
1405
1406 if (_mu.empty()) calcu_mu();
1407
1408 int i = 0, j = 0, k = 0, col_num = _varcmp_Py.cols();
1409 double x = 0.0, fcount = 0.0;
1410
1411 // Calcuate A matrix
1412 cout << "Calculating the BLUP solution to SNP effects ..." << endl;
1413 vector<double> var_SNP(_include.size());
1414 eigenMatrix b_SNP = eigenMatrix::Zero(_include.size(), col_num); // variance of each SNP, 2pq
1415 for (j = 0; j < _include.size(); j++) {
1416 var_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1417 if (fabs(var_SNP[j]) < 1.0e-50) var_SNP[j] = 0.0;
1418 else var_SNP[j] = 1.0 / var_SNP[j];
1419 }
1420
1421 for (k = 0; k < _include.size(); k++) {
1422 fcount = 0.0;
1423 for (i = 0; i < _keep.size(); i++) {
1424 if (!_snp_1[_include[k]][_keep[i]] || _snp_2[_include[k]][_keep[i]]) {
1425 if (_allele1[_include[k]] == _ref_A[_include[k]]) x = _snp_1[_include[k]][_keep[i]] + _snp_2[_include[k]][_keep[i]];
1426 else x = 2.0 - (_snp_1[_include[k]][_keep[i]] + _snp_2[_include[k]][_keep[i]]);
1427 x = (x - _mu[_include[k]]);
1428 for (j = 0; j < col_num; j++) b_SNP(k, j) += x * _varcmp_Py(i, j);
1429 fcount += 1.0;
1430 }
1431 }
1432 for (j = 0; j < col_num; j++) b_SNP(k, j) = (b_SNP(k, j) * var_SNP[k] / fcount)*((double) _keep.size() / (double) _include.size());
1433 }
1434 output_blup_snp(b_SNP);
1435 }
1436
blup_snp_dosage()1437 void gcta::blup_snp_dosage() {
1438 check_autosome();
1439
1440 if (_mu.empty()) calcu_mu();
1441
1442 int i = 0, j = 0, k = 0, col_num = _varcmp_Py.cols();
1443
1444 // Subtract each element by 2p
1445 for (i = 0; i < _keep.size(); i++) {
1446 for (j = 0; j < _include.size(); j++) _geno_dose[_keep[i]][_include[j]] -= _mu[_include[j]];
1447 }
1448
1449 // Calculate A matrix
1450 cout << "Calculating the BLUP solution to SNP effects using imputed dosage scores ... " << endl;
1451 vector<double> var_SNP(_include.size()); // variance of each SNP, 2pq
1452 eigenMatrix b_SNP = eigenMatrix::Zero(_include.size(), col_num); // variance of each SNP, 2pq
1453 for (j = 0; j < _include.size(); j++) {
1454 for (i = 0; i < _keep.size(); i++) var_SNP[j] += _geno_dose[_keep[i]][_include[j]] * _geno_dose[_keep[i]][_include[j]];
1455 var_SNP[j] /= (double) (_keep.size() - 1);
1456 if (fabs(var_SNP[j]) < 1.0e-50) var_SNP[j] = 0.0;
1457 else var_SNP[j] = 1.0 / var_SNP[j];
1458 }
1459 for (k = 0; k < _include.size(); k++) {
1460 for (i = 0; i < _keep.size(); i++) {
1461 for (j = 0; j < col_num; j++) b_SNP(k, j) += _geno_dose[_keep[i]][_include[k]] * _varcmp_Py(i, j);
1462 }
1463 for (j = 0; j < col_num; j++) b_SNP(k, j) = b_SNP(k, j) * var_SNP[k] / (double) _include.size();
1464 }
1465 output_blup_snp(b_SNP);
1466 }
1467
output_blup_snp(eigenMatrix & b_SNP)1468 void gcta::output_blup_snp(eigenMatrix &b_SNP) {
1469 string o_b_snp_file = _out + ".snp.blp";
1470 ofstream o_b_snp(o_b_snp_file.c_str());
1471 if (!o_b_snp) throw ("Error: can not open the file " + o_b_snp_file + " to write.");
1472 int i = 0, j = 0, col_num = b_SNP.cols();
1473 cout << "Writing BLUP solutions of SNP effects for " << _include.size() << " SNPs to [" + o_b_snp_file + "]." << endl;
1474 for (i = 0; i < _include.size(); i++) {
1475 o_b_snp << _snp_name[_include[i]] << "\t" << _ref_A[_include[i]] << "\t";
1476 for (j = 0; j < col_num; j++) o_b_snp << b_SNP(i, j) << "\t";
1477 o_b_snp << endl;
1478 }
1479 o_b_snp.close();
1480 cout << "BLUP solutions of SNP effects for " << _include.size() << " SNPs have been saved in the file [" + o_b_snp_file + "]." << endl;
1481 }
1482
HE_reg(string grm_file,string phen_file,string keep_indi_file,string remove_indi_file,int mphen)1483 void gcta::HE_reg(string grm_file, string phen_file, string keep_indi_file, string remove_indi_file, int mphen) {
1484 int i = 0, j = 0, k = 0, l = 0;
1485 stringstream errmsg;
1486 vector<string> phen_ID, grm_id;
1487 vector< vector<string> > phen_buf; // save individuals by column
1488
1489 read_grm(grm_file, grm_id, true, false, true);
1490 update_id_map_kp(grm_id, _id_map, _keep);
1491 read_phen(phen_file, phen_ID, phen_buf, mphen);
1492 update_id_map_kp(phen_ID, _id_map, _keep);
1493 if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
1494 if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
1495
1496 vector<string> uni_id;
1497 map<string, int> uni_id_map;
1498 map<string, int>::iterator iter;
1499 for (i = 0; i < _keep.size(); i++) {
1500 uni_id.push_back(_fid[_keep[i]] + ":" + _pid[_keep[i]]);
1501 uni_id_map.insert(pair<string, int>(_fid[_keep[i]] + ":" + _pid[_keep[i]], i));
1502 }
1503 _n = _keep.size();
1504 if (_n < 1) throw ("Error: no individual is in common in the input files.");
1505
1506 _y.setZero(_n);
1507 for (i = 0; i < phen_ID.size(); i++) {
1508 iter = uni_id_map.find(phen_ID[i]);
1509 if (iter == uni_id_map.end()) continue;
1510 _y[iter->second] = atof(phen_buf[i][mphen - 1].c_str());
1511 }
1512
1513 cout << "\nPerforming Haseman-Elston regression ...\n" << endl;
1514 int n = _n * (_n - 1) / 2;
1515 /* vector<bool> nomiss(_n*(_n-1));
1516 for(i=0, k=0; i<_n; i++){
1517 for(j=0; j<i; j++, k++){
1518 if(CommFunc::FloatNotEqual(_grm(i,j),0)){
1519 nomiss[k]=true;
1520 n++;
1521 }
1522 else nomiss[k]=false;
1523 }
1524 }*/
1525
1526 // normalise phenotype
1527 cout << "Standardising the phenotype ..." << endl;
1528 eigenVector y = _y.array() - _y.mean();
1529 y = y.array() / sqrt(y.squaredNorm() / (_n - 1.0));
1530
1531 vector<double> y_cp(n), y_sd(n), x(n), rst;
1532 for (i = 0, k = 0; i < _n; i++) {
1533 for (j = 0; j < i; j++, k++) {
1534 y_sd[k] = (y[i] - y[j])*(y[i] - y[j]);
1535 y_cp[k] = y[i]*y[j];
1536 x[k] = _grm(i, j);
1537 }
1538 }
1539 eigenMatrix reg_sum_sd = reg(y_sd, x, rst, true);
1540 eigenMatrix reg_sum_cp = reg(y_cp, x, rst, true);
1541
1542 stringstream ss;
1543 ss << "HE-SD" << endl;
1544 ss << "Coefficient\tEstimate\tSE\tP\n";
1545 ss << "Intercept\t" << reg_sum_sd.row(0) << endl;
1546 ss << "Slope\t" << reg_sum_sd.row(1) << endl;
1547 ss << "V(G)/Vp\t" << -1 * reg_sum_sd(1, 0) / reg_sum_sd(0.0) << "\t" << reg_sum_sd(1, 1) / fabs(reg_sum_sd(0.0)) << endl;
1548 ss << "\nHE-CP" << endl;
1549 ss << "Coefficient\tEstimate\tSE\tP\n";
1550 ss << "Intercept\t" << reg_sum_cp.row(0) << endl;
1551 ss << "Slope\t" << reg_sum_cp.row(1) << endl;
1552 ss << "V(G)/Vp\t" << reg_sum_cp(1, 0) << "\t" << reg_sum_cp(1, 1) << endl;
1553
1554 cout << ss.str() << endl;
1555 string ofile = _out + ".HEreg";
1556 ofstream os(ofile.c_str());
1557 if (!os) throw ("Error: can not open the file [" + ofile + "] to write.");
1558 os << ss.str() << endl;
1559 cout << "Results from Haseman-Elston regression have been saved in [" + ofile + "]." << endl;
1560
1561 }