1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for BLUP analysis using sumamry data from EWAS
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13
14 #include "gcta.h"
15
calcu_eR()16 void gcta::calcu_eR()
17 {
18 // _probe_data rotated
19 eigenMatrix X(_probe_data);
20 _probe_data.resize(_e_include.size(), _keep.size());
21
22 int i=0, j=0;
23 #pragma omp parallel for private(j)
24 for(i=0; i<_keep.size(); i++){
25 for(j=0; j<_e_include.size(); j++) _probe_data(j,i)=X(_keep[i], _e_include[j]);
26 }
27 eigenVector m(_e_include.size()), nonmiss(_e_include.size());
28
29 #pragma omp parallel for private(i)
30 for(j=0; j<_e_include.size(); j++){
31 m(j)=0.0;
32 nonmiss(j)=0.0;
33 for(i=0; i<_keep.size(); i++){
34 if(_probe_data(j,i)<1e9){
35 m(j)+=_probe_data(j,i);
36 nonmiss(j)+=1.0;
37 }
38 }
39 m(j)/=nonmiss(j);
40 }
41
42 #pragma omp parallel for private(j)
43 for(i=0; i<_keep.size(); i++){
44 for(j=0; j<_e_include.size(); j++){
45 if(_probe_data(j,i)<1e9) _probe_data(j,i)-=m(j);
46 else _probe_data(j,i)=0.0;
47 }
48 }
49 eigenVector d(_e_include.size());
50
51 #pragma omp parallel for
52 for(j=0; j<_e_include.size(); j++){
53 d(j)=_probe_data.row(j).dot(_probe_data.row(j));
54 }
55
56 _ecojo_wholeR=_probe_data*_probe_data.transpose();
57
58 #pragma omp parallel for private(j)
59 for(i=0; i<_e_include.size(); i++){
60 _ecojo_wholeR(i,i)=1.0;
61 for(j=i+1; j<_e_include.size(); j++) _ecojo_wholeR(i,j)=_ecojo_wholeR(j,i)=_ecojo_wholeR(i,j)/sqrt(d(i)*d(j));
62 }
63 _probe_data.resize(0,0);
64 }
65
read_eR(string eR_file)66 void gcta::read_eR(string eR_file)
67 {
68 ifstream eR_inf(eR_file.c_str());
69 if (!eR_inf.is_open()) throw ("Error: can not open the file [" + eR_file + "] to read.");
70 cout << "Reading correlation matrix of gene expression from [" + eR_file + "] ..." << endl;
71
72 string str_buf="";
73 getline(eR_inf, str_buf); // reading the probe names
74 _probe_num = StrFunc::split_string(str_buf, _probe_name, " \t\n");
75 cout<<_probe_num<<" probes found in the file. \nReading correlation matrix ..."<<endl;
76 int i = 0, j = 0;
77 _ecojo_wholeR.resize(_probe_num, _probe_num);
78 for(i = 0; i < _probe_num; i++) {
79 for(j = 0; j < _probe_num; j++) {
80 if(!(eR_inf >> _ecojo_wholeR(i,j))) throw("Error: incorrect format of [" + eR_file + "].");
81 }
82 }
83 eR_inf.close();
84 cout<<"Correlation matrix for "<<_probe_num<<" probes have been included from the file [" + eR_file + "]."<<endl;
85
86 init_e_include();
87 }
88
read_e_metafile(string e_metafile)89 void gcta::read_e_metafile(string e_metafile)
90 {
91 cout << "\nReading expression-trait association summary-level statistics from [" + e_metafile + "] ..." << endl;
92 ifstream e_meta(e_metafile.c_str());
93 if (!e_meta) throw ("Error: can not open the file [" + e_metafile + "] to read.");
94
95 string str_buf="";
96 double d_buf=0.0;
97 vector<string> vs_buf, probe_buf;
98 vector<double> z_buf, n_buf;
99
100 getline(e_meta, str_buf); // the header line
101 if (StrFunc::split_string(str_buf, vs_buf) < 3) throw ("Error: there needs be at least 3 columns in the file [" + e_metafile + "].");
102 stringstream errmsg;
103 int line=1;
104 while(getline(e_meta, str_buf)){
105 stringstream iss(str_buf);
106 if(!(iss >> str_buf)){ errmsg<<"Error: in line "<<line<<"."; throw(errmsg.str()); }
107 if (_probe_name_map.find(str_buf) == _probe_name_map.end()) continue;
108 probe_buf.push_back(str_buf);
109 if(!(iss >> d_buf)){ errmsg<<"Error: in line "<<line<<"."; throw(errmsg.str()); }
110 z_buf.push_back(d_buf);
111 if(!(iss >> d_buf)){ errmsg<<"Error: in line "<<line<<"."; throw(errmsg.str()); }
112 n_buf.push_back(d_buf);
113 line++;
114 }
115 e_meta.close();
116 if(probe_buf.size()<1) throw("Error: no probe remains in the analysis.");
117 cout << "GWAS summary statistics of " << probe_buf.size() << " probs read from [" + e_metafile + "]." << endl;
118
119 cout << "Matching the summary data to the genotype data ..." << endl;
120 update_id_map_kp(probe_buf, _probe_name_map, _e_include);
121 vector<int> indx(_e_include.size());
122 map<string, int> id_map;
123 int i=0;
124 for (i = 0; i < probe_buf.size(); i++) id_map.insert(pair<string, int>(probe_buf[i], i));
125 map<string, int>::iterator iter;
126 for (i = 0; i < _e_include.size(); i++) {
127 iter = id_map.find(_probe_name[_e_include[i]]);
128 indx[i] = iter->second;
129 }
130 _ecojo_z.resize(_e_include.size());
131 _ecojo_b.resize(_e_include.size());
132 _ecojo_se.resize(_e_include.size());
133 _ecojo_n.resize(_e_include.size());
134 _ecojo_pval.resize(_e_include.size());
135
136 //#pragma omp parallel for private(d_buf)
137 for (i = 0; i < _e_include.size(); i++) {
138 _ecojo_z[i]=z_buf[indx[i]];
139 _ecojo_pval[i] = StatFunc::pchisq(_ecojo_z[i]*_ecojo_z[i], 1);
140 _ecojo_n[i]=n_buf[indx[i]];
141 d_buf=sqrt(_ecojo_z[i]*_ecojo_z[i] + _ecojo_n[i]);
142 if(d_buf<1e-30){
143 _ecojo_b[i]=0.0;
144 _ecojo_se[i]=-1.0;
145 }
146 else{
147 _ecojo_b[i]=_ecojo_z[i]/d_buf;
148 _ecojo_se[i]=1.0/d_buf;
149 }
150 }
151 }
152
run_ecojo_slct(string e_metafile,double p_cutoff,double collinear)153 void gcta::run_ecojo_slct(string e_metafile, double p_cutoff, double collinear)
154 {
155 bool joint_only=false, backward=false;
156 _ecojo_p_cutoff = p_cutoff;
157 _ecojo_collinear = collinear;
158 read_e_metafile(e_metafile);
159 calcu_eR();
160
161 int i = 0, j = 0;
162 vector<int> slct, remain;
163 eigenVector bC, bC_se, pC;
164 cout << endl;
165 if (!joint_only && !backward) {
166 cout << "Performing stepwise model selection on " << _e_include.size() << " probes to select association signals ... (p cutoff = " << _ecojo_p_cutoff << "; ";
167 cout << "collinearity cutoff = " << _ecojo_collinear << ")"<< endl;
168 ecojo_slct(slct, remain, bC, bC_se, pC);
169 if (slct.empty()) {
170 cout << "No probe has been selected." << endl;
171 return;
172 }
173 }
174 else {
175 for (i = 0; i < _e_include.size(); i++) slct.push_back(i);
176 if (backward) {
177 cout << "Performing backward selection on " << _e_include.size() << " probes at threshold p-value = " << _ecojo_p_cutoff << " ..." << endl;
178 ecojo_slct_stay(slct, bC, bC_se, pC);
179 }
180 }
181
182 // joint analysis
183 eigenVector bJ, bJ_se, pJ;
184 cout << "Performing joint analysis on all the " << slct.size();
185 if (joint_only) cout << " probes ..." << endl;
186 else cout << " selected signals ..." << endl;
187 if (slct.size() >= _keep.size()) throw ("Error: too many probes. The number of probes in a joint analysis should not be larger than the sample size.");
188 ecojo_joint(slct, bJ, bJ_se, pJ);
189 ecojo_slct_output(joint_only, slct, bJ, bJ_se, pJ);
190 }
191
ecojo_slct_output(bool joint_only,vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)192 void gcta::ecojo_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ)
193 {
194 string filename = _out + ".slct.ecojo";
195 if (joint_only) cout << "Saving the joint analysis result of " << slct.size() << " probes to [" + filename + "] ..." << endl;
196 else cout << "Saving the " << slct.size() << " independent signals to [" + filename + "] ..." << endl;
197 ofstream ofile(filename.c_str());
198 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
199 ofile << "Probe\tb\tse\tz\tn\tbJ\tbJ_se\tzJ\tpJ"<< endl;
200 int i = 0, j = 0;
201 for (i = 0; i < slct.size(); i++) {
202 j = slct[i];
203 ofile << _probe_name[_e_include[j]] << "\t" << _ecojo_b[j] << "\t" <<_ecojo_se[j] << "\t" << _ecojo_z[j] << "\t" << _ecojo_n[j] << "\t"<< bJ[i] << "\t" << bJ_se[i] << "\t" << bJ[i]/bJ_se[i] << "\t" << pJ[i] << "\t" << endl;
204 }
205 ofile.close();
206 }
207
ecojo_slct(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)208 void gcta::ecojo_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
209 {
210 int i = 0, i_buf = 0;
211 vector<double> p_buf;
212 eigenVector2Vector(_ecojo_pval, p_buf);
213 int m = min_element(p_buf.begin(), p_buf.end()) - p_buf.begin();
214 if (p_buf[m] >= _ecojo_p_cutoff) return;
215 slct.push_back(m);
216 for (i = 0; i < _e_include.size(); i++) {
217 if (i != m) remain.push_back(i);
218 }
219 int prev_num = 0;
220 ecojo_init_R(slct);
221 ecojo_init_RC(slct, remain);
222 if (_ecojo_p_cutoff > 1e-3) cout << "Performing forward model selection because the significance level is too low..." << endl;
223
224 while (!remain.empty()) {
225 if (ecojo_slct_entry(slct, remain, bC, bC_se, pC)) {
226 if (_ecojo_p_cutoff <= 1e-3) ecojo_slct_stay(slct, bC, bC_se, pC);
227 ecojo_init_RC(slct, remain);
228 }
229 else break;
230 if (slct.size() % 5 == 0 && slct.size() > prev_num) cout << slct.size() << " associated probes have been selected." << endl;
231 if (slct.size() > prev_num) prev_num = slct.size();
232 }
233 if (_ecojo_p_cutoff > 1e-3) {
234 cout << "Performing backward elimination..." << endl;
235 ecojo_slct_stay(slct, bC, bC_se, pC);
236 }
237 cout << "Finally, " << slct.size() << " associated probes are selected." << endl;
238 }
239
ecojo_slct_entry(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)240 bool gcta::ecojo_slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
241 {
242 int i = 0, m = 0;
243 ecojo_cond(slct, remain, bC, bC_se, pC);
244 vector<double> pC_buf;
245 eigenVector2Vector(pC, pC_buf);
246
247 while (true) {
248 m = min_element(pC_buf.begin(), pC_buf.end()) - pC_buf.begin();
249 if (pC_buf[m] >= _ecojo_p_cutoff){
250
251 ecojo_init_RC(slct, remain);
252 ecojo_cond(slct, remain, bC, bC_se, pC);
253
254
255 // debug
256 /* cout<<"here"<<endl;
257 ofstream tmp("cond.txt");
258 for(int j=0; j<remain.size(); j++){
259 tmp<<_probe_name[_e_include[remain[j]]]<<" "<<bC[j]<<" "<<bC_se[j]<<" "<<pC[j]<<endl;
260 }
261 tmp.close();
262 ofstream oR("R.txt");
263 for(int j=0; j<_ecojo_R.rows(); j++){
264 for(int k=0; k<_ecojo_R.cols(); k++) oR<<_ecojo_R(j,k)<<" ";
265 oR<<endl;
266 }
267 oR.close();
268 ofstream oRC("RC.txt");
269 for(int j=0; j<_ecojo_RC.rows(); j++){
270 for(int k=0; k<_ecojo_RC.cols(); k++) oRC<<_ecojo_RC(j,k)<<" ";
271 oRC<<endl;
272 }
273 oRC.close();*/
274
275 return false;
276 }
277 if (ecojo_insert_R(slct, remain[m])){
278 slct.push_back(remain[m]);
279 stable_sort(slct.begin(), slct.end());
280 remain.erase(remain.begin() + m);
281 return (true);
282 }
283 pC_buf.erase(pC_buf.begin() + m);
284 remain.erase(remain.begin() + m);
285
286 // debug
287 //cout<<"remain = "<<remain.size()<<endl;
288 }
289 }
290
ecojo_slct_stay(vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)291 void gcta::ecojo_slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ)
292 {
293 vector<double> pJ_buf;
294 while(!slct.empty()){
295 ecojo_joint(slct, bJ, bJ_se, pJ);
296 eigenVector2Vector(pJ, pJ_buf);
297 int m = max_element(pJ_buf.begin(), pJ_buf.end()) - pJ_buf.begin();
298 if(pJ[m] > _ecojo_p_cutoff){
299 slct.erase(slct.begin() + m);
300 ecojo_erase_R(slct);
301 }
302 else break;
303 }
304 }
305
ecojo_joint(const vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)306 void gcta::ecojo_joint(const vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ)
307 {
308 int i = 0, size = slct.size();
309 eigenVector b(size), n(size);
310 for(i = 0; i < size; i++){
311 b[i] = _ecojo_b[slct[i]];
312 n[i] = _ecojo_n[slct[i]];
313 }
314 bJ = _ecojo_R*b;
315 bJ_se = eigenVector::Constant(size, -1);
316 pJ = eigenVector::Constant(size, 2);
317 double chisq=0.0;
318 for(i = 0; i < size; i++){
319 bJ_se[i] = _ecojo_R(i,i) / n[i];
320 if (bJ_se[i] > 1.0e-30) {
321 bJ_se[i] = sqrt(bJ_se[i]);
322 chisq = bJ[i] / bJ_se[i];
323 pJ[i] = StatFunc::pchisq(chisq*chisq, 1);
324 }
325 }
326 }
327
ecojo_cond(const vector<int> & slct,const vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)328 void gcta::ecojo_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
329 {
330 int i=0, j=0;
331 eigenVector b1(slct.size()), b2(remain.size());
332 for(i=0; i<slct.size(); i++) b1[i]=_ecojo_b[slct[i]];
333 for(i=0; i<remain.size(); i++) b2[i]=_ecojo_b[remain[i]];
334
335 bC = eigenVector::Constant(remain.size(), 0);
336 bC_se = eigenVector::Constant(remain.size(), -1);
337 pC = eigenVector::Constant(remain.size(), 2);
338 double chisq = 0.0;
339 eigenVector RCRi;
340 //#pragma omp parallel for private(RCRi)
341 for (i = 0; i < remain.size(); i++) {
342 RCRi=_ecojo_R*_ecojo_RC.col(i);
343 bC[i]=b2[i]-RCRi.dot(b1);
344 bC_se[i] = (1 - _ecojo_RC.col(i).dot(RCRi)) / _ecojo_n[remain[i]];
345 if (bC_se[i] > 1e-30) {
346 bC_se[i] = sqrt(bC_se[i]);
347 chisq = bC[i] / bC_se[i];
348 pC[i] = StatFunc::pchisq(chisq*chisq, 1);
349 }
350 }
351 }
352
ecojo_init_R(const vector<int> & slct)353 bool gcta::ecojo_init_R(const vector<int> &slct)
354 {
355 int i=0, j=0, size=slct.size();
356 _ecojo_R.resize(size, size);
357
358 #pragma omp parallel for private(j)
359 for(i=0; i<size; i++){
360 for(j=0; j<size; j++) _ecojo_R(i,j)=_ecojo_wholeR(slct[i],slct[j]);
361 }
362 ecojo_inv_R();
363 if ((1 - eigenVector::Constant(size, 1).array() / _ecojo_R.diagonal().array()).maxCoeff() > _ecojo_collinear) return false;
364
365 return true;
366 }
367
ecojo_init_RC(const vector<int> & slct,const vector<int> & remain)368 void gcta::ecojo_init_RC(const vector<int> &slct, const vector<int> &remain) {
369 int i = 0, j = 0;
370 _ecojo_RC.resize(slct.size(), remain.size());
371
372 #pragma omp parallel for private(j)
373 for (i = 0; i < slct.size(); i++){
374 for (j = 0; j < remain.size(); j++) _ecojo_RC(i,j)=_ecojo_wholeR(slct[i], remain[j]);
375 }
376 }
377
ecojo_insert_R(const vector<int> & slct,int insert_indx)378 bool gcta::ecojo_insert_R(const vector<int> &slct, int insert_indx)
379 {
380 eigenMatrix R_buf(_ecojo_R);
381 vector<int> ix(slct);
382 ix.push_back(insert_indx);
383 stable_sort(ix.begin(), ix.end());
384 int i = 0, j = 0;
385 _ecojo_R.resize(ix.size(), ix.size());
386
387 #pragma omp parallel for private(j)
388 for(i=0; i<ix.size(); i++){
389 for(j=0; j<ix.size(); j++) _ecojo_R(i,j)=_ecojo_wholeR(ix[i],ix[j]);
390 }
391
392 ecojo_inv_R();
393 if((1 - eigenVector::Constant(ix.size(), 1).array() / _ecojo_R.diagonal().array()).maxCoeff() > _ecojo_collinear){
394 _ecojo_R=R_buf;
395 return false;
396 }
397
398 return true;
399 }
400
ecojo_erase_R(const vector<int> & slct)401 void gcta::ecojo_erase_R(const vector<int> &slct)
402 {
403 int i = 0, j = 0;
404 _ecojo_R.resize(slct.size(), slct.size());
405
406 #pragma omp parallel for private(j)
407 for(i=0; i<slct.size(); i++){
408 for(j=0; j<slct.size(); j++) _ecojo_R(i,j)=_ecojo_wholeR(slct[i],slct[j]);
409 }
410 ecojo_inv_R();
411 }
412
run_ecojo_blup_efile(string e_metafile,double lambda)413 void gcta::run_ecojo_blup_efile(string e_metafile, double lambda)
414 {
415 read_e_metafile(e_metafile);
416 cout << "Recoding gene expression data ..." << endl;
417 calcu_eR();
418 ecojo_blup(lambda);
419 }
420
run_ecojo_blup_eR(string e_metafile,double lambda)421 void gcta::run_ecojo_blup_eR(string e_metafile, double lambda)
422 {
423 read_e_metafile(e_metafile);
424 ecojo_blup(lambda);
425 }
426
ecojo_blup(double lambda)427 void gcta::ecojo_blup(double lambda)
428 {
429 cout << "\nPerforming joint analysis on all the " << _e_include.size() << " probes ..." << endl;
430 int i = 0, j=0;
431 double d_n = _ecojo_n.mean();
432 double diag_val=1.0+_e_include.size()*(1.0/lambda-1.0)/d_n;
433 for(i=0; i<_e_include.size(); i++) _ecojo_wholeR(i,i)=diag_val;
434 double logdet=0.0;
435 if (!comput_inverse_logdet_LDLT_mkl(_ecojo_wholeR, logdet)) {
436 cout<<"Note: no solution to LDLT decomposition. Switching to LU decomposition."<<endl;
437 _ecojo_wholeR = _ecojo_wholeR.lu().solve(eigenMatrix::Identity(_e_include.size(), _e_include.size()));
438 //if (!comput_inverse_logdet_LU_mkl(_ecojo_wholeR, logdet)) throw("\nError: the correlation matrix is not invertible.");
439 }
440 eigenVector bJ=_ecojo_wholeR*_ecojo_b;
441
442 string filename = _out + ".blup.ecojo";
443 cout << "Saving the BLUP analysis result of " << _e_include.size() << " probes to [" + filename + "] ..." << endl;
444 ofstream ofile(filename.c_str());
445 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
446 ofile << "Probe\tb\tse\tz\tn\tb_blup"<< endl;
447 for (i = 0; i < _e_include.size(); i++) {
448 ofile << _probe_name[_e_include[i]] << "\t" << _ecojo_b[i] << "\t" <<_ecojo_se[i] << "\t" << _ecojo_z[i] << "\t" << _ecojo_n[i] << "\t"<< bJ[i] << endl;
449 }
450 ofile.close();
451 }
452
ecojo_inv_R()453 void gcta::ecojo_inv_R() {
454 int i = 0, j = 0, k = 0;
455 string errmsg = "\nError: the correlation matrix is not invertible.";
456
457 double logdet=0.0;
458 if (!comput_inverse_logdet_LDLT_mkl(_ecojo_R, logdet)) {
459 cout<<"Note: no solution to LDLT decomposition. Switching to LU decomposition."<<endl;
460 _ecojo_R = _ecojo_R.lu().solve(eigenMatrix::Identity(_ecojo_R.cols(), _ecojo_R.cols()));
461 }
462 }
463
464