1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for BLUP analysis using sumamry data from EWAS
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 
14 #include "gcta.h"
15 
calcu_eR()16 void gcta::calcu_eR()
17 {
18     // _probe_data rotated
19     eigenMatrix X(_probe_data);
20     _probe_data.resize(_e_include.size(), _keep.size());
21 
22     int i=0, j=0;
23     #pragma omp parallel for private(j)
24     for(i=0; i<_keep.size(); i++){
25         for(j=0; j<_e_include.size(); j++) _probe_data(j,i)=X(_keep[i], _e_include[j]);
26     }
27     eigenVector m(_e_include.size()), nonmiss(_e_include.size());
28 
29     #pragma omp parallel for private(i)
30     for(j=0; j<_e_include.size(); j++){
31         m(j)=0.0;
32         nonmiss(j)=0.0;
33         for(i=0; i<_keep.size(); i++){
34             if(_probe_data(j,i)<1e9){
35                 m(j)+=_probe_data(j,i);
36                 nonmiss(j)+=1.0;
37             }
38         }
39         m(j)/=nonmiss(j);
40     }
41 
42     #pragma omp parallel for private(j)
43     for(i=0; i<_keep.size(); i++){
44         for(j=0; j<_e_include.size(); j++){
45             if(_probe_data(j,i)<1e9) _probe_data(j,i)-=m(j);
46             else _probe_data(j,i)=0.0;
47         }
48     }
49     eigenVector d(_e_include.size());
50 
51     #pragma omp parallel for
52     for(j=0; j<_e_include.size(); j++){
53         d(j)=_probe_data.row(j).dot(_probe_data.row(j));
54     }
55 
56     _ecojo_wholeR=_probe_data*_probe_data.transpose();
57 
58     #pragma omp parallel for private(j)
59     for(i=0; i<_e_include.size(); i++){
60         _ecojo_wholeR(i,i)=1.0;
61         for(j=i+1; j<_e_include.size(); j++) _ecojo_wholeR(i,j)=_ecojo_wholeR(j,i)=_ecojo_wholeR(i,j)/sqrt(d(i)*d(j));
62     }
63     _probe_data.resize(0,0);
64 }
65 
read_eR(string eR_file)66 void gcta::read_eR(string eR_file)
67 {
68     ifstream eR_inf(eR_file.c_str());
69     if (!eR_inf.is_open()) throw ("Error: can not open the file [" + eR_file + "] to read.");
70     cout << "Reading correlation matrix of gene expression from [" + eR_file + "] ..." << endl;
71 
72     string str_buf="";
73     getline(eR_inf, str_buf); // reading the probe names
74     _probe_num = StrFunc::split_string(str_buf, _probe_name, " \t\n");
75     cout<<_probe_num<<" probes found in the file. \nReading correlation matrix ..."<<endl;
76     int i = 0, j = 0;
77     _ecojo_wholeR.resize(_probe_num, _probe_num);
78     for(i = 0; i < _probe_num; i++) {
79         for(j = 0; j < _probe_num; j++) {
80             if(!(eR_inf >> _ecojo_wholeR(i,j))) throw("Error: incorrect format of [" + eR_file + "].");
81         }
82     }
83     eR_inf.close();
84     cout<<"Correlation matrix for "<<_probe_num<<" probes have been included from the file [" + eR_file + "]."<<endl;
85 
86     init_e_include();
87 }
88 
read_e_metafile(string e_metafile)89 void gcta::read_e_metafile(string e_metafile)
90 {
91     cout << "\nReading expression-trait association summary-level statistics from [" + e_metafile + "] ..." << endl;
92     ifstream e_meta(e_metafile.c_str());
93     if (!e_meta) throw ("Error: can not open the file [" + e_metafile + "] to read.");
94 
95     string str_buf="";
96     double d_buf=0.0;
97     vector<string> vs_buf, probe_buf;
98     vector<double> z_buf, n_buf;
99 
100     getline(e_meta, str_buf); // the header line
101     if (StrFunc::split_string(str_buf, vs_buf) < 3) throw ("Error: there needs be at least 3 columns in the file [" + e_metafile + "].");
102     stringstream errmsg;
103     int line=1;
104     while(getline(e_meta, str_buf)){
105         stringstream iss(str_buf);
106         if(!(iss >> str_buf)){ errmsg<<"Error: in line "<<line<<"."; throw(errmsg.str()); }
107         if (_probe_name_map.find(str_buf) == _probe_name_map.end()) continue;
108         probe_buf.push_back(str_buf);
109         if(!(iss >> d_buf)){ errmsg<<"Error: in line "<<line<<"."; throw(errmsg.str()); }
110         z_buf.push_back(d_buf);
111         if(!(iss >> d_buf)){ errmsg<<"Error: in line "<<line<<"."; throw(errmsg.str()); }
112         n_buf.push_back(d_buf);
113         line++;
114     }
115     e_meta.close();
116     if(probe_buf.size()<1) throw("Error: no probe remains in the analysis.");
117     cout << "GWAS summary statistics of " << probe_buf.size() << " probs read from [" + e_metafile + "]." << endl;
118 
119     cout << "Matching the summary data to the genotype data ..." << endl;
120     update_id_map_kp(probe_buf, _probe_name_map, _e_include);
121     vector<int> indx(_e_include.size());
122     map<string, int> id_map;
123     int i=0;
124     for (i = 0; i < probe_buf.size(); i++) id_map.insert(pair<string, int>(probe_buf[i], i));
125     map<string, int>::iterator iter;
126     for (i = 0; i < _e_include.size(); i++) {
127         iter = id_map.find(_probe_name[_e_include[i]]);
128         indx[i] = iter->second;
129     }
130     _ecojo_z.resize(_e_include.size());
131     _ecojo_b.resize(_e_include.size());
132     _ecojo_se.resize(_e_include.size());
133     _ecojo_n.resize(_e_include.size());
134     _ecojo_pval.resize(_e_include.size());
135 
136     //#pragma omp parallel for private(d_buf)
137     for (i = 0; i < _e_include.size(); i++) {
138         _ecojo_z[i]=z_buf[indx[i]];
139         _ecojo_pval[i] = StatFunc::pchisq(_ecojo_z[i]*_ecojo_z[i], 1);
140         _ecojo_n[i]=n_buf[indx[i]];
141         d_buf=sqrt(_ecojo_z[i]*_ecojo_z[i] + _ecojo_n[i]);
142         if(d_buf<1e-30){
143             _ecojo_b[i]=0.0;
144             _ecojo_se[i]=-1.0;
145         }
146         else{
147             _ecojo_b[i]=_ecojo_z[i]/d_buf;
148             _ecojo_se[i]=1.0/d_buf;
149         }
150     }
151  }
152 
run_ecojo_slct(string e_metafile,double p_cutoff,double collinear)153 void gcta::run_ecojo_slct(string e_metafile, double p_cutoff, double collinear)
154 {
155     bool joint_only=false, backward=false;
156     _ecojo_p_cutoff = p_cutoff;
157     _ecojo_collinear = collinear;
158     read_e_metafile(e_metafile);
159     calcu_eR();
160 
161     int i = 0, j = 0;
162     vector<int> slct, remain;
163     eigenVector bC, bC_se, pC;
164     cout << endl;
165     if (!joint_only && !backward) {
166         cout << "Performing stepwise model selection on " << _e_include.size() << " probes to select association signals ... (p cutoff = " << _ecojo_p_cutoff << "; ";
167         cout << "collinearity cutoff = " << _ecojo_collinear << ")"<< endl;
168         ecojo_slct(slct, remain, bC, bC_se, pC);
169         if (slct.empty()) {
170             cout << "No probe has been selected." << endl;
171             return;
172         }
173     }
174     else {
175         for (i = 0; i < _e_include.size(); i++) slct.push_back(i);
176         if (backward) {
177             cout << "Performing backward selection on " << _e_include.size() << " probes at threshold p-value = " << _ecojo_p_cutoff << " ..." << endl;
178             ecojo_slct_stay(slct, bC, bC_se, pC);
179         }
180     }
181 
182     // joint analysis
183     eigenVector bJ, bJ_se, pJ;
184     cout << "Performing joint analysis on all the " << slct.size();
185     if (joint_only) cout << " probes ..." << endl;
186     else cout << " selected signals ..." << endl;
187     if (slct.size() >= _keep.size()) throw ("Error: too many probes. The number of probes in a joint analysis should not be larger than the sample size.");
188     ecojo_joint(slct, bJ, bJ_se, pJ);
189     ecojo_slct_output(joint_only, slct, bJ, bJ_se, pJ);
190 }
191 
ecojo_slct_output(bool joint_only,vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)192 void gcta::ecojo_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ)
193 {
194     string filename = _out + ".slct.ecojo";
195     if (joint_only) cout << "Saving the joint analysis result of " << slct.size() << " probes to [" + filename + "] ..." << endl;
196     else cout << "Saving the " << slct.size() << " independent signals to [" + filename + "] ..." << endl;
197     ofstream ofile(filename.c_str());
198     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
199     ofile << "Probe\tb\tse\tz\tn\tbJ\tbJ_se\tzJ\tpJ"<< endl;
200     int i = 0, j = 0;
201     for (i = 0; i < slct.size(); i++) {
202         j = slct[i];
203         ofile << _probe_name[_e_include[j]] << "\t" << _ecojo_b[j] << "\t" <<_ecojo_se[j] << "\t" << _ecojo_z[j] << "\t" << _ecojo_n[j] << "\t"<< bJ[i] << "\t" << bJ_se[i] << "\t" << bJ[i]/bJ_se[i] << "\t" << pJ[i] << "\t" << endl;
204     }
205     ofile.close();
206 }
207 
ecojo_slct(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)208 void gcta::ecojo_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
209 {
210     int i = 0, i_buf = 0;
211     vector<double> p_buf;
212     eigenVector2Vector(_ecojo_pval, p_buf);
213     int m = min_element(p_buf.begin(), p_buf.end()) - p_buf.begin();
214     if (p_buf[m] >= _ecojo_p_cutoff) return;
215     slct.push_back(m);
216     for (i = 0; i < _e_include.size(); i++) {
217         if (i != m) remain.push_back(i);
218     }
219     int prev_num = 0;
220     ecojo_init_R(slct);
221     ecojo_init_RC(slct, remain);
222     if (_ecojo_p_cutoff > 1e-3) cout << "Performing forward model selection because the significance level is too low..." << endl;
223 
224     while (!remain.empty()) {
225         if (ecojo_slct_entry(slct, remain, bC, bC_se, pC)) {
226             if (_ecojo_p_cutoff <= 1e-3) ecojo_slct_stay(slct, bC, bC_se, pC);
227             ecojo_init_RC(slct, remain);
228         }
229         else break;
230         if (slct.size() % 5 == 0 && slct.size() > prev_num) cout << slct.size() << " associated probes have been selected." << endl;
231         if (slct.size() > prev_num) prev_num = slct.size();
232     }
233     if (_ecojo_p_cutoff > 1e-3) {
234         cout << "Performing backward elimination..." << endl;
235         ecojo_slct_stay(slct, bC, bC_se, pC);
236     }
237     cout << "Finally, " << slct.size() << " associated probes are selected." << endl;
238 }
239 
ecojo_slct_entry(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)240 bool gcta::ecojo_slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
241 {
242     int i = 0, m = 0;
243     ecojo_cond(slct, remain, bC, bC_se, pC);
244     vector<double> pC_buf;
245     eigenVector2Vector(pC, pC_buf);
246 
247     while (true) {
248         m = min_element(pC_buf.begin(), pC_buf.end()) - pC_buf.begin();
249         if (pC_buf[m] >= _ecojo_p_cutoff){
250 
251             ecojo_init_RC(slct, remain);
252             ecojo_cond(slct, remain, bC, bC_se, pC);
253 
254 
255             // debug
256 /*            cout<<"here"<<endl;
257             ofstream tmp("cond.txt");
258             for(int j=0; j<remain.size(); j++){
259                 tmp<<_probe_name[_e_include[remain[j]]]<<" "<<bC[j]<<" "<<bC_se[j]<<" "<<pC[j]<<endl;
260             }
261             tmp.close();
262             ofstream oR("R.txt");
263             for(int j=0; j<_ecojo_R.rows(); j++){
264                 for(int k=0; k<_ecojo_R.cols(); k++) oR<<_ecojo_R(j,k)<<" ";
265                 oR<<endl;
266             }
267             oR.close();
268             ofstream oRC("RC.txt");
269             for(int j=0; j<_ecojo_RC.rows(); j++){
270                 for(int k=0; k<_ecojo_RC.cols(); k++) oRC<<_ecojo_RC(j,k)<<" ";
271                 oRC<<endl;
272             }
273             oRC.close();*/
274 
275             return false;
276         }
277         if (ecojo_insert_R(slct, remain[m])){
278             slct.push_back(remain[m]);
279             stable_sort(slct.begin(), slct.end());
280             remain.erase(remain.begin() + m);
281             return (true);
282         }
283         pC_buf.erase(pC_buf.begin() + m);
284         remain.erase(remain.begin() + m);
285 
286         // debug
287         //cout<<"remain = "<<remain.size()<<endl;
288     }
289 }
290 
ecojo_slct_stay(vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)291 void gcta::ecojo_slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ)
292 {
293     vector<double> pJ_buf;
294     while(!slct.empty()){
295         ecojo_joint(slct, bJ, bJ_se, pJ);
296         eigenVector2Vector(pJ, pJ_buf);
297         int m = max_element(pJ_buf.begin(), pJ_buf.end()) - pJ_buf.begin();
298         if(pJ[m] > _ecojo_p_cutoff){
299             slct.erase(slct.begin() + m);
300             ecojo_erase_R(slct);
301         }
302         else break;
303     }
304 }
305 
ecojo_joint(const vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)306 void gcta::ecojo_joint(const vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ)
307 {
308     int i = 0, size = slct.size();
309     eigenVector b(size), n(size);
310     for(i = 0; i < size; i++){
311         b[i] = _ecojo_b[slct[i]];
312         n[i] = _ecojo_n[slct[i]];
313     }
314     bJ = _ecojo_R*b;
315     bJ_se = eigenVector::Constant(size, -1);
316     pJ = eigenVector::Constant(size, 2);
317     double chisq=0.0;
318     for(i = 0; i < size; i++){
319         bJ_se[i] = _ecojo_R(i,i) / n[i];
320         if (bJ_se[i] > 1.0e-30) {
321             bJ_se[i] = sqrt(bJ_se[i]);
322              chisq = bJ[i] / bJ_se[i];
323             pJ[i] = StatFunc::pchisq(chisq*chisq, 1);
324         }
325     }
326 }
327 
ecojo_cond(const vector<int> & slct,const vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)328 void gcta::ecojo_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
329 {
330     int i=0, j=0;
331     eigenVector b1(slct.size()), b2(remain.size());
332     for(i=0; i<slct.size(); i++) b1[i]=_ecojo_b[slct[i]];
333     for(i=0; i<remain.size(); i++) b2[i]=_ecojo_b[remain[i]];
334 
335     bC = eigenVector::Constant(remain.size(), 0);
336     bC_se = eigenVector::Constant(remain.size(), -1);
337     pC = eigenVector::Constant(remain.size(), 2);
338     double chisq = 0.0;
339     eigenVector RCRi;
340     //#pragma omp parallel for private(RCRi)
341     for (i = 0; i < remain.size(); i++) {
342         RCRi=_ecojo_R*_ecojo_RC.col(i);
343         bC[i]=b2[i]-RCRi.dot(b1);
344         bC_se[i] = (1 - _ecojo_RC.col(i).dot(RCRi)) / _ecojo_n[remain[i]];
345         if (bC_se[i] > 1e-30) {
346             bC_se[i] = sqrt(bC_se[i]);
347             chisq = bC[i] / bC_se[i];
348             pC[i] = StatFunc::pchisq(chisq*chisq, 1);
349         }
350     }
351 }
352 
ecojo_init_R(const vector<int> & slct)353 bool gcta::ecojo_init_R(const vector<int> &slct)
354 {
355     int i=0, j=0, size=slct.size();
356     _ecojo_R.resize(size, size);
357 
358     #pragma omp parallel for private(j)
359     for(i=0; i<size; i++){
360         for(j=0; j<size; j++) _ecojo_R(i,j)=_ecojo_wholeR(slct[i],slct[j]);
361     }
362     ecojo_inv_R();
363     if ((1 - eigenVector::Constant(size, 1).array() / _ecojo_R.diagonal().array()).maxCoeff() > _ecojo_collinear) return false;
364 
365     return true;
366 }
367 
ecojo_init_RC(const vector<int> & slct,const vector<int> & remain)368 void gcta::ecojo_init_RC(const vector<int> &slct, const vector<int> &remain) {
369     int i = 0, j = 0;
370     _ecojo_RC.resize(slct.size(), remain.size());
371 
372     #pragma omp parallel for private(j)
373     for (i = 0; i < slct.size(); i++){
374         for (j = 0; j < remain.size(); j++) _ecojo_RC(i,j)=_ecojo_wholeR(slct[i], remain[j]);
375     }
376 }
377 
ecojo_insert_R(const vector<int> & slct,int insert_indx)378 bool gcta::ecojo_insert_R(const vector<int> &slct, int insert_indx)
379 {
380     eigenMatrix R_buf(_ecojo_R);
381     vector<int> ix(slct);
382     ix.push_back(insert_indx);
383     stable_sort(ix.begin(), ix.end());
384     int i = 0, j = 0;
385     _ecojo_R.resize(ix.size(), ix.size());
386 
387     #pragma omp parallel for private(j)
388     for(i=0; i<ix.size(); i++){
389         for(j=0; j<ix.size(); j++) _ecojo_R(i,j)=_ecojo_wholeR(ix[i],ix[j]);
390     }
391 
392     ecojo_inv_R();
393     if((1 - eigenVector::Constant(ix.size(), 1).array() / _ecojo_R.diagonal().array()).maxCoeff() > _ecojo_collinear){
394         _ecojo_R=R_buf;
395         return false;
396     }
397 
398     return true;
399 }
400 
ecojo_erase_R(const vector<int> & slct)401 void gcta::ecojo_erase_R(const vector<int> &slct)
402 {
403     int i = 0, j = 0;
404     _ecojo_R.resize(slct.size(), slct.size());
405 
406     #pragma omp parallel for private(j)
407     for(i=0; i<slct.size(); i++){
408         for(j=0; j<slct.size(); j++) _ecojo_R(i,j)=_ecojo_wholeR(slct[i],slct[j]);
409     }
410     ecojo_inv_R();
411 }
412 
run_ecojo_blup_efile(string e_metafile,double lambda)413 void gcta::run_ecojo_blup_efile(string e_metafile, double lambda)
414 {
415     read_e_metafile(e_metafile);
416     cout << "Recoding gene expression data ..." << endl;
417     calcu_eR();
418     ecojo_blup(lambda);
419 }
420 
run_ecojo_blup_eR(string e_metafile,double lambda)421 void gcta::run_ecojo_blup_eR(string e_metafile, double lambda)
422 {
423     read_e_metafile(e_metafile);
424     ecojo_blup(lambda);
425 }
426 
ecojo_blup(double lambda)427 void gcta::ecojo_blup(double lambda)
428 {
429     cout << "\nPerforming joint analysis on all the " << _e_include.size() << " probes ..." << endl;
430     int i = 0, j=0;
431     double d_n = _ecojo_n.mean();
432     double diag_val=1.0+_e_include.size()*(1.0/lambda-1.0)/d_n;
433     for(i=0; i<_e_include.size(); i++) _ecojo_wholeR(i,i)=diag_val;
434     double logdet=0.0;
435     if (!comput_inverse_logdet_LDLT_mkl(_ecojo_wholeR, logdet)) {
436         cout<<"Note: no solution to LDLT decomposition. Switching to LU decomposition."<<endl;
437         _ecojo_wholeR = _ecojo_wholeR.lu().solve(eigenMatrix::Identity(_e_include.size(), _e_include.size()));
438         //if (!comput_inverse_logdet_LU_mkl(_ecojo_wholeR, logdet)) throw("\nError: the correlation matrix is not invertible.");
439     }
440     eigenVector bJ=_ecojo_wholeR*_ecojo_b;
441 
442     string filename = _out + ".blup.ecojo";
443     cout << "Saving the BLUP analysis result of " << _e_include.size() << " probes to [" + filename + "] ..." << endl;
444     ofstream ofile(filename.c_str());
445     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
446     ofile << "Probe\tb\tse\tz\tn\tb_blup"<< endl;
447     for (i = 0; i < _e_include.size(); i++) {
448         ofile << _probe_name[_e_include[i]] << "\t" << _ecojo_b[i] << "\t" <<_ecojo_se[i] << "\t" << _ecojo_z[i] << "\t" << _ecojo_n[i] << "\t"<< bJ[i] << endl;
449     }
450     ofile.close();
451 }
452 
ecojo_inv_R()453 void gcta::ecojo_inv_R() {
454     int i = 0, j = 0, k = 0;
455     string errmsg = "\nError: the correlation matrix is not invertible.";
456 
457     double logdet=0.0;
458     if (!comput_inverse_logdet_LDLT_mkl(_ecojo_R, logdet)) {
459         cout<<"Note: no solution to LDLT decomposition. Switching to LU decomposition."<<endl;
460         _ecojo_R = _ecojo_R.lu().solve(eigenMatrix::Identity(_ecojo_R.cols(), _ecojo_R.cols()));
461     }
462 }
463 
464