1 #include "gcta.h"
2 
read_metafile(string metafile,bool GC,double GC_val)3 void gcta::read_metafile(string metafile, bool GC, double GC_val) {
4     cout << "\nReading GWAS summary-level statistics from [" + metafile + "] ..." << endl;
5     ifstream Meta(metafile.c_str());
6     if (!Meta) throw ("Error: can not open the file [" + metafile + "] to read.");
7 
8     int i = 0, count = 0;
9     double f_buf = 0.0, b_buf = 0.0, se_buf = 0.0, p_buf = 0.0, N_buf = 0.0, Vp_buf = 0.0, GC_buf = 0.0, chi_buf = 0.0, h_buf = 0.0;
10     string A1_buf, A2_buf;
11     string snp_buf, str_buf0, str_buf;
12 
13     vector<string> snplist, vs_buf, bad_snp;
14     vector<string> ref_A_buf, bad_A1, bad_A2, bad_refA;
15     vector<double> freq_buf, beta_buf, beta_se_buf, pval_buf, N_o_buf, Vp_v_buf, GC_v_buf;
16     map<string, int>::iterator iter;
17     getline(Meta, str_buf); // the header line
18     if (StrFunc::split_string(str_buf, vs_buf) < 7) throw ("Error: format error in the input file [" + metafile + "].");
19     _jma_Vp = 0.0;
20     _GC_val = -1;
21     while (Meta) {
22         getline(Meta, str_buf0);
23         stringstream iss(str_buf0);
24         iss >> snp_buf >> A1_buf >> A2_buf;
25         StrFunc::to_upper(A1_buf);
26         StrFunc::to_upper(A2_buf);
27         iss >> str_buf;
28         f_buf = atof(str_buf.c_str());
29         iss >> str_buf;
30         if (str_buf == "NA" || str_buf == ".") continue;
31         b_buf = atof(str_buf.c_str());
32         iss >> str_buf;
33         if (str_buf == "NA" || str_buf == "." || str_buf == "0") continue;
34         se_buf = atof(str_buf.c_str());
35         iss >> str_buf;
36         if (str_buf == "NA" || str_buf == ".") continue;
37         p_buf = atof(str_buf.c_str());
38         iss >> str_buf;
39         if (str_buf == "NA" || str_buf == ".") continue;
40         N_buf = atof(str_buf.c_str());
41         if (N_buf < 10) throw ("Error: invalid sample size in line:\n\"" + str_buf0 + "\"");
42         if (Meta.eof()) break;
43         iter = _snp_name_map.find(snp_buf);
44         h_buf = 2.0 * f_buf * (1.0 - f_buf);
45         Vp_buf = h_buf * N_buf * se_buf * se_buf + h_buf * b_buf * b_buf * N_buf / (N_buf - 1.0);
46         if (Vp_buf < 0.0) throw ("Error: in line:\n\"" + str_buf0 + "\"");
47         Vp_v_buf.push_back(Vp_buf);
48         if (GC) {
49             GC_buf = b_buf * b_buf / se_buf / se_buf;
50             if (GC_buf < 0) throw ("Error: in line:\n\"" + str_buf0 + "\"");
51             GC_v_buf.push_back(GC_buf);
52         }
53         count++;
54         if (iter == _snp_name_map.end()) continue;
55         i = iter->second;
56         if (A1_buf != _allele1[i] && A1_buf != _allele2[i]) {
57             bad_snp.push_back(_snp_name[i]);
58             bad_A1.push_back(_allele1[i]);
59             bad_A2.push_back(_allele2[i]);
60             bad_refA.push_back(A1_buf);
61             continue;
62         }
63         snplist.push_back(snp_buf);
64         ref_A_buf.push_back(A1_buf);
65         freq_buf.push_back(f_buf);
66         beta_buf.push_back(b_buf);
67         beta_se_buf.push_back(se_buf);
68         pval_buf.push_back(p_buf);
69         N_o_buf.push_back(N_buf);
70     }
71     Meta.close();
72     cout << "GWAS summary statistics of " << count << " SNPs read from [" + metafile + "]." << endl;
73     _jma_Vp = CommFunc::median(Vp_v_buf);
74     cout << "Phenotypic variance estimated from summary statistics of all " << count << " SNPs: " << _jma_Vp << " (variance of logit for case-control studies)." << endl;
75     if (GC) {
76         if (GC_val > 0) {
77             _GC_val = GC_val;
78             cout << "User specified genomic inflation factor: " << _GC_val << endl;
79         } else {
80             _GC_val = CommFunc::median(GC_v_buf) / 0.455;
81             cout << "Genomic inflation factor calcualted from " << count << " SNPs: " << _GC_val << endl;
82         }
83         cout << "p-values wil be adjusted by the genomic control approach." << endl;
84     }
85 
86     cout << "Matching the GWAS meta-analysis results to the genotype data ..." << endl;
87     update_id_map_kp(snplist, _snp_name_map, _include);
88     vector<int> indx(_include.size());
89     map<string, int> id_map;
90     for (i = 0; i < snplist.size(); i++) id_map.insert(pair<string, int>(snplist[i], i));
91     for (i = 0; i < _include.size(); i++) {
92         iter = id_map.find(_snp_name[_include[i]]);
93         indx[i] = iter->second;
94         _ref_A[_include[i]] = ref_A_buf[iter->second];
95         if (!_mu.empty() && ref_A_buf[iter->second] == _allele2[_include[i]]) _mu[_include[i]] = 2.0 - _mu[_include[i]];
96     }
97     if (!bad_snp.empty()) {
98         string badsnpfile = _out + ".badsnps";
99         ofstream obadsnp(badsnpfile.c_str());
100         obadsnp << "SNP\tA1\tA2\tRefA" << endl;
101         for (i = 0; i < bad_snp.size(); i++) obadsnp << bad_snp[i] << "\t" << bad_A1[i] << "\t" << bad_A2[i] << "\t" << bad_refA[i] << endl;
102         obadsnp.close();
103         cout << "Warning: can't match the reference alleles of " << bad_snp.size() << " SNPs to those in the genotype data. These SNPs have been saved in [" + badsnpfile + "]." << endl;
104     }
105     if (_include.empty()) throw ("Error: all the SNPs in the GWAS meta-analysis results can't be found in the genotype data.");
106     else cout << _include.size() << " SNPs are matched to the genotype data." << endl;
107 
108     if (_mu.empty()) calcu_mu();
109 
110     _freq.resize(_include.size());
111     _beta.resize(_include.size());
112     _beta_se.resize(_include.size());
113     _pval.resize(_include.size());
114     _N_o.resize(_include.size());
115     for (i = 0; i < _include.size(); i++) {
116         _freq[i] = freq_buf[indx[i]];
117         _beta[i] = beta_buf[indx[i]];
118         _beta_se[i] = beta_se_buf[indx[i]];
119         if (GC) {
120             chi_buf = _beta[i] / _beta_se[i];
121             _pval[i] = StatFunc::pchisq(chi_buf * chi_buf / _GC_val, 1);
122         } else _pval[i] = pval_buf[indx[i]];
123         _N_o[i] = N_o_buf[indx[i]];
124     }
125     _jma_Ve = _jma_Vp;
126 }
127 
init_massoc(string metafile,bool GC,double GC_val)128 void gcta::init_massoc(string metafile, bool GC, double GC_val)
129 {
130     read_metafile(metafile, GC, GC_val);
131 
132     int i = 0, j = 0, n = _keep.size(), m = _include.size();
133     cout << "Calculating the variance of SNP genotypes ..." << endl;
134     _MSX_B.resize(m);
135     _Nd.resize(m);
136 
137     if (_mu.empty()) calcu_mu();
138     #pragma omp parallel for
139     for (i = 0; i < m; i++){
140         eigenVector x;
141         makex_eigenVector(i, x, true, true);
142         _MSX_B[i] = x.squaredNorm() / (double)n;
143     }
144     if (_jma_actual_geno) {
145         _MSX = _MSX_B;
146         _Nd = _N_o;
147     } else {
148         _MSX = 2.0 * _freq.array()*(1.0 - _freq.array());
149         for (i = 0; i < m; i++) _Nd[i] = (_jma_Vp - _MSX[i] * _beta[i] * _beta[i]) / (_MSX[i] * _beta_se[i] * _beta_se[i]) + 1; // revised by JY 25/11/13 according to Eq. 13, Yang et al. 2012 NG
150     }
151 }
152 
read_fixed_snp(string snplistfile,string msg,vector<int> & pgiven,vector<int> & remain)153 void gcta::read_fixed_snp(string snplistfile, string msg, vector<int> &pgiven, vector<int> &remain) {
154     int i = 0, j = 0;
155     vector<string> givenSNPs;
156     read_snplist(snplistfile, givenSNPs, msg);
157     if (givenSNPs.empty()) throw ("Error: failed to read any SNP from the file [" + snplistfile + "].");
158     map<string, int> givenSNPs_map;
159     pgiven.clear();
160     remain.clear();
161     for (i = 0; i < givenSNPs.size(); i++) givenSNPs_map.insert(pair<string, int>(givenSNPs[i], i));
162     for (i = 0; i < _include.size(); i++) {
163         if (givenSNPs_map.find(_snp_name[_include[i]]) != givenSNPs_map.end()) pgiven.push_back(i);
164         else remain.push_back(i);
165     }
166     if (pgiven.size() > 0) cout << pgiven.size() << " of them are matched to the genotype and summary data." << endl;
167     else throw ("Error: none of the given SNPs can be matched to the genotype and summary data.");
168 }
169 
run_massoc_slct(string metafile,int wind_size,double p_cutoff,double collinear,int top_SNPs,bool joint_only,bool GC,double GC_val,bool actual_geno,int mld_slct_alg)170 void gcta::run_massoc_slct(string metafile, int wind_size, double p_cutoff, double collinear, int top_SNPs, bool joint_only, bool GC, double GC_val, bool actual_geno, int mld_slct_alg)
171 {
172     _jma_actual_geno = actual_geno;
173     _jma_wind_size = wind_size;
174     _jma_p_cutoff = p_cutoff;
175     _jma_collinear = collinear;
176     _jma_snpnum_backward = 0;
177     _jma_snpnum_collienar = 0;
178     init_massoc(metafile, GC, GC_val);
179     if (top_SNPs < 0) top_SNPs = 1e30;
180     else {
181         _jma_p_cutoff = 0.5;
182         cout << "The threshold p-value has been set to 0.5 because of the --massoc-top-SNPs option." << endl;
183     }
184     int i = 0, j = 0;
185     vector<int> slct, remain;
186     eigenVector bC, bC_se, pC;
187     cout << endl;
188     if (!joint_only && mld_slct_alg<2) {
189         cout << "Performing "<< ((mld_slct_alg==0)?"stepwise":"forward") << " model selection on " << _include.size() << " SNPs to select association signals ... (p cutoff = " << _jma_p_cutoff << "; ";
190         cout << "collinearity cutoff = " << _jma_collinear << ")"<< endl;
191         if (!_jma_actual_geno) cout << "(Assuming complete linkage equilibrium between SNPs which are more than " << _jma_wind_size / 1e6 << "Mb away from each other)" << endl;
192         stepwise_slct(slct, remain, bC, bC_se, pC, mld_slct_alg, top_SNPs);
193         if (slct.empty()) {
194             cout << "No SNPs have been selected." << endl;
195             return;
196         }
197     } else { // mld_slct_alg = 2 for backward selection
198         for (i = 0; i < _include.size(); i++) slct.push_back(i);
199         if (mld_slct_alg==2) {
200             cout << "Performing backward selection on " << _include.size() << " SNPs at threshold p-value = " << _jma_p_cutoff << " ..." << endl;
201             slct_stay(slct, bC, bC_se, pC);
202         }
203     }
204 
205     // joint analysis
206     eigenVector bJ, bJ_se, pJ;
207     cout << "Performing joint analysis on all the " << slct.size();
208     if (joint_only) cout << " SNPs ..." << endl;
209     else cout << " selected signals ..." << endl;
210     if (slct.size() >= _keep.size()) throw ("Error: too many SNPs. The number of SNPs in a joint analysis should not be larger than the sample size.");
211     massoc_joint(slct, bJ, bJ_se, pJ);
212     eigenMatrix rval(slct.size(), slct.size());
213     LD_rval(slct, rval);
214     if (_jma_actual_geno) cout << "Residual varaince = " << _jma_Ve << endl;
215     massoc_slct_output(joint_only, slct, bJ, bJ_se, pJ, rval);
216 
217     // output conditional results
218     if (!joint_only && !mld_slct_alg==2) {
219         massoc_cond_output(remain, bC, bC_se, pC);
220         cout << "(" << _jma_snpnum_backward << " SNPs eliminated by backward selection and " << _jma_snpnum_collienar << " SNPs filtered by collinearity test are not included in the output)" << endl;
221     }
222 }
223 
run_massoc_cond(string metafile,string snplistfile,int wind_size,double collinear,bool GC,double GC_val,bool acutal_geno)224 void gcta::run_massoc_cond(string metafile, string snplistfile, int wind_size, double collinear, bool GC, double GC_val, bool acutal_geno) {
225     _jma_actual_geno = acutal_geno;
226     _jma_wind_size = wind_size;
227     _jma_collinear = collinear;
228     init_massoc(metafile, GC, GC_val);
229     vector<int> pgiven, remain;
230     read_fixed_snp(snplistfile, "given SNPs", pgiven, remain);
231 
232     eigenVector bC, bC_se, pC;
233     cout << "Performing single-SNP association analysis conditional on the " << pgiven.size() << " given SNPs ... ";
234     cout << "(collinearity cutoff = " << _jma_collinear << ")" << endl;
235     if (!_jma_actual_geno) cout << "(Assuming complete linkage equilibrium between SNPs which are more than " << _jma_wind_size / 1e6 << "Mb away from each other)" << endl;
236 
237     string filename = _out + ".given.cojo";
238     cout<<"Saving the summary statistics of the given SNPs in the file [" + filename + "] ..." << endl;
239     ofstream ofile(filename.c_str());
240     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
241     ofile << "Chr\tSNP\tbp\trefA\tfreq\tb\tse\tp" << endl;
242     int i = 0, j = 0;
243     for (i = 0; i < pgiven.size(); i++) {
244         j = pgiven[i];
245         ofile << _chr[_include[j]] << "\t" << _snp_name[_include[j]] << "\t" << _bp[_include[j]] << "\t" << _ref_A[_include[j]] << "\t" << _freq[j] << "\t" << _beta[j] << "\t" << _beta_se[j] << "\t" << _pval[j] << endl;
246     }
247     ofile.close();
248 
249     massoc_cond(pgiven, remain, bC, bC_se, pC);
250     massoc_cond_output(remain, bC, bC_se, pC);
251 }
252 
massoc_slct_output(bool joint_only,vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ,eigenMatrix & rval)253 void gcta::massoc_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ, eigenMatrix &rval)
254 {
255     string filename = _out + ".jma.cojo";
256     if (joint_only) cout << "Saving the joint analysis result of " << slct.size() << " SNPs to [" + filename + "] ..." << endl;
257     else cout << "Saving the " << slct.size() << " independent signals to [" + filename + "] ..." << endl;
258     ofstream ofile(filename.c_str());
259     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
260     ofile << "Chr\tSNP\tbp\trefA\tfreq\tb\tse\tp" << ((_GC_val > 0) ? "_GC" : "") << "\tn\tfreq_geno\tbJ\tbJ_se\tpJ" << ((_GC_val > 0) ? "_GC" : "") << "\tLD_r" << endl;
261     int i = 0, j = 0;
262     for (i = 0; i < slct.size(); i++) {
263         j = slct[i];
264         ofile << _chr[_include[j]] << "\t" << _snp_name[_include[j]] << "\t" << _bp[_include[j]] << "\t";
265         ofile << _ref_A[_include[j]] << "\t" << _freq[j] << "\t" << _beta[j] << "\t" << _beta_se[j] << "\t";
266         ofile << _pval[j] << "\t" << _Nd[j] << "\t" << 0.5 * _mu[_include[j]] << "\t" << bJ[i] << "\t" << bJ_se[i] << "\t" << pJ[i] << "\t";
267         if (i == slct.size() - 1) ofile << 0 << endl;
268         else ofile << rval(i, i + 1) << endl;
269     }
270     ofile.close();
271 
272     filename = _out + ".ldr.cojo";
273     if (joint_only) cout << "Saving the LD structure of " << slct.size() << " SNPs to [" + filename + "] ..." << endl;
274     else cout << "Saving the LD structure of " << slct.size() << " independent signals to [" + filename + "] ..." << endl;
275     ofile.open(filename.c_str());
276     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
277     ofile << "SNP\t";
278     for (i = 0; i < slct.size(); i++) ofile << _snp_name[_include[slct[i]]] << "\t";
279     ofile << endl;
280     for (i = 0; i < slct.size(); i++) {
281         ofile << _snp_name[_include[slct[i]]] << "\t";
282         for (j = 0; j < slct.size(); j++) ofile << rval(i, j) << "\t";
283         ofile << endl;
284     }
285     ofile.close();
286 }
287 
massoc_cond_output(vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)288 void gcta::massoc_cond_output(vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
289 {
290     int i = 0, j = 0;
291     string filename = _out + ".cma.cojo";
292     cout << "Saving the conditional analysis results of " << remain.size() << " remaining SNPs to [" + filename + "] ..." << endl;
293     ofstream ofile(filename.c_str());
294     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
295     ofile << "Chr\tSNP\tbp\trefA\tfreq\tb\tse\tp" << ((_GC_val > 0) ? "_GC" : "") << "\tn\tfreq_geno\tbC\tbC_se\tpC" << ((_GC_val > 0) ? "_GC" : "") << endl;
296     for (i = 0; i < remain.size(); i++) {
297         j = remain[i];
298         ofile << _chr[_include[j]] << "\t" << _snp_name[_include[j]] << "\t" << _bp[_include[j]] << "\t";
299         ofile << _ref_A[_include[j]] << "\t" << _freq[j] << "\t" << _beta[j] << "\t" << _beta_se[j] << "\t";
300         ofile << _pval[j] << "\t" << _Nd[j] << "\t" << 0.5 * _mu[_include[j]] << "\t";
301         if (pC[i] > 1.5) ofile << "NA\tNA\tNA" << endl;
302         else ofile << bC[i] << "\t" << bC_se[i] << "\t" << pC[i] << endl;
303     }
304     ofile.close();
305 }
306 
stepwise_slct(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC,int mld_slct_alg,int top_SNPs)307 void gcta::stepwise_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC, int mld_slct_alg, int top_SNPs)
308 {
309     int i = 0, i_buf = 0;
310     vector<double> p_buf;
311     eigenVector2Vector(_pval, p_buf);
312     int m = min_element(p_buf.begin(), p_buf.end()) - p_buf.begin();
313     if (p_buf[m] >= _jma_p_cutoff) return;
314     slct.push_back(m);
315     for (i = 0; i < _include.size(); i++) {
316         if (i != m) remain.push_back(i);
317     }
318     int prev_num = 0;
319     if (mld_slct_alg==0 && _jma_p_cutoff > 1e-3){
320         cout << "Switched to perform a forward model selection because the significance level is too low..." << endl;
321         mld_slct_alg=1;
322     }
323     while (!remain.empty()) {
324         if (slct_entry(slct, remain, bC, bC_se, pC)) {
325             if (mld_slct_alg==0) slct_stay(slct, bC, bC_se, pC);
326         }
327         else break;
328         if (slct.size() % 5 == 0 && slct.size() > prev_num) cout << slct.size() << " associated SNPs have been selected." << endl;
329         if (slct.size() > prev_num) prev_num = slct.size();
330         if (slct.size() >= top_SNPs) break;
331     }
332     if (_jma_p_cutoff > 1e-3) {
333         cout << "Performing backward elimination..." << endl;
334         slct_stay(slct, bC, bC_se, pC);
335     }
336     cout << "Finally, " << slct.size() << " associated SNPs are selected." << endl;
337 }
338 
slct_entry(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)339 bool gcta::slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC) {
340     int i = 0, m = 0;
341     massoc_cond(slct, remain, bC, bC_se, pC);
342     vector<double> pC_buf;
343     eigenVector2Vector(pC, pC_buf);
344     while (true) {
345         m = min_element(pC_buf.begin(), pC_buf.end()) - pC_buf.begin();
346         if (pC_buf[m] >= _jma_p_cutoff) return (false);
347         if (insert_B_and_Z(slct, remain[m])) {
348             slct.push_back(remain[m]);
349             stable_sort(slct.begin(), slct.end());
350             remain.erase(remain.begin() + m);
351             return (true);
352         }
353         pC_buf.erase(pC_buf.begin() + m);
354         remain.erase(remain.begin() + m);
355     }
356 }
357 
slct_stay(vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)358 void gcta::slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ) {
359     if (_B_N.cols() < 1) {
360         if (!init_B(slct)) throw ("Error: there is a collinearity problem of the given list of SNPs.\nYou can try the option --massoc-slct to get rid of one of each pair of highly correlated SNPs.");
361     }
362 
363     vector<double> pJ_buf;
364     while (!slct.empty()) {
365         massoc_joint(slct, bJ, bJ_se, pJ);
366         eigenVector2Vector(pJ, pJ_buf);
367         int m = max_element(pJ_buf.begin(), pJ_buf.end()) - pJ_buf.begin();
368         if (pJ[m] > _jma_p_cutoff) {
369             _jma_snpnum_backward++;
370             erase_B_and_Z(slct, slct[m]);
371             slct.erase(slct.begin() + m);
372         } else break;
373     }
374 }
375 
eigenVector2Vector(eigenVector & x,vector<double> & y)376 void gcta::eigenVector2Vector(eigenVector &x, vector<double> &y) {
377     y.resize(x.size());
378     for (int i = 0; i < x.size(); i++) y[i] = x[i];
379 }
380 
massoc_calcu_Ve(const vector<int> & slct,eigenVector & bJ,eigenVector & b)381 double gcta::massoc_calcu_Ve(const vector<int> &slct, eigenVector &bJ, eigenVector &b) {
382     double Ve = 0.0;
383     int n = bJ.size();
384     vector<double> Nd_buf(n);
385     for (int k = 0; k < n; k++) {
386         Nd_buf[k] = _Nd[slct[k]];
387         Ve += _D_N[k] * bJ[k] * b[k];
388     }
389     double d_buf = CommFunc::median(Nd_buf);
390     if (d_buf - n < 1) throw ("Error: no degree of freedom left for the residues, the model is over-fitting. Please specify a more stringent p cutoff value.");
391     Ve = ((d_buf - 1) * _jma_Vp - Ve) / (d_buf - n);
392     if (Ve <= 0.0) throw ("Error: residual variance is out of boundary, the model is over-fitting. Please specify a more stringent p cutoff value.");
393     return Ve;
394 }
395 
massoc_joint(const vector<int> & indx,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)396 void gcta::massoc_joint(const vector<int> &indx, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ) {
397     if (_B_N.cols() < 1) {
398         if (!init_B(indx)) throw ("Error: there is a collinearity problem of the given list of SNPs.\nYou can try the option --massoc-slct to get rid of one of each pair of highly correlated SNPs.");
399     }
400 
401     int i = 0, n = indx.size();
402     double chisq = 0.0;
403     eigenVector b(n);
404     for (i = 0; i < n; i++) b[i] = _beta[indx[i]];
405     bJ.resize(n);
406     bJ_se.resize(n);
407     pJ.resize(n);
408     bJ = _B_N_i * _D_N.asDiagonal() * b;
409     bJ_se = _B_N_i.diagonal();
410     pJ = eigenVector::Ones(n);
411     if (_jma_actual_geno) _jma_Ve = massoc_calcu_Ve(indx, bJ, b);
412     bJ_se *= _jma_Ve;
413     for (i = 0; i < n; i++) {
414         if (bJ_se[i] > 1.0e-7) {
415             bJ_se[i] = sqrt(bJ_se[i]);
416             chisq = bJ[i] / bJ_se[i];
417             if (_GC_val > 0) pJ[i] = StatFunc::pchisq(chisq * chisq / _GC_val, 1);
418             else pJ[i] = StatFunc::pchisq(chisq*chisq, 1);
419         } else {
420             bJ[i] = 0.0;
421             bJ_se[i] = 0.0;
422         }
423     }
424 }
425 
massoc_cond(const vector<int> & slct,const vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)426 void gcta::massoc_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC) {
427     if (_B_N.cols() < 1) {
428         if (!init_B(slct)) throw ("Error: there is a collinearity problem of the given list of SNPs.\nYou can try the option --massoc-slct to get rid of one of each pair of highly correlated SNPs.");
429     }
430     if (_Z_N.cols() < 1) init_Z(slct);
431 
432     int i = 0, j = 0, n = slct.size();
433     double chisq = 0.0;
434     eigenVector b(n), se(n);
435     for (i = 0; i < n; i++) {
436         b[i] = _beta[slct[i]];
437         se[i] = _beta_se[slct[i]];
438     }
439     eigenVector bJ1 = _B_N_i * _D_N.asDiagonal() * b;
440     if (_jma_actual_geno) _jma_Ve = massoc_calcu_Ve(slct, bJ1, b);
441 
442     double B2 = 0.0;
443     bC = eigenVector::Zero(remain.size());
444     bC_se = eigenVector::Zero(remain.size());
445     pC = eigenVector::Constant(remain.size(), 2);
446     eigenVector Z_Bi(n), Z_Bi_buf(n);
447     for (i = 0; i < remain.size(); i++) {
448         j = remain[i];
449         B2 = _MSX[j] * _Nd[j];
450         if (!CommFunc::FloatEqual(B2, 0.0)) {
451             Z_Bi = _Z_N.col(j).transpose() * _B_N_i;
452             Z_Bi_buf = _Z.col(j).transpose() * _B_i;
453             if (_Z.col(j).dot(Z_Bi_buf) / _MSX_B[j] < _jma_collinear) {
454                 bC[i] = _beta[j] - Z_Bi.cwiseProduct(_D_N).dot(b) / B2;
455                 bC_se[i] = (B2 - _Z_N.col(j).dot(Z_Bi)) / (B2 * B2);
456             }
457         }
458         if (_jma_actual_geno) bC_se[i] *= _jma_Ve - (B2 * bC[i] * _beta[j]) / (_Nd[j] - n - 1);
459         else bC_se[i] *= _jma_Ve;
460         if (bC_se[i] > 1e-7) {
461             bC_se[i] = sqrt(bC_se[i]);
462             chisq = bC[i] / bC_se[i];
463             if (_GC_val > 0) pC[i] = StatFunc::pchisq(chisq * chisq / _GC_val, 1);
464             else pC[i] = StatFunc::pchisq(chisq*chisq, 1);
465         }
466     }
467 }
468 
init_B(const vector<int> & indx)469 bool gcta::init_B(const vector<int> &indx)
470 {
471     int i = 0, j = 0, k = 0, n = _keep.size();
472     double d_buf = 0.0;
473     eigenVector diagB(indx.size());
474     _B.resize(indx.size(), indx.size());
475     _B_N.resize(indx.size(), indx.size());
476     _D_N.resize(indx.size());
477     eigenVector x_i(_keep.size()), x_j(_keep.size());
478     for (i = 0; i < indx.size(); i++) {
479         _D_N[i] = _MSX[indx[i]] * _Nd[indx[i]];
480         _B.startVec(i);
481         _B_N.startVec(i);
482         _B.insertBack(i, i) = _MSX_B[indx[i]];
483         _B_N.insertBack(i, i) = _D_N[i];
484         diagB[i] = _MSX_B[indx[i]];
485         makex_eigenVector(indx[i], x_i, false, true);
486         for (j = i + 1; j < indx.size(); j++) {
487             if (_jma_actual_geno || (_chr[_include[indx[i]]] == _chr[_include[indx[j]]] && abs(_bp[_include[indx[i]]] - _bp[_include[indx[j]]]) < _jma_wind_size)) {
488                 makex_eigenVector(indx[j], x_j, false, true);
489                 d_buf = x_i.dot(x_j) / (double)n;
490                 _B.insertBack(j, i) = d_buf;
491                 _B_N.insertBack(j, i) = d_buf * min(_Nd[indx[i]], _Nd[indx[j]]) * sqrt(_MSX[indx[i]] * _MSX[indx[j]] / (_MSX_B[indx[i]] * _MSX_B[indx[j]]));
492             }
493         }
494     }
495     _B.finalize();
496     _B_N.finalize();
497 
498     LDLT<eigenMatrix> ldlt_B(_B);
499     if (ldlt_B.vectorD().minCoeff() < 0 || sqrt(ldlt_B.vectorD().maxCoeff() / ldlt_B.vectorD().minCoeff()) > 30) return false;
500     _B_i = eigenMatrix::Identity(indx.size(), indx.size());
501     ldlt_B.solveInPlace(_B_i);
502     if ((1 - eigenVector::Constant(indx.size(), 1).array() / (diagB.array() * _B_i.diagonal().array())).maxCoeff() > _jma_collinear) return false;
503     LDLT<eigenMatrix> ldlt_B_N(_B_N);
504     _B_N_i = eigenMatrix::Identity(indx.size(), indx.size());
505     ldlt_B_N.solveInPlace(_B_N_i);
506     return true;
507 }
508 
init_Z(const vector<int> & indx)509 void gcta::init_Z(const vector<int> &indx)
510 {
511     int i = 0, j = 0, n = _keep.size();
512     double d_buf = 0.0;
513     _Z.resize(indx.size(), _include.size());
514     _Z_N.resize(indx.size(), _include.size());
515     eigenVector x_i(_keep.size()), x_j(_keep.size());
516     for (j = 0; j < _include.size(); j++) {
517         _Z.startVec(j);
518         _Z_N.startVec(j);
519         makex_eigenVector(j, x_j, false, true);
520         for (i = 0; i < indx.size(); i++) {
521             if (_jma_actual_geno || (indx[i] != j && _chr[_include[indx[i]]] == _chr[_include[j]] && abs(_bp[_include[indx[i]]] - _bp[_include[j]]) < _jma_wind_size)) {
522                 makex_eigenVector(indx[i], x_i, false, true);
523                 d_buf = x_j.dot(x_i) / (double)n;
524                 _Z.insertBack(i, j) = d_buf;
525                 _Z_N.insertBack(i, j) = d_buf * min(_Nd[indx[i]], _Nd[j]) * sqrt(_MSX[indx[i]] * _MSX[j] / (_MSX_B[indx[i]] * _MSX_B[j])); // added by Jian Yang 18/12/2013
526             }
527         }
528     }
529     _Z.finalize();
530     _Z_N.finalize();
531 }
532 
insert_B_and_Z(const vector<int> & indx,int insert_indx)533 bool gcta::insert_B_and_Z(const vector<int> &indx, int insert_indx)
534 {
535     int i = 0, j = 0, n = _keep.size();
536     double d_buf = 0.0;
537     vector<int> ix(indx);
538     ix.push_back(insert_indx);
539     stable_sort(ix.begin(), ix.end());
540     eigenSparseMat B_buf(_B), B_N_buf(_B_N);
541     _B.resize(ix.size(), ix.size());
542     _B_N.resize(ix.size(), ix.size());
543     bool get_insert_col = false, get_insert_row = false;
544     int pos = find(ix.begin(), ix.end(), insert_indx) - ix.begin();
545     eigenVector diagB(ix.size());
546     eigenVector x_i(_keep.size()), x_j(_keep.size());
547     for (j = 0; j < ix.size(); j++) {
548         _B.startVec(j);
549         _B_N.startVec(j);
550         _B.insertBack(j, j) = _MSX_B[ix[j]];
551         _B_N.insertBack(j, j) = _MSX[ix[j]] * _Nd[ix[j]];
552         diagB[j] = _MSX_B[ix[j]];
553         if (insert_indx == ix[j]) get_insert_col = true;
554         get_insert_row = get_insert_col;
555         makex_eigenVector(ix[j], x_j, false, true);
556         for (i = j + 1; i < ix.size(); i++) {
557             if (insert_indx == ix[i]) get_insert_row = true;
558             if (insert_indx == ix[j] || insert_indx == ix[i]) {
559                 if (_jma_actual_geno || (_chr[_include[ix[i]]] == _chr[_include[ix[j]]] && abs(_bp[_include[ix[i]]] - _bp[_include[ix[j]]]) < _jma_wind_size)) {
560                     makex_eigenVector(ix[i], x_i, false, true);
561                     d_buf = x_i.dot(x_j) / (double)n;
562                     _B.insertBack(i, j) = d_buf;
563                     _B_N.insertBack(i, j) = d_buf * min(_Nd[ix[i]], _Nd[ix[j]]) * sqrt(_MSX[ix[i]] * _MSX[ix[j]] / (_MSX_B[ix[i]] * _MSX_B[ix[j]]));
564                 }
565             } else {
566                 if (B_buf.coeff(i - get_insert_row, j - get_insert_col) != 0) {
567                     _B.insertBack(i, j) = B_buf.coeff(i - get_insert_row, j - get_insert_col);
568                     _B_N.insertBack(i, j) = B_N_buf.coeff(i - get_insert_row, j - get_insert_col);
569                 }
570             }
571         }
572     }
573     _B.finalize();
574     _B_N.finalize();
575     LDLT<eigenMatrix> ldlt_B(_B);
576     _B_i = eigenMatrix::Identity(ix.size(), ix.size());
577     ldlt_B.solveInPlace(_B_i);
578     if (ldlt_B.vectorD().minCoeff() < 0 || sqrt(ldlt_B.vectorD().maxCoeff() / ldlt_B.vectorD().minCoeff()) > 30 || (1 - eigenVector::Constant(ix.size(), 1).array() / (diagB.array() * _B_i.diagonal().array())).maxCoeff() > _jma_collinear) {
579         _jma_snpnum_collienar++;
580         _B = B_buf;
581         _B_N = B_N_buf;
582         return false;
583     }
584     LDLT<eigenMatrix> ldlt_B_N(_B_N);
585     _B_N_i = eigenMatrix::Identity(ix.size(), ix.size());
586     ldlt_B_N.solveInPlace(_B_N_i);
587     _D_N.resize(ix.size());
588     for (j = 0; j < ix.size(); j++) {
589         _D_N[j] = _MSX[ix[j]] * _Nd[ix[j]];
590     }
591 
592     if (_Z_N.cols() < 1) return true;
593     eigenSparseMat Z_buf(_Z), Z_N_buf(_Z_N);
594     _Z.resize(ix.size(), _include.size());
595     _Z_N.resize(ix.size(), _include.size());
596     for (j = 0; j < _include.size(); j++) {
597         _Z.startVec(j);
598         _Z_N.startVec(j);
599         get_insert_row = false;
600         makex_eigenVector(j, x_j, false, true);
601         for (i = 0; i < ix.size(); i++) {
602             if (insert_indx == ix[i]) {
603                 if (_jma_actual_geno || (ix[i] != j && _chr[_include[ix[i]]] == _chr[_include[j]] && abs(_bp[_include[ix[i]]] - _bp[_include[j]]) < _jma_wind_size)) {
604                     makex_eigenVector(ix[i], x_i, false, true);
605                     d_buf = x_j.dot(x_i) / (double)n;
606                     _Z.insertBack(i, j) = d_buf;
607                     _Z_N.insertBack(i, j) = d_buf * min(_Nd[ix[i]], _Nd[j]) * sqrt(_MSX[ix[i]] * _MSX[j] / (_MSX_B[ix[i]] * _MSX_B[j])); // added by Jian Yang 18/12/2013
608                 }
609                 get_insert_row = true;
610             } else {
611                 if (Z_buf.coeff(i - get_insert_row, j) != 0) {
612                     _Z.insertBack(i, j) = Z_buf.coeff(i - get_insert_row, j);
613                     _Z_N.insertBack(i, j) = Z_N_buf.coeff(i - get_insert_row, j);
614                 }
615             }
616         }
617     }
618     _Z.finalize();
619     _Z_N.finalize();
620 
621     return true;
622 }
623 
erase_B_and_Z(const vector<int> & indx,int erase_indx)624 void gcta::erase_B_and_Z(const vector<int> &indx, int erase_indx) {
625     int i = 0, j = 0;
626     eigenSparseMat B_dense(_B), B_N_dense(_B_N);
627     _B.resize(indx.size() - 1, indx.size() - 1);
628     _B_N.resize(indx.size() - 1, indx.size() - 1);
629     _D_N.resize(indx.size() - 1);
630     int pos = find(indx.begin(), indx.end(), erase_indx) - indx.begin();
631     bool get_insert_col = false, get_insert_row = false;
632     for (j = 0; j < indx.size(); j++) {
633         if (erase_indx == indx[j]) {
634             get_insert_col = true;
635             continue;
636         }
637         _B.startVec(j - get_insert_col);
638         _B_N.startVec(j - get_insert_col);
639         _D_N[j - get_insert_col] = _MSX[indx[j]] * _Nd[indx[j]];
640         get_insert_row = get_insert_col;
641         for (i = j; i < indx.size(); i++) {
642             if (erase_indx == indx[i]) {
643                 get_insert_row = true;
644                 continue;
645             }
646             if (B_dense.coeff(i, j) != 0) {
647                 _B.insertBack(i - get_insert_row, j - get_insert_col) = B_dense.coeff(i, j);
648                 _B_N.insertBack(i - get_insert_row, j - get_insert_col) = B_N_dense.coeff(i, j);
649             }
650         }
651     }
652     _B.finalize();
653     _B_N.finalize();
654 
655     if (_Z_N.cols() < 1) return;
656 
657     LDLT<eigenMatrix> ldlt_B(_B);
658     _B_i = eigenMatrix::Identity(indx.size() - 1, indx.size() - 1);
659     ldlt_B.solveInPlace(_B_i);
660     LDLT<eigenMatrix> ldlt_B_N(_B_N);
661     _B_N_i = eigenMatrix::Identity(indx.size() - 1, indx.size() - 1);
662     ldlt_B_N.solveInPlace(_B_N_i);
663 
664     eigenSparseMat Z_buf(_Z), Z_N_buf(_Z_N);
665     _Z.resize(indx.size() - 1, _include.size());
666     _Z_N.resize(indx.size() - 1, _include.size());
667     for (j = 0; j < _include.size(); j++) {
668         _Z.startVec(j);
669         _Z_N.startVec(j);
670         get_insert_row = false;
671         for (i = 0; i < indx.size(); i++) {
672             if (erase_indx == indx[i]) {
673                 get_insert_row = true;
674                 continue;
675             }
676             if (Z_buf.coeff(i, j) != 0) {
677                 _Z.insertBack(i - get_insert_row, j) = Z_buf.coeff(i, j);
678                 _Z_N.insertBack(i - get_insert_row, j) = Z_N_buf.coeff(i, j);
679             }
680         }
681     }
682     _Z.finalize();
683     _Z_N.finalize();
684 }
685 
686 /*
687 double gcta::crossprod(vector<float> &x_i, vector<float> &x_j) {
688     double prod = 0.0;
689     for (int i = 0; i < x_i.size(); i++) prod += x_i[i] * x_j[i];
690     return (prod / x_i.size());
691 }
692 */
693 
LD_rval(const vector<int> & indx,eigenMatrix & rval)694 void gcta::LD_rval(const vector<int> &indx, eigenMatrix &rval) {
695     int i = 0, j = 0;
696     eigenVector sd(indx.size());
697     for (i = 0; i < indx.size(); i++) sd[i] = sqrt(_MSX_B[indx[i]]);
698     for (j = 0; j < indx.size(); j++) {
699         rval(j, j) = 1.0;
700         for (i = j + 1; i < indx.size(); i++) rval(i, j) = rval(j, i) = _B.coeff(i, j) / sd[i] / sd[j];
701     }
702 }
703 
run_massoc_sblup(string metafile,int wind_size,double lambda)704 void gcta::run_massoc_sblup(string metafile, int wind_size, double lambda)
705 {
706     _jma_wind_size = wind_size;
707     init_massoc(metafile, false, -1);
708 
709     int j = 0;
710     eigenVector bJ;
711     cout << "\nPerforming joint analysis on all the " << _include.size() << " SNPs ..." << endl;
712     cout << "(Assuming complete linkage equilibrium between SNPs which are more than " << _jma_wind_size / 1e6 << "Mb away from each other)" << endl;
713     if (massoc_sblup(lambda, bJ)) {
714         string filename = _out + ".sblup.cojo";
715         cout << "Saving the joint analysis result of " << _include.size() << " SNPs to [" + filename + "] ..." << endl;
716         ofstream ofile(filename.c_str());
717         if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
718         for (j = 0; j < _include.size(); j++) {
719             ofile << _snp_name[_include[j]] << "\t" << _ref_A[_include[j]] << "\t" << _beta[j] << "\t" << bJ[j] << endl;
720         }
721         ofile.close();
722     } else throw ("Jacobi iteration not converged. You can increase the maximum number of iterations by the option --massoc-sblup-maxit.");
723 }
724 
massoc_sblup(double lambda,eigenVector & bJ)725 bool gcta::massoc_sblup(double lambda, eigenVector &bJ)
726 {
727     int i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
728     double prod = 0.0;
729     eigenVector D(_include.size());
730     eigenSparseMat B(_include.size(), _include.size());
731 
732     vector<int> nz_i(m), nz_j(m);
733     for (i = 0; i < _include.size(); i++){
734         nz_i[i]=0;
735         nz_j[i]=0;
736     }
737     cout << "Calculating the LD correlation matrix of all the " << _include.size() << " SNPs..." << endl;
738     for (i = 0; i < _include.size(); i++) {
739         D[i] = _MSX[i] * _Nd[i];
740         B.startVec(i);
741         B.insertBack(i, i) = D[i] + lambda;
742         for (j = i + 1; j < _include.size(); j++) {
743             if (_chr[_include[i]] == _chr[_include[j]] && abs(_bp[_include[i]] - _bp[_include[j]]) < _jma_wind_size) {
744                 B.insertBack(j, i) = 1.0;
745             }
746         }
747     }
748     B.finalize();
749 
750     eigenVector x_i(_keep.size()), x_j(_keep.size());
751     for (i = 0; i < _include.size(); i++) {
752         makex_eigenVector(i, x_i, false, true);
753         for (j = i + 1; j < _include.size(); j++) {
754             if (_chr[_include[i]] == _chr[_include[j]] && abs(_bp[_include[i]] - _bp[_include[j]]) < _jma_wind_size) {
755                 makex_eigenVector(j, x_j, false, true);
756                 prod = x_i.dot(x_j) / (double)n;
757                 B.coeffRef(j, i) = prod * min(_Nd[i], _Nd[j]) * sqrt(_MSX[i] * _MSX[j] / (_MSX_B[i] * _MSX_B[j]));
758             }
759         }
760         if((i + 1) % 1000 == 0 || (i + 1) == _include.size()) cout << i + 1 << " of " << _include.size() << " SNPs.\r";
761     }
762 
763     cout << "Estimating the joint effects of all SNPs ..." << endl;
764     eigenVector Xty = D.array() * _beta.array();
765     SimplicialLDLT<eigenSparseMat> solver;
766     solver.compute(B);
767     if(solver.info()!=Success) {
768         throw("Error: decomposition failed! The SNP correlation matrix is not positive definite.");
769     }
770     bJ = solver.solve(Xty);
771     if(solver.info()!=Success) {
772         throw("Error: solving failed! Unable to solve the BLUP equation.");
773     }
774 
775     // debug
776     //cout<<"_MSX: "<<_MSX<<endl;
777     //cout<<"Nd: "<<_Nd<<endl;
778     //cout<<"B: "<<B<<endl;
779     //cout<<"D: "<<D<<endl;
780     //cout<<"_beta: "<<_beta<<endl;
781     //cout<<"bJ: "<<bJ<<endl;
782 
783     //cout<<"B inverse: "<<ldlt.solve(eigenMatrix::Identity(_include.size(), _include.size()))<<endl;
784 
785 
786     return (true);
787 }
788 
789 
790 
791