1 #include "gcta.h"
2
read_metafile(string metafile,bool GC,double GC_val)3 void gcta::read_metafile(string metafile, bool GC, double GC_val) {
4 cout << "\nReading GWAS summary-level statistics from [" + metafile + "] ..." << endl;
5 ifstream Meta(metafile.c_str());
6 if (!Meta) throw ("Error: can not open the file [" + metafile + "] to read.");
7
8 int i = 0, count = 0;
9 double f_buf = 0.0, b_buf = 0.0, se_buf = 0.0, p_buf = 0.0, N_buf = 0.0, Vp_buf = 0.0, GC_buf = 0.0, chi_buf = 0.0, h_buf = 0.0;
10 string A1_buf, A2_buf;
11 string snp_buf, str_buf0, str_buf;
12
13 vector<string> snplist, vs_buf, bad_snp;
14 vector<string> ref_A_buf, bad_A1, bad_A2, bad_refA;
15 vector<double> freq_buf, beta_buf, beta_se_buf, pval_buf, N_o_buf, Vp_v_buf, GC_v_buf;
16 map<string, int>::iterator iter;
17 getline(Meta, str_buf); // the header line
18 if (StrFunc::split_string(str_buf, vs_buf) < 7) throw ("Error: format error in the input file [" + metafile + "].");
19 _jma_Vp = 0.0;
20 _GC_val = -1;
21 while (Meta) {
22 getline(Meta, str_buf0);
23 stringstream iss(str_buf0);
24 iss >> snp_buf >> A1_buf >> A2_buf;
25 StrFunc::to_upper(A1_buf);
26 StrFunc::to_upper(A2_buf);
27 iss >> str_buf;
28 f_buf = atof(str_buf.c_str());
29 iss >> str_buf;
30 if (str_buf == "NA" || str_buf == ".") continue;
31 b_buf = atof(str_buf.c_str());
32 iss >> str_buf;
33 if (str_buf == "NA" || str_buf == "." || str_buf == "0") continue;
34 se_buf = atof(str_buf.c_str());
35 iss >> str_buf;
36 if (str_buf == "NA" || str_buf == ".") continue;
37 p_buf = atof(str_buf.c_str());
38 iss >> str_buf;
39 if (str_buf == "NA" || str_buf == ".") continue;
40 N_buf = atof(str_buf.c_str());
41 if (N_buf < 10) throw ("Error: invalid sample size in line:\n\"" + str_buf0 + "\"");
42 if (Meta.eof()) break;
43 iter = _snp_name_map.find(snp_buf);
44 h_buf = 2.0 * f_buf * (1.0 - f_buf);
45 Vp_buf = h_buf * N_buf * se_buf * se_buf + h_buf * b_buf * b_buf * N_buf / (N_buf - 1.0);
46 if (Vp_buf < 0.0) throw ("Error: in line:\n\"" + str_buf0 + "\"");
47 Vp_v_buf.push_back(Vp_buf);
48 if (GC) {
49 GC_buf = b_buf * b_buf / se_buf / se_buf;
50 if (GC_buf < 0) throw ("Error: in line:\n\"" + str_buf0 + "\"");
51 GC_v_buf.push_back(GC_buf);
52 }
53 count++;
54 if (iter == _snp_name_map.end()) continue;
55 i = iter->second;
56 if (A1_buf != _allele1[i] && A1_buf != _allele2[i]) {
57 bad_snp.push_back(_snp_name[i]);
58 bad_A1.push_back(_allele1[i]);
59 bad_A2.push_back(_allele2[i]);
60 bad_refA.push_back(A1_buf);
61 continue;
62 }
63 snplist.push_back(snp_buf);
64 ref_A_buf.push_back(A1_buf);
65 freq_buf.push_back(f_buf);
66 beta_buf.push_back(b_buf);
67 beta_se_buf.push_back(se_buf);
68 pval_buf.push_back(p_buf);
69 N_o_buf.push_back(N_buf);
70 }
71 Meta.close();
72 cout << "GWAS summary statistics of " << count << " SNPs read from [" + metafile + "]." << endl;
73 _jma_Vp = CommFunc::median(Vp_v_buf);
74 cout << "Phenotypic variance estimated from summary statistics of all " << count << " SNPs: " << _jma_Vp << " (variance of logit for case-control studies)." << endl;
75 if (GC) {
76 if (GC_val > 0) {
77 _GC_val = GC_val;
78 cout << "User specified genomic inflation factor: " << _GC_val << endl;
79 } else {
80 _GC_val = CommFunc::median(GC_v_buf) / 0.455;
81 cout << "Genomic inflation factor calcualted from " << count << " SNPs: " << _GC_val << endl;
82 }
83 cout << "p-values wil be adjusted by the genomic control approach." << endl;
84 }
85
86 cout << "Matching the GWAS meta-analysis results to the genotype data ..." << endl;
87 update_id_map_kp(snplist, _snp_name_map, _include);
88 vector<int> indx(_include.size());
89 map<string, int> id_map;
90 for (i = 0; i < snplist.size(); i++) id_map.insert(pair<string, int>(snplist[i], i));
91 for (i = 0; i < _include.size(); i++) {
92 iter = id_map.find(_snp_name[_include[i]]);
93 indx[i] = iter->second;
94 _ref_A[_include[i]] = ref_A_buf[iter->second];
95 if (!_mu.empty() && ref_A_buf[iter->second] == _allele2[_include[i]]) _mu[_include[i]] = 2.0 - _mu[_include[i]];
96 }
97 if (!bad_snp.empty()) {
98 string badsnpfile = _out + ".badsnps";
99 ofstream obadsnp(badsnpfile.c_str());
100 obadsnp << "SNP\tA1\tA2\tRefA" << endl;
101 for (i = 0; i < bad_snp.size(); i++) obadsnp << bad_snp[i] << "\t" << bad_A1[i] << "\t" << bad_A2[i] << "\t" << bad_refA[i] << endl;
102 obadsnp.close();
103 cout << "Warning: can't match the reference alleles of " << bad_snp.size() << " SNPs to those in the genotype data. These SNPs have been saved in [" + badsnpfile + "]." << endl;
104 }
105 if (_include.empty()) throw ("Error: all the SNPs in the GWAS meta-analysis results can't be found in the genotype data.");
106 else cout << _include.size() << " SNPs are matched to the genotype data." << endl;
107
108 if (_mu.empty()) calcu_mu();
109
110 _freq.resize(_include.size());
111 _beta.resize(_include.size());
112 _beta_se.resize(_include.size());
113 _pval.resize(_include.size());
114 _N_o.resize(_include.size());
115 for (i = 0; i < _include.size(); i++) {
116 _freq[i] = freq_buf[indx[i]];
117 _beta[i] = beta_buf[indx[i]];
118 _beta_se[i] = beta_se_buf[indx[i]];
119 if (GC) {
120 chi_buf = _beta[i] / _beta_se[i];
121 _pval[i] = StatFunc::pchisq(chi_buf * chi_buf / _GC_val, 1);
122 } else _pval[i] = pval_buf[indx[i]];
123 _N_o[i] = N_o_buf[indx[i]];
124 }
125 _jma_Ve = _jma_Vp;
126 }
127
init_massoc(string metafile,bool GC,double GC_val)128 void gcta::init_massoc(string metafile, bool GC, double GC_val)
129 {
130 read_metafile(metafile, GC, GC_val);
131
132 int i = 0, j = 0, n = _keep.size(), m = _include.size();
133 cout << "Calculating the variance of SNP genotypes ..." << endl;
134 _MSX_B.resize(m);
135 _Nd.resize(m);
136
137 if (_mu.empty()) calcu_mu();
138 #pragma omp parallel for
139 for (i = 0; i < m; i++){
140 eigenVector x;
141 makex_eigenVector(i, x, true, true);
142 _MSX_B[i] = x.squaredNorm() / (double)n;
143 }
144 if (_jma_actual_geno) {
145 _MSX = _MSX_B;
146 _Nd = _N_o;
147 } else {
148 _MSX = 2.0 * _freq.array()*(1.0 - _freq.array());
149 for (i = 0; i < m; i++) _Nd[i] = (_jma_Vp - _MSX[i] * _beta[i] * _beta[i]) / (_MSX[i] * _beta_se[i] * _beta_se[i]) + 1; // revised by JY 25/11/13 according to Eq. 13, Yang et al. 2012 NG
150 }
151 }
152
read_fixed_snp(string snplistfile,string msg,vector<int> & pgiven,vector<int> & remain)153 void gcta::read_fixed_snp(string snplistfile, string msg, vector<int> &pgiven, vector<int> &remain) {
154 int i = 0, j = 0;
155 vector<string> givenSNPs;
156 read_snplist(snplistfile, givenSNPs, msg);
157 if (givenSNPs.empty()) throw ("Error: failed to read any SNP from the file [" + snplistfile + "].");
158 map<string, int> givenSNPs_map;
159 pgiven.clear();
160 remain.clear();
161 for (i = 0; i < givenSNPs.size(); i++) givenSNPs_map.insert(pair<string, int>(givenSNPs[i], i));
162 for (i = 0; i < _include.size(); i++) {
163 if (givenSNPs_map.find(_snp_name[_include[i]]) != givenSNPs_map.end()) pgiven.push_back(i);
164 else remain.push_back(i);
165 }
166 if (pgiven.size() > 0) cout << pgiven.size() << " of them are matched to the genotype and summary data." << endl;
167 else throw ("Error: none of the given SNPs can be matched to the genotype and summary data.");
168 }
169
run_massoc_slct(string metafile,int wind_size,double p_cutoff,double collinear,int top_SNPs,bool joint_only,bool GC,double GC_val,bool actual_geno,int mld_slct_alg)170 void gcta::run_massoc_slct(string metafile, int wind_size, double p_cutoff, double collinear, int top_SNPs, bool joint_only, bool GC, double GC_val, bool actual_geno, int mld_slct_alg)
171 {
172 _jma_actual_geno = actual_geno;
173 _jma_wind_size = wind_size;
174 _jma_p_cutoff = p_cutoff;
175 _jma_collinear = collinear;
176 _jma_snpnum_backward = 0;
177 _jma_snpnum_collienar = 0;
178 init_massoc(metafile, GC, GC_val);
179 if (top_SNPs < 0) top_SNPs = 1e30;
180 else {
181 _jma_p_cutoff = 0.5;
182 cout << "The threshold p-value has been set to 0.5 because of the --massoc-top-SNPs option." << endl;
183 }
184 int i = 0, j = 0;
185 vector<int> slct, remain;
186 eigenVector bC, bC_se, pC;
187 cout << endl;
188 if (!joint_only && mld_slct_alg<2) {
189 cout << "Performing "<< ((mld_slct_alg==0)?"stepwise":"forward") << " model selection on " << _include.size() << " SNPs to select association signals ... (p cutoff = " << _jma_p_cutoff << "; ";
190 cout << "collinearity cutoff = " << _jma_collinear << ")"<< endl;
191 if (!_jma_actual_geno) cout << "(Assuming complete linkage equilibrium between SNPs which are more than " << _jma_wind_size / 1e6 << "Mb away from each other)" << endl;
192 stepwise_slct(slct, remain, bC, bC_se, pC, mld_slct_alg, top_SNPs);
193 if (slct.empty()) {
194 cout << "No SNPs have been selected." << endl;
195 return;
196 }
197 } else { // mld_slct_alg = 2 for backward selection
198 for (i = 0; i < _include.size(); i++) slct.push_back(i);
199 if (mld_slct_alg==2) {
200 cout << "Performing backward selection on " << _include.size() << " SNPs at threshold p-value = " << _jma_p_cutoff << " ..." << endl;
201 slct_stay(slct, bC, bC_se, pC);
202 }
203 }
204
205 // joint analysis
206 eigenVector bJ, bJ_se, pJ;
207 cout << "Performing joint analysis on all the " << slct.size();
208 if (joint_only) cout << " SNPs ..." << endl;
209 else cout << " selected signals ..." << endl;
210 if (slct.size() >= _keep.size()) throw ("Error: too many SNPs. The number of SNPs in a joint analysis should not be larger than the sample size.");
211 massoc_joint(slct, bJ, bJ_se, pJ);
212 eigenMatrix rval(slct.size(), slct.size());
213 LD_rval(slct, rval);
214 if (_jma_actual_geno) cout << "Residual varaince = " << _jma_Ve << endl;
215 massoc_slct_output(joint_only, slct, bJ, bJ_se, pJ, rval);
216
217 // output conditional results
218 if (!joint_only && !mld_slct_alg==2) {
219 massoc_cond_output(remain, bC, bC_se, pC);
220 cout << "(" << _jma_snpnum_backward << " SNPs eliminated by backward selection and " << _jma_snpnum_collienar << " SNPs filtered by collinearity test are not included in the output)" << endl;
221 }
222 }
223
run_massoc_cond(string metafile,string snplistfile,int wind_size,double collinear,bool GC,double GC_val,bool acutal_geno)224 void gcta::run_massoc_cond(string metafile, string snplistfile, int wind_size, double collinear, bool GC, double GC_val, bool acutal_geno) {
225 _jma_actual_geno = acutal_geno;
226 _jma_wind_size = wind_size;
227 _jma_collinear = collinear;
228 init_massoc(metafile, GC, GC_val);
229 vector<int> pgiven, remain;
230 read_fixed_snp(snplistfile, "given SNPs", pgiven, remain);
231
232 eigenVector bC, bC_se, pC;
233 cout << "Performing single-SNP association analysis conditional on the " << pgiven.size() << " given SNPs ... ";
234 cout << "(collinearity cutoff = " << _jma_collinear << ")" << endl;
235 if (!_jma_actual_geno) cout << "(Assuming complete linkage equilibrium between SNPs which are more than " << _jma_wind_size / 1e6 << "Mb away from each other)" << endl;
236
237 string filename = _out + ".given.cojo";
238 cout<<"Saving the summary statistics of the given SNPs in the file [" + filename + "] ..." << endl;
239 ofstream ofile(filename.c_str());
240 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
241 ofile << "Chr\tSNP\tbp\trefA\tfreq\tb\tse\tp" << endl;
242 int i = 0, j = 0;
243 for (i = 0; i < pgiven.size(); i++) {
244 j = pgiven[i];
245 ofile << _chr[_include[j]] << "\t" << _snp_name[_include[j]] << "\t" << _bp[_include[j]] << "\t" << _ref_A[_include[j]] << "\t" << _freq[j] << "\t" << _beta[j] << "\t" << _beta_se[j] << "\t" << _pval[j] << endl;
246 }
247 ofile.close();
248
249 massoc_cond(pgiven, remain, bC, bC_se, pC);
250 massoc_cond_output(remain, bC, bC_se, pC);
251 }
252
massoc_slct_output(bool joint_only,vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ,eigenMatrix & rval)253 void gcta::massoc_slct_output(bool joint_only, vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ, eigenMatrix &rval)
254 {
255 string filename = _out + ".jma.cojo";
256 if (joint_only) cout << "Saving the joint analysis result of " << slct.size() << " SNPs to [" + filename + "] ..." << endl;
257 else cout << "Saving the " << slct.size() << " independent signals to [" + filename + "] ..." << endl;
258 ofstream ofile(filename.c_str());
259 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
260 ofile << "Chr\tSNP\tbp\trefA\tfreq\tb\tse\tp" << ((_GC_val > 0) ? "_GC" : "") << "\tn\tfreq_geno\tbJ\tbJ_se\tpJ" << ((_GC_val > 0) ? "_GC" : "") << "\tLD_r" << endl;
261 int i = 0, j = 0;
262 for (i = 0; i < slct.size(); i++) {
263 j = slct[i];
264 ofile << _chr[_include[j]] << "\t" << _snp_name[_include[j]] << "\t" << _bp[_include[j]] << "\t";
265 ofile << _ref_A[_include[j]] << "\t" << _freq[j] << "\t" << _beta[j] << "\t" << _beta_se[j] << "\t";
266 ofile << _pval[j] << "\t" << _Nd[j] << "\t" << 0.5 * _mu[_include[j]] << "\t" << bJ[i] << "\t" << bJ_se[i] << "\t" << pJ[i] << "\t";
267 if (i == slct.size() - 1) ofile << 0 << endl;
268 else ofile << rval(i, i + 1) << endl;
269 }
270 ofile.close();
271
272 filename = _out + ".ldr.cojo";
273 if (joint_only) cout << "Saving the LD structure of " << slct.size() << " SNPs to [" + filename + "] ..." << endl;
274 else cout << "Saving the LD structure of " << slct.size() << " independent signals to [" + filename + "] ..." << endl;
275 ofile.open(filename.c_str());
276 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
277 ofile << "SNP\t";
278 for (i = 0; i < slct.size(); i++) ofile << _snp_name[_include[slct[i]]] << "\t";
279 ofile << endl;
280 for (i = 0; i < slct.size(); i++) {
281 ofile << _snp_name[_include[slct[i]]] << "\t";
282 for (j = 0; j < slct.size(); j++) ofile << rval(i, j) << "\t";
283 ofile << endl;
284 }
285 ofile.close();
286 }
287
massoc_cond_output(vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)288 void gcta::massoc_cond_output(vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC)
289 {
290 int i = 0, j = 0;
291 string filename = _out + ".cma.cojo";
292 cout << "Saving the conditional analysis results of " << remain.size() << " remaining SNPs to [" + filename + "] ..." << endl;
293 ofstream ofile(filename.c_str());
294 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
295 ofile << "Chr\tSNP\tbp\trefA\tfreq\tb\tse\tp" << ((_GC_val > 0) ? "_GC" : "") << "\tn\tfreq_geno\tbC\tbC_se\tpC" << ((_GC_val > 0) ? "_GC" : "") << endl;
296 for (i = 0; i < remain.size(); i++) {
297 j = remain[i];
298 ofile << _chr[_include[j]] << "\t" << _snp_name[_include[j]] << "\t" << _bp[_include[j]] << "\t";
299 ofile << _ref_A[_include[j]] << "\t" << _freq[j] << "\t" << _beta[j] << "\t" << _beta_se[j] << "\t";
300 ofile << _pval[j] << "\t" << _Nd[j] << "\t" << 0.5 * _mu[_include[j]] << "\t";
301 if (pC[i] > 1.5) ofile << "NA\tNA\tNA" << endl;
302 else ofile << bC[i] << "\t" << bC_se[i] << "\t" << pC[i] << endl;
303 }
304 ofile.close();
305 }
306
stepwise_slct(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC,int mld_slct_alg,int top_SNPs)307 void gcta::stepwise_slct(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC, int mld_slct_alg, int top_SNPs)
308 {
309 int i = 0, i_buf = 0;
310 vector<double> p_buf;
311 eigenVector2Vector(_pval, p_buf);
312 int m = min_element(p_buf.begin(), p_buf.end()) - p_buf.begin();
313 if (p_buf[m] >= _jma_p_cutoff) return;
314 slct.push_back(m);
315 for (i = 0; i < _include.size(); i++) {
316 if (i != m) remain.push_back(i);
317 }
318 int prev_num = 0;
319 if (mld_slct_alg==0 && _jma_p_cutoff > 1e-3){
320 cout << "Switched to perform a forward model selection because the significance level is too low..." << endl;
321 mld_slct_alg=1;
322 }
323 while (!remain.empty()) {
324 if (slct_entry(slct, remain, bC, bC_se, pC)) {
325 if (mld_slct_alg==0) slct_stay(slct, bC, bC_se, pC);
326 }
327 else break;
328 if (slct.size() % 5 == 0 && slct.size() > prev_num) cout << slct.size() << " associated SNPs have been selected." << endl;
329 if (slct.size() > prev_num) prev_num = slct.size();
330 if (slct.size() >= top_SNPs) break;
331 }
332 if (_jma_p_cutoff > 1e-3) {
333 cout << "Performing backward elimination..." << endl;
334 slct_stay(slct, bC, bC_se, pC);
335 }
336 cout << "Finally, " << slct.size() << " associated SNPs are selected." << endl;
337 }
338
slct_entry(vector<int> & slct,vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)339 bool gcta::slct_entry(vector<int> &slct, vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC) {
340 int i = 0, m = 0;
341 massoc_cond(slct, remain, bC, bC_se, pC);
342 vector<double> pC_buf;
343 eigenVector2Vector(pC, pC_buf);
344 while (true) {
345 m = min_element(pC_buf.begin(), pC_buf.end()) - pC_buf.begin();
346 if (pC_buf[m] >= _jma_p_cutoff) return (false);
347 if (insert_B_and_Z(slct, remain[m])) {
348 slct.push_back(remain[m]);
349 stable_sort(slct.begin(), slct.end());
350 remain.erase(remain.begin() + m);
351 return (true);
352 }
353 pC_buf.erase(pC_buf.begin() + m);
354 remain.erase(remain.begin() + m);
355 }
356 }
357
slct_stay(vector<int> & slct,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)358 void gcta::slct_stay(vector<int> &slct, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ) {
359 if (_B_N.cols() < 1) {
360 if (!init_B(slct)) throw ("Error: there is a collinearity problem of the given list of SNPs.\nYou can try the option --massoc-slct to get rid of one of each pair of highly correlated SNPs.");
361 }
362
363 vector<double> pJ_buf;
364 while (!slct.empty()) {
365 massoc_joint(slct, bJ, bJ_se, pJ);
366 eigenVector2Vector(pJ, pJ_buf);
367 int m = max_element(pJ_buf.begin(), pJ_buf.end()) - pJ_buf.begin();
368 if (pJ[m] > _jma_p_cutoff) {
369 _jma_snpnum_backward++;
370 erase_B_and_Z(slct, slct[m]);
371 slct.erase(slct.begin() + m);
372 } else break;
373 }
374 }
375
eigenVector2Vector(eigenVector & x,vector<double> & y)376 void gcta::eigenVector2Vector(eigenVector &x, vector<double> &y) {
377 y.resize(x.size());
378 for (int i = 0; i < x.size(); i++) y[i] = x[i];
379 }
380
massoc_calcu_Ve(const vector<int> & slct,eigenVector & bJ,eigenVector & b)381 double gcta::massoc_calcu_Ve(const vector<int> &slct, eigenVector &bJ, eigenVector &b) {
382 double Ve = 0.0;
383 int n = bJ.size();
384 vector<double> Nd_buf(n);
385 for (int k = 0; k < n; k++) {
386 Nd_buf[k] = _Nd[slct[k]];
387 Ve += _D_N[k] * bJ[k] * b[k];
388 }
389 double d_buf = CommFunc::median(Nd_buf);
390 if (d_buf - n < 1) throw ("Error: no degree of freedom left for the residues, the model is over-fitting. Please specify a more stringent p cutoff value.");
391 Ve = ((d_buf - 1) * _jma_Vp - Ve) / (d_buf - n);
392 if (Ve <= 0.0) throw ("Error: residual variance is out of boundary, the model is over-fitting. Please specify a more stringent p cutoff value.");
393 return Ve;
394 }
395
massoc_joint(const vector<int> & indx,eigenVector & bJ,eigenVector & bJ_se,eigenVector & pJ)396 void gcta::massoc_joint(const vector<int> &indx, eigenVector &bJ, eigenVector &bJ_se, eigenVector &pJ) {
397 if (_B_N.cols() < 1) {
398 if (!init_B(indx)) throw ("Error: there is a collinearity problem of the given list of SNPs.\nYou can try the option --massoc-slct to get rid of one of each pair of highly correlated SNPs.");
399 }
400
401 int i = 0, n = indx.size();
402 double chisq = 0.0;
403 eigenVector b(n);
404 for (i = 0; i < n; i++) b[i] = _beta[indx[i]];
405 bJ.resize(n);
406 bJ_se.resize(n);
407 pJ.resize(n);
408 bJ = _B_N_i * _D_N.asDiagonal() * b;
409 bJ_se = _B_N_i.diagonal();
410 pJ = eigenVector::Ones(n);
411 if (_jma_actual_geno) _jma_Ve = massoc_calcu_Ve(indx, bJ, b);
412 bJ_se *= _jma_Ve;
413 for (i = 0; i < n; i++) {
414 if (bJ_se[i] > 1.0e-7) {
415 bJ_se[i] = sqrt(bJ_se[i]);
416 chisq = bJ[i] / bJ_se[i];
417 if (_GC_val > 0) pJ[i] = StatFunc::pchisq(chisq * chisq / _GC_val, 1);
418 else pJ[i] = StatFunc::pchisq(chisq*chisq, 1);
419 } else {
420 bJ[i] = 0.0;
421 bJ_se[i] = 0.0;
422 }
423 }
424 }
425
massoc_cond(const vector<int> & slct,const vector<int> & remain,eigenVector & bC,eigenVector & bC_se,eigenVector & pC)426 void gcta::massoc_cond(const vector<int> &slct, const vector<int> &remain, eigenVector &bC, eigenVector &bC_se, eigenVector &pC) {
427 if (_B_N.cols() < 1) {
428 if (!init_B(slct)) throw ("Error: there is a collinearity problem of the given list of SNPs.\nYou can try the option --massoc-slct to get rid of one of each pair of highly correlated SNPs.");
429 }
430 if (_Z_N.cols() < 1) init_Z(slct);
431
432 int i = 0, j = 0, n = slct.size();
433 double chisq = 0.0;
434 eigenVector b(n), se(n);
435 for (i = 0; i < n; i++) {
436 b[i] = _beta[slct[i]];
437 se[i] = _beta_se[slct[i]];
438 }
439 eigenVector bJ1 = _B_N_i * _D_N.asDiagonal() * b;
440 if (_jma_actual_geno) _jma_Ve = massoc_calcu_Ve(slct, bJ1, b);
441
442 double B2 = 0.0;
443 bC = eigenVector::Zero(remain.size());
444 bC_se = eigenVector::Zero(remain.size());
445 pC = eigenVector::Constant(remain.size(), 2);
446 eigenVector Z_Bi(n), Z_Bi_buf(n);
447 for (i = 0; i < remain.size(); i++) {
448 j = remain[i];
449 B2 = _MSX[j] * _Nd[j];
450 if (!CommFunc::FloatEqual(B2, 0.0)) {
451 Z_Bi = _Z_N.col(j).transpose() * _B_N_i;
452 Z_Bi_buf = _Z.col(j).transpose() * _B_i;
453 if (_Z.col(j).dot(Z_Bi_buf) / _MSX_B[j] < _jma_collinear) {
454 bC[i] = _beta[j] - Z_Bi.cwiseProduct(_D_N).dot(b) / B2;
455 bC_se[i] = (B2 - _Z_N.col(j).dot(Z_Bi)) / (B2 * B2);
456 }
457 }
458 if (_jma_actual_geno) bC_se[i] *= _jma_Ve - (B2 * bC[i] * _beta[j]) / (_Nd[j] - n - 1);
459 else bC_se[i] *= _jma_Ve;
460 if (bC_se[i] > 1e-7) {
461 bC_se[i] = sqrt(bC_se[i]);
462 chisq = bC[i] / bC_se[i];
463 if (_GC_val > 0) pC[i] = StatFunc::pchisq(chisq * chisq / _GC_val, 1);
464 else pC[i] = StatFunc::pchisq(chisq*chisq, 1);
465 }
466 }
467 }
468
init_B(const vector<int> & indx)469 bool gcta::init_B(const vector<int> &indx)
470 {
471 int i = 0, j = 0, k = 0, n = _keep.size();
472 double d_buf = 0.0;
473 eigenVector diagB(indx.size());
474 _B.resize(indx.size(), indx.size());
475 _B_N.resize(indx.size(), indx.size());
476 _D_N.resize(indx.size());
477 eigenVector x_i(_keep.size()), x_j(_keep.size());
478 for (i = 0; i < indx.size(); i++) {
479 _D_N[i] = _MSX[indx[i]] * _Nd[indx[i]];
480 _B.startVec(i);
481 _B_N.startVec(i);
482 _B.insertBack(i, i) = _MSX_B[indx[i]];
483 _B_N.insertBack(i, i) = _D_N[i];
484 diagB[i] = _MSX_B[indx[i]];
485 makex_eigenVector(indx[i], x_i, false, true);
486 for (j = i + 1; j < indx.size(); j++) {
487 if (_jma_actual_geno || (_chr[_include[indx[i]]] == _chr[_include[indx[j]]] && abs(_bp[_include[indx[i]]] - _bp[_include[indx[j]]]) < _jma_wind_size)) {
488 makex_eigenVector(indx[j], x_j, false, true);
489 d_buf = x_i.dot(x_j) / (double)n;
490 _B.insertBack(j, i) = d_buf;
491 _B_N.insertBack(j, i) = d_buf * min(_Nd[indx[i]], _Nd[indx[j]]) * sqrt(_MSX[indx[i]] * _MSX[indx[j]] / (_MSX_B[indx[i]] * _MSX_B[indx[j]]));
492 }
493 }
494 }
495 _B.finalize();
496 _B_N.finalize();
497
498 LDLT<eigenMatrix> ldlt_B(_B);
499 if (ldlt_B.vectorD().minCoeff() < 0 || sqrt(ldlt_B.vectorD().maxCoeff() / ldlt_B.vectorD().minCoeff()) > 30) return false;
500 _B_i = eigenMatrix::Identity(indx.size(), indx.size());
501 ldlt_B.solveInPlace(_B_i);
502 if ((1 - eigenVector::Constant(indx.size(), 1).array() / (diagB.array() * _B_i.diagonal().array())).maxCoeff() > _jma_collinear) return false;
503 LDLT<eigenMatrix> ldlt_B_N(_B_N);
504 _B_N_i = eigenMatrix::Identity(indx.size(), indx.size());
505 ldlt_B_N.solveInPlace(_B_N_i);
506 return true;
507 }
508
init_Z(const vector<int> & indx)509 void gcta::init_Z(const vector<int> &indx)
510 {
511 int i = 0, j = 0, n = _keep.size();
512 double d_buf = 0.0;
513 _Z.resize(indx.size(), _include.size());
514 _Z_N.resize(indx.size(), _include.size());
515 eigenVector x_i(_keep.size()), x_j(_keep.size());
516 for (j = 0; j < _include.size(); j++) {
517 _Z.startVec(j);
518 _Z_N.startVec(j);
519 makex_eigenVector(j, x_j, false, true);
520 for (i = 0; i < indx.size(); i++) {
521 if (_jma_actual_geno || (indx[i] != j && _chr[_include[indx[i]]] == _chr[_include[j]] && abs(_bp[_include[indx[i]]] - _bp[_include[j]]) < _jma_wind_size)) {
522 makex_eigenVector(indx[i], x_i, false, true);
523 d_buf = x_j.dot(x_i) / (double)n;
524 _Z.insertBack(i, j) = d_buf;
525 _Z_N.insertBack(i, j) = d_buf * min(_Nd[indx[i]], _Nd[j]) * sqrt(_MSX[indx[i]] * _MSX[j] / (_MSX_B[indx[i]] * _MSX_B[j])); // added by Jian Yang 18/12/2013
526 }
527 }
528 }
529 _Z.finalize();
530 _Z_N.finalize();
531 }
532
insert_B_and_Z(const vector<int> & indx,int insert_indx)533 bool gcta::insert_B_and_Z(const vector<int> &indx, int insert_indx)
534 {
535 int i = 0, j = 0, n = _keep.size();
536 double d_buf = 0.0;
537 vector<int> ix(indx);
538 ix.push_back(insert_indx);
539 stable_sort(ix.begin(), ix.end());
540 eigenSparseMat B_buf(_B), B_N_buf(_B_N);
541 _B.resize(ix.size(), ix.size());
542 _B_N.resize(ix.size(), ix.size());
543 bool get_insert_col = false, get_insert_row = false;
544 int pos = find(ix.begin(), ix.end(), insert_indx) - ix.begin();
545 eigenVector diagB(ix.size());
546 eigenVector x_i(_keep.size()), x_j(_keep.size());
547 for (j = 0; j < ix.size(); j++) {
548 _B.startVec(j);
549 _B_N.startVec(j);
550 _B.insertBack(j, j) = _MSX_B[ix[j]];
551 _B_N.insertBack(j, j) = _MSX[ix[j]] * _Nd[ix[j]];
552 diagB[j] = _MSX_B[ix[j]];
553 if (insert_indx == ix[j]) get_insert_col = true;
554 get_insert_row = get_insert_col;
555 makex_eigenVector(ix[j], x_j, false, true);
556 for (i = j + 1; i < ix.size(); i++) {
557 if (insert_indx == ix[i]) get_insert_row = true;
558 if (insert_indx == ix[j] || insert_indx == ix[i]) {
559 if (_jma_actual_geno || (_chr[_include[ix[i]]] == _chr[_include[ix[j]]] && abs(_bp[_include[ix[i]]] - _bp[_include[ix[j]]]) < _jma_wind_size)) {
560 makex_eigenVector(ix[i], x_i, false, true);
561 d_buf = x_i.dot(x_j) / (double)n;
562 _B.insertBack(i, j) = d_buf;
563 _B_N.insertBack(i, j) = d_buf * min(_Nd[ix[i]], _Nd[ix[j]]) * sqrt(_MSX[ix[i]] * _MSX[ix[j]] / (_MSX_B[ix[i]] * _MSX_B[ix[j]]));
564 }
565 } else {
566 if (B_buf.coeff(i - get_insert_row, j - get_insert_col) != 0) {
567 _B.insertBack(i, j) = B_buf.coeff(i - get_insert_row, j - get_insert_col);
568 _B_N.insertBack(i, j) = B_N_buf.coeff(i - get_insert_row, j - get_insert_col);
569 }
570 }
571 }
572 }
573 _B.finalize();
574 _B_N.finalize();
575 LDLT<eigenMatrix> ldlt_B(_B);
576 _B_i = eigenMatrix::Identity(ix.size(), ix.size());
577 ldlt_B.solveInPlace(_B_i);
578 if (ldlt_B.vectorD().minCoeff() < 0 || sqrt(ldlt_B.vectorD().maxCoeff() / ldlt_B.vectorD().minCoeff()) > 30 || (1 - eigenVector::Constant(ix.size(), 1).array() / (diagB.array() * _B_i.diagonal().array())).maxCoeff() > _jma_collinear) {
579 _jma_snpnum_collienar++;
580 _B = B_buf;
581 _B_N = B_N_buf;
582 return false;
583 }
584 LDLT<eigenMatrix> ldlt_B_N(_B_N);
585 _B_N_i = eigenMatrix::Identity(ix.size(), ix.size());
586 ldlt_B_N.solveInPlace(_B_N_i);
587 _D_N.resize(ix.size());
588 for (j = 0; j < ix.size(); j++) {
589 _D_N[j] = _MSX[ix[j]] * _Nd[ix[j]];
590 }
591
592 if (_Z_N.cols() < 1) return true;
593 eigenSparseMat Z_buf(_Z), Z_N_buf(_Z_N);
594 _Z.resize(ix.size(), _include.size());
595 _Z_N.resize(ix.size(), _include.size());
596 for (j = 0; j < _include.size(); j++) {
597 _Z.startVec(j);
598 _Z_N.startVec(j);
599 get_insert_row = false;
600 makex_eigenVector(j, x_j, false, true);
601 for (i = 0; i < ix.size(); i++) {
602 if (insert_indx == ix[i]) {
603 if (_jma_actual_geno || (ix[i] != j && _chr[_include[ix[i]]] == _chr[_include[j]] && abs(_bp[_include[ix[i]]] - _bp[_include[j]]) < _jma_wind_size)) {
604 makex_eigenVector(ix[i], x_i, false, true);
605 d_buf = x_j.dot(x_i) / (double)n;
606 _Z.insertBack(i, j) = d_buf;
607 _Z_N.insertBack(i, j) = d_buf * min(_Nd[ix[i]], _Nd[j]) * sqrt(_MSX[ix[i]] * _MSX[j] / (_MSX_B[ix[i]] * _MSX_B[j])); // added by Jian Yang 18/12/2013
608 }
609 get_insert_row = true;
610 } else {
611 if (Z_buf.coeff(i - get_insert_row, j) != 0) {
612 _Z.insertBack(i, j) = Z_buf.coeff(i - get_insert_row, j);
613 _Z_N.insertBack(i, j) = Z_N_buf.coeff(i - get_insert_row, j);
614 }
615 }
616 }
617 }
618 _Z.finalize();
619 _Z_N.finalize();
620
621 return true;
622 }
623
erase_B_and_Z(const vector<int> & indx,int erase_indx)624 void gcta::erase_B_and_Z(const vector<int> &indx, int erase_indx) {
625 int i = 0, j = 0;
626 eigenSparseMat B_dense(_B), B_N_dense(_B_N);
627 _B.resize(indx.size() - 1, indx.size() - 1);
628 _B_N.resize(indx.size() - 1, indx.size() - 1);
629 _D_N.resize(indx.size() - 1);
630 int pos = find(indx.begin(), indx.end(), erase_indx) - indx.begin();
631 bool get_insert_col = false, get_insert_row = false;
632 for (j = 0; j < indx.size(); j++) {
633 if (erase_indx == indx[j]) {
634 get_insert_col = true;
635 continue;
636 }
637 _B.startVec(j - get_insert_col);
638 _B_N.startVec(j - get_insert_col);
639 _D_N[j - get_insert_col] = _MSX[indx[j]] * _Nd[indx[j]];
640 get_insert_row = get_insert_col;
641 for (i = j; i < indx.size(); i++) {
642 if (erase_indx == indx[i]) {
643 get_insert_row = true;
644 continue;
645 }
646 if (B_dense.coeff(i, j) != 0) {
647 _B.insertBack(i - get_insert_row, j - get_insert_col) = B_dense.coeff(i, j);
648 _B_N.insertBack(i - get_insert_row, j - get_insert_col) = B_N_dense.coeff(i, j);
649 }
650 }
651 }
652 _B.finalize();
653 _B_N.finalize();
654
655 if (_Z_N.cols() < 1) return;
656
657 LDLT<eigenMatrix> ldlt_B(_B);
658 _B_i = eigenMatrix::Identity(indx.size() - 1, indx.size() - 1);
659 ldlt_B.solveInPlace(_B_i);
660 LDLT<eigenMatrix> ldlt_B_N(_B_N);
661 _B_N_i = eigenMatrix::Identity(indx.size() - 1, indx.size() - 1);
662 ldlt_B_N.solveInPlace(_B_N_i);
663
664 eigenSparseMat Z_buf(_Z), Z_N_buf(_Z_N);
665 _Z.resize(indx.size() - 1, _include.size());
666 _Z_N.resize(indx.size() - 1, _include.size());
667 for (j = 0; j < _include.size(); j++) {
668 _Z.startVec(j);
669 _Z_N.startVec(j);
670 get_insert_row = false;
671 for (i = 0; i < indx.size(); i++) {
672 if (erase_indx == indx[i]) {
673 get_insert_row = true;
674 continue;
675 }
676 if (Z_buf.coeff(i, j) != 0) {
677 _Z.insertBack(i - get_insert_row, j) = Z_buf.coeff(i, j);
678 _Z_N.insertBack(i - get_insert_row, j) = Z_N_buf.coeff(i, j);
679 }
680 }
681 }
682 _Z.finalize();
683 _Z_N.finalize();
684 }
685
686 /*
687 double gcta::crossprod(vector<float> &x_i, vector<float> &x_j) {
688 double prod = 0.0;
689 for (int i = 0; i < x_i.size(); i++) prod += x_i[i] * x_j[i];
690 return (prod / x_i.size());
691 }
692 */
693
LD_rval(const vector<int> & indx,eigenMatrix & rval)694 void gcta::LD_rval(const vector<int> &indx, eigenMatrix &rval) {
695 int i = 0, j = 0;
696 eigenVector sd(indx.size());
697 for (i = 0; i < indx.size(); i++) sd[i] = sqrt(_MSX_B[indx[i]]);
698 for (j = 0; j < indx.size(); j++) {
699 rval(j, j) = 1.0;
700 for (i = j + 1; i < indx.size(); i++) rval(i, j) = rval(j, i) = _B.coeff(i, j) / sd[i] / sd[j];
701 }
702 }
703
run_massoc_sblup(string metafile,int wind_size,double lambda)704 void gcta::run_massoc_sblup(string metafile, int wind_size, double lambda)
705 {
706 _jma_wind_size = wind_size;
707 init_massoc(metafile, false, -1);
708
709 int j = 0;
710 eigenVector bJ;
711 cout << "\nPerforming joint analysis on all the " << _include.size() << " SNPs ..." << endl;
712 cout << "(Assuming complete linkage equilibrium between SNPs which are more than " << _jma_wind_size / 1e6 << "Mb away from each other)" << endl;
713 if (massoc_sblup(lambda, bJ)) {
714 string filename = _out + ".sblup.cojo";
715 cout << "Saving the joint analysis result of " << _include.size() << " SNPs to [" + filename + "] ..." << endl;
716 ofstream ofile(filename.c_str());
717 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
718 for (j = 0; j < _include.size(); j++) {
719 ofile << _snp_name[_include[j]] << "\t" << _ref_A[_include[j]] << "\t" << _beta[j] << "\t" << bJ[j] << endl;
720 }
721 ofile.close();
722 } else throw ("Jacobi iteration not converged. You can increase the maximum number of iterations by the option --massoc-sblup-maxit.");
723 }
724
massoc_sblup(double lambda,eigenVector & bJ)725 bool gcta::massoc_sblup(double lambda, eigenVector &bJ)
726 {
727 int i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
728 double prod = 0.0;
729 eigenVector D(_include.size());
730 eigenSparseMat B(_include.size(), _include.size());
731
732 vector<int> nz_i(m), nz_j(m);
733 for (i = 0; i < _include.size(); i++){
734 nz_i[i]=0;
735 nz_j[i]=0;
736 }
737 cout << "Calculating the LD correlation matrix of all the " << _include.size() << " SNPs..." << endl;
738 for (i = 0; i < _include.size(); i++) {
739 D[i] = _MSX[i] * _Nd[i];
740 B.startVec(i);
741 B.insertBack(i, i) = D[i] + lambda;
742 for (j = i + 1; j < _include.size(); j++) {
743 if (_chr[_include[i]] == _chr[_include[j]] && abs(_bp[_include[i]] - _bp[_include[j]]) < _jma_wind_size) {
744 B.insertBack(j, i) = 1.0;
745 }
746 }
747 }
748 B.finalize();
749
750 eigenVector x_i(_keep.size()), x_j(_keep.size());
751 for (i = 0; i < _include.size(); i++) {
752 makex_eigenVector(i, x_i, false, true);
753 for (j = i + 1; j < _include.size(); j++) {
754 if (_chr[_include[i]] == _chr[_include[j]] && abs(_bp[_include[i]] - _bp[_include[j]]) < _jma_wind_size) {
755 makex_eigenVector(j, x_j, false, true);
756 prod = x_i.dot(x_j) / (double)n;
757 B.coeffRef(j, i) = prod * min(_Nd[i], _Nd[j]) * sqrt(_MSX[i] * _MSX[j] / (_MSX_B[i] * _MSX_B[j]));
758 }
759 }
760 if((i + 1) % 1000 == 0 || (i + 1) == _include.size()) cout << i + 1 << " of " << _include.size() << " SNPs.\r";
761 }
762
763 cout << "Estimating the joint effects of all SNPs ..." << endl;
764 eigenVector Xty = D.array() * _beta.array();
765 SimplicialLDLT<eigenSparseMat> solver;
766 solver.compute(B);
767 if(solver.info()!=Success) {
768 throw("Error: decomposition failed! The SNP correlation matrix is not positive definite.");
769 }
770 bJ = solver.solve(Xty);
771 if(solver.info()!=Success) {
772 throw("Error: solving failed! Unable to solve the BLUP equation.");
773 }
774
775 // debug
776 //cout<<"_MSX: "<<_MSX<<endl;
777 //cout<<"Nd: "<<_Nd<<endl;
778 //cout<<"B: "<<B<<endl;
779 //cout<<"D: "<<D<<endl;
780 //cout<<"_beta: "<<_beta<<endl;
781 //cout<<"bJ: "<<bJ<<endl;
782
783 //cout<<"B inverse: "<<ldlt.solve(eigenMatrix::Identity(_include.size(), _include.size()))<<endl;
784
785
786 return (true);
787 }
788
789
790
791