1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for estimating the LD structure
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
set_ldscore_adj_flag(bool ldscore_adj)15 void gcta::set_ldscore_adj_flag(bool ldscore_adj){
16 _ldscore_adj = ldscore_adj;
17 }
18
read_LD_target_SNPs(string snplistfile)19 void gcta::read_LD_target_SNPs(string snplistfile)
20 {
21 // Read snplist file
22 _ld_target_snp.clear();
23 read_snplist(snplistfile, _ld_target_snp);
24
25 int i = 0, prev_target_snp_num = _ld_target_snp.size(), prev_include_size = _include.size();
26 map<string, int> snp_map_buf;
27 map<string, int>::iterator iter;
28 for (i = 0; i < _snp_num; i++) snp_map_buf.insert(pair<string, int>(_snp_name[i], i));
29 for (i = _ld_target_snp.size() - 1; i >= 0; i--) {
30 iter = snp_map_buf.find(_ld_target_snp[i]);
31 if (iter != snp_map_buf.end()) _snp_name_map.insert(*iter);
32 else _ld_target_snp.erase(_ld_target_snp.begin() + i);
33 }
34 _include.clear();
35 for (iter = _snp_name_map.begin(); iter != _snp_name_map.end(); iter++) _include.push_back(iter->second);
36 stable_sort(_include.begin(), _include.end());
37 if (_ld_target_snp.size() == 0) throw ("Error: no target SNPs are retained to estimate the LD structure.");
38 else cout << prev_target_snp_num << " target SNPs read from [" + snplistfile + "], " << _ld_target_snp.size() << " of which exist in the data." << endl;
39 }
40
LD_Blocks(int stp,double wind_size,double alpha,bool IncldQ,bool save_ram)41 void gcta::LD_Blocks(int stp, double wind_size, double alpha, bool IncldQ, bool save_ram)
42 {
43 int i = 0, j = 0;
44 if (_mu.empty()) calcu_mu();
45
46 // Read snplist file
47 vector<int> smpl_buf, smpl;
48 vector<string> uni_snp;
49 for (i = 0; i < _include.size(); i++) uni_snp.push_back(_snp_name[_include[i]]);
50 StrFunc::match(_ld_target_snp, uni_snp, smpl_buf);
51 for (i = 0; i < smpl_buf.size(); i++) {
52 if (smpl_buf[i]>-1) smpl.push_back(smpl_buf[i]);
53 }
54 int SNP_SmplNum = smpl.size(); // smpl is the position of _include
55
56 // Calculate LD structure
57 cout << "Estimating LD structure..." << endl;
58 vector<int> K(SNP_SmplNum);
59 vector<double> r2(SNP_SmplNum), md_r2(SNP_SmplNum), max_r2(SNP_SmplNum), dL(SNP_SmplNum), dR(SNP_SmplNum);
60 vector<string> max_r2_snp(SNP_SmplNum);
61 vector< vector<double> > r(SNP_SmplNum);
62 vector<string> L_SNP(SNP_SmplNum), R_SNP(SNP_SmplNum);
63 vector< vector<string> > snp_ls(SNP_SmplNum);
64 EstLD(smpl, wind_size, snp_ls, r, r2, md_r2, max_r2, max_r2_snp, dL, dR, K, L_SNP, R_SNP, alpha, IncldQ);
65
66 // Save result
67 string SavFileName = _out + ".rsq.ld";
68 ofstream SavFile(SavFileName.c_str());
69 if (!SavFile) throw ("Error: can not open the file [" + SavFileName + "] to save result!");
70 SavFile << "target_SNP\tfreq\tL_region\tR_region\tL_snp\tR_snp\tnSNPs\tmean_rsq\tmedian_rsq\tmax_rsq\tmax_rsq_snp" << endl;
71 for (i = 0; i < SNP_SmplNum; i++) SavFile << _snp_name[_include[smpl[i]]] << "\t" << 0.5 * _mu[_include[smpl[i]]] << "\t" << dL[i] << "\t" << dR[i] << "\t" << L_SNP[i] << "\t" << R_SNP[i] << "\t" << K[i] << "\t" << r2[i] << "\t" << md_r2[i] << "\t" << max_r2[i] << "\t" << max_r2_snp[i] << endl;
72 SavFile.close();
73 SavFileName = _out + ".r.ld";
74 SavFile.open(SavFileName.c_str());
75 if (!SavFile) throw ("Error: can not open the file [" + SavFileName + "] to save result.");
76 for (i = 0; i < SNP_SmplNum; i++) {
77 for (j = 0; j < r[i].size(); j++) SavFile << r[i][j] << " ";
78 SavFile << endl;
79 }
80 SavFile.close();
81 SavFileName = _out + ".snp.ld";
82 SavFile.open(SavFileName.c_str());
83 if (!SavFile) throw ("Can not open the file [" + SavFileName + "] to save result.");
84 for (i = 0; i < SNP_SmplNum; i++) {
85 for (j = 0; j < snp_ls[i].size(); j++) SavFile << snp_ls[i][j] << " ";
86 SavFile << endl;
87 }
88 SavFile.close();
89 cout << "Results have been saved in [" + _out + ".rsq.ld]" + ", [" + _out + ".r.ld]" + " and [" + _out + ".snp.ld].\n" << endl;
90 }
91
EstLD(vector<int> & smpl,double wind_size,vector<vector<string>> & snp,vector<vector<double>> & r,vector<double> & r2,vector<double> & md_r2,vector<double> & max_r2,vector<string> & max_r2_snp,vector<double> & dL,vector<double> & dR,vector<int> & K,vector<string> & L_SNP,vector<string> & R_SNP,double alpha,bool IncldQ)92 void gcta::EstLD(vector<int> &smpl, double wind_size, vector< vector<string> > &snp, vector< vector<double> > &r, vector<double> &r2, vector<double> &md_r2, vector<double> &max_r2, vector<string> &max_r2_snp, vector<double> &dL, vector<double> &dR, vector<int> &K, vector<string> &L_SNP, vector<string> &R_SNP, double alpha, bool IncldQ)
93 {
94 int i = 0, j = 0, L = 0, R = 0, maxL = 0, maxR = 0, i_buf = 0;
95 map<int, int> smpl_snp_map;
96 for (i = 0; i < smpl.size(); i++) smpl_snp_map.insert(pair<int, int>(_include[smpl[i]], i));
97
98 cout << "Parameters used to search SNPs in LD with the given SNPs: window size=" << (int) (wind_size * 0.001) << "Kb, significant level=" << alpha << endl;
99 vector<double> rst, y, x;
100 for (i = 0; i < smpl.size(); i++) {
101 vector<int> buf;
102 vector<double> r_buf, rsq_buf;
103 maxL = maxR = L = R = smpl[i];
104 makex(L, y);
105 if (IncldQ) {
106 buf.push_back(L);
107 rsq_buf.push_back(1.0);
108 r_buf.push_back(1.0);
109 }
110 while (1) {
111 if (R == _include.size() - 1) break;
112 if (_chr[_include[R]] != _chr[_include[R + 1]]) break;
113 if (_bp[_include[R + 1]] - _bp[_include[smpl[i]]] > wind_size) break;
114 R++;
115 if (smpl_snp_map.find(_include[R]) != smpl_snp_map.end()) continue;
116 makex(R, x);
117 reg(y, x, rst);
118 if (rst[2] < alpha) {
119 maxR = R;
120 buf.push_back(R);
121 rsq_buf.push_back(rst[3]);
122 r_buf.push_back(rst[4]);
123 }
124 }
125 while (1) {
126 if (L == 0) break;
127 if (_chr[_include[L]] != _chr[_include[L - 1]]) break;
128 if (_bp[_include[smpl[i]]] - _bp[_include[L - 1]] > wind_size) break;
129 L--;
130 if (smpl_snp_map.find(_include[L]) != smpl_snp_map.end()) continue;
131 makex(L, x);
132 reg(y, x, rst);
133 if (rst[2] < alpha) {
134 maxL = L;
135 buf.insert(buf.begin(), L);
136 rsq_buf.insert(rsq_buf.begin(), rst[3]);
137 r_buf.insert(r_buf.begin(), rst[4]);
138 }
139 }
140 if (buf.size() == 0) {
141 K[i] = 0;
142 dL[i] = 0;
143 dR[i] = 0;
144 L_SNP[i] = "NA";
145 R_SNP[i] = "NA";
146 r[i].push_back(0.0);
147 snp[i].push_back("NA");
148 r2[i] = 0.0;
149 md_r2[i] = 0.0;
150 max_r2[i] = 0.0;
151 max_r2_snp[i] = "NA";
152 } else {
153 K[i] = buf.size();
154 dL[i] = _bp[_include[smpl[i]]] - _bp[_include[maxL]];
155 dR[i] = _bp[_include[maxR]] - _bp[_include[smpl[i]]];
156 L_SNP[i] = _snp_name[_include[maxL]];
157 R_SNP[i] = _snp_name[_include[maxR]];
158 for (j = 0; j < K[i]; j++) {
159 r[i].push_back(r_buf[j]);
160 snp[i].push_back(_snp_name[_include[buf[j]]]);
161 }
162 r2[i] = CommFunc::mean(rsq_buf);
163 md_r2[i] = CommFunc::median(rsq_buf);
164 i_buf = max_element(rsq_buf.begin(), rsq_buf.end()) - rsq_buf.begin();
165 max_r2[i] = rsq_buf[i_buf];
166 max_r2_snp[i] = snp[i][i_buf];
167 }
168 cout << i + 1 << " of " << smpl.size() << " target SNPs.\r";
169 }
170 }
171
reg(vector<double> & y,vector<double> & x,vector<double> & rst,bool table)172 eigenMatrix gcta::reg(vector<double> &y, vector<double> &x, vector<double> &rst, bool table) {
173 int N = x.size();
174 if (N != y.size() || N < 1) throw ("Error: The lengths of x and y do not match.");
175
176 int i = 0;
177 double d_buf = 0.0, y_mu = 0.0, x_mu = 0.0, x_var = 0.0, y_var = 0.0, cov = 0.0;
178 for (i = 0; i < N; i++) {
179 x_mu += x[i];
180 y_mu += y[i];
181 }
182 x_mu /= (double) N;
183 y_mu /= (double) N;
184 for (i = 0; i < N; i++) {
185 d_buf = (x[i] - x_mu);
186 x_var += d_buf*d_buf;
187 d_buf = (y[i] - y_mu);
188 y_var += d_buf*d_buf;
189 }
190 x_var /= (double) (N - 1.0);
191 y_var /= (double) (N - 1.0);
192 for (i = 0; i < N; i++) cov += (x[i] - x_mu)*(y[i] - y_mu);
193 cov /= (double) (N - 1);
194 double a = 0.0, b = 0.0, sse = 0.0, a_se = 0.0, b_se = 0.0, p = 0.0, rsq = 0.0, r = 0.0;
195 if (x_var > 0.0) b = cov / x_var;
196 a = y_mu - b*x_mu;
197 for (i = 0; i < N; i++) {
198 d_buf = y[i] - a - b * x[i];
199 sse += d_buf*d_buf;
200 }
201 if (x_var > 0.0) {
202 a_se = sqrt((sse / (N - 2.0))*(1.0 / N + x_mu * x_mu / (x_var * (N - 1.0))));
203 b_se = sqrt(sse / x_var / (N - 1.0) / (N - 2.0));
204 }
205 if (x_var > 0.0 && y_var > 0.0) {
206 r = cov / sqrt(y_var * x_var);
207 rsq = r*r;
208 }
209 double t = 0.0;
210 if (b_se > 0.0) t = fabs(b / b_se);
211 p = StatFunc::t_prob(N - 2.0, t, true);
212 rst.clear();
213 rst.push_back(b);
214 rst.push_back(b_se);
215 rst.push_back(p);
216 rst.push_back(rsq);
217 rst.push_back(r);
218
219 eigenMatrix reg_sum(3, 3);
220 if (table) {
221 reg_sum(2, 0) = rsq;
222 reg_sum(1, 0) = b;
223 reg_sum(1, 1) = b_se;
224 reg_sum(1, 2) = p;
225 if (a_se > 0.0) t = fabs(a / a_se);
226 p = StatFunc::t_prob(N - 2.0, t, true);
227 reg_sum(0, 0) = a;
228 reg_sum(0, 1) = a_se;
229 reg_sum(0, 2) = p;
230 return (reg_sum);
231 }
232 return (reg_sum);
233 }
234
rm_cor_snp(int m,int start,float * rsq,double rsq_cutoff,vector<int> & rm_snp_ID1)235 void gcta::rm_cor_snp(int m, int start, float *rsq, double rsq_cutoff, vector<int> &rm_snp_ID1)
236 {
237 int i = 0, j = 0, i_buf = 0;
238
239 rm_snp_ID1.clear();
240 vector<int> rm_snp_ID2;
241 for (i = 0; i < m; i++) {
242 for (j = 0; j < i; j++) {
243 if (rsq[i * m + j] > rsq_cutoff) {
244 rm_snp_ID1.push_back(i + start);
245 rm_snp_ID2.push_back(j + start);
246 }
247 }
248 }
249 vector<int> rm_uni_ID(rm_snp_ID1);
250 rm_uni_ID.insert(rm_uni_ID.end(), rm_snp_ID2.begin(), rm_snp_ID2.end());
251 stable_sort(rm_uni_ID.begin(), rm_uni_ID.end());
252 rm_uni_ID.erase(unique(rm_uni_ID.begin(), rm_uni_ID.end()), rm_uni_ID.end());
253 map<int, int> rm_uni_ID_count;
254 for (i = 0; i < rm_uni_ID.size(); i++) {
255 i_buf = count(rm_snp_ID1.begin(), rm_snp_ID1.end(), rm_uni_ID[i]) + count(rm_snp_ID2.begin(), rm_snp_ID2.end(), rm_uni_ID[i]);
256 rm_uni_ID_count.insert(pair<int, int>(rm_uni_ID[i], i_buf));
257 }
258 map<int, int>::iterator iter1, iter2;
259 #pragma omp parallel for
260 for (i = 0; i < rm_snp_ID1.size(); i++) {
261 iter1 = rm_uni_ID_count.find(rm_snp_ID1[i]);
262 iter2 = rm_uni_ID_count.find(rm_snp_ID2[i]);
263 if (iter1->second < iter2->second) {
264 i_buf = rm_snp_ID1[i];
265 rm_snp_ID1[i] = rm_snp_ID2[i];
266 rm_snp_ID2[i] = i_buf;
267 }
268 }
269 stable_sort(rm_snp_ID1.begin(), rm_snp_ID1.end());
270 rm_snp_ID1.erase(unique(rm_snp_ID1.begin(), rm_snp_ID1.end()), rm_snp_ID1.end());
271 }
272
calcu_mean_rsq(int wind_size,double rsq_cutoff,bool dominance_flag)273 void gcta::calcu_mean_rsq(int wind_size, double rsq_cutoff, bool dominance_flag)
274 {
275 check_autosome();
276
277 int i = 0, m = _include.size();
278
279 cout << "\nCalculating LD score for SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size/2000<<"Kb between blocks); LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
280 if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
281 if(_ldscore_adj) cout << "LD rsq will be adjusted for chance correlation, i.e. rsq_adj = rsq - (1 - rsq) / (n -2)." << endl;
282 vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
283 get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size);
284
285 eigenVector mean_rsq = eigenVector::Zero(m), snp_num = eigenVector::Zero(m), max_rsq = eigenVector::Zero(m);
286 calcu_ld_blk(brk_pnt1, brk_pnt3, mean_rsq, snp_num, max_rsq, false, rsq_cutoff, dominance_flag);
287 if (brk_pnt2.size() > 1) calcu_ld_blk(brk_pnt2, brk_pnt3, mean_rsq, snp_num, max_rsq, true, rsq_cutoff, dominance_flag);
288
289 string mrsq_file = "";
290 if(dominance_flag) mrsq_file = _out + ".d.score.ld";
291 else mrsq_file = _out + ".score.ld";
292 ofstream o_mrsq(mrsq_file.data());
293 o_mrsq<<"SNP chr bp MAF mean_rsq snp_num max_rsq ldscore"<<endl;
294 double ldscore = 0.0;
295 for (i = 0; i < m; i++){
296 o_mrsq << _snp_name[_include[i]] << " " << _chr[_include[i]] << " " << _bp[_include[i]] << " ";
297 double MAF = 0.5 * _mu[_include[i]];
298 if(MAF > 0.5) MAF = 1.0 - MAF;
299 ldscore = 1.0 + mean_rsq[i] * snp_num[i];
300 o_mrsq << MAF << " " << mean_rsq[i] << " " << snp_num[i] << " " << max_rsq[i] << " " << ldscore << "\n";
301 }
302 o_mrsq << endl;
303 cout << "LD score for " << m << " SNPs have been saved in the file [" + mrsq_file + "]." << endl;
304 }
305
calcu_ssx_sqrt_i(eigenVector & ssx_sqrt_i)306 void gcta::calcu_ssx_sqrt_i(eigenVector &ssx_sqrt_i)
307 {
308 int i = 0, m = _include.size();
309 ssx_sqrt_i.resize(m);
310 for (i = 0; i < m; i++){
311 ssx_sqrt_i[i] = _geno.col(i).squaredNorm();
312 if (ssx_sqrt_i[i] < 1.0e-30) ssx_sqrt_i[i] = 0.0;
313 else ssx_sqrt_i[i] = 1.0 / sqrt(ssx_sqrt_i[i]);
314 }
315 }
316
get_ld_blk_pnt(vector<int> & brk_pnt1,vector<int> & brk_pnt2,vector<int> & brk_pnt3,int wind_bp,int wind_snp)317 void gcta::get_ld_blk_pnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp)
318 {
319 unsigned long i = 0, j = 0, k = 0, m = _include.size();
320
321 brk_pnt1.clear();
322 brk_pnt1.push_back(0);
323 bool chr_start = true;
324 for (i = 1, j = 0; i < m; i++) {
325 if (i == (m - 1)){
326 if(chr_start
327 || ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > 0.5 * wind_bp)
328 && (i - brk_pnt1[j] > 0.5 * wind_snp))) brk_pnt1.push_back(m - 1);
329 else brk_pnt1[j - 1] = brk_pnt1[j] = m - 1;
330 }
331 else if (_chr[_include[i]] != _chr[_include[brk_pnt1[j]]] || _bp[_include[i]] - _bp[_include[i-1]] > 1e6) {
332 if(chr_start
333 || ((_bp[_include[i-1]] - _bp[_include[brk_pnt1[j]]] > 0.5 * wind_bp)
334 && (i - 1 - brk_pnt1[j] > 0.5 * wind_snp))){
335 brk_pnt1.push_back(i - 1);
336 j++;
337 brk_pnt1.push_back(i);
338 j++;
339 }
340 else{
341 brk_pnt1[j - 1] = i - 1;
342 brk_pnt1[j] = i;
343 }
344 chr_start = true;
345 }
346 else if ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > wind_bp) && (i - brk_pnt1[j] >= wind_snp)) {
347 chr_start = false;
348 brk_pnt1.push_back(i - 1);
349 j++;
350 brk_pnt1.push_back(i);
351 j++;
352 }
353 }
354 stable_sort(brk_pnt1.begin(), brk_pnt1.end());
355 brk_pnt1.erase(unique(brk_pnt1.begin(), brk_pnt1.end()), brk_pnt1.end());
356
357 brk_pnt2.clear();
358 brk_pnt3.clear();
359 for (i = 1; i < brk_pnt1.size() && brk_pnt1.size() > 2; i++) {
360 if ((_chr[_include[brk_pnt1[i - 1]]] == _chr[_include[brk_pnt1[i]]]) && (brk_pnt1[i] - brk_pnt1[i - 1] > 1)) {
361 int i_buf = (brk_pnt1[i - 1] + brk_pnt1[i]) / 2;
362 brk_pnt2.push_back(i_buf);
363 brk_pnt2.push_back(i_buf + 1);
364 brk_pnt3.push_back(brk_pnt1[i]);
365 brk_pnt3.push_back(brk_pnt1[i]);
366 }
367 }
368 }
369
calcu_ld_blk(vector<int> & brk_pnt,vector<int> & brk_pnt3,eigenVector & mean_rsq,eigenVector & snp_num,eigenVector & max_rsq,bool second,double rsq_cutoff,bool dominance_flag)370 void gcta::calcu_ld_blk(vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff, bool dominance_flag)
371 {
372 int i = 0, j = 0, k = 0, s1 = 0, s2 = 0, n = _keep.size(), m = _include.size(), size = 0, size_limit = 10000;
373
374 for (i = 0; i < brk_pnt.size() - 1; i++) {
375 if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
376 size = brk_pnt[i + 1] - brk_pnt[i] + 1;
377 if (size < 3) continue;
378 if (second) {
379 s1 = brk_pnt3[i] - brk_pnt[i];
380 s2 = s1 + 1;
381 }
382 else {
383 s1 = 0;
384 s2 = size - 1;
385 }
386
387 eigenVector rsq_size(size), mean_rsq_sub(size), max_rsq_sub = eigenVector::Constant(size, -1.0);
388
389 // make genotype matrix
390 vector<int> snp_indx(size);
391 for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++) snp_indx[k] = j;
392 MatrixXf X_sub;
393 if(dominance_flag) make_XMat_d_subset(X_sub, snp_indx, true);
394 else make_XMat_subset(X_sub, snp_indx, true);
395 eigenVector ssx_sqrt_i_sub(size);
396 for (j = 0; j < size; j++){
397 ssx_sqrt_i_sub[j] = X_sub.col(j).squaredNorm();
398 if (ssx_sqrt_i_sub[j] < 1.0e-30) ssx_sqrt_i_sub[j] = 0.0;
399 else ssx_sqrt_i_sub[j] = 1.0 / sqrt(ssx_sqrt_i_sub[j]);
400 }
401
402 if (size > size_limit) calcu_ld_blk_split(size, size_limit, X_sub, ssx_sqrt_i_sub, rsq_cutoff, rsq_size, mean_rsq_sub, max_rsq_sub, s1, s2, second);
403 else {
404 MatrixXf rsq_sub = X_sub.transpose() * X_sub;
405 #pragma omp parallel for private(k)
406 for (j = 0; j < size; j++) {
407 rsq_size[j] = 0.0;
408 mean_rsq_sub[j] = 0.0;
409 for (k = 0; k < size; k++) {
410 if (second) {
411 if (j <= s1 && k <= s1) continue;
412 if (j >= s2 && k >= s2) continue;
413 }
414 if (k == j) continue;
415 rsq_sub(j,k) *= (ssx_sqrt_i_sub[j] * ssx_sqrt_i_sub[k]);
416 rsq_sub(j,k) = rsq_sub(j,k) * rsq_sub(j,k);
417 if (rsq_sub(j,k) >= rsq_cutoff) {
418 if(_ldscore_adj) mean_rsq_sub[j] += rsq_sub(j,k) - (1.0 - rsq_sub(j,k)) / (n - 2.0);
419 else mean_rsq_sub[j] += rsq_sub(j,k);
420 rsq_size[j] += 1.0;
421 }
422 if (rsq_sub(j,k) > max_rsq_sub[j]) max_rsq_sub[j] = rsq_sub(j,k);
423 }
424 if (rsq_size[j] > 0.0) mean_rsq_sub[j] /= rsq_size[j];
425 }
426 }
427
428 for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
429 if (second) {
430 if (rsq_size[j] > 0.0) {
431 mean_rsq[k] = (mean_rsq[k] * snp_num[k] + mean_rsq_sub[j] * rsq_size[j]) / (snp_num[k] + rsq_size[j]);
432 snp_num[k] = (snp_num[k] + rsq_size[j]);
433 if(max_rsq[k] < max_rsq_sub[j]) max_rsq[k] = max_rsq_sub[j];
434 }
435 }
436 else {
437 mean_rsq[k] = mean_rsq_sub[j];
438 snp_num[k] = rsq_size[j];
439 max_rsq[k] = max_rsq_sub[j];
440 }
441 }
442 }
443 }
444
calcu_ld_blk_split(int size,int size_limit,MatrixXf & X_sub,eigenVector & ssx_sqrt_i_sub,double rsq_cutoff,eigenVector & rsq_size,eigenVector & mean_rsq_sub,eigenVector & max_rsq_sub,int s1,int s2,bool second)445 void gcta::calcu_ld_blk_split(int size, int size_limit, MatrixXf &X_sub, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second)
446 {
447 int i = 0, j = 0, k = 0, m = 0, n = _keep.size();
448 vector<int> brk_pnt_sub;
449 brk_pnt_sub.push_back(0);
450 for (i = size_limit; i < size - size_limit; i += size_limit) {
451 brk_pnt_sub.push_back(i - 1);
452 brk_pnt_sub.push_back(i);
453 j = i;
454 }
455 j = (size - j) / 2 + j;
456 brk_pnt_sub.push_back(j - 1);
457 brk_pnt_sub.push_back(j);
458 brk_pnt_sub.push_back(size - 1);
459
460 for (i = 0; i < brk_pnt_sub.size() - 1; i++) {
461 int size_sub = brk_pnt_sub[i + 1] - brk_pnt_sub[i] + 1;
462 if (size_sub < 3) continue;
463
464 eigenVector ssx_sqrt_i_sub_sub = ssx_sqrt_i_sub.segment(brk_pnt_sub[i], size_sub);
465 MatrixXf rsq_sub_sub = X_sub.block(0,brk_pnt_sub[i],n,size_sub).transpose() * X_sub;
466 eigenVector rsq_size_sub(size_sub), mean_rsq_sub_sub(size_sub), max_rsq_sub_sub = eigenVector::Constant(size_sub, -1.0);
467
468 #pragma omp parallel for private(k)
469 for (j = 0; j < size_sub; j++) {
470 unsigned long s = j + brk_pnt_sub[i];
471 rsq_size_sub[j] = 0.0;
472 mean_rsq_sub_sub[j] = 0.0;
473 for (k = 0; k < size; k++) {
474 if (second) {
475 if (s <= s1 && k <= s1) continue;
476 if (s >= s2 && k >= s2) continue;
477 }
478 if (k == s) continue;
479 rsq_sub_sub(j,k) *= (ssx_sqrt_i_sub_sub[j] * ssx_sqrt_i_sub[k]);
480 rsq_sub_sub(j,k) = rsq_sub_sub(j,k) * rsq_sub_sub(j,k);
481 if (rsq_sub_sub(j,k) >= rsq_cutoff) {
482 if(_ldscore_adj) mean_rsq_sub_sub[j] += rsq_sub_sub(j,k) - (1.0 - rsq_sub_sub(j,k)) / (n - 2.0);
483 else mean_rsq_sub_sub[j] += rsq_sub_sub(j,k);
484 rsq_size_sub[j] += 1.0;
485 }
486 if(rsq_sub_sub(j,k) > max_rsq_sub_sub[j]) max_rsq_sub_sub[j] = rsq_sub_sub(j,k);
487 }
488
489 if (rsq_size_sub[j] > 0.0) mean_rsq_sub_sub[j] /= rsq_size_sub[j];
490 }
491
492 for (j = 0, k = brk_pnt_sub[i]; j < size_sub; j++, k++) {
493 mean_rsq_sub[k] = mean_rsq_sub_sub[j];
494 rsq_size[k] = rsq_size_sub[j];
495 max_rsq_sub[k] = max_rsq_sub_sub[j];
496 }
497 }
498 }
499
calcu_mean_rsq_multiSet(string snpset_filenames_file,int wind_size,double rsq_cutoff,bool dominance_flag)500 void gcta::calcu_mean_rsq_multiSet(string snpset_filenames_file, int wind_size, double rsq_cutoff, bool dominance_flag)
501 {
502 check_autosome();
503 int i = 0, j = 0, m = _include.size();
504
505 ifstream in_snpset_filenames(snpset_filenames_file.c_str());
506 if (!in_snpset_filenames) throw ("Error: can not open the file [" + snpset_filenames_file + "] to read.");
507 string str_buf;
508 vector<string> snpset_filenaems, vs_buf;
509 while (getline(in_snpset_filenames, str_buf)) {
510 if (!str_buf.empty()) {
511 if (StrFunc::split_string(str_buf, vs_buf) == 1) snpset_filenaems.push_back(vs_buf[0]);
512 }
513 }
514 in_snpset_filenames.close();
515
516 int set_num = snpset_filenaems.size();
517 cout << "There are " << set_num << " filenames specified in [" + snpset_filenames_file + "]." << endl;
518 if (set_num < 1) throw ("Error: no filename found in [" + snpset_filenames_file + "].");
519
520 vector< vector<string> > snplist(set_num);
521 for(i = 0; i < set_num; i++) {
522 ifstream i_snplist(snpset_filenaems[i].c_str());
523 if (!i_snplist) throw ("Error: can not open the file [" + snpset_filenaems[i] + "] to read.");
524 cout << "Reading the list of SNPs from [" + snpset_filenaems[i] + "]." << endl;
525 while (i_snplist >> str_buf) {
526 snplist[i].push_back(str_buf);
527 getline(i_snplist, str_buf);
528 }
529 i_snplist.close();
530 }
531 vector< vector<bool> > set_flag(set_num);
532 for(i = 0; i < set_num; i++){
533 set_flag[i].resize(m);
534 for(j = 0; j < m; j++) set_flag[i][j] = false;
535 }
536
537 map<string, int> snp_map;
538 for (i = 0; i < m; i++) snp_map.insert(pair<string, int>(_snp_name[_include[i]], i));
539 map<string, int>::iterator iter, end = snp_map.end();
540 for(i = 0; i < set_num; i++){
541 int snp_count = 0;
542 for(j = 0; j < snplist[i].size(); j++){
543 iter = snp_map.find(snplist[i][j]);
544 if(iter == end) continue;
545 set_flag[i][iter->second] = true;
546 snp_count++;
547 }
548 cout << snp_count << " SNPs included from [" << snpset_filenaems[i] << "]. " <<endl;
549 }
550
551 cout << "\nCalculating multi-component LD score for SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size/2000<<"Kb between blocks); LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
552 if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
553 if(_ldscore_adj) cout << "LD rsq will be adjusted for chance correlation, i.e. rsq_adj = rsq - (1 - rsq) / (n -2)." << endl;
554 vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
555 get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size);
556
557 vector<eigenVector> mean_rsq(set_num), snp_num(set_num), max_rsq(set_num);
558 for(i = 0; i < set_num; i++){
559 mean_rsq[i] = eigenVector::Zero(m);
560 snp_num[i] = eigenVector::Zero(m);
561 max_rsq[i] = eigenVector::Zero(m);
562 }
563 calcu_ld_blk_multiSet(brk_pnt1, brk_pnt3, set_flag, mean_rsq, snp_num, max_rsq, false, rsq_cutoff, dominance_flag);
564 if (brk_pnt2.size() > 1) calcu_ld_blk_multiSet(brk_pnt2, brk_pnt3, set_flag, mean_rsq, snp_num, max_rsq, true, rsq_cutoff, dominance_flag);
565
566 string mrsq_file = "";
567 double ldscore = 0.0;
568 if(dominance_flag) mrsq_file = _out + ".d.multi_score.ld";
569 else mrsq_file = _out + ".multi_score.ld";
570 ofstream o_mrsq(mrsq_file.data());
571 o_mrsq<<"SNP chr bp MAF";
572 for(j = 0; j < set_num; j++) o_mrsq << " mean_rsq_" << j+1 << " snp_num_" << j+1 << " max_rsq" << j+1 << " ldscore" << j+1;
573 o_mrsq << endl;
574 for (i = 0; i < m; i++){
575 o_mrsq << _snp_name[_include[i]] << " " << _chr[_include[i]] << " " << _bp[_include[i]] << " ";
576 double MAF = 0.5 * _mu[_include[i]];
577 if(MAF > 0.5) MAF = 1.0 - MAF;
578 o_mrsq << MAF;
579 for(j = 0; j < set_num; j++){
580 ldscore = 1.0 + (mean_rsq[j])[i] * (snp_num[j])[i];
581 o_mrsq << " " << (mean_rsq[j])[i] << " " << (snp_num[j])[i] << " " << (max_rsq[j])[i];
582 }
583 o_mrsq << endl;
584 }
585 o_mrsq << endl;
586 cout << "LD score for " << m << " SNPs have been saved in the file [" + mrsq_file + "]." << endl;
587 }
588
calcu_ld_blk_multiSet(vector<int> & brk_pnt,vector<int> & brk_pnt3,vector<vector<bool>> & set_flag,vector<eigenVector> & mean_rsq,vector<eigenVector> & snp_num,vector<eigenVector> & max_rsq,bool second,double rsq_cutoff,bool dominance_flag)589 void gcta::calcu_ld_blk_multiSet(vector<int> &brk_pnt, vector<int> &brk_pnt3, vector< vector<bool> > &set_flag, vector<eigenVector> &mean_rsq, vector<eigenVector> &snp_num, vector<eigenVector> &max_rsq, bool second, double rsq_cutoff, bool dominance_flag)
590 {
591 int i = 0, j = 0, k = 0, l = 0, s1 = 0, s2 = 0, n = _keep.size(), m = _include.size(), size = 0, size_limit = 10000;
592 int s = 0, set_num = set_flag.size();
593
594 for (i = 0; i < brk_pnt.size() - 1; i++) {
595 if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
596 size = brk_pnt[i + 1] - brk_pnt[i] + 1;
597 if (size < 3) continue;
598 if (second) {
599 s1 = brk_pnt3[i] - brk_pnt[i];
600 s2 = s1 + 1;
601 }
602 else {
603 s1 = 0;
604 s2 = size - 1;
605 }
606
607 vector<int> snp_indx(size);
608 for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++) snp_indx[k] = j;
609 MatrixXf X_sub;
610 if(dominance_flag) make_XMat_d_subset(X_sub, snp_indx, true);
611 else make_XMat_subset(X_sub, snp_indx, true);
612 eigenVector ssx_sqrt_i_sub(size);
613 for (j = 0; j < size; j++){
614 ssx_sqrt_i_sub[j] = X_sub.col(j).squaredNorm();
615 if (ssx_sqrt_i_sub[j] < 1.0e-30) ssx_sqrt_i_sub[j] = 0.0;
616 else ssx_sqrt_i_sub[j] = 1.0 / sqrt(ssx_sqrt_i_sub[j]);
617 }
618
619 for(s = 0; s < set_num; s++){
620 vector<int> used_in_this_set;
621 vector<bool> set_flag_sub(size);
622 for (j = 0, k = brk_pnt[i]; j < size; j++, k++){
623 if(set_flag[s][k] == true){
624 used_in_this_set.push_back(j);
625 set_flag_sub[j] = true;
626 }
627 else set_flag_sub[j] = false;
628 }
629
630 int size_of_this_set = used_in_this_set.size();
631 if(size_of_this_set < 1) continue;
632
633 MatrixXf X_sub2(n, size_of_this_set);
634 for(j = 0; j < size_of_this_set; j++) X_sub2.col(j) = X_sub.col(used_in_this_set[j]);
635
636 eigenVector rsq_size = eigenVector::Zero(size);
637 eigenVector mean_rsq_sub = eigenVector::Zero(size);
638 eigenVector max_rsq_sub = eigenVector::Constant(size, -1.0);
639
640 if (size > size_limit) calcu_ld_blk_split_multiSet(size, size_limit, X_sub, X_sub2, used_in_this_set, ssx_sqrt_i_sub, rsq_cutoff, rsq_size, mean_rsq_sub, max_rsq_sub, s1, s2, second);
641 else {
642 MatrixXf rsq_sub = X_sub.transpose() * X_sub2;
643
644 #pragma omp parallel for private(k,l)
645 for (j = 0; j < size; j++) {
646 rsq_size[j] = 0.0;
647 mean_rsq_sub[j] = 0.0;
648 for (k = 0; k < size_of_this_set; k++) {
649 l = used_in_this_set[k];
650 if (second) {
651 if (j <= s1 && l <= s1) continue;
652 if (j >= s2 && l >= s2) continue;
653 }
654 if (l == j) continue;
655 rsq_sub(j,k) *= (ssx_sqrt_i_sub[j] * ssx_sqrt_i_sub[l]);
656 rsq_sub(j,k) = rsq_sub(j,k) * rsq_sub(j,k);
657 if (rsq_sub(j,k) >= rsq_cutoff) {
658 if(_ldscore_adj) mean_rsq_sub[j] += rsq_sub(j,k) - (1.0 - rsq_sub(j,k)) / (n - 2.0);
659 else mean_rsq_sub[j] += rsq_sub(j,k);
660 rsq_size[j] += 1.0;
661 }
662 if (rsq_sub(j,k) > max_rsq_sub[j]) max_rsq_sub[j] = rsq_sub(j,k);
663 }
664 if (rsq_size[j] > 0.0) mean_rsq_sub[j] /= rsq_size[j];
665 }
666 }
667 for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
668 if (second) {
669 if (rsq_size[j] > 0.0) {
670 (mean_rsq[s])[k] = ((mean_rsq[s])[k] * (snp_num[s])[k] + mean_rsq_sub[j] * rsq_size[j]) / ((snp_num[s])[k] + rsq_size[j]);
671 (snp_num[s])[k] = ((snp_num[s])[k] + rsq_size[j]);
672 if((max_rsq[s])[k] < max_rsq_sub[j]) (max_rsq[s])[k] = max_rsq_sub[j];
673 }
674 }
675 else {
676 (mean_rsq[s])[k] = mean_rsq_sub[j];
677 (snp_num[s])[k] = rsq_size[j];
678 (max_rsq[s])[k] = max_rsq_sub[j];
679 }
680 }
681 }
682 }
683 }
684
calcu_ld_blk_split_multiSet(int size,int size_limit,MatrixXf & X_sub,MatrixXf & X_sub2,vector<int> & used_in_this_set,eigenVector & ssx_sqrt_i_sub,double rsq_cutoff,eigenVector & rsq_size,eigenVector & mean_rsq_sub,eigenVector & max_rsq_sub,int s1,int s2,bool second)685 void gcta::calcu_ld_blk_split_multiSet(int size, int size_limit, MatrixXf &X_sub, MatrixXf &X_sub2, vector<int> &used_in_this_set, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second)
686 {
687 int i = 0, j = 0, k = 0, l = 0, m = 0, n = _keep.size();
688 vector<int> brk_pnt_sub;
689 brk_pnt_sub.push_back(0);
690 for (i = size_limit; i < size - size_limit; i += size_limit) {
691 brk_pnt_sub.push_back(i - 1);
692 brk_pnt_sub.push_back(i);
693 j = i;
694 }
695 j = (size - j) / 2 + j;
696 brk_pnt_sub.push_back(j - 1);
697 brk_pnt_sub.push_back(j);
698 brk_pnt_sub.push_back(size - 1);
699
700 for (i = 0; i < brk_pnt_sub.size() - 1; i++) {
701 int size_sub = brk_pnt_sub[i + 1] - brk_pnt_sub[i] + 1;
702 if (size_sub < 3) continue;
703
704 eigenVector ssx_sqrt_i_sub_sub = ssx_sqrt_i_sub.segment(brk_pnt_sub[i], size_sub);
705 MatrixXf rsq_sub_sub = X_sub.block(0,brk_pnt_sub[i],n,size_sub).transpose() * X_sub2;
706 eigenVector rsq_size_sub = eigenVector::Zero(size_sub);
707 eigenVector mean_rsq_sub_sub = eigenVector::Zero(size_sub);
708 eigenVector max_rsq_sub_sub = eigenVector::Constant(size_sub, -1.0);
709
710 #pragma omp parallel for private(k, l)
711 for (j = 0; j < size_sub; j++) {
712 unsigned long s = j + brk_pnt_sub[i];
713 rsq_size_sub[j] = 0.0;
714 mean_rsq_sub_sub[j] = 0.0;
715 for (k = 0; k < used_in_this_set.size(); k++) {
716 l = used_in_this_set[k];
717 if (second) {
718 if (s <= s1 && l <= s1) continue;
719 if (s >= s2 && l >= s2) continue;
720 }
721 if (l == s) continue;
722 rsq_sub_sub(j,k) *= (ssx_sqrt_i_sub_sub[j] * ssx_sqrt_i_sub[l]);
723 rsq_sub_sub(j,k) = rsq_sub_sub(j,k) * rsq_sub_sub(j,k);
724 if (rsq_sub_sub(j,k) >= rsq_cutoff) {
725 if(_ldscore_adj) mean_rsq_sub_sub[j] += rsq_sub_sub(j,k) - (1.0 - rsq_sub_sub(j,k)) / (n - 2.0);
726 else mean_rsq_sub_sub[j] += rsq_sub_sub(j,k);
727 rsq_size_sub[j] += 1.0;
728 }
729 if(rsq_sub_sub(j,k) > max_rsq_sub_sub[j]) max_rsq_sub_sub[j] = rsq_sub_sub(j,k);
730 }
731
732 if (rsq_size_sub[j] > 0.0) mean_rsq_sub_sub[j] /= rsq_size_sub[j];
733 }
734
735 for (j = 0, k = brk_pnt_sub[i]; j < size_sub; j++, k++) {
736 mean_rsq_sub[k] = mean_rsq_sub_sub[j];
737 rsq_size[k] = rsq_size_sub[j];
738 max_rsq_sub[k] = max_rsq_sub_sub[j];
739 }
740 }
741 }
742
ld_seg(string i_ld_file,int seg_size,int wind_size,double rsq_cutoff,bool dominance_flag)743 void gcta::ld_seg(string i_ld_file, int seg_size, int wind_size, double rsq_cutoff, bool dominance_flag)
744 {
745 int i = 0, j = 0, k = 0, m = 0;
746 vector<float> mrsq, snp_num, max_rsq, ldscore;
747 vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
748
749 if(!i_ld_file.empty()){
750 ifstream ild(i_ld_file.c_str());
751 if (!ild) throw ("Error: can not open the file [" + i_ld_file + "] to read.");
752
753 string str_buf;
754 cout << "Reading per-SNP LD score from [" + i_ld_file + "] ..." << endl;
755 getline(ild, str_buf); // get the header
756 while(getline(ild, str_buf)){
757 if(str_buf.size() > 0) m++;
758 }
759 ild.close();
760 ild.open(i_ld_file.c_str());
761 _snp_num = m;
762 _snp_name.resize(m);
763 _chr.resize(m);
764 _bp.resize(m);
765 _mu.resize(m);
766 mrsq.resize(m);
767 snp_num.resize(m);
768 max_rsq.resize(m);
769 ldscore.resize(m);
770 i = 0;
771 getline(ild, str_buf); // get the header
772 while (ild) {
773 ild >> str_buf;
774 if (ild.eof() || str_buf.size() < 1) break;
775 _snp_name[i] = str_buf;
776 if(!(ild >> _chr[i])) throw("Error: in the file [" + i_ld_file + "].");
777 if(!(ild >> _bp[i])) throw("Error: in the file [" + i_ld_file + "].");
778 if(!(ild >> _mu[i])) throw("Error: in the file [" + i_ld_file + "].");
779 _mu[i] *= 2.0;
780 if(!(ild >> mrsq[i])) throw("Error: in the file [" + i_ld_file + "].");
781 if(!(ild >> snp_num[i])) throw("Error: in the file [" + i_ld_file + "].");
782 if(!(ild >> max_rsq[i])) throw("Error: in the file [" + i_ld_file + "].");
783 if(!(ild >> ldscore[i])) throw("Error: in the file [" + i_ld_file + "].");
784 getline(ild, str_buf);
785 i++;
786 }
787 ild.close();
788 cout << "Per-SNP LD score for " << m << " SNPs read from [" + i_ld_file + "]." << endl;
789
790 _include.resize(m);
791 for(i = 0; i < m; i++) _include[i] = i;
792 }
793 else {
794 m = _include.size();
795 check_autosome();
796 cout << "\nCalculating LD score between SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size / 2000<<"Kb between blocks); LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
797 if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
798 get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size);
799 eigenVector mrsq_buf = eigenVector::Zero(m), snp_num_buf = eigenVector::Zero(m), max_rsq_buf = eigenVector::Zero(m);
800 calcu_ld_blk(brk_pnt1, brk_pnt3, mrsq_buf, snp_num_buf, max_rsq_buf, false, rsq_cutoff, dominance_flag);
801 if (brk_pnt2.size() > 1) calcu_ld_blk(brk_pnt2, brk_pnt3, mrsq_buf, snp_num_buf, max_rsq_buf, true, rsq_cutoff, dominance_flag);
802 mrsq.resize(m);
803 snp_num.resize(m);
804 max_rsq.resize(m);
805 ldscore.resize(m);
806 for(i = 0; i < m; i++){
807 mrsq[i] = mrsq_buf(i);
808 snp_num[i] = snp_num_buf(i);
809 max_rsq[i] = max_rsq_buf(i);
810 ldscore[i] = 1.0 + mrsq_buf(i) * snp_num_buf(i);
811 }
812 }
813
814 cout << "Calculating regional mean LD score (region width = " << seg_size / 1000 << "Kb with an overlap of " << seg_size / 2000 << "Kb between regions) ... " << endl;
815
816 get_lds_brkpnt(brk_pnt1, brk_pnt2, seg_size);
817 int size = 0, mean_size = 0, count = 0;
818 for (i = 0; i < brk_pnt1.size() - 1; i++) {
819 size = brk_pnt1[i + 1] - brk_pnt1[i] + 1;
820 if (size > 2){
821 mean_size += size;
822 count++;
823 }
824 }
825 mean_size /= count;
826 get_lds_brkpnt(brk_pnt1, brk_pnt2, 0, mean_size);
827
828 vector<double> lds(m);
829 for (i = 0; i < brk_pnt1.size() - 1; i++){
830 size = brk_pnt1[i + 1] - brk_pnt1[i] + 1;
831 if(size < 3) continue;
832 double ld_score = 0.0;
833 for(j = brk_pnt1[i]; j <= brk_pnt1[i + 1]; j++) ld_score += ldscore[j]; //mrsq[j] * snp_num[j] + 1.0;
834 ld_score /= (double)size;
835 for(j = brk_pnt1[i]; j <= brk_pnt1[i + 1]; j++) lds[j] = ld_score;
836 }
837 for (i = 0; i < brk_pnt2.size() - 1; i++){
838 size = brk_pnt2[i + 1] - brk_pnt2[i] + 1;
839 if(size < 3) continue;
840 double ld_score = 0.0;
841 for(j = brk_pnt2[i]; j <= brk_pnt2[i + 1]; j++) ld_score += ldscore[j]; // mrsq[j] * snp_num[j] + 1.0;
842 ld_score /= (double)size;
843 for(j = brk_pnt2[i]; j <= brk_pnt2[i + 1]; j++) lds[j] = 0.5*(ld_score + lds[j]);
844 }
845
846 string lds_file;
847 if(dominance_flag) lds_file = _out + ".d.score.ld";
848 else lds_file = _out + ".score.ld";
849 cout << "Writing the regional LD score to file ["+ lds_file +"] ..." << endl;
850 ofstream o_lds(lds_file.data());
851 o_lds << "SNP chr bp freq mean_rsq snp_num max_rsq ldscore_SNP ldscore_region"<<endl;
852 for (i = 0; i < m; i++){
853 o_lds << _snp_name[_include[i]] << " " << _chr[_include[i]] << " " << _bp[_include[i]] << " " << 0.5 * _mu[_include[i]] ;
854 o_lds << " " << mrsq[i] << " " << snp_num[i] << " " << max_rsq[i] << " " << ldscore[i] << " " << lds[i] << "\n";
855 }
856 }
857
get_lds_brkpnt(vector<int> & brk_pnt1,vector<int> & brk_pnt2,int ldwt_seg,int wind_snp_num)858 void gcta::get_lds_brkpnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, int ldwt_seg, int wind_snp_num)
859 {
860 unsigned long i = 0, j = 0, k = 0, m = _include.size();
861
862 brk_pnt1.clear();
863 brk_pnt1.push_back(0);
864 bool chr_start = true;
865 int prev_length = 0;
866 for (i = 1, j = 0; i < m; i++) {
867 if (i == (m - 1)){
868 if(!chr_start && prev_length < 0.5 * wind_snp_num) brk_pnt1[j - 1] = brk_pnt1[j] = m - 1;
869 else brk_pnt1.push_back(m - 1);
870 }
871 else if (_chr[_include[i]] != _chr[_include[brk_pnt1[j]]]) {
872 if(!chr_start && prev_length < 0.5 * wind_snp_num){
873 brk_pnt1[j - 1] = i - 1;
874 brk_pnt1[j] = i;
875 }
876 else{
877 brk_pnt1.push_back(i - 1);
878 j++;
879 brk_pnt1.push_back(i);
880 j++;
881 }
882 chr_start = true;
883 }
884 else if ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > ldwt_seg) && (i - brk_pnt1[j] > wind_snp_num)) {
885 prev_length = i - brk_pnt1[j];
886 chr_start = false;
887 brk_pnt1.push_back(i - 1);
888 j++;
889 brk_pnt1.push_back(i);
890 j++;
891 }
892 }
893 stable_sort(brk_pnt1.begin(), brk_pnt1.end());
894 brk_pnt1.erase(unique(brk_pnt1.begin(), brk_pnt1.end()), brk_pnt1.end());
895
896 brk_pnt2.clear();
897 for (i = 1; i < brk_pnt1.size() && brk_pnt1.size() > 2; i++) {
898 if ((_chr[_include[brk_pnt1[i - 1]]] == _chr[_include[brk_pnt1[i]]]) && (brk_pnt1[i] - brk_pnt1[i - 1] > 1)) {
899 int i_buf = (brk_pnt1[i - 1] + brk_pnt1[i]) / 2;
900 brk_pnt2.push_back(i_buf);
901 brk_pnt2.push_back(i_buf + 1);
902 }
903 }
904 }
905
906 // calculate maximum LD rsq between SNPs
calcu_max_ld_rsq(int wind_size,double rsq_cutoff,bool dominance_flag)907 void gcta::calcu_max_ld_rsq(int wind_size, double rsq_cutoff, bool dominance_flag)
908 {
909 int i = 0, m = _include.size(), max_size = 0.3 * _keep.size();
910
911 cout << "Calculating maximum LD rsq between SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size/2000<<"Kb between blocks; LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
912 cout << "(Maximum number of SNPs allowed in a block = " << max_size << " due to computational limitation)" << endl;
913 if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
914
915 vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
916 get_ld_blk_pnt_max_limit(brk_pnt1, brk_pnt2, brk_pnt3, wind_size, max_size);
917
918 eigenVector multi_rsq = eigenVector::Constant(m, -1.0), max_rsq = eigenVector::Constant(m, -999), multi_rsq_adj = eigenVector::Constant(m, -999);
919 vector<int> max_pos(m);
920 calcu_max_ld_rsq_blk(multi_rsq, multi_rsq_adj, max_rsq, max_pos, brk_pnt1, rsq_cutoff, dominance_flag);
921 if (brk_pnt2.size() > 1) calcu_max_ld_rsq_blk(multi_rsq, multi_rsq_adj, max_rsq, max_pos, brk_pnt2, rsq_cutoff, dominance_flag);
922
923 string max_rsq_file = "";
924 if(dominance_flag) max_rsq_file = _out + ".d.max_rsq.ld";
925 else max_rsq_file = _out + ".max_rsq.ld";
926 ofstream o_max_rsq(max_rsq_file.data());
927 o_max_rsq<<"SNP freq max_rsq max_snp multi_rsq multi_rsq_adj"<<endl;
928 for (i = 0; i < m; i++){
929 if(max_rsq[i] > 0.0 || multi_rsq[i] > -998) o_max_rsq << _snp_name[_include[i]] << " " << 0.5 * _mu[_include[i]] << " " << max_rsq[i] << " " << _snp_name[_include[max_pos[i]]] << " " << multi_rsq[i] << " " <<multi_rsq_adj[i] << endl;
930 else o_max_rsq << _snp_name[_include[i]] << " " << 0.5 * _mu[_include[i]] << " NA NA NA NA" << endl;
931 }
932 o_max_rsq << endl;
933 cout << "Maximum LD rsq for " << m << " SNPs have been saved in the file [" + max_rsq_file + "]." << endl;
934 }
935
get_ld_blk_pnt_max_limit(vector<int> & brk_pnt1,vector<int> & brk_pnt2,vector<int> & brk_pnt3,int wind_bp,int wind_snp)936 void gcta::get_ld_blk_pnt_max_limit(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp)
937 {
938 unsigned long i = 0, j = 0, k = 0, m = _include.size();
939
940 brk_pnt1.clear();
941 brk_pnt1.push_back(0);
942 for (i = 1, j = 0; i < m; i++) {
943 if (i == (m - 1)) brk_pnt1.push_back(m - 1);
944 else if (_chr[_include[i]] != _chr[_include[brk_pnt1[j]]] || _bp[_include[i]] - _bp[_include[i-1]] > 1e6) {
945 brk_pnt1.push_back(i - 1);
946 j++;
947 brk_pnt1.push_back(i);
948 j++;
949 }
950 else if ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > wind_bp) || (i - brk_pnt1[j] >= wind_snp)) {
951 brk_pnt1.push_back(i - 1);
952 j++;
953 brk_pnt1.push_back(i);
954 j++;
955 }
956 }
957 stable_sort(brk_pnt1.begin(), brk_pnt1.end());
958 brk_pnt1.erase(unique(brk_pnt1.begin(), brk_pnt1.end()), brk_pnt1.end());
959
960 brk_pnt2.clear();
961 brk_pnt3.clear();
962 for (i = 1; i < brk_pnt1.size() && brk_pnt1.size() > 2; i++) {
963 if ((_chr[_include[brk_pnt1[i - 1]]] == _chr[_include[brk_pnt1[i]]]) && (brk_pnt1[i] - brk_pnt1[i - 1] > 1)) {
964 int i_buf = (brk_pnt1[i - 1] + brk_pnt1[i]) / 2;
965 brk_pnt2.push_back(i_buf);
966 brk_pnt2.push_back(i_buf + 1);
967 brk_pnt3.push_back(brk_pnt1[i]);
968 brk_pnt3.push_back(brk_pnt1[i]);
969 }
970 }
971 }
972
calcu_max_ld_rsq_blk(eigenVector & multi_rsq,eigenVector & multi_rsq_adj,eigenVector & max_rsq,vector<int> & max_pos,vector<int> & brk_pnt,double rsq_cutoff,bool dominance_flag)973 void gcta::calcu_max_ld_rsq_blk(eigenVector &multi_rsq, eigenVector &multi_rsq_adj, eigenVector &max_rsq, vector<int> &max_pos, vector<int> &brk_pnt, double rsq_cutoff, bool dominance_flag)
974 {
975 int i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size(), size = 0;
976
977 for (i = 0; i < brk_pnt.size() - 1; i++)
978 {
979 if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
980 size = brk_pnt[i + 1] - brk_pnt[i] + 1;
981 if (size < 3) continue;
982
983 // make genotype matrix
984 vector<int> snp_indx(size);
985 for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++) snp_indx[k] = j;
986 MatrixXf X_sub;
987 if(dominance_flag) make_XMat_d_subset(X_sub, snp_indx, true);
988 else make_XMat_subset(X_sub, snp_indx, true);
989 eigenVector ssx_sqrt_i_sub(size);
990 for (j = 0; j < size; j++){
991 ssx_sqrt_i_sub[j] = X_sub.col(j).squaredNorm();
992 if (ssx_sqrt_i_sub[j] < 1.0e-30) ssx_sqrt_i_sub[j] = 0.0;
993 else ssx_sqrt_i_sub[j] = 1.0 / sqrt(ssx_sqrt_i_sub[j]);
994 }
995
996 MatrixXf rsq_sub = X_sub.transpose() * X_sub;
997 #pragma omp parallel for private(k)
998 for (j = 0; j < size; j++) {
999 rsq_sub(j,j) = 1.0;
1000 for (k = j + 1; k < size; k++){
1001 rsq_sub(j,k) *= (ssx_sqrt_i_sub[j] * ssx_sqrt_i_sub[k]);
1002 rsq_sub(k,j) = rsq_sub(j,k);
1003 }
1004 }
1005
1006
1007 // Fixed compile by removing .array(). Not sure about the validity
1008 // of this change, but it seemed reasonable based on constructor docs.
1009 SelfAdjointEigenSolver<MatrixXf> pca(rsq_sub);
1010
1011 // debug
1012 // ofstream tmp("tmp_R.txt");
1013 //tmp<< rsq_sub.col(1998) << endl;
1014
1015
1016 VectorXf d_i = pca.eigenvalues();
1017 double eff_m = 0;
1018 for(j = 0; j < size; j ++){
1019 if(d_i(j) < 1e-5) d_i(j) = 0.0;
1020 else{
1021 d_i(j) = 1.0 / d_i(j);
1022 eff_m++;
1023 }
1024 }
1025
1026 // debug
1027 //ofstream tmp2("tmp_eval.txt");
1028 //tmp2<<pca.eigenvalues()<< endl;
1029
1030
1031
1032 // debug
1033 cout << "size = " << size << "; eff_m = " << eff_m << endl;
1034 MatrixXf R_i = pca.eigenvectors() * d_i.asDiagonal() * pca.eigenvectors().transpose();
1035 R_i = R_i.array();
1036
1037
1038 //JacobiSVD<MatrixXf> svd;
1039 /*svd.compute(rsq_sub, ComputeThinV);
1040 VectorXf d_i = svd.singularValues();
1041 double eff_m = 0;
1042 for(j = 0; j < size; j ++){
1043 if(d_i(j) < 1e-6) d_i(j) = 0.0;
1044 else{
1045 d_i(j) = 1.0 / d_i(j);
1046 eff_m++;
1047 }
1048 }
1049
1050 // debug
1051 cout << "size = " << size << "; eff_m = " << eff_m << endl;
1052
1053 MatrixXf R_i = svd.matrixV() * d_i.asDiagonal() * svd.matrixV().transpose();*/
1054 VectorXf Q_diag(size);
1055 for(j = 0; j < size; j ++) Q_diag(j) = R_i.col(j).dot(rsq_sub.row(j).transpose());
1056 VectorXf multi_rsq_buf(size);
1057 for(j = 0; j < size; j ++){
1058 if(fabs(Q_diag[j] - 1.0) < 0.01) multi_rsq_buf[j] = 1.0 - 1.0 / R_i(j,j);
1059 else multi_rsq_buf[j] = 1.0;
1060 if(multi_rsq_buf[j] > 1.0) multi_rsq_buf[j] = 1.0;
1061 }
1062 VectorXf multi_rsq_buf_adj = multi_rsq_buf.array() - (1.0 - multi_rsq_buf.array()) * (eff_m / ((double)n - eff_m - 1.0));
1063
1064 rsq_sub.diagonal() = VectorXf::Zero(size);
1065 rsq_sub = rsq_sub.array() * rsq_sub.array();
1066 VectorXf max_rsq_buf(size);
1067 vector<int> max_pos_buf(size);
1068 VectorXf::Index max_pos_pnt;
1069 for(j = 0; j < size; j++){
1070 rsq_sub.col(j).maxCoeff(&max_pos_pnt);
1071 max_pos_buf[j] = (int)max_pos_pnt;
1072 max_rsq_buf[j] = rsq_sub.col(j)[max_pos_buf[j]];
1073 if(multi_rsq_buf[j] < max_rsq_buf[j]) multi_rsq_buf[j] = max_rsq_buf[j];
1074 }
1075
1076 /* for(j = 0; j < size; j++){
1077 if(multi_rsq_buf[j] > 1.0) multi_rsq_buf[j] = 1.0;
1078 if(max_rsq_buf[j] > 1.0) max_rsq_buf[j] = 1.0;
1079 }
1080 */
1081 for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
1082 if(multi_rsq_adj[k] <= multi_rsq_buf_adj[j]){
1083 multi_rsq[k] = multi_rsq_buf[j];
1084 multi_rsq_adj[k] = multi_rsq_buf_adj[j];
1085 }
1086 if(max_rsq[k] < max_rsq_buf[j]){
1087 max_rsq[k] = max_rsq_buf[j];
1088 max_pos[k] = max_pos_buf[j] + brk_pnt[i];
1089 }
1090 }
1091 }
1092 }
1093
bending_eigenval_Xf(VectorXf & eval)1094 bool gcta::bending_eigenval_Xf(VectorXf &eval)
1095 {
1096 int j = 0;
1097 double eval_m = eval.mean();
1098 if (eval.minCoeff() > 0.0) return false;
1099 double S = 0.0, P = 0.0;
1100 for (j = 0; j < eval.size(); j++) {
1101 if (eval[j] >= 0) continue;
1102 S += eval[j];
1103 P = -eval[j];
1104 }
1105 double W = S * S * 100.0 + 1;
1106 for (j = 0; j < eval.size(); j++) {
1107 if (eval[j] >= 0) continue;
1108 eval[j] = P * (S - eval[j])*(S - eval[j]) / W;
1109 }
1110 eval *= eval_m / eval.mean();
1111 return true;
1112 }
1113
1114