1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for estimating the LD structure
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
set_ldscore_adj_flag(bool ldscore_adj)15 void gcta::set_ldscore_adj_flag(bool ldscore_adj){
16     _ldscore_adj = ldscore_adj;
17 }
18 
read_LD_target_SNPs(string snplistfile)19 void gcta::read_LD_target_SNPs(string snplistfile)
20 {
21     // Read snplist file
22     _ld_target_snp.clear();
23     read_snplist(snplistfile, _ld_target_snp);
24 
25     int i = 0, prev_target_snp_num = _ld_target_snp.size(), prev_include_size = _include.size();
26     map<string, int> snp_map_buf;
27     map<string, int>::iterator iter;
28     for (i = 0; i < _snp_num; i++) snp_map_buf.insert(pair<string, int>(_snp_name[i], i));
29     for (i = _ld_target_snp.size() - 1; i >= 0; i--) {
30         iter = snp_map_buf.find(_ld_target_snp[i]);
31         if (iter != snp_map_buf.end()) _snp_name_map.insert(*iter);
32         else _ld_target_snp.erase(_ld_target_snp.begin() + i);
33     }
34     _include.clear();
35     for (iter = _snp_name_map.begin(); iter != _snp_name_map.end(); iter++) _include.push_back(iter->second);
36     stable_sort(_include.begin(), _include.end());
37     if (_ld_target_snp.size() == 0) throw ("Error: no target SNPs are retained to estimate the LD structure.");
38     else cout << prev_target_snp_num << " target SNPs read from [" + snplistfile + "], " << _ld_target_snp.size() << " of which exist in the data." << endl;
39 }
40 
LD_Blocks(int stp,double wind_size,double alpha,bool IncldQ,bool save_ram)41 void gcta::LD_Blocks(int stp, double wind_size, double alpha, bool IncldQ, bool save_ram)
42 {
43     int i = 0, j = 0;
44     if (_mu.empty()) calcu_mu();
45 
46     // Read snplist file
47     vector<int> smpl_buf, smpl;
48     vector<string> uni_snp;
49     for (i = 0; i < _include.size(); i++) uni_snp.push_back(_snp_name[_include[i]]);
50     StrFunc::match(_ld_target_snp, uni_snp, smpl_buf);
51     for (i = 0; i < smpl_buf.size(); i++) {
52         if (smpl_buf[i]>-1) smpl.push_back(smpl_buf[i]);
53     }
54     int SNP_SmplNum = smpl.size(); // smpl is the position of _include
55 
56     // Calculate LD structure
57     cout << "Estimating LD structure..." << endl;
58     vector<int> K(SNP_SmplNum);
59     vector<double> r2(SNP_SmplNum), md_r2(SNP_SmplNum), max_r2(SNP_SmplNum), dL(SNP_SmplNum), dR(SNP_SmplNum);
60     vector<string> max_r2_snp(SNP_SmplNum);
61     vector< vector<double> > r(SNP_SmplNum);
62     vector<string> L_SNP(SNP_SmplNum), R_SNP(SNP_SmplNum);
63     vector< vector<string> > snp_ls(SNP_SmplNum);
64     EstLD(smpl, wind_size, snp_ls, r, r2, md_r2, max_r2, max_r2_snp, dL, dR, K, L_SNP, R_SNP, alpha, IncldQ);
65 
66     // Save result
67     string SavFileName = _out + ".rsq.ld";
68     ofstream SavFile(SavFileName.c_str());
69     if (!SavFile) throw ("Error: can not open the file [" + SavFileName + "] to save result!");
70     SavFile << "target_SNP\tfreq\tL_region\tR_region\tL_snp\tR_snp\tnSNPs\tmean_rsq\tmedian_rsq\tmax_rsq\tmax_rsq_snp" << endl;
71     for (i = 0; i < SNP_SmplNum; i++) SavFile << _snp_name[_include[smpl[i]]] << "\t" << 0.5 * _mu[_include[smpl[i]]] << "\t" << dL[i] << "\t" << dR[i] << "\t" << L_SNP[i] << "\t" << R_SNP[i] << "\t" << K[i] << "\t" << r2[i] << "\t" << md_r2[i] << "\t" << max_r2[i] << "\t" << max_r2_snp[i] << endl;
72     SavFile.close();
73     SavFileName = _out + ".r.ld";
74     SavFile.open(SavFileName.c_str());
75     if (!SavFile) throw ("Error: can not open the file [" + SavFileName + "] to save result.");
76     for (i = 0; i < SNP_SmplNum; i++) {
77         for (j = 0; j < r[i].size(); j++) SavFile << r[i][j] << " ";
78         SavFile << endl;
79     }
80     SavFile.close();
81     SavFileName = _out + ".snp.ld";
82     SavFile.open(SavFileName.c_str());
83     if (!SavFile) throw ("Can not open the file [" + SavFileName + "] to save result.");
84     for (i = 0; i < SNP_SmplNum; i++) {
85         for (j = 0; j < snp_ls[i].size(); j++) SavFile << snp_ls[i][j] << " ";
86         SavFile << endl;
87     }
88     SavFile.close();
89     cout << "Results have been saved in [" + _out + ".rsq.ld]" + ", [" + _out + ".r.ld]" + " and [" + _out + ".snp.ld].\n" << endl;
90 }
91 
EstLD(vector<int> & smpl,double wind_size,vector<vector<string>> & snp,vector<vector<double>> & r,vector<double> & r2,vector<double> & md_r2,vector<double> & max_r2,vector<string> & max_r2_snp,vector<double> & dL,vector<double> & dR,vector<int> & K,vector<string> & L_SNP,vector<string> & R_SNP,double alpha,bool IncldQ)92 void gcta::EstLD(vector<int> &smpl, double wind_size, vector< vector<string> > &snp, vector< vector<double> > &r, vector<double> &r2, vector<double> &md_r2, vector<double> &max_r2, vector<string> &max_r2_snp, vector<double> &dL, vector<double> &dR, vector<int> &K, vector<string> &L_SNP, vector<string> &R_SNP, double alpha, bool IncldQ)
93 {
94     int i = 0, j = 0, L = 0, R = 0, maxL = 0, maxR = 0, i_buf = 0;
95     map<int, int> smpl_snp_map;
96     for (i = 0; i < smpl.size(); i++) smpl_snp_map.insert(pair<int, int>(_include[smpl[i]], i));
97 
98     cout << "Parameters used to search SNPs in LD with the given SNPs: window size=" << (int) (wind_size * 0.001) << "Kb, significant level=" << alpha << endl;
99     vector<double> rst, y, x;
100     for (i = 0; i < smpl.size(); i++) {
101         vector<int> buf;
102         vector<double> r_buf, rsq_buf;
103         maxL = maxR = L = R = smpl[i];
104         makex(L, y);
105         if (IncldQ) {
106             buf.push_back(L);
107             rsq_buf.push_back(1.0);
108             r_buf.push_back(1.0);
109         }
110         while (1) {
111             if (R == _include.size() - 1) break;
112             if (_chr[_include[R]] != _chr[_include[R + 1]]) break;
113             if (_bp[_include[R + 1]] - _bp[_include[smpl[i]]] > wind_size) break;
114             R++;
115             if (smpl_snp_map.find(_include[R]) != smpl_snp_map.end()) continue;
116             makex(R, x);
117             reg(y, x, rst);
118             if (rst[2] < alpha) {
119                 maxR = R;
120                 buf.push_back(R);
121                 rsq_buf.push_back(rst[3]);
122                 r_buf.push_back(rst[4]);
123             }
124         }
125         while (1) {
126             if (L == 0) break;
127             if (_chr[_include[L]] != _chr[_include[L - 1]]) break;
128             if (_bp[_include[smpl[i]]] - _bp[_include[L - 1]] > wind_size) break;
129             L--;
130             if (smpl_snp_map.find(_include[L]) != smpl_snp_map.end()) continue;
131             makex(L, x);
132             reg(y, x, rst);
133             if (rst[2] < alpha) {
134                 maxL = L;
135                 buf.insert(buf.begin(), L);
136                 rsq_buf.insert(rsq_buf.begin(), rst[3]);
137                 r_buf.insert(r_buf.begin(), rst[4]);
138             }
139         }
140         if (buf.size() == 0) {
141             K[i] = 0;
142             dL[i] = 0;
143             dR[i] = 0;
144             L_SNP[i] = "NA";
145             R_SNP[i] = "NA";
146             r[i].push_back(0.0);
147             snp[i].push_back("NA");
148             r2[i] = 0.0;
149             md_r2[i] = 0.0;
150             max_r2[i] = 0.0;
151             max_r2_snp[i] = "NA";
152         } else {
153             K[i] = buf.size();
154             dL[i] = _bp[_include[smpl[i]]] - _bp[_include[maxL]];
155             dR[i] = _bp[_include[maxR]] - _bp[_include[smpl[i]]];
156             L_SNP[i] = _snp_name[_include[maxL]];
157             R_SNP[i] = _snp_name[_include[maxR]];
158             for (j = 0; j < K[i]; j++) {
159                 r[i].push_back(r_buf[j]);
160                 snp[i].push_back(_snp_name[_include[buf[j]]]);
161             }
162             r2[i] = CommFunc::mean(rsq_buf);
163             md_r2[i] = CommFunc::median(rsq_buf);
164             i_buf = max_element(rsq_buf.begin(), rsq_buf.end()) - rsq_buf.begin();
165             max_r2[i] = rsq_buf[i_buf];
166             max_r2_snp[i] = snp[i][i_buf];
167         }
168         cout << i + 1 << " of " << smpl.size() << " target SNPs.\r";
169     }
170 }
171 
reg(vector<double> & y,vector<double> & x,vector<double> & rst,bool table)172 eigenMatrix gcta::reg(vector<double> &y, vector<double> &x, vector<double> &rst, bool table) {
173     int N = x.size();
174     if (N != y.size() || N < 1) throw ("Error: The lengths of x and y do not match.");
175 
176     int i = 0;
177     double d_buf = 0.0, y_mu = 0.0, x_mu = 0.0, x_var = 0.0, y_var = 0.0, cov = 0.0;
178     for (i = 0; i < N; i++) {
179         x_mu += x[i];
180         y_mu += y[i];
181     }
182     x_mu /= (double) N;
183     y_mu /= (double) N;
184     for (i = 0; i < N; i++) {
185         d_buf = (x[i] - x_mu);
186         x_var += d_buf*d_buf;
187         d_buf = (y[i] - y_mu);
188         y_var += d_buf*d_buf;
189     }
190     x_var /= (double) (N - 1.0);
191     y_var /= (double) (N - 1.0);
192     for (i = 0; i < N; i++) cov += (x[i] - x_mu)*(y[i] - y_mu);
193     cov /= (double) (N - 1);
194     double a = 0.0, b = 0.0, sse = 0.0, a_se = 0.0, b_se = 0.0, p = 0.0, rsq = 0.0, r = 0.0;
195     if (x_var > 0.0) b = cov / x_var;
196     a = y_mu - b*x_mu;
197     for (i = 0; i < N; i++) {
198         d_buf = y[i] - a - b * x[i];
199         sse += d_buf*d_buf;
200     }
201     if (x_var > 0.0) {
202         a_se = sqrt((sse / (N - 2.0))*(1.0 / N + x_mu * x_mu / (x_var * (N - 1.0))));
203         b_se = sqrt(sse / x_var / (N - 1.0) / (N - 2.0));
204     }
205     if (x_var > 0.0 && y_var > 0.0) {
206         r = cov / sqrt(y_var * x_var);
207         rsq = r*r;
208     }
209     double t = 0.0;
210     if (b_se > 0.0) t = fabs(b / b_se);
211     p = StatFunc::t_prob(N - 2.0, t, true);
212     rst.clear();
213     rst.push_back(b);
214     rst.push_back(b_se);
215     rst.push_back(p);
216     rst.push_back(rsq);
217     rst.push_back(r);
218 
219     eigenMatrix reg_sum(3, 3);
220     if (table) {
221         reg_sum(2, 0) = rsq;
222         reg_sum(1, 0) = b;
223         reg_sum(1, 1) = b_se;
224         reg_sum(1, 2) = p;
225         if (a_se > 0.0) t = fabs(a / a_se);
226         p = StatFunc::t_prob(N - 2.0, t, true);
227         reg_sum(0, 0) = a;
228         reg_sum(0, 1) = a_se;
229         reg_sum(0, 2) = p;
230         return (reg_sum);
231     }
232     return (reg_sum);
233 }
234 
rm_cor_snp(int m,int start,float * rsq,double rsq_cutoff,vector<int> & rm_snp_ID1)235 void gcta::rm_cor_snp(int m, int start, float *rsq, double rsq_cutoff, vector<int> &rm_snp_ID1)
236 {
237     int i = 0, j = 0, i_buf = 0;
238 
239     rm_snp_ID1.clear();
240     vector<int> rm_snp_ID2;
241     for (i = 0; i < m; i++) {
242         for (j = 0; j < i; j++) {
243             if (rsq[i * m + j] > rsq_cutoff) {
244                 rm_snp_ID1.push_back(i + start);
245                 rm_snp_ID2.push_back(j + start);
246             }
247         }
248     }
249     vector<int> rm_uni_ID(rm_snp_ID1);
250     rm_uni_ID.insert(rm_uni_ID.end(), rm_snp_ID2.begin(), rm_snp_ID2.end());
251     stable_sort(rm_uni_ID.begin(), rm_uni_ID.end());
252     rm_uni_ID.erase(unique(rm_uni_ID.begin(), rm_uni_ID.end()), rm_uni_ID.end());
253     map<int, int> rm_uni_ID_count;
254     for (i = 0; i < rm_uni_ID.size(); i++) {
255         i_buf = count(rm_snp_ID1.begin(), rm_snp_ID1.end(), rm_uni_ID[i]) + count(rm_snp_ID2.begin(), rm_snp_ID2.end(), rm_uni_ID[i]);
256         rm_uni_ID_count.insert(pair<int, int>(rm_uni_ID[i], i_buf));
257     }
258     map<int, int>::iterator iter1, iter2;
259     #pragma omp parallel for
260     for (i = 0; i < rm_snp_ID1.size(); i++) {
261         iter1 = rm_uni_ID_count.find(rm_snp_ID1[i]);
262         iter2 = rm_uni_ID_count.find(rm_snp_ID2[i]);
263         if (iter1->second < iter2->second) {
264             i_buf = rm_snp_ID1[i];
265             rm_snp_ID1[i] = rm_snp_ID2[i];
266             rm_snp_ID2[i] = i_buf;
267         }
268     }
269     stable_sort(rm_snp_ID1.begin(), rm_snp_ID1.end());
270     rm_snp_ID1.erase(unique(rm_snp_ID1.begin(), rm_snp_ID1.end()), rm_snp_ID1.end());
271 }
272 
calcu_mean_rsq(int wind_size,double rsq_cutoff,bool dominance_flag)273 void gcta::calcu_mean_rsq(int wind_size, double rsq_cutoff, bool dominance_flag)
274 {
275     check_autosome();
276 
277     int i = 0, m = _include.size();
278 
279     cout << "\nCalculating LD score for SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size/2000<<"Kb between blocks); LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
280     if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
281     if(_ldscore_adj) cout << "LD rsq will be adjusted for chance correlation, i.e. rsq_adj = rsq - (1 - rsq) / (n -2)." << endl;
282     vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
283     get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size);
284 
285     eigenVector mean_rsq = eigenVector::Zero(m), snp_num = eigenVector::Zero(m), max_rsq = eigenVector::Zero(m);
286     calcu_ld_blk(brk_pnt1, brk_pnt3, mean_rsq, snp_num, max_rsq, false, rsq_cutoff, dominance_flag);
287     if (brk_pnt2.size() > 1) calcu_ld_blk(brk_pnt2, brk_pnt3, mean_rsq, snp_num, max_rsq, true, rsq_cutoff, dominance_flag);
288 
289     string mrsq_file = "";
290     if(dominance_flag) mrsq_file = _out + ".d.score.ld";
291     else mrsq_file = _out + ".score.ld";
292     ofstream o_mrsq(mrsq_file.data());
293     o_mrsq<<"SNP chr bp MAF mean_rsq snp_num max_rsq ldscore"<<endl;
294     double ldscore = 0.0;
295     for (i = 0; i < m; i++){
296         o_mrsq << _snp_name[_include[i]] << " " << _chr[_include[i]] << " " << _bp[_include[i]] << " ";
297         double MAF = 0.5 * _mu[_include[i]];
298         if(MAF > 0.5) MAF = 1.0 - MAF;
299         ldscore = 1.0 + mean_rsq[i] * snp_num[i];
300         o_mrsq << MAF << " " << mean_rsq[i] << " " << snp_num[i] << " " << max_rsq[i] << " " << ldscore << "\n";
301     }
302     o_mrsq << endl;
303     cout << "LD score for " << m << " SNPs have been saved in the file [" + mrsq_file + "]." << endl;
304 }
305 
calcu_ssx_sqrt_i(eigenVector & ssx_sqrt_i)306 void gcta::calcu_ssx_sqrt_i(eigenVector &ssx_sqrt_i)
307 {
308     int i = 0, m = _include.size();
309     ssx_sqrt_i.resize(m);
310     for (i = 0; i < m; i++){
311         ssx_sqrt_i[i] = _geno.col(i).squaredNorm();
312         if (ssx_sqrt_i[i] < 1.0e-30) ssx_sqrt_i[i] = 0.0;
313         else ssx_sqrt_i[i] = 1.0 / sqrt(ssx_sqrt_i[i]);
314     }
315 }
316 
get_ld_blk_pnt(vector<int> & brk_pnt1,vector<int> & brk_pnt2,vector<int> & brk_pnt3,int wind_bp,int wind_snp)317 void gcta::get_ld_blk_pnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp)
318 {
319     unsigned long i = 0, j = 0, k = 0, m = _include.size();
320 
321     brk_pnt1.clear();
322     brk_pnt1.push_back(0);
323     bool chr_start = true;
324     for (i = 1, j = 0; i < m; i++) {
325         if (i == (m - 1)){
326             if(chr_start
327                 || ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > 0.5 * wind_bp)
328                 && (i - brk_pnt1[j] > 0.5 * wind_snp))) brk_pnt1.push_back(m - 1);
329             else brk_pnt1[j - 1] = brk_pnt1[j] = m - 1;
330         }
331         else if (_chr[_include[i]] != _chr[_include[brk_pnt1[j]]] || _bp[_include[i]] - _bp[_include[i-1]] > 1e6) {
332             if(chr_start
333                 || ((_bp[_include[i-1]] - _bp[_include[brk_pnt1[j]]] > 0.5 * wind_bp)
334                 && (i - 1 - brk_pnt1[j] > 0.5 * wind_snp))){
335                 brk_pnt1.push_back(i - 1);
336                 j++;
337                 brk_pnt1.push_back(i);
338                 j++;
339             }
340             else{
341                 brk_pnt1[j - 1] = i - 1;
342                 brk_pnt1[j] = i;
343             }
344             chr_start = true;
345         }
346         else if ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > wind_bp) && (i - brk_pnt1[j] >= wind_snp)) {
347             chr_start = false;
348             brk_pnt1.push_back(i - 1);
349             j++;
350             brk_pnt1.push_back(i);
351             j++;
352         }
353     }
354     stable_sort(brk_pnt1.begin(), brk_pnt1.end());
355     brk_pnt1.erase(unique(brk_pnt1.begin(), brk_pnt1.end()), brk_pnt1.end());
356 
357     brk_pnt2.clear();
358     brk_pnt3.clear();
359     for (i = 1; i < brk_pnt1.size() && brk_pnt1.size() > 2; i++) {
360         if ((_chr[_include[brk_pnt1[i - 1]]] == _chr[_include[brk_pnt1[i]]]) && (brk_pnt1[i] - brk_pnt1[i - 1] > 1)) {
361             int i_buf = (brk_pnt1[i - 1] + brk_pnt1[i]) / 2;
362             brk_pnt2.push_back(i_buf);
363             brk_pnt2.push_back(i_buf + 1);
364             brk_pnt3.push_back(brk_pnt1[i]);
365             brk_pnt3.push_back(brk_pnt1[i]);
366         }
367     }
368 }
369 
calcu_ld_blk(vector<int> & brk_pnt,vector<int> & brk_pnt3,eigenVector & mean_rsq,eigenVector & snp_num,eigenVector & max_rsq,bool second,double rsq_cutoff,bool dominance_flag)370 void gcta::calcu_ld_blk(vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff, bool dominance_flag)
371 {
372     int i = 0, j = 0, k = 0, s1 = 0, s2 = 0, n = _keep.size(), m = _include.size(), size = 0, size_limit = 10000;
373 
374     for (i = 0; i < brk_pnt.size() - 1; i++) {
375         if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
376         size = brk_pnt[i + 1] - brk_pnt[i] + 1;
377         if (size < 3) continue;
378         if (second) {
379             s1 = brk_pnt3[i] - brk_pnt[i];
380             s2 = s1 + 1;
381         }
382         else {
383             s1 = 0;
384             s2 = size - 1;
385         }
386 
387         eigenVector rsq_size(size), mean_rsq_sub(size), max_rsq_sub = eigenVector::Constant(size, -1.0);
388 
389         // make genotype matrix
390         vector<int> snp_indx(size);
391         for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++) snp_indx[k] = j;
392         MatrixXf X_sub;
393         if(dominance_flag) make_XMat_d_subset(X_sub, snp_indx, true);
394         else make_XMat_subset(X_sub, snp_indx, true);
395         eigenVector ssx_sqrt_i_sub(size);
396         for (j = 0; j < size; j++){
397             ssx_sqrt_i_sub[j] = X_sub.col(j).squaredNorm();
398             if (ssx_sqrt_i_sub[j] < 1.0e-30) ssx_sqrt_i_sub[j] = 0.0;
399             else ssx_sqrt_i_sub[j] = 1.0 / sqrt(ssx_sqrt_i_sub[j]);
400         }
401 
402         if (size > size_limit) calcu_ld_blk_split(size, size_limit, X_sub, ssx_sqrt_i_sub, rsq_cutoff, rsq_size, mean_rsq_sub, max_rsq_sub, s1, s2, second);
403         else {
404             MatrixXf rsq_sub = X_sub.transpose() * X_sub;
405             #pragma omp parallel for private(k)
406             for (j = 0; j < size; j++) {
407                 rsq_size[j] = 0.0;
408                 mean_rsq_sub[j] = 0.0;
409                 for (k = 0; k < size; k++) {
410                     if (second) {
411                         if (j <= s1 && k <= s1) continue;
412                         if (j >= s2 && k >= s2) continue;
413                     }
414                     if (k == j) continue;
415                     rsq_sub(j,k) *= (ssx_sqrt_i_sub[j] * ssx_sqrt_i_sub[k]);
416                     rsq_sub(j,k) = rsq_sub(j,k) * rsq_sub(j,k);
417                     if (rsq_sub(j,k) >= rsq_cutoff) {
418                         if(_ldscore_adj) mean_rsq_sub[j] += rsq_sub(j,k) - (1.0 - rsq_sub(j,k)) / (n - 2.0);
419                         else mean_rsq_sub[j] += rsq_sub(j,k);
420                         rsq_size[j] += 1.0;
421                     }
422                     if (rsq_sub(j,k) > max_rsq_sub[j]) max_rsq_sub[j] = rsq_sub(j,k);
423                 }
424                 if (rsq_size[j] > 0.0) mean_rsq_sub[j] /= rsq_size[j];
425             }
426         }
427 
428         for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
429             if (second) {
430                 if (rsq_size[j] > 0.0) {
431                     mean_rsq[k] = (mean_rsq[k] * snp_num[k] + mean_rsq_sub[j] * rsq_size[j]) / (snp_num[k] + rsq_size[j]);
432                     snp_num[k] = (snp_num[k] + rsq_size[j]);
433                     if(max_rsq[k] < max_rsq_sub[j]) max_rsq[k] = max_rsq_sub[j];
434                 }
435             }
436             else {
437                 mean_rsq[k] = mean_rsq_sub[j];
438                 snp_num[k] = rsq_size[j];
439                 max_rsq[k] = max_rsq_sub[j];
440             }
441         }
442     }
443 }
444 
calcu_ld_blk_split(int size,int size_limit,MatrixXf & X_sub,eigenVector & ssx_sqrt_i_sub,double rsq_cutoff,eigenVector & rsq_size,eigenVector & mean_rsq_sub,eigenVector & max_rsq_sub,int s1,int s2,bool second)445 void gcta::calcu_ld_blk_split(int size, int size_limit, MatrixXf &X_sub, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second)
446 {
447     int i = 0, j = 0, k = 0, m = 0, n = _keep.size();
448     vector<int> brk_pnt_sub;
449     brk_pnt_sub.push_back(0);
450     for (i = size_limit; i < size - size_limit; i += size_limit) {
451         brk_pnt_sub.push_back(i - 1);
452         brk_pnt_sub.push_back(i);
453         j = i;
454     }
455     j = (size - j) / 2 + j;
456     brk_pnt_sub.push_back(j - 1);
457     brk_pnt_sub.push_back(j);
458     brk_pnt_sub.push_back(size - 1);
459 
460     for (i = 0; i < brk_pnt_sub.size() - 1; i++) {
461         int size_sub = brk_pnt_sub[i + 1] - brk_pnt_sub[i] + 1;
462         if (size_sub < 3) continue;
463 
464         eigenVector ssx_sqrt_i_sub_sub = ssx_sqrt_i_sub.segment(brk_pnt_sub[i], size_sub);
465         MatrixXf rsq_sub_sub = X_sub.block(0,brk_pnt_sub[i],n,size_sub).transpose() * X_sub;
466         eigenVector rsq_size_sub(size_sub), mean_rsq_sub_sub(size_sub), max_rsq_sub_sub = eigenVector::Constant(size_sub, -1.0);
467 
468         #pragma omp parallel for private(k)
469         for (j = 0; j < size_sub; j++) {
470             unsigned long s = j + brk_pnt_sub[i];
471             rsq_size_sub[j] = 0.0;
472             mean_rsq_sub_sub[j] = 0.0;
473             for (k = 0; k < size; k++) {
474                 if (second) {
475                     if (s <= s1 && k <= s1) continue;
476                     if (s >= s2 && k >= s2) continue;
477                 }
478                 if (k == s) continue;
479                 rsq_sub_sub(j,k) *= (ssx_sqrt_i_sub_sub[j] * ssx_sqrt_i_sub[k]);
480                 rsq_sub_sub(j,k) = rsq_sub_sub(j,k) * rsq_sub_sub(j,k);
481                 if (rsq_sub_sub(j,k) >= rsq_cutoff) {
482                     if(_ldscore_adj) mean_rsq_sub_sub[j] += rsq_sub_sub(j,k) - (1.0 - rsq_sub_sub(j,k)) / (n - 2.0);
483                     else mean_rsq_sub_sub[j] += rsq_sub_sub(j,k);
484                     rsq_size_sub[j] += 1.0;
485                 }
486                 if(rsq_sub_sub(j,k) > max_rsq_sub_sub[j]) max_rsq_sub_sub[j] = rsq_sub_sub(j,k);
487             }
488 
489             if (rsq_size_sub[j] > 0.0) mean_rsq_sub_sub[j] /= rsq_size_sub[j];
490         }
491 
492         for (j = 0, k = brk_pnt_sub[i]; j < size_sub; j++, k++) {
493             mean_rsq_sub[k] = mean_rsq_sub_sub[j];
494             rsq_size[k] = rsq_size_sub[j];
495             max_rsq_sub[k] = max_rsq_sub_sub[j];
496         }
497     }
498 }
499 
calcu_mean_rsq_multiSet(string snpset_filenames_file,int wind_size,double rsq_cutoff,bool dominance_flag)500 void gcta::calcu_mean_rsq_multiSet(string snpset_filenames_file, int wind_size, double rsq_cutoff, bool dominance_flag)
501 {
502     check_autosome();
503     int i = 0,  j = 0, m = _include.size();
504 
505     ifstream in_snpset_filenames(snpset_filenames_file.c_str());
506     if (!in_snpset_filenames) throw ("Error: can not open the file [" + snpset_filenames_file + "] to read.");
507     string str_buf;
508     vector<string> snpset_filenaems, vs_buf;
509     while (getline(in_snpset_filenames, str_buf)) {
510         if (!str_buf.empty()) {
511             if (StrFunc::split_string(str_buf, vs_buf) == 1) snpset_filenaems.push_back(vs_buf[0]);
512         }
513     }
514     in_snpset_filenames.close();
515 
516     int set_num = snpset_filenaems.size();
517     cout << "There are " << set_num << " filenames specified in [" + snpset_filenames_file + "]." << endl;
518     if (set_num < 1) throw ("Error: no filename found in [" + snpset_filenames_file + "].");
519 
520     vector< vector<string> > snplist(set_num);
521     for(i = 0; i < set_num; i++) {
522         ifstream i_snplist(snpset_filenaems[i].c_str());
523         if (!i_snplist) throw ("Error: can not open the file [" + snpset_filenaems[i] + "] to read.");
524         cout << "Reading the list of SNPs from [" + snpset_filenaems[i] + "]." << endl;
525         while (i_snplist >> str_buf) {
526             snplist[i].push_back(str_buf);
527             getline(i_snplist, str_buf);
528         }
529         i_snplist.close();
530     }
531     vector< vector<bool> > set_flag(set_num);
532     for(i = 0; i < set_num; i++){
533         set_flag[i].resize(m);
534         for(j = 0; j < m; j++) set_flag[i][j] = false;
535     }
536 
537     map<string, int> snp_map;
538     for (i = 0; i < m; i++) snp_map.insert(pair<string, int>(_snp_name[_include[i]], i));
539     map<string, int>::iterator iter, end = snp_map.end();
540     for(i = 0; i < set_num; i++){
541         int snp_count = 0;
542         for(j = 0; j < snplist[i].size(); j++){
543             iter = snp_map.find(snplist[i][j]);
544             if(iter == end) continue;
545             set_flag[i][iter->second] = true;
546             snp_count++;
547         }
548         cout << snp_count << " SNPs included from [" << snpset_filenaems[i] << "]. " <<endl;
549     }
550 
551     cout << "\nCalculating multi-component LD score for SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size/2000<<"Kb between blocks); LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
552     if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
553     if(_ldscore_adj) cout << "LD rsq will be adjusted for chance correlation, i.e. rsq_adj = rsq - (1 - rsq) / (n -2)." << endl;
554     vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
555     get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size);
556 
557     vector<eigenVector> mean_rsq(set_num), snp_num(set_num), max_rsq(set_num);
558     for(i = 0; i < set_num; i++){
559         mean_rsq[i] = eigenVector::Zero(m);
560         snp_num[i] = eigenVector::Zero(m);
561         max_rsq[i] = eigenVector::Zero(m);
562     }
563     calcu_ld_blk_multiSet(brk_pnt1, brk_pnt3, set_flag, mean_rsq, snp_num, max_rsq, false, rsq_cutoff, dominance_flag);
564     if (brk_pnt2.size() > 1) calcu_ld_blk_multiSet(brk_pnt2, brk_pnt3, set_flag, mean_rsq, snp_num, max_rsq, true, rsq_cutoff, dominance_flag);
565 
566     string mrsq_file = "";
567     double ldscore = 0.0;
568     if(dominance_flag) mrsq_file = _out + ".d.multi_score.ld";
569     else mrsq_file = _out + ".multi_score.ld";
570     ofstream o_mrsq(mrsq_file.data());
571     o_mrsq<<"SNP chr bp MAF";
572     for(j = 0; j < set_num; j++) o_mrsq << " mean_rsq_" << j+1 << " snp_num_" << j+1 << " max_rsq" << j+1 << " ldscore" << j+1;
573     o_mrsq << endl;
574     for (i = 0; i < m; i++){
575         o_mrsq << _snp_name[_include[i]] << " " << _chr[_include[i]] << " " << _bp[_include[i]] << " ";
576         double MAF = 0.5 * _mu[_include[i]];
577         if(MAF > 0.5) MAF = 1.0 - MAF;
578         o_mrsq << MAF;
579         for(j = 0; j < set_num; j++){
580             ldscore = 1.0 + (mean_rsq[j])[i] * (snp_num[j])[i];
581             o_mrsq << " " << (mean_rsq[j])[i] << " " << (snp_num[j])[i] << " " << (max_rsq[j])[i];
582         }
583         o_mrsq << endl;
584     }
585     o_mrsq << endl;
586     cout << "LD score for " << m << " SNPs have been saved in the file [" + mrsq_file + "]." << endl;
587 }
588 
calcu_ld_blk_multiSet(vector<int> & brk_pnt,vector<int> & brk_pnt3,vector<vector<bool>> & set_flag,vector<eigenVector> & mean_rsq,vector<eigenVector> & snp_num,vector<eigenVector> & max_rsq,bool second,double rsq_cutoff,bool dominance_flag)589 void gcta::calcu_ld_blk_multiSet(vector<int> &brk_pnt, vector<int> &brk_pnt3, vector< vector<bool> > &set_flag, vector<eigenVector> &mean_rsq, vector<eigenVector> &snp_num, vector<eigenVector> &max_rsq, bool second, double rsq_cutoff, bool dominance_flag)
590 {
591     int i = 0, j = 0, k = 0, l = 0, s1 = 0, s2 = 0, n = _keep.size(), m = _include.size(), size = 0, size_limit = 10000;
592     int s = 0, set_num = set_flag.size();
593 
594     for (i = 0; i < brk_pnt.size() - 1; i++) {
595         if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
596         size = brk_pnt[i + 1] - brk_pnt[i] + 1;
597         if (size < 3) continue;
598         if (second) {
599             s1 = brk_pnt3[i] - brk_pnt[i];
600             s2 = s1 + 1;
601         }
602         else {
603             s1 = 0;
604             s2 = size - 1;
605         }
606 
607         vector<int> snp_indx(size);
608         for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++) snp_indx[k] = j;
609         MatrixXf X_sub;
610         if(dominance_flag) make_XMat_d_subset(X_sub, snp_indx, true);
611         else make_XMat_subset(X_sub, snp_indx, true);
612         eigenVector ssx_sqrt_i_sub(size);
613         for (j = 0; j < size; j++){
614             ssx_sqrt_i_sub[j] = X_sub.col(j).squaredNorm();
615             if (ssx_sqrt_i_sub[j] < 1.0e-30) ssx_sqrt_i_sub[j] = 0.0;
616             else ssx_sqrt_i_sub[j] = 1.0 / sqrt(ssx_sqrt_i_sub[j]);
617         }
618 
619         for(s = 0; s < set_num; s++){
620             vector<int> used_in_this_set;
621             vector<bool> set_flag_sub(size);
622              for (j = 0, k = brk_pnt[i]; j < size; j++, k++){
623                 if(set_flag[s][k] == true){
624                     used_in_this_set.push_back(j);
625                     set_flag_sub[j] = true;
626                 }
627                 else set_flag_sub[j] = false;
628              }
629 
630             int size_of_this_set = used_in_this_set.size();
631             if(size_of_this_set < 1) continue;
632 
633             MatrixXf X_sub2(n, size_of_this_set);
634             for(j = 0; j < size_of_this_set; j++) X_sub2.col(j) = X_sub.col(used_in_this_set[j]);
635 
636             eigenVector rsq_size = eigenVector::Zero(size);
637             eigenVector mean_rsq_sub = eigenVector::Zero(size);
638             eigenVector max_rsq_sub = eigenVector::Constant(size, -1.0);
639 
640             if (size > size_limit) calcu_ld_blk_split_multiSet(size, size_limit, X_sub, X_sub2, used_in_this_set, ssx_sqrt_i_sub, rsq_cutoff, rsq_size, mean_rsq_sub, max_rsq_sub, s1, s2, second);
641             else {
642                 MatrixXf rsq_sub = X_sub.transpose() * X_sub2;
643 
644                 #pragma omp parallel for private(k,l)
645                 for (j = 0; j < size; j++) {
646                     rsq_size[j] = 0.0;
647                     mean_rsq_sub[j] = 0.0;
648                     for (k = 0; k < size_of_this_set; k++) {
649                         l = used_in_this_set[k];
650                         if (second) {
651                             if (j <= s1 && l <= s1) continue;
652                             if (j >= s2 && l >= s2) continue;
653                         }
654                         if (l == j) continue;
655                         rsq_sub(j,k) *= (ssx_sqrt_i_sub[j] * ssx_sqrt_i_sub[l]);
656                         rsq_sub(j,k) = rsq_sub(j,k) * rsq_sub(j,k);
657                         if (rsq_sub(j,k) >= rsq_cutoff) {
658                             if(_ldscore_adj) mean_rsq_sub[j] += rsq_sub(j,k) - (1.0 - rsq_sub(j,k)) / (n - 2.0);
659                             else mean_rsq_sub[j] += rsq_sub(j,k);
660                             rsq_size[j] += 1.0;
661                         }
662                         if (rsq_sub(j,k) > max_rsq_sub[j]) max_rsq_sub[j] = rsq_sub(j,k);
663                     }
664                     if (rsq_size[j] > 0.0) mean_rsq_sub[j] /= rsq_size[j];
665                 }
666             }
667             for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
668                 if (second) {
669                     if (rsq_size[j] > 0.0) {
670                         (mean_rsq[s])[k] = ((mean_rsq[s])[k] * (snp_num[s])[k] + mean_rsq_sub[j] * rsq_size[j]) / ((snp_num[s])[k] + rsq_size[j]);
671                         (snp_num[s])[k] = ((snp_num[s])[k] + rsq_size[j]);
672                         if((max_rsq[s])[k] < max_rsq_sub[j]) (max_rsq[s])[k] = max_rsq_sub[j];
673                     }
674                 }
675                 else {
676                     (mean_rsq[s])[k] = mean_rsq_sub[j];
677                     (snp_num[s])[k] = rsq_size[j];
678                     (max_rsq[s])[k] = max_rsq_sub[j];
679                 }
680             }
681         }
682     }
683 }
684 
calcu_ld_blk_split_multiSet(int size,int size_limit,MatrixXf & X_sub,MatrixXf & X_sub2,vector<int> & used_in_this_set,eigenVector & ssx_sqrt_i_sub,double rsq_cutoff,eigenVector & rsq_size,eigenVector & mean_rsq_sub,eigenVector & max_rsq_sub,int s1,int s2,bool second)685 void gcta::calcu_ld_blk_split_multiSet(int size, int size_limit, MatrixXf &X_sub, MatrixXf &X_sub2, vector<int> &used_in_this_set, eigenVector &ssx_sqrt_i_sub, double rsq_cutoff, eigenVector &rsq_size, eigenVector &mean_rsq_sub, eigenVector &max_rsq_sub, int s1, int s2, bool second)
686 {
687     int i = 0, j = 0, k = 0, l = 0, m = 0, n = _keep.size();
688     vector<int> brk_pnt_sub;
689     brk_pnt_sub.push_back(0);
690     for (i = size_limit; i < size - size_limit; i += size_limit) {
691         brk_pnt_sub.push_back(i - 1);
692         brk_pnt_sub.push_back(i);
693         j = i;
694     }
695     j = (size - j) / 2 + j;
696     brk_pnt_sub.push_back(j - 1);
697     brk_pnt_sub.push_back(j);
698     brk_pnt_sub.push_back(size - 1);
699 
700     for (i = 0; i < brk_pnt_sub.size() - 1; i++) {
701         int size_sub = brk_pnt_sub[i + 1] - brk_pnt_sub[i] + 1;
702         if (size_sub < 3) continue;
703 
704         eigenVector ssx_sqrt_i_sub_sub = ssx_sqrt_i_sub.segment(brk_pnt_sub[i], size_sub);
705         MatrixXf rsq_sub_sub = X_sub.block(0,brk_pnt_sub[i],n,size_sub).transpose() * X_sub2;
706         eigenVector rsq_size_sub = eigenVector::Zero(size_sub);
707         eigenVector mean_rsq_sub_sub = eigenVector::Zero(size_sub);
708         eigenVector max_rsq_sub_sub = eigenVector::Constant(size_sub, -1.0);
709 
710         #pragma omp parallel for private(k, l)
711         for (j = 0; j < size_sub; j++) {
712             unsigned long s = j + brk_pnt_sub[i];
713             rsq_size_sub[j] = 0.0;
714             mean_rsq_sub_sub[j] = 0.0;
715             for (k = 0; k < used_in_this_set.size(); k++) {
716                 l = used_in_this_set[k];
717                 if (second) {
718                     if (s <= s1 && l <= s1) continue;
719                     if (s >= s2 && l >= s2) continue;
720                 }
721                 if (l == s) continue;
722                 rsq_sub_sub(j,k) *= (ssx_sqrt_i_sub_sub[j] * ssx_sqrt_i_sub[l]);
723                 rsq_sub_sub(j,k) = rsq_sub_sub(j,k) * rsq_sub_sub(j,k);
724                 if (rsq_sub_sub(j,k) >= rsq_cutoff) {
725                     if(_ldscore_adj) mean_rsq_sub_sub[j] += rsq_sub_sub(j,k) - (1.0 - rsq_sub_sub(j,k)) / (n - 2.0);
726                     else mean_rsq_sub_sub[j] += rsq_sub_sub(j,k);
727                     rsq_size_sub[j] += 1.0;
728                 }
729                 if(rsq_sub_sub(j,k) > max_rsq_sub_sub[j]) max_rsq_sub_sub[j] = rsq_sub_sub(j,k);
730             }
731 
732             if (rsq_size_sub[j] > 0.0) mean_rsq_sub_sub[j] /= rsq_size_sub[j];
733         }
734 
735         for (j = 0, k = brk_pnt_sub[i]; j < size_sub; j++, k++) {
736             mean_rsq_sub[k] = mean_rsq_sub_sub[j];
737             rsq_size[k] = rsq_size_sub[j];
738             max_rsq_sub[k] = max_rsq_sub_sub[j];
739         }
740     }
741 }
742 
ld_seg(string i_ld_file,int seg_size,int wind_size,double rsq_cutoff,bool dominance_flag)743 void gcta::ld_seg(string i_ld_file, int seg_size, int wind_size, double rsq_cutoff, bool dominance_flag)
744 {
745     int i = 0, j = 0, k = 0, m = 0;
746     vector<float> mrsq, snp_num, max_rsq, ldscore;
747     vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
748 
749     if(!i_ld_file.empty()){
750         ifstream ild(i_ld_file.c_str());
751         if (!ild) throw ("Error: can not open the file [" + i_ld_file + "] to read.");
752 
753         string str_buf;
754         cout << "Reading per-SNP LD score from [" + i_ld_file + "] ..." << endl;
755         getline(ild, str_buf); // get the header
756         while(getline(ild, str_buf)){
757             if(str_buf.size() > 0) m++;
758         }
759         ild.close();
760         ild.open(i_ld_file.c_str());
761         _snp_num = m;
762         _snp_name.resize(m);
763         _chr.resize(m);
764         _bp.resize(m);
765         _mu.resize(m);
766         mrsq.resize(m);
767         snp_num.resize(m);
768         max_rsq.resize(m);
769         ldscore.resize(m);
770         i = 0;
771         getline(ild, str_buf); // get the header
772         while (ild) {
773             ild >> str_buf;
774             if (ild.eof() || str_buf.size() < 1) break;
775             _snp_name[i] = str_buf;
776             if(!(ild >> _chr[i])) throw("Error: in the file [" + i_ld_file + "].");
777             if(!(ild >> _bp[i])) throw("Error: in the file [" + i_ld_file + "].");
778             if(!(ild >> _mu[i])) throw("Error: in the file [" + i_ld_file + "].");
779             _mu[i] *= 2.0;
780             if(!(ild >> mrsq[i])) throw("Error: in the file [" + i_ld_file + "].");
781             if(!(ild >> snp_num[i])) throw("Error: in the file [" + i_ld_file + "].");
782             if(!(ild >> max_rsq[i])) throw("Error: in the file [" + i_ld_file + "].");
783             if(!(ild >> ldscore[i])) throw("Error: in the file [" + i_ld_file + "].");
784             getline(ild, str_buf);
785             i++;
786         }
787         ild.close();
788         cout << "Per-SNP LD score for " << m << " SNPs read from [" + i_ld_file + "]." << endl;
789 
790         _include.resize(m);
791         for(i = 0; i < m; i++) _include[i] = i;
792     }
793     else {
794         m = _include.size();
795         check_autosome();
796         cout << "\nCalculating LD score between SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size / 2000<<"Kb between blocks); LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
797         if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
798         get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size);
799         eigenVector mrsq_buf = eigenVector::Zero(m), snp_num_buf = eigenVector::Zero(m), max_rsq_buf = eigenVector::Zero(m);
800         calcu_ld_blk(brk_pnt1, brk_pnt3, mrsq_buf, snp_num_buf, max_rsq_buf, false, rsq_cutoff, dominance_flag);
801         if (brk_pnt2.size() > 1) calcu_ld_blk(brk_pnt2, brk_pnt3, mrsq_buf, snp_num_buf, max_rsq_buf, true, rsq_cutoff, dominance_flag);
802         mrsq.resize(m);
803         snp_num.resize(m);
804         max_rsq.resize(m);
805         ldscore.resize(m);
806         for(i = 0; i < m; i++){
807             mrsq[i] = mrsq_buf(i);
808             snp_num[i] = snp_num_buf(i);
809             max_rsq[i] = max_rsq_buf(i);
810             ldscore[i] = 1.0 + mrsq_buf(i) * snp_num_buf(i);
811         }
812     }
813 
814     cout << "Calculating regional mean LD score (region width =  " << seg_size / 1000 << "Kb with an overlap of " << seg_size / 2000 << "Kb between regions) ... " << endl;
815 
816     get_lds_brkpnt(brk_pnt1, brk_pnt2, seg_size);
817     int size = 0, mean_size = 0, count = 0;
818     for (i = 0; i < brk_pnt1.size() - 1; i++) {
819         size = brk_pnt1[i + 1] - brk_pnt1[i] + 1;
820         if (size > 2){
821             mean_size += size;
822             count++;
823         }
824     }
825     mean_size /= count;
826     get_lds_brkpnt(brk_pnt1, brk_pnt2, 0, mean_size);
827 
828     vector<double> lds(m);
829     for (i = 0; i < brk_pnt1.size() - 1; i++){
830         size = brk_pnt1[i + 1] - brk_pnt1[i] + 1;
831         if(size < 3) continue;
832         double ld_score = 0.0;
833         for(j = brk_pnt1[i]; j <= brk_pnt1[i + 1]; j++) ld_score += ldscore[j]; //mrsq[j] * snp_num[j] + 1.0;
834         ld_score /= (double)size;
835         for(j = brk_pnt1[i]; j <= brk_pnt1[i + 1]; j++) lds[j] = ld_score;
836     }
837     for (i = 0; i < brk_pnt2.size() - 1; i++){
838         size = brk_pnt2[i + 1] - brk_pnt2[i] + 1;
839         if(size < 3) continue;
840         double ld_score = 0.0;
841         for(j = brk_pnt2[i]; j <= brk_pnt2[i + 1]; j++) ld_score += ldscore[j]; // mrsq[j] * snp_num[j] + 1.0;
842         ld_score /= (double)size;
843         for(j = brk_pnt2[i]; j <= brk_pnt2[i + 1]; j++) lds[j] = 0.5*(ld_score + lds[j]);
844     }
845 
846     string lds_file;
847     if(dominance_flag) lds_file = _out + ".d.score.ld";
848     else lds_file = _out + ".score.ld";
849     cout << "Writing the regional LD score to file ["+ lds_file +"] ..." << endl;
850     ofstream o_lds(lds_file.data());
851     o_lds << "SNP chr bp freq mean_rsq snp_num max_rsq ldscore_SNP ldscore_region"<<endl;
852     for (i = 0; i < m; i++){
853         o_lds << _snp_name[_include[i]] << " " << _chr[_include[i]] << " " << _bp[_include[i]] << " " << 0.5 * _mu[_include[i]] ;
854         o_lds << " " << mrsq[i] << " " << snp_num[i] << " " << max_rsq[i] << " " << ldscore[i] << " " << lds[i] << "\n";
855     }
856 }
857 
get_lds_brkpnt(vector<int> & brk_pnt1,vector<int> & brk_pnt2,int ldwt_seg,int wind_snp_num)858 void gcta::get_lds_brkpnt(vector<int> &brk_pnt1, vector<int> &brk_pnt2, int ldwt_seg, int wind_snp_num)
859 {
860     unsigned long i = 0, j = 0, k = 0, m = _include.size();
861 
862     brk_pnt1.clear();
863     brk_pnt1.push_back(0);
864     bool chr_start = true;
865     int prev_length = 0;
866     for (i = 1, j = 0; i < m; i++) {
867         if (i == (m - 1)){
868             if(!chr_start && prev_length < 0.5 * wind_snp_num) brk_pnt1[j - 1] = brk_pnt1[j] = m - 1;
869             else brk_pnt1.push_back(m - 1);
870         }
871         else if (_chr[_include[i]] != _chr[_include[brk_pnt1[j]]]) {
872             if(!chr_start && prev_length < 0.5 * wind_snp_num){
873                 brk_pnt1[j - 1] = i - 1;
874                 brk_pnt1[j] = i;
875             }
876             else{
877                 brk_pnt1.push_back(i - 1);
878                 j++;
879                 brk_pnt1.push_back(i);
880                 j++;
881             }
882             chr_start = true;
883         }
884         else if ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > ldwt_seg) && (i - brk_pnt1[j] > wind_snp_num)) {
885             prev_length  = i - brk_pnt1[j];
886             chr_start = false;
887             brk_pnt1.push_back(i - 1);
888             j++;
889             brk_pnt1.push_back(i);
890             j++;
891         }
892     }
893     stable_sort(brk_pnt1.begin(), brk_pnt1.end());
894     brk_pnt1.erase(unique(brk_pnt1.begin(), brk_pnt1.end()), brk_pnt1.end());
895 
896     brk_pnt2.clear();
897     for (i = 1; i < brk_pnt1.size() && brk_pnt1.size() > 2; i++) {
898         if ((_chr[_include[brk_pnt1[i - 1]]] == _chr[_include[brk_pnt1[i]]]) && (brk_pnt1[i] - brk_pnt1[i - 1] > 1)) {
899             int i_buf = (brk_pnt1[i - 1] + brk_pnt1[i]) / 2;
900             brk_pnt2.push_back(i_buf);
901             brk_pnt2.push_back(i_buf + 1);
902         }
903     }
904 }
905 
906 // calculate maximum LD rsq between SNPs
calcu_max_ld_rsq(int wind_size,double rsq_cutoff,bool dominance_flag)907 void gcta::calcu_max_ld_rsq(int wind_size, double rsq_cutoff, bool dominance_flag)
908 {
909     int i = 0, m = _include.size(), max_size = 0.3 * _keep.size();
910 
911     cout << "Calculating maximum LD rsq between SNPs (block size of " << wind_size / 1000 << "Kb with an overlap of "<<wind_size/2000<<"Kb between blocks; LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
912     cout << "(Maximum number of SNPs allowed in a block = " << max_size << " due to computational limitation)" << endl;
913     if(dominance_flag) cout<<"(SNP genotypes are coded for dominance effects)"<<endl;
914 
915     vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
916     get_ld_blk_pnt_max_limit(brk_pnt1, brk_pnt2, brk_pnt3, wind_size, max_size);
917 
918     eigenVector multi_rsq = eigenVector::Constant(m, -1.0), max_rsq = eigenVector::Constant(m, -999), multi_rsq_adj = eigenVector::Constant(m, -999);
919     vector<int> max_pos(m);
920     calcu_max_ld_rsq_blk(multi_rsq, multi_rsq_adj, max_rsq, max_pos, brk_pnt1, rsq_cutoff, dominance_flag);
921     if (brk_pnt2.size() > 1) calcu_max_ld_rsq_blk(multi_rsq, multi_rsq_adj, max_rsq, max_pos, brk_pnt2, rsq_cutoff, dominance_flag);
922 
923     string max_rsq_file = "";
924     if(dominance_flag) max_rsq_file = _out + ".d.max_rsq.ld";
925     else max_rsq_file = _out + ".max_rsq.ld";
926     ofstream o_max_rsq(max_rsq_file.data());
927     o_max_rsq<<"SNP freq max_rsq max_snp multi_rsq multi_rsq_adj"<<endl;
928     for (i = 0; i < m; i++){
929         if(max_rsq[i] > 0.0 || multi_rsq[i] > -998) o_max_rsq << _snp_name[_include[i]] << " " << 0.5 * _mu[_include[i]] << " " << max_rsq[i] << " " << _snp_name[_include[max_pos[i]]] << " " << multi_rsq[i] << " " <<multi_rsq_adj[i] << endl;
930         else o_max_rsq << _snp_name[_include[i]] << " " << 0.5 * _mu[_include[i]] << " NA NA NA NA" << endl;
931     }
932     o_max_rsq << endl;
933     cout << "Maximum LD rsq for " << m << " SNPs have been saved in the file [" + max_rsq_file + "]." << endl;
934 }
935 
get_ld_blk_pnt_max_limit(vector<int> & brk_pnt1,vector<int> & brk_pnt2,vector<int> & brk_pnt3,int wind_bp,int wind_snp)936 void gcta::get_ld_blk_pnt_max_limit(vector<int> &brk_pnt1, vector<int> &brk_pnt2, vector<int> &brk_pnt3, int wind_bp, int wind_snp)
937 {
938     unsigned long i = 0, j = 0, k = 0, m = _include.size();
939 
940     brk_pnt1.clear();
941     brk_pnt1.push_back(0);
942     for (i = 1, j = 0; i < m; i++) {
943         if (i == (m - 1)) brk_pnt1.push_back(m - 1);
944         else if (_chr[_include[i]] != _chr[_include[brk_pnt1[j]]] || _bp[_include[i]] - _bp[_include[i-1]] > 1e6) {
945             brk_pnt1.push_back(i - 1);
946             j++;
947             brk_pnt1.push_back(i);
948             j++;
949         }
950         else if ((_bp[_include[i]] - _bp[_include[brk_pnt1[j]]] > wind_bp) || (i - brk_pnt1[j] >= wind_snp)) {
951             brk_pnt1.push_back(i - 1);
952             j++;
953             brk_pnt1.push_back(i);
954             j++;
955         }
956     }
957     stable_sort(brk_pnt1.begin(), brk_pnt1.end());
958     brk_pnt1.erase(unique(brk_pnt1.begin(), brk_pnt1.end()), brk_pnt1.end());
959 
960     brk_pnt2.clear();
961     brk_pnt3.clear();
962     for (i = 1; i < brk_pnt1.size() && brk_pnt1.size() > 2; i++) {
963         if ((_chr[_include[brk_pnt1[i - 1]]] == _chr[_include[brk_pnt1[i]]]) && (brk_pnt1[i] - brk_pnt1[i - 1] > 1)) {
964             int i_buf = (brk_pnt1[i - 1] + brk_pnt1[i]) / 2;
965             brk_pnt2.push_back(i_buf);
966             brk_pnt2.push_back(i_buf + 1);
967             brk_pnt3.push_back(brk_pnt1[i]);
968             brk_pnt3.push_back(brk_pnt1[i]);
969         }
970     }
971 }
972 
calcu_max_ld_rsq_blk(eigenVector & multi_rsq,eigenVector & multi_rsq_adj,eigenVector & max_rsq,vector<int> & max_pos,vector<int> & brk_pnt,double rsq_cutoff,bool dominance_flag)973 void gcta::calcu_max_ld_rsq_blk(eigenVector &multi_rsq, eigenVector &multi_rsq_adj, eigenVector &max_rsq, vector<int> &max_pos, vector<int> &brk_pnt, double rsq_cutoff, bool dominance_flag)
974 {
975 	int i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size(), size = 0;
976 
977     for (i = 0; i < brk_pnt.size() - 1; i++)
978     {
979         if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
980         size = brk_pnt[i + 1] - brk_pnt[i] + 1;
981         if (size < 3) continue;
982 
983         // make genotype matrix
984         vector<int> snp_indx(size);
985         for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++) snp_indx[k] = j;
986         MatrixXf X_sub;
987         if(dominance_flag) make_XMat_d_subset(X_sub, snp_indx, true);
988         else make_XMat_subset(X_sub, snp_indx, true);
989         eigenVector ssx_sqrt_i_sub(size);
990         for (j = 0; j < size; j++){
991             ssx_sqrt_i_sub[j] = X_sub.col(j).squaredNorm();
992             if (ssx_sqrt_i_sub[j] < 1.0e-30) ssx_sqrt_i_sub[j] = 0.0;
993             else ssx_sqrt_i_sub[j] = 1.0 / sqrt(ssx_sqrt_i_sub[j]);
994         }
995 
996         MatrixXf rsq_sub = X_sub.transpose() * X_sub;
997         #pragma omp parallel for private(k)
998         for (j = 0; j < size; j++) {
999             rsq_sub(j,j) = 1.0;
1000             for (k = j + 1; k < size; k++){
1001                 rsq_sub(j,k) *= (ssx_sqrt_i_sub[j] * ssx_sqrt_i_sub[k]);
1002                 rsq_sub(k,j) = rsq_sub(j,k);
1003             }
1004         }
1005 
1006 
1007 	// Fixed compile by removing .array().  Not sure about the validity
1008 	// of this change, but it seemed reasonable based on constructor docs.
1009         SelfAdjointEigenSolver<MatrixXf> pca(rsq_sub);
1010 
1011                 // debug
1012        // ofstream tmp("tmp_R.txt");
1013         //tmp<< rsq_sub.col(1998) << endl;
1014 
1015 
1016         VectorXf d_i = pca.eigenvalues();
1017         double eff_m = 0;
1018         for(j = 0; j < size; j ++){
1019         	if(d_i(j) < 1e-5) d_i(j) = 0.0;
1020         	else{
1021         		d_i(j) = 1.0 / d_i(j);
1022         		eff_m++;
1023         	}
1024         }
1025 
1026         // debug
1027         //ofstream tmp2("tmp_eval.txt");
1028         //tmp2<<pca.eigenvalues()<< endl;
1029 
1030 
1031 
1032         // debug
1033         cout << "size = " << size << "; eff_m = " << eff_m << endl;
1034         MatrixXf R_i = pca.eigenvectors() * d_i.asDiagonal() * pca.eigenvectors().transpose();
1035         R_i = R_i.array();
1036 
1037 
1038         //JacobiSVD<MatrixXf> svd;
1039         /*svd.compute(rsq_sub, ComputeThinV);
1040         VectorXf d_i = svd.singularValues();
1041         double eff_m = 0;
1042         for(j = 0; j < size; j ++){
1043         	if(d_i(j) < 1e-6) d_i(j) = 0.0;
1044         	else{
1045         		d_i(j) = 1.0 / d_i(j);
1046         		eff_m++;
1047         	}
1048         }
1049 
1050         // debug
1051         cout << "size = " << size << "; eff_m = " << eff_m << endl;
1052 
1053         MatrixXf R_i = svd.matrixV() * d_i.asDiagonal() * svd.matrixV().transpose();*/
1054         VectorXf Q_diag(size);
1055         for(j = 0; j < size; j ++) Q_diag(j) = R_i.col(j).dot(rsq_sub.row(j).transpose());
1056         VectorXf multi_rsq_buf(size);
1057     	for(j = 0; j < size; j ++){
1058     		if(fabs(Q_diag[j] - 1.0) < 0.01) multi_rsq_buf[j] = 1.0 - 1.0 / R_i(j,j);
1059     		else multi_rsq_buf[j] = 1.0;
1060             if(multi_rsq_buf[j] > 1.0) multi_rsq_buf[j] = 1.0;
1061     	}
1062         VectorXf multi_rsq_buf_adj = multi_rsq_buf.array() - (1.0 - multi_rsq_buf.array()) * (eff_m / ((double)n - eff_m - 1.0));
1063 
1064         rsq_sub.diagonal() = VectorXf::Zero(size);
1065         rsq_sub = rsq_sub.array() * rsq_sub.array();
1066         VectorXf max_rsq_buf(size);
1067         vector<int> max_pos_buf(size);
1068         VectorXf::Index max_pos_pnt;
1069         for(j = 0; j < size; j++){
1070             rsq_sub.col(j).maxCoeff(&max_pos_pnt);
1071             max_pos_buf[j] = (int)max_pos_pnt;
1072             max_rsq_buf[j] = rsq_sub.col(j)[max_pos_buf[j]];
1073             if(multi_rsq_buf[j] < max_rsq_buf[j]) multi_rsq_buf[j] = max_rsq_buf[j];
1074         }
1075 
1076 /*      for(j = 0; j < size; j++){
1077             if(multi_rsq_buf[j] > 1.0) multi_rsq_buf[j] = 1.0;
1078             if(max_rsq_buf[j] > 1.0) max_rsq_buf[j] = 1.0;
1079         }
1080         */
1081         for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
1082             if(multi_rsq_adj[k] <= multi_rsq_buf_adj[j]){
1083             	multi_rsq[k] = multi_rsq_buf[j];
1084             	multi_rsq_adj[k] = multi_rsq_buf_adj[j];
1085             }
1086             if(max_rsq[k] < max_rsq_buf[j]){
1087                 max_rsq[k] = max_rsq_buf[j];
1088                 max_pos[k] = max_pos_buf[j] + brk_pnt[i];
1089             }
1090         }
1091     }
1092 }
1093 
bending_eigenval_Xf(VectorXf & eval)1094 bool gcta::bending_eigenval_Xf(VectorXf &eval)
1095 {
1096     int j = 0;
1097     double eval_m = eval.mean();
1098     if (eval.minCoeff() > 0.0) return false;
1099     double S = 0.0, P = 0.0;
1100     for (j = 0; j < eval.size(); j++) {
1101         if (eval[j] >= 0) continue;
1102         S += eval[j];
1103         P = -eval[j];
1104     }
1105     double W = S * S * 100.0 + 1;
1106     for (j = 0; j < eval.size(); j++) {
1107         if (eval[j] >= 0) continue;
1108         eval[j] = P * (S - eval[j])*(S - eval[j]) / W;
1109     }
1110     eval *= eval_m / eval.mean();
1111     return true;
1112 }
1113 
1114