1 /*
2  *  mkl.cpp
3  *  gcta
4  *
5  *  Created by Jian Yang on 24/01/13.
6  *  Copyright 2013 QIMR. All rights reserved.
7  *
8  */
9 
10 #include "gcta.h"
11 
12 /////////////////
13 // data functions
14 
make_XMat_mkl(float * X,bool grm_d_flag)15 void gcta::make_XMat_mkl(float *X, bool grm_d_flag)
16 {
17     if (_mu.empty()) calcu_mu();
18 
19     cout << "Recoding genotypes (individual major mode) ..." << endl;
20     unsigned long i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
21 
22     if (!grm_d_flag) {
23         #pragma omp parallel for private(j)
24         for (i = 0; i < n; i++) {
25             if (_dosage_flag) {
26                 for (j = 0; j < m; j++) {
27                     if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
28                         if (_allele1[_include[j]] == _ref_A[_include[j]]) X[i * m + j] = _geno_dose[_keep[i]][_include[j]];
29                         else X[i * m + j] = 2.0 - _geno_dose[_keep[i]][_include[j]];
30                     } else X[i * m + j] = 1e6;
31                 }
32                 _geno_dose[i].clear();
33             } else {
34                 for (j = 0; j < _include.size(); j++) {
35                     if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
36                         if (_allele1[_include[j]] == _ref_A[_include[j]]) X[i * m + j] = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
37                         else X[i * m + j] = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
38                     } else X[i * m + j] = 1e6;
39                 }
40             }
41         }
42     }
43     else {
44         #pragma omp parallel for private(j, k)
45         for (i = 0; i < n; i++) {
46             if (_dosage_flag) {
47                 for (j = 0; j < m; j++) {
48                     if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
49                         k = i * m + j;
50                         if (_allele1[_include[j]] == _ref_A[_include[j]]) X[k] = _geno_dose[_keep[i]][_include[j]];
51                         else X[k] = 2.0 - _geno_dose[_keep[i]][_include[j]];
52                         if (X[k] < 0.5) X[k] = 0.0;
53                         else if (X[k] < 1.5) X[k] = _mu[_include[j]];
54                         else X[k] = (2.0 * _mu[_include[j]] - 2.0);
55                     } else X[k] = 1e6;
56                 }
57                 _geno_dose[i].clear();
58             }
59             else {
60                 for (j = 0; j < _include.size(); j++) {
61                     if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
62                         k = i * m + j;
63                         if (_allele1[_include[j]] == _ref_A[_include[j]]) X[k] = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
64                         else X[k] = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
65                         if (X[k] < 0.5) X[k] = 0.0;
66                         else if (X[k] < 1.5) X[k] = _mu[_include[j]];
67                         else X[k] = (2.0 * _mu[_include[j]] - 2.0);
68                     } else X[i * m + j] = 1e6;
69                 }
70             }
71         }
72     }
73 }
74 
std_XMat_mkl(float * X,vector<double> & sd_SNP,bool grm_xchr_flag,bool miss_with_mu,bool divid_by_std)75 void gcta::std_XMat_mkl(float *X, vector<double> &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std) {
76     if (_mu.empty()) calcu_mu();
77 
78     unsigned long i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
79     sd_SNP.clear();
80     sd_SNP.resize(m);
81     if (_dosage_flag) {
82 #pragma omp parallel for private(i)
83         for (j = 0; j < m; j++) {
84             for (i = 0; i < n; i++) {
85                 double d_buf = X[i * m + j] - _mu[_include[j]];
86                 sd_SNP[j] += d_buf*d_buf;
87             }
88             sd_SNP[j] /= (n - 1.0);
89         }
90     } else {
91         for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
92     }
93     if (divid_by_std) {
94         for (j = 0; j < m; j++) {
95             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
96             else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
97         }
98     }
99 
100 #pragma omp parallel for private(j, k)
101     for (i = 0; i < n; i++) {
102         for (j = 0; j < m; j++) {
103             k = i * m + j;
104             if (X[k] < 1e5) {
105                 X[k] -= _mu[_include[j]];
106                 if (divid_by_std) X[k] *= sd_SNP[j];
107             } else if (miss_with_mu) X[k] = 0.0;
108         }
109     }
110 
111     if (!grm_xchr_flag) return;
112     // for the X-chromosome
113     check_sex();
114     double f_buf = sqrt(0.5);
115 
116 #pragma omp parallel for private(j, k)
117     for (i = 0; i < n; i++) {
118         if (_sex[_keep[i]] == 1) {
119             for (j = 0; j < m; j++) {
120                 k = i * m + j;
121                 if (X[k] < 1e5) X[k] *= f_buf;
122                 else if (miss_with_mu) X[k] = 0.0;
123             }
124         }
125     }
126 }
127 
std_XMat_d_mkl(float * X,vector<double> & sd_SNP,bool miss_with_mu,bool divid_by_std)128 void gcta::std_XMat_d_mkl(float *X, vector<double> &sd_SNP, bool miss_with_mu, bool divid_by_std) {
129     if (_mu.empty()) calcu_mu();
130 
131     unsigned long i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
132     sd_SNP.clear();
133     sd_SNP.resize(m);
134     if (_dosage_flag) {
135 #pragma omp parallel for private(i)
136         for (j = 0; j < m; j++) {
137             for (i = 0; i < n; i++) {
138                 double d_buf = (X[i * m + j] - _mu[_include[j]]);
139                 sd_SNP[j] += d_buf*d_buf;
140             }
141             sd_SNP[j] /= (n - 1.0);
142         }
143     } else {
144         for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
145     }
146     if (divid_by_std) {
147         for (j = 0; j < m; j++) {
148             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
149             else sd_SNP[j] = 1.0 / sd_SNP[j];
150         }
151     } else {
152         for (j = 0; j < m; j++) sd_SNP[j] = sd_SNP[j] * sd_SNP[j];
153     }
154     vector<double> psq(m);
155     for (j = 0; j < m; j++) psq[j] = 0.5 * _mu[_include[j]] * _mu[_include[j]];
156 
157 #pragma omp parallel for private(j, k)
158     for (i = 0; i < n; i++) {
159         for (j = 0; j < m; j++) {
160             k = i * m + j;
161             if (X[k] < 1e5) {
162                 X[k] -= psq[j];
163                 if (divid_by_std) X[k] *= sd_SNP[j];
164             } else if (miss_with_mu) X[k] = 0.0;
165         }
166     }
167 }
168 
169 ////////////
170 // GRM
make_grm_mkl(bool grm_d_flag,bool grm_xchr_flag,bool inbred,bool output_bin,int grm_mtd,bool mlmassoc,bool diag_f3_flag)171 void gcta::make_grm_mkl(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag)
172 {
173     if (!grm_d_flag && grm_xchr_flag) check_chrX();
174     else check_autosome();
175 
176     unsigned long i = 0, j = 0, k = 0, l = 0, n = _keep.size(), m = _include.size();
177     _geno_mkl = new float[n * m]; // alloc memory to X matrix
178 
179     make_XMat_mkl(_geno_mkl, grm_d_flag);
180     vector<double> sd_SNP, sd_SNP_buf;
181     if (grm_mtd == 0) {
182         if (grm_d_flag) std_XMat_d_mkl(_geno_mkl, sd_SNP, false, true);
183         else std_XMat_mkl(_geno_mkl, sd_SNP, grm_xchr_flag, false, true);
184     }
185     else {
186         if (grm_d_flag) std_XMat_d_mkl(_geno_mkl, sd_SNP, false, false);
187         else std_XMat_mkl(_geno_mkl, sd_SNP, grm_xchr_flag, false, false);
188     }
189 
190     if (!mlmassoc) cout << "\nCalculating the" << ((grm_d_flag) ? " dominance" : "") << " genetic relationship matrix (GRM)" << (grm_xchr_flag ? " for the X chromosome" : "") << (_dosage_flag ? " using imputed dosage data" : "") << " ... (Note: default speed-optimized mode, may use huge RAM)" << endl;
191     else cout << "\nCalculating the genetic relationship matrix (GRM) ... " << endl;
192 
193     // count the number of missing genotypes
194     vector< vector<int> > miss_pos(n);
195     bool * X_bool = new bool[n * m];
196     for (i = 0; i < n; i++) {
197         for (j = 0; j < m; j++) {
198             k = i * m + j;
199             if (_geno_mkl[k] < 1e5) X_bool[k] = true;
200             else {
201                 _geno_mkl[k] = 0.0;
202                 miss_pos[i].push_back(j);
203                 X_bool[k] = false;
204             }
205         }
206     }
207 
208     // Calculate A_N matrix
209     _grm_N.resize(n, n);
210     #pragma omp parallel for private(j, k)
211     for (i = 0; i < n; i++) {
212         for (j = 0; j <= i; j++) {
213             int miss_j = 0;
214             for (k = 0; k < miss_pos[j].size(); k++) miss_j += (int) X_bool[i * m + miss_pos[j][k]];
215             _grm_N(i,j) = m - miss_pos[i].size() - miss_j;
216         }
217     }
218 
219     // Calculate sum of LD weights
220     if (grm_mtd == 1) {
221         double denominator = 0.0;
222         for (j = 0; j < m; j++) denominator += sd_SNP[j];
223         denominator = denominator / (double)m;
224         #pragma omp parallel for private(j, k)
225         for (i = 0; i < n; i++) {
226             for (j = 0; j <= i; j++) _grm_N(i,j) = _grm_N(i,j) * denominator;
227         }
228     }
229 
230     // Calcuate WW'
231     _grm_mkl = new float[n * n]; // alloc memory to A
232     cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, n, n, m, 1.0, _geno_mkl, m, _geno_mkl, m, 0.0, _grm_mkl, n);
233 
234     // re-calcuate the diagonals (Fhat3+1)
235     if (diag_f3_flag) {
236         #pragma omp parallel for private(j,k,l)
237         for (i = 0; i < n; i++) {
238             l = i * n + i;
239             _grm_mkl[l] = 0.0;
240             for (j = 0; j < m; j++) {
241                 k = i * m + j;
242                 _grm_mkl[l] += _geno_mkl[k]*(_geno_mkl[k]+(_mu[_include[j]] - 1.0) * sd_SNP[j]);
243             }
244         }
245     }
246 
247     // Calculate A matrix
248     #pragma omp parallel for private(j)
249     for (i = 0; i < n; i++) {
250         for (j = 0; j <= i; j++) {
251             if (_grm_N(i,j) > 0.0) _grm_mkl[i * n + j] /= _grm_N(i,j);
252             else _grm_mkl[i * n + j] = 0.0;
253         }
254     }
255 
256     if (inbred) {
257         #pragma omp parallel for private(j)
258         for (i = 0; i < n; i++) {
259             for (j = 0; j <= i; j++) _grm_mkl[i * n + j] *= 0.5;
260         }
261     }
262 
263     if (mlmassoc && grm_mtd == 0) {
264         for (j = 0; j < m; j++) {
265             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
266             else sd_SNP[j] = 1.0 / sd_SNP[j];
267         }
268         #pragma omp parallel for private(j, k)
269         for (i = 0; i < n; i++) {
270             for (j = 0; j < m; j++) {
271                 k = i * m + j;
272                 if (_geno_mkl[k] < 1e5) _geno_mkl[k] *= sd_SNP[j];
273                 else _geno_mkl[k] = 0.0;
274             }
275         }
276         delete[] X_bool;
277     } else {
278         // Output A_N and A
279         string out_buf = _out;
280         if (grm_d_flag) _out += ".d";
281         output_grm_mkl(_grm_mkl, output_bin);
282         _out = out_buf;
283 
284         // free memory
285         delete[] _geno_mkl;
286         delete[] X_bool;
287         delete[] _grm_mkl;
288     }
289 }
290 
output_grm_mkl(float * A,bool output_grm_bin)291 void gcta::output_grm_mkl(float* A, bool output_grm_bin)
292 {
293     unsigned long i = 0, j = 0, n = _keep.size();
294     string grm_file;
295 
296     if (output_grm_bin) {
297         // Save matrix A in binary file
298         grm_file = _out + ".grm.bin";
299         fstream A_Bin(grm_file.c_str(), ios::out | ios::binary);
300         if (!A_Bin) throw ("Error: can not open the file [" + grm_file + "] to write.");
301         int size = sizeof (float);
302         for (i = 0; i < n; i++) {
303             for (j = 0; j <= i; j++) A_Bin.write((char*) &(A[i * n + j]), size);
304         }
305         A_Bin.close();
306         cout << "GRM of " << n << " individuals has been saved in the file [" + grm_file + "] (in binary format)." << endl;
307 
308         string grm_N_file = _out + ".grm.N.bin";
309         fstream N_Bin(grm_N_file.c_str(), ios::out | ios::binary);
310         if (!N_Bin) throw ("Error: can not open the file [" + grm_N_file + "] to write.");
311         size = sizeof (float);
312         for (i = 0; i < n; i++) {
313             for (j = 0; j <= i; j++) N_Bin.write((char*) &(_grm_N(i,j)), size);
314         }
315         N_Bin.close();
316         cout << "Number of SNPs to calcuate the genetic relationship between each pair of individuals has been saved in the file [" + grm_N_file + "] (in binary format)." << endl;
317     } else {
318         // Save A matrix in txt format
319         grm_file = _out + ".grm.gz";
320         gzofstream zoutf;
321         zoutf.open(grm_file.c_str());
322         if (!zoutf.is_open()) throw ("Error: can not open the file [" + grm_file + "] to write.");
323         cout << "Saving the genetic relationship matrix to the file [" + grm_file + "] (in compressed text format)." << endl;
324         zoutf.setf(ios::scientific);
325         zoutf.precision(6);
326         for (i = 0; i < n; i++) {
327             for (j = 0; j <= i; j++) zoutf << i + 1 << '\t' << j + 1 << '\t' << _grm_N(i,j) << '\t' << A[i * n + j] << endl;
328         }
329         zoutf.close();
330         cout << "The genetic relationship matrix has been saved in the file [" + grm_file + "] (in compressed text format)." << endl;
331     }
332 
333     string famfile = _out + ".grm.id";
334     ofstream Fam(famfile.c_str());
335     if (!Fam) throw ("Error: can not open the file [" + famfile + "] to write.");
336     for (i = 0; i < n; i++) Fam << _fid[_keep[i]] + "\t" + _pid[_keep[i]] << endl;
337     Fam.close();
338     cout << "IDs for the GRM file [" + grm_file + "] have been saved in the file [" + famfile + "]." << endl;
339 }
340 
341 ///////////
342 // reml
343 
comput_inverse_logdet_LDLT_mkl(eigenMatrix & Vi,double & logdet)344 bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet)
345 {
346     unsigned long i = 0, j = 0, n = Vi.cols();
347     double* Vi_mkl = new double[n * n];
348     //float* Vi_mkl=new float[n*n];
349 
350     #pragma omp parallel for private(j)
351     for (i = 0; i < n; i++) {
352         for (j = 0; j < n; j++) {
353             Vi_mkl[i * n + j] = Vi(i, j);
354         }
355     }
356 
357     // MKL's Cholesky decomposition
358     int info = 0, int_n = (int) n;
359     char uplo = 'L';
360     dpotrf_(&uplo, &int_n, Vi_mkl, &int_n, &info);
361     //spotrf( &uplo, &n, Vi_mkl, &n, &info );
362     if (info < 0) throw ("Error: Cholesky decomposition failed. Invalid values found in the matrix.\n");
363     else if (info > 0) return false;
364     else {
365         logdet = 0.0;
366         for (i = 0; i < n; i++) {
367             double d_buf = Vi_mkl[i * n + i];
368             logdet += log(d_buf * d_buf);
369         }
370 
371         // Calcualte V inverse
372         dpotri_(&uplo, &int_n, Vi_mkl, &int_n, &info);
373         //spotri( &uplo, &n, Vi_mkl, &n, &info );
374         if (info < 0) throw ("Error: invalid values found in the varaince-covaraince (V) matrix.\n");
375         else if (info > 0) return false;
376         else {
377             #pragma omp parallel for private(j)
378             for (j = 0; j < n; j++) {
379                 for (i = 0; i <= j; i++) Vi(i, j) = Vi(j, i) = Vi_mkl[i * n + j];
380             }
381         }
382     }
383 
384     // free memory
385     delete[] Vi_mkl;
386 
387     return true;
388 
389 }
390 
comput_inverse_logdet_LU_mkl(eigenMatrix & Vi,double & logdet)391 bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet)
392 {
393     unsigned long i = 0, j = 0, n = Vi.cols();
394     double* Vi_mkl = new double[n * n];
395 
396     #pragma omp parallel for private(j)
397     for (i = 0; i < n; i++) {
398         for (j = 0; j < n; j++) {
399             Vi_mkl[i * n + j] = Vi(i, j);
400         }
401     }
402 
403     int N = (int) n;
404     int *IPIV = new int[n + 1];
405     int LWORK = N*N;
406     double *WORK = new double[n * n];
407     int INFO;
408     dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO);
409     if (INFO < 0) throw ("Error: LU decomposition failed. Invalid values found in the matrix.\n");
410     else if (INFO > 0) {
411         delete[] Vi_mkl;
412         return false;
413     } else {
414         logdet = 0.0;
415         for (i = 0; i < n; i++) {
416             double d_buf = Vi_mkl[i * n + i];
417             logdet += log(fabs(d_buf));
418         }
419 
420         // Calcualte V inverse
421         dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO);
422         if (INFO < 0) throw ("Error: invalid values found in the varaince-covaraince (V) matrix.\n");
423         else if (INFO > 0) return false;
424         else {
425             #pragma omp parallel for private(j)
426             for (j = 0; j < n; j++) {
427                 for (i = 0; i <= j; i++) Vi(i, j) = Vi(j, i) = Vi_mkl[i * n + j];
428             }
429         }
430     }
431 
432     // free memory
433     delete[] Vi_mkl;
434     delete[] IPIV;
435     delete[] WORK;
436 
437     return true;
438 }
439 
comput_inverse_logdet_LU_mkl_array(int n,float * Vi,double & logdet)440 bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) {
441     unsigned long i = 0, j = 0;
442     double* Vi_mkl = new double[n * n];
443 
444     #pragma omp parallel for private(j)
445     for (i = 0; i < n; i++) {
446         for (j = 0; j < n; j++) {
447             Vi_mkl[i * n + j] = Vi[i * n + j];
448         }
449     }
450 
451     int N = (int) n;
452     int *IPIV = new int[n + 1];
453     int LWORK = N*N;
454     double *WORK = new double[n * n];
455     int INFO;
456     dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO);
457     if (INFO < 0) throw ("Error: LU decomposition failed. Invalid values found in the matrix.\n");
458     else if (INFO > 0) {
459         delete[] Vi_mkl;
460         return (false); //Vi.diagonal()=Vi.diagonal().array()+Vi.diagonal().mean()*1e-3;
461     }
462     else {
463         logdet = 0.0;
464         for (i = 0; i < n; i++) {
465             double d_buf = Vi_mkl[i * n + i];
466             logdet += log(fabs(d_buf));
467         }
468 
469         // Calcualte V inverse
470         dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO);
471         if (INFO < 0) throw ("Error: invalid values found in the varaince-covaraince (V) matrix.\n");
472         else if (INFO > 0) return (false); // Vi.diagonal()=Vi.diagonal().array()+Vi.diagonal().mean()*1e-3;
473         else {
474             #pragma omp parallel for private(j)
475             for (j = 0; j < n; j++) {
476                 for (i = 0; i < n; i++) Vi[i * n + j] = Vi_mkl[i * n + j];
477             }
478         }
479     }
480 
481     // free memory
482     delete[] Vi_mkl;
483     delete[] IPIV;
484     delete[] WORK;
485 
486     return true;
487 }
488 
489 //////////////
490 // LD
491 
LD_pruning_mkl(double rsq_cutoff,int wind_size)492 void gcta::LD_pruning_mkl(double rsq_cutoff, int wind_size) {
493     check_autosome();
494 
495     unsigned long i = 0, j = 0, k = 0, l = 0, n = _keep.size(), m = _include.size();
496     _geno_mkl = new float[n * m]; // alloc memory to X matrix
497     make_XMat_mkl(_geno_mkl, false);
498     vector<double> sd_SNP;
499     std_XMat_mkl(_geno_mkl, sd_SNP, false, true, true);
500 
501     cout << "\nPruning SNPs for LD ..." << endl;
502     vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
503     get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size*2);
504 
505     vector<int> rm_snp_indx;
506     LD_pruning_blk_mkl(_geno_mkl, brk_pnt1, rsq_cutoff, rm_snp_indx);
507     if (brk_pnt2.size() > 1) LD_pruning_blk_mkl(_geno_mkl, brk_pnt2, rsq_cutoff, rm_snp_indx);
508     stable_sort(rm_snp_indx.begin(), rm_snp_indx.end());
509     rm_snp_indx.erase(unique(rm_snp_indx.begin(), rm_snp_indx.end()), rm_snp_indx.end());
510     int m_sub = rm_snp_indx.size();
511     vector<string> rm_snp_name(m_sub);
512 #pragma omp parallel for
513     for (i = 0; i < m_sub; i++) rm_snp_name[i] = _snp_name[_include[rm_snp_indx[i]]];
514     update_id_map_rm(rm_snp_name, _snp_name_map, _include);
515     m = _include.size();
516 
517     cout << "After LD-pruning, " << m << " SNPs are remaining." << endl;
518     string pruned_file = _out + ".prune.in";
519     ofstream oprune(pruned_file.data());
520     for (i = 0; i < m; i++) oprune << _snp_name[_include[i]] << endl;
521     oprune << endl;
522     cout << "The list of " << m << " LD-pruned SNPs (pruned in) have been saved in the file [" + pruned_file + "]." << endl;
523 }
524 
LD_pruning_blk_mkl(float * X,vector<int> & brk_pnt,double rsq_cutoff,vector<int> & rm_snp_ID1)525 void gcta::LD_pruning_blk_mkl(float *X, vector<int> &brk_pnt, double rsq_cutoff, vector<int> &rm_snp_ID1)
526 {
527     unsigned long i = 0, j = 0, k = 0, l = 0, n = _keep.size(), m = _include.size(), size = 0;
528 
529     for (i = 0; i < brk_pnt.size() - 1; i++) {
530         if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
531         size = brk_pnt[i + 1] - brk_pnt[i] + 1;
532         if (size < 3) continue;
533 
534         float *X_sub = new float[n * size];
535         #pragma omp parallel for private(k, l)
536         for (j = 0; j < n; j++) {
537             for (k = 0, l = brk_pnt[i]; k < size; k++, l++) X_sub[j * size + k] = X[j * m + l];
538         }
539 
540         float *rsq_sub = new float[size * size];
541         cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, size, size, n, 1.0, X_sub, size, X_sub, size, 0.0, rsq_sub, size);
542         #pragma omp parallel for private(k, l)
543         for (j = 0; j < size; j++) {
544             for (k = 0; k < size; k++) {
545                 l = j * size + k;
546                 rsq_sub[l] /= (float) n;
547                 rsq_sub[l] = rsq_sub[l] * rsq_sub[l];
548             }
549         }
550         vector<int> rm_snp_buf;
551         rm_cor_snp((int) size, brk_pnt[i], rsq_sub, rsq_cutoff, rm_snp_buf);
552         rm_snp_ID1.insert(rm_snp_ID1.end(), rm_snp_buf.begin(), rm_snp_buf.end());
553 
554         delete[] X_sub;
555         delete[] rsq_sub;
556     }
557 }
558 
calcu_mean_rsq_mkl(int wind_size,double rsq_cutoff)559 void gcta::calcu_mean_rsq_mkl(int wind_size, double rsq_cutoff)
560 {
561     check_autosome();
562 
563     unsigned long i = 0, j = 0, k = 0, l = 0, n = _keep.size(), m = _include.size();
564     _geno_mkl = new float[n * m];
565     make_XMat_mkl(_geno_mkl, false);
566     vector<double> sd_SNP;
567     std_XMat_mkl(_geno_mkl, sd_SNP, false, true, true);
568     calcu_ssx_sqrt_i_mkl(_geno_mkl, sd_SNP);
569 
570     cout << "Calculating mean and maximum LD rsq (window size = at least " << wind_size / 1000 << "Kb in either direction; LD rsq threshold = " << rsq_cutoff << ") ... " << endl;
571     vector<int> brk_pnt1, brk_pnt2, brk_pnt3;
572     get_ld_blk_pnt(brk_pnt1, brk_pnt2, brk_pnt3, wind_size*2);
573 
574     eigenVector mean_rsq = eigenVector::Zero(m);
575     eigenVector snp_num = eigenVector::Zero(m);
576     eigenVector max_rsq = eigenVector::Zero(m);
577     calcu_ld_blk_mkl(_geno_mkl, sd_SNP, brk_pnt1, brk_pnt3, mean_rsq, snp_num, max_rsq, false, rsq_cutoff);
578     if (brk_pnt2.size() > 1) calcu_ld_blk_mkl(_geno_mkl, sd_SNP, brk_pnt2, brk_pnt3, mean_rsq, snp_num, max_rsq, true, rsq_cutoff);
579 
580     string mrsq_file = _out + ".mrsq.ld";
581     ofstream o_mrsq(mrsq_file.data());
582     for (i = 0; i < m; i++) o_mrsq << _snp_name[_include[i]] << " " << 0.5 * _mu[_include[i]] << " " << mean_rsq[i] << " " << snp_num[i] << " " << max_rsq[i] << endl;
583     o_mrsq << endl;
584     cout << "Mean and maximum LD rsq for " << m << " SNPs have been saved in the file [" + mrsq_file + "]." << endl;
585 }
586 
calcu_ssx_sqrt_i_mkl(float * X_std,vector<double> & ssx_sqrt_i)587 void gcta::calcu_ssx_sqrt_i_mkl(float *X_std, vector<double> &ssx_sqrt_i)
588 {
589     unsigned long i = 0, j = 0, k = 0, m = _include.size(), n = _keep.size();
590 
591     ssx_sqrt_i.clear();
592     ssx_sqrt_i.resize(m);
593     #pragma omp parallel for private(i,k)
594     for (j = 0; j < m; j++) {
595         ssx_sqrt_i[j] = 0.0;
596         for (i = 0; i < n; i++) {
597             k = i * m + j;
598             ssx_sqrt_i[j] += X_std[k] * X_std[k];
599         }
600         ssx_sqrt_i[j] = sqrt(ssx_sqrt_i[j]);
601         if (ssx_sqrt_i[j] < 1.0e-50) ssx_sqrt_i[j] = 0.0;
602         else ssx_sqrt_i[j] = 1.0 / ssx_sqrt_i[j];
603     }
604 }
605 
calcu_ld_blk_mkl(float * X,vector<double> & ssx,vector<int> & brk_pnt,vector<int> & brk_pnt3,eigenVector & mean_rsq,eigenVector & snp_num,eigenVector & max_rsq,bool second,double rsq_cutoff)606 void gcta::calcu_ld_blk_mkl(float *X, vector<double> &ssx, vector<int> &brk_pnt, vector<int> &brk_pnt3, eigenVector &mean_rsq, eigenVector &snp_num, eigenVector &max_rsq, bool second, double rsq_cutoff)
607 {
608     unsigned long i = 0, j = 0, k = 0, l = 0, s1 = 0, s2 = 0, n = _keep.size(), m = _include.size(), size = 0, size_limit = 10000;
609 
610     for (i = 0; i < brk_pnt.size() - 1; i++) {
611         if (_chr[_include[brk_pnt[i]]] != _chr[_include[brk_pnt[i + 1]]]) continue;
612         size = brk_pnt[i + 1] - brk_pnt[i] + 1;
613         if (size < 3) continue;
614         if (second) {
615             s1 = brk_pnt3[i] - brk_pnt[i];
616             s2 = s1 + 1;
617         }
618         else {
619             s1 = 0;
620             s2 = size - 1;
621         }
622 
623         float *X_sub = new float[n * size];
624         #pragma omp parallel for private(k,l)
625         for (j = 0; j < n; j++) {
626             for (k = 0, l = brk_pnt[i]; k < size; k++, l++) X_sub[j * size + k] = X[j * m + l];
627         }
628         vector<double> ssx_sub(size);
629         for (k = 0, l = brk_pnt[i]; k < size; k++, l++) ssx_sub[k] = ssx[l];
630         vector<double> rsq_size(size), mean_rsq_sub(size), max_rsq_sub(size);
631         for(j = 0; j < size; j++) max_rsq_sub[j] = -1.0;
632 
633         if (size > size_limit) calcu_ld_blk_split_mkl(size, size_limit, X_sub, ssx_sub, rsq_cutoff, rsq_size, mean_rsq_sub, max_rsq_sub, s1, s2, second);
634         else {
635             float *rsq_sub = new float[size * size];
636             cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, size, size, n, 1.0, X_sub, size, X_sub, size, 0.0, rsq_sub, size);
637             #pragma omp parallel for private(k,l)
638             for (j = 0; j < size; j++) {
639                 rsq_size[j] = 0.0;
640                 mean_rsq_sub[j] = 0.0;
641                 for (k = 0; k < size; k++) {
642                     if (second) {
643                         if (j <= s1 && k <= s1) continue;
644                         if (j >= s2 && k >= s2) continue;
645                     }
646                     if (k == j) continue;
647                     l = j * size + k;
648                     rsq_sub[l] *= (ssx_sub[j] * ssx_sub[k]);
649                     if (rsq_sub[l] > 1.0) rsq_sub[l] = 1.0;
650                     rsq_sub[l] = rsq_sub[l] * rsq_sub[l];
651                     if (rsq_sub[l] >= rsq_cutoff) {
652                         mean_rsq_sub[j] += rsq_sub[l];
653                         rsq_size[j] += 1.0;
654                     }
655                     if (rsq_sub[l] > max_rsq_sub[j]) max_rsq_sub[j] = rsq_sub[l];
656                 }
657                 if (rsq_size[j] > 0.0) mean_rsq_sub[j] /= rsq_size[j];
658             }
659             delete[] rsq_sub;
660         }
661         delete[] X_sub;
662 
663         for (j = 0, k = brk_pnt[i]; j < size; j++, k++) {
664             if (second) {
665                 if (rsq_size[j] > 0.0) {
666                     mean_rsq[k] = (mean_rsq[k] * snp_num[k] + mean_rsq_sub[j] * rsq_size[j]) / (snp_num[k] + rsq_size[j]);
667                     snp_num[k] = (snp_num[k] + rsq_size[j]);
668                     if(max_rsq[k] < max_rsq_sub[j]) max_rsq[k] = max_rsq_sub[j];
669                 }
670             }
671             else {
672                 mean_rsq[k] = mean_rsq_sub[j];
673                 snp_num[k] = rsq_size[j];
674                 max_rsq[k] = max_rsq_sub[j];
675             }
676         }
677     }
678 }
679 
calcu_ld_blk_split_mkl(int size,int size_limit,float * X_sub,vector<double> & ssx_sub,double rsq_cutoff,vector<double> & rsq_size,vector<double> & mean_rsq_sub,vector<double> & max_rsq_sub,int s1,int s2,bool second)680 void gcta::calcu_ld_blk_split_mkl(int size, int size_limit, float *X_sub, vector<double> &ssx_sub, double rsq_cutoff, vector<double> &rsq_size, vector<double> &mean_rsq_sub, vector<double> &max_rsq_sub, int s1, int s2, bool second)
681 {
682     unsigned long i = 0, j = 0, k = 0, l = 0, m = 0, n = _keep.size();
683     vector<int> brk_pnt;
684     brk_pnt.push_back(0);
685     for (i = size_limit; i < size - size_limit; i += size_limit) {
686         brk_pnt.push_back(i - 1);
687         brk_pnt.push_back(i);
688         j = i;
689     }
690     j = (size - j) / 2 + j;
691     brk_pnt.push_back(j - 1);
692     brk_pnt.push_back(j);
693     brk_pnt.push_back(size - 1);
694 
695     for (i = 0; i < brk_pnt.size() - 1; i++) {
696         int size_sub = brk_pnt[i + 1] - brk_pnt[i] + 1;
697         if (size_sub < 3) continue;
698 
699         float *X_sub_sub = new float[size_sub * n];
700         #pragma omp parallel for private(k,l)
701         for (j = 0; j < n; j++) {
702             for (k = 0, l = brk_pnt[i]; k < size_sub; k++, l++) X_sub_sub[k * n + j] = X_sub[j * size + l];
703         }
704         vector<double> ssx_sub_sub(size_sub);
705         for (k = 0, l = brk_pnt[i]; k < size_sub; k++, l++) ssx_sub_sub[k] = ssx_sub[l];
706 
707         float *rsq_sub_sub = new float[size_sub * size];
708         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, size_sub, size, n, 1.0, X_sub_sub, n, X_sub, size, 0.0, rsq_sub_sub, size);
709         delete[] X_sub_sub;
710 
711         vector<double> rsq_size_sub(size_sub), mean_rsq_sub_sub(size_sub), max_rsq_sub_sub(size_sub);
712         for (j = 0; j < size_sub; j++) max_rsq_sub_sub[j] = -1.0;
713         #pragma omp parallel for private(k,l)
714         for (j = 0; j < size_sub; j++) {
715             unsigned long s = j + brk_pnt[i];
716             rsq_size_sub[j] = 0.0;
717             mean_rsq_sub_sub[j] = 0.0;
718             for (k = 0; k < size; k++) {
719                 if (second) {
720                     if (s <= s1 && k <= s1) continue;
721                     if (s >= s2 && k >= s2) continue;
722                 }
723                 if (k == s) continue;
724                 l = j * size + k;
725                 rsq_sub_sub[l] *= (ssx_sub_sub[j] * ssx_sub[k]);
726                 rsq_sub_sub[l] = rsq_sub_sub[l] * rsq_sub_sub[l];
727                 if (rsq_sub_sub[l] > 1.0) rsq_sub_sub[l] = 1.0;
728                 if (rsq_sub_sub[l] >= rsq_cutoff) {
729                     mean_rsq_sub_sub[j] += rsq_sub_sub[l];
730                     rsq_size_sub[j] += 1.0;
731                 }
732                 if(rsq_sub_sub[l] > max_rsq_sub_sub[j]) max_rsq_sub_sub[j] = rsq_sub_sub[l];
733             }
734 
735             if (rsq_size_sub[j] > 0.0) mean_rsq_sub_sub[j] /= rsq_size_sub[j];
736         }
737         delete[] rsq_sub_sub;
738 
739         for (j = 0, k = brk_pnt[i]; j < size_sub; j++, k++) {
740             mean_rsq_sub[k] = mean_rsq_sub_sub[j];
741             rsq_size[k] = rsq_size_sub[j];
742             max_rsq_sub[k] = max_rsq_sub_sub[j];
743         }
744     }
745 }
746 
747 
748