1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for estimating the genetic relationship matrix
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
enable_grm_bin_flag()15 void gcta::enable_grm_bin_flag() {
16 _grm_bin_flag = true;
17 }
18
check_autosome()19 void gcta::check_autosome() {
20 for (int i = 0; i < _include.size(); i++) {
21 if (_chr[_include[i]] > _autosome_num) throw ("Error: this option is for the autosomal SNPs only. Please check the option --autosome.");
22 }
23 }
24
check_chrX()25 void gcta::check_chrX() {
26 for (int i = 0; i < _include.size(); i++) {
27 if (_chr[_include[i]] != (_autosome_num + 1)) throw ("Error: this option is for SNPs on the X chromosome only.");
28 }
29 }
30
check_sex()31 void gcta::check_sex() {
32 for (int i = 0; i < _keep.size(); i++) {
33 if (_sex[_keep[i]] != 1 && _sex[_keep[i]] != 2) throw ("Error: Sex information of the individual \"" + _fid[_keep[i]] + " " + _pid[_keep[i]] + "\" is missing.\nUse --update-sex option to update the sex information of the individuals.");
34 }
35 }
36
make_grm(bool grm_d_flag,bool grm_xchr_flag,bool inbred,bool output_bin,int grm_mtd,bool mlmassoc,bool diag_f3_flag,string subpopu_file)37 void gcta::make_grm(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag, string subpopu_file)
38 {
39 bool have_mis = false;
40
41 if (!grm_d_flag && grm_xchr_flag) check_chrX();
42 else check_autosome();
43
44 unsigned long i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
45 if(grm_d_flag) have_mis = make_XMat_d(_geno);
46 else have_mis = make_XMat(_geno);
47
48 eigenVector sd_SNP;
49 if (grm_mtd == 0) {
50 if (grm_d_flag) std_XMat_d(_geno, sd_SNP, false, true);
51 else{
52 if(subpopu_file.empty()) std_XMat(_geno, sd_SNP, grm_xchr_flag, false, true);
53 else std_XMat_subpopu(subpopu_file, _geno, sd_SNP, grm_xchr_flag, false, true);
54 }
55 }
56 else {
57 if (grm_d_flag) std_XMat_d(_geno, sd_SNP, false, false);
58 else std_XMat(_geno, sd_SNP, grm_xchr_flag, false, false);
59 }
60
61 if (!mlmassoc) cout << "\nCalculating the" << ((grm_d_flag) ? " dominance" : "") << " genetic relationship matrix (GRM)" << (grm_xchr_flag ? " for the X chromosome" : "") << (_dosage_flag ? " using imputed dosage data" : "") << " ... (Note: default speed-optimized mode, may use huge RAM)" << endl;
62 else cout << "\nCalculating the genetic relationship matrix (GRM) ... " << endl;
63
64 // count the number of missing genotypes
65 vector< vector<int> > miss_pos;
66 vector< vector<bool> > X_bool;
67 if(have_mis){
68 miss_pos.resize(n);
69 X_bool.resize(n);
70 #pragma omp parallel for private(j)
71 for (i = 0; i < n; i++) {
72 X_bool[i].resize(m);
73 for (j = 0; j < m; j++) {
74 if (_geno(i,j) < 1e5) X_bool[i][j] = true;
75 else {
76 _geno(i,j) = 0.0;
77 miss_pos[i].push_back(j);
78 X_bool[i][j] = false;
79 }
80 }
81 }
82 }
83
84 // Calculate A_N matrix
85 if(have_mis){
86 _grm_N.resize(n, n);
87 #pragma omp parallel for private(j, k)
88 for (i = 0; i < n; i++) {
89 for (j = 0; j <= i; j++) {
90 int miss_j = 0;
91 for (k = 0; k < miss_pos[j].size(); k++) miss_j += (int) X_bool[i][miss_pos[j][k]];
92 _grm_N(i,j) = m - miss_pos[i].size() - miss_j;
93 _grm_N(j,i) = _grm_N(i,j);
94 }
95 }
96 }
97 else _grm_N = MatrixXf::Constant(n,n,m);
98
99 // Calcuate WW'
100 #ifdef SINGLE_PRECISION
101 _grm = _geno * _geno.transpose();
102 #else
103 _grm = (_geno * _geno.transpose()).cast<double>();
104 #endif
105
106 // Calculate A matrix
107 if (grm_mtd == 1) _grm_N = _grm_N.array() * sd_SNP.mean();
108
109 #pragma omp parallel for private(j)
110 for (i = 0; i < n; i++) {
111 for (j = 0; j <= i; j++) {
112 if(_grm_N(i,j) > 0) _grm(i,j) /= _grm_N(i,j);
113 else _grm(i,j) = 0.0;
114 _grm(j,i) = _grm(i,j);
115 }
116 }
117
118 // GRM summary
119 double diag_m = 0.0, diag_v = 0.0, off_m = 0.0, off_v = 0.0;
120 calcu_grm_var(diag_m, diag_v, off_m, off_v);
121 cout<<"\nSummary of the GRM:" << endl;
122 cout<<"Mean of diagonals = "<<diag_m<<endl;
123 cout<<"Variance of diagonals = "<<diag_v<<endl;
124 cout<<"Mean of off-diagonals = " << off_m <<endl;
125 cout<<"Variance of off-diagonals = " << off_v <<endl;
126
127 // re-calcuate the diagonals (Fhat3+1)
128 if (diag_f3_flag) {
129 #pragma omp parallel for private(j)
130 for (i = 0; i < n; i++) {
131 _grm(i,i) = 0.0;
132 double non_missing = 0.0;
133 for (j = 0; j < m; j++) {
134 if (_geno(i,j) < 1e5){
135 _grm(i,i) += _geno(i,j)*(_geno(i,j)+(_mu[_include[j]] - 1.0) * sd_SNP[j]);
136 non_missing += 1.0;
137 }
138 }
139 _grm(i,i) /= non_missing;
140 }
141 }
142
143 if (inbred) {
144 #pragma omp parallel for private(j)
145 for (i = 0; i < n; i++) {
146 for (j = 0; j <= i; j++) _grm(i,j) *= 0.5;
147 }
148 }
149
150 if (mlmassoc && grm_mtd == 0) {
151 for (j = 0; j < m; j++) {
152 if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
153 else sd_SNP[j] = 1.0 / sd_SNP[j];
154 }
155 #pragma omp parallel for private(j)
156 for (i = 0; i < n; i++) {
157 for (j = 0; j < m; j++) {
158 if (_geno(i,j) < 1e5) _geno(i,j) *= sd_SNP[j];
159 else _geno(i,j) = 0.0;
160 }
161 }
162 }
163 else {
164 // Output A_N and A
165 string out_buf = _out;
166 if (grm_d_flag) _out += ".d";
167 output_grm(output_bin);
168 _out = out_buf;
169 }
170 }
171
calcu_grm_var(double & diag_m,double & diag_v,double & off_m,double & off_v)172 void gcta::calcu_grm_var(double &diag_m, double &diag_v, double &off_m, double &off_v)
173 {
174 int i = 0, n = _keep.size();
175 diag_m = _grm.diagonal().mean();
176 diag_v = (_grm.diagonal() - eigenVector::Constant(n, diag_m)).squaredNorm() / ((double)n - 1.0);
177 double off_num = 0.5*n*(n - 1.0);
178 off_m = 0.0;
179 for (i = 1; i < n; i++) off_m += _grm.row(i).segment(0, i).sum();
180 off_m /= off_num;
181 off_v = 0.0;
182 for (i = 1; i < n; i++) off_v += (_grm.row(i).segment(0, i) - eigenVector::Constant(i, off_m).transpose()).squaredNorm();
183 off_v /= (off_num - 1.0);
184 }
185
output_grm(bool output_grm_bin)186 void gcta::output_grm(bool output_grm_bin)
187 {
188 int i = 0, j = 0;
189 string grm_file;
190 if (output_grm_bin) {
191 // Save matrix A in binary file
192 grm_file = _out + ".grm.bin";
193 fstream A_Bin(grm_file.c_str(), ios::out | ios::binary);
194 if (!A_Bin) throw ("Error: can not open the file [" + grm_file + "] to write.");
195 float f_buf = 0.0;
196 int size = sizeof (float);
197 for (i = 0; i < _keep.size(); i++) {
198 for (j = 0; j <= i; j++) {
199 f_buf = (float) (_grm(i, j));
200 A_Bin.write((char*) &f_buf, size);
201 }
202 }
203 A_Bin.close();
204 cout << "GRM of " << _keep.size() << " individuals has been saved in the file [" + grm_file + "] (in binary format)." << endl;
205
206 string grm_N_file = _out + ".grm.N.bin";
207 fstream N_Bin(grm_N_file.c_str(), ios::out | ios::binary);
208 if (!N_Bin) throw ("Error: can not open the file [" + grm_N_file + "] to write.");
209 f_buf = 0.0;
210 size = sizeof (int);
211 for (i = 0; i < _keep.size(); i++) {
212 for (j = 0; j <= i; j++) {
213 f_buf = (float) (_grm_N(i, j));
214 N_Bin.write((char*) &f_buf, size);
215 }
216 }
217 N_Bin.close();
218 cout << "Number of SNPs to calcuate the genetic relationship between each pair of individuals has been saved in the file [" + grm_N_file + "] (in binary format)." << endl;
219 }
220 else {
221 // Save A matrix in txt format
222 grm_file = _out + ".grm.gz";
223 gzofstream zoutf;
224 zoutf.open(grm_file.c_str());
225 if (!zoutf.is_open()) throw ("Error: can not open the file [" + grm_file + "] to write.");
226 cout << "Saving the genetic relationship matrix to the file [" + grm_file + "] (in compressed text format)." << endl;
227 zoutf.setf(ios::scientific);
228 zoutf.precision(6);
229 for (i = 0; i < _keep.size(); i++) {
230 if (_grm_N.rows() > 0){
231 zoutf.setf(ios::scientific);
232 zoutf.precision(6);
233 for (j = 0; j <= i; j++) zoutf << i + 1 << '\t' << j + 1 << '\t' << _grm_N(i, j) << '\t' << _grm(i, j) << endl;
234 }
235 else{
236 for (j = 0; j <= i; j++) zoutf << i + 1 << '\t' << j + 1 << "\t0\t" << _grm(i, j) << endl;
237 }
238 }
239 zoutf.close();
240 cout << "The genetic relationship matrix has been saved in the file [" + grm_file + "] (in compressed text format)." << endl;
241 }
242
243 string famfile = _out + ".grm.id";
244 ofstream Fam(famfile.c_str());
245 if (!Fam) throw ("Error: can not open the file [" + famfile + "] to write.");
246 for (i = 0; i < _keep.size(); i++) Fam << _fid[_keep[i]] + "\t" + _pid[_keep[i]] << endl;
247 Fam.close();
248 cout << "IDs for the GRM file [" + grm_file + "] have been saved in the file [" + famfile + "]." << endl;
249 }
250
read_grm_id(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only)251 int gcta::read_grm_id(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only)
252 {
253 // read GRM IDs
254 string grm_id_file = grm_file + ".grm.id";
255 if (out_id_log) cout << "Reading IDs of the GRM from [" + grm_id_file + "]." << endl;
256 ifstream i_grm_id(grm_id_file.c_str());
257 if (!i_grm_id) throw ("Error: can not open the file [" + grm_id_file + "] to read.");
258 string str_buf, id_buf;
259 vector<string> fid, pid;
260 grm_id.clear();
261 while (i_grm_id) {
262 i_grm_id >> str_buf;
263 if (i_grm_id.eof()) break;
264 fid.push_back(str_buf);
265 id_buf = str_buf + ":";
266 i_grm_id >> str_buf;
267 pid.push_back(str_buf);
268 id_buf += str_buf;
269 grm_id.push_back(id_buf);
270 getline(i_grm_id, str_buf);
271 }
272 i_grm_id.close();
273 int n = grm_id.size();
274 if (out_id_log) cout << n << " IDs read from [" + grm_id_file + "]." << endl;
275
276 if (_id_map.empty()) {
277 _fid = fid;
278 _pid = pid;
279 _indi_num = _fid.size();
280 _sex.resize(_fid.size());
281 init_keep();
282 }
283
284 return (n);
285 }
286
read_grm(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only,bool dont_read_N)287 void gcta::read_grm(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only, bool dont_read_N)
288 {
289 if (_grm_bin_flag) read_grm_bin(grm_file, grm_id, out_id_log, read_id_only, dont_read_N);
290 else read_grm_gz(grm_file, grm_id, out_id_log, read_id_only);
291 }
292
read_grm_gz(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only)293 void gcta::read_grm_gz(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only) {
294 int n = read_grm_id(grm_file, grm_id, out_id_log, read_id_only);
295
296 if (read_id_only) return;
297
298 string grm_gzfile = grm_file + ".grm.gz", str_buf;
299 const int MAX_LINE_LENGTH = 1000;
300 char buf[MAX_LINE_LENGTH];
301 gzifstream zinf;
302 zinf.open(grm_gzfile.c_str());
303 if (!zinf.is_open()) throw ("Error: can not open the file [" + grm_gzfile + "] to read.");
304
305 int indx1 = 0, indx2 = 0, nline = 0;
306 double grm_buf = 0.0, grm_N_buf;
307 string errmsg = "Error: failed to read [" + grm_gzfile + "]. The format of the GRM file has been changed?\nError occurs in line:\n";
308 cout << "Reading the GRM from [" + grm_gzfile + "]." << endl;
309 _grm.resize(n, n);
310 _grm_N.resize(n, n);
311 while (1) {
312 zinf.getline(buf, MAX_LINE_LENGTH, '\n');
313 if (zinf.fail() || !zinf.good()) break;
314 stringstream ss(buf);
315 if (!(ss >> indx1)) throw (errmsg + buf);
316 if (!(ss >> indx2)) throw (errmsg + buf);
317 if (!(ss >> grm_N_buf)) throw (errmsg + buf);
318 if (!(ss >> grm_buf)) throw (errmsg + buf);
319 if (indx1 < indx2 || indx1 > n || indx2 > n) throw (errmsg + buf);
320 if (grm_N_buf == 0) cout << "Warning: " << buf << endl;
321 _grm_N(indx1 - 1, indx2 - 1) = _grm_N(indx2 - 1, indx1 - 1) = grm_N_buf;
322 _grm(indx1 - 1, indx2 - 1) = _grm(indx2 - 1, indx1 - 1) = grm_buf;
323 nline++;
324 if (ss >> str_buf) throw (errmsg + buf);
325 }
326 zinf.close();
327 if (!_within_family && nline != (int) (n * (n + 1)*0.5)){
328 stringstream errmsg_tmp;
329 errmsg_tmp << "Error: there are " << nline << " lines in the [" << grm_gzfile << "] file. The expected number of lines is " << (int) (n * (n + 1)*0.5) << "." << endl;
330 throw(errmsg_tmp.str());
331 //throw ("Error: incorrect number of lines in the grm file. *.grm.gz file and *.grm.id file are mismatched?");
332 }
333 cout << "GRM for " << n << " individuals are included from [" + grm_gzfile + "]." << endl;
334 }
335
read_grm_bin(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only,bool dont_read_N)336 void gcta::read_grm_bin(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only, bool dont_read_N)
337 {
338 int i = 0, j = 0, n = read_grm_id(grm_file, grm_id, out_id_log, read_id_only);
339
340 if (read_id_only) return;
341
342 string grm_binfile = grm_file + ".grm.bin";
343 ifstream A_bin(grm_binfile.c_str(), ios::in | ios::binary);
344 if (!A_bin.is_open()) throw ("Error: can not open the file [" + grm_binfile + "] to read.");
345 _grm.resize(n, n);
346 cout << "Reading the GRM from [" + grm_binfile + "]." << endl;
347 int size = sizeof (float);
348 float f_buf = 0.0;
349 for (i = 0; i < n; i++) {
350 for (j = 0; j <= i; j++) {
351 if (!(A_bin.read((char*) &f_buf, size))) throw ("Error: the size of the [" + grm_binfile + "] file is incomplete?");
352 _grm(j, i) = _grm(i, j) = f_buf;
353 }
354 }
355 A_bin.close();
356
357 if(!dont_read_N){
358 string grm_Nfile = grm_file + ".grm.N.bin";
359 ifstream N_bin(grm_Nfile.c_str(), ios::in | ios::binary);
360 if (!N_bin.is_open()) throw ("Error: can not open the file [" + grm_Nfile + "] to read.");
361 _grm_N.resize(n, n);
362 cout << "Reading the number of SNPs for the GRM from [" + grm_Nfile + "]." << endl;
363 size = sizeof (float);
364 f_buf = 0.0;
365 for (i = 0; i < n; i++) {
366 for (j = 0; j <= i; j++) {
367 if (!(N_bin.read((char*) &f_buf, size))) throw ("Error: the size of the [" + grm_Nfile + "] file is incomplete?");
368 _grm_N(j, i) = _grm_N(i, j) = f_buf;
369 }
370 }
371 N_bin.close();
372 }
373
374 cout << "GRM for " << n << " individuals are included from [" + grm_binfile + "]." << endl;
375 }
376
rm_cor_indi(double grm_cutoff)377 void gcta::rm_cor_indi(double grm_cutoff) {
378 cout << "Pruning the GRM with a cutoff of " << grm_cutoff << " ..." << endl;
379
380 int i = 0, j = 0, i_buf = 0;
381
382 // identify the positions where you see a value > than the threshold
383 vector<int> rm_grm_ID1, rm_grm_ID2;
384 for (i = 0; i < _keep.size(); i++) {
385 for (j = 0; j < i; j++) {
386 if (_grm(_keep[i], _keep[j]) > grm_cutoff) {
387 rm_grm_ID1.push_back(_keep[i]);
388 rm_grm_ID2.push_back(_keep[j]);
389 }
390 }
391 }
392
393 // count the number of appearance of each "position" in the vector, which involves a few steps
394 vector<int> rm_uni_ID(rm_grm_ID1);
395 rm_uni_ID.insert(rm_uni_ID.end(), rm_grm_ID2.begin(), rm_grm_ID2.end());
396 stable_sort(rm_uni_ID.begin(), rm_uni_ID.end());
397 rm_uni_ID.erase(unique(rm_uni_ID.begin(), rm_uni_ID.end()), rm_uni_ID.end());
398 map<int, int> rm_uni_ID_count;
399 for (i = 0; i < rm_uni_ID.size(); i++) {
400 i_buf = count(rm_grm_ID1.begin(), rm_grm_ID1.end(), rm_uni_ID[i]) + count(rm_grm_ID2.begin(), rm_grm_ID2.end(), rm_uni_ID[i]);
401 rm_uni_ID_count.insert(pair<int, int>(rm_uni_ID[i], i_buf));
402 }
403
404 // swapping
405 map<int, int>::iterator iter1, iter2;
406 for (i = 0; i < rm_grm_ID1.size(); i++) {
407 iter1 = rm_uni_ID_count.find(rm_grm_ID1[i]);
408 iter2 = rm_uni_ID_count.find(rm_grm_ID2[i]);
409 if (iter1->second < iter2->second) {
410 i_buf = rm_grm_ID1[i];
411 rm_grm_ID1[i] = rm_grm_ID2[i];
412 rm_grm_ID2[i] = i_buf;
413 }
414 }
415
416 stable_sort(rm_grm_ID1.begin(), rm_grm_ID1.end());
417 rm_grm_ID1.erase(unique(rm_grm_ID1.begin(), rm_grm_ID1.end()), rm_grm_ID1.end());
418 vector<string> removed_ID;
419 for (i = 0; i < rm_grm_ID1.size(); i++) removed_ID.push_back(_fid[rm_grm_ID1[i]] + ":" + _pid[rm_grm_ID1[i]]);
420
421 // update _keep and _id_map
422 update_id_map_rm(removed_ID, _id_map, _keep);
423
424 cout << "After pruning the GRM, there are " << _keep.size() << " individuals (" << removed_ID.size() << " individuals removed)." << endl;
425 }
426
adj_grm(double adj_grm_fac)427 void gcta::adj_grm(double adj_grm_fac) {
428 cout << "Adjusting the GRM for sampling errors ..." << endl;
429 int i = 0, j = 0, n = _keep.size();
430 double off_mean = 0.0, diag_mean = 0.0, off_var = 0.0, diag_var = 0.0, d_buf = 0.0;
431 for (i = 0; i < n; i++) {
432 diag_mean += _grm(_keep[i], _keep[i]);
433 for (j = 0; j < i; j++) off_mean += _grm(_keep[i], _keep[j]);
434 }
435 diag_mean /= n;
436 off_mean /= 0.5 * n * (n - 1.0);
437 for (i = 0; i < n; i++) {
438 d_buf = _grm(_keep[i], _keep[i]) - diag_mean;
439 diag_var += d_buf*d_buf;
440 for (j = 0; j < i; j++) {
441 d_buf = _grm(_keep[i], _keep[j]) - off_mean;
442 off_var += d_buf*d_buf;
443 }
444 }
445 diag_var /= n - 1.0;
446 off_var /= 0.5 * n * (n - 1.0) - 1.0;
447 for (i = 0; i < _keep.size(); i++) {
448 d_buf = 1.0 - (adj_grm_fac + 1.0 / _grm_N(_keep[i], _keep[i])) / diag_var;
449 if (_grm(_keep[i], _keep[i]) > 0) _grm(_keep[i], _keep[i]) = 1.0 + d_buf * (_grm(_keep[i], _keep[i]) - 1.0);
450 for (j = 0; j < i; j++) {
451 if (_grm_N(_keep[i], _keep[j]) > 0) _grm(_keep[i], _keep[j]) *= 1.0 - (adj_grm_fac + 1.0 / _grm_N(_keep[i], _keep[j])) / off_var;
452 }
453 }
454 }
455
dc(int dosage_compen)456 void gcta::dc(int dosage_compen) {
457 cout << "Parameterizing the GRM under the assumption of ";
458 if (dosage_compen == 1) cout << "full dosage compensation ..." << endl;
459 else if (dosage_compen == 0) cout << "no dosage compensation ..." << endl;
460
461 int i = 0, j = 0, i_buf = 0;
462 double c1 = 1.0, c2 = 1.0;
463 if (dosage_compen == 1) {
464 c1 = 2.0;
465 c2 = sqrt(2.0);
466 }// full dosage compensation
467 else if (dosage_compen == 0) {
468 c1 = 0.5;
469 c2 = sqrt(0.5);
470 } // on dosage compensation
471 for (i = 0; i < _keep.size(); i++) {
472 for (j = 0; j <= i; j++) {
473 i_buf = _sex[_keep[i]] * _sex[_keep[j]];
474 if (i_buf == 1) _grm(i, j) *= c1;
475 else if (i_buf == 2) _grm(i, j) *= c2;
476 }
477 }
478 }
479
manipulate_grm(string grm_file,string keep_indi_file,string remove_indi_file,string sex_file,double grm_cutoff,double adj_grm_fac,int dosage_compen,bool merge_grm_flag,bool dont_read_N)480 void gcta::manipulate_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool dont_read_N)
481 {
482 int i = 0, j = 0;
483
484 vector<string> grm_id;
485 if (merge_grm_flag) merge_grm(grm_file);
486 else read_grm(grm_file, grm_id, true, false, dont_read_N);
487
488 if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
489 if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
490 if (grm_cutoff>-1.0) rm_cor_indi(grm_cutoff);
491 if (!sex_file.empty()) update_sex(sex_file);
492 if (adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
493 if (dosage_compen>-1) dc(dosage_compen);
494 if (grm_cutoff>-1.0 || !keep_indi_file.empty() || !remove_indi_file.empty()) {
495 eigenMatrix grm_buf(_grm);
496 _grm.resize(_keep.size(), _keep.size());
497 for (i = 0; i < _keep.size(); i++) {
498 for (j = 0; j <= i; j++) _grm(i, j) = grm_buf(_keep[i], _keep[j]);
499 }
500 grm_buf.resize(0,0);
501 MatrixXf grm_N_buf = _grm_N;
502 _grm_N.resize(_keep.size(), _keep.size());
503 for (i = 0; i < _keep.size(); i++) {
504 for (j = 0; j <= i; j++) _grm_N(i, j) = grm_N_buf(_keep[i], _keep[j]);
505 }
506 }
507 }
508
save_grm(string grm_file,string keep_indi_file,string remove_indi_file,string sex_file,double grm_cutoff,double adj_grm_fac,int dosage_compen,bool merge_grm_flag,bool output_grm_bin)509 void gcta::save_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool output_grm_bin) {
510 if (dosage_compen>-1) check_sex();
511 manipulate_grm(grm_file, keep_indi_file, remove_indi_file, sex_file, grm_cutoff, adj_grm_fac, dosage_compen, merge_grm_flag);
512 output_grm(output_grm_bin);
513 }
514
merge_grm(string merge_grm_file)515 void gcta::merge_grm(string merge_grm_file) {
516 vector<string> grm_files, grm_id;
517 read_grm_filenames(merge_grm_file, grm_files);
518
519 int f = 0, i = 0, j = 0;
520 for (f = 0; f < grm_files.size(); f++) {
521 read_grm(grm_files[f], grm_id, false, true);
522 update_id_map_kp(grm_id, _id_map, _keep);
523 }
524 vector<string> uni_id;
525 for (i = 0; i < _keep.size(); i++) uni_id.push_back(_fid[_keep[i]] + ":" + _pid[_keep[i]]);
526 _n = uni_id.size();
527 if (_n == 0) throw ("Error: no individual is in common in the GRM files.");
528 else cout << _n << " individuals in common in the GRM files." << endl;
529
530 vector<int> kp;
531 eigenMatrix grm = eigenMatrix::Zero(_n, _n);
532 eigenMatrix grm_N = eigenMatrix::Zero(_n, _n);
533 for (f = 0; f < grm_files.size(); f++) {
534 cout << "Reading the GRM from the " << f + 1 << "th file ..." << endl;
535 read_grm(grm_files[f], grm_id);
536 StrFunc::match(uni_id, grm_id, kp);
537 for (i = 0; i < _n; i++) {
538 for (j = 0; j <= i; j++) {
539 if (kp[i] >= kp[j]) {
540 grm(i, j) += _grm(kp[i], kp[j]) * _grm_N(kp[i], kp[j]);
541 grm_N(i, j) += _grm_N(kp[i], kp[j]);
542 } else {
543 grm(i, j) += _grm(kp[j], kp[i]) * _grm_N(kp[j], kp[i]);
544 grm_N(i, j) += _grm_N(kp[j], kp[i]);
545 }
546 }
547 }
548 }
549 for (i = 0; i < _n; i++) {
550 for (j = 0; j <= i; j++) {
551 if (grm_N(i, j) == 0) _grm(i, j) = 0;
552 else _grm(i, j) = grm(i, j) / grm_N(i, j);
553 _grm_N(i, j) = grm_N(i, j);
554 }
555 }
556 grm.resize(0, 0);
557 grm_N.resize(0, 0);
558 cout << "\n" << grm_files.size() << " GRMs have been merged together." << endl;
559 }
560
read_grm_filenames(string merge_grm_file,vector<string> & grm_files,bool out_log)561 void gcta::read_grm_filenames(string merge_grm_file, vector<string> &grm_files, bool out_log) {
562 ifstream merge_grm(merge_grm_file.c_str());
563 if (!merge_grm) throw ("Error: can not open the file [" + merge_grm_file + "] to read.");
564 string str_buf;
565 grm_files.clear();
566 vector<string> vs_buf;
567 while (getline(merge_grm, str_buf)) {
568 if (!str_buf.empty()) {
569 if (StrFunc::split_string(str_buf, vs_buf) == 1) grm_files.push_back(vs_buf[0]);
570 }
571 }
572 if (out_log) cout << "There are " << grm_files.size() << " GRM file names specified in [" + merge_grm_file + "]." << endl;
573 if (grm_files.size() > 1000) throw ("Error: too many GRM file names specified in [" + merge_grm_file + "]. Maximum is 1000.");
574 if (grm_files.size() < 1) throw ("Error: no GRM file name is found in [" + merge_grm_file + "].");
575 }
576
grm_bK(string grm_file,string keep_indi_file,string remove_indi_file,double threshold,bool grm_out_bin_flag)577 void gcta::grm_bK(string grm_file, string keep_indi_file, string remove_indi_file, double threshold, bool grm_out_bin_flag)
578 {
579 int i = 0, j = 0;
580 vector<string> grm_id;
581 read_grm(grm_file, grm_id);
582 if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
583 if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
584 if (!keep_indi_file.empty() || !remove_indi_file.empty()) {
585 eigenMatrix grm_buf(_grm);
586 _grm.resize(_keep.size(), _keep.size());
587 for (i = 0; i < _keep.size(); i++) {
588 for (j = 0; j <= i; j++) _grm(i, j) = grm_buf(_keep[i], _keep[j]);
589 }
590 grm_buf.resize(0,0);
591 MatrixXf grm_N_buf = _grm_N;
592 _grm_N.resize(_keep.size(), _keep.size());
593 for (i = 0; i < _keep.size(); i++) {
594 for (j = 0; j <= i; j++) _grm_N(i, j) = grm_N_buf(_keep[i], _keep[j]);
595 }
596 }
597
598 cout << "\nThe off-diagonals that are < " << threshold << " are set to zero.\n" << endl;
599 for (i = 0; i < _keep.size(); i++) {
600 for (j = 0; j < i; j++){
601 if(_grm(i, j) < threshold) _grm(i, j) = 0.0;
602 }
603 }
604
605 output_grm(grm_out_bin_flag);
606 }
607
pca(string grm_file,string keep_indi_file,string remove_indi_file,double grm_cutoff,bool merge_grm_flag,int out_pc_num)608 void gcta::pca(string grm_file, string keep_indi_file, string remove_indi_file, double grm_cutoff, bool merge_grm_flag, int out_pc_num)
609 {
610 manipulate_grm(grm_file, keep_indi_file, remove_indi_file, "", grm_cutoff, -2.0, -2, merge_grm_flag, true);
611 _grm_N.resize(0, 0);
612 int i = 0, j = 0, n = _keep.size();
613 cout << "\nPerforming principal component analysis ..." << endl;
614
615 SelfAdjointEigenSolver<MatrixXd> eigensolver(_grm.cast<double>());
616 MatrixXd evec = (eigensolver.eigenvectors());
617 VectorXd eval = eigensolver.eigenvalues();
618
619 string eval_file = _out + ".eigenval";
620 ofstream o_eval(eval_file.c_str());
621 if (!o_eval) throw ("Error: can not open the file [" + eval_file + "] to read.");
622 for (i = n - 1; i >= 0; i--) o_eval << eval(i) << endl;
623 o_eval.close();
624 cout << "Eigenvalues of " << n << " individuals have been saved in [" + eval_file + "]." << endl;
625 string evec_file = _out + ".eigenvec";
626 ofstream o_evec(evec_file.c_str());
627 if (!o_evec) throw ("Error: can not open the file [" + evec_file + "] to read.");
628 if (out_pc_num > n) out_pc_num = n;
629 for (i = 0; i < n; i++) {
630 o_evec << _fid[_keep[i]] << " " << _pid[_keep[i]];
631 for (j = n - 1; j >= (n - out_pc_num); j--) o_evec << " " << evec(i, j);
632 o_evec << endl;
633 }
634 o_evec.close();
635 cout << "The first " << out_pc_num << " eigenvectors of " << n << " individuals have been saved in [" + evec_file + "]." << endl;
636 }
637
snp_pc_loading(string pc_file,int grm_N)638 void gcta::snp_pc_loading(string pc_file, int grm_N)
639 {
640 // read eigenvectors and eigenvalues
641 string eigenval_file = pc_file + ".eigenval";
642 ifstream in_eigenval(eigenval_file.c_str());
643 if (!in_eigenval) throw ("Error: can not open the file [" + eigenval_file + "] to read.");
644 string eigenvec_file = pc_file + ".eigenvec";
645 ifstream in_eigenvec(eigenvec_file.c_str());
646 if (!in_eigenvec) throw ("Error: can not open the file [" + eigenvec_file + "] to read.");
647
648 cout << "Reading eigenvectors from [" + eigenvec_file + "]." << endl;
649 vector<string> eigenvec_ID;
650 vector< vector<string> > eigenvec_str;
651 int eigenvec_num = read_fac(in_eigenvec, eigenvec_ID, eigenvec_str);
652 cout << eigenvec_num << " eigenvectors of " << eigenvec_ID.size() << " individuals are included from [" + eigenvec_file + "]." << endl;
653 update_id_map_kp(eigenvec_ID, _id_map, _keep);
654
655 cout << "\nReading eigenvalues from [" + eigenval_file + "]." << endl;
656 vector<double> eigenval_buf;
657 double d_buf = 0.0;
658 int eigenval_num = 0;
659 while(in_eigenval && eigenval_num < eigenvec_num){
660 in_eigenval >> d_buf;
661 if(d_buf > 1e10 || d_buf < 1e-10) throw("Error: invalid eigenvalue in the file [" + eigenval_file + "].");
662 eigenval_buf.push_back(d_buf);
663 eigenval_num++;
664 }
665 if(eigenvec_num != eigenval_num) throw("Error: inconsistent numbers of eigenvalues and eigenvectors in the files [" + eigenval_file + "] and [" + eigenvec_file + "]");
666 cout << eigenval_num << " eigenvalues read from [" + eigenval_file + "]" << endl;
667
668 int i = 0, j = 0;
669 vector<string> uni_id;
670 map<string, int> uni_id_map;
671 map<string, int>::iterator iter;
672 for(i=0; i<_keep.size(); i++){
673 uni_id.push_back(_fid[_keep[i]]+":"+_pid[_keep[i]]);
674 uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
675 }
676 _n=_keep.size();
677 if(_n < 1) throw("Error: no individual is in common between the input files.");
678 cout << _n << " individuals in common between the input files are included in the analysis."<<endl;
679
680 eigenMatrix eigenvec(eigenvec_num, _n);
681 for(i = 0; i < eigenvec_ID.size(); i++){
682 iter = uni_id_map.find(eigenvec_ID[i]);
683 if(iter == uni_id_map.end()) continue;
684 for(j = 0; j < eigenvec_num; j++) eigenvec(j, iter->second) = atof(eigenvec_str[i][j].c_str());
685 }
686
687 eigenVector inv_eigenval(eigenval_num);
688 for(i = 0; i < eigenval_num; i++) inv_eigenval(i) = 1.0 / (eigenval_buf[i] * grm_N);
689
690 // calculating SNP loading
691 if (_mu.empty()) calcu_mu();
692 cout << "\nCalculating SNP loading ..." << endl;
693 int m = _include.size();
694 eigenMatrix snp_loading(m, eigenvec_num);
695 eigenVector x(_n);
696 for(j = 0; j < m ; j++) {
697 makex_eigenVector(j, x, false, true);
698 x = x.array() / sqrt(_mu[_include[j]]*(1.0 - 0.5*_mu[_include[j]]));
699 snp_loading.row(j) = (eigenvec * x).array() * inv_eigenval.array();
700 }
701
702 string filename = _out + ".pcl";
703 cout << "\nSaving the PC loading of " << m << " SNPs to [" + filename + "] ..." << endl;
704 ofstream ofile(filename.c_str());
705 if(!ofile) throw("Can not open the file [" + filename + "] to write.");
706 ofile << "SNP\trefA";
707 for(i = 0; i < eigenval_num; i++) ofile << "\tpc" << i+1 << "_loading";
708 ofile << endl;
709 for(i = 0; i < m; i++){
710 ofile << _snp_name[_include[i]] << "\t" << _ref_A[_include[i]];
711 for(j = 0; j < eigenvec_num; j++) ofile << "\t" << snp_loading(i, j);
712 ofile << "\n";
713 }
714 ofile.close();
715 }
716
717
718
719