1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for estimating the genetic relationship matrix
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
enable_grm_bin_flag()15 void gcta::enable_grm_bin_flag() {
16     _grm_bin_flag = true;
17 }
18 
check_autosome()19 void gcta::check_autosome() {
20     for (int i = 0; i < _include.size(); i++) {
21         if (_chr[_include[i]] > _autosome_num) throw ("Error: this option is for the autosomal SNPs only. Please check the option --autosome.");
22     }
23 }
24 
check_chrX()25 void gcta::check_chrX() {
26     for (int i = 0; i < _include.size(); i++) {
27         if (_chr[_include[i]] != (_autosome_num + 1)) throw ("Error: this option is for SNPs on the X chromosome only.");
28     }
29 }
30 
check_sex()31 void gcta::check_sex() {
32     for (int i = 0; i < _keep.size(); i++) {
33         if (_sex[_keep[i]] != 1 && _sex[_keep[i]] != 2) throw ("Error: Sex information of the individual \"" + _fid[_keep[i]] + " " + _pid[_keep[i]] + "\" is missing.\nUse --update-sex option to update the sex information of the individuals.");
34     }
35 }
36 
make_grm(bool grm_d_flag,bool grm_xchr_flag,bool inbred,bool output_bin,int grm_mtd,bool mlmassoc,bool diag_f3_flag,string subpopu_file)37 void gcta::make_grm(bool grm_d_flag, bool grm_xchr_flag, bool inbred, bool output_bin, int grm_mtd, bool mlmassoc, bool diag_f3_flag, string subpopu_file)
38 {
39     bool have_mis = false;
40 
41     if (!grm_d_flag && grm_xchr_flag) check_chrX();
42     else check_autosome();
43 
44     unsigned long i = 0, j = 0, k = 0, n = _keep.size(), m = _include.size();
45     if(grm_d_flag) have_mis = make_XMat_d(_geno);
46     else  have_mis = make_XMat(_geno);
47 
48     eigenVector sd_SNP;
49     if (grm_mtd == 0) {
50         if (grm_d_flag) std_XMat_d(_geno, sd_SNP, false, true);
51         else{
52             if(subpopu_file.empty()) std_XMat(_geno, sd_SNP, grm_xchr_flag, false, true);
53             else std_XMat_subpopu(subpopu_file, _geno, sd_SNP, grm_xchr_flag, false, true);
54         }
55     }
56     else {
57         if (grm_d_flag) std_XMat_d(_geno, sd_SNP, false, false);
58         else std_XMat(_geno, sd_SNP, grm_xchr_flag, false, false);
59     }
60 
61     if (!mlmassoc) cout << "\nCalculating the" << ((grm_d_flag) ? " dominance" : "") << " genetic relationship matrix (GRM)" << (grm_xchr_flag ? " for the X chromosome" : "") << (_dosage_flag ? " using imputed dosage data" : "") << " ... (Note: default speed-optimized mode, may use huge RAM)" << endl;
62     else cout << "\nCalculating the genetic relationship matrix (GRM) ... " << endl;
63 
64     // count the number of missing genotypes
65     vector< vector<int> > miss_pos;
66     vector< vector<bool> > X_bool;
67     if(have_mis){
68         miss_pos.resize(n);
69         X_bool.resize(n);
70         #pragma omp parallel for private(j)
71         for (i = 0; i < n; i++) {
72             X_bool[i].resize(m);
73             for (j = 0; j < m; j++) {
74                 if (_geno(i,j) < 1e5) X_bool[i][j] = true;
75                 else {
76                     _geno(i,j) = 0.0;
77                     miss_pos[i].push_back(j);
78                     X_bool[i][j] = false;
79                 }
80             }
81         }
82     }
83 
84     // Calculate A_N matrix
85     if(have_mis){
86         _grm_N.resize(n, n);
87         #pragma omp parallel for private(j, k)
88         for (i = 0; i < n; i++) {
89             for (j = 0; j <= i; j++) {
90                 int miss_j = 0;
91                 for (k = 0; k < miss_pos[j].size(); k++) miss_j += (int) X_bool[i][miss_pos[j][k]];
92                 _grm_N(i,j) = m - miss_pos[i].size() - miss_j;
93                 _grm_N(j,i) = _grm_N(i,j);
94             }
95         }
96     }
97     else _grm_N = MatrixXf::Constant(n,n,m);
98 
99     // Calcuate WW'
100     #ifdef SINGLE_PRECISION
101     _grm = _geno * _geno.transpose();
102     #else
103     _grm = (_geno * _geno.transpose()).cast<double>();
104     #endif
105 
106     // Calculate A matrix
107     if (grm_mtd == 1) _grm_N = _grm_N.array() * sd_SNP.mean();
108 
109     #pragma omp parallel for private(j)
110     for (i = 0; i < n; i++) {
111         for (j = 0; j <= i; j++) {
112             if(_grm_N(i,j) > 0) _grm(i,j) /= _grm_N(i,j);
113             else _grm(i,j) = 0.0;
114             _grm(j,i) = _grm(i,j);
115         }
116     }
117 
118     // GRM summary
119     double diag_m = 0.0, diag_v = 0.0, off_m = 0.0, off_v = 0.0;
120     calcu_grm_var(diag_m, diag_v, off_m, off_v);
121     cout<<"\nSummary of the GRM:" << endl;
122     cout<<"Mean of diagonals = "<<diag_m<<endl;
123     cout<<"Variance of diagonals = "<<diag_v<<endl;
124     cout<<"Mean of off-diagonals = " << off_m <<endl;
125     cout<<"Variance of off-diagonals = " << off_v <<endl;
126 
127     // re-calcuate the diagonals (Fhat3+1)
128     if (diag_f3_flag) {
129         #pragma omp parallel for private(j)
130         for (i = 0; i < n; i++) {
131             _grm(i,i) = 0.0;
132             double non_missing = 0.0;
133             for (j = 0; j < m; j++) {
134                 if (_geno(i,j) < 1e5){
135                     _grm(i,i) += _geno(i,j)*(_geno(i,j)+(_mu[_include[j]] - 1.0) * sd_SNP[j]);
136                     non_missing += 1.0;
137                 }
138             }
139             _grm(i,i) /= non_missing;
140         }
141     }
142 
143     if (inbred) {
144         #pragma omp parallel for private(j)
145         for (i = 0; i < n; i++) {
146             for (j = 0; j <= i; j++) _grm(i,j) *= 0.5;
147         }
148     }
149 
150     if (mlmassoc && grm_mtd == 0) {
151         for (j = 0; j < m; j++) {
152             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
153             else sd_SNP[j] = 1.0 / sd_SNP[j];
154         }
155         #pragma omp parallel for private(j)
156         for (i = 0; i < n; i++) {
157             for (j = 0; j < m; j++) {
158                 if (_geno(i,j) < 1e5) _geno(i,j) *= sd_SNP[j];
159                 else _geno(i,j) = 0.0;
160             }
161         }
162     }
163     else {
164         // Output A_N and A
165         string out_buf = _out;
166         if (grm_d_flag) _out += ".d";
167         output_grm(output_bin);
168         _out = out_buf;
169     }
170 }
171 
calcu_grm_var(double & diag_m,double & diag_v,double & off_m,double & off_v)172 void gcta::calcu_grm_var(double &diag_m, double &diag_v, double &off_m, double &off_v)
173 {
174     int i = 0, n = _keep.size();
175     diag_m = _grm.diagonal().mean();
176     diag_v = (_grm.diagonal() - eigenVector::Constant(n, diag_m)).squaredNorm() / ((double)n - 1.0);
177     double off_num = 0.5*n*(n - 1.0);
178     off_m = 0.0;
179     for (i = 1; i < n; i++) off_m += _grm.row(i).segment(0, i).sum();
180     off_m /= off_num;
181     off_v = 0.0;
182     for (i = 1; i < n; i++) off_v += (_grm.row(i).segment(0, i) -  eigenVector::Constant(i, off_m).transpose()).squaredNorm();
183     off_v /= (off_num - 1.0);
184 }
185 
output_grm(bool output_grm_bin)186 void gcta::output_grm(bool output_grm_bin)
187 {
188     int i = 0, j = 0;
189     string grm_file;
190     if (output_grm_bin) {
191         // Save matrix A in binary file
192         grm_file = _out + ".grm.bin";
193         fstream A_Bin(grm_file.c_str(), ios::out | ios::binary);
194         if (!A_Bin) throw ("Error: can not open the file [" + grm_file + "] to write.");
195         float f_buf = 0.0;
196         int size = sizeof (float);
197         for (i = 0; i < _keep.size(); i++) {
198             for (j = 0; j <= i; j++) {
199                 f_buf = (float) (_grm(i, j));
200                 A_Bin.write((char*) &f_buf, size);
201             }
202         }
203         A_Bin.close();
204         cout << "GRM of " << _keep.size() << " individuals has been saved in the file [" + grm_file + "] (in binary format)." << endl;
205 
206         string grm_N_file = _out + ".grm.N.bin";
207         fstream N_Bin(grm_N_file.c_str(), ios::out | ios::binary);
208         if (!N_Bin) throw ("Error: can not open the file [" + grm_N_file + "] to write.");
209         f_buf = 0.0;
210         size = sizeof (int);
211         for (i = 0; i < _keep.size(); i++) {
212             for (j = 0; j <= i; j++) {
213                 f_buf = (float) (_grm_N(i, j));
214                 N_Bin.write((char*) &f_buf, size);
215             }
216         }
217         N_Bin.close();
218         cout << "Number of SNPs to calcuate the genetic relationship between each pair of individuals has been saved in the file [" + grm_N_file + "] (in binary format)." << endl;
219     }
220     else {
221         // Save A matrix in txt format
222         grm_file = _out + ".grm.gz";
223         gzofstream zoutf;
224         zoutf.open(grm_file.c_str());
225         if (!zoutf.is_open()) throw ("Error: can not open the file [" + grm_file + "] to write.");
226         cout << "Saving the genetic relationship matrix to the file [" + grm_file + "] (in compressed text format)." << endl;
227         zoutf.setf(ios::scientific);
228         zoutf.precision(6);
229         for (i = 0; i < _keep.size(); i++) {
230             if (_grm_N.rows() > 0){
231                 zoutf.setf(ios::scientific);
232                 zoutf.precision(6);
233                 for (j = 0; j <= i; j++) zoutf << i + 1 << '\t' << j + 1 << '\t' << _grm_N(i, j) << '\t' << _grm(i, j) << endl;
234             }
235             else{
236                 for (j = 0; j <= i; j++) zoutf << i + 1 << '\t' << j + 1 << "\t0\t" << _grm(i, j) << endl;
237             }
238         }
239         zoutf.close();
240         cout << "The genetic relationship matrix has been saved in the file [" + grm_file + "] (in compressed text format)." << endl;
241     }
242 
243     string famfile = _out + ".grm.id";
244     ofstream Fam(famfile.c_str());
245     if (!Fam) throw ("Error: can not open the file [" + famfile + "] to write.");
246     for (i = 0; i < _keep.size(); i++) Fam << _fid[_keep[i]] + "\t" + _pid[_keep[i]] << endl;
247     Fam.close();
248     cout << "IDs for the GRM file [" + grm_file + "] have been saved in the file [" + famfile + "]." << endl;
249 }
250 
read_grm_id(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only)251 int gcta::read_grm_id(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only)
252 {
253     // read GRM IDs
254     string grm_id_file = grm_file + ".grm.id";
255     if (out_id_log) cout << "Reading IDs of the GRM from [" + grm_id_file + "]." << endl;
256     ifstream i_grm_id(grm_id_file.c_str());
257     if (!i_grm_id) throw ("Error: can not open the file [" + grm_id_file + "] to read.");
258     string str_buf, id_buf;
259     vector<string> fid, pid;
260     grm_id.clear();
261     while (i_grm_id) {
262         i_grm_id >> str_buf;
263         if (i_grm_id.eof()) break;
264         fid.push_back(str_buf);
265         id_buf = str_buf + ":";
266         i_grm_id >> str_buf;
267         pid.push_back(str_buf);
268         id_buf += str_buf;
269         grm_id.push_back(id_buf);
270         getline(i_grm_id, str_buf);
271     }
272     i_grm_id.close();
273     int n = grm_id.size();
274     if (out_id_log) cout << n << " IDs read from [" + grm_id_file + "]." << endl;
275 
276     if (_id_map.empty()) {
277         _fid = fid;
278         _pid = pid;
279         _indi_num = _fid.size();
280         _sex.resize(_fid.size());
281         init_keep();
282     }
283 
284     return (n);
285 }
286 
read_grm(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only,bool dont_read_N)287 void gcta::read_grm(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only, bool dont_read_N)
288 {
289     if (_grm_bin_flag) read_grm_bin(grm_file, grm_id, out_id_log, read_id_only, dont_read_N);
290     else read_grm_gz(grm_file, grm_id, out_id_log, read_id_only);
291 }
292 
read_grm_gz(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only)293 void gcta::read_grm_gz(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only) {
294     int n = read_grm_id(grm_file, grm_id, out_id_log, read_id_only);
295 
296     if (read_id_only) return;
297 
298     string grm_gzfile = grm_file + ".grm.gz", str_buf;
299     const int MAX_LINE_LENGTH = 1000;
300     char buf[MAX_LINE_LENGTH];
301     gzifstream zinf;
302     zinf.open(grm_gzfile.c_str());
303     if (!zinf.is_open()) throw ("Error: can not open the file [" + grm_gzfile + "] to read.");
304 
305     int indx1 = 0, indx2 = 0, nline = 0;
306     double grm_buf = 0.0, grm_N_buf;
307     string errmsg = "Error: failed to read [" + grm_gzfile + "]. The format of the GRM file has been changed?\nError occurs in line:\n";
308     cout << "Reading the GRM from [" + grm_gzfile + "]." << endl;
309     _grm.resize(n, n);
310     _grm_N.resize(n, n);
311     while (1) {
312         zinf.getline(buf, MAX_LINE_LENGTH, '\n');
313         if (zinf.fail() || !zinf.good()) break;
314         stringstream ss(buf);
315         if (!(ss >> indx1)) throw (errmsg + buf);
316         if (!(ss >> indx2)) throw (errmsg + buf);
317         if (!(ss >> grm_N_buf)) throw (errmsg + buf);
318         if (!(ss >> grm_buf)) throw (errmsg + buf);
319         if (indx1 < indx2 || indx1 > n || indx2 > n) throw (errmsg + buf);
320         if (grm_N_buf == 0) cout << "Warning: " << buf << endl;
321         _grm_N(indx1 - 1, indx2 - 1) = _grm_N(indx2 - 1, indx1 - 1) = grm_N_buf;
322         _grm(indx1 - 1, indx2 - 1) = _grm(indx2 - 1, indx1 - 1) = grm_buf;
323         nline++;
324         if (ss >> str_buf) throw (errmsg + buf);
325     }
326     zinf.close();
327     if (!_within_family && nline != (int) (n * (n + 1)*0.5)){
328         stringstream errmsg_tmp;
329         errmsg_tmp << "Error: there are " << nline << " lines in the [" << grm_gzfile << "] file. The expected number of lines is " << (int) (n * (n + 1)*0.5) << "." << endl;
330         throw(errmsg_tmp.str());
331         //throw ("Error: incorrect number of lines in the grm file. *.grm.gz file and *.grm.id file are mismatched?");
332     }
333     cout << "GRM for " << n << " individuals are included from [" + grm_gzfile + "]." << endl;
334 }
335 
read_grm_bin(string grm_file,vector<string> & grm_id,bool out_id_log,bool read_id_only,bool dont_read_N)336 void gcta::read_grm_bin(string grm_file, vector<string> &grm_id, bool out_id_log, bool read_id_only, bool dont_read_N)
337 {
338     int i = 0, j = 0, n = read_grm_id(grm_file, grm_id, out_id_log, read_id_only);
339 
340     if (read_id_only) return;
341 
342     string grm_binfile = grm_file + ".grm.bin";
343     ifstream A_bin(grm_binfile.c_str(), ios::in | ios::binary);
344     if (!A_bin.is_open()) throw ("Error: can not open the file [" + grm_binfile + "] to read.");
345     _grm.resize(n, n);
346     cout << "Reading the GRM from [" + grm_binfile + "]." << endl;
347     int size = sizeof (float);
348     float f_buf = 0.0;
349     for (i = 0; i < n; i++) {
350         for (j = 0; j <= i; j++) {
351             if (!(A_bin.read((char*) &f_buf, size))) throw ("Error: the size of the [" + grm_binfile + "] file is incomplete?");
352             _grm(j, i) = _grm(i, j) = f_buf;
353         }
354     }
355     A_bin.close();
356 
357     if(!dont_read_N){
358         string grm_Nfile = grm_file + ".grm.N.bin";
359         ifstream N_bin(grm_Nfile.c_str(), ios::in | ios::binary);
360         if (!N_bin.is_open()) throw ("Error: can not open the file [" + grm_Nfile + "] to read.");
361         _grm_N.resize(n, n);
362         cout << "Reading the number of SNPs for the GRM from [" + grm_Nfile + "]." << endl;
363         size = sizeof (float);
364         f_buf = 0.0;
365         for (i = 0; i < n; i++) {
366             for (j = 0; j <= i; j++) {
367                 if (!(N_bin.read((char*) &f_buf, size))) throw ("Error: the size of the [" + grm_Nfile + "] file is incomplete?");
368                 _grm_N(j, i) = _grm_N(i, j) = f_buf;
369             }
370         }
371         N_bin.close();
372     }
373 
374     cout << "GRM for " << n << " individuals are included from [" + grm_binfile + "]." << endl;
375 }
376 
rm_cor_indi(double grm_cutoff)377 void gcta::rm_cor_indi(double grm_cutoff) {
378     cout << "Pruning the GRM with a cutoff of " << grm_cutoff << " ..." << endl;
379 
380     int i = 0, j = 0, i_buf = 0;
381 
382     // identify the positions where you see a value > than the threshold
383     vector<int> rm_grm_ID1, rm_grm_ID2;
384     for (i = 0; i < _keep.size(); i++) {
385         for (j = 0; j < i; j++) {
386             if (_grm(_keep[i], _keep[j]) > grm_cutoff) {
387                 rm_grm_ID1.push_back(_keep[i]);
388                 rm_grm_ID2.push_back(_keep[j]);
389             }
390         }
391     }
392 
393     // count the number of appearance of each "position" in the vector, which involves a few steps
394     vector<int> rm_uni_ID(rm_grm_ID1);
395     rm_uni_ID.insert(rm_uni_ID.end(), rm_grm_ID2.begin(), rm_grm_ID2.end());
396     stable_sort(rm_uni_ID.begin(), rm_uni_ID.end());
397     rm_uni_ID.erase(unique(rm_uni_ID.begin(), rm_uni_ID.end()), rm_uni_ID.end());
398     map<int, int> rm_uni_ID_count;
399     for (i = 0; i < rm_uni_ID.size(); i++) {
400         i_buf = count(rm_grm_ID1.begin(), rm_grm_ID1.end(), rm_uni_ID[i]) + count(rm_grm_ID2.begin(), rm_grm_ID2.end(), rm_uni_ID[i]);
401         rm_uni_ID_count.insert(pair<int, int>(rm_uni_ID[i], i_buf));
402     }
403 
404     // swapping
405     map<int, int>::iterator iter1, iter2;
406     for (i = 0; i < rm_grm_ID1.size(); i++) {
407         iter1 = rm_uni_ID_count.find(rm_grm_ID1[i]);
408         iter2 = rm_uni_ID_count.find(rm_grm_ID2[i]);
409         if (iter1->second < iter2->second) {
410             i_buf = rm_grm_ID1[i];
411             rm_grm_ID1[i] = rm_grm_ID2[i];
412             rm_grm_ID2[i] = i_buf;
413         }
414     }
415 
416     stable_sort(rm_grm_ID1.begin(), rm_grm_ID1.end());
417     rm_grm_ID1.erase(unique(rm_grm_ID1.begin(), rm_grm_ID1.end()), rm_grm_ID1.end());
418     vector<string> removed_ID;
419     for (i = 0; i < rm_grm_ID1.size(); i++) removed_ID.push_back(_fid[rm_grm_ID1[i]] + ":" + _pid[rm_grm_ID1[i]]);
420 
421     // update _keep and _id_map
422     update_id_map_rm(removed_ID, _id_map, _keep);
423 
424     cout << "After pruning the GRM, there are " << _keep.size() << " individuals (" << removed_ID.size() << " individuals removed)." << endl;
425 }
426 
adj_grm(double adj_grm_fac)427 void gcta::adj_grm(double adj_grm_fac) {
428     cout << "Adjusting the GRM for sampling errors ..." << endl;
429     int i = 0, j = 0, n = _keep.size();
430     double off_mean = 0.0, diag_mean = 0.0, off_var = 0.0, diag_var = 0.0, d_buf = 0.0;
431     for (i = 0; i < n; i++) {
432         diag_mean += _grm(_keep[i], _keep[i]);
433         for (j = 0; j < i; j++) off_mean += _grm(_keep[i], _keep[j]);
434     }
435     diag_mean /= n;
436     off_mean /= 0.5 * n * (n - 1.0);
437     for (i = 0; i < n; i++) {
438         d_buf = _grm(_keep[i], _keep[i]) - diag_mean;
439         diag_var += d_buf*d_buf;
440         for (j = 0; j < i; j++) {
441             d_buf = _grm(_keep[i], _keep[j]) - off_mean;
442             off_var += d_buf*d_buf;
443         }
444     }
445     diag_var /= n - 1.0;
446     off_var /= 0.5 * n * (n - 1.0) - 1.0;
447     for (i = 0; i < _keep.size(); i++) {
448         d_buf = 1.0 - (adj_grm_fac + 1.0 / _grm_N(_keep[i], _keep[i])) / diag_var;
449         if (_grm(_keep[i], _keep[i]) > 0) _grm(_keep[i], _keep[i]) = 1.0 + d_buf * (_grm(_keep[i], _keep[i]) - 1.0);
450         for (j = 0; j < i; j++) {
451             if (_grm_N(_keep[i], _keep[j]) > 0) _grm(_keep[i], _keep[j]) *= 1.0 - (adj_grm_fac + 1.0 / _grm_N(_keep[i], _keep[j])) / off_var;
452         }
453     }
454 }
455 
dc(int dosage_compen)456 void gcta::dc(int dosage_compen) {
457     cout << "Parameterizing the GRM under the assumption of ";
458     if (dosage_compen == 1) cout << "full dosage compensation ..." << endl;
459     else if (dosage_compen == 0) cout << "no dosage compensation ..." << endl;
460 
461     int i = 0, j = 0, i_buf = 0;
462     double c1 = 1.0, c2 = 1.0;
463     if (dosage_compen == 1) {
464         c1 = 2.0;
465         c2 = sqrt(2.0);
466     }// full dosage compensation
467     else if (dosage_compen == 0) {
468         c1 = 0.5;
469         c2 = sqrt(0.5);
470     } // on dosage compensation
471     for (i = 0; i < _keep.size(); i++) {
472         for (j = 0; j <= i; j++) {
473             i_buf = _sex[_keep[i]] * _sex[_keep[j]];
474             if (i_buf == 1) _grm(i, j) *= c1;
475             else if (i_buf == 2) _grm(i, j) *= c2;
476         }
477     }
478 }
479 
manipulate_grm(string grm_file,string keep_indi_file,string remove_indi_file,string sex_file,double grm_cutoff,double adj_grm_fac,int dosage_compen,bool merge_grm_flag,bool dont_read_N)480 void gcta::manipulate_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool dont_read_N)
481 {
482     int i = 0, j = 0;
483 
484     vector<string> grm_id;
485     if (merge_grm_flag) merge_grm(grm_file);
486     else read_grm(grm_file, grm_id, true, false, dont_read_N);
487 
488     if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
489     if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
490     if (grm_cutoff>-1.0) rm_cor_indi(grm_cutoff);
491     if (!sex_file.empty()) update_sex(sex_file);
492     if (adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
493     if (dosage_compen>-1) dc(dosage_compen);
494     if (grm_cutoff>-1.0 || !keep_indi_file.empty() || !remove_indi_file.empty()) {
495         eigenMatrix grm_buf(_grm);
496         _grm.resize(_keep.size(), _keep.size());
497         for (i = 0; i < _keep.size(); i++) {
498             for (j = 0; j <= i; j++) _grm(i, j) = grm_buf(_keep[i], _keep[j]);
499         }
500         grm_buf.resize(0,0);
501         MatrixXf grm_N_buf = _grm_N;
502         _grm_N.resize(_keep.size(), _keep.size());
503         for (i = 0; i < _keep.size(); i++) {
504             for (j = 0; j <= i; j++) _grm_N(i, j) = grm_N_buf(_keep[i], _keep[j]);
505         }
506     }
507 }
508 
save_grm(string grm_file,string keep_indi_file,string remove_indi_file,string sex_file,double grm_cutoff,double adj_grm_fac,int dosage_compen,bool merge_grm_flag,bool output_grm_bin)509 void gcta::save_grm(string grm_file, string keep_indi_file, string remove_indi_file, string sex_file, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool merge_grm_flag, bool output_grm_bin) {
510     if (dosage_compen>-1) check_sex();
511     manipulate_grm(grm_file, keep_indi_file, remove_indi_file, sex_file, grm_cutoff, adj_grm_fac, dosage_compen, merge_grm_flag);
512     output_grm(output_grm_bin);
513 }
514 
merge_grm(string merge_grm_file)515 void gcta::merge_grm(string merge_grm_file) {
516     vector<string> grm_files, grm_id;
517     read_grm_filenames(merge_grm_file, grm_files);
518 
519     int f = 0, i = 0, j = 0;
520     for (f = 0; f < grm_files.size(); f++) {
521         read_grm(grm_files[f], grm_id, false, true);
522         update_id_map_kp(grm_id, _id_map, _keep);
523     }
524     vector<string> uni_id;
525     for (i = 0; i < _keep.size(); i++) uni_id.push_back(_fid[_keep[i]] + ":" + _pid[_keep[i]]);
526     _n = uni_id.size();
527     if (_n == 0) throw ("Error: no individual is in common in the GRM files.");
528     else cout << _n << " individuals in common in the GRM files." << endl;
529 
530     vector<int> kp;
531     eigenMatrix grm = eigenMatrix::Zero(_n, _n);
532     eigenMatrix grm_N = eigenMatrix::Zero(_n, _n);
533     for (f = 0; f < grm_files.size(); f++) {
534         cout << "Reading the GRM from the " << f + 1 << "th file ..." << endl;
535         read_grm(grm_files[f], grm_id);
536         StrFunc::match(uni_id, grm_id, kp);
537         for (i = 0; i < _n; i++) {
538             for (j = 0; j <= i; j++) {
539                 if (kp[i] >= kp[j]) {
540                     grm(i, j) += _grm(kp[i], kp[j]) * _grm_N(kp[i], kp[j]);
541                     grm_N(i, j) += _grm_N(kp[i], kp[j]);
542                 } else {
543                     grm(i, j) += _grm(kp[j], kp[i]) * _grm_N(kp[j], kp[i]);
544                     grm_N(i, j) += _grm_N(kp[j], kp[i]);
545                 }
546             }
547         }
548     }
549     for (i = 0; i < _n; i++) {
550         for (j = 0; j <= i; j++) {
551             if (grm_N(i, j) == 0) _grm(i, j) = 0;
552             else _grm(i, j) = grm(i, j) / grm_N(i, j);
553             _grm_N(i, j) = grm_N(i, j);
554         }
555     }
556     grm.resize(0, 0);
557     grm_N.resize(0, 0);
558     cout << "\n" << grm_files.size() << " GRMs have been merged together." << endl;
559 }
560 
read_grm_filenames(string merge_grm_file,vector<string> & grm_files,bool out_log)561 void gcta::read_grm_filenames(string merge_grm_file, vector<string> &grm_files, bool out_log) {
562     ifstream merge_grm(merge_grm_file.c_str());
563     if (!merge_grm) throw ("Error: can not open the file [" + merge_grm_file + "] to read.");
564     string str_buf;
565     grm_files.clear();
566     vector<string> vs_buf;
567     while (getline(merge_grm, str_buf)) {
568         if (!str_buf.empty()) {
569             if (StrFunc::split_string(str_buf, vs_buf) == 1) grm_files.push_back(vs_buf[0]);
570         }
571     }
572     if (out_log) cout << "There are " << grm_files.size() << " GRM file names specified in [" + merge_grm_file + "]." << endl;
573     if (grm_files.size() > 1000) throw ("Error: too many GRM file names specified in [" + merge_grm_file + "]. Maximum is 1000.");
574     if (grm_files.size() < 1) throw ("Error: no GRM file name is found in [" + merge_grm_file + "].");
575 }
576 
grm_bK(string grm_file,string keep_indi_file,string remove_indi_file,double threshold,bool grm_out_bin_flag)577 void gcta::grm_bK(string grm_file, string keep_indi_file, string remove_indi_file, double threshold, bool grm_out_bin_flag)
578 {
579     int i = 0, j = 0;
580     vector<string> grm_id;
581     read_grm(grm_file, grm_id);
582     if (!keep_indi_file.empty()) keep_indi(keep_indi_file);
583     if (!remove_indi_file.empty()) remove_indi(remove_indi_file);
584     if (!keep_indi_file.empty() || !remove_indi_file.empty()) {
585         eigenMatrix grm_buf(_grm);
586         _grm.resize(_keep.size(), _keep.size());
587         for (i = 0; i < _keep.size(); i++) {
588             for (j = 0; j <= i; j++) _grm(i, j) = grm_buf(_keep[i], _keep[j]);
589         }
590         grm_buf.resize(0,0);
591         MatrixXf grm_N_buf = _grm_N;
592         _grm_N.resize(_keep.size(), _keep.size());
593         for (i = 0; i < _keep.size(); i++) {
594             for (j = 0; j <= i; j++) _grm_N(i, j) = grm_N_buf(_keep[i], _keep[j]);
595         }
596     }
597 
598     cout << "\nThe off-diagonals that are < " << threshold << " are set to zero.\n" << endl;
599     for (i = 0; i < _keep.size(); i++) {
600         for (j = 0; j < i; j++){
601             if(_grm(i, j) < threshold) _grm(i, j) = 0.0;
602         }
603     }
604 
605     output_grm(grm_out_bin_flag);
606 }
607 
pca(string grm_file,string keep_indi_file,string remove_indi_file,double grm_cutoff,bool merge_grm_flag,int out_pc_num)608 void gcta::pca(string grm_file, string keep_indi_file, string remove_indi_file, double grm_cutoff, bool merge_grm_flag, int out_pc_num)
609 {
610     manipulate_grm(grm_file, keep_indi_file, remove_indi_file, "", grm_cutoff, -2.0, -2, merge_grm_flag, true);
611     _grm_N.resize(0, 0);
612     int i = 0, j = 0, n = _keep.size();
613     cout << "\nPerforming principal component analysis ..." << endl;
614 
615     SelfAdjointEigenSolver<MatrixXd> eigensolver(_grm.cast<double>());
616     MatrixXd evec = (eigensolver.eigenvectors());
617     VectorXd eval = eigensolver.eigenvalues();
618 
619     string eval_file = _out + ".eigenval";
620     ofstream o_eval(eval_file.c_str());
621     if (!o_eval) throw ("Error: can not open the file [" + eval_file + "] to read.");
622     for (i = n - 1; i >= 0; i--) o_eval << eval(i) << endl;
623     o_eval.close();
624     cout << "Eigenvalues of " << n << " individuals have been saved in [" + eval_file + "]." << endl;
625     string evec_file = _out + ".eigenvec";
626     ofstream o_evec(evec_file.c_str());
627     if (!o_evec) throw ("Error: can not open the file [" + evec_file + "] to read.");
628     if (out_pc_num > n) out_pc_num = n;
629     for (i = 0; i < n; i++) {
630         o_evec << _fid[_keep[i]] << " " << _pid[_keep[i]];
631         for (j = n - 1; j >= (n - out_pc_num); j--) o_evec << " " << evec(i, j);
632         o_evec << endl;
633     }
634     o_evec.close();
635     cout << "The first " << out_pc_num << " eigenvectors of " << n << " individuals have been saved in [" + evec_file + "]." << endl;
636 }
637 
snp_pc_loading(string pc_file,int grm_N)638 void gcta::snp_pc_loading(string pc_file, int grm_N)
639 {
640     // read eigenvectors and eigenvalues
641     string eigenval_file = pc_file + ".eigenval";
642     ifstream in_eigenval(eigenval_file.c_str());
643     if (!in_eigenval) throw ("Error: can not open the file [" + eigenval_file + "] to read.");
644     string eigenvec_file = pc_file + ".eigenvec";
645     ifstream in_eigenvec(eigenvec_file.c_str());
646     if (!in_eigenvec) throw ("Error: can not open the file [" + eigenvec_file + "] to read.");
647 
648     cout << "Reading eigenvectors from [" + eigenvec_file + "]." << endl;
649     vector<string> eigenvec_ID;
650     vector< vector<string> > eigenvec_str;
651     int eigenvec_num = read_fac(in_eigenvec, eigenvec_ID, eigenvec_str);
652     cout << eigenvec_num << " eigenvectors of " << eigenvec_ID.size() << " individuals are included from [" + eigenvec_file + "]." << endl;
653     update_id_map_kp(eigenvec_ID, _id_map, _keep);
654 
655     cout << "\nReading eigenvalues from [" + eigenval_file + "]." << endl;
656     vector<double> eigenval_buf;
657     double d_buf = 0.0;
658     int eigenval_num = 0;
659     while(in_eigenval && eigenval_num < eigenvec_num){
660         in_eigenval >> d_buf;
661         if(d_buf > 1e10 || d_buf < 1e-10) throw("Error: invalid eigenvalue in the file [" + eigenval_file + "].");
662         eigenval_buf.push_back(d_buf);
663         eigenval_num++;
664     }
665     if(eigenvec_num != eigenval_num) throw("Error: inconsistent numbers of eigenvalues and eigenvectors in the files [" + eigenval_file + "] and [" + eigenvec_file + "]");
666     cout << eigenval_num << " eigenvalues read from [" + eigenval_file + "]" << endl;
667 
668     int i = 0, j = 0;
669     vector<string> uni_id;
670     map<string, int> uni_id_map;
671     map<string, int>::iterator iter;
672     for(i=0; i<_keep.size(); i++){
673         uni_id.push_back(_fid[_keep[i]]+":"+_pid[_keep[i]]);
674         uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
675     }
676     _n=_keep.size();
677     if(_n < 1) throw("Error: no individual is in common between the input files.");
678     cout << _n << " individuals in common between the input files are included in the analysis."<<endl;
679 
680     eigenMatrix eigenvec(eigenvec_num, _n);
681     for(i = 0; i < eigenvec_ID.size(); i++){
682         iter = uni_id_map.find(eigenvec_ID[i]);
683         if(iter == uni_id_map.end()) continue;
684         for(j = 0; j < eigenvec_num; j++) eigenvec(j, iter->second) = atof(eigenvec_str[i][j].c_str());
685     }
686 
687     eigenVector inv_eigenval(eigenval_num);
688     for(i = 0; i < eigenval_num; i++)  inv_eigenval(i) = 1.0 / (eigenval_buf[i] * grm_N);
689 
690     // calculating SNP loading
691     if (_mu.empty()) calcu_mu();
692     cout << "\nCalculating SNP loading ..." << endl;
693     int m = _include.size();
694     eigenMatrix snp_loading(m, eigenvec_num);
695     eigenVector x(_n);
696     for(j = 0; j < m ; j++) {
697         makex_eigenVector(j, x, false, true);
698         x = x.array() / sqrt(_mu[_include[j]]*(1.0 - 0.5*_mu[_include[j]]));
699         snp_loading.row(j) = (eigenvec * x).array() * inv_eigenval.array();
700     }
701 
702     string filename = _out + ".pcl";
703     cout << "\nSaving the PC loading of " << m << " SNPs to [" + filename + "] ..." << endl;
704     ofstream ofile(filename.c_str());
705     if(!ofile) throw("Can not open the file [" + filename + "] to write.");
706     ofile << "SNP\trefA";
707     for(i = 0; i < eigenval_num; i++) ofile << "\tpc" << i+1 << "_loading";
708     ofile << endl;
709     for(i = 0; i < m; i++){
710         ofile << _snp_name[_include[i]] << "\t" << _ref_A[_include[i]];
711         for(j = 0; j < eigenvec_num; j++) ofile << "\t" << snp_loading(i, j);
712         ofile << "\n";
713     }
714     ofile.close();
715 }
716 
717 
718 
719