1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for data management
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
gcta(int autosome_num,double rm_ld_cutoff,string out)15 gcta::gcta(int autosome_num, double rm_ld_cutoff, string out)
16 {
17     _autosome_num = autosome_num;
18     _rm_ld_cutoff = rm_ld_cutoff;
19     _out = out;
20     _dosage_flag = false;
21     _grm_bin_flag = false;
22     _reml_mtd = 0;
23     _reml_inv_mtd = 0;
24     _reml_max_iter = 30;
25     _jma_actual_geno = false;
26     _jma_wind_size = 1e7;
27     _jma_p_cutoff = 5e-8;
28     _jma_collinear = 0.9;
29     _jma_Vp = 1;
30     _GC_val = 1;
31     _bivar_reml = false;
32     _ignore_Ce = false;
33     _bivar_no_constrain = false;
34     _y_Ssq = 0.0;
35     _y2_Ssq = 0.0;
36     _ncase = 0;
37     _ncase2 = 0;
38     _flag_CC = false;
39     _flag_CC2 = false;
40     _within_family = false;
41     _reml_have_bend_A = false;
42     _V_inv_mtd = 0;
43     _reml_force_inv = false;
44     _reml_AI_not_invertible = false;
45     _reml_force_converge = false;
46     _reml_no_converge = false;
47     _reml_fixed_var = false;
48     _ldscore_adj = false;
49 }
50 
gcta()51 gcta::gcta() {
52     _dosage_flag = false;
53     _grm_bin_flag = false;
54     _reml_mtd = 0;
55     _reml_inv_mtd = 0;
56     _reml_max_iter = 30;
57     _jma_actual_geno = false;
58     _jma_wind_size = 1e7;
59     _jma_p_cutoff = 5e-8;
60     _jma_collinear = 0.9;
61     _jma_Vp = 1;
62     _GC_val = 1;
63     _bivar_reml = false;
64     _ignore_Ce = false;
65     _bivar_no_constrain = false;
66     _y_Ssq = 0.0;
67     _y2_Ssq = 0.0;
68     _ncase = 0;
69     _ncase2 = 0;
70     _flag_CC = false;
71     _flag_CC2 = false;
72     _within_family = false;
73     _reml_have_bend_A = false;
74     _V_inv_mtd = 0;
75     _reml_force_inv = false;
76     _reml_AI_not_invertible = false;
77     _reml_force_converge = false;
78     _reml_no_converge = false;
79     _reml_fixed_var = false;
80     _ldscore_adj = false;
81 }
82 
~gcta()83 gcta::~gcta() {
84 
85 }
86 
read_famfile(string famfile)87 void gcta::read_famfile(string famfile) {
88     ifstream Fam(famfile.c_str());
89     if (!Fam) throw ("Error: can not open the file [" + famfile + "] to read.");
90     cout << "Reading PLINK FAM file from [" + famfile + "]." << endl;
91 
92     int i = 0;
93     string str_buf;
94     _fid.clear();
95     _pid.clear();
96     _fa_id.clear();
97     _mo_id.clear();
98     _sex.clear();
99     _pheno.clear();
100     while (Fam) {
101         Fam >> str_buf;
102         if (Fam.eof()) break;
103         _fid.push_back(str_buf);
104         Fam >> str_buf;
105         _pid.push_back(str_buf);
106         Fam >> str_buf;
107         _fa_id.push_back(str_buf);
108         Fam >> str_buf;
109         _mo_id.push_back(str_buf);
110         Fam >> str_buf;
111         _sex.push_back(atoi(str_buf.c_str()));
112         Fam >> str_buf;
113         _pheno.push_back(atoi(str_buf.c_str()));
114     }
115     Fam.clear();
116     Fam.close();
117     _indi_num = _fid.size();
118     cout << _indi_num << " individuals to be included from [" + famfile + "]." << endl;
119 
120     // Initialize _keep
121     init_keep();
122 }
123 
init_keep()124 void gcta::init_keep() {
125     _keep.clear();
126     _keep.resize(_indi_num);
127     _id_map.clear();
128     int i = 0, size = 0;
129     for (i = 0; i < _indi_num; i++) {
130         _keep[i] = i;
131         _id_map.insert(pair<string, int>(_fid[i] + ":" + _pid[i], i));
132         if (size == _id_map.size()) throw ("Error: Duplicate individual ID found: \"" + _fid[i] + "\t" + _pid[i] + "\".");
133         size = _id_map.size();
134     }
135 }
136 
read_bimfile(string bimfile)137 void gcta::read_bimfile(string bimfile) {
138     // Read bim file: recombination rate is defined between SNP i and SNP i-1
139     int ibuf = 0;
140     string cbuf = "0";
141     double dbuf = 0.0;
142     string str_buf;
143     ifstream Bim(bimfile.c_str());
144     if (!Bim) throw ("Error: can not open the file [" + bimfile + "] to read.");
145     cout << "Reading PLINK BIM file from [" + bimfile + "]." << endl;
146     _chr.clear();
147     _snp_name.clear();
148     _genet_dst.clear();
149     _bp.clear();
150     _allele1.clear();
151     _allele2.clear();
152     while (Bim) {
153         Bim >> ibuf;
154         if (Bim.eof()) break;
155         _chr.push_back(ibuf);
156         Bim >> str_buf;
157         _snp_name.push_back(str_buf);
158         Bim >> dbuf;
159         _genet_dst.push_back(dbuf);
160         Bim >> ibuf;
161         _bp.push_back(ibuf);
162         Bim >> cbuf;
163         StrFunc::to_upper(cbuf);
164         _allele1.push_back(cbuf);
165         Bim >> cbuf;
166         StrFunc::to_upper(cbuf);
167         _allele2.push_back(cbuf);
168     }
169     Bim.close();
170     _snp_num = _chr.size();
171     _ref_A = _allele1;
172     _other_A = _allele2;
173     cout << _snp_num << " SNPs to be included from [" + bimfile + "]." << endl;
174 
175     // Initialize _include
176     init_include();
177 }
178 
init_include()179 void gcta::init_include()
180 {
181     _include.clear();
182     _include.resize(_snp_num);
183     _snp_name_map.clear();
184     int i = 0, size = 0;
185     for (i = 0; i < _snp_num; i++) {
186         _include[i] = i;
187         if(_snp_name_map.find(_snp_name[i]) != _snp_name_map.end()){
188             cout << "Warning: Duplicated SNP ID \"" + _snp_name[i] + "\" ";
189             stringstream ss;
190             ss << _snp_name[i] << "_" << i + 1;
191             _snp_name[i] = ss.str();
192             cout<<"has been changed to \"" + _snp_name[i] + "\"\n.";
193         }
194         _snp_name_map.insert(pair<string, int>(_snp_name[i], i));
195     }
196 }
197 
198 // some code are adopted from PLINK with modifications
read_bedfile(string bedfile)199 void gcta::read_bedfile(string bedfile)
200 {
201     int i = 0, j = 0, k = 0;
202 
203     // Flag for reading individuals and SNPs
204     vector<int> rindi, rsnp;
205     get_rindi(rindi);
206     get_rsnp(rsnp);
207 
208     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
209     if (_keep.size() == 0) throw ("Error: No individual is retained for analysis.");
210 
211     // Read bed file
212     char ch[1];
213     bitset<8> b;
214     _snp_1.resize(_include.size());
215     _snp_2.resize(_include.size());
216     for (i = 0; i < _include.size(); i++) {
217         _snp_1[i].reserve(_keep.size());
218         _snp_2[i].reserve(_keep.size());
219     }
220     fstream BIT(bedfile.c_str(), ios::in | ios::binary);
221     if (!BIT) throw ("Error: can not open the file [" + bedfile + "] to read.");
222     cout << "Reading PLINK BED file from [" + bedfile + "] in SNP-major format ..." << endl;
223     for (i = 0; i < 3; i++) BIT.read(ch, 1); // skip the first three bytes
224     int snp_indx = 0, indi_indx = 0;
225     for (j = 0, snp_indx = 0; j < _snp_num; j++) { // Read genotype in SNP-major mode, 00: homozygote AA; 11: homozygote BB; 01: hetezygote; 10: missing
226         if (!rsnp[j]) {
227             for (i = 0; i < _indi_num; i += 4) BIT.read(ch, 1);
228             continue;
229         }
230         for (i = 0, indi_indx = 0; i < _indi_num;) {
231             BIT.read(ch, 1);
232             if (!BIT) throw ("Error: problem with the BED file ... has the FAM/BIM file been changed?");
233             b = ch[0];
234             k = 0;
235             while (k < 7 && i < _indi_num) { // change code: 11 for AA; 00 for BB;
236                 if (!rindi[i]) k += 2;
237                 else {
238                     _snp_2[snp_indx][indi_indx] = (!b[k++]);
239                     _snp_1[snp_indx][indi_indx] = (!b[k++]);
240                     indi_indx++;
241                 }
242                 i++;
243             }
244         }
245         if (snp_indx == _include.size()) break;
246         snp_indx++;
247     }
248     BIT.clear();
249     BIT.close();
250     cout << "Genotype data for " << _keep.size() << " individuals and " << _include.size() << " SNPs to be included from [" + bedfile + "]." << endl;
251 
252     update_fam(rindi);
253     update_bim(rsnp);
254 }
255 
get_rsnp(vector<int> & rsnp)256 void gcta::get_rsnp(vector<int> &rsnp) {
257     rsnp.clear();
258     rsnp.resize(_snp_num);
259     for (int i = 0; i < _snp_num; i++) {
260         if (_snp_name_map.find(_snp_name[i]) != _snp_name_map.end()) rsnp[i] = 1;
261         else rsnp[i] = 0;
262     }
263 }
264 
get_rindi(vector<int> & rindi)265 void gcta::get_rindi(vector<int> &rindi) {
266     rindi.clear();
267     rindi.resize(_indi_num);
268     for (int i = 0; i < _indi_num; i++) {
269         if (_id_map.find(_fid[i] + ":" + _pid[i]) != _id_map.end()) rindi[i] = 1;
270         else rindi[i] = 0;
271     }
272 }
273 
update_bim(vector<int> & rsnp)274 void gcta::update_bim(vector<int> &rsnp) {
275     int i = 0;
276 
277     //update bim information
278     vector<int> chr_buf, bp_buf;
279     vector<string> a1_buf, a2_buf, ref_A_buf, other_A_buf;
280     vector<string> snp_name_buf;
281     vector<double> genet_dst_buf, impRsq_buf;
282     for (i = 0; i < _snp_num; i++) {
283         if (!rsnp[i]) continue;
284         chr_buf.push_back(_chr[i]);
285         snp_name_buf.push_back(_snp_name[i]);
286         genet_dst_buf.push_back(_genet_dst[i]);
287         bp_buf.push_back(_bp[i]);
288         a1_buf.push_back(_allele1[i]);
289         a2_buf.push_back(_allele2[i]);
290         ref_A_buf.push_back(_ref_A[i]);
291         other_A_buf.push_back(_other_A[i]);
292         if(_impRsq.size()>0) impRsq_buf.push_back(_impRsq[i]);
293     }
294     _chr.clear();
295     _snp_name.clear();
296     _genet_dst.clear();
297     _bp.clear();
298     _allele1.clear();
299     _allele2.clear();
300     _ref_A.clear();
301     _other_A.clear();
302     _impRsq.clear();
303     _chr = chr_buf;
304     _snp_name = snp_name_buf;
305     _genet_dst = genet_dst_buf;
306     _bp = bp_buf;
307     _allele1 = a1_buf;
308     _allele2 = a2_buf;
309     _ref_A = ref_A_buf;
310     _other_A = other_A_buf;
311     _impRsq=impRsq_buf;
312     _snp_num = _chr.size();
313     _include.clear();
314     _include.resize(_snp_num);
315     _snp_name_map.clear();
316 
317     for (i = 0; i < _snp_num; i++) {
318         _include[i] = i;
319         _snp_name_map.insert(pair<string, int>(_snp_name[i], i));
320     }
321 }
322 
update_fam(vector<int> & rindi)323 void gcta::update_fam(vector<int> &rindi) {
324     //update fam information
325     int i = 0;
326     vector<string> fid_buf, pid_buf, fa_id_buf, mo_id_buf;
327     vector<int> sex_buf;
328     vector<double> pheno_buf;
329     for (i = 0; i < _indi_num; i++) {
330         if (!rindi[i]) continue;
331         fid_buf.push_back(_fid[i]);
332         pid_buf.push_back(_pid[i]);
333         fa_id_buf.push_back(_fa_id[i]);
334         mo_id_buf.push_back(_mo_id[i]);
335         sex_buf.push_back(_sex[i]);
336         pheno_buf.push_back(_pheno[i]);
337     }
338     _fid.clear();
339     _pid.clear();
340     _fa_id.clear();
341     _mo_id.clear();
342     _sex.clear();
343     _pheno.clear();
344     _fid = fid_buf;
345     _pid = pid_buf;
346     _fa_id = fa_id_buf;
347     _mo_id = mo_id_buf;
348     _sex = sex_buf;
349     _pheno = pheno_buf;
350 
351     _indi_num = _fid.size();
352     _keep.clear();
353     _keep.resize(_indi_num);
354     _id_map.clear();
355     for (i = 0; i < _indi_num; i++) {
356         _keep[i] = i;
357         _id_map.insert(pair<string, int>(_fid[i] + ":" + _pid[i], i));
358     }
359 }
360 
read_imp_info_mach_gz(string zinfofile)361 void gcta::read_imp_info_mach_gz(string zinfofile)
362 {
363     _dosage_flag = true;
364 
365     int i = 0;
366     gzifstream zinf;
367     zinf.open(zinfofile.c_str());
368     if (!zinf.is_open()) throw ("Error: can not open the file [" + zinfofile + "] to read.");
369 
370     string buf, str_buf, errmsg = "Reading dosage data failed. Please check the format of the map file.";
371     string c_buf;
372     double f_buf = 0.0;
373     cout << "Reading map file of the imputed dosage data from [" + zinfofile + "]." << endl;
374     getline(zinf, buf); // skip the header
375     vector<string> vs_buf;
376     int col_num = StrFunc::split_string(buf, vs_buf, " \t\n");
377     if (col_num < 7) throw (errmsg);
378     if (vs_buf[6] != "Rsq") throw (errmsg);
379     _snp_name.clear();
380     _allele1.clear();
381     _allele2.clear();
382     _impRsq.clear();
383     while (1) {
384         getline(zinf, buf);
385         stringstream ss(buf);
386         string nerr = errmsg + "\nError occurs in line: " + ss.str();
387         if (!(ss >> str_buf)) break;
388         _snp_name.push_back(str_buf);
389         if (!(ss >> c_buf)) throw (nerr);
390         _allele1.push_back(c_buf);
391         if (!(ss >> c_buf)) throw (nerr);
392         _allele2.push_back(c_buf);
393         for (i = 0; i < 4; i++) if (!(ss >> f_buf)) throw (nerr);
394         _impRsq.push_back(f_buf);
395         if (zinf.fail() || !zinf.good()) break;
396     }
397     zinf.clear();
398     zinf.close();
399     _snp_num = _snp_name.size();
400     _chr.resize(_snp_num);
401     _bp.resize(_snp_num);
402     _genet_dst.resize(_snp_num);
403     _ref_A = _allele1;
404     _other_A = _allele2;
405 
406     // Initialize _include
407     init_include();
408     cout << _snp_num << " SNPs to be included from [" + zinfofile + "]." << endl;
409 }
410 
read_imp_info_mach(string infofile)411 void gcta::read_imp_info_mach(string infofile)
412 {
413     _dosage_flag = true;
414     if(infofile.substr(infofile.length()-3,3)==".gz") throw("Error: the --dosage-mach option doesn't support gz file any more. Please check --dosage-mach-gz option.");
415 
416     int i = 0;
417     ifstream inf(infofile.c_str());
418     if (!inf.is_open()) throw ("Error: can not open the file [" + infofile + "] to read.");
419 
420     string buf, str_buf, errmsg = "Reading dosage data failed. Please check the format of the map file.";
421     string c_buf;
422     double f_buf = 0.0;
423     cout << "Reading map file of the imputed dosage data from [" + infofile + "]." << endl;
424     getline(inf, buf); // skip the header
425     vector<string> vs_buf;
426     int col_num = StrFunc::split_string(buf, vs_buf, " \t\n");
427     if (col_num < 7) throw (errmsg);
428     if (vs_buf[6] != "Rsq" && vs_buf[6] != "Rsq_hat") throw (errmsg);
429     _snp_name.clear();
430     _allele1.clear();
431     _allele2.clear();
432     _impRsq.clear();
433     while (getline(inf, buf)) {
434         stringstream ss(buf);
435         string nerr = errmsg + "\nError occurs in line: " + ss.str();
436         if (!(ss >> str_buf)) break;
437         _snp_name.push_back(str_buf);
438         if (!(ss >> c_buf)) throw (nerr);
439         _allele1.push_back(c_buf);
440         if (!(ss >> c_buf)) throw (nerr);
441         _allele2.push_back(c_buf);
442         for (i = 0; i < 3; i++) if (!(ss >> f_buf)) throw (nerr);
443         _impRsq.push_back(f_buf);
444     }
445     inf.close();
446     _snp_num = _snp_name.size();
447     _chr.resize(_snp_num);
448     _bp.resize(_snp_num);
449     _genet_dst.resize(_snp_num);
450     _ref_A = _allele1;
451     _other_A = _allele2;
452 
453     // Initialize _include
454     init_include();
455     cout << _snp_num << " SNPs to be included from [" + infofile + "]." << endl;
456 }
457 
read_imp_dose_mach_gz(string zdosefile,string kp_indi_file,string rm_indi_file,string blup_indi_file)458 void gcta::read_imp_dose_mach_gz(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) {
459     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
460 
461     int i = 0, j = 0, k = 0, line = 0;
462     vector<int> rsnp;
463     get_rsnp(rsnp);
464 
465     gzifstream zinf;
466     zinf.open(zdosefile.c_str());
467     if (!zinf.is_open()) throw ("Error: can not open the file [" + zdosefile + "] to read.");
468 
469     vector<string> indi_ls;
470     map<string, int> kp_id_map, blup_id_map, rm_id_map;
471     bool kp_indi_flag = !kp_indi_file.empty(), blup_indi_flag = !blup_indi_file.empty(), rm_indi_flag = !rm_indi_file.empty();
472     if (kp_indi_flag) read_indi_list(kp_indi_file, indi_ls);
473     for (i = 0; i < indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i));
474     if (blup_indi_flag) read_indi_list(blup_indi_file, indi_ls);
475     for (i = 0; i < indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i));
476     if (rm_indi_flag) read_indi_list(rm_indi_file, indi_ls);
477     for (i = 0; i < indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i));
478 
479     bool missing = false;
480     string buf, str_buf, id_buf, err_msg = "Error: reading dosage data failed. Are the map file and the dosage file matched?";
481     double f_buf = 0.0;
482     vector<string> kept_id, vs_buf;
483     cout << "Reading dosage data from [" + zdosefile + "] in individual-major format (Note: may use huge RAM)." << endl;
484     _fid.clear();
485     _pid.clear();
486     _geno_dose.clear();
487 
488     vector<int> kp_it;
489     while (1) {
490         bool kp_flag = true;
491         getline(zinf, buf);
492         stringstream ss(buf);
493         if (!(ss >> str_buf)) break;
494         int ibuf = StrFunc::split_string(str_buf, vs_buf, ">");
495         if (ibuf > 1) {
496             if (vs_buf[0].empty()) throw ("Error: family ID of the individual [" + str_buf + "] is missing.");
497             else vs_buf[0].erase(vs_buf[0].end() - 1);
498         } else if (ibuf == 1) vs_buf.push_back(vs_buf[0]);
499         else break;
500         id_buf = vs_buf[0] + ":" + vs_buf[1];
501         if (kp_indi_flag && kp_id_map.find(id_buf) == kp_id_map.end()) kp_flag = false;
502         if (kp_flag && blup_indi_flag && blup_id_map.find(id_buf) == blup_id_map.end()) kp_flag = false;
503         if (kp_flag && rm_indi_flag && rm_id_map.find(id_buf) != rm_id_map.end()) kp_flag = false;
504         if (kp_flag) {
505             kp_it.push_back(1);
506             _fid.push_back(vs_buf[0]);
507             _pid.push_back(vs_buf[1]);
508             kept_id.push_back(id_buf);
509         } else kp_it.push_back(0);
510         if (zinf.fail() || !zinf.good()) break;
511     }
512     zinf.clear();
513     zinf.close();
514     cout << "(Imputed dosage data for " << kp_it.size() << " individuals detected)." << endl;
515     _indi_num = _fid.size();
516 
517     zinf.open(zdosefile.c_str());
518     _geno_dose.resize(_indi_num);
519     for (line = 0; line < _indi_num; line++) _geno_dose[line].resize(_include.size());
520     for (line = 0, k = 0; line < kp_it.size(); line++) {
521         getline(zinf, buf);
522         if (kp_it[line] == 0) continue;
523         stringstream ss(buf);
524         if (!(ss >> str_buf)) break;
525         if (!(ss >> str_buf)) break;
526         for (i = 0, j = 0; i < _snp_num; i++) {
527             ss >> str_buf;
528             f_buf = atof(str_buf.c_str());
529             if (str_buf == "X" || str_buf == "NA") {
530                 if (!missing) {
531                     cout << "Warning: missing values detected in the dosage data." << endl;
532                     missing = true;
533                 }
534                 f_buf = 1e6;
535             }
536             if (rsnp[i]) {
537                 _geno_dose[k][j] = (f_buf);
538                 j++;
539             }
540         }
541         k++;
542     }
543     zinf.clear();
544     zinf.close();
545 
546     cout << "Imputed dosage data for " << kept_id.size() << " individuals are included from [" << zdosefile << "]." << endl;
547     _fa_id.resize(_indi_num);
548     _mo_id.resize(_indi_num);
549     _sex.resize(_indi_num);
550     _pheno.resize(_indi_num);
551     for (i = 0; i < _indi_num; i++) {
552         _fa_id[i] = _mo_id[i] = "0";
553         _sex[i] = -9;
554         _pheno[i] = -9;
555     }
556 
557     // initialize keep
558     init_keep();
559     update_id_map_kp(kept_id, _id_map, _keep);
560     if (_keep.size() == 0) throw ("Error: No individual is retained for analysis.");
561 
562     if (blup_indi_flag) read_indi_blup(blup_indi_file);
563 
564     // update data
565     update_bim(rsnp);
566 }
567 
read_imp_dose_mach(string dosefile,string kp_indi_file,string rm_indi_file,string blup_indi_file)568 void gcta::read_imp_dose_mach(string dosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) {
569     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
570     if(dosefile.substr(dosefile.length()-3,3)==".gz") throw("Error: the --dosage-mach option doesn't support gz file any more. Please check --dosage-mach-gz option.");
571 
572     int i = 0, j = 0, k = 0, line = 0;
573     vector<int> rsnp;
574     get_rsnp(rsnp);
575 
576     ifstream idose(dosefile.c_str());
577     if (!idose) throw ("Error: can not open the file [" + dosefile + "] to read.");
578 
579     vector<string> indi_ls;
580     map<string, int> kp_id_map, blup_id_map, rm_id_map;
581     bool kp_indi_flag = !kp_indi_file.empty(), blup_indi_flag = !blup_indi_file.empty(), rm_indi_flag = !rm_indi_file.empty();
582     if (kp_indi_flag) read_indi_list(kp_indi_file, indi_ls);
583     for (i = 0; i < indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i));
584     if (blup_indi_flag) read_indi_list(blup_indi_file, indi_ls);
585     for (i = 0; i < indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i));
586     if (rm_indi_flag) read_indi_list(rm_indi_file, indi_ls);
587     for (i = 0; i < indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i));
588 
589     bool missing = false;
590     string buf, str_buf, id_buf, err_msg = "Error: reading dosage data failed. Are the map file and the dosage file matched?";
591     double f_buf = 0.0;
592     vector<string> kept_id, vs_buf;
593     cout << "Reading dosage data from [" + dosefile + "] in individual-major format (Note: may use huge RAM)." << endl;
594     _fid.clear();
595     _pid.clear();
596     _geno_dose.clear();
597 
598     vector<int> kp_it;
599     while (getline(idose, buf)) {
600         bool kp_flag = true;
601         stringstream ss(buf);
602         if (!(ss >> str_buf)) break;
603         int ibuf = StrFunc::split_string(str_buf, vs_buf, ">");
604         if (ibuf > 1) {
605             if (vs_buf[0].empty()) throw ("Error: family ID of the individual [" + str_buf + "] is missing.");
606             else vs_buf[0].erase(vs_buf[0].end() - 1);
607         } else if (ibuf == 1) vs_buf.push_back(vs_buf[0]);
608         else break;
609         id_buf = vs_buf[0] + ":" + vs_buf[1];
610         if (kp_indi_flag && kp_id_map.find(id_buf) == kp_id_map.end()) kp_flag = false;
611         if (kp_flag && blup_indi_flag && blup_id_map.find(id_buf) == blup_id_map.end()) kp_flag = false;
612         if (kp_flag && rm_indi_flag && rm_id_map.find(id_buf) != rm_id_map.end()) kp_flag = false;
613         if (kp_flag) {
614             kp_it.push_back(1);
615             _fid.push_back(vs_buf[0]);
616             _pid.push_back(vs_buf[1]);
617             kept_id.push_back(id_buf);
618         } else kp_it.push_back(0);
619     }
620     idose.close();
621     cout << "(Imputed dosage data for " << kp_it.size() << " individuals detected)." << endl;
622     _indi_num = _fid.size();
623 
624     idose.open(dosefile.c_str());
625     _geno_dose.resize(_indi_num);
626     for (line = 0; line < _indi_num; line++) _geno_dose[line].resize(_include.size());
627     for (line = 0, k = 0; line < kp_it.size(); line++) {
628         getline(idose, buf);
629         if (kp_it[line] == 0) continue;
630         stringstream ss(buf);
631         if (!(ss >> str_buf)) break;
632         if (!(ss >> str_buf)) break;
633         for (i = 0, j = 0; i < _snp_num; i++) {
634             ss >> str_buf;
635             f_buf = atof(str_buf.c_str());
636             if (str_buf == "X" || str_buf == "NA") {
637                 if (!missing) {
638                     cout << "Warning: missing values detected in the dosage data." << endl;
639                     missing = true;
640                 }
641                 f_buf = 1e6;
642             }
643             if (rsnp[i]) {
644                 _geno_dose[k][j] = (f_buf);
645                 j++;
646             }
647         }
648         k++;
649     }
650     idose.close();
651 
652     cout << "Imputed dosage data for " << kept_id.size() << " individuals are included from [" << dosefile << "]." << endl;
653     _fa_id.resize(_indi_num);
654     _mo_id.resize(_indi_num);
655     _sex.resize(_indi_num);
656     _pheno.resize(_indi_num);
657     for (i = 0; i < _indi_num; i++) {
658         _fa_id[i] = _mo_id[i] = "0";
659         _sex[i] = -9;
660         _pheno[i] = -9;
661     }
662 
663     // initialize keep
664     init_keep();
665     update_id_map_kp(kept_id, _id_map, _keep);
666     if (_keep.size() == 0) throw ("Error: No individual is retained for analysis.");
667 
668     if (blup_indi_flag) read_indi_blup(blup_indi_file);
669 
670     // update data
671     update_bim(rsnp);
672 }
673 
read_imp_info_beagle(string zinfofile)674 void gcta::read_imp_info_beagle(string zinfofile) {
675     _dosage_flag = true;
676 
677     const int MAX_LINE_LENGTH = 1000;
678     char buf[MAX_LINE_LENGTH];
679     string str_buf, errmsg = "Error: Reading SNP summary information filed? Please check the format of [" + zinfofile + "].";
680 
681     string c_buf;
682     int i_buf;
683     double f_buf = 0.0;
684     gzifstream zinf;
685     zinf.open(zinfofile.c_str());
686     if (!zinf.is_open()) throw ("Error: can not open the file [" + zinfofile + "] to read.");
687     cout << "Reading summary information of the imputed SNPs (BEAGLE) ..." << endl;
688     zinf.getline(buf, MAX_LINE_LENGTH, '\n'); // skip the header
689     while (1) {
690         zinf.getline(buf, MAX_LINE_LENGTH, '\n');
691         if (zinf.fail() || !zinf.good()) break;
692         stringstream ss(buf);
693         string nerr = errmsg + "\nError line: " + ss.str();
694         if (!(ss >> i_buf)) throw (nerr);
695         _chr.push_back(i_buf);
696         if (!(ss >> str_buf)) throw (nerr);
697         _snp_name.push_back(str_buf);
698         if (!(ss >> i_buf)) throw (nerr);
699         _bp.push_back(i_buf);
700         if (!(ss >> c_buf)) throw (nerr);
701         _allele1.push_back(c_buf);
702         if (!(ss >> c_buf)) throw (nerr);
703         _allele2.push_back(c_buf);
704         if (!(ss >> str_buf)) throw (nerr);
705         if (!(ss >> str_buf)) throw (nerr);
706         if (!(ss >> str_buf)) throw (nerr);
707         if (!(ss >> f_buf)) throw (nerr);
708         if (!(ss >> f_buf)) throw (nerr);
709         if (!(ss >> f_buf)) throw (nerr);
710         _impRsq.push_back(f_buf);
711         if (!(ss >> f_buf)) throw (nerr);
712         if (!(ss >> f_buf)) throw (nerr);
713         if (ss >> f_buf) throw (nerr);
714     }
715     zinf.clear();
716     zinf.close();
717     _snp_num = _snp_name.size();
718     cout << _snp_num << " SNPs to be included from [" + zinfofile + "]." << endl;
719     _genet_dst.resize(_snp_num);
720     _ref_A = _allele1;
721     _other_A = _allele2;
722 
723     // Initialize _include
724     init_include();
725 }
726 
read_imp_dose_beagle(string zdosefile,string kp_indi_file,string rm_indi_file,string blup_indi_file)727 void gcta::read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) {
728     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
729     int i = 0, j = 0;
730     vector<int> rsnp;
731     get_rsnp(rsnp);
732 
733     const int MAX_LINE_LENGTH = 10000000;
734     char buf[MAX_LINE_LENGTH];
735     string str_buf;
736 
737     gzifstream zinf;
738     zinf.open(zdosefile.c_str());
739     if (!zinf.is_open()) throw ("Error: can not open the file [" + zdosefile + "] to read.");
740     cout << "Reading imputed dosage scores (BEAGLE output) ..." << endl;
741     zinf.getline(buf, MAX_LINE_LENGTH, '\n');
742     stringstream ss(buf);
743     for (i = 0; i < 3; i++) ss >> str_buf;
744     while (ss >> str_buf) {
745         _fid.push_back(str_buf);
746     }
747     _pid = _fid;
748     _indi_num = _fid.size();
749     _fa_id.resize(_indi_num);
750     _mo_id.resize(_indi_num);
751     _sex.resize(_indi_num);
752     _pheno.resize(_indi_num);
753     cout << _indi_num << " individuals to be included from [" + zdosefile + "]." << endl;
754     init_keep();
755     if (!kp_indi_file.empty()) keep_indi(kp_indi_file);
756     if (!blup_indi_file.empty()) read_indi_blup(blup_indi_file);
757     if (!rm_indi_file.empty()) remove_indi(rm_indi_file);
758 
759     _geno_dose.resize(_keep.size());
760     for (i = 0; i < _keep.size(); i++) _geno_dose[i].resize(_include.size());
761 
762     vector<int> rindi;
763     get_rindi(rindi);
764 
765     int line = 0;
766     int k = 0;
767     double d_buf = 0.0;
768     while (1) {
769         zinf.getline(buf, MAX_LINE_LENGTH, '\n');
770         if (zinf.fail() || !zinf.good()) break;
771         if (!rsnp[line++]) continue;
772         stringstream ss(buf);
773         ss >> str_buf;
774         if (str_buf != _snp_name[line - 1]) {
775             stringstream errmsg;
776             errmsg << "Error: the " << line << " th SNP [" + _snp_name[line - 1] + "] in the summary file doesn't match to that in the dosage file." << endl;
777             throw (errmsg.str());
778         }
779         ss >> str_buf >> str_buf;
780         for (i = 0, j = 0; i < _indi_num; i++) {
781             ss >> d_buf;
782             if (rindi[i]) {
783                 _geno_dose[j][k] = d_buf;
784                 j++;
785             }
786         }
787         k++;
788     }
789     zinf.clear();
790     zinf.close();
791 }
792 
save_plink()793 void gcta::save_plink() {
794     if (_dosage_flag) dose2bed();
795     save_famfile();
796     save_bimfile();
797     save_bedfile();
798 }
799 
save_bedfile()800 void gcta::save_bedfile() {
801     int i = 0, pos = 0, j = 0;
802     string OutBedFile = _out + ".bed";
803     fstream OutBed(OutBedFile.c_str(), ios::out | ios::binary);
804     if (!OutBed) throw ("Error: can not open the file [" + OutBedFile + "] to write.");
805     cout << "Writing genotypes to PLINK BED file [" + OutBedFile + "] ..." << endl;
806     bitset<8> b;
807     char ch[1];
808     b.reset();
809     b.set(2);
810     b.set(3);
811     b.set(5);
812     b.set(6);
813     ch[0] = (char) b.to_ulong();
814     OutBed.write(ch, 1);
815     b.reset();
816     b.set(0);
817     b.set(1);
818     b.set(3);
819     b.set(4);
820     ch[0] = (char) b.to_ulong();
821     OutBed.write(ch, 1);
822     b.reset();
823     b.set(0);
824     ch[0] = (char) b.to_ulong();
825     OutBed.write(ch, 1);
826     for (i = 0; i < _include.size(); i++) {
827         pos = 0;
828         b.reset();
829         for (j = 0; j < _keep.size(); j++) {
830             b[pos++] = (!_snp_2[_include[i]][_keep[j]]);
831             b[pos++] = (!_snp_1[_include[i]][_keep[j]]);
832             if (pos > 7 || j == _keep.size() - 1) {
833                 ch[0] = (char) b.to_ulong();
834                 OutBed.write(ch, 1);
835                 pos = 0;
836                 b.reset();
837             }
838         }
839     }
840     OutBed.close();
841 }
842 
save_famfile()843 void gcta::save_famfile() {
844     string famfile = _out + ".fam";
845     ofstream Fam(famfile.c_str());
846     if (!Fam) throw ("Error: can not open the fam file " + famfile + " to save!");
847     cout << "Writing PLINK FAM file to [" + famfile + "] ..." << endl;
848     int i = 0;
849     for (i = 0; i < _keep.size(); i++) {
850         Fam << _fid[_keep[i]] << "\t" << _pid[_keep[i]] << "\t" << _fa_id[_keep[i]] << "\t" << _mo_id[_keep[i]] << "\t" << _sex[_keep[i]] << "\t" << _pheno[_keep[i]] << endl;
851     }
852     Fam.close();
853     cout << _keep.size() << " individuals to be saved to [" + famfile + "]." << endl;
854 }
855 
save_bimfile()856 void gcta::save_bimfile() {
857     int i = 0;
858     string bimfile = _out + ".bim";
859     ofstream Bim(bimfile.c_str());
860     if (!Bim) throw ("Error: can not open the file [" + bimfile + "] to write.");
861     cout << "Writing PLINK bim file to [" + bimfile + "] ..." << endl;
862     for (i = 0; i < _include.size(); i++) {
863         Bim << _chr[_include[i]] << "\t" << _snp_name[_include[i]] << "\t" << _genet_dst[_include[i]] << "\t" << _bp[_include[i]] << "\t" << _allele1[_include[i]] << "\t" << _allele2[_include[i]] << endl;
864     }
865     Bim.close();
866     cout << _include.size() << " SNPs to be saved to [" + bimfile + "]." << endl;
867 }
868 
dose2bed()869 void gcta::dose2bed() {
870     int i = 0, j = 0;
871     double d_buf = 0.0;
872 
873     cout << "Converting dosage data into PLINK binary PED format ... " << endl;
874     _snp_1.resize(_snp_num);
875     _snp_2.resize(_snp_num);
876     for (i = 0; i < _snp_num; i++){
877         _snp_1[i].resize(_indi_num);
878         _snp_2[i].resize(_indi_num);
879     }
880     for (i = 0; i < _include.size(); i++) {
881         for (j = 0; j < _keep.size(); j++) {
882            d_buf = _geno_dose[_keep[j]][_include[i]];
883             if (d_buf > 1e5) {
884                 _snp_2[_include[i]][_keep[j]] = false;
885                 _snp_1[_include[i]][_keep[j]] = true;
886             } else if (d_buf >= 1.5) _snp_1[_include[i]][_keep[j]] = _snp_2[_include[i]][_keep[j]] = true;
887             else if (d_buf > 0.5) {
888                 _snp_2[_include[i]][_keep[j]] = true;
889                 _snp_1[_include[i]][_keep[j]] = false;
890             } else if (d_buf <= 0.5) _snp_1[_include[i]][_keep[j]] = _snp_2[_include[i]][_keep[j]] = false;
891         }
892     }
893 }
894 
update_id_map_kp(const vector<string> & id_list,map<string,int> & id_map,vector<int> & keep)895 void gcta::update_id_map_kp(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep) {
896     int i = 0;
897     map<string, int> id_map_buf(id_map);
898     for (i = 0; i < id_list.size(); i++) id_map_buf.erase(id_list[i]);
899     map<string, int>::iterator iter;
900     for (iter = id_map_buf.begin(); iter != id_map_buf.end(); iter++) id_map.erase(iter->first);
901 
902     keep.clear();
903     for (iter = id_map.begin(); iter != id_map.end(); iter++) keep.push_back(iter->second);
904     stable_sort(keep.begin(), keep.end());
905 }
906 
update_id_map_rm(const vector<string> & id_list,map<string,int> & id_map,vector<int> & keep)907 void gcta::update_id_map_rm(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep)
908 {
909     int i = 0;
910     for (i = 0; i < id_list.size(); i++) id_map.erase(id_list[i]);
911 
912     keep.clear();
913     map<string, int>::iterator iter;
914     for (iter = id_map.begin(); iter != id_map.end(); iter++) keep.push_back(iter->second);
915     stable_sort(keep.begin(), keep.end());
916 }
917 
read_snplist(string snplistfile,vector<string> & snplist,string msg)918 void gcta::read_snplist(string snplistfile, vector<string> &snplist, string msg)
919 {
920     // Read snplist file
921     snplist.clear();
922     string StrBuf;
923     ifstream i_snplist(snplistfile.c_str());
924     if (!i_snplist) throw ("Error: can not open the file [" + snplistfile + "] to read.");
925     cout << "Reading a list of " << msg << " from [" + snplistfile + "]." << endl;
926     while (i_snplist >> StrBuf) {
927         snplist.push_back(StrBuf);
928         getline(i_snplist, StrBuf);
929     }
930     i_snplist.close();
931 }
932 
extract_snp(string snplistfile)933 void gcta::extract_snp(string snplistfile)
934 {
935     vector<string> snplist;
936     read_snplist(snplistfile, snplist);
937     update_id_map_kp(snplist, _snp_name_map, _include);
938     cout << _include.size() << " SNPs are extracted from [" + snplistfile + "]." << endl;
939 }
940 
extract_single_snp(string snpname)941 void gcta::extract_single_snp(string snpname)
942 {
943     vector<string> snplist;
944     snplist.push_back(snpname);
945     update_id_map_kp(snplist, _snp_name_map, _include);
946     if (_include.empty()) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
947     else cout << "Only the SNP [" + snpname + "] is included in the analysis." << endl;
948 }
949 
extract_region_snp(string snpname,int wind_size)950 void gcta::extract_region_snp(string snpname, int wind_size)
951 {
952     cout << "Extracting SNPs " << wind_size/1000 << "kb away from the SNP [" << snpname << "] in either direction ..." << endl;
953     map<string, int>::iterator iter;
954     iter = _snp_name_map.find(snpname);
955     int i = 0, j = 0;
956     vector<string> snplist;
957     if(iter==_snp_name_map.end()) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
958     else{
959         int bp = _bp[iter->second];
960         int chr = _chr[iter->second];
961         for(i = 0; i < _include.size(); i++){
962             j = _include[i];
963             if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
964         }
965     }
966     if(snplist.empty()) throw ("Error: on SNP found in this region.");
967     update_id_map_kp(snplist, _snp_name_map, _include);
968     cout << _include.size() << " SNPs are extracted." << endl;
969 }
970 
extract_region_bp(int chr,int bp,int wind_size)971 void gcta::extract_region_bp(int chr, int bp, int wind_size)
972 {
973     cout << "Extracting SNPs " << wind_size/1000 << "kb away from the position [chr=" << chr <<"; bp="<< bp << "] in either direction ..." << endl;
974     int i = 0, j = 0;
975     vector<string> snplist;
976     for(i = 0; i < _include.size(); i++){
977         j = _include[i];
978         if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
979     }
980     if(snplist.empty()) throw ("Error: on SNP found in this region.");
981     update_id_map_kp(snplist, _snp_name_map, _include);
982     cout << _include.size() << " SNPs are extracted." << endl;
983 }
984 
exclude_snp(string snplistfile)985 void gcta::exclude_snp(string snplistfile)
986 {
987     vector<string> snplist;
988     read_snplist(snplistfile, snplist);
989     int prev_size = _include.size();
990     update_id_map_rm(snplist, _snp_name_map, _include);
991     cout << prev_size - _include.size() << " SNPs are excluded from [" + snplistfile + "] and there are " << _include.size() << " SNPs remaining." << endl;
992 }
993 
exclude_region_snp(string snpname,int wind_size)994 void gcta::exclude_region_snp(string snpname, int wind_size)
995 {
996     cout << "Excluding SNPs " << wind_size/1000 << "kb away from the SNP [" << snpname << "] in either direction ..." << endl;
997     map<string, int>::iterator iter;
998     iter = _snp_name_map.find(snpname);
999     int i = 0, j = 0;
1000     vector<string> snplist;
1001     if(iter==_snp_name_map.end()) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
1002     else{
1003         int bp = _bp[iter->second];
1004         int chr = _chr[iter->second];
1005         for(i = 0; i < _include.size(); i++){
1006             j = _include[i];
1007             if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
1008         }
1009     }
1010     if(snplist.empty()) throw ("Error: on SNP found in this region.");
1011     update_id_map_rm(snplist, _snp_name_map, _include);
1012     cout << _include.size() << " SNPs have been excluded." << endl;
1013 }
1014 
exclude_region_bp(int chr,int bp,int wind_size)1015 void gcta::exclude_region_bp(int chr, int bp, int wind_size)
1016 {
1017     cout << "Extracting SNPs " << wind_size/1000 << "kb away from the position [chr=" << chr <<"; bp="<< bp << "] in either direction ..." << endl;
1018     int i = 0, j = 0;
1019     vector<string> snplist;
1020     for(i = 0; i < _include.size(); i++){
1021         j = _include[i];
1022         if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
1023     }
1024     if(snplist.empty()) throw ("Error: on SNP found in this region.");
1025     update_id_map_rm(snplist, _snp_name_map, _include);
1026     cout << _include.size() << " SNPs are excludeed." << endl;
1027 }
1028 
exclude_single_snp(string snpname)1029 void gcta::exclude_single_snp(string snpname)
1030 {
1031     vector<string> snplist;
1032     snplist.push_back(snpname);
1033     int include_size = _include.size();
1034     update_id_map_rm(snplist, _snp_name_map, _include);
1035     if (_include.size() == include_size) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
1036     else cout << "The SNP[" + snpname + "] has been excluded from the analysis." << endl;
1037 }
1038 
extract_chr(int chr_start,int chr_end)1039 void gcta::extract_chr(int chr_start, int chr_end)
1040 {
1041     map<string, int> id_map_buf(_snp_name_map);
1042     map<string, int>::iterator iter, end = id_map_buf.end();
1043     _snp_name_map.clear();
1044     _include.clear();
1045     for (iter = id_map_buf.begin(); iter != end; iter++) {
1046         if (_chr[iter->second] >= chr_start && _chr[iter->second] <= chr_end) {
1047             _snp_name_map.insert(*iter);
1048             _include.push_back(iter->second);
1049         }
1050     }
1051     stable_sort(_include.begin(), _include.end());
1052     if (chr_start != chr_end) cout << _include.size() << " SNPs from chromosome " << chr_start << " to chromosome " << chr_end << " are included in the analysis." << endl;
1053     else cout << _include.size() << " SNPs on chromosome " << chr_start << " are included in the analysis." << endl;
1054 }
1055 
filter_snp_maf(double maf)1056 void gcta::filter_snp_maf(double maf)
1057 {
1058     if (_mu.empty()) calcu_mu();
1059 
1060     cout << "Filtering SNPs with MAF > " << maf << " ..." << endl;
1061     map<string, int> id_map_buf(_snp_name_map);
1062     map<string, int>::iterator iter, end = id_map_buf.end();
1063     int prev_size = _include.size();
1064     double fbuf = 0.0;
1065     _include.clear();
1066     _snp_name_map.clear();
1067     for (iter = id_map_buf.begin(); iter != end; iter++) {
1068         fbuf = _mu[iter->second]*0.5;
1069         if (fbuf <= maf || (1.0 - fbuf) <= maf) continue;
1070         _snp_name_map.insert(*iter);
1071         _include.push_back(iter->second);
1072     }
1073     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
1074     else {
1075         stable_sort(_include.begin(), _include.end());
1076         cout << "After filtering SNPs with MAF > " << maf << ", there are " << _include.size() << " SNPs (" << prev_size - _include.size() << " SNPs with MAF < " << maf << ")." << endl;
1077     }
1078 }
1079 
filter_snp_max_maf(double max_maf)1080 void gcta::filter_snp_max_maf(double max_maf)
1081 {
1082     if (_mu.empty()) calcu_mu();
1083 
1084     cout << "Filtering SNPs with MAF < " << max_maf << " ..." << endl;
1085     map<string, int> id_map_buf(_snp_name_map);
1086     map<string, int>::iterator iter, end = id_map_buf.end();
1087     int prev_size = _include.size();
1088     double fbuf = 0.0;
1089     _include.clear();
1090     _snp_name_map.clear();
1091     for (iter = id_map_buf.begin(); iter != end; iter++) {
1092         fbuf = _mu[iter->second]*0.5;
1093         if (fbuf > max_maf && 1.0 - fbuf > max_maf) continue;
1094         _snp_name_map.insert(*iter);
1095         _include.push_back(iter->second);
1096     }
1097     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
1098     else {
1099         stable_sort(_include.begin(), _include.end());
1100         cout << "After filtering SNPs with MAF < " << max_maf << ", there are " << _include.size() << " SNPs (" << prev_size - _include.size() << " SNPs with MAF > " << max_maf << ")." << endl;
1101     }
1102 }
1103 
filter_impRsq(double rsq_cutoff)1104 void gcta::filter_impRsq(double rsq_cutoff)
1105 {
1106     if (_impRsq.empty()) cout << "Warning: the option --imput-rsq is inactive because GCTA can't find the imputation quality scores for the SNPs. Use the option --update-imput-rsq to input the imputation quality scores." << endl;
1107     cout << "Filtering SNPs with imputation Rsq > " << rsq_cutoff << " ..." << endl;
1108     map<string, int> id_map_buf(_snp_name_map);
1109     map<string, int>::iterator iter, end = id_map_buf.end();
1110     int prev_size = _include.size();
1111     _include.clear();
1112     _snp_name_map.clear();
1113     for (iter = id_map_buf.begin(); iter != end; iter++) {
1114         if (_impRsq[iter->second] < rsq_cutoff) continue;
1115         _snp_name_map.insert(*iter);
1116         _include.push_back(iter->second);
1117     }
1118     if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
1119     else {
1120         stable_sort(_include.begin(), _include.end());
1121         cout << "After filtering for imputation Rsq > " << rsq_cutoff << ", there are " << _include.size() << " SNPs (" << prev_size - _include.size() << " SNPs with imputation Rsq < " << rsq_cutoff << ")." << endl;
1122     }
1123 }
1124 
read_indi_list(string indi_list_file,vector<string> & indi_list)1125 void gcta::read_indi_list(string indi_list_file, vector<string> &indi_list)
1126 {
1127     ifstream i_indi_list(indi_list_file.c_str());
1128     if (!i_indi_list) throw ("Error: can not open the file [" + indi_list_file + "] to read.");
1129     string str_buf, id_buf;
1130     indi_list.clear();
1131     while (i_indi_list) {
1132         i_indi_list >> str_buf;
1133         if (i_indi_list.eof()) break;
1134         id_buf = str_buf + ":";
1135         i_indi_list >> str_buf;
1136         id_buf += str_buf;
1137         indi_list.push_back(id_buf);
1138         getline(i_indi_list, str_buf);
1139     }
1140     i_indi_list.close();
1141 }
1142 
keep_indi(string indi_list_file)1143 void gcta::keep_indi(string indi_list_file) {
1144     vector<string> indi_list;
1145     read_indi_list(indi_list_file, indi_list);
1146     update_id_map_kp(indi_list, _id_map, _keep);
1147     cout << _keep.size() << " individuals are kept from [" + indi_list_file + "]." << endl;
1148 }
1149 
remove_indi(string indi_list_file)1150 void gcta::remove_indi(string indi_list_file) {
1151     vector<string> indi_list;
1152     read_indi_list(indi_list_file, indi_list);
1153     int prev_size = _keep.size();
1154     update_id_map_rm(indi_list, _id_map, _keep);
1155     cout << prev_size - _keep.size() << " individuals are removed from [" + indi_list_file + "] and there are " << _keep.size() << " individuals remaining." << endl;
1156 }
1157 
update_sex(string sex_file)1158 void gcta::update_sex(string sex_file) {
1159     ifstream isex(sex_file.c_str());
1160     if (!isex) throw ("Error: can not open the file [" + sex_file + "] to read.");
1161     int sex_buf = 0, icount = 0;
1162     string str_buf, fid, pid;
1163     cout << "Reading sex information from [" + sex_file + "]." << endl;
1164     map<string, int>::iterator iter, End = _id_map.end();
1165     _sex.clear();
1166     _sex.resize(_indi_num);
1167     vector<int> confirm(_indi_num);
1168     while (isex) {
1169         isex >> fid;
1170         if (isex.eof()) break;
1171         isex >> pid;
1172         isex >> str_buf;
1173         if (str_buf != "1" && str_buf != "2" && str_buf != "M" && str_buf != "F") throw ("Error: unrecognized sex code: \"" + fid + " " + pid + " " + str_buf + "\" in [" + sex_file + "].");
1174         iter = _id_map.find(fid + ":" + pid);
1175         if (iter != End) {
1176             if (str_buf == "M" || str_buf == "1") _sex[iter->second] = 1;
1177             else if (str_buf == "F" || str_buf == "2") _sex[iter->second] = 2;
1178             confirm[iter->second] = 1;
1179             icount++;
1180         }
1181         getline(isex, str_buf);
1182     }
1183     isex.close();
1184 
1185     for (int i = 0; i < _keep.size(); i++) {
1186         if (confirm[_keep[i]] != 1) throw ("Error: sex information for all of the included individuals should be updated.");
1187     }
1188     cout << "Sex information for " << icount << " individuals are update from [" + sex_file + "]." << endl;
1189 }
1190 
update_ref_A(string ref_A_file)1191 void gcta::update_ref_A(string ref_A_file) {
1192     ifstream i_ref_A(ref_A_file.c_str());
1193     if (!i_ref_A) throw ("Error: can not open the file [" + ref_A_file + "] to read.");
1194     int i = 0;
1195     string str_buf, ref_A_buf;
1196     cout << "Reading reference alleles of SNPs from [" + ref_A_file + "]." << endl;
1197     map<string, int>::iterator iter, End = _snp_name_map.end();
1198     int icount = 0;
1199     while (i_ref_A) {
1200         i_ref_A >> str_buf;
1201         if (i_ref_A.eof()) break;
1202         iter = _snp_name_map.find(str_buf);
1203         i_ref_A >> ref_A_buf;
1204         if (iter != End) {
1205             if (ref_A_buf == _allele1[iter->second]) {
1206                 _ref_A[iter->second] = _allele1[iter->second];
1207                 _other_A[iter->second] = _allele2[iter->second];
1208             } else if (ref_A_buf == _allele2[iter->second]) {
1209                 _ref_A[iter->second] = _allele2[iter->second];
1210                 _other_A[iter->second] = _allele1[iter->second];
1211             } else throw ("Error: invalid reference allele for SNP \"" + _snp_name[iter->second] + "\".");
1212             icount++;
1213         }
1214         getline(i_ref_A, str_buf);
1215     }
1216     i_ref_A.close();
1217     cout << "Reference alleles of " << icount << " SNPs are update from [" + ref_A_file + "]." << endl;
1218     if (icount != _snp_num) cout << "Warning: reference alleles of " << _snp_num - icount << " SNPs have not been updated." << endl;
1219 }
1220 
calcu_mu(bool ssq_flag)1221 void gcta::calcu_mu(bool ssq_flag) {
1222     int i = 0, j = 0;
1223 
1224     vector<double> auto_fac(_keep.size()), xfac(_keep.size()), fac(_keep.size());
1225     for (i = 0; i < _keep.size(); i++) {
1226         auto_fac[i] = 1.0;
1227         if (_sex[_keep[i]] == 1) xfac[i] = 0.5;
1228         else if (_sex[_keep[i]] == 2) xfac[i] = 1.0;
1229         fac[i] = 0.5;
1230     }
1231 
1232     cout << "Calculating allele frequencies ..." << endl;
1233     _mu.clear();
1234     _mu.resize(_snp_num);
1235 
1236     #pragma omp parallel for
1237     for (j = 0; j < _include.size(); j++) {
1238         if (_chr[_include[j]]<(_autosome_num + 1)) mu_func(j, auto_fac);
1239         else if (_chr[_include[j]] == (_autosome_num + 1)) mu_func(j, xfac);
1240         else mu_func(j, fac);
1241     }
1242 }
1243 
calcu_maf()1244 void gcta::calcu_maf()
1245 {
1246     if (_mu.empty()) calcu_mu();
1247 
1248     int i = 0, m = _include.size();
1249     _maf.resize(m);
1250     #pragma omp parallel for
1251     for(i = 0; i < m; i++){
1252         _maf[i] = 0.5*_mu[_include[i]];
1253         if(_maf[i] > 0.5) _maf[i] = 1.0 - _maf[i];
1254     }
1255 }
1256 
mu_func(int j,vector<double> & fac)1257 void gcta::mu_func(int j, vector<double> &fac) {
1258     int i = 0;
1259     double fcount = 0.0, f_buf = 0.0;
1260     if (_dosage_flag) {
1261         for (i = 0; i < _keep.size(); i++) {
1262             if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1263                 _mu[_include[j]] += fac[i] * _geno_dose[_keep[i]][_include[j]];
1264                 fcount += fac[i];
1265             }
1266         }
1267     } else {
1268         for (i = 0; i < _keep.size(); i++) {
1269             if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1270                 f_buf = (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1271                 if (_allele2[_include[j]] == _ref_A[_include[j]]) f_buf = 2.0 - f_buf;
1272                 _mu[_include[j]] += fac[i] * f_buf;
1273                 fcount += fac[i];
1274             }
1275         }
1276     }
1277 
1278     if (fcount > 0.0)_mu[_include[j]] /= fcount;
1279 }
1280 
update_impRsq(string zinfofile)1281 void gcta::update_impRsq(string zinfofile) {
1282     ifstream iRsq(zinfofile.c_str());
1283     if (!iRsq) throw ("Error: can not open the file [" + zinfofile + "] to read.");
1284 
1285     string snp_name_buf, str_buf;
1286     double fbuf = 0.0;
1287     cout << "Reading imputation Rsq of the SNPs from [" + zinfofile + "]." << endl;
1288     _impRsq.clear();
1289     _impRsq.resize(_snp_num, 0.0);
1290     map<string, int>::iterator iter, End = _snp_name_map.end();
1291     int icount = 0;
1292     while (iRsq) {
1293         iRsq >> snp_name_buf;
1294         if (iRsq.eof()) break;
1295         iter = _snp_name_map.find(snp_name_buf);
1296         iRsq >> str_buf;
1297         fbuf = atof(str_buf.c_str());
1298         if (iter != End) {
1299             if (fbuf > 2.0 || fbuf < 0.0) throw ("Error: invalid value of imputation Rsq for the SNP " + snp_name_buf + ".");
1300             _impRsq[iter->second] = fbuf;
1301             icount++;
1302         }
1303         getline(iRsq, str_buf);
1304     }
1305     iRsq.close();
1306 
1307     cout << "Imputation Rsq of " << icount << " SNPs are update from [" + zinfofile + "]." << endl;
1308     if (icount != _snp_num) cout << "Warning: imputation Rsq of " << _snp_num - icount << " SNPs have not been updated." << endl;
1309 }
1310 
update_freq(string freq)1311 void gcta::update_freq(string freq) {
1312     ifstream ifreq(freq.c_str());
1313     if (!ifreq) throw ("Error: can not open the file [" + freq + "] to read.");
1314     int i = 0;
1315     string ref_A_buf;
1316     double fbuf = 0.0;
1317     string snp_name_buf, str_buf;
1318     cout << "Reading allele frequencies of the SNPs from [" + freq + "]." << endl;
1319     map<string, int>::iterator iter, End = _snp_name_map.end();
1320     _mu.clear();
1321     _mu.resize(_snp_num, 0.0);
1322     int icount = 0;
1323     while (ifreq) {
1324         ifreq >> snp_name_buf;
1325         if (ifreq.eof()) break;
1326         iter = _snp_name_map.find(snp_name_buf);
1327         ifreq >> ref_A_buf;
1328         ifreq >> str_buf;
1329         fbuf = atof(str_buf.c_str());
1330         if (iter != End) {
1331             if (fbuf > 1.0 || fbuf < 0.0) throw ("Error: invalid value of allele frequency for the SNP " + snp_name_buf + ".");
1332             if (ref_A_buf != _allele1[iter->second] && ref_A_buf != _allele2[iter->second]) {
1333                 throw ("Invalid allele type \"" + ref_A_buf + "\" for the SNP " + _snp_name[iter->second] + ".");
1334             }
1335             if (ref_A_buf == _ref_A[iter->second]) _mu[iter->second] = fbuf * 2.0;
1336             else _mu[iter->second] = (1.0 - fbuf)*2.0;
1337             icount++;
1338         }
1339         getline(ifreq, str_buf);
1340     }
1341     ifreq.close();
1342 
1343     cout << "Allele frequencies of " << icount << " SNPs are update from [" + freq + "]." << endl;
1344     if (icount != _snp_num) cout << "Warning: allele frequencies of " << _snp_num - icount << " SNPs have not been updated." << endl;
1345 }
1346 
save_freq(bool ssq_flag)1347 void gcta::save_freq(bool ssq_flag) {
1348     if (_mu.empty()) calcu_mu(ssq_flag);
1349     string save_freq = _out + ".freq";
1350     ofstream ofreq(save_freq.c_str());
1351     if (!ofreq) throw ("Error: can not open the file [" + save_freq + "] to write.");
1352     int i = 0;
1353     cout << "Writing allele frequencies of " << _include.size() << " SNPs to [" + save_freq + "]." << endl;
1354     for (i = 0; i < _include.size(); i++) {
1355         ofreq << _snp_name[_include[i]] << "\t" << _ref_A[_include[i]] << "\t" << setprecision(15) << _mu[_include[i]]*0.5;
1356         //        if(ssq_flag) ofreq<<"\t"<<_ssq[_include[i]]<<"\t"<<_w[_include[i]];
1357         ofreq << endl;
1358     }
1359     ofreq.close();
1360     cout << "Allele frequencies of " << _include.size() << " SNPs have been saved in the file [" + save_freq + "]." << endl;
1361 }
1362 
read_indi_blup(string blup_indi_file)1363 void gcta::read_indi_blup(string blup_indi_file) {
1364     vector< vector<string> > g_buf;
1365     ifstream i_indi_blup(blup_indi_file.c_str());
1366     if (!i_indi_blup) throw ("Error: can not open the file [" + blup_indi_file + "] to read.");
1367     string str_buf, id_buf;
1368     vector<string> id, vs_buf;
1369     int i = 0, j = 0, k = 0, col_num = 0;
1370     while (i_indi_blup) {
1371         i_indi_blup >> str_buf;
1372         if (i_indi_blup.eof()) break;
1373         id_buf = str_buf + ":";
1374         i_indi_blup >> str_buf;
1375         id_buf += str_buf;
1376         getline(i_indi_blup, str_buf);
1377         col_num = StrFunc::split_string(str_buf, vs_buf, " \t\n");
1378         if (col_num < 1) continue;
1379         id.push_back(id_buf);
1380         g_buf.push_back(vs_buf);
1381     }
1382     i_indi_blup.close();
1383 
1384     update_id_map_kp(id, _id_map, _keep);
1385     map<string, int> uni_id_map;
1386     map<string, int>::iterator iter;
1387     for (i = 0; i < _keep.size(); i++) uni_id_map.insert(pair<string, int>(_fid[_keep[i]] + ":" + _pid[_keep[i]], i));
1388     _varcmp_Py.setZero(_keep.size(), col_num / 2);
1389     for (i = 0; i < id.size(); i++) {
1390         iter = uni_id_map.find(id[i]);
1391         if (iter == uni_id_map.end()) continue;
1392         for (j = 0, k = 0; j < col_num; j += 2, k++) _varcmp_Py(iter->second, k) = atof(g_buf[i][j].c_str());
1393     }
1394     cout << "BLUP solution to the total genetic effects for " << _keep.size() << " individuals have been read from [" + blup_indi_file + "]." << endl;
1395 }
1396 
make_XMat(MatrixXf & X)1397 bool gcta::make_XMat(MatrixXf &X)
1398 {
1399     if (_mu.empty()) calcu_mu();
1400 
1401     cout << "Recoding genotypes (individual major mode) ..." << endl;
1402     bool have_mis = false;
1403     unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1404 
1405     X.resize(0,0);
1406     X.resize(n, m);
1407     #pragma omp parallel for private(j)
1408     for (i = 0; i < n; i++) {
1409         if (_dosage_flag) {
1410             for (j = 0; j < m; j++) {
1411                 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1412                     if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _geno_dose[_keep[i]][_include[j]];
1413                     else X(i,j) = 2.0 - _geno_dose[_keep[i]][_include[j]];
1414                 }
1415                 else {
1416                     X(i,j) = 1e6;
1417                     have_mis = true;
1418                 }
1419             }
1420             _geno_dose[i].clear();
1421         }
1422         else {
1423             for (j = 0; j < _include.size(); j++) {
1424                 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1425                     if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
1426                     else X(i,j) = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1427                 }
1428                 else {
1429                     X(i,j) = 1e6;
1430                     have_mis = true;
1431                 }
1432             }
1433         }
1434     }
1435     return have_mis;
1436 }
1437 
make_XMat_d(MatrixXf & X)1438 bool gcta::make_XMat_d(MatrixXf &X)
1439 {
1440     if (_mu.empty()) calcu_mu();
1441 
1442     cout << "Recoding genotypes for dominance effects (individual major mode) ..." << endl;
1443     unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1444     bool have_mis = false;
1445 
1446     X.resize(0,0);
1447     X.resize(n, m);
1448     #pragma omp parallel for private(j)
1449     for (i = 0; i < n; i++) {
1450         if (_dosage_flag) {
1451             for (j = 0; j < m; j++) {
1452                 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1453                     if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _geno_dose[_keep[i]][_include[j]];
1454                     else X(i,j) = 2.0 - _geno_dose[_keep[i]][_include[j]];
1455                     if (X(i,j) < 0.5) X(i,j) = 0.0;
1456                     else if (X(i,j) < 1.5) X(i,j) = _mu[_include[j]];
1457                     else X(i,j) = (2.0 * _mu[_include[j]] - 2.0);
1458                 }
1459                 else {
1460                     X(i,j) = 1e6;
1461                     have_mis = true;
1462                 }
1463             }
1464             _geno_dose[i].clear();
1465         }
1466         else {
1467             for (j = 0; j < _include.size(); j++) {
1468                 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1469                     if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
1470                     else X(i,j) = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1471                     if (X(i,j) < 0.5) X(i,j) = 0.0;
1472                     else if (X(i,j) < 1.5) X(i,j) = _mu[_include[j]];
1473                     else X(i,j) = (2.0 * _mu[_include[j]] - 2.0);
1474                 }
1475                 else{
1476                     X(i,j) = 1e6;
1477                     have_mis = true;
1478                 }
1479             }
1480         }
1481     }
1482     return have_mis;
1483 }
1484 
std_XMat(MatrixXf & X,eigenVector & sd_SNP,bool grm_xchr_flag,bool miss_with_mu,bool divid_by_std)1485 void gcta::std_XMat(MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std)
1486 {
1487     if (_mu.empty()) calcu_mu();
1488 
1489     unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1490     sd_SNP.resize(m);
1491     if (_dosage_flag) {
1492         for (j = 0; j < m; j++)  sd_SNP[j] = (X.col(j) - VectorXf::Constant(n, _mu[_include[j]])).squaredNorm() / (n - 1.0);
1493     }
1494     else {
1495         for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1496     }
1497     if (divid_by_std) {
1498         for (j = 0; j < m; j++) {
1499             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1500             else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
1501         }
1502     }
1503 
1504     #pragma omp parallel for private(j)
1505     for (i = 0; i < n; i++) {
1506         for (j = 0; j < m; j++) {
1507             if (X(i,j) < 1e5) {
1508                 X(i,j) -= _mu[_include[j]];
1509                 if (divid_by_std) X(i,j) *= sd_SNP[j];
1510             }
1511             else if (miss_with_mu) X(i,j) = 0.0;
1512         }
1513     }
1514 
1515     if (!grm_xchr_flag) return;
1516 
1517     // for the X-chromosome
1518     check_sex();
1519     double f_buf = sqrt(0.5);
1520 
1521     #pragma omp parallel for private(j)
1522     for (i = 0; i < n; i++) {
1523         if (_sex[_keep[i]] == 1) {
1524             for (j = 0; j < m; j++) {
1525                 if (X(i,j) < 1e5) X(i,j) *= f_buf;
1526                 else if (miss_with_mu) X(i,j) = 0.0;
1527             }
1528         }
1529     }
1530 }
1531 
std_XMat_d(MatrixXf & X,eigenVector & sd_SNP,bool miss_with_mu,bool divid_by_std)1532 void gcta::std_XMat_d(MatrixXf &X, eigenVector &sd_SNP, bool miss_with_mu, bool divid_by_std)
1533 {
1534     if (_mu.empty()) calcu_mu();
1535 
1536     unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1537     sd_SNP.resize(m);
1538     if (_dosage_flag) {
1539         #pragma omp parallel for private(i)
1540         for (j = 0; j < m; j++) {
1541             for (i = 0; i < n; i++) {
1542                 double d_buf = (X(i,j) - _mu[_include[j]]);
1543                 sd_SNP[j] += d_buf*d_buf;
1544             }
1545             sd_SNP[j] /= (n - 1.0);
1546         }
1547     }
1548     else {
1549         for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1550     }
1551     if (divid_by_std) {
1552         for (j = 0; j < m; j++) {
1553             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1554             else sd_SNP[j] = 1.0 / sd_SNP[j];
1555         }
1556     }
1557     else {
1558         for (j = 0; j < m; j++) sd_SNP[j] = sd_SNP[j] * sd_SNP[j];
1559     }
1560     vector<double> psq(m);
1561     for (j = 0; j < m; j++) psq[j] = 0.5 * _mu[_include[j]] * _mu[_include[j]];
1562 
1563     #pragma omp parallel for private(j)
1564     for (i = 0; i < n; i++) {
1565         for (j = 0; j < m; j++) {
1566             if (X(i,j) < 1e5) {
1567                 X(i,j) -= psq[j];
1568                 if (divid_by_std) X(i,j) *= sd_SNP[j];
1569             }
1570             else if (miss_with_mu) X(i,j) = 0.0;
1571         }
1572     }
1573 }
1574 
makex_eigenVector(int j,eigenVector & x,bool resize,bool minus_2p)1575 void gcta::makex_eigenVector(int j, eigenVector &x, bool resize, bool minus_2p)
1576 {
1577     int i = 0;
1578     if (resize) x.resize(_keep.size());
1579     #pragma omp parallel for
1580     for (i = 0; i < _keep.size(); i++) {
1581         if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1582             if (_allele1[_include[j]] == _ref_A[_include[j]]) x[i] = (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1583             else x[i] = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1584         }
1585         else x[i] = _mu[_include[j]];
1586         if (minus_2p) x[i] -= _mu[_include[j]];
1587     }
1588 }
1589 
save_XMat(bool miss_with_mu,bool std)1590 void gcta::save_XMat(bool miss_with_mu, bool std)
1591 {
1592     if(std && _dosage_flag) throw("Error: the --recode-std is invalid for dosage data.");
1593     if ( (miss_with_mu || std) && _mu.empty()) calcu_mu();
1594 
1595     int i = 0, j = 0, m = _include.size();
1596     eigenVector sd_SNP;
1597     if(std){
1598         sd_SNP.resize(m);
1599         for (j = 0; j < m; j++) {
1600             sd_SNP(j) = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1601             if (fabs(sd_SNP(j)) < 1.0e-50) sd_SNP(j) = 0.0;
1602             else sd_SNP(j) = sqrt(1.0 / sd_SNP(j));
1603         }
1604     }
1605 
1606     // Save matrix X
1607     double x_buf = 0.0;
1608     string X_zFile = _out + ".xmat.gz";
1609     gzofstream zoutf;
1610     zoutf.open(X_zFile.c_str());
1611     if (!zoutf.is_open()) throw ("Error: can not open the file [" + X_zFile + "] to write.");
1612     cout << "Saving the recoded genotype matrix to the file [" + X_zFile + "]." << endl;
1613     zoutf << "FID IID ";
1614     for (j = 0; j < _include.size(); j++) zoutf << _snp_name[_include[j]] << " ";
1615     zoutf << endl;
1616     zoutf << "Reference Allele ";
1617     for (j = 0; j < _include.size(); j++) zoutf << _ref_A[_include[j]] << " ";
1618     zoutf << endl;
1619     for (i = 0; i < _keep.size(); i++) {
1620         zoutf << _fid[_keep[i]] << ' ' << _pid[_keep[i]] << ' ';
1621         if (_dosage_flag) {
1622             for (j = 0; j < _include.size(); j++) {
1623                 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1624                     if (_allele1[_include[j]] == _ref_A[_include[j]]) x_buf = _geno_dose[_keep[i]][_include[j]];
1625                     else x_buf = 2.0 - _geno_dose[_keep[i]][_include[j]];
1626                     if(std) x_buf = (x_buf - _mu[_include[j]]) * sd_SNP(j);
1627                     zoutf << x_buf << ' ';
1628                 } else {
1629                     if(std) zoutf << "0 ";
1630                     else{
1631                         if (miss_with_mu) zoutf << _mu[_include[j]] << ' ';
1632                         else zoutf << "NA ";
1633                     }
1634                 }
1635             }
1636         } else {
1637             for (j = 0; j < _include.size(); j++) {
1638                 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1639                     if (_allele1[_include[j]] == _ref_A[_include[j]]) x_buf = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
1640                     else x_buf = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1641                     if(std) x_buf = (x_buf - _mu[_include[j]]) * sd_SNP(j);
1642                     zoutf << x_buf << ' ';
1643                 } else {
1644                     if(std) zoutf << "0 ";
1645                     else {
1646                         if (miss_with_mu) zoutf << _mu[_include[j]] << ' ';
1647                         else zoutf << "NA ";
1648                     }
1649                 }
1650             }
1651         }
1652         zoutf << endl;
1653     }
1654     zoutf.close();
1655     cout << "The recoded genotype matrix has been saved in the file [" + X_zFile + "] (in compressed text format)." << endl;
1656 }
1657 
make_XMat_subset(MatrixXf & X,vector<int> & snp_indx,bool divid_by_std)1658 bool gcta::make_XMat_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std)
1659 {
1660     if(snp_indx.empty()) return false;
1661     if (_mu.empty()) calcu_mu();
1662 
1663     int i = 0, j = 0, k = 0, n = _keep.size(), m = snp_indx.size();
1664 
1665     X.resize(n, m);
1666     #pragma omp parallel for private(j, k)
1667     for (i = 0; i < n; i++) {
1668         for (j = 0; j < m; j++) {
1669             k = _include[snp_indx[j]];
1670             if (!_snp_1[k][_keep[i]] || _snp_2[k][_keep[i]]) {
1671                 if (_allele1[k] == _ref_A[k]) X(i,j) = _snp_1[k][_keep[i]] + _snp_2[k][_keep[i]];
1672                 else X(i,j) = 2.0 - (_snp_1[k][_keep[i]] + _snp_2[k][_keep[i]]);
1673                 X(i,j) -= _mu[k];
1674             }
1675             else X(i,j) = 0.0;
1676         }
1677     }
1678 
1679     if(divid_by_std){
1680         vector<double> sd_SNP(m);
1681         for (j = 0; j < m; j++){
1682             k = _include[snp_indx[j]];
1683             sd_SNP[j] = _mu[k]*(1.0 - 0.5 * _mu[k]);
1684             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1685             else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
1686         }
1687         for (j = 0; j < m; j++) X.col(j) = X.col(j).array() * sd_SNP[j];
1688     }
1689 
1690     return true;
1691 }
1692 
make_XMat_d_subset(MatrixXf & X,vector<int> & snp_indx,bool divid_by_std)1693 bool gcta::make_XMat_d_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std)
1694 {
1695     if(snp_indx.empty()) return false;
1696     if (_mu.empty()) calcu_mu();
1697 
1698     int i = 0, j = 0, k = 0, n = _keep.size(), m = snp_indx.size();
1699 
1700     X.resize(n, m);
1701     #pragma omp parallel for private(j, k)
1702     for (i = 0; i < n; i++) {
1703         for (j = 0; j < m; j++) {
1704             k = _include[snp_indx[j]];
1705             if (!_snp_1[k][_keep[i]] || _snp_2[k][_keep[i]]) {
1706                 if (_allele1[k] == _ref_A[k]) X(i,j) = _snp_1[k][_keep[i]] + _snp_2[k][_keep[i]];
1707                 else X(i,j) = 2.0 - (_snp_1[k][_keep[i]] + _snp_2[k][_keep[i]]);
1708                 if (X(i,j) < 0.5) X(i,j) = 0.0;
1709                 else if (X(i,j) < 1.5) X(i,j) = _mu[k];
1710                 else X(i,j) = (2.0 * _mu[k] - 2.0);
1711                 X(i,j) -= 0.5 * _mu[k] * _mu[k];
1712             }
1713             else X(i,j) = 0.0;
1714         }
1715     }
1716 
1717     if(divid_by_std){
1718         vector<double> sd_SNP(m);
1719         for (j = 0; j < m; j++){
1720             k = _include[snp_indx[j]];
1721             sd_SNP[j] = _mu[k]*(1.0 - 0.5 * _mu[k]);
1722             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1723             else sd_SNP[j] = 1.0 / sd_SNP[j];
1724         }
1725         for (j = 0; j < m; j++) X.col(j) = X.col(j).array() * sd_SNP[j];
1726     }
1727 
1728     return true;
1729 }
1730