1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for data management
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
gcta(int autosome_num,double rm_ld_cutoff,string out)15 gcta::gcta(int autosome_num, double rm_ld_cutoff, string out)
16 {
17 _autosome_num = autosome_num;
18 _rm_ld_cutoff = rm_ld_cutoff;
19 _out = out;
20 _dosage_flag = false;
21 _grm_bin_flag = false;
22 _reml_mtd = 0;
23 _reml_inv_mtd = 0;
24 _reml_max_iter = 30;
25 _jma_actual_geno = false;
26 _jma_wind_size = 1e7;
27 _jma_p_cutoff = 5e-8;
28 _jma_collinear = 0.9;
29 _jma_Vp = 1;
30 _GC_val = 1;
31 _bivar_reml = false;
32 _ignore_Ce = false;
33 _bivar_no_constrain = false;
34 _y_Ssq = 0.0;
35 _y2_Ssq = 0.0;
36 _ncase = 0;
37 _ncase2 = 0;
38 _flag_CC = false;
39 _flag_CC2 = false;
40 _within_family = false;
41 _reml_have_bend_A = false;
42 _V_inv_mtd = 0;
43 _reml_force_inv = false;
44 _reml_AI_not_invertible = false;
45 _reml_force_converge = false;
46 _reml_no_converge = false;
47 _reml_fixed_var = false;
48 _ldscore_adj = false;
49 }
50
gcta()51 gcta::gcta() {
52 _dosage_flag = false;
53 _grm_bin_flag = false;
54 _reml_mtd = 0;
55 _reml_inv_mtd = 0;
56 _reml_max_iter = 30;
57 _jma_actual_geno = false;
58 _jma_wind_size = 1e7;
59 _jma_p_cutoff = 5e-8;
60 _jma_collinear = 0.9;
61 _jma_Vp = 1;
62 _GC_val = 1;
63 _bivar_reml = false;
64 _ignore_Ce = false;
65 _bivar_no_constrain = false;
66 _y_Ssq = 0.0;
67 _y2_Ssq = 0.0;
68 _ncase = 0;
69 _ncase2 = 0;
70 _flag_CC = false;
71 _flag_CC2 = false;
72 _within_family = false;
73 _reml_have_bend_A = false;
74 _V_inv_mtd = 0;
75 _reml_force_inv = false;
76 _reml_AI_not_invertible = false;
77 _reml_force_converge = false;
78 _reml_no_converge = false;
79 _reml_fixed_var = false;
80 _ldscore_adj = false;
81 }
82
~gcta()83 gcta::~gcta() {
84
85 }
86
read_famfile(string famfile)87 void gcta::read_famfile(string famfile) {
88 ifstream Fam(famfile.c_str());
89 if (!Fam) throw ("Error: can not open the file [" + famfile + "] to read.");
90 cout << "Reading PLINK FAM file from [" + famfile + "]." << endl;
91
92 int i = 0;
93 string str_buf;
94 _fid.clear();
95 _pid.clear();
96 _fa_id.clear();
97 _mo_id.clear();
98 _sex.clear();
99 _pheno.clear();
100 while (Fam) {
101 Fam >> str_buf;
102 if (Fam.eof()) break;
103 _fid.push_back(str_buf);
104 Fam >> str_buf;
105 _pid.push_back(str_buf);
106 Fam >> str_buf;
107 _fa_id.push_back(str_buf);
108 Fam >> str_buf;
109 _mo_id.push_back(str_buf);
110 Fam >> str_buf;
111 _sex.push_back(atoi(str_buf.c_str()));
112 Fam >> str_buf;
113 _pheno.push_back(atoi(str_buf.c_str()));
114 }
115 Fam.clear();
116 Fam.close();
117 _indi_num = _fid.size();
118 cout << _indi_num << " individuals to be included from [" + famfile + "]." << endl;
119
120 // Initialize _keep
121 init_keep();
122 }
123
init_keep()124 void gcta::init_keep() {
125 _keep.clear();
126 _keep.resize(_indi_num);
127 _id_map.clear();
128 int i = 0, size = 0;
129 for (i = 0; i < _indi_num; i++) {
130 _keep[i] = i;
131 _id_map.insert(pair<string, int>(_fid[i] + ":" + _pid[i], i));
132 if (size == _id_map.size()) throw ("Error: Duplicate individual ID found: \"" + _fid[i] + "\t" + _pid[i] + "\".");
133 size = _id_map.size();
134 }
135 }
136
read_bimfile(string bimfile)137 void gcta::read_bimfile(string bimfile) {
138 // Read bim file: recombination rate is defined between SNP i and SNP i-1
139 int ibuf = 0;
140 string cbuf = "0";
141 double dbuf = 0.0;
142 string str_buf;
143 ifstream Bim(bimfile.c_str());
144 if (!Bim) throw ("Error: can not open the file [" + bimfile + "] to read.");
145 cout << "Reading PLINK BIM file from [" + bimfile + "]." << endl;
146 _chr.clear();
147 _snp_name.clear();
148 _genet_dst.clear();
149 _bp.clear();
150 _allele1.clear();
151 _allele2.clear();
152 while (Bim) {
153 Bim >> ibuf;
154 if (Bim.eof()) break;
155 _chr.push_back(ibuf);
156 Bim >> str_buf;
157 _snp_name.push_back(str_buf);
158 Bim >> dbuf;
159 _genet_dst.push_back(dbuf);
160 Bim >> ibuf;
161 _bp.push_back(ibuf);
162 Bim >> cbuf;
163 StrFunc::to_upper(cbuf);
164 _allele1.push_back(cbuf);
165 Bim >> cbuf;
166 StrFunc::to_upper(cbuf);
167 _allele2.push_back(cbuf);
168 }
169 Bim.close();
170 _snp_num = _chr.size();
171 _ref_A = _allele1;
172 _other_A = _allele2;
173 cout << _snp_num << " SNPs to be included from [" + bimfile + "]." << endl;
174
175 // Initialize _include
176 init_include();
177 }
178
init_include()179 void gcta::init_include()
180 {
181 _include.clear();
182 _include.resize(_snp_num);
183 _snp_name_map.clear();
184 int i = 0, size = 0;
185 for (i = 0; i < _snp_num; i++) {
186 _include[i] = i;
187 if(_snp_name_map.find(_snp_name[i]) != _snp_name_map.end()){
188 cout << "Warning: Duplicated SNP ID \"" + _snp_name[i] + "\" ";
189 stringstream ss;
190 ss << _snp_name[i] << "_" << i + 1;
191 _snp_name[i] = ss.str();
192 cout<<"has been changed to \"" + _snp_name[i] + "\"\n.";
193 }
194 _snp_name_map.insert(pair<string, int>(_snp_name[i], i));
195 }
196 }
197
198 // some code are adopted from PLINK with modifications
read_bedfile(string bedfile)199 void gcta::read_bedfile(string bedfile)
200 {
201 int i = 0, j = 0, k = 0;
202
203 // Flag for reading individuals and SNPs
204 vector<int> rindi, rsnp;
205 get_rindi(rindi);
206 get_rsnp(rsnp);
207
208 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
209 if (_keep.size() == 0) throw ("Error: No individual is retained for analysis.");
210
211 // Read bed file
212 char ch[1];
213 bitset<8> b;
214 _snp_1.resize(_include.size());
215 _snp_2.resize(_include.size());
216 for (i = 0; i < _include.size(); i++) {
217 _snp_1[i].reserve(_keep.size());
218 _snp_2[i].reserve(_keep.size());
219 }
220 fstream BIT(bedfile.c_str(), ios::in | ios::binary);
221 if (!BIT) throw ("Error: can not open the file [" + bedfile + "] to read.");
222 cout << "Reading PLINK BED file from [" + bedfile + "] in SNP-major format ..." << endl;
223 for (i = 0; i < 3; i++) BIT.read(ch, 1); // skip the first three bytes
224 int snp_indx = 0, indi_indx = 0;
225 for (j = 0, snp_indx = 0; j < _snp_num; j++) { // Read genotype in SNP-major mode, 00: homozygote AA; 11: homozygote BB; 01: hetezygote; 10: missing
226 if (!rsnp[j]) {
227 for (i = 0; i < _indi_num; i += 4) BIT.read(ch, 1);
228 continue;
229 }
230 for (i = 0, indi_indx = 0; i < _indi_num;) {
231 BIT.read(ch, 1);
232 if (!BIT) throw ("Error: problem with the BED file ... has the FAM/BIM file been changed?");
233 b = ch[0];
234 k = 0;
235 while (k < 7 && i < _indi_num) { // change code: 11 for AA; 00 for BB;
236 if (!rindi[i]) k += 2;
237 else {
238 _snp_2[snp_indx][indi_indx] = (!b[k++]);
239 _snp_1[snp_indx][indi_indx] = (!b[k++]);
240 indi_indx++;
241 }
242 i++;
243 }
244 }
245 if (snp_indx == _include.size()) break;
246 snp_indx++;
247 }
248 BIT.clear();
249 BIT.close();
250 cout << "Genotype data for " << _keep.size() << " individuals and " << _include.size() << " SNPs to be included from [" + bedfile + "]." << endl;
251
252 update_fam(rindi);
253 update_bim(rsnp);
254 }
255
get_rsnp(vector<int> & rsnp)256 void gcta::get_rsnp(vector<int> &rsnp) {
257 rsnp.clear();
258 rsnp.resize(_snp_num);
259 for (int i = 0; i < _snp_num; i++) {
260 if (_snp_name_map.find(_snp_name[i]) != _snp_name_map.end()) rsnp[i] = 1;
261 else rsnp[i] = 0;
262 }
263 }
264
get_rindi(vector<int> & rindi)265 void gcta::get_rindi(vector<int> &rindi) {
266 rindi.clear();
267 rindi.resize(_indi_num);
268 for (int i = 0; i < _indi_num; i++) {
269 if (_id_map.find(_fid[i] + ":" + _pid[i]) != _id_map.end()) rindi[i] = 1;
270 else rindi[i] = 0;
271 }
272 }
273
update_bim(vector<int> & rsnp)274 void gcta::update_bim(vector<int> &rsnp) {
275 int i = 0;
276
277 //update bim information
278 vector<int> chr_buf, bp_buf;
279 vector<string> a1_buf, a2_buf, ref_A_buf, other_A_buf;
280 vector<string> snp_name_buf;
281 vector<double> genet_dst_buf, impRsq_buf;
282 for (i = 0; i < _snp_num; i++) {
283 if (!rsnp[i]) continue;
284 chr_buf.push_back(_chr[i]);
285 snp_name_buf.push_back(_snp_name[i]);
286 genet_dst_buf.push_back(_genet_dst[i]);
287 bp_buf.push_back(_bp[i]);
288 a1_buf.push_back(_allele1[i]);
289 a2_buf.push_back(_allele2[i]);
290 ref_A_buf.push_back(_ref_A[i]);
291 other_A_buf.push_back(_other_A[i]);
292 if(_impRsq.size()>0) impRsq_buf.push_back(_impRsq[i]);
293 }
294 _chr.clear();
295 _snp_name.clear();
296 _genet_dst.clear();
297 _bp.clear();
298 _allele1.clear();
299 _allele2.clear();
300 _ref_A.clear();
301 _other_A.clear();
302 _impRsq.clear();
303 _chr = chr_buf;
304 _snp_name = snp_name_buf;
305 _genet_dst = genet_dst_buf;
306 _bp = bp_buf;
307 _allele1 = a1_buf;
308 _allele2 = a2_buf;
309 _ref_A = ref_A_buf;
310 _other_A = other_A_buf;
311 _impRsq=impRsq_buf;
312 _snp_num = _chr.size();
313 _include.clear();
314 _include.resize(_snp_num);
315 _snp_name_map.clear();
316
317 for (i = 0; i < _snp_num; i++) {
318 _include[i] = i;
319 _snp_name_map.insert(pair<string, int>(_snp_name[i], i));
320 }
321 }
322
update_fam(vector<int> & rindi)323 void gcta::update_fam(vector<int> &rindi) {
324 //update fam information
325 int i = 0;
326 vector<string> fid_buf, pid_buf, fa_id_buf, mo_id_buf;
327 vector<int> sex_buf;
328 vector<double> pheno_buf;
329 for (i = 0; i < _indi_num; i++) {
330 if (!rindi[i]) continue;
331 fid_buf.push_back(_fid[i]);
332 pid_buf.push_back(_pid[i]);
333 fa_id_buf.push_back(_fa_id[i]);
334 mo_id_buf.push_back(_mo_id[i]);
335 sex_buf.push_back(_sex[i]);
336 pheno_buf.push_back(_pheno[i]);
337 }
338 _fid.clear();
339 _pid.clear();
340 _fa_id.clear();
341 _mo_id.clear();
342 _sex.clear();
343 _pheno.clear();
344 _fid = fid_buf;
345 _pid = pid_buf;
346 _fa_id = fa_id_buf;
347 _mo_id = mo_id_buf;
348 _sex = sex_buf;
349 _pheno = pheno_buf;
350
351 _indi_num = _fid.size();
352 _keep.clear();
353 _keep.resize(_indi_num);
354 _id_map.clear();
355 for (i = 0; i < _indi_num; i++) {
356 _keep[i] = i;
357 _id_map.insert(pair<string, int>(_fid[i] + ":" + _pid[i], i));
358 }
359 }
360
read_imp_info_mach_gz(string zinfofile)361 void gcta::read_imp_info_mach_gz(string zinfofile)
362 {
363 _dosage_flag = true;
364
365 int i = 0;
366 gzifstream zinf;
367 zinf.open(zinfofile.c_str());
368 if (!zinf.is_open()) throw ("Error: can not open the file [" + zinfofile + "] to read.");
369
370 string buf, str_buf, errmsg = "Reading dosage data failed. Please check the format of the map file.";
371 string c_buf;
372 double f_buf = 0.0;
373 cout << "Reading map file of the imputed dosage data from [" + zinfofile + "]." << endl;
374 getline(zinf, buf); // skip the header
375 vector<string> vs_buf;
376 int col_num = StrFunc::split_string(buf, vs_buf, " \t\n");
377 if (col_num < 7) throw (errmsg);
378 if (vs_buf[6] != "Rsq") throw (errmsg);
379 _snp_name.clear();
380 _allele1.clear();
381 _allele2.clear();
382 _impRsq.clear();
383 while (1) {
384 getline(zinf, buf);
385 stringstream ss(buf);
386 string nerr = errmsg + "\nError occurs in line: " + ss.str();
387 if (!(ss >> str_buf)) break;
388 _snp_name.push_back(str_buf);
389 if (!(ss >> c_buf)) throw (nerr);
390 _allele1.push_back(c_buf);
391 if (!(ss >> c_buf)) throw (nerr);
392 _allele2.push_back(c_buf);
393 for (i = 0; i < 4; i++) if (!(ss >> f_buf)) throw (nerr);
394 _impRsq.push_back(f_buf);
395 if (zinf.fail() || !zinf.good()) break;
396 }
397 zinf.clear();
398 zinf.close();
399 _snp_num = _snp_name.size();
400 _chr.resize(_snp_num);
401 _bp.resize(_snp_num);
402 _genet_dst.resize(_snp_num);
403 _ref_A = _allele1;
404 _other_A = _allele2;
405
406 // Initialize _include
407 init_include();
408 cout << _snp_num << " SNPs to be included from [" + zinfofile + "]." << endl;
409 }
410
read_imp_info_mach(string infofile)411 void gcta::read_imp_info_mach(string infofile)
412 {
413 _dosage_flag = true;
414 if(infofile.substr(infofile.length()-3,3)==".gz") throw("Error: the --dosage-mach option doesn't support gz file any more. Please check --dosage-mach-gz option.");
415
416 int i = 0;
417 ifstream inf(infofile.c_str());
418 if (!inf.is_open()) throw ("Error: can not open the file [" + infofile + "] to read.");
419
420 string buf, str_buf, errmsg = "Reading dosage data failed. Please check the format of the map file.";
421 string c_buf;
422 double f_buf = 0.0;
423 cout << "Reading map file of the imputed dosage data from [" + infofile + "]." << endl;
424 getline(inf, buf); // skip the header
425 vector<string> vs_buf;
426 int col_num = StrFunc::split_string(buf, vs_buf, " \t\n");
427 if (col_num < 7) throw (errmsg);
428 if (vs_buf[6] != "Rsq" && vs_buf[6] != "Rsq_hat") throw (errmsg);
429 _snp_name.clear();
430 _allele1.clear();
431 _allele2.clear();
432 _impRsq.clear();
433 while (getline(inf, buf)) {
434 stringstream ss(buf);
435 string nerr = errmsg + "\nError occurs in line: " + ss.str();
436 if (!(ss >> str_buf)) break;
437 _snp_name.push_back(str_buf);
438 if (!(ss >> c_buf)) throw (nerr);
439 _allele1.push_back(c_buf);
440 if (!(ss >> c_buf)) throw (nerr);
441 _allele2.push_back(c_buf);
442 for (i = 0; i < 3; i++) if (!(ss >> f_buf)) throw (nerr);
443 _impRsq.push_back(f_buf);
444 }
445 inf.close();
446 _snp_num = _snp_name.size();
447 _chr.resize(_snp_num);
448 _bp.resize(_snp_num);
449 _genet_dst.resize(_snp_num);
450 _ref_A = _allele1;
451 _other_A = _allele2;
452
453 // Initialize _include
454 init_include();
455 cout << _snp_num << " SNPs to be included from [" + infofile + "]." << endl;
456 }
457
read_imp_dose_mach_gz(string zdosefile,string kp_indi_file,string rm_indi_file,string blup_indi_file)458 void gcta::read_imp_dose_mach_gz(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) {
459 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
460
461 int i = 0, j = 0, k = 0, line = 0;
462 vector<int> rsnp;
463 get_rsnp(rsnp);
464
465 gzifstream zinf;
466 zinf.open(zdosefile.c_str());
467 if (!zinf.is_open()) throw ("Error: can not open the file [" + zdosefile + "] to read.");
468
469 vector<string> indi_ls;
470 map<string, int> kp_id_map, blup_id_map, rm_id_map;
471 bool kp_indi_flag = !kp_indi_file.empty(), blup_indi_flag = !blup_indi_file.empty(), rm_indi_flag = !rm_indi_file.empty();
472 if (kp_indi_flag) read_indi_list(kp_indi_file, indi_ls);
473 for (i = 0; i < indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i));
474 if (blup_indi_flag) read_indi_list(blup_indi_file, indi_ls);
475 for (i = 0; i < indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i));
476 if (rm_indi_flag) read_indi_list(rm_indi_file, indi_ls);
477 for (i = 0; i < indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i));
478
479 bool missing = false;
480 string buf, str_buf, id_buf, err_msg = "Error: reading dosage data failed. Are the map file and the dosage file matched?";
481 double f_buf = 0.0;
482 vector<string> kept_id, vs_buf;
483 cout << "Reading dosage data from [" + zdosefile + "] in individual-major format (Note: may use huge RAM)." << endl;
484 _fid.clear();
485 _pid.clear();
486 _geno_dose.clear();
487
488 vector<int> kp_it;
489 while (1) {
490 bool kp_flag = true;
491 getline(zinf, buf);
492 stringstream ss(buf);
493 if (!(ss >> str_buf)) break;
494 int ibuf = StrFunc::split_string(str_buf, vs_buf, ">");
495 if (ibuf > 1) {
496 if (vs_buf[0].empty()) throw ("Error: family ID of the individual [" + str_buf + "] is missing.");
497 else vs_buf[0].erase(vs_buf[0].end() - 1);
498 } else if (ibuf == 1) vs_buf.push_back(vs_buf[0]);
499 else break;
500 id_buf = vs_buf[0] + ":" + vs_buf[1];
501 if (kp_indi_flag && kp_id_map.find(id_buf) == kp_id_map.end()) kp_flag = false;
502 if (kp_flag && blup_indi_flag && blup_id_map.find(id_buf) == blup_id_map.end()) kp_flag = false;
503 if (kp_flag && rm_indi_flag && rm_id_map.find(id_buf) != rm_id_map.end()) kp_flag = false;
504 if (kp_flag) {
505 kp_it.push_back(1);
506 _fid.push_back(vs_buf[0]);
507 _pid.push_back(vs_buf[1]);
508 kept_id.push_back(id_buf);
509 } else kp_it.push_back(0);
510 if (zinf.fail() || !zinf.good()) break;
511 }
512 zinf.clear();
513 zinf.close();
514 cout << "(Imputed dosage data for " << kp_it.size() << " individuals detected)." << endl;
515 _indi_num = _fid.size();
516
517 zinf.open(zdosefile.c_str());
518 _geno_dose.resize(_indi_num);
519 for (line = 0; line < _indi_num; line++) _geno_dose[line].resize(_include.size());
520 for (line = 0, k = 0; line < kp_it.size(); line++) {
521 getline(zinf, buf);
522 if (kp_it[line] == 0) continue;
523 stringstream ss(buf);
524 if (!(ss >> str_buf)) break;
525 if (!(ss >> str_buf)) break;
526 for (i = 0, j = 0; i < _snp_num; i++) {
527 ss >> str_buf;
528 f_buf = atof(str_buf.c_str());
529 if (str_buf == "X" || str_buf == "NA") {
530 if (!missing) {
531 cout << "Warning: missing values detected in the dosage data." << endl;
532 missing = true;
533 }
534 f_buf = 1e6;
535 }
536 if (rsnp[i]) {
537 _geno_dose[k][j] = (f_buf);
538 j++;
539 }
540 }
541 k++;
542 }
543 zinf.clear();
544 zinf.close();
545
546 cout << "Imputed dosage data for " << kept_id.size() << " individuals are included from [" << zdosefile << "]." << endl;
547 _fa_id.resize(_indi_num);
548 _mo_id.resize(_indi_num);
549 _sex.resize(_indi_num);
550 _pheno.resize(_indi_num);
551 for (i = 0; i < _indi_num; i++) {
552 _fa_id[i] = _mo_id[i] = "0";
553 _sex[i] = -9;
554 _pheno[i] = -9;
555 }
556
557 // initialize keep
558 init_keep();
559 update_id_map_kp(kept_id, _id_map, _keep);
560 if (_keep.size() == 0) throw ("Error: No individual is retained for analysis.");
561
562 if (blup_indi_flag) read_indi_blup(blup_indi_file);
563
564 // update data
565 update_bim(rsnp);
566 }
567
read_imp_dose_mach(string dosefile,string kp_indi_file,string rm_indi_file,string blup_indi_file)568 void gcta::read_imp_dose_mach(string dosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) {
569 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
570 if(dosefile.substr(dosefile.length()-3,3)==".gz") throw("Error: the --dosage-mach option doesn't support gz file any more. Please check --dosage-mach-gz option.");
571
572 int i = 0, j = 0, k = 0, line = 0;
573 vector<int> rsnp;
574 get_rsnp(rsnp);
575
576 ifstream idose(dosefile.c_str());
577 if (!idose) throw ("Error: can not open the file [" + dosefile + "] to read.");
578
579 vector<string> indi_ls;
580 map<string, int> kp_id_map, blup_id_map, rm_id_map;
581 bool kp_indi_flag = !kp_indi_file.empty(), blup_indi_flag = !blup_indi_file.empty(), rm_indi_flag = !rm_indi_file.empty();
582 if (kp_indi_flag) read_indi_list(kp_indi_file, indi_ls);
583 for (i = 0; i < indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i));
584 if (blup_indi_flag) read_indi_list(blup_indi_file, indi_ls);
585 for (i = 0; i < indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i));
586 if (rm_indi_flag) read_indi_list(rm_indi_file, indi_ls);
587 for (i = 0; i < indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i));
588
589 bool missing = false;
590 string buf, str_buf, id_buf, err_msg = "Error: reading dosage data failed. Are the map file and the dosage file matched?";
591 double f_buf = 0.0;
592 vector<string> kept_id, vs_buf;
593 cout << "Reading dosage data from [" + dosefile + "] in individual-major format (Note: may use huge RAM)." << endl;
594 _fid.clear();
595 _pid.clear();
596 _geno_dose.clear();
597
598 vector<int> kp_it;
599 while (getline(idose, buf)) {
600 bool kp_flag = true;
601 stringstream ss(buf);
602 if (!(ss >> str_buf)) break;
603 int ibuf = StrFunc::split_string(str_buf, vs_buf, ">");
604 if (ibuf > 1) {
605 if (vs_buf[0].empty()) throw ("Error: family ID of the individual [" + str_buf + "] is missing.");
606 else vs_buf[0].erase(vs_buf[0].end() - 1);
607 } else if (ibuf == 1) vs_buf.push_back(vs_buf[0]);
608 else break;
609 id_buf = vs_buf[0] + ":" + vs_buf[1];
610 if (kp_indi_flag && kp_id_map.find(id_buf) == kp_id_map.end()) kp_flag = false;
611 if (kp_flag && blup_indi_flag && blup_id_map.find(id_buf) == blup_id_map.end()) kp_flag = false;
612 if (kp_flag && rm_indi_flag && rm_id_map.find(id_buf) != rm_id_map.end()) kp_flag = false;
613 if (kp_flag) {
614 kp_it.push_back(1);
615 _fid.push_back(vs_buf[0]);
616 _pid.push_back(vs_buf[1]);
617 kept_id.push_back(id_buf);
618 } else kp_it.push_back(0);
619 }
620 idose.close();
621 cout << "(Imputed dosage data for " << kp_it.size() << " individuals detected)." << endl;
622 _indi_num = _fid.size();
623
624 idose.open(dosefile.c_str());
625 _geno_dose.resize(_indi_num);
626 for (line = 0; line < _indi_num; line++) _geno_dose[line].resize(_include.size());
627 for (line = 0, k = 0; line < kp_it.size(); line++) {
628 getline(idose, buf);
629 if (kp_it[line] == 0) continue;
630 stringstream ss(buf);
631 if (!(ss >> str_buf)) break;
632 if (!(ss >> str_buf)) break;
633 for (i = 0, j = 0; i < _snp_num; i++) {
634 ss >> str_buf;
635 f_buf = atof(str_buf.c_str());
636 if (str_buf == "X" || str_buf == "NA") {
637 if (!missing) {
638 cout << "Warning: missing values detected in the dosage data." << endl;
639 missing = true;
640 }
641 f_buf = 1e6;
642 }
643 if (rsnp[i]) {
644 _geno_dose[k][j] = (f_buf);
645 j++;
646 }
647 }
648 k++;
649 }
650 idose.close();
651
652 cout << "Imputed dosage data for " << kept_id.size() << " individuals are included from [" << dosefile << "]." << endl;
653 _fa_id.resize(_indi_num);
654 _mo_id.resize(_indi_num);
655 _sex.resize(_indi_num);
656 _pheno.resize(_indi_num);
657 for (i = 0; i < _indi_num; i++) {
658 _fa_id[i] = _mo_id[i] = "0";
659 _sex[i] = -9;
660 _pheno[i] = -9;
661 }
662
663 // initialize keep
664 init_keep();
665 update_id_map_kp(kept_id, _id_map, _keep);
666 if (_keep.size() == 0) throw ("Error: No individual is retained for analysis.");
667
668 if (blup_indi_flag) read_indi_blup(blup_indi_file);
669
670 // update data
671 update_bim(rsnp);
672 }
673
read_imp_info_beagle(string zinfofile)674 void gcta::read_imp_info_beagle(string zinfofile) {
675 _dosage_flag = true;
676
677 const int MAX_LINE_LENGTH = 1000;
678 char buf[MAX_LINE_LENGTH];
679 string str_buf, errmsg = "Error: Reading SNP summary information filed? Please check the format of [" + zinfofile + "].";
680
681 string c_buf;
682 int i_buf;
683 double f_buf = 0.0;
684 gzifstream zinf;
685 zinf.open(zinfofile.c_str());
686 if (!zinf.is_open()) throw ("Error: can not open the file [" + zinfofile + "] to read.");
687 cout << "Reading summary information of the imputed SNPs (BEAGLE) ..." << endl;
688 zinf.getline(buf, MAX_LINE_LENGTH, '\n'); // skip the header
689 while (1) {
690 zinf.getline(buf, MAX_LINE_LENGTH, '\n');
691 if (zinf.fail() || !zinf.good()) break;
692 stringstream ss(buf);
693 string nerr = errmsg + "\nError line: " + ss.str();
694 if (!(ss >> i_buf)) throw (nerr);
695 _chr.push_back(i_buf);
696 if (!(ss >> str_buf)) throw (nerr);
697 _snp_name.push_back(str_buf);
698 if (!(ss >> i_buf)) throw (nerr);
699 _bp.push_back(i_buf);
700 if (!(ss >> c_buf)) throw (nerr);
701 _allele1.push_back(c_buf);
702 if (!(ss >> c_buf)) throw (nerr);
703 _allele2.push_back(c_buf);
704 if (!(ss >> str_buf)) throw (nerr);
705 if (!(ss >> str_buf)) throw (nerr);
706 if (!(ss >> str_buf)) throw (nerr);
707 if (!(ss >> f_buf)) throw (nerr);
708 if (!(ss >> f_buf)) throw (nerr);
709 if (!(ss >> f_buf)) throw (nerr);
710 _impRsq.push_back(f_buf);
711 if (!(ss >> f_buf)) throw (nerr);
712 if (!(ss >> f_buf)) throw (nerr);
713 if (ss >> f_buf) throw (nerr);
714 }
715 zinf.clear();
716 zinf.close();
717 _snp_num = _snp_name.size();
718 cout << _snp_num << " SNPs to be included from [" + zinfofile + "]." << endl;
719 _genet_dst.resize(_snp_num);
720 _ref_A = _allele1;
721 _other_A = _allele2;
722
723 // Initialize _include
724 init_include();
725 }
726
read_imp_dose_beagle(string zdosefile,string kp_indi_file,string rm_indi_file,string blup_indi_file)727 void gcta::read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) {
728 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
729 int i = 0, j = 0;
730 vector<int> rsnp;
731 get_rsnp(rsnp);
732
733 const int MAX_LINE_LENGTH = 10000000;
734 char buf[MAX_LINE_LENGTH];
735 string str_buf;
736
737 gzifstream zinf;
738 zinf.open(zdosefile.c_str());
739 if (!zinf.is_open()) throw ("Error: can not open the file [" + zdosefile + "] to read.");
740 cout << "Reading imputed dosage scores (BEAGLE output) ..." << endl;
741 zinf.getline(buf, MAX_LINE_LENGTH, '\n');
742 stringstream ss(buf);
743 for (i = 0; i < 3; i++) ss >> str_buf;
744 while (ss >> str_buf) {
745 _fid.push_back(str_buf);
746 }
747 _pid = _fid;
748 _indi_num = _fid.size();
749 _fa_id.resize(_indi_num);
750 _mo_id.resize(_indi_num);
751 _sex.resize(_indi_num);
752 _pheno.resize(_indi_num);
753 cout << _indi_num << " individuals to be included from [" + zdosefile + "]." << endl;
754 init_keep();
755 if (!kp_indi_file.empty()) keep_indi(kp_indi_file);
756 if (!blup_indi_file.empty()) read_indi_blup(blup_indi_file);
757 if (!rm_indi_file.empty()) remove_indi(rm_indi_file);
758
759 _geno_dose.resize(_keep.size());
760 for (i = 0; i < _keep.size(); i++) _geno_dose[i].resize(_include.size());
761
762 vector<int> rindi;
763 get_rindi(rindi);
764
765 int line = 0;
766 int k = 0;
767 double d_buf = 0.0;
768 while (1) {
769 zinf.getline(buf, MAX_LINE_LENGTH, '\n');
770 if (zinf.fail() || !zinf.good()) break;
771 if (!rsnp[line++]) continue;
772 stringstream ss(buf);
773 ss >> str_buf;
774 if (str_buf != _snp_name[line - 1]) {
775 stringstream errmsg;
776 errmsg << "Error: the " << line << " th SNP [" + _snp_name[line - 1] + "] in the summary file doesn't match to that in the dosage file." << endl;
777 throw (errmsg.str());
778 }
779 ss >> str_buf >> str_buf;
780 for (i = 0, j = 0; i < _indi_num; i++) {
781 ss >> d_buf;
782 if (rindi[i]) {
783 _geno_dose[j][k] = d_buf;
784 j++;
785 }
786 }
787 k++;
788 }
789 zinf.clear();
790 zinf.close();
791 }
792
save_plink()793 void gcta::save_plink() {
794 if (_dosage_flag) dose2bed();
795 save_famfile();
796 save_bimfile();
797 save_bedfile();
798 }
799
save_bedfile()800 void gcta::save_bedfile() {
801 int i = 0, pos = 0, j = 0;
802 string OutBedFile = _out + ".bed";
803 fstream OutBed(OutBedFile.c_str(), ios::out | ios::binary);
804 if (!OutBed) throw ("Error: can not open the file [" + OutBedFile + "] to write.");
805 cout << "Writing genotypes to PLINK BED file [" + OutBedFile + "] ..." << endl;
806 bitset<8> b;
807 char ch[1];
808 b.reset();
809 b.set(2);
810 b.set(3);
811 b.set(5);
812 b.set(6);
813 ch[0] = (char) b.to_ulong();
814 OutBed.write(ch, 1);
815 b.reset();
816 b.set(0);
817 b.set(1);
818 b.set(3);
819 b.set(4);
820 ch[0] = (char) b.to_ulong();
821 OutBed.write(ch, 1);
822 b.reset();
823 b.set(0);
824 ch[0] = (char) b.to_ulong();
825 OutBed.write(ch, 1);
826 for (i = 0; i < _include.size(); i++) {
827 pos = 0;
828 b.reset();
829 for (j = 0; j < _keep.size(); j++) {
830 b[pos++] = (!_snp_2[_include[i]][_keep[j]]);
831 b[pos++] = (!_snp_1[_include[i]][_keep[j]]);
832 if (pos > 7 || j == _keep.size() - 1) {
833 ch[0] = (char) b.to_ulong();
834 OutBed.write(ch, 1);
835 pos = 0;
836 b.reset();
837 }
838 }
839 }
840 OutBed.close();
841 }
842
save_famfile()843 void gcta::save_famfile() {
844 string famfile = _out + ".fam";
845 ofstream Fam(famfile.c_str());
846 if (!Fam) throw ("Error: can not open the fam file " + famfile + " to save!");
847 cout << "Writing PLINK FAM file to [" + famfile + "] ..." << endl;
848 int i = 0;
849 for (i = 0; i < _keep.size(); i++) {
850 Fam << _fid[_keep[i]] << "\t" << _pid[_keep[i]] << "\t" << _fa_id[_keep[i]] << "\t" << _mo_id[_keep[i]] << "\t" << _sex[_keep[i]] << "\t" << _pheno[_keep[i]] << endl;
851 }
852 Fam.close();
853 cout << _keep.size() << " individuals to be saved to [" + famfile + "]." << endl;
854 }
855
save_bimfile()856 void gcta::save_bimfile() {
857 int i = 0;
858 string bimfile = _out + ".bim";
859 ofstream Bim(bimfile.c_str());
860 if (!Bim) throw ("Error: can not open the file [" + bimfile + "] to write.");
861 cout << "Writing PLINK bim file to [" + bimfile + "] ..." << endl;
862 for (i = 0; i < _include.size(); i++) {
863 Bim << _chr[_include[i]] << "\t" << _snp_name[_include[i]] << "\t" << _genet_dst[_include[i]] << "\t" << _bp[_include[i]] << "\t" << _allele1[_include[i]] << "\t" << _allele2[_include[i]] << endl;
864 }
865 Bim.close();
866 cout << _include.size() << " SNPs to be saved to [" + bimfile + "]." << endl;
867 }
868
dose2bed()869 void gcta::dose2bed() {
870 int i = 0, j = 0;
871 double d_buf = 0.0;
872
873 cout << "Converting dosage data into PLINK binary PED format ... " << endl;
874 _snp_1.resize(_snp_num);
875 _snp_2.resize(_snp_num);
876 for (i = 0; i < _snp_num; i++){
877 _snp_1[i].resize(_indi_num);
878 _snp_2[i].resize(_indi_num);
879 }
880 for (i = 0; i < _include.size(); i++) {
881 for (j = 0; j < _keep.size(); j++) {
882 d_buf = _geno_dose[_keep[j]][_include[i]];
883 if (d_buf > 1e5) {
884 _snp_2[_include[i]][_keep[j]] = false;
885 _snp_1[_include[i]][_keep[j]] = true;
886 } else if (d_buf >= 1.5) _snp_1[_include[i]][_keep[j]] = _snp_2[_include[i]][_keep[j]] = true;
887 else if (d_buf > 0.5) {
888 _snp_2[_include[i]][_keep[j]] = true;
889 _snp_1[_include[i]][_keep[j]] = false;
890 } else if (d_buf <= 0.5) _snp_1[_include[i]][_keep[j]] = _snp_2[_include[i]][_keep[j]] = false;
891 }
892 }
893 }
894
update_id_map_kp(const vector<string> & id_list,map<string,int> & id_map,vector<int> & keep)895 void gcta::update_id_map_kp(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep) {
896 int i = 0;
897 map<string, int> id_map_buf(id_map);
898 for (i = 0; i < id_list.size(); i++) id_map_buf.erase(id_list[i]);
899 map<string, int>::iterator iter;
900 for (iter = id_map_buf.begin(); iter != id_map_buf.end(); iter++) id_map.erase(iter->first);
901
902 keep.clear();
903 for (iter = id_map.begin(); iter != id_map.end(); iter++) keep.push_back(iter->second);
904 stable_sort(keep.begin(), keep.end());
905 }
906
update_id_map_rm(const vector<string> & id_list,map<string,int> & id_map,vector<int> & keep)907 void gcta::update_id_map_rm(const vector<string> &id_list, map<string, int> &id_map, vector<int> &keep)
908 {
909 int i = 0;
910 for (i = 0; i < id_list.size(); i++) id_map.erase(id_list[i]);
911
912 keep.clear();
913 map<string, int>::iterator iter;
914 for (iter = id_map.begin(); iter != id_map.end(); iter++) keep.push_back(iter->second);
915 stable_sort(keep.begin(), keep.end());
916 }
917
read_snplist(string snplistfile,vector<string> & snplist,string msg)918 void gcta::read_snplist(string snplistfile, vector<string> &snplist, string msg)
919 {
920 // Read snplist file
921 snplist.clear();
922 string StrBuf;
923 ifstream i_snplist(snplistfile.c_str());
924 if (!i_snplist) throw ("Error: can not open the file [" + snplistfile + "] to read.");
925 cout << "Reading a list of " << msg << " from [" + snplistfile + "]." << endl;
926 while (i_snplist >> StrBuf) {
927 snplist.push_back(StrBuf);
928 getline(i_snplist, StrBuf);
929 }
930 i_snplist.close();
931 }
932
extract_snp(string snplistfile)933 void gcta::extract_snp(string snplistfile)
934 {
935 vector<string> snplist;
936 read_snplist(snplistfile, snplist);
937 update_id_map_kp(snplist, _snp_name_map, _include);
938 cout << _include.size() << " SNPs are extracted from [" + snplistfile + "]." << endl;
939 }
940
extract_single_snp(string snpname)941 void gcta::extract_single_snp(string snpname)
942 {
943 vector<string> snplist;
944 snplist.push_back(snpname);
945 update_id_map_kp(snplist, _snp_name_map, _include);
946 if (_include.empty()) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
947 else cout << "Only the SNP [" + snpname + "] is included in the analysis." << endl;
948 }
949
extract_region_snp(string snpname,int wind_size)950 void gcta::extract_region_snp(string snpname, int wind_size)
951 {
952 cout << "Extracting SNPs " << wind_size/1000 << "kb away from the SNP [" << snpname << "] in either direction ..." << endl;
953 map<string, int>::iterator iter;
954 iter = _snp_name_map.find(snpname);
955 int i = 0, j = 0;
956 vector<string> snplist;
957 if(iter==_snp_name_map.end()) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
958 else{
959 int bp = _bp[iter->second];
960 int chr = _chr[iter->second];
961 for(i = 0; i < _include.size(); i++){
962 j = _include[i];
963 if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
964 }
965 }
966 if(snplist.empty()) throw ("Error: on SNP found in this region.");
967 update_id_map_kp(snplist, _snp_name_map, _include);
968 cout << _include.size() << " SNPs are extracted." << endl;
969 }
970
extract_region_bp(int chr,int bp,int wind_size)971 void gcta::extract_region_bp(int chr, int bp, int wind_size)
972 {
973 cout << "Extracting SNPs " << wind_size/1000 << "kb away from the position [chr=" << chr <<"; bp="<< bp << "] in either direction ..." << endl;
974 int i = 0, j = 0;
975 vector<string> snplist;
976 for(i = 0; i < _include.size(); i++){
977 j = _include[i];
978 if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
979 }
980 if(snplist.empty()) throw ("Error: on SNP found in this region.");
981 update_id_map_kp(snplist, _snp_name_map, _include);
982 cout << _include.size() << " SNPs are extracted." << endl;
983 }
984
exclude_snp(string snplistfile)985 void gcta::exclude_snp(string snplistfile)
986 {
987 vector<string> snplist;
988 read_snplist(snplistfile, snplist);
989 int prev_size = _include.size();
990 update_id_map_rm(snplist, _snp_name_map, _include);
991 cout << prev_size - _include.size() << " SNPs are excluded from [" + snplistfile + "] and there are " << _include.size() << " SNPs remaining." << endl;
992 }
993
exclude_region_snp(string snpname,int wind_size)994 void gcta::exclude_region_snp(string snpname, int wind_size)
995 {
996 cout << "Excluding SNPs " << wind_size/1000 << "kb away from the SNP [" << snpname << "] in either direction ..." << endl;
997 map<string, int>::iterator iter;
998 iter = _snp_name_map.find(snpname);
999 int i = 0, j = 0;
1000 vector<string> snplist;
1001 if(iter==_snp_name_map.end()) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
1002 else{
1003 int bp = _bp[iter->second];
1004 int chr = _chr[iter->second];
1005 for(i = 0; i < _include.size(); i++){
1006 j = _include[i];
1007 if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
1008 }
1009 }
1010 if(snplist.empty()) throw ("Error: on SNP found in this region.");
1011 update_id_map_rm(snplist, _snp_name_map, _include);
1012 cout << _include.size() << " SNPs have been excluded." << endl;
1013 }
1014
exclude_region_bp(int chr,int bp,int wind_size)1015 void gcta::exclude_region_bp(int chr, int bp, int wind_size)
1016 {
1017 cout << "Extracting SNPs " << wind_size/1000 << "kb away from the position [chr=" << chr <<"; bp="<< bp << "] in either direction ..." << endl;
1018 int i = 0, j = 0;
1019 vector<string> snplist;
1020 for(i = 0; i < _include.size(); i++){
1021 j = _include[i];
1022 if(_chr[j] == chr && abs(_bp[j]-bp) <= wind_size) snplist.push_back(_snp_name[j]);
1023 }
1024 if(snplist.empty()) throw ("Error: on SNP found in this region.");
1025 update_id_map_rm(snplist, _snp_name_map, _include);
1026 cout << _include.size() << " SNPs are excludeed." << endl;
1027 }
1028
exclude_single_snp(string snpname)1029 void gcta::exclude_single_snp(string snpname)
1030 {
1031 vector<string> snplist;
1032 snplist.push_back(snpname);
1033 int include_size = _include.size();
1034 update_id_map_rm(snplist, _snp_name_map, _include);
1035 if (_include.size() == include_size) throw ("Error: can not find the SNP [" + snpname + "] in the data.");
1036 else cout << "The SNP[" + snpname + "] has been excluded from the analysis." << endl;
1037 }
1038
extract_chr(int chr_start,int chr_end)1039 void gcta::extract_chr(int chr_start, int chr_end)
1040 {
1041 map<string, int> id_map_buf(_snp_name_map);
1042 map<string, int>::iterator iter, end = id_map_buf.end();
1043 _snp_name_map.clear();
1044 _include.clear();
1045 for (iter = id_map_buf.begin(); iter != end; iter++) {
1046 if (_chr[iter->second] >= chr_start && _chr[iter->second] <= chr_end) {
1047 _snp_name_map.insert(*iter);
1048 _include.push_back(iter->second);
1049 }
1050 }
1051 stable_sort(_include.begin(), _include.end());
1052 if (chr_start != chr_end) cout << _include.size() << " SNPs from chromosome " << chr_start << " to chromosome " << chr_end << " are included in the analysis." << endl;
1053 else cout << _include.size() << " SNPs on chromosome " << chr_start << " are included in the analysis." << endl;
1054 }
1055
filter_snp_maf(double maf)1056 void gcta::filter_snp_maf(double maf)
1057 {
1058 if (_mu.empty()) calcu_mu();
1059
1060 cout << "Filtering SNPs with MAF > " << maf << " ..." << endl;
1061 map<string, int> id_map_buf(_snp_name_map);
1062 map<string, int>::iterator iter, end = id_map_buf.end();
1063 int prev_size = _include.size();
1064 double fbuf = 0.0;
1065 _include.clear();
1066 _snp_name_map.clear();
1067 for (iter = id_map_buf.begin(); iter != end; iter++) {
1068 fbuf = _mu[iter->second]*0.5;
1069 if (fbuf <= maf || (1.0 - fbuf) <= maf) continue;
1070 _snp_name_map.insert(*iter);
1071 _include.push_back(iter->second);
1072 }
1073 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
1074 else {
1075 stable_sort(_include.begin(), _include.end());
1076 cout << "After filtering SNPs with MAF > " << maf << ", there are " << _include.size() << " SNPs (" << prev_size - _include.size() << " SNPs with MAF < " << maf << ")." << endl;
1077 }
1078 }
1079
filter_snp_max_maf(double max_maf)1080 void gcta::filter_snp_max_maf(double max_maf)
1081 {
1082 if (_mu.empty()) calcu_mu();
1083
1084 cout << "Filtering SNPs with MAF < " << max_maf << " ..." << endl;
1085 map<string, int> id_map_buf(_snp_name_map);
1086 map<string, int>::iterator iter, end = id_map_buf.end();
1087 int prev_size = _include.size();
1088 double fbuf = 0.0;
1089 _include.clear();
1090 _snp_name_map.clear();
1091 for (iter = id_map_buf.begin(); iter != end; iter++) {
1092 fbuf = _mu[iter->second]*0.5;
1093 if (fbuf > max_maf && 1.0 - fbuf > max_maf) continue;
1094 _snp_name_map.insert(*iter);
1095 _include.push_back(iter->second);
1096 }
1097 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
1098 else {
1099 stable_sort(_include.begin(), _include.end());
1100 cout << "After filtering SNPs with MAF < " << max_maf << ", there are " << _include.size() << " SNPs (" << prev_size - _include.size() << " SNPs with MAF > " << max_maf << ")." << endl;
1101 }
1102 }
1103
filter_impRsq(double rsq_cutoff)1104 void gcta::filter_impRsq(double rsq_cutoff)
1105 {
1106 if (_impRsq.empty()) cout << "Warning: the option --imput-rsq is inactive because GCTA can't find the imputation quality scores for the SNPs. Use the option --update-imput-rsq to input the imputation quality scores." << endl;
1107 cout << "Filtering SNPs with imputation Rsq > " << rsq_cutoff << " ..." << endl;
1108 map<string, int> id_map_buf(_snp_name_map);
1109 map<string, int>::iterator iter, end = id_map_buf.end();
1110 int prev_size = _include.size();
1111 _include.clear();
1112 _snp_name_map.clear();
1113 for (iter = id_map_buf.begin(); iter != end; iter++) {
1114 if (_impRsq[iter->second] < rsq_cutoff) continue;
1115 _snp_name_map.insert(*iter);
1116 _include.push_back(iter->second);
1117 }
1118 if (_include.size() == 0) throw ("Error: No SNP is retained for analysis.");
1119 else {
1120 stable_sort(_include.begin(), _include.end());
1121 cout << "After filtering for imputation Rsq > " << rsq_cutoff << ", there are " << _include.size() << " SNPs (" << prev_size - _include.size() << " SNPs with imputation Rsq < " << rsq_cutoff << ")." << endl;
1122 }
1123 }
1124
read_indi_list(string indi_list_file,vector<string> & indi_list)1125 void gcta::read_indi_list(string indi_list_file, vector<string> &indi_list)
1126 {
1127 ifstream i_indi_list(indi_list_file.c_str());
1128 if (!i_indi_list) throw ("Error: can not open the file [" + indi_list_file + "] to read.");
1129 string str_buf, id_buf;
1130 indi_list.clear();
1131 while (i_indi_list) {
1132 i_indi_list >> str_buf;
1133 if (i_indi_list.eof()) break;
1134 id_buf = str_buf + ":";
1135 i_indi_list >> str_buf;
1136 id_buf += str_buf;
1137 indi_list.push_back(id_buf);
1138 getline(i_indi_list, str_buf);
1139 }
1140 i_indi_list.close();
1141 }
1142
keep_indi(string indi_list_file)1143 void gcta::keep_indi(string indi_list_file) {
1144 vector<string> indi_list;
1145 read_indi_list(indi_list_file, indi_list);
1146 update_id_map_kp(indi_list, _id_map, _keep);
1147 cout << _keep.size() << " individuals are kept from [" + indi_list_file + "]." << endl;
1148 }
1149
remove_indi(string indi_list_file)1150 void gcta::remove_indi(string indi_list_file) {
1151 vector<string> indi_list;
1152 read_indi_list(indi_list_file, indi_list);
1153 int prev_size = _keep.size();
1154 update_id_map_rm(indi_list, _id_map, _keep);
1155 cout << prev_size - _keep.size() << " individuals are removed from [" + indi_list_file + "] and there are " << _keep.size() << " individuals remaining." << endl;
1156 }
1157
update_sex(string sex_file)1158 void gcta::update_sex(string sex_file) {
1159 ifstream isex(sex_file.c_str());
1160 if (!isex) throw ("Error: can not open the file [" + sex_file + "] to read.");
1161 int sex_buf = 0, icount = 0;
1162 string str_buf, fid, pid;
1163 cout << "Reading sex information from [" + sex_file + "]." << endl;
1164 map<string, int>::iterator iter, End = _id_map.end();
1165 _sex.clear();
1166 _sex.resize(_indi_num);
1167 vector<int> confirm(_indi_num);
1168 while (isex) {
1169 isex >> fid;
1170 if (isex.eof()) break;
1171 isex >> pid;
1172 isex >> str_buf;
1173 if (str_buf != "1" && str_buf != "2" && str_buf != "M" && str_buf != "F") throw ("Error: unrecognized sex code: \"" + fid + " " + pid + " " + str_buf + "\" in [" + sex_file + "].");
1174 iter = _id_map.find(fid + ":" + pid);
1175 if (iter != End) {
1176 if (str_buf == "M" || str_buf == "1") _sex[iter->second] = 1;
1177 else if (str_buf == "F" || str_buf == "2") _sex[iter->second] = 2;
1178 confirm[iter->second] = 1;
1179 icount++;
1180 }
1181 getline(isex, str_buf);
1182 }
1183 isex.close();
1184
1185 for (int i = 0; i < _keep.size(); i++) {
1186 if (confirm[_keep[i]] != 1) throw ("Error: sex information for all of the included individuals should be updated.");
1187 }
1188 cout << "Sex information for " << icount << " individuals are update from [" + sex_file + "]." << endl;
1189 }
1190
update_ref_A(string ref_A_file)1191 void gcta::update_ref_A(string ref_A_file) {
1192 ifstream i_ref_A(ref_A_file.c_str());
1193 if (!i_ref_A) throw ("Error: can not open the file [" + ref_A_file + "] to read.");
1194 int i = 0;
1195 string str_buf, ref_A_buf;
1196 cout << "Reading reference alleles of SNPs from [" + ref_A_file + "]." << endl;
1197 map<string, int>::iterator iter, End = _snp_name_map.end();
1198 int icount = 0;
1199 while (i_ref_A) {
1200 i_ref_A >> str_buf;
1201 if (i_ref_A.eof()) break;
1202 iter = _snp_name_map.find(str_buf);
1203 i_ref_A >> ref_A_buf;
1204 if (iter != End) {
1205 if (ref_A_buf == _allele1[iter->second]) {
1206 _ref_A[iter->second] = _allele1[iter->second];
1207 _other_A[iter->second] = _allele2[iter->second];
1208 } else if (ref_A_buf == _allele2[iter->second]) {
1209 _ref_A[iter->second] = _allele2[iter->second];
1210 _other_A[iter->second] = _allele1[iter->second];
1211 } else throw ("Error: invalid reference allele for SNP \"" + _snp_name[iter->second] + "\".");
1212 icount++;
1213 }
1214 getline(i_ref_A, str_buf);
1215 }
1216 i_ref_A.close();
1217 cout << "Reference alleles of " << icount << " SNPs are update from [" + ref_A_file + "]." << endl;
1218 if (icount != _snp_num) cout << "Warning: reference alleles of " << _snp_num - icount << " SNPs have not been updated." << endl;
1219 }
1220
calcu_mu(bool ssq_flag)1221 void gcta::calcu_mu(bool ssq_flag) {
1222 int i = 0, j = 0;
1223
1224 vector<double> auto_fac(_keep.size()), xfac(_keep.size()), fac(_keep.size());
1225 for (i = 0; i < _keep.size(); i++) {
1226 auto_fac[i] = 1.0;
1227 if (_sex[_keep[i]] == 1) xfac[i] = 0.5;
1228 else if (_sex[_keep[i]] == 2) xfac[i] = 1.0;
1229 fac[i] = 0.5;
1230 }
1231
1232 cout << "Calculating allele frequencies ..." << endl;
1233 _mu.clear();
1234 _mu.resize(_snp_num);
1235
1236 #pragma omp parallel for
1237 for (j = 0; j < _include.size(); j++) {
1238 if (_chr[_include[j]]<(_autosome_num + 1)) mu_func(j, auto_fac);
1239 else if (_chr[_include[j]] == (_autosome_num + 1)) mu_func(j, xfac);
1240 else mu_func(j, fac);
1241 }
1242 }
1243
calcu_maf()1244 void gcta::calcu_maf()
1245 {
1246 if (_mu.empty()) calcu_mu();
1247
1248 int i = 0, m = _include.size();
1249 _maf.resize(m);
1250 #pragma omp parallel for
1251 for(i = 0; i < m; i++){
1252 _maf[i] = 0.5*_mu[_include[i]];
1253 if(_maf[i] > 0.5) _maf[i] = 1.0 - _maf[i];
1254 }
1255 }
1256
mu_func(int j,vector<double> & fac)1257 void gcta::mu_func(int j, vector<double> &fac) {
1258 int i = 0;
1259 double fcount = 0.0, f_buf = 0.0;
1260 if (_dosage_flag) {
1261 for (i = 0; i < _keep.size(); i++) {
1262 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1263 _mu[_include[j]] += fac[i] * _geno_dose[_keep[i]][_include[j]];
1264 fcount += fac[i];
1265 }
1266 }
1267 } else {
1268 for (i = 0; i < _keep.size(); i++) {
1269 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1270 f_buf = (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1271 if (_allele2[_include[j]] == _ref_A[_include[j]]) f_buf = 2.0 - f_buf;
1272 _mu[_include[j]] += fac[i] * f_buf;
1273 fcount += fac[i];
1274 }
1275 }
1276 }
1277
1278 if (fcount > 0.0)_mu[_include[j]] /= fcount;
1279 }
1280
update_impRsq(string zinfofile)1281 void gcta::update_impRsq(string zinfofile) {
1282 ifstream iRsq(zinfofile.c_str());
1283 if (!iRsq) throw ("Error: can not open the file [" + zinfofile + "] to read.");
1284
1285 string snp_name_buf, str_buf;
1286 double fbuf = 0.0;
1287 cout << "Reading imputation Rsq of the SNPs from [" + zinfofile + "]." << endl;
1288 _impRsq.clear();
1289 _impRsq.resize(_snp_num, 0.0);
1290 map<string, int>::iterator iter, End = _snp_name_map.end();
1291 int icount = 0;
1292 while (iRsq) {
1293 iRsq >> snp_name_buf;
1294 if (iRsq.eof()) break;
1295 iter = _snp_name_map.find(snp_name_buf);
1296 iRsq >> str_buf;
1297 fbuf = atof(str_buf.c_str());
1298 if (iter != End) {
1299 if (fbuf > 2.0 || fbuf < 0.0) throw ("Error: invalid value of imputation Rsq for the SNP " + snp_name_buf + ".");
1300 _impRsq[iter->second] = fbuf;
1301 icount++;
1302 }
1303 getline(iRsq, str_buf);
1304 }
1305 iRsq.close();
1306
1307 cout << "Imputation Rsq of " << icount << " SNPs are update from [" + zinfofile + "]." << endl;
1308 if (icount != _snp_num) cout << "Warning: imputation Rsq of " << _snp_num - icount << " SNPs have not been updated." << endl;
1309 }
1310
update_freq(string freq)1311 void gcta::update_freq(string freq) {
1312 ifstream ifreq(freq.c_str());
1313 if (!ifreq) throw ("Error: can not open the file [" + freq + "] to read.");
1314 int i = 0;
1315 string ref_A_buf;
1316 double fbuf = 0.0;
1317 string snp_name_buf, str_buf;
1318 cout << "Reading allele frequencies of the SNPs from [" + freq + "]." << endl;
1319 map<string, int>::iterator iter, End = _snp_name_map.end();
1320 _mu.clear();
1321 _mu.resize(_snp_num, 0.0);
1322 int icount = 0;
1323 while (ifreq) {
1324 ifreq >> snp_name_buf;
1325 if (ifreq.eof()) break;
1326 iter = _snp_name_map.find(snp_name_buf);
1327 ifreq >> ref_A_buf;
1328 ifreq >> str_buf;
1329 fbuf = atof(str_buf.c_str());
1330 if (iter != End) {
1331 if (fbuf > 1.0 || fbuf < 0.0) throw ("Error: invalid value of allele frequency for the SNP " + snp_name_buf + ".");
1332 if (ref_A_buf != _allele1[iter->second] && ref_A_buf != _allele2[iter->second]) {
1333 throw ("Invalid allele type \"" + ref_A_buf + "\" for the SNP " + _snp_name[iter->second] + ".");
1334 }
1335 if (ref_A_buf == _ref_A[iter->second]) _mu[iter->second] = fbuf * 2.0;
1336 else _mu[iter->second] = (1.0 - fbuf)*2.0;
1337 icount++;
1338 }
1339 getline(ifreq, str_buf);
1340 }
1341 ifreq.close();
1342
1343 cout << "Allele frequencies of " << icount << " SNPs are update from [" + freq + "]." << endl;
1344 if (icount != _snp_num) cout << "Warning: allele frequencies of " << _snp_num - icount << " SNPs have not been updated." << endl;
1345 }
1346
save_freq(bool ssq_flag)1347 void gcta::save_freq(bool ssq_flag) {
1348 if (_mu.empty()) calcu_mu(ssq_flag);
1349 string save_freq = _out + ".freq";
1350 ofstream ofreq(save_freq.c_str());
1351 if (!ofreq) throw ("Error: can not open the file [" + save_freq + "] to write.");
1352 int i = 0;
1353 cout << "Writing allele frequencies of " << _include.size() << " SNPs to [" + save_freq + "]." << endl;
1354 for (i = 0; i < _include.size(); i++) {
1355 ofreq << _snp_name[_include[i]] << "\t" << _ref_A[_include[i]] << "\t" << setprecision(15) << _mu[_include[i]]*0.5;
1356 // if(ssq_flag) ofreq<<"\t"<<_ssq[_include[i]]<<"\t"<<_w[_include[i]];
1357 ofreq << endl;
1358 }
1359 ofreq.close();
1360 cout << "Allele frequencies of " << _include.size() << " SNPs have been saved in the file [" + save_freq + "]." << endl;
1361 }
1362
read_indi_blup(string blup_indi_file)1363 void gcta::read_indi_blup(string blup_indi_file) {
1364 vector< vector<string> > g_buf;
1365 ifstream i_indi_blup(blup_indi_file.c_str());
1366 if (!i_indi_blup) throw ("Error: can not open the file [" + blup_indi_file + "] to read.");
1367 string str_buf, id_buf;
1368 vector<string> id, vs_buf;
1369 int i = 0, j = 0, k = 0, col_num = 0;
1370 while (i_indi_blup) {
1371 i_indi_blup >> str_buf;
1372 if (i_indi_blup.eof()) break;
1373 id_buf = str_buf + ":";
1374 i_indi_blup >> str_buf;
1375 id_buf += str_buf;
1376 getline(i_indi_blup, str_buf);
1377 col_num = StrFunc::split_string(str_buf, vs_buf, " \t\n");
1378 if (col_num < 1) continue;
1379 id.push_back(id_buf);
1380 g_buf.push_back(vs_buf);
1381 }
1382 i_indi_blup.close();
1383
1384 update_id_map_kp(id, _id_map, _keep);
1385 map<string, int> uni_id_map;
1386 map<string, int>::iterator iter;
1387 for (i = 0; i < _keep.size(); i++) uni_id_map.insert(pair<string, int>(_fid[_keep[i]] + ":" + _pid[_keep[i]], i));
1388 _varcmp_Py.setZero(_keep.size(), col_num / 2);
1389 for (i = 0; i < id.size(); i++) {
1390 iter = uni_id_map.find(id[i]);
1391 if (iter == uni_id_map.end()) continue;
1392 for (j = 0, k = 0; j < col_num; j += 2, k++) _varcmp_Py(iter->second, k) = atof(g_buf[i][j].c_str());
1393 }
1394 cout << "BLUP solution to the total genetic effects for " << _keep.size() << " individuals have been read from [" + blup_indi_file + "]." << endl;
1395 }
1396
make_XMat(MatrixXf & X)1397 bool gcta::make_XMat(MatrixXf &X)
1398 {
1399 if (_mu.empty()) calcu_mu();
1400
1401 cout << "Recoding genotypes (individual major mode) ..." << endl;
1402 bool have_mis = false;
1403 unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1404
1405 X.resize(0,0);
1406 X.resize(n, m);
1407 #pragma omp parallel for private(j)
1408 for (i = 0; i < n; i++) {
1409 if (_dosage_flag) {
1410 for (j = 0; j < m; j++) {
1411 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1412 if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _geno_dose[_keep[i]][_include[j]];
1413 else X(i,j) = 2.0 - _geno_dose[_keep[i]][_include[j]];
1414 }
1415 else {
1416 X(i,j) = 1e6;
1417 have_mis = true;
1418 }
1419 }
1420 _geno_dose[i].clear();
1421 }
1422 else {
1423 for (j = 0; j < _include.size(); j++) {
1424 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1425 if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
1426 else X(i,j) = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1427 }
1428 else {
1429 X(i,j) = 1e6;
1430 have_mis = true;
1431 }
1432 }
1433 }
1434 }
1435 return have_mis;
1436 }
1437
make_XMat_d(MatrixXf & X)1438 bool gcta::make_XMat_d(MatrixXf &X)
1439 {
1440 if (_mu.empty()) calcu_mu();
1441
1442 cout << "Recoding genotypes for dominance effects (individual major mode) ..." << endl;
1443 unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1444 bool have_mis = false;
1445
1446 X.resize(0,0);
1447 X.resize(n, m);
1448 #pragma omp parallel for private(j)
1449 for (i = 0; i < n; i++) {
1450 if (_dosage_flag) {
1451 for (j = 0; j < m; j++) {
1452 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1453 if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _geno_dose[_keep[i]][_include[j]];
1454 else X(i,j) = 2.0 - _geno_dose[_keep[i]][_include[j]];
1455 if (X(i,j) < 0.5) X(i,j) = 0.0;
1456 else if (X(i,j) < 1.5) X(i,j) = _mu[_include[j]];
1457 else X(i,j) = (2.0 * _mu[_include[j]] - 2.0);
1458 }
1459 else {
1460 X(i,j) = 1e6;
1461 have_mis = true;
1462 }
1463 }
1464 _geno_dose[i].clear();
1465 }
1466 else {
1467 for (j = 0; j < _include.size(); j++) {
1468 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1469 if (_allele1[_include[j]] == _ref_A[_include[j]]) X(i,j) = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
1470 else X(i,j) = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1471 if (X(i,j) < 0.5) X(i,j) = 0.0;
1472 else if (X(i,j) < 1.5) X(i,j) = _mu[_include[j]];
1473 else X(i,j) = (2.0 * _mu[_include[j]] - 2.0);
1474 }
1475 else{
1476 X(i,j) = 1e6;
1477 have_mis = true;
1478 }
1479 }
1480 }
1481 }
1482 return have_mis;
1483 }
1484
std_XMat(MatrixXf & X,eigenVector & sd_SNP,bool grm_xchr_flag,bool miss_with_mu,bool divid_by_std)1485 void gcta::std_XMat(MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std)
1486 {
1487 if (_mu.empty()) calcu_mu();
1488
1489 unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1490 sd_SNP.resize(m);
1491 if (_dosage_flag) {
1492 for (j = 0; j < m; j++) sd_SNP[j] = (X.col(j) - VectorXf::Constant(n, _mu[_include[j]])).squaredNorm() / (n - 1.0);
1493 }
1494 else {
1495 for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1496 }
1497 if (divid_by_std) {
1498 for (j = 0; j < m; j++) {
1499 if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1500 else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
1501 }
1502 }
1503
1504 #pragma omp parallel for private(j)
1505 for (i = 0; i < n; i++) {
1506 for (j = 0; j < m; j++) {
1507 if (X(i,j) < 1e5) {
1508 X(i,j) -= _mu[_include[j]];
1509 if (divid_by_std) X(i,j) *= sd_SNP[j];
1510 }
1511 else if (miss_with_mu) X(i,j) = 0.0;
1512 }
1513 }
1514
1515 if (!grm_xchr_flag) return;
1516
1517 // for the X-chromosome
1518 check_sex();
1519 double f_buf = sqrt(0.5);
1520
1521 #pragma omp parallel for private(j)
1522 for (i = 0; i < n; i++) {
1523 if (_sex[_keep[i]] == 1) {
1524 for (j = 0; j < m; j++) {
1525 if (X(i,j) < 1e5) X(i,j) *= f_buf;
1526 else if (miss_with_mu) X(i,j) = 0.0;
1527 }
1528 }
1529 }
1530 }
1531
std_XMat_d(MatrixXf & X,eigenVector & sd_SNP,bool miss_with_mu,bool divid_by_std)1532 void gcta::std_XMat_d(MatrixXf &X, eigenVector &sd_SNP, bool miss_with_mu, bool divid_by_std)
1533 {
1534 if (_mu.empty()) calcu_mu();
1535
1536 unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
1537 sd_SNP.resize(m);
1538 if (_dosage_flag) {
1539 #pragma omp parallel for private(i)
1540 for (j = 0; j < m; j++) {
1541 for (i = 0; i < n; i++) {
1542 double d_buf = (X(i,j) - _mu[_include[j]]);
1543 sd_SNP[j] += d_buf*d_buf;
1544 }
1545 sd_SNP[j] /= (n - 1.0);
1546 }
1547 }
1548 else {
1549 for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1550 }
1551 if (divid_by_std) {
1552 for (j = 0; j < m; j++) {
1553 if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1554 else sd_SNP[j] = 1.0 / sd_SNP[j];
1555 }
1556 }
1557 else {
1558 for (j = 0; j < m; j++) sd_SNP[j] = sd_SNP[j] * sd_SNP[j];
1559 }
1560 vector<double> psq(m);
1561 for (j = 0; j < m; j++) psq[j] = 0.5 * _mu[_include[j]] * _mu[_include[j]];
1562
1563 #pragma omp parallel for private(j)
1564 for (i = 0; i < n; i++) {
1565 for (j = 0; j < m; j++) {
1566 if (X(i,j) < 1e5) {
1567 X(i,j) -= psq[j];
1568 if (divid_by_std) X(i,j) *= sd_SNP[j];
1569 }
1570 else if (miss_with_mu) X(i,j) = 0.0;
1571 }
1572 }
1573 }
1574
makex_eigenVector(int j,eigenVector & x,bool resize,bool minus_2p)1575 void gcta::makex_eigenVector(int j, eigenVector &x, bool resize, bool minus_2p)
1576 {
1577 int i = 0;
1578 if (resize) x.resize(_keep.size());
1579 #pragma omp parallel for
1580 for (i = 0; i < _keep.size(); i++) {
1581 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1582 if (_allele1[_include[j]] == _ref_A[_include[j]]) x[i] = (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1583 else x[i] = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1584 }
1585 else x[i] = _mu[_include[j]];
1586 if (minus_2p) x[i] -= _mu[_include[j]];
1587 }
1588 }
1589
save_XMat(bool miss_with_mu,bool std)1590 void gcta::save_XMat(bool miss_with_mu, bool std)
1591 {
1592 if(std && _dosage_flag) throw("Error: the --recode-std is invalid for dosage data.");
1593 if ( (miss_with_mu || std) && _mu.empty()) calcu_mu();
1594
1595 int i = 0, j = 0, m = _include.size();
1596 eigenVector sd_SNP;
1597 if(std){
1598 sd_SNP.resize(m);
1599 for (j = 0; j < m; j++) {
1600 sd_SNP(j) = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
1601 if (fabs(sd_SNP(j)) < 1.0e-50) sd_SNP(j) = 0.0;
1602 else sd_SNP(j) = sqrt(1.0 / sd_SNP(j));
1603 }
1604 }
1605
1606 // Save matrix X
1607 double x_buf = 0.0;
1608 string X_zFile = _out + ".xmat.gz";
1609 gzofstream zoutf;
1610 zoutf.open(X_zFile.c_str());
1611 if (!zoutf.is_open()) throw ("Error: can not open the file [" + X_zFile + "] to write.");
1612 cout << "Saving the recoded genotype matrix to the file [" + X_zFile + "]." << endl;
1613 zoutf << "FID IID ";
1614 for (j = 0; j < _include.size(); j++) zoutf << _snp_name[_include[j]] << " ";
1615 zoutf << endl;
1616 zoutf << "Reference Allele ";
1617 for (j = 0; j < _include.size(); j++) zoutf << _ref_A[_include[j]] << " ";
1618 zoutf << endl;
1619 for (i = 0; i < _keep.size(); i++) {
1620 zoutf << _fid[_keep[i]] << ' ' << _pid[_keep[i]] << ' ';
1621 if (_dosage_flag) {
1622 for (j = 0; j < _include.size(); j++) {
1623 if (_geno_dose[_keep[i]][_include[j]] < 1e5) {
1624 if (_allele1[_include[j]] == _ref_A[_include[j]]) x_buf = _geno_dose[_keep[i]][_include[j]];
1625 else x_buf = 2.0 - _geno_dose[_keep[i]][_include[j]];
1626 if(std) x_buf = (x_buf - _mu[_include[j]]) * sd_SNP(j);
1627 zoutf << x_buf << ' ';
1628 } else {
1629 if(std) zoutf << "0 ";
1630 else{
1631 if (miss_with_mu) zoutf << _mu[_include[j]] << ' ';
1632 else zoutf << "NA ";
1633 }
1634 }
1635 }
1636 } else {
1637 for (j = 0; j < _include.size(); j++) {
1638 if (!_snp_1[_include[j]][_keep[i]] || _snp_2[_include[j]][_keep[i]]) {
1639 if (_allele1[_include[j]] == _ref_A[_include[j]]) x_buf = _snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]];
1640 else x_buf = 2.0 - (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]);
1641 if(std) x_buf = (x_buf - _mu[_include[j]]) * sd_SNP(j);
1642 zoutf << x_buf << ' ';
1643 } else {
1644 if(std) zoutf << "0 ";
1645 else {
1646 if (miss_with_mu) zoutf << _mu[_include[j]] << ' ';
1647 else zoutf << "NA ";
1648 }
1649 }
1650 }
1651 }
1652 zoutf << endl;
1653 }
1654 zoutf.close();
1655 cout << "The recoded genotype matrix has been saved in the file [" + X_zFile + "] (in compressed text format)." << endl;
1656 }
1657
make_XMat_subset(MatrixXf & X,vector<int> & snp_indx,bool divid_by_std)1658 bool gcta::make_XMat_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std)
1659 {
1660 if(snp_indx.empty()) return false;
1661 if (_mu.empty()) calcu_mu();
1662
1663 int i = 0, j = 0, k = 0, n = _keep.size(), m = snp_indx.size();
1664
1665 X.resize(n, m);
1666 #pragma omp parallel for private(j, k)
1667 for (i = 0; i < n; i++) {
1668 for (j = 0; j < m; j++) {
1669 k = _include[snp_indx[j]];
1670 if (!_snp_1[k][_keep[i]] || _snp_2[k][_keep[i]]) {
1671 if (_allele1[k] == _ref_A[k]) X(i,j) = _snp_1[k][_keep[i]] + _snp_2[k][_keep[i]];
1672 else X(i,j) = 2.0 - (_snp_1[k][_keep[i]] + _snp_2[k][_keep[i]]);
1673 X(i,j) -= _mu[k];
1674 }
1675 else X(i,j) = 0.0;
1676 }
1677 }
1678
1679 if(divid_by_std){
1680 vector<double> sd_SNP(m);
1681 for (j = 0; j < m; j++){
1682 k = _include[snp_indx[j]];
1683 sd_SNP[j] = _mu[k]*(1.0 - 0.5 * _mu[k]);
1684 if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1685 else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
1686 }
1687 for (j = 0; j < m; j++) X.col(j) = X.col(j).array() * sd_SNP[j];
1688 }
1689
1690 return true;
1691 }
1692
make_XMat_d_subset(MatrixXf & X,vector<int> & snp_indx,bool divid_by_std)1693 bool gcta::make_XMat_d_subset(MatrixXf &X, vector<int> &snp_indx, bool divid_by_std)
1694 {
1695 if(snp_indx.empty()) return false;
1696 if (_mu.empty()) calcu_mu();
1697
1698 int i = 0, j = 0, k = 0, n = _keep.size(), m = snp_indx.size();
1699
1700 X.resize(n, m);
1701 #pragma omp parallel for private(j, k)
1702 for (i = 0; i < n; i++) {
1703 for (j = 0; j < m; j++) {
1704 k = _include[snp_indx[j]];
1705 if (!_snp_1[k][_keep[i]] || _snp_2[k][_keep[i]]) {
1706 if (_allele1[k] == _ref_A[k]) X(i,j) = _snp_1[k][_keep[i]] + _snp_2[k][_keep[i]];
1707 else X(i,j) = 2.0 - (_snp_1[k][_keep[i]] + _snp_2[k][_keep[i]]);
1708 if (X(i,j) < 0.5) X(i,j) = 0.0;
1709 else if (X(i,j) < 1.5) X(i,j) = _mu[k];
1710 else X(i,j) = (2.0 * _mu[k] - 2.0);
1711 X(i,j) -= 0.5 * _mu[k] * _mu[k];
1712 }
1713 else X(i,j) = 0.0;
1714 }
1715 }
1716
1717 if(divid_by_std){
1718 vector<double> sd_SNP(m);
1719 for (j = 0; j < m; j++){
1720 k = _include[snp_indx[j]];
1721 sd_SNP[j] = _mu[k]*(1.0 - 0.5 * _mu[k]);
1722 if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
1723 else sd_SNP[j] = 1.0 / sd_SNP[j];
1724 }
1725 for (j = 0; j < m; j++) X.col(j) = X.col(j).array() * sd_SNP[j];
1726 }
1727
1728 return true;
1729 }
1730