1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for GWAS simulation
5 *
6 * 2014 by Jian Yang <jian.yang@uq.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
read_QTL_file(string qtl_file,vector<string> & qtl_name,vector<int> & qtl_pos,vector<double> & qtl_eff,vector<int> & have_eff)15 int gcta::read_QTL_file(string qtl_file, vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, vector<int> &have_eff)
16 {
17 qtl_name.clear();
18 qtl_pos.clear();
19 qtl_eff.clear();
20 have_eff.clear();
21
22 ifstream i_qtl(qtl_file.c_str());
23 if (!i_qtl) throw ("Error: can not open the file [" + qtl_file + "] to read.");
24 string qtl_buf, str_buf;
25 double qtl_eff_buf = 0.0;
26 cout << "Reading a list of SNPs (as causal variants) from [" + qtl_file + "]." << endl;
27 map<string, int>::iterator iter, End = _snp_name_map.end();
28 vector<string> vs_buf;
29 vector<int> confirm(_snp_num);
30 int icount = 0;
31 while (i_qtl) {
32 i_qtl >> qtl_buf;
33 if (i_qtl.eof()) break;
34 iter = _snp_name_map.find(qtl_buf);
35 if (getline(i_qtl, str_buf) && StrFunc::split_string(str_buf, vs_buf, " \t\n") > 0) {
36 have_eff.push_back(1);
37 qtl_eff_buf = atof(vs_buf[0].c_str());
38 if (fabs(qtl_eff_buf) > 1e5) throw ("Error: invalid effect size specified for the causal variant [" + str_buf + "].");
39 } else {
40 have_eff.push_back(0);
41 qtl_eff_buf = 0.0;
42 }
43 if (iter != End) {
44 qtl_name.push_back(qtl_buf);
45 qtl_pos.push_back(iter->second);
46 qtl_eff.push_back(qtl_eff_buf);
47 }
48 }
49 vector<string> qtl_name_buf=qtl_name;
50 stable_sort(qtl_name_buf.begin(), qtl_name_buf.end());
51 qtl_name_buf.erase(unique(qtl_name_buf.begin(), qtl_name_buf.end()), qtl_name_buf.end());
52 i_qtl.close();
53
54 if(qtl_name_buf.size() < qtl_name.size()) throw("Error: there are duplicated SNP IDs.");
55 cout << qtl_pos.size() << " SNPs (as causal variants) to be included from [" + qtl_file + "]." << endl;
56 return (qtl_pos.size());
57 }
58
output_simu_par(vector<string> & qtl_name,vector<int> & qtl_pos,vector<double> & qtl_eff,double Vp)59 void gcta::output_simu_par(vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, double Vp)
60 {
61 int i = 0;
62 string out_parfile = _out + ".par";
63 ofstream out_par(out_parfile.c_str());
64 if (!out_par) throw ("Error: can not open par file [" + out_parfile + "] to write!");
65 out_par << "QTL\tRefAllele\tFrequency\tEffect" << endl;
66 for (i = 0; i < qtl_eff.size(); i++) out_par << qtl_name[i] << "\t" << _ref_A[qtl_pos[i]] << "\t" << 0.5 * _mu[qtl_pos[i]] << "\t" << qtl_eff[i] << endl;
67 out_par.close();
68 cout << "Simulated QTL effect(s) have been saved in [" + out_parfile + "]." << endl;
69 }
70
save_phenfile(vector<vector<double>> & y)71 void gcta::save_phenfile(vector< vector<double> > &y)
72 {
73 string phenfile = _out + ".phen";
74 ofstream phen(phenfile.c_str());
75 if (!phen) throw ("Error: can not open the file [" + phenfile + "] to write.");
76 int i = 0, j = 0;
77 for (i = 0; i < _keep.size(); i++) {
78 phen << _fid[_keep[i]] << " " << _pid[_keep[i]] << " ";
79 for (j = 0; j < y.size(); j++) phen << y[j][i] << " ";
80 phen << endl;
81 }
82 phen.close();
83 }
84
GWAS_simu(string bfile,int simu_num,string qtl_file,int case_num,int control_num,double hsq,double K,int seed,bool output_causal,bool simu_emb_flag,int eff_mod)85 void gcta::GWAS_simu(string bfile, int simu_num, string qtl_file, int case_num, int control_num, double hsq, double K, int seed, bool output_causal, bool simu_emb_flag, int eff_mod)
86 {
87 int i = 0, j = 0;
88 bool cc_flag = false;
89 if (case_num > 0 || control_num > 0) cc_flag = true;
90
91 cout << "Simulation parameters:" << endl;
92 cout << "Number of simulation replicate(s) = " << simu_num << " (Default = 1)" << endl;
93 cout << "Heritability " << (cc_flag ? "of liability = " : " = ") << hsq << " (Default = 0.1)" << endl;
94 if (cc_flag) {
95 cout << "Disease prevalence = " << K << " (Default = 0.1)" << endl;
96 cout << "Number of cases = " << case_num << endl;
97 cout << "Number of controls = " << control_num << endl;
98 }
99 cout << endl;
100
101 // Read QTL file
102 vector<string> qtl_name;
103 vector<int> qtl_pos, have_eff;
104 vector<double> qtl_eff;
105 int qtl_num = read_QTL_file(qtl_file, qtl_name, qtl_pos, qtl_eff, have_eff);
106 update_id_map_kp(qtl_name, _snp_name_map, _include);
107
108 // Generate QTL effects
109 int Seed = -CommFunc::rand_seed();
110 if (hsq > 0.0) {
111 int num_gener_qtl_eff = 0;
112 for (i = 0; i < qtl_num; i++) {
113 if (have_eff[i] == 0) {
114 qtl_eff[i] = StatFunc::gasdev(Seed);
115 num_gener_qtl_eff++;
116 }
117 }
118 if (qtl_num - num_gener_qtl_eff > 0) cout << qtl_num - num_gener_qtl_eff << " user-specified QTL effects." << endl;
119 if (num_gener_qtl_eff > 0) cout << num_gener_qtl_eff << " unspecified QTL effects are generated from standard normal distribution." << endl;
120
121 vector<string> vs_buf(qtl_num);
122 for (i = 0; i < qtl_num; i++) vs_buf[i] = _snp_name[_include[i]];
123 vector<int> indx;
124 StrFunc::match(vs_buf, qtl_name, indx);
125 qtl_name = vs_buf;
126 vector<double> qtl_eff_buf(qtl_eff);
127 vector<int> qtl_pos_buf(qtl_pos);
128 for (i = 0; i < qtl_num; i++){
129 qtl_eff[i] = qtl_eff_buf[indx[i]];
130 qtl_pos[i] = qtl_pos_buf[indx[i]];
131 }
132 } else {
133 qtl_eff.clear();
134 qtl_eff.resize(qtl_num, 0.0);
135 }
136
137 // Calculate allele frequency
138 MatrixXf X;
139 make_XMat(X);
140 if(eff_mod == 0){
141 eigenVector sd_SNP;
142 std_XMat(X, sd_SNP, false, true, true);
143 }
144
145 // Calculate Ve and threhold
146 double var_g = 0.0, var_e = 1.0;
147 vector<double> g(_keep.size());
148 if (hsq > 0.0) {
149 for (i = 0; i < _keep.size(); i++) {
150 for (j = 0; j < qtl_num; j++) g[i] += X(i,j) * qtl_eff[j];
151 }
152 var_g = CommFunc::var(g);
153 var_e = var_g * (1.0 / hsq - 1.0);
154 }
155 double sd_e = sqrt(var_e);
156
157 // Output par file
158 output_simu_par(qtl_name, qtl_pos, qtl_eff, var_e + var_g);
159
160 // Output phenotype file
161 cout << "Simulating GWAS based on the real genotyped data with " << simu_num << " replicate(s) ..." << endl;
162 vector< vector<double> > y(simu_num);
163 int case_num_buf = 0, control_num_buf = 0;
164 for (i = 0; i < simu_num; i++) {
165 y[i].resize(_keep.size());
166 for (j = 0; j < _keep.size(); j++) {
167 if (hsq < 1.0) y[i][j] = g[j] + sd_e * StatFunc::gasdev(Seed);
168 else y[i][j] = g[j];
169 }
170 if (cc_flag) {
171 case_num_buf = 0;
172 control_num_buf = 0;
173 vector<double> y_buf(y[i]);
174 stable_sort(y_buf.begin(), y_buf.end());
175 int n = (int) (_indi_num * (1.0 - K));
176 double Th = 0.5 * (y_buf[n] + y_buf[n - 1]);
177 for (j = 0; j < _keep.size(); j++) {
178 if (y[i][j] > Th) {
179 if (case_num_buf < case_num) {
180 y[i][j] = 2;
181 case_num_buf++;
182 } else y[i][j] = -9;
183 } else {
184 if (control_num_buf < control_num) {
185 y[i][j] = 1;
186 control_num_buf++;
187 } else y[i][j] = -9;
188 }
189 }
190 }
191 }
192
193 if (!simu_emb_flag) {
194 save_phenfile(y);
195 if (cc_flag) cout << "Simulated " << case_num_buf << " cases and " << control_num << " controls have been saved in [" + _out + ".phen" + "]." << endl;
196 else cout << "Simulated phenotypes of " << _keep.size() << " individuals have been saved in [" + _out + ".phen" + "]." << endl;
197 } else {
198 // emBayesB format
199 if (!output_causal) update_id_map_rm(qtl_name, _snp_name_map, _include);
200 string out_rstfile = _out + ".emb";
201 ofstream out_emBayesB(out_rstfile.c_str());
202 if (!out_emBayesB) throw ("Error: can not open the file [" + out_rstfile + "] to write.");
203 cout << "Saving the simulated data to the file [" + out_rstfile + "] (in emBayesB format)." << endl;
204 for (i = 0; i < _keep.size(); i++) {
205 if (y[0][i] == -9) continue;
206 out_emBayesB << _pid[_keep[i]] << " " << g[i] << " " << y[0][i] << endl;
207 for (j = 0; j < _include.size(); j++) {
208 if (_snp_1[_include[j]][_keep[i]] && !_snp_2[_include[j]][_keep[i]]) out_emBayesB << _mu[_include[j]] << " ";
209 else out_emBayesB << (double) (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]) << " ";
210 }
211 out_emBayesB << endl;
212 }
213 out_emBayesB.close();
214 cout << "Simulated data (" << _keep.size() << " individuals and " << _include.size() << " SNPs) has been saved in [" + out_rstfile + "]." << endl;
215 }
216 }
217
218 /////////////////////////////////////////
219 /* These codes are not used any more */
220 /////////////////////////////////////////
221 /*
222 void gcta::GenerCases(string bfile, string qtl_file, int case_num, int control_num, double hsq, double K, bool curr_popu, double gnrt)
223 {
224 int i=0, j=0, k=0;
225
226 if(gnrt<1.0 || gnrt>1e5) throw("Error: --simu-gener should be within the range from 1 to 100000.");
227 if(hsq>1.0 || hsq<0.0) throw("Error: --simu-h2 should be within the range from 0 to 1.");
228 if(K>0.5 || K<0.0001) throw("Error: --simu-K should be within the range from 0.0001 to 0.5.");
229 if(!curr_popu && (case_num>1e5 || case_num<1)) throw("Error: --simu-cc, Invalid number of cases.");
230 if(!curr_popu && (control_num>1e6 || control_num<1)) throw("Error: --simu-cc, Invalid number of controls.");
231
232 // Read bim file: recombination rate is defined between SNP i and SNP i-1
233 read_bimfile(bfile+".bim");
234 kosambi();
235
236 // calcualte the accumulative recombination probability over xxx generation
237 for(i=0; i<_snp_num; i++) _rc_rate[i]=1.0-pow((double)(1.0-_rc_rate[i]), (double)gnrt);
238
239 // Read QTL file
240 vector<string> qtl_name;
241 read_snplist(qtl_file, qtl_name);
242 vector<int> qtl_pos;
243 map<string, int>::iterator iter;
244 for(int i=0; i<qtl_name.size(); i++){
245 iter=_snp_name_map.find(qtl_name[i]);
246 if(iter!=_snp_name_map.end()) qtl_pos.push_back(iter->second);
247 }
248 stable_sort(qtl_pos.begin(), qtl_pos.end());
249 int qtl_num=qtl_pos.size();
250 cout<<qtl_num<<" SNPs (as QTLs) to be included from ["+qtl_file+"]."<<endl;
251
252 // Read fam and bed files
253 read_famfile(bfile+".fam");
254 read_bedfile(bfile+".bed");
255
256 // Generate QTL effects
257 int Seed=-CommFunc::rand_seed();
258 cout<<"Generating QTL effects ("<<"Random seed = "<<abs(Seed)<<")."<<endl;
259 vector<double> qtl_eff(qtl_num);
260 for(i=0; i<qtl_num && hsq>0.0; i++) qtl_eff[i]=StatFunc::gasdev(Seed);
261
262 // Calculate Ve and threhold
263 int n=0, x=0;
264 double var_g=0.0, var_e=1.0;
265 vector<double> AF(qtl_num), y(_indi_num), y_backup;
266 if(hsq>0.0){
267 for(n=0; n<_indi_num; n++){
268 for(i=0; i<qtl_num; i++){
269 j=qtl_pos[i];
270 if(_snp_a[n][j] && !_snp_b[n][j]) continue;
271 x=_snp_a[n][j]+_snp_b[n][j];
272 AF[i]+=x;
273 y[n]+=x*qtl_eff[i];
274 }
275 }
276 var_g=CommFunc::var(y);
277 var_e=var_g*(1.0/hsq-1.0);
278 }
279 double sd_e=sqrt(var_e);
280 for(n=0; n<_indi_num; n++) y[n]+=sd_e*StatFunc::gasdev(Seed);
281 if(curr_popu) y_backup=y;
282 stable_sort(y.begin(), y.end());
283 n=(int)(_indi_num*(1.0-K));
284 double Th=0.5*(y[n]+y[n-1]);
285
286 // ouput par file
287 vector<double> qsq(qtl_num);
288 for(i=0; i<qtl_num; i++){
289 AF[i]/=_indi_num;
290 AF[i]*=0.5;
291 qsq[i]=2.0*AF[i]*(1.0-AF[i])*qtl_eff[i]*qtl_eff[i];
292 qsq[i]/=(var_e+var_g);
293 }
294 string out_parfile=_out+".par";
295 ofstream out_par(out_parfile.c_str());
296 if(!out_par) throw("Error: can not open par file ["+out_parfile+"] to write!");
297 cout<<"Writing simulation parameters to file ["<<out_parfile<<"]."<<endl;
298 for(i=0; i<qtl_num; i++) out_par<<qtl_name[i]<<"\t"<<AF[i]<<"\t"<<qtl_eff[i]<<"\t"<<qsq[i]<<endl;
299 out_par.close();
300
301 // If generate cases and controls based on current population
302 if(curr_popu){
303 cout<<"Generating cases and control based on the current population."<<endl;
304 for(i=0; i<_indi_num; i++){
305 if(y_backup[i]>Th) _pheno[i]=2;
306 else _pheno[i]=1;
307 }
308 save_famfile();
309 return;
310 }
311
312 // Generate population
313 int fa_hap_buf=0, mo_hap_buf, fa=0, mo=0;
314 double score=0.0;
315 vector< vector<int> > case_fa, case_mo, case_fa_hap, case_mo_hap, control_fa, control_mo, control_fa_hap, control_mo_hap;
316 cout<<"Generating population derived from the current population."<<endl;
317 while(case_fa.size()<case_num || control_fa.size()<control_num){
318 vector<int> fa_vbuf, mo_vbuf, fa_hap_vbuf, mo_hap_vbuf;
319 for(i=0; i<_snp_num; i++){
320 if(StatFunc::ran1(Seed)<_rc_rate[i]){
321 fa_hap_vbuf.push_back(i);
322 fa_vbuf.push_back((int)(StatFunc::ran1(Seed)*_indi_num*2));
323 }
324 if(StatFunc::ran1(Seed)<_rc_rate[i]){
325 mo_hap_vbuf.push_back(i);
326 mo_vbuf.push_back((int)(StatFunc::ran1(Seed)*_indi_num*2));
327 }
328 }
329 score=0.0;
330 for(i=0; i<qtl_num; i++){
331 j=qtl_pos[i];
332 fa_hap_buf=upper_bound(fa_hap_vbuf.begin(), fa_hap_vbuf.end(), j)-fa_hap_vbuf.begin()-1;
333 mo_hap_buf=upper_bound(mo_hap_vbuf.begin(), mo_hap_vbuf.end(), j)-mo_hap_vbuf.begin()-1;
334 fa=(int)(fa_vbuf[fa_hap_buf]*0.5);
335 mo=(int)(mo_vbuf[mo_hap_buf]*0.5);
336 if( (_snp_a[fa][j] && !_snp_b[fa][j]) || (_snp_a[mo][j] && !_snp_b[mo][j]) ) continue;
337 if(fa_vbuf[fa_hap_buf]%2==0) x=_snp_a[fa][j];
338 else x=_snp_b[fa][j];
339 if(mo_vbuf[mo_hap_buf]%2==0) x+=_snp_a[mo][j];
340 else x+=_snp_b[mo][j];
341 score+=x*qtl_eff[i];
342 }
343 score+=sd_e*StatFunc::gasdev(Seed);
344 if(score>Th){
345 if(case_fa.size()<case_num){
346 case_fa.push_back(fa_vbuf);
347 case_mo.push_back(mo_vbuf);
348 case_fa_hap.push_back(fa_hap_vbuf);
349 case_mo_hap.push_back(mo_hap_vbuf);
350 }
351 }
352 else{
353 if(control_fa.size()<control_num){
354 control_fa.push_back(fa_vbuf);
355 control_mo.push_back(mo_vbuf);
356 control_fa_hap.push_back(fa_hap_vbuf);
357 control_mo_hap.push_back(mo_hap_vbuf);
358 }
359 }
360 }
361 cout<<case_num<<" cases and "<<control_num<<" controls have been generated."<<endl;
362 _indi_num=case_num+control_num;
363
364 // Output fam file
365 string out_famfile=_out+".fam";
366 ofstream out_fam(out_famfile.c_str());
367 if(!out_fam) throw("Error: can not open fam file ["+out_famfile+"] to write!");
368 cout<<"Writing IDs of cases and controls to file ["<<out_famfile<<"]."<<endl;
369 for(i=0; i<case_num; i++) out_fam<<i+1<<"\t"<<i+1<<"\t0\t0\t0\t2"<<endl;
370 for(i=case_num; i<_indi_num; i++) out_fam<<i+1<<"\t"<<i+1<<"\t0\t0\t0\t1"<<endl;
371 out_fam.close();
372
373 // Output bed file
374 // Output first three bytes
375 // Output genotype in binary format
376 case_fa.insert(case_fa.end(), control_fa.begin(), control_fa.end());
377 case_mo.insert(case_mo.end(), control_mo.begin(), control_mo.end());
378 case_fa_hap.insert(case_fa_hap.end(), control_fa_hap.begin(), control_fa_hap.end());
379 case_mo_hap.insert(case_mo_hap.end(), control_mo_hap.begin(), control_mo_hap.end());
380 save_bedfile(case_fa, case_mo, case_fa_hap, case_mo_hap);
381 }
382
383
384 void gcta::kosambi()
385 {
386 int i=0;
387 double c=0.0;
388 _rc_rate.clear();
389 _rc_rate.resize(_snp_num);
390 for(i=0; i<_snp_num; i++){
391 if(i==0 || _chr[i]!=_chr[i-1]) _rc_rate[i]=0.5;
392 else{
393 c=exp(_genet_dst[i]*0.04);
394 _rc_rate[i]=0.5*(c-1.0)/(c+1.0);
395 }
396 }
397 }
398 */
399
400 /*
401 void gcta::save_bedfile(vector< vector<int> > &fa_indx, vector< vector<int> > &mo_indx, vector< vector<int> > &fa_hap, vector< vector<int> > &mo_hap, bool GENOME)
402 {
403 bool fa_geno=false, mo_geno=false;
404 int i=0, pos=0, n=0, fa=0, mo=0, fa_hap_buf=0, mo_hap_buf=0;
405 string OutBedFile=_out+".bed";
406 fstream OutBed(OutBedFile.c_str(), ios::out|ios::binary);
407 if(!OutBed) throw("Error: can not open the file ["+OutBedFile+"] to write.");
408 cout<<"Writing genotypes to PLINK BED file ["+OutBedFile+"]."<<endl;
409 bitset<8> b;
410 char ch[1];
411 b.reset();
412 b.set(2); b.set(3); b.set(5); b.set(6);
413 ch[0] = (char)b.to_ulong();
414 OutBed.write(ch,1);
415 b.reset();
416 b.set(0); b.set(1); b.set(3); b.set(4);
417 ch[0] = (char)b.to_ulong();
418 OutBed.write(ch,1);
419 b.reset();
420 b.set(0);
421 ch[0] = (char)b.to_ulong();
422 OutBed.write(ch,1);
423 for(i=0; i<_snp_num; i++){
424 pos=0;
425 b.reset();
426 for(n=0; n<_indi_num; n++){
427 if(!GENOME){
428 fa_hap_buf=upper_bound(fa_hap[n].begin(), fa_hap[n].end(), i)-fa_hap[n].begin()-1;
429 mo_hap_buf=upper_bound(mo_hap[n].begin(), mo_hap[n].end(), i)-mo_hap[n].begin()-1;
430 fa=(int)(fa_indx[n][fa_hap_buf]*0.5);
431 mo=(int)(mo_indx[n][mo_hap_buf]*0.5);
432 }
433 else{
434 fa=n;
435 mo=n;
436 }
437 if( (_snp_a[fa][i] && !_snp_b[fa][i]) || (_snp_a[mo][i] && !_snp_b[mo][i]) ){
438 b[pos++]=1;
439 b[pos++]=0;
440 }
441 else{
442 if(!GENOME){
443 if(fa_indx[n][fa_hap_buf]%2==0) fa_geno=_snp_a[fa][i];
444 else fa_geno=_snp_b[fa][i];
445 if(mo_indx[n][mo_hap_buf]%2==0) mo_geno=_snp_a[mo][i];
446 else mo_geno=_snp_b[mo][i];
447 fa_geno=(!fa_geno);
448 mo_geno=(!mo_geno);
449 }
450 else{
451 fa_geno=_snp_a[fa][i];
452 mo_geno=_snp_b[mo][i];
453 }
454 b[pos++]=min(fa_geno, mo_geno);
455 b[pos++]=max(fa_geno, mo_geno);
456 }
457 if(pos>7 || n==_indi_num-1){
458 ch[0]=(char)b.to_ulong();
459 OutBed.write(ch,1);
460 pos=0;
461 b.reset();
462 }
463 }
464 }
465 OutBed.close();
466 }
467
468 void gcta::save_famfile()
469 {
470 string famfile=_out+".fam";
471 ofstream Fam(famfile.c_str());
472 if(!Fam) throw("Error: can not open the fam file "+famfile+" to save!");
473 cout<<"Writing PLINK FAM file to ["+famfile+"]."<<endl;
474 int i=0;
475 for(i=0; i<_indi_num; i++){
476 Fam<<_fid[i]<<"\t"<<_pid[i]<<"\t"<<_fa_id[i]<<"\t"<<_mo_id[i]<<"\t"<<_sex[i]<<"\t"<<_pheno[i]<<endl;
477 }
478 Fam.close();
479 cout<<_indi_num<<" individuals to be saved to ["+famfile+"]."<<endl;
480 }
481
482 void gcta::save_bimfile()
483 {
484 int i=0;
485 string bimfile=_out+".bim";
486 ofstream Bim(bimfile.c_str());
487 if(!Bim) throw("Error: can not open the file ["+bimfile+"] to write.");
488 cout<<"Writing PLINK bim file to ["<<bimfile<<"]."<<endl;
489 for(i=0; i<_snp_num; i++){
490 Bim<<_chr[i]<<"\t"<<_snp_name[i]<<"\t"<<_genet_dst[i]<<"\t"<<_bp[i]<<"\t"<<_allele1[i]<<"\t"<<_allele2[i]<<endl;
491 }
492 Bim.close();
493 cout<<_snp_num<<" SNPs to be saved to ["<<bimfile<<"]."<<endl;
494 }
495 */
genet_dst(string bfile,string hapmap_genet_map)496 void gcta::genet_dst(string bfile, string hapmap_genet_map)
497 {
498 // Read bim file
499 read_bimfile(bfile + ".bim");
500 int snp_num = _snp_name.size();
501
502 // Read HAPMAP genetic map files
503 int i = 0, j = 0;
504 string str_buf;
505 vector<string> vs_buf;
506 string genet_mapfile;
507 vector< vector< vector<double> > > hap_genet(_autosome_num);
508 for (i = 0; i < _autosome_num; i++) {
509 stringstream str_strm;
510 str_strm << hapmap_genet_map << i + 1 << "_CEU_b36.txt";
511 genet_mapfile = str_strm.str();
512 ifstream i_genet_map(genet_mapfile.c_str());
513 if (!i_genet_map) throw ("Error: can not open HAPMAP genetic map file " + genet_mapfile + "!");
514 hap_genet[i].resize(2);
515 getline(i_genet_map, str_buf);
516 while (getline(i_genet_map, str_buf)) {
517 if (StrFunc::split_string(str_buf, vs_buf) < 3) continue;
518 hap_genet[i][0].push_back(atof(vs_buf[0].c_str()));
519 hap_genet[i][1].push_back(atof(vs_buf[1].c_str()));
520 }
521 i_genet_map.close();
522 }
523
524 // calculate genetic distance
525 int pos1 = 0, pos2 = 0;
526 double prev_bp = 0.0;
527 vector<double> dst(snp_num);
528 vector<double>::iterator iter1, iter2;
529 for (i = 0; i < snp_num; i++) {
530 if (i == 0 || _chr[i - 1] != _chr[i]) {
531 dst[i] = 0.0;
532 continue;
533 }
534 iter1 = upper_bound(hap_genet[_chr[i] - 1][0].begin(), hap_genet[_chr[i] - 1][0].end(), _bp[i - 1]);
535 iter2 = upper_bound(hap_genet[_chr[i] - 1][0].begin(), hap_genet[_chr[i] - 1][0].end(), _bp[i]);
536 if (iter1 == hap_genet[_chr[i] - 1][0].end() || iter2 == hap_genet[_chr[i] - 1][0].end()) {
537 dst[i] = _bp[i] - _bp[i - 1];
538 continue;
539 }
540 pos1 = iter1 - hap_genet[_chr[i] - 1][0].begin();
541 pos2 = iter2 - hap_genet[_chr[i] - 1][0].begin();
542 prev_bp = _bp[i - 1];
543 while (pos1 < pos2) {
544 dst[i] += (hap_genet[_chr[i] - 1][0][pos1] - prev_bp) * hap_genet[_chr[i] - 1][1][pos1 - 1];
545 prev_bp = hap_genet[_chr[i] - 1][0][pos1];
546 pos1++;
547 }
548 dst[i] += (_bp[i] - prev_bp) * hap_genet[_chr[i] - 1][1][pos1 - 1];
549 }
550
551 // Output fam file
552 string out_bimfile = _out + ".genetdst";
553 ofstream out_bim(out_bimfile.c_str());
554 if (!out_bim) throw ("Error: can not open file " + out_bimfile + " to write!");
555 for (i = 0; i < snp_num; i++) out_bim << _chr[i] << "\t" << _snp_name[i] << "\t" << dst[i]*1e-6 << "\t" << _bp[i] << "\t" << _allele1[i] << "\t" << _allele2[i] << endl;
556 out_bim.close();
557 cout << "Genetic distances have been created, and been saved in [" + out_bimfile + "]." << endl;
558 }
559
560 /*
561 void gcta::simu_genome(
562 string popSize, // effective population size of each population
563 int nSubPOP, // number of populations
564 vector<int> & nSubSample, // number of samples (haplotypes) draw from each populations
565 int numPieces, // number of fragments for each sample (chromosome)
566 int pieceLen, // length in base pair of each fragment
567 int numIndepRegion, // number of independent regions (independent chromosome)
568 int s, // fixed number of SNPs want to simulate, randomly place s SNPs on the genealogy
569 string rec, // recombination rate between consecutive fragments per generation
570 double mut, // mutation rate per generation per base pair
571 double mig // migration rate per generation
572 )
573 {
574 int seed=CommFunc::rand_seed();
575 vector< vector<bool> > data; // the return chromosome by connecting all independent chromosome into one long chromosome
576
577 cout<<"\n*********************************************"<<endl;
578 cout<<"The following output is generated by the program GENOME\n"<<endl;
579 genome(popSize, nSubPOP, nSubSample, numPieces, pieceLen, numIndepRegion, s, rec, mut, mig, data, seed, false, false);
580 cout<<"\nEnd of output by GENOME"<<endl;
581 cout<<"*********************************************\n"<<endl;
582 _snp_a.clear();
583 _snp_b.clear();
584 _pheno.clear();
585 int i=0, j=0, k=0, index=0;
586 for(k=0; k<nSubPOP; k++){
587 for(i=0; i<nSubSample[k]; i++){
588 _snp_a.push_back(data[index++]);
589 _snp_b.push_back(data[index++]);
590 _pheno.push_back(k+1);
591 i++;
592 if(i==nSubSample[k]-2) break;
593 }
594 }
595 _indi_num=_snp_a.size();
596 _snp_num=_snp_a[0].size();
597
598 // save famfile
599 stringstream strstrm;
600 _fid.clear();
601 _fid.resize(_indi_num);
602 _pid.clear();
603 _pid.resize(_indi_num);
604 _fa_id.clear();
605 _fa_id.resize(_indi_num);
606 _mo_id.clear();
607 _mo_id.resize(_indi_num);
608 _sex.clear();
609 _sex.resize(_indi_num);
610 for(i=0; i<_indi_num; i++){
611 strstrm.str("");
612 strstrm<<i+1;
613 _fid[i]=strstrm.str();
614 _pid[i]=strstrm.str();
615 _fa_id[i]="0";
616 _mo_id[i]="0";
617 }
618 save_famfile();
619
620 // save bimfile
621 _chr.clear();
622 _chr.resize(_snp_num);
623 _snp_name.clear();
624 _snp_name.resize(_snp_num);
625 _genet_dst.clear();
626 _genet_dst.resize(_snp_num);
627 _bp.clear();
628 _bp.resize(_snp_num);
629 _allele1.clear();
630 _allele1.resize(_snp_num);
631 _allele2.clear();
632 _allele2.resize(_snp_num);
633 for(i=0; i<_snp_num; i++){
634 _chr[i]=1;
635 strstrm.str("");
636 strstrm<<"SNP_"<<i+1;
637 _snp_name[i]=strstrm.str();
638 _allele1[i]='A';
639 _allele2[i]='C';
640 }
641 save_bimfile();
642
643 // save bedfile
644 vector< vector<int> > tmp;
645 save_bedfile(tmp, tmp, tmp, tmp, true);
646 }
647 */
648
649 /*
650 void gcta::simu_geno_unlinked(int N, int M, double maf)
651 {
652 int i=0, j=0, x=0;
653
654 // fam file
655 _keep.resize(N);
656 _fid.resize(N);
657 _pid.resize(N);
658 _fa_id.resize(N);
659 _mo_id.resize(N);
660 _sex.resize(N);
661 _pheno.resize(N);
662 for(i=0; i<N; i++){
663 _keep[i]=i;
664 stringstream ss;
665 ss<<i+1;
666 _fid[i]=ss.str();
667 _pid[i]=_fid[i];
668 _fa_id[i]="0";
669 _mo_id[i]="0";
670 _sex[i]=-9;
671 _pid[i]=-9;
672 }
673
674 // bim file
675 _include.resize(M);
676 _snp_num=M;
677 _chr.resize(M);
678 _snp_name.resize(M);
679 _bp.resize(M);
680 _genet_dst.resize(M);
681 _allele1.resize(M);
682 _allele2.resize(M);
683 _snp_1.resize(M);
684 _snp_2.resize(M);
685
686 //double p=0.0;
687 std::tr1::minstd_rand eng;
688 eng.seed((unsigned int)time(NULL));
689
690 cout<<"maf "<<maf<<endl;
691
692 for(j=0; j<M; j++){
693 _include[j]=j;
694 stringstream ss;
695 ss<<"SNP"<<j+1;
696 _chr[j]=1;
697 _snp_name[j]=ss.str();
698 _bp[j]=j+1;
699 _genet_dst[j]=0.0;
700 _allele1[j]="A";
701 _allele2[j]="G";
702 _snp_1[j].resize(N);
703 _snp_2[j].resize(N);
704 std::tr1::uniform_real<double> runiform(maf,1-maf);
705 double p = runiform(eng)/1.0e10;
706
707 //debug
708 cout<<"p = "<<p<<endl;
709
710 for(i=0; i<N; i++){
711 std::tr1::binomial_distribution<int, double> rbinom(2,0.5);
712 x=rbinom(eng);
713
714 //debug
715 cout<<x<<"\t";
716
717
718 if(x==2) _snp_1[j][i]=_snp_2[j][i]=true;
719 else if(x==1){
720 _snp_1[j][i]=false;
721 _snp_2[j][i]=true;
722 }
723 else _snp_1[j][i]=_snp_2[j][i]=false;
724 }
725
726 //debug
727 cout<<endl;
728 }
729
730 save_plink();
731 }
732 */
733
734