1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for GWAS simulation
5  *
6  * 2014 by Jian Yang <jian.yang@uq.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
read_QTL_file(string qtl_file,vector<string> & qtl_name,vector<int> & qtl_pos,vector<double> & qtl_eff,vector<int> & have_eff)15 int gcta::read_QTL_file(string qtl_file, vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, vector<int> &have_eff)
16 {
17     qtl_name.clear();
18     qtl_pos.clear();
19     qtl_eff.clear();
20     have_eff.clear();
21 
22     ifstream i_qtl(qtl_file.c_str());
23     if (!i_qtl) throw ("Error: can not open the file [" + qtl_file + "] to read.");
24     string qtl_buf, str_buf;
25     double qtl_eff_buf = 0.0;
26     cout << "Reading a list of SNPs (as causal variants) from [" + qtl_file + "]." << endl;
27     map<string, int>::iterator iter, End = _snp_name_map.end();
28     vector<string> vs_buf;
29     vector<int> confirm(_snp_num);
30     int icount = 0;
31     while (i_qtl) {
32         i_qtl >> qtl_buf;
33         if (i_qtl.eof()) break;
34         iter = _snp_name_map.find(qtl_buf);
35         if (getline(i_qtl, str_buf) && StrFunc::split_string(str_buf, vs_buf, " \t\n") > 0) {
36             have_eff.push_back(1);
37             qtl_eff_buf = atof(vs_buf[0].c_str());
38             if (fabs(qtl_eff_buf) > 1e5) throw ("Error: invalid effect size specified for the causal variant [" + str_buf + "].");
39         } else {
40             have_eff.push_back(0);
41             qtl_eff_buf = 0.0;
42         }
43         if (iter != End) {
44             qtl_name.push_back(qtl_buf);
45             qtl_pos.push_back(iter->second);
46             qtl_eff.push_back(qtl_eff_buf);
47         }
48     }
49     vector<string> qtl_name_buf=qtl_name;
50     stable_sort(qtl_name_buf.begin(), qtl_name_buf.end());
51     qtl_name_buf.erase(unique(qtl_name_buf.begin(), qtl_name_buf.end()), qtl_name_buf.end());
52     i_qtl.close();
53 
54     if(qtl_name_buf.size() < qtl_name.size()) throw("Error: there are duplicated SNP IDs.");
55     cout << qtl_pos.size() << " SNPs (as causal variants) to be included from [" + qtl_file + "]." << endl;
56     return (qtl_pos.size());
57 }
58 
output_simu_par(vector<string> & qtl_name,vector<int> & qtl_pos,vector<double> & qtl_eff,double Vp)59 void gcta::output_simu_par(vector<string> &qtl_name, vector<int> &qtl_pos, vector<double> &qtl_eff, double Vp)
60 {
61     int i = 0;
62     string out_parfile = _out + ".par";
63     ofstream out_par(out_parfile.c_str());
64     if (!out_par) throw ("Error: can not open par file [" + out_parfile + "] to write!");
65     out_par << "QTL\tRefAllele\tFrequency\tEffect" << endl;
66     for (i = 0; i < qtl_eff.size(); i++) out_par << qtl_name[i] << "\t" << _ref_A[qtl_pos[i]] << "\t" << 0.5 * _mu[qtl_pos[i]] << "\t" << qtl_eff[i] << endl;
67     out_par.close();
68     cout << "Simulated QTL effect(s) have been saved in [" + out_parfile + "]." << endl;
69 }
70 
save_phenfile(vector<vector<double>> & y)71 void gcta::save_phenfile(vector< vector<double> > &y)
72 {
73     string phenfile = _out + ".phen";
74     ofstream phen(phenfile.c_str());
75     if (!phen) throw ("Error: can not open the file [" + phenfile + "] to write.");
76     int i = 0, j = 0;
77     for (i = 0; i < _keep.size(); i++) {
78         phen << _fid[_keep[i]] << " " << _pid[_keep[i]] << " ";
79         for (j = 0; j < y.size(); j++) phen << y[j][i] << " ";
80         phen << endl;
81     }
82     phen.close();
83 }
84 
GWAS_simu(string bfile,int simu_num,string qtl_file,int case_num,int control_num,double hsq,double K,int seed,bool output_causal,bool simu_emb_flag,int eff_mod)85 void gcta::GWAS_simu(string bfile, int simu_num, string qtl_file, int case_num, int control_num, double hsq, double K, int seed, bool output_causal, bool simu_emb_flag, int eff_mod)
86 {
87     int i = 0, j = 0;
88     bool cc_flag = false;
89     if (case_num > 0 || control_num > 0) cc_flag = true;
90 
91     cout << "Simulation parameters:" << endl;
92     cout << "Number of simulation replicate(s) = " << simu_num << " (Default = 1)" << endl;
93     cout << "Heritability " << (cc_flag ? "of liability = " : " = ") << hsq << " (Default = 0.1)" << endl;
94     if (cc_flag) {
95         cout << "Disease prevalence = " << K << " (Default = 0.1)" << endl;
96         cout << "Number of cases = " << case_num << endl;
97         cout << "Number of controls = " << control_num << endl;
98     }
99     cout << endl;
100 
101     // Read QTL file
102     vector<string> qtl_name;
103     vector<int> qtl_pos, have_eff;
104     vector<double> qtl_eff;
105     int qtl_num = read_QTL_file(qtl_file, qtl_name, qtl_pos, qtl_eff, have_eff);
106     update_id_map_kp(qtl_name, _snp_name_map, _include);
107 
108     // Generate QTL effects
109     int Seed = -CommFunc::rand_seed();
110     if (hsq > 0.0) {
111         int num_gener_qtl_eff = 0;
112         for (i = 0; i < qtl_num; i++) {
113             if (have_eff[i] == 0) {
114                 qtl_eff[i] = StatFunc::gasdev(Seed);
115                 num_gener_qtl_eff++;
116             }
117         }
118         if (qtl_num - num_gener_qtl_eff > 0) cout << qtl_num - num_gener_qtl_eff << " user-specified QTL effects." << endl;
119         if (num_gener_qtl_eff > 0) cout << num_gener_qtl_eff << " unspecified QTL effects are generated from standard normal distribution." << endl;
120 
121         vector<string> vs_buf(qtl_num);
122         for (i = 0; i < qtl_num; i++) vs_buf[i] = _snp_name[_include[i]];
123         vector<int> indx;
124         StrFunc::match(vs_buf, qtl_name, indx);
125         qtl_name = vs_buf;
126         vector<double> qtl_eff_buf(qtl_eff);
127         vector<int> qtl_pos_buf(qtl_pos);
128         for (i = 0; i < qtl_num; i++){
129             qtl_eff[i] = qtl_eff_buf[indx[i]];
130             qtl_pos[i] = qtl_pos_buf[indx[i]];
131         }
132     } else {
133         qtl_eff.clear();
134         qtl_eff.resize(qtl_num, 0.0);
135     }
136 
137     // Calculate allele frequency
138     MatrixXf X;
139     make_XMat(X);
140     if(eff_mod == 0){
141         eigenVector sd_SNP;
142         std_XMat(X, sd_SNP, false, true, true);
143     }
144 
145     // Calculate Ve and threhold
146     double var_g = 0.0, var_e = 1.0;
147     vector<double> g(_keep.size());
148     if (hsq > 0.0) {
149         for (i = 0; i < _keep.size(); i++) {
150             for (j = 0; j < qtl_num; j++) g[i] += X(i,j) * qtl_eff[j];
151         }
152         var_g = CommFunc::var(g);
153         var_e = var_g * (1.0 / hsq - 1.0);
154     }
155     double sd_e = sqrt(var_e);
156 
157     // Output par file
158     output_simu_par(qtl_name, qtl_pos, qtl_eff, var_e + var_g);
159 
160     // Output phenotype file
161     cout << "Simulating GWAS based on the real genotyped data with " << simu_num << " replicate(s) ..." << endl;
162     vector< vector<double> > y(simu_num);
163     int case_num_buf = 0, control_num_buf = 0;
164     for (i = 0; i < simu_num; i++) {
165         y[i].resize(_keep.size());
166         for (j = 0; j < _keep.size(); j++) {
167             if (hsq < 1.0) y[i][j] = g[j] + sd_e * StatFunc::gasdev(Seed);
168             else y[i][j] = g[j];
169         }
170         if (cc_flag) {
171             case_num_buf = 0;
172             control_num_buf = 0;
173             vector<double> y_buf(y[i]);
174             stable_sort(y_buf.begin(), y_buf.end());
175             int n = (int) (_indi_num * (1.0 - K));
176             double Th = 0.5 * (y_buf[n] + y_buf[n - 1]);
177             for (j = 0; j < _keep.size(); j++) {
178                 if (y[i][j] > Th) {
179                     if (case_num_buf < case_num) {
180                         y[i][j] = 2;
181                         case_num_buf++;
182                     } else y[i][j] = -9;
183                 } else {
184                     if (control_num_buf < control_num) {
185                         y[i][j] = 1;
186                         control_num_buf++;
187                     } else y[i][j] = -9;
188                 }
189             }
190         }
191     }
192 
193     if (!simu_emb_flag) {
194         save_phenfile(y);
195         if (cc_flag) cout << "Simulated " << case_num_buf << " cases and " << control_num << " controls have been saved in [" + _out + ".phen" + "]." << endl;
196         else cout << "Simulated phenotypes of " << _keep.size() << " individuals have been saved in [" + _out + ".phen" + "]." << endl;
197     } else {
198         // emBayesB format
199         if (!output_causal) update_id_map_rm(qtl_name, _snp_name_map, _include);
200         string out_rstfile = _out + ".emb";
201         ofstream out_emBayesB(out_rstfile.c_str());
202         if (!out_emBayesB) throw ("Error: can not open the file [" + out_rstfile + "] to write.");
203         cout << "Saving the simulated data to the file [" + out_rstfile + "] (in emBayesB format)." << endl;
204         for (i = 0; i < _keep.size(); i++) {
205             if (y[0][i] == -9) continue;
206             out_emBayesB << _pid[_keep[i]] << " " << g[i] << " " << y[0][i] << endl;
207             for (j = 0; j < _include.size(); j++) {
208                 if (_snp_1[_include[j]][_keep[i]] && !_snp_2[_include[j]][_keep[i]]) out_emBayesB << _mu[_include[j]] << " ";
209                 else out_emBayesB << (double) (_snp_1[_include[j]][_keep[i]] + _snp_2[_include[j]][_keep[i]]) << " ";
210             }
211             out_emBayesB << endl;
212         }
213         out_emBayesB.close();
214         cout << "Simulated data (" << _keep.size() << " individuals and " << _include.size() << " SNPs) has been saved in [" + out_rstfile + "]." << endl;
215     }
216 }
217 
218 /////////////////////////////////////////
219 /* These codes are not used any more */
220 /////////////////////////////////////////
221 /*
222 void gcta::GenerCases(string bfile, string qtl_file, int case_num, int control_num, double hsq, double K, bool curr_popu, double gnrt)
223 {
224         int i=0, j=0, k=0;
225 
226         if(gnrt<1.0 || gnrt>1e5) throw("Error: --simu-gener should be within the range from 1 to 100000.");
227         if(hsq>1.0 || hsq<0.0) throw("Error: --simu-h2 should be within the range from 0 to 1.");
228         if(K>0.5 || K<0.0001) throw("Error: --simu-K should be within the range from 0.0001 to 0.5.");
229         if(!curr_popu && (case_num>1e5 || case_num<1)) throw("Error: --simu-cc, Invalid number of cases.");
230         if(!curr_popu && (control_num>1e6 || control_num<1)) throw("Error: --simu-cc, Invalid number of controls.");
231 
232         // Read bim file: recombination rate is defined between SNP i and SNP i-1
233         read_bimfile(bfile+".bim");
234         kosambi();
235 
236         // calcualte the accumulative recombination probability over xxx generation
237         for(i=0; i<_snp_num; i++) _rc_rate[i]=1.0-pow((double)(1.0-_rc_rate[i]), (double)gnrt);
238 
239     // Read QTL file
240     vector<string> qtl_name;
241         read_snplist(qtl_file, qtl_name);
242     vector<int> qtl_pos;
243         map<string, int>::iterator iter;
244         for(int i=0; i<qtl_name.size(); i++){
245         iter=_snp_name_map.find(qtl_name[i]);
246         if(iter!=_snp_name_map.end()) qtl_pos.push_back(iter->second);
247     }
248     stable_sort(qtl_pos.begin(), qtl_pos.end());
249         int qtl_num=qtl_pos.size();
250         cout<<qtl_num<<" SNPs (as QTLs) to be included from ["+qtl_file+"]."<<endl;
251 
252         // Read fam and bed files
253         read_famfile(bfile+".fam");
254         read_bedfile(bfile+".bed");
255 
256         // Generate QTL effects
257         int Seed=-CommFunc::rand_seed();
258         cout<<"Generating QTL effects ("<<"Random seed = "<<abs(Seed)<<")."<<endl;
259         vector<double> qtl_eff(qtl_num);
260         for(i=0; i<qtl_num && hsq>0.0; i++) qtl_eff[i]=StatFunc::gasdev(Seed);
261 
262         // Calculate Ve and threhold
263     int n=0, x=0;
264     double var_g=0.0, var_e=1.0;
265     vector<double> AF(qtl_num), y(_indi_num), y_backup;
266     if(hsq>0.0){
267         for(n=0; n<_indi_num; n++){
268             for(i=0; i<qtl_num; i++){
269                 j=qtl_pos[i];
270                 if(_snp_a[n][j] && !_snp_b[n][j]) continue;
271                 x=_snp_a[n][j]+_snp_b[n][j];
272                 AF[i]+=x;
273                 y[n]+=x*qtl_eff[i];
274             }
275         }
276         var_g=CommFunc::var(y);
277             var_e=var_g*(1.0/hsq-1.0);
278     }
279         double sd_e=sqrt(var_e);
280         for(n=0; n<_indi_num; n++) y[n]+=sd_e*StatFunc::gasdev(Seed);
281         if(curr_popu) y_backup=y;
282         stable_sort(y.begin(), y.end());
283         n=(int)(_indi_num*(1.0-K));
284         double Th=0.5*(y[n]+y[n-1]);
285 
286         // ouput par file
287         vector<double> qsq(qtl_num);
288         for(i=0; i<qtl_num; i++){
289         AF[i]/=_indi_num;
290         AF[i]*=0.5;
291         qsq[i]=2.0*AF[i]*(1.0-AF[i])*qtl_eff[i]*qtl_eff[i];
292         qsq[i]/=(var_e+var_g);
293     }
294         string out_parfile=_out+".par";
295         ofstream out_par(out_parfile.c_str());
296         if(!out_par) throw("Error: can not open par file ["+out_parfile+"] to write!");
297         cout<<"Writing simulation parameters to file ["<<out_parfile<<"]."<<endl;
298         for(i=0; i<qtl_num; i++) out_par<<qtl_name[i]<<"\t"<<AF[i]<<"\t"<<qtl_eff[i]<<"\t"<<qsq[i]<<endl;
299     out_par.close();
300 
301         // If generate cases and controls based on current population
302         if(curr_popu){
303             cout<<"Generating cases and control based on the current population."<<endl;
304         for(i=0; i<_indi_num; i++){
305             if(y_backup[i]>Th) _pheno[i]=2;
306             else _pheno[i]=1;
307         }
308         save_famfile();
309         return;
310         }
311 
312         // Generate population
313     int fa_hap_buf=0, mo_hap_buf, fa=0, mo=0;
314     double score=0.0;
315         vector< vector<int> > case_fa, case_mo, case_fa_hap, case_mo_hap, control_fa, control_mo, control_fa_hap, control_mo_hap;
316         cout<<"Generating population derived from the current population."<<endl;
317         while(case_fa.size()<case_num || control_fa.size()<control_num){
318         vector<int> fa_vbuf, mo_vbuf, fa_hap_vbuf, mo_hap_vbuf;
319             for(i=0; i<_snp_num; i++){
320             if(StatFunc::ran1(Seed)<_rc_rate[i]){
321                 fa_hap_vbuf.push_back(i);
322                 fa_vbuf.push_back((int)(StatFunc::ran1(Seed)*_indi_num*2));
323             }
324             if(StatFunc::ran1(Seed)<_rc_rate[i]){
325                 mo_hap_vbuf.push_back(i);
326                 mo_vbuf.push_back((int)(StatFunc::ran1(Seed)*_indi_num*2));
327             }
328         }
329         score=0.0;
330         for(i=0; i<qtl_num; i++){
331             j=qtl_pos[i];
332             fa_hap_buf=upper_bound(fa_hap_vbuf.begin(), fa_hap_vbuf.end(), j)-fa_hap_vbuf.begin()-1;
333             mo_hap_buf=upper_bound(mo_hap_vbuf.begin(), mo_hap_vbuf.end(), j)-mo_hap_vbuf.begin()-1;
334             fa=(int)(fa_vbuf[fa_hap_buf]*0.5);
335             mo=(int)(mo_vbuf[mo_hap_buf]*0.5);
336             if( (_snp_a[fa][j] && !_snp_b[fa][j]) || (_snp_a[mo][j] && !_snp_b[mo][j]) ) continue;
337             if(fa_vbuf[fa_hap_buf]%2==0) x=_snp_a[fa][j];
338             else x=_snp_b[fa][j];
339             if(mo_vbuf[mo_hap_buf]%2==0) x+=_snp_a[mo][j];
340             else x+=_snp_b[mo][j];
341             score+=x*qtl_eff[i];
342         }
343         score+=sd_e*StatFunc::gasdev(Seed);
344         if(score>Th){
345             if(case_fa.size()<case_num){
346                 case_fa.push_back(fa_vbuf);
347                 case_mo.push_back(mo_vbuf);
348                 case_fa_hap.push_back(fa_hap_vbuf);
349                 case_mo_hap.push_back(mo_hap_vbuf);
350             }
351         }
352         else{
353             if(control_fa.size()<control_num){
354                 control_fa.push_back(fa_vbuf);
355                 control_mo.push_back(mo_vbuf);
356                 control_fa_hap.push_back(fa_hap_vbuf);
357                 control_mo_hap.push_back(mo_hap_vbuf);
358             }
359         }
360     }
361         cout<<case_num<<" cases and "<<control_num<<" controls have been generated."<<endl;
362         _indi_num=case_num+control_num;
363 
364         // Output fam file
365         string out_famfile=_out+".fam";
366     ofstream out_fam(out_famfile.c_str());
367     if(!out_fam) throw("Error: can not open fam file ["+out_famfile+"] to write!");
368     cout<<"Writing IDs of cases and controls to file ["<<out_famfile<<"]."<<endl;
369     for(i=0; i<case_num; i++) out_fam<<i+1<<"\t"<<i+1<<"\t0\t0\t0\t2"<<endl;
370     for(i=case_num; i<_indi_num; i++) out_fam<<i+1<<"\t"<<i+1<<"\t0\t0\t0\t1"<<endl;
371     out_fam.close();
372 
373         // Output bed file
374         // Output first three bytes
375     // Output genotype in binary format
376     case_fa.insert(case_fa.end(), control_fa.begin(), control_fa.end());
377     case_mo.insert(case_mo.end(), control_mo.begin(), control_mo.end());
378     case_fa_hap.insert(case_fa_hap.end(), control_fa_hap.begin(), control_fa_hap.end());
379     case_mo_hap.insert(case_mo_hap.end(), control_mo_hap.begin(), control_mo_hap.end());
380     save_bedfile(case_fa, case_mo, case_fa_hap, case_mo_hap);
381 }
382 
383 
384 void gcta::kosambi()
385 {
386     int i=0;
387         double c=0.0;
388         _rc_rate.clear();
389         _rc_rate.resize(_snp_num);
390     for(i=0; i<_snp_num; i++){
391         if(i==0 || _chr[i]!=_chr[i-1]) _rc_rate[i]=0.5;
392         else{
393             c=exp(_genet_dst[i]*0.04);
394             _rc_rate[i]=0.5*(c-1.0)/(c+1.0);
395         }
396     }
397 }
398  */
399 
400 /*
401 void gcta::save_bedfile(vector< vector<int> > &fa_indx, vector< vector<int> > &mo_indx, vector< vector<int> > &fa_hap, vector< vector<int> > &mo_hap, bool GENOME)
402 {
403     bool fa_geno=false, mo_geno=false;
404     int i=0, pos=0, n=0, fa=0, mo=0, fa_hap_buf=0, mo_hap_buf=0;
405     string OutBedFile=_out+".bed";
406         fstream OutBed(OutBedFile.c_str(), ios::out|ios::binary);
407         if(!OutBed) throw("Error: can not open the file ["+OutBedFile+"] to write.");
408         cout<<"Writing genotypes to PLINK BED file ["+OutBedFile+"]."<<endl;
409         bitset<8> b;
410         char ch[1];
411     b.reset();
412     b.set(2);  b.set(3);  b.set(5);  b.set(6);
413     ch[0] = (char)b.to_ulong();
414     OutBed.write(ch,1);
415     b.reset();
416     b.set(0);  b.set(1);  b.set(3);  b.set(4);
417     ch[0] = (char)b.to_ulong();
418     OutBed.write(ch,1);
419     b.reset();
420     b.set(0);
421     ch[0] = (char)b.to_ulong();
422     OutBed.write(ch,1);
423     for(i=0; i<_snp_num; i++){
424         pos=0;
425         b.reset();
426         for(n=0; n<_indi_num; n++){
427             if(!GENOME){
428                 fa_hap_buf=upper_bound(fa_hap[n].begin(), fa_hap[n].end(), i)-fa_hap[n].begin()-1;
429                 mo_hap_buf=upper_bound(mo_hap[n].begin(), mo_hap[n].end(), i)-mo_hap[n].begin()-1;
430                 fa=(int)(fa_indx[n][fa_hap_buf]*0.5);
431                 mo=(int)(mo_indx[n][mo_hap_buf]*0.5);
432             }
433             else{
434                 fa=n;
435                 mo=n;
436             }
437                         if( (_snp_a[fa][i] && !_snp_b[fa][i]) || (_snp_a[mo][i] && !_snp_b[mo][i]) ){
438                 b[pos++]=1;
439                 b[pos++]=0;
440             }
441             else{
442                 if(!GENOME){
443                     if(fa_indx[n][fa_hap_buf]%2==0) fa_geno=_snp_a[fa][i];
444                     else fa_geno=_snp_b[fa][i];
445                     if(mo_indx[n][mo_hap_buf]%2==0) mo_geno=_snp_a[mo][i];
446                     else mo_geno=_snp_b[mo][i];
447                     fa_geno=(!fa_geno);
448                     mo_geno=(!mo_geno);
449                 }
450                 else{
451                    fa_geno=_snp_a[fa][i];
452                    mo_geno=_snp_b[mo][i];
453                 }
454                 b[pos++]=min(fa_geno, mo_geno);
455                 b[pos++]=max(fa_geno, mo_geno);
456             }
457             if(pos>7 || n==_indi_num-1){
458                 ch[0]=(char)b.to_ulong();
459                 OutBed.write(ch,1);
460                 pos=0;
461                 b.reset();
462             }
463         }
464     }
465     OutBed.close();
466 }
467 
468 void gcta::save_famfile()
469 {
470     string famfile=_out+".fam";
471         ofstream Fam(famfile.c_str());
472         if(!Fam) throw("Error: can not open the fam file "+famfile+" to save!");
473         cout<<"Writing PLINK FAM file to ["+famfile+"]."<<endl;
474         int i=0;
475         for(i=0; i<_indi_num; i++){
476                 Fam<<_fid[i]<<"\t"<<_pid[i]<<"\t"<<_fa_id[i]<<"\t"<<_mo_id[i]<<"\t"<<_sex[i]<<"\t"<<_pheno[i]<<endl;
477         }
478         Fam.close();
479         cout<<_indi_num<<" individuals to be saved to ["+famfile+"]."<<endl;
480 }
481 
482 void gcta::save_bimfile()
483 {
484         int i=0;
485         string bimfile=_out+".bim";
486         ofstream Bim(bimfile.c_str());
487         if(!Bim) throw("Error: can not open the file ["+bimfile+"] to write.");
488         cout<<"Writing PLINK bim file to ["<<bimfile<<"]."<<endl;
489         for(i=0; i<_snp_num; i++){
490                 Bim<<_chr[i]<<"\t"<<_snp_name[i]<<"\t"<<_genet_dst[i]<<"\t"<<_bp[i]<<"\t"<<_allele1[i]<<"\t"<<_allele2[i]<<endl;
491         }
492         Bim.close();
493         cout<<_snp_num<<" SNPs to be saved to ["<<bimfile<<"]."<<endl;
494 }
495  */
genet_dst(string bfile,string hapmap_genet_map)496 void gcta::genet_dst(string bfile, string hapmap_genet_map)
497 {
498     // Read bim file
499     read_bimfile(bfile + ".bim");
500     int snp_num = _snp_name.size();
501 
502     // Read HAPMAP genetic map files
503     int i = 0, j = 0;
504     string str_buf;
505     vector<string> vs_buf;
506     string genet_mapfile;
507     vector< vector< vector<double> > > hap_genet(_autosome_num);
508     for (i = 0; i < _autosome_num; i++) {
509         stringstream str_strm;
510         str_strm << hapmap_genet_map << i + 1 << "_CEU_b36.txt";
511         genet_mapfile = str_strm.str();
512         ifstream i_genet_map(genet_mapfile.c_str());
513         if (!i_genet_map) throw ("Error: can not open HAPMAP genetic map file " + genet_mapfile + "!");
514         hap_genet[i].resize(2);
515         getline(i_genet_map, str_buf);
516         while (getline(i_genet_map, str_buf)) {
517             if (StrFunc::split_string(str_buf, vs_buf) < 3) continue;
518             hap_genet[i][0].push_back(atof(vs_buf[0].c_str()));
519             hap_genet[i][1].push_back(atof(vs_buf[1].c_str()));
520         }
521         i_genet_map.close();
522     }
523 
524     // calculate genetic distance
525     int pos1 = 0, pos2 = 0;
526     double prev_bp = 0.0;
527     vector<double> dst(snp_num);
528     vector<double>::iterator iter1, iter2;
529     for (i = 0; i < snp_num; i++) {
530         if (i == 0 || _chr[i - 1] != _chr[i]) {
531             dst[i] = 0.0;
532             continue;
533         }
534         iter1 = upper_bound(hap_genet[_chr[i] - 1][0].begin(), hap_genet[_chr[i] - 1][0].end(), _bp[i - 1]);
535         iter2 = upper_bound(hap_genet[_chr[i] - 1][0].begin(), hap_genet[_chr[i] - 1][0].end(), _bp[i]);
536         if (iter1 == hap_genet[_chr[i] - 1][0].end() || iter2 == hap_genet[_chr[i] - 1][0].end()) {
537             dst[i] = _bp[i] - _bp[i - 1];
538             continue;
539         }
540         pos1 = iter1 - hap_genet[_chr[i] - 1][0].begin();
541         pos2 = iter2 - hap_genet[_chr[i] - 1][0].begin();
542         prev_bp = _bp[i - 1];
543         while (pos1 < pos2) {
544             dst[i] += (hap_genet[_chr[i] - 1][0][pos1] - prev_bp) * hap_genet[_chr[i] - 1][1][pos1 - 1];
545             prev_bp = hap_genet[_chr[i] - 1][0][pos1];
546             pos1++;
547         }
548         dst[i] += (_bp[i] - prev_bp) * hap_genet[_chr[i] - 1][1][pos1 - 1];
549     }
550 
551     // Output fam file
552     string out_bimfile = _out + ".genetdst";
553     ofstream out_bim(out_bimfile.c_str());
554     if (!out_bim) throw ("Error: can not open file " + out_bimfile + " to write!");
555     for (i = 0; i < snp_num; i++) out_bim << _chr[i] << "\t" << _snp_name[i] << "\t" << dst[i]*1e-6 << "\t" << _bp[i] << "\t" << _allele1[i] << "\t" << _allele2[i] << endl;
556     out_bim.close();
557     cout << "Genetic distances have been created, and been saved in [" + out_bimfile + "]." << endl;
558 }
559 
560 /*
561 void gcta::simu_genome(
562           string popSize, 						// effective population size of each population
563                   int nSubPOP, 							// number of populations
564                   vector<int> & nSubSample, 			// number of samples (haplotypes) draw from each populations
565                   int numPieces, 						// number of fragments for each sample (chromosome)
566                   int pieceLen, 						// length in base pair of each fragment
567                   int numIndepRegion, 					// number of independent regions (independent chromosome)
568                   int s,								// fixed number of SNPs want to simulate, randomly place s SNPs on the genealogy
569                   string rec,							// recombination rate between consecutive fragments per generation
570                   double mut, 							// mutation rate per generation per base pair
571                   double mig 							// migration rate per generation
572                   )
573 {
574     int seed=CommFunc::rand_seed();
575     vector< vector<bool> > data; // the return chromosome by connecting all independent chromosome into one long chromosome
576 
577     cout<<"\n*********************************************"<<endl;
578     cout<<"The following output is generated by the program GENOME\n"<<endl;
579     genome(popSize, nSubPOP, nSubSample, numPieces, pieceLen, numIndepRegion, s, rec, mut, mig, data, seed, false, false);
580     cout<<"\nEnd of output by GENOME"<<endl;
581     cout<<"*********************************************\n"<<endl;
582     _snp_a.clear();
583     _snp_b.clear();
584     _pheno.clear();
585     int i=0, j=0, k=0, index=0;
586     for(k=0; k<nSubPOP; k++){
587         for(i=0; i<nSubSample[k]; i++){
588             _snp_a.push_back(data[index++]);
589             _snp_b.push_back(data[index++]);
590             _pheno.push_back(k+1);
591             i++;
592             if(i==nSubSample[k]-2) break;
593         }
594     }
595     _indi_num=_snp_a.size();
596     _snp_num=_snp_a[0].size();
597 
598     // save famfile
599         stringstream strstrm;
600     _fid.clear();
601     _fid.resize(_indi_num);
602     _pid.clear();
603     _pid.resize(_indi_num);
604     _fa_id.clear();
605     _fa_id.resize(_indi_num);
606     _mo_id.clear();
607     _mo_id.resize(_indi_num);
608     _sex.clear();
609     _sex.resize(_indi_num);
610     for(i=0; i<_indi_num; i++){
611         strstrm.str("");
612         strstrm<<i+1;
613         _fid[i]=strstrm.str();
614                 _pid[i]=strstrm.str();
615                 _fa_id[i]="0";
616                 _mo_id[i]="0";
617         }
618         save_famfile();
619 
620         // save bimfile
621         _chr.clear();
622         _chr.resize(_snp_num);
623         _snp_name.clear();
624         _snp_name.resize(_snp_num);
625         _genet_dst.clear();
626         _genet_dst.resize(_snp_num);
627         _bp.clear();
628         _bp.resize(_snp_num);
629         _allele1.clear();
630         _allele1.resize(_snp_num);
631         _allele2.clear();
632         _allele2.resize(_snp_num);
633     for(i=0; i<_snp_num; i++){
634         _chr[i]=1;
635         strstrm.str("");
636         strstrm<<"SNP_"<<i+1;
637         _snp_name[i]=strstrm.str();
638         _allele1[i]='A';
639         _allele2[i]='C';
640     }
641     save_bimfile();
642 
643     // save bedfile
644     vector< vector<int> > tmp;
645     save_bedfile(tmp, tmp, tmp, tmp, true);
646 }
647  */
648 
649 /*
650 void gcta::simu_geno_unlinked(int N, int M, double maf)
651 {
652     int i=0, j=0, x=0;
653 
654     // fam file
655     _keep.resize(N);
656     _fid.resize(N);
657     _pid.resize(N);
658     _fa_id.resize(N);
659     _mo_id.resize(N);
660     _sex.resize(N);
661     _pheno.resize(N);
662     for(i=0; i<N; i++){
663         _keep[i]=i;
664         stringstream ss;
665         ss<<i+1;
666         _fid[i]=ss.str();
667         _pid[i]=_fid[i];
668         _fa_id[i]="0";
669         _mo_id[i]="0";
670         _sex[i]=-9;
671         _pid[i]=-9;
672     }
673 
674     // bim file
675     _include.resize(M);
676     _snp_num=M;
677     _chr.resize(M);
678     _snp_name.resize(M);
679         _bp.resize(M);
680     _genet_dst.resize(M);
681     _allele1.resize(M);
682     _allele2.resize(M);
683     _snp_1.resize(M);
684     _snp_2.resize(M);
685 
686 //double p=0.0;
687     std::tr1::minstd_rand eng;
688         eng.seed((unsigned int)time(NULL));
689 
690         cout<<"maf "<<maf<<endl;
691 
692     for(j=0; j<M; j++){
693         _include[j]=j;
694         stringstream ss;
695         ss<<"SNP"<<j+1;
696         _chr[j]=1;
697         _snp_name[j]=ss.str();
698         _bp[j]=j+1;
699         _genet_dst[j]=0.0;
700         _allele1[j]="A";
701         _allele2[j]="G";
702         _snp_1[j].resize(N);
703         _snp_2[j].resize(N);
704                 std::tr1::uniform_real<double> runiform(maf,1-maf);
705         double p = runiform(eng)/1.0e10;
706 
707                 //debug
708                 cout<<"p = "<<p<<endl;
709 
710         for(i=0; i<N; i++){
711             std::tr1::binomial_distribution<int, double> rbinom(2,0.5);
712             x=rbinom(eng);
713 
714                         //debug
715                         cout<<x<<"\t";
716 
717 
718             if(x==2) _snp_1[j][i]=_snp_2[j][i]=true;
719             else if(x==1){
720                 _snp_1[j][i]=false;
721                 _snp_2[j][i]=true;
722             }
723             else _snp_1[j][i]=_snp_2[j][i]=false;
724         }
725 
726                 //debug
727                 cout<<endl;
728     }
729 
730     save_plink();
731 }
732  */
733 
734