1 //
2 // popu_genet.cpp
3 // gcta
4 //
5 // Created by Jian Yang on 14/03/13.
6 //
7 //
8
9 #include "gcta.h"
10
11 // paa: proportion of ancestral alleles
paa(string aa_file)12 void gcta::paa(string aa_file)
13 {
14 check_autosome();
15 if(_mu.empty()) calcu_mu();
16 //rm_high_ld();
17
18 int i=0, j=0, k=0;
19
20 // read ancestral alleles from a file
21 ifstream i_aa(aa_file.c_str());
22 if(!i_aa) throw("Error: can not open the file ["+aa_file+"] to read.");
23 string cbuf=".";
24 string str_buf;
25 cout<<"Reading ancestral alleles of the SNPs from ["+aa_file+"]."<<endl;
26 map<string, int>::iterator iter, End=_snp_name_map.end();
27 vector<string> aa(_snp_num);
28 for(i=0; i<_snp_num; i++) aa[i]=".";
29 int icount=0;
30 while(i_aa){
31 i_aa>>str_buf;
32 if(i_aa.eof()) break;
33 iter=_snp_name_map.find(str_buf);
34 i_aa>>cbuf;
35 if(iter!=End && cbuf!="."){
36 aa[iter->second]=cbuf;
37 icount++;
38 }
39 }
40 i_aa.close();
41 cout<<"Ancestral alleles for "<<icount<<" SNPs are included from ["+aa_file+"]."<<endl;
42
43 cout<<"Calculating proportion of ancestral alleles ..."<<endl;
44 double x=0.0;
45 vector<double> hom_aa_rare(_keep.size()), hom_aa_comm(_keep.size()), hom_da_rare(_keep.size()), hom_da_comm(_keep.size()), het_aa_rare(_keep.size()), het_aa_comm(_keep.size()), nomiss(_keep.size());
46 for(i=0; i<_keep.size(); i++){
47 for(k=0; k<_include.size(); k++){
48 if(aa[_include[k]]==".") continue;
49 if(!_snp_1[_include[k]][_keep[i]] || _snp_2[_include[k]][_keep[i]]){
50 x=_snp_1[_include[k]][_keep[i]]+_snp_2[_include[k]][_keep[i]];
51 if(x<0.1){
52 if(_ref_A[_include[k]]==aa[_include[k]]){
53 if(_mu[_include[k]]>1.0) hom_da_rare[i]+=1.0;
54 else hom_da_comm[i]+=1.0;
55 }
56 else{
57 if(_mu[_include[k]]>1.0) hom_aa_rare[i]+=1.0;
58 else hom_aa_comm[i]+=1.0;
59 }
60 }
61 else if(x>1.9){
62 if(_ref_A[_include[k]]==aa[_include[k]]){
63 if(_mu[_include[k]]>1.0) hom_aa_comm[i]+=1.0;
64 else hom_aa_rare[i]+=1.0;
65 }
66 else{
67 if(_mu[_include[k]]>1.0) hom_da_comm[i]+=1.0;
68 else hom_da_rare[i]+=1.0;
69 }
70 }
71 else{
72 if(_ref_A[_include[k]]==aa[_include[k]]){
73 if(_mu[_include[k]]>1.0) het_aa_comm[i]+=1.0;
74 else het_aa_rare[i]+=1.0;
75 }
76 else{
77 if(_mu[_include[k]]>1.0) het_aa_rare[i]+=1.0;
78 else het_aa_comm[i]+=1.0;
79 }
80 }
81 nomiss[i]+=1.0;
82 }
83 }
84 hom_aa_rare[i]/=nomiss[i];
85 hom_aa_comm[i]/=nomiss[i];
86 hom_da_rare[i]/=nomiss[i];
87 hom_da_comm[i]/=nomiss[i];
88 het_aa_rare[i]/=nomiss[i];
89 het_aa_comm[i]/=nomiss[i];
90 cout<<i+1<<" of "<<_keep.size()<<" individuals.\r";
91 }
92
93 // Save matrix A in binary file
94 string paa_file=_out+".paa";
95 ofstream o_paa(paa_file.c_str());
96 if(!o_paa) throw("Error: can not open the file ["+paa_file+"] to write.");
97 o_paa<<"FID\tIID\tNOMISS\tHOM_AA_RARE\tHOM_AA_COMM\tHOM_DA_RARE\tHOM_DA_COMM\tHET_AA_RARE\tHET_AA_COMM"<<endl;
98 for(i=0; i<_keep.size(); i++) o_paa<<_fid[i]<<"\t"<<_pid[i]<<"\t"<<nomiss[i]<<"\t"<<hom_aa_rare[i]<<"\t"<<hom_aa_comm[i]<<"\t"<<hom_da_rare[i]<<"\t"<<hom_da_comm[i]<<"\t"<<het_aa_rare[i]<<"\t"<<het_aa_comm[i]<<endl;
99 o_paa.close();
100 cout<<"Proportion of ancestral alleles has been saved in file ["+paa_file+"]."<<endl;
101 }
102
103 // inbreeding coefficient
ibc(bool ibc_all)104 void gcta::ibc(bool ibc_all)
105 {
106 check_autosome();
107 if(_mu.empty()) calcu_mu();
108 //rm_high_ld();
109
110 int i=0, j=0, k=0;
111
112 // Calcuate A matrix
113 cout<<"Calculating the inbreeding coefficients ..."<<endl;
114 vector<double> h(_include.size()), h_i(_include.size()), w(_include.size()), p(_include.size()), p_q(_include.size()), q_p(_include.size()); // variance of each SNP, 2pq
115 #pragma omp parallel for
116 for(j=0; j<_include.size(); j++){
117 p[j]=0.5*_mu[_include[j]];
118 h[j]=2.0*p[j]*(1.0-p[j]);
119 p_q[j]=p[j]/(1.0-p[j]);
120 if(fabs(p_q[j])<1.0e-50) q_p[j]=0.0;
121 else q_p[j]=1.0/p_q[j];
122 w[j]=h[j]/(1.0-h[j]);
123 if(fabs(h[j])<1.0e-50) h_i[j]=0.0;
124 else h_i[j]=1.0/h[j];
125 }
126 vector<double> Fhat1, Fhat1_w, Fhat2, Fhat2_w, Fhat3, Fhat4, Fhat5, Fhat6, Fhat7, rare_hom, comm_hom, nomiss;
127 Fhat1.resize(_keep.size());
128 Fhat2.resize(_keep.size());
129 Fhat3.resize(_keep.size());
130 nomiss.resize(_keep.size());
131 if(ibc_all){
132 Fhat4.resize(_keep.size());
133 Fhat5.resize(_keep.size());
134 Fhat6.resize(_keep.size());
135 Fhat7.resize(_keep.size());
136 Fhat1_w.resize(_keep.size());
137 Fhat2_w.resize(_keep.size());
138 rare_hom.resize(_keep.size());
139 comm_hom.resize(_keep.size());
140 }
141 #pragma omp parallel for private(k)
142 for(i=0; i<_keep.size(); i++){
143 double x=0.0, sum_w=0.0, sum_h=0.0, Fhat_buf=0.0;
144 for(k=0; k<_include.size(); k++){
145 if(!_snp_1[_include[k]][_keep[i]] || _snp_2[_include[k]][_keep[i]]){
146 x=_snp_1[_include[k]][_keep[i]]+_snp_2[_include[k]][_keep[i]];
147 if(_allele2[_include[k]]==_ref_A[_include[k]]) x=2.0-x;
148 Fhat_buf=(x-_mu[_include[k]])*(x-_mu[_include[k]]);
149 if(ibc_all) Fhat4[i]+=Fhat_buf;
150 Fhat_buf*=h_i[k];
151 Fhat1[i]+=Fhat_buf;
152 if(ibc_all) Fhat1_w[i]+=Fhat_buf*w[k];
153 Fhat_buf=h[k]-x*(2.0-x);
154 if(ibc_all) Fhat6[i]+=Fhat_buf;
155 Fhat_buf*=h_i[k];
156 Fhat2[i]+=Fhat_buf;
157 if(ibc_all) Fhat2_w[i]+=Fhat_buf*w[k];
158 Fhat_buf=(x*(x-1.0-_mu[_include[k]])+_mu[_include[k]]*p[k]);
159 Fhat3[i]+=Fhat_buf*h_i[k];
160 if(ibc_all){
161 Fhat5[i]+=Fhat_buf;
162 sum_w+=w[k];
163 sum_h+=h[k];
164 if(x<0.1) Fhat7[i]+=p_q[k];
165 else if(x>1.9) Fhat7[i]+=q_p[k];
166 // Count the number of rare and common homozygotes
167 if(x<0.1){
168 if(p[k]>0.5) rare_hom[i]+=1.0;
169 else comm_hom[i]+=1.0;
170 }
171 else if(x>1.9){
172 if(p[k]<0.5) rare_hom[i]+=1.0;
173 else comm_hom[i]+=1.0;
174 }
175 }
176 nomiss[i]+=1.0;
177 }
178 }
179 Fhat1[i]/=nomiss[i];
180 Fhat2[i]/=nomiss[i];
181 Fhat3[i]/=nomiss[i];
182 if(ibc_all){
183 Fhat1_w[i]/=sum_w;
184 Fhat2_w[i]/=sum_w;
185 Fhat4[i]/=sum_h;
186 Fhat5[i]/=sum_h;
187 Fhat6[i]/=sum_h;
188 rare_hom[i]/=nomiss[i];
189 comm_hom[i]/=nomiss[i];
190 Fhat7[i]/=nomiss[i];
191 }
192 }
193
194 // Save matrix A in binary file
195 string ibc_file=_out+".ibc";
196 ofstream o_ibc(ibc_file.c_str());
197 if(!o_ibc) throw("Error: can not open the file ["+ibc_file+"] to write.");
198 if(ibc_all){
199 o_ibc<<"FID\tIID\tNOMISS\tP_RARE_HOM\tP_COMM_HOM\tFhat1\tFhat1_w\tFhat2\tFhat2_w\tFhat3\tFhat4\tFhat5\tFhat6\tFhat7"<<endl;
200 for(i=0; i<_keep.size(); i++) o_ibc<<_fid[_keep[i]]<<"\t"<<_pid[_keep[i]]<<"\t"<<nomiss[i]<<"\t"<<rare_hom[i]<<"\t"<<comm_hom[i]<<"\t"<<Fhat1[i]-1.0<<"\t"<<Fhat1_w[i]-1.0<<"\t"<<Fhat2[i]<<"\t"<<Fhat2_w[i]<<"\t"<<Fhat3[i]<<"\t"<<Fhat4[i]-1.0<<"\t"<<Fhat5[i]<<"\t"<<Fhat6[i]<<"\t"<<Fhat7[i]<<endl;
201 }
202 else{
203 o_ibc<<"FID\tIID\tNOMISS\tFhat1\tFhat2\tFhat3"<<endl;
204 for(i=0; i<_keep.size(); i++) o_ibc<<_fid[_keep[i]]<<"\t"<<_pid[_keep[i]]<<"\t"<<nomiss[i]<<"\t"<<Fhat1[i]-1.0<<"\t"<<Fhat2[i]<<"\t"<<Fhat3[i]<<endl;
205 }
206 o_ibc.close();
207 cout<<"Inbreeding coefficients have been saved in the file ["+ibc_file+"]."<<endl;
208 }
209
Fst(string filename)210 void gcta::Fst(string filename)
211 {
212 vector<string> subpopu, subpopu_name;
213 read_subpopu(filename, subpopu, subpopu_name);
214
215 eigenMatrix S;
216 coeff_mat(subpopu, S, "Error: too many subpopulations specified in the file ["+filename+"].", "Error: at least two sub-populations are required.");
217 int r=subpopu_name.size();
218 eigenVector x(_keep.size()), tmp, n_sub(r), xt_S, p_sub(r), h_sub(r);//Fst(_include.size()), f_all(_include.size()), chisq(_include.size()), pval=eigenVector::Constant(_include.size(), 1.0);
219 if(_mu.empty()) calcu_mu();
220
221 int i = 0, j = 0;
222 for(i=0; i<r; i++) n_sub[i]=S.col(i).sum();
223 double n_bar = n_sub.mean();
224 double n_bar_r = n_bar * r;
225 double n_c = (n_bar_r - n_sub.squaredNorm() / n_bar_r) / (r - 1.0);
226
227 string outfile=_out+".fst";
228 cout<<"\nSaving the Fst test results"<<_include.size()<<" SNPs to ["+outfile+"] ..."<<endl;
229 ofstream ofile(outfile.c_str());
230 if(!ofile) throw("Can not open the file ["+outfile+"] to write.");
231 ofile<<"Chr\tSNP\tbp\trefA\t";
232 for(i=0; i<r; i++) ofile<<"freq_"<<subpopu_name[i]<<"(n="<<n_sub[i]<<")\t";
233 ofile<<"Fst"<<endl;
234 double p_bar=0.0, s_sq=0.0, h_bar=0.0, a=0.0, b=0.0, c=0.0, Fst=0.0, d_buf=0.0;
235 for(i=0; i<_include.size(); i++){
236 ofile<<_chr[_include[i]]<<"\t"<<_snp_name[_include[i]]<<"\t"<<_bp[_include[i]]<<"\t"<<_ref_A[_include[i]]<<"\t";
237 makex_eigenVector(i, x, false, false);
238 for(j=0; j<r; j++){
239 p_sub[j]=x.dot(S.col(j))*0.5/n_sub[j];
240 ofile<<p_sub[j]<<"\t";
241 }
242 p_bar = p_sub.dot(n_sub) / n_bar_r;
243 tmp = p_sub.array() - p_bar;
244 s_sq = (tmp.array() * tmp.array() * n_sub.array()).sum() / ((r - 1.0) * n_bar);
245 h_sub = 2 * p_sub.array() * (1.0 - p_sub.array());
246 h_bar = h_sub.dot(n_sub) / n_bar_r;
247 d_buf = p_bar * (1.0 - p_bar) - (r - 1.0) * s_sq / r;
248 a = (n_bar / n_c) * (s_sq - (1.0 / (n_bar - 1.0)) * (d_buf - 0.25 * h_bar));
249 b = (n_bar / (n_bar - 1.0)) * (d_buf - (2 * n_bar - 1.0) * h_bar / (4 * n_bar));
250 c = 0.5 * h_bar;
251
252 Fst= a / (a + b + c);
253
254 /*double p_mean = p_sub.mean();
255 tmp = p_sub.array() - p_mean;
256 double Fst_fixed = (tmp.array() * tmp.array() * n_sub.array()).sum();
257 Fst_fixed = Fst_fixed / (p_mean * (1.0 - p_mean));
258 Fst_fixed = Fst_fixed / ((r - 1.0) * n_bar);*/
259
260 ofile<<Fst<<endl;
261 }
262 ofile.close();
263 }
264
read_subpopu(string filename,vector<string> & subpopu,vector<string> & subpopu_name)265 void gcta::read_subpopu(string filename, vector<string> &subpopu, vector<string> &subpopu_name)
266 {
267 cout<<"Reading sub-population information from ["+filename+"]."<<endl;
268 ifstream ifstream_subpopu(filename.c_str());
269 if(!ifstream_subpopu) throw("Error: can not open the file ["+filename+"] to read.");
270
271 vector<string> ID;
272 vector< vector<string> > subpopu_buf;
273 read_fac(ifstream_subpopu, ID, subpopu_buf);
274 update_id_map_kp(ID, _id_map, _keep);
275 cout<<"Sub-population information for "<<_keep.size()<<" individuals matched to the genotype data."<<endl;
276
277 int i=0, j=0;
278 map<string, int> uni_id_map;
279 map<string, int>::iterator iter;
280 for(i=0; i<_keep.size(); i++) uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
281 subpopu.clear();
282 subpopu.resize(_keep.size());
283 for(i=0; i<ID.size(); i++){
284 iter=uni_id_map.find(ID[i]);
285 if(iter!=uni_id_map.end()) subpopu[iter->second]=subpopu_buf[i][0];
286 }
287
288 subpopu_name.clear();
289 subpopu_name=subpopu;
290 stable_sort(subpopu_name.begin(), subpopu_name.end());
291 subpopu_name.erase(unique(subpopu_name.begin(), subpopu_name.end()), subpopu_name.end());
292 }
293
std_XMat_subpopu(string subpopu_file,MatrixXf & X,eigenVector & sd_SNP,bool grm_xchr_flag,bool miss_with_mu,bool divid_by_std)294 void gcta::std_XMat_subpopu(string subpopu_file, MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std)
295 {
296 vector<string> subpopu, subpopu_name;
297 read_subpopu(subpopu_file, subpopu, subpopu_name);
298
299 eigenMatrix S;
300 coeff_mat(subpopu, S, "Error: too many subpopulations specified in the file ["+subpopu_file+"].", "Error: at least two sub-populations are required.");
301 int popu_num = subpopu_name.size();
302
303 if (_mu.empty()) calcu_mu();
304
305 unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
306 sd_SNP.resize(m);
307 if (_dosage_flag) {
308 for (j = 0; j < m; j++) sd_SNP[j] = (X.col(j) - VectorXf::Constant(n, _mu[_include[j]])).squaredNorm() / (n - 1.0);
309 }
310 else {
311 for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
312 }
313 if (divid_by_std) {
314 for (j = 0; j < m; j++) {
315 if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
316 else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
317 }
318 }
319
320 int popu = 0;
321 vector<eigenVector> sd_SNP_sub(popu_num), mu_SNP_sub(popu_num);
322 for(popu = 0; popu < popu_num; popu++) {
323 (mu_SNP_sub[popu]).resize(m);
324 (sd_SNP_sub[popu]).resize(m);
325 #pragma omp parallel for private(i)
326 for (j = 0; j < m; j++){
327 double m_sub = 0.0;
328 (mu_SNP_sub[popu])(j) = 0.0;
329 for(i = 0; i < n; i++){
330 if(X(i,j) < 1e5 && S(i,popu)>0.0){
331 m_sub += 1.0;
332 (mu_SNP_sub[popu])(j) += X(i,j);
333 }
334 }
335 (mu_SNP_sub[popu])(j) /= m_sub;
336 if(divid_by_std){
337 if(_dosage_flag){
338 (sd_SNP_sub[popu])(j) = 0.0;
339 for(i = 0; i < n; i++){
340 if(X(i,j) < 1e5 && S(i,popu)>0.0) (sd_SNP_sub[popu])(j) += (X(i,j) - (mu_SNP_sub[popu])(j)) * (X(i,j) - (mu_SNP_sub[popu])(j));
341 }
342 (sd_SNP_sub[popu])(j) /= (m_sub - 1.0);
343 }
344 else (sd_SNP_sub[popu])(j) = (mu_SNP_sub[popu])(j)*(1.0 - 0.5 * (mu_SNP_sub[popu])(j));
345 if ((sd_SNP_sub[popu])(j) < 1.0e-50) (sd_SNP_sub[popu])(j) = 0.0;
346 else (sd_SNP_sub[popu])(j) = 1.0 / sqrt((sd_SNP_sub[popu])(j));
347 }
348 }
349 }
350
351 for(popu = 0; popu < popu_num; popu++) {
352 #pragma omp parallel for private(j)
353 for(i = 0; i < n; i++){
354 if(S(i,popu)>0.0){
355 for (j = 0; j < m; j++){
356 if(X(i,j) < 1e5){
357 X(i,j) -= (mu_SNP_sub[popu])(j);
358 if(divid_by_std) X(i,j) *= (sd_SNP_sub[popu])(j);
359 }
360 else if (miss_with_mu) X(i,j) = 0.0;
361 }
362 }
363 }
364 }
365
366 if (!grm_xchr_flag) return;
367
368 // for the X-chromosome
369 check_sex();
370 double f_buf = sqrt(0.5);
371
372 #pragma omp parallel for private(j)
373 for (i = 0; i < n; i++) {
374 if (_sex[_keep[i]] == 1) {
375 for (j = 0; j < m; j++) {
376 if (X(i,j) < 1e5) X(i,j) *= f_buf;
377 else if (miss_with_mu) X(i,j) = 0.0;
378 }
379 }
380 }
381 }
382
383