1 //
2 //  popu_genet.cpp
3 //  gcta
4 //
5 //  Created by Jian Yang on 14/03/13.
6 //
7 //
8 
9 #include "gcta.h"
10 
11 // paa: proportion of ancestral alleles
paa(string aa_file)12 void gcta::paa(string aa_file)
13 {
14     check_autosome();
15     if(_mu.empty()) calcu_mu();
16 	//rm_high_ld();
17 
18     int i=0, j=0, k=0;
19 
20     // read ancestral alleles from a file
21     ifstream i_aa(aa_file.c_str());
22     if(!i_aa) throw("Error: can not open the file ["+aa_file+"] to read.");
23     string cbuf=".";
24     string str_buf;
25 	cout<<"Reading ancestral alleles of the SNPs from ["+aa_file+"]."<<endl;
26     map<string, int>::iterator iter, End=_snp_name_map.end();
27     vector<string> aa(_snp_num);
28     for(i=0; i<_snp_num; i++) aa[i]=".";
29     int icount=0;
30 	while(i_aa){
31 		i_aa>>str_buf;
32 		if(i_aa.eof()) break;
33 		iter=_snp_name_map.find(str_buf);
34 		i_aa>>cbuf;
35 		if(iter!=End && cbuf!="."){
36 		    aa[iter->second]=cbuf;
37 		    icount++;
38 		}
39 	}
40     i_aa.close();
41 	cout<<"Ancestral alleles for "<<icount<<" SNPs are included from ["+aa_file+"]."<<endl;
42 
43 	cout<<"Calculating proportion of ancestral alleles ..."<<endl;
44 	double x=0.0;
45 	vector<double> hom_aa_rare(_keep.size()), hom_aa_comm(_keep.size()), hom_da_rare(_keep.size()), hom_da_comm(_keep.size()), het_aa_rare(_keep.size()), het_aa_comm(_keep.size()), nomiss(_keep.size());
46 	for(i=0; i<_keep.size(); i++){
47  		for(k=0; k<_include.size(); k++){
48  		    if(aa[_include[k]]==".") continue;
49             if(!_snp_1[_include[k]][_keep[i]] || _snp_2[_include[k]][_keep[i]]){
50                 x=_snp_1[_include[k]][_keep[i]]+_snp_2[_include[k]][_keep[i]];
51                 if(x<0.1){
52                     if(_ref_A[_include[k]]==aa[_include[k]]){
53                         if(_mu[_include[k]]>1.0) hom_da_rare[i]+=1.0;
54                         else hom_da_comm[i]+=1.0;
55                     }
56                     else{
57                         if(_mu[_include[k]]>1.0) hom_aa_rare[i]+=1.0;
58                         else hom_aa_comm[i]+=1.0;
59                     }
60                 }
61                 else if(x>1.9){
62                     if(_ref_A[_include[k]]==aa[_include[k]]){
63                         if(_mu[_include[k]]>1.0) hom_aa_comm[i]+=1.0;
64                         else hom_aa_rare[i]+=1.0;
65                     }
66                     else{
67                         if(_mu[_include[k]]>1.0) hom_da_comm[i]+=1.0;
68                         else hom_da_rare[i]+=1.0;
69                     }
70                 }
71                 else{
72                     if(_ref_A[_include[k]]==aa[_include[k]]){
73                         if(_mu[_include[k]]>1.0) het_aa_comm[i]+=1.0;
74                         else het_aa_rare[i]+=1.0;
75                     }
76                     else{
77                         if(_mu[_include[k]]>1.0) het_aa_rare[i]+=1.0;
78                         else het_aa_comm[i]+=1.0;
79                     }
80                 }
81                 nomiss[i]+=1.0;
82             }
83         }
84         hom_aa_rare[i]/=nomiss[i];
85         hom_aa_comm[i]/=nomiss[i];
86         hom_da_rare[i]/=nomiss[i];
87         hom_da_comm[i]/=nomiss[i];
88         het_aa_rare[i]/=nomiss[i];
89         het_aa_comm[i]/=nomiss[i];
90         cout<<i+1<<" of "<<_keep.size()<<" individuals.\r";
91 	}
92 
93     // Save matrix A in binary file
94 	string paa_file=_out+".paa";
95 	ofstream o_paa(paa_file.c_str());
96 	if(!o_paa) throw("Error: can not open the file ["+paa_file+"] to write.");
97 	o_paa<<"FID\tIID\tNOMISS\tHOM_AA_RARE\tHOM_AA_COMM\tHOM_DA_RARE\tHOM_DA_COMM\tHET_AA_RARE\tHET_AA_COMM"<<endl;
98 	for(i=0; i<_keep.size(); i++) o_paa<<_fid[i]<<"\t"<<_pid[i]<<"\t"<<nomiss[i]<<"\t"<<hom_aa_rare[i]<<"\t"<<hom_aa_comm[i]<<"\t"<<hom_da_rare[i]<<"\t"<<hom_da_comm[i]<<"\t"<<het_aa_rare[i]<<"\t"<<het_aa_comm[i]<<endl;
99 	o_paa.close();
100 	cout<<"Proportion of ancestral alleles has been saved in file ["+paa_file+"]."<<endl;
101 }
102 
103 // inbreeding coefficient
ibc(bool ibc_all)104 void gcta::ibc(bool ibc_all)
105 {
106     check_autosome();
107     if(_mu.empty()) calcu_mu();
108 	//rm_high_ld();
109 
110     int i=0, j=0, k=0;
111 
112 	// Calcuate A matrix
113 	cout<<"Calculating the inbreeding coefficients ..."<<endl;
114 	vector<double> h(_include.size()), h_i(_include.size()), w(_include.size()), p(_include.size()), p_q(_include.size()), q_p(_include.size()); // variance of each SNP, 2pq
115 	#pragma omp parallel for
116     for(j=0; j<_include.size(); j++){
117 	    p[j]=0.5*_mu[_include[j]];
118         h[j]=2.0*p[j]*(1.0-p[j]);
119         p_q[j]=p[j]/(1.0-p[j]);
120         if(fabs(p_q[j])<1.0e-50) q_p[j]=0.0;
121         else q_p[j]=1.0/p_q[j];
122         w[j]=h[j]/(1.0-h[j]);
123         if(fabs(h[j])<1.0e-50) h_i[j]=0.0;
124         else h_i[j]=1.0/h[j];
125 	}
126 	vector<double> Fhat1, Fhat1_w, Fhat2, Fhat2_w, Fhat3, Fhat4, Fhat5, Fhat6, Fhat7, rare_hom, comm_hom, nomiss;
127     Fhat1.resize(_keep.size());
128     Fhat2.resize(_keep.size());
129     Fhat3.resize(_keep.size());
130     nomiss.resize(_keep.size());
131     if(ibc_all){
132         Fhat4.resize(_keep.size());
133         Fhat5.resize(_keep.size());
134         Fhat6.resize(_keep.size());
135         Fhat7.resize(_keep.size());
136         Fhat1_w.resize(_keep.size());
137         Fhat2_w.resize(_keep.size());
138         rare_hom.resize(_keep.size());
139         comm_hom.resize(_keep.size());
140     }
141     #pragma omp parallel for private(k)
142     for(i=0; i<_keep.size(); i++){
143         double x=0.0, sum_w=0.0, sum_h=0.0, Fhat_buf=0.0;
144 		for(k=0; k<_include.size(); k++){
145             if(!_snp_1[_include[k]][_keep[i]] || _snp_2[_include[k]][_keep[i]]){
146                 x=_snp_1[_include[k]][_keep[i]]+_snp_2[_include[k]][_keep[i]];
147                 if(_allele2[_include[k]]==_ref_A[_include[k]]) x=2.0-x;
148                 Fhat_buf=(x-_mu[_include[k]])*(x-_mu[_include[k]]);
149                 if(ibc_all) Fhat4[i]+=Fhat_buf;
150                 Fhat_buf*=h_i[k];
151                 Fhat1[i]+=Fhat_buf;
152                 if(ibc_all) Fhat1_w[i]+=Fhat_buf*w[k];
153                 Fhat_buf=h[k]-x*(2.0-x);
154                 if(ibc_all) Fhat6[i]+=Fhat_buf;
155                 Fhat_buf*=h_i[k];
156                 Fhat2[i]+=Fhat_buf;
157                 if(ibc_all) Fhat2_w[i]+=Fhat_buf*w[k];
158                 Fhat_buf=(x*(x-1.0-_mu[_include[k]])+_mu[_include[k]]*p[k]);
159                 Fhat3[i]+=Fhat_buf*h_i[k];
160                 if(ibc_all){
161                     Fhat5[i]+=Fhat_buf;
162                     sum_w+=w[k];
163                     sum_h+=h[k];
164                     if(x<0.1) Fhat7[i]+=p_q[k];
165                     else if(x>1.9) Fhat7[i]+=q_p[k];
166                     // Count the number of rare and common homozygotes
167                     if(x<0.1){
168                         if(p[k]>0.5) rare_hom[i]+=1.0;
169                         else comm_hom[i]+=1.0;
170                     }
171                     else if(x>1.9){
172                         if(p[k]<0.5) rare_hom[i]+=1.0;
173                         else comm_hom[i]+=1.0;
174                     }
175                 }
176                 nomiss[i]+=1.0;
177             }
178         }
179         Fhat1[i]/=nomiss[i];
180         Fhat2[i]/=nomiss[i];
181         Fhat3[i]/=nomiss[i];
182         if(ibc_all){
183             Fhat1_w[i]/=sum_w;
184             Fhat2_w[i]/=sum_w;
185             Fhat4[i]/=sum_h;
186             Fhat5[i]/=sum_h;
187             Fhat6[i]/=sum_h;
188             rare_hom[i]/=nomiss[i];
189             comm_hom[i]/=nomiss[i];
190             Fhat7[i]/=nomiss[i];
191         }
192 	}
193 
194     // Save matrix A in binary file
195 	string ibc_file=_out+".ibc";
196 	ofstream o_ibc(ibc_file.c_str());
197 	if(!o_ibc) throw("Error: can not open the file ["+ibc_file+"] to write.");
198 	if(ibc_all){
199         o_ibc<<"FID\tIID\tNOMISS\tP_RARE_HOM\tP_COMM_HOM\tFhat1\tFhat1_w\tFhat2\tFhat2_w\tFhat3\tFhat4\tFhat5\tFhat6\tFhat7"<<endl;
200         for(i=0; i<_keep.size(); i++) o_ibc<<_fid[_keep[i]]<<"\t"<<_pid[_keep[i]]<<"\t"<<nomiss[i]<<"\t"<<rare_hom[i]<<"\t"<<comm_hom[i]<<"\t"<<Fhat1[i]-1.0<<"\t"<<Fhat1_w[i]-1.0<<"\t"<<Fhat2[i]<<"\t"<<Fhat2_w[i]<<"\t"<<Fhat3[i]<<"\t"<<Fhat4[i]-1.0<<"\t"<<Fhat5[i]<<"\t"<<Fhat6[i]<<"\t"<<Fhat7[i]<<endl;
201 	}
202 	else{
203         o_ibc<<"FID\tIID\tNOMISS\tFhat1\tFhat2\tFhat3"<<endl;
204         for(i=0; i<_keep.size(); i++) o_ibc<<_fid[_keep[i]]<<"\t"<<_pid[_keep[i]]<<"\t"<<nomiss[i]<<"\t"<<Fhat1[i]-1.0<<"\t"<<Fhat2[i]<<"\t"<<Fhat3[i]<<endl;
205 	}
206 	o_ibc.close();
207 	cout<<"Inbreeding coefficients have been saved in the file ["+ibc_file+"]."<<endl;
208 }
209 
Fst(string filename)210 void gcta::Fst(string filename)
211 {
212     vector<string> subpopu, subpopu_name;
213     read_subpopu(filename, subpopu, subpopu_name);
214 
215     eigenMatrix S;
216     coeff_mat(subpopu, S, "Error: too many subpopulations specified in the file ["+filename+"].", "Error: at least two sub-populations are required.");
217     int r=subpopu_name.size();
218     eigenVector x(_keep.size()), tmp, n_sub(r), xt_S, p_sub(r), h_sub(r);//Fst(_include.size()),  f_all(_include.size()), chisq(_include.size()), pval=eigenVector::Constant(_include.size(), 1.0);
219     if(_mu.empty()) calcu_mu();
220 
221     int i = 0, j = 0;
222     for(i=0; i<r; i++) n_sub[i]=S.col(i).sum();
223     double n_bar = n_sub.mean();
224     double n_bar_r = n_bar * r;
225     double n_c = (n_bar_r - n_sub.squaredNorm() / n_bar_r) / (r - 1.0);
226 
227     string outfile=_out+".fst";
228     cout<<"\nSaving the Fst test results"<<_include.size()<<" SNPs to ["+outfile+"] ..."<<endl;
229     ofstream ofile(outfile.c_str());
230     if(!ofile) throw("Can not open the file ["+outfile+"] to write.");
231     ofile<<"Chr\tSNP\tbp\trefA\t";
232     for(i=0; i<r; i++) ofile<<"freq_"<<subpopu_name[i]<<"(n="<<n_sub[i]<<")\t";
233     ofile<<"Fst"<<endl;
234     double p_bar=0.0, s_sq=0.0, h_bar=0.0, a=0.0, b=0.0, c=0.0, Fst=0.0, d_buf=0.0;
235     for(i=0; i<_include.size(); i++){
236         ofile<<_chr[_include[i]]<<"\t"<<_snp_name[_include[i]]<<"\t"<<_bp[_include[i]]<<"\t"<<_ref_A[_include[i]]<<"\t";
237         makex_eigenVector(i, x, false, false);
238         for(j=0; j<r; j++){
239             p_sub[j]=x.dot(S.col(j))*0.5/n_sub[j];
240             ofile<<p_sub[j]<<"\t";
241         }
242         p_bar = p_sub.dot(n_sub) / n_bar_r;
243         tmp = p_sub.array() - p_bar;
244         s_sq = (tmp.array() * tmp.array() * n_sub.array()).sum() / ((r - 1.0) * n_bar);
245         h_sub = 2 * p_sub.array() * (1.0 - p_sub.array());
246         h_bar = h_sub.dot(n_sub) / n_bar_r;
247         d_buf = p_bar * (1.0 - p_bar) - (r - 1.0) * s_sq / r;
248         a = (n_bar / n_c) * (s_sq - (1.0 / (n_bar - 1.0)) * (d_buf - 0.25 * h_bar));
249         b = (n_bar / (n_bar - 1.0)) * (d_buf - (2 * n_bar - 1.0) * h_bar / (4 * n_bar));
250         c = 0.5 * h_bar;
251 
252         Fst= a / (a + b + c);
253 
254         /*double p_mean = p_sub.mean();
255         tmp = p_sub.array() - p_mean;
256         double Fst_fixed = (tmp.array() * tmp.array() * n_sub.array()).sum();
257         Fst_fixed = Fst_fixed / (p_mean * (1.0 - p_mean));
258         Fst_fixed = Fst_fixed / ((r - 1.0) * n_bar);*/
259 
260         ofile<<Fst<<endl;
261     }
262     ofile.close();
263 }
264 
read_subpopu(string filename,vector<string> & subpopu,vector<string> & subpopu_name)265 void gcta::read_subpopu(string filename, vector<string> &subpopu, vector<string> &subpopu_name)
266 {
267     cout<<"Reading sub-population information from ["+filename+"]."<<endl;
268     ifstream ifstream_subpopu(filename.c_str());
269     if(!ifstream_subpopu) throw("Error: can not open the file ["+filename+"] to read.");
270 
271     vector<string> ID;
272     vector< vector<string> > subpopu_buf;
273     read_fac(ifstream_subpopu, ID, subpopu_buf);
274     update_id_map_kp(ID, _id_map, _keep);
275     cout<<"Sub-population information for "<<_keep.size()<<" individuals matched to the genotype data."<<endl;
276 
277     int i=0, j=0;
278     map<string, int> uni_id_map;
279     map<string, int>::iterator iter;
280     for(i=0; i<_keep.size(); i++) uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
281     subpopu.clear();
282     subpopu.resize(_keep.size());
283     for(i=0; i<ID.size(); i++){
284         iter=uni_id_map.find(ID[i]);
285         if(iter!=uni_id_map.end()) subpopu[iter->second]=subpopu_buf[i][0];
286     }
287 
288     subpopu_name.clear();
289     subpopu_name=subpopu;
290     stable_sort(subpopu_name.begin(), subpopu_name.end());
291     subpopu_name.erase(unique(subpopu_name.begin(), subpopu_name.end()), subpopu_name.end());
292  }
293 
std_XMat_subpopu(string subpopu_file,MatrixXf & X,eigenVector & sd_SNP,bool grm_xchr_flag,bool miss_with_mu,bool divid_by_std)294  void gcta::std_XMat_subpopu(string subpopu_file, MatrixXf &X, eigenVector &sd_SNP, bool grm_xchr_flag, bool miss_with_mu, bool divid_by_std)
295 {
296     vector<string> subpopu, subpopu_name;
297     read_subpopu(subpopu_file, subpopu, subpopu_name);
298 
299     eigenMatrix S;
300     coeff_mat(subpopu, S, "Error: too many subpopulations specified in the file ["+subpopu_file+"].", "Error: at least two sub-populations are required.");
301     int popu_num = subpopu_name.size();
302 
303     if (_mu.empty()) calcu_mu();
304 
305     unsigned long i = 0, j = 0, n = _keep.size(), m = _include.size();
306     sd_SNP.resize(m);
307     if (_dosage_flag) {
308         for (j = 0; j < m; j++)  sd_SNP[j] = (X.col(j) - VectorXf::Constant(n, _mu[_include[j]])).squaredNorm() / (n - 1.0);
309     }
310     else {
311         for (j = 0; j < m; j++) sd_SNP[j] = _mu[_include[j]]*(1.0 - 0.5 * _mu[_include[j]]);
312     }
313     if (divid_by_std) {
314         for (j = 0; j < m; j++) {
315             if (fabs(sd_SNP[j]) < 1.0e-50) sd_SNP[j] = 0.0;
316             else sd_SNP[j] = sqrt(1.0 / sd_SNP[j]);
317         }
318     }
319 
320     int popu = 0;
321     vector<eigenVector> sd_SNP_sub(popu_num), mu_SNP_sub(popu_num);
322     for(popu  = 0; popu < popu_num; popu++) {
323         (mu_SNP_sub[popu]).resize(m);
324         (sd_SNP_sub[popu]).resize(m);
325         #pragma omp parallel for private(i)
326         for (j = 0; j < m; j++){
327             double m_sub = 0.0;
328             (mu_SNP_sub[popu])(j) = 0.0;
329             for(i = 0; i < n; i++){
330                 if(X(i,j) < 1e5 && S(i,popu)>0.0){
331                     m_sub += 1.0;
332                     (mu_SNP_sub[popu])(j) += X(i,j);
333                 }
334             }
335             (mu_SNP_sub[popu])(j) /= m_sub;
336             if(divid_by_std){
337                 if(_dosage_flag){
338                     (sd_SNP_sub[popu])(j) = 0.0;
339                     for(i = 0; i < n; i++){
340                         if(X(i,j) < 1e5 && S(i,popu)>0.0) (sd_SNP_sub[popu])(j) += (X(i,j) - (mu_SNP_sub[popu])(j)) * (X(i,j) - (mu_SNP_sub[popu])(j));
341                     }
342                     (sd_SNP_sub[popu])(j) /= (m_sub - 1.0);
343                 }
344                 else (sd_SNP_sub[popu])(j) = (mu_SNP_sub[popu])(j)*(1.0 - 0.5 * (mu_SNP_sub[popu])(j));
345                 if ((sd_SNP_sub[popu])(j) < 1.0e-50) (sd_SNP_sub[popu])(j) = 0.0;
346                 else (sd_SNP_sub[popu])(j) = 1.0 / sqrt((sd_SNP_sub[popu])(j));
347             }
348         }
349     }
350 
351     for(popu  = 0; popu < popu_num; popu++) {
352         #pragma omp parallel for private(j)
353         for(i = 0; i < n; i++){
354             if(S(i,popu)>0.0){
355                 for (j = 0; j < m; j++){
356                     if(X(i,j) < 1e5){
357                         X(i,j) -=  (mu_SNP_sub[popu])(j);
358                         if(divid_by_std) X(i,j) *= (sd_SNP_sub[popu])(j);
359                     }
360                     else if (miss_with_mu) X(i,j) = 0.0;
361                 }
362             }
363         }
364     }
365 
366     if (!grm_xchr_flag) return;
367 
368     // for the X-chromosome
369     check_sex();
370     double f_buf = sqrt(0.5);
371 
372     #pragma omp parallel for private(j)
373     for (i = 0; i < n; i++) {
374         if (_sex[_keep[i]] == 1) {
375             for (j = 0; j < m; j++) {
376                 if (X(i,j) < 1e5) X(i,j) *= f_buf;
377                 else if (miss_with_mu) X(i,j) = 0.0;
378             }
379         }
380     }
381 }
382 
383