1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions to build a ERM
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
read_efile(string efile)15 void gcta::read_efile(string efile)
16 {
17     ifstream einf;
18     einf.open(efile.c_str());
19     if (!einf.is_open()) throw ("Error: can not open the file [" + efile + "] to read.");
20     cout << "Reading gene expression / methylation data from [" + efile + "] ..." << endl;
21 
22     string str_buf="";
23     vector<string> vs_buf;
24     getline(einf, str_buf); // reading the header
25     int col_num = StrFunc::split_string(str_buf, vs_buf, " \t\n");
26     if(col_num < 3) throw ("Error: there needs be at least 3 columns in the file [" + efile + "].");
27     _probe_num = col_num - 2;
28     _probe_name.resize(_probe_num);
29     int i=0;
30     for(i=0; i<_probe_num; i++) _probe_name[i]=vs_buf[i+2];
31     _indi_num = 0;
32     while(getline(einf,str_buf)) _indi_num++;
33     einf.close();
34 
35     einf.open(efile.c_str());
36     getline(einf, str_buf);
37     i=0;
38     int j=0;
39     stringstream errmsg;
40     _fid.resize(_indi_num);
41     _pid.resize(_indi_num);
42     _probe_data.resize(_indi_num, _probe_num);
43     string id_buf="";
44     while (getline(einf, str_buf)) {
45         stringstream ss(str_buf);
46         if (!(ss >> id_buf)){ errmsg<<"Error: in line "<<i+2<<"."; throw(errmsg.str()); }
47         _fid[i]=id_buf;
48         if (!(ss >> id_buf)){ errmsg<<"Error: in line "<<i+2<<"."; throw(errmsg.str()); }
49         _pid[i]=id_buf;
50         for(j=0; j<_probe_num; j++){
51             if (!(ss >> id_buf)){ errmsg<<"Error: in line "<<i+2<<"."; throw(errmsg.str()); }
52             if(id_buf=="-9") _probe_data(i,j)=1e10;
53             else _probe_data(i,j)=atof(id_buf.c_str());
54         }
55         i++;
56     }
57     einf.close();
58     cout<<"Expression data for "<<_probe_num<<" probes of "<<_indi_num<<" individuals have been included from the file [" + efile + "]."<<endl;
59 
60     // Initialize _keep and _e_include
61     init_keep();
62     init_e_include();
63 }
64 
init_e_include()65 void gcta::init_e_include() {
66     _e_include.clear();
67     _e_include.resize(_probe_num);
68     _probe_name_map.clear();
69     int i = 0, size = 0;
70     for (i = 0; i < _probe_num; i++) {
71         _e_include[i] = i;
72         _probe_name_map.insert(pair<string, int>(_probe_name[i], i));
73         if (size == _probe_name_map.size()) throw ("Error: Duplicate probe name found: \"" + _probe_name[i] + "\".");
74         size = _probe_name_map.size();
75     }
76 }
77 
std_probe(vector<vector<bool>> & X_bool,bool divid_by_std)78 void gcta::std_probe(vector< vector<bool> > &X_bool, bool divid_by_std)
79 {
80     eigenMatrix X(_probe_data);
81     _probe_data.resize(_keep.size(), _e_include.size());
82 
83     int i=0, j=0;
84     #pragma omp parallel for private(j)
85     for(i=0; i<_keep.size(); i++){
86         for(j=0; j<_e_include.size(); j++) _probe_data(i,j)=X(_keep[i], _e_include[j]);
87     }
88     eigenVector mu(_e_include.size()), nonmiss(_e_include.size());
89 
90     X_bool.resize(_keep.size());
91     for(i=0; i<_keep.size(); i++) X_bool[i].resize(_e_include.size());
92     #pragma omp parallel for private(i)
93     for(j=0; j<_e_include.size(); j++){
94         mu(j)=0.0;
95         nonmiss(j)=0.0;
96         for(i=0; i<_keep.size(); i++){
97             if(_probe_data(i,j)<1e9){
98                 mu(j)+=_probe_data(i,j);
99                 nonmiss(j)+=1.0;
100                 X_bool[i][j] = true;
101             }
102             else X_bool[i][j] = false;
103         }
104         mu(j)/=nonmiss(j);
105     }
106 
107     #pragma omp parallel for private(j)
108     for(i=0; i<_keep.size(); i++){
109         for(j=0; j<_e_include.size(); j++){
110             if(_probe_data(i,j)<1e9) _probe_data(i,j) -= mu(j);
111             else _probe_data(i,j) = 0.0;
112         }
113     }
114 
115     if(divid_by_std){
116         eigenVector sd(_e_include.size());
117         #pragma omp parallel for
118         for(j=0; j<_e_include.size(); j++){
119             sd(j) = sqrt((_probe_data.col(j).dot(_probe_data.col(j))) / (nonmiss(j) - 1.0));
120         }
121 
122         #pragma omp parallel for private(j)
123         for(i=0; i<_keep.size(); i++){
124             for(j=0; j<_e_include.size(); j++) _probe_data(i,j) /= sd(j);
125         }
126     }
127 }
128 
std_probe_ind(vector<vector<bool>> & X_bool,bool divid_by_std)129 void gcta::std_probe_ind(vector< vector<bool> > &X_bool, bool divid_by_std)
130 {
131      int i = 0, j = 0, n = _keep.size(), m = _e_include.size();
132     eigenMatrix X(_probe_data);
133     _probe_data.resize(n, m);
134 
135     #pragma omp parallel for private(j)
136     for(i=0; i<n; i++){
137         for(j=0; j<m; j++) _probe_data(i,j)=X(_keep[i], _e_include[j]);
138     }
139     X.resize(0,0);
140 
141     eigenVector mu(n), nonmiss(n);
142 
143     X_bool.resize(n);
144     for(i=0; i<n; i++) X_bool[i].resize(m);
145     #pragma omp parallel for private(j)
146     for(i=0; i<n; i++){
147         mu(i)=0.0;
148         nonmiss(i)=0.0;
149         for(j=0; j<m; j++){
150             if(_probe_data(i,j)<1e9){
151                 mu(i)+=_probe_data(i,j);
152                 nonmiss(i)+=1.0;
153                 X_bool[i][j] = true;
154             }
155             else X_bool[i][j] = false;
156         }
157         mu(i)/=nonmiss(i);
158     }
159 
160     #pragma omp parallel for private(j)
161     for(i=0; i<n; i++){
162         for(j=0; j<m; j++){
163             if(_probe_data(i,j)<1e9) _probe_data(i,j) -= mu(i);
164             else _probe_data(i,j) = 0.0;
165         }
166     }
167 
168     if(divid_by_std){
169         eigenVector sd(n);
170         #pragma omp parallel for
171         for(i=0; i<n; i++){
172             sd(i) = sqrt((_probe_data.row(i).dot(_probe_data.row(i))) / (nonmiss(i) - 1.0));
173         }
174 
175         #pragma omp parallel for private(j)
176         for(i=0; i<n; i++){
177             for(j=0; j<m; j++) _probe_data(i,j) /= sd(i);
178         }
179     }
180 }
181 
make_erm(int erm_mtd,bool output_bin)182 void gcta::make_erm(int erm_mtd, bool output_bin)
183 {
184     int i = 0, j = 0, k = 0, n = _keep.size(), m = _e_include.size();
185 
186     cout << "Recoding gene expression / methylation data ..." << endl;
187     bool divid_by_std = false;
188     vector< vector<bool> > X_bool;
189     if(erm_mtd < 2){
190         if(erm_mtd == 0) divid_by_std = true;
191         else if(erm_mtd == 1) divid_by_std = false;
192         std_probe(X_bool, divid_by_std);
193     }
194     else std_probe_ind(X_bool, false);
195 
196     cout << "\nCalculating expression relationship matrix (ERM) ... " << endl;
197 
198     // count the number of missing genotypes
199     vector< vector<int> > miss_pos(n);
200     for (i = 0; i < n; i++) {
201         for (j = 0; j < m; j++) {
202             if (X_bool[i][j] == false) miss_pos[i].push_back(j);
203         }
204     }
205 
206     // Calculate A_N matrix
207     _grm_N.resize(n, n);
208     if(erm_mtd == 0){
209         #pragma omp parallel for private(j, k)
210         for (i = 0; i < n; i++) {
211             for (j = 0; j <= i; j++) {
212                 int miss_j = 0;
213                 for (k = 0; k < miss_pos[j].size(); k++) miss_j += (int) X_bool[i][miss_pos[j][k]];
214                 _grm_N(i,j) = m - miss_pos[i].size() - miss_j;
215             }
216         }
217     }
218     else if (erm_mtd > 0){
219         if (erm_mtd == 1){
220             eigenVector nonmiss(m);
221             #pragma omp parallel for private(i)
222             for (j = 0; j < m; j++) {
223                 nonmiss[j] = 0.0;
224                 for (i = 0; i < n; i++) {
225                     if (X_bool[i][j] == true) nonmiss[j] += 1.0;
226                 }
227             }
228             eigenVector var(m);
229             #pragma omp parallel for
230             for(j=0; j<m; j++){
231                 if((nonmiss(j) - 1.0) > 0.0) var(j) = (_probe_data.col(j).dot(_probe_data.col(j))) / (nonmiss(j) - 1.0);
232                 else var(j) = 0.0;
233 
234             }
235             double sum_var = var.sum();
236             #pragma omp parallel for private(j, k)
237             for (i = 0; i < n; i++) {
238                 double i_miss_sum_var = 0.0;
239                 for (k = 0; k < miss_pos[i].size(); k++) i_miss_sum_var += var[miss_pos[i][k]];
240                 for (j = 0; j <= i; j++) {
241                     double j_miss_sum_var = 0.0;
242                     for (k = 0; k < miss_pos[j].size(); k++){
243                         if (X_bool[i][miss_pos[j][k]] == true) j_miss_sum_var += var[miss_pos[j][k]];
244                     }
245                     _grm_N(i,j) = sum_var - i_miss_sum_var - j_miss_sum_var;
246                 }
247             }
248         }
249         else{
250             eigenVector ssx(n);
251             #pragma omp parallel for
252             for(i=0; i<n; i++) ssx(i) = _probe_data.row(i).dot(_probe_data.row(i));
253             #pragma omp parallel for private(j, k)
254             for (i = 0; i < n; i++) {
255                 for (j = 0; j <= i; j++) {
256                     double ssx_i = ssx(i);
257                     double ssx_j = ssx(j);
258                     for (k = 0; k < miss_pos[j].size(); k++){
259                         int l = miss_pos[j][k];
260                         if(X_bool[i][l] == true) ssx_i -= _probe_data(i,l) * _probe_data(i,l);
261                     }
262                     for (k = 0; k < miss_pos[i].size(); k++){
263                         int l = miss_pos[i][k];
264                         if(X_bool[j][l] == true) ssx_j -= _probe_data(j,l) * _probe_data(j,l);
265                     }
266                     _grm_N(i,j) = sqrt(ssx_i*ssx_j);
267                 }
268             }
269         }
270     }
271 
272     // Calculate A matrix
273     _grm = _probe_data * _probe_data.transpose();
274     #pragma omp parallel for private(j)
275     for (i = 0; i < n; i++) {
276         for (j = 0; j <= i; j++) {
277             if (_grm_N(i,j) > 0.0) _grm(i,j) /= _grm_N(i,j);
278             else _grm(i,j) = 0.0;
279         }
280     }
281     _grm = _grm.array() / _grm.diagonal().mean();
282 
283     // Output A_N and A
284     string out_buf = _out;
285     _out += ".E";
286     output_grm(output_bin);
287     _out = out_buf;
288 }
289