1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions to build a ERM
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
read_efile(string efile)15 void gcta::read_efile(string efile)
16 {
17 ifstream einf;
18 einf.open(efile.c_str());
19 if (!einf.is_open()) throw ("Error: can not open the file [" + efile + "] to read.");
20 cout << "Reading gene expression / methylation data from [" + efile + "] ..." << endl;
21
22 string str_buf="";
23 vector<string> vs_buf;
24 getline(einf, str_buf); // reading the header
25 int col_num = StrFunc::split_string(str_buf, vs_buf, " \t\n");
26 if(col_num < 3) throw ("Error: there needs be at least 3 columns in the file [" + efile + "].");
27 _probe_num = col_num - 2;
28 _probe_name.resize(_probe_num);
29 int i=0;
30 for(i=0; i<_probe_num; i++) _probe_name[i]=vs_buf[i+2];
31 _indi_num = 0;
32 while(getline(einf,str_buf)) _indi_num++;
33 einf.close();
34
35 einf.open(efile.c_str());
36 getline(einf, str_buf);
37 i=0;
38 int j=0;
39 stringstream errmsg;
40 _fid.resize(_indi_num);
41 _pid.resize(_indi_num);
42 _probe_data.resize(_indi_num, _probe_num);
43 string id_buf="";
44 while (getline(einf, str_buf)) {
45 stringstream ss(str_buf);
46 if (!(ss >> id_buf)){ errmsg<<"Error: in line "<<i+2<<"."; throw(errmsg.str()); }
47 _fid[i]=id_buf;
48 if (!(ss >> id_buf)){ errmsg<<"Error: in line "<<i+2<<"."; throw(errmsg.str()); }
49 _pid[i]=id_buf;
50 for(j=0; j<_probe_num; j++){
51 if (!(ss >> id_buf)){ errmsg<<"Error: in line "<<i+2<<"."; throw(errmsg.str()); }
52 if(id_buf=="-9") _probe_data(i,j)=1e10;
53 else _probe_data(i,j)=atof(id_buf.c_str());
54 }
55 i++;
56 }
57 einf.close();
58 cout<<"Expression data for "<<_probe_num<<" probes of "<<_indi_num<<" individuals have been included from the file [" + efile + "]."<<endl;
59
60 // Initialize _keep and _e_include
61 init_keep();
62 init_e_include();
63 }
64
init_e_include()65 void gcta::init_e_include() {
66 _e_include.clear();
67 _e_include.resize(_probe_num);
68 _probe_name_map.clear();
69 int i = 0, size = 0;
70 for (i = 0; i < _probe_num; i++) {
71 _e_include[i] = i;
72 _probe_name_map.insert(pair<string, int>(_probe_name[i], i));
73 if (size == _probe_name_map.size()) throw ("Error: Duplicate probe name found: \"" + _probe_name[i] + "\".");
74 size = _probe_name_map.size();
75 }
76 }
77
std_probe(vector<vector<bool>> & X_bool,bool divid_by_std)78 void gcta::std_probe(vector< vector<bool> > &X_bool, bool divid_by_std)
79 {
80 eigenMatrix X(_probe_data);
81 _probe_data.resize(_keep.size(), _e_include.size());
82
83 int i=0, j=0;
84 #pragma omp parallel for private(j)
85 for(i=0; i<_keep.size(); i++){
86 for(j=0; j<_e_include.size(); j++) _probe_data(i,j)=X(_keep[i], _e_include[j]);
87 }
88 eigenVector mu(_e_include.size()), nonmiss(_e_include.size());
89
90 X_bool.resize(_keep.size());
91 for(i=0; i<_keep.size(); i++) X_bool[i].resize(_e_include.size());
92 #pragma omp parallel for private(i)
93 for(j=0; j<_e_include.size(); j++){
94 mu(j)=0.0;
95 nonmiss(j)=0.0;
96 for(i=0; i<_keep.size(); i++){
97 if(_probe_data(i,j)<1e9){
98 mu(j)+=_probe_data(i,j);
99 nonmiss(j)+=1.0;
100 X_bool[i][j] = true;
101 }
102 else X_bool[i][j] = false;
103 }
104 mu(j)/=nonmiss(j);
105 }
106
107 #pragma omp parallel for private(j)
108 for(i=0; i<_keep.size(); i++){
109 for(j=0; j<_e_include.size(); j++){
110 if(_probe_data(i,j)<1e9) _probe_data(i,j) -= mu(j);
111 else _probe_data(i,j) = 0.0;
112 }
113 }
114
115 if(divid_by_std){
116 eigenVector sd(_e_include.size());
117 #pragma omp parallel for
118 for(j=0; j<_e_include.size(); j++){
119 sd(j) = sqrt((_probe_data.col(j).dot(_probe_data.col(j))) / (nonmiss(j) - 1.0));
120 }
121
122 #pragma omp parallel for private(j)
123 for(i=0; i<_keep.size(); i++){
124 for(j=0; j<_e_include.size(); j++) _probe_data(i,j) /= sd(j);
125 }
126 }
127 }
128
std_probe_ind(vector<vector<bool>> & X_bool,bool divid_by_std)129 void gcta::std_probe_ind(vector< vector<bool> > &X_bool, bool divid_by_std)
130 {
131 int i = 0, j = 0, n = _keep.size(), m = _e_include.size();
132 eigenMatrix X(_probe_data);
133 _probe_data.resize(n, m);
134
135 #pragma omp parallel for private(j)
136 for(i=0; i<n; i++){
137 for(j=0; j<m; j++) _probe_data(i,j)=X(_keep[i], _e_include[j]);
138 }
139 X.resize(0,0);
140
141 eigenVector mu(n), nonmiss(n);
142
143 X_bool.resize(n);
144 for(i=0; i<n; i++) X_bool[i].resize(m);
145 #pragma omp parallel for private(j)
146 for(i=0; i<n; i++){
147 mu(i)=0.0;
148 nonmiss(i)=0.0;
149 for(j=0; j<m; j++){
150 if(_probe_data(i,j)<1e9){
151 mu(i)+=_probe_data(i,j);
152 nonmiss(i)+=1.0;
153 X_bool[i][j] = true;
154 }
155 else X_bool[i][j] = false;
156 }
157 mu(i)/=nonmiss(i);
158 }
159
160 #pragma omp parallel for private(j)
161 for(i=0; i<n; i++){
162 for(j=0; j<m; j++){
163 if(_probe_data(i,j)<1e9) _probe_data(i,j) -= mu(i);
164 else _probe_data(i,j) = 0.0;
165 }
166 }
167
168 if(divid_by_std){
169 eigenVector sd(n);
170 #pragma omp parallel for
171 for(i=0; i<n; i++){
172 sd(i) = sqrt((_probe_data.row(i).dot(_probe_data.row(i))) / (nonmiss(i) - 1.0));
173 }
174
175 #pragma omp parallel for private(j)
176 for(i=0; i<n; i++){
177 for(j=0; j<m; j++) _probe_data(i,j) /= sd(i);
178 }
179 }
180 }
181
make_erm(int erm_mtd,bool output_bin)182 void gcta::make_erm(int erm_mtd, bool output_bin)
183 {
184 int i = 0, j = 0, k = 0, n = _keep.size(), m = _e_include.size();
185
186 cout << "Recoding gene expression / methylation data ..." << endl;
187 bool divid_by_std = false;
188 vector< vector<bool> > X_bool;
189 if(erm_mtd < 2){
190 if(erm_mtd == 0) divid_by_std = true;
191 else if(erm_mtd == 1) divid_by_std = false;
192 std_probe(X_bool, divid_by_std);
193 }
194 else std_probe_ind(X_bool, false);
195
196 cout << "\nCalculating expression relationship matrix (ERM) ... " << endl;
197
198 // count the number of missing genotypes
199 vector< vector<int> > miss_pos(n);
200 for (i = 0; i < n; i++) {
201 for (j = 0; j < m; j++) {
202 if (X_bool[i][j] == false) miss_pos[i].push_back(j);
203 }
204 }
205
206 // Calculate A_N matrix
207 _grm_N.resize(n, n);
208 if(erm_mtd == 0){
209 #pragma omp parallel for private(j, k)
210 for (i = 0; i < n; i++) {
211 for (j = 0; j <= i; j++) {
212 int miss_j = 0;
213 for (k = 0; k < miss_pos[j].size(); k++) miss_j += (int) X_bool[i][miss_pos[j][k]];
214 _grm_N(i,j) = m - miss_pos[i].size() - miss_j;
215 }
216 }
217 }
218 else if (erm_mtd > 0){
219 if (erm_mtd == 1){
220 eigenVector nonmiss(m);
221 #pragma omp parallel for private(i)
222 for (j = 0; j < m; j++) {
223 nonmiss[j] = 0.0;
224 for (i = 0; i < n; i++) {
225 if (X_bool[i][j] == true) nonmiss[j] += 1.0;
226 }
227 }
228 eigenVector var(m);
229 #pragma omp parallel for
230 for(j=0; j<m; j++){
231 if((nonmiss(j) - 1.0) > 0.0) var(j) = (_probe_data.col(j).dot(_probe_data.col(j))) / (nonmiss(j) - 1.0);
232 else var(j) = 0.0;
233
234 }
235 double sum_var = var.sum();
236 #pragma omp parallel for private(j, k)
237 for (i = 0; i < n; i++) {
238 double i_miss_sum_var = 0.0;
239 for (k = 0; k < miss_pos[i].size(); k++) i_miss_sum_var += var[miss_pos[i][k]];
240 for (j = 0; j <= i; j++) {
241 double j_miss_sum_var = 0.0;
242 for (k = 0; k < miss_pos[j].size(); k++){
243 if (X_bool[i][miss_pos[j][k]] == true) j_miss_sum_var += var[miss_pos[j][k]];
244 }
245 _grm_N(i,j) = sum_var - i_miss_sum_var - j_miss_sum_var;
246 }
247 }
248 }
249 else{
250 eigenVector ssx(n);
251 #pragma omp parallel for
252 for(i=0; i<n; i++) ssx(i) = _probe_data.row(i).dot(_probe_data.row(i));
253 #pragma omp parallel for private(j, k)
254 for (i = 0; i < n; i++) {
255 for (j = 0; j <= i; j++) {
256 double ssx_i = ssx(i);
257 double ssx_j = ssx(j);
258 for (k = 0; k < miss_pos[j].size(); k++){
259 int l = miss_pos[j][k];
260 if(X_bool[i][l] == true) ssx_i -= _probe_data(i,l) * _probe_data(i,l);
261 }
262 for (k = 0; k < miss_pos[i].size(); k++){
263 int l = miss_pos[i][k];
264 if(X_bool[j][l] == true) ssx_j -= _probe_data(j,l) * _probe_data(j,l);
265 }
266 _grm_N(i,j) = sqrt(ssx_i*ssx_j);
267 }
268 }
269 }
270 }
271
272 // Calculate A matrix
273 _grm = _probe_data * _probe_data.transpose();
274 #pragma omp parallel for private(j)
275 for (i = 0; i < n; i++) {
276 for (j = 0; j <= i; j++) {
277 if (_grm_N(i,j) > 0.0) _grm(i,j) /= _grm_N(i,j);
278 else _grm(i,j) = 0.0;
279 }
280 }
281 _grm = _grm.array() / _grm.diagonal().mean();
282
283 // Output A_N and A
284 string out_buf = _out;
285 _out += ".E";
286 output_grm(output_bin);
287 _out = out_buf;
288 }
289