1 /*
2 Genome-wide Efficient Mixed Model Association (GEMMA)
3 Copyright (C) 2011-2017, Xiang Zhou
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "gsl/gsl_blas.h"
20 #include "gsl/gsl_linalg.h"
21 #include "gsl/gsl_matrix.h"
22 #include "gsl/gsl_vector.h"
23 #include <bitset>
24 #include <cmath>
25 #include <fstream>
26 #include <iomanip>
27 #include <iostream>
28 #include <sstream>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string>
32 #include <vector>
33
34 #include "gzstream.h"
35 #include "gemma_io.h"
36 #include "lapack.h"
37 #include "mathfunc.h"
38 #include "prdt.h"
39
40 using namespace std;
41
CopyFromParam(PARAM & cPar)42 void PRDT::CopyFromParam(PARAM &cPar) {
43 a_mode = cPar.a_mode;
44 d_pace = cPar.d_pace;
45
46 file_bfile = cPar.file_bfile;
47 file_geno = cPar.file_geno;
48 file_out = cPar.file_out;
49 path_out = cPar.path_out;
50
51 indicator_pheno = cPar.indicator_pheno;
52 indicator_cvt = cPar.indicator_cvt;
53 indicator_idv = cPar.indicator_idv;
54
55 snpInfo = cPar.snpInfo;
56 mapRS2est = cPar.mapRS2est;
57
58 time_eigen = 0;
59
60 n_ph = cPar.n_ph;
61 np_obs = cPar.np_obs;
62 np_miss = cPar.np_miss;
63 ns_total = cPar.ns_total;
64 ns_test = 0;
65
66 return;
67 }
68
CopyToParam(PARAM & cPar)69 void PRDT::CopyToParam(PARAM &cPar) {
70 cPar.ns_test = ns_test;
71 cPar.time_eigen = time_eigen;
72
73 return;
74 }
75
WriteFiles(gsl_vector * y_prdt)76 void PRDT::WriteFiles(gsl_vector *y_prdt) {
77 string file_str;
78 file_str = path_out + "/" + file_out;
79 file_str += ".";
80 file_str += "prdt";
81 file_str += ".txt";
82
83 ofstream outfile(file_str.c_str(), ofstream::out);
84 if (!outfile) {
85 cout << "error writing file: " << file_str.c_str() << endl;
86 return;
87 }
88
89 size_t ci_test = 0;
90 for (size_t i = 0; i < indicator_idv.size(); i++) {
91 if (indicator_idv[i] == 1) {
92 outfile << "NA" << endl;
93 } else {
94 outfile << gsl_vector_get(y_prdt, ci_test) << endl;
95 ci_test++;
96 }
97 }
98
99 outfile.close();
100 outfile.clear();
101 return;
102 }
103
WriteFiles(gsl_matrix * Y_full)104 void PRDT::WriteFiles(gsl_matrix *Y_full) {
105 string file_str;
106 file_str = path_out + "/" + file_out;
107 file_str += ".prdt.txt";
108
109 ofstream outfile(file_str.c_str(), ofstream::out);
110 if (!outfile) {
111 cout << "error writing file: " << file_str.c_str() << endl;
112 return;
113 }
114
115 size_t ci_test = 0;
116 for (size_t i = 0; i < indicator_cvt.size(); i++) {
117 if (indicator_cvt[i] == 0) {
118 outfile << "NA" << endl;
119 } else {
120 for (size_t j = 0; j < Y_full->size2; j++) {
121 outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t";
122 }
123 outfile << endl;
124 ci_test++;
125 }
126 }
127
128 outfile.close();
129 outfile.clear();
130 return;
131 }
132
AddBV(gsl_matrix * G,const gsl_vector * u_hat,gsl_vector * y_prdt)133 void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
134 size_t ni_test = u_hat->size, ni_total = G->size1;
135
136 gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test);
137 gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test);
138 gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test);
139 gsl_vector *eval = gsl_vector_alloc(ni_test);
140 gsl_vector *Utu = gsl_vector_alloc(ni_test);
141 gsl_vector *w = gsl_vector_alloc(ni_total);
142 gsl_permutation *pmt = gsl_permutation_alloc(ni_test);
143
144 // center matrix G based on indicator_idv
145 for (size_t i = 0; i < ni_total; i++) {
146 gsl_vector_set(w, i, indicator_idv[i]);
147 }
148 CenterMatrix(G, w);
149
150 // obtain Koo and Kfo
151 size_t o_i = 0, o_j = 0;
152 double d;
153 for (size_t i = 0; i < indicator_idv.size(); i++) {
154 o_j = 0;
155 for (size_t j = 0; j < indicator_idv.size(); j++) {
156 d = gsl_matrix_get(G, i, j);
157 if (indicator_idv[i] == 1 && indicator_idv[j] == 1) {
158 gsl_matrix_set(Goo, o_i, o_j, d);
159 }
160 if (indicator_idv[i] == 0 && indicator_idv[j] == 1) {
161 gsl_matrix_set(Gfo, i - o_i, o_j, d);
162 }
163 if (indicator_idv[j] == 1) {
164 o_j++;
165 }
166 }
167 if (indicator_idv[i] == 1) {
168 o_i++;
169 }
170 }
171
172 // matrix operations to get u_prdt
173 cout << "Start Eigen-Decomposition..." << endl;
174 clock_t time_start = clock();
175 EigenDecomp(Goo, U, eval, 0);
176 for (size_t i = 0; i < eval->size; i++) {
177 if (gsl_vector_get(eval, i) < 1e-10) {
178 gsl_vector_set(eval, i, 0);
179 }
180 }
181
182 time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
183
184 gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu);
185 for (size_t i = 0; i < eval->size; i++) {
186 d = gsl_vector_get(eval, i);
187 if (d != 0) {
188 d = gsl_vector_get(Utu, i) / d;
189 gsl_vector_set(Utu, i, d);
190 }
191 }
192 gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval);
193 gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
194
195 // Free matrices.
196 gsl_matrix_free(Goo);
197 gsl_matrix_free(Gfo);
198 gsl_matrix_free(U);
199 gsl_vector_free(eval);
200 gsl_vector_free(Utu);
201 gsl_vector_free(w);
202 gsl_permutation_free(pmt);
203
204 return;
205 }
206
AnalyzeBimbam(gsl_vector * y_prdt)207 void PRDT::AnalyzeBimbam(gsl_vector *y_prdt) {
208 debug_msg("entering");
209 igzstream infile(file_geno.c_str(), igzstream::in);
210 if (!infile) {
211 cout << "error reading genotype file:" << file_geno << endl;
212 return;
213 }
214
215 string line;
216 char *ch_ptr;
217 string rs;
218
219 size_t n_miss, n_train_nomiss, c_phen;
220 double geno, x_mean, x_train_mean, effect_size;
221
222 gsl_vector *x = gsl_vector_alloc(y_prdt->size);
223 gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size);
224
225 ns_test = 0;
226
227 // Start reading genotypes and analyze.
228 for (size_t t = 0; t < ns_total; ++t) {
229 safeGetline(infile, line).eof();
230 if (t % d_pace == 0 || t == (ns_total - 1)) {
231 ProgressBar("Reading SNPs ", t, ns_total - 1);
232 }
233
234 ch_ptr = strtok((char *)line.c_str(), " , \t");
235 rs = ch_ptr;
236 ch_ptr = strtok(NULL, " , \t");
237 ch_ptr = strtok(NULL, " , \t");
238
239 if (mapRS2est.count(rs) == 0) {
240 continue;
241 } else {
242 effect_size = mapRS2est[rs];
243 }
244
245 x_mean = 0.0;
246 c_phen = 0;
247 n_miss = 0;
248 x_train_mean = 0;
249 n_train_nomiss = 0;
250
251 gsl_vector_set_zero(x_miss);
252
253 for (size_t i = 0; i < indicator_idv.size(); ++i) {
254 ch_ptr = strtok(NULL, " , \t");
255 if (indicator_idv[i] == 1) {
256 if (strcmp(ch_ptr, "NA") != 0) {
257 geno = atof(ch_ptr);
258 x_train_mean += geno;
259 n_train_nomiss++;
260 }
261 } else {
262 if (strcmp(ch_ptr, "NA") == 0) {
263 gsl_vector_set(x_miss, c_phen, 0.0);
264 n_miss++;
265 } else {
266 geno = atof(ch_ptr);
267
268 gsl_vector_set(x, c_phen, geno);
269 gsl_vector_set(x_miss, c_phen, 1.0);
270 x_mean += geno;
271 }
272 c_phen++;
273 }
274 }
275
276 if (x->size == n_miss) {
277 cout << "snp " << rs << " has missing genotype for all "
278 << "individuals and will be ignored." << endl;
279 continue;
280 }
281
282 x_mean /= (double)(x->size - n_miss);
283 x_train_mean /= (double)(n_train_nomiss);
284
285 for (size_t i = 0; i < x->size; ++i) {
286 geno = gsl_vector_get(x, i);
287 if (gsl_vector_get(x_miss, i) == 0) {
288 gsl_vector_set(x, i, x_mean - x_train_mean);
289 } else {
290 gsl_vector_set(x, i, geno - x_train_mean);
291 }
292 }
293
294 gsl_vector_scale(x, effect_size);
295 gsl_vector_add(y_prdt, x);
296
297 ns_test++;
298 }
299 cout << endl;
300
301 gsl_vector_free(x);
302 gsl_vector_free(x_miss);
303
304 infile.close();
305 infile.clear();
306
307 return;
308 }
309
AnalyzePlink(gsl_vector * y_prdt)310 void PRDT::AnalyzePlink(gsl_vector *y_prdt) {
311 debug_msg("entering");
312 string file_bed = file_bfile + ".bed";
313 ifstream infile(file_bed.c_str(), ios::binary);
314 if (!infile) {
315 cout << "error reading bed file:" << file_bed << endl;
316 return;
317 }
318
319 char ch[1];
320 bitset<8> b;
321 string rs;
322
323 size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
324 double geno, x_mean, x_train_mean, effect_size;
325
326 gsl_vector *x = gsl_vector_alloc(y_prdt->size);
327
328 // Calculate n_bit and c, the number of bit for each SNP.
329 if (indicator_idv.size() % 4 == 0) {
330 n_bit = indicator_idv.size() / 4;
331 } else {
332 n_bit = indicator_idv.size() / 4 + 1;
333 }
334
335 // Print the first 3 magic numbers.
336 for (size_t i = 0; i < 3; ++i) {
337 infile.read(ch, 1);
338 b = ch[0];
339 }
340
341 ns_test = 0;
342
343 for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
344 if (t % d_pace == 0 || t == snpInfo.size() - 1) {
345 ProgressBar("Reading SNPs ", t, snpInfo.size() - 1);
346 }
347
348 rs = snpInfo[t].rs_number;
349
350 if (mapRS2est.count(rs) == 0) {
351 continue;
352 } else {
353 effect_size = mapRS2est[rs];
354 }
355
356 // n_bit, and 3 is the number of magic numbers.
357 infile.seekg(t * n_bit + 3);
358
359 // Read genotypes.
360 x_mean = 0.0;
361 n_miss = 0;
362 ci_total = 0;
363 ci_test = 0;
364 x_train_mean = 0;
365 n_train_nomiss = 0;
366 for (size_t i = 0; i < n_bit; ++i) {
367 infile.read(ch, 1);
368 b = ch[0];
369
370 // Minor allele homozygous: 2.0; major: 0.0.
371 for (size_t j = 0; j < 4; ++j) {
372 if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) {
373 break;
374 }
375 if (indicator_idv[ci_total] == 1) {
376 if (b[2 * j] == 0) {
377 if (b[2 * j + 1] == 0) {
378 x_train_mean += 2.0;
379 n_train_nomiss++;
380 } else {
381 x_train_mean += 1.0;
382 n_train_nomiss++;
383 }
384 } else {
385 if (b[2 * j + 1] == 1) {
386 n_train_nomiss++;
387 } else {
388 }
389 }
390 } else {
391 if (b[2 * j] == 0) {
392 if (b[2 * j + 1] == 0) {
393 gsl_vector_set(x, ci_test, 2);
394 x_mean += 2.0;
395 } else {
396 gsl_vector_set(x, ci_test, 1);
397 x_mean += 1.0;
398 }
399 } else {
400 if (b[2 * j + 1] == 1) {
401 gsl_vector_set(x, ci_test, 0);
402 } else {
403 gsl_vector_set(x, ci_test, -9);
404 n_miss++;
405 }
406 }
407 ci_test++;
408 }
409 ci_total++;
410 }
411 }
412
413 if (x->size == n_miss) {
414 cout << "snp " << rs << " has missing genotype for all "
415 << "individuals and will be ignored." << endl;
416 continue;
417 }
418
419 x_mean /= (double)(x->size - n_miss);
420 x_train_mean /= (double)(n_train_nomiss);
421
422 for (size_t i = 0; i < x->size; ++i) {
423 geno = gsl_vector_get(x, i);
424 if (geno == -9) {
425 gsl_vector_set(x, i, x_mean - x_train_mean);
426 } else {
427 gsl_vector_set(x, i, geno - x_train_mean);
428 }
429 }
430
431 gsl_vector_scale(x, effect_size);
432 gsl_vector_add(y_prdt, x);
433
434 ns_test++;
435 }
436 cout << endl;
437
438 gsl_vector_free(x);
439
440 infile.close();
441 infile.clear();
442
443 return;
444 }
445
446 // Predict missing phenotypes using ridge regression.
447 // Y_hat contains fixed effects
MvnormPrdt(const gsl_matrix * Y_hat,const gsl_matrix * H,gsl_matrix * Y_full)448 void PRDT::MvnormPrdt(const gsl_matrix *Y_hat, const gsl_matrix *H,
449 gsl_matrix *Y_full) {
450 gsl_vector *y_obs = gsl_vector_alloc(np_obs);
451 gsl_vector *y_miss = gsl_vector_alloc(np_miss);
452 gsl_matrix *H_oo = gsl_matrix_alloc(np_obs, np_obs);
453 gsl_matrix *H_mo = gsl_matrix_alloc(np_miss, np_obs);
454 gsl_vector *Hiy = gsl_vector_alloc(np_obs);
455
456 size_t c_obs1 = 0, c_obs2 = 0, c_miss1 = 0, c_miss2 = 0;
457
458 // Obtain H_oo, H_mo.
459 c_obs1 = 0;
460 c_miss1 = 0;
461 for (vector<int>::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) {
462 if (indicator_cvt[i1] == 0) {
463 continue;
464 }
465 for (vector<int>::size_type j1 = 0; j1 < n_ph; ++j1) {
466
467 c_obs2 = 0;
468 c_miss2 = 0;
469 for (vector<int>::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) {
470 if (indicator_cvt[i2] == 0) {
471 continue;
472 }
473 for (vector<int>::size_type j2 = 0; j2 < n_ph; j2++) {
474
475 if (indicator_pheno[i2][j2] == 1) {
476 if (indicator_pheno[i1][j1] == 1) {
477 gsl_matrix_set(
478 H_oo, c_obs1, c_obs2,
479 gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
480 } else {
481 gsl_matrix_set(
482 H_mo, c_miss1, c_obs2,
483 gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
484 }
485 c_obs2++;
486 } else {
487 c_miss2++;
488 }
489 }
490 }
491
492 if (indicator_pheno[i1][j1] == 1) {
493 c_obs1++;
494 } else {
495 c_miss1++;
496 }
497 }
498 }
499
500 // Do LU decomposition of H_oo.
501 int sig;
502 gsl_permutation *pmt = gsl_permutation_alloc(np_obs);
503 LUDecomp(H_oo, pmt, &sig);
504
505 // Obtain y_obs=y_full-y_hat.
506 // Add the fixed effects part to y_miss: y_miss=y_hat.
507 c_obs1 = 0;
508 c_miss1 = 0;
509 for (vector<int>::size_type i = 0; i < indicator_pheno.size(); ++i) {
510 if (indicator_cvt[i] == 0) {
511 continue;
512 }
513
514 for (vector<int>::size_type j = 0; j < n_ph; ++j) {
515 if (indicator_pheno[i][j] == 1) {
516 gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) -
517 gsl_matrix_get(Y_hat, i, j));
518 c_obs1++;
519 } else {
520 gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j));
521 c_miss1++;
522 }
523 }
524 }
525
526 LUSolve(H_oo, pmt, y_obs, Hiy);
527
528 gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
529
530 // Put back predicted y_miss to Y_full.
531 c_miss1 = 0;
532 for (vector<int>::size_type i = 0; i < indicator_pheno.size(); ++i) {
533 if (indicator_cvt[i] == 0) {
534 continue;
535 }
536
537 for (vector<int>::size_type j = 0; j < n_ph; ++j) {
538 if (indicator_pheno[i][j] == 0) {
539 gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1));
540 c_miss1++;
541 }
542 }
543 }
544
545 // Free matrices.
546 gsl_vector_free(y_obs);
547 gsl_vector_free(y_miss);
548 gsl_matrix_free(H_oo);
549 gsl_matrix_free(H_mo);
550 gsl_vector_free(Hiy);
551
552 return;
553 }
554