1 /*
2  Genome-wide Efficient Mixed Model Association (GEMMA)
3  Copyright (C) 2011-2017, Xiang Zhou
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include "gsl/gsl_blas.h"
20 #include "gsl/gsl_linalg.h"
21 #include "gsl/gsl_matrix.h"
22 #include "gsl/gsl_vector.h"
23 #include <bitset>
24 #include <cmath>
25 #include <fstream>
26 #include <iomanip>
27 #include <iostream>
28 #include <sstream>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string>
32 #include <vector>
33 
34 #include "gzstream.h"
35 #include "gemma_io.h"
36 #include "lapack.h"
37 #include "mathfunc.h"
38 #include "prdt.h"
39 
40 using namespace std;
41 
CopyFromParam(PARAM & cPar)42 void PRDT::CopyFromParam(PARAM &cPar) {
43   a_mode = cPar.a_mode;
44   d_pace = cPar.d_pace;
45 
46   file_bfile = cPar.file_bfile;
47   file_geno = cPar.file_geno;
48   file_out = cPar.file_out;
49   path_out = cPar.path_out;
50 
51   indicator_pheno = cPar.indicator_pheno;
52   indicator_cvt = cPar.indicator_cvt;
53   indicator_idv = cPar.indicator_idv;
54 
55   snpInfo = cPar.snpInfo;
56   mapRS2est = cPar.mapRS2est;
57 
58   time_eigen = 0;
59 
60   n_ph = cPar.n_ph;
61   np_obs = cPar.np_obs;
62   np_miss = cPar.np_miss;
63   ns_total = cPar.ns_total;
64   ns_test = 0;
65 
66   return;
67 }
68 
CopyToParam(PARAM & cPar)69 void PRDT::CopyToParam(PARAM &cPar) {
70   cPar.ns_test = ns_test;
71   cPar.time_eigen = time_eigen;
72 
73   return;
74 }
75 
WriteFiles(gsl_vector * y_prdt)76 void PRDT::WriteFiles(gsl_vector *y_prdt) {
77   string file_str;
78   file_str = path_out + "/" + file_out;
79   file_str += ".";
80   file_str += "prdt";
81   file_str += ".txt";
82 
83   ofstream outfile(file_str.c_str(), ofstream::out);
84   if (!outfile) {
85     cout << "error writing file: " << file_str.c_str() << endl;
86     return;
87   }
88 
89   size_t ci_test = 0;
90   for (size_t i = 0; i < indicator_idv.size(); i++) {
91     if (indicator_idv[i] == 1) {
92       outfile << "NA" << endl;
93     } else {
94       outfile << gsl_vector_get(y_prdt, ci_test) << endl;
95       ci_test++;
96     }
97   }
98 
99   outfile.close();
100   outfile.clear();
101   return;
102 }
103 
WriteFiles(gsl_matrix * Y_full)104 void PRDT::WriteFiles(gsl_matrix *Y_full) {
105   string file_str;
106   file_str = path_out + "/" + file_out;
107   file_str += ".prdt.txt";
108 
109   ofstream outfile(file_str.c_str(), ofstream::out);
110   if (!outfile) {
111     cout << "error writing file: " << file_str.c_str() << endl;
112     return;
113   }
114 
115   size_t ci_test = 0;
116   for (size_t i = 0; i < indicator_cvt.size(); i++) {
117     if (indicator_cvt[i] == 0) {
118       outfile << "NA" << endl;
119     } else {
120       for (size_t j = 0; j < Y_full->size2; j++) {
121         outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t";
122       }
123       outfile << endl;
124       ci_test++;
125     }
126   }
127 
128   outfile.close();
129   outfile.clear();
130   return;
131 }
132 
AddBV(gsl_matrix * G,const gsl_vector * u_hat,gsl_vector * y_prdt)133 void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
134   size_t ni_test = u_hat->size, ni_total = G->size1;
135 
136   gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test);
137   gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test);
138   gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test);
139   gsl_vector *eval = gsl_vector_alloc(ni_test);
140   gsl_vector *Utu = gsl_vector_alloc(ni_test);
141   gsl_vector *w = gsl_vector_alloc(ni_total);
142   gsl_permutation *pmt = gsl_permutation_alloc(ni_test);
143 
144   // center matrix G based on indicator_idv
145   for (size_t i = 0; i < ni_total; i++) {
146     gsl_vector_set(w, i, indicator_idv[i]);
147   }
148   CenterMatrix(G, w);
149 
150   // obtain Koo and Kfo
151   size_t o_i = 0, o_j = 0;
152   double d;
153   for (size_t i = 0; i < indicator_idv.size(); i++) {
154     o_j = 0;
155     for (size_t j = 0; j < indicator_idv.size(); j++) {
156       d = gsl_matrix_get(G, i, j);
157       if (indicator_idv[i] == 1 && indicator_idv[j] == 1) {
158         gsl_matrix_set(Goo, o_i, o_j, d);
159       }
160       if (indicator_idv[i] == 0 && indicator_idv[j] == 1) {
161         gsl_matrix_set(Gfo, i - o_i, o_j, d);
162       }
163       if (indicator_idv[j] == 1) {
164         o_j++;
165       }
166     }
167     if (indicator_idv[i] == 1) {
168       o_i++;
169     }
170   }
171 
172   // matrix operations to get u_prdt
173   cout << "Start Eigen-Decomposition..." << endl;
174   clock_t time_start = clock();
175   EigenDecomp(Goo, U, eval, 0);
176   for (size_t i = 0; i < eval->size; i++) {
177     if (gsl_vector_get(eval, i) < 1e-10) {
178       gsl_vector_set(eval, i, 0);
179     }
180   }
181 
182   time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
183 
184   gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu);
185   for (size_t i = 0; i < eval->size; i++) {
186     d = gsl_vector_get(eval, i);
187     if (d != 0) {
188       d = gsl_vector_get(Utu, i) / d;
189       gsl_vector_set(Utu, i, d);
190     }
191   }
192   gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval);
193   gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
194 
195   // Free matrices.
196   gsl_matrix_free(Goo);
197   gsl_matrix_free(Gfo);
198   gsl_matrix_free(U);
199   gsl_vector_free(eval);
200   gsl_vector_free(Utu);
201   gsl_vector_free(w);
202   gsl_permutation_free(pmt);
203 
204   return;
205 }
206 
AnalyzeBimbam(gsl_vector * y_prdt)207 void PRDT::AnalyzeBimbam(gsl_vector *y_prdt) {
208   debug_msg("entering");
209   igzstream infile(file_geno.c_str(), igzstream::in);
210   if (!infile) {
211     cout << "error reading genotype file:" << file_geno << endl;
212     return;
213   }
214 
215   string line;
216   char *ch_ptr;
217   string rs;
218 
219   size_t n_miss, n_train_nomiss, c_phen;
220   double geno, x_mean, x_train_mean, effect_size;
221 
222   gsl_vector *x = gsl_vector_alloc(y_prdt->size);
223   gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size);
224 
225   ns_test = 0;
226 
227   // Start reading genotypes and analyze.
228   for (size_t t = 0; t < ns_total; ++t) {
229     safeGetline(infile, line).eof();
230     if (t % d_pace == 0 || t == (ns_total - 1)) {
231       ProgressBar("Reading SNPs  ", t, ns_total - 1);
232     }
233 
234     ch_ptr = strtok((char *)line.c_str(), " , \t");
235     rs = ch_ptr;
236     ch_ptr = strtok(NULL, " , \t");
237     ch_ptr = strtok(NULL, " , \t");
238 
239     if (mapRS2est.count(rs) == 0) {
240       continue;
241     } else {
242       effect_size = mapRS2est[rs];
243     }
244 
245     x_mean = 0.0;
246     c_phen = 0;
247     n_miss = 0;
248     x_train_mean = 0;
249     n_train_nomiss = 0;
250 
251     gsl_vector_set_zero(x_miss);
252 
253     for (size_t i = 0; i < indicator_idv.size(); ++i) {
254       ch_ptr = strtok(NULL, " , \t");
255       if (indicator_idv[i] == 1) {
256         if (strcmp(ch_ptr, "NA") != 0) {
257           geno = atof(ch_ptr);
258           x_train_mean += geno;
259           n_train_nomiss++;
260         }
261       } else {
262         if (strcmp(ch_ptr, "NA") == 0) {
263           gsl_vector_set(x_miss, c_phen, 0.0);
264           n_miss++;
265         } else {
266           geno = atof(ch_ptr);
267 
268           gsl_vector_set(x, c_phen, geno);
269           gsl_vector_set(x_miss, c_phen, 1.0);
270           x_mean += geno;
271         }
272         c_phen++;
273       }
274     }
275 
276     if (x->size == n_miss) {
277       cout << "snp " << rs << " has missing genotype for all "
278            << "individuals and will be ignored." << endl;
279       continue;
280     }
281 
282     x_mean /= (double)(x->size - n_miss);
283     x_train_mean /= (double)(n_train_nomiss);
284 
285     for (size_t i = 0; i < x->size; ++i) {
286       geno = gsl_vector_get(x, i);
287       if (gsl_vector_get(x_miss, i) == 0) {
288         gsl_vector_set(x, i, x_mean - x_train_mean);
289       } else {
290         gsl_vector_set(x, i, geno - x_train_mean);
291       }
292     }
293 
294     gsl_vector_scale(x, effect_size);
295     gsl_vector_add(y_prdt, x);
296 
297     ns_test++;
298   }
299   cout << endl;
300 
301   gsl_vector_free(x);
302   gsl_vector_free(x_miss);
303 
304   infile.close();
305   infile.clear();
306 
307   return;
308 }
309 
AnalyzePlink(gsl_vector * y_prdt)310 void PRDT::AnalyzePlink(gsl_vector *y_prdt) {
311   debug_msg("entering");
312   string file_bed = file_bfile + ".bed";
313   ifstream infile(file_bed.c_str(), ios::binary);
314   if (!infile) {
315     cout << "error reading bed file:" << file_bed << endl;
316     return;
317   }
318 
319   char ch[1];
320   bitset<8> b;
321   string rs;
322 
323   size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
324   double geno, x_mean, x_train_mean, effect_size;
325 
326   gsl_vector *x = gsl_vector_alloc(y_prdt->size);
327 
328   // Calculate n_bit and c, the number of bit for each SNP.
329   if (indicator_idv.size() % 4 == 0) {
330     n_bit = indicator_idv.size() / 4;
331   } else {
332     n_bit = indicator_idv.size() / 4 + 1;
333   }
334 
335   // Print the first 3 magic numbers.
336   for (size_t i = 0; i < 3; ++i) {
337     infile.read(ch, 1);
338     b = ch[0];
339   }
340 
341   ns_test = 0;
342 
343   for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
344     if (t % d_pace == 0 || t == snpInfo.size() - 1) {
345       ProgressBar("Reading SNPs  ", t, snpInfo.size() - 1);
346     }
347 
348     rs = snpInfo[t].rs_number;
349 
350     if (mapRS2est.count(rs) == 0) {
351       continue;
352     } else {
353       effect_size = mapRS2est[rs];
354     }
355 
356     // n_bit, and 3 is the number of magic numbers.
357     infile.seekg(t * n_bit + 3);
358 
359     // Read genotypes.
360     x_mean = 0.0;
361     n_miss = 0;
362     ci_total = 0;
363     ci_test = 0;
364     x_train_mean = 0;
365     n_train_nomiss = 0;
366     for (size_t i = 0; i < n_bit; ++i) {
367       infile.read(ch, 1);
368       b = ch[0];
369 
370       // Minor allele homozygous: 2.0; major: 0.0.
371       for (size_t j = 0; j < 4; ++j) {
372         if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) {
373           break;
374         }
375         if (indicator_idv[ci_total] == 1) {
376           if (b[2 * j] == 0) {
377             if (b[2 * j + 1] == 0) {
378               x_train_mean += 2.0;
379               n_train_nomiss++;
380             } else {
381               x_train_mean += 1.0;
382               n_train_nomiss++;
383             }
384           } else {
385             if (b[2 * j + 1] == 1) {
386               n_train_nomiss++;
387             } else {
388             }
389           }
390         } else {
391           if (b[2 * j] == 0) {
392             if (b[2 * j + 1] == 0) {
393               gsl_vector_set(x, ci_test, 2);
394               x_mean += 2.0;
395             } else {
396               gsl_vector_set(x, ci_test, 1);
397               x_mean += 1.0;
398             }
399           } else {
400             if (b[2 * j + 1] == 1) {
401               gsl_vector_set(x, ci_test, 0);
402             } else {
403               gsl_vector_set(x, ci_test, -9);
404               n_miss++;
405             }
406           }
407           ci_test++;
408         }
409         ci_total++;
410       }
411     }
412 
413     if (x->size == n_miss) {
414       cout << "snp " << rs << " has missing genotype for all "
415            << "individuals and will be ignored." << endl;
416       continue;
417     }
418 
419     x_mean /= (double)(x->size - n_miss);
420     x_train_mean /= (double)(n_train_nomiss);
421 
422     for (size_t i = 0; i < x->size; ++i) {
423       geno = gsl_vector_get(x, i);
424       if (geno == -9) {
425         gsl_vector_set(x, i, x_mean - x_train_mean);
426       } else {
427         gsl_vector_set(x, i, geno - x_train_mean);
428       }
429     }
430 
431     gsl_vector_scale(x, effect_size);
432     gsl_vector_add(y_prdt, x);
433 
434     ns_test++;
435   }
436   cout << endl;
437 
438   gsl_vector_free(x);
439 
440   infile.close();
441   infile.clear();
442 
443   return;
444 }
445 
446 // Predict missing phenotypes using ridge regression.
447 // Y_hat contains fixed effects
MvnormPrdt(const gsl_matrix * Y_hat,const gsl_matrix * H,gsl_matrix * Y_full)448 void PRDT::MvnormPrdt(const gsl_matrix *Y_hat, const gsl_matrix *H,
449                       gsl_matrix *Y_full) {
450   gsl_vector *y_obs = gsl_vector_alloc(np_obs);
451   gsl_vector *y_miss = gsl_vector_alloc(np_miss);
452   gsl_matrix *H_oo = gsl_matrix_alloc(np_obs, np_obs);
453   gsl_matrix *H_mo = gsl_matrix_alloc(np_miss, np_obs);
454   gsl_vector *Hiy = gsl_vector_alloc(np_obs);
455 
456   size_t c_obs1 = 0, c_obs2 = 0, c_miss1 = 0, c_miss2 = 0;
457 
458   // Obtain H_oo, H_mo.
459   c_obs1 = 0;
460   c_miss1 = 0;
461   for (vector<int>::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) {
462     if (indicator_cvt[i1] == 0) {
463       continue;
464     }
465     for (vector<int>::size_type j1 = 0; j1 < n_ph; ++j1) {
466 
467       c_obs2 = 0;
468       c_miss2 = 0;
469       for (vector<int>::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) {
470         if (indicator_cvt[i2] == 0) {
471           continue;
472         }
473         for (vector<int>::size_type j2 = 0; j2 < n_ph; j2++) {
474 
475           if (indicator_pheno[i2][j2] == 1) {
476             if (indicator_pheno[i1][j1] == 1) {
477               gsl_matrix_set(
478                   H_oo, c_obs1, c_obs2,
479                   gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
480             } else {
481               gsl_matrix_set(
482                   H_mo, c_miss1, c_obs2,
483                   gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
484             }
485             c_obs2++;
486           } else {
487             c_miss2++;
488           }
489         }
490       }
491 
492       if (indicator_pheno[i1][j1] == 1) {
493         c_obs1++;
494       } else {
495         c_miss1++;
496       }
497     }
498   }
499 
500   // Do LU decomposition of H_oo.
501   int sig;
502   gsl_permutation *pmt = gsl_permutation_alloc(np_obs);
503   LUDecomp(H_oo, pmt, &sig);
504 
505   // Obtain y_obs=y_full-y_hat.
506   // Add the fixed effects part to y_miss: y_miss=y_hat.
507   c_obs1 = 0;
508   c_miss1 = 0;
509   for (vector<int>::size_type i = 0; i < indicator_pheno.size(); ++i) {
510     if (indicator_cvt[i] == 0) {
511       continue;
512     }
513 
514     for (vector<int>::size_type j = 0; j < n_ph; ++j) {
515       if (indicator_pheno[i][j] == 1) {
516         gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) -
517                                           gsl_matrix_get(Y_hat, i, j));
518         c_obs1++;
519       } else {
520         gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j));
521         c_miss1++;
522       }
523     }
524   }
525 
526   LUSolve(H_oo, pmt, y_obs, Hiy);
527 
528   gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
529 
530   // Put back predicted y_miss to Y_full.
531   c_miss1 = 0;
532   for (vector<int>::size_type i = 0; i < indicator_pheno.size(); ++i) {
533     if (indicator_cvt[i] == 0) {
534       continue;
535     }
536 
537     for (vector<int>::size_type j = 0; j < n_ph; ++j) {
538       if (indicator_pheno[i][j] == 0) {
539         gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1));
540         c_miss1++;
541       }
542     }
543   }
544 
545   // Free matrices.
546   gsl_vector_free(y_obs);
547   gsl_vector_free(y_miss);
548   gsl_matrix_free(H_oo);
549   gsl_matrix_free(H_mo);
550   gsl_vector_free(Hiy);
551 
552   return;
553 }
554