1 /*
2  Genome-wide Efficient Mixed Model Association (GEMMA)
3  Copyright (C) 2011-2017 Xiang Zhou
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <fstream>
20 #include <iostream>
21 #include <sstream>
22 
23 #include <assert.h>
24 #include <bitset>
25 #include <cmath>
26 #include <cstring>
27 #include <iomanip>
28 #include <iostream>
29 #include <stdio.h>
30 #include <stdlib.h>
31 
32 #include "gsl/gsl_blas.h"
33 #include "gsl/gsl_linalg.h"
34 #include "gsl/gsl_matrix.h"
35 #include "gsl/gsl_vector.h"
36 
37 #include "gsl/gsl_cdf.h"
38 #include "gsl/gsl_integration.h"
39 #include "gsl/gsl_min.h"
40 #include "gsl/gsl_roots.h"
41 
42 // #include "eigenlib.h"
43 #include "gzstream.h"
44 #include "lapack.h"
45 #include "lm.h"
46 
47 using namespace std;
48 
CopyFromParam(PARAM & cPar)49 void LM::CopyFromParam(PARAM &cPar) {
50   a_mode = cPar.a_mode;
51   d_pace = cPar.d_pace;
52 
53   file_bfile = cPar.file_bfile;
54   file_geno = cPar.file_geno;
55   file_out = cPar.file_out;
56   path_out = cPar.path_out;
57   file_gene = cPar.file_gene;
58 
59   time_opt = 0.0;
60 
61   ni_total = cPar.ni_total;
62   ns_total = cPar.ns_total;
63   ni_test = cPar.ni_test;
64   ns_test = cPar.ns_test;
65   n_cvt = cPar.n_cvt;
66 
67   ng_total = cPar.ng_total;
68   ng_test = 0;
69 
70   indicator_idv = cPar.indicator_idv;
71   indicator_snp = cPar.indicator_snp;
72   snpInfo = cPar.snpInfo;
73 
74   return;
75 }
76 
CopyToParam(PARAM & cPar)77 void LM::CopyToParam(PARAM &cPar) {
78   cPar.time_opt = time_opt;
79   cPar.ng_test = ng_test;
80   return;
81 }
82 
WriteFiles()83 void LM::WriteFiles() {
84   string file_str;
85   file_str = path_out + "/" + file_out;
86   file_str += ".assoc.txt";
87 
88   ofstream outfile(file_str.c_str(), ofstream::out);
89   if (!outfile) {
90     cout << "error writing file: " << file_str.c_str() << endl;
91     return;
92   }
93 
94   if (!file_gene.empty()) {
95     outfile << "geneID"
96             << "\t";
97 
98     if (a_mode == 51) {
99       outfile << "beta"
100               << "\t"
101               << "se"
102               << "\t"
103               << "p_wald" << endl;
104     } else if (a_mode == 52) {
105       outfile << "p_lrt" << endl;
106     } else if (a_mode == 53) {
107       outfile << "beta"
108               << "\t"
109               << "se"
110               << "\t"
111               << "p_score" << endl;
112     } else if (a_mode == 54) {
113       outfile << "beta"
114               << "\t"
115               << "se"
116               << "\t"
117               << "p_wald"
118               << "\t"
119               << "p_lrt"
120               << "\t"
121               << "p_score" << endl;
122     } else {
123     }
124 
125     for (vector<SUMSTAT>::size_type t = 0; t < sumStat.size(); ++t) {
126       outfile << snpInfo[t].rs_number << "\t";
127 
128       if (a_mode == 51) {
129         outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
130                 << sumStat[t].se << "\t" << sumStat[t].p_wald << endl;
131       } else if (a_mode == 52) {
132         outfile << scientific << setprecision(6) << "\t" << sumStat[t].p_lrt
133                 << endl;
134       } else if (a_mode == 53) {
135         outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
136                 << sumStat[t].se << "\t" << sumStat[t].p_score << endl;
137       } else if (a_mode == 54) {
138         outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
139                 << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t"
140                 << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl;
141       } else {
142       }
143     }
144   } else {
145     outfile << "chr"
146             << "\t"
147             << "rs"
148             << "\t"
149             << "ps"
150             << "\t"
151             << "n_mis"
152             << "\t"
153             << "n_obs"
154             << "\t"
155             << "allele1"
156             << "\t"
157             << "allele0"
158             << "\t"
159             << "af"
160             << "\t";
161 
162     if (a_mode == 51) {
163       outfile << "beta"
164               << "\t"
165               << "se"
166               << "\t"
167               << "p_wald" << endl;
168     } else if (a_mode == 52) {
169       outfile << "p_lrt" << endl;
170     } else if (a_mode == 53) {
171       outfile << "beta"
172               << "\t"
173               << "se"
174               << "\t"
175               << "p_score" << endl;
176     } else if (a_mode == 54) {
177       outfile << "beta"
178               << "\t"
179               << "se"
180               << "\t"
181               << "p_wald"
182               << "\t"
183               << "p_lrt"
184               << "\t"
185               << "p_score" << endl;
186     } else {
187     }
188 
189     size_t t = 0;
190     for (size_t i = 0; i < snpInfo.size(); ++i) {
191       if (indicator_snp[i] == 0) {
192         continue;
193       }
194 
195       outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
196               << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
197               << ni_test - snpInfo[i].n_miss << "\t" << snpInfo[i].a_minor
198               << "\t" << snpInfo[i].a_major << "\t" << fixed << setprecision(3)
199               << snpInfo[i].maf << "\t";
200 
201       if (a_mode == 51) {
202         outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
203                 << sumStat[t].se << "\t" << sumStat[t].p_wald << endl;
204       } else if (a_mode == 52) {
205         outfile << scientific << setprecision(6) << sumStat[t].p_lrt << endl;
206       } else if (a_mode == 53) {
207         outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
208                 << sumStat[t].se << "\t" << sumStat[t].p_score << endl;
209       } else if (a_mode == 54) {
210         outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
211                 << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t"
212                 << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl;
213       } else {
214       }
215       t++;
216     }
217   }
218 
219   outfile.close();
220   outfile.clear();
221   return;
222 }
223 
CalcvPv(const gsl_matrix * WtWi,const gsl_vector * Wty,const gsl_vector * Wtx,const gsl_vector * y,const gsl_vector * x,double & xPwy,double & xPwx)224 void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty,
225              const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x,
226              double &xPwy, double &xPwx) {
227   size_t c_size = Wty->size;
228   double d;
229 
230   gsl_vector *WtWiWtx = gsl_vector_alloc(c_size);
231 
232   gsl_blas_ddot(x, x, &xPwx);
233   gsl_blas_ddot(x, y, &xPwy);
234   gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
235 
236   gsl_blas_ddot(WtWiWtx, Wtx, &d);
237   xPwx -= d;
238 
239   gsl_blas_ddot(WtWiWtx, Wty, &d);
240   xPwy -= d;
241 
242   gsl_vector_free(WtWiWtx);
243 
244   return;
245 }
246 
CalcvPv(const gsl_matrix * WtWi,const gsl_vector * Wty,const gsl_vector * y,double & yPwy)247 void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y,
248              double &yPwy) {
249   size_t c_size = Wty->size;
250   double d;
251 
252   gsl_vector *WtWiWty = gsl_vector_alloc(c_size);
253 
254   gsl_blas_ddot(y, y, &yPwy);
255   gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty);
256 
257   gsl_blas_ddot(WtWiWty, Wty, &d);
258   yPwy -= d;
259 
260   gsl_vector_free(WtWiWty);
261 
262   return;
263 }
264 
265 // Calculate p-values and beta/se in a linear model.
LmCalcP(const size_t test_mode,const double yPwy,const double xPwy,const double xPwx,const double df,const size_t n_size,double & beta,double & se,double & p_wald,double & p_lrt,double & p_score)266 void LmCalcP(const size_t test_mode, const double yPwy, const double xPwy,
267              const double xPwx, const double df, const size_t n_size,
268              double &beta, double &se, double &p_wald, double &p_lrt,
269              double &p_score) {
270   double yPxy = yPwy - xPwy * xPwy / xPwx;
271   double se_wald, se_score;
272 
273   beta = xPwy / xPwx;
274   se_wald = sqrt(yPxy / (df * xPwx));
275   se_score = sqrt(yPwy / ((double)n_size * xPwx));
276 
277   p_wald = gsl_cdf_fdist_Q(beta * beta / (se_wald * se_wald), 1.0, df);
278   p_score = gsl_cdf_fdist_Q(beta * beta / (se_score * se_score), 1.0, df);
279   p_lrt = gsl_cdf_chisq_Q((double)n_size * (log(yPwy) - log(yPxy)), 1);
280 
281   if (test_mode == 3) {
282     se = se_score;
283   } else {
284     se = se_wald;
285   }
286 
287   return;
288 }
289 
AnalyzeGene(const gsl_matrix * W,const gsl_vector * x)290 void LM::AnalyzeGene(const gsl_matrix *W, const gsl_vector *x) {
291   debug_msg("entering");
292   ifstream infile(file_gene.c_str(), ifstream::in);
293   if (!infile) {
294     cout << "error reading gene expression file:" << file_gene << endl;
295     return;
296   }
297 
298   clock_t time_start = clock();
299 
300   string line;
301   char *ch_ptr;
302 
303   double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0;
304   int c_phen;
305   string rs; // Gene id.
306   double d;
307 
308   // Calculate some basic quantities.
309   double yPwy, xPwy, xPwx;
310   double df = (double)W->size1 - (double)W->size2 - 1.0;
311 
312   gsl_vector *y = gsl_vector_alloc(W->size1);
313 
314   gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2);
315   gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2);
316   gsl_vector *Wty = gsl_vector_alloc(W->size2);
317   gsl_vector *Wtx = gsl_vector_alloc(W->size2);
318   gsl_permutation *pmt = gsl_permutation_alloc(W->size2);
319 
320   gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
321   int sig;
322   LUDecomp(WtW, pmt, &sig);
323   LUInvert(WtW, pmt, WtWi);
324 
325   gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
326   CalcvPv(WtWi, Wtx, x, xPwx);
327 
328   // Header.
329   getline(infile, line);
330 
331   for (size_t t = 0; t < ng_total; t++) {
332     getline(infile, line);
333     if (t % d_pace == 0 || t == ng_total - 1) {
334       ProgressBar("Performing Analysis", t, ng_total - 1);
335     }
336     ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
337     rs = ch_ptr;
338 
339     c_phen = 0;
340     for (size_t i = 0; i < indicator_idv.size(); ++i) {
341       ch_ptr = strtok_safe(NULL, " , \t");
342       if (indicator_idv[i] == 0) {
343         continue;
344       }
345 
346       d = atof(ch_ptr);
347       gsl_vector_set(y, c_phen, d);
348 
349       c_phen++;
350     }
351 
352     // Calculate statistics.
353     time_start = clock();
354 
355     gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
356     CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy);
357     LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald,
358             p_lrt, p_score);
359 
360     time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
361 
362     // Store summary data.
363     SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0 };
364     sumStat.push_back(SNPs);
365   }
366   cout << endl;
367 
368   gsl_vector_free(y);
369 
370   gsl_matrix_free(WtW);
371   gsl_matrix_free(WtWi);
372   gsl_vector_free(Wty);
373   gsl_vector_free(Wtx);
374   gsl_permutation_free(pmt);
375 
376   infile.close();
377   infile.clear();
378 
379   return;
380 }
381 
AnalyzeBimbam(const gsl_matrix * W,const gsl_vector * y)382 void LM::AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y) {
383   debug_msg("entering");
384   igzstream infile(file_geno.c_str(), igzstream::in);
385   if (!infile) {
386     cout << "error reading genotype file:" << file_geno << endl;
387     return;
388   }
389 
390   clock_t time_start = clock();
391 
392   string line;
393   char *ch_ptr;
394 
395   double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0;
396   int n_miss, c_phen;
397   double geno, x_mean;
398 
399   // Calculate some basic quantities.
400   double yPwy, xPwy, xPwx;
401   double df = (double)W->size1 - (double)W->size2 - 1.0;
402 
403   gsl_vector *x = gsl_vector_alloc(W->size1);
404   gsl_vector *x_miss = gsl_vector_alloc(W->size1);
405 
406   gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2);
407   gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2);
408   gsl_vector *Wty = gsl_vector_alloc(W->size2);
409   gsl_vector *Wtx = gsl_vector_alloc(W->size2);
410   gsl_permutation *pmt = gsl_permutation_alloc(W->size2);
411 
412   gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
413   int sig;
414   LUDecomp(WtW, pmt, &sig);
415   LUInvert(WtW, pmt, WtWi);
416 
417   gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
418   CalcvPv(WtWi, Wty, y, yPwy);
419 
420   // Start reading genotypes and analyze.
421   for (size_t t = 0; t < indicator_snp.size(); ++t) {
422     getline(infile, line);
423     if (t % d_pace == 0 || t == (ns_total - 1)) {
424       ProgressBar("Reading SNPs", t, ns_total - 1);
425     }
426     if (indicator_snp[t] == 0) {
427       continue;
428     }
429 
430     ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
431     ch_ptr = strtok_safe(NULL, " , \t");
432     ch_ptr = strtok_safe(NULL, " , \t");
433 
434     x_mean = 0.0;
435     c_phen = 0;
436     n_miss = 0;
437     gsl_vector_set_zero(x_miss);
438     for (size_t i = 0; i < ni_total; ++i) {
439       ch_ptr = strtok_safe(NULL, " , \t");
440       if (indicator_idv[i] == 0) {
441         continue;
442       }
443 
444       if (strcmp(ch_ptr, "NA") == 0) {
445         gsl_vector_set(x_miss, c_phen, 0.0);
446         n_miss++;
447       } else {
448         geno = atof(ch_ptr);
449 
450         gsl_vector_set(x, c_phen, geno);
451         gsl_vector_set(x_miss, c_phen, 1.0);
452         x_mean += geno;
453       }
454       c_phen++;
455     }
456 
457     x_mean /= (double)(ni_test - n_miss);
458 
459     for (size_t i = 0; i < ni_test; ++i) {
460       if (gsl_vector_get(x_miss, i) == 0) {
461         gsl_vector_set(x, i, x_mean);
462       }
463       geno = gsl_vector_get(x, i);
464     }
465 
466     // Calculate statistics.
467     time_start = clock();
468 
469     gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
470     CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
471     LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald,
472             p_lrt, p_score);
473 
474     time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
475 
476     // Store summary data.
477     SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0};
478     sumStat.push_back(SNPs);
479   }
480   cout << endl;
481 
482   gsl_vector_free(x);
483   gsl_vector_free(x_miss);
484 
485   gsl_matrix_free(WtW);
486   gsl_matrix_free(WtWi);
487   gsl_vector_free(Wty);
488   gsl_vector_free(Wtx);
489   gsl_permutation_free(pmt);
490 
491   infile.close();
492   infile.clear();
493 
494   return;
495 }
496 
AnalyzePlink(const gsl_matrix * W,const gsl_vector * y)497 void LM::AnalyzePlink(const gsl_matrix *W, const gsl_vector *y) {
498   debug_msg("entering");
499   string file_bed = file_bfile + ".bed";
500   ifstream infile(file_bed.c_str(), ios::binary);
501   if (!infile) {
502     cout << "error reading bed file:" << file_bed << endl;
503     return;
504   }
505 
506   clock_t time_start = clock();
507 
508   char ch[1];
509   bitset<8> b;
510 
511   double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0;
512   int n_bit, n_miss, ci_total, ci_test;
513   double geno, x_mean;
514 
515   // Calculate some basic quantities.
516   double yPwy, xPwy, xPwx;
517   double df = (double)W->size1 - (double)W->size2 - 1.0;
518 
519   gsl_vector *x = gsl_vector_alloc(W->size1);
520 
521   gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2);
522   gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2);
523   gsl_vector *Wty = gsl_vector_alloc(W->size2);
524   gsl_vector *Wtx = gsl_vector_alloc(W->size2);
525   gsl_permutation *pmt = gsl_permutation_alloc(W->size2);
526 
527   gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
528   int sig;
529   LUDecomp(WtW, pmt, &sig);
530   LUInvert(WtW, pmt, WtWi);
531 
532   gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
533   CalcvPv(WtWi, Wty, y, yPwy);
534 
535   // Calculate n_bit and c, the number of bit for each SNP.
536   if (ni_total % 4 == 0) {
537     n_bit = ni_total / 4;
538   } else {
539     n_bit = ni_total / 4 + 1;
540   }
541 
542   // Print the first three magic numbers.
543   for (int i = 0; i < 3; ++i) {
544     infile.read(ch, 1);
545     b = ch[0];
546   }
547 
548   for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
549     if (t % d_pace == 0 || t == snpInfo.size() - 1) {
550       ProgressBar("Reading SNPs", t, snpInfo.size() - 1);
551     }
552     if (indicator_snp[t] == 0) {
553       continue;
554     }
555 
556     // n_bit, and 3 is the number of magic numbers.
557     infile.seekg(t * n_bit + 3);
558 
559     // Read genotypes.
560     x_mean = 0.0;
561     n_miss = 0;
562     ci_total = 0;
563     ci_test = 0;
564     for (int i = 0; i < n_bit; ++i) {
565       infile.read(ch, 1);
566       b = ch[0];
567 
568       // Minor allele homozygous: 2.0; major: 0.0;
569       for (size_t j = 0; j < 4; ++j) {
570         if ((i == (n_bit - 1)) && ci_total == (int)ni_total) {
571           break;
572         }
573         if (indicator_idv[ci_total] == 0) {
574           ci_total++;
575           continue;
576         }
577 
578         if (b[2 * j] == 0) {
579           if (b[2 * j + 1] == 0) {
580             gsl_vector_set(x, ci_test, 2);
581             x_mean += 2.0;
582           } else {
583             gsl_vector_set(x, ci_test, 1);
584             x_mean += 1.0;
585           }
586         } else {
587           if (b[2 * j + 1] == 1) {
588             gsl_vector_set(x, ci_test, 0);
589           } else {
590             gsl_vector_set(x, ci_test, -9);
591             n_miss++;
592           }
593         }
594 
595         ci_total++;
596         ci_test++;
597       }
598     }
599 
600     x_mean /= (double)(ni_test - n_miss);
601 
602     for (size_t i = 0; i < ni_test; ++i) {
603       geno = gsl_vector_get(x, i);
604       if (geno == -9) {
605         gsl_vector_set(x, i, x_mean);
606         geno = x_mean;
607       }
608     }
609 
610     // Calculate statistics.
611     time_start = clock();
612 
613     gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
614     CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
615     LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald,
616             p_lrt, p_score);
617 
618     // store summary data
619     SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0};
620     sumStat.push_back(SNPs);
621 
622     time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
623   }
624   cout << endl;
625 
626   gsl_vector_free(x);
627 
628   gsl_matrix_free(WtW);
629   gsl_matrix_free(WtWi);
630   gsl_vector_free(Wty);
631   gsl_vector_free(Wtx);
632   gsl_permutation_free(pmt);
633 
634   infile.close();
635   infile.clear();
636 
637   return;
638 }
639 
640 // Make sure that both y and X are centered already.
MatrixCalcLmLR(const gsl_matrix * X,const gsl_vector * y,vector<pair<size_t,double>> & pos_loglr)641 void MatrixCalcLmLR(const gsl_matrix *X, const gsl_vector *y,
642                     vector<pair<size_t, double>> &pos_loglr) {
643   double yty, xty, xtx, log_lr;
644   gsl_blas_ddot(y, y, &yty);
645 
646   for (size_t i = 0; i < X->size2; ++i) {
647     gsl_vector_const_view X_col = gsl_matrix_const_column(X, i);
648     gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx);
649     gsl_blas_ddot(&X_col.vector, y, &xty);
650 
651     log_lr = 0.5 * (double)y->size * (log(yty) - log(yty - xty * xty / xtx));
652     pos_loglr.push_back(make_pair(i, log_lr));
653   }
654 
655   return;
656 }
657