1 /*
2 Genome-wide Efficient Mixed Model Association (GEMMA)
3 Copyright (C) 2011-2017 Xiang Zhou
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <fstream>
20 #include <iostream>
21 #include <sstream>
22
23 #include <assert.h>
24 #include <bitset>
25 #include <cmath>
26 #include <cstring>
27 #include <iomanip>
28 #include <iostream>
29 #include <stdio.h>
30 #include <stdlib.h>
31
32 #include "gsl/gsl_blas.h"
33 #include "gsl/gsl_linalg.h"
34 #include "gsl/gsl_matrix.h"
35 #include "gsl/gsl_vector.h"
36
37 #include "gsl/gsl_cdf.h"
38 #include "gsl/gsl_integration.h"
39 #include "gsl/gsl_min.h"
40 #include "gsl/gsl_roots.h"
41
42 // #include "eigenlib.h"
43 #include "gzstream.h"
44 #include "lapack.h"
45 #include "lm.h"
46
47 using namespace std;
48
CopyFromParam(PARAM & cPar)49 void LM::CopyFromParam(PARAM &cPar) {
50 a_mode = cPar.a_mode;
51 d_pace = cPar.d_pace;
52
53 file_bfile = cPar.file_bfile;
54 file_geno = cPar.file_geno;
55 file_out = cPar.file_out;
56 path_out = cPar.path_out;
57 file_gene = cPar.file_gene;
58
59 time_opt = 0.0;
60
61 ni_total = cPar.ni_total;
62 ns_total = cPar.ns_total;
63 ni_test = cPar.ni_test;
64 ns_test = cPar.ns_test;
65 n_cvt = cPar.n_cvt;
66
67 ng_total = cPar.ng_total;
68 ng_test = 0;
69
70 indicator_idv = cPar.indicator_idv;
71 indicator_snp = cPar.indicator_snp;
72 snpInfo = cPar.snpInfo;
73
74 return;
75 }
76
CopyToParam(PARAM & cPar)77 void LM::CopyToParam(PARAM &cPar) {
78 cPar.time_opt = time_opt;
79 cPar.ng_test = ng_test;
80 return;
81 }
82
WriteFiles()83 void LM::WriteFiles() {
84 string file_str;
85 file_str = path_out + "/" + file_out;
86 file_str += ".assoc.txt";
87
88 ofstream outfile(file_str.c_str(), ofstream::out);
89 if (!outfile) {
90 cout << "error writing file: " << file_str.c_str() << endl;
91 return;
92 }
93
94 if (!file_gene.empty()) {
95 outfile << "geneID"
96 << "\t";
97
98 if (a_mode == 51) {
99 outfile << "beta"
100 << "\t"
101 << "se"
102 << "\t"
103 << "p_wald" << endl;
104 } else if (a_mode == 52) {
105 outfile << "p_lrt" << endl;
106 } else if (a_mode == 53) {
107 outfile << "beta"
108 << "\t"
109 << "se"
110 << "\t"
111 << "p_score" << endl;
112 } else if (a_mode == 54) {
113 outfile << "beta"
114 << "\t"
115 << "se"
116 << "\t"
117 << "p_wald"
118 << "\t"
119 << "p_lrt"
120 << "\t"
121 << "p_score" << endl;
122 } else {
123 }
124
125 for (vector<SUMSTAT>::size_type t = 0; t < sumStat.size(); ++t) {
126 outfile << snpInfo[t].rs_number << "\t";
127
128 if (a_mode == 51) {
129 outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
130 << sumStat[t].se << "\t" << sumStat[t].p_wald << endl;
131 } else if (a_mode == 52) {
132 outfile << scientific << setprecision(6) << "\t" << sumStat[t].p_lrt
133 << endl;
134 } else if (a_mode == 53) {
135 outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
136 << sumStat[t].se << "\t" << sumStat[t].p_score << endl;
137 } else if (a_mode == 54) {
138 outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
139 << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t"
140 << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl;
141 } else {
142 }
143 }
144 } else {
145 outfile << "chr"
146 << "\t"
147 << "rs"
148 << "\t"
149 << "ps"
150 << "\t"
151 << "n_mis"
152 << "\t"
153 << "n_obs"
154 << "\t"
155 << "allele1"
156 << "\t"
157 << "allele0"
158 << "\t"
159 << "af"
160 << "\t";
161
162 if (a_mode == 51) {
163 outfile << "beta"
164 << "\t"
165 << "se"
166 << "\t"
167 << "p_wald" << endl;
168 } else if (a_mode == 52) {
169 outfile << "p_lrt" << endl;
170 } else if (a_mode == 53) {
171 outfile << "beta"
172 << "\t"
173 << "se"
174 << "\t"
175 << "p_score" << endl;
176 } else if (a_mode == 54) {
177 outfile << "beta"
178 << "\t"
179 << "se"
180 << "\t"
181 << "p_wald"
182 << "\t"
183 << "p_lrt"
184 << "\t"
185 << "p_score" << endl;
186 } else {
187 }
188
189 size_t t = 0;
190 for (size_t i = 0; i < snpInfo.size(); ++i) {
191 if (indicator_snp[i] == 0) {
192 continue;
193 }
194
195 outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
196 << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
197 << ni_test - snpInfo[i].n_miss << "\t" << snpInfo[i].a_minor
198 << "\t" << snpInfo[i].a_major << "\t" << fixed << setprecision(3)
199 << snpInfo[i].maf << "\t";
200
201 if (a_mode == 51) {
202 outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
203 << sumStat[t].se << "\t" << sumStat[t].p_wald << endl;
204 } else if (a_mode == 52) {
205 outfile << scientific << setprecision(6) << sumStat[t].p_lrt << endl;
206 } else if (a_mode == 53) {
207 outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
208 << sumStat[t].se << "\t" << sumStat[t].p_score << endl;
209 } else if (a_mode == 54) {
210 outfile << scientific << setprecision(6) << sumStat[t].beta << "\t"
211 << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t"
212 << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl;
213 } else {
214 }
215 t++;
216 }
217 }
218
219 outfile.close();
220 outfile.clear();
221 return;
222 }
223
CalcvPv(const gsl_matrix * WtWi,const gsl_vector * Wty,const gsl_vector * Wtx,const gsl_vector * y,const gsl_vector * x,double & xPwy,double & xPwx)224 void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty,
225 const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x,
226 double &xPwy, double &xPwx) {
227 size_t c_size = Wty->size;
228 double d;
229
230 gsl_vector *WtWiWtx = gsl_vector_alloc(c_size);
231
232 gsl_blas_ddot(x, x, &xPwx);
233 gsl_blas_ddot(x, y, &xPwy);
234 gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
235
236 gsl_blas_ddot(WtWiWtx, Wtx, &d);
237 xPwx -= d;
238
239 gsl_blas_ddot(WtWiWtx, Wty, &d);
240 xPwy -= d;
241
242 gsl_vector_free(WtWiWtx);
243
244 return;
245 }
246
CalcvPv(const gsl_matrix * WtWi,const gsl_vector * Wty,const gsl_vector * y,double & yPwy)247 void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y,
248 double &yPwy) {
249 size_t c_size = Wty->size;
250 double d;
251
252 gsl_vector *WtWiWty = gsl_vector_alloc(c_size);
253
254 gsl_blas_ddot(y, y, &yPwy);
255 gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty);
256
257 gsl_blas_ddot(WtWiWty, Wty, &d);
258 yPwy -= d;
259
260 gsl_vector_free(WtWiWty);
261
262 return;
263 }
264
265 // Calculate p-values and beta/se in a linear model.
LmCalcP(const size_t test_mode,const double yPwy,const double xPwy,const double xPwx,const double df,const size_t n_size,double & beta,double & se,double & p_wald,double & p_lrt,double & p_score)266 void LmCalcP(const size_t test_mode, const double yPwy, const double xPwy,
267 const double xPwx, const double df, const size_t n_size,
268 double &beta, double &se, double &p_wald, double &p_lrt,
269 double &p_score) {
270 double yPxy = yPwy - xPwy * xPwy / xPwx;
271 double se_wald, se_score;
272
273 beta = xPwy / xPwx;
274 se_wald = sqrt(yPxy / (df * xPwx));
275 se_score = sqrt(yPwy / ((double)n_size * xPwx));
276
277 p_wald = gsl_cdf_fdist_Q(beta * beta / (se_wald * se_wald), 1.0, df);
278 p_score = gsl_cdf_fdist_Q(beta * beta / (se_score * se_score), 1.0, df);
279 p_lrt = gsl_cdf_chisq_Q((double)n_size * (log(yPwy) - log(yPxy)), 1);
280
281 if (test_mode == 3) {
282 se = se_score;
283 } else {
284 se = se_wald;
285 }
286
287 return;
288 }
289
AnalyzeGene(const gsl_matrix * W,const gsl_vector * x)290 void LM::AnalyzeGene(const gsl_matrix *W, const gsl_vector *x) {
291 debug_msg("entering");
292 ifstream infile(file_gene.c_str(), ifstream::in);
293 if (!infile) {
294 cout << "error reading gene expression file:" << file_gene << endl;
295 return;
296 }
297
298 clock_t time_start = clock();
299
300 string line;
301 char *ch_ptr;
302
303 double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0;
304 int c_phen;
305 string rs; // Gene id.
306 double d;
307
308 // Calculate some basic quantities.
309 double yPwy, xPwy, xPwx;
310 double df = (double)W->size1 - (double)W->size2 - 1.0;
311
312 gsl_vector *y = gsl_vector_alloc(W->size1);
313
314 gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2);
315 gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2);
316 gsl_vector *Wty = gsl_vector_alloc(W->size2);
317 gsl_vector *Wtx = gsl_vector_alloc(W->size2);
318 gsl_permutation *pmt = gsl_permutation_alloc(W->size2);
319
320 gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
321 int sig;
322 LUDecomp(WtW, pmt, &sig);
323 LUInvert(WtW, pmt, WtWi);
324
325 gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
326 CalcvPv(WtWi, Wtx, x, xPwx);
327
328 // Header.
329 getline(infile, line);
330
331 for (size_t t = 0; t < ng_total; t++) {
332 getline(infile, line);
333 if (t % d_pace == 0 || t == ng_total - 1) {
334 ProgressBar("Performing Analysis", t, ng_total - 1);
335 }
336 ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
337 rs = ch_ptr;
338
339 c_phen = 0;
340 for (size_t i = 0; i < indicator_idv.size(); ++i) {
341 ch_ptr = strtok_safe(NULL, " , \t");
342 if (indicator_idv[i] == 0) {
343 continue;
344 }
345
346 d = atof(ch_ptr);
347 gsl_vector_set(y, c_phen, d);
348
349 c_phen++;
350 }
351
352 // Calculate statistics.
353 time_start = clock();
354
355 gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
356 CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy);
357 LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald,
358 p_lrt, p_score);
359
360 time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
361
362 // Store summary data.
363 SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0 };
364 sumStat.push_back(SNPs);
365 }
366 cout << endl;
367
368 gsl_vector_free(y);
369
370 gsl_matrix_free(WtW);
371 gsl_matrix_free(WtWi);
372 gsl_vector_free(Wty);
373 gsl_vector_free(Wtx);
374 gsl_permutation_free(pmt);
375
376 infile.close();
377 infile.clear();
378
379 return;
380 }
381
AnalyzeBimbam(const gsl_matrix * W,const gsl_vector * y)382 void LM::AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y) {
383 debug_msg("entering");
384 igzstream infile(file_geno.c_str(), igzstream::in);
385 if (!infile) {
386 cout << "error reading genotype file:" << file_geno << endl;
387 return;
388 }
389
390 clock_t time_start = clock();
391
392 string line;
393 char *ch_ptr;
394
395 double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0;
396 int n_miss, c_phen;
397 double geno, x_mean;
398
399 // Calculate some basic quantities.
400 double yPwy, xPwy, xPwx;
401 double df = (double)W->size1 - (double)W->size2 - 1.0;
402
403 gsl_vector *x = gsl_vector_alloc(W->size1);
404 gsl_vector *x_miss = gsl_vector_alloc(W->size1);
405
406 gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2);
407 gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2);
408 gsl_vector *Wty = gsl_vector_alloc(W->size2);
409 gsl_vector *Wtx = gsl_vector_alloc(W->size2);
410 gsl_permutation *pmt = gsl_permutation_alloc(W->size2);
411
412 gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
413 int sig;
414 LUDecomp(WtW, pmt, &sig);
415 LUInvert(WtW, pmt, WtWi);
416
417 gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
418 CalcvPv(WtWi, Wty, y, yPwy);
419
420 // Start reading genotypes and analyze.
421 for (size_t t = 0; t < indicator_snp.size(); ++t) {
422 getline(infile, line);
423 if (t % d_pace == 0 || t == (ns_total - 1)) {
424 ProgressBar("Reading SNPs", t, ns_total - 1);
425 }
426 if (indicator_snp[t] == 0) {
427 continue;
428 }
429
430 ch_ptr = strtok_safe((char *)line.c_str(), " , \t");
431 ch_ptr = strtok_safe(NULL, " , \t");
432 ch_ptr = strtok_safe(NULL, " , \t");
433
434 x_mean = 0.0;
435 c_phen = 0;
436 n_miss = 0;
437 gsl_vector_set_zero(x_miss);
438 for (size_t i = 0; i < ni_total; ++i) {
439 ch_ptr = strtok_safe(NULL, " , \t");
440 if (indicator_idv[i] == 0) {
441 continue;
442 }
443
444 if (strcmp(ch_ptr, "NA") == 0) {
445 gsl_vector_set(x_miss, c_phen, 0.0);
446 n_miss++;
447 } else {
448 geno = atof(ch_ptr);
449
450 gsl_vector_set(x, c_phen, geno);
451 gsl_vector_set(x_miss, c_phen, 1.0);
452 x_mean += geno;
453 }
454 c_phen++;
455 }
456
457 x_mean /= (double)(ni_test - n_miss);
458
459 for (size_t i = 0; i < ni_test; ++i) {
460 if (gsl_vector_get(x_miss, i) == 0) {
461 gsl_vector_set(x, i, x_mean);
462 }
463 geno = gsl_vector_get(x, i);
464 }
465
466 // Calculate statistics.
467 time_start = clock();
468
469 gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
470 CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
471 LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald,
472 p_lrt, p_score);
473
474 time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
475
476 // Store summary data.
477 SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0};
478 sumStat.push_back(SNPs);
479 }
480 cout << endl;
481
482 gsl_vector_free(x);
483 gsl_vector_free(x_miss);
484
485 gsl_matrix_free(WtW);
486 gsl_matrix_free(WtWi);
487 gsl_vector_free(Wty);
488 gsl_vector_free(Wtx);
489 gsl_permutation_free(pmt);
490
491 infile.close();
492 infile.clear();
493
494 return;
495 }
496
AnalyzePlink(const gsl_matrix * W,const gsl_vector * y)497 void LM::AnalyzePlink(const gsl_matrix *W, const gsl_vector *y) {
498 debug_msg("entering");
499 string file_bed = file_bfile + ".bed";
500 ifstream infile(file_bed.c_str(), ios::binary);
501 if (!infile) {
502 cout << "error reading bed file:" << file_bed << endl;
503 return;
504 }
505
506 clock_t time_start = clock();
507
508 char ch[1];
509 bitset<8> b;
510
511 double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0;
512 int n_bit, n_miss, ci_total, ci_test;
513 double geno, x_mean;
514
515 // Calculate some basic quantities.
516 double yPwy, xPwy, xPwx;
517 double df = (double)W->size1 - (double)W->size2 - 1.0;
518
519 gsl_vector *x = gsl_vector_alloc(W->size1);
520
521 gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2);
522 gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2);
523 gsl_vector *Wty = gsl_vector_alloc(W->size2);
524 gsl_vector *Wtx = gsl_vector_alloc(W->size2);
525 gsl_permutation *pmt = gsl_permutation_alloc(W->size2);
526
527 gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
528 int sig;
529 LUDecomp(WtW, pmt, &sig);
530 LUInvert(WtW, pmt, WtWi);
531
532 gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
533 CalcvPv(WtWi, Wty, y, yPwy);
534
535 // Calculate n_bit and c, the number of bit for each SNP.
536 if (ni_total % 4 == 0) {
537 n_bit = ni_total / 4;
538 } else {
539 n_bit = ni_total / 4 + 1;
540 }
541
542 // Print the first three magic numbers.
543 for (int i = 0; i < 3; ++i) {
544 infile.read(ch, 1);
545 b = ch[0];
546 }
547
548 for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
549 if (t % d_pace == 0 || t == snpInfo.size() - 1) {
550 ProgressBar("Reading SNPs", t, snpInfo.size() - 1);
551 }
552 if (indicator_snp[t] == 0) {
553 continue;
554 }
555
556 // n_bit, and 3 is the number of magic numbers.
557 infile.seekg(t * n_bit + 3);
558
559 // Read genotypes.
560 x_mean = 0.0;
561 n_miss = 0;
562 ci_total = 0;
563 ci_test = 0;
564 for (int i = 0; i < n_bit; ++i) {
565 infile.read(ch, 1);
566 b = ch[0];
567
568 // Minor allele homozygous: 2.0; major: 0.0;
569 for (size_t j = 0; j < 4; ++j) {
570 if ((i == (n_bit - 1)) && ci_total == (int)ni_total) {
571 break;
572 }
573 if (indicator_idv[ci_total] == 0) {
574 ci_total++;
575 continue;
576 }
577
578 if (b[2 * j] == 0) {
579 if (b[2 * j + 1] == 0) {
580 gsl_vector_set(x, ci_test, 2);
581 x_mean += 2.0;
582 } else {
583 gsl_vector_set(x, ci_test, 1);
584 x_mean += 1.0;
585 }
586 } else {
587 if (b[2 * j + 1] == 1) {
588 gsl_vector_set(x, ci_test, 0);
589 } else {
590 gsl_vector_set(x, ci_test, -9);
591 n_miss++;
592 }
593 }
594
595 ci_total++;
596 ci_test++;
597 }
598 }
599
600 x_mean /= (double)(ni_test - n_miss);
601
602 for (size_t i = 0; i < ni_test; ++i) {
603 geno = gsl_vector_get(x, i);
604 if (geno == -9) {
605 gsl_vector_set(x, i, x_mean);
606 geno = x_mean;
607 }
608 }
609
610 // Calculate statistics.
611 time_start = clock();
612
613 gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
614 CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
615 LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald,
616 p_lrt, p_score);
617
618 // store summary data
619 SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0};
620 sumStat.push_back(SNPs);
621
622 time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
623 }
624 cout << endl;
625
626 gsl_vector_free(x);
627
628 gsl_matrix_free(WtW);
629 gsl_matrix_free(WtWi);
630 gsl_vector_free(Wty);
631 gsl_vector_free(Wtx);
632 gsl_permutation_free(pmt);
633
634 infile.close();
635 infile.clear();
636
637 return;
638 }
639
640 // Make sure that both y and X are centered already.
MatrixCalcLmLR(const gsl_matrix * X,const gsl_vector * y,vector<pair<size_t,double>> & pos_loglr)641 void MatrixCalcLmLR(const gsl_matrix *X, const gsl_vector *y,
642 vector<pair<size_t, double>> &pos_loglr) {
643 double yty, xty, xtx, log_lr;
644 gsl_blas_ddot(y, y, &yty);
645
646 for (size_t i = 0; i < X->size2; ++i) {
647 gsl_vector_const_view X_col = gsl_matrix_const_column(X, i);
648 gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx);
649 gsl_blas_ddot(&X_col.vector, y, &xty);
650
651 log_lr = 0.5 * (double)y->size * (log(yty) - log(yty - xty * xty / xtx));
652 pos_loglr.push_back(make_pair(i, log_lr));
653 }
654
655 return;
656 }
657