1 /*
2 Genome-wide Efficient Mixed Model Association (GEMMA)
3 Copyright (C) 2011-2017, Xiang Zhou
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <bitset>
20 #include <cmath>
21 #include <cstring>
22 #include <fstream>
23 #include <iomanip>
24 #include <iostream>
25 #include <map>
26 #include <set>
27 #include <sstream>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string>
31 #include <vector>
32
33 #include "gsl/gsl_blas.h"
34 #include "gsl/gsl_cdf.h"
35 #include "gsl/gsl_linalg.h"
36 #include "gsl/gsl_matrix.h"
37 #include "gsl/gsl_vector.h"
38
39 #include "gzstream.h"
40 #include "gemma_io.h"
41 #include "lapack.h"
42 #include "mathfunc.h"
43 #include "param.h"
44 #include "varcov.h"
45
46 using namespace std;
47
CopyFromParam(PARAM & cPar)48 void VARCOV::CopyFromParam(PARAM &cPar) {
49 d_pace = cPar.d_pace;
50
51 file_bfile = cPar.file_bfile;
52 file_geno = cPar.file_geno;
53 file_out = cPar.file_out;
54 path_out = cPar.path_out;
55
56 time_opt = 0.0;
57
58 window_cm = cPar.window_cm;
59 window_bp = cPar.window_bp;
60 window_ns = cPar.window_ns;
61
62 indicator_idv = cPar.indicator_idv;
63 indicator_snp = cPar.indicator_snp;
64 snpInfo = cPar.snpInfo;
65
66 return;
67 }
68
CopyToParam(PARAM & cPar)69 void VARCOV::CopyToParam(PARAM &cPar) {
70 cPar.time_opt = time_opt;
71 return;
72 }
73
WriteCov(const int flag,const vector<SNPINFO> & snpInfo_sub,const vector<vector<double>> & Cov_mat)74 void VARCOV::WriteCov(const int flag, const vector<SNPINFO> &snpInfo_sub,
75 const vector<vector<double>> &Cov_mat) {
76 string file_cov;
77 file_cov = path_out + "/" + file_out;
78 file_cov += ".cor.txt";
79
80 ofstream outfile;
81
82 if (flag == 0) {
83 outfile.open(file_cov.c_str(), ofstream::out);
84 if (!outfile) {
85 cout << "error writing file: " << file_cov << endl;
86 return;
87 }
88
89 outfile << "chr"
90 << "\t"
91 << "rs"
92 << "\t"
93 << "ps"
94 << "\t"
95 << "n_mis"
96 << "\t"
97 << "n_obs"
98 << "\t"
99 << "allele1"
100 << "\t"
101 << "allele0"
102 << "\t"
103 << "af"
104 << "\t"
105 << "window_size"
106 << "\t"
107 << "var"
108 << "\t"
109 << "cor" << endl;
110 } else {
111 outfile.open(file_cov.c_str(), ofstream::app);
112 if (!outfile) {
113 cout << "error writing file: " << file_cov << endl;
114 return;
115 }
116
117 for (size_t i = 0; i < Cov_mat.size(); i++) {
118 outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << "\t"
119 << snpInfo_sub[i].base_position << "\t" << snpInfo_sub[i].n_miss
120 << "\t" << snpInfo_sub[i].n_idv << "\t" << snpInfo_sub[i].a_minor
121 << "\t" << snpInfo_sub[i].a_major << "\t" << fixed
122 << setprecision(3) << snpInfo_sub[i].maf << "\t"
123 << Cov_mat[i].size() - 1 << "\t";
124 outfile << scientific << setprecision(6) << Cov_mat[i][0] << "\t";
125
126 if (Cov_mat[i].size() == 1) {
127 outfile << "NA";
128 } else {
129 for (size_t j = 1; j < Cov_mat[i].size(); j++) {
130 if (j == (Cov_mat[i].size() - 1)) {
131 outfile << Cov_mat[i][j];
132 } else {
133 outfile << Cov_mat[i][j] << ",";
134 }
135 }
136 }
137
138 outfile << endl;
139 }
140 }
141
142 outfile.close();
143 outfile.clear();
144 return;
145 }
146
CompareSNPinfo(const SNPINFO & snpInfo1,const SNPINFO & snpInfo2)147 bool CompareSNPinfo(const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) {
148 int c_chr = snpInfo1.chr.compare(snpInfo2.chr);
149 long int c_bp = snpInfo1.base_position - snpInfo2.base_position;
150
151 if (c_chr < 0) {
152 return true;
153 } else if (c_chr > 0) {
154 return false;
155 } else {
156 if (c_bp < 0) {
157 return true;
158 } else if (c_bp > 0) {
159 return false;
160 } else {
161 return true;
162 }
163 }
164 }
165
166 // Do not sort SNPs (because gzip files do not support random access)
167 // then calculate n_nb, the number of neighbours, for each SNP.
CalcNB(vector<SNPINFO> & snpInfo_sort)168 void VARCOV::CalcNB(vector<SNPINFO> &snpInfo_sort) {
169 size_t t2 = 0, n_nb = 0;
170 for (size_t t = 0; t < indicator_snp.size(); ++t) {
171 if (indicator_snp[t] == 0) {
172 continue;
173 }
174
175 if (snpInfo_sort[t].chr == "-9" ||
176 (snpInfo_sort[t].cM == -9 && window_cm != 0) ||
177 (snpInfo_sort[t].base_position == -9 && window_bp != 0)) {
178 snpInfo_sort[t].n_nb = 0;
179 continue;
180 }
181
182 if (t == indicator_snp.size() - 1) {
183 snpInfo_sort[t].n_nb = 0;
184 continue;
185 }
186
187 t2 = t + 1;
188 n_nb = 0;
189
190 while (t2 < indicator_snp.size() &&
191 snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
192 indicator_snp[t2] == 0) {
193 t2++;
194 }
195
196 while (t2 < indicator_snp.size() &&
197 snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
198 (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm ||
199 window_cm == 0) &&
200 (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position <
201 (long int) window_bp ||
202 window_bp == 0) &&
203 (n_nb < window_ns || window_ns == 0)) {
204 t2++;
205 n_nb++;
206 while (t2 < indicator_snp.size() &&
207 snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
208 indicator_snp[t2] == 0) {
209 t2++;
210 }
211 }
212
213 snpInfo_sort[t].n_nb = n_nb;
214 }
215
216 return;
217 }
218
219 // Vector double is centered to have mean 0.
Calc_Cor(vector<vector<double>> & X_mat,vector<double> & cov_vec)220 void Calc_Cor(vector<vector<double>> &X_mat, vector<double> &cov_vec) {
221 cov_vec.clear();
222
223 double v1, v2, r;
224 vector<double> x_vec = X_mat[0];
225
226 lapack_ddot(x_vec, x_vec, v1);
227 cov_vec.push_back(v1 / (double)x_vec.size());
228
229 for (size_t i = 1; i < X_mat.size(); i++) {
230 lapack_ddot(X_mat[i], x_vec, r);
231 lapack_ddot(X_mat[i], X_mat[i], v2);
232 r /= sqrt(v1 * v2);
233
234 cov_vec.push_back(r);
235 }
236
237 return;
238 }
239
240 // Read the genotype file again, calculate r2 between pairs of SNPs
241 // within a window, output the file every 10K SNPs the output results
242 // are sorted based on chr and bp output format similar to assoc.txt
243 // files (remember n_miss is replaced by n_idv).
244 //
245 // r2 between the current SNP and every following SNPs within the
246 // window_size (which can vary if cM was used) read bimbam mean
247 // genotype file and calculate the covariance matrix for neighboring
248 // SNPs output values at 10000-SNP-interval.
AnalyzeBimbam()249 void VARCOV::AnalyzeBimbam() {
250 debug_msg("entering");
251 igzstream infile(file_geno.c_str(), igzstream::in);
252 if (!infile) {
253 cout << "error reading genotype file:" << file_geno << endl;
254 return;
255 }
256
257 // Calculate the number of right-hand-side neighbours for each SNP.
258 vector<SNPINFO> snpInfo_sub;
259 CalcNB(snpInfo);
260
261 size_t ni_test = 0;
262 for (size_t i = 0; i < indicator_idv.size(); i++) {
263 ni_test += indicator_idv[i];
264 }
265
266 gsl_vector *geno = gsl_vector_alloc(ni_test);
267 double geno_mean;
268
269 vector<double> x_vec, cov_vec;
270 vector<vector<double>> X_mat, Cov_mat;
271
272 for (size_t i = 0; i < ni_test; i++) {
273 x_vec.push_back(0);
274 }
275
276 WriteCov(0, snpInfo_sub, Cov_mat);
277
278 size_t t2 = 0, inc;
279 int n_nb = 0;
280
281 for (size_t t = 0; t < indicator_snp.size(); ++t) {
282 if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
283 ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1);
284 }
285 if (indicator_snp[t] == 0) {
286 continue;
287 }
288
289 if (X_mat.size() == 0) {
290 n_nb = snpInfo[t].n_nb + 1;
291 } else {
292 n_nb = snpInfo[t].n_nb - n_nb + 1;
293 }
294
295 for (int i = 0; i < n_nb; i++) {
296 if (X_mat.size() == 0) {
297 t2 = t;
298 }
299
300 // Read a line of the snp is filtered out.
301 inc = 0;
302 while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
303 t2++;
304 inc++;
305 }
306
307 Bimbam_ReadOneSNP(inc, indicator_idv, infile, geno, geno_mean);
308 gsl_vector_add_constant(geno, -1.0 * geno_mean);
309
310 for (size_t j = 0; j < geno->size; j++) {
311 x_vec[j] = gsl_vector_get(geno, j);
312 }
313 X_mat.push_back(x_vec);
314
315 t2++;
316 }
317
318 n_nb = snpInfo[t].n_nb;
319
320 Calc_Cor(X_mat, cov_vec);
321 Cov_mat.push_back(cov_vec);
322 snpInfo_sub.push_back(snpInfo[t]);
323
324 X_mat.erase(X_mat.begin());
325
326 // Write out var/cov values.
327 if (Cov_mat.size() == 10000) {
328 WriteCov(1, snpInfo_sub, Cov_mat);
329 Cov_mat.clear();
330 snpInfo_sub.clear();
331 }
332 }
333
334 if (Cov_mat.size() != 0) {
335 WriteCov(1, snpInfo_sub, Cov_mat);
336 Cov_mat.clear();
337 snpInfo_sub.clear();
338 }
339
340 gsl_vector_free(geno);
341
342 infile.close();
343 infile.clear();
344
345 return;
346 }
347
AnalyzePlink()348 void VARCOV::AnalyzePlink() {
349 debug_msg("entering");
350 string file_bed = file_bfile + ".bed";
351 ifstream infile(file_bed.c_str(), ios::binary);
352 if (!infile) {
353 cout << "error reading bed file:" << file_bed << endl;
354 return;
355 }
356
357 // Calculate the number of right-hand-side neighbours for each SNP.
358 vector<SNPINFO> snpInfo_sub;
359 CalcNB(snpInfo);
360
361 size_t ni_test = 0;
362 for (size_t i = 0; i < indicator_idv.size(); i++) {
363 ni_test += indicator_idv[i];
364 }
365
366 gsl_vector *geno = gsl_vector_alloc(ni_test);
367 double geno_mean;
368
369 vector<double> x_vec, cov_vec;
370 vector<vector<double>> X_mat, Cov_mat;
371
372 for (size_t i = 0; i < ni_test; i++) {
373 x_vec.push_back(0);
374 }
375
376 WriteCov(0, snpInfo_sub, Cov_mat);
377
378 size_t t2 = 0, inc;
379 int n_nb = 0;
380
381 for (size_t t = 0; t < indicator_snp.size(); ++t) {
382 if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
383 ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1);
384 }
385 if (indicator_snp[t] == 0) {
386 continue;
387 }
388
389 if (X_mat.size() == 0) {
390 n_nb = snpInfo[t].n_nb + 1;
391 } else {
392 n_nb = snpInfo[t].n_nb - n_nb + 1;
393 }
394
395 for (int i = 0; i < n_nb; i++) {
396 if (X_mat.size() == 0) {
397 t2 = t;
398 }
399
400 // Read a line if the SNP is filtered out.
401 inc = 0;
402 while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
403 t2++;
404 inc++;
405 }
406
407 Plink_ReadOneSNP(t2, indicator_idv, infile, geno, geno_mean);
408 gsl_vector_add_constant(geno, -1.0 * geno_mean);
409
410 for (size_t j = 0; j < geno->size; j++) {
411 x_vec[j] = gsl_vector_get(geno, j);
412 }
413 X_mat.push_back(x_vec);
414
415 t2++;
416 }
417
418 n_nb = snpInfo[t].n_nb;
419
420 Calc_Cor(X_mat, cov_vec);
421 Cov_mat.push_back(cov_vec);
422 snpInfo_sub.push_back(snpInfo[t]);
423
424 X_mat.erase(X_mat.begin());
425
426 // Write out var/cov values.
427 if (Cov_mat.size() == 10000) {
428 WriteCov(1, snpInfo_sub, Cov_mat);
429 Cov_mat.clear();
430 snpInfo_sub.clear();
431 }
432 }
433
434 if (Cov_mat.size() != 0) {
435 WriteCov(1, snpInfo_sub, Cov_mat);
436 Cov_mat.clear();
437 snpInfo_sub.clear();
438 }
439
440 gsl_vector_free(geno);
441
442 infile.close();
443 infile.clear();
444
445 return;
446 }
447