1 /*
2     Genome-wide Efficient Mixed Model Association (GEMMA)
3     Copyright (C) 2011-2017, Xiang Zhou
4 
5     This program is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <bitset>
20 #include <cmath>
21 #include <cstring>
22 #include <fstream>
23 #include <iomanip>
24 #include <iostream>
25 #include <map>
26 #include <set>
27 #include <sstream>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string>
31 #include <vector>
32 
33 #include "gsl/gsl_blas.h"
34 #include "gsl/gsl_cdf.h"
35 #include "gsl/gsl_linalg.h"
36 #include "gsl/gsl_matrix.h"
37 #include "gsl/gsl_vector.h"
38 
39 #include "gzstream.h"
40 #include "gemma_io.h"
41 #include "lapack.h"
42 #include "mathfunc.h"
43 #include "param.h"
44 #include "varcov.h"
45 
46 using namespace std;
47 
CopyFromParam(PARAM & cPar)48 void VARCOV::CopyFromParam(PARAM &cPar) {
49   d_pace = cPar.d_pace;
50 
51   file_bfile = cPar.file_bfile;
52   file_geno = cPar.file_geno;
53   file_out = cPar.file_out;
54   path_out = cPar.path_out;
55 
56   time_opt = 0.0;
57 
58   window_cm = cPar.window_cm;
59   window_bp = cPar.window_bp;
60   window_ns = cPar.window_ns;
61 
62   indicator_idv = cPar.indicator_idv;
63   indicator_snp = cPar.indicator_snp;
64   snpInfo = cPar.snpInfo;
65 
66   return;
67 }
68 
CopyToParam(PARAM & cPar)69 void VARCOV::CopyToParam(PARAM &cPar) {
70   cPar.time_opt = time_opt;
71   return;
72 }
73 
WriteCov(const int flag,const vector<SNPINFO> & snpInfo_sub,const vector<vector<double>> & Cov_mat)74 void VARCOV::WriteCov(const int flag, const vector<SNPINFO> &snpInfo_sub,
75                       const vector<vector<double>> &Cov_mat) {
76   string file_cov;
77   file_cov = path_out + "/" + file_out;
78   file_cov += ".cor.txt";
79 
80   ofstream outfile;
81 
82   if (flag == 0) {
83     outfile.open(file_cov.c_str(), ofstream::out);
84     if (!outfile) {
85       cout << "error writing file: " << file_cov << endl;
86       return;
87     }
88 
89     outfile << "chr"
90             << "\t"
91             << "rs"
92             << "\t"
93             << "ps"
94             << "\t"
95             << "n_mis"
96             << "\t"
97             << "n_obs"
98             << "\t"
99             << "allele1"
100             << "\t"
101             << "allele0"
102             << "\t"
103             << "af"
104             << "\t"
105             << "window_size"
106             << "\t"
107             << "var"
108             << "\t"
109             << "cor" << endl;
110   } else {
111     outfile.open(file_cov.c_str(), ofstream::app);
112     if (!outfile) {
113       cout << "error writing file: " << file_cov << endl;
114       return;
115     }
116 
117     for (size_t i = 0; i < Cov_mat.size(); i++) {
118       outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << "\t"
119               << snpInfo_sub[i].base_position << "\t" << snpInfo_sub[i].n_miss
120               << "\t" << snpInfo_sub[i].n_idv << "\t" << snpInfo_sub[i].a_minor
121               << "\t" << snpInfo_sub[i].a_major << "\t" << fixed
122               << setprecision(3) << snpInfo_sub[i].maf << "\t"
123               << Cov_mat[i].size() - 1 << "\t";
124       outfile << scientific << setprecision(6) << Cov_mat[i][0] << "\t";
125 
126       if (Cov_mat[i].size() == 1) {
127         outfile << "NA";
128       } else {
129         for (size_t j = 1; j < Cov_mat[i].size(); j++) {
130           if (j == (Cov_mat[i].size() - 1)) {
131             outfile << Cov_mat[i][j];
132           } else {
133             outfile << Cov_mat[i][j] << ",";
134           }
135         }
136       }
137 
138       outfile << endl;
139     }
140   }
141 
142   outfile.close();
143   outfile.clear();
144   return;
145 }
146 
CompareSNPinfo(const SNPINFO & snpInfo1,const SNPINFO & snpInfo2)147 bool CompareSNPinfo(const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) {
148   int c_chr = snpInfo1.chr.compare(snpInfo2.chr);
149   long int c_bp = snpInfo1.base_position - snpInfo2.base_position;
150 
151   if (c_chr < 0) {
152     return true;
153   } else if (c_chr > 0) {
154     return false;
155   } else {
156     if (c_bp < 0) {
157       return true;
158     } else if (c_bp > 0) {
159       return false;
160     } else {
161       return true;
162     }
163   }
164 }
165 
166 // Do not sort SNPs (because gzip files do not support random access)
167 // then calculate n_nb, the number of neighbours, for each SNP.
CalcNB(vector<SNPINFO> & snpInfo_sort)168 void VARCOV::CalcNB(vector<SNPINFO> &snpInfo_sort) {
169   size_t t2 = 0, n_nb = 0;
170   for (size_t t = 0; t < indicator_snp.size(); ++t) {
171     if (indicator_snp[t] == 0) {
172       continue;
173     }
174 
175     if (snpInfo_sort[t].chr == "-9" ||
176         (snpInfo_sort[t].cM == -9 && window_cm != 0) ||
177         (snpInfo_sort[t].base_position == -9 && window_bp != 0)) {
178       snpInfo_sort[t].n_nb = 0;
179       continue;
180     }
181 
182     if (t == indicator_snp.size() - 1) {
183       snpInfo_sort[t].n_nb = 0;
184       continue;
185     }
186 
187     t2 = t + 1;
188     n_nb = 0;
189 
190     while (t2 < indicator_snp.size() &&
191            snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
192            indicator_snp[t2] == 0) {
193       t2++;
194     }
195 
196     while (t2 < indicator_snp.size() &&
197            snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
198            (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm ||
199             window_cm == 0) &&
200            (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position <
201             (long int) window_bp ||
202             window_bp == 0) &&
203            (n_nb < window_ns || window_ns == 0)) {
204       t2++;
205       n_nb++;
206       while (t2 < indicator_snp.size() &&
207              snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
208              indicator_snp[t2] == 0) {
209         t2++;
210       }
211     }
212 
213     snpInfo_sort[t].n_nb = n_nb;
214   }
215 
216   return;
217 }
218 
219 // Vector double is centered to have mean 0.
Calc_Cor(vector<vector<double>> & X_mat,vector<double> & cov_vec)220 void Calc_Cor(vector<vector<double>> &X_mat, vector<double> &cov_vec) {
221   cov_vec.clear();
222 
223   double v1, v2, r;
224   vector<double> x_vec = X_mat[0];
225 
226   lapack_ddot(x_vec, x_vec, v1);
227   cov_vec.push_back(v1 / (double)x_vec.size());
228 
229   for (size_t i = 1; i < X_mat.size(); i++) {
230     lapack_ddot(X_mat[i], x_vec, r);
231     lapack_ddot(X_mat[i], X_mat[i], v2);
232     r /= sqrt(v1 * v2);
233 
234     cov_vec.push_back(r);
235   }
236 
237   return;
238 }
239 
240 // Read the genotype file again, calculate r2 between pairs of SNPs
241 // within a window, output the file every 10K SNPs the output results
242 // are sorted based on chr and bp output format similar to assoc.txt
243 // files (remember n_miss is replaced by n_idv).
244 //
245 // r2 between the current SNP and every following SNPs within the
246 // window_size (which can vary if cM was used) read bimbam mean
247 // genotype file and calculate the covariance matrix for neighboring
248 // SNPs output values at 10000-SNP-interval.
AnalyzeBimbam()249 void VARCOV::AnalyzeBimbam() {
250   debug_msg("entering");
251   igzstream infile(file_geno.c_str(), igzstream::in);
252   if (!infile) {
253     cout << "error reading genotype file:" << file_geno << endl;
254     return;
255   }
256 
257   // Calculate the number of right-hand-side neighbours for each SNP.
258   vector<SNPINFO> snpInfo_sub;
259   CalcNB(snpInfo);
260 
261   size_t ni_test = 0;
262   for (size_t i = 0; i < indicator_idv.size(); i++) {
263     ni_test += indicator_idv[i];
264   }
265 
266   gsl_vector *geno = gsl_vector_alloc(ni_test);
267   double geno_mean;
268 
269   vector<double> x_vec, cov_vec;
270   vector<vector<double>> X_mat, Cov_mat;
271 
272   for (size_t i = 0; i < ni_test; i++) {
273     x_vec.push_back(0);
274   }
275 
276   WriteCov(0, snpInfo_sub, Cov_mat);
277 
278   size_t t2 = 0, inc;
279   int n_nb = 0;
280 
281   for (size_t t = 0; t < indicator_snp.size(); ++t) {
282     if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
283       ProgressBar("Reading SNPs  ", t, indicator_snp.size() - 1);
284     }
285     if (indicator_snp[t] == 0) {
286       continue;
287     }
288 
289     if (X_mat.size() == 0) {
290       n_nb = snpInfo[t].n_nb + 1;
291     } else {
292       n_nb = snpInfo[t].n_nb - n_nb + 1;
293     }
294 
295     for (int i = 0; i < n_nb; i++) {
296       if (X_mat.size() == 0) {
297         t2 = t;
298       }
299 
300       // Read a line of the snp is filtered out.
301       inc = 0;
302       while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
303         t2++;
304         inc++;
305       }
306 
307       Bimbam_ReadOneSNP(inc, indicator_idv, infile, geno, geno_mean);
308       gsl_vector_add_constant(geno, -1.0 * geno_mean);
309 
310       for (size_t j = 0; j < geno->size; j++) {
311         x_vec[j] = gsl_vector_get(geno, j);
312       }
313       X_mat.push_back(x_vec);
314 
315       t2++;
316     }
317 
318     n_nb = snpInfo[t].n_nb;
319 
320     Calc_Cor(X_mat, cov_vec);
321     Cov_mat.push_back(cov_vec);
322     snpInfo_sub.push_back(snpInfo[t]);
323 
324     X_mat.erase(X_mat.begin());
325 
326     // Write out var/cov values.
327     if (Cov_mat.size() == 10000) {
328       WriteCov(1, snpInfo_sub, Cov_mat);
329       Cov_mat.clear();
330       snpInfo_sub.clear();
331     }
332   }
333 
334   if (Cov_mat.size() != 0) {
335     WriteCov(1, snpInfo_sub, Cov_mat);
336     Cov_mat.clear();
337     snpInfo_sub.clear();
338   }
339 
340   gsl_vector_free(geno);
341 
342   infile.close();
343   infile.clear();
344 
345   return;
346 }
347 
AnalyzePlink()348 void VARCOV::AnalyzePlink() {
349   debug_msg("entering");
350   string file_bed = file_bfile + ".bed";
351   ifstream infile(file_bed.c_str(), ios::binary);
352   if (!infile) {
353     cout << "error reading bed file:" << file_bed << endl;
354     return;
355   }
356 
357   // Calculate the number of right-hand-side neighbours for each SNP.
358   vector<SNPINFO> snpInfo_sub;
359   CalcNB(snpInfo);
360 
361   size_t ni_test = 0;
362   for (size_t i = 0; i < indicator_idv.size(); i++) {
363     ni_test += indicator_idv[i];
364   }
365 
366   gsl_vector *geno = gsl_vector_alloc(ni_test);
367   double geno_mean;
368 
369   vector<double> x_vec, cov_vec;
370   vector<vector<double>> X_mat, Cov_mat;
371 
372   for (size_t i = 0; i < ni_test; i++) {
373     x_vec.push_back(0);
374   }
375 
376   WriteCov(0, snpInfo_sub, Cov_mat);
377 
378   size_t t2 = 0, inc;
379   int n_nb = 0;
380 
381   for (size_t t = 0; t < indicator_snp.size(); ++t) {
382     if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
383       ProgressBar("Reading SNPs  ", t, indicator_snp.size() - 1);
384     }
385     if (indicator_snp[t] == 0) {
386       continue;
387     }
388 
389     if (X_mat.size() == 0) {
390       n_nb = snpInfo[t].n_nb + 1;
391     } else {
392       n_nb = snpInfo[t].n_nb - n_nb + 1;
393     }
394 
395     for (int i = 0; i < n_nb; i++) {
396       if (X_mat.size() == 0) {
397         t2 = t;
398       }
399 
400       // Read a line if the SNP is filtered out.
401       inc = 0;
402       while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
403         t2++;
404         inc++;
405       }
406 
407       Plink_ReadOneSNP(t2, indicator_idv, infile, geno, geno_mean);
408       gsl_vector_add_constant(geno, -1.0 * geno_mean);
409 
410       for (size_t j = 0; j < geno->size; j++) {
411         x_vec[j] = gsl_vector_get(geno, j);
412       }
413       X_mat.push_back(x_vec);
414 
415       t2++;
416     }
417 
418     n_nb = snpInfo[t].n_nb;
419 
420     Calc_Cor(X_mat, cov_vec);
421     Cov_mat.push_back(cov_vec);
422     snpInfo_sub.push_back(snpInfo[t]);
423 
424     X_mat.erase(X_mat.begin());
425 
426     // Write out var/cov values.
427     if (Cov_mat.size() == 10000) {
428       WriteCov(1, snpInfo_sub, Cov_mat);
429       Cov_mat.clear();
430       snpInfo_sub.clear();
431     }
432   }
433 
434   if (Cov_mat.size() != 0) {
435     WriteCov(1, snpInfo_sub, Cov_mat);
436     Cov_mat.clear();
437     snpInfo_sub.clear();
438   }
439 
440   gsl_vector_free(geno);
441 
442   infile.close();
443   infile.clear();
444 
445   return;
446 }
447