1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementation of gene-based association test (GBAT) in GCTA
5 *
6 * 2013 by Jian Yang <jian.yang@uq.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
sbat_read_snpAssoc(string snpAssoc_file,vector<string> & snp_name,vector<int> & snp_chr,vector<int> & snp_bp,vector<double> & snp_pval)15 void gcta::sbat_read_snpAssoc(string snpAssoc_file, vector<string> &snp_name, vector<int> &snp_chr, vector<int> &snp_bp, vector<double> &snp_pval)
16 {
17 ifstream in_snpAssoc(snpAssoc_file.c_str());
18 if (!in_snpAssoc) throw ("Error: can not open the file [" + snpAssoc_file + "] to read.");
19 cout << "\nReading SNP association results from [" + snpAssoc_file + "]." << endl;
20 string str_buf;
21 vector<string> vs_buf;
22 map<string, int>::iterator iter;
23 map<string, int> assoc_snp_map;
24 int line = 0;
25 while (getline(in_snpAssoc, str_buf)) {
26 if (StrFunc::split_string(str_buf, vs_buf, " \t") != 2) throw ("Error: in line \"" + str_buf + "\".");
27 iter = _snp_name_map.find(vs_buf[0]);
28 if (iter == _snp_name_map.end()) continue;
29 if(assoc_snp_map.find(vs_buf[0]) != assoc_snp_map.end()) continue;
30 else assoc_snp_map.insert(pair<string, int>(vs_buf[0], line));
31 snp_name.push_back(vs_buf[0]);
32 snp_pval.push_back(atof(vs_buf[1].c_str()));
33 line++;
34 }
35 in_snpAssoc.close();
36 snp_name.erase(unique(snp_name.begin(), snp_name.end()), snp_name.end());
37 cout << "Association p-values of " << snp_name.size() << " SNPs have been included." << endl;
38
39 update_id_map_kp(snp_name, _snp_name_map, _include);
40 vector<string> snp_name_buf(snp_name);
41 vector<double> snp_pval_buf(snp_pval);
42 snp_name.clear();
43 snp_pval.clear();
44 snp_name.resize(_include.size());
45 snp_pval.resize(_include.size());
46 int i = 0;
47 map<string, int> snp_name_buf_map;
48 for (i = 0; i < snp_name_buf.size(); i++) snp_name_buf_map.insert(pair<string,int>(snp_name_buf[i], i));
49 #pragma omp parallel for
50 for (i = 0; i < _include.size(); i++) {
51 map<string, int>::iterator iter = snp_name_buf_map.find(_snp_name[_include[i]]);
52 snp_name[i] = snp_name_buf[iter->second];
53 snp_pval[i] = snp_pval_buf[iter->second];
54 }
55 snp_chr.resize(_include.size());
56 snp_bp.resize(_include.size());
57 #pragma omp parallel for
58 for (i = 0; i < _include.size(); i++) {
59 snp_chr[i] = _chr[_include[i]];
60 snp_bp[i] = _bp[_include[i]];
61 }
62 if (_include.size() < 1) throw ("Error: no SNP is included in the analysis.");
63 else if (_chr[_include[0]] < 1) throw ("Error: chromosome information is missing.");
64 else if (_bp[_include[0]] < 1) throw ("Error: bp information is missing.");
65 }
66
sbat_read_geneAnno(string gAnno_file,vector<string> & gene_name,vector<int> & gene_chr,vector<int> & gene_bp1,vector<int> & gene_bp2)67 void gcta::sbat_read_geneAnno(string gAnno_file, vector<string> &gene_name, vector<int> &gene_chr, vector<int> &gene_bp1, vector<int> &gene_bp2) {
68 ifstream in_gAnno(gAnno_file.c_str());
69 if (!in_gAnno) throw ("Error: can not open the file [" + gAnno_file + "] to read.");
70 cout << "Reading physical positions of the genes from [" + gAnno_file + "]." << endl;
71 string str_buf;
72 vector<string> vs_buf;
73 while (getline(in_gAnno, str_buf)) {
74 if (StrFunc::split_string(str_buf, vs_buf) != 4) throw ("Error: in line \"" + str_buf + "\".");
75 gene_chr.push_back(atoi(vs_buf[0].c_str()));
76 gene_bp1.push_back(atoi(vs_buf[1].c_str()));
77 gene_bp2.push_back(atoi(vs_buf[2].c_str()));
78 gene_name.push_back(vs_buf[3]);
79 }
80 in_gAnno.close();
81 cout << "Physical positions of " << gene_name.size() << " genes have been include." << endl;
82 }
83
sbat_gene(string sAssoc_file,string gAnno_file,int wind,double sbat_ld_cutoff,bool sbat_write_snpset)84 void gcta::sbat_gene(string sAssoc_file, string gAnno_file, int wind, double sbat_ld_cutoff, bool sbat_write_snpset)
85 {
86 int i = 0, j = 0;
87 int snp_count;
88
89 // read SNP association results
90 vector<string> snp_name;
91 vector<int> snp_chr, snp_bp;
92 vector<double> snp_pval;
93 sbat_read_snpAssoc(sAssoc_file, snp_name, snp_chr, snp_bp, snp_pval);
94 vector<double> snp_chisq(snp_pval.size());
95 for (i = 0; i < snp_pval.size(); i++) snp_chisq[i] = StatFunc::qchisq(snp_pval[i], 1);
96
97 // get start and end of chr
98 int snp_num = snp_name.size();
99 map<int, string> chr_begin_snp, chr_end_snp;
100 chr_begin_snp.insert(pair<int, string>(snp_chr[0], snp_name[0]));
101 for (i = 1; i < snp_num; i++) {
102 if (snp_chr[i] != snp_chr[i - 1]) {
103 chr_begin_snp.insert(pair<int, string>(snp_chr[i], snp_name[i]));
104 chr_end_snp.insert(pair<int, string>(snp_chr[i - 1], snp_name[i - 1]));
105 }
106 }
107 chr_end_snp.insert(pair<int, string>(snp_chr[snp_num - 1], snp_name[snp_num - 1]));
108
109 // read gene list
110 vector<string> gene_name;
111 vector<int> gene_chr, gene_bp1, gene_bp2;
112 sbat_read_geneAnno(gAnno_file, gene_name, gene_chr, gene_bp1, gene_bp2);
113
114 // map genes to SNPs
115 cout << "Mapping the physical positions of genes to SNP data (gene bounaries: " << wind / 1000 << "Kb away from UTRs) ..." << endl;
116
117 int gene_num = gene_name.size();
118 vector<string> gene2snp_1(gene_num), gene2snp_2(gene_num);
119 vector<locus_bp>::iterator iter;
120 map<int, string>::iterator chr_iter;
121 vector<locus_bp> snp_vec;
122 for (i = 0; i < snp_num; i++) snp_vec.push_back(locus_bp(snp_name[i], snp_chr[i], snp_bp[i]));
123 #pragma omp parallel for private(iter, chr_iter)
124 for (i = 0; i < gene_num; i++) {
125 iter = find_if(snp_vec.begin(), snp_vec.end(), locus_bp(gene_name[i], gene_chr[i], gene_bp1[i] - wind));
126 if (iter != snp_vec.end()) gene2snp_1[i] = iter->locus_name;
127 else gene2snp_1[i] = "NA";
128 }
129 #pragma omp parallel for private(iter, chr_iter)
130 for (i = 0; i < gene_num; i++) {
131 if (gene2snp_1[i] == "NA") {
132 gene2snp_2[i] = "NA";
133 continue;
134 }
135 iter = find_if(snp_vec.begin(), snp_vec.end(), locus_bp(gene_name[i], gene_chr[i], gene_bp2[i] + wind));
136 if (iter != snp_vec.end()){
137 if (iter->bp == gene_bp2[i] + wind) gene2snp_2[i] = iter->locus_name;
138 else {
139 if(iter!=snp_vec.begin()){
140 iter--;
141 gene2snp_2[i] = iter->locus_name;
142 }
143 else gene2snp_2[i] = "NA";
144 }
145 }
146 else {
147 chr_iter = chr_end_snp.find(gene_chr[i]);
148 if (chr_iter == chr_end_snp.end()) gene2snp_2[i] = "NA";
149 else gene2snp_2[i] = chr_iter->second;
150 }
151 }
152 int mapped = 0;
153 for (i = 0; i < gene_num; i++) {
154 if (gene2snp_1[i] != "NA" && gene2snp_2[i] != "NA") mapped++;
155 }
156 if (mapped < 1) throw ("Error: no gene can be mapped to the SNP data. Please check the input data regarding chr and bp.");
157 else cout << mapped << " genes have been mapped to SNP data." << endl;
158
159 // run gene-based test
160 if (_mu.empty()) calcu_mu();
161 cout << "\nRunning fastBAT analysis for genes ..." << endl;
162 if (sbat_ld_cutoff < 1) cout << "Pruning SNPs with LD rsq cutoff = " << sbat_ld_cutoff*sbat_ld_cutoff << endl;
163 vector<double> gene_pval(gene_num), chisq_o(gene_num), min_snp_pval(gene_num);
164 vector<string> min_snp_name(gene_num);
165 vector<int> snp_num_in_gene(gene_num);
166 map<string, int>::iterator iter1, iter2;
167 map<string, int> snp_name_map;
168 string rgoodsnpfile = _out + ".gene.snpset";
169 ofstream rogoodsnp;
170 if (sbat_write_snpset) rogoodsnp.open(rgoodsnpfile.c_str());
171 for (i = 0; i < snp_name.size(); i++) snp_name_map.insert(pair<string,int>(snp_name[i], i));
172 for (i = 0; i < gene_num; i++) {
173 iter1 = snp_name_map.find(gene2snp_1[i]);
174 iter2 = snp_name_map.find(gene2snp_2[i]);
175 bool skip = false;
176 if (iter1 == snp_name_map.end() || iter2 == snp_name_map.end() || iter1->second >= iter2->second) skip = true;
177 snp_num_in_gene[i] = iter2->second - iter1->second + 1;
178 if(!skip && snp_num_in_gene[i] > 10000){
179 cout<<"Warning: Too many SNPs in the gene region ["<<gene_name[i]<<"]. Maximum limit is 10000. This gene is ignored in the analysis."<<endl;
180 skip = true;
181 }
182 if(skip){
183 gene_pval[i] = 2.0;
184 snp_num_in_gene[i] = 0;
185 continue;
186 }
187 chisq_o[i] = 0;
188 min_snp_pval[i]=2;
189 min_snp_name[i]="na";
190 for (j = iter1->second; j <= iter2->second; j++) {
191 if (min_snp_pval[i] > snp_pval[j]) {
192 min_snp_pval[i] = snp_pval[j];
193 min_snp_name[i] = snp_name[j]; //keep minimum value - regardless of whether SNP removed by LD pruning
194 }
195 chisq_o[i] += snp_chisq[j];
196 }
197 if(snp_num_in_gene[i] == 1) gene_pval[i] = StatFunc::pchisq(chisq_o[i], 1.0);
198 else {
199 vector<int> snp_indx;
200 for (j = iter1->second; j <= iter2->second; j++) snp_indx.push_back(j);
201 snp_count=snp_num_in_gene[i];
202 VectorXd eigenval;
203 vector<int> sub_indx;
204 sbat_calcu_lambda(snp_indx, eigenval, snp_count, sbat_ld_cutoff, sub_indx);
205 //recalculate chisq value from low correlation snp subset
206 if (sbat_ld_cutoff < 1) {
207 chisq_o[i] = 0;
208 for (j = 0; j < sub_indx.size(); j++) chisq_o[i] += snp_chisq[snp_indx[sub_indx[j]]];
209 }
210 snp_num_in_gene[i] = snp_count;
211 if (snp_count==1 && chisq_o[i] ==0) gene_pval[i] = 1;
212 else gene_pval[i] = StatFunc::pchisqsum(chisq_o[i], eigenval);
213
214 if (sbat_write_snpset) {
215 rogoodsnp << gene_name[i] << endl;
216 for (int k = 0; k < sub_indx.size(); k++) rogoodsnp << snp_name[(iter1->second)+sub_indx[k]] << endl;
217 rogoodsnp << "END" << endl << endl;
218 }
219
220 }
221
222 if((i + 1) % 100 == 0 || (i + 1) == gene_num) cout << i + 1 << " of " << gene_num << " genes.\r";
223 }
224
225 string filename = _out + ".gene.fastbat";
226 cout << "\nSaving the results of the fastBAT analysis to [" + filename + "] ..." << endl;
227 ofstream ofile(filename.c_str());
228 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
229 ofile << "Gene\tChr\tStart\tEnd\tNo.SNPs\tSNP_start\tSNP_end\tChisq(Obs)\tPvalue\tTopSNP.Pvalue\tTopSNP" << endl;
230 for (i = 0; i < gene_num; i++) {
231 if(gene_pval[i]>1.5) continue;
232 ofile << gene_name[i] << "\t" << gene_chr[i] << "\t" << gene_bp1[i] << "\t" << gene_bp2[i] << "\t";
233 ofile << snp_num_in_gene[i] << "\t" << gene2snp_1[i] << "\t" << gene2snp_2[i] << "\t" << chisq_o[i];
234 ofile << "\t" << gene_pval[i] << "\t" << min_snp_pval[i] << "\t" << min_snp_name[i] << endl;
235 //else ofile << "0\tNA\tNA\tNA\tNA" << endl;
236 }
237 ofile.close();
238 if (sbat_write_snpset) {
239 cout << "The SNP sets have been save in file [" << rgoodsnpfile << "]." << endl;
240 rogoodsnp.close();
241 }
242 }
243
sbat_read_snpset(string snpset_file,vector<string> & set_name,vector<vector<string>> & snpset)244 void gcta::sbat_read_snpset(string snpset_file, vector<string> &set_name, vector< vector<string> > &snpset)
245 {
246 ifstream in_snpset(snpset_file.c_str());
247 if (!in_snpset) throw ("Error: can not open the file [" + snpset_file + "] to read.");
248 cout << "\nReading SNP sets from [" + snpset_file + "]." << endl;
249 string str_buf;
250 vector<string> vs_buf, snpset_buf, snp_name;
251 int i = 0;
252 while (in_snpset>>str_buf) {
253 if(str_buf!="END" && str_buf!="end") vs_buf.push_back(str_buf);
254 else{
255 if(vs_buf.empty()) continue;
256 set_name.push_back(vs_buf[0]);
257 for(i = 1; i < vs_buf.size(); i++){
258 if (_snp_name_map.find(vs_buf[i]) != _snp_name_map.end()){
259 snpset_buf.push_back(vs_buf[i]);
260 snp_name.push_back(vs_buf[i]);
261 }
262 }
263 vs_buf.clear();
264 snpset.push_back(snpset_buf);
265 snpset_buf.clear();
266 }
267 }
268 in_snpset.close();
269 snp_name.erase(unique(snp_name.begin(), snp_name.end()), snp_name.end());
270 update_id_map_kp(snp_name, _snp_name_map, _include);
271 cout << snp_name.size() << " SNPs in " << snpset.size() << " sets have been included." << endl;
272 }
273
sbat(string sAssoc_file,string snpset_file,double sbat_ld_cutoff,bool sbat_write_snpset)274 void gcta::sbat(string sAssoc_file, string snpset_file, double sbat_ld_cutoff, bool sbat_write_snpset)
275 {
276 int i = 0, j = 0;
277 int snp_count;
278
279 // read SNP set file
280 vector<string> set_name;
281 vector< vector<string> > snpset;
282 sbat_read_snpset(snpset_file, set_name, snpset);
283 int set_num = set_name.size();
284
285 // read SNP association results
286 vector<string> snp_name;
287 vector<int> snp_chr, snp_bp;
288 vector<double> snp_pval;
289 sbat_read_snpAssoc(sAssoc_file, snp_name, snp_chr, snp_bp, snp_pval);
290 vector<double> snp_chisq(snp_pval.size());
291 for (i = 0; i < snp_pval.size(); i++) snp_chisq[i] = StatFunc::qchisq(snp_pval[i], 1);
292
293 // run gene-based test
294 if (_mu.empty()) calcu_mu();
295 cout << "\nRunning fastBAT analysis ..." << endl;
296 if (sbat_ld_cutoff < 1) cout << "Pruning snps with maximum ld cutoff " << sbat_ld_cutoff << endl;
297 vector<double> set_pval(set_num), chisq_o(set_num), min_snp_pval(set_num);
298 vector<string> min_snp_name(set_num);
299 vector<int> snp_num_in_set(set_num);
300 map<string, int>::iterator iter;
301 map<string, int> snp_name_map;
302
303 string rgoodsnpfile = _out + ".snpset";
304 ofstream rogoodsnp;
305 if (sbat_write_snpset) rogoodsnp.open(rgoodsnpfile.c_str());
306
307 for (i = 0; i < snp_name.size(); i++) snp_name_map.insert(pair<string,int>(snp_name[i], i));
308 for (i = 0; i < set_num; i++) {
309 bool skip = false;
310 if(snpset[i].size() < 1) skip = true;
311 vector<int> snp_indx;
312 for(j = 0; j < snpset[i].size(); j++){
313 iter = snp_name_map.find(snpset[i][j]);
314 if(iter!=snp_name_map.end()) snp_indx.push_back(iter->second);
315 }
316 snp_num_in_set[i] = snp_indx.size();
317 if(!skip && snp_num_in_set[i] > 20000){
318 cout<<"Warning: Too many SNPs in the set ["<<set_name[i]<<"]. Maximum limit is 20000. This gene is ignored in the analysis."<<endl;
319 skip = true;
320 }
321 if(skip){
322 set_pval[i] = 2.0;
323 snp_num_in_set[i] = 0;
324 continue;
325 }
326 chisq_o[i] = 0;
327 min_snp_pval[i]=2;
328 min_snp_name[i]="na";
329 for (j = 0; j < snp_indx.size(); j++) {
330 if (min_snp_pval[i] > snp_pval[snp_indx[j]]) {
331 min_snp_pval[i] = snp_pval[snp_indx[j]];
332 min_snp_name[i] = snp_name[snp_indx[j]];
333 }
334 chisq_o[i] += snp_chisq[snp_indx[j]];
335 }
336 if(snp_num_in_set[i] == 1) set_pval[i] = StatFunc::pchisq(chisq_o[i], 1.0);
337 else {
338 snp_count=snp_num_in_set[i];
339 VectorXd eigenval;
340 vector<int> sub_indx;
341 sbat_calcu_lambda(snp_indx, eigenval, snp_count, sbat_ld_cutoff, sub_indx);
342
343 //recalculate chisq value from low correlation snp subset
344 if (sbat_ld_cutoff < 1) {
345 chisq_o[i] = 0;
346 for (j = 0; j < sub_indx.size(); j++) chisq_o[i] += snp_chisq[snp_indx[sub_indx[j]]];
347 }
348 snp_num_in_set[i] = snp_count;
349 if (snp_count==1 && chisq_o[i] ==0) set_pval[i] = 1;
350 else set_pval[i] = StatFunc::pchisqsum(chisq_o[i], eigenval);
351
352 if (sbat_write_snpset) {
353 rogoodsnp << set_name[i] << endl;
354 for (int k = 0; k < sub_indx.size(); k++) rogoodsnp << snp_name[snp_indx[sub_indx[k]]] << endl;
355 rogoodsnp << "END" << endl << endl;
356 }
357
358 }
359
360 if((i + 1) % 100 == 0 || (i + 1) == set_num) cout << i + 1 << " of " << set_num << " sets.\r";
361 }
362
363 string filename = _out + ".fastbat";
364 cout << "\nSaving the results of the fastBAT analysis to [" + filename + "] ..." << endl;
365 ofstream ofile(filename.c_str());
366 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
367 ofile << "Set\tNo.SNPs\tChisq(Obs)\tPvalue\tTopSNP.Pvalue\tTopSNP" << endl;
368 for (i = 0; i < set_num; i++) {
369 if(set_pval[i]>1.5) continue;
370 ofile << set_name[i] << "\t" << snp_num_in_set[i] << "\t" << chisq_o[i] << "\t";
371 ofile << set_pval[i] << "\t" << min_snp_pval[i] << "\t" << min_snp_name[i] << endl;
372 }
373 ofile.close();
374 if (sbat_write_snpset) {
375 cout << "The SNP sets have been save in file [" << rgoodsnpfile << "]." << endl;
376 rogoodsnp.close();
377 }
378 }
379
sbat_seg(string sAssoc_file,int seg_size,double sbat_ld_cutoff,bool sbat_write_snpset)380 void gcta::sbat_seg(string sAssoc_file, int seg_size, double sbat_ld_cutoff, bool sbat_write_snpset)
381 {
382 int i = 0, j = 0;
383 int snp_count;
384
385 // read SNP association results
386 vector<string> snp_name;
387 vector<int> snp_chr, snp_bp;
388 vector<double> snp_pval;
389 sbat_read_snpAssoc(sAssoc_file, snp_name, snp_chr, snp_bp, snp_pval);
390 vector<double> snp_chisq(snp_pval.size());
391 for (i = 0; i < snp_pval.size(); i++) snp_chisq[i] = StatFunc::qchisq(snp_pval[i], 1);
392
393 // run gene-based test
394 if (_mu.empty()) calcu_mu();
395 cout << "\nRunning fastBAT analysis at genomic segments with a length of " << seg_size/1000 << "Kb ..." << endl;
396 if (sbat_ld_cutoff < 1) cout << "Pruning snps with maximum ld cutoff " << sbat_ld_cutoff << endl;
397 vector< vector<int> > snp_set_indx;
398 vector<int> set_chr, set_start_bp, set_end_bp;
399 get_sbat_seg_blk(seg_size, snp_set_indx, set_chr, set_start_bp, set_end_bp);
400 int set_num = snp_set_indx.size();
401 vector<double> set_pval(set_num), chisq_o(set_num), min_snp_pval(set_num);
402 vector<string> min_snp_name(set_num);
403 vector<int> snp_num_in_set(set_num);
404
405 string rgoodsnpfile = _out + ".seg.snpset";
406 ofstream rogoodsnp;
407 if (sbat_write_snpset) rogoodsnp.open(rgoodsnpfile.c_str());
408
409 for (i = 0; i < set_num; i++) {
410 bool skip = false;
411 vector<int> snp_indx = snp_set_indx[i];
412 if(snp_indx.size() < 1) skip = true;
413 snp_num_in_set[i] = snp_indx.size();
414 if(!skip && snp_num_in_set[i] > 20000){
415 cout<<"Warning: Too many SNPs in the set on [chr" << set_chr[i] << ":" << set_start_bp[i] << "-" << set_end_bp[i] << "]. Maximum limit is 20000. This gene is ignored in the analysis."<<endl;
416 skip = true;
417 }
418 if(skip){
419 set_pval[i] = 2.0;
420 snp_num_in_set[i] = 0;
421 continue;
422 }
423 chisq_o[i] = 0;
424 min_snp_pval[i]=2;
425 min_snp_name[i]="na";
426 for (j = 0; j < snp_indx.size(); j++) {
427 if (min_snp_pval[i] > snp_pval[snp_indx[j]]) {
428 min_snp_pval[i] = snp_pval[snp_indx[j]];
429 min_snp_name[i] = snp_name[snp_indx[j]];
430 }
431 chisq_o[i] += snp_chisq[snp_indx[j]];
432 }
433 if(snp_num_in_set[i] == 1) set_pval[i] = StatFunc::pchisq(chisq_o[i], 1.0);
434 else {
435 snp_count=snp_num_in_set[i];
436 VectorXd eigenval;
437 vector<int> sub_indx;
438 sbat_calcu_lambda(snp_indx, eigenval, snp_count, sbat_ld_cutoff, sub_indx);
439 //recalculate chisq value from low correlation snp subset
440 if (sbat_ld_cutoff < 1) {
441 chisq_o[i] = 0;
442 for (j = 0; j < sub_indx.size(); j++) chisq_o[i] += snp_chisq[snp_indx[sub_indx[j]]];
443 }
444 snp_num_in_set[i] = snp_count;
445 if (snp_count==1 && chisq_o[i] ==0) set_pval[i] = 1;
446 else set_pval[i] = StatFunc::pchisqsum(chisq_o[i], eigenval);
447
448 if (sbat_write_snpset) {
449 rogoodsnp << "SEG" << i << ":" << set_start_bp[i] << "-" << set_end_bp[i] << endl;
450 for (int k = 0; k < sub_indx.size(); k++) rogoodsnp << snp_name[snp_indx[sub_indx[k]]] << endl;
451 rogoodsnp << "END" << endl << endl;
452 }
453
454 }
455
456 if((i + 1) % 100 == 0 || (i + 1) == set_num) cout << i + 1 << " of " << set_num << " sets.\r";
457 }
458
459 string filename = _out + ".seg.fastbat";
460 cout << "\nSaving the results of the segment-based fastBAT analysis to [" + filename + "] ..." << endl;
461 ofstream ofile(filename.c_str());
462 if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
463 ofile << "Chr\tStart\tEnd\tNo.SNPs\tChisq(Obs)\tPvalue\tTopSNP.Pvalue\tTopSNP" << endl;
464 for (i = 0; i < set_num; i++) {
465 if(set_pval[i]>1.5) continue;
466 ofile << set_chr[i] << "\t" << set_start_bp[i] << "\t"<< set_end_bp[i] << "\t";
467 ofile << snp_num_in_set[i] << "\t" << chisq_o[i] << "\t" << set_pval[i] << "\t";
468 ofile << min_snp_pval[i] << "\t" << min_snp_name[i] << endl;
469
470 }
471 ofile.close();
472 if (sbat_write_snpset) {
473 cout << "The SNP sets have been save in file [" << rgoodsnpfile << "]." << endl;
474 rogoodsnp.close();
475 }
476 }
477
get_sbat_seg_blk(int seg_size,vector<vector<int>> & snp_set_indx,vector<int> & set_chr,vector<int> & set_start_bp,vector<int> & set_end_bp)478 void gcta::get_sbat_seg_blk(int seg_size, vector< vector<int> > &snp_set_indx, vector<int> &set_chr, vector<int> &set_start_bp, vector<int> &set_end_bp)
479 {
480 int i = 0, j = 0, k = 0, m = _include.size();
481
482 vector<int> brk_pnt;
483 brk_pnt.push_back(0);
484 for (i = 1, j = 0; i < m; i++) {
485 if (i == (m - 1)) brk_pnt.push_back(m - 1);
486 else if (_chr[_include[i]] != _chr[_include[brk_pnt[j]]]) {
487 brk_pnt.push_back(i - 1);
488 j++;
489 brk_pnt.push_back(i);
490 j++;
491 }
492 else if (_bp[_include[i]] - _bp[_include[brk_pnt[j]]] > seg_size) {
493 brk_pnt.push_back(i - 1);
494 j++;
495 brk_pnt.push_back(i);
496 j++;
497 }
498 }
499
500 snp_set_indx.clear();
501 set_chr.clear();
502 set_start_bp.clear();
503 set_end_bp.clear();
504 for (i = 0; i < brk_pnt.size() - 1; i++) {
505 int size = brk_pnt[i + 1] - brk_pnt[i] + 1;
506 if(size < 3 && (i%2 != 0)) continue;
507 vector<int> snp_indx(size);
508 for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++){
509 snp_indx[k] = j;
510 }
511 snp_set_indx.push_back(snp_indx);
512 set_chr.push_back(_chr[_include[brk_pnt[i]]]);
513 set_start_bp.push_back(_bp[_include[brk_pnt[i]]]);
514 set_end_bp.push_back(_bp[_include[brk_pnt[i + 1]]]);
515 }
516 }
517
sbat_calcu_lambda(vector<int> & snp_indx,VectorXd & eigenval,int & snp_count,double sbat_ld_cutoff,vector<int> & sub_indx)518 void gcta::sbat_calcu_lambda(vector<int> &snp_indx, VectorXd &eigenval, int &snp_count, double sbat_ld_cutoff, vector<int> &sub_indx)
519 {
520 int i = 0, j = 0, k = 0, n = _keep.size(), m = snp_indx.size();
521
522 MatrixXf X;
523 make_XMat_subset(X, snp_indx, false);
524 vector<int> rm_ID1;
525 double R_cutoff = sbat_ld_cutoff;
526 int qi = 0; //alternate index
527
528 VectorXd sumsq_x(m);
529 for (j = 0; j < m; j++) sumsq_x[j] = X.col(j).dot(X.col(j));
530
531 MatrixXf C = X.transpose() * X;
532 X.resize(0,0);
533 #pragma omp parallel for private(j)
534 for (i = 0; i < m; i++) {
535 for (j = 0; j < m; j++) {
536 double d_buf = sqrt(sumsq_x[i] * sumsq_x[j]);
537 if(d_buf>0.0) C(i,j) /= d_buf;
538 else C(i,j) = 0.0;
539 }
540 }
541
542 if (sbat_ld_cutoff < 1) rm_cor_sbat(C, R_cutoff, m, rm_ID1);
543 //Create new index
544 for (int i=0 ; i<m ; i++) {
545 if (rm_ID1.size() == 0) sub_indx.push_back(i);
546 else {
547 if (rm_ID1[qi] == i) qi++; //Skip removed snp
548 else sub_indx.push_back(i);
549 }
550 }
551 snp_count = sub_indx.size();
552 if (sub_indx.size() < C.size()) { //Build new matrix
553 MatrixXf D(sub_indx.size(),sub_indx.size());
554 for (i = 0 ; i < sub_indx.size() ; i++) {
555 for (j = 0 ; j < sub_indx.size() ; j++) {
556 D(i,j) = C(sub_indx[i],sub_indx[j]);
557 }
558 }
559 C = D;
560 }
561
562 SelfAdjointEigenSolver<MatrixXf> saes(C);
563 eigenval = saes.eigenvalues().cast<double>();
564 }
565
rm_cor_sbat(MatrixXf & R,double R_cutoff,int m,vector<int> & rm_ID1)566 void gcta::rm_cor_sbat(MatrixXf &R, double R_cutoff, int m, vector<int> &rm_ID1) {
567 //Modified version of rm_cor_indi from grm.cpp
568
569 int i = 0, j = 0, i_buf = 0;
570 vector<int> rm_ID2;
571
572 //float tmpr = 0;
573 for (i = 0; i < m; i++) {
574 for (j = 0; j < i; j++) {
575 if (fabs(R(i,j)) > R_cutoff ) {
576 rm_ID1.push_back(i);
577 rm_ID2.push_back(j);
578 }
579 }
580 }
581
582 // count the number of appearance of each "position" in the vector, which involves a few steps
583 vector<int> rm_uni_ID(rm_ID1);
584 rm_uni_ID.insert(rm_uni_ID.end(), rm_ID2.begin(), rm_ID2.end());
585 stable_sort(rm_uni_ID.begin(), rm_uni_ID.end());
586 rm_uni_ID.erase(unique(rm_uni_ID.begin(), rm_uni_ID.end()), rm_uni_ID.end());
587 map<int, int> rm_uni_ID_count;
588 for (i = 0; i < rm_uni_ID.size(); i++) {
589 i_buf = count(rm_ID1.begin(), rm_ID1.end(), rm_uni_ID[i]) + count(rm_ID2.begin(), rm_ID2.end(), rm_uni_ID[i]);
590 rm_uni_ID_count.insert(pair<int, int>(rm_uni_ID[i], i_buf));
591 }
592
593 // swapping
594 map<int, int>::iterator iter1, iter2;
595 for (i = 0; i < rm_ID1.size(); i++) {
596 iter1 = rm_uni_ID_count.find(rm_ID1[i]);
597 iter2 = rm_uni_ID_count.find(rm_ID2[i]);
598 if (iter1->second < iter2->second) {
599 i_buf = rm_ID1[i];
600 rm_ID1[i] = rm_ID2[i];
601 rm_ID2[i] = i_buf;
602 }
603 }
604 stable_sort(rm_ID1.begin(), rm_ID1.end());
605 rm_ID1.erase(unique(rm_ID1.begin(), rm_ID1.end()), rm_ID1.end());
606 }
607
608