1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementation of gene-based association test (GBAT) in GCTA
5  *
6  * 2013 by Jian Yang <jian.yang@uq.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
sbat_read_snpAssoc(string snpAssoc_file,vector<string> & snp_name,vector<int> & snp_chr,vector<int> & snp_bp,vector<double> & snp_pval)15 void gcta::sbat_read_snpAssoc(string snpAssoc_file, vector<string> &snp_name, vector<int> &snp_chr, vector<int> &snp_bp, vector<double> &snp_pval)
16 {
17     ifstream in_snpAssoc(snpAssoc_file.c_str());
18     if (!in_snpAssoc) throw ("Error: can not open the file [" + snpAssoc_file + "] to read.");
19     cout << "\nReading SNP association results from [" + snpAssoc_file + "]." << endl;
20     string str_buf;
21     vector<string> vs_buf;
22     map<string, int>::iterator iter;
23     map<string, int> assoc_snp_map;
24     int line = 0;
25     while (getline(in_snpAssoc, str_buf)) {
26         if (StrFunc::split_string(str_buf, vs_buf, " \t") != 2) throw ("Error: in line \"" + str_buf + "\".");
27         iter = _snp_name_map.find(vs_buf[0]);
28         if (iter == _snp_name_map.end()) continue;
29         if(assoc_snp_map.find(vs_buf[0]) != assoc_snp_map.end()) continue;
30         else assoc_snp_map.insert(pair<string, int>(vs_buf[0], line));
31         snp_name.push_back(vs_buf[0]);
32         snp_pval.push_back(atof(vs_buf[1].c_str()));
33         line++;
34     }
35     in_snpAssoc.close();
36     snp_name.erase(unique(snp_name.begin(), snp_name.end()), snp_name.end());
37     cout << "Association p-values of " << snp_name.size() << " SNPs have been included." << endl;
38 
39     update_id_map_kp(snp_name, _snp_name_map, _include);
40     vector<string> snp_name_buf(snp_name);
41     vector<double> snp_pval_buf(snp_pval);
42     snp_name.clear();
43     snp_pval.clear();
44     snp_name.resize(_include.size());
45     snp_pval.resize(_include.size());
46     int i = 0;
47     map<string, int> snp_name_buf_map;
48     for (i = 0; i < snp_name_buf.size(); i++) snp_name_buf_map.insert(pair<string,int>(snp_name_buf[i], i));
49     #pragma omp parallel for
50     for (i = 0; i < _include.size(); i++) {
51         map<string, int>::iterator iter = snp_name_buf_map.find(_snp_name[_include[i]]);
52         snp_name[i] = snp_name_buf[iter->second];
53         snp_pval[i] = snp_pval_buf[iter->second];
54     }
55     snp_chr.resize(_include.size());
56     snp_bp.resize(_include.size());
57     #pragma omp parallel for
58     for (i = 0; i < _include.size(); i++) {
59         snp_chr[i] = _chr[_include[i]];
60         snp_bp[i] = _bp[_include[i]];
61     }
62     if (_include.size() < 1) throw ("Error: no SNP is included in the analysis.");
63     else if (_chr[_include[0]] < 1) throw ("Error: chromosome information is missing.");
64     else if (_bp[_include[0]] < 1) throw ("Error: bp information is missing.");
65 }
66 
sbat_read_geneAnno(string gAnno_file,vector<string> & gene_name,vector<int> & gene_chr,vector<int> & gene_bp1,vector<int> & gene_bp2)67 void gcta::sbat_read_geneAnno(string gAnno_file, vector<string> &gene_name, vector<int> &gene_chr, vector<int> &gene_bp1, vector<int> &gene_bp2) {
68     ifstream in_gAnno(gAnno_file.c_str());
69     if (!in_gAnno) throw ("Error: can not open the file [" + gAnno_file + "] to read.");
70     cout << "Reading physical positions of the genes from [" + gAnno_file + "]." << endl;
71     string str_buf;
72     vector<string> vs_buf;
73     while (getline(in_gAnno, str_buf)) {
74         if (StrFunc::split_string(str_buf, vs_buf) != 4) throw ("Error: in line \"" + str_buf + "\".");
75         gene_chr.push_back(atoi(vs_buf[0].c_str()));
76         gene_bp1.push_back(atoi(vs_buf[1].c_str()));
77         gene_bp2.push_back(atoi(vs_buf[2].c_str()));
78         gene_name.push_back(vs_buf[3]);
79     }
80     in_gAnno.close();
81     cout << "Physical positions of " << gene_name.size() << " genes have been include." << endl;
82 }
83 
sbat_gene(string sAssoc_file,string gAnno_file,int wind,double sbat_ld_cutoff,bool sbat_write_snpset)84 void gcta::sbat_gene(string sAssoc_file, string gAnno_file, int wind, double sbat_ld_cutoff, bool sbat_write_snpset)
85 {
86     int i = 0, j = 0;
87     int snp_count;
88 
89     // read SNP association results
90     vector<string> snp_name;
91     vector<int> snp_chr, snp_bp;
92     vector<double> snp_pval;
93     sbat_read_snpAssoc(sAssoc_file, snp_name, snp_chr, snp_bp, snp_pval);
94     vector<double> snp_chisq(snp_pval.size());
95     for (i = 0; i < snp_pval.size(); i++) snp_chisq[i] = StatFunc::qchisq(snp_pval[i], 1);
96 
97     // get start and end of chr
98     int snp_num = snp_name.size();
99     map<int, string> chr_begin_snp, chr_end_snp;
100     chr_begin_snp.insert(pair<int, string>(snp_chr[0], snp_name[0]));
101     for (i = 1; i < snp_num; i++) {
102         if (snp_chr[i] != snp_chr[i - 1]) {
103             chr_begin_snp.insert(pair<int, string>(snp_chr[i], snp_name[i]));
104             chr_end_snp.insert(pair<int, string>(snp_chr[i - 1], snp_name[i - 1]));
105         }
106     }
107     chr_end_snp.insert(pair<int, string>(snp_chr[snp_num - 1], snp_name[snp_num - 1]));
108 
109     // read gene list
110     vector<string> gene_name;
111     vector<int> gene_chr, gene_bp1, gene_bp2;
112     sbat_read_geneAnno(gAnno_file, gene_name, gene_chr, gene_bp1, gene_bp2);
113 
114     // map genes to SNPs
115     cout << "Mapping the physical positions of genes to SNP data (gene bounaries: " << wind / 1000 << "Kb away from UTRs) ..." << endl;
116 
117     int gene_num = gene_name.size();
118     vector<string> gene2snp_1(gene_num), gene2snp_2(gene_num);
119     vector<locus_bp>::iterator iter;
120     map<int, string>::iterator chr_iter;
121     vector<locus_bp> snp_vec;
122     for (i = 0; i < snp_num; i++) snp_vec.push_back(locus_bp(snp_name[i], snp_chr[i], snp_bp[i]));
123     #pragma omp parallel for private(iter, chr_iter)
124     for (i = 0; i < gene_num; i++) {
125         iter = find_if(snp_vec.begin(), snp_vec.end(), locus_bp(gene_name[i], gene_chr[i], gene_bp1[i] - wind));
126         if (iter != snp_vec.end()) gene2snp_1[i] = iter->locus_name;
127         else gene2snp_1[i] = "NA";
128     }
129     #pragma omp parallel for private(iter, chr_iter)
130     for (i = 0; i < gene_num; i++) {
131         if (gene2snp_1[i] == "NA") {
132             gene2snp_2[i] = "NA";
133             continue;
134         }
135         iter = find_if(snp_vec.begin(), snp_vec.end(), locus_bp(gene_name[i], gene_chr[i], gene_bp2[i] + wind));
136         if (iter != snp_vec.end()){
137             if (iter->bp ==  gene_bp2[i] + wind) gene2snp_2[i] = iter->locus_name;
138             else {
139                 if(iter!=snp_vec.begin()){
140                     iter--;
141                     gene2snp_2[i] = iter->locus_name;
142                 }
143                 else gene2snp_2[i] = "NA";
144             }
145         }
146         else {
147             chr_iter = chr_end_snp.find(gene_chr[i]);
148             if (chr_iter == chr_end_snp.end()) gene2snp_2[i] = "NA";
149             else gene2snp_2[i] = chr_iter->second;
150         }
151     }
152     int mapped = 0;
153     for (i = 0; i < gene_num; i++) {
154         if (gene2snp_1[i] != "NA" && gene2snp_2[i] != "NA") mapped++;
155     }
156     if (mapped < 1) throw ("Error: no gene can be mapped to the SNP data. Please check the input data regarding chr and bp.");
157     else cout << mapped << " genes have been mapped to SNP data." << endl;
158 
159     // run gene-based test
160     if (_mu.empty()) calcu_mu();
161     cout << "\nRunning fastBAT analysis for genes ..." << endl;
162     if (sbat_ld_cutoff < 1) cout << "Pruning SNPs with LD rsq cutoff = " << sbat_ld_cutoff*sbat_ld_cutoff  << endl;
163     vector<double> gene_pval(gene_num), chisq_o(gene_num), min_snp_pval(gene_num);
164     vector<string> min_snp_name(gene_num);
165     vector<int> snp_num_in_gene(gene_num);
166     map<string, int>::iterator iter1, iter2;
167     map<string, int> snp_name_map;
168     string rgoodsnpfile = _out + ".gene.snpset";
169     ofstream rogoodsnp;
170     if (sbat_write_snpset) rogoodsnp.open(rgoodsnpfile.c_str());
171     for (i = 0; i < snp_name.size(); i++) snp_name_map.insert(pair<string,int>(snp_name[i], i));
172     for (i = 0; i < gene_num; i++) {
173         iter1 = snp_name_map.find(gene2snp_1[i]);
174         iter2 = snp_name_map.find(gene2snp_2[i]);
175         bool skip = false;
176         if (iter1 == snp_name_map.end() || iter2 == snp_name_map.end() || iter1->second >= iter2->second) skip = true;
177         snp_num_in_gene[i] = iter2->second - iter1->second + 1;
178         if(!skip && snp_num_in_gene[i] > 10000){
179             cout<<"Warning: Too many SNPs in the gene region ["<<gene_name[i]<<"]. Maximum limit is 10000. This gene is ignored in the analysis."<<endl;
180             skip = true;
181         }
182         if(skip){
183             gene_pval[i] = 2.0;
184             snp_num_in_gene[i] = 0;
185             continue;
186         }
187         chisq_o[i] = 0;
188         min_snp_pval[i]=2;
189         min_snp_name[i]="na";
190         for (j = iter1->second; j <= iter2->second; j++) {
191             if (min_snp_pval[i] > snp_pval[j]) {
192                 min_snp_pval[i] = snp_pval[j];
193                 min_snp_name[i] = snp_name[j]; //keep minimum value - regardless of whether SNP removed by LD pruning
194             }
195             chisq_o[i] += snp_chisq[j];
196         }
197         if(snp_num_in_gene[i] == 1) gene_pval[i] = StatFunc::pchisq(chisq_o[i], 1.0);
198         else {
199             vector<int> snp_indx;
200             for (j = iter1->second; j <= iter2->second; j++) snp_indx.push_back(j);
201             snp_count=snp_num_in_gene[i];
202             VectorXd eigenval;
203             vector<int> sub_indx;
204             sbat_calcu_lambda(snp_indx, eigenval, snp_count, sbat_ld_cutoff, sub_indx);
205             //recalculate chisq value from low correlation snp subset
206             if (sbat_ld_cutoff < 1) {
207                 chisq_o[i] = 0;
208                 for (j = 0; j < sub_indx.size(); j++) chisq_o[i] += snp_chisq[snp_indx[sub_indx[j]]];
209             }
210             snp_num_in_gene[i] = snp_count;
211             if (snp_count==1 && chisq_o[i] ==0) gene_pval[i] = 1;
212             else gene_pval[i] = StatFunc::pchisqsum(chisq_o[i], eigenval);
213 
214             if (sbat_write_snpset) {
215                 rogoodsnp << gene_name[i] << endl;
216                 for (int k = 0; k < sub_indx.size(); k++) rogoodsnp << snp_name[(iter1->second)+sub_indx[k]] << endl;
217                 rogoodsnp << "END" << endl << endl;
218             }
219 
220         }
221 
222         if((i + 1) % 100 == 0 || (i + 1) == gene_num) cout << i + 1 << " of " << gene_num << " genes.\r";
223     }
224 
225     string filename = _out + ".gene.fastbat";
226     cout << "\nSaving the results of the fastBAT analysis to [" + filename + "] ..." << endl;
227     ofstream ofile(filename.c_str());
228     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
229     ofile << "Gene\tChr\tStart\tEnd\tNo.SNPs\tSNP_start\tSNP_end\tChisq(Obs)\tPvalue\tTopSNP.Pvalue\tTopSNP" << endl;
230     for (i = 0; i < gene_num; i++) {
231         if(gene_pval[i]>1.5) continue;
232         ofile << gene_name[i] << "\t" << gene_chr[i] << "\t" << gene_bp1[i] << "\t" << gene_bp2[i] << "\t";
233         ofile << snp_num_in_gene[i] << "\t" << gene2snp_1[i] << "\t" << gene2snp_2[i] << "\t" << chisq_o[i];
234         ofile << "\t" << gene_pval[i] << "\t" << min_snp_pval[i] << "\t" << min_snp_name[i] << endl;
235         //else ofile << "0\tNA\tNA\tNA\tNA" << endl;
236     }
237     ofile.close();
238     if (sbat_write_snpset) {
239         cout << "The SNP sets have been save in file [" << rgoodsnpfile << "]." << endl;
240         rogoodsnp.close();
241     }
242 }
243 
sbat_read_snpset(string snpset_file,vector<string> & set_name,vector<vector<string>> & snpset)244 void gcta::sbat_read_snpset(string snpset_file, vector<string> &set_name, vector< vector<string> > &snpset)
245 {
246     ifstream in_snpset(snpset_file.c_str());
247     if (!in_snpset) throw ("Error: can not open the file [" + snpset_file + "] to read.");
248     cout << "\nReading SNP sets from [" + snpset_file + "]." << endl;
249     string str_buf;
250     vector<string> vs_buf, snpset_buf, snp_name;
251     int i = 0;
252     while (in_snpset>>str_buf) {
253         if(str_buf!="END" && str_buf!="end") vs_buf.push_back(str_buf);
254         else{
255             if(vs_buf.empty()) continue;
256             set_name.push_back(vs_buf[0]);
257             for(i = 1; i < vs_buf.size(); i++){
258                 if (_snp_name_map.find(vs_buf[i]) != _snp_name_map.end()){
259                     snpset_buf.push_back(vs_buf[i]);
260                     snp_name.push_back(vs_buf[i]);
261                 }
262             }
263             vs_buf.clear();
264             snpset.push_back(snpset_buf);
265             snpset_buf.clear();
266         }
267     }
268     in_snpset.close();
269     snp_name.erase(unique(snp_name.begin(), snp_name.end()), snp_name.end());
270     update_id_map_kp(snp_name, _snp_name_map, _include);
271     cout << snp_name.size() << " SNPs in " << snpset.size() << " sets have been included." << endl;
272 }
273 
sbat(string sAssoc_file,string snpset_file,double sbat_ld_cutoff,bool sbat_write_snpset)274 void gcta::sbat(string sAssoc_file, string snpset_file, double sbat_ld_cutoff, bool sbat_write_snpset)
275 {
276     int i = 0, j = 0;
277     int snp_count;
278 
279     // read SNP set file
280     vector<string> set_name;
281     vector< vector<string> > snpset;
282     sbat_read_snpset(snpset_file, set_name, snpset);
283     int set_num = set_name.size();
284 
285     // read SNP association results
286     vector<string> snp_name;
287     vector<int> snp_chr, snp_bp;
288     vector<double> snp_pval;
289     sbat_read_snpAssoc(sAssoc_file, snp_name, snp_chr, snp_bp, snp_pval);
290     vector<double> snp_chisq(snp_pval.size());
291     for (i = 0; i < snp_pval.size(); i++) snp_chisq[i] = StatFunc::qchisq(snp_pval[i], 1);
292 
293     // run gene-based test
294     if (_mu.empty()) calcu_mu();
295     cout << "\nRunning fastBAT analysis ..." << endl;
296     if (sbat_ld_cutoff < 1) cout << "Pruning snps with maximum ld cutoff " << sbat_ld_cutoff  << endl;
297     vector<double> set_pval(set_num), chisq_o(set_num), min_snp_pval(set_num);
298     vector<string> min_snp_name(set_num);
299     vector<int> snp_num_in_set(set_num);
300     map<string, int>::iterator iter;
301     map<string, int> snp_name_map;
302 
303     string rgoodsnpfile = _out + ".snpset";
304     ofstream rogoodsnp;
305     if (sbat_write_snpset) rogoodsnp.open(rgoodsnpfile.c_str());
306 
307     for (i = 0; i < snp_name.size(); i++) snp_name_map.insert(pair<string,int>(snp_name[i], i));
308     for (i = 0; i < set_num; i++) {
309         bool skip = false;
310         if(snpset[i].size() < 1) skip = true;
311         vector<int> snp_indx;
312         for(j = 0; j < snpset[i].size(); j++){
313             iter = snp_name_map.find(snpset[i][j]);
314             if(iter!=snp_name_map.end()) snp_indx.push_back(iter->second);
315         }
316         snp_num_in_set[i] = snp_indx.size();
317         if(!skip && snp_num_in_set[i] > 20000){
318             cout<<"Warning: Too many SNPs in the set ["<<set_name[i]<<"]. Maximum limit is 20000. This gene is ignored in the analysis."<<endl;
319             skip = true;
320         }
321         if(skip){
322             set_pval[i] = 2.0;
323             snp_num_in_set[i] = 0;
324             continue;
325         }
326         chisq_o[i] = 0;
327         min_snp_pval[i]=2;
328         min_snp_name[i]="na";
329         for (j = 0; j < snp_indx.size(); j++) {
330             if (min_snp_pval[i] > snp_pval[snp_indx[j]]) {
331                 min_snp_pval[i] = snp_pval[snp_indx[j]];
332                 min_snp_name[i] = snp_name[snp_indx[j]];
333             }
334             chisq_o[i] += snp_chisq[snp_indx[j]];
335         }
336         if(snp_num_in_set[i] == 1) set_pval[i] = StatFunc::pchisq(chisq_o[i], 1.0);
337         else {
338             snp_count=snp_num_in_set[i];
339             VectorXd eigenval;
340             vector<int> sub_indx;
341             sbat_calcu_lambda(snp_indx, eigenval, snp_count, sbat_ld_cutoff, sub_indx);
342 
343             //recalculate chisq value from low correlation snp subset
344             if (sbat_ld_cutoff < 1) {
345                 chisq_o[i] = 0;
346                 for (j = 0; j < sub_indx.size(); j++) chisq_o[i] += snp_chisq[snp_indx[sub_indx[j]]];
347             }
348             snp_num_in_set[i] = snp_count;
349             if (snp_count==1 && chisq_o[i] ==0) set_pval[i] = 1;
350             else set_pval[i] = StatFunc::pchisqsum(chisq_o[i], eigenval);
351 
352             if (sbat_write_snpset) {
353                 rogoodsnp << set_name[i] << endl;
354                 for (int k = 0; k < sub_indx.size(); k++) rogoodsnp << snp_name[snp_indx[sub_indx[k]]] << endl;
355                 rogoodsnp << "END" << endl << endl;
356             }
357 
358         }
359 
360         if((i + 1) % 100 == 0 || (i + 1) == set_num) cout << i + 1 << " of " << set_num << " sets.\r";
361     }
362 
363     string filename = _out + ".fastbat";
364     cout << "\nSaving the results of the fastBAT analysis to [" + filename + "] ..." << endl;
365     ofstream ofile(filename.c_str());
366     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
367     ofile << "Set\tNo.SNPs\tChisq(Obs)\tPvalue\tTopSNP.Pvalue\tTopSNP" << endl;
368     for (i = 0; i < set_num; i++) {
369         if(set_pval[i]>1.5) continue;
370         ofile << set_name[i] << "\t" << snp_num_in_set[i] << "\t" << chisq_o[i] << "\t";
371         ofile << set_pval[i] << "\t" << min_snp_pval[i] << "\t" << min_snp_name[i] << endl;
372     }
373     ofile.close();
374     if (sbat_write_snpset) {
375         cout << "The SNP sets have been save in file [" << rgoodsnpfile << "]." << endl;
376         rogoodsnp.close();
377     }
378 }
379 
sbat_seg(string sAssoc_file,int seg_size,double sbat_ld_cutoff,bool sbat_write_snpset)380 void gcta::sbat_seg(string sAssoc_file, int seg_size, double sbat_ld_cutoff, bool sbat_write_snpset)
381 {
382     int i = 0, j = 0;
383     int snp_count;
384 
385        // read SNP association results
386     vector<string> snp_name;
387     vector<int> snp_chr, snp_bp;
388     vector<double> snp_pval;
389     sbat_read_snpAssoc(sAssoc_file, snp_name, snp_chr, snp_bp, snp_pval);
390     vector<double> snp_chisq(snp_pval.size());
391     for (i = 0; i < snp_pval.size(); i++) snp_chisq[i] = StatFunc::qchisq(snp_pval[i], 1);
392 
393     // run gene-based test
394     if (_mu.empty()) calcu_mu();
395     cout << "\nRunning fastBAT analysis at genomic segments with a length of " << seg_size/1000 << "Kb ..." << endl;
396     if (sbat_ld_cutoff < 1) cout << "Pruning snps with maximum ld cutoff " << sbat_ld_cutoff  << endl;
397     vector< vector<int> > snp_set_indx;
398     vector<int> set_chr, set_start_bp, set_end_bp;
399     get_sbat_seg_blk(seg_size, snp_set_indx, set_chr, set_start_bp, set_end_bp);
400     int set_num = snp_set_indx.size();
401     vector<double> set_pval(set_num), chisq_o(set_num), min_snp_pval(set_num);
402     vector<string> min_snp_name(set_num);
403     vector<int> snp_num_in_set(set_num);
404 
405     string rgoodsnpfile = _out + ".seg.snpset";
406     ofstream rogoodsnp;
407     if (sbat_write_snpset) rogoodsnp.open(rgoodsnpfile.c_str());
408 
409     for (i = 0; i < set_num; i++) {
410         bool skip = false;
411         vector<int> snp_indx = snp_set_indx[i];
412         if(snp_indx.size() < 1) skip = true;
413         snp_num_in_set[i] = snp_indx.size();
414         if(!skip && snp_num_in_set[i] > 20000){
415             cout<<"Warning: Too many SNPs in the set on [chr" << set_chr[i] << ":" << set_start_bp[i] << "-" << set_end_bp[i] << "]. Maximum limit is 20000. This gene is ignored in the analysis."<<endl;
416             skip = true;
417         }
418         if(skip){
419             set_pval[i] = 2.0;
420             snp_num_in_set[i] = 0;
421             continue;
422         }
423         chisq_o[i] = 0;
424         min_snp_pval[i]=2;
425         min_snp_name[i]="na";
426         for (j = 0; j < snp_indx.size(); j++) {
427             if (min_snp_pval[i] > snp_pval[snp_indx[j]]) {
428                 min_snp_pval[i] = snp_pval[snp_indx[j]];
429                 min_snp_name[i] = snp_name[snp_indx[j]];
430             }
431             chisq_o[i] += snp_chisq[snp_indx[j]];
432         }
433         if(snp_num_in_set[i] == 1) set_pval[i] = StatFunc::pchisq(chisq_o[i], 1.0);
434         else {
435             snp_count=snp_num_in_set[i];
436             VectorXd eigenval;
437             vector<int> sub_indx;
438             sbat_calcu_lambda(snp_indx, eigenval, snp_count, sbat_ld_cutoff, sub_indx);
439             //recalculate chisq value from low correlation snp subset
440             if (sbat_ld_cutoff < 1) {
441                 chisq_o[i] = 0;
442                 for (j = 0; j < sub_indx.size(); j++) chisq_o[i] += snp_chisq[snp_indx[sub_indx[j]]];
443             }
444             snp_num_in_set[i] = snp_count;
445             if (snp_count==1 && chisq_o[i] ==0) set_pval[i] = 1;
446             else set_pval[i] = StatFunc::pchisqsum(chisq_o[i], eigenval);
447 
448            if (sbat_write_snpset) {
449                 rogoodsnp << "SEG" << i << ":" << set_start_bp[i] << "-" << set_end_bp[i] << endl;
450                 for (int k = 0; k < sub_indx.size(); k++) rogoodsnp << snp_name[snp_indx[sub_indx[k]]] << endl;
451                 rogoodsnp << "END" << endl << endl;
452             }
453 
454         }
455 
456         if((i + 1) % 100 == 0 || (i + 1) == set_num) cout << i + 1 << " of " << set_num << " sets.\r";
457     }
458 
459     string filename = _out + ".seg.fastbat";
460     cout << "\nSaving the results of the segment-based fastBAT analysis to [" + filename + "] ..." << endl;
461     ofstream ofile(filename.c_str());
462     if (!ofile) throw ("Can not open the file [" + filename + "] to write.");
463     ofile << "Chr\tStart\tEnd\tNo.SNPs\tChisq(Obs)\tPvalue\tTopSNP.Pvalue\tTopSNP" << endl;
464     for (i = 0; i < set_num; i++) {
465         if(set_pval[i]>1.5) continue;
466         ofile << set_chr[i] << "\t" << set_start_bp[i] << "\t"<< set_end_bp[i] << "\t";
467         ofile << snp_num_in_set[i] << "\t" << chisq_o[i] << "\t" << set_pval[i] << "\t";
468         ofile << min_snp_pval[i] << "\t" << min_snp_name[i] << endl;
469 
470     }
471     ofile.close();
472     if (sbat_write_snpset) {
473         cout << "The SNP sets have been save in file [" << rgoodsnpfile << "]." << endl;
474         rogoodsnp.close();
475     }
476 }
477 
get_sbat_seg_blk(int seg_size,vector<vector<int>> & snp_set_indx,vector<int> & set_chr,vector<int> & set_start_bp,vector<int> & set_end_bp)478 void gcta::get_sbat_seg_blk(int seg_size, vector< vector<int> > &snp_set_indx, vector<int> &set_chr, vector<int> &set_start_bp, vector<int> &set_end_bp)
479 {
480     int i = 0, j = 0, k = 0, m = _include.size();
481 
482     vector<int> brk_pnt;
483     brk_pnt.push_back(0);
484     for (i = 1, j = 0; i < m; i++) {
485         if (i == (m - 1)) brk_pnt.push_back(m - 1);
486         else if (_chr[_include[i]] != _chr[_include[brk_pnt[j]]]) {
487             brk_pnt.push_back(i - 1);
488             j++;
489             brk_pnt.push_back(i);
490             j++;
491         }
492         else if (_bp[_include[i]] - _bp[_include[brk_pnt[j]]] > seg_size) {
493             brk_pnt.push_back(i - 1);
494             j++;
495             brk_pnt.push_back(i);
496             j++;
497         }
498     }
499 
500     snp_set_indx.clear();
501     set_chr.clear();
502     set_start_bp.clear();
503     set_end_bp.clear();
504     for (i = 0; i < brk_pnt.size() - 1; i++) {
505         int size = brk_pnt[i + 1] - brk_pnt[i] + 1;
506         if(size < 3 && (i%2 != 0)) continue;
507         vector<int> snp_indx(size);
508         for (j = brk_pnt[i], k = 0; j <= brk_pnt[i + 1]; j++, k++){
509             snp_indx[k] = j;
510         }
511         snp_set_indx.push_back(snp_indx);
512         set_chr.push_back(_chr[_include[brk_pnt[i]]]);
513         set_start_bp.push_back(_bp[_include[brk_pnt[i]]]);
514         set_end_bp.push_back(_bp[_include[brk_pnt[i + 1]]]);
515     }
516 }
517 
sbat_calcu_lambda(vector<int> & snp_indx,VectorXd & eigenval,int & snp_count,double sbat_ld_cutoff,vector<int> & sub_indx)518 void gcta::sbat_calcu_lambda(vector<int> &snp_indx, VectorXd &eigenval, int &snp_count, double sbat_ld_cutoff, vector<int> &sub_indx)
519 {
520     int i = 0, j = 0, k = 0, n = _keep.size(), m = snp_indx.size();
521 
522     MatrixXf X;
523     make_XMat_subset(X, snp_indx, false);
524     vector<int> rm_ID1;
525     double R_cutoff = sbat_ld_cutoff;
526     int qi = 0; //alternate index
527 
528     VectorXd sumsq_x(m);
529     for (j = 0; j < m; j++) sumsq_x[j] = X.col(j).dot(X.col(j));
530 
531     MatrixXf C = X.transpose() * X;
532     X.resize(0,0);
533     #pragma omp parallel for private(j)
534     for (i = 0; i < m; i++) {
535         for (j = 0; j < m; j++) {
536             double d_buf = sqrt(sumsq_x[i] * sumsq_x[j]);
537             if(d_buf>0.0) C(i,j) /= d_buf;
538             else C(i,j) = 0.0;
539         }
540     }
541 
542     if (sbat_ld_cutoff < 1) rm_cor_sbat(C, R_cutoff, m, rm_ID1);
543         //Create new index
544         for (int i=0 ; i<m ; i++) {
545             if (rm_ID1.size() == 0) sub_indx.push_back(i);
546             else {
547                 if (rm_ID1[qi] == i) qi++; //Skip removed snp
548                 else sub_indx.push_back(i);
549             }
550         }
551         snp_count = sub_indx.size();
552         if (sub_indx.size() < C.size()) { //Build new matrix
553             MatrixXf D(sub_indx.size(),sub_indx.size());
554             for (i = 0 ; i < sub_indx.size() ; i++) {
555                for (j = 0 ; j < sub_indx.size() ; j++) {
556                    D(i,j) = C(sub_indx[i],sub_indx[j]);
557                }
558             }
559             C = D;
560         }
561 
562     SelfAdjointEigenSolver<MatrixXf> saes(C);
563     eigenval = saes.eigenvalues().cast<double>();
564 }
565 
rm_cor_sbat(MatrixXf & R,double R_cutoff,int m,vector<int> & rm_ID1)566 void gcta::rm_cor_sbat(MatrixXf &R, double R_cutoff, int m, vector<int> &rm_ID1) {
567     //Modified version of rm_cor_indi from grm.cpp
568 
569     int i = 0, j = 0, i_buf = 0;
570     vector<int> rm_ID2;
571 
572     //float tmpr = 0;
573     for (i = 0; i < m; i++) {
574         for (j = 0; j < i; j++) {
575             if (fabs(R(i,j)) > R_cutoff ) {
576                 rm_ID1.push_back(i);
577                 rm_ID2.push_back(j);
578             }
579         }
580     }
581 
582     // count the number of appearance of each "position" in the vector, which involves a few steps
583     vector<int> rm_uni_ID(rm_ID1);
584     rm_uni_ID.insert(rm_uni_ID.end(), rm_ID2.begin(), rm_ID2.end());
585     stable_sort(rm_uni_ID.begin(), rm_uni_ID.end());
586     rm_uni_ID.erase(unique(rm_uni_ID.begin(), rm_uni_ID.end()), rm_uni_ID.end());
587     map<int, int> rm_uni_ID_count;
588     for (i = 0; i < rm_uni_ID.size(); i++) {
589         i_buf = count(rm_ID1.begin(), rm_ID1.end(), rm_uni_ID[i]) + count(rm_ID2.begin(), rm_ID2.end(), rm_uni_ID[i]);
590         rm_uni_ID_count.insert(pair<int, int>(rm_uni_ID[i], i_buf));
591     }
592 
593     // swapping
594     map<int, int>::iterator iter1, iter2;
595     for (i = 0; i < rm_ID1.size(); i++) {
596         iter1 = rm_uni_ID_count.find(rm_ID1[i]);
597         iter2 = rm_uni_ID_count.find(rm_ID2[i]);
598         if (iter1->second < iter2->second) {
599             i_buf = rm_ID1[i];
600             rm_ID1[i] = rm_ID2[i];
601             rm_ID2[i] = i_buf;
602         }
603     }
604     stable_sort(rm_ID1.begin(), rm_ID1.end());
605     rm_ID1.erase(unique(rm_ID1.begin(), rm_ID1.end()), rm_ID1.end());
606 }
607 
608