1 /*
2  * GCTA: a tool for Genome-wide Complex Trait Analysis
3  *
4  * Implementations of functions for reading the raw genotype data
5  *
6  * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7  *
8  * This file is distributed under the GNU General Public
9  * License, Version 2.  Please see the file COPYING for more
10  * details
11  */
12 
13 #include "gcta.h"
14 
read_IRG_fnames(string snp_info_file,string fname_file,double GC_cutoff)15 void gcta::read_IRG_fnames(string snp_info_file, string fname_file, double GC_cutoff)
16 {
17     // read SNP summary information
18     cout<<"Reading summary information of the SNPs ..."<<endl;
19     ifstream i_snp_info(snp_info_file.c_str());
20     if(!i_snp_info) throw("Error: can not open the file ["+snp_info_file+"] to read.");
21     int i=0,j=0;
22     string str_buf;
23     vector<string> vs_buf;
24     getline(i_snp_info, str_buf);
25     while(getline(i_snp_info, str_buf)){
26         StrFunc::split_string(str_buf, vs_buf, "\t ,;\n");
27         j=0;
28         _snp_name.push_back(vs_buf[++j]);
29         _chr.push_back(atoi(vs_buf[++j].c_str()));
30         _bp.push_back(atoi(vs_buf[++j].c_str()));
31     }
32     i_snp_info.close();
33     _snp_num=_snp_name.size();
34     cout<<_snp_num<<" SNPs specified in ["+snp_info_file+"]."<<endl;
35 
36     // save PLINK map file
37     string map_file=_out+".map";
38     ofstream omap(map_file.c_str());
39     cout<<"Saving PLINK MAP file ..."<<endl;
40     if(!omap) throw("Error: can not open the file ["+map_file+"] to write.");
41     for(i=0; i<_snp_num; i++) omap<<_chr[i]<<"\t"<<_snp_name[i]<<"\t0\t"<<_bp[i]<<endl;
42     omap.close();
43     cout<<"PLINK MAP file has been saved in ["+map_file+"].\n"<<endl;
44 
45     // read the filenames of the raw genotype data files
46     ifstream i_fnames(fname_file.c_str());
47     if(!i_fnames) throw("Error: can not open the file ["+fname_file+"] to read.");
48     vector<string> fnames;
49     while(getline(i_fnames, str_buf)){
50         if(StrFunc::split_string(str_buf, vs_buf)==1) fnames.push_back(vs_buf[0]);
51     }
52     i_fnames.close();
53     _indi_num=fnames.size();
54     cout<<_indi_num<<" raw genotype data filenames specified in ["+fname_file+"]."<<endl;
55 
56     // read raw genotype file
57     _snp_1.resize(_snp_num);
58     _snp_2.resize(_snp_num);
59     for(i=0; i<_snp_num; i++){
60         _snp_1[i].reserve(_indi_num);
61         _snp_2[i].reserve(_indi_num);
62     }
63     cout<<"Reading the raw genotype files and saving the genotype data in PLINK PED format ..."<<endl;
64     cout<<"(SNP genotypes with GenCall rate < "<<GC_cutoff<<" are regarded as missing)"<<endl;
65     string ped_file=_out+".ped";
66     ofstream oped(ped_file.c_str());
67     if(!oped) throw("Error: can not open the file ["+ped_file+"] to read.");
68     for(i=0; i<_indi_num; i++){
69         read_one_IRG(oped, i, fnames[i], GC_cutoff);
70         cout<<i+1<<" of "<<_indi_num<<" files.\r";
71     }
72     cout<<"Genotype data for "<<_indi_num<<" individuals have been save in the file ["+ped_file+"]."<<endl;
73     oped.close();
74 }
75 
flip_allele(char a)76 char gcta::flip_allele(char a)
77 {
78     if(a=='A') return('T');
79     else if(a=='C') return('G');
80     else if(a=='G') return('C');
81     else if(a=='T') return('A');
82     else return('0');
83 }
84 
read_one_IRG(ofstream & oped,int ind,string IRG_fname,double GC_cutoff)85 void gcta::read_one_IRG(ofstream &oped, int ind, string IRG_fname, double GC_cutoff)
86 {
87     ifstream i_IRG(IRG_fname.c_str());
88     if(!i_IRG) throw("Error: can not open the file ["+IRG_fname+"] to read.");
89     char a1='0', a2='0';
90     int i=0, j=0, snp_num=0;
91     double GC=0.0;
92     string str_buf, fid, pid;
93     vector<string> vs_buf;
94     for(i=0; i<2; i++) getline(i_IRG, str_buf);
95     StrFunc::split_string(str_buf, vs_buf);
96     bool oldversion=false;
97     if(vs_buf[2]=="1.1.9") oldversion=true;
98     for(i=0; i<3; i++) getline(i_IRG, str_buf);
99     StrFunc::split_string(str_buf, vs_buf);
100     snp_num=atoi(vs_buf[2].c_str());
101     if(snp_num!=_snp_num) throw("Error: the number of SNPs specified in the summary file does not match that in the raw genotype file ["+IRG_fname+"].");
102     for(i=0; i<6; i++) getline(i_IRG, str_buf);
103     for(i=0; i<snp_num; i++){
104         getline(i_IRG, str_buf);
105         StrFunc::split_string(str_buf, vs_buf, "\t ,;\n");
106         if(vs_buf[0]!=_snp_name[i]) throw("Error: the SNP ["+vs_buf[0]+"] specified in the summary file does not match that in the raw genotype file ["+IRG_fname+"]. The order of the SNPs have been changed?");
107         pid=vs_buf[1];
108         if(oldversion) fid=pid;
109         else fid=vs_buf[2];
110         if(oldversion){
111             a1=vs_buf[2][0];
112             a2=vs_buf[3][0];
113             GC=atof(vs_buf[8].c_str());
114         }
115         else{
116             GC=atof(vs_buf[3].c_str());
117             a1=vs_buf[6][0];
118             a2=vs_buf[7][0];
119         }
120         if(GC<GC_cutoff) a1=a2='0';
121         if(i==0) oped<<fid<<" "<<pid<<" -9 -9 -9 -9 ";
122         oped<<a1<<" "<<a2<<" ";
123     }
124     oped<<endl;
125     i_IRG.close();
126 }
127