1 /*
2 * GCTA: a tool for Genome-wide Complex Trait Analysis
3 *
4 * Implementations of functions for reading the raw genotype data
5 *
6 * 2010 by Jian Yang <jian.yang@qimr.edu.au>
7 *
8 * This file is distributed under the GNU General Public
9 * License, Version 2. Please see the file COPYING for more
10 * details
11 */
12
13 #include "gcta.h"
14
read_IRG_fnames(string snp_info_file,string fname_file,double GC_cutoff)15 void gcta::read_IRG_fnames(string snp_info_file, string fname_file, double GC_cutoff)
16 {
17 // read SNP summary information
18 cout<<"Reading summary information of the SNPs ..."<<endl;
19 ifstream i_snp_info(snp_info_file.c_str());
20 if(!i_snp_info) throw("Error: can not open the file ["+snp_info_file+"] to read.");
21 int i=0,j=0;
22 string str_buf;
23 vector<string> vs_buf;
24 getline(i_snp_info, str_buf);
25 while(getline(i_snp_info, str_buf)){
26 StrFunc::split_string(str_buf, vs_buf, "\t ,;\n");
27 j=0;
28 _snp_name.push_back(vs_buf[++j]);
29 _chr.push_back(atoi(vs_buf[++j].c_str()));
30 _bp.push_back(atoi(vs_buf[++j].c_str()));
31 }
32 i_snp_info.close();
33 _snp_num=_snp_name.size();
34 cout<<_snp_num<<" SNPs specified in ["+snp_info_file+"]."<<endl;
35
36 // save PLINK map file
37 string map_file=_out+".map";
38 ofstream omap(map_file.c_str());
39 cout<<"Saving PLINK MAP file ..."<<endl;
40 if(!omap) throw("Error: can not open the file ["+map_file+"] to write.");
41 for(i=0; i<_snp_num; i++) omap<<_chr[i]<<"\t"<<_snp_name[i]<<"\t0\t"<<_bp[i]<<endl;
42 omap.close();
43 cout<<"PLINK MAP file has been saved in ["+map_file+"].\n"<<endl;
44
45 // read the filenames of the raw genotype data files
46 ifstream i_fnames(fname_file.c_str());
47 if(!i_fnames) throw("Error: can not open the file ["+fname_file+"] to read.");
48 vector<string> fnames;
49 while(getline(i_fnames, str_buf)){
50 if(StrFunc::split_string(str_buf, vs_buf)==1) fnames.push_back(vs_buf[0]);
51 }
52 i_fnames.close();
53 _indi_num=fnames.size();
54 cout<<_indi_num<<" raw genotype data filenames specified in ["+fname_file+"]."<<endl;
55
56 // read raw genotype file
57 _snp_1.resize(_snp_num);
58 _snp_2.resize(_snp_num);
59 for(i=0; i<_snp_num; i++){
60 _snp_1[i].reserve(_indi_num);
61 _snp_2[i].reserve(_indi_num);
62 }
63 cout<<"Reading the raw genotype files and saving the genotype data in PLINK PED format ..."<<endl;
64 cout<<"(SNP genotypes with GenCall rate < "<<GC_cutoff<<" are regarded as missing)"<<endl;
65 string ped_file=_out+".ped";
66 ofstream oped(ped_file.c_str());
67 if(!oped) throw("Error: can not open the file ["+ped_file+"] to read.");
68 for(i=0; i<_indi_num; i++){
69 read_one_IRG(oped, i, fnames[i], GC_cutoff);
70 cout<<i+1<<" of "<<_indi_num<<" files.\r";
71 }
72 cout<<"Genotype data for "<<_indi_num<<" individuals have been save in the file ["+ped_file+"]."<<endl;
73 oped.close();
74 }
75
flip_allele(char a)76 char gcta::flip_allele(char a)
77 {
78 if(a=='A') return('T');
79 else if(a=='C') return('G');
80 else if(a=='G') return('C');
81 else if(a=='T') return('A');
82 else return('0');
83 }
84
read_one_IRG(ofstream & oped,int ind,string IRG_fname,double GC_cutoff)85 void gcta::read_one_IRG(ofstream &oped, int ind, string IRG_fname, double GC_cutoff)
86 {
87 ifstream i_IRG(IRG_fname.c_str());
88 if(!i_IRG) throw("Error: can not open the file ["+IRG_fname+"] to read.");
89 char a1='0', a2='0';
90 int i=0, j=0, snp_num=0;
91 double GC=0.0;
92 string str_buf, fid, pid;
93 vector<string> vs_buf;
94 for(i=0; i<2; i++) getline(i_IRG, str_buf);
95 StrFunc::split_string(str_buf, vs_buf);
96 bool oldversion=false;
97 if(vs_buf[2]=="1.1.9") oldversion=true;
98 for(i=0; i<3; i++) getline(i_IRG, str_buf);
99 StrFunc::split_string(str_buf, vs_buf);
100 snp_num=atoi(vs_buf[2].c_str());
101 if(snp_num!=_snp_num) throw("Error: the number of SNPs specified in the summary file does not match that in the raw genotype file ["+IRG_fname+"].");
102 for(i=0; i<6; i++) getline(i_IRG, str_buf);
103 for(i=0; i<snp_num; i++){
104 getline(i_IRG, str_buf);
105 StrFunc::split_string(str_buf, vs_buf, "\t ,;\n");
106 if(vs_buf[0]!=_snp_name[i]) throw("Error: the SNP ["+vs_buf[0]+"] specified in the summary file does not match that in the raw genotype file ["+IRG_fname+"]. The order of the SNPs have been changed?");
107 pid=vs_buf[1];
108 if(oldversion) fid=pid;
109 else fid=vs_buf[2];
110 if(oldversion){
111 a1=vs_buf[2][0];
112 a2=vs_buf[3][0];
113 GC=atof(vs_buf[8].c_str());
114 }
115 else{
116 GC=atof(vs_buf[3].c_str());
117 a1=vs_buf[6][0];
118 a2=vs_buf[7][0];
119 }
120 if(GC<GC_cutoff) a1=a2='0';
121 if(i==0) oped<<fid<<" "<<pid<<" -9 -9 -9 -9 ";
122 oped<<a1<<" "<<a2<<" ";
123 }
124 oped<<endl;
125 i_IRG.close();
126 }
127