1 #include "Variant.h"
2 #include "split.h"
3 #include "cdflib.hpp"
4 #include "pdflib.hpp"
5 #include "var.hpp"
6
7 #include <string>
8 #include <iostream>
9 #include <math.h>
10 #include <cmath>
11 #include <stdlib.h>
12 #include <time.h>
13 #include <stdio.h>
14 #include <getopt.h>
15 #include "gpatInfo.hpp"
16
17 using namespace std;
18 using namespace vcflib;
19
20
printHelp(void)21 void printHelp(void){
22
23 cerr << endl << endl;
24 cerr << "INFO: help" << endl;
25 cerr << "INFO: description:" << endl;
26 cerr << " wcFst is Weir & Cockerham's Fst for two populations. Negative values are VALID, " << endl;
27 cerr << " they are sites which can be treated as zero Fst. For more information see Evolution, Vol. 38 N. 6 Nov 1984. " << endl;
28 cerr << " Specifically wcFst uses equations 1,2,3,4. " << endl << endl;
29
30 cerr << "Output : 3 columns : " << endl;
31 cerr << " 1. seqid " << endl;
32 cerr << " 2. position " << endl;
33 cerr << " 3. target allele frequency " << endl;
34 cerr << " 4. background allele frequency " << endl;
35 cerr << " 5. wcFst " << endl << endl;
36
37 cerr << "INFO: usage: wcFst --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --type PL " << endl;
38 cerr << endl;
39 cerr << "INFO: required: t,target -- argument: a zero based comma separated list of target individuals corrisponding to VCF columns " << endl;
40 cerr << "INFO: required: b,background -- argument: a zero based comma separated list of background individuals corrisponding to VCF columns " << endl;
41 cerr << "INFO: required: f,file -- argument: proper formatted VCF " << endl;
42 cerr << "INFO: required, y,type -- argument: genotype likelihood format; genotype : GT,GL,PL,GP " << endl;
43 cerr << "INFO: optional: r,region -- argument: a tabix compliant genomic range: seqid or seqid:start-end " << endl;
44 cerr << "INFO: optional: d,deltaaf -- argument: skip sites where the difference in allele frequencies is less than deltaaf, default is zero " << endl;
45
46 printVersion();
47 }
48
49
bound(double v)50 double bound(double v){
51 if(v <= 0.00001){
52 return 0.00001;
53 }
54 if(v >= 0.99999){
55 return 0.99999;
56 }
57 return v;
58 }
59
loadIndices(map<int,int> & index,string set)60 void loadIndices(map<int, int> & index, string set){
61
62 vector<string> indviduals = split(set, ",");
63
64 vector<string>::iterator it = indviduals.begin();
65
66 for(; it != indviduals.end(); it++){
67 index[ atoi( (*it).c_str() ) ] = 1;
68 }
69 }
70
71
main(int argc,char ** argv)72 int main(int argc, char** argv) {
73
74 // set the random seed for MCMC
75
76 srand((unsigned)time(NULL));
77
78 // the filename
79
80 string filename = "NA";
81
82 // set region to scaffold
83
84 string region = "NA";
85
86 // using vcflib; thanks to Erik Garrison
87
88 VariantCallFile variantFile;
89
90 // zero based index for the target and background indivudals
91
92 map<int, int> it, ib;
93
94 // deltaaf is the difference of allele frequency we bother to look at
95
96 string deltaaf ;
97 double daf = 0.00;
98
99 // genotype likelihood format
100
101 string type = "NA";
102
103
104 const struct option longopts[] =
105 {
106 {"version" , 0, 0, 'v'},
107 {"help" , 0, 0, 'h'},
108 {"file" , 1, 0, 'f'},
109 {"target" , 1, 0, 't'},
110 {"background", 1, 0, 'b'},
111 {"deltaaf" , 1, 0, 'd'},
112 {"type" , 1, 0, 'y'},
113 {"region" , 1, 0, 'r'},
114 {0,0,0,0}
115 };
116
117 int index;
118 int iarg=0;
119
120 while(iarg != -1)
121 {
122 iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chv", longopts, &index);
123
124 switch (iarg)
125 {
126 case 'h':
127 printHelp();
128 return 0;
129 case 'v':
130 printVersion();
131 return 0;
132 case 't':
133 loadIndices(it, optarg);
134 cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
135 cerr << "INFO: target ids: " << optarg << endl;
136 break;
137 case 'b':
138 loadIndices(ib, optarg);
139 cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
140 cerr << "INFO: background ids: " << optarg << endl;
141 break;
142 case 'f':
143 cerr << "INFO: file: " << optarg << endl;
144 filename = optarg;
145 break;
146 case 'd':
147 cerr << "INFO: only scoring sites where the allele frequency difference is greater than: " << optarg << endl;
148 deltaaf = optarg;
149 daf = atof(deltaaf.c_str());
150 break;
151 case 'y':
152 type = optarg;
153 cerr << "INFO: setting genotype likelihood format to: " << type << endl;
154 break;
155 case 'r':
156 cerr << "INFO: set seqid region to : " << optarg << endl;
157 region = optarg;
158 break;
159 default:
160 break;
161 }
162 }
163
164 if(filename == "NA"){
165 cerr << "FATAL: did not specify a required option: file" << endl;
166 printHelp();
167 exit(1);
168 }
169
170 variantFile.open(filename);
171
172 if(region != "NA"){
173 if(! variantFile.setRegion(region)){
174 cerr << "FATAL: unable to set region" << endl;
175 return 1;
176 }
177 }
178
179 if (!variantFile.is_open()) {
180 cerr << "FATAL: could not open VCF for reading" << endl;
181 printHelp();
182 return 1;
183 }
184
185 map<string, int> okayGenotypeLikelihoods;
186 okayGenotypeLikelihoods["PL"] = 1;
187 okayGenotypeLikelihoods["GL"] = 1;
188 okayGenotypeLikelihoods["GP"] = 1;
189 okayGenotypeLikelihoods["GT"] = 1;
190
191 if(type == "NA"){
192 cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
193 printHelp();
194 return 1;
195 }
196 if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
197 cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
198 printHelp();
199 return 1;
200 }
201
202 Variant var(variantFile);
203
204 vector<string> samples = variantFile.sampleNames;
205 int nsamples = samples.size();
206
207 while (variantFile.getNextVariant(var)) {
208
209 // biallelic sites naturally
210
211 if(var.alt.size() > 1){
212 continue;
213 }
214
215 vector < map< string, vector<string> > > target, background, total;
216
217 int index = 0;
218
219 for(int nsamp = 0; nsamp < nsamples; nsamp++){
220
221 map<string, vector<string> > sample = var.samples[ samples[nsamp]];
222
223 if(sample["GT"].front() != "./."){
224 if(it.find(index) != it.end() ){
225 target.push_back(sample);
226 }
227 if(ib.find(index) != ib.end()){
228 background.push_back(sample);
229 }
230 }
231 index += 1;
232 }
233
234
235 if(target.size() < 5 || background.size() < 5){
236 continue;
237 }
238
239 genotype * populationTarget ;
240 genotype * populationBackground ;
241
242 if(type == "PL"){
243 populationTarget = new pl();
244 populationBackground = new pl();
245 }
246 if(type == "GL"){
247 populationTarget = new gl();
248 populationBackground = new gl();
249 }
250 if(type == "GP"){
251 populationTarget = new gp();
252 populationBackground = new gp();
253 }
254 if(type == "GT"){
255 populationTarget = new gt();
256 populationBackground = new gt();
257 }
258
259 populationTarget->loadPop(target, var.sequenceName, var.position);
260 populationBackground->loadPop(background, var.sequenceName, var.position);
261
262 if(populationTarget->af == -1 || populationBackground->af == -1){
263 delete populationTarget;
264 delete populationBackground;
265 continue;
266 }
267 if(populationTarget->af == 1 && populationBackground->af == 1){
268 delete populationTarget;
269 delete populationBackground;
270 continue;
271 }
272 if(populationTarget->af == 0 && populationBackground->af == 0){
273 delete populationTarget;
274 delete populationBackground;
275 continue;
276 }
277
278 double afdiff = abs(populationTarget->af - populationBackground->af);
279
280 if(afdiff < daf){
281 delete populationTarget;
282 delete populationBackground;
283 continue;
284 }
285
286 // pg 1360 B.S Weir and C.C. Cockerham 1984
287 double nbar = ( populationTarget->ngeno / 2 ) + (populationBackground->ngeno / 2);
288 double rn = 2*nbar;
289
290 // special case of only two populations
291 double nc = rn ;
292 nc -= (pow(populationTarget->ngeno,2)/rn);
293 nc -= (pow(populationBackground->ngeno,2)/rn);
294 // average sample frequency
295 double pbar = (populationTarget->af + populationBackground->af) / 2;
296
297 // sample variance of allele A frequences over the population
298
299 double s2 = 0;
300 s2 += ((populationTarget->ngeno * pow(populationTarget->af - pbar, 2))/nbar);
301 s2 += ((populationBackground->ngeno * pow(populationBackground->af - pbar, 2))/nbar);
302
303 // average heterozygosity
304
305 double hbar = (populationTarget->hfrq + populationBackground->hfrq) / 2;
306
307 //global af var
308 double pvar = pbar * (1 - pbar);
309
310 // a, b, c
311
312 double avar1 = nbar / nc;
313 double avar2 = 1 / (nbar -1) ;
314 double avar3 = pvar - (0.5*s2) - (0.25*hbar);
315 double avar = avar1 * (s2 - (avar2 * avar3));
316
317 double bvar1 = nbar / (nbar - 1);
318 double bvar2 = pvar - (0.5*s2) - (((2*nbar -1)/(4*nbar))*hbar);
319 double bvar = bvar1 * bvar2;
320
321 double cvar = 0.5*hbar;
322
323 double fst = avar / (avar+bvar+cvar);
324
325 cout << var.sequenceName << "\t" << var.position << "\t" << populationTarget->af << "\t" << populationBackground->af << "\t" << fst << endl ;
326
327 delete populationTarget;
328 delete populationBackground;
329
330 }
331 return 0;
332 }
333