1 #include "Variant.h"
2 #include "split.h"
3 #include "cdflib.hpp"
4 #include "pdflib.hpp"
5 #include "var.hpp"
6 
7 #include <string>
8 #include <iostream>
9 #include <math.h>
10 #include <cmath>
11 #include <stdlib.h>
12 #include <time.h>
13 #include <stdio.h>
14 #include <getopt.h>
15 #include "gpatInfo.hpp"
16 
17 using namespace std;
18 using namespace vcflib;
19 
20 
printHelp(void)21 void printHelp(void){
22 
23   cerr << endl << endl;
24   cerr << "INFO: help" << endl;
25   cerr << "INFO: description:" << endl;
26   cerr << "      wcFst is Weir & Cockerham's Fst for two populations.  Negative values are VALID,  " << endl;
27   cerr << "      they are sites which can be treated as zero Fst. For more information see Evolution, Vol. 38 N. 6 Nov 1984. " << endl;
28   cerr << "      Specifically wcFst uses equations 1,2,3,4.                                                              " << endl << endl;
29 
30   cerr << "Output : 3 columns :                 "    << endl;
31   cerr << "     1. seqid                        "    << endl;
32   cerr << "     2. position                     "    << endl;
33   cerr << "     3. target allele frequency      "    << endl;
34   cerr << "     4. background allele frequency  "    << endl;
35   cerr << "     5. wcFst                        "    << endl  << endl;
36 
37   cerr << "INFO: usage:  wcFst --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --type PL                  " << endl;
38   cerr << endl;
39   cerr << "INFO: required: t,target     -- argument: a zero based comma separated list of target individuals corrisponding to VCF columns        " << endl;
40   cerr << "INFO: required: b,background -- argument: a zero based comma separated list of background individuals corrisponding to VCF columns    " << endl;
41   cerr << "INFO: required: f,file       -- argument: proper formatted VCF                                                                        " << endl;
42   cerr << "INFO: required, y,type       -- argument: genotype likelihood format; genotype : GT,GL,PL,GP                                             " << endl;
43   cerr << "INFO: optional: r,region     -- argument: a tabix compliant genomic range: seqid or seqid:start-end                                   " << endl;
44   cerr << "INFO: optional: d,deltaaf    -- argument: skip sites where the difference in allele frequencies is less than deltaaf, default is zero " << endl;
45 
46   printVersion();
47 }
48 
49 
bound(double v)50 double bound(double v){
51   if(v <= 0.00001){
52     return  0.00001;
53   }
54   if(v >= 0.99999){
55     return 0.99999;
56   }
57   return v;
58 }
59 
loadIndices(map<int,int> & index,string set)60 void loadIndices(map<int, int> & index, string set){
61 
62   vector<string>  indviduals = split(set, ",");
63 
64   vector<string>::iterator it = indviduals.begin();
65 
66   for(; it != indviduals.end(); it++){
67     index[ atoi( (*it).c_str() ) ] = 1;
68   }
69 }
70 
71 
main(int argc,char ** argv)72 int main(int argc, char** argv) {
73 
74   // set the random seed for MCMC
75 
76   srand((unsigned)time(NULL));
77 
78   // the filename
79 
80   string filename = "NA";
81 
82   // set region to scaffold
83 
84   string region = "NA";
85 
86   // using vcflib; thanks to Erik Garrison
87 
88   VariantCallFile variantFile;
89 
90   // zero based index for the target and background indivudals
91 
92   map<int, int> it, ib;
93 
94   // deltaaf is the difference of allele frequency we bother to look at
95 
96   string deltaaf ;
97   double daf  = 0.00;
98 
99   // genotype likelihood format
100 
101   string type = "NA";
102 
103 
104     const struct option longopts[] =
105       {
106 	{"version"   , 0, 0, 'v'},
107 	{"help"      , 0, 0, 'h'},
108         {"file"      , 1, 0, 'f'},
109 	{"target"    , 1, 0, 't'},
110 	{"background", 1, 0, 'b'},
111 	{"deltaaf"   , 1, 0, 'd'},
112 	{"type"      , 1, 0, 'y'},
113 	{"region"    , 1, 0, 'r'},
114 	{0,0,0,0}
115       };
116 
117     int index;
118     int iarg=0;
119 
120     while(iarg != -1)
121       {
122 	iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chv", longopts, &index);
123 
124 	switch (iarg)
125 	  {
126 	  case 'h':
127 	    printHelp();
128 	    return 0;
129 	  case 'v':
130 	    printVersion();
131 	    return 0;
132 	  case 't':
133 	    loadIndices(it, optarg);
134 	    cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
135 	    cerr << "INFO: target ids: " << optarg << endl;
136 	    break;
137 	  case 'b':
138 	    loadIndices(ib, optarg);
139 	    cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
140 	    cerr << "INFO: background ids: " << optarg << endl;
141 	    break;
142 	  case 'f':
143 	    cerr << "INFO: file: " << optarg  <<  endl;
144 	    filename = optarg;
145 	    break;
146 	  case 'd':
147 	    cerr << "INFO: only scoring sites where the allele frequency difference is greater than: " << optarg << endl;
148 	    deltaaf = optarg;
149 	    daf = atof(deltaaf.c_str());
150 	    break;
151 	  case 'y':
152 	    type = optarg;
153 	    cerr << "INFO: setting genotype likelihood format to: " << type << endl;
154 	    break;
155 	  case 'r':
156             cerr << "INFO: set seqid region to : " << optarg << endl;
157 	    region = optarg;
158 	    break;
159 	  default:
160 	    break;
161 	  }
162       }
163 
164     if(filename == "NA"){
165       cerr << "FATAL: did not specify a required option: file" << endl;
166       printHelp();
167       exit(1);
168     }
169 
170     variantFile.open(filename);
171 
172     if(region != "NA"){
173       if(! variantFile.setRegion(region)){
174 	cerr << "FATAL: unable to set region" << endl;
175 	return 1;
176       }
177     }
178 
179     if (!variantFile.is_open()) {
180       cerr << "FATAL: could not open VCF for reading" << endl;
181       printHelp();
182       return 1;
183     }
184 
185     map<string, int> okayGenotypeLikelihoods;
186     okayGenotypeLikelihoods["PL"] = 1;
187     okayGenotypeLikelihoods["GL"] = 1;
188     okayGenotypeLikelihoods["GP"] = 1;
189     okayGenotypeLikelihoods["GT"] = 1;
190 
191     if(type == "NA"){
192       cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
193       printHelp();
194       return 1;
195     }
196     if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
197       cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
198       printHelp();
199       return 1;
200     }
201 
202     Variant var(variantFile);
203 
204     vector<string> samples = variantFile.sampleNames;
205     int nsamples = samples.size();
206 
207     while (variantFile.getNextVariant(var)) {
208 
209 	// biallelic sites naturally
210 
211 	if(var.alt.size() > 1){
212 	  continue;
213 	}
214 
215 	vector < map< string, vector<string> > > target, background, total;
216 
217 	int index = 0;
218 
219 	for(int nsamp = 0; nsamp < nsamples; nsamp++){
220 
221           map<string, vector<string> > sample = var.samples[ samples[nsamp]];
222 
223 	    if(sample["GT"].front() != "./."){
224 	      if(it.find(index) != it.end() ){
225 		target.push_back(sample);
226 	      }
227 	      if(ib.find(index) != ib.end()){
228 		background.push_back(sample);
229 	      }
230 	    }
231 	    index += 1;
232 	}
233 
234 
235 	if(target.size() < 5 || background.size() < 5){
236 	  continue;
237 	}
238 
239 	genotype * populationTarget      ;
240 	genotype * populationBackground  ;
241 
242 	if(type == "PL"){
243 	  populationTarget      = new pl();
244 	  populationBackground  = new pl();
245 	}
246 	if(type == "GL"){
247 	  populationTarget     = new gl();
248 	  populationBackground = new gl();
249 	}
250 	if(type == "GP"){
251 	  populationTarget     = new gp();
252 	  populationBackground = new gp();
253 	}
254 	if(type == "GT"){
255           populationTarget     = new gt();
256           populationBackground = new gt();
257         }
258 
259 	populationTarget->loadPop(target, var.sequenceName, var.position);
260 	populationBackground->loadPop(background, var.sequenceName, var.position);
261 
262 	if(populationTarget->af == -1 || populationBackground->af == -1){
263 	  delete populationTarget;
264 	  delete populationBackground;
265 	  continue;
266 	}
267 	if(populationTarget->af == 1 &&  populationBackground->af == 1){
268 	  delete populationTarget;
269           delete populationBackground;
270 	  continue;
271 	}
272 	if(populationTarget->af == 0 &&  populationBackground->af == 0){
273 	  delete populationTarget;
274           delete populationBackground;
275 	  continue;
276 	}
277 
278 	double afdiff = abs(populationTarget->af - populationBackground->af);
279 
280         if(afdiff < daf){
281 	  delete populationTarget;
282           delete populationBackground;
283           continue;
284         }
285 
286 	// pg 1360 B.S Weir and C.C. Cockerham 1984
287 	double nbar = ( populationTarget->ngeno / 2 ) + (populationBackground->ngeno / 2);
288 	double rn   = 2*nbar;
289 
290 	// special case of only two populations
291 	double nc   =  rn ;
292 	nc -= (pow(populationTarget->ngeno,2)/rn);
293 	nc -= (pow(populationBackground->ngeno,2)/rn);
294 	// average sample frequency
295 	double pbar = (populationTarget->af + populationBackground->af) / 2;
296 
297 	// sample variance of allele A frequences over the population
298 
299 	double s2 = 0;
300 	s2 += ((populationTarget->ngeno * pow(populationTarget->af - pbar, 2))/nbar);
301 	s2 += ((populationBackground->ngeno * pow(populationBackground->af - pbar, 2))/nbar);
302 
303 	// average heterozygosity
304 
305 	double hbar = (populationTarget->hfrq + populationBackground->hfrq) / 2;
306 
307 	//global af var
308 	double pvar = pbar * (1 - pbar);
309 
310 	// a, b, c
311 
312 	double avar1 = nbar / nc;
313 	double avar2 = 1 / (nbar -1) ;
314 	double avar3 = pvar - (0.5*s2) - (0.25*hbar);
315 	double avar  = avar1 * (s2 - (avar2 * avar3));
316 
317 	double bvar1 = nbar / (nbar - 1);
318 	double bvar2 = pvar - (0.5*s2) - (((2*nbar -1)/(4*nbar))*hbar);
319 	double bvar  = bvar1 * bvar2;
320 
321 	double cvar = 0.5*hbar;
322 
323 	double fst = avar / (avar+bvar+cvar);
324 
325 	cout << var.sequenceName << "\t"  << var.position << "\t" << populationTarget->af << "\t" << populationBackground->af << "\t" << fst << endl ;
326 
327 	delete populationTarget;
328 	delete populationBackground;
329 
330     }
331     return 0;
332 }
333