1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25
26 using namespace std;
27 using namespace vcflib;
28
29
printHelp(void)30 void printHelp(void){
31 cerr << endl << endl;
32 cerr << "INFO: help" << endl;
33 cerr << "INFO: description:" << endl;
34 cerr << " The sequenceDiversity program calculates two popular metrics of haplotype diversity: pi and " << endl;
35 cerr << " extended haplotype homozygoisty (eHH). Pi is calculated using the Nei and Li 1979 formulation. " << endl;
36 cerr << " eHH a convenient way to think about haplotype diversity. When eHH = 0 all haplotypes in the window " << endl;
37 cerr << " are unique and when eHH = 1 all haplotypes in the window are identical. " << endl;
38
39 cerr << endl;
40 cerr << "Output : 5 columns:" << endl;
41 cerr << " 1. seqid" << endl;
42 cerr << " 2. start of window" << endl;
43 cerr << " 3. end of window " << endl;
44 cerr << " 4. pi " << endl;
45 cerr << " 5. eHH " << endl;
46 cerr << endl << endl;
47 cerr << "INFO: usage: sequenceDiversity --target 0,1,2,3,4,5,6,7 --file my.vcf " << endl;
48 cerr << endl;
49 cerr << "INFO: required: t,target -- argument: a zero base comma separated list of target individuals corresponding to VCF columns " << endl;
50 cerr << "INFO: required: f,file -- argument: a properly formatted phased VCF file " << endl;
51 cerr << "INFO: required: y,type -- argument: type of genotype likelihood: PL, GL or GP " << endl;
52 cerr << "INFO: optional: a,af -- sites less than af are filtered out; default is 0 " << endl;
53 cerr << "INFO: optional: r,region -- argument: a tabix compliant region : \"seqid:0-100\" or \"seqid\" " << endl;
54 cerr << "INFO: optional: w,window -- argument: the number of SNPs per window; default is 20 " << endl;
55 cerr << endl << "Type: statistics" << endl << endl;
56 cerr << endl;
57
58 printVersion();
59
60 exit(1);
61 }
62
clearHaplotypes(string ** haplotypes,int ntarget)63 void clearHaplotypes(string **haplotypes, int ntarget){
64 for(int i= 0; i < ntarget; i++){
65 haplotypes[i][0].clear();
66 haplotypes[i][1].clear();
67 }
68 }
69
loadIndices(map<int,int> & index,string set)70 void loadIndices(map<int, int> & index, string set){
71
72 vector<string> indviduals = split(set, ",");
73 vector<string>::iterator it = indviduals.begin();
74
75 for(; it != indviduals.end(); it++){
76 index[ atoi( (*it).c_str() ) ] = 1;
77 }
78 }
79
pi(map<string,int> & hapWin,int nHaps,double * pi,double * eHH,int wlen)80 void pi(map<string, int> & hapWin, int nHaps, double * pi, double * eHH, int wlen){
81
82 double nchooseSum = 0;
83 // summing over all possible haplotypes
84 for(std::map<string, int>::iterator it = hapWin.begin();
85 it != hapWin.end(); it++){
86 nchooseSum += r8_choose(it->second, 2);
87 }
88
89 double piSum = 0;
90 // all unique pairwise
91 for(std::map<string, int>::iterator it = hapWin.begin();
92 it != hapWin.end(); it++){
93
94 // advancing it
95 std::map<string, int>::iterator iz = it;
96 iz++;
97 for(;iz != hapWin.end(); iz++){
98 // different bases
99 int ndiff = 0;
100 for(int i = 0; i < it->first.size();i++){
101 if(it->first[i] != iz->first[i]){
102 ndiff += 1;
103 }
104 }
105 double f1 = double(it->second)/double(nHaps);
106 double f2 = double(iz->second)/double(nHaps);
107 double perBaseDiff = double(ndiff)/double(wlen);
108
109 piSum += f1*f2*perBaseDiff;
110 }
111 }
112
113
114 *pi = piSum;
115 *eHH = nchooseSum / r8_choose(nHaps, 2);
116 }
117
118
119 //calc(haplotypes, nsamples, positions, targetAFS, backgroundAFS, external, derived, windowSize, target_h, background_h, currentSeqid)
calc(string ** haplotypes,int nhaps,vector<long int> pos,vector<double> tafs,vector<double> bafs,int external,int derived,int window,vector<int> & target,vector<int> & background,string seqid)120 void calc(string **haplotypes, int nhaps, vector<long int> pos, vector<double> tafs, vector<double> bafs, int external, int derived, int window, vector<int> & target, vector<int> & background, string seqid){
121
122 if(haplotypes[0][0].length() < (window-1) ){
123 return;
124 }
125
126 for(int snpA = 0; snpA < haplotypes[0][0].length() - window; snpA += 1){
127
128 map <string, int> targetHaplotypes;
129
130
131 for(int targetIndex = 0; targetIndex < target.size(); targetIndex++ ){
132
133 string haplotypeA;
134 string haplotypeB;
135
136 haplotypeA += haplotypes[target[targetIndex]][0].substr(snpA, window) ;
137 haplotypeB += haplotypes[target[targetIndex]][1].substr(snpA, window) ;
138
139 targetHaplotypes[haplotypeA]++;
140 targetHaplotypes[haplotypeB]++;
141
142 }
143
144 double piEst;
145 double eHH = 0;
146
147 // target haplotype are the counts of the unique haplotypes
148
149 int wlen = pos[snpA + window] - pos[snpA];
150
151 pi(targetHaplotypes, target.size()*2, &piEst, &eHH, wlen);
152
153 cout << seqid << "\t" << pos[snpA] << "\t" << pos[snpA + window] << "\t" << piEst << "\t" << eHH << endl;
154
155 }
156
157 }
158
loadPhased(string ** haplotypes,genotype * pop,int ntarget)159 void loadPhased(string **haplotypes, genotype * pop, int ntarget){
160
161 int indIndex = 0;
162
163 for(vector<string>::iterator ind = pop->gts.begin(); ind != pop->gts.end(); ind++){
164 string g = (*ind);
165 vector< string > gs = split(g, "|");
166 haplotypes[indIndex][0].append(gs[0]);
167 haplotypes[indIndex][1].append(gs[1]);
168 indIndex += 1;
169 }
170 }
171
main(int argc,char ** argv)172 int main(int argc, char** argv) {
173
174 // set the random seed for MCMC
175
176 srand((unsigned)time(NULL));
177
178 // the filename
179
180 string filename = "NA";
181
182 // set region to scaffold
183
184 string region = "NA";
185
186 // using vcflib; thanks to Erik Garrison
187
188 VariantCallFile variantFile;
189
190 // zero based index for the target and background indivudals
191
192 map<int, int> targetIndex, backgroundIndex;
193
194 // deltaaf is the difference of allele frequency we bother to look at
195
196 // ancestral state is set to zero by default
197
198
199 int counts = 0;
200
201 // phased
202
203 int phased = 0;
204
205 // use the background allele frequency
206
207 int external = 0;
208
209 // "11" vs "00"
210
211 int derived = 0;
212
213 int windowSize = 20;
214
215 // allele frequency to filter out
216 double af_filt = 0;
217
218 string type = "NA";
219
220 const struct option longopts[] =
221 {
222 {"version" , 0, 0, 'v'},
223 {"help" , 0, 0, 'h'},
224 {"file" , 1, 0, 'f'},
225 {"target" , 1, 0, 't'},
226 {"region" , 1, 0, 'r'},
227 {"type" , 1, 0, 'y'},
228 {"window" , 1, 0, 'w'},
229 {"external" , 1, 0, 'e'},
230 {"af" , 1, 0, 'a'},
231 {"derived" , 1, 0, 'd'},
232 {0,0,0,0}
233 };
234
235 int findex;
236 int iarg=0;
237
238 while(iarg != -1)
239 {
240 iarg = getopt_long(argc, argv, "a:w:y:r:t:b:f:edhv", longopts, &findex);
241
242 switch (iarg)
243 {
244 case 'h':
245 {
246 printHelp();
247 break;
248 }
249 case 'v':
250 {
251 printVersion();
252 break;
253 }
254 case 'y':
255 {
256 type = optarg;
257 break;
258 }
259 case 'a':
260 {
261 af_filt = atof(optarg);
262 cerr << "INFO: filtering out allele frequencies less than: " << af_filt << endl;
263 break;
264 }
265 case 't':
266 {
267 loadIndices(targetIndex, optarg);
268 cerr << "INFO: there are " << targetIndex.size() << " individuals in the target" << endl;
269 cerr << "INFO: target ids: " << optarg << endl;
270 break;
271 }
272 case 'f':
273 {
274 cerr << "INFO: file: " << optarg << endl;
275 filename = optarg;
276 break;
277 }
278 case 'r':
279 {
280 cerr << "INFO: set seqid region to : " << optarg << endl;
281 region = optarg;
282 break;
283 }
284 case 'e':
285 {
286 external = 1;
287 cerr << "INFO: using background group\'s allele frequecy" << endl;
288 break;
289 }
290 case 'd':
291 {
292 derived == 1;
293 cerr << "INFO: count haplotypes \"11\" rather than \"00\"" << endl;
294 break;
295 }
296 case 'w':
297 {
298 string win = optarg;
299 windowSize = atof( win.c_str() );
300 break;
301 }
302 default :
303 break;
304 }
305 }
306
307 map<string, int> okayGenotypeLikelihoods;
308 okayGenotypeLikelihoods["PL"] = 1;
309 okayGenotypeLikelihoods["GL"] = 1;
310 okayGenotypeLikelihoods["GP"] = 1;
311 okayGenotypeLikelihoods["GT"] = 1;
312
313 if(targetIndex.size() < 2){
314 cerr << endl;
315 cerr << "FATAL: failed to specify a target - or - too few individuals in the target" << endl;
316 printHelp();
317 return 1;
318 }
319
320 if(type == "NA"){
321 cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
322 printHelp();
323 return 1;
324 }
325 if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
326 cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
327 printHelp();
328 return 1;
329 }
330
331 if(filename == "NA"){
332 cerr << "FATAL: did not specify a file" << endl;
333 printHelp();
334 return(1);
335 }
336
337 cerr << "INFO: window size: " << windowSize << endl;
338
339 variantFile.open(filename);
340
341 if(region != "NA"){
342 variantFile.setRegion(region);
343 }
344
345 if (!variantFile.is_open()) {
346 return 1;
347 }
348
349 Variant var(variantFile);
350
351 vector<string> samples = variantFile.sampleNames;
352 int nsamples = samples.size();
353
354 vector<int> target_h, background_h;
355
356 int index, indexi = 0;
357
358 for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
359 string sampleName = (*samp);
360 if(targetIndex.find(index) != targetIndex.end() ){
361 target_h.push_back(indexi);
362 indexi++;
363 }
364 if(backgroundIndex.find(index) != backgroundIndex.end()){
365 background_h.push_back(indexi);
366 indexi++;
367 }
368 index++;
369 }
370
371 vector<long int> positions;
372 vector<double> targetAFS;
373 vector<double> backgroundAFS;
374
375 string **haplotypes = new string*[nsamples];
376 for (int i = 0; i < nsamples; i++) {
377 haplotypes[i] = new string[2];
378 }
379
380 string currentSeqid = "NA";
381
382 while (variantFile.getNextVariant(var)) {
383
384 if(!var.isPhased()){
385 cerr <<"FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
386 printHelp();
387 return(1);
388 }
389 if(var.alt.size() > 1){
390 continue;
391 }
392 if(currentSeqid != var.sequenceName){
393 if(haplotypes[0][0].length() > windowSize){
394 calc(haplotypes, nsamples, positions, targetAFS, backgroundAFS, external, derived, windowSize, target_h, background_h, currentSeqid);
395 }
396 clearHaplotypes(haplotypes, nsamples);
397 positions.clear();
398 currentSeqid = var.sequenceName;
399 targetAFS.clear();
400 backgroundAFS.clear();
401 }
402
403
404 vector < map< string, vector<string> > > target, background, total;
405
406 int sindex = 0;
407
408 for(int nsamp = 0; nsamp < nsamples; nsamp++){
409
410 map<string, vector<string> > sample = var.samples[ samples[nsamp]];
411
412 if(targetIndex.find(sindex) != targetIndex.end() ){
413 target.push_back(sample);
414 total.push_back(sample);
415 }
416 if(backgroundIndex.find(sindex) != backgroundIndex.end()){
417 background.push_back(sample);
418 total.push_back(sample);
419 }
420 sindex += 1;
421 }
422
423 genotype * populationTarget ;
424 genotype * populationBackground;
425 genotype * populationTotal ;
426
427 if(type == "PL"){
428 populationTarget = new pl();
429 populationBackground = new pl();
430 populationTotal = new pl();
431 }
432 if(type == "GL"){
433 populationTarget = new gl();
434 populationBackground = new gl();
435 populationTotal = new gl();
436 }
437 if(type == "GP"){
438 populationTarget = new gp();
439 populationBackground = new gp();
440 populationTotal = new gp();
441 }
442 if(type == "GT"){
443 populationTarget = new gt();
444 populationBackground = new gt();
445 populationTotal = new gt();
446 }
447
448 populationTarget->loadPop(target, var.sequenceName, var.position);
449
450 populationBackground->loadPop(background, var.sequenceName, var.position);
451
452 populationTotal->loadPop(total, var.sequenceName, var.position);
453
454 if(populationTotal->af < af_filt){
455
456 delete populationTarget;
457 delete populationBackground;
458 delete populationTotal;
459 continue;
460 }
461
462 targetAFS.push_back(populationTarget->af);
463 backgroundAFS.push_back(populationBackground->af);
464 positions.push_back(var.position);
465 loadPhased(haplotypes, populationTotal, nsamples);
466
467 }
468
469 calc(haplotypes, nsamples, positions, targetAFS, backgroundAFS, external, derived, windowSize, target_h, background_h, currentSeqid);
470
471 return 0;
472 }
473