1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15 
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25 
26 using namespace std;
27 using namespace vcflib;
28 
29 
printHelp(void)30 void printHelp(void){
31   cerr << endl << endl;
32   cerr << "INFO: help" << endl;
33   cerr << "INFO: description:" << endl;
34   cerr << "      The sequenceDiversity program calculates two popular metrics of  haplotype diversity: pi and                                  " << endl;
35   cerr << "      extended haplotype homozygoisty (eHH).  Pi is calculated using the Nei and Li 1979 formulation.                               " << endl;
36   cerr << "      eHH a convenient way to think about haplotype diversity.  When eHH = 0 all haplotypes in the window                           " << endl;
37   cerr << "      are unique and when eHH = 1 all haplotypes in the window are identical.                           " << endl;
38 
39   cerr << endl;
40   cerr << "Output : 5 columns:"           << endl;
41   cerr << "         1.  seqid"            << endl;
42   cerr << "         2.  start of window"  << endl;
43   cerr << "         3.  end of window  "  << endl;
44   cerr << "         4.  pi             "  << endl;
45   cerr << "         5.  eHH            "  << endl;
46   cerr << endl << endl;
47   cerr << "INFO: usage: sequenceDiversity --target 0,1,2,3,4,5,6,7 --file my.vcf                                                                      " << endl;
48   cerr << endl;
49   cerr << "INFO: required: t,target     -- argument: a zero base comma separated list of target individuals corresponding to VCF columns        " << endl;
50   cerr << "INFO: required: f,file       -- argument: a properly formatted phased VCF file                                                       " << endl;
51   cerr << "INFO: required: y,type       -- argument: type of genotype likelihood: PL, GL or GP                                                  " << endl;
52   cerr << "INFO: optional: a,af         -- sites less than af  are filtered out; default is 0                                          " << endl;
53   cerr << "INFO: optional: r,region     -- argument: a tabix compliant region : \"seqid:0-100\" or \"seqid\"                                    " << endl;
54   cerr << "INFO: optional: w,window     -- argument: the number of SNPs per window; default is 20                                               " << endl;
55   cerr << endl << "Type: statistics" << endl << endl;
56   cerr << endl;
57 
58   printVersion();
59 
60   exit(1);
61 }
62 
clearHaplotypes(string ** haplotypes,int ntarget)63 void clearHaplotypes(string **haplotypes, int ntarget){
64   for(int i= 0; i < ntarget; i++){
65     haplotypes[i][0].clear();
66     haplotypes[i][1].clear();
67   }
68 }
69 
loadIndices(map<int,int> & index,string set)70 void loadIndices(map<int, int> & index, string set){
71 
72   vector<string>  indviduals = split(set, ",");
73   vector<string>::iterator it = indviduals.begin();
74 
75   for(; it != indviduals.end(); it++){
76     index[ atoi( (*it).c_str() ) ] = 1;
77   }
78 }
79 
pi(map<string,int> & hapWin,int nHaps,double * pi,double * eHH,int wlen)80 void pi(map<string, int> & hapWin, int nHaps, double * pi, double * eHH, int wlen){
81 
82   double nchooseSum = 0;
83   // summing over all possible haplotypes
84   for(std::map<string, int>::iterator it = hapWin.begin();
85       it != hapWin.end(); it++){
86     nchooseSum += r8_choose(it->second, 2);
87   }
88 
89   double piSum = 0;
90   // all unique pairwise
91   for(std::map<string, int>::iterator it = hapWin.begin();
92       it != hapWin.end(); it++){
93 
94     // advancing it
95     std::map<string, int>::iterator iz = it;
96     iz++;
97     for(;iz != hapWin.end(); iz++){
98       // different bases
99       int ndiff = 0;
100       for(int i = 0; i < it->first.size();i++){
101 	if(it->first[i] != iz->first[i]){
102 	  ndiff += 1;
103 	}
104       }
105       double f1 = double(it->second)/double(nHaps);
106       double f2 = double(iz->second)/double(nHaps);
107       double perBaseDiff = double(ndiff)/double(wlen);
108 
109       piSum += f1*f2*perBaseDiff;
110     }
111   }
112 
113 
114   *pi  = piSum;
115   *eHH = nchooseSum / r8_choose(nHaps, 2);
116 }
117 
118 
119 //calc(haplotypes, nsamples, positions, targetAFS, backgroundAFS, external, derived, windowSize, target_h, background_h, currentSeqid)
calc(string ** haplotypes,int nhaps,vector<long int> pos,vector<double> tafs,vector<double> bafs,int external,int derived,int window,vector<int> & target,vector<int> & background,string seqid)120 void calc(string **haplotypes, int nhaps, vector<long int> pos, vector<double> tafs, vector<double> bafs, int external, int derived, int window,  vector<int> & target, vector<int> & background, string seqid){
121 
122   if(haplotypes[0][0].length() < (window-1) ){
123     return;
124   }
125 
126   for(int snpA = 0; snpA < haplotypes[0][0].length() - window; snpA += 1){
127 
128     map <string, int> targetHaplotypes;
129 
130 
131     for(int targetIndex = 0; targetIndex < target.size(); targetIndex++ ){
132 
133       string haplotypeA;
134       string haplotypeB;
135 
136       haplotypeA += haplotypes[target[targetIndex]][0].substr(snpA, window) ;
137       haplotypeB += haplotypes[target[targetIndex]][1].substr(snpA, window) ;
138 
139       targetHaplotypes[haplotypeA]++;
140       targetHaplotypes[haplotypeB]++;
141 
142     }
143 
144     double piEst;
145     double eHH = 0;
146 
147     // target haplotype are the counts of the unique haplotypes
148 
149     int wlen = pos[snpA + window] - pos[snpA];
150 
151     pi(targetHaplotypes, target.size()*2, &piEst, &eHH, wlen);
152 
153     cout << seqid << "\t" << pos[snpA] << "\t" << pos[snpA + window] << "\t" << piEst << "\t" << eHH << endl;
154 
155   }
156 
157 }
158 
loadPhased(string ** haplotypes,genotype * pop,int ntarget)159 void loadPhased(string **haplotypes, genotype * pop, int ntarget){
160 
161   int indIndex = 0;
162 
163   for(vector<string>::iterator ind = pop->gts.begin(); ind != pop->gts.end(); ind++){
164     string g = (*ind);
165     vector< string > gs = split(g, "|");
166     haplotypes[indIndex][0].append(gs[0]);
167     haplotypes[indIndex][1].append(gs[1]);
168     indIndex += 1;
169   }
170 }
171 
main(int argc,char ** argv)172 int main(int argc, char** argv) {
173 
174   // set the random seed for MCMC
175 
176   srand((unsigned)time(NULL));
177 
178   // the filename
179 
180   string filename = "NA";
181 
182   // set region to scaffold
183 
184   string region = "NA";
185 
186   // using vcflib; thanks to Erik Garrison
187 
188   VariantCallFile variantFile;
189 
190   // zero based index for the target and background indivudals
191 
192   map<int, int> targetIndex, backgroundIndex;
193 
194   // deltaaf is the difference of allele frequency we bother to look at
195 
196   // ancestral state is set to zero by default
197 
198 
199   int counts = 0;
200 
201   // phased
202 
203   int phased   = 0;
204 
205   // use the background allele frequency
206 
207   int external = 0;
208 
209   // "11" vs "00"
210 
211   int derived = 0;
212 
213   int windowSize = 20;
214 
215   // allele frequency to filter out
216   double af_filt = 0;
217 
218   string type = "NA";
219 
220     const struct option longopts[] =
221       {
222 	{"version"     , 0, 0, 'v'},
223 	{"help"        , 0, 0, 'h'},
224         {"file"        , 1, 0, 'f'},
225 	{"target"      , 1, 0, 't'},
226 	{"region"      , 1, 0, 'r'},
227 	{"type"        , 1, 0, 'y'},
228 	{"window"      , 1, 0, 'w'},
229 	{"external"    , 1, 0, 'e'},
230 	{"af"          , 1, 0, 'a'},
231 	{"derived"     , 1, 0, 'd'},
232 	{0,0,0,0}
233       };
234 
235     int findex;
236     int iarg=0;
237 
238     while(iarg != -1)
239       {
240 	iarg = getopt_long(argc, argv, "a:w:y:r:t:b:f:edhv", longopts, &findex);
241 
242 	switch (iarg)
243 	  {
244 	  case 'h':
245 	    {
246 	      printHelp();
247 	      break;
248 	    }
249 	  case 'v':
250 	    {
251 	      printVersion();
252 	      break;
253 	    }
254 	  case 'y':
255 	    {
256 	      type = optarg;
257 	      break;
258 	    }
259 	  case 'a':
260 	    {
261 	      af_filt = atof(optarg);
262 	      cerr << "INFO: filtering out allele frequencies less than: " << af_filt << endl;
263 	      break;
264 	    }
265 	  case 't':
266 	    {
267 	      loadIndices(targetIndex, optarg);
268 	      cerr << "INFO: there are " << targetIndex.size() << " individuals in the target" << endl;
269 	      cerr << "INFO: target ids: " << optarg << endl;
270 	      break;
271 	    }
272 	  case 'f':
273 	    {
274 	      cerr << "INFO: file: " << optarg  <<  endl;
275 	      filename = optarg;
276 	      break;
277 	    }
278 	  case 'r':
279 	    {
280 	      cerr << "INFO: set seqid region to : " << optarg << endl;
281 	      region = optarg;
282 	      break;
283 	    }
284 	  case 'e':
285 	    {
286 	      external = 1;
287 	      cerr << "INFO: using background group\'s allele frequecy" << endl;
288 	      break;
289 	    }
290 	  case 'd':
291 	    {
292 	      derived == 1;
293 	      cerr << "INFO: count haplotypes \"11\" rather than \"00\"" << endl;
294 	      break;
295 	    }
296 	  case 'w':
297 	    {
298 	      string win = optarg;
299 	      windowSize = atof( win.c_str() );
300 	      break;
301 	    }
302 	  default :
303 	    break;
304 	  }
305       }
306 
307     map<string, int> okayGenotypeLikelihoods;
308     okayGenotypeLikelihoods["PL"] = 1;
309     okayGenotypeLikelihoods["GL"] = 1;
310     okayGenotypeLikelihoods["GP"] = 1;
311     okayGenotypeLikelihoods["GT"] = 1;
312 
313     if(targetIndex.size() < 2){
314       cerr << endl;
315       cerr << "FATAL: failed to specify a target - or - too few individuals in the target" << endl;
316       printHelp();
317       return 1;
318     }
319 
320     if(type == "NA"){
321       cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
322       printHelp();
323       return 1;
324     }
325     if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
326       cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
327       printHelp();
328       return 1;
329     }
330 
331     if(filename == "NA"){
332       cerr << "FATAL: did not specify a file" << endl;
333       printHelp();
334       return(1);
335     }
336 
337     cerr << "INFO: window size: " << windowSize << endl;
338 
339     variantFile.open(filename);
340 
341    if(region != "NA"){
342      variantFile.setRegion(region);
343    }
344 
345     if (!variantFile.is_open()) {
346         return 1;
347     }
348 
349     Variant var(variantFile);
350 
351     vector<string> samples = variantFile.sampleNames;
352     int nsamples = samples.size();
353 
354     vector<int> target_h, background_h;
355 
356     int index, indexi = 0;
357 
358     for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
359       string sampleName = (*samp);
360       if(targetIndex.find(index) != targetIndex.end() ){
361 	target_h.push_back(indexi);
362 	indexi++;
363       }
364       if(backgroundIndex.find(index) != backgroundIndex.end()){
365 	background_h.push_back(indexi);
366 	indexi++;
367       }
368       index++;
369     }
370 
371     vector<long int> positions;
372     vector<double>   targetAFS;
373     vector<double>   backgroundAFS;
374 
375     string **haplotypes = new string*[nsamples];
376 	for (int i = 0; i < nsamples; i++) {
377 	  haplotypes[i] = new string[2];
378 	}
379 
380     string currentSeqid = "NA";
381 
382     while (variantFile.getNextVariant(var)) {
383 
384       if(!var.isPhased()){
385 	cerr <<"FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
386 	printHelp();
387 	return(1);
388       }
389       if(var.alt.size() > 1){
390 	continue;
391       }
392       if(currentSeqid != var.sequenceName){
393 	if(haplotypes[0][0].length() > windowSize){
394 	  calc(haplotypes, nsamples, positions, targetAFS, backgroundAFS, external, derived, windowSize, target_h, background_h, currentSeqid);
395 	}
396 	clearHaplotypes(haplotypes, nsamples);
397 	positions.clear();
398 	currentSeqid = var.sequenceName;
399 	targetAFS.clear();
400 	backgroundAFS.clear();
401       }
402 
403 
404       vector < map< string, vector<string> > > target, background, total;
405 
406       int sindex = 0;
407 
408       for(int nsamp = 0; nsamp < nsamples; nsamp++){
409 
410 	map<string, vector<string> > sample = var.samples[ samples[nsamp]];
411 
412 	if(targetIndex.find(sindex) != targetIndex.end() ){
413 	  target.push_back(sample);
414 	  total.push_back(sample);
415 	}
416 	if(backgroundIndex.find(sindex) != backgroundIndex.end()){
417 	  background.push_back(sample);
418 	  total.push_back(sample);
419 	}
420 	sindex += 1;
421       }
422 
423       genotype * populationTarget    ;
424       genotype * populationBackground;
425       genotype * populationTotal     ;
426 
427       if(type == "PL"){
428 	populationTarget     = new pl();
429 	populationBackground = new pl();
430 	populationTotal      = new pl();
431       }
432       if(type == "GL"){
433 	populationTarget     = new gl();
434 	populationBackground = new gl();
435 	populationTotal      = new gl();
436       }
437       if(type == "GP"){
438 	populationTarget     = new gp();
439 	populationBackground = new gp();
440 	populationTotal      = new gp();
441       }
442       if(type == "GT"){
443         populationTarget     = new gt();
444         populationBackground = new gt();
445         populationTotal      = new gt();
446       }
447 
448       populationTarget->loadPop(target,         var.sequenceName, var.position);
449 
450       populationBackground->loadPop(background, var.sequenceName, var.position);
451 
452       populationTotal->loadPop(total,           var.sequenceName, var.position);
453 
454       if(populationTotal->af < af_filt){
455 
456 	delete populationTarget;
457 	delete populationBackground;
458 	delete populationTotal;
459 	continue;
460       }
461 
462       targetAFS.push_back(populationTarget->af);
463       backgroundAFS.push_back(populationBackground->af);
464       positions.push_back(var.position);
465       loadPhased(haplotypes, populationTotal, nsamples);
466 
467     }
468 
469     calc(haplotypes, nsamples, positions, targetAFS, backgroundAFS, external, derived, windowSize, target_h, background_h, currentSeqid);
470 
471     return 0;
472 }
473