1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15 
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25 // maaas speed
26 
27 #if defined HAS_OPENMP
28 #include <omp.h>
29 // print lock
30 omp_lock_t lock;
31 #endif
32 
33 
34 struct opts{
35   int         threads             ;
36   std::string filename            ;
37   std::string mapFile             ;
38   std::string seqid               ;
39   std::string geneticMapFile      ;
40   std::string type                ;
41   std::string region              ;
42   std::map<int, double> geneticMap;
43   double      af                  ;
44 
45 }globalOpts;
46 
47 
48 using namespace std;
49 using namespace vcflib;
50 
printHelp(void)51 void printHelp(void){
52   cerr << R"(
53 iHS calculates the integrated haplotype score which measures the relative decay of extended haplotype homozygosity (EHH) for the reference and alternative alleles at a site (see: voight et al. 2006, Spiech & Hernandez 2014).
54 
55 Our code is highly concordant with both implementations mentioned. However, we do not set an upper limit to the allele frequency.  iHS can be run without a genetic map, in which case the change in EHH is integrated over a constant.  Human genetic maps for GRCh36 and GRCh37 (hg18 & hg19) can be found at: http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/ . iHS by default interpolates SNV positions to genetic position (you don't need a genetic position for every VCF entry in the map file).
56 
57 iHS analyses requires normalization by allele frequency.  It is important that iHS is calculated over large regions so that the normalization does not down weight real signals.  For genome-wide runs it is recommended to run slightly overlapping windows and throwing out values that fail integration (columns 7 & 8 in the output) and then removing duplicates by using the 'sort' and 'uniq' linux commands.  Normalization of the output is as simple as running 'normalize-iHS'.
58 
59 INFO: help
60 INFO: description:
61      iHS calculates the integrated ratio of haplotype decay between the reference and non-reference allele.
62 Output : 4 columns :
63      1. seqid
64      2. position
65      3. target allele frequency
66      4. integrated EHH (alternative)
67      5. integrated EHH (reference)
68      6. iHS ln(iEHHalt/iEHHref)
69      7. != 0 integration failure
70      8. != 0 integration failure
71 
72 Usage: iHS --target 0,1,2,3,4,5,6,7 --file my.phased.vcf  \
73            --region chr1:1-1000 > STDOUT 2> STDERR
74 
75 Params:
76        required: t,target  <STRING>  A zero base comma separated list of target
77                                      individuals corresponding to VCF columns
78        required: r,region  <STRING>  A tabix compliant genomic range
79                                      format: "seqid:start-end" or "seqid"
80        required: f,file    <STRING>  Proper formatted and phased VCF.
81        required: y,type    <STRING>  Genotype likelihood format: GT,PL,GL,GP
82        optional: a,af      <DOUBLE>  Alternative alleles with frquences less
83                                      than [0.05] are skipped.
84        optional: x,threads <INT>     Number of CPUS [1].
85        recommended: g,gen <STRING>   A PLINK formatted map file.
86 
87 )" << endl ;
88   cerr << endl << "Type: statistics" << endl << endl;
89   cerr << endl;
90 
91   printVersion();
92 
93   exit(1);
94 }
95 
96 
gDist(int start,int end,double * gd)97 bool gDist(int start, int end, double * gd){
98 
99   if(globalOpts.geneticMap.find(start) == globalOpts.geneticMap.end()){
100     return false;
101   }
102   if(globalOpts.geneticMap.find(end) == globalOpts.geneticMap.end()){
103     return false;
104   }
105   *gd = abs(globalOpts.geneticMap[start] - globalOpts.geneticMap[end]);
106   return true;
107 }
108 
loadGeneticMap(int start,int end)109 void loadGeneticMap(int start, int end){
110 
111   if(globalOpts.geneticMapFile.empty()){
112     std::cerr << "WARNING: No genetic map." << std::endl;
113     std::cerr << "WARNING: A constant genetic distance is being used: 0.001." << std::endl;
114     return;
115   }
116 
117   ifstream featureFile (globalOpts.geneticMapFile.c_str());
118 
119   string line;
120 
121   int lastpos      = 0;
122   double lastvalue = 0;
123 
124   if(featureFile.is_open()){
125 
126     while(getline(featureFile, line)){
127 
128       vector<string> region = split(line, "\t");
129 
130       if(region.front() != globalOpts.seqid){
131 	std::cerr << "WARNING: seqid MisMatch: " << region.front() << " " << globalOpts.seqid << std::endl;
132 	continue;
133       }
134 
135       int   pos = atoi(region[3].c_str()) ;
136       double cm = atof(region[2].c_str()) ;
137 
138       if(lastpos == 0 && start > pos){
139 	lastpos = pos;
140 	continue;
141       }
142 
143       int diff     = abs(pos - lastpos);
144       double vdiff = abs(lastvalue - cm );
145       double chunk = vdiff/double(diff);
146 
147       double running = lastvalue;
148 
149       for(int i = lastpos; i < pos; i++){
150 	globalOpts.geneticMap[i] = running;
151 	running += chunk;
152       }
153 
154       if(pos > end){
155 	break;
156       }
157 
158 
159       lastpos = pos;
160       lastvalue = cm;
161     }
162   }
163 
164   featureFile.close();
165 
166   if(globalOpts.geneticMap.size() < 1){
167     std::cerr << "FATAL: Problem loading genetic map" << std::endl;
168     exit(1);
169   }
170 }
171 
172 
clearHaplotypes(string ** haplotypes,int ntarget)173 void clearHaplotypes(string **haplotypes, int ntarget){
174   for(int i= 0; i < ntarget; i++){
175     haplotypes[i][0].clear();
176     haplotypes[i][1].clear();
177   }
178 }
179 
loadIndices(map<int,int> & index,string set)180 void loadIndices(map<int, int> & index, string set){
181 
182   vector<string>  indviduals = split(set, ",");
183   vector<string>::iterator it = indviduals.begin();
184 
185   for(; it != indviduals.end(); it++){
186     index[ atoi( (*it).c_str() ) ] = 1;
187   }
188 }
189 
countHaps(int nhaps,map<string,int> & targetH,string ** haplotypes,int start,int end)190 void countHaps(int nhaps, map<string, int> & targetH,
191 	       string **haplotypes, int start, int end){
192 
193   for(int i = 0; i < nhaps; i++){
194 
195     std::string h1 =  haplotypes[i][0].substr(start, (end - start)) ;
196     std::string h2 =  haplotypes[i][1].substr(start, (end - start)) ;
197 
198     if(targetH.find(h1)  == targetH.end()){
199       targetH[h1] = 1;
200     }
201     else{
202       targetH[h1]++;
203     }
204     if(targetH.find(h2)  == targetH.end()){
205       targetH[h2] = 1;
206     }
207     else{
208       targetH[h2]++;
209     }
210   }
211 }
212 
computeNs(map<string,int> & targetH,int start,int end,double * sumT,char ref,bool dir)213 void computeNs(map<string, int> & targetH, int start,
214 	       int end, double * sumT, char ref, bool dir){
215 
216   for( map<string, int>::iterator th = targetH.begin();
217        th != targetH.end(); th++){
218 
219     if(th->second < 2){
220       continue;
221     }
222 
223 
224     // end is extending ; check first base
225     if(dir){
226       if( th->first[0] == ref){
227 
228 	//	std::cerr << "count dat: " << th->first << " " << th->second << " " << ref << " " << dir << endl;
229 
230 
231 	*sumT += r8_choose(th->second, 2);
232       }
233     }
234 
235     // start is extending ; check last base
236     else{
237 
238       int last = th->first.size() -1;
239       if( th->first[last] == ref ){
240 	//	std::cerr << "count dat:" << th->first << " " << th->second << " " << ref << " " << dir << endl;
241 
242 
243       	*sumT += r8_choose(th->second, 2);
244       }
245     }
246   }
247 }
248 
calcEhh(string ** haplotypes,int start,int end,char ref,int nhaps,double * ehh,double div,bool dir)249 bool calcEhh(string **haplotypes, int start,
250 	     int end, char ref, int nhaps,
251 	     double * ehh, double  div, bool dir){
252 
253   double sum = 0 ;
254   map<string , int> refH;
255 
256   countHaps(nhaps, refH, haplotypes, start, end);
257   computeNs(refH, start, end, &sum, ref, dir   );
258 
259   double internalEHH = sum / (r8_choose(div, 2));
260 
261   if(internalEHH > 1){
262     std::cerr << "FATAL: internal error." << std::endl;
263     exit(1);
264   }
265 
266   *ehh = internalEHH;
267 
268   return true;
269 }
270 
integrate(string ** haplotypes,vector<long int> & pos,bool direction,int maxl,int snp,char ref,int nhaps,double * iHH,double denom)271 int integrate(string **haplotypes   ,
272 	      vector<long int> & pos,
273 	      bool         direction,
274 	      int               maxl,
275 	      int                snp,
276 	      char               ref,
277 	      int              nhaps,
278 	      double *           iHH,
279 	      double           denom ){
280 
281   double ehh = 1;
282 
283   int start = snp;
284   int end   = snp;
285 
286   // controls the substring madness
287   if(!direction){
288     start += 1;
289     end += 1;
290   }
291 
292   while(ehh > 0.05){
293     if(direction){
294       end += 1;
295     }
296     else{
297       start -= 1;
298     }
299     if(start < 0){
300       return 1;
301     }
302     if(end > maxl){
303       return 1;
304     }
305     double ehhRT = 0;
306     if(!calcEhh(haplotypes,
307 		start, end,
308 		ref, nhaps,
309 		&ehhRT, denom,
310 		direction)){
311       return 1;
312     }
313 
314     if(ehhRT <= 0.05){
315       return 0;
316     }
317 
318     double delta_gDist = 0.001;
319 
320     bool veryLongGap = false ;
321     double dist      =     0 ;
322 
323     if(direction){
324       gDist(pos[end-1], pos[end], &delta_gDist);
325       dist = abs(pos[end-1] - pos[end]);
326     }
327     else{
328       gDist(pos[start + 1], pos[start], &delta_gDist);
329       dist = abs(pos[end-1] - pos[end]);
330 
331     }
332 
333     if(dist > 10000){
334       return 1;
335     }
336     double correction = 1;
337     if(dist > 5000){
338       correction = 5000 / dist;
339     }
340 
341     *iHH += ((ehh + ehhRT)/2)*delta_gDist*correction;
342     ehh = ehhRT;
343 
344   }
345 
346   return 10;
347 }
348 
calc(string ** haplotypes,int nhaps,vector<double> & afs,vector<long int> & pos,vector<int> & target,vector<int> & background,string seqid)349 void calc(string **haplotypes, int nhaps,
350 	  vector<double> & afs, vector<long int> & pos,
351 	  vector<int> & target, vector<int> & background, string seqid){
352 
353   int maxl = haplotypes[0][0].length();
354 
355 #if defined HAS_OPENMP
356 #pragma omp parallel for schedule(dynamic, 20)
357 #endif
358 
359   for(int snp = 0; snp < maxl; snp++){
360 
361     double ihhR     = 0;
362     double ihhA     = 0;
363 
364     map<string , int> refH;
365 
366     countHaps(nhaps, refH, haplotypes, snp, snp+1);
367 
368 
369     double denomP1 = double(refH["0"]);
370     double denomP2 = double(refH["1"]);
371 
372     int refFail = 0;
373     int altFail = 0;
374 
375 
376     refFail += integrate(haplotypes, pos, true,  maxl, snp, '0', nhaps, &ihhR, denomP1);
377 
378     refFail += integrate(haplotypes, pos, false, maxl, snp, '0', nhaps, &ihhR,  denomP1);
379 
380     altFail += integrate(haplotypes, pos, true, maxl, snp,  '1', nhaps, &ihhA, denomP2);
381 
382     altFail += integrate(haplotypes, pos, false, maxl, snp,  '1', nhaps, &ihhA, denomP2);
383 
384     if(ihhR < 0.0001 || ihhA < 0.0001){
385       continue;
386     }
387 
388 #if defined HAS_OPENMP
389     omp_set_lock(&lock);
390 #endif
391     cout << seqid
392 	 << "\t" << pos[snp]
393 	 << "\t" << afs[snp]
394 	 << "\t" << ihhR
395 	 << "\t" << ihhA
396 	 << "\t" << log(ihhA/ihhR)
397 	 << "\t" << refFail
398 	 << "\t" << altFail << std::endl;
399 
400 #if defined HAS_OPENMP
401     omp_unset_lock(&lock);
402 #endif
403   }
404 }
405 
loadPhased(string ** haplotypes,genotype * pop,int ntarget)406 void loadPhased(string **haplotypes, genotype * pop, int ntarget){
407 
408   int indIndex = 0;
409 
410   for(vector<string>::iterator ind = pop->gts.begin(); ind != pop->gts.end(); ind++){
411     string g = (*ind);
412     vector< string > gs = split(g, "|");
413     haplotypes[indIndex][0].append(gs[0]);
414     haplotypes[indIndex][1].append(gs[1]);
415     indIndex += 1;
416   }
417 }
418 
main(int argc,char ** argv)419 int main(int argc, char** argv) {
420 
421   globalOpts.threads = 1   ;
422   globalOpts.af      = 0.05;
423 
424   // zero based index for the target and background indivudals
425 
426   map<int, int> it, ib;
427 
428     const struct option longopts[] =
429       {
430 	{"version"   , 0, 0, 'v'},
431 	{"help"      , 0, 0, 'h'},
432         {"file"      , 1, 0, 'f'},
433 	{"target"    , 1, 0, 't'},
434 	{"region"    , 1, 0, 'r'},
435 	{"gen"       , 1, 0, 'g'},
436 	{"type"      , 1, 0, 'y'},
437 	{"threads"   , 1, 0, 'x'},
438 	{"af"        , 1, 0, 'a'},
439 	{0,0,0,0}
440       };
441 
442     int findex;
443     int iarg=0;
444 
445     while(iarg != -1)
446       {
447 	iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:hv", longopts, &findex);
448 
449 	switch (iarg)
450 	  {
451 	  case 'a':
452 	    {
453 	      globalOpts.af = atof(optarg);
454 	      break;
455 	    }
456 	  case 'x':
457 	    {
458 	      globalOpts.threads = atoi(optarg);
459 	      break;
460 	    }
461 	  case 'g':
462 	    {
463 	      globalOpts.geneticMapFile = optarg;
464 	      break;
465 	    }
466 	  case 'h':
467 	    {
468 	      printHelp();
469 	      break;
470 	    }
471 	  case 'v':
472 	    {
473 	      printVersion();
474 	      break;
475 	    }
476 	  case 'y':
477 	    {
478 	      globalOpts.type = optarg;
479 	      break;
480 	    }
481 	  case 't':
482 	    {
483 	      loadIndices(it, optarg);
484 	      cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
485 	      cerr << "INFO: target ids: " << optarg << endl;
486 	      break;
487 	    }
488 	  case 'f':
489 	    {
490 	      cerr << "INFO: file: " << optarg  <<  endl;
491 	      globalOpts.filename = optarg;
492 	      break;
493 	    }
494 	  case 'r':
495 	    {
496 	      cerr << "INFO: set seqid region to : " << optarg << endl;
497 	      globalOpts.region = optarg;
498 	      break;
499 	    default:
500 	      break;
501 	    }
502 	  }
503       }
504 #if defined HAS_OPENMP
505   omp_set_num_threads(globalOpts.threads);
506 #endif
507     map<string, int> okayGenotypeLikelihoods;
508     okayGenotypeLikelihoods["PL"] = 1;
509     okayGenotypeLikelihoods["GL"] = 1;
510     okayGenotypeLikelihoods["GP"] = 1;
511     okayGenotypeLikelihoods["GT"] = 1;
512 
513 
514     // add an option for dumping
515 
516 //    for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){
517 //      cerr << "pos: " << gm->first << " cm: " << gm->second << endl;
518 //    }
519 
520     if(globalOpts.type.empty()){
521       cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
522       printHelp();
523       exit(1);
524     }
525     if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){
526       cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
527       printHelp();
528       exit(1);
529     }
530 
531     if(globalOpts.filename.empty()){
532       cerr << "FATAL: did not specify a file" << endl;
533       printHelp();
534       exit(1);
535     }
536 
537     if(it.size() < 2){
538       cerr << "FATAL: target option is required -- or -- less than two individuals in target\n";
539       printHelp();
540       exit(1);
541     }
542 
543     // using vcflib; thanksErik
544 
545     VariantCallFile variantFile;
546 
547     variantFile.open(globalOpts.filename);
548 
549     if(globalOpts.region.empty()){
550       cerr << "FATAL: region required" << endl;
551       exit(1);
552     }
553     if(! variantFile.setRegion(globalOpts.region)){
554       cerr <<"WARNING: unable to set region" << endl;
555       exit(0);
556     }
557 
558     if (!variantFile.is_open()) {
559       exit(1);
560     }
561 
562     Variant var( variantFile );
563     vector<int> target_h, background_h;
564 
565     int index   = 0;
566     int indexi  = 0;
567 
568 
569     vector<string> samples = variantFile.sampleNames;
570     int nsamples = samples.size();
571 
572     for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
573 
574       string sampleName = (*samp);
575 
576       if(it.find(index) != it.end() ){
577 	target_h.push_back(indexi);
578 	indexi++;
579       }
580       index++;
581     }
582 
583 
584     vector<long int> positions;
585 
586     vector<double> afs;
587 
588     string **haplotypes = new string*[target_h.size()];
589     for (int i = 0; i < target_h.size(); i++) {
590       haplotypes[i] = new string[2];
591     }
592 
593 
594     while (variantFile.getNextVariant(var)) {
595 
596       globalOpts.seqid = var.sequenceName;
597 
598       if(!var.isPhased()){
599 	cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
600 	exit(1);
601       }
602 
603       if(var.alleles.size() > 2){
604 	continue;
605       }
606 
607       vector < map< string, vector<string> > > target, background, total;
608 
609       int sindex = 0;
610 
611       for(int nsamp = 0; nsamp < nsamples; nsamp++){
612 
613 	map<string, vector<string> > sample = var.samples[ samples[nsamp]];
614 
615 	if(it.find(sindex) != it.end() ){
616 	  target.push_back(sample);
617 	}
618 	sindex += 1;
619       }
620 
621       genotype * populationTarget    ;
622 
623       if(globalOpts.type == "PL"){
624 	populationTarget     = new pl();
625       }
626       if(globalOpts.type == "GL"){
627 	populationTarget     = new gl();
628       }
629       if(globalOpts.type == "GP"){
630 	populationTarget     = new gp();
631       }
632       if(globalOpts.type == "GT"){
633 	populationTarget     = new gt();
634       }
635 
636       populationTarget->loadPop(target, var.sequenceName, var.position);
637 
638       if(populationTarget->af <= globalOpts.af
639 	 || populationTarget->nref < 2
640 	 || populationTarget->nalt < 2){
641 	delete populationTarget;
642 	continue;
643       }
644       positions.push_back(var.position);
645       afs.push_back(populationTarget->af);
646       loadPhased(haplotypes, populationTarget, populationTarget->gts.size());
647 
648       populationTarget = NULL;
649       delete populationTarget;
650     }
651 
652     if(!globalOpts.geneticMapFile.empty()){
653       cerr << "INFO: loading genetics map" << endl;
654       loadGeneticMap(positions.front(), positions.back());
655       cerr << "INFO: finished loading genetics map" << endl;
656     }
657 
658     calc(haplotypes, target_h.size(), afs, positions,
659 	 target_h, background_h, globalOpts.seqid);
660     clearHaplotypes(haplotypes, target_h.size());
661 
662     exit(0);
663 
664 }
665