1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25 // maaas speed
26
27 #if defined HAS_OPENMP
28 #include <omp.h>
29 // print lock
30 omp_lock_t lock;
31 #endif
32
33
34 struct opts{
35 int threads ;
36 std::string filename ;
37 std::string mapFile ;
38 std::string seqid ;
39 std::string geneticMapFile ;
40 std::string type ;
41 std::string region ;
42 std::map<int, double> geneticMap;
43 double af ;
44
45 }globalOpts;
46
47
48 using namespace std;
49 using namespace vcflib;
50
printHelp(void)51 void printHelp(void){
52 cerr << R"(
53 iHS calculates the integrated haplotype score which measures the relative decay of extended haplotype homozygosity (EHH) for the reference and alternative alleles at a site (see: voight et al. 2006, Spiech & Hernandez 2014).
54
55 Our code is highly concordant with both implementations mentioned. However, we do not set an upper limit to the allele frequency. iHS can be run without a genetic map, in which case the change in EHH is integrated over a constant. Human genetic maps for GRCh36 and GRCh37 (hg18 & hg19) can be found at: http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/ . iHS by default interpolates SNV positions to genetic position (you don't need a genetic position for every VCF entry in the map file).
56
57 iHS analyses requires normalization by allele frequency. It is important that iHS is calculated over large regions so that the normalization does not down weight real signals. For genome-wide runs it is recommended to run slightly overlapping windows and throwing out values that fail integration (columns 7 & 8 in the output) and then removing duplicates by using the 'sort' and 'uniq' linux commands. Normalization of the output is as simple as running 'normalize-iHS'.
58
59 INFO: help
60 INFO: description:
61 iHS calculates the integrated ratio of haplotype decay between the reference and non-reference allele.
62 Output : 4 columns :
63 1. seqid
64 2. position
65 3. target allele frequency
66 4. integrated EHH (alternative)
67 5. integrated EHH (reference)
68 6. iHS ln(iEHHalt/iEHHref)
69 7. != 0 integration failure
70 8. != 0 integration failure
71
72 Usage: iHS --target 0,1,2,3,4,5,6,7 --file my.phased.vcf \
73 --region chr1:1-1000 > STDOUT 2> STDERR
74
75 Params:
76 required: t,target <STRING> A zero base comma separated list of target
77 individuals corresponding to VCF columns
78 required: r,region <STRING> A tabix compliant genomic range
79 format: "seqid:start-end" or "seqid"
80 required: f,file <STRING> Proper formatted and phased VCF.
81 required: y,type <STRING> Genotype likelihood format: GT,PL,GL,GP
82 optional: a,af <DOUBLE> Alternative alleles with frquences less
83 than [0.05] are skipped.
84 optional: x,threads <INT> Number of CPUS [1].
85 recommended: g,gen <STRING> A PLINK formatted map file.
86
87 )" << endl ;
88 cerr << endl << "Type: statistics" << endl << endl;
89 cerr << endl;
90
91 printVersion();
92
93 exit(1);
94 }
95
96
gDist(int start,int end,double * gd)97 bool gDist(int start, int end, double * gd){
98
99 if(globalOpts.geneticMap.find(start) == globalOpts.geneticMap.end()){
100 return false;
101 }
102 if(globalOpts.geneticMap.find(end) == globalOpts.geneticMap.end()){
103 return false;
104 }
105 *gd = abs(globalOpts.geneticMap[start] - globalOpts.geneticMap[end]);
106 return true;
107 }
108
loadGeneticMap(int start,int end)109 void loadGeneticMap(int start, int end){
110
111 if(globalOpts.geneticMapFile.empty()){
112 std::cerr << "WARNING: No genetic map." << std::endl;
113 std::cerr << "WARNING: A constant genetic distance is being used: 0.001." << std::endl;
114 return;
115 }
116
117 ifstream featureFile (globalOpts.geneticMapFile.c_str());
118
119 string line;
120
121 int lastpos = 0;
122 double lastvalue = 0;
123
124 if(featureFile.is_open()){
125
126 while(getline(featureFile, line)){
127
128 vector<string> region = split(line, "\t");
129
130 if(region.front() != globalOpts.seqid){
131 std::cerr << "WARNING: seqid MisMatch: " << region.front() << " " << globalOpts.seqid << std::endl;
132 continue;
133 }
134
135 int pos = atoi(region[3].c_str()) ;
136 double cm = atof(region[2].c_str()) ;
137
138 if(lastpos == 0 && start > pos){
139 lastpos = pos;
140 continue;
141 }
142
143 int diff = abs(pos - lastpos);
144 double vdiff = abs(lastvalue - cm );
145 double chunk = vdiff/double(diff);
146
147 double running = lastvalue;
148
149 for(int i = lastpos; i < pos; i++){
150 globalOpts.geneticMap[i] = running;
151 running += chunk;
152 }
153
154 if(pos > end){
155 break;
156 }
157
158
159 lastpos = pos;
160 lastvalue = cm;
161 }
162 }
163
164 featureFile.close();
165
166 if(globalOpts.geneticMap.size() < 1){
167 std::cerr << "FATAL: Problem loading genetic map" << std::endl;
168 exit(1);
169 }
170 }
171
172
clearHaplotypes(string ** haplotypes,int ntarget)173 void clearHaplotypes(string **haplotypes, int ntarget){
174 for(int i= 0; i < ntarget; i++){
175 haplotypes[i][0].clear();
176 haplotypes[i][1].clear();
177 }
178 }
179
loadIndices(map<int,int> & index,string set)180 void loadIndices(map<int, int> & index, string set){
181
182 vector<string> indviduals = split(set, ",");
183 vector<string>::iterator it = indviduals.begin();
184
185 for(; it != indviduals.end(); it++){
186 index[ atoi( (*it).c_str() ) ] = 1;
187 }
188 }
189
countHaps(int nhaps,map<string,int> & targetH,string ** haplotypes,int start,int end)190 void countHaps(int nhaps, map<string, int> & targetH,
191 string **haplotypes, int start, int end){
192
193 for(int i = 0; i < nhaps; i++){
194
195 std::string h1 = haplotypes[i][0].substr(start, (end - start)) ;
196 std::string h2 = haplotypes[i][1].substr(start, (end - start)) ;
197
198 if(targetH.find(h1) == targetH.end()){
199 targetH[h1] = 1;
200 }
201 else{
202 targetH[h1]++;
203 }
204 if(targetH.find(h2) == targetH.end()){
205 targetH[h2] = 1;
206 }
207 else{
208 targetH[h2]++;
209 }
210 }
211 }
212
computeNs(map<string,int> & targetH,int start,int end,double * sumT,char ref,bool dir)213 void computeNs(map<string, int> & targetH, int start,
214 int end, double * sumT, char ref, bool dir){
215
216 for( map<string, int>::iterator th = targetH.begin();
217 th != targetH.end(); th++){
218
219 if(th->second < 2){
220 continue;
221 }
222
223
224 // end is extending ; check first base
225 if(dir){
226 if( th->first[0] == ref){
227
228 // std::cerr << "count dat: " << th->first << " " << th->second << " " << ref << " " << dir << endl;
229
230
231 *sumT += r8_choose(th->second, 2);
232 }
233 }
234
235 // start is extending ; check last base
236 else{
237
238 int last = th->first.size() -1;
239 if( th->first[last] == ref ){
240 // std::cerr << "count dat:" << th->first << " " << th->second << " " << ref << " " << dir << endl;
241
242
243 *sumT += r8_choose(th->second, 2);
244 }
245 }
246 }
247 }
248
calcEhh(string ** haplotypes,int start,int end,char ref,int nhaps,double * ehh,double div,bool dir)249 bool calcEhh(string **haplotypes, int start,
250 int end, char ref, int nhaps,
251 double * ehh, double div, bool dir){
252
253 double sum = 0 ;
254 map<string , int> refH;
255
256 countHaps(nhaps, refH, haplotypes, start, end);
257 computeNs(refH, start, end, &sum, ref, dir );
258
259 double internalEHH = sum / (r8_choose(div, 2));
260
261 if(internalEHH > 1){
262 std::cerr << "FATAL: internal error." << std::endl;
263 exit(1);
264 }
265
266 *ehh = internalEHH;
267
268 return true;
269 }
270
integrate(string ** haplotypes,vector<long int> & pos,bool direction,int maxl,int snp,char ref,int nhaps,double * iHH,double denom)271 int integrate(string **haplotypes ,
272 vector<long int> & pos,
273 bool direction,
274 int maxl,
275 int snp,
276 char ref,
277 int nhaps,
278 double * iHH,
279 double denom ){
280
281 double ehh = 1;
282
283 int start = snp;
284 int end = snp;
285
286 // controls the substring madness
287 if(!direction){
288 start += 1;
289 end += 1;
290 }
291
292 while(ehh > 0.05){
293 if(direction){
294 end += 1;
295 }
296 else{
297 start -= 1;
298 }
299 if(start < 0){
300 return 1;
301 }
302 if(end > maxl){
303 return 1;
304 }
305 double ehhRT = 0;
306 if(!calcEhh(haplotypes,
307 start, end,
308 ref, nhaps,
309 &ehhRT, denom,
310 direction)){
311 return 1;
312 }
313
314 if(ehhRT <= 0.05){
315 return 0;
316 }
317
318 double delta_gDist = 0.001;
319
320 bool veryLongGap = false ;
321 double dist = 0 ;
322
323 if(direction){
324 gDist(pos[end-1], pos[end], &delta_gDist);
325 dist = abs(pos[end-1] - pos[end]);
326 }
327 else{
328 gDist(pos[start + 1], pos[start], &delta_gDist);
329 dist = abs(pos[end-1] - pos[end]);
330
331 }
332
333 if(dist > 10000){
334 return 1;
335 }
336 double correction = 1;
337 if(dist > 5000){
338 correction = 5000 / dist;
339 }
340
341 *iHH += ((ehh + ehhRT)/2)*delta_gDist*correction;
342 ehh = ehhRT;
343
344 }
345
346 return 10;
347 }
348
calc(string ** haplotypes,int nhaps,vector<double> & afs,vector<long int> & pos,vector<int> & target,vector<int> & background,string seqid)349 void calc(string **haplotypes, int nhaps,
350 vector<double> & afs, vector<long int> & pos,
351 vector<int> & target, vector<int> & background, string seqid){
352
353 int maxl = haplotypes[0][0].length();
354
355 #if defined HAS_OPENMP
356 #pragma omp parallel for schedule(dynamic, 20)
357 #endif
358
359 for(int snp = 0; snp < maxl; snp++){
360
361 double ihhR = 0;
362 double ihhA = 0;
363
364 map<string , int> refH;
365
366 countHaps(nhaps, refH, haplotypes, snp, snp+1);
367
368
369 double denomP1 = double(refH["0"]);
370 double denomP2 = double(refH["1"]);
371
372 int refFail = 0;
373 int altFail = 0;
374
375
376 refFail += integrate(haplotypes, pos, true, maxl, snp, '0', nhaps, &ihhR, denomP1);
377
378 refFail += integrate(haplotypes, pos, false, maxl, snp, '0', nhaps, &ihhR, denomP1);
379
380 altFail += integrate(haplotypes, pos, true, maxl, snp, '1', nhaps, &ihhA, denomP2);
381
382 altFail += integrate(haplotypes, pos, false, maxl, snp, '1', nhaps, &ihhA, denomP2);
383
384 if(ihhR < 0.0001 || ihhA < 0.0001){
385 continue;
386 }
387
388 #if defined HAS_OPENMP
389 omp_set_lock(&lock);
390 #endif
391 cout << seqid
392 << "\t" << pos[snp]
393 << "\t" << afs[snp]
394 << "\t" << ihhR
395 << "\t" << ihhA
396 << "\t" << log(ihhA/ihhR)
397 << "\t" << refFail
398 << "\t" << altFail << std::endl;
399
400 #if defined HAS_OPENMP
401 omp_unset_lock(&lock);
402 #endif
403 }
404 }
405
loadPhased(string ** haplotypes,genotype * pop,int ntarget)406 void loadPhased(string **haplotypes, genotype * pop, int ntarget){
407
408 int indIndex = 0;
409
410 for(vector<string>::iterator ind = pop->gts.begin(); ind != pop->gts.end(); ind++){
411 string g = (*ind);
412 vector< string > gs = split(g, "|");
413 haplotypes[indIndex][0].append(gs[0]);
414 haplotypes[indIndex][1].append(gs[1]);
415 indIndex += 1;
416 }
417 }
418
main(int argc,char ** argv)419 int main(int argc, char** argv) {
420
421 globalOpts.threads = 1 ;
422 globalOpts.af = 0.05;
423
424 // zero based index for the target and background indivudals
425
426 map<int, int> it, ib;
427
428 const struct option longopts[] =
429 {
430 {"version" , 0, 0, 'v'},
431 {"help" , 0, 0, 'h'},
432 {"file" , 1, 0, 'f'},
433 {"target" , 1, 0, 't'},
434 {"region" , 1, 0, 'r'},
435 {"gen" , 1, 0, 'g'},
436 {"type" , 1, 0, 'y'},
437 {"threads" , 1, 0, 'x'},
438 {"af" , 1, 0, 'a'},
439 {0,0,0,0}
440 };
441
442 int findex;
443 int iarg=0;
444
445 while(iarg != -1)
446 {
447 iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:hv", longopts, &findex);
448
449 switch (iarg)
450 {
451 case 'a':
452 {
453 globalOpts.af = atof(optarg);
454 break;
455 }
456 case 'x':
457 {
458 globalOpts.threads = atoi(optarg);
459 break;
460 }
461 case 'g':
462 {
463 globalOpts.geneticMapFile = optarg;
464 break;
465 }
466 case 'h':
467 {
468 printHelp();
469 break;
470 }
471 case 'v':
472 {
473 printVersion();
474 break;
475 }
476 case 'y':
477 {
478 globalOpts.type = optarg;
479 break;
480 }
481 case 't':
482 {
483 loadIndices(it, optarg);
484 cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
485 cerr << "INFO: target ids: " << optarg << endl;
486 break;
487 }
488 case 'f':
489 {
490 cerr << "INFO: file: " << optarg << endl;
491 globalOpts.filename = optarg;
492 break;
493 }
494 case 'r':
495 {
496 cerr << "INFO: set seqid region to : " << optarg << endl;
497 globalOpts.region = optarg;
498 break;
499 default:
500 break;
501 }
502 }
503 }
504 #if defined HAS_OPENMP
505 omp_set_num_threads(globalOpts.threads);
506 #endif
507 map<string, int> okayGenotypeLikelihoods;
508 okayGenotypeLikelihoods["PL"] = 1;
509 okayGenotypeLikelihoods["GL"] = 1;
510 okayGenotypeLikelihoods["GP"] = 1;
511 okayGenotypeLikelihoods["GT"] = 1;
512
513
514 // add an option for dumping
515
516 // for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){
517 // cerr << "pos: " << gm->first << " cm: " << gm->second << endl;
518 // }
519
520 if(globalOpts.type.empty()){
521 cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
522 printHelp();
523 exit(1);
524 }
525 if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){
526 cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
527 printHelp();
528 exit(1);
529 }
530
531 if(globalOpts.filename.empty()){
532 cerr << "FATAL: did not specify a file" << endl;
533 printHelp();
534 exit(1);
535 }
536
537 if(it.size() < 2){
538 cerr << "FATAL: target option is required -- or -- less than two individuals in target\n";
539 printHelp();
540 exit(1);
541 }
542
543 // using vcflib; thanksErik
544
545 VariantCallFile variantFile;
546
547 variantFile.open(globalOpts.filename);
548
549 if(globalOpts.region.empty()){
550 cerr << "FATAL: region required" << endl;
551 exit(1);
552 }
553 if(! variantFile.setRegion(globalOpts.region)){
554 cerr <<"WARNING: unable to set region" << endl;
555 exit(0);
556 }
557
558 if (!variantFile.is_open()) {
559 exit(1);
560 }
561
562 Variant var( variantFile );
563 vector<int> target_h, background_h;
564
565 int index = 0;
566 int indexi = 0;
567
568
569 vector<string> samples = variantFile.sampleNames;
570 int nsamples = samples.size();
571
572 for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
573
574 string sampleName = (*samp);
575
576 if(it.find(index) != it.end() ){
577 target_h.push_back(indexi);
578 indexi++;
579 }
580 index++;
581 }
582
583
584 vector<long int> positions;
585
586 vector<double> afs;
587
588 string **haplotypes = new string*[target_h.size()];
589 for (int i = 0; i < target_h.size(); i++) {
590 haplotypes[i] = new string[2];
591 }
592
593
594 while (variantFile.getNextVariant(var)) {
595
596 globalOpts.seqid = var.sequenceName;
597
598 if(!var.isPhased()){
599 cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
600 exit(1);
601 }
602
603 if(var.alleles.size() > 2){
604 continue;
605 }
606
607 vector < map< string, vector<string> > > target, background, total;
608
609 int sindex = 0;
610
611 for(int nsamp = 0; nsamp < nsamples; nsamp++){
612
613 map<string, vector<string> > sample = var.samples[ samples[nsamp]];
614
615 if(it.find(sindex) != it.end() ){
616 target.push_back(sample);
617 }
618 sindex += 1;
619 }
620
621 genotype * populationTarget ;
622
623 if(globalOpts.type == "PL"){
624 populationTarget = new pl();
625 }
626 if(globalOpts.type == "GL"){
627 populationTarget = new gl();
628 }
629 if(globalOpts.type == "GP"){
630 populationTarget = new gp();
631 }
632 if(globalOpts.type == "GT"){
633 populationTarget = new gt();
634 }
635
636 populationTarget->loadPop(target, var.sequenceName, var.position);
637
638 if(populationTarget->af <= globalOpts.af
639 || populationTarget->nref < 2
640 || populationTarget->nalt < 2){
641 delete populationTarget;
642 continue;
643 }
644 positions.push_back(var.position);
645 afs.push_back(populationTarget->af);
646 loadPhased(haplotypes, populationTarget, populationTarget->gts.size());
647
648 populationTarget = NULL;
649 delete populationTarget;
650 }
651
652 if(!globalOpts.geneticMapFile.empty()){
653 cerr << "INFO: loading genetics map" << endl;
654 loadGeneticMap(positions.front(), positions.back());
655 cerr << "INFO: finished loading genetics map" << endl;
656 }
657
658 calc(haplotypes, target_h.size(), afs, positions,
659 target_h, background_h, globalOpts.seqid);
660 clearHaplotypes(haplotypes, target_h.size());
661
662 exit(0);
663
664 }
665