1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15 
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25 
26 using namespace std;
27 using namespace vcflib;
28 
29 struct indv{
30   int nhet  ;
31   int nhom  ;
32   int nalt  ;
33   int nocall;
34 };
35 
36 
37 
printHelp(void)38 void printHelp(void){
39 
40   cerr << endl << endl;
41   cerr << "INFO: help" << endl;
42   cerr << "INFO: description:" << endl;
43   cerr << "Generates a table of genotype counts. Summarizes genotype counts for bi-allelic SNVs and indel " << endl << endl;
44 
45   cerr << "INFO: output: table of genotype counts for each individual." << endl;
46 
47 
48   cerr << "INFO: usage:  genotypeSummmary --type PL --target 0,1,2,3,4,5,6,7 --file my.vcf --snp                                                               " << endl;
49   cerr << endl;
50   cerr << "INFO: required: t,target     -- a zero based comma separated list of target individuals corresponding to VCF columns        " << endl;
51   cerr << "INFO: required: f,file       -- proper formatted VCF                                                                        " << endl;
52   cerr << "INFO: required, y,type       -- genotype likelihood format; genotype : GL,PL,GP                                             " << endl;
53   cerr << "INFO: optional, r,region     -- a tabix compliant region : chr1:1-1000 or chr1                                              " << endl;
54   cerr << "INFO: optional, s,snp        -- Only count SNPs                                              " << endl;
55   cerr << "INFO: optional, a,ancestral  -- describe counts relative to the ancestral allele defined as AA in INFO" << endl;
56   cerr << endl << "Type: statistics" << endl << endl;
57 
58   printVersion();
59 }
60 
61 
bound(double v)62 double bound(double v){
63   if(v <= 0.00001){
64     return  0.00001;
65   }
66   if(v >= 0.99999){
67     return 0.99999;
68   }
69   return v;
70 }
71 
loadIndices(map<int,int> & index,string set)72 void loadIndices(map<int, int> & index, string set){
73 
74   vector<string>  indviduals = split(set, ",");
75 
76   vector<string>::iterator it = indviduals.begin();
77 
78   for(; it != indviduals.end(); it++){
79     index[ atoi( (*it).c_str() ) ] = 1;
80   }
81 }
82 
83 
main(int argc,char ** argv)84 int main(int argc, char** argv) {
85 
86   bool snp = false;
87 
88   // set the random seed for MCMC
89 
90   srand((unsigned)time(NULL));
91 
92   // the filename
93 
94   string filename;
95 
96   // open standardout
97 
98   // set region to scaffold
99 
100   string region = "NA";
101 
102   // using vcflib; thanks to Erik Garrison
103 
104  VariantCallFile variantFile;
105 
106   // zero based index for the target and background indivudals
107 
108   map<int, int> it, ib;
109 
110   // genotype likelihood format
111 
112   string type = "NA";
113 
114   // are we polarizing the counts relative to the ancestral allele?
115   bool use_ancestral_state = false;
116   set<char> allowed_ancestral_bases = { 'A', 'T', 'C', 'G' };
117 
118     const struct option longopts[] =
119       {
120 	{"version"   , 0, 0, 'v'},
121 	{"help"      , 0, 0, 'h'},
122         {"file"      , 1, 0, 'f'},
123 	{"target"    , 1, 0, 't'},
124 	{"region"    , 1, 0, 'r'},
125 	{"type"      , 1, 0, 'y'},
126 	{"snp"       , 0, 0, 's'},
127 	{"ancestral" , 0, 0, 'a'},
128 	{0,0,0,0}
129       };
130 
131     int index;
132     int iarg=0;
133 
134     while(iarg != -1)
135       {
136 	iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chvsa", longopts, &index);
137 
138 	switch (iarg)
139 	  {
140     case 'a':
141       {
142         use_ancestral_state = true;
143         break;
144       }
145 	  case 's':
146 	    {
147 	      snp = true;
148 	      break;
149 	    }
150 	  case 'h':
151 	    {
152 	      printHelp();
153 	      return 0;
154 	    }
155 	  case 'v':
156 	    {
157 	      printVersion();
158 	      return 0;
159 	    }
160 	  case 't':
161 	    {
162 	      loadIndices(it, optarg);
163 	      cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
164 	      cerr << "INFO: target ids: " << optarg << endl;
165 	      break;
166 	    }
167 	  case 'b':
168 	    {
169 	      loadIndices(ib, optarg);
170 	      cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
171 	      cerr << "INFO: background ids: " << optarg << endl;
172 	      break;
173 	    }
174 	  case 'f':
175 	    {
176 	      cerr << "INFO: file: " << optarg  <<  endl;
177 	      filename = optarg;
178 	      break;
179 	    }
180 	  case 'r':
181 	    {
182 	      cerr << "INFO: set seqid region to : " << optarg << endl;
183 	      region = optarg;
184 	      break;
185 	    }
186 	  case 'y':
187 	    {
188 	      type = optarg;
189 	      cerr << "INFO: set genotype likelihood to: " << type << endl;
190 	      break;
191 	    }
192 	  default:
193 	    break;
194 	  }
195 
196       }
197 
198     if(filename.empty()){
199       cerr << "FATAL: failed to specify a file" << endl;
200       printHelp();
201     }
202 
203     bool is_open;
204 
205     if (filename == "-") {
206 
207         is_open=variantFile.open(std::cin);
208 
209     } else {
210 
211     	is_open=variantFile.open(filename);
212 
213      }
214 
215     if (!is_open)  {
216           cerr << "FATAL: could not open file for reading" << endl;
217           printHelp();
218 
219     }
220 
221 
222     if(region != "NA"){
223       if(! variantFile.setRegion(region)){
224 	cerr <<"FATAL: unable to set region" << endl;
225 	return 1;
226       }
227     }
228 
229     if (!variantFile.is_open()) {
230       cerr << "FATAL: could not open VCF for reading" << endl;
231       printHelp();
232       return 1;
233     }
234 
235     map<string, int> okayGenotypeLikelihoods;
236     okayGenotypeLikelihoods["PL"] = 1;
237     okayGenotypeLikelihoods["GL"] = 1;
238     okayGenotypeLikelihoods["GP"] = 1;
239     okayGenotypeLikelihoods["GT"] = 1;
240 
241     if(type == "NA"){
242       cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
243       printHelp();
244       return 1;
245     }
246     if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
247       cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
248       printHelp();
249       return 1;
250     }
251 
252     Variant var(variantFile);
253 
254     vector<string> samples = variantFile.sampleNames;
255     int nsamples = samples.size();
256 
257     vector<indv *> countData;
258     vector<string > countDataSampleName;
259 
260     for ( map<int ,int>::iterator x=it.begin(); x!=it.end(); ++x) {
261 
262         countDataSampleName.push_back(samples[x->first] );
263     }
264 
265 
266     for(int i = 0; i < it.size(); i++){
267       indv * dip = new indv;
268 
269       dip->nhet   = 0;
270       dip->nhom   = 0;
271       dip->nalt   = 0;
272       dip->nocall = 0;
273 
274       countData.push_back(dip);
275 
276     }
277 
278 
279     while (variantFile.getNextVariant(var)) {
280 
281 	// biallelic sites naturally
282 
283 	if(var.alt.size() > 1){
284 	  continue;
285 	}
286 	if(snp){
287 	  bool hit =false;
288 
289 	  for(vector<string>::iterator it = var.alleles.begin(); it != var.alleles.end(); it++){
290 	    if((*it).size() > 1){
291 	      hit = true;
292 	    }
293 	  }
294 	  if(hit){
295 	    continue;
296 	  }
297 	}
298 
299   // decide if we can polarize the site if we are using the ancestral allele
300   bool ref_is_ancestral_allele = true;
301   if (use_ancestral_state) {
302     // we need the ancestral allele to decide what to do at this site
303     if (var.info.find("AA") == var.info.end()) continue;
304     string ancestral_allele = var.info["AA"].front();
305     // if we do not have a polarized site with only allowed bases in the ancestral allele, skip it
306     bool allowed = true;
307     for (string::iterator c = ancestral_allele.begin(); c != ancestral_allele.end(); ++c) {
308       if (!allowed_ancestral_bases.count(*c)) {
309         allowed = false;
310         break;
311       }
312     }
313     if (!allowed) continue;
314     ref_is_ancestral_allele = (ancestral_allele == var.ref);
315   }
316 
317 	vector < map< string, vector<string> > > target, background, total;
318 
319 	int index = 0;
320 
321 	for(int nsamp = 0; nsamp < nsamples; nsamp++){
322 
323 	    if(it.find(index) != it.end() ){
324 	        const map<string, vector<string> >& sample = var.samples[ samples[nsamp]];
325 		target.push_back(sample);
326 	    }
327 	    index += 1;
328 	}
329 
330 	genotype * populationTarget      ;
331 
332 	if(type == "PL"){
333 	  populationTarget     = new pl();
334 	}
335 	if(type == "GL"){
336 	  populationTarget     = new gl();
337 	}
338 	if(type == "GP"){
339 	  populationTarget     = new gp();
340 	}
341 	if(type == "GT"){
342           populationTarget     = new gt();
343 	}
344 
345 	populationTarget->loadPop(target, var.sequenceName, var.position);
346 
347 	for(int i = 0; i < populationTarget->genoIndex.size() ; i++){
348 	  if(populationTarget->genoIndex[i] == -1){
349 	    countData[i]->nocall += 1;
350 	  }
351 	  else if (populationTarget->genoIndex[i] == 0) {
352             if (!use_ancestral_state || ref_is_ancestral_allele) {
353 	      countData[i]->nhom += 1;
354             } else {
355 	      countData[i]->nalt += 1;
356             }
357 	  }
358 	  else if (populationTarget->genoIndex[i] == 1){
359 	    countData[i]->nhet += 1;
360 	  }
361 	  else if (populationTarget->genoIndex[i] == 2) {
362             if (!use_ancestral_state || ref_is_ancestral_allele) {
363 	      countData[i]->nalt += 1;
364             } else {
365 	      countData[i]->nhom += 1;
366             }
367 	  }
368 	  else{
369 	    std::cerr << "FATAL: unknown genotype index" << std::endl;
370 cerr << populationTarget->genoIndex[i] << endl;
371 cerr << var << endl;
372 	    exit(1);
373 	  }
374 	}
375 	delete populationTarget;
376 
377     }
378 
379     if (!use_ancestral_state) {
380         std::cout << "#sample-id\tn-nocall\tn-hom-ref\tn-het\tn-hom-alt" << std::endl;
381     } else {
382         std::cout << "#sample-id\tn-nocall\tn-hom-ancestral\tn-het\tn-hom-derived" << std::endl;
383     }
384     for(int i = 0; i < countData.size(); i++){
385         std::cout << countDataSampleName[i]
386                   << "\t" << countData[i]->nocall
387                   << "\t" << countData[i]->nhom
388                   << "\t" << countData[i]->nhet
389                   << "\t" << countData[i]->nalt
390                   << std::endl;
391     }
392 
393 
394     return 0;
395 }
396