1 #include "Variant.h"
2 #include "split.h"
3 #include "cdflib.hpp"
4 #include "pdflib.hpp"
5 #include "var.hpp"
6 
7 #include <string>
8 #include <iostream>
9 #include <math.h>
10 #include <cmath>
11 #include <stdlib.h>
12 #include <time.h>
13 #include <stdio.h>
14 #include <getopt.h>
15 #include "gpatInfo.hpp"
16 
17 using namespace std;
18 using namespace vcflib;
19 
20 struct indv{
21   int nhet  ;
22   int nhom  ;
23   int nalt  ;
24   int nocall;
25 };
26 
27 
28 
printHelp(void)29 void printHelp(void){
30 
31   cerr << endl << endl;
32   cerr << "INFO: help" << endl;
33   cerr << "INFO: description:" << endl;
34   cerr << "      Summarizes genotype counts for bi-allelic SNVs and indel " << endl;
35 
36   cerr << "INFO: output: table of genotype counts for each individual." << endl;
37 
38 
39   cerr << "INFO: usage:  genotypeSummmary --type PL --target 0,1,2,3,4,5,6,7 --file my.vcf --snp                                                               " << endl;
40   cerr << endl;
41   cerr << "INFO: required: t,target     -- a zero based comma separated list of target individuals corresponding to VCF columns        " << endl;
42   cerr << "INFO: required: f,file       -- proper formatted VCF                                                                        " << endl;
43   cerr << "INFO: required, y,type       -- genotype likelihood format; genotype : GL,PL,GP                                             " << endl;
44   cerr << "INFO: optional, r,region     -- a tabix compliant region : chr1:1-1000 or chr1                                              " << endl;
45   cerr << "INFO: optional, s,snp        -- Only count SNPs                                              " << endl;
46   cerr << "INFO: optional, a,ancestral  -- describe counts relative to the ancestral allele defined as AA in INFO" << endl;
47 
48   printVersion();
49 }
50 
51 
bound(double v)52 double bound(double v){
53   if(v <= 0.00001){
54     return  0.00001;
55   }
56   if(v >= 0.99999){
57     return 0.99999;
58   }
59   return v;
60 }
61 
loadIndices(map<int,int> & index,string set)62 void loadIndices(map<int, int> & index, string set){
63 
64   vector<string>  indviduals = split(set, ",");
65 
66   vector<string>::iterator it = indviduals.begin();
67 
68   for(; it != indviduals.end(); it++){
69     index[ atoi( (*it).c_str() ) ] = 1;
70   }
71 }
72 
73 
main(int argc,char ** argv)74 int main(int argc, char** argv) {
75 
76   bool snp = false;
77 
78   // set the random seed for MCMC
79 
80   srand((unsigned)time(NULL));
81 
82   // the filename
83 
84   string filename;
85 
86   // open standardout
87 
88   // set region to scaffold
89 
90   string region = "NA";
91 
92   // using vcflib; thanks to Erik Garrison
93 
94  VariantCallFile variantFile;
95 
96   // zero based index for the target and background indivudals
97 
98   map<int, int> it, ib;
99 
100   // genotype likelihood format
101 
102   string type = "NA";
103 
104   // are we polarizing the counts relative to the ancestral allele?
105   bool use_ancestral_state = false;
106   set<char> allowed_ancestral_bases = { 'A', 'T', 'C', 'G' };
107 
108     const struct option longopts[] =
109       {
110 	{"version"   , 0, 0, 'v'},
111 	{"help"      , 0, 0, 'h'},
112         {"file"      , 1, 0, 'f'},
113 	{"target"    , 1, 0, 't'},
114 	{"region"    , 1, 0, 'r'},
115 	{"type"      , 1, 0, 'y'},
116 	{"snp"       , 0, 0, 's'},
117 	{"ancestral" , 0, 0, 'a'},
118 	{0,0,0,0}
119       };
120 
121     int index;
122     int iarg=0;
123 
124     while(iarg != -1)
125       {
126 	iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chvsa", longopts, &index);
127 
128 	switch (iarg)
129 	  {
130     case 'a':
131       {
132         use_ancestral_state = true;
133         break;
134       }
135 	  case 's':
136 	    {
137 	      snp = true;
138 	      break;
139 	    }
140 	  case 'h':
141 	    {
142 	      printHelp();
143 	      return 0;
144 	    }
145 	  case 'v':
146 	    {
147 	      printVersion();
148 	      return 0;
149 	    }
150 	  case 't':
151 	    {
152 	      loadIndices(it, optarg);
153 	      cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
154 	      cerr << "INFO: target ids: " << optarg << endl;
155 	      break;
156 	    }
157 	  case 'b':
158 	    {
159 	      loadIndices(ib, optarg);
160 	      cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
161 	      cerr << "INFO: background ids: " << optarg << endl;
162 	      break;
163 	    }
164 	  case 'f':
165 	    {
166 	      cerr << "INFO: file: " << optarg  <<  endl;
167 	      filename = optarg;
168 	      break;
169 	    }
170 	  case 'r':
171 	    {
172 	      cerr << "INFO: set seqid region to : " << optarg << endl;
173 	      region = optarg;
174 	      break;
175 	    }
176 	  case 'y':
177 	    {
178 	      type = optarg;
179 	      cerr << "INFO: set genotype likelihood to: " << type << endl;
180 	      break;
181 	    }
182 	  default:
183 	    break;
184 	  }
185 
186       }
187 
188     if(filename.empty()){
189       cerr << "FATAL: failed to specify a file" << endl;
190       printHelp();
191     }
192 
193     bool is_open;
194 
195     if (filename == "-") {
196 
197         is_open=variantFile.open(std::cin);
198 
199     } else {
200 
201     	is_open=variantFile.open(filename);
202 
203      }
204 
205     if (!is_open)  {
206           cerr << "FATAL: could not open file for reading" << endl;
207           printHelp();
208 
209     }
210 
211 
212     if(region != "NA"){
213       if(! variantFile.setRegion(region)){
214 	cerr <<"FATAL: unable to set region" << endl;
215 	return 1;
216       }
217     }
218 
219     if (!variantFile.is_open()) {
220       cerr << "FATAL: could not open VCF for reading" << endl;
221       printHelp();
222       return 1;
223     }
224 
225     map<string, int> okayGenotypeLikelihoods;
226     okayGenotypeLikelihoods["PL"] = 1;
227     okayGenotypeLikelihoods["GL"] = 1;
228     okayGenotypeLikelihoods["GP"] = 1;
229     okayGenotypeLikelihoods["GT"] = 1;
230 
231     if(type == "NA"){
232       cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
233       printHelp();
234       return 1;
235     }
236     if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
237       cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
238       printHelp();
239       return 1;
240     }
241 
242     Variant var(variantFile);
243 
244     vector<string> samples = variantFile.sampleNames;
245     int nsamples = samples.size();
246 
247     vector<indv *> countData;
248     vector<string > countDataSampleName;
249 
250     for ( map<int ,int>::iterator x=it.begin(); x!=it.end(); ++x) {
251 
252         countDataSampleName.push_back(samples[x->first] );
253     }
254 
255 
256     for(int i = 0; i < it.size(); i++){
257       indv * dip = new indv;
258 
259       dip->nhet   = 0;
260       dip->nhom   = 0;
261       dip->nalt   = 0;
262       dip->nocall = 0;
263 
264       countData.push_back(dip);
265 
266     }
267 
268 
269     while (variantFile.getNextVariant(var)) {
270 
271 	// biallelic sites naturally
272 
273 	if(var.alt.size() > 1){
274 	  continue;
275 	}
276 	if(snp){
277 	  bool hit =false;
278 
279 	  for(vector<string>::iterator it = var.alleles.begin(); it != var.alleles.end(); it++){
280 	    if((*it).size() > 1){
281 	      hit = true;
282 	    }
283 	  }
284 	  if(hit){
285 	    continue;
286 	  }
287 	}
288 
289   // decide if we can polarize the site if we are using the ancestral allele
290   bool ref_is_ancestral_allele = true;
291   if (use_ancestral_state) {
292     // we need the ancestral allele to decide what to do at this site
293     if (var.info.find("AA") == var.info.end()) continue;
294     string ancestral_allele = var.info["AA"].front();
295     // if we do not have a polarized site with only allowed bases in the ancestral allele, skip it
296     bool allowed = true;
297     for (string::iterator c = ancestral_allele.begin(); c != ancestral_allele.end(); ++c) {
298       if (!allowed_ancestral_bases.count(*c)) {
299         allowed = false;
300         break;
301       }
302     }
303     if (!allowed) continue;
304     ref_is_ancestral_allele = (ancestral_allele == var.ref);
305   }
306 
307 	vector < map< string, vector<string> > > target, background, total;
308 
309 	int index = 0;
310 
311 	for(int nsamp = 0; nsamp < nsamples; nsamp++){
312 
313 	    if(it.find(index) != it.end() ){
314 	        const map<string, vector<string> >& sample = var.samples[ samples[nsamp]];
315 		target.push_back(sample);
316 	    }
317 	    index += 1;
318 	}
319 
320 	genotype * populationTarget      ;
321 
322 	if(type == "PL"){
323 	  populationTarget     = new pl();
324 	}
325 	if(type == "GL"){
326 	  populationTarget     = new gl();
327 	}
328 	if(type == "GP"){
329 	  populationTarget     = new gp();
330 	}
331 	if(type == "GT"){
332           populationTarget     = new gt();
333 	}
334 
335 	populationTarget->loadPop(target, var.sequenceName, var.position);
336 
337 	for(int i = 0; i < populationTarget->genoIndex.size() ; i++){
338 	  if(populationTarget->genoIndex[i] == -1){
339 	    countData[i]->nocall += 1;
340 	  }
341 	  else if (populationTarget->genoIndex[i] == 0) {
342             if (!use_ancestral_state || ref_is_ancestral_allele) {
343 	      countData[i]->nhom += 1;
344             } else {
345 	      countData[i]->nalt += 1;
346             }
347 	  }
348 	  else if (populationTarget->genoIndex[i] == 1){
349 	    countData[i]->nhet += 1;
350 	  }
351 	  else if (populationTarget->genoIndex[i] == 2) {
352             if (!use_ancestral_state || ref_is_ancestral_allele) {
353 	      countData[i]->nalt += 1;
354             } else {
355 	      countData[i]->nhom += 1;
356             }
357 	  }
358 	  else{
359 	    std::cerr << "FATAL: unknown genotype index" << std::endl;
360 cerr << populationTarget->genoIndex[i] << endl;
361 cerr << var << endl;
362 	    exit(1);
363 	  }
364 	}
365 	delete populationTarget;
366 
367     }
368 
369     if (!use_ancestral_state) {
370         std::cout << "#sample-id\tn-nocall\tn-hom-ref\tn-het\tn-hom-alt" << std::endl;
371     } else {
372         std::cout << "#sample-id\tn-nocall\tn-hom-ancestral\tn-het\tn-hom-derived" << std::endl;
373     }
374     for(int i = 0; i < countData.size(); i++){
375         std::cout << countDataSampleName[i]
376                   << "\t" << countData[i]->nocall
377                   << "\t" << countData[i]->nhom
378                   << "\t" << countData[i]->nhet
379                   << "\t" << countData[i]->nalt
380                   << std::endl;
381     }
382 
383 
384     return 0;
385 }
386