1 #include "Variant.h"
2 #include "split.h"
3 #include "cdflib.hpp"
4 #include "pdflib.hpp"
5 #include "var.hpp"
6
7 #include <string>
8 #include <iostream>
9 #include <math.h>
10 #include <cmath>
11 #include <stdlib.h>
12 #include <time.h>
13 #include <stdio.h>
14 #include <getopt.h>
15 #include "gpatInfo.hpp"
16
17 using namespace std;
18 using namespace vcflib;
19
20 struct indv{
21 int nhet ;
22 int nhom ;
23 int nalt ;
24 int nocall;
25 };
26
27
28
printHelp(void)29 void printHelp(void){
30
31 cerr << endl << endl;
32 cerr << "INFO: help" << endl;
33 cerr << "INFO: description:" << endl;
34 cerr << " Summarizes genotype counts for bi-allelic SNVs and indel " << endl;
35
36 cerr << "INFO: output: table of genotype counts for each individual." << endl;
37
38
39 cerr << "INFO: usage: genotypeSummmary --type PL --target 0,1,2,3,4,5,6,7 --file my.vcf --snp " << endl;
40 cerr << endl;
41 cerr << "INFO: required: t,target -- a zero based comma separated list of target individuals corresponding to VCF columns " << endl;
42 cerr << "INFO: required: f,file -- proper formatted VCF " << endl;
43 cerr << "INFO: required, y,type -- genotype likelihood format; genotype : GL,PL,GP " << endl;
44 cerr << "INFO: optional, r,region -- a tabix compliant region : chr1:1-1000 or chr1 " << endl;
45 cerr << "INFO: optional, s,snp -- Only count SNPs " << endl;
46 cerr << "INFO: optional, a,ancestral -- describe counts relative to the ancestral allele defined as AA in INFO" << endl;
47
48 printVersion();
49 }
50
51
bound(double v)52 double bound(double v){
53 if(v <= 0.00001){
54 return 0.00001;
55 }
56 if(v >= 0.99999){
57 return 0.99999;
58 }
59 return v;
60 }
61
loadIndices(map<int,int> & index,string set)62 void loadIndices(map<int, int> & index, string set){
63
64 vector<string> indviduals = split(set, ",");
65
66 vector<string>::iterator it = indviduals.begin();
67
68 for(; it != indviduals.end(); it++){
69 index[ atoi( (*it).c_str() ) ] = 1;
70 }
71 }
72
73
main(int argc,char ** argv)74 int main(int argc, char** argv) {
75
76 bool snp = false;
77
78 // set the random seed for MCMC
79
80 srand((unsigned)time(NULL));
81
82 // the filename
83
84 string filename;
85
86 // open standardout
87
88 // set region to scaffold
89
90 string region = "NA";
91
92 // using vcflib; thanks to Erik Garrison
93
94 VariantCallFile variantFile;
95
96 // zero based index for the target and background indivudals
97
98 map<int, int> it, ib;
99
100 // genotype likelihood format
101
102 string type = "NA";
103
104 // are we polarizing the counts relative to the ancestral allele?
105 bool use_ancestral_state = false;
106 set<char> allowed_ancestral_bases = { 'A', 'T', 'C', 'G' };
107
108 const struct option longopts[] =
109 {
110 {"version" , 0, 0, 'v'},
111 {"help" , 0, 0, 'h'},
112 {"file" , 1, 0, 'f'},
113 {"target" , 1, 0, 't'},
114 {"region" , 1, 0, 'r'},
115 {"type" , 1, 0, 'y'},
116 {"snp" , 0, 0, 's'},
117 {"ancestral" , 0, 0, 'a'},
118 {0,0,0,0}
119 };
120
121 int index;
122 int iarg=0;
123
124 while(iarg != -1)
125 {
126 iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chvsa", longopts, &index);
127
128 switch (iarg)
129 {
130 case 'a':
131 {
132 use_ancestral_state = true;
133 break;
134 }
135 case 's':
136 {
137 snp = true;
138 break;
139 }
140 case 'h':
141 {
142 printHelp();
143 return 0;
144 }
145 case 'v':
146 {
147 printVersion();
148 return 0;
149 }
150 case 't':
151 {
152 loadIndices(it, optarg);
153 cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
154 cerr << "INFO: target ids: " << optarg << endl;
155 break;
156 }
157 case 'b':
158 {
159 loadIndices(ib, optarg);
160 cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
161 cerr << "INFO: background ids: " << optarg << endl;
162 break;
163 }
164 case 'f':
165 {
166 cerr << "INFO: file: " << optarg << endl;
167 filename = optarg;
168 break;
169 }
170 case 'r':
171 {
172 cerr << "INFO: set seqid region to : " << optarg << endl;
173 region = optarg;
174 break;
175 }
176 case 'y':
177 {
178 type = optarg;
179 cerr << "INFO: set genotype likelihood to: " << type << endl;
180 break;
181 }
182 default:
183 break;
184 }
185
186 }
187
188 if(filename.empty()){
189 cerr << "FATAL: failed to specify a file" << endl;
190 printHelp();
191 }
192
193 bool is_open;
194
195 if (filename == "-") {
196
197 is_open=variantFile.open(std::cin);
198
199 } else {
200
201 is_open=variantFile.open(filename);
202
203 }
204
205 if (!is_open) {
206 cerr << "FATAL: could not open file for reading" << endl;
207 printHelp();
208
209 }
210
211
212 if(region != "NA"){
213 if(! variantFile.setRegion(region)){
214 cerr <<"FATAL: unable to set region" << endl;
215 return 1;
216 }
217 }
218
219 if (!variantFile.is_open()) {
220 cerr << "FATAL: could not open VCF for reading" << endl;
221 printHelp();
222 return 1;
223 }
224
225 map<string, int> okayGenotypeLikelihoods;
226 okayGenotypeLikelihoods["PL"] = 1;
227 okayGenotypeLikelihoods["GL"] = 1;
228 okayGenotypeLikelihoods["GP"] = 1;
229 okayGenotypeLikelihoods["GT"] = 1;
230
231 if(type == "NA"){
232 cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
233 printHelp();
234 return 1;
235 }
236 if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
237 cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
238 printHelp();
239 return 1;
240 }
241
242 Variant var(variantFile);
243
244 vector<string> samples = variantFile.sampleNames;
245 int nsamples = samples.size();
246
247 vector<indv *> countData;
248 vector<string > countDataSampleName;
249
250 for ( map<int ,int>::iterator x=it.begin(); x!=it.end(); ++x) {
251
252 countDataSampleName.push_back(samples[x->first] );
253 }
254
255
256 for(int i = 0; i < it.size(); i++){
257 indv * dip = new indv;
258
259 dip->nhet = 0;
260 dip->nhom = 0;
261 dip->nalt = 0;
262 dip->nocall = 0;
263
264 countData.push_back(dip);
265
266 }
267
268
269 while (variantFile.getNextVariant(var)) {
270
271 // biallelic sites naturally
272
273 if(var.alt.size() > 1){
274 continue;
275 }
276 if(snp){
277 bool hit =false;
278
279 for(vector<string>::iterator it = var.alleles.begin(); it != var.alleles.end(); it++){
280 if((*it).size() > 1){
281 hit = true;
282 }
283 }
284 if(hit){
285 continue;
286 }
287 }
288
289 // decide if we can polarize the site if we are using the ancestral allele
290 bool ref_is_ancestral_allele = true;
291 if (use_ancestral_state) {
292 // we need the ancestral allele to decide what to do at this site
293 if (var.info.find("AA") == var.info.end()) continue;
294 string ancestral_allele = var.info["AA"].front();
295 // if we do not have a polarized site with only allowed bases in the ancestral allele, skip it
296 bool allowed = true;
297 for (string::iterator c = ancestral_allele.begin(); c != ancestral_allele.end(); ++c) {
298 if (!allowed_ancestral_bases.count(*c)) {
299 allowed = false;
300 break;
301 }
302 }
303 if (!allowed) continue;
304 ref_is_ancestral_allele = (ancestral_allele == var.ref);
305 }
306
307 vector < map< string, vector<string> > > target, background, total;
308
309 int index = 0;
310
311 for(int nsamp = 0; nsamp < nsamples; nsamp++){
312
313 if(it.find(index) != it.end() ){
314 const map<string, vector<string> >& sample = var.samples[ samples[nsamp]];
315 target.push_back(sample);
316 }
317 index += 1;
318 }
319
320 genotype * populationTarget ;
321
322 if(type == "PL"){
323 populationTarget = new pl();
324 }
325 if(type == "GL"){
326 populationTarget = new gl();
327 }
328 if(type == "GP"){
329 populationTarget = new gp();
330 }
331 if(type == "GT"){
332 populationTarget = new gt();
333 }
334
335 populationTarget->loadPop(target, var.sequenceName, var.position);
336
337 for(int i = 0; i < populationTarget->genoIndex.size() ; i++){
338 if(populationTarget->genoIndex[i] == -1){
339 countData[i]->nocall += 1;
340 }
341 else if (populationTarget->genoIndex[i] == 0) {
342 if (!use_ancestral_state || ref_is_ancestral_allele) {
343 countData[i]->nhom += 1;
344 } else {
345 countData[i]->nalt += 1;
346 }
347 }
348 else if (populationTarget->genoIndex[i] == 1){
349 countData[i]->nhet += 1;
350 }
351 else if (populationTarget->genoIndex[i] == 2) {
352 if (!use_ancestral_state || ref_is_ancestral_allele) {
353 countData[i]->nalt += 1;
354 } else {
355 countData[i]->nhom += 1;
356 }
357 }
358 else{
359 std::cerr << "FATAL: unknown genotype index" << std::endl;
360 cerr << populationTarget->genoIndex[i] << endl;
361 cerr << var << endl;
362 exit(1);
363 }
364 }
365 delete populationTarget;
366
367 }
368
369 if (!use_ancestral_state) {
370 std::cout << "#sample-id\tn-nocall\tn-hom-ref\tn-het\tn-hom-alt" << std::endl;
371 } else {
372 std::cout << "#sample-id\tn-nocall\tn-hom-ancestral\tn-het\tn-hom-derived" << std::endl;
373 }
374 for(int i = 0; i < countData.size(); i++){
375 std::cout << countDataSampleName[i]
376 << "\t" << countData[i]->nocall
377 << "\t" << countData[i]->nhom
378 << "\t" << countData[i]->nhet
379 << "\t" << countData[i]->nalt
380 << std::endl;
381 }
382
383
384 return 0;
385 }
386