1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 #include "Variant.h"
11 #include "split.h"
12 #include "cdflib.hpp"
13 #include "pdflib.hpp"
14 #include "var.hpp"
15
16 #include <string>
17 #include <iostream>
18 #include <math.h>
19 #include <cmath>
20 #include <stdlib.h>
21 #include <time.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include "gpatInfo.hpp"
25
26 using namespace std;
27 using namespace vcflib;
28
29 struct indv{
30 int nhet ;
31 int nhom ;
32 int nalt ;
33 int nocall;
34 };
35
36
37
printHelp(void)38 void printHelp(void){
39
40 cerr << endl << endl;
41 cerr << "INFO: help" << endl;
42 cerr << "INFO: description:" << endl;
43 cerr << "Generates a table of genotype counts. Summarizes genotype counts for bi-allelic SNVs and indel " << endl << endl;
44
45 cerr << "INFO: output: table of genotype counts for each individual." << endl;
46
47
48 cerr << "INFO: usage: genotypeSummmary --type PL --target 0,1,2,3,4,5,6,7 --file my.vcf --snp " << endl;
49 cerr << endl;
50 cerr << "INFO: required: t,target -- a zero based comma separated list of target individuals corresponding to VCF columns " << endl;
51 cerr << "INFO: required: f,file -- proper formatted VCF " << endl;
52 cerr << "INFO: required, y,type -- genotype likelihood format; genotype : GL,PL,GP " << endl;
53 cerr << "INFO: optional, r,region -- a tabix compliant region : chr1:1-1000 or chr1 " << endl;
54 cerr << "INFO: optional, s,snp -- Only count SNPs " << endl;
55 cerr << "INFO: optional, a,ancestral -- describe counts relative to the ancestral allele defined as AA in INFO" << endl;
56 cerr << endl << "Type: statistics" << endl << endl;
57
58 printVersion();
59 }
60
61
bound(double v)62 double bound(double v){
63 if(v <= 0.00001){
64 return 0.00001;
65 }
66 if(v >= 0.99999){
67 return 0.99999;
68 }
69 return v;
70 }
71
loadIndices(map<int,int> & index,string set)72 void loadIndices(map<int, int> & index, string set){
73
74 vector<string> indviduals = split(set, ",");
75
76 vector<string>::iterator it = indviduals.begin();
77
78 for(; it != indviduals.end(); it++){
79 index[ atoi( (*it).c_str() ) ] = 1;
80 }
81 }
82
83
main(int argc,char ** argv)84 int main(int argc, char** argv) {
85
86 bool snp = false;
87
88 // set the random seed for MCMC
89
90 srand((unsigned)time(NULL));
91
92 // the filename
93
94 string filename;
95
96 // open standardout
97
98 // set region to scaffold
99
100 string region = "NA";
101
102 // using vcflib; thanks to Erik Garrison
103
104 VariantCallFile variantFile;
105
106 // zero based index for the target and background indivudals
107
108 map<int, int> it, ib;
109
110 // genotype likelihood format
111
112 string type = "NA";
113
114 // are we polarizing the counts relative to the ancestral allele?
115 bool use_ancestral_state = false;
116 set<char> allowed_ancestral_bases = { 'A', 'T', 'C', 'G' };
117
118 const struct option longopts[] =
119 {
120 {"version" , 0, 0, 'v'},
121 {"help" , 0, 0, 'h'},
122 {"file" , 1, 0, 'f'},
123 {"target" , 1, 0, 't'},
124 {"region" , 1, 0, 'r'},
125 {"type" , 1, 0, 'y'},
126 {"snp" , 0, 0, 's'},
127 {"ancestral" , 0, 0, 'a'},
128 {0,0,0,0}
129 };
130
131 int index;
132 int iarg=0;
133
134 while(iarg != -1)
135 {
136 iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chvsa", longopts, &index);
137
138 switch (iarg)
139 {
140 case 'a':
141 {
142 use_ancestral_state = true;
143 break;
144 }
145 case 's':
146 {
147 snp = true;
148 break;
149 }
150 case 'h':
151 {
152 printHelp();
153 return 0;
154 }
155 case 'v':
156 {
157 printVersion();
158 return 0;
159 }
160 case 't':
161 {
162 loadIndices(it, optarg);
163 cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
164 cerr << "INFO: target ids: " << optarg << endl;
165 break;
166 }
167 case 'b':
168 {
169 loadIndices(ib, optarg);
170 cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
171 cerr << "INFO: background ids: " << optarg << endl;
172 break;
173 }
174 case 'f':
175 {
176 cerr << "INFO: file: " << optarg << endl;
177 filename = optarg;
178 break;
179 }
180 case 'r':
181 {
182 cerr << "INFO: set seqid region to : " << optarg << endl;
183 region = optarg;
184 break;
185 }
186 case 'y':
187 {
188 type = optarg;
189 cerr << "INFO: set genotype likelihood to: " << type << endl;
190 break;
191 }
192 default:
193 break;
194 }
195
196 }
197
198 if(filename.empty()){
199 cerr << "FATAL: failed to specify a file" << endl;
200 printHelp();
201 }
202
203 bool is_open;
204
205 if (filename == "-") {
206
207 is_open=variantFile.open(std::cin);
208
209 } else {
210
211 is_open=variantFile.open(filename);
212
213 }
214
215 if (!is_open) {
216 cerr << "FATAL: could not open file for reading" << endl;
217 printHelp();
218
219 }
220
221
222 if(region != "NA"){
223 if(! variantFile.setRegion(region)){
224 cerr <<"FATAL: unable to set region" << endl;
225 return 1;
226 }
227 }
228
229 if (!variantFile.is_open()) {
230 cerr << "FATAL: could not open VCF for reading" << endl;
231 printHelp();
232 return 1;
233 }
234
235 map<string, int> okayGenotypeLikelihoods;
236 okayGenotypeLikelihoods["PL"] = 1;
237 okayGenotypeLikelihoods["GL"] = 1;
238 okayGenotypeLikelihoods["GP"] = 1;
239 okayGenotypeLikelihoods["GT"] = 1;
240
241 if(type == "NA"){
242 cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
243 printHelp();
244 return 1;
245 }
246 if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
247 cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
248 printHelp();
249 return 1;
250 }
251
252 Variant var(variantFile);
253
254 vector<string> samples = variantFile.sampleNames;
255 int nsamples = samples.size();
256
257 vector<indv *> countData;
258 vector<string > countDataSampleName;
259
260 for ( map<int ,int>::iterator x=it.begin(); x!=it.end(); ++x) {
261
262 countDataSampleName.push_back(samples[x->first] );
263 }
264
265
266 for(int i = 0; i < it.size(); i++){
267 indv * dip = new indv;
268
269 dip->nhet = 0;
270 dip->nhom = 0;
271 dip->nalt = 0;
272 dip->nocall = 0;
273
274 countData.push_back(dip);
275
276 }
277
278
279 while (variantFile.getNextVariant(var)) {
280
281 // biallelic sites naturally
282
283 if(var.alt.size() > 1){
284 continue;
285 }
286 if(snp){
287 bool hit =false;
288
289 for(vector<string>::iterator it = var.alleles.begin(); it != var.alleles.end(); it++){
290 if((*it).size() > 1){
291 hit = true;
292 }
293 }
294 if(hit){
295 continue;
296 }
297 }
298
299 // decide if we can polarize the site if we are using the ancestral allele
300 bool ref_is_ancestral_allele = true;
301 if (use_ancestral_state) {
302 // we need the ancestral allele to decide what to do at this site
303 if (var.info.find("AA") == var.info.end()) continue;
304 string ancestral_allele = var.info["AA"].front();
305 // if we do not have a polarized site with only allowed bases in the ancestral allele, skip it
306 bool allowed = true;
307 for (string::iterator c = ancestral_allele.begin(); c != ancestral_allele.end(); ++c) {
308 if (!allowed_ancestral_bases.count(*c)) {
309 allowed = false;
310 break;
311 }
312 }
313 if (!allowed) continue;
314 ref_is_ancestral_allele = (ancestral_allele == var.ref);
315 }
316
317 vector < map< string, vector<string> > > target, background, total;
318
319 int index = 0;
320
321 for(int nsamp = 0; nsamp < nsamples; nsamp++){
322
323 if(it.find(index) != it.end() ){
324 const map<string, vector<string> >& sample = var.samples[ samples[nsamp]];
325 target.push_back(sample);
326 }
327 index += 1;
328 }
329
330 genotype * populationTarget ;
331
332 if(type == "PL"){
333 populationTarget = new pl();
334 }
335 if(type == "GL"){
336 populationTarget = new gl();
337 }
338 if(type == "GP"){
339 populationTarget = new gp();
340 }
341 if(type == "GT"){
342 populationTarget = new gt();
343 }
344
345 populationTarget->loadPop(target, var.sequenceName, var.position);
346
347 for(int i = 0; i < populationTarget->genoIndex.size() ; i++){
348 if(populationTarget->genoIndex[i] == -1){
349 countData[i]->nocall += 1;
350 }
351 else if (populationTarget->genoIndex[i] == 0) {
352 if (!use_ancestral_state || ref_is_ancestral_allele) {
353 countData[i]->nhom += 1;
354 } else {
355 countData[i]->nalt += 1;
356 }
357 }
358 else if (populationTarget->genoIndex[i] == 1){
359 countData[i]->nhet += 1;
360 }
361 else if (populationTarget->genoIndex[i] == 2) {
362 if (!use_ancestral_state || ref_is_ancestral_allele) {
363 countData[i]->nalt += 1;
364 } else {
365 countData[i]->nhom += 1;
366 }
367 }
368 else{
369 std::cerr << "FATAL: unknown genotype index" << std::endl;
370 cerr << populationTarget->genoIndex[i] << endl;
371 cerr << var << endl;
372 exit(1);
373 }
374 }
375 delete populationTarget;
376
377 }
378
379 if (!use_ancestral_state) {
380 std::cout << "#sample-id\tn-nocall\tn-hom-ref\tn-het\tn-hom-alt" << std::endl;
381 } else {
382 std::cout << "#sample-id\tn-nocall\tn-hom-ancestral\tn-het\tn-hom-derived" << std::endl;
383 }
384 for(int i = 0; i < countData.size(); i++){
385 std::cout << countDataSampleName[i]
386 << "\t" << countData[i]->nocall
387 << "\t" << countData[i]->nhom
388 << "\t" << countData[i]->nhet
389 << "\t" << countData[i]->nalt
390 << std::endl;
391 }
392
393
394 return 0;
395 }
396