1 #include "Variant.h"
2 #include "split.h"
3 #include "convert.h"
4 #include <getopt.h>
5
6 using namespace std;
7 using namespace vcflib;
8
isTransition(const string & ref,const string & alt)9 bool isTransition(const string& ref, const string& alt) {
10 if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) ||
11 ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) {
12 return true;
13 } else {
14 return false;
15 }
16 }
17
isDeamination(const string & ref,const string & alt)18 bool isDeamination(const string& ref, const string& alt) {
19 if ((ref == "G" && alt == "A") ||
20 (ref == "C" && alt == "T")) {
21 return true;
22 } else {
23 return false;
24 }
25 }
26
isAmination(const string & ref,const string & alt)27 bool isAmination(const string& ref, const string& alt) {
28 if ((ref == "A" && alt == "G") ||
29 (ref == "T" && alt == "C")) {
30 return true;
31 } else {
32 return false;
33 }
34 }
35
36 class AlleleStats {
37 public:
38 int transitions;
39 int transversions;
40 int deaminations;
41 int aminations;
42 int mismatches;
43 int insertedbases;
44 int insertions;
45 int deletedbases;
46 int deletions;
47 //AlleleStats(int ts, int tv, int da, int am, int mm)
AlleleStats(void)48 AlleleStats(void)
49 : transitions(0)
50 , transversions(0)
51 , deaminations(0)
52 , aminations(0)
53 , mismatches(0)
54 , insertions(0)
55 , insertedbases(0)
56 , deletions(0)
57 , deletedbases(0)
58 { }
59 };
60
printSummary(char ** argv)61 void printSummary(char** argv) {
62 cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
63 << endl
64 << " -r, --region specify a region on which to target the stats, requires a BGZF" << endl
65 << " compressed file which has been indexed with tabix. any number of" << endl
66 << " regions may be specified." << endl
67 << " -a, --add-info add the statistics intermediate information to the VCF file," << endl
68 << " writing out VCF records instead of summary statistics" << endl
69 << " -t, --add-type only add the type= field to the info (faster than -a)" << endl
70 << " -l, --no-length-frequency don't out the indel and mnp length-frequency spectra" << endl
71 << " -m, --match-score N match score for SW algorithm" << endl
72 << " -x, --mismatch-score N mismatch score for SW algorithm" << endl
73 << " -o, --gap-open-penalty N gap open penalty for SW algorithm" << endl
74 << " -e, --gap-extend-penalty N gap extension penalty for SW algorithm" << endl
75 << endl
76 << "Prints statistics about variants in the input VCF file." << endl;
77 }
78
79
main(int argc,char ** argv)80 int main(int argc, char** argv) {
81
82 vector<string> regions;
83 bool addTags = false;
84 bool addType = false;
85 bool lengthFrequency = true;
86
87 // constants for SmithWaterman algorithm
88 float matchScore = 10.0f;
89 float mismatchScore = -9.0f;
90 float gapOpenPenalty = 15.0f;
91 float gapExtendPenalty = 6.66f;
92
93 bool useReferenceAlignment = false;
94
95 int c;
96 while (true) {
97 static struct option long_options[] =
98 {
99 /* These options set a flag. */
100 //{"verbose", no_argument, &verbose_flag, 1},
101 {"help", no_argument, 0, 'h'},
102 {"region", required_argument, 0, 'r'},
103 {"add-info", no_argument, 0, 'a'},
104 {"add-type", no_argument, 0, 't'},
105 {"no-length-frequency", no_argument, 0, 'l'},
106 {"match-score", required_argument, 0, 'm'},
107 {"mismatch-score", required_argument, 0, 'x'},
108 {"gap-open-penalty", required_argument, 0, 'o'},
109 {"gap-extend-penalty", required_argument, 0, 'e'},
110 //{"length", no_argument, &printLength, true},
111 {0, 0, 0, 0}
112 };
113 /* getopt_long stores the option index here. */
114 int option_index = 0;
115
116 c = getopt_long (argc, argv, "hlatr:m:x:o:e:",
117 long_options, &option_index);
118
119 /* Detect the end of the options. */
120 if (c == -1)
121 break;
122
123 switch (c)
124 {
125 case 0:
126 /* If this option set a flag, do nothing else now. */
127 if (long_options[option_index].flag != 0)
128 break;
129 printf ("option %s", long_options[option_index].name);
130 if (optarg)
131 printf (" with arg %s", optarg);
132 printf ("\n");
133 break;
134
135 case 'h':
136 printSummary(argv);
137 exit(0);
138 break;
139
140 case 'r':
141 regions.push_back(optarg);
142 break;
143
144 case 'l':
145 lengthFrequency = false;
146 break;
147
148 case 'a':
149 addTags = true;
150 break;
151
152 case 't':
153 addType = true;
154 break;
155
156 case 'm':
157 matchScore = atof(optarg);
158 break;
159
160 case 'x':
161 mismatchScore = atof(optarg);
162 break;
163
164 case 'o':
165 gapOpenPenalty = atof(optarg);
166 break;
167
168 case 'e':
169 gapExtendPenalty = atof(optarg);
170 break;
171
172 default:
173 abort ();
174 }
175 }
176
177 VariantCallFile variantFile;
178 string inputFilename;
179 if (optind == argc - 1) {
180 inputFilename = argv[optind];
181 variantFile.open(inputFilename);
182 } else {
183 variantFile.open(std::cin);
184 }
185
186 if (!variantFile.is_open()) {
187 return 1;
188 }
189
190 if (addType && !addTags) {
191 variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
192 variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
193 cout << variantFile.header << endl;
194 }
195
196 if (addTags) {
197 variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">");
198 variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">");
199 variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">");
200 variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">");
201 variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">");
202 variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">");
203 variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">");
204 variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
205 variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
206 variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">");
207 variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">");
208 cout << variantFile.header << endl;
209 }
210
211 Variant var(variantFile);
212
213 vector<string>::iterator regionItr = regions.begin();
214
215 int variantAlleles = 0;
216 int uniqueVariantAlleles = 0;
217 int variantSites = 0;
218 int snps = 0;
219 int transitions = 0;
220 int transversions = 0;
221 int deaminations = 0;
222 int aminations = 0;
223 int totalinsertions = 0;
224 int totaldeletions = 0;
225 int insertedbases = 0;
226 int deletedbases = 0;
227 int totalmnps = 0;
228 int totalcomplex = 0;
229 int mismatchbases = 0;
230 int mnpbases = 0;
231 int biallelics = 0;
232 int multiallelics = 0;
233 map<int, int> insertions;
234 map<int, int> deletions;
235 map<int, int> mnps;
236 map<int, int> complexsubs;
237
238 bool includePreviousBaseForIndels = false;
239 bool useMNPs = true;
240 bool useEntropy = false;
241
242 AlleleStats biallelicSNPs;
243
244 // todo, add biallelic snp dialog to output and ts/tv for snps and mnps
245
246 do {
247
248 if (!inputFilename.empty() && !regions.empty()) {
249 string regionStr = *regionItr++;
250 variantFile.setRegion(regionStr);
251 }
252
253 while (variantFile.getNextVariant(var)) {
254 ++variantSites;
255 if (var.alt.size() > 1) {
256 ++multiallelics;
257 } else {
258 ++biallelics;
259 }
260 map<string, vector<VariantAllele> > alternates
261 = var.parsedAlternates(includePreviousBaseForIndels,
262 useMNPs,
263 useEntropy,
264 matchScore,
265 mismatchScore,
266 gapOpenPenalty,
267 gapExtendPenalty);
268
269 map<VariantAllele, vector<string> > uniqueVariants;
270
271 vector<string> cigars;
272
273 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
274 string& alternate = *a;
275 if (addTags)
276 var.info["altlen"].push_back(convert(alternate.size()));
277 vector<VariantAllele>& vav = alternates[alternate];
278 if (vav.size() > 1) {
279 // check that there are actually multiple non-reference alleles
280 int nonRefAlleles = 0;
281 for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) {
282 if (z->ref != z->alt)
283 ++nonRefAlleles;
284 }
285 if (nonRefAlleles > 1)
286 ++totalcomplex;
287 }
288 for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
289 uniqueVariants[*v].push_back(alternate);
290 }
291
292 if (addTags || addType) {
293 string cigar;
294 pair<int, string> element;
295 for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
296 VariantAllele& va = *v;
297 if (va.ref != va.alt) {
298 if (element.second == "M") {
299 cigar += convert(element.first) + element.second;
300 element.second = ""; element.first = 0;
301 }
302 if (va.ref.size() == va.alt.size()) {
303 cigar += convert(va.ref.size()) + "X";
304 } else if (va.ref.size() > va.alt.size()) {
305 cigar += convert(va.ref.size() - va.alt.size()) + "D";
306 } else {
307 cigar += convert(va.alt.size() - va.ref.size()) + "I";
308 }
309 } else {
310 if (element.second == "M") {
311 element.first += va.ref.size();
312 } else {
313 element = make_pair(va.ref.size(), "M");
314 }
315 }
316 }
317 if (element.second == "M") {
318 cigar += convert(element.first) + element.second;
319 }
320 element.second = ""; element.first = 0;
321 cigars.push_back(cigar);
322 }
323 }
324
325 if (addTags) {
326 var.info["cigar"] = cigars;
327 var.info["reflen"].push_back(convert(var.ref.size()));
328 } else if (addType) {
329 var.info["cigar"] = cigars;
330 }
331
332 variantAlleles += var.alt.size();
333 map<string, AlleleStats> alleleStats;
334
335 for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) {
336 const VariantAllele& va = v->first;
337 vector<string>& alternates = v->second;
338
339 if (!(addTags || addType)) { // don't add any tag information if we're not going to output it
340 alternates.clear();
341 }
342
343 if (va.ref != va.alt) {
344 ++uniqueVariantAlleles;
345 if (va.ref.size() == va.alt.size()) {
346 if (va.ref.size() == 1) {
347 ++snps;
348 ++mismatchbases;
349 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
350 ++alleleStats[*a].mismatches;
351 }
352 if (isTransition(va.ref, va.alt)) {
353 ++transitions;
354 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
355 ++alleleStats[*a].transitions;
356 }
357 } else {
358 ++transversions;
359 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
360 ++alleleStats[*a].transversions;
361 }
362 }
363 if (isAmination(va.ref, va.alt)) {
364 ++aminations;
365 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
366 ++alleleStats[*a].aminations;
367 }
368 }
369 if (isDeamination(va.ref, va.alt)) {
370 ++deaminations;
371 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
372 ++alleleStats[*a].deaminations;
373 }
374 }
375 } else {
376 ++totalmnps;
377 ++mnps[va.alt.size()]; // not entirely correct
378 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
379 alleleStats[*a].mismatches += va.alt.size();
380 }
381 string::const_iterator r = va.ref.begin();
382 for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) {
383 string rstr = string(1, *r);
384 string astr = string(1, *a);
385 if (rstr == astr) {
386 continue;
387 }
388 if (isTransition(rstr, astr)) {
389 ++transitions;
390 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
391 ++alleleStats[*a].transitions;
392 }
393 } else {
394 ++transversions;
395 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
396 ++alleleStats[*a].transversions;
397 }
398 }
399 if (isAmination(rstr, astr)) {
400 ++aminations;
401 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
402 ++alleleStats[*a].aminations;
403 }
404 }
405 if (isDeamination(rstr, astr)) {
406 ++deaminations;
407 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
408 ++alleleStats[*a].deaminations;
409 }
410 }
411 ++mismatchbases;
412 ++mnpbases;
413 }
414 }
415 } else if (va.ref.size() > va.alt.size()) {
416 int diff = va.ref.size() - va.alt.size();
417 deletedbases += diff;
418 ++totaldeletions;
419 ++deletions[diff];
420 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
421 alleleStats[*a].deletedbases += diff;
422 alleleStats[*a].deletions += 1;
423 }
424 } else {
425 int diff = va.alt.size() - va.ref.size();
426 insertedbases += diff;
427 ++totalinsertions;
428 ++insertions[diff];
429 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
430 alleleStats[*a].insertedbases += diff;
431 alleleStats[*a].insertions += 1;
432 }
433 }
434 }
435 }
436 if (addTags || addType) {
437 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
438 string vartype;
439 if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) {
440 if (alleleStats[*a].mismatches == 1) {
441 vartype = "snp";
442 } else if (alleleStats[*a].mismatches > 1) {
443 vartype = "complex";
444 } else {
445 vartype = "ref";
446 }
447 } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) {
448 if (alleleStats[*a].insertions == 1) {
449 vartype = "ins";
450 } else {
451 vartype = "del";
452 }
453 } else {
454 vartype = "complex";
455 }
456 if (addTags) {
457 var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches));
458 var.info["insertions"].push_back(convert(alleleStats[*a].insertions));
459 var.info["deletions"].push_back(convert(alleleStats[*a].deletions));
460 var.info["transitions"].push_back(convert(alleleStats[*a].transitions));
461 var.info["transversions"].push_back(convert(alleleStats[*a].transversions));
462 var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations));
463 var.info["aminations"].push_back(convert(alleleStats[*a].aminations));
464 }
465 var.info["type"].push_back(vartype);
466 }
467 cout << var << endl;
468 }
469 // biallelic SNP case
470 if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) {
471 if (isTransition(var.ref, var.alt.front())) {
472 biallelicSNPs.transitions++;
473 } else {
474 biallelicSNPs.transversions++;
475 }
476 biallelicSNPs.mismatches++;
477 }
478 }
479
480 } while (regionItr != regions.end());
481
482
483 // find the maximum indel size
484 int maxindel = 0;
485 for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
486 if (i->first > maxindel) {
487 maxindel = i->first;
488 }
489 }
490 for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
491 if (i->first > maxindel) {
492 maxindel = i->first;
493 }
494 }
495
496 // and maximum mnp
497 int maxmnp = 0;
498 for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
499 if (i->first > maxmnp) {
500 maxmnp = i->first;
501 }
502 }
503
504 // now print the results
505
506 if (!addTags && !addType) {
507 cout << "total variant sites:\t" << variantSites << endl
508 << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and "
509 << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl
510 << "total variant alleles:\t" << variantAlleles << endl
511 << "unique variant alleles:\t" << uniqueVariantAlleles << endl
512 << endl
513 << "snps:\t" << snps << endl
514 << "mnps:\t" << totalmnps << endl
515 << "indels:\t" << totalinsertions + totaldeletions << endl
516 << "complex:\t" << totalcomplex << endl
517 << endl
518 << "mismatches:\t" << mismatchbases << endl
519 << endl
520 << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
521 << "deamination ratio:\t" << (double) deaminations / aminations << endl
522 << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ "
523 << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl;
524
525 if (lengthFrequency) {
526 cout << endl
527 << "ins/del length frequency distribution" << endl
528 << "length\tins\tdel\tins/del" << endl;
529 for (int i = 1; i <= maxindel; ++i) {
530 int ins = insertions[i];
531 int del = deletions[i];
532 cout << i << "\t"
533 << (ins > 0 ? convert(ins) : "" ) << "\t"
534 << (del > 0 ? convert(del) : "") << "\t"
535 << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
536 << endl;
537 }
538 }
539
540 cout << endl
541 << "insertion alleles / deletion alleles:\t"
542 << (double) totalinsertions / (double) totaldeletions << endl
543 << "inserted bases / deleted bases:\t"
544 << (double) insertedbases / (double) deletedbases << endl
545 << endl;
546
547 if (lengthFrequency) {
548 cout << "mnp length frequency distribution" << endl
549 << "length\tcount" << endl;
550 for (int i = 2; i <= maxmnp; ++i) {
551 int mnp = mnps[i];
552 cout << i << "\t"
553 << (mnp > 0 ? convert(mnp) : "")
554 << endl;
555 }
556 }
557
558 cout << "total bases in mnps:\t" << mnpbases << endl;
559
560 /*
561 cout << "complex event frequency distribution" << endl
562 << "length\tcount" << endl;
563 for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
564 cout << i->first << "\t" << i->second << endl;
565 }
566 */
567 }
568
569 return 0;
570
571 }
572
573