1 #include "Variant.h"
2 #include "split.h"
3 #include "convert.h"
4 #include <getopt.h>
5 
6 using namespace std;
7 using namespace vcflib;
8 
isTransition(const string & ref,const string & alt)9 bool isTransition(const string& ref, const string& alt) {
10     if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) ||
11         ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) {
12         return true;
13     } else {
14         return false;
15     }
16 }
17 
isDeamination(const string & ref,const string & alt)18 bool isDeamination(const string& ref, const string& alt) {
19     if ((ref == "G" && alt == "A") ||
20         (ref == "C" && alt == "T")) {
21         return true;
22     } else {
23         return false;
24     }
25 }
26 
isAmination(const string & ref,const string & alt)27 bool isAmination(const string& ref, const string& alt) {
28     if ((ref == "A" && alt == "G") ||
29         (ref == "T" && alt == "C")) {
30         return true;
31     } else {
32         return false;
33     }
34 }
35 
36 class AlleleStats {
37 public:
38     int transitions;
39     int transversions;
40     int deaminations;
41     int aminations;
42     int mismatches;
43     int insertedbases;
44     int insertions;
45     int deletedbases;
46     int deletions;
47     //AlleleStats(int ts, int tv, int da, int am, int mm)
AlleleStats(void)48     AlleleStats(void)
49         : transitions(0)
50         , transversions(0)
51         , deaminations(0)
52         , aminations(0)
53         , mismatches(0)
54         , insertions(0)
55         , insertedbases(0)
56         , deletions(0)
57         , deletedbases(0)
58     { }
59 };
60 
printSummary(char ** argv)61 void printSummary(char** argv) {
62     cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
63          << endl
64          << "    -r, --region          specify a region on which to target the stats, requires a BGZF" << endl
65          << "                          compressed file which has been indexed with tabix.  any number of" << endl
66          << "                          regions may be specified." << endl
67          << "    -a, --add-info        add the statistics intermediate information to the VCF file," << endl
68          << "                          writing out VCF records instead of summary statistics" << endl
69          << "    -t, --add-type        only add the type= field to the info (faster than -a)" << endl
70          << "    -l, --no-length-frequency    don't out the indel and mnp length-frequency spectra" << endl
71          << "    -m, --match-score N          match score for SW algorithm" << endl
72          << "    -x, --mismatch-score N       mismatch score for SW algorithm" << endl
73          << "    -o, --gap-open-penalty N     gap open penalty for SW algorithm" << endl
74          << "    -e, --gap-extend-penalty N   gap extension penalty for SW algorithm" << endl
75          << endl
76          << "Prints statistics about variants in the input VCF file." << endl;
77 }
78 
79 
main(int argc,char ** argv)80 int main(int argc, char** argv) {
81 
82     vector<string> regions;
83     bool addTags = false;
84     bool addType = false;
85     bool lengthFrequency = true;
86 
87     // constants for SmithWaterman algorithm
88     float matchScore = 10.0f;
89     float mismatchScore = -9.0f;
90     float gapOpenPenalty = 15.0f;
91     float gapExtendPenalty = 6.66f;
92 
93     bool useReferenceAlignment = false;
94 
95     int c;
96     while (true) {
97         static struct option long_options[] =
98             {
99                 /* These options set a flag. */
100                 //{"verbose", no_argument,       &verbose_flag, 1},
101                 {"help", no_argument, 0, 'h'},
102                 {"region", required_argument, 0, 'r'},
103                 {"add-info", no_argument, 0, 'a'},
104                 {"add-type", no_argument, 0, 't'},
105                 {"no-length-frequency", no_argument, 0, 'l'},
106                 {"match-score", required_argument, 0, 'm'},
107                 {"mismatch-score", required_argument, 0, 'x'},
108                 {"gap-open-penalty", required_argument, 0, 'o'},
109                 {"gap-extend-penalty", required_argument, 0, 'e'},
110                 //{"length",  no_argument, &printLength, true},
111                 {0, 0, 0, 0}
112             };
113         /* getopt_long stores the option index here. */
114         int option_index = 0;
115 
116         c = getopt_long (argc, argv, "hlatr:m:x:o:e:",
117                          long_options, &option_index);
118 
119         /* Detect the end of the options. */
120         if (c == -1)
121             break;
122 
123         switch (c)
124         {
125         case 0:
126             /* If this option set a flag, do nothing else now. */
127             if (long_options[option_index].flag != 0)
128                 break;
129             printf ("option %s", long_options[option_index].name);
130             if (optarg)
131                 printf (" with arg %s", optarg);
132             printf ("\n");
133             break;
134 
135 	    case 'h':
136             printSummary(argv);
137             exit(0);
138             break;
139 
140 	    case 'r':
141             regions.push_back(optarg);
142             break;
143 
144 	    case 'l':
145             lengthFrequency = false;
146             break;
147 
148 	    case 'a':
149             addTags = true;
150             break;
151 
152 	    case 't':
153             addType = true;
154             break;
155 
156 	    case 'm':
157             matchScore = atof(optarg);
158 	        break;
159 
160 	    case 'x':
161             mismatchScore = atof(optarg);
162 	        break;
163 
164 	    case 'o':
165             gapOpenPenalty = atof(optarg);
166 	        break;
167 
168 	    case 'e':
169             gapExtendPenalty = atof(optarg);
170 	        break;
171 
172 	    default:
173             abort ();
174         }
175     }
176 
177     VariantCallFile variantFile;
178     string inputFilename;
179     if (optind == argc - 1) {
180         inputFilename = argv[optind];
181         variantFile.open(inputFilename);
182     } else {
183         variantFile.open(std::cin);
184     }
185 
186     if (!variantFile.is_open()) {
187         return 1;
188     }
189 
190     if (addType && !addTags) {
191         variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
192         variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
193         cout << variantFile.header << endl;
194     }
195 
196     if (addTags) {
197         variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">");
198         variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">");
199         variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">");
200         variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">");
201         variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">");
202         variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">");
203         variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">");
204         variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
205         variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
206         variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">");
207         variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">");
208         cout << variantFile.header << endl;
209     }
210 
211     Variant var(variantFile);
212 
213     vector<string>::iterator regionItr = regions.begin();
214 
215     int variantAlleles = 0;
216     int uniqueVariantAlleles = 0;
217     int variantSites = 0;
218     int snps = 0;
219     int transitions = 0;
220     int transversions = 0;
221     int deaminations = 0;
222     int aminations = 0;
223     int totalinsertions = 0;
224     int totaldeletions = 0;
225     int insertedbases = 0;
226     int deletedbases = 0;
227     int totalmnps = 0;
228     int totalcomplex = 0;
229     int mismatchbases = 0;
230     int mnpbases = 0;
231     int biallelics = 0;
232     int multiallelics = 0;
233     map<int, int> insertions;
234     map<int, int> deletions;
235     map<int, int> mnps;
236     map<int, int> complexsubs;
237 
238     bool includePreviousBaseForIndels = false;
239     bool useMNPs = true;
240     bool useEntropy = false;
241 
242     AlleleStats biallelicSNPs;
243 
244     // todo, add biallelic snp dialog to output and ts/tv for snps and mnps
245 
246     do {
247 
248         if (!inputFilename.empty() && !regions.empty()) {
249             string regionStr = *regionItr++;
250             variantFile.setRegion(regionStr);
251         }
252 
253         while (variantFile.getNextVariant(var)) {
254             ++variantSites;
255             if (var.alt.size() > 1) {
256                 ++multiallelics;
257             } else {
258                 ++biallelics;
259             }
260             map<string, vector<VariantAllele> > alternates
261 	      = var.parsedAlternates(includePreviousBaseForIndels,
262 				     useMNPs,
263 				     useEntropy,
264 				     matchScore,
265 				     mismatchScore,
266 				     gapOpenPenalty,
267 				     gapExtendPenalty);
268 
269             map<VariantAllele, vector<string> > uniqueVariants;
270 
271             vector<string> cigars;
272 
273             for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
274                 string& alternate = *a;
275                 if (addTags)
276                     var.info["altlen"].push_back(convert(alternate.size()));
277                 vector<VariantAllele>& vav = alternates[alternate];
278                 if (vav.size() > 1) {
279                     // check that there are actually multiple non-reference alleles
280                     int nonRefAlleles = 0;
281                     for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) {
282                         if (z->ref != z->alt)
283                             ++nonRefAlleles;
284                     }
285                     if (nonRefAlleles > 1)
286                         ++totalcomplex;
287                 }
288                 for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
289                     uniqueVariants[*v].push_back(alternate);
290                 }
291 
292                 if (addTags || addType) {
293                     string cigar;
294                     pair<int, string> element;
295                     for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
296                         VariantAllele& va = *v;
297                         if (va.ref != va.alt) {
298                             if (element.second == "M") {
299                                 cigar += convert(element.first) + element.second;
300                                 element.second = ""; element.first = 0;
301                             }
302                             if (va.ref.size() == va.alt.size()) {
303                                 cigar += convert(va.ref.size()) + "X";
304                             } else if (va.ref.size() > va.alt.size()) {
305                                 cigar += convert(va.ref.size() - va.alt.size()) + "D";
306                             } else {
307                                 cigar += convert(va.alt.size() - va.ref.size()) + "I";
308                             }
309                         } else {
310                             if (element.second == "M") {
311                                 element.first += va.ref.size();
312                             } else {
313                                 element = make_pair(va.ref.size(), "M");
314                             }
315                         }
316                     }
317                     if (element.second == "M") {
318                         cigar += convert(element.first) + element.second;
319                     }
320                     element.second = ""; element.first = 0;
321                     cigars.push_back(cigar);
322                 }
323             }
324 
325             if (addTags) {
326                 var.info["cigar"] = cigars;
327                 var.info["reflen"].push_back(convert(var.ref.size()));
328             } else if (addType) {
329                 var.info["cigar"] = cigars;
330             }
331 
332             variantAlleles += var.alt.size();
333             map<string, AlleleStats> alleleStats;
334 
335             for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) {
336                 const VariantAllele& va = v->first;
337                 vector<string>& alternates = v->second;
338 
339                 if (!(addTags || addType)) { // don't add any tag information if we're not going to output it
340                     alternates.clear();
341                 }
342 
343                 if (va.ref != va.alt) {
344                     ++uniqueVariantAlleles;
345                     if (va.ref.size() == va.alt.size()) {
346                         if (va.ref.size() == 1) {
347                             ++snps;
348                             ++mismatchbases;
349                             for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
350                                 ++alleleStats[*a].mismatches;
351                             }
352                             if (isTransition(va.ref, va.alt)) {
353                                 ++transitions;
354                                 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
355                                     ++alleleStats[*a].transitions;
356                                 }
357                             } else {
358                                 ++transversions;
359                                 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
360                                     ++alleleStats[*a].transversions;
361                                 }
362                             }
363                             if (isAmination(va.ref, va.alt)) {
364                                 ++aminations;
365                                 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
366                                     ++alleleStats[*a].aminations;
367                                 }
368                             }
369                             if (isDeamination(va.ref, va.alt)) {
370                                 ++deaminations;
371                                 for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
372                                     ++alleleStats[*a].deaminations;
373                                 }
374                             }
375                         } else {
376                             ++totalmnps;
377                             ++mnps[va.alt.size()]; // not entirely correct
378                             for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
379                                 alleleStats[*a].mismatches += va.alt.size();
380                             }
381                             string::const_iterator r = va.ref.begin();
382                             for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) {
383                                 string rstr = string(1, *r);
384                                 string astr = string(1, *a);
385                                 if (rstr == astr) {
386                                     continue;
387                                 }
388                                 if (isTransition(rstr, astr)) {
389                                     ++transitions;
390                                     for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
391                                         ++alleleStats[*a].transitions;
392                                     }
393                                 } else {
394                                     ++transversions;
395                                     for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
396                                         ++alleleStats[*a].transversions;
397                                     }
398                                 }
399                                 if (isAmination(rstr, astr)) {
400                                     ++aminations;
401                                     for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
402                                         ++alleleStats[*a].aminations;
403                                     }
404                                 }
405                                 if (isDeamination(rstr, astr)) {
406                                     ++deaminations;
407                                     for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
408                                         ++alleleStats[*a].deaminations;
409                                     }
410                                 }
411                                 ++mismatchbases;
412                                 ++mnpbases;
413                             }
414                         }
415                     } else if (va.ref.size() > va.alt.size()) {
416                         int diff = va.ref.size() - va.alt.size();
417                         deletedbases += diff;
418                         ++totaldeletions;
419                         ++deletions[diff];
420                         for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
421                             alleleStats[*a].deletedbases += diff;
422                             alleleStats[*a].deletions += 1;
423                         }
424                     } else {
425                         int diff = va.alt.size() - va.ref.size();
426                         insertedbases += diff;
427                         ++totalinsertions;
428                         ++insertions[diff];
429                         for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
430                             alleleStats[*a].insertedbases += diff;
431                             alleleStats[*a].insertions += 1;
432                         }
433                     }
434                 }
435             }
436             if (addTags || addType) {
437                 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
438                     string vartype;
439                     if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) {
440                         if (alleleStats[*a].mismatches == 1) {
441                             vartype = "snp";
442                         } else if (alleleStats[*a].mismatches > 1) {
443                             vartype = "complex";
444                         } else {
445                             vartype = "ref";
446                         }
447                     } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) {
448                         if (alleleStats[*a].insertions == 1) {
449                             vartype = "ins";
450                         } else {
451                             vartype = "del";
452                         }
453                     } else {
454                         vartype = "complex";
455                     }
456                     if (addTags) {
457                         var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches));
458                         var.info["insertions"].push_back(convert(alleleStats[*a].insertions));
459                         var.info["deletions"].push_back(convert(alleleStats[*a].deletions));
460                         var.info["transitions"].push_back(convert(alleleStats[*a].transitions));
461                         var.info["transversions"].push_back(convert(alleleStats[*a].transversions));
462                         var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations));
463                         var.info["aminations"].push_back(convert(alleleStats[*a].aminations));
464                     }
465                     var.info["type"].push_back(vartype);
466                 }
467                 cout << var << endl;
468             }
469             // biallelic SNP case
470             if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) {
471                 if (isTransition(var.ref, var.alt.front())) {
472                     biallelicSNPs.transitions++;
473                 } else {
474                     biallelicSNPs.transversions++;
475                 }
476                 biallelicSNPs.mismatches++;
477             }
478         }
479 
480     } while (regionItr != regions.end());
481 
482 
483     // find the maximum indel size
484     int maxindel = 0;
485     for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
486         if (i->first > maxindel) {
487             maxindel = i->first;
488         }
489     }
490     for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
491         if (i->first > maxindel) {
492             maxindel = i->first;
493         }
494     }
495 
496     // and maximum mnp
497     int maxmnp = 0;
498     for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
499         if (i->first > maxmnp) {
500             maxmnp = i->first;
501         }
502     }
503 
504     // now print the results
505 
506     if (!addTags && !addType) {
507         cout << "total variant sites:\t" << variantSites << endl
508              << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and "
509                             << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl
510              << "total variant alleles:\t" << variantAlleles << endl
511              << "unique variant alleles:\t" << uniqueVariantAlleles << endl
512              << endl
513              << "snps:\t" << snps << endl
514              << "mnps:\t" << totalmnps << endl
515              << "indels:\t" << totalinsertions + totaldeletions << endl
516              << "complex:\t" << totalcomplex << endl
517              << endl
518              << "mismatches:\t" << mismatchbases << endl
519              << endl
520              << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
521              << "deamination ratio:\t" << (double) deaminations / aminations << endl
522              << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ "
523              << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl;
524 
525         if (lengthFrequency) {
526             cout << endl
527                  << "ins/del length frequency distribution" << endl
528                  << "length\tins\tdel\tins/del" << endl;
529             for (int i = 1; i <= maxindel; ++i) {
530                 int ins = insertions[i];
531                 int del = deletions[i];
532                 cout << i << "\t"
533                      << (ins > 0 ? convert(ins) : "" ) << "\t"
534                      << (del > 0 ? convert(del) : "") << "\t"
535                      << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
536                      << endl;
537             }
538         }
539 
540         cout << endl
541              << "insertion alleles / deletion alleles:\t"
542              << (double) totalinsertions / (double) totaldeletions << endl
543              << "inserted bases / deleted bases:\t"
544              << (double) insertedbases / (double) deletedbases << endl
545              << endl;
546 
547         if (lengthFrequency) {
548             cout << "mnp length frequency distribution" << endl
549                  << "length\tcount" << endl;
550             for (int i = 2; i <= maxmnp; ++i) {
551                 int mnp = mnps[i];
552                 cout << i << "\t"
553                      << (mnp > 0 ? convert(mnp) : "")
554                      << endl;
555             }
556         }
557 
558         cout << "total bases in mnps:\t" << mnpbases << endl;
559 
560         /*
561           cout << "complex event frequency distribution" << endl
562           << "length\tcount" << endl;
563           for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
564           cout << i->first << "\t" << i->second << endl;
565           }
566         */
567     }
568 
569     return 0;
570 
571 }
572 
573