1 #include "Variant.h"
2 #include "split.h"
3 #include "Fasta.h"
4 #include <getopt.h>
5 #include <algorithm>
6 #include <numeric>
7
8 using namespace std;
9 using namespace vcflib;
10
printSummary(char ** argv)11 void printSummary(char** argv) {
12 cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
13 << endl
14 << "options:" << endl
15 << " -f, --field Add information about this field in samples to INFO column" << endl
16 << " -i, --info Store the computed statistic in this info field" << endl
17 << " -a, --average Take the mean of samples for field (default)" << endl
18 << " -m, --median Use the median" << endl
19 << " -n, --min Use the min" << endl
20 << " -x, --max Use the max" << endl
21 << endl
22 << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
23 << "to the site-level INFO." << endl
24 << endl;
25 exit(0);
26 }
27
median(vector<double> & v)28 double median(vector<double> &v)
29 {
30 size_t n = v.size() / 2;
31 nth_element(v.begin(), v.begin()+n, v.end());
32 return v[n];
33 }
34
mean(vector<double> & v)35 double mean(vector<double> &v)
36 {
37 double sum = accumulate(v.begin(), v.end(), 0.0);
38 return sum / v.size();
39 }
40
41 enum StatType { MEAN, MEDIAN, MIN, MAX };
42
main(int argc,char ** argv)43 int main(int argc, char** argv) {
44
45 int c;
46 string sampleField;
47 string infoField;
48 StatType statType = MEAN;
49
50 if (argc == 1)
51 printSummary(argv);
52
53 while (true) {
54 static struct option long_options[] =
55 {
56 /* These options set a flag. */
57 {"help", no_argument, 0, 'h'},
58 {"field", required_argument, 0, 'f'},
59 {"info", required_argument, 0, 'i'},
60 {"average", no_argument, 0, 'a'},
61 {"median", no_argument, 0, 'm'},
62 {"min", no_argument, 0, 'n'},
63 {"max", no_argument, 0, 'x'},
64 {0, 0, 0, 0}
65 };
66 /* getopt_long stores the option index here. */
67 int option_index = 0;
68
69 c = getopt_long (argc, argv, "hamnxf:i:",
70 long_options, &option_index);
71
72 /* Detect the end of the options. */
73 if (c == -1)
74 break;
75
76 switch (c)
77 {
78 case 0:
79 /* If this option set a flag, do nothing else now. */
80 if (long_options[option_index].flag != 0)
81 break;
82 printf ("option %s", long_options[option_index].name);
83 if (optarg)
84 printf (" with arg %s", optarg);
85 printf ("\n");
86 break;
87
88 case 'f':
89 sampleField = optarg;
90 break;
91
92 case 'i':
93 infoField = optarg;
94 break;
95
96 case 'a':
97 statType = MEAN;
98 break;
99
100 case 'm':
101 statType = MEDIAN;
102 break;
103
104 case 'n':
105 statType = MIN;
106 break;
107
108 case 'x':
109 statType = MAX;
110 break;
111
112 case 'h':
113 printSummary(argv);
114 exit(0);
115
116 case '?':
117 /* getopt_long already printed an error message. */
118 printSummary(argv);
119 exit(1);
120 break;
121
122 default:
123 abort ();
124 }
125 }
126
127 if (infoField.empty() || sampleField.empty()) {
128 cerr << "Error: both a sample field and an info field are required." << endl;
129 return 1;
130 }
131
132 VariantCallFile variantFile;
133 string inputFilename;
134 if (optind == argc - 1) {
135 inputFilename = argv[optind];
136 variantFile.open(inputFilename);
137 } else {
138 variantFile.open(std::cin);
139 }
140
141 if (!variantFile.is_open()) {
142 return 1;
143 }
144
145 string statTypeStr;
146
147 switch (statType) {
148 case MEAN:
149 statTypeStr = "mean";
150 break;
151 case MEDIAN:
152 statTypeStr = "median";
153 break;
154 case MIN:
155 statTypeStr = "min";
156 break;
157 case MAX:
158 statTypeStr = "max";
159 break;
160 default:
161 cerr << "Error: failure to convert stat type to string" << endl;
162 return 1;
163 break;
164 }
165
166 variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of per-sample values of "+sampleField+" \">");
167
168 cout << variantFile.header << endl;
169
170 Variant var(variantFile);
171 while (variantFile.getNextVariant(var)) {
172 vector<double> vals;
173 for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
174 s != var.samples.end(); ++s) {
175 map<string, vector<string> >& sample = s->second;
176 if (sample.find(sampleField) != sample.end()) {
177 double val;
178 string& s = sample[sampleField].front();
179 if (sample[sampleField].size() > 1) {
180 cerr << "Error: cannot handle sample fields with multiple values" << endl;
181 return 1;
182 }
183 convert(s, val);
184 vals.push_back(val);
185 }
186 }
187
188 double result;
189 switch (statType) {
190 case MEAN:
191 result = mean(vals);
192 break;
193 case MEDIAN:
194 result = median(vals);
195 break;
196 case MIN:
197 result = *min_element(vals.begin(), vals.end());
198 break;
199 case MAX:
200 result = *max_element(vals.begin(), vals.end());
201 break;
202 default:
203 cerr << "Error: unrecognized StatType" << endl;
204 return 1;
205 break;
206 }
207
208 var.info[infoField].clear();
209 var.info[infoField].push_back(convert(result));
210
211 cout << var << endl;
212
213 }
214
215 return 0;
216
217 }
218
219