1 #include "Variant.h"
2 #include "split.h"
3 #include "Fasta.h"
4 #include "gpatInfo.hpp"
5 #include <getopt.h>
6 #include <algorithm>
7 #include <numeric>
8
9 using namespace std;
10 using namespace vcflib;
11
printSummary(char ** argv)12 void printSummary(char** argv) {
13 cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
14 << endl
15 << "options:" << endl
16 << " -f, --field Summarize this field in the INFO column" << endl
17 << " -i, --info Store the computed statistic in this info field" << endl
18 << " -a, --average Take the mean for field (default)" << endl
19 << " -m, --median Use the median" << endl
20 << " -n, --min Use the min" << endl
21 << " -x, --max Use the max" << endl
22 << " -h, --help Print this message" << endl
23 << " -v, --version Print version" << endl
24 << endl
25 << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
26 << "to the site-level INFO." << endl
27 << endl;
28 exit(0);
29 }
30
median(vector<double> & v)31 double median(vector<double> &v)
32 {
33 size_t n = v.size() / 2;
34 nth_element(v.begin(), v.begin()+n, v.end());
35 return v[n];
36 }
37
mean(vector<double> & v)38 double mean(vector<double> &v)
39 {
40 double sum = accumulate(v.begin(), v.end(), 0.0);
41 return sum / v.size();
42 }
43
44 enum StatType { MEAN, MEDIAN, MIN, MAX };
45
main(int argc,char ** argv)46 int main(int argc, char** argv) {
47
48 int c;
49 string sitewideField;
50 string infoField;
51 StatType statType = MEAN;
52
53 if (argc == 1)
54 printSummary(argv);
55
56 while (true) {
57 static struct option long_options[] =
58 {
59 /* These options set a flag. */
60 {"help", no_argument, 0, 'h'},
61 {"field", required_argument, 0, 'f'},
62 {"info", required_argument, 0, 'i'},
63 {"average", no_argument, 0, 'a'},
64 {"median", no_argument, 0, 'm'},
65 {"min", no_argument, 0, 'n'},
66 {"max", no_argument, 0, 'x'},
67 {"max", no_argument, 0, 'v'},
68 {0, 0, 0, 0}
69 };
70 /* getopt_long stores the option index here. */
71 int option_index = 0;
72
73 c = getopt_long (argc, argv, "hamnxfv:i:",
74 long_options, &option_index);
75
76 /* Detect the end of the options. */
77 if (c == -1)
78 break;
79
80 switch (c)
81 {
82 case 0:
83 /* If this option set a flag, do nothing else now. */
84 if (long_options[option_index].flag != 0)
85 break;
86 printf ("option %s", long_options[option_index].name);
87 if (optarg)
88 printf (" with arg %s", optarg);
89 printf ("\n");
90 break;
91
92 case 'v':
93 {
94 printBasicVersion();
95 exit(0);
96 }
97 case 'f':
98 {
99 sitewideField = optarg;
100 break;
101 }
102 case 'i':
103 {
104 infoField = optarg;
105 break;
106 }
107 case 'a':
108 {
109 statType = MEAN;
110 break;
111 }
112 case 'm':
113 {
114 statType = MEDIAN;
115 break;
116 }
117 case 'n':
118 {
119 statType = MIN;
120 break;
121 }
122 case 'x':
123 {
124 statType = MAX;
125 break;
126 }
127 case 'h':
128 {
129 printSummary(argv);
130 exit(0);
131 }
132 case '?':
133 /* getopt_long already printed an error message. */
134 printSummary(argv);
135 exit(1);
136 break;
137
138 default:
139 abort ();
140 }
141 }
142
143 if (infoField.empty() || sitewideField.empty()) {
144 cerr << "Error: both a sample field and an info field are required." << endl;
145 return 1;
146 }
147
148 VariantCallFile variantFile;
149 string inputFilename;
150 if (optind == argc - 1) {
151 inputFilename = argv[optind];
152 variantFile.open(inputFilename);
153 } else {
154 variantFile.open(std::cin);
155 }
156
157 if (!variantFile.is_open()) {
158 return 1;
159 }
160
161 string statTypeStr;
162
163 switch (statType) {
164 case MEAN:
165 statTypeStr = "mean";
166 break;
167 case MEDIAN:
168 statTypeStr = "median";
169 break;
170 case MIN:
171 statTypeStr = "min";
172 break;
173 case MAX:
174 statTypeStr = "max";
175 break;
176 default:
177 cerr << "Error: failure to convert stat type to string" << endl;
178 return 1;
179 break;
180 }
181
182 variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of site-wide values of "+sitewideField+" \">");
183
184 cout << variantFile.header << endl;
185
186 Variant var(variantFile);
187 while (variantFile.getNextVariant(var)) {
188 vector<double> vals;
189 map<string, vector<string> >::iterator i = var.info.find(sitewideField);
190 if (i != var.info.end()) {
191 for (vector<string>::iterator s = i->second.begin(); s != i->second.end(); ++s) {
192 double d;
193 convert(*s, d);
194 vals.push_back(d);
195 }
196 }
197
198 double result;
199 switch (statType) {
200 case MEAN:
201 result = mean(vals);
202 break;
203 case MEDIAN:
204 result = median(vals);
205 break;
206 case MIN:
207 result = *min_element(vals.begin(), vals.end());
208 break;
209 case MAX:
210 result = *max_element(vals.begin(), vals.end());
211 break;
212 default:
213 cerr << "Error: unrecognized StatType" << endl;
214 return 1;
215 break;
216 }
217
218 var.info[infoField].clear();
219 var.info[infoField].push_back(convert(result));
220
221 cout << var << endl;
222
223 }
224
225 return 0;
226
227 }
228
229