1 #include "Variant.h"
2 #include "split.h"
3 #include "Fasta.h"
4 #include <getopt.h>
5 #include <algorithm>
6 #include <numeric>
7 
8 using namespace std;
9 using namespace vcflib;
10 
printSummary(char ** argv)11 void printSummary(char** argv) {
12     cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
13          << endl
14          << "options:" << endl
15          << "    -f, --field         Add information about this field in samples to INFO column" << endl
16          << "    -i, --info          Store the computed statistic in this info field" << endl
17          << "    -a, --average       Take the mean of samples for field (default)" << endl
18          << "    -m, --median        Use the median" << endl
19          << "    -n, --min           Use the min" << endl
20          << "    -x, --max           Use the max" << endl
21          << endl
22          << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
23          << "to the site-level INFO." << endl
24          << endl;
25     exit(0);
26 }
27 
median(vector<double> & v)28 double median(vector<double> &v)
29 {
30     size_t n = v.size() / 2;
31     nth_element(v.begin(), v.begin()+n, v.end());
32     return v[n];
33 }
34 
mean(vector<double> & v)35 double mean(vector<double> &v)
36 {
37     double sum = accumulate(v.begin(), v.end(), 0.0);
38     return sum / v.size();
39 }
40 
41 enum StatType { MEAN, MEDIAN, MIN, MAX };
42 
main(int argc,char ** argv)43 int main(int argc, char** argv) {
44 
45     int c;
46     string sampleField;
47     string infoField;
48     StatType statType = MEAN;
49 
50     if (argc == 1)
51         printSummary(argv);
52 
53     while (true) {
54         static struct option long_options[] =
55             {
56                 /* These options set a flag. */
57                 {"help", no_argument, 0, 'h'},
58                 {"field",  required_argument, 0, 'f'},
59                 {"info",  required_argument, 0, 'i'},
60                 {"average", no_argument, 0, 'a'},
61                 {"median", no_argument, 0, 'm'},
62                 {"min", no_argument, 0, 'n'},
63                 {"max", no_argument, 0, 'x'},
64                 {0, 0, 0, 0}
65             };
66         /* getopt_long stores the option index here. */
67         int option_index = 0;
68 
69         c = getopt_long (argc, argv, "hamnxf:i:",
70                          long_options, &option_index);
71 
72         /* Detect the end of the options. */
73         if (c == -1)
74             break;
75 
76         switch (c)
77         {
78         case 0:
79             /* If this option set a flag, do nothing else now. */
80             if (long_options[option_index].flag != 0)
81                 break;
82             printf ("option %s", long_options[option_index].name);
83             if (optarg)
84                 printf (" with arg %s", optarg);
85             printf ("\n");
86             break;
87 
88         case 'f':
89             sampleField = optarg;
90             break;
91 
92         case 'i':
93             infoField = optarg;
94             break;
95 
96         case 'a':
97             statType = MEAN;
98             break;
99 
100         case 'm':
101             statType = MEDIAN;
102             break;
103 
104         case 'n':
105             statType = MIN;
106             break;
107 
108         case 'x':
109             statType = MAX;
110             break;
111 
112         case 'h':
113             printSummary(argv);
114             exit(0);
115 
116         case '?':
117             /* getopt_long already printed an error message. */
118             printSummary(argv);
119             exit(1);
120             break;
121 
122         default:
123             abort ();
124         }
125     }
126 
127     if (infoField.empty() || sampleField.empty()) {
128         cerr << "Error: both a sample field and an info field are required." << endl;
129         return 1;
130     }
131 
132     VariantCallFile variantFile;
133     string inputFilename;
134     if (optind == argc - 1) {
135         inputFilename = argv[optind];
136         variantFile.open(inputFilename);
137     } else {
138         variantFile.open(std::cin);
139     }
140 
141     if (!variantFile.is_open()) {
142         return 1;
143     }
144 
145     string statTypeStr;
146 
147     switch (statType) {
148     case MEAN:
149         statTypeStr = "mean";
150         break;
151     case MEDIAN:
152         statTypeStr = "median";
153         break;
154     case MIN:
155         statTypeStr = "min";
156         break;
157     case MAX:
158         statTypeStr = "max";
159         break;
160     default:
161         cerr << "Error: failure to convert stat type to string" << endl;
162         return 1;
163         break;
164     }
165 
166     variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of per-sample values of "+sampleField+" \">");
167 
168     cout << variantFile.header << endl;
169 
170     Variant var(variantFile);
171     while (variantFile.getNextVariant(var)) {
172         vector<double> vals;
173         for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
174              s != var.samples.end(); ++s) {
175             map<string, vector<string> >& sample = s->second;
176             if (sample.find(sampleField) != sample.end()) {
177                 double val;
178                 string& s = sample[sampleField].front();
179                 if (sample[sampleField].size() > 1) {
180                     cerr << "Error: cannot handle sample fields with multiple values" << endl;
181                     return 1;
182                 }
183                 convert(s, val);
184                 vals.push_back(val);
185             }
186         }
187 
188         double result;
189         switch (statType) {
190         case MEAN:
191             result = mean(vals);
192             break;
193         case MEDIAN:
194             result = median(vals);
195             break;
196         case MIN:
197             result = *min_element(vals.begin(), vals.end());
198             break;
199         case MAX:
200             result = *max_element(vals.begin(), vals.end());
201             break;
202         default:
203             cerr << "Error: unrecognized StatType" << endl;
204             return 1;
205             break;
206         }
207 
208         var.info[infoField].clear();
209         var.info[infoField].push_back(convert(result));
210 
211         cout << var << endl;
212 
213     }
214 
215     return 0;
216 
217 }
218 
219