1 #include "Variant.h"
2 #include "split.h"
3 #include "Fasta.h"
4 #include "gpatInfo.hpp"
5 #include <getopt.h>
6 #include <algorithm>
7 #include <numeric>
8 
9 using namespace std;
10 using namespace vcflib;
11 
printSummary(char ** argv)12 void printSummary(char** argv) {
13     cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
14          << endl
15          << "options:" << endl
16          << "    -f, --field         Summarize this field in the INFO column" << endl
17          << "    -i, --info          Store the computed statistic in this info field" << endl
18          << "    -a, --average       Take the mean for field (default)" << endl
19          << "    -m, --median        Use the median" << endl
20          << "    -n, --min           Use the min" << endl
21          << "    -x, --max           Use the max" << endl
22 	 << "    -h, --help          Print this message" << endl
23 	 << "    -v, --version       Print version" << endl
24          << endl
25          << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
26          << "to the site-level INFO." << endl
27          << endl;
28     exit(0);
29 }
30 
median(vector<double> & v)31 double median(vector<double> &v)
32 {
33     size_t n = v.size() / 2;
34     nth_element(v.begin(), v.begin()+n, v.end());
35     return v[n];
36 }
37 
mean(vector<double> & v)38 double mean(vector<double> &v)
39 {
40     double sum = accumulate(v.begin(), v.end(), 0.0);
41     return sum / v.size();
42 }
43 
44 enum StatType { MEAN, MEDIAN, MIN, MAX };
45 
main(int argc,char ** argv)46 int main(int argc, char** argv) {
47 
48     int c;
49     string sitewideField;
50     string infoField;
51     StatType statType = MEAN;
52 
53     if (argc == 1)
54         printSummary(argv);
55 
56     while (true) {
57         static struct option long_options[] =
58             {
59                 /* These options set a flag. */
60                 {"help", no_argument, 0, 'h'},
61                 {"field",  required_argument, 0, 'f'},
62                 {"info",  required_argument, 0, 'i'},
63                 {"average", no_argument, 0, 'a'},
64                 {"median", no_argument, 0, 'm'},
65                 {"min", no_argument, 0, 'n'},
66                 {"max", no_argument, 0, 'x'},
67 		{"max", no_argument, 0, 'v'},
68                 {0, 0, 0, 0}
69             };
70         /* getopt_long stores the option index here. */
71         int option_index = 0;
72 
73         c = getopt_long (argc, argv, "hamnxfv:i:",
74                          long_options, &option_index);
75 
76         /* Detect the end of the options. */
77         if (c == -1)
78             break;
79 
80         switch (c)
81         {
82         case 0:
83             /* If this option set a flag, do nothing else now. */
84             if (long_options[option_index].flag != 0)
85                 break;
86             printf ("option %s", long_options[option_index].name);
87             if (optarg)
88                 printf (" with arg %s", optarg);
89             printf ("\n");
90             break;
91 
92 	case 'v':
93 	  {
94 	    printBasicVersion();
95 	    exit(0);
96 	  }
97         case 'f':
98 	  {
99             sitewideField = optarg;
100             break;
101 	  }
102         case 'i':
103 	  {
104             infoField = optarg;
105             break;
106 	  }
107         case 'a':
108 	  {
109             statType = MEAN;
110             break;
111 	  }
112         case 'm':
113 	  {
114             statType = MEDIAN;
115             break;
116 	  }
117         case 'n':
118 	  {
119             statType = MIN;
120             break;
121 	  }
122         case 'x':
123 	  {
124             statType = MAX;
125             break;
126 	  }
127         case 'h':
128 	  {
129             printSummary(argv);
130             exit(0);
131 	  }
132         case '?':
133             /* getopt_long already printed an error message. */
134             printSummary(argv);
135             exit(1);
136             break;
137 
138         default:
139             abort ();
140         }
141     }
142 
143     if (infoField.empty() || sitewideField.empty()) {
144         cerr << "Error: both a sample field and an info field are required." << endl;
145         return 1;
146     }
147 
148     VariantCallFile variantFile;
149     string inputFilename;
150     if (optind == argc - 1) {
151         inputFilename = argv[optind];
152         variantFile.open(inputFilename);
153     } else {
154         variantFile.open(std::cin);
155     }
156 
157     if (!variantFile.is_open()) {
158         return 1;
159     }
160 
161     string statTypeStr;
162 
163     switch (statType) {
164     case MEAN:
165         statTypeStr = "mean";
166         break;
167     case MEDIAN:
168         statTypeStr = "median";
169         break;
170     case MIN:
171         statTypeStr = "min";
172         break;
173     case MAX:
174         statTypeStr = "max";
175         break;
176     default:
177         cerr << "Error: failure to convert stat type to string" << endl;
178         return 1;
179         break;
180     }
181 
182     variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of site-wide values of "+sitewideField+" \">");
183 
184     cout << variantFile.header << endl;
185 
186     Variant var(variantFile);
187     while (variantFile.getNextVariant(var)) {
188         vector<double> vals;
189         map<string, vector<string> >::iterator i = var.info.find(sitewideField);
190         if (i != var.info.end()) {
191             for (vector<string>::iterator s = i->second.begin(); s != i->second.end(); ++s) {
192                 double d;
193                 convert(*s, d);
194                 vals.push_back(d);
195             }
196         }
197 
198         double result;
199         switch (statType) {
200         case MEAN:
201             result = mean(vals);
202             break;
203         case MEDIAN:
204             result = median(vals);
205             break;
206         case MIN:
207             result = *min_element(vals.begin(), vals.end());
208             break;
209         case MAX:
210             result = *max_element(vals.begin(), vals.end());
211             break;
212         default:
213             cerr << "Error: unrecognized StatType" << endl;
214             return 1;
215             break;
216         }
217 
218         var.info[infoField].clear();
219         var.info[infoField].push_back(convert(result));
220 
221         cout << var << endl;
222 
223     }
224 
225     return 0;
226 
227 }
228 
229