1 #include "Variant.h"
2 #include "split.h"
3 #include "convert.h"
4 #include <string>
5 #include <iostream>
6 #include <set>
7 #include <sys/time.h>
8 #include "fsom/fsom.h"
9 #include <getopt.h>
10 #include <cmath>
11
12 using namespace std;
13 using namespace vcflib;
14
mean(const vector<double> & data)15 double mean(const vector<double>& data) {
16 double total = 0;
17 for (vector<double>::const_iterator i = data.begin(); i != data.end(); ++i) {
18 total += *i;
19 }
20 return total/data.size();
21 }
22
median(vector<double> & data)23 double median(vector <double>& data) {
24 double median;
25 size_t size = data.size();
26 // ascending order
27 sort(data.begin(), data.end());
28 // get middle value
29 if (size % 2 == 0) {
30 median = (data[size/2-1] + data[size/2]) / 2;
31 } else {
32 median = data[size/2];
33 }
34 return median;
35 }
36
variance(const vector<double> & data,const double mean)37 double variance(const vector <double>& data, const double mean) {
38 double total = 0;
39 for (vector <double>::const_iterator i = data.begin(); i != data.end(); ++i) {
40 total += (*i - mean)*(*i - mean);
41 }
42 return total / (data.size());
43 }
44
standard_deviation(const vector<double> & data,const double mean)45 double standard_deviation(const vector <double>& data, const double mean) {
46 return sqrt(variance(data, mean));
47 }
48
49 struct Stats {
50 double mean;
51 double stdev;
StatsStats52 Stats(void) : mean(0), stdev(1) { }
53 };
54
load_som_metadata(string & som_metadata_file,int & x,int & y,vector<string> & fields,map<string,Stats> & stats)55 bool load_som_metadata(string& som_metadata_file, int& x, int& y, vector<string>& fields, map<string, Stats>& stats) {
56 ifstream in(som_metadata_file.c_str());
57 if (!in.is_open()) {
58 return false;
59 }
60 string linebuf;
61 getline(in, linebuf);
62 vector<string> xy = split(linebuf, "\t ");
63 convert(xy.front(), x);
64 convert(xy.back(), y);
65 while (getline(in, linebuf)) {
66 // format is: field_name, mean, stdev
67 vector<string> m = split(linebuf, "\t ");
68 fields.push_back(m[0]);
69 Stats& s = stats[m[0]];
70 convert(m[1], s.mean);
71 convert(m[2], s.stdev);
72 }
73 in.close();
74 return true;
75 }
76
save_som_metadata(string & som_metadata_file,int x,int y,vector<string> & fields,map<string,Stats> & stats)77 bool save_som_metadata(string& som_metadata_file, int x, int y, vector<string>& fields, map<string, Stats>& stats) {
78 ofstream out(som_metadata_file.c_str());
79 if (!out.is_open()) {
80 return false;
81 }
82 out << x << "\t" << y << endl;
83 for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f) {
84 Stats& s = stats[*f];
85 out << *f << "\t" << s.mean << "\t" << s.stdev << endl;
86 }
87 out.close();
88 return true;
89 }
90
normalize_inputs(vector<double> & record,vector<string> & fields,map<string,Stats> & stats)91 void normalize_inputs(vector<double>& record, vector<string>& fields, map<string, Stats>& stats) {
92 vector<double>::iterator r = record.begin();
93 for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++r) {
94 Stats& s = stats[*f];
95 *r = (*r - s.mean) / s.stdev;
96 }
97 }
98
read_fields(Variant & var,int ai,vector<string> & fields,vector<double> & record)99 void read_fields(Variant& var, int ai, vector<string>& fields, vector<double>& record) {
100 double td;
101 vector<string>::iterator j = fields.begin();
102 for (; j != fields.end(); ++j) {
103 if (*j == "QUAL") { // special handling...
104 td = var.quality;
105 } else {
106 if (var.info.find(*j) == var.info.end()) {
107 td = 0;
108 } else {
109 if (var.vcf->infoCounts[*j] == 1) { // for non Allele-variant fields
110 convert(var.info[*j][0], td);
111 } else {
112 convert(var.info[*j][ai], td);
113 }
114 }
115 }
116 record.push_back(td);
117 }
118 }
119
120 struct SomPaint {
121 int true_count;
122 int false_count;
123 double prob_true;
SomPaintSomPaint124 SomPaint(void) : true_count(0), false_count(0), prob_true(0) { }
125 };
126
127 static unsigned long prev_uticks = 0;
128
get_uticks()129 static unsigned long get_uticks(){
130 struct timeval ts;
131 gettimeofday(&ts,0);
132 return ((ts.tv_sec * 1000000) + ts.tv_usec);
133 }
134
start_timer()135 static void start_timer(){
136 prev_uticks = get_uticks();
137 }
138
print_timing(const char * msg)139 static void print_timing( const char *msg ){
140 #define MS_DELTA (1000.0)
141 #define SS_DELTA (MS_DELTA * 1000.0)
142 #define MM_DELTA (SS_DELTA * 60.0)
143 #define HH_DELTA (MM_DELTA * 60.0)
144
145 double ticks = get_uticks() - prev_uticks;
146
147 if( ticks < MS_DELTA ){
148 fprintf(stderr, "%s\t : %lf us\n", msg, ticks );
149 }
150 else if( ticks < SS_DELTA ){
151 fprintf(stderr, "%s\t : %lf ms\n", msg, ticks / MS_DELTA );
152 }
153 else if( ticks < MM_DELTA ){
154 fprintf(stderr, "%s\t : %lf s\n", msg, ticks / SS_DELTA );
155 }
156 else if( ticks < HH_DELTA ){
157 fprintf(stderr, "%s\t : %lf m\n", msg, ticks / MM_DELTA );
158 }
159 else{
160 fprintf(stderr, "%s\t : %lf h\n", msg, ticks / HH_DELTA );
161 }
162
163 start_timer();
164 }
165
166
printSummary(char ** argv)167 void printSummary(char** argv) {
168 cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl
169 << endl
170 << "training: " << endl
171 << " " << argv[0] << " -s output.som -x 20 -y 20 -f \"AF DP ABP\" training.vcf" << endl
172 << endl
173 << "application: " << endl
174 << " " << argv[0] << " -a output.som test.vcf >results.vcf" << endl
175 << endl
176 << argv[0] << "trains and/or applies a self-organizing map to the input VCF data" << endl
177 << "on stdin, adding two columns for the x and y coordinates of the winning" << endl
178 << "neuron in the network and an optional euclidean distance from a given" << endl
179 << "node (--center)." << endl
180 << endl
181 << "If a map is provided via --apply, it will be applied to input without" << endl
182 << "training. A .meta file describing network parameters and input parameter" << endl
183 << "distributions is used to automatically setup the network." << endl
184 << endl
185 << "options:" << endl
186 << endl
187 << " -h, --help this dialog" << endl
188 << endl
189 << "training:" << endl
190 << endl
191 << " -f, --fields \"FIELD ...\" INFO fields to provide to the SOM" << endl
192 << " -a, --apply FILE apply the saved map to input data to FILE" << endl
193 << " -s, --save FILE train on input data and save the map to FILE" << endl
194 << " -p, --print-training-results" << endl
195 << " print results of SOM on training input" << endl
196 << " (you can also just use --apply on the same input)" << endl
197 << " -x, --width X width in columns of the output array" << endl
198 << " -y, --height Y height in columns of the output array" << endl
199 << " -i, --iterations N number of training iterations or epochs" << endl
200 << " -d, --debug print timing information" << endl
201 << endl
202 << "recalibration:" << endl
203 << endl
204 << " -c, --center X,Y annotate with euclidean distance from center" << endl
205 << " -T, --paint-true VCF use VCF file to annotate true variants (multiple)" << endl
206 << " -F, --paint-false VCF use VCF file to annotate false variants (multiple)" << endl
207 << " -R, --paint-tag TAG provide estimated FDR% in TAG in variant INFO" << endl
208 << " -N, --false-negative replace FDR% (false detection) with FNR% (false negative)" << endl;
209
210 }
211
212
main(int argc,char ** argv)213 int main(int argc, char** argv) {
214
215 int width = 100;
216 int height = 100;
217 int num_dimensions = 2;
218 int iterations = 1000;
219 string som_file;
220 string som_metadata_file;
221 bool apply = false;
222 bool train = false;
223 bool apply_to_training_data = false; // print results against training data
224 bool debug = false;
225 vector<string> fields;
226 vector<string> centerv;
227 int centerx;
228 int centery;
229 string trueVCF;
230 string falseVCF;
231 bool normalize = true;
232
233 int c;
234
235 if (argc == 1) {
236 printSummary(argv);
237 exit(1);
238 }
239
240 while (true) {
241 static struct option long_options[] =
242 {
243 /* These options set a flag. */
244 //{"verbose", no_argument, &verbose_flag, 1},
245 {"help", no_argument, 0, 'h'},
246 {"iterations", required_argument, 0, 'i'},
247 {"width", required_argument, 0, 'x'},
248 {"height", required_argument, 0, 'y'},
249 {"apply", required_argument, 0, 'a'},
250 {"save", required_argument, 0, 's'},
251 {"fields", required_argument, 0, 'f'},
252 {"print-training-results", no_argument, 0, 'p'},
253 {"center", required_argument, 0, 'c'},
254 {"paint-true", required_argument, 0, 'T'},
255 {"paint-false", required_argument, 0, 'F'},
256 {"debug", no_argument, 0, 'd'},
257 {0, 0, 0, 0}
258 };
259 /* getopt_long stores the option index here. */
260 int option_index = 0;
261
262 c = getopt_long (argc, argv, "hpdi:x:y:a:s:f:c:T:F:",
263 long_options, &option_index);
264
265 if (c == -1)
266 break;
267
268 string field;
269
270 switch (c)
271 {
272
273 case 'x':
274 if (!convert(optarg, width)) {
275 cerr << "could not parse --width, -x" << endl;
276 exit(1);
277 }
278 break;
279
280 case 'y':
281 if (!convert(optarg, height)) {
282 cerr << "could not parse --height, -y" << endl;
283 exit(1);
284 }
285 break;
286
287 case 'i':
288 if (!convert(optarg, iterations)) {
289 cerr << "could not parse --iterations, -i" << endl;
290 exit(1);
291 }
292 break;
293
294 case 'p':
295 apply_to_training_data = true;
296 break;
297
298 case 'T':
299 trueVCF = optarg;
300 break;
301
302 case 'F':
303 falseVCF = optarg;
304 break;
305
306 case 'd':
307 debug = true;
308 break;
309
310 case 'a':
311 som_file = optarg;
312 apply = true;
313 break;
314
315 case 's':
316 som_file = optarg;
317 train = true;
318 break;
319
320 case 'f':
321 fields = split(string(optarg), ' ');
322 break;
323
324 case 'c':
325 centerv = split(string(optarg), ',');
326 convert(centerv.at(0), centerx);
327 convert(centerv.at(1), centery);
328 break;
329
330 case 'h':
331 printSummary(argv);
332 exit(0);
333 break;
334
335 default:
336 break;
337 }
338 }
339
340 size_t i, j;
341 som_network_t *net = NULL;
342 vector<string> inputs;
343 vector<vector<double> > data;
344 map<string, Stats> stats;
345
346 string line;
347 stringstream ss;
348
349 VariantCallFile variantFile;
350 bool usingstdin = false;
351 string inputFilename;
352 if (optind == argc - 1) {
353 inputFilename = argv[optind];
354 variantFile.open(inputFilename);
355 } else {
356 variantFile.open(std::cin);
357 usingstdin = true;
358 }
359
360 if (!variantFile.is_open()) {
361 cerr << "could not open VCF file" << endl;
362 return 1;
363 }
364
365 som_metadata_file = som_file + ".meta";
366
367 Variant var(variantFile);
368
369 variantFile.addHeaderLine("##INFO=<ID=SOMX,Number=A,Type=Integer,Description=\"X position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
370 variantFile.addHeaderLine("##INFO=<ID=SOMY,Number=A,Type=Integer,Description=\"Y position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
371 if (!centerv.empty()) {
372 variantFile.addHeaderLine("##INFO=<ID=SOMD,Number=A,Type=Float,Description=\"Euclidean distance from "
373 + convert(centerx) + "," + convert(centery) + " as defined by " + som_file + "\">");
374 }
375 if (!trueVCF.empty() && !falseVCF.empty()) {
376 variantFile.addHeaderLine("##INFO=<ID=SOMP,Number=A,Type=Float,Description=\"Estimated probability the variant is true using som "
377 + som_file + ", true variants from " + trueVCF + ", and false variants from " + falseVCF + "\">");
378 }
379
380 if (debug) start_timer();
381
382 vector<Variant> variants;
383 if (train) {
384 map<string, pair<double, double> > normalizationLimits;
385 while (variantFile.getNextVariant(var)) {
386 variants.push_back(var);
387 int ai = 0;
388 vector<string>::iterator a = var.alt.begin();
389 for ( ; a != var.alt.end(); ++a, ++ai) {
390 vector<double> record;
391 double td;
392 vector<string>::iterator j = fields.begin();
393 for (; j != fields.end(); ++j) {
394 if (*j == "QUAL") { // special handling...
395 td = var.quality;
396 } else {
397 if (var.info.find(*j) == var.info.end()) {
398 td = 0;
399 } else {
400 if (variantFile.infoCounts[*j] == 1) { // for non Allele-variant fields
401 convert(var.info[*j][0], td);
402 } else {
403 convert(var.info[*j][ai], td);
404 }
405 }
406 }
407 if (normalize) {
408 pair<double, double>& limits = normalizationLimits[*j];
409 if (td < limits.first) limits.first = td;
410 if (td > limits.second) limits.second = td;
411 }
412 record.push_back(td);
413 }
414 data.push_back(record);
415 }
416 }
417 // normalize inputs
418 if (normalize) {
419 // get normalization vector
420 // goal is normalization at 0, sd=1
421 int i = 0;
422 for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++i) {
423 vector<double> fv;
424 for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
425 fv.push_back(d->at(i));
426 }
427 Stats& s = stats[*f];
428 // get normalization constants
429 s.mean = mean(fv);
430 s.stdev = standard_deviation(fv, s.mean);
431 // normalize
432 for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
433 double v = d->at(i);
434 d->at(i) = (v - s.mean) / s.stdev;
435 }
436 }
437 }
438 }
439
440 vector<double*> dataptrs (data.size());
441 for (unsigned i=0, e=dataptrs.size(); i<e; ++i) {
442 dataptrs[i] = &(data[i][0]); // assuming !thing[i].empty()
443 }
444
445 if (debug) print_timing( "Input Processing" );
446
447 if (apply) {
448 if (! (net = som_deserialize(som_file.c_str()))) {
449 cerr << "could not load SOM from " << som_file << endl;
450 return 1;
451 }
452 if (!fields.empty()) {
453 cerr << "fields specified, but a SOM is to be applied, and metadata should be stored at " << som_metadata_file << endl;
454 return 1;
455 }
456 if (!load_som_metadata(som_metadata_file, width, height, fields, stats)) {
457 cerr << "could not load SOM metadata from " << som_metadata_file << endl;
458 return 1;
459 }
460 } else {
461
462 net = som_network_new(data[0].size(), height, width);
463
464 if ( !net ) {
465 printf( "ERROR: som_network_new failed.\n" );
466 return 1;
467 }
468 }
469
470 if (debug) print_timing( "Network Creation" );
471
472 if (train) {
473 if (debug) cerr << "Training using " << data.size() << " input vectors" << endl;
474 som_init_weights ( net, &dataptrs[0], data.size() );
475 som_train ( net, &dataptrs[0], data.size(), iterations );
476 }
477
478 if (debug) print_timing( "Network Training" );
479
480 // open and calibrate using the true and false datasets
481
482 if (train && apply_to_training_data) {
483 // currently disabled
484 /*
485 cout << variantFile.header << endl;
486 vector<Variant>::iterator v = variants.begin(); int di = 0;
487 for ( ; v != variants.end() && di < data.size(); ++v) {
488 var.info["SOMX"].clear();
489 var.info["SOMY"].clear();
490 var.info["SOMP"].clear();
491 var.info["SOMD"].clear();
492 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++di) {
493 som_set_inputs ( net, dataptrs[di] );
494 size_t x=0, y=0;
495 som_get_best_neuron_coordinates ( net, &x, &y );
496 v->info["SOMX"].push_back(convert(x));
497 v->info["SOMY"].push_back(convert(y));
498 if (!centerv.empty()) {
499 float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
500 + pow(abs((float)centery - (float)y), 2));
501 var.info["SOMD"].clear();
502 var.info["SOMD"].push_back(convert(distance));
503 }
504 }
505 cout << *v << endl;
506 }
507 */
508 } else if (apply) {
509
510 // if we have true and false sets, use them to "paint" the map
511 vector<vector<SomPaint> > paintedSOM;
512 paintedSOM.resize(width);
513 for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
514 t != paintedSOM.end(); ++t) {
515 t->resize(height);
516 }
517
518 // handle trues
519 if (!trueVCF.empty()) {
520 VariantCallFile trueVariantFile;
521 trueVariantFile.open(trueVCF);
522 Variant v(trueVariantFile);
523 while (trueVariantFile.getNextVariant(v)) {
524 int ai = 0;
525 vector<string>::iterator a = v.alt.begin();
526 for ( ; a != v.alt.end(); ++a, ++ai) {
527 vector<double> record;
528 read_fields(v, ai, fields, record);
529 if (normalize) {
530 normalize_inputs(record, fields, stats);
531 }
532 som_set_inputs ( net, &record[0] );
533 size_t x=0, y=0;
534 som_get_best_neuron_coordinates ( net, &x, &y );
535 paintedSOM[x][y].true_count += 1;
536 }
537 }
538 }
539
540 // get falses
541 if (!falseVCF.empty()) {
542 VariantCallFile falseVariantFile;
543 falseVariantFile.open(falseVCF);
544 Variant v(falseVariantFile);
545 while (falseVariantFile.getNextVariant(v)) {
546 int ai = 0;
547 vector<string>::iterator a = v.alt.begin();
548 for ( ; a != v.alt.end(); ++a, ++ai) {
549 vector<double> record;
550 read_fields(v, ai, fields, record);
551 if (normalize) {
552 normalize_inputs(record, fields, stats);
553 }
554 som_set_inputs ( net, &record[0] );
555 size_t x=0, y=0;
556 som_get_best_neuron_coordinates ( net, &x, &y );
557 paintedSOM[x][y].false_count += 1;
558 }
559 }
560 }
561
562 // estimate probability of each node using true and false set
563 for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
564 t != paintedSOM.end(); ++t) {
565 for (vector<SomPaint>::iterator p = t->begin(); p != t->end(); ++p) {
566 //cout << "count at node " << t - paintedSOM.begin() << "," << p - t->begin()
567 // << " is " << p->true_count << " true, " << p->false_count << " false" << endl;
568 if (p->true_count + p->false_count > 0) {
569 p->prob_true = (double) p->true_count / (double) (p->true_count + p->false_count);
570 } else {
571 // for nodes without training data, could we estimate from surrounding nodes?
572 // yes, TODO, but for now we can be conservative and say "0"
573 p->prob_true = 0;
574 }
575 }
576 }
577
578 cout << variantFile.header << endl;
579 while (variantFile.getNextVariant(var)) {
580 var.info["SOMX"].clear();
581 var.info["SOMY"].clear();
582 var.info["SOMP"].clear();
583 var.info["SOMD"].clear();
584 int ai = 0;
585 vector<string>::iterator a = var.alt.begin();
586 for ( ; a != var.alt.end(); ++a, ++ai) {
587 vector<double> record;
588 read_fields(var, ai, fields, record);
589 if (normalize) {
590 normalize_inputs(record, fields, stats);
591 }
592 som_set_inputs ( net, &record[0] );
593 size_t x=0, y=0;
594 som_get_best_neuron_coordinates ( net, &x, &y );
595 if (!trueVCF.empty() && !falseVCF.empty()) {
596 SomPaint& paint = paintedSOM[x][y];
597 var.info["SOMP"].push_back(convert(paint.prob_true));
598 }
599 var.info["SOMX"].push_back(convert(x));
600 var.info["SOMY"].push_back(convert(y));
601 if (!centerv.empty()) {
602 float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
603 + pow(abs((float)centery - (float)y), 2));
604 var.info["SOMD"].push_back(convert(distance));
605 }
606 }
607 cout << var << endl;
608 }
609 }
610
611 if (debug) print_timing( "Input Recognition" );
612
613 if (train) {
614 if (!save_som_metadata(som_metadata_file, width, height, fields, stats)) {
615 cerr << "could not save metadata to " << som_metadata_file << endl;
616 }
617 som_serialize(net, som_file.c_str());
618 }
619
620 som_network_destroy ( net );
621
622 if (debug) print_timing( "Network Destruction" );
623
624 return 0;
625
626 }
627