1 #include "Variant.h"
2 #include "split.h"
3 #include "convert.h"
4 #include <string>
5 #include <iostream>
6 #include <set>
7 #include <sys/time.h>
8 #include "fsom/fsom.h"
9 #include <getopt.h>
10 #include <cmath>
11 
12 using namespace std;
13 using namespace vcflib;
14 
mean(const vector<double> & data)15 double mean(const vector<double>& data) {
16     double total = 0;
17     for (vector<double>::const_iterator i = data.begin(); i != data.end(); ++i) {
18         total += *i;
19     }
20     return total/data.size();
21 }
22 
median(vector<double> & data)23 double median(vector <double>& data) {
24     double median;
25     size_t size = data.size();
26     // ascending order
27     sort(data.begin(), data.end());
28     // get middle value
29     if (size % 2 == 0) {
30         median = (data[size/2-1] + data[size/2]) / 2;
31     } else {
32         median = data[size/2];
33     }
34     return median;
35 }
36 
variance(const vector<double> & data,const double mean)37 double variance(const vector <double>& data, const double mean) {
38     double total = 0;
39     for (vector <double>::const_iterator i = data.begin(); i != data.end(); ++i) {
40         total += (*i - mean)*(*i - mean);
41     }
42     return total / (data.size());
43 }
44 
standard_deviation(const vector<double> & data,const double mean)45 double standard_deviation(const vector <double>& data, const double mean) {
46     return sqrt(variance(data, mean));
47 }
48 
49 struct Stats {
50     double mean;
51     double stdev;
StatsStats52     Stats(void) : mean(0), stdev(1) { }
53 };
54 
load_som_metadata(string & som_metadata_file,int & x,int & y,vector<string> & fields,map<string,Stats> & stats)55 bool load_som_metadata(string& som_metadata_file, int& x, int& y, vector<string>& fields, map<string, Stats>& stats) {
56     ifstream in(som_metadata_file.c_str());
57     if (!in.is_open()) {
58         return false;
59     }
60     string linebuf;
61     getline(in, linebuf);
62     vector<string> xy = split(linebuf, "\t ");
63     convert(xy.front(), x);
64     convert(xy.back(), y);
65     while (getline(in, linebuf)) {
66         // format is: field_name, mean, stdev
67         vector<string> m = split(linebuf, "\t ");
68         fields.push_back(m[0]);
69         Stats& s = stats[m[0]];
70         convert(m[1], s.mean);
71         convert(m[2], s.stdev);
72     }
73     in.close();
74     return true;
75 }
76 
save_som_metadata(string & som_metadata_file,int x,int y,vector<string> & fields,map<string,Stats> & stats)77 bool save_som_metadata(string& som_metadata_file, int x, int y, vector<string>& fields, map<string, Stats>& stats) {
78     ofstream out(som_metadata_file.c_str());
79     if (!out.is_open()) {
80         return false;
81     }
82     out << x << "\t" << y << endl;
83     for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f) {
84         Stats& s = stats[*f];
85         out << *f << "\t" << s.mean << "\t" << s.stdev << endl;
86     }
87     out.close();
88     return true;
89 }
90 
normalize_inputs(vector<double> & record,vector<string> & fields,map<string,Stats> & stats)91 void normalize_inputs(vector<double>& record, vector<string>& fields, map<string, Stats>& stats) {
92     vector<double>::iterator r = record.begin();
93     for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++r) {
94         Stats& s = stats[*f];
95         *r = (*r - s.mean) / s.stdev;
96     }
97 }
98 
read_fields(Variant & var,int ai,vector<string> & fields,vector<double> & record)99 void read_fields(Variant& var, int ai, vector<string>& fields, vector<double>& record) {
100     double td;
101     vector<string>::iterator j = fields.begin();
102     for (; j != fields.end(); ++j) {
103         if (*j == "QUAL") { // special handling...
104             td = var.quality;
105         } else {
106             if (var.info.find(*j) == var.info.end()) {
107                 td = 0;
108             } else {
109                 if (var.vcf->infoCounts[*j] == 1) { // for non Allele-variant fields
110                     convert(var.info[*j][0], td);
111                 } else {
112                     convert(var.info[*j][ai], td);
113                 }
114             }
115         }
116         record.push_back(td);
117     }
118 }
119 
120 struct SomPaint {
121     int true_count;
122     int false_count;
123     double prob_true;
SomPaintSomPaint124     SomPaint(void) : true_count(0), false_count(0), prob_true(0) { }
125 };
126 
127 static unsigned long prev_uticks = 0;
128 
get_uticks()129 static unsigned long get_uticks(){
130     struct timeval ts;
131     gettimeofday(&ts,0);
132     return ((ts.tv_sec * 1000000) + ts.tv_usec);
133 }
134 
start_timer()135 static void start_timer(){
136     prev_uticks = get_uticks();
137 }
138 
print_timing(const char * msg)139 static void print_timing( const char *msg ){
140 #define MS_DELTA (1000.0)
141 #define SS_DELTA (MS_DELTA * 1000.0)
142 #define MM_DELTA (SS_DELTA * 60.0)
143 #define HH_DELTA (MM_DELTA * 60.0)
144 
145     double ticks = get_uticks() - prev_uticks;
146 
147     if( ticks < MS_DELTA ){
148         fprintf(stderr, "%s\t : %lf us\n", msg, ticks );
149     }
150     else if( ticks < SS_DELTA ){
151         fprintf(stderr, "%s\t : %lf ms\n", msg, ticks / MS_DELTA );
152     }
153     else if( ticks < MM_DELTA ){
154         fprintf(stderr, "%s\t : %lf s\n", msg, ticks / SS_DELTA );
155     }
156     else if( ticks < HH_DELTA ){
157         fprintf(stderr, "%s\t : %lf m\n", msg, ticks / MM_DELTA );
158     }
159     else{
160         fprintf(stderr, "%s\t : %lf h\n", msg, ticks / HH_DELTA );
161     }
162 
163     start_timer();
164 }
165 
166 
printSummary(char ** argv)167 void printSummary(char** argv) {
168     cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl
169          << endl
170          << "training: " << endl
171          << "    " << argv[0] << " -s output.som -x 20 -y 20 -f \"AF DP ABP\" training.vcf" << endl
172          << endl
173          << "application: " << endl
174          << "    " << argv[0] << " -a output.som test.vcf >results.vcf" << endl
175          << endl
176          << argv[0] << "trains and/or applies a self-organizing map to the input VCF data" << endl
177          << "on stdin, adding two columns for the x and y coordinates of the winning" << endl
178          << "neuron in the network and an optional euclidean distance from a given" << endl
179          << "node (--center)." << endl
180          << endl
181          << "If a map is provided via --apply, it will be applied to input without" << endl
182          << "training.  A .meta file describing network parameters and input parameter" << endl
183          << "distributions is used to automatically setup the network." << endl
184          << endl
185          << "options:" << endl
186          << endl
187          << "    -h, --help             this dialog" << endl
188          << endl
189          << "training:" << endl
190          << endl
191          << "    -f, --fields \"FIELD ...\"  INFO fields to provide to the SOM" << endl
192          << "    -a, --apply FILE       apply the saved map to input data to FILE" << endl
193          << "    -s, --save  FILE       train on input data and save the map to FILE" << endl
194          << "    -p, --print-training-results" << endl
195          << "                           print results of SOM on training input" << endl
196          << "                           (you can also just use --apply on the same input)" << endl
197          << "    -x, --width X          width in columns of the output array" << endl
198          << "    -y, --height Y         height in columns of the output array" << endl
199          << "    -i, --iterations N     number of training iterations or epochs" << endl
200          << "    -d, --debug            print timing information" << endl
201          << endl
202          << "recalibration:" << endl
203          << endl
204          << "    -c, --center X,Y       annotate with euclidean distance from center" << endl
205          << "    -T, --paint-true VCF   use VCF file to annotate true variants (multiple)" << endl
206          << "    -F, --paint-false VCF  use VCF file to annotate false variants (multiple)" << endl
207          << "    -R, --paint-tag TAG    provide estimated FDR% in TAG in variant INFO" << endl
208          << "    -N, --false-negative   replace FDR% (false detection) with FNR% (false negative)" << endl;
209 
210 }
211 
212 
main(int argc,char ** argv)213 int main(int argc, char** argv) {
214 
215     int width = 100;
216     int height = 100;
217     int num_dimensions = 2;
218     int iterations = 1000;
219     string som_file;
220     string som_metadata_file;
221     bool apply = false;
222     bool train = false;
223     bool apply_to_training_data = false; // print results against training data
224     bool debug = false;
225     vector<string> fields;
226     vector<string> centerv;
227     int centerx;
228     int centery;
229     string trueVCF;
230     string falseVCF;
231     bool normalize = true;
232 
233     int c;
234 
235     if (argc == 1) {
236         printSummary(argv);
237         exit(1);
238     }
239 
240     while (true) {
241         static struct option long_options[] =
242         {
243             /* These options set a flag. */
244             //{"verbose", no_argument,       &verbose_flag, 1},
245             {"help", no_argument, 0, 'h'},
246             {"iterations", required_argument, 0, 'i'},
247             {"width", required_argument, 0, 'x'},
248             {"height", required_argument, 0, 'y'},
249             {"apply", required_argument, 0, 'a'},
250             {"save", required_argument, 0, 's'},
251             {"fields", required_argument, 0, 'f'},
252             {"print-training-results", no_argument, 0, 'p'},
253             {"center", required_argument, 0, 'c'},
254             {"paint-true", required_argument, 0, 'T'},
255             {"paint-false", required_argument, 0, 'F'},
256             {"debug", no_argument, 0, 'd'},
257             {0, 0, 0, 0}
258         };
259         /* getopt_long stores the option index here. */
260         int option_index = 0;
261 
262         c = getopt_long (argc, argv, "hpdi:x:y:a:s:f:c:T:F:",
263                          long_options, &option_index);
264 
265         if (c == -1)
266             break;
267 
268         string field;
269 
270         switch (c)
271         {
272 
273             case 'x':
274                 if (!convert(optarg, width)) {
275                     cerr << "could not parse --width, -x" << endl;
276                     exit(1);
277                 }
278                 break;
279 
280             case 'y':
281                 if (!convert(optarg, height)) {
282                     cerr << "could not parse --height, -y" << endl;
283                     exit(1);
284                 }
285                 break;
286 
287             case 'i':
288                 if (!convert(optarg, iterations)) {
289                     cerr << "could not parse --iterations, -i" << endl;
290                     exit(1);
291                 }
292                 break;
293 
294             case 'p':
295                 apply_to_training_data = true;
296                 break;
297 
298             case 'T':
299                 trueVCF = optarg;
300                 break;
301 
302             case 'F':
303                 falseVCF = optarg;
304                 break;
305 
306             case 'd':
307                 debug = true;
308                 break;
309 
310             case 'a':
311                 som_file = optarg;
312                 apply = true;
313                 break;
314 
315             case 's':
316                 som_file = optarg;
317                 train = true;
318                 break;
319 
320             case 'f':
321                 fields = split(string(optarg), ' ');
322                 break;
323 
324             case 'c':
325                 centerv = split(string(optarg), ',');
326                 convert(centerv.at(0), centerx);
327                 convert(centerv.at(1), centery);
328                 break;
329 
330             case 'h':
331                 printSummary(argv);
332                 exit(0);
333                 break;
334 
335             default:
336                 break;
337         }
338     }
339 
340     size_t i, j;
341     som_network_t *net = NULL;
342     vector<string> inputs;
343     vector<vector<double> > data;
344     map<string, Stats> stats;
345 
346     string line;
347     stringstream ss;
348 
349     VariantCallFile variantFile;
350     bool usingstdin = false;
351     string inputFilename;
352     if (optind == argc - 1) {
353         inputFilename = argv[optind];
354         variantFile.open(inputFilename);
355     } else {
356         variantFile.open(std::cin);
357         usingstdin = true;
358     }
359 
360     if (!variantFile.is_open()) {
361         cerr << "could not open VCF file" << endl;
362         return 1;
363     }
364 
365     som_metadata_file = som_file + ".meta";
366 
367     Variant var(variantFile);
368 
369     variantFile.addHeaderLine("##INFO=<ID=SOMX,Number=A,Type=Integer,Description=\"X position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
370     variantFile.addHeaderLine("##INFO=<ID=SOMY,Number=A,Type=Integer,Description=\"Y position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
371     if (!centerv.empty()) {
372         variantFile.addHeaderLine("##INFO=<ID=SOMD,Number=A,Type=Float,Description=\"Euclidean distance from "
373                                   + convert(centerx) + "," + convert(centery) + " as defined by " + som_file + "\">");
374     }
375     if (!trueVCF.empty() && !falseVCF.empty()) {
376         variantFile.addHeaderLine("##INFO=<ID=SOMP,Number=A,Type=Float,Description=\"Estimated probability the variant is true using som "
377                                   + som_file + ", true variants from " + trueVCF + ", and false variants from " + falseVCF + "\">");
378     }
379 
380     if (debug) start_timer();
381 
382     vector<Variant> variants;
383     if (train) {
384         map<string, pair<double, double> > normalizationLimits;
385         while (variantFile.getNextVariant(var)) {
386             variants.push_back(var);
387             int ai = 0;
388             vector<string>::iterator a = var.alt.begin();
389             for ( ; a != var.alt.end(); ++a, ++ai) {
390                 vector<double> record;
391                 double td;
392                 vector<string>::iterator j = fields.begin();
393                 for (; j != fields.end(); ++j) {
394                     if (*j == "QUAL") { // special handling...
395                         td = var.quality;
396                     } else {
397                         if (var.info.find(*j) == var.info.end()) {
398                             td = 0;
399                         } else {
400                             if (variantFile.infoCounts[*j] == 1) { // for non Allele-variant fields
401                                 convert(var.info[*j][0], td);
402                             } else {
403                                 convert(var.info[*j][ai], td);
404                             }
405                         }
406                     }
407                     if (normalize) {
408                         pair<double, double>& limits = normalizationLimits[*j];
409                         if (td < limits.first) limits.first = td;
410                         if (td > limits.second) limits.second = td;
411                     }
412                     record.push_back(td);
413                 }
414                 data.push_back(record);
415             }
416         }
417         // normalize inputs
418         if (normalize) {
419             // get normalization vector
420             // goal is normalization at 0, sd=1
421             int i = 0;
422             for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++i) {
423                 vector<double> fv;
424                 for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
425                     fv.push_back(d->at(i));
426                 }
427                 Stats& s = stats[*f];
428                 // get normalization constants
429                 s.mean = mean(fv);
430                 s.stdev = standard_deviation(fv, s.mean);
431                 // normalize
432                 for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
433                     double v = d->at(i);
434                     d->at(i) = (v - s.mean) / s.stdev;
435                 }
436             }
437         }
438     }
439 
440     vector<double*> dataptrs (data.size());
441     for (unsigned i=0, e=dataptrs.size(); i<e; ++i) {
442         dataptrs[i] = &(data[i][0]); // assuming !thing[i].empty()
443     }
444 
445     if (debug) print_timing( "Input Processing" );
446 
447     if (apply) {
448         if (! (net = som_deserialize(som_file.c_str()))) {
449             cerr << "could not load SOM from " << som_file << endl;
450             return 1;
451         }
452         if (!fields.empty()) {
453             cerr << "fields specified, but a SOM is to be applied, and metadata should be stored at " << som_metadata_file << endl;
454             return 1;
455         }
456         if (!load_som_metadata(som_metadata_file, width, height, fields, stats)) {
457             cerr << "could not load SOM metadata from " << som_metadata_file << endl;
458             return 1;
459         }
460     } else {
461 
462         net = som_network_new(data[0].size(), height, width);
463 
464         if ( !net )	{
465             printf( "ERROR: som_network_new failed.\n" );
466             return 1;
467         }
468     }
469 
470     if (debug) print_timing( "Network Creation" );
471 
472     if (train) {
473         if (debug) cerr << "Training using " << data.size() << " input vectors" << endl;
474         som_init_weights ( net, &dataptrs[0], data.size() );
475         som_train ( net, &dataptrs[0], data.size(), iterations );
476     }
477 
478     if (debug) print_timing( "Network Training" );
479 
480     // open and calibrate using the true and false datasets
481 
482     if (train && apply_to_training_data) {
483         // currently disabled
484         /*
485         cout << variantFile.header << endl;
486         vector<Variant>::iterator v = variants.begin(); int di = 0;
487         for ( ; v != variants.end() && di < data.size(); ++v) {
488             var.info["SOMX"].clear();
489             var.info["SOMY"].clear();
490             var.info["SOMP"].clear();
491             var.info["SOMD"].clear();
492             for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++di) {
493                 som_set_inputs ( net, dataptrs[di] );
494                 size_t x=0, y=0;
495                 som_get_best_neuron_coordinates ( net, &x, &y );
496                 v->info["SOMX"].push_back(convert(x));
497                 v->info["SOMY"].push_back(convert(y));
498                 if (!centerv.empty()) {
499                     float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
500                                           + pow(abs((float)centery - (float)y), 2));
501                     var.info["SOMD"].clear();
502                     var.info["SOMD"].push_back(convert(distance));
503                 }
504             }
505             cout << *v << endl;
506         }
507         */
508     } else if (apply) {
509 
510         // if we have true and false sets, use them to "paint" the map
511         vector<vector<SomPaint> > paintedSOM;
512         paintedSOM.resize(width);
513         for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
514              t != paintedSOM.end(); ++t) {
515             t->resize(height);
516         }
517 
518         // handle trues
519         if (!trueVCF.empty()) {
520             VariantCallFile trueVariantFile;
521             trueVariantFile.open(trueVCF);
522             Variant v(trueVariantFile);
523             while (trueVariantFile.getNextVariant(v)) {
524                 int ai = 0;
525                 vector<string>::iterator a = v.alt.begin();
526                 for ( ; a != v.alt.end(); ++a, ++ai) {
527                     vector<double> record;
528                     read_fields(v, ai, fields, record);
529                     if (normalize) {
530                         normalize_inputs(record, fields, stats);
531                     }
532                     som_set_inputs ( net, &record[0] );
533                     size_t x=0, y=0;
534                     som_get_best_neuron_coordinates ( net, &x, &y );
535                     paintedSOM[x][y].true_count += 1;
536                 }
537             }
538         }
539 
540         // get falses
541         if (!falseVCF.empty()) {
542             VariantCallFile falseVariantFile;
543             falseVariantFile.open(falseVCF);
544             Variant v(falseVariantFile);
545             while (falseVariantFile.getNextVariant(v)) {
546                 int ai = 0;
547                 vector<string>::iterator a = v.alt.begin();
548                 for ( ; a != v.alt.end(); ++a, ++ai) {
549                     vector<double> record;
550                     read_fields(v, ai, fields, record);
551                     if (normalize) {
552                         normalize_inputs(record, fields, stats);
553                     }
554                     som_set_inputs ( net, &record[0] );
555                     size_t x=0, y=0;
556                     som_get_best_neuron_coordinates ( net, &x, &y );
557                     paintedSOM[x][y].false_count += 1;
558                 }
559             }
560         }
561 
562         // estimate probability of each node using true and false set
563         for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
564              t != paintedSOM.end(); ++t) {
565             for (vector<SomPaint>::iterator p = t->begin(); p != t->end(); ++p) {
566                 //cout << "count at node " << t - paintedSOM.begin() << "," << p - t->begin()
567                 //     << " is " << p->true_count << " true, " << p->false_count << " false" << endl;
568                 if (p->true_count + p->false_count > 0) {
569                     p->prob_true = (double) p->true_count / (double) (p->true_count + p->false_count);
570                 } else {
571                     // for nodes without training data, could we estimate from surrounding nodes?
572                     // yes, TODO, but for now we can be conservative and say "0"
573                     p->prob_true = 0;
574                 }
575             }
576         }
577 
578         cout << variantFile.header << endl;
579         while (variantFile.getNextVariant(var)) {
580             var.info["SOMX"].clear();
581             var.info["SOMY"].clear();
582             var.info["SOMP"].clear();
583             var.info["SOMD"].clear();
584             int ai = 0;
585             vector<string>::iterator a = var.alt.begin();
586             for ( ; a != var.alt.end(); ++a, ++ai) {
587                 vector<double> record;
588                 read_fields(var, ai, fields, record);
589                 if (normalize) {
590                     normalize_inputs(record, fields, stats);
591                 }
592                 som_set_inputs ( net, &record[0] );
593                 size_t x=0, y=0;
594                 som_get_best_neuron_coordinates ( net, &x, &y );
595                 if (!trueVCF.empty() && !falseVCF.empty()) {
596                     SomPaint& paint = paintedSOM[x][y];
597                     var.info["SOMP"].push_back(convert(paint.prob_true));
598                 }
599                 var.info["SOMX"].push_back(convert(x));
600                 var.info["SOMY"].push_back(convert(y));
601                 if (!centerv.empty()) {
602                     float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
603                                           + pow(abs((float)centery - (float)y), 2));
604                     var.info["SOMD"].push_back(convert(distance));
605                 }
606             }
607             cout << var << endl;
608         }
609     }
610 
611     if (debug) print_timing( "Input Recognition" );
612 
613     if (train) {
614         if (!save_som_metadata(som_metadata_file, width, height, fields, stats)) {
615             cerr << "could not save metadata to " << som_metadata_file << endl;
616         }
617         som_serialize(net, som_file.c_str());
618     }
619 
620     som_network_destroy ( net );
621 
622     if (debug) print_timing( "Network Destruction" );
623 
624     return 0;
625 
626 }
627