1#!/usr/bin/perl -w 2# 3# vim: sw=4 ts=4 expandtab smarttab 4# 5# vw-varinfo Summarize features of a training-set using VW 6# Input: A vw training set file 7# Output: A list of features, their VW hash values, min/max 8# values, regressor weights, and distance from 9# the best constant. 10# 11# Algorithm: 12# 1) Collect all variables and their ranges from training-set 13# 2) Train with VW to determine regressor weights 14# 3) Build a test-set with a single example including all variables 15# 4) run VW with --audit on 3) to map variable names to hash values 16# and weights. 17# 5) Output collected information about the input variables. 18# 19# NOTE: distance from the best constant predictor is not really 20# variable 'importance', in the sense that it is not a reliable indicator 21# of prediction performance in general. For example, if you have two 22# equal co-occurring features, the sum of the weights is what matters 23# and individual weight values are arbitrary. 24# 25# Distance represents the relative distance of the regressor weight 26# from the weight of the 'zero' which is vw's best constant prediction. 27# 28# Merits of this distance: 29# 1) It is a relative metric, making it easier to compare two features 30# 2) Easier to interpret vs an arbitrary weight (max is set to 100%) 31# 2) It is signed, so it shows which features are positively vs 32# negatively correlated with the target label. 33# 34# (c) 2012 - ariel faigon for vowpal-wabbit 35# This software may be distributed under the same terms as vowpal-wabbit 36# 37# use Getopt::Std; 38use vars (qw($opt_v $opt_V $opt_O $opt_K)); 39 40my $VW = 'vw'; 41# default vw options are more optimal now, don't try to make them so 42my $VWARGS = ''; 43 44my ($TrainFile, $ModelFile, $RmodelFile, $ExampleFile, $AuditFile); 45my (%FeatureMax, %FeatureMin); 46 47my (%Feature2Hash, %Feature2Weight); 48 49my %NameSpace; # hash of hashes: namespace => { key => val }... 50my @Features; # feature names list 51 52my @QPairs = (); # list of pairs ([a, b], [c, d] ...) for namespace pairing 53my %Ignore; # support for --ignore X 54my %Keep; # support for --keep X 55my $DoKeep; # flag for whether we need to use --keep or not 56my $MultiClass = 0; # flag for multi-class (--oaa --csoaa --wap* --sequence?) 57my %Labels; 58my @Labels = (1); # List of labels for super example generation 59 60my %Label2FW; # for multi-class: every label has feature->weight 61my %Prediction; # prediction of each isolated multi-class label 62 63my @TmpFiles; 64 65# 66# v() & V(): verbose/debug prints for -v and -V 67# 68sub v(@) { 69 return unless $opt_v; 70 if (@_ == 1) { 71 print STDERR @_; 72 } else { 73 printf STDERR @_; 74 } 75} 76 77sub V(@) { 78 return unless $opt_V; 79 if (@_ == 1) { 80 print STDERR @_; 81 } else { 82 printf STDERR @_; 83 } 84} 85 86sub usage(@) { 87 print STDERR @_, "\n" if (@_); 88 die "Usage: $0 [options] [vw-options] <training-set-file> 89 Options: 90 -v verbose 91 -V more verbose 92 -K keep temporary files 93 -O<which> Use order/ranking metric <which> 94 Supported metrics: 95 ... not implemented yet ... 96 97 vw-options: 98 Note that all the above options do not clash with vw options 99 All other options will be passed as-is to the vw training step. 100 101 See the script source head comments for more details. 102"; 103} 104 105sub get_args { 106 $0 =~ s{.*/}{}; 107 108 if (-f $ARGV[-1]) { 109 $TrainFile = pop(@ARGV); 110 } else { 111 usage("last command line arg must be a training-set file"); 112 } 113 114 my @vw_opts_and_args = (); 115 foreach my $arg (@ARGV) { 116 if ($arg =~ /^-[vVKOP]+$/) { 117 # These options are for us, not for vw 118 $opt_v = 1 if ($arg =~ /v/); 119 $opt_V = 1 if ($arg =~ /V/); 120 $opt_K = 1 if ($arg =~ /K/); 121 $opt_v = 1 if ($opt_V); 122 if ($arg =~ /O/) { 123 ($opt_O) = ("@ARGV" =~ /$arg\s+(\S+)\b/); 124 } 125 if ($arg =~ /P/) { 126 usage("-P: option no longer supported.\n" . 127 "Please pass VW options directly.\n"); 128 } 129 next; 130 } 131 if (-f $arg) { 132 my $skip_ts = 0; 133 if ($vw_opts_and_args[-1] =~ '-d|--data') { 134 pop(@vw_opts_and_args); 135 $skip_ts = 1; 136 # These options have file-args following them 137 } elsif ($vw_opts_and_args[-1] !~ 138 /^(?: 139 -p 140 |--predictions 141 |-i 142 |--initial_regressor 143 |-f 144 |--final_regressor 145 |--feature_mask 146 |-r 147 |--raw_predictions 148 |--cache_file 149 |--pid_file 150 |--readable_model 151 |--output_feature_\S+ 152 )$/x) { 153 $skip_ts = 1; 154 } 155 if ($skip_ts) { 156 warn "ignoring trainset: $arg in vw-args: train-set must be last arg\n"; 157 next; 158 } 159 } 160 push(@vw_opts_and_args, $arg); 161 } 162 $opt_O = '' unless (defined $opt_O); 163 usage("You must supply a training-set file") 164 unless (defined $TrainFile); 165 166 usage("training-set file: $TrainFile: $!") 167 unless (-f $TrainFile); 168 169 if (@vw_opts_and_args) { 170 $VWARGS = "@vw_opts_and_args"; 171 } 172 while ($VWARGS =~ /-q\s*(\S)(\S)/g) { 173 push(@QPairs, [$1, $2]); 174 } 175 while ($VWARGS =~ /--keep\s*(\S)/g) { 176 $DoKeep = 1; 177 $Keep{$1} = 1; 178 } 179 $Keep{''} = 1; # to be consistent with vw no-namespce behavior 180 while ($VWARGS =~ /--ignore\s*(\S)/g) { 181 $Ignore{$1} = 1; 182 } 183 if ($VWARGS =~ /--(?:(?:cs)?oaa|wap|ect|sequence)/) { 184 if ($VWARGS =~ /--(?:wap|ect)/) { 185 # Please send a patch when/if you can figure these out 186 die "$0: --wap, --ect multi-class is not supported - sorry\n"; 187 } 188 $MultiClass = 1; 189 } 190 191 # Since we need to pass '-f model', ensure it doesn't clash 192 # with the user explicitly passing these 193 if ($VWARGS =~ /(?:(?:^|\s)(?:-f|--final_regressor)\s+(\S+))/) { 194 $ModelFile = $1; 195 } else { 196 $ModelFile = "$TrainFile.model"; 197 push(@TmpFiles, $ModelFile); 198 } 199 200 $ExampleFile = "$TrainFile.examples"; 201 $AuditFile = "$TrainFile.audit"; 202 203 push(@TmpFiles, $ExampleFile, $AuditFile); 204} 205 206sub cleanup { 207 if ($opt_K) { 208 v("keeping temporary files: @TmpFiles\n"); 209 return; 210 } 211 foreach my $tmpfile (@TmpFiles) { 212 unlink($tmpfile); 213 } 214} 215 216 217# 218# symbolic full feature (name-space + feature-name) as used by --audit 219# 220sub feature_name(@) { 221 join('^', @_); 222} 223 224# 225# pair_features() 226# Initialize %FeatureMin and %FeatureMax for all paired 227# name-spaces based on the @QPairs list which was constructed 228# from VW -q ... arguments. 229# 230# Respect all --ignore and --keep logic while doing so. 231# 232sub pair_features { 233 my %paired_features; 234 my @name_spaces = keys %NameSpaces; 235 my (@matching_ns1, @matching_ns2); 236 237 foreach my $pair_ref (@QPairs) { 238 my ($x, $y) = @$pair_ref; 239 foreach my $ns1 (@name_spaces) { 240 my $ns1_ch = substr($ns1,0,1); 241 if ($x eq $ns1_ch) { 242 push(@matching_ns1, $ns1) 243 if (exists($Keep{$x}) or !exists($Ignore{$x})); 244 } 245 } 246 foreach my $ns2 (@name_spaces) { 247 my $ns2_ch = substr($ns2,0,1); 248 if ($y eq $ns2_ch) { 249 push(@matching_ns2, $ns2) 250 if (exists($Keep{$y}) or !exists($Ignore{$y})); 251 } 252 } 253 } 254 foreach my $ns1 (@matching_ns1) { 255 foreach my $ns2 (@matching_ns2) { 256 my $nsref1 = $NameSpaces{$ns1}; 257 my $nsref2 = $NameSpaces{$ns2}; 258 foreach my $key1 (keys %$nsref1) { 259 foreach my $key2 (keys %$nsref2) { 260 my $feature = feature_name($ns1, $key1, $ns2, $key2); 261 $FeatureMax{$feature} = 0; 262 $FeatureMin{$feature} = 0; 263 } 264 } 265 } 266 } 267} 268 269sub parse_labels($) { 270 my $labels = shift; 271 $labels =~ s/\s+\S+$//; # trim optional tag (touching the '|') 272 while ($labels =~ /([^:\s]+):?(\S+)?/g) { 273 # match labels and optional weights 274 $Labels{$1} = (defined $2) ? $2 : 1; 275 } 276 sort {$a <=> $b} keys %Labels; 277} 278 279# 280# read_features($trainingset_file) 281# Read the training set & parse it, collect all name-spaces, 282# feature-names, min/max values 283# 284sub read_features($) { 285 my ($trainset) = @_; 286 287 my $ts; 288 if ($trainset =~ /\.gz$/) { 289 open($ts, "gunzip -c $trainset|") || die "$0: gunzip -c $trainset|: $!\n"; 290 } else { 291 open($ts, $trainset) || die "$0: $trainset: $!\n"; 292 } 293 while (<$ts>) { 294 # -- examples loop 295 next unless (/\S/); # skip empty lines 296 297 # -- grab anything following the 1st '|' 298 my ($labels, $input_features) = ($_ =~ /^([^|]*)\|(.*)$/); 299 die "$0: $trainset line $.: malformed example: missing '|'\n" 300 unless (defined $input_features); 301 302 if ($MultiClass) { 303 @Labels = parse_labels($labels); 304 } 305 306 my @name_space_region = split('\|', $input_features); 307 foreach my $nsr (@name_space_region) { 308 # -- name-spaces loop (note: name-space my be ''): 309 # extract the name-space string, ignore (optional) :weight 310 my ($ns) = ($nsr =~ /^([^:\s]+)(?:\:\S+)?/); 311 $ns = '' unless ((defined $ns) && length($ns)); 312 313 my $ns_ch1 = substr($ns, 0, 1); 314 315 next if (exists $Ignore{$ns_ch1}); 316 next if ($DoKeep && !exists $Keep{$ns_ch1}); 317 318 unless (exists $NameSpaces{$ns}) { 319 $NameSpaces{$ns} = {}; 320 } 321 my $nsref = $NameSpaces{$ns}; 322 323 # Trim (the optionally empty) name-space prefix, 324 # including the optional :weight 325 $nsr =~ s/^$ns\S*\s*//; 326 327 # Following the name-space: loop over feature+value pairs: 328 foreach my $keyval (split(/\s+/, $nsr)) { 329 # -- features loop 330 my ($key, $val); 331 if ($keyval =~ /:/) { # explicit :value 332 ($key, $val) = ($`, $'); 333 } else { # implicit value == 1 334 $key = $keyval; 335 $val = 1; 336 } 337 $nsref->{$key} = $val; 338 339 my $f = feature_name($ns, $key); 340 341 # -- record min, max per feature 342 unless (exists $FeatureMax{$f}) { 343 $FeatureMax{$f} = 0; 344 $FeatureMin{$f} = 0; 345 } 346 if ($FeatureMax{$f} < $val) { 347 $FeatureMax{$f} = $val; 348 } 349 if ($FeatureMin{$f} > $val) { 350 $FeatureMin{$f} = $val; 351 } 352 } 353 } 354 } 355 close $ts; 356 357 # Add the -q pairs of features 358 pair_features(); 359 360 # Add the Constant feature 361 $FeatureMin{'Constant'} = $FeatureMax{'Constant'} = 0; 362} 363 364# 365# do_train 366# run the training stage to compute per feature weights 367# 368sub do_train($$) { 369 my ($trainset, $model) = @_; 370 my $cmd = "$VW --quiet $VWARGS"; 371 unless ($cmd =~ /(?:\s(?:-d|--data)\s)/) { 372 $cmd .= " $trainset"; 373 } 374 unless ($cmd =~ /(?:\s(?:-f|--final_regressor))/) { 375 $cmd .= " -f $model"; 376 } 377 378 if ($opt_v) { 379 $cmd =~ s/ --quiet / /; 380 } 381 382 v("training: %s\n", $cmd); 383 system($cmd); 384 die "$0: vw training failed (see details above)\n" 385 unless ($? == 0); 386} 387 388sub generate_one_example($$) { 389 my ($fd, $label) = @_; 390 391 if ($MultiClass) { 392 printf $fd "%s:1", $label; 393 # foreach $label2 (@Labels) { 394 # next if ($label eq $label2); 395 # printf $fd " %s:0", $label2; 396 # } 397 } else { 398 # simple, non multi-class case 399 print $fd $label; 400 } 401 # print all possible input features, with a weight of 1 402 foreach my $ns (keys %NameSpaces) { 403 my $nsref = $NameSpaces{$ns}; 404 printf $fd ' |%s', $ns; 405 foreach my $key (sort keys %$nsref) { 406 my $weight = 1; 407 printf $fd ' %s:%s', $key, $weight; 408 } 409 } 410 print $fd "\n"; 411} 412 413sub generate_examples($) { 414 my ($example_file) = shift; 415 open(my $fd, ">$example_file") || 416 die "$0: can't write full_example file: '$example_file': $!\n"; 417 418 v("Labels: @Labels\n"); 419 foreach my $label (@Labels) { 420 # One line per label: 421 # multiclass deprecates to singleton: label=1 422 generate_one_example($fd, $label); 423 } 424 close $fd; 425} 426 427my %SeenFeatureNames; 428my $MCLabel; 429my $MCLabelIndex = -1; 430 431sub audit_one_example($) { 432 my $audit_stream = shift; 433 434 # skip the prediction line 435 # we're only interested in the audit line 436 my $prediction = <$audit_stream>; 437 my $features_data = <$audit_stream>; 438 439 my $weight_href; 440 if ($MultiClass) { 441 if (++$MCLabelIndex >= @Labels) { 442 $MCLabelIndex =0; 443 } 444 $MCLabel = $Labels[$MCLabelIndex]; 445 $weight_href = $Label2FW{$MCLabel} = {}; 446 chomp($prediction); 447 $Prediction{$MCLabel} = $prediction; 448 } 449 450 chomp($features_data); 451 my @features_list = split(' ', $features_data); 452 453 while (@features_list) { 454 my $audited_item = shift @features_list; 455 next unless ($audited_item); 456 457 # Audited feature format: namespace^varname:142703:1:0.0435613 ... 458 my (@fields) = split(':', $audited_item); 459 460 my ($feature, $hashval, $value, $weight) = @fields[-4 .. -1]; 461 462 # Trim '@0' and similar suffixes. 463 $weight =~ s/@.*$//; 464 465 unless ($feature) { 466 if ($MultiClass) { 467 $feature = "Constant_$MCLabel"; 468 $FeatureMax{$feature} = 0; 469 $FeatureMin{$feature} = 0; 470 } 471 } 472 473 $SeenFeatureNames{$feature} = 1; 474 $Feature2Hash{$feature} = $hashval; 475 476 if ($MultiClass) { 477 # v("audit_one_example: MC=$MultiClass Label=$MCLabel {$feature} = $weight\n"); 478 $weight_href->{$feature} = $weight; 479 } else { 480 $Feature2Weight{$feature} = $weight; 481 } 482 V("%s\t%s\t%s\t%s\n", $feature, $hashval, $value, $weight); 483 } 484} 485 486# 487# audit_features() 488# read the output of vw -a (audit) on the all-feature example 489# to extract hash values and weights 490# Return the list of all feature-names 491# 492sub audit_features { 493 generate_examples($ExampleFile); 494 495 # Bug in vw multiclass, looks like we need to pass the multiclass 496 # params to --audit even though they should be in the model 497 my $vw_audit_args = "--quiet -t --audit -i $ModelFile -d $ExampleFile"; 498 my $vw_mcargs = ''; 499 if (${VWARGS} =~ /--(?:(?:cs)?oaa|wap|sequence)(?:_ldf)?\s+\d+/) { 500 $vw_mcargs = $&; 501 } 502 my $audit_cmd = "$VW $vw_mcargs $vw_audit_args"; 503 $audit_cmd .= "|tee $AuditFile" if ($opt_K); 504 505 open(my $audit_stream, "$audit_cmd |") || 506 die "$0: can't run \$audit_cmd: '$audit_cmd |'\n"; 507 508 while (!eof($audit_stream)) { 509 audit_one_example($audit_stream); 510 } 511 512 close $audit_stream; 513 514 # Return the list of features actually seen in the audit 515 sort keys %SeenFeatureNames; 516} 517 518 519# 520# return 'score' metric for a given feature 521# 522sub score($$;$) { 523 my ($class_label, $feature, $metric) = @_; 524 525 my $f2w_hashref = $MultiClass 526 ? $Label2FW{$class_label} 527 : \%Feature2Weight; 528 529 my $fweight = $f2w_hashref->{$feature}; 530 unless (defined $fweight) { 531 warn "$0: BUG?: score($class_label, $feature): undef weight!\n"; 532 return undef; 533 } 534 535 # Support more metrics, are there any others that make sense? 536 # if ($metric eq '...') ... 537 538 $fweight; 539} 540 541# 542# Find maximum feature-name length and max/min values 543# 544sub feature_flen_min_max($@) { 545 my $w_href = shift @_; 546 547 my ($max_flen, $min_weight, $max_weight) = (10, 0, 0); 548 549 foreach my $f (@_) { 550 my $w = $w_href->{$f}; 551 unless (defined $w) { 552 # Should already be caught in score() above, 553 # so warn only in verbose mode 554 v("%s: feature_flen_min_max: %s: undefined weight\n", $0, $f); 555 next; 556 } 557 my $flen = length($f); 558 $max_flen = $flen if ($flen > $max_flen); 559 560 $max_weight = $w if ($w > $max_weight); 561 $min_weight = $w if ($w < $min_weight); 562 } 563 ($max_flen, $min_weight, $max_weight); 564} 565 566# 567# Find min/max score and zero the score of the constant feature 568# 569sub feature_min_max_score($@) { 570 my $class_label = shift @_; 571 572 my ($min_score, $max_score) = (0, 0); 573 my $score_href = {}; # feature->score 574 575 foreach my $f (@_) { # features loop 576 if ($f =~ /^Constant/) { 577 my $constant_feature_name = $f; 578 $score_href->{$f} = 0; 579 next; 580 } 581 582 my $score = score($class_label, $f, $opt_O); 583 next unless (defined $score); 584 $score_href->{$f} = $score; 585 586 $max_score = $score if ($score > $max_score); 587 $min_score = $score if ($score < $min_score); 588 } 589 ($min_score, $max_score, $score_href); 590} 591 592sub print_feature_report($$$$@) { 593 my ($max_flen, $max_distance_from_0, 594 $class_label, $score_href, @features) = @_; 595 596 my $weight_href = $MultiClass 597 ? $Label2FW{$class_label} 598 : \%Feature2Weight; 599 600 printf "%-*s\t%+10s %8s %8s %+9s %+10s\n", 601 $max_flen, 'FeatureName', 'HashVal', 602 'MinVal', 'MaxVal', 603 'Weight', 'RelScore'; 604 605 # TODO? support different orders: 606 # by weight, by feature-range-normalized weights? 607 foreach my $f (sort { 608 $score_href->{$b} <=> $score_href->{$a} 609 } @features) { 610 611 my $score = $score_href->{$f}; 612 my $distance_from_0 = $score; 613 $max_distance_from_0 = 1e-10 if ($max_distance_from_0 == 0); 614 my $normalized_score = $distance_from_0 / $max_distance_from_0; 615 616 # FIXME: support different normalization schemes 617 if ($opt_O =~ /^a/i) { 618 $normalized_score = abs($normalized_score); 619 } 620 621 # On the fly generated (e.g. -q crossed features) don't have a min/max 622 $FeatureMin{$f} = 0 unless (exists $FeatureMin{$f}); 623 $FeatureMax{$f} = 0 unless (exists $FeatureMax{$f}); 624 625 printf "%-*s\t%10u %8.2f %8.2f %+9.4f %9.2f%%\n", 626 $max_flen, $f, 627 $Feature2Hash{$f}, 628 $FeatureMin{$f}, 629 $FeatureMax{$f}, 630 $weight_href->{$f}, 631 (100.0 * $normalized_score); 632 } 633 print "\n" if ($MultiClass); 634} 635 636 637# 638# summarize_features 639# Output what we know about all features + relative score 640# 641sub summarize_features { 642 # Per-class loop for multi-class, 643 # only one loop for non multi-class 644 foreach my $label (@Labels) { 645 my @features = ($MultiClass) ? (keys %{$Label2FW{$label}}) : @Features; 646 647 my ($min_score, $max_score, $score_href) = 648 feature_min_max_score($label, @features); 649 my $score_range = $max_score - $min_score; 650 651 my ($max_flen, $min_wt, $max_wt) = 652 feature_flen_min_max($score_href, @features); 653 my $range_weight = $max_wt - $min_wt; 654 655 my $upper_range = abs($max_score); 656 my $lower_range = abs($min_score); 657 658 my $max_distance_from_0 = ($upper_range > $lower_range) 659 ? $upper_range 660 : $lower_range; 661 662 printf "=== Class Label: %s\tPrediction: %s\n", 663 $label, $Prediction{$label} 664 if ($MultiClass); 665 666 print_feature_report($max_flen, $max_distance_from_0, 667 $label, $score_href, @features); 668 } 669} 670 671# -- main 672get_args(); 673read_features($TrainFile); 674do_train($TrainFile, $ModelFile); 675@Features = audit_features(); 676summarize_features(); 677cleanup(); 678 679