1#!/usr/bin/perl -w
2#
3# vim: sw=4 ts=4 expandtab smarttab
4#
5# vw-varinfo      Summarize features of a training-set using VW
6#   Input:          A vw training set file
7#   Output:         A list of features, their VW hash values, min/max
8#                   values, regressor weights, and distance from
9#                   the best constant.
10#
11# Algorithm:
12#   1)  Collect all variables and their ranges from training-set
13#   2)  Train with VW to determine regressor weights
14#   3)  Build a test-set with a single example including all variables
15#   4)  run VW with --audit on 3) to map variable names to hash values
16#       and weights.
17#   5)  Output collected information about the input variables.
18#
19# NOTE: distance from the best constant predictor is not really
20# variable 'importance', in the sense that it is not a reliable indicator
21# of prediction performance in general.  For example, if you have two
22# equal co-occurring features, the sum of the weights is what matters
23# and individual weight values are arbitrary.
24#
25# Distance represents the relative distance of the regressor weight
26# from the weight of the 'zero' which is vw's best constant prediction.
27#
28# Merits of this distance:
29#   1) It is a relative metric, making it easier to compare two features
30#   2) Easier to interpret vs an arbitrary weight (max is set to 100%)
31#   2) It is signed, so it shows which features are positively vs
32#      negatively correlated with the target label.
33#
34# (c) 2012 - ariel faigon for vowpal-wabbit
35# This software may be distributed under the same terms as vowpal-wabbit
36#
37# use Getopt::Std;
38use vars (qw($opt_v $opt_V $opt_O $opt_K));
39
40my $VW = 'vw';
41# default vw options are more optimal now, don't try to make them so
42my $VWARGS = '';
43
44my ($TrainFile, $ModelFile, $RmodelFile, $ExampleFile, $AuditFile);
45my (%FeatureMax, %FeatureMin);
46
47my (%Feature2Hash, %Feature2Weight);
48
49my %NameSpace;      # hash of hashes: namespace => { key => val }...
50my @Features;       # feature names list
51
52my @QPairs = ();    # list of pairs ([a, b], [c, d] ...) for namespace pairing
53my %Ignore;         # support for --ignore X
54my %Keep;           # support for --keep X
55my $DoKeep;         # flag for whether we need to use --keep or not
56my $MultiClass = 0; # flag for multi-class (--oaa --csoaa --wap* --sequence?)
57my %Labels;
58my @Labels = (1);   # List of labels for super example generation
59
60my %Label2FW;       # for multi-class: every label has feature->weight
61my %Prediction;     # prediction of each isolated multi-class label
62
63my @TmpFiles;
64
65#
66# v() & V():  verbose/debug prints for -v and -V
67#
68sub v(@) {
69    return unless $opt_v;
70    if (@_ == 1) {
71        print STDERR @_;
72    } else {
73        printf STDERR @_;
74    }
75}
76
77sub V(@) {
78    return unless $opt_V;
79    if (@_ == 1) {
80        print STDERR @_;
81    } else {
82        printf STDERR @_;
83    }
84}
85
86sub usage(@) {
87    print STDERR @_, "\n" if (@_);
88    die "Usage: $0 [options] [vw-options] <training-set-file>
89    Options:
90        -v          verbose
91        -V          more verbose
92        -K          keep temporary files
93        -O<which>   Use order/ranking metric <which>
94                    Supported metrics:
95                        ... not implemented yet ...
96
97    vw-options:
98        Note that all the above options do not clash with vw options
99        All other options will be passed as-is to the vw training step.
100
101    See the script source head comments for more details.
102";
103}
104
105sub get_args {
106    $0 =~ s{.*/}{};
107
108    if (-f $ARGV[-1]) {
109        $TrainFile = pop(@ARGV);
110    } else {
111        usage("last command line arg must be a training-set file");
112    }
113
114    my @vw_opts_and_args = ();
115    foreach my $arg (@ARGV) {
116        if ($arg =~ /^-[vVKOP]+$/) {
117            # These options are for us, not for vw
118            $opt_v = 1 if ($arg =~ /v/);
119            $opt_V = 1 if ($arg =~ /V/);
120            $opt_K = 1 if ($arg =~ /K/);
121            $opt_v = 1 if ($opt_V);
122            if ($arg =~ /O/) {
123                ($opt_O) = ("@ARGV" =~ /$arg\s+(\S+)\b/);
124            }
125            if ($arg =~ /P/) {
126                usage("-P: option no longer supported.\n" .
127                       "Please pass VW options directly.\n");
128            }
129            next;
130        }
131        if (-f $arg) {
132            my $skip_ts = 0;
133            if ($vw_opts_and_args[-1] =~ '-d|--data') {
134                pop(@vw_opts_and_args);
135                $skip_ts = 1;
136            # These options have file-args following them
137            } elsif ($vw_opts_and_args[-1] !~
138                        /^(?:
139                            -p
140                            |--predictions
141                            |-i
142                            |--initial_regressor
143                            |-f
144                            |--final_regressor
145                            |--feature_mask
146                            |-r
147                            |--raw_predictions
148                            |--cache_file
149                            |--pid_file
150                            |--readable_model
151                            |--output_feature_\S+
152                        )$/x) {
153                $skip_ts = 1;
154            }
155            if ($skip_ts) {
156                warn "ignoring trainset: $arg in vw-args: train-set must be last arg\n";
157                next;
158            }
159        }
160        push(@vw_opts_and_args, $arg);
161    }
162    $opt_O = '' unless (defined $opt_O);
163    usage("You must supply a training-set file")
164        unless (defined $TrainFile);
165
166    usage("training-set file: $TrainFile: $!")
167        unless (-f $TrainFile);
168
169    if (@vw_opts_and_args) {
170        $VWARGS = "@vw_opts_and_args";
171    }
172    while ($VWARGS =~ /-q\s*(\S)(\S)/g) {
173        push(@QPairs, [$1, $2]);
174    }
175    while ($VWARGS =~ /--keep\s*(\S)/g) {
176        $DoKeep = 1;
177        $Keep{$1} = 1;
178    }
179    $Keep{''} = 1;          # to be consistent with vw no-namespce behavior
180    while ($VWARGS =~ /--ignore\s*(\S)/g) {
181        $Ignore{$1} = 1;
182    }
183    if ($VWARGS =~ /--(?:(?:cs)?oaa|wap|ect|sequence)/) {
184        if ($VWARGS =~ /--(?:wap|ect)/) {
185            # Please send a patch when/if you can figure these out
186            die "$0: --wap, --ect multi-class is not supported - sorry\n";
187        }
188        $MultiClass = 1;
189    }
190
191    # Since we need to pass '-f model', ensure it doesn't clash
192    # with the user explicitly passing these
193    if ($VWARGS =~ /(?:(?:^|\s)(?:-f|--final_regressor)\s+(\S+))/) {
194        $ModelFile = $1;
195    } else {
196        $ModelFile = "$TrainFile.model";
197        push(@TmpFiles, $ModelFile);
198    }
199
200    $ExampleFile = "$TrainFile.examples";
201    $AuditFile = "$TrainFile.audit";
202
203    push(@TmpFiles, $ExampleFile, $AuditFile);
204}
205
206sub cleanup {
207    if ($opt_K) {
208        v("keeping temporary files: @TmpFiles\n");
209        return;
210    }
211    foreach my $tmpfile (@TmpFiles) {
212        unlink($tmpfile);
213    }
214}
215
216
217#
218# symbolic full feature (name-space + feature-name) as used by --audit
219#
220sub feature_name(@) {
221    join('^', @_);
222}
223
224#
225# pair_features()
226#   Initialize %FeatureMin and %FeatureMax for all paired
227#   name-spaces based on the @QPairs list which was constructed
228#   from VW -q ... arguments.
229#
230#   Respect all --ignore and --keep logic while doing so.
231#
232sub pair_features {
233    my %paired_features;
234    my @name_spaces = keys %NameSpaces;
235    my (@matching_ns1, @matching_ns2);
236
237    foreach my $pair_ref (@QPairs) {
238        my ($x, $y) = @$pair_ref;
239        foreach my $ns1 (@name_spaces) {
240            my $ns1_ch = substr($ns1,0,1);
241            if ($x eq $ns1_ch) {
242                push(@matching_ns1, $ns1)
243                    if (exists($Keep{$x}) or !exists($Ignore{$x}));
244            }
245        }
246        foreach my $ns2 (@name_spaces) {
247            my $ns2_ch = substr($ns2,0,1);
248            if ($y eq $ns2_ch) {
249                push(@matching_ns2, $ns2)
250                    if (exists($Keep{$y}) or !exists($Ignore{$y}));
251            }
252        }
253    }
254    foreach my $ns1 (@matching_ns1) {
255        foreach my $ns2 (@matching_ns2) {
256            my $nsref1 = $NameSpaces{$ns1};
257            my $nsref2 = $NameSpaces{$ns2};
258            foreach my $key1 (keys %$nsref1) {
259                foreach my $key2 (keys %$nsref2) {
260                    my $feature = feature_name($ns1, $key1, $ns2, $key2);
261                    $FeatureMax{$feature} = 0;
262                    $FeatureMin{$feature} = 0;
263                }
264            }
265        }
266    }
267}
268
269sub parse_labels($) {
270    my $labels = shift;
271    $labels =~ s/\s+\S+$//; # trim optional tag (touching the '|')
272    while ($labels =~ /([^:\s]+):?(\S+)?/g) {
273        # match labels and optional weights
274        $Labels{$1} = (defined $2) ? $2 : 1;
275    }
276    sort {$a <=> $b} keys %Labels;
277}
278
279#
280# read_features($trainingset_file)
281#   Read the training set & parse it, collect all name-spaces,
282#   feature-names, min/max values
283#
284sub read_features($) {
285    my ($trainset) = @_;
286
287    my $ts;
288    if ($trainset =~ /\.gz$/) {
289        open($ts, "gunzip -c $trainset|") || die "$0: gunzip -c $trainset|: $!\n";
290    } else {
291        open($ts, $trainset) || die "$0: $trainset: $!\n";
292    }
293    while (<$ts>) {
294        # -- examples loop
295        next unless (/\S/);     # skip empty lines
296
297        # -- grab anything following the 1st '|'
298        my ($labels, $input_features) = ($_ =~ /^([^|]*)\|(.*)$/);
299        die "$0: $trainset line $.: malformed example: missing '|'\n"
300            unless (defined $input_features);
301
302        if ($MultiClass) {
303            @Labels = parse_labels($labels);
304        }
305
306        my @name_space_region = split('\|', $input_features);
307        foreach my $nsr (@name_space_region) {
308            # -- name-spaces loop (note: name-space my be ''):
309            #    extract the name-space string, ignore (optional) :weight
310            my ($ns) = ($nsr =~ /^([^:\s]+)(?:\:\S+)?/);
311            $ns = '' unless ((defined $ns) && length($ns));
312
313            my $ns_ch1 = substr($ns, 0, 1);
314
315            next if (exists $Ignore{$ns_ch1});
316            next if ($DoKeep && !exists $Keep{$ns_ch1});
317
318            unless (exists $NameSpaces{$ns}) {
319                $NameSpaces{$ns} = {};
320            }
321            my $nsref = $NameSpaces{$ns};
322
323            # Trim (the optionally empty) name-space prefix,
324            # including the optional :weight
325            $nsr =~ s/^$ns\S*\s*//;
326
327            # Following the name-space: loop over feature+value pairs:
328            foreach my $keyval (split(/\s+/, $nsr)) {
329                # -- features loop
330                my ($key, $val);
331                if ($keyval =~ /:/) {       # explicit :value
332                    ($key, $val) = ($`, $');
333                } else {                    # implicit value == 1
334                    $key = $keyval;
335                    $val = 1;
336                }
337                $nsref->{$key} = $val;
338
339                my $f = feature_name($ns, $key);
340
341                # -- record min, max per feature
342                unless (exists $FeatureMax{$f}) {
343                    $FeatureMax{$f} = 0;
344                    $FeatureMin{$f} = 0;
345                }
346                if ($FeatureMax{$f} < $val) {
347                    $FeatureMax{$f} = $val;
348                }
349                if ($FeatureMin{$f} > $val) {
350                    $FeatureMin{$f} = $val;
351                }
352            }
353        }
354    }
355    close $ts;
356
357    # Add the -q pairs of features
358    pair_features();
359
360    # Add the Constant feature
361    $FeatureMin{'Constant'} = $FeatureMax{'Constant'} = 0;
362}
363
364#
365# do_train
366#   run the training stage to compute per feature weights
367#
368sub do_train($$) {
369    my ($trainset, $model) = @_;
370    my $cmd = "$VW --quiet $VWARGS";
371    unless ($cmd =~ /(?:\s(?:-d|--data)\s)/) {
372        $cmd .= " $trainset";
373    }
374    unless ($cmd =~ /(?:\s(?:-f|--final_regressor))/) {
375        $cmd .= " -f $model";
376    }
377
378    if ($opt_v) {
379        $cmd =~ s/ --quiet / /;
380    }
381
382    v("training: %s\n", $cmd);
383    system($cmd);
384    die "$0: vw training failed (see details above)\n"
385        unless ($? == 0);
386}
387
388sub generate_one_example($$) {
389    my ($fd, $label) = @_;
390
391    if ($MultiClass) {
392        printf $fd "%s:1", $label;
393        # foreach $label2 (@Labels) {
394        #    next if ($label eq $label2);
395        #    printf $fd " %s:0", $label2;
396        # }
397    } else {
398        # simple, non multi-class case
399        print $fd $label;
400    }
401    # print all possible input features, with a weight of 1
402    foreach my $ns (keys %NameSpaces) {
403        my $nsref = $NameSpaces{$ns};
404        printf $fd ' |%s', $ns;
405        foreach my $key (sort keys %$nsref) {
406            my $weight = 1;
407            printf $fd ' %s:%s', $key, $weight;
408        }
409    }
410    print $fd "\n";
411}
412
413sub generate_examples($) {
414    my ($example_file) = shift;
415    open(my $fd, ">$example_file") ||
416        die "$0: can't write full_example file: '$example_file': $!\n";
417
418    v("Labels: @Labels\n");
419    foreach my $label (@Labels) {
420        # One line per label:
421        # multiclass deprecates to singleton: label=1
422        generate_one_example($fd, $label);
423    }
424    close $fd;
425}
426
427my %SeenFeatureNames;
428my $MCLabel;
429my $MCLabelIndex = -1;
430
431sub audit_one_example($) {
432    my $audit_stream = shift;
433
434    # skip the prediction line
435    # we're only interested in the audit line
436    my $prediction = <$audit_stream>;
437    my $features_data = <$audit_stream>;
438
439    my $weight_href;
440    if ($MultiClass) {
441        if (++$MCLabelIndex >= @Labels) {
442            $MCLabelIndex =0;
443        }
444        $MCLabel = $Labels[$MCLabelIndex];
445        $weight_href = $Label2FW{$MCLabel} = {};
446        chomp($prediction);
447        $Prediction{$MCLabel} = $prediction;
448    }
449
450    chomp($features_data);
451    my @features_list = split(' ', $features_data);
452
453    while (@features_list) {
454        my $audited_item = shift @features_list;
455        next unless ($audited_item);
456
457        # Audited feature format:   namespace^varname:142703:1:0.0435613 ...
458        my (@fields) = split(':', $audited_item);
459
460        my ($feature, $hashval, $value, $weight) = @fields[-4 .. -1];
461
462        # Trim '@0' and similar suffixes.
463        $weight =~ s/@.*$//;
464
465        unless ($feature) {
466            if ($MultiClass) {
467                $feature = "Constant_$MCLabel";
468                $FeatureMax{$feature} = 0;
469                $FeatureMin{$feature} = 0;
470            }
471        }
472
473        $SeenFeatureNames{$feature} = 1;
474        $Feature2Hash{$feature} = $hashval;
475
476        if ($MultiClass) {
477            # v("audit_one_example: MC=$MultiClass Label=$MCLabel {$feature} = $weight\n");
478            $weight_href->{$feature} = $weight;
479        } else {
480            $Feature2Weight{$feature} = $weight;
481        }
482        V("%s\t%s\t%s\t%s\n", $feature, $hashval, $value, $weight);
483    }
484}
485
486#
487# audit_features()
488#   read the output of vw -a (audit) on the all-feature example
489#   to extract hash values and weights
490#   Return the list of all feature-names
491#
492sub audit_features {
493    generate_examples($ExampleFile);
494
495    # Bug in vw multiclass, looks like we need to pass the multiclass
496    # params to --audit even though they should be in the model
497    my $vw_audit_args = "--quiet -t --audit -i $ModelFile -d $ExampleFile";
498    my $vw_mcargs = '';
499    if (${VWARGS} =~ /--(?:(?:cs)?oaa|wap|sequence)(?:_ldf)?\s+\d+/) {
500        $vw_mcargs = $&;
501    }
502    my $audit_cmd = "$VW $vw_mcargs $vw_audit_args";
503    $audit_cmd .= "|tee $AuditFile" if ($opt_K);
504
505    open(my $audit_stream, "$audit_cmd |") ||
506        die "$0: can't run \$audit_cmd: '$audit_cmd |'\n";
507
508    while (!eof($audit_stream)) {
509        audit_one_example($audit_stream);
510    }
511
512    close $audit_stream;
513
514    # Return the list of features actually seen in the audit
515    sort keys %SeenFeatureNames;
516}
517
518
519#
520# return 'score' metric for a given feature
521#
522sub score($$;$) {
523    my ($class_label, $feature, $metric) = @_;
524
525    my $f2w_hashref = $MultiClass
526                        ? $Label2FW{$class_label}
527                        : \%Feature2Weight;
528
529    my $fweight = $f2w_hashref->{$feature};
530    unless (defined $fweight) {
531        warn "$0: BUG?: score($class_label, $feature): undef weight!\n";
532        return undef;
533    }
534
535    # Support more metrics, are there any others that make sense?
536    # if ($metric eq '...') ...
537
538    $fweight;
539}
540
541#
542# Find maximum feature-name length and max/min values
543#
544sub feature_flen_min_max($@) {
545    my $w_href = shift @_;
546
547    my ($max_flen, $min_weight, $max_weight) = (10, 0, 0);
548
549    foreach my $f (@_) {
550        my $w = $w_href->{$f};
551        unless (defined $w) {
552            # Should already be caught in score() above,
553            # so warn only in verbose mode
554            v("%s: feature_flen_min_max: %s: undefined weight\n", $0, $f);
555            next;
556        }
557        my $flen = length($f);
558        $max_flen = $flen if ($flen > $max_flen);
559
560        $max_weight = $w if ($w > $max_weight);
561        $min_weight = $w if ($w < $min_weight);
562    }
563    ($max_flen, $min_weight, $max_weight);
564}
565
566#
567# Find min/max score and zero the score of the constant feature
568#
569sub feature_min_max_score($@) {
570    my $class_label = shift @_;
571
572    my ($min_score, $max_score) = (0, 0);
573    my $score_href = {};    # feature->score
574
575    foreach my $f (@_) {            # features loop
576        if ($f =~ /^Constant/) {
577            my $constant_feature_name = $f;
578            $score_href->{$f} = 0;
579            next;
580        }
581
582        my $score = score($class_label, $f, $opt_O);
583        next unless (defined $score);
584        $score_href->{$f} = $score;
585
586        $max_score = $score if ($score > $max_score);
587        $min_score = $score if ($score < $min_score);
588    }
589    ($min_score, $max_score, $score_href);
590}
591
592sub print_feature_report($$$$@) {
593    my ($max_flen, $max_distance_from_0,
594        $class_label, $score_href, @features) = @_;
595
596    my $weight_href = $MultiClass
597                        ? $Label2FW{$class_label}
598                        : \%Feature2Weight;
599
600    printf "%-*s\t%+10s %8s %8s %+9s %+10s\n",
601                $max_flen, 'FeatureName', 'HashVal',
602                'MinVal', 'MaxVal',
603                'Weight', 'RelScore';
604
605    # TODO? support different orders:
606    # by weight, by feature-range-normalized weights?
607    foreach my $f (sort {
608                     $score_href->{$b} <=> $score_href->{$a}
609                  } @features) {
610
611        my $score = $score_href->{$f};
612        my $distance_from_0 = $score;
613        $max_distance_from_0 = 1e-10 if ($max_distance_from_0 == 0);
614        my $normalized_score = $distance_from_0 / $max_distance_from_0;
615
616        # FIXME: support different normalization schemes
617        if ($opt_O =~ /^a/i) {
618            $normalized_score = abs($normalized_score);
619        }
620
621        # On the fly generated (e.g. -q crossed features) don't have a min/max
622        $FeatureMin{$f} = 0 unless (exists $FeatureMin{$f});
623        $FeatureMax{$f} = 0 unless (exists $FeatureMax{$f});
624
625        printf "%-*s\t%10u %8.2f %8.2f %+9.4f %9.2f%%\n",
626                $max_flen, $f,
627                $Feature2Hash{$f},
628                $FeatureMin{$f},
629                $FeatureMax{$f},
630                $weight_href->{$f},
631                (100.0 * $normalized_score);
632    }
633    print "\n" if ($MultiClass);
634}
635
636
637#
638# summarize_features
639#   Output what we know about all features + relative score
640#
641sub summarize_features {
642    # Per-class loop for multi-class,
643    # only one loop for non multi-class
644    foreach my $label (@Labels) {
645        my @features = ($MultiClass) ? (keys %{$Label2FW{$label}}) : @Features;
646
647        my ($min_score, $max_score, $score_href) =
648                            feature_min_max_score($label, @features);
649        my $score_range = $max_score - $min_score;
650
651        my ($max_flen, $min_wt, $max_wt) =
652                            feature_flen_min_max($score_href, @features);
653        my $range_weight = $max_wt - $min_wt;
654
655        my $upper_range = abs($max_score);
656        my $lower_range = abs($min_score);
657
658        my $max_distance_from_0 = ($upper_range > $lower_range)
659                                        ? $upper_range
660                                        : $lower_range;
661
662        printf "=== Class Label: %s\tPrediction: %s\n",
663                $label, $Prediction{$label}
664                    if ($MultiClass);
665
666        print_feature_report($max_flen, $max_distance_from_0,
667                             $label, $score_href, @features);
668    }
669}
670
671# -- main
672get_args();
673read_features($TrainFile);
674do_train($TrainFile, $ModelFile);
675@Features = audit_features();
676summarize_features();
677cleanup();
678
679