1# WordNet::Similarity::vector_pairs.pm version 2.04
2# (Last updated $Id: vector_pairs.pm,v 1.11 2008/03/27 06:21:17 sidz1979 Exp $)
3#
4# Module to accept two WordNet synsets and to return a floating point
5# number that indicates how similar those two synsets are, using a
6# gloss vector overlap measure based on "context vectors" described by
7# Schütze (1998).
8#
9# Copyright (c) 2005,
10#
11# Ted Pedersen, University of Minnesota Duluth
12# tpederse at d.umn.edu
13#
14# Siddharth Patwardhan, University of Utah, Salt Lake City
15# sidd at cs.utah.edu
16#
17# Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
18# banerjee+ at cs.cmu.edu
19#
20# This program is free software; you can redistribute it and/or
21# modify it under the terms of the GNU General Public License
22# as published by the Free Software Foundation; either version 2
23# of the License, or (at your option) any later version.
24#
25# This program is distributed in the hope that it will be useful,
26# but WITHOUT ANY WARRANTY; without even the implied warranty of
27# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
28# GNU General Public License for more details.
29#
30# You should have received a copy of the GNU General Public License
31# along with this program; if not, write to
32#
33# The Free Software Foundation, Inc.,
34# 59 Temple Place - Suite 330,
35# Boston, MA  02111-1307, USA.
36#
37# ------------------------------------------------------------------
38
39package WordNet::Similarity::vector_pairs;
40
41=head1 NAME
42
43WordNet::Similarity::vector_pairs - module for computing semantic relatedness
44of word senses using second order co-occurrence vectors of glosses of the word
45senses.
46
47=head1 SYNOPSIS
48
49  use WordNet::Similarity::vector_pairs;
50
51  use WordNet::QueryData;
52
53  my $wn = WordNet::QueryData->new();
54
55  my $vector_pairs = WordNet::Similarity::vector_pairs->new($wn);
56
57  my $value = $vector_pairs->getRelatedness("car#n#1", "bus#n#2");
58
59  ($error, $errorString) = $vector_pairs->getError();
60
61  die "$errorString\n" if($error);
62
63  print "car (sense 1) <-> bus (sense 2) = $value\n";
64
65=head1 DESCRIPTION
66
67SchE<uuml>tze (1998) creates what he calls context vectors (second order
68co-occurrence vectors) of pieces of text for the purpose of Word Sense
69Discrimination. This idea is adopted by Patwardhan and Pedersen to represent
70the word senses by second-order co-occurrence vectors of their dictionary
71(WordNet) definitions. The relatedness of two senses is then computed as
72the cosine of their representative gloss vectors.
73
74A concept is represented by its own gloss, as well as the glosses of the
75neighboring senses as specified in the vector-relation.dat file. Each
76gloss is converted into a second order vector by replacing the words in
77the gloss with co-occurrence vectors for those words. The overall measure
78of relatedness between two concepts is determined by taking the pairwise
79cosines between these expanded glosses. If vector-relation.dat consists
80of:
81
82 example-example
83 glos-glos
84 hypo-hypo
85
86then three pairwise cosine measurements are made to determine the
87relatedness of concepts A and B. The examples found in the glosses
88of A and B are expanded and measured, then the glosses themselves are
89expanded and measured, and then the hyponyms of A and B are expanded
90and measured. Then, the values of these three pairwise measures are summed
91to create the overall relatedness score.
92
93=over
94
95=cut
96
97use strict;
98use WordNet::vectorFile;
99use WordNet::Similarity::GlossFinder;
100use File::Spec;
101use vars qw($VERSION @ISA);
102
103@ISA = qw(WordNet::Similarity::GlossFinder);
104$VERSION = '2.04';
105
106WordNet::Similarity::addConfigOption("vectordb", 0, "p", undef);
107
108=item $measure->initialize($file)
109
110Overrides the initialize method in the parent class (GlossFinder.pm). This method
111essentially initializes the measure for use.
112
113Parameters: $file -- configuration file.
114
115Returns: none.
116
117=cut 
118
119# Initialization of the WordNet::Similarity::vector_pairs object... parses the config file and sets up
120# global variables, or sets them to default values.
121# INPUT PARAMS  : $paramFile .. File containing the module specific params.
122# RETURN VALUES : (none)
123sub initialize
124{
125    my $self = shift;
126    my $vectorDB;
127    my $documentCount;
128    my $wn = $self->{wn};
129    my $readDims;
130    my $readVectors;
131
132    # Look for the default vector relation file...
133    if(!defined $self->{relationDefault})
134    {
135        my $path;
136        my $header;
137        my @possiblePaths = ();
138
139        # Look for all possible default data files installed.
140        foreach $path (@INC)
141        {
142            # JM 1-16-04  -- modified to use File::Spec
143            my $file = File::Spec->catfile($path, 'WordNet', 'vector-pairs-relation.dat');
144            push @possiblePaths, $file if(-e $file);
145        }
146
147        # If there are multiple possibilities, get the one in the correct format.
148        foreach $path (@possiblePaths)
149        {
150            next if(!open(RELATIONS, $path));
151            $header = <RELATIONS>;
152            $header =~ s/\s+//g;
153            if($header =~ /RelationFile/)
154            {
155                $self->{relationDefault} = $path;
156                close(RELATIONS);
157                last;
158            }
159            close(RELATIONS);
160        }
161    }
162
163    # Call the initialize method of the super-class.
164    $self->SUPER::initialize(@_);
165
166    # Initialize the vector cache.
167    $self->{vCache} = ();
168    $self->{vCacheQ} = ();
169    $self->{vCacheSize} = 80;
170
171    # Initialize the word vector database interface...
172    if(!defined $self->{vectordb} || $self->{vectordb} eq "")
173    {
174        my $path;
175        my $header;
176        my @possiblePaths = ();
177        $vectorDB = "";
178
179        # Look for all possible default data files installed.
180        foreach $path (@INC)
181        {
182            # JM 1-16-04  -- modified to use File::Spec
183            my $file = File::Spec->catfile($path, 'WordNet', 'wordvectors.dat');
184            push @possiblePaths, $file if(-e $file);
185        }
186
187        # If there are multiple possibilities, get the one in the correct format.
188        foreach $path (@possiblePaths)
189        {
190            next if(!open(VECTORS, $path));
191            $header = <VECTORS>;
192            $header =~ s/\s+//g;
193            if($header =~ /DOCUMENTCOUNT/)
194            {
195                $vectorDB = $path;
196                close(VECTORS);
197                last;
198            }
199            close(VECTORS);
200        }
201    }
202    else
203    {
204        $vectorDB = $self->{vectordb};
205    }
206
207    # If database still not specified...
208    if(!defined $vectorDB || $vectorDB eq "")
209    {
210	$self->{errorString} .= "\nError (WordNet::Similarity::vector_pairs->initialize()) - ";
211	$self->{errorString} .= "Word Vector database file not specified. Use configuration file.";
212	$self->{error} = 2;
213	return;
214    }
215
216    # Get the documentCount, dimensions and vectors...
217    ($documentCount, $readDims, $readVectors) = WordNet::vectorFile->readVectors($vectorDB);
218    if(!defined $documentCount || !defined $readDims || !defined $readVectors)
219    {
220	$self->{errorString} .= "\nError (WordNet::Similarity::vector_pairs->initialize()) - ";
221	$self->{errorString} .= "Error reading the vector database file.";
222	$self->{error} = 2;
223	return;
224    }
225
226    # Load the word vector dimensions...
227    my $key;
228    $self->{numberOfDimensions} = scalar(keys(%{$readDims}));
229    foreach $key (keys %{$readDims})
230    {
231	my $ans = $readDims->{$key};
232	my @prts = split(/\s+/, $ans);
233	$self->{wordIndex}->{$key} = $prts[0];
234	$self->{indexWord}->[$prts[0]] = $key;
235    }
236
237    # Set up the interface to the word vectors...
238    foreach $key (keys %{$readVectors})
239    {
240	my $vec = $readVectors->{$key};
241	if(defined $vec)
242	{
243	    $self->{table}->{$key} = $vec;
244	}
245    }
246}
247
248=item $measure->traceOptions()
249
250This method is internally called to determine the extra options
251specified by this measure (apart from the default options specified
252in the WordNet::Similarity base class).
253
254Parameters: none.
255
256Returns: none.
257
258=cut 
259
260# show all config options specific to this module
261sub traceOptions
262{
263    my $self = shift;
264    $self->{traceString} .= "vectorDB File :: ".((defined $self->{vectordb})?"$self->{vectordb}":"")."\n";
265    $self->SUPER::traceOptions();
266}
267
268=item $vector_pairs->getRelatedness
269
270Computes the relatedness of two word senses using the Vector Algorithm.
271
272Parameters: two word senses in "word#pos#sense" format.
273
274Returns: Unless a problem occurs, the return value is the relatedness
275score, which is greater-than or equal-to 0. If an error occurs,
276then the error level is set to non-zero and an error
277string is created (see the description of getError()).
278
279=cut
280
281sub getRelatedness
282{
283    my $self = shift;
284    my $wps1 = shift;
285    my $wps2 = shift;
286    my $wn = $self->{wn};
287    my $wntools = $self->{wntools};
288    my $class = ref $self || $self;
289
290    # Check the existence of the WordNet::QueryData object.
291    unless($wn)
292    {
293        $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
294        $self->{errorString} .= "A WordNet::QueryData object is required.";
295        $self->{error} = 2;
296        return undef;
297    }
298
299    # Check the existence of the WordNet::Tools object.
300    unless($wntools)
301    {
302        $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
303        $self->{errorString} .= "A WordNet::Tools object is required.";
304        $self->{error} = 2;
305        return undef;
306    }
307
308    # Using validation code from parseWps() in a super-class
309    my $ret = $self->parseWps($wps1, $wps2);
310    ref $ret or return undef;
311
312    # Initialize traces.
313    $self->{traceString} = "";
314
315    # Now check if the similarity value for these two synsets is in
316    # fact in the cache... if so return the cached value.
317    my $relatedness =
318        $self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef;
319    defined $relatedness and return $relatedness;
320
321    # Now get down to really finding the relatedness of these two.
322    # see if any traces reqd. if so, put in the synset arrays.
323    if($self->{trace})
324    {
325	# ah so we do need SOME traces! put in the synset names.
326	$self->{traceString}  = "Synset 1: $wps1\n";
327	$self->{traceString} .= "Synset 2: $wps2\n";
328    }
329
330    # initialize the score
331    my $score = 0;
332    my $i = 0;
333
334    # Get the gloss strings from the get_wn_info module
335    my ($firstStringArray, $secondStringArray, $weightsArray, $functionsStringArray) = $self->getSuperGlosses($wps1, $wps2);
336    for($i = 0; $i < scalar(@{$weightsArray}); $i++)
337    {
338        my $functionsScore = 0;
339        my $funcStringPrinted = 0;
340        my $firstString = $firstStringArray->[$i];
341        my $secondString = $secondStringArray->[$i];
342        my $weight = $weightsArray->[$i];
343        my $functionsString = $functionsStringArray->[$i];
344
345	# so those are the two strings for this relation pair. Get the vectors
346        # Preprocess...
347        $firstString =~ s/\'//g;
348        $firstString =~ s/[^a-z0-9]+/ /g;
349        $firstString =~ s/^\s+//;
350        $firstString =~ s/\s+$//;
351        $firstString = $wntools->compoundify($firstString);
352        $secondString =~ s/\'//g;
353        $secondString =~ s/[^a-z0-9]+/ /g;
354        $secondString =~ s/^\s+//;
355        $secondString =~ s/\s+$//;
356        $secondString = $wntools->compoundify($secondString);
357
358        # Get vectors... score...
359        my $a;
360        my $maga;
361        my $sizea;
362        my $b;
363        my $magb;
364        my $sizeb;
365        my $trr1;
366        my $trr2;
367
368        # see if any traces reqd. if so, put in the synset arrays.
369	($a, $trr1, $maga) = $self->_getVector($firstString);
370	&_norm($a, $maga);
371
372	($b, $trr2, $magb) = $self->_getVector($secondString);
373	&_norm($b, $magb);
374
375        $functionsScore = &_inner($a, $b);
376        $score += $functionsScore;
377
378	# check if the two strings need to be reported in the trace.
379	if($self->{trace})
380	{
381	    if(!$funcStringPrinted)
382	    {
383		$self->{traceString} .= "$functionsString: $functionsScore\n";
384                $self->{traceString} .= "\nString: \"$firstString\"\n$trr1\n";
385                $self->{traceString} .= "\nString: \"$secondString\"\n$trr2\n";
386		$funcStringPrinted = 1;
387	    }
388	}
389    }
390
391    # Average the score...
392    $score /= $i if($i > 0);
393
394    # that does all the scoring. Put in cache if doing cacheing. Then
395    # return the score.
396    $self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
397
398    return $score;
399}
400
401# Method to compute a context vector from a given body of text...
402sub _getVector
403{
404    my $self = shift;
405    my $text = shift;
406    my $ret = {};
407    return $ret if(!defined $text);
408    my @words = split(/\s+/, $text);
409    my $word;
410    my %types;
411    my $fstFlag = 1;
412    my $localTraces = "";
413    my $kk;
414    my $mag;
415
416    # [trace]
417    if($self->{trace})
418    {
419	$localTraces .= "Word Vectors for: ";
420    }
421    # [/trace]
422
423    foreach $word (@words)
424    {
425	$types{$word} = 1 if($word !~ /[XGES]{3}\d{5}[XGES]{3}/);
426    }
427    foreach $word (keys %types)
428    {
429	if(defined $self->{table}->{$word} && !defined $self->{stopHash}->{$word})
430	{
431	    my %pieces = split(/\s+/, $self->{table}->{$word});
432
433	    # [trace]
434	    if($self->{trace})
435	    {
436		$localTraces .= ", " if(!$fstFlag);
437		$localTraces .= "$word";
438		$fstFlag = 0;
439	    }
440	    # [/trace]
441
442	    foreach $kk (keys %pieces)
443	    {
444		$ret->{$kk} = ((defined $ret->{$kk})?($ret->{$kk}):0) + $pieces{$kk};
445	    }
446	}
447    }
448
449    $mag = 0;
450    foreach $kk (keys %{$ret})
451    {
452	$mag += ($ret->{$kk} * $ret->{$kk});
453    }
454
455    return ($ret, $localTraces, sqrt($mag));
456}
457
458# Normalizes the sparse vector.
459sub _norm
460{
461    my $vec = shift;
462    my $mag = shift;
463
464    if(defined $vec && defined $mag && $mag != 0)
465    {
466	my $key;
467	foreach $key (keys %{$vec})
468	{
469	    $vec->{$key} /= $mag;
470	}
471    }
472}
473
474# Inner product of two sparse vectors.
475sub _inner
476{
477    my $vec1 = shift;
478    my $vec2 = shift;
479    my ($size1, $size2);
480    my $prod = 0;
481
482    return 0 if(!defined $vec1 || !defined $vec2);
483
484    $size1 = scalar(keys(%{$vec1}));
485    $size2 = scalar(keys(%{$vec2}));
486
487    if(defined $size1 && defined $size2 && $size1 < $size2)
488    {
489	my $key;
490	foreach $key (keys %{$vec1})
491	{
492	    $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec2->{$key});
493	}
494    }
495    else
496    {
497	my $key;
498	foreach $key (keys %{$vec2})
499	{
500	    $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec1->{$key});
501	}
502    }
503
504    return $prod;
505}
506
5071;
508
509__END__
510
511=back
512
513=head2 Usage
514
515The semantic relatedness modules in this distribution are built as classes
516that define the following methods:
517
518    new()
519    getRelatedness()
520    getError()
521    getTraceString()
522
523See the WordNet::Similarity(3) documentation for details of these methods.
524
525=head3 Typical Usage Examples
526
527To create an object of the vector_pairs measure, we would have the following
528lines of code in the Perl program.
529
530  use WordNet::Similarity::vector_pairs;
531  $measure = WordNet::Similarity::vector_pairs->new($wn, '/home/sid/vector_pairs.conf');
532
533The reference of the initialized object is stored in the scalar variable
534'$measure'. '$wn' contains a WordNet::QueryData object that should have been
535created earlier in the program. The second parameter to the 'new' method is
536the path of the configuration file for the vector_pairs measure. If the 'new'
537method is unable to create the object, '$measure' would be undefined. This,
538as well as any other error/warning may be tested.
539
540  die "Unable to create object.\n" if(!defined $measure);
541  ($err, $errString) = $measure->getError();
542  die $errString."\n" if($err);
543
544To find the semantic relatedness of the first sense of the noun 'car' and
545the second sense of the noun 'bus' using the measure, we would write
546the following piece of code:
547
548  $relatedness = $measure->getRelatedness('car#n#1', 'bus#n#2');
549
550To get traces for the above computation:
551
552  print $measure->getTraceString();
553
554However, traces must be enabled using configuration files. By default
555traces are turned off.
556
557=head1 CONFIGURATION FILE
558
559The behavior of the measures of semantic relatedness can be controlled by
560using configuration files. These configuration files specify how certain
561parameters are initialized within the object. A configuration file may be
562specified as a parameter during the creation of an object using the new
563method. The configuration files must follow a fixed format.
564
565Every configuration file starts with the name of the module ON THE FIRST LINE
566of the file. For example, a configuration file for the vector_pairs module will have
567on the first line 'WordNet::Similarity::vector_pairs'. This is followed by the
568various parameters, each on a new line and having the form 'name::value'. The
569'value' of a parameter is optional (in case of boolean parameters). In case
570'value' is omitted, we would have just 'name::' on that line. Comments are
571supported in the configuration file. Anything following a '#' is ignored till
572the end of the line.
573
574The module parses the configuration file and recognizes the following
575parameters:
576
577=over
578
579=item trace
580
581The value of this parameter specifies the level of tracing that should
582be employed for generating the traces. This value
583is an integer equal to 0, 1, or 2. If the value is omitted, then the
584default value, 0, is used. A value of 0 switches tracing off. A value
585of 1 or 2 switches tracing on.  A value of 1 displays as
586traces only the gloss overlaps found. A value of 2 displays as traces all
587the text being compared.
588
589=item cache
590
591The value of this parameter specifies whether or not caching of the
592relatedness values should be performed.  This value is an
593integer equal to  0 or 1.  If the value is omitted, then the default
594value, 1, is used. A value of 0 switches caching 'off', and
595a value of 1 switches caching 'on'.
596
597=item maxCacheSize
598
599The value of this parameter indicates the size of the cache, used for
600storing the computed relatedness value. The specified value must be
601a non-negative integer.  If the value is omitted, then the default
602value, 5,000, is used. Setting maxCacheSize to zero has
603the same effect as setting cache to zero, but setting cache to zero is
604likely to be more efficient.  Caching and tracing at the same time can result
605in excessive memory usage because the trace strings are also cached.  If
606you intend to perform a large number of relatedness queries, then you
607might want to turn tracing off.
608
609=item relation
610
611The value of this parameter is the path to a file that contains a list of
612WordNet relations.  The path may be either an absolute path or a relative
613path.
614
615The vector_pairs module combines the glosses of synsets related to the target
616synsets by these relations and forms the gloss-vector from this combined
617gloss.
618
619WARNING: the format of the relation file is different for the vector_pairs and lesk
620measures.
621
622=item stop
623
624The value of this parameter the path of a file containing a list of stop
625words that should be ignored in the glosses.  The path may be either an
626absolute path or a relative path.
627
628=item stem
629
630The value of this parameter indicates whether or not stemming should be
631performed.  The value must be an integer equal to 0 or 1.  If the
632value is omitted, then the default value, 0, is used.
633A value of 1 switches 'on' stemming, and a value of 0 switches stemming
634'off'. When stemming is enabled, all the words of the
635glosses are stemmed before their vectors are created for the vector
636measure or their overlaps are compared for the lesk measure.
637
638=item vectordb
639
640The value of this parameter is the path to a file
641containing word vectors, i.e. co-occurrence vectors for all the words
642in the WordNet glosses.  The value of this parameter may not be omitted,
643and the vector_pairs measure will not run without a vectors file being specified
644in a configuration file.
645
646=back
647
648=head1 RELATION FILE FORMAT
649
650The relation file starts with the string "RelationFile" on the first line
651of the file. Following this, on each consecutive line, a relation is specified
652in the form --
653
654 func(func(func... (func)...))-func(func(func... (func)...)) [weight]
655
656Where "func" can be any one of the following functions:
657
658 hype() = Hypernym of
659 hypo() = Hyponym of
660 holo() = Holonym of
661 mero() = Meronym of
662 attr() = Attribute of
663 also() = Also see
664 sim() = Similar
665 enta() = Entails
666 caus() = Causes
667 part() = Particle
668 pert() = Pertainym of
669 glos = gloss (without example)
670 example = example (from the gloss)
671 glosexample = gloss + example
672 syns = the synset of the concept
673
674Each of these specifies a WordNet relation. And the outermost function in the
675nesting can only be one of glos, example, glosexample or syns. The functions specify which
676glosses to use for forming the gloss vector of the synset. An optional weight can be
677specified to weigh the contribution of that relation in the overall score.
678
679For example,
680
681 glos(hype(hypo))-glosexample(hype) 0.5
682
683means that the gloss of the hypernym of the hyponym of the first synset is used to
684form the gloss vector of the first synset, and the gloss+example of the hypernym
685of the second synset is used to form the gloss vector of the second synset. The values
686in these vector are weighted by 0.5. If one of "glos", "example", "glosexample" or "syns"
687is not specified as the outermost function in the nesting, then "glosexample" is assumed
688by default. This implies that
689
690 glosexample(hypo(also))-glosexample(hype)
691
692and
693
694 hypo(also)-hype
695
696are equivalent as far as the measure is concerned.
697
698=head1 SEE ALSO
699
700perl(1), WordNet::Similarity(3), WordNet::QueryData(3)
701
702http://www.cs.utah.edu/~sidd
703
704http://wordnet.princeton.edu
705
706http://www.ai.mit.edu/~jrennie/WordNet
707
708http://groups.yahoo.com/group/wn-similarity
709
710=head1 AUTHORS
711
712 Ted Pedersen, University of Minnesota, Duluth
713 tpederse at d.umn.edu
714
715 Siddharth Patwardhan, University of Utah, Salt Lake City
716 sidd at cs.utah.edu
717
718 Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
719 banerjee+ at cs.cmu.edu
720
721=head1 BUGS
722
723To report bugs, go to http://groups.yahoo.com/group/wn-similarity/ or
724send an e-mail to "S<tpederse at d.umn.edu>".
725
726=head1 COPYRIGHT AND LICENSE
727
728Copyright (c) 2005, Ted Pedersen, Siddharth Patwardhan and Satanjeev Banerjee
729
730This program is free software; you can redistribute it and/or
731modify it under the terms of the GNU General Public License
732as published by the Free Software Foundation; either version 2
733of the License, or (at your option) any later version.
734
735This program is distributed in the hope that it will be useful,
736but WITHOUT ANY WARRANTY; without even the implied warranty of
737MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
738GNU General Public License for more details.
739
740You should have received a copy of the GNU General Public License
741along with this program; if not, write to
742
743    The Free Software Foundation, Inc.,
744    59 Temple Place - Suite 330,
745    Boston, MA  02111-1307, USA.
746
747Note: a copy of the GNU General Public License is available on the web
748at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
749distribution as GPL.txt.
750
751=cut
752