1# WordNet::Similarity::vector.pm version 2.04
2# (Last updated $Id: vector.pm,v 1.24 2008/03/27 06:21:17 sidz1979 Exp $)
3#
4# Module accepts two WordNet synsets and returns a floating point
5# number that indicates how similar those two synsets are, using a
6# gloss vector overlap measure based on "context vectors" described by
7# Schütze (1998).
8#
9# Copyright (c) 2005,
10#
11# Ted Pedersen, University of Minnesota Duluth
12# tpederse at d.umn.edu
13#
14# Siddharth Patwardhan, University of Utah, Salt Lake City
15# sidd at cs.utah.edu
16#
17# Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
18# banerjee+ at cs.cmu.edu
19#
20# This program is free software; you can redistribute it and/or
21# modify it under the terms of the GNU General Public License
22# as published by the Free Software Foundation; either version 2
23# of the License, or (at your option) any later version.
24#
25# This program is distributed in the hope that it will be useful,
26# but WITHOUT ANY WARRANTY; without even the implied warranty of
27# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
28# GNU General Public License for more details.
29#
30# You should have received a copy of the GNU General Public License
31# along with this program; if not, write to
32#
33# The Free Software Foundation, Inc.,
34# 59 Temple Place - Suite 330,
35# Boston, MA  02111-1307, USA.
36#
37# ------------------------------------------------------------------
38
39package WordNet::Similarity::vector;
40
41=head1 NAME
42
43WordNet::Similarity::vector - Perl module for computing semantic relatedness
44of word senses using second order co-occurrence vectors of glosses of the word
45senses.
46
47=head1 SYNOPSIS
48
49  use WordNet::Similarity::vector;
50
51  use WordNet::QueryData;
52
53  my $wn = WordNet::QueryData->new();
54
55  my $vector = WordNet::Similarity::vector->new($wn);
56
57  my $value = $vector->getRelatedness("car#n#1", "bus#n#2");
58
59  ($error, $errorString) = $vector->getError();
60
61  die "$errorString\n" if($error);
62
63  print "car (sense 1) <-> bus (sense 2) = $value\n";
64
65=head1 DESCRIPTION
66
67SchE<uuml>tze (1998) creates what he calls context vectors (second order
68co-occurrence vectors) of pieces of text for the purpose of Word Sense
69Discrimination. This idea is adopted by Patwardhan and Pedersen to represent
70the word senses by second-order co-occurrence vectors of their dictionary
71(WordNet) definitions. The relatedness of two senses is then computed as
72the cosine of their representative gloss vectors.
73
74=over
75
76=cut
77
78use strict;
79use WordNet::get_wn_info;
80use WordNet::stem;
81use WordNet::vectorFile;
82use WordNet::Similarity;
83use File::Spec;
84use vars qw($VERSION @ISA);
85
86@ISA = qw(WordNet::Similarity);
87
88$VERSION = '2.04';
89
90WordNet::Similarity::addConfigOption("relation", 0, "p", undef);
91WordNet::Similarity::addConfigOption("vectordb", 0, "p", undef);
92WordNet::Similarity::addConfigOption("stop", 0, "p", undef);
93WordNet::Similarity::addConfigOption("stem", 0, "i", 0);
94WordNet::Similarity::addConfigOption("textsize", 0, "i", "-1");
95
96=item $vector->setPosList()
97
98This method is internally called to determine the parts of speech
99this measure is capable of dealing with.
100
101Parameters: none.
102
103Returns: none.
104
105=cut
106
107sub setPosList
108{
109  my $self = shift;
110  $self->{n} = 1;
111  $self->{v} = 1;
112  $self->{a} = 1;
113  $self->{r} = 1;
114  return 1;
115}
116
117=item $vector->initialize($file)
118
119Overrides the initialize method in the parent class (GlossFinder.pm). This method
120essentially initializes the measure for use.
121
122Parameters: $file -- configuration file.
123
124Returns: none.
125
126=cut 
127
128# Initialization of the WordNet::Similarity::vector object... parses the config file and sets up
129# global variables, or sets them to default values.
130# INPUT PARAMS  : $paramFile .. File containing the module specific params.
131# RETURN VALUES : (none)
132sub initialize
133{
134    my $self = shift;
135    my $vectorDB;
136    my $documentCount;
137    my $wn = $self->{wn};
138    my $gwi;
139    my $readDims;
140    my $readVectors;
141    my %stopHash = ();
142
143    # Stemming? Compounds? StopWords?
144    $self->{stem} = 0;
145    $self->{stopHash} = {};
146
147    # Call the initialize method of the super-class.
148    $self->SUPER::initialize(@_);
149
150    # Initialize the vector cache.
151    $self->{vCache} = ();
152    $self->{vCacheQ} = ();
153    $self->{vCacheSize} = 80;
154
155    # Load the stop list.
156    if(defined $self->{stop})
157    {
158	my $line;
159        my $stopFile = $self->{stop};
160
161	if(open(STOP, $stopFile))
162	{
163	    while($line = <STOP>)
164	    {
165		$line =~ s/[\r\f\n]//g;
166		$line =~ s/^\s+//;
167		$line =~ s/\s+$//;
168		$line =~ s/\s+/_/g;
169		$stopHash{$line} = 1;
170		$self->{stopHash}->{$line} = 1;
171	    }
172	    close(STOP);
173	}
174	else
175	{
176	    $self->{errorString} .= "\nWarning (WordNet::Similarity::vector->initialize()) - ";
177	    $self->{errorString} .= "Unable to open $stopFile.";
178	    $self->{error} = 1 if($self->{error} < 1);
179	}
180    }
181
182    # so now we are ready to initialize the get_wn_info package with
183    # the wordnet object, 0/1 depending on if stemming is required and
184    # the stop hash
185    if($self->{stem})
186    {
187	$gwi = WordNet::get_wn_info->new($wn, 1, %stopHash);
188	$self->{gwi} = $gwi;
189    }
190    else
191    {
192	$gwi = WordNet::get_wn_info->new($wn, 0, %stopHash);
193	$self->{gwi} = $gwi;
194    }
195
196    # Initialize the word vector database interface...
197    if(!defined $self->{vectordb} || $self->{vectordb} eq "")
198    {
199        my $path;
200        my $header;
201        my @possiblePaths = ();
202        $vectorDB = "";
203
204        # Look for all possible default data files installed.
205        foreach $path (@INC)
206        {
207            # JM 1-16-04  -- modified to use File::Spec
208            my $file = File::Spec->catfile($path, 'WordNet', 'wordvectors.dat');
209            push @possiblePaths, $file if(-e $file);
210        }
211
212        # If there are multiple possibilities, get the one in the correct format.
213        foreach $path (@possiblePaths)
214        {
215            next if(!open(VECTORS, $path));
216            $header = <VECTORS>;
217            $header =~ s/\s+//g;
218            if($header =~ /DOCUMENTCOUNT/)
219            {
220                $vectorDB = $path;
221                $self->{vectordb} = $path;
222                close(VECTORS);
223                last;
224            }
225            close(VECTORS);
226        }
227    }
228    else
229    {
230        $vectorDB = $self->{vectordb};
231    }
232
233    # Initialize the word vector database interface...
234    if(!defined $vectorDB || $vectorDB eq "")
235    {
236	$self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
237	$self->{errorString} .= "No usable Word Vector database found. Use configuration file.";
238	$self->{error} = 2;
239	return;
240    }
241
242    # Get the documentCount, dimensions and vectors...
243    ($documentCount, $readDims, $readVectors) = WordNet::vectorFile->readVectors($vectorDB);
244    if(!defined $documentCount || !defined $readDims || !defined $readVectors)
245    {
246	$self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
247	$self->{errorString} .= "Error reading the vector database file.";
248	$self->{error} = 2;
249	return;
250    }
251
252    # Load the word vector dimensions...
253    my $key;
254    $self->{numberOfDimensions} = scalar(keys(%{$readDims}));
255    foreach $key (keys %{$readDims})
256    {
257	my $ans = $readDims->{$key};
258	my @prts = split(/\s+/, $ans);
259	$self->{wordIndex}->{$key} = $prts[0];
260	$self->{indexWord}->[$prts[0]] = $key;
261    }
262
263    # Set up the interface to the word vectors...
264    foreach $key (keys %{$readVectors})
265    {
266	my $vec = $readVectors->{$key};
267	if(defined $vec)
268	{
269	    $self->{table}->{$key} = $vec;
270	}
271    }
272
273    # If relation file not specified... manually add the relations to
274    # be used... Look for the default vector relation file...
275    if(!defined $self->{relation})
276    {
277        my $path;
278        my $header;
279        my @possiblePaths = ();
280
281        # Look for all possible default data files installed.
282        foreach $path (@INC)
283        {
284            # JM 1-16-04  -- modified to use File::Spec
285            my $file = File::Spec->catfile($path, 'WordNet', 'vector-relation.dat');
286            push @possiblePaths, $file if(-e $file);
287        }
288
289        # If there are multiple possibilities, get the one in the correct format.
290        foreach $path (@possiblePaths)
291        {
292            next if(!open(RELATIONS, $path));
293            $header = <RELATIONS>;
294            $header =~ s/\s+//g;
295            if($header =~ /VectorRelationFile/)
296            {
297                $self->{relation} = $path;
298                close(RELATIONS);
299                last;
300            }
301            close(RELATIONS);
302        }
303    }
304    if(!(defined $self->{relation}))
305    {
306	$self->{weights}->[0] = 1;
307	$self->{functions}->[0]->[0] = "glosexample";
308    }
309    else
310    {
311	# Load the relations data
312	my $header;
313	my $relation;
314        my $relationFile = $self->{relation};
315
316	if(open(RELATIONS, $relationFile))
317	{
318	    $header = <RELATIONS>;
319	    $header =~ s/[\r\f\n]//g;
320	    $header =~ s/\s+//g;
321	    if($header =~ /VectorRelationFile/)
322	    {
323		my $index = 0;
324		$self->{functions} = ();
325		$self->{weights} = ();
326		while($relation = <RELATIONS>)
327		{
328		    $relation =~ s/[\r\f\n]//g;
329
330		    # now for each line in the <REL> file, extract the
331		    # nested functions if any, check if they are defined,
332		    # if it makes sense to nest them, and then finally put
333		    # them into the @functions triple dimensioned array!
334
335		    # remove leading/trailing spaces from the relation
336		    $relation =~ s/^\s+//;
337		    $relation =~ s/\s+$//;
338
339		    # now extract the weight if any. if no weight, assume 1
340		    if($relation =~ /(\S+)\s+(\S+)/)
341		    {
342			$relation = $1;
343			$self->{weights}->[$index] = $2;
344		    }
345		    else
346		    {
347			$self->{weights}->[$index] = 1;
348		    }
349
350                    # Need to remove strict for this block.
351                    {
352                        no strict;
353
354                        $relation =~ s/[\s\)]//g;
355                        my @functionArray = split(/\(/, $relation);
356
357                        my $j = 0;
358                        my $fn = $functionArray[$#functionArray];
359                        if(!($gwi->can($fn)))
360                        {
361                            $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
362                            $self->{errorString} .= "Undefined function ($functionArray[$#functionArray]) in relations file.";
363                            $self->{error} = 2;
364                            close(RELATIONS);
365                            return;
366                        }
367
368                        $self->{functions}->[$index]->[$j++] = $functionArray[$#functionArray];
369                        my $input;
370                        my $output;
371                        my $dummy;
372                        my $k;
373
374			for ($k = $#functionArray-1; $k >= 0; $k--)
375			{
376			    my $fn2 = $functionArray[$k];
377			    my $fn3 = $functionArray[$k+1];
378			    if(!($gwi->can($fn2)))
379			    {
380				$self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
381				$self->{errorString} .= "Undefined function ($functionArray[$k]) in relations file.";
382				$self->{error} = 2;
383				close(RELATIONS);
384				return;
385			    }
386
387			    ($input, $dummy) = $gwi->$fn2($dummy, 1);
388			    ($dummy, $output) = $gwi->$fn3($dummy, 1);
389
390			    if($input != $output)
391			    {
392				$self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
393				$self->{errorString} .= "Invalid function combination - $functionArray[$k]($functionArray[$k+1]).";
394				$self->{error} = 2;
395				close(RELATIONS);
396				return;
397			    }
398
399			    $self->{functions}->[$index]->[$j++] = $functionArray[$k];
400			}
401
402			# if the output of the outermost function is synset array (1)
403			# wrap a glosexample around it
404			my $xfn = $functionArray[0];
405			($dummy, $output) = $gwi->$xfn($dummy, 1);
406			if($output == 1)
407			{
408			    $self->{functions}->[$index]->[$j++] = "glosexample";
409			}
410		    }
411
412		    $index++;
413		}
414	    }
415	    else
416	    {
417		$self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
418		$self->{errorString} .= "Bad file format ($relationFile).";
419		$self->{error} = 2;
420		close(RELATIONS);
421		return;
422	    }
423	    close(RELATIONS);
424	}
425	else
426	{
427	    $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - ";
428	    $self->{errorString} .= "Unable to open $relationFile.";
429	    $self->{error} = 2;
430	    return;
431	}
432    }
433
434    $self->{textsize} = -1 if(!defined $self->{textsize});
435}
436
437=item $vector->traceOptions()
438
439This method is internally called to determine the extra options
440specified by this measure (apart from the default options specified
441in the WordNet::Similarity base class).
442
443Parameters: none.
444
445Returns: none.
446
447=cut 
448
449# show all config options specific to this module
450sub traceOptions {
451  my $self = shift;
452  $self->{traceString} .= "relation File :: ".((defined $self->{relation})?"$self->{relation}":"")."\n";
453  $self->{traceString} .= "vectorDB File :: ".((defined $self->{vectordb})?"$self->{vectordb}":"")."\n";
454  $self->{traceString} .= "stop File :: ".((defined $self->{stop})?"$self->{stop}":"")."\n";
455  $self->{traceString} .= "stem :: $self->{stem}\n";
456  $self->{traceString} .= "textsize :: $self->{textsize}\n";
457}
458
459=item $vector->getRelatedness
460
461Computes the relatedness of two word senses using the Vector Algorithm.
462
463Parameters: two word senses in "word#pos#sense" format.
464
465Returns: Unless a problem occurs, the return value is the relatedness
466score, which is greater-than or equal-to 0. If an error occurs,
467then the error level is set to non-zero and an error
468string is created (see the description of getError()).
469
470=cut
471
472sub getRelatedness
473{
474    my $self = shift;
475    my $wps1 = shift;
476    my $wps2 = shift;
477    my $wn = $self->{wn};
478    my $wntools = $self->{wntools};
479    my $gwi = $self->{gwi};
480
481    # Check the existence of the WordNet::QueryData object.
482    if(!$wn)
483    {
484	$self->{errorString} .= "\nError (WordNet::Similarity::vector->getRelatedness()) - ";
485	$self->{errorString} .= "A WordNet::QueryData object is required.";
486	$self->{error} = 2;
487	return undef;
488    }
489
490    # Check the existence of the WordNet::Tools object.
491    if(!$wntools)
492    {
493	$self->{errorString} .= "\nError (WordNet::Similarity::vector->getRelatedness()) - ";
494	$self->{errorString} .= "A WordNet::Tools object is required.";
495	$self->{error} = 2;
496	return undef;
497    }
498
499    # Initialize traces.
500    $self->{traceString} = "" if($self->{trace});
501
502    # Undefined input cannot go unpunished.
503    if(!$wps1 || !$wps2)
504    {
505	$self->{errorString} .= "\nWarning (WordNet::Similarity::vector->getRelatedness()) - Undefined input values.";
506	$self->{error} = 1 if($self->{error} < 1);
507	return undef;
508    }
509
510    # Security check -- are the input strings in the correct format (word#pos#sense).
511    if($wps1 !~ /^\S+\#([nvar])\#\d+$/)
512    {
513	$self->{errorString} .= "\nWarning (WordNet::Similarity::vector->getRelatedness()) - ";
514	$self->{errorString} .= "Input not in word\#pos\#sense format.";
515	$self->{error} = ($self->{error} < 1) ? 1 : $self->{error};
516	return undef;
517    }
518    if($wps2 !~ /^\S+\#([nvar])\#\d+$/)
519    {
520	$self->{errorString} .= "\nWarning (WordNet::Similarity::vector->getRelatedness()) - ";
521	$self->{errorString} .= "Input not in word\#pos\#sense format.";
522	$self->{error} = ($self->{error} < 1) ? 1 : $self->{error};
523	return undef;
524    }
525
526    # Now check if the similarity value for these two synsets is in
527    # fact in the cache... if so return the cached value.
528    my $relatedness = $self->{doCache} ? $self->fetchFromCache($wps1, $wps2) : undef;
529    defined $relatedness and return $relatedness;
530
531    # Are the gloss vectors present in the cache...
532    if(defined $self->{vCache}->{$wps1} && defined $self->{vCache}->{$wps2})
533    {
534	if($self->{trace})
535	{
536	    # ah so we do need SOME traces! put in the synset names.
537	    $self->{traceString} .= "Synset 1: $wps1 (Gloss Vector found in Cache)\n";
538	    $self->{traceString} .= "Synset 2: $wps2 (Gloss Vector found in Cache)\n";
539	}
540	my $a = $self->{vCache}->{$wps1};
541	my $b = $self->{vCache}->{$wps2};
542	my $score = &_inner($a, $b);
543
544	# that does all the scoring. Put in cache if doing cacheing. Then
545	# return the score.
546        $self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
547	return $score;
548    }
549
550    # we shall put the first synset in a "set" of itself, and the
551    # second synset in another "set" of itself. These sets may
552    # increase in size as the functions are applied (since some
553    # relations have a one to many mapping).
554
555    # initialize the score
556    my $score = 0;
557
558    # and now go thru the functions array, get the strings and do the scoring
559    my $i = 0;
560    my %overlaps;
561    my $firstString = "";
562    my $secondString = "";
563    while(defined $self->{functions}->[$i])
564    {
565	my $functionsString = "";
566	my $funcStringPrinted = 0;
567	my $functionsScore = 0;
568
569	# see if any traces reqd. if so, create the functions string
570	# however don't send it to the trace string immediately - will
571	# print it only if there are any overlaps for this rel
572	if($self->{trace})
573	{
574	    $functionsString = "Functions: ";
575	    my $j = 0;
576	    while(defined $self->{functions}->[$i]->[$j])
577	    {
578		$functionsString .= ($self->{functions}->[$i]->[$j])." ";
579		$j++;
580	    }
581	}
582
583	# now get the string for the first set of synsets
584        my %seth1 = ();
585        $seth1{$wps1} = 1;
586	my @arguments = \%seth1;
587
588	# apply the functions to the arguments, passing the output of
589	# the inner functions to the inputs of the outer ones
590	my $j = 0;
591	no strict;
592
593	while(defined $self->{functions}->[$i]->[$j])
594	{
595	    my $fn = $self->{functions}->[$i]->[$j];
596	    @arguments = $gwi->$fn(@arguments);
597	    $j++;
598	}
599
600	# finally we should have one cute little string!
601	$firstString .= $arguments[0];
602
603	# next do all this for the string for the second set
604        my %seth2 = ();
605        $seth2{$wps2} = 1;
606	@arguments = \%seth2;
607
608	$j = 0;
609	while(defined $self->{functions}->[$i]->[$j])
610	{
611	    my $fn = $self->{functions}->[$i]->[$j];
612	    @arguments = $gwi->$fn(@arguments);
613	    $j++;
614	}
615
616	$secondString .= $arguments[0];
617
618	# check if the two strings need to be reported in the trace.
619	if($self->{trace})
620	{
621	    if(!$funcStringPrinted)
622	    {
623		$self->{traceString} .= "$functionsString\n";
624		$funcStringPrinted = 1;
625	    }
626	}
627
628	$i++;
629    }
630
631    # Preprocess...
632    $firstString =~ s/\'//g;
633    $firstString =~ s/[^a-z0-9]+/ /g;
634    $firstString =~ s/^\s+//;
635    $firstString =~ s/\s+$//;
636    $firstString = $wntools->compoundify($firstString);
637    $secondString =~ s/\'//g;
638    $secondString =~ s/[^a-z0-9]+/ /g;
639    $secondString =~ s/^\s+//;
640    $secondString =~ s/\s+$//;
641    $secondString = $wntools->compoundify($secondString);
642
643    # Get vectors... score...
644    my $a;
645    my $maga;
646    my $sizea;
647    my $b;
648    my $magb;
649    my $sizeb;
650    my $trr;
651
652    # see if any traces reqd. if so, put in the synset arrays.
653    if($self->{trace})
654    {
655	# ah so we do need SOME traces! put in the synset names.
656	$self->{traceString} .= "Synset 1: $wps1";
657    }
658    $sizea = 0;
659    if(defined $self->{vCache}->{$wps1})
660    {
661	$a = $self->{vCache}->{$wps1};
662	$self->{traceString} .= " (Gloss vector found in cache)\n" if($self->{trace});
663    }
664    else
665    {
666	($a, $trr, $maga, $sizea) = $self->_getVector($firstString);
667	$self->{traceString} .= "\nString: \"$firstString\"\n$trr\n" if($self->{trace});
668	&_norm($a, $maga);
669	$self->{vCache}->{$wps1} = $a;
670	push(@{$self->{vCacheQ}}, $wps1);
671	while(scalar(@{$self->{vCacheQ}}) > $self->{vCacheSize})
672	{
673	    my $wps = shift(@{$self->{vCacheQ}});
674	    delete $self->{vCache}->{$wps}
675	}
676    }
677
678    if($self->{trace})
679    {
680	# ah so we do need SOME traces! put in the synset names.
681	$self->{traceString} .= "Synset 2: $wps2";
682    }
683    $sizeb = 0;
684    if(defined $self->{vCache}->{$wps2})
685    {
686	$b = $self->{vCache}->{$wps2};
687	$self->{traceString} .= " (Gloss vector found in cache)\n" if($self->{trace});
688    }
689    else
690    {
691	($b, $trr, $magb, $sizeb) = $self->_getVector($secondString);
692	$self->{traceString} .= "\nString: \"$secondString\"\n$trr\n" if($self->{trace});
693	&_norm($b, $magb);
694	$self->{vCache}->{$wps2} = $b;
695	push(@{$self->{vCacheQ}}, $wps2);
696	while(scalar(@{$self->{vCacheQ}}) > $self->{vCacheSize})
697	{
698	    my $wps = shift(@{$self->{vCacheQ}});
699	    delete $self->{vCache}->{$wps}
700	}
701    }
702
703    $score = &_inner($a, $b);
704
705    # that does all the scoring. Put in cache if doing cacheing. Then
706    # return the score.
707    $self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
708
709    return $score;
710}
711
712
713# Method to compute a context vector from a given body of text...
714sub _getVector
715{
716    my $self = shift;
717    my $text = shift;
718    my $ret = {};
719    return ($ret, "", 0, 0) if(!defined $text);
720    my @words = split(/\s+/, $text);
721    my $word;
722    my %types;
723    my $fstFlag = 1;
724    my $localTraces = "";
725    my $kk;
726    my $mag;
727    my $count = 0;
728
729    # [trace]
730    if($self->{trace})
731    {
732	$localTraces .= "Word Vectors for: ";
733    }
734    # [/trace]
735
736    foreach $word (@words)
737    {
738        if($word !~ /[XGES]{3}\d{5}[XGES]{3}/)
739        {
740            $types{$word} = 1;
741            $count++;
742            last if($self->{textsize} >= 0 && $count > $self->{textsize});
743        }
744    }
745    foreach $word (keys %types)
746    {
747	if(defined $self->{table}->{$word} && !defined $self->{stopHash}->{$word})
748	{
749	    my %pieces = split(/\s+/, $self->{table}->{$word});
750
751	    # [trace]
752	    if($self->{trace})
753	    {
754		$localTraces .= ", " if(!$fstFlag);
755		$localTraces .= "$word";
756		$fstFlag = 0;
757	    }
758	    # [/trace]
759
760	    foreach $kk (keys %pieces)
761	    {
762		$ret->{$kk} = ((defined $ret->{$kk})?($ret->{$kk}):0) + $pieces{$kk};
763	    }
764	}
765    }
766
767    $mag = 0;
768    foreach $kk (keys %{$ret})
769    {
770	$mag += ($ret->{$kk} * $ret->{$kk});
771    }
772
773    return ($ret, $localTraces, sqrt($mag), $count);
774}
775
776# Normalizes the sparse vector.
777sub _norm
778{
779    my $vec = shift;
780    my $mag = shift;
781
782    if(defined $vec && defined $mag && $mag != 0)
783    {
784	my $key;
785	foreach $key (keys %{$vec})
786	{
787	    $vec->{$key} /= $mag;
788	}
789    }
790}
791
792# Inner product of two sparse vectors.
793sub _inner
794{
795    my $vec1 = shift;
796    my $vec2 = shift;
797    my ($size1, $size2);
798    my $prod = 0;
799
800    return 0 if(!defined $vec1 || !defined $vec2);
801
802    $size1 = scalar(keys(%{$vec1}));
803    $size2 = scalar(keys(%{$vec2}));
804
805    if(defined $size1 && defined $size2 && $size1 < $size2)
806    {
807	my $key;
808	foreach $key (keys %{$vec1})
809	{
810	    $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec2->{$key});
811	}
812    }
813    else
814    {
815	my $key;
816	foreach $key (keys %{$vec2})
817	{
818	    $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec1->{$key});
819	}
820    }
821
822    return $prod;
823}
824
8251;
826
827__END__
828
829=back
830
831=head2 Usage
832
833The semantic relatedness modules in this distribution are built as classes
834that define the following methods:
835
836    new()
837    getRelatedness()
838    getError()
839    getTraceString()
840
841See the WordNet::Similarity(3) documentation for details of these methods.
842
843=head3 Typical Usage Examples
844
845To create an object of the vector measure, we would have the following
846lines of code in the Perl program.
847
848  use WordNet::Similarity::vector;
849  $measure = WordNet::Similarity::vector->new($wn, '/home/sid/vector.conf');
850
851The reference of the initialized object is stored in the scalar variable
852'$measure'. '$wn' contains a WordNet::QueryData object that should have been
853created earlier in the program. The second parameter to the 'new' method is
854the path of the configuration file for the vector measure. If the 'new'
855method is unable to create the object, '$measure' would be undefined. This,
856as well as any other error/warning may be tested.
857
858  die "Unable to create object.\n" if(!defined $measure);
859  ($err, $errString) = $measure->getError();
860  die $errString."\n" if($err);
861
862To find the semantic relatedness of the first sense of the noun 'car' and
863the second sense of the noun 'bus' using the measure, we would write
864the following piece of code:
865
866  $relatedness = $measure->getRelatedness('car#n#1', 'bus#n#2');
867
868To get traces for the above computation:
869
870  print $measure->getTraceString();
871
872However, traces must be enabled using configuration files. By default
873traces are turned off.
874
875=head1 CONFIGURATION FILE
876
877The behavior of the measures of semantic relatedness can be controlled by
878using configuration files. These configuration files specify how certain
879parameters are initialized within the object. A configuration file may be
880specified as a parameter during the creation of an object using the new
881method. The configuration files must follow a fixed format.
882
883Every configuration file starts with the name of the module ON THE FIRST LINE
884of the file. For example, a configuration file for the vector module will have
885on the first line 'WordNet::Similarity::vector'. This is followed by the
886various parameters, each on a new line and having the form 'name::value'. The
887'value' of a parameter is optional (in case of boolean parameters). In case
888'value' is omitted, we would have just 'name::' on that line. Comments are
889supported in the configuration file. Anything following a '#' is ignored till
890the end of the line.
891
892The module parses the configuration file and recognizes the following
893parameters:
894
895=over
896
897=item trace
898
899The value of this parameter specifies the level of tracing that should
900be employed for generating the traces. This value
901is an integer equal to 0, 1, or 2. If the value is omitted, then the
902default value, 0, is used. A value of 0 switches tracing off. A value
903of 1 or 2 switches tracing on.  A value of 1 displays as
904traces only the gloss overlaps found. A value of 2 displays as traces all
905the text being compared.
906
907=item cache
908
909The value of this parameter specifies whether or not caching of the
910relatedness values should be performed.  This value is an
911integer equal to  0 or 1.  If the value is omitted, then the default
912value, 1, is used. A value of 0 switches caching 'off', and
913a value of 1 switches caching 'on'.
914
915=item maxCacheSize
916
917The value of this parameter indicates the size of the cache, used for
918storing the computed relatedness value. The specified value must be
919a non-negative integer.  If the value is omitted, then the default
920value, 5,000, is used. Setting maxCacheSize to zero has
921the same effect as setting cache to zero, but setting cache to zero is
922likely to be more efficient.  Caching and tracing at the same time can result
923in excessive memory usage because the trace strings are also cached.  If
924you intend to perform a large number of relatedness queries, then you
925might want to turn tracing off.
926
927=item relation
928
929The value of this parameter is the path to a file that contains a list of
930WordNet relations.  The path may be either an absolute path or a relative
931path.
932
933The vector module combines the glosses of synsets related to the target
934synsets by these relations and forms the gloss-vector from this combined
935gloss.
936
937WARNING: the format of the relation file is different for the vector and lesk
938measures.
939
940=item stop
941
942The value of this parameter the path of a file containing a list of stop
943words that should be ignored in the glosses.  The path may be either an
944absolute path or a relative path.
945
946=item stem
947
948The value of this parameter indicates whether or not stemming should be
949performed.  The value must be an integer equal to 0 or 1.  If the
950value is omitted, then the default value, 0, is used.
951A value of 1 switches 'on' stemming, and a value of 0 switches stemming
952'off'. When stemming is enabled, all the words of the
953glosses are stemmed before their vectors are created for the vector
954measure or their overlaps are compared for the lesk measure.
955
956=item vectordb
957
958The value of this parameter is the path to a file
959containing word vectors, i.e. co-occurrence vectors for all the words
960in the WordNet glosses.  The value of this parameter may not be omitted,
961and the vector measure will not run without a vectors file being specified
962in a configuration file.
963
964=back
965
966=head1 RELATION FILE FORMAT
967
968The relation file starts with the string "VectorRelationFile" on the first line
969of the file. Following this, on each consecutive line, a relation is specified
970in the form --
971
972 func(func(func... (func)...)) [weight]
973
974Where "func" can be any one of the following functions:
975
976 hype() = Hypernym of
977 hypo() = Hyponym of
978 holo() = Holonym of
979 mero() = Meronym of
980 attr() = Attribute of
981 also() = Also see
982 sim() = Similar
983 enta() = Entails
984 caus() = Causes
985 part() = Particle
986 pert() = Pertainym of
987 glos = gloss (without example)
988 example = example (from the gloss)
989 glosexample = gloss + example
990 syns = the synset of the concept
991
992Each of these specifies a WordNet relation. And the outermost function in the
993nesting can only be one of glos, example, glosexample or syns. The functions specify which
994glosses to use for forming the gloss vector of the synset. An optional weight can be
995specified to weigh the contribution of that relation in the overall score.
996
997For example,
998
999 glos(hype(hypo)) 0.5
1000
1001means that the gloss of the hypernym of the hyponym of the synset is used to
1002form the gloss vector of the synset, and the values in this vector are
1003weighted by 0.5. If one of "glos", "example", "glosexample" or "syns" is not
1004specified as the outermost function in the nesting,
1005then "glosexample" is assumed by default. This implies that
1006
1007 glosexample(hypo(also))
1008
1009and
1010
1011 hypo(also)
1012
1013are equivalent as far as the measure is concerned.
1014
1015=head1 SEE ALSO
1016
1017perl(1), WordNet::Similarity(3), WordNet::QueryData(3)
1018
1019http://www.cs.utah.edu/~sidd
1020
1021http://wordnet.princeton.edu
1022
1023http://www.ai.mit.edu/~jrennie/WordNet
1024
1025http://groups.yahoo.com/group/wn-similarity
1026
1027=head1 AUTHORS
1028
1029 Ted Pedersen, University of Minnesota, Duluth
1030 tpederse at d.umn.edu
1031
1032 Siddharth Patwardhan, University of Utah, Salt Lake City
1033 sidd at cs.utah.edu
1034
1035 Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
1036 banerjee+ at cs.cmu.edu
1037
1038=head1 BUGS
1039
1040To report bugs, go to http://groups.yahoo.com/group/wn-similarity/ or
1041send an e-mail to "S<tpederse at d.umn.edu>".
1042
1043=head1 COPYRIGHT AND LICENSE
1044
1045Copyright (c) 2005, Ted Pedersen, Siddharth Patwardhan and Satanjeev Banerjee
1046
1047This program is free software; you can redistribute it and/or
1048modify it under the terms of the GNU General Public License
1049as published by the Free Software Foundation; either version 2
1050of the License, or (at your option) any later version.
1051
1052This program is distributed in the hope that it will be useful,
1053but WITHOUT ANY WARRANTY; without even the implied warranty of
1054MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1055GNU General Public License for more details.
1056
1057You should have received a copy of the GNU General Public License
1058along with this program; if not, write to
1059
1060    The Free Software Foundation, Inc.,
1061    59 Temple Place - Suite 330,
1062    Boston, MA  02111-1307, USA.
1063
1064Note: a copy of the GNU General Public License is available on the web
1065at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
1066distribution as GPL.txt.
1067
1068=cut
1069