1# WordNet::Similarity::vector.pm version 2.04 2# (Last updated $Id: vector.pm,v 1.24 2008/03/27 06:21:17 sidz1979 Exp $) 3# 4# Module accepts two WordNet synsets and returns a floating point 5# number that indicates how similar those two synsets are, using a 6# gloss vector overlap measure based on "context vectors" described by 7# Schütze (1998). 8# 9# Copyright (c) 2005, 10# 11# Ted Pedersen, University of Minnesota Duluth 12# tpederse at d.umn.edu 13# 14# Siddharth Patwardhan, University of Utah, Salt Lake City 15# sidd at cs.utah.edu 16# 17# Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh 18# banerjee+ at cs.cmu.edu 19# 20# This program is free software; you can redistribute it and/or 21# modify it under the terms of the GNU General Public License 22# as published by the Free Software Foundation; either version 2 23# of the License, or (at your option) any later version. 24# 25# This program is distributed in the hope that it will be useful, 26# but WITHOUT ANY WARRANTY; without even the implied warranty of 27# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 28# GNU General Public License for more details. 29# 30# You should have received a copy of the GNU General Public License 31# along with this program; if not, write to 32# 33# The Free Software Foundation, Inc., 34# 59 Temple Place - Suite 330, 35# Boston, MA 02111-1307, USA. 36# 37# ------------------------------------------------------------------ 38 39package WordNet::Similarity::vector; 40 41=head1 NAME 42 43WordNet::Similarity::vector - Perl module for computing semantic relatedness 44of word senses using second order co-occurrence vectors of glosses of the word 45senses. 46 47=head1 SYNOPSIS 48 49 use WordNet::Similarity::vector; 50 51 use WordNet::QueryData; 52 53 my $wn = WordNet::QueryData->new(); 54 55 my $vector = WordNet::Similarity::vector->new($wn); 56 57 my $value = $vector->getRelatedness("car#n#1", "bus#n#2"); 58 59 ($error, $errorString) = $vector->getError(); 60 61 die "$errorString\n" if($error); 62 63 print "car (sense 1) <-> bus (sense 2) = $value\n"; 64 65=head1 DESCRIPTION 66 67SchE<uuml>tze (1998) creates what he calls context vectors (second order 68co-occurrence vectors) of pieces of text for the purpose of Word Sense 69Discrimination. This idea is adopted by Patwardhan and Pedersen to represent 70the word senses by second-order co-occurrence vectors of their dictionary 71(WordNet) definitions. The relatedness of two senses is then computed as 72the cosine of their representative gloss vectors. 73 74=over 75 76=cut 77 78use strict; 79use WordNet::get_wn_info; 80use WordNet::stem; 81use WordNet::vectorFile; 82use WordNet::Similarity; 83use File::Spec; 84use vars qw($VERSION @ISA); 85 86@ISA = qw(WordNet::Similarity); 87 88$VERSION = '2.04'; 89 90WordNet::Similarity::addConfigOption("relation", 0, "p", undef); 91WordNet::Similarity::addConfigOption("vectordb", 0, "p", undef); 92WordNet::Similarity::addConfigOption("stop", 0, "p", undef); 93WordNet::Similarity::addConfigOption("stem", 0, "i", 0); 94WordNet::Similarity::addConfigOption("textsize", 0, "i", "-1"); 95 96=item $vector->setPosList() 97 98This method is internally called to determine the parts of speech 99this measure is capable of dealing with. 100 101Parameters: none. 102 103Returns: none. 104 105=cut 106 107sub setPosList 108{ 109 my $self = shift; 110 $self->{n} = 1; 111 $self->{v} = 1; 112 $self->{a} = 1; 113 $self->{r} = 1; 114 return 1; 115} 116 117=item $vector->initialize($file) 118 119Overrides the initialize method in the parent class (GlossFinder.pm). This method 120essentially initializes the measure for use. 121 122Parameters: $file -- configuration file. 123 124Returns: none. 125 126=cut 127 128# Initialization of the WordNet::Similarity::vector object... parses the config file and sets up 129# global variables, or sets them to default values. 130# INPUT PARAMS : $paramFile .. File containing the module specific params. 131# RETURN VALUES : (none) 132sub initialize 133{ 134 my $self = shift; 135 my $vectorDB; 136 my $documentCount; 137 my $wn = $self->{wn}; 138 my $gwi; 139 my $readDims; 140 my $readVectors; 141 my %stopHash = (); 142 143 # Stemming? Compounds? StopWords? 144 $self->{stem} = 0; 145 $self->{stopHash} = {}; 146 147 # Call the initialize method of the super-class. 148 $self->SUPER::initialize(@_); 149 150 # Initialize the vector cache. 151 $self->{vCache} = (); 152 $self->{vCacheQ} = (); 153 $self->{vCacheSize} = 80; 154 155 # Load the stop list. 156 if(defined $self->{stop}) 157 { 158 my $line; 159 my $stopFile = $self->{stop}; 160 161 if(open(STOP, $stopFile)) 162 { 163 while($line = <STOP>) 164 { 165 $line =~ s/[\r\f\n]//g; 166 $line =~ s/^\s+//; 167 $line =~ s/\s+$//; 168 $line =~ s/\s+/_/g; 169 $stopHash{$line} = 1; 170 $self->{stopHash}->{$line} = 1; 171 } 172 close(STOP); 173 } 174 else 175 { 176 $self->{errorString} .= "\nWarning (WordNet::Similarity::vector->initialize()) - "; 177 $self->{errorString} .= "Unable to open $stopFile."; 178 $self->{error} = 1 if($self->{error} < 1); 179 } 180 } 181 182 # so now we are ready to initialize the get_wn_info package with 183 # the wordnet object, 0/1 depending on if stemming is required and 184 # the stop hash 185 if($self->{stem}) 186 { 187 $gwi = WordNet::get_wn_info->new($wn, 1, %stopHash); 188 $self->{gwi} = $gwi; 189 } 190 else 191 { 192 $gwi = WordNet::get_wn_info->new($wn, 0, %stopHash); 193 $self->{gwi} = $gwi; 194 } 195 196 # Initialize the word vector database interface... 197 if(!defined $self->{vectordb} || $self->{vectordb} eq "") 198 { 199 my $path; 200 my $header; 201 my @possiblePaths = (); 202 $vectorDB = ""; 203 204 # Look for all possible default data files installed. 205 foreach $path (@INC) 206 { 207 # JM 1-16-04 -- modified to use File::Spec 208 my $file = File::Spec->catfile($path, 'WordNet', 'wordvectors.dat'); 209 push @possiblePaths, $file if(-e $file); 210 } 211 212 # If there are multiple possibilities, get the one in the correct format. 213 foreach $path (@possiblePaths) 214 { 215 next if(!open(VECTORS, $path)); 216 $header = <VECTORS>; 217 $header =~ s/\s+//g; 218 if($header =~ /DOCUMENTCOUNT/) 219 { 220 $vectorDB = $path; 221 $self->{vectordb} = $path; 222 close(VECTORS); 223 last; 224 } 225 close(VECTORS); 226 } 227 } 228 else 229 { 230 $vectorDB = $self->{vectordb}; 231 } 232 233 # Initialize the word vector database interface... 234 if(!defined $vectorDB || $vectorDB eq "") 235 { 236 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 237 $self->{errorString} .= "No usable Word Vector database found. Use configuration file."; 238 $self->{error} = 2; 239 return; 240 } 241 242 # Get the documentCount, dimensions and vectors... 243 ($documentCount, $readDims, $readVectors) = WordNet::vectorFile->readVectors($vectorDB); 244 if(!defined $documentCount || !defined $readDims || !defined $readVectors) 245 { 246 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 247 $self->{errorString} .= "Error reading the vector database file."; 248 $self->{error} = 2; 249 return; 250 } 251 252 # Load the word vector dimensions... 253 my $key; 254 $self->{numberOfDimensions} = scalar(keys(%{$readDims})); 255 foreach $key (keys %{$readDims}) 256 { 257 my $ans = $readDims->{$key}; 258 my @prts = split(/\s+/, $ans); 259 $self->{wordIndex}->{$key} = $prts[0]; 260 $self->{indexWord}->[$prts[0]] = $key; 261 } 262 263 # Set up the interface to the word vectors... 264 foreach $key (keys %{$readVectors}) 265 { 266 my $vec = $readVectors->{$key}; 267 if(defined $vec) 268 { 269 $self->{table}->{$key} = $vec; 270 } 271 } 272 273 # If relation file not specified... manually add the relations to 274 # be used... Look for the default vector relation file... 275 if(!defined $self->{relation}) 276 { 277 my $path; 278 my $header; 279 my @possiblePaths = (); 280 281 # Look for all possible default data files installed. 282 foreach $path (@INC) 283 { 284 # JM 1-16-04 -- modified to use File::Spec 285 my $file = File::Spec->catfile($path, 'WordNet', 'vector-relation.dat'); 286 push @possiblePaths, $file if(-e $file); 287 } 288 289 # If there are multiple possibilities, get the one in the correct format. 290 foreach $path (@possiblePaths) 291 { 292 next if(!open(RELATIONS, $path)); 293 $header = <RELATIONS>; 294 $header =~ s/\s+//g; 295 if($header =~ /VectorRelationFile/) 296 { 297 $self->{relation} = $path; 298 close(RELATIONS); 299 last; 300 } 301 close(RELATIONS); 302 } 303 } 304 if(!(defined $self->{relation})) 305 { 306 $self->{weights}->[0] = 1; 307 $self->{functions}->[0]->[0] = "glosexample"; 308 } 309 else 310 { 311 # Load the relations data 312 my $header; 313 my $relation; 314 my $relationFile = $self->{relation}; 315 316 if(open(RELATIONS, $relationFile)) 317 { 318 $header = <RELATIONS>; 319 $header =~ s/[\r\f\n]//g; 320 $header =~ s/\s+//g; 321 if($header =~ /VectorRelationFile/) 322 { 323 my $index = 0; 324 $self->{functions} = (); 325 $self->{weights} = (); 326 while($relation = <RELATIONS>) 327 { 328 $relation =~ s/[\r\f\n]//g; 329 330 # now for each line in the <REL> file, extract the 331 # nested functions if any, check if they are defined, 332 # if it makes sense to nest them, and then finally put 333 # them into the @functions triple dimensioned array! 334 335 # remove leading/trailing spaces from the relation 336 $relation =~ s/^\s+//; 337 $relation =~ s/\s+$//; 338 339 # now extract the weight if any. if no weight, assume 1 340 if($relation =~ /(\S+)\s+(\S+)/) 341 { 342 $relation = $1; 343 $self->{weights}->[$index] = $2; 344 } 345 else 346 { 347 $self->{weights}->[$index] = 1; 348 } 349 350 # Need to remove strict for this block. 351 { 352 no strict; 353 354 $relation =~ s/[\s\)]//g; 355 my @functionArray = split(/\(/, $relation); 356 357 my $j = 0; 358 my $fn = $functionArray[$#functionArray]; 359 if(!($gwi->can($fn))) 360 { 361 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 362 $self->{errorString} .= "Undefined function ($functionArray[$#functionArray]) in relations file."; 363 $self->{error} = 2; 364 close(RELATIONS); 365 return; 366 } 367 368 $self->{functions}->[$index]->[$j++] = $functionArray[$#functionArray]; 369 my $input; 370 my $output; 371 my $dummy; 372 my $k; 373 374 for ($k = $#functionArray-1; $k >= 0; $k--) 375 { 376 my $fn2 = $functionArray[$k]; 377 my $fn3 = $functionArray[$k+1]; 378 if(!($gwi->can($fn2))) 379 { 380 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 381 $self->{errorString} .= "Undefined function ($functionArray[$k]) in relations file."; 382 $self->{error} = 2; 383 close(RELATIONS); 384 return; 385 } 386 387 ($input, $dummy) = $gwi->$fn2($dummy, 1); 388 ($dummy, $output) = $gwi->$fn3($dummy, 1); 389 390 if($input != $output) 391 { 392 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 393 $self->{errorString} .= "Invalid function combination - $functionArray[$k]($functionArray[$k+1])."; 394 $self->{error} = 2; 395 close(RELATIONS); 396 return; 397 } 398 399 $self->{functions}->[$index]->[$j++] = $functionArray[$k]; 400 } 401 402 # if the output of the outermost function is synset array (1) 403 # wrap a glosexample around it 404 my $xfn = $functionArray[0]; 405 ($dummy, $output) = $gwi->$xfn($dummy, 1); 406 if($output == 1) 407 { 408 $self->{functions}->[$index]->[$j++] = "glosexample"; 409 } 410 } 411 412 $index++; 413 } 414 } 415 else 416 { 417 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 418 $self->{errorString} .= "Bad file format ($relationFile)."; 419 $self->{error} = 2; 420 close(RELATIONS); 421 return; 422 } 423 close(RELATIONS); 424 } 425 else 426 { 427 $self->{errorString} .= "\nError (WordNet::Similarity::vector->initialize()) - "; 428 $self->{errorString} .= "Unable to open $relationFile."; 429 $self->{error} = 2; 430 return; 431 } 432 } 433 434 $self->{textsize} = -1 if(!defined $self->{textsize}); 435} 436 437=item $vector->traceOptions() 438 439This method is internally called to determine the extra options 440specified by this measure (apart from the default options specified 441in the WordNet::Similarity base class). 442 443Parameters: none. 444 445Returns: none. 446 447=cut 448 449# show all config options specific to this module 450sub traceOptions { 451 my $self = shift; 452 $self->{traceString} .= "relation File :: ".((defined $self->{relation})?"$self->{relation}":"")."\n"; 453 $self->{traceString} .= "vectorDB File :: ".((defined $self->{vectordb})?"$self->{vectordb}":"")."\n"; 454 $self->{traceString} .= "stop File :: ".((defined $self->{stop})?"$self->{stop}":"")."\n"; 455 $self->{traceString} .= "stem :: $self->{stem}\n"; 456 $self->{traceString} .= "textsize :: $self->{textsize}\n"; 457} 458 459=item $vector->getRelatedness 460 461Computes the relatedness of two word senses using the Vector Algorithm. 462 463Parameters: two word senses in "word#pos#sense" format. 464 465Returns: Unless a problem occurs, the return value is the relatedness 466score, which is greater-than or equal-to 0. If an error occurs, 467then the error level is set to non-zero and an error 468string is created (see the description of getError()). 469 470=cut 471 472sub getRelatedness 473{ 474 my $self = shift; 475 my $wps1 = shift; 476 my $wps2 = shift; 477 my $wn = $self->{wn}; 478 my $wntools = $self->{wntools}; 479 my $gwi = $self->{gwi}; 480 481 # Check the existence of the WordNet::QueryData object. 482 if(!$wn) 483 { 484 $self->{errorString} .= "\nError (WordNet::Similarity::vector->getRelatedness()) - "; 485 $self->{errorString} .= "A WordNet::QueryData object is required."; 486 $self->{error} = 2; 487 return undef; 488 } 489 490 # Check the existence of the WordNet::Tools object. 491 if(!$wntools) 492 { 493 $self->{errorString} .= "\nError (WordNet::Similarity::vector->getRelatedness()) - "; 494 $self->{errorString} .= "A WordNet::Tools object is required."; 495 $self->{error} = 2; 496 return undef; 497 } 498 499 # Initialize traces. 500 $self->{traceString} = "" if($self->{trace}); 501 502 # Undefined input cannot go unpunished. 503 if(!$wps1 || !$wps2) 504 { 505 $self->{errorString} .= "\nWarning (WordNet::Similarity::vector->getRelatedness()) - Undefined input values."; 506 $self->{error} = 1 if($self->{error} < 1); 507 return undef; 508 } 509 510 # Security check -- are the input strings in the correct format (word#pos#sense). 511 if($wps1 !~ /^\S+\#([nvar])\#\d+$/) 512 { 513 $self->{errorString} .= "\nWarning (WordNet::Similarity::vector->getRelatedness()) - "; 514 $self->{errorString} .= "Input not in word\#pos\#sense format."; 515 $self->{error} = ($self->{error} < 1) ? 1 : $self->{error}; 516 return undef; 517 } 518 if($wps2 !~ /^\S+\#([nvar])\#\d+$/) 519 { 520 $self->{errorString} .= "\nWarning (WordNet::Similarity::vector->getRelatedness()) - "; 521 $self->{errorString} .= "Input not in word\#pos\#sense format."; 522 $self->{error} = ($self->{error} < 1) ? 1 : $self->{error}; 523 return undef; 524 } 525 526 # Now check if the similarity value for these two synsets is in 527 # fact in the cache... if so return the cached value. 528 my $relatedness = $self->{doCache} ? $self->fetchFromCache($wps1, $wps2) : undef; 529 defined $relatedness and return $relatedness; 530 531 # Are the gloss vectors present in the cache... 532 if(defined $self->{vCache}->{$wps1} && defined $self->{vCache}->{$wps2}) 533 { 534 if($self->{trace}) 535 { 536 # ah so we do need SOME traces! put in the synset names. 537 $self->{traceString} .= "Synset 1: $wps1 (Gloss Vector found in Cache)\n"; 538 $self->{traceString} .= "Synset 2: $wps2 (Gloss Vector found in Cache)\n"; 539 } 540 my $a = $self->{vCache}->{$wps1}; 541 my $b = $self->{vCache}->{$wps2}; 542 my $score = &_inner($a, $b); 543 544 # that does all the scoring. Put in cache if doing cacheing. Then 545 # return the score. 546 $self->{doCache} and $self->storeToCache($wps1, $wps2, $score); 547 return $score; 548 } 549 550 # we shall put the first synset in a "set" of itself, and the 551 # second synset in another "set" of itself. These sets may 552 # increase in size as the functions are applied (since some 553 # relations have a one to many mapping). 554 555 # initialize the score 556 my $score = 0; 557 558 # and now go thru the functions array, get the strings and do the scoring 559 my $i = 0; 560 my %overlaps; 561 my $firstString = ""; 562 my $secondString = ""; 563 while(defined $self->{functions}->[$i]) 564 { 565 my $functionsString = ""; 566 my $funcStringPrinted = 0; 567 my $functionsScore = 0; 568 569 # see if any traces reqd. if so, create the functions string 570 # however don't send it to the trace string immediately - will 571 # print it only if there are any overlaps for this rel 572 if($self->{trace}) 573 { 574 $functionsString = "Functions: "; 575 my $j = 0; 576 while(defined $self->{functions}->[$i]->[$j]) 577 { 578 $functionsString .= ($self->{functions}->[$i]->[$j])." "; 579 $j++; 580 } 581 } 582 583 # now get the string for the first set of synsets 584 my %seth1 = (); 585 $seth1{$wps1} = 1; 586 my @arguments = \%seth1; 587 588 # apply the functions to the arguments, passing the output of 589 # the inner functions to the inputs of the outer ones 590 my $j = 0; 591 no strict; 592 593 while(defined $self->{functions}->[$i]->[$j]) 594 { 595 my $fn = $self->{functions}->[$i]->[$j]; 596 @arguments = $gwi->$fn(@arguments); 597 $j++; 598 } 599 600 # finally we should have one cute little string! 601 $firstString .= $arguments[0]; 602 603 # next do all this for the string for the second set 604 my %seth2 = (); 605 $seth2{$wps2} = 1; 606 @arguments = \%seth2; 607 608 $j = 0; 609 while(defined $self->{functions}->[$i]->[$j]) 610 { 611 my $fn = $self->{functions}->[$i]->[$j]; 612 @arguments = $gwi->$fn(@arguments); 613 $j++; 614 } 615 616 $secondString .= $arguments[0]; 617 618 # check if the two strings need to be reported in the trace. 619 if($self->{trace}) 620 { 621 if(!$funcStringPrinted) 622 { 623 $self->{traceString} .= "$functionsString\n"; 624 $funcStringPrinted = 1; 625 } 626 } 627 628 $i++; 629 } 630 631 # Preprocess... 632 $firstString =~ s/\'//g; 633 $firstString =~ s/[^a-z0-9]+/ /g; 634 $firstString =~ s/^\s+//; 635 $firstString =~ s/\s+$//; 636 $firstString = $wntools->compoundify($firstString); 637 $secondString =~ s/\'//g; 638 $secondString =~ s/[^a-z0-9]+/ /g; 639 $secondString =~ s/^\s+//; 640 $secondString =~ s/\s+$//; 641 $secondString = $wntools->compoundify($secondString); 642 643 # Get vectors... score... 644 my $a; 645 my $maga; 646 my $sizea; 647 my $b; 648 my $magb; 649 my $sizeb; 650 my $trr; 651 652 # see if any traces reqd. if so, put in the synset arrays. 653 if($self->{trace}) 654 { 655 # ah so we do need SOME traces! put in the synset names. 656 $self->{traceString} .= "Synset 1: $wps1"; 657 } 658 $sizea = 0; 659 if(defined $self->{vCache}->{$wps1}) 660 { 661 $a = $self->{vCache}->{$wps1}; 662 $self->{traceString} .= " (Gloss vector found in cache)\n" if($self->{trace}); 663 } 664 else 665 { 666 ($a, $trr, $maga, $sizea) = $self->_getVector($firstString); 667 $self->{traceString} .= "\nString: \"$firstString\"\n$trr\n" if($self->{trace}); 668 &_norm($a, $maga); 669 $self->{vCache}->{$wps1} = $a; 670 push(@{$self->{vCacheQ}}, $wps1); 671 while(scalar(@{$self->{vCacheQ}}) > $self->{vCacheSize}) 672 { 673 my $wps = shift(@{$self->{vCacheQ}}); 674 delete $self->{vCache}->{$wps} 675 } 676 } 677 678 if($self->{trace}) 679 { 680 # ah so we do need SOME traces! put in the synset names. 681 $self->{traceString} .= "Synset 2: $wps2"; 682 } 683 $sizeb = 0; 684 if(defined $self->{vCache}->{$wps2}) 685 { 686 $b = $self->{vCache}->{$wps2}; 687 $self->{traceString} .= " (Gloss vector found in cache)\n" if($self->{trace}); 688 } 689 else 690 { 691 ($b, $trr, $magb, $sizeb) = $self->_getVector($secondString); 692 $self->{traceString} .= "\nString: \"$secondString\"\n$trr\n" if($self->{trace}); 693 &_norm($b, $magb); 694 $self->{vCache}->{$wps2} = $b; 695 push(@{$self->{vCacheQ}}, $wps2); 696 while(scalar(@{$self->{vCacheQ}}) > $self->{vCacheSize}) 697 { 698 my $wps = shift(@{$self->{vCacheQ}}); 699 delete $self->{vCache}->{$wps} 700 } 701 } 702 703 $score = &_inner($a, $b); 704 705 # that does all the scoring. Put in cache if doing cacheing. Then 706 # return the score. 707 $self->{doCache} and $self->storeToCache($wps1, $wps2, $score); 708 709 return $score; 710} 711 712 713# Method to compute a context vector from a given body of text... 714sub _getVector 715{ 716 my $self = shift; 717 my $text = shift; 718 my $ret = {}; 719 return ($ret, "", 0, 0) if(!defined $text); 720 my @words = split(/\s+/, $text); 721 my $word; 722 my %types; 723 my $fstFlag = 1; 724 my $localTraces = ""; 725 my $kk; 726 my $mag; 727 my $count = 0; 728 729 # [trace] 730 if($self->{trace}) 731 { 732 $localTraces .= "Word Vectors for: "; 733 } 734 # [/trace] 735 736 foreach $word (@words) 737 { 738 if($word !~ /[XGES]{3}\d{5}[XGES]{3}/) 739 { 740 $types{$word} = 1; 741 $count++; 742 last if($self->{textsize} >= 0 && $count > $self->{textsize}); 743 } 744 } 745 foreach $word (keys %types) 746 { 747 if(defined $self->{table}->{$word} && !defined $self->{stopHash}->{$word}) 748 { 749 my %pieces = split(/\s+/, $self->{table}->{$word}); 750 751 # [trace] 752 if($self->{trace}) 753 { 754 $localTraces .= ", " if(!$fstFlag); 755 $localTraces .= "$word"; 756 $fstFlag = 0; 757 } 758 # [/trace] 759 760 foreach $kk (keys %pieces) 761 { 762 $ret->{$kk} = ((defined $ret->{$kk})?($ret->{$kk}):0) + $pieces{$kk}; 763 } 764 } 765 } 766 767 $mag = 0; 768 foreach $kk (keys %{$ret}) 769 { 770 $mag += ($ret->{$kk} * $ret->{$kk}); 771 } 772 773 return ($ret, $localTraces, sqrt($mag), $count); 774} 775 776# Normalizes the sparse vector. 777sub _norm 778{ 779 my $vec = shift; 780 my $mag = shift; 781 782 if(defined $vec && defined $mag && $mag != 0) 783 { 784 my $key; 785 foreach $key (keys %{$vec}) 786 { 787 $vec->{$key} /= $mag; 788 } 789 } 790} 791 792# Inner product of two sparse vectors. 793sub _inner 794{ 795 my $vec1 = shift; 796 my $vec2 = shift; 797 my ($size1, $size2); 798 my $prod = 0; 799 800 return 0 if(!defined $vec1 || !defined $vec2); 801 802 $size1 = scalar(keys(%{$vec1})); 803 $size2 = scalar(keys(%{$vec2})); 804 805 if(defined $size1 && defined $size2 && $size1 < $size2) 806 { 807 my $key; 808 foreach $key (keys %{$vec1}) 809 { 810 $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec2->{$key}); 811 } 812 } 813 else 814 { 815 my $key; 816 foreach $key (keys %{$vec2}) 817 { 818 $prod += ($vec1->{$key} * $vec2->{$key}) if(defined $vec1->{$key}); 819 } 820 } 821 822 return $prod; 823} 824 8251; 826 827__END__ 828 829=back 830 831=head2 Usage 832 833The semantic relatedness modules in this distribution are built as classes 834that define the following methods: 835 836 new() 837 getRelatedness() 838 getError() 839 getTraceString() 840 841See the WordNet::Similarity(3) documentation for details of these methods. 842 843=head3 Typical Usage Examples 844 845To create an object of the vector measure, we would have the following 846lines of code in the Perl program. 847 848 use WordNet::Similarity::vector; 849 $measure = WordNet::Similarity::vector->new($wn, '/home/sid/vector.conf'); 850 851The reference of the initialized object is stored in the scalar variable 852'$measure'. '$wn' contains a WordNet::QueryData object that should have been 853created earlier in the program. The second parameter to the 'new' method is 854the path of the configuration file for the vector measure. If the 'new' 855method is unable to create the object, '$measure' would be undefined. This, 856as well as any other error/warning may be tested. 857 858 die "Unable to create object.\n" if(!defined $measure); 859 ($err, $errString) = $measure->getError(); 860 die $errString."\n" if($err); 861 862To find the semantic relatedness of the first sense of the noun 'car' and 863the second sense of the noun 'bus' using the measure, we would write 864the following piece of code: 865 866 $relatedness = $measure->getRelatedness('car#n#1', 'bus#n#2'); 867 868To get traces for the above computation: 869 870 print $measure->getTraceString(); 871 872However, traces must be enabled using configuration files. By default 873traces are turned off. 874 875=head1 CONFIGURATION FILE 876 877The behavior of the measures of semantic relatedness can be controlled by 878using configuration files. These configuration files specify how certain 879parameters are initialized within the object. A configuration file may be 880specified as a parameter during the creation of an object using the new 881method. The configuration files must follow a fixed format. 882 883Every configuration file starts with the name of the module ON THE FIRST LINE 884of the file. For example, a configuration file for the vector module will have 885on the first line 'WordNet::Similarity::vector'. This is followed by the 886various parameters, each on a new line and having the form 'name::value'. The 887'value' of a parameter is optional (in case of boolean parameters). In case 888'value' is omitted, we would have just 'name::' on that line. Comments are 889supported in the configuration file. Anything following a '#' is ignored till 890the end of the line. 891 892The module parses the configuration file and recognizes the following 893parameters: 894 895=over 896 897=item trace 898 899The value of this parameter specifies the level of tracing that should 900be employed for generating the traces. This value 901is an integer equal to 0, 1, or 2. If the value is omitted, then the 902default value, 0, is used. A value of 0 switches tracing off. A value 903of 1 or 2 switches tracing on. A value of 1 displays as 904traces only the gloss overlaps found. A value of 2 displays as traces all 905the text being compared. 906 907=item cache 908 909The value of this parameter specifies whether or not caching of the 910relatedness values should be performed. This value is an 911integer equal to 0 or 1. If the value is omitted, then the default 912value, 1, is used. A value of 0 switches caching 'off', and 913a value of 1 switches caching 'on'. 914 915=item maxCacheSize 916 917The value of this parameter indicates the size of the cache, used for 918storing the computed relatedness value. The specified value must be 919a non-negative integer. If the value is omitted, then the default 920value, 5,000, is used. Setting maxCacheSize to zero has 921the same effect as setting cache to zero, but setting cache to zero is 922likely to be more efficient. Caching and tracing at the same time can result 923in excessive memory usage because the trace strings are also cached. If 924you intend to perform a large number of relatedness queries, then you 925might want to turn tracing off. 926 927=item relation 928 929The value of this parameter is the path to a file that contains a list of 930WordNet relations. The path may be either an absolute path or a relative 931path. 932 933The vector module combines the glosses of synsets related to the target 934synsets by these relations and forms the gloss-vector from this combined 935gloss. 936 937WARNING: the format of the relation file is different for the vector and lesk 938measures. 939 940=item stop 941 942The value of this parameter the path of a file containing a list of stop 943words that should be ignored in the glosses. The path may be either an 944absolute path or a relative path. 945 946=item stem 947 948The value of this parameter indicates whether or not stemming should be 949performed. The value must be an integer equal to 0 or 1. If the 950value is omitted, then the default value, 0, is used. 951A value of 1 switches 'on' stemming, and a value of 0 switches stemming 952'off'. When stemming is enabled, all the words of the 953glosses are stemmed before their vectors are created for the vector 954measure or their overlaps are compared for the lesk measure. 955 956=item vectordb 957 958The value of this parameter is the path to a file 959containing word vectors, i.e. co-occurrence vectors for all the words 960in the WordNet glosses. The value of this parameter may not be omitted, 961and the vector measure will not run without a vectors file being specified 962in a configuration file. 963 964=back 965 966=head1 RELATION FILE FORMAT 967 968The relation file starts with the string "VectorRelationFile" on the first line 969of the file. Following this, on each consecutive line, a relation is specified 970in the form -- 971 972 func(func(func... (func)...)) [weight] 973 974Where "func" can be any one of the following functions: 975 976 hype() = Hypernym of 977 hypo() = Hyponym of 978 holo() = Holonym of 979 mero() = Meronym of 980 attr() = Attribute of 981 also() = Also see 982 sim() = Similar 983 enta() = Entails 984 caus() = Causes 985 part() = Particle 986 pert() = Pertainym of 987 glos = gloss (without example) 988 example = example (from the gloss) 989 glosexample = gloss + example 990 syns = the synset of the concept 991 992Each of these specifies a WordNet relation. And the outermost function in the 993nesting can only be one of glos, example, glosexample or syns. The functions specify which 994glosses to use for forming the gloss vector of the synset. An optional weight can be 995specified to weigh the contribution of that relation in the overall score. 996 997For example, 998 999 glos(hype(hypo)) 0.5 1000 1001means that the gloss of the hypernym of the hyponym of the synset is used to 1002form the gloss vector of the synset, and the values in this vector are 1003weighted by 0.5. If one of "glos", "example", "glosexample" or "syns" is not 1004specified as the outermost function in the nesting, 1005then "glosexample" is assumed by default. This implies that 1006 1007 glosexample(hypo(also)) 1008 1009and 1010 1011 hypo(also) 1012 1013are equivalent as far as the measure is concerned. 1014 1015=head1 SEE ALSO 1016 1017perl(1), WordNet::Similarity(3), WordNet::QueryData(3) 1018 1019http://www.cs.utah.edu/~sidd 1020 1021http://wordnet.princeton.edu 1022 1023http://www.ai.mit.edu/~jrennie/WordNet 1024 1025http://groups.yahoo.com/group/wn-similarity 1026 1027=head1 AUTHORS 1028 1029 Ted Pedersen, University of Minnesota, Duluth 1030 tpederse at d.umn.edu 1031 1032 Siddharth Patwardhan, University of Utah, Salt Lake City 1033 sidd at cs.utah.edu 1034 1035 Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh 1036 banerjee+ at cs.cmu.edu 1037 1038=head1 BUGS 1039 1040To report bugs, go to http://groups.yahoo.com/group/wn-similarity/ or 1041send an e-mail to "S<tpederse at d.umn.edu>". 1042 1043=head1 COPYRIGHT AND LICENSE 1044 1045Copyright (c) 2005, Ted Pedersen, Siddharth Patwardhan and Satanjeev Banerjee 1046 1047This program is free software; you can redistribute it and/or 1048modify it under the terms of the GNU General Public License 1049as published by the Free Software Foundation; either version 2 1050of the License, or (at your option) any later version. 1051 1052This program is distributed in the hope that it will be useful, 1053but WITHOUT ANY WARRANTY; without even the implied warranty of 1054MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1055GNU General Public License for more details. 1056 1057You should have received a copy of the GNU General Public License 1058along with this program; if not, write to 1059 1060 The Free Software Foundation, Inc., 1061 59 Temple Place - Suite 330, 1062 Boston, MA 02111-1307, USA. 1063 1064Note: a copy of the GNU General Public License is available on the web 1065at L<http://www.gnu.org/licenses/gpl.txt> and is included in this 1066distribution as GPL.txt. 1067 1068=cut 1069