1#! /usr/bin/perl -w 2# 3# semCorFreq.pl version 2.05 4# (Last updated $Id: semCorFreq.pl,v 1.13 2008/05/30 23:12:44 sidz1979 Exp $) 5# 6# ----------------------------------------------------------------- 7 8# Include other packages 9use strict; 10use WordNet::QueryData; 11use WordNet::Tools; 12use Getopt::Long; 13use WordNet::Similarity::FrequencyCounter; 14 15# Global Variable declaration. 16my %offsetMnem; 17my %mnemFreq; 18my %offsetFreq; 19 20# Get Command-Line options. 21our ($opt_help, $opt_version, $opt_wnpath, $opt_outfile, $opt_smooth); 22&GetOptions("help", "version", "wnpath=s", "outfile=s", "smooth=s"); 23 24# Check if help has been requested ... If so ... display help. 25if(defined $opt_help) 26{ 27 &showHelp; 28 exit; 29} 30 31# Check if version number has been requested ... If so ... display version. 32if(defined $opt_version) 33{ 34 &showVersion; 35 exit; 36} 37 38# Check if path to WordNet Data files has been provided ... If so ... save it. 39my ($wnPCPath, $wnUnixPath); 40if(defined $opt_wnpath) 41{ 42 $wnPCPath = $opt_wnpath; 43 $wnUnixPath = $opt_wnpath; 44} 45elsif (defined $ENV{WNSEARCHDIR}) 46{ 47 $wnPCPath = $ENV{WNSEARCHDIR}; 48 $wnUnixPath = $ENV{WNSEARCHDIR}; 49} 50elsif (defined $ENV{WNHOME}) 51{ 52 $wnPCPath = $ENV{WNHOME} . "\\dict"; 53 $wnUnixPath = $ENV{WNHOME} . ""; 54} 55else 56{ 57 $wnPCPath = "C:\\Program Files\\WordNet\\3.0\\dict"; 58 $wnUnixPath = "/usr/local/share/WordNet"; 59} 60 61# Output file must be specified 62unless(defined $opt_outfile) 63{ 64 &showUsage; 65 print "Type 'semCorFreq.pl --help' for detailed help.\n"; 66 exit; 67} 68 69# Initialize POS Map. 70my %posMap; 71$posMap{"1"} = "n"; 72$posMap{"2"} = "v"; 73 74# Get a WordNet::QueryData object... 75print STDERR "Loading WordNet ... "; 76my $wn = ((defined $opt_wnpath) ? (WordNet::QueryData->new($opt_wnpath)) : (WordNet::QueryData->new())); 77die "Unable to create WordNet::QueryData object.\n" if(!$wn); 78$wnPCPath = $wnUnixPath = $wn->dataPath() if($wn->can('dataPath')); 79my $wntools = WordNet::Tools->new($wn); 80die "Unable to create WordNet::Tools object.\n" if(!$wntools); 81print STDERR "done.\n"; 82 83# Loading the Sense Indices. 84print STDERR "Loading sense indices ... "; 85open(IDX, $wnUnixPath."/index.sense") || open(IDX, $wnPCPath."\\sense.idx") || die "Unable to open sense index file.\n"; 86while(<IDX>) 87{ 88 chomp; 89 my @line = split / +/; 90 if($line[0] =~ /%([12]):/) 91 { 92 my $posHere = $1; 93 $line[1] =~ s/^0*//; 94 push @{$offsetMnem{$line[1].$posMap{$posHere}}}, $line[0]; 95 } 96} 97close(IDX); 98print STDERR "done.\n"; 99 100# Loading the frequency counts from 'cntlist'. 101print STDERR "Loading cntlist ... "; 102open(CNT, $wnUnixPath."/cntlist.rev") || open(CNT, $wnPCPath."\\cntlist") || die "Unable to open cntlist.\n"; 103while(<CNT>) 104{ 105 chomp; 106 my @line = split / /; 107 if($line[1] =~ /%[12]:/) 108 { 109 $mnemFreq{$line[1]}=$line[0]; 110 } 111} 112close(CNT); 113print STDERR "done.\n"; 114 115# Mapping the frequency counts to offsets... 116print STDERR "Mapping offsets to frequencies ... "; 117my $unknownSmooth = 0; 118foreach my $tPos ("noun", "verb") 119{ 120 my $xPos = $tPos; 121 my $line; 122 $xPos =~ s/(^[nv]).*/$1/; 123 open(DATA, $wnUnixPath."/data.$tPos") || open(DATA, $wnPCPath."\\$tPos.dat") || die "Unable to open data file.\n"; 124 while($line=<DATA>) 125 { 126 next if " " eq substr $line, 0, 2; 127 $line =~ /^([0-9]+)\s+/; 128 my $offset = $1; 129 $offset =~ s/^0*//; 130 if(exists $offsetMnem{$offset."$xPos"}) 131 { 132 foreach my $mnem (@{$offsetMnem{$offset."$xPos"}}) 133 { 134 if($offsetFreq{"$xPos"}{$offset}) 135 { 136 $offsetFreq{"$xPos"}{$offset} += ($mnemFreq{$mnem}) ? $mnemFreq{$mnem} : 0; 137 } 138 else 139 { 140 141 # [old] 142 # Using initial value of 1 for add-1 smoothing. (added 06/22/2002) 143 # $offsetFreq{$offset} = ($mnemFreq{$mnem}) ? $mnemFreq{$mnem} : 0; 144 # [/old] 145 # No more add-1 (09/13/2002) 146 # Option for add-1 ! (05/01/2003) 147 $offsetFreq{"$xPos"}{$offset} = ($mnemFreq{$mnem}) ? $mnemFreq{$mnem} : 0; 148 if(defined $opt_smooth) 149 { 150 if($opt_smooth eq 'ADD1') 151 { 152 $offsetFreq{"$xPos"}{$offset}++; 153 } 154 else 155 { 156 $unknownSmooth = 1; 157 } 158 } 159 } 160 } 161 } 162 else 163 { 164 165 # Code added for Add-1 smoothing (06/22/2002) 166 # Code changed... no more add-1 (09/13/2002) 167 # Code changed... option for add-1 (05/01/2003) 168 $offsetFreq{"$xPos"}{$offset} = 0; 169 if(defined $opt_smooth) 170 { 171 if($opt_smooth eq 'ADD1') 172 { 173 $offsetFreq{"$xPos"}{$offset}++; 174 } 175 else 176 { 177 $unknownSmooth = 1; 178 } 179 } 180 } 181 } 182 close(DATA); 183} 184print STDERR "done.\n"; 185print "Unknown smoothing scheme '$opt_smooth'.\nContinuing without smoothing.\n" if($unknownSmooth); 186 187# Removing unwanted data structures... 188print STDERR "Cleaning junk from memory ... "; 189undef %offsetMnem; 190undef %mnemFreq; 191print STDERR "done.\n"; 192 193# Determine the topmost nodes of all hierarchies... 194print STDERR "Determining topmost nodes of all hierarchies ... "; 195my $topHash = WordNet::Similarity::FrequencyCounter::createTopHash($wn); 196print STDERR "done.\n"; 197 198# Propagate the frequencies up... 199print STDERR "Propagating frequencies up through WordNet ... "; 200my $newFreq = WordNet::Similarity::FrequencyCounter::propagateFrequency(\%offsetFreq, $wn, $topHash); 201print STDERR "done.\n"; 202 203# Write out the information content file... 204print STDERR "Writing infocontent file ... "; 205open(DATA, ">$opt_outfile") || die "Unable to open data file for writing.\n"; 206print DATA "wnver::".$wntools->hashCode()."\n"; 207foreach my $offset (sort {$a <=> $b} keys %{$newFreq->{"n"}}) 208{ 209 print DATA $offset."n ".$newFreq->{"n"}->{$offset}; 210 print DATA " ROOT" if($topHash->{"n"}->{$offset}); 211 print DATA "\n"; 212} 213foreach my $offset (sort {$a <=> $b} keys %{$newFreq->{"v"}}) 214{ 215 print DATA $offset."v ".$newFreq->{"v"}->{$offset}; 216 print DATA " ROOT" if($topHash->{"v"}->{$offset}); 217 print DATA "\n"; 218} 219close(DATA); 220print STDERR "done.\n"; 221print STDERR "Wrote file '$opt_outfile'.\n"; 222 223# ---------------------- Subroutines start here ------------------------- 224# Subroutine to display Usage 225sub showUsage 226{ 227 print "Usage: semCorFreq.pl [{ --outfile FILE [--wnpath PATH] [--smooth SCHEME] | --help | --version }]\n"; 228} 229 230# Subroutine to show detailed help. 231sub showHelp 232{ 233 &showUsage; 234 print "\nA helper tool Perl program for WordNet::Similarity.\n"; 235 print "This program is used to generate the frequency count data\n"; 236 print "files which are used by the Jiang Conrath, Resnik and Lin\n"; 237 print "measures to calculate the information content of synsets in\n"; 238 print "WordNet.\n"; 239 print "\nOptions:\n"; 240 print "--outfile Name of the output file (FILE) to write out the\n"; 241 print " information content data to.\n"; 242 print "--wnpath Option to specify the path to the WordNet data\n"; 243 print " files as PATH.\n"; 244 print "--smooth Specifies the smoothing to be used on the\n"; 245 print " probabilities computed. SCHEME specifies the type\n"; 246 print " of smoothing to perform. It is a string, which can be\n"; 247 print " only be 'ADD1' as of now. Other smoothing schemes\n"; 248 print " will be added in future releases.\n"; 249 print "--help Displays this help screen.\n"; 250 print "--version Displays version information.\n"; 251} 252 253# Subroutine to display version information. 254sub showVersion 255{ 256 print "semCorFreq.pl version 2.05\n"; 257 print "Copyright (c) 2005-2008, Ted Pedersen and Siddharth Patwardhan.\n"; 258} 259 2601; 261 262__END__ 263 264=head1 NAME 265 266semCorFreq.pl - Compute Information Content from SemCor sense-tagged corpus 267 268=head1 SYNOPSIS 269 270 semCorFreq.pl [{ --outfile FILE [--wnpath PATH] [--smooth SCHEME] 271 | --help | --version }] 272 273=head1 DESCRIPTION 274 275This program is used to generate the default information 276content file (ic-semcor.dat) that is used by 277WordNet::Similarity in the Jiang Conrath, Resnik and Lin 278measures. 279 280It uses the cntlist file as provided by WordNet as the 281source of frequency counts. These are derived from sense tagged 282corpora which include a portion of the Brown Corpus and 283the Red Badge of Courage. This collection of of sense tagged 284text is referred to as SemCor, and is not distributed by 285WordNet any longer. Also, note that the cntlist file is 286no longer officially supported by WordNet, so the information 287provided therein may not be reliable. 288 289The SemCor data we use comes from Rada Mihalcea, who has 290mapped SemCor from its original version to each successive 291version of WordNet. In this program we ignore the 292SenseTags and simply treat SemCor as raw text. This is 293to allow for the comparison of the effect of counting 294from sense tags (as done in L<semCorFreq.pl>) versus 295raw or plain word forms (as done here). 296 297=head1 OPTIONS 298 299B<--outfile>=I<filename> 300 301 The name of a file to which output should be written 302 303B<--wnpath>=I<path> 304 305 Location of the WordNet data files (e.g., 306 /usr/local/WordNet-3.0/dict) 307 308B<--smooth>=I<SCHEME> 309 310 Smoothing should used on the probabilities computed. SCHEME can 311 only be ADD1 at this time 312 313B<--help> 314 315 Show a help message 316 317B<--version> 318 319 Display version information 320 321=head1 BUGS 322 323Report to WordNet::Similarity mailing list : 324 L<http://groups.yahoo.com/group/wn-similarity> 325 326=head1 SEE ALSO 327 328L<WordNet::Similarity> 329 330SemCor Download (from Rada Mihalcea): 331 L<http://www.cs.unt.edu/~rada/downloads.html#semcor> 332 333WordNet home page : 334 L<http://wordnet.princeton.edu> 335 336WordNet::Similarity home page : 337 L<http://wn-similarity.sourceforge.net> 338 339=head1 AUTHORS 340 341 Ted Pedersen, University of Minnesota, Duluth 342 tpederse at d.umn.edu 343 344 Siddharth Patwardhan, University of Utah, Salt Lake City 345 sidd at cs.utah.edu 346 347=head1 COPYRIGHT 348 349Copyright (c) 2005-2008, Ted Pedersen and Siddharth Patwardhan 350 351This program is free software; you can redistribute it and/or 352modify it under the terms of the GNU General Public License 353as published by the Free Software Foundation; either version 2 354of the License, or (at your option) any later version. 355This program is distributed in the hope that it will be useful, 356but WITHOUT ANY WARRANTY; without even the implied warranty of 357MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 358GNU General Public License for more details. 359 360You should have received a copy of the GNU General Public License 361along with this program; if not, write to the Free Software 362Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 363 364=cut 365 366