1#! /usr/bin/perl -w
2#
3# semCorFreq.pl version 2.05
4# (Last updated $Id: semCorFreq.pl,v 1.13 2008/05/30 23:12:44 sidz1979 Exp $)
5#
6# -----------------------------------------------------------------
7
8# Include other packages
9use strict;
10use WordNet::QueryData;
11use WordNet::Tools;
12use Getopt::Long;
13use WordNet::Similarity::FrequencyCounter;
14
15# Global Variable declaration.
16my %offsetMnem;
17my %mnemFreq;
18my %offsetFreq;
19
20# Get Command-Line options.
21our ($opt_help, $opt_version, $opt_wnpath, $opt_outfile, $opt_smooth);
22&GetOptions("help", "version", "wnpath=s", "outfile=s", "smooth=s");
23
24# Check if help has been requested ... If so ... display help.
25if(defined $opt_help)
26{
27  &showHelp;
28  exit;
29}
30
31# Check if version number has been requested ... If so ... display version.
32if(defined $opt_version)
33{
34  &showVersion;
35  exit;
36}
37
38# Check if path to WordNet Data files has been provided ... If so ... save it.
39my ($wnPCPath, $wnUnixPath);
40if(defined $opt_wnpath)
41{
42  $wnPCPath = $opt_wnpath;
43  $wnUnixPath = $opt_wnpath;
44}
45elsif (defined $ENV{WNSEARCHDIR})
46{
47  $wnPCPath = $ENV{WNSEARCHDIR};
48  $wnUnixPath = $ENV{WNSEARCHDIR};
49}
50elsif (defined $ENV{WNHOME})
51{
52  $wnPCPath = $ENV{WNHOME} . "\\dict";
53  $wnUnixPath = $ENV{WNHOME} . "";
54}
55else
56{
57  $wnPCPath = "C:\\Program Files\\WordNet\\3.0\\dict";
58  $wnUnixPath = "/usr/local/share/WordNet";
59}
60
61# Output file must be specified
62unless(defined $opt_outfile)
63{
64  &showUsage;
65  print "Type 'semCorFreq.pl --help' for detailed help.\n";
66  exit;
67}
68
69# Initialize POS Map.
70my %posMap;
71$posMap{"1"} = "n";
72$posMap{"2"} = "v";
73
74# Get a WordNet::QueryData object...
75print STDERR "Loading WordNet ... ";
76my $wn = ((defined $opt_wnpath) ? (WordNet::QueryData->new($opt_wnpath)) : (WordNet::QueryData->new()));
77die "Unable to create WordNet::QueryData object.\n" if(!$wn);
78$wnPCPath = $wnUnixPath = $wn->dataPath() if($wn->can('dataPath'));
79my $wntools = WordNet::Tools->new($wn);
80die "Unable to create WordNet::Tools object.\n" if(!$wntools);
81print STDERR "done.\n";
82
83# Loading the Sense Indices.
84print STDERR "Loading sense indices ... ";
85open(IDX, $wnUnixPath."/index.sense") || open(IDX, $wnPCPath."\\sense.idx") || die "Unable to open sense index file.\n";
86while(<IDX>)
87{
88  chomp;
89  my @line = split / +/;
90  if($line[0] =~ /%([12]):/)
91  {
92    my $posHere = $1;
93    $line[1] =~ s/^0*//;
94    push @{$offsetMnem{$line[1].$posMap{$posHere}}}, $line[0];
95  }
96}
97close(IDX);
98print STDERR "done.\n";
99
100# Loading the frequency counts from 'cntlist'.
101print STDERR "Loading cntlist ... ";
102open(CNT, $wnUnixPath."/cntlist.rev") || open(CNT, $wnPCPath."\\cntlist") || die "Unable to open cntlist.\n";
103while(<CNT>)
104{
105  chomp;
106  my @line = split / /;
107  if($line[1] =~ /%[12]:/)
108  {
109    $mnemFreq{$line[1]}=$line[0];
110  }
111}
112close(CNT);
113print STDERR "done.\n";
114
115# Mapping the frequency counts to offsets...
116print STDERR "Mapping offsets to frequencies ... ";
117my $unknownSmooth = 0;
118foreach my $tPos ("noun", "verb")
119{
120  my $xPos = $tPos;
121  my $line;
122  $xPos =~ s/(^[nv]).*/$1/;
123  open(DATA, $wnUnixPath."/data.$tPos") || open(DATA, $wnPCPath."\\$tPos.dat") || die "Unable to open data file.\n";
124  while($line=<DATA>)
125  {
126    next if "  " eq substr $line, 0, 2;
127    $line =~ /^([0-9]+)\s+/;
128    my $offset = $1;
129    $offset =~ s/^0*//;
130    if(exists $offsetMnem{$offset."$xPos"})
131    {
132      foreach my $mnem (@{$offsetMnem{$offset."$xPos"}})
133      {
134        if($offsetFreq{"$xPos"}{$offset})
135        {
136          $offsetFreq{"$xPos"}{$offset} += ($mnemFreq{$mnem}) ? $mnemFreq{$mnem} : 0;
137        }
138        else
139        {
140
141          # [old]
142          # Using initial value of 1 for add-1 smoothing. (added 06/22/2002)
143          # $offsetFreq{$offset} = ($mnemFreq{$mnem}) ? $mnemFreq{$mnem} : 0;
144          # [/old]
145          # No more add-1 (09/13/2002)
146          # Option for add-1 ! (05/01/2003)
147          $offsetFreq{"$xPos"}{$offset} = ($mnemFreq{$mnem}) ? $mnemFreq{$mnem} : 0;
148          if(defined $opt_smooth)
149          {
150            if($opt_smooth eq 'ADD1')
151            {
152              $offsetFreq{"$xPos"}{$offset}++;
153            }
154            else
155            {
156              $unknownSmooth = 1;
157            }
158          }
159        }
160      }
161    }
162    else
163    {
164
165      # Code added for Add-1 smoothing (06/22/2002)
166      # Code changed... no more add-1 (09/13/2002)
167      # Code changed... option for add-1 (05/01/2003)
168      $offsetFreq{"$xPos"}{$offset} = 0;
169      if(defined $opt_smooth)
170      {
171        if($opt_smooth eq 'ADD1')
172        {
173          $offsetFreq{"$xPos"}{$offset}++;
174        }
175        else
176        {
177          $unknownSmooth = 1;
178        }
179      }
180    }
181  }
182  close(DATA);
183}
184print STDERR "done.\n";
185print "Unknown smoothing scheme '$opt_smooth'.\nContinuing without smoothing.\n" if($unknownSmooth);
186
187# Removing unwanted data structures...
188print STDERR "Cleaning junk from memory ... ";
189undef %offsetMnem;
190undef %mnemFreq;
191print STDERR "done.\n";
192
193# Determine the topmost nodes of all hierarchies...
194print STDERR "Determining topmost nodes of all hierarchies ... ";
195my $topHash = WordNet::Similarity::FrequencyCounter::createTopHash($wn);
196print STDERR "done.\n";
197
198# Propagate the frequencies up...
199print STDERR "Propagating frequencies up through WordNet ... ";
200my $newFreq = WordNet::Similarity::FrequencyCounter::propagateFrequency(\%offsetFreq, $wn, $topHash);
201print STDERR "done.\n";
202
203# Write out the information content file...
204print STDERR "Writing infocontent file ... ";
205open(DATA, ">$opt_outfile") || die "Unable to open data file for writing.\n";
206print DATA "wnver::".$wntools->hashCode()."\n";
207foreach my $offset (sort {$a <=> $b} keys %{$newFreq->{"n"}})
208{
209  print DATA $offset."n ".$newFreq->{"n"}->{$offset};
210  print DATA " ROOT" if($topHash->{"n"}->{$offset});
211  print DATA "\n";
212}
213foreach my $offset (sort {$a <=> $b} keys %{$newFreq->{"v"}})
214{
215  print DATA $offset."v ".$newFreq->{"v"}->{$offset};
216  print DATA " ROOT" if($topHash->{"v"}->{$offset});
217  print DATA "\n";
218}
219close(DATA);
220print STDERR "done.\n";
221print STDERR "Wrote file '$opt_outfile'.\n";
222
223# ---------------------- Subroutines start here -------------------------
224# Subroutine to display Usage
225sub showUsage
226{
227  print "Usage: semCorFreq.pl [{ --outfile FILE [--wnpath PATH] [--smooth SCHEME] | --help | --version }]\n";
228}
229
230# Subroutine to show detailed help.
231sub showHelp
232{
233  &showUsage;
234  print "\nA helper tool Perl program for WordNet::Similarity.\n";
235  print "This program is used to generate the frequency count data\n";
236  print "files which are used by the Jiang Conrath, Resnik and Lin\n";
237  print "measures to calculate the information content of synsets in\n";
238  print "WordNet.\n";
239  print "\nOptions:\n";
240  print "--outfile     Name of the output file (FILE) to write out the\n";
241  print "              information content data to.\n";
242  print "--wnpath      Option to specify the path to the WordNet data\n";
243  print "              files as PATH.\n";
244  print "--smooth      Specifies the smoothing to be used on the\n";
245  print "              probabilities computed. SCHEME specifies the type\n";
246  print "              of smoothing to perform. It is a string, which can be\n";
247  print "              only be 'ADD1' as of now. Other smoothing schemes\n";
248  print "              will be added in future releases.\n";
249  print "--help        Displays this help screen.\n";
250  print "--version     Displays version information.\n";
251}
252
253# Subroutine to display version information.
254sub showVersion
255{
256  print "semCorFreq.pl version 2.05\n";
257  print "Copyright (c) 2005-2008, Ted Pedersen and Siddharth Patwardhan.\n";
258}
259
2601;
261
262__END__
263
264=head1 NAME
265
266semCorFreq.pl - Compute Information Content from SemCor sense-tagged corpus
267
268=head1 SYNOPSIS
269
270 semCorFreq.pl [{ --outfile FILE [--wnpath PATH] [--smooth SCHEME]
271	| --help | --version }]
272
273=head1 DESCRIPTION
274
275This program is used to generate the default information
276content file (ic-semcor.dat) that is used by
277WordNet::Similarity in the Jiang Conrath, Resnik and Lin
278measures.
279
280It uses the cntlist file as provided by WordNet as the
281source of frequency counts. These are derived from sense tagged
282corpora which include a portion of the Brown Corpus and
283the Red Badge of Courage. This collection of of sense tagged
284text is referred to as SemCor, and is not distributed by
285WordNet any longer. Also, note that the cntlist file is
286no longer officially supported by WordNet, so the information
287provided therein may not be reliable.
288
289The SemCor data we use comes from Rada Mihalcea, who has
290mapped SemCor from its original version to each successive
291version of WordNet. In this program we ignore the
292SenseTags and simply treat SemCor as raw text. This is
293to allow for the comparison of the effect of counting
294from sense tags (as done in L<semCorFreq.pl>) versus
295raw or plain word forms (as done here).
296
297=head1 OPTIONS
298
299B<--outfile>=I<filename>
300
301    The name of a file to which output should be written
302
303B<--wnpath>=I<path>
304
305    Location of the WordNet data files (e.g.,
306    /usr/local/WordNet-3.0/dict)
307
308B<--smooth>=I<SCHEME>
309
310    Smoothing should used on the probabilities computed.  SCHEME can
311    only be ADD1 at this time
312
313B<--help>
314
315    Show a help message
316
317B<--version>
318
319    Display version information
320
321=head1 BUGS
322
323Report to WordNet::Similarity mailing list :
324 L<http://groups.yahoo.com/group/wn-similarity>
325
326=head1 SEE ALSO
327
328L<WordNet::Similarity>
329
330SemCor Download (from Rada Mihalcea):
331 L<http://www.cs.unt.edu/~rada/downloads.html#semcor>
332
333WordNet home page :
334 L<http://wordnet.princeton.edu>
335
336WordNet::Similarity home page :
337 L<http://wn-similarity.sourceforge.net>
338
339=head1 AUTHORS
340
341 Ted Pedersen, University of Minnesota, Duluth
342 tpederse at d.umn.edu
343
344 Siddharth Patwardhan, University of Utah, Salt Lake City
345 sidd at cs.utah.edu
346
347=head1 COPYRIGHT
348
349Copyright (c) 2005-2008, Ted Pedersen and Siddharth Patwardhan
350
351This program is free software; you can redistribute it and/or
352modify it under the terms of the GNU General Public License
353as published by the Free Software Foundation; either version 2
354of the License, or (at your option) any later version.
355This program is distributed in the hope that it will be useful,
356but WITHOUT ANY WARRANTY; without even the implied warranty of
357MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
358GNU General Public License for more details.
359
360You should have received a copy of the GNU General Public License
361along with this program; if not, write to the Free Software
362Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
363
364=cut
365
366