1# Before `make install' is performed this script should be runnable with
2# `make test'. After `make install' it should work as `perl test.pl'
3
4#########################
5
6# change 'tests => 1' to 'tests => last_test_to_print';
7use Test::More tests => 12;
8use Lingua::EN::Tagger;
9
10ok('Lingua::EN::Tagger', 'module compiled'); # If we made it this far, we're ok.
11
12#########################
13
14# Insert your test code below, the Test module is use()ed here so read
15# its man page ( perldoc Test ) for help writing this test script.
16
17
18
19######################################
20# Start by creating the parser object
21# (without the stemmer)
22######################################
23ok( $parser = Lingua::EN::Tagger->new( stem => 0, weight_noun_phrases => 0, longest_noun_phrase => 15 ), 'creating parser object' );
24$tagged = $parser->add_tags( penn() );
25
26ok( %words = $parser->get_words( penn() ), 'get_words() method' );
27$accuracy = compute_accuracy( \%words, np_benchmark() );
28is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );
29
30##############################################
31# Test the extraction of maximal noun phrases
32##############################################
33ok( %max_noun_phrases = $parser->get_max_noun_phrases( $tagged ), 'extract MNPs' );
34$accuracy = compute_accuracy( \%max_noun_phrases, mnp_benchmark() );
35is( $accuracy, '100', "accuracy of mnp extraction ($accuracy%)" );
36
37
38##############################################
39# Test the extraction of all noun phrases
40##############################################
41ok( %noun_phrases = $parser->get_noun_phrases( $tagged ), 'extract noun phrases' );
42$accuracy = compute_accuracy( \%noun_phrases, np_benchmark() );
43is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );
44
45##############################################
46# Test the extraction of all nouns
47##############################################
48ok( %nouns = $parser->get_nouns( $tagged ), 'extract nouns' );
49$accuracy = compute_accuracy( \%nouns, noun_benchmark() );
50is( $accuracy, '100', "accuracy of noun extraction ($accuracy%)" );
51
52
53##############################################
54# Test the extraction of proper nouns
55##############################################
56ok( %nnp = $parser->get_proper_nouns( $tagged ), 'extract proper nouns' );
57$accuracy = compute_accuracy( \%nnp, nnp_benchmark() );
58is( $accuracy, '100', "accuracy of nnp extraction ($accuracy%)" );
59
60
61sub compute_accuracy {
62        ( $hash_ref, $benchmark ) = @_;
63        ( $errors, $i ) = ( 0 )x2;
64        foreach( keys %{ $hash_ref } ){
65                $i++;
66                unless( defined $benchmark->{$_} ){
67                        # warn "$_ not in benchmark\n";
68                        $errors++,
69                        next;
70                }
71                $i++;
72                unless ( $hash_ref->{$_} == $benchmark->{$_} ){
73                        # warn $hash_ref->{$_}." != ".$benchmark->{$_}." (benchmark)\n";
74                        $errors++;
75                }
76        }
77        foreach( keys %{ $benchmark } ){
78                $i++;
79                unless( defined $hash_ref->{$_} ){
80                        # warn "$_ not defined in extraction\n";
81                        $errors++;
82                }
83        }
84        return sprintf( "%d", 100 * ( 1 - $errors / $i ) );
85}
86
87sub mnp_benchmark {
88        $hash_ref = { 'lisa raines' => 1,
89                        'lawyer' => 1,
90                        'director of government relations for the industrial biotechnical association' => 1,
91                        'judge' => 1,
92                        'patent law' => 1,
93                        'concerns of research-based industries' => 1,
94                        'judge newman' => 1,
95                        'former patent lawyer' => 1,
96                        'dissent' => 1,
97                        'court' => 1,
98                        'motion for a rehearing of the case by the full court' => 1,
99                        'panel' => 1,
100                        'judicial legislation' => 1,
101                        'important high-technological industry' => 1,
102                        'regard' => 1,
103                        'consequences for research' => 1,
104                        'innovation' => 1,
105                        'public interest' => 1,
106                        'ms. raines' => 1,
107                        'judgement' => 1,
108                        'concern that the absence of patent lawyers on the court' => 1
109                };
110        return $hash_ref;
111}
112
113sub noun_benchmark {
114        $hash_ref = { 'lisa' => 1,
115                        'raines' => 2,
116                        'lawyer' => 2,
117                        'director' => 1,
118                        'relations' => 1,
119                        'government' => 1,
120                        'association' => 1,
121                        'judge' => 2,
122                        'patent' => 3,
123                        'law' => 1,
124                        'concerns' => 1,
125                        'industries' => 1,
126                        'newman' => 1,
127                        'dissent' => 1,
128                        'court' => 3,
129                        'motion' => 1,
130                        'rehearing' => 1,
131                        'case' => 1,
132                        'panel' => 1,
133                        'legislation' => 1,
134                        'industry' => 1,
135                        'regard' => 1,
136                        'consequences' => 1,
137                        'research' => 1,
138                        'innovation' => 1,
139                        'interest' => 1,
140                        'ms.' => 1,
141                        'judgement' => 1,
142                        'concern' => 1,
143                        'industrial' => 1,
144                        'biotechnical' => 1,
145                        'absence' => 1,
146                        'lawyers' => 1
147                };
148        return $hash_ref;
149}
150
151sub np_benchmark {
152        $hash_ref = { 'lisa' => 1,
153                        'raines' => 2,
154                        'lawyer' => 2,
155                        'director' => 1,
156                        'relations' => 1,
157                        'government' => 1,
158                        'association' => 1,
159                        'judge' => 2,
160                        'patent' => 3,
161                        'law' => 1,
162                        'concerns' => 1,
163                        'industries' => 1,
164                        'newman' => 1,
165                        'dissent' => 1,
166                        'court' => 3,
167                        'motion' => 1,
168                        'rehearing' => 1,
169                        'case' => 1,
170                        'panel' => 1,
171                        'legislation' => 1,
172                        'industry' => 1,
173                        'regard' => 1,
174                        'consequences' => 1,
175                        'research' => 1,
176                        'innovation' => 1,
177                        'interest' => 1,
178                        'ms.' => 1,
179                        'judgement' => 1,
180                        'concern' => 1,
181                        'industrial' => 1,
182                        'biotechnical' => 1,
183                        'absence' => 1,
184                        'lawyers' => 1,
185                        'lisa raines' => 1,
186                        'director of government relations for the industrial biotechnical association' => 1,
187                        'patent law' => 1,
188                        'concerns of research-based industries' => 1,
189                        'judge newman' => 1,
190                        'former patent lawyer' => 1,
191                        'motion for a rehearing of the case by the full court' => 1,
192                        'judicial legislation' => 1,
193                        'important high-technological industry' => 1,
194                        'consequences for research' => 1,
195                        'public interest' => 1,
196                        'ms. raines' => 1,
197                        'concern that the absence of patent lawyers on the court' => 1,
198                        'government relations' => 1,
199                        'industrial biotechnical association' => 1,
200                        'biotechnical association' => 1,
201                        'research-based industries' => 1,
202                        'patent lawyer' => 1,
203                        'full court' => 1,
204                        'high-technological industry' => 1,
205                        'patent lawyers' => 1
206                };
207        return $hash_ref;
208
209}
210sub nnp_benchmark {
211	$hash_ref = { 'lisa raines' => 1,
212			'industrial biotechnical association' => 1,
213			'judge newman' => 1,
214			'ms. raines' => 1
215		};
216	return $hash_ref;
217}
218
219sub words_benchmark {
220}
221
222#       Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
223
224
225
226
227###############################################
228# Words that mostly don't occur in the lexicon
229###############################################
230sub jibberish {
231        return "Nils occludes the 5 corybantic sciolists from fressing upon the
232        northeast-oriented perambulations of the yabbering doyenne";
233}
234
235
236##########################################################
237# Hyphenated words that mostly don't occur in the lexicon
238##########################################################
239sub hyphen {
240        # brother-in-law not in lexicon, sister-in-law is
241        return "The brother-in-law. The sister-in-law. A strategy of tit-for-tat among
242        middle-eastern states.";
243}
244
245
246
247####################################################
248# Test the tagger against an actual tagged corpus
249####################################################
250sub penn {
251        return <<PENN
252        Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
253PENN
254}
255
256
257