1# Before `make install' is performed this script should be runnable with 2# `make test'. After `make install' it should work as `perl test.pl' 3 4######################### 5 6# change 'tests => 1' to 'tests => last_test_to_print'; 7use Test::More tests => 12; 8use Lingua::EN::Tagger; 9 10ok('Lingua::EN::Tagger', 'module compiled'); # If we made it this far, we're ok. 11 12######################### 13 14# Insert your test code below, the Test module is use()ed here so read 15# its man page ( perldoc Test ) for help writing this test script. 16 17 18 19###################################### 20# Start by creating the parser object 21# (without the stemmer) 22###################################### 23ok( $parser = Lingua::EN::Tagger->new( stem => 0, weight_noun_phrases => 0, longest_noun_phrase => 15 ), 'creating parser object' ); 24$tagged = $parser->add_tags( penn() ); 25 26ok( %words = $parser->get_words( penn() ), 'get_words() method' ); 27$accuracy = compute_accuracy( \%words, np_benchmark() ); 28is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" ); 29 30############################################## 31# Test the extraction of maximal noun phrases 32############################################## 33ok( %max_noun_phrases = $parser->get_max_noun_phrases( $tagged ), 'extract MNPs' ); 34$accuracy = compute_accuracy( \%max_noun_phrases, mnp_benchmark() ); 35is( $accuracy, '100', "accuracy of mnp extraction ($accuracy%)" ); 36 37 38############################################## 39# Test the extraction of all noun phrases 40############################################## 41ok( %noun_phrases = $parser->get_noun_phrases( $tagged ), 'extract noun phrases' ); 42$accuracy = compute_accuracy( \%noun_phrases, np_benchmark() ); 43is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" ); 44 45############################################## 46# Test the extraction of all nouns 47############################################## 48ok( %nouns = $parser->get_nouns( $tagged ), 'extract nouns' ); 49$accuracy = compute_accuracy( \%nouns, noun_benchmark() ); 50is( $accuracy, '100', "accuracy of noun extraction ($accuracy%)" ); 51 52 53############################################## 54# Test the extraction of proper nouns 55############################################## 56ok( %nnp = $parser->get_proper_nouns( $tagged ), 'extract proper nouns' ); 57$accuracy = compute_accuracy( \%nnp, nnp_benchmark() ); 58is( $accuracy, '100', "accuracy of nnp extraction ($accuracy%)" ); 59 60 61sub compute_accuracy { 62 ( $hash_ref, $benchmark ) = @_; 63 ( $errors, $i ) = ( 0 )x2; 64 foreach( keys %{ $hash_ref } ){ 65 $i++; 66 unless( defined $benchmark->{$_} ){ 67 # warn "$_ not in benchmark\n"; 68 $errors++, 69 next; 70 } 71 $i++; 72 unless ( $hash_ref->{$_} == $benchmark->{$_} ){ 73 # warn $hash_ref->{$_}." != ".$benchmark->{$_}." (benchmark)\n"; 74 $errors++; 75 } 76 } 77 foreach( keys %{ $benchmark } ){ 78 $i++; 79 unless( defined $hash_ref->{$_} ){ 80 # warn "$_ not defined in extraction\n"; 81 $errors++; 82 } 83 } 84 return sprintf( "%d", 100 * ( 1 - $errors / $i ) ); 85} 86 87sub mnp_benchmark { 88 $hash_ref = { 'lisa raines' => 1, 89 'lawyer' => 1, 90 'director of government relations for the industrial biotechnical association' => 1, 91 'judge' => 1, 92 'patent law' => 1, 93 'concerns of research-based industries' => 1, 94 'judge newman' => 1, 95 'former patent lawyer' => 1, 96 'dissent' => 1, 97 'court' => 1, 98 'motion for a rehearing of the case by the full court' => 1, 99 'panel' => 1, 100 'judicial legislation' => 1, 101 'important high-technological industry' => 1, 102 'regard' => 1, 103 'consequences for research' => 1, 104 'innovation' => 1, 105 'public interest' => 1, 106 'ms. raines' => 1, 107 'judgement' => 1, 108 'concern that the absence of patent lawyers on the court' => 1 109 }; 110 return $hash_ref; 111} 112 113sub noun_benchmark { 114 $hash_ref = { 'lisa' => 1, 115 'raines' => 2, 116 'lawyer' => 2, 117 'director' => 1, 118 'relations' => 1, 119 'government' => 1, 120 'association' => 1, 121 'judge' => 2, 122 'patent' => 3, 123 'law' => 1, 124 'concerns' => 1, 125 'industries' => 1, 126 'newman' => 1, 127 'dissent' => 1, 128 'court' => 3, 129 'motion' => 1, 130 'rehearing' => 1, 131 'case' => 1, 132 'panel' => 1, 133 'legislation' => 1, 134 'industry' => 1, 135 'regard' => 1, 136 'consequences' => 1, 137 'research' => 1, 138 'innovation' => 1, 139 'interest' => 1, 140 'ms.' => 1, 141 'judgement' => 1, 142 'concern' => 1, 143 'industrial' => 1, 144 'biotechnical' => 1, 145 'absence' => 1, 146 'lawyers' => 1 147 }; 148 return $hash_ref; 149} 150 151sub np_benchmark { 152 $hash_ref = { 'lisa' => 1, 153 'raines' => 2, 154 'lawyer' => 2, 155 'director' => 1, 156 'relations' => 1, 157 'government' => 1, 158 'association' => 1, 159 'judge' => 2, 160 'patent' => 3, 161 'law' => 1, 162 'concerns' => 1, 163 'industries' => 1, 164 'newman' => 1, 165 'dissent' => 1, 166 'court' => 3, 167 'motion' => 1, 168 'rehearing' => 1, 169 'case' => 1, 170 'panel' => 1, 171 'legislation' => 1, 172 'industry' => 1, 173 'regard' => 1, 174 'consequences' => 1, 175 'research' => 1, 176 'innovation' => 1, 177 'interest' => 1, 178 'ms.' => 1, 179 'judgement' => 1, 180 'concern' => 1, 181 'industrial' => 1, 182 'biotechnical' => 1, 183 'absence' => 1, 184 'lawyers' => 1, 185 'lisa raines' => 1, 186 'director of government relations for the industrial biotechnical association' => 1, 187 'patent law' => 1, 188 'concerns of research-based industries' => 1, 189 'judge newman' => 1, 190 'former patent lawyer' => 1, 191 'motion for a rehearing of the case by the full court' => 1, 192 'judicial legislation' => 1, 193 'important high-technological industry' => 1, 194 'consequences for research' => 1, 195 'public interest' => 1, 196 'ms. raines' => 1, 197 'concern that the absence of patent lawyers on the court' => 1, 198 'government relations' => 1, 199 'industrial biotechnical association' => 1, 200 'biotechnical association' => 1, 201 'research-based industries' => 1, 202 'patent lawyer' => 1, 203 'full court' => 1, 204 'high-technological industry' => 1, 205 'patent lawyers' => 1 206 }; 207 return $hash_ref; 208 209} 210sub nnp_benchmark { 211 $hash_ref = { 'lisa raines' => 1, 212 'industrial biotechnical association' => 1, 213 'judge newman' => 1, 214 'ms. raines' => 1 215 }; 216 return $hash_ref; 217} 218 219sub words_benchmark { 220} 221 222# Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome." 223 224 225 226 227############################################### 228# Words that mostly don't occur in the lexicon 229############################################### 230sub jibberish { 231 return "Nils occludes the 5 corybantic sciolists from fressing upon the 232 northeast-oriented perambulations of the yabbering doyenne"; 233} 234 235 236########################################################## 237# Hyphenated words that mostly don't occur in the lexicon 238########################################################## 239sub hyphen { 240 # brother-in-law not in lexicon, sister-in-law is 241 return "The brother-in-law. The sister-in-law. A strategy of tit-for-tat among 242 middle-eastern states."; 243} 244 245 246 247#################################################### 248# Test the tagger against an actual tagged corpus 249#################################################### 250sub penn { 251 return <<PENN 252 Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome." 253PENN 254} 255 256 257