1package Lingua::Stem::Ru; 2$Lingua::Stem::Ru::VERSION = '0.04'; 3use 5.006; 4use strict; 5use warnings; 6use Exporter 5.57 'import'; 7use Carp; 8 9our @EXPORT = (); 10our @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching); 11our %EXPORT_TAGS = (); 12 13my $Stem_Caching = 0; 14my $Stem_Cache = {}; 15 16my $VOWEL = qr/���������/; 17my $PERFECTIVEGROUND = qr/((��|����|������|��|����|������)|((?<=[��])(�|���|�����)))$/; 18my $REFLEXIVE = qr/(�[��])$/; 19my $ADJECTIVE = qr/(��|��|��|��|���|���|��|��|��|��|��|��|��|��|���|���|���|���|��|��|��|��|��|��|��|��)$/; 20my $PARTICIPLE = qr/((���|���|���)|((?<=[��])(��|��|��|��|�)))$/; 21my $VERB = qr/((���|���|���|����|����|���|���|���|��|��|��|��|��|��|��|���|���|���|��|���|���|��|��|���|���|���|���|��|�)|((?<=[��])(��|��|���|���|��|�|�|��|�|��|��|��|��|��|��|���|���)))$/; 22my $NOUN = qr/(�|��|��|��|��|�|����|���|���|��|��|�|���|��|��|��|�|���|��|���|��|��|��|�|�|��|���|��|�|�|��|��|�|��|��|�)$/; 23my $RVRE = qr/^(.*?[$VOWEL])(.*)$/; 24my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=�)���?$/; 25 26sub stem { 27 return [] if ($#_ == -1); 28 my $parm_ref; 29 if (ref $_[0]) { 30 $parm_ref = shift; 31 } else { 32 $parm_ref = { @_ }; 33 } 34 35 my $words = []; 36 my $locale = 'ru'; 37 my $exceptions = {}; 38 foreach (keys %$parm_ref) { 39 my $key = lc ($_); 40 if ($key eq '-words') { 41 @$words = @{$parm_ref->{$key}}; 42 } elsif ($key eq '-exceptions') { 43 $exceptions = $parm_ref->{$key}; 44 } elsif ($key eq '-locale') { 45 $locale = $parm_ref->{$key}; 46 } else { 47 croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n"); 48 } 49 } 50 51 local( $_ ); 52 foreach (@$words) { 53 # Flatten case 54 $_ = lc $_; 55 56 # Check against exceptions list 57 if (exists $exceptions->{$_}) { 58 $_ = $exceptions->{$_}; 59 next; 60 } 61 62 # Check against cache of stemmed words 63 my $original_word = $_; 64 if ($Stem_Caching && exists $Stem_Cache->{$original_word}) { 65 $_ = $Stem_Cache->{$original_word}; 66 next; 67 } 68 69 $_ = stem_word($_); 70 71 $Stem_Cache->{$original_word} = $_ if $Stem_Caching; 72 } 73 $Stem_Cache = {} if ($Stem_Caching < 2); 74 75 return $words; 76} 77 78sub stem_word { 79 my $word = lc shift; 80 81 # Check against cache of stemmed words 82 if ($Stem_Caching && exists $Stem_Cache->{$word}) { 83 return $Stem_Cache->{$word}; 84 } 85 86 my ($start, $RV) = $word =~ /$RVRE/; 87 return $word unless $RV; 88 89 # Step 1 90 unless ($RV =~ s/$PERFECTIVEGROUND//) { 91 $RV =~ s/$REFLEXIVE//; 92 93 if ($RV =~ s/$ADJECTIVE//) { 94 $RV =~ s/$PARTICIPLE//; 95 } else { 96 $RV =~ s/$NOUN// unless $RV =~ s/$VERB//; 97 } 98 } 99 100 # Step 2 101 $RV =~ s/�$//; 102 103 # Step 3 104 $RV =~ s/����?$// if $RV =~ /$DERIVATIONAL/; 105 106 # Step 4 107 unless ($RV =~ s/�$//) { 108 $RV =~ s/����?//; 109 $RV =~ s/��$/�/; 110 } 111 112 return $start.$RV; 113} 114 115sub stem_caching { 116 my $parm_ref; 117 if (ref $_[0]) { 118 $parm_ref = shift; 119 } else { 120 $parm_ref = { @_ }; 121 } 122 my $caching_level = $parm_ref->{-level}; 123 if (defined $caching_level) { 124 if ($caching_level !~ m/^[012]$/) { 125 croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value"); 126 } 127 $Stem_Caching = $caching_level; 128 } 129 return $Stem_Caching; 130} 131 132sub clear_stem_cache { 133 $Stem_Cache = {}; 134} 135 136 1371; 138__END__ 139 140=head1 NAME 141 142Lingua::Stem::Ru - Porter's stemming algorithm for Russian (KOI8-R only) 143 144=head1 SYNOPSIS 145 146 use Lingua::Stem::Ru; 147 my $stems = Lingua::Stem::Ru::stem({ -words => $word_list_reference, 148 -locale => 'ru', 149 -exceptions => $exceptions_hash, 150 }); 151 152 my $stem = Lingua::Stem::Ru::stem_word( $word ); 153 154=head1 DESCRIPTION 155 156This module applies the Porter Stemming Algorithm to its parameters, 157returning the stemmed words. 158 159The algorithm is implemented exactly as described in: 160 161 http://snowball.tartarus.org/algorithms/russian/stemmer.html 162 163The code is carefully crafted to work in conjunction with the L<Lingua::Stem> 164module by Benjamin Franz. This stemmer is also based 165on the work of Aldo Capini, see L<Lingua::Stem::It>. 166 167=head1 METHODS 168 169=over 4 170 171=item stem({ -words => \@words, -locale => 'ru', -exceptions => \%exceptions }); 172 173Stems a list of passed words. Returns an anonymous list reference to the stemmed 174words. 175 176Example: 177 178 my $stemmed_words = Lingua::Stem::Ru::stem({ -words => \@words, 179 -locale => 'ru', 180 -exceptions => \%exceptions, 181 }); 182 183=item stem_word( $word ); 184 185Stems a single word and returns the stem directly. 186 187Example: 188 189 my $stem = Lingua::Stem::Ru::stem_word( $word ); 190 191=item stem_caching({ -level => 0|1|2 }); 192 193Sets the level of stem caching. 194 195'0' means 'no caching'. This is the default level. 196 197'1' means 'cache per run'. This caches stemming results during a single 198 call to 'stem'. 199 200'2' means 'cache indefinitely'. This caches stemming results until 201 either the process exits or the 'clear_stem_cache' method is called. 202 203=item clear_stem_cache; 204 205Clears the cache of stemmed words 206 207=back 208 209=cut 210 211=head2 EXPORT 212 213None by default. 214 215=head1 AUTHOR 216 217Aleksandr Guidrevitch <pillgrim@mail.ru> 218 219=head1 REPOSITORY 220 221L<https://github.com/neilb/Lingua-Stem-Ru> 222 223=head1 SEE ALSO 224 225=over 226 227=item L<Lingua::Stem> 228 229provides an interface for some other pure Perl stemmers available 230on CPAN, including L<Lingua::Stem::Ru> 231 232=item L<Lingua::Stem::Snowball> 233 234=item L<Lingua::Stem::Any> 235 236=back 237 238=head1 COPYRIGHT AND LICENSE 239 240Copyright (C) 2003 by Aldo Calpini <dada@perl.it> 241 242Copyright (C) 2004 by Aleksandr Guidrevitch <pillgrim@mail.ru> 243 244This is free software; you can redistribute it and/or modify it under 245the same terms as the Perl 5 programming language system itself. 246 247=cut 248