1package Lingua::Stem::Ru;
2$Lingua::Stem::Ru::VERSION = '0.04';
3use 5.006;
4use strict;
5use warnings;
6use Exporter 5.57 'import';
7use Carp;
8
9our @EXPORT      = ();
10our @EXPORT_OK   = qw (stem stem_word clear_stem_cache stem_caching);
11our %EXPORT_TAGS = ();
12
13my $Stem_Caching  = 0;
14my $Stem_Cache    = {};
15
16my $VOWEL        = qr/���������/;
17my $PERFECTIVEGROUND = qr/((��|����|������|��|����|������)|((?<=[��])(�|���|�����)))$/;
18my $REFLEXIVE    = qr/(�[��])$/;
19my $ADJECTIVE    = qr/(��|��|��|��|���|���|��|��|��|��|��|��|��|��|���|���|���|���|��|��|��|��|��|��|��|��)$/;
20my $PARTICIPLE   = qr/((���|���|���)|((?<=[��])(��|��|��|��|�)))$/;
21my $VERB         = qr/((���|���|���|����|����|���|���|���|��|��|��|��|��|��|��|���|���|���|��|���|���|��|��|���|���|���|���|��|�)|((?<=[��])(��|��|���|���|��|�|�|��|�|��|��|��|��|��|��|���|���)))$/;
22my $NOUN         = qr/(�|��|��|��|��|�|����|���|���|��|��|�|���|��|��|��|�|���|��|���|��|��|��|�|�|��|���|��|�|�|��|��|�|��|��|�)$/;
23my $RVRE         = qr/^(.*?[$VOWEL])(.*)$/;
24my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=�)���?$/;
25
26sub stem {
27    return [] if ($#_ == -1);
28    my $parm_ref;
29    if (ref $_[0]) {
30        $parm_ref = shift;
31    } else {
32        $parm_ref = { @_ };
33    }
34
35    my $words      = [];
36    my $locale     = 'ru';
37    my $exceptions = {};
38    foreach (keys %$parm_ref) {
39        my $key = lc ($_);
40        if ($key eq '-words') {
41            @$words = @{$parm_ref->{$key}};
42        } elsif ($key eq '-exceptions') {
43            $exceptions = $parm_ref->{$key};
44        } elsif ($key eq '-locale') {
45            $locale = $parm_ref->{$key};
46        } else {
47            croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
48        }
49    }
50
51    local( $_ );
52    foreach (@$words) {
53        # Flatten case
54        $_ = lc $_;
55
56        # Check against exceptions list
57        if (exists $exceptions->{$_}) {
58            $_ = $exceptions->{$_};
59            next;
60        }
61
62        # Check against cache of stemmed words
63        my $original_word = $_;
64        if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
65            $_ = $Stem_Cache->{$original_word};
66            next;
67        }
68
69        $_ = stem_word($_);
70
71        $Stem_Cache->{$original_word} = $_ if $Stem_Caching;
72    }
73    $Stem_Cache = {} if ($Stem_Caching < 2);
74
75    return $words;
76}
77
78sub stem_word {
79    my $word = lc shift;
80
81    # Check against cache of stemmed words
82    if ($Stem_Caching && exists $Stem_Cache->{$word}) {
83        return $Stem_Cache->{$word};
84    }
85
86     my ($start, $RV) = $word =~ /$RVRE/;
87     return $word unless $RV;
88
89     # Step 1
90     unless ($RV =~ s/$PERFECTIVEGROUND//) {
91         $RV =~ s/$REFLEXIVE//;
92
93         if ($RV =~ s/$ADJECTIVE//) {
94             $RV =~ s/$PARTICIPLE//;
95         } else {
96             $RV =~ s/$NOUN// unless $RV =~ s/$VERB//;
97         }
98     }
99
100     # Step 2
101     $RV =~ s/�$//;
102
103     # Step 3
104     $RV =~ s/����?$// if $RV =~ /$DERIVATIONAL/;
105
106     # Step 4
107     unless ($RV =~ s/�$//) {
108         $RV =~ s/����?//;
109         $RV =~ s/��$/�/;
110     }
111
112     return $start.$RV;
113}
114
115sub stem_caching {
116    my $parm_ref;
117    if (ref $_[0]) {
118        $parm_ref = shift;
119    } else {
120        $parm_ref = { @_ };
121    }
122    my $caching_level = $parm_ref->{-level};
123    if (defined $caching_level) {
124        if ($caching_level !~ m/^[012]$/) {
125            croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
126        }
127        $Stem_Caching = $caching_level;
128    }
129    return $Stem_Caching;
130}
131
132sub clear_stem_cache {
133    $Stem_Cache = {};
134}
135
136
1371;
138__END__
139
140=head1 NAME
141
142Lingua::Stem::Ru - Porter's stemming algorithm for Russian (KOI8-R only)
143
144=head1 SYNOPSIS
145
146    use Lingua::Stem::Ru;
147    my $stems = Lingua::Stem::Ru::stem({ -words => $word_list_reference,
148                                         -locale => 'ru',
149                                         -exceptions => $exceptions_hash,
150                                      });
151
152    my $stem = Lingua::Stem::Ru::stem_word( $word );
153
154=head1 DESCRIPTION
155
156This module applies the Porter Stemming Algorithm to its parameters,
157returning the stemmed words.
158
159The algorithm is implemented exactly as described in:
160
161    http://snowball.tartarus.org/algorithms/russian/stemmer.html
162
163The code is carefully crafted to work in conjunction with the L<Lingua::Stem>
164module by Benjamin Franz. This stemmer is also based
165on the work of Aldo Capini, see L<Lingua::Stem::It>.
166
167=head1 METHODS
168
169=over 4
170
171=item stem({ -words => \@words, -locale => 'ru', -exceptions => \%exceptions });
172
173Stems a list of passed words. Returns an anonymous list reference to the stemmed
174words.
175
176Example:
177
178  my $stemmed_words = Lingua::Stem::Ru::stem({ -words => \@words,
179                                              -locale => 'ru',
180                                          -exceptions => \%exceptions,
181                          });
182
183=item stem_word( $word );
184
185Stems a single word and returns the stem directly.
186
187Example:
188
189  my $stem = Lingua::Stem::Ru::stem_word( $word );
190
191=item stem_caching({ -level => 0|1|2 });
192
193Sets the level of stem caching.
194
195'0' means 'no caching'. This is the default level.
196
197'1' means 'cache per run'. This caches stemming results during a single
198    call to 'stem'.
199
200'2' means 'cache indefinitely'. This caches stemming results until
201    either the process exits or the 'clear_stem_cache' method is called.
202
203=item clear_stem_cache;
204
205Clears the cache of stemmed words
206
207=back
208
209=cut
210
211=head2 EXPORT
212
213None by default.
214
215=head1 AUTHOR
216
217Aleksandr Guidrevitch <pillgrim@mail.ru>
218
219=head1 REPOSITORY
220
221L<https://github.com/neilb/Lingua-Stem-Ru>
222
223=head1 SEE ALSO
224
225=over
226
227=item L<Lingua::Stem>
228
229provides an interface for some other pure Perl stemmers available
230on CPAN, including L<Lingua::Stem::Ru>
231
232=item L<Lingua::Stem::Snowball>
233
234=item L<Lingua::Stem::Any>
235
236=back
237
238=head1 COPYRIGHT AND LICENSE
239
240Copyright (C) 2003 by Aldo Calpini <dada@perl.it>
241
242Copyright (C) 2004 by Aleksandr Guidrevitch <pillgrim@mail.ru>
243
244This is free software; you can redistribute it and/or modify it under
245the same terms as the Perl 5 programming language system itself.
246
247=cut
248