1package Lingua::Stem::Snowball::No;
2use strict;
3use bytes;
4# $Id: No.pm,v 1.1 2007/05/07 11:35:26 ask Exp $
5# $Source: /opt/CVS/NoSnowball/lib/Lingua/Stem/Snowball/No.pm,v $
6# $Author: ask $
7# $HeadURL$
8# $Revision: 1.1 $
9# $Date: 2007/05/07 11:35:26 $
10# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
11# Lingua::Stem::Snowball::No - Norwegian stemmer
12# :: based upon the norwegian stemmer algorithm at snowball.tartarus.org
13#	 by Martin Porter.
14# (c) 2001-2007 Ask Solem Hoel <ask@0x61736b.net>
15#
16#   This program is free software; you can redistribute it and/or modify
17#   it under the terms of the GNU General Public License version 2,
18#   *NOT* "earlier versions", as published by the Free Software Foundation.
19#
20#   This program is distributed in the hope that it will be useful,
21#   but WITHOUT ANY WARRANTY; without even the implied warranty of
22#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23#   GNU General Public License for more details.
24#
25#   You should have received a copy of the GNU General Public License
26#   along with this program; if not, write to the Free Software
27#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
28# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
29#####
30
31
32use vars qw($VERSION);
33$VERSION = 1.2;
34
35my %cache = ( );
36
37# special characters
38my $ae = "\xE6";
39my $ao = "\xE5";
40my $oe = "\xF8";
41
42# delete the s if a "s ending" is preceeded by one
43# of these characters.
44my $s_ending = "bcdfghjklmnoprtvyz";
45
46# norwegian vowels.
47my $vowels = "aeiouy$ae$ao$oe";
48
49# ####
50# the endings in step 1
51# XXX: these must be sorted by length
52# to save time we've done it already, you can do it like this:
53#	my $bylength = sub {
54#		length $a <=> length $b;
55#	}
56#	@endings = reverse sort $bylength @endings;
57my @endings = qw/
58	hetenes hetens hetene endes heter heten enes edes ende erte
59	ande ast het ets ers ert ens ene ane ede et es as er ar en
60	e a s
61/;
62
63# the endings in step 2
64# XXX: these must be sorted by length, like @endings in step 1.
65my @endings2 = qw/
66	hetslov slov elov elig eleg els lig eig lov leg ig
67/;
68
69sub new {
70	my $class = shift;
71	my %arg = @_;
72	my $self = { };
73
74	bless $self, $class;
75	if ($arg{use_cache}) {
76		$self->use_cache(1);
77	}
78
79	return $self;
80}
81
82sub use_cache {
83	my ($self, $use_cache) = @_;
84	if ($use_cache) {
85		$self->{USE_CACHE} = 1;
86	}
87	return $self->{USE_CACHE};
88}
89
90sub stem {
91	my ($self, $word) = @_;
92    no warnings;
93	$word = lc $word;
94	$word =~ y/\xC6/\xE6/;
95	$word =~ y/\xD8/\xF8/;
96	$word =~ y/\xC5/\xE5/;
97	my $orig_word;
98
99	if ($self->use_cache( )) {
100		$orig_word = $word;
101		my $cached_word = $cache{$word};
102		return $cached_word if $cached_word;
103	}
104
105	my ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
106	return $word unless $lslen >= 3;
107
108	# ### STEP 1
109	# only need to refresh wlen each time we change the word.
110	foreach my $ending (@endings)  {
111		my $endinglen = length $ending; # do this once.
112
113		# only continue if the word has this ending at all.
114		if(substr($rs, $rslen - $endinglen, $rslen) eq $ending) {
115			# replace erte and ert with er
116			if($ending eq 'erte' || $ending eq 'ert') { # c)
117					$word = substr($word, 0, $wlen - $endinglen);
118					$word .= "er";
119					($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
120					last;
121			}
122			elsif($ending eq 's') { # b)
123				# check if it has a valid "s ending"...
124				my $valid_s_ending = 0;
125				if($rslen == 1) {
126					my $wmr1 = substr($word, 0, $wlen - $rslen);
127					if($wmr1 =~ /[$s_ending]$/o) {
128						$valid_s_ending = 1;
129					}
130				}
131				else {
132					if(substr($rs, $rslen - 2, $rslen - 1) =~ /[$s_ending]/o) {
133						$valid_s_ending = 1;
134					}
135				}
136				if($valid_s_ending) {
137					# ...delete the last character (which is a s)
138					$word = substr($word, 0, $wlen - 1);
139					($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
140					last;
141				}
142			}
143			else { # a)
144				# delete the ending.
145				$word = substr($word, 0, $wlen - $endinglen);
146				($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
147				last;
148			}
149		}
150	}
151	return $word unless $lslen >= 3;
152
153	# ### STEP 2
154	my $ending = substr($rs, $rslen - 2, $rslen);
155	if($ending eq 'dt' || $ending eq 'vt') {
156		$word = substr($word, 0, $wlen - 1);
157		($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
158	}
159	return $word unless $lslen >= 3;
160
161	# ### STEP 3
162	foreach my $ending (@endings2) {
163		if($rs =~ /\Q$ending\E$/) {
164			$word = substr($word, 0, $wlen - length($ending));
165			last;
166		}
167	}
168
169	if($self->use_cache()) {
170		$cache{$orig_word} = $word;
171	}
172
173	return $word;
174}
175
176sub getsides {
177    my $word = shift;
178    no warnings;
179    my $wlen = length $word;
180
181    my($ls, $rs) = (undef, undef); # left side and right side.
182
183    # ###
184    # find the first vowel with a non-vowel after it.
185    my($found_vowel, $nonv_position, $curpos) = (-1, -1, 0);
186    foreach(split//, $word) {
187        if($found_vowel> 0) {
188			if(/[^$vowels]/o) {
189				if($curpos > 0) {
190				$nonv_position = $curpos + 1;
191				last;
192				}
193			}
194        }
195        if(/[$vowels]/o) {
196            $found_vowel = 1;
197        }
198        $curpos++;
199    }
200
201	# got nothing: return false
202	return undef if $nonv_position < 0;
203
204    # ###
205    # length of the left side must be atleast 3 chars.
206    my $leftlen = $wlen - ($wlen - $nonv_position);
207    if($leftlen < 3) {
208        $ls = substr($word, 0, 3);
209        $rs = substr($word, 3, $wlen);
210    }
211    else {
212        $ls = substr($word, 0, $leftlen);
213        $rs = substr($word, $nonv_position, $wlen);
214    }
215    return($ls, $rs, $wlen, length $ls, length $rs);
216}
217
2181;
219
220__END__
221
222=head1 NAME
223
224Lingua::Stem::Snowball::No - Porters stemming algorithm for Norwegian
225
226=head1 VERSION
227
228This document describes version 1.1.
229
230=head1 SYNOPSIS
231
232  use Lingua::Stem::Snowball::No
233  my $stemmer = new Lingua::Stem::Snowball::No (use_cache => 1);
234
235  foreach my $word (@words) {
236	my $stemmed = $stemmer->stem($word);
237	print $stemmed, "\n";
238  }
239
240=head1 DESCRIPTION
241
242The stem function takes a scalar as a parameter and stems the word
243according to Martin Porters Norwegian stemming algorithm,
244which can be found at the Snowball website: L<http://snowball.tartarus.org/>.
245
246It also supports caching if you pass the use_cache option when constructing
247a new L:S:S:N object.
248
249=head2 EXPORT
250
251Lingua::Stem::Snowball::No has nothing to export.
252
253=head1 AUTHOR
254
255Ask Solem Hoel, E<lt>ask@0x61736b.netE<gt>
256
257=head1 SEE ALSO
258
259L<perl>. L<Lingua::Stem::Snowball>. L<Lingua::Stem>. L<http://snowball.tartarus.org>.
260L<Lingua::Stem::Snowball> L<Lingua::Stem::Snowball::Se> L<Lingua::Stem::Snowball::Da>
261
262=head1 LICENSE AND COPYRIGHT
263
264Copyright (c), 2007 Ask Solem C<< ask@0x61736b.net >>.
265
266All rights reserved.
267
268This library is free software; you can redistribute it and/or modify
269it under the same terms as Perl itself, either Perl version 5.8.6 or,
270at your option, any later version of Perl 5 you may have available.
271
272=head1 DISCLAIMER OF WARRANTY
273
274BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE
275SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE
276STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE
277SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
278INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
279FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND
280PERFORMANCE OF THE SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE,
281YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION.
282
283IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
284COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE
285SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE TO YOU FOR DAMAGES,
286INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING
287OUT OF THE USE OR INABILITY TO USE THE SOFTWARE (INCLUDING BUT NOT LIMITED TO
288LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR
289THIRD PARTIES OR A FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER
290SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
291POSSIBILITY OF SUCH DAMAGES.
292
293=cut
294~
295
296