1package Lingua::Stem::Snowball::No; 2use strict; 3use bytes; 4# $Id: No.pm,v 1.1 2007/05/07 11:35:26 ask Exp $ 5# $Source: /opt/CVS/NoSnowball/lib/Lingua/Stem/Snowball/No.pm,v $ 6# $Author: ask $ 7# $HeadURL$ 8# $Revision: 1.1 $ 9# $Date: 2007/05/07 11:35:26 $ 10# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 11# Lingua::Stem::Snowball::No - Norwegian stemmer 12# :: based upon the norwegian stemmer algorithm at snowball.tartarus.org 13# by Martin Porter. 14# (c) 2001-2007 Ask Solem Hoel <ask@0x61736b.net> 15# 16# This program is free software; you can redistribute it and/or modify 17# it under the terms of the GNU General Public License version 2, 18# *NOT* "earlier versions", as published by the Free Software Foundation. 19# 20# This program is distributed in the hope that it will be useful, 21# but WITHOUT ANY WARRANTY; without even the implied warranty of 22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23# GNU General Public License for more details. 24# 25# You should have received a copy of the GNU General Public License 26# along with this program; if not, write to the Free Software 27# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 28# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 29##### 30 31 32use vars qw($VERSION); 33$VERSION = 1.2; 34 35my %cache = ( ); 36 37# special characters 38my $ae = "\xE6"; 39my $ao = "\xE5"; 40my $oe = "\xF8"; 41 42# delete the s if a "s ending" is preceeded by one 43# of these characters. 44my $s_ending = "bcdfghjklmnoprtvyz"; 45 46# norwegian vowels. 47my $vowels = "aeiouy$ae$ao$oe"; 48 49# #### 50# the endings in step 1 51# XXX: these must be sorted by length 52# to save time we've done it already, you can do it like this: 53# my $bylength = sub { 54# length $a <=> length $b; 55# } 56# @endings = reverse sort $bylength @endings; 57my @endings = qw/ 58 hetenes hetens hetene endes heter heten enes edes ende erte 59 ande ast het ets ers ert ens ene ane ede et es as er ar en 60 e a s 61/; 62 63# the endings in step 2 64# XXX: these must be sorted by length, like @endings in step 1. 65my @endings2 = qw/ 66 hetslov slov elov elig eleg els lig eig lov leg ig 67/; 68 69sub new { 70 my $class = shift; 71 my %arg = @_; 72 my $self = { }; 73 74 bless $self, $class; 75 if ($arg{use_cache}) { 76 $self->use_cache(1); 77 } 78 79 return $self; 80} 81 82sub use_cache { 83 my ($self, $use_cache) = @_; 84 if ($use_cache) { 85 $self->{USE_CACHE} = 1; 86 } 87 return $self->{USE_CACHE}; 88} 89 90sub stem { 91 my ($self, $word) = @_; 92 no warnings; 93 $word = lc $word; 94 $word =~ y/\xC6/\xE6/; 95 $word =~ y/\xD8/\xF8/; 96 $word =~ y/\xC5/\xE5/; 97 my $orig_word; 98 99 if ($self->use_cache( )) { 100 $orig_word = $word; 101 my $cached_word = $cache{$word}; 102 return $cached_word if $cached_word; 103 } 104 105 my ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word); 106 return $word unless $lslen >= 3; 107 108 # ### STEP 1 109 # only need to refresh wlen each time we change the word. 110 foreach my $ending (@endings) { 111 my $endinglen = length $ending; # do this once. 112 113 # only continue if the word has this ending at all. 114 if(substr($rs, $rslen - $endinglen, $rslen) eq $ending) { 115 # replace erte and ert with er 116 if($ending eq 'erte' || $ending eq 'ert') { # c) 117 $word = substr($word, 0, $wlen - $endinglen); 118 $word .= "er"; 119 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word); 120 last; 121 } 122 elsif($ending eq 's') { # b) 123 # check if it has a valid "s ending"... 124 my $valid_s_ending = 0; 125 if($rslen == 1) { 126 my $wmr1 = substr($word, 0, $wlen - $rslen); 127 if($wmr1 =~ /[$s_ending]$/o) { 128 $valid_s_ending = 1; 129 } 130 } 131 else { 132 if(substr($rs, $rslen - 2, $rslen - 1) =~ /[$s_ending]/o) { 133 $valid_s_ending = 1; 134 } 135 } 136 if($valid_s_ending) { 137 # ...delete the last character (which is a s) 138 $word = substr($word, 0, $wlen - 1); 139 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word); 140 last; 141 } 142 } 143 else { # a) 144 # delete the ending. 145 $word = substr($word, 0, $wlen - $endinglen); 146 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word); 147 last; 148 } 149 } 150 } 151 return $word unless $lslen >= 3; 152 153 # ### STEP 2 154 my $ending = substr($rs, $rslen - 2, $rslen); 155 if($ending eq 'dt' || $ending eq 'vt') { 156 $word = substr($word, 0, $wlen - 1); 157 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word); 158 } 159 return $word unless $lslen >= 3; 160 161 # ### STEP 3 162 foreach my $ending (@endings2) { 163 if($rs =~ /\Q$ending\E$/) { 164 $word = substr($word, 0, $wlen - length($ending)); 165 last; 166 } 167 } 168 169 if($self->use_cache()) { 170 $cache{$orig_word} = $word; 171 } 172 173 return $word; 174} 175 176sub getsides { 177 my $word = shift; 178 no warnings; 179 my $wlen = length $word; 180 181 my($ls, $rs) = (undef, undef); # left side and right side. 182 183 # ### 184 # find the first vowel with a non-vowel after it. 185 my($found_vowel, $nonv_position, $curpos) = (-1, -1, 0); 186 foreach(split//, $word) { 187 if($found_vowel> 0) { 188 if(/[^$vowels]/o) { 189 if($curpos > 0) { 190 $nonv_position = $curpos + 1; 191 last; 192 } 193 } 194 } 195 if(/[$vowels]/o) { 196 $found_vowel = 1; 197 } 198 $curpos++; 199 } 200 201 # got nothing: return false 202 return undef if $nonv_position < 0; 203 204 # ### 205 # length of the left side must be atleast 3 chars. 206 my $leftlen = $wlen - ($wlen - $nonv_position); 207 if($leftlen < 3) { 208 $ls = substr($word, 0, 3); 209 $rs = substr($word, 3, $wlen); 210 } 211 else { 212 $ls = substr($word, 0, $leftlen); 213 $rs = substr($word, $nonv_position, $wlen); 214 } 215 return($ls, $rs, $wlen, length $ls, length $rs); 216} 217 2181; 219 220__END__ 221 222=head1 NAME 223 224Lingua::Stem::Snowball::No - Porters stemming algorithm for Norwegian 225 226=head1 VERSION 227 228This document describes version 1.1. 229 230=head1 SYNOPSIS 231 232 use Lingua::Stem::Snowball::No 233 my $stemmer = new Lingua::Stem::Snowball::No (use_cache => 1); 234 235 foreach my $word (@words) { 236 my $stemmed = $stemmer->stem($word); 237 print $stemmed, "\n"; 238 } 239 240=head1 DESCRIPTION 241 242The stem function takes a scalar as a parameter and stems the word 243according to Martin Porters Norwegian stemming algorithm, 244which can be found at the Snowball website: L<http://snowball.tartarus.org/>. 245 246It also supports caching if you pass the use_cache option when constructing 247a new L:S:S:N object. 248 249=head2 EXPORT 250 251Lingua::Stem::Snowball::No has nothing to export. 252 253=head1 AUTHOR 254 255Ask Solem Hoel, E<lt>ask@0x61736b.netE<gt> 256 257=head1 SEE ALSO 258 259L<perl>. L<Lingua::Stem::Snowball>. L<Lingua::Stem>. L<http://snowball.tartarus.org>. 260L<Lingua::Stem::Snowball> L<Lingua::Stem::Snowball::Se> L<Lingua::Stem::Snowball::Da> 261 262=head1 LICENSE AND COPYRIGHT 263 264Copyright (c), 2007 Ask Solem C<< ask@0x61736b.net >>. 265 266All rights reserved. 267 268This library is free software; you can redistribute it and/or modify 269it under the same terms as Perl itself, either Perl version 5.8.6 or, 270at your option, any later version of Perl 5 you may have available. 271 272=head1 DISCLAIMER OF WARRANTY 273 274BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE 275SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE 276STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE 277SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, 278INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 279FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND 280PERFORMANCE OF THE SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, 281YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION. 282 283IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY 284COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE 285SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE TO YOU FOR DAMAGES, 286INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING 287OUT OF THE USE OR INABILITY TO USE THE SOFTWARE (INCLUDING BUT NOT LIMITED TO 288LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR 289THIRD PARTIES OR A FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER 290SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 291POSSIBILITY OF SUCH DAMAGES. 292 293=cut 294~ 295 296