1package Lingua::GL::Stemmer; 2$Lingua::GL::Stemmer::VERSION = '0.02'; 3use 5.006; 4use strict; 5use warnings; 6my $aa = "\xe1"; 7my $ea = "\xe9"; 8my $ia = "\xed"; 9my $oa = "\xf3"; 10my $ua = "\xfa"; 11my $at = "\xe3"; 12my $ot = "\xf5"; 13my $nt = "\xf1"; 14my $ac = "\xe2"; 15my $ec = "\xea"; 16my $cc = "\xe7"; 17my %rule; 18 19$rule{plural} = { 20 "ns" => [ 1, "n" ], 21 "${ot}es" => [ 3, "${ot}n" ], 22 "${at}es" => [ 1, "${at}o" ], 23 "ais" => [ 1, "al" ], 24 "${ea}is" => [ 2, "el" ], 25 "eis" => [ 2, "el" ], 26 "${oa}is" => [ 2, "ol" ], 27 "ois" => [ 2, "ol" ], 28 "${ia}s" => [ 2, "il" ], 29 "les" => [ 2, "l" ], 30 "res" => [ 3, "r" ], 31 "s" => [ 2, "" ], 32}; 33 34$rule{femin} = { 35 "ona" => [ 3, "${oa}n" ], 36 "oa" => [ 3, "${oa}n" ], 37 "ora" => [ 3, "or" ], 38 "na" => [ 4, "no" ], 39 "inha" => [ 3, "inho" ], 40 "i${nt}a" => [ 3, "i${nt}o" ], 41 "esa" => [ 3, "${ea}s" ], 42 "osa" => [ 3, "oso" ], 43 "${ia}aca" => [ 3, "${ia}aco" ], 44 "ica" => [ 3, "ico" ], 45 "ada" => [ 3, "ado" ], 46 "ida" => [ 3, "ido" ], 47 "${ia}da" => [ 3, "ido" ], 48 "ana" => [ 2, "${aa}n" ], 49 "${aa}ria" => [ 3, "${aa}rio" ], 50 "ima" => [ 3, "imo" ], 51 "iva" => [ 3, "ivo" ], 52 "eira" => [ 3, "eiro" ], 53 "${at}" => [ 2, "${at}o" ], 54 "${aa}" => [ 2, "${at}n" ], 55}; 56 57$rule{augment} = { 58 "d${ia}ssimo" => [ 5, '' ], 59 "d${ia}simo" => [ 5, '' ], 60 "abil${ia}ssimo" => [ 5,'' ], 61 "abil${ia}simo" => [ 5,'' ], 62 "${ia}ssimo" => [ 3,'' ], 63 "${ia}simo" => [ 3,'' ], 64 "${ea}simo" => [ 3,'' ], 65 "${ea}sima" => [ 3,'' ], 66 "${ea}rrimo" => [ 4,'' ], 67 "${ea}rrima" => [ 4,'' ], 68 "zinho" => [ 2,'' ], 69 "ci${nt}o" => [ 2,'' ], 70 "a${cc}o" => [ 4, '' ], 71 "a${cc}a" => [ 4, '' ], 72 "azo" => [ 4, '' ], 73 "aza" => [ 4, '' ], 74 "ad${at}o" => [ 4, '' ], 75 "acho" => [ 2, '' ], 76 "acha" => [ 2, '' ], 77 "adinho" => [ 3, '' ], 78 "adi${nt}o" => [ 3, '' ], 79 "alh${aa}m" => [ 4, '' ], 80 "alh${at}o" => [ 4, '' ], 81 "all${aa}n" => [ 4, '' ], 82 "allo" => [ 4, '' ], 83 "alla" => [ 4, '' ], 84 "z${at}o" => [ 2,'' ], 85 "z${oa}n" => [ 2,'' ], 86 "zom" => [ 2,'' ], 87 "${aa}n" => [ 4, '' ], 88 "${oa}n" => [ 3, '' ], 89 "${at}o" => [ 3, '' ], 90 "arra" => [ 3,'' ], 91 "astro" => [ 3,'' ], 92 "${aa}zio" => [ 3,'' ], 93 "echo" => [ 3,'' ], 94 "echa" => [ 3,'' ], 95 "edela" => [ 3,'' ], 96 "ela" => [ 4,'' ], 97 "elo" => [ 4,'' ], 98 "eta" => [ 3,'' ], 99 "ete" => [ 3,'' ], 100 "ica" => [ 3,'' ], 101 "id${at}o" => [ 3,'' ], 102 "quinho" => [ 4, "c" ], 103 "qui${nt}o" => [ 4, "c" ], 104 "uinho" => [ 4,'' ], 105 "ui${nt}o" => [ 4,'' ], 106 "inho" => [ 3,'' ], 107 "i${nt}o" => [ 3,'' ], 108 "ito" => [ 3, '' ], 109 "ocho" => [ 4, '' ], 110 "ocha" => [ 4, '' ], 111 "oide" => [ 3, '' ], 112 "ola" => [ 3, '' ], 113 "olo" => [ 3, '' ], 114 "ote" => [ 3, '' ], 115 "ota" => [ 3, '' ], 116 "u${cc}a" => [ 4,'' ], 117 "ucha" => [ 3,'' ], 118 "ucho" => [ 3,'' ], 119 "uco" => [ 4,'' ], 120 "uza" => [ 4,'' ], 121 "uxa" => [ 3,'' ], 122}; 123 124 125$rule{noun} = { 126 "abilidade" => [ 5, "" ], 127 "${aa}bel" => [ 2, "" ], 128 "able" => [ 2, "" ], 129 "aci" => [ 3, "" ], 130 "a${cc}" => [ 3, "" ], 131 "adeiro" => [ 3, "" ], 132 "ador" => [ 3, "" ], 133 "ado" => [ 2, "" ], 134 "agem" => [ 3, "" ], 135 "age" => [ 3, "" ], 136 "alismo" => [ 4, "" ], 137 "al${ia}stico" => [ 3, "" ], 138 "alista" => [ 5, "" ], 139 "alizado" => [ 4, "" ], 140 "alizaci" => [ 5, "" ], 141 "aliza${cc}" => [ 5, "" ], 142 "alizaz" => [ 5, "" ], 143 "al" => [ 4, "" ], 144 "ancia" => [ 4, "" ], 145 "${aa}ncia" => [ 4, "" ], 146 "${ac}ncia" => [ 4, "" ], 147 "ano" => [ 4, "" ], 148 "ante" => [ 2, "" ], 149 "ario" => [ 3, "" ], 150 "${aa}rio" => [ 3, "" ], 151 "${aa}stico" => [ 4, "" ], 152 "ativo" => [ 4, "" ], 153 "atizado" => [ 4, "" ], 154 "atizaci" => [ 4, "" ], 155 "atiza${cc}" => [ 4, "" ], 156 "atizaz" => [ 4, "" ], 157 "atoria" => [ 5, "" ], 158 "at${oa}ria" => [ 5, "" ], 159 "atorio" => [ 3, "" ], 160 "at${oa}rio" => [ 3, "" ], 161 "${aa}utico" => [ 4, "" ], 162 "ico" => [ 4, "" ], 163 "auta" => [ 5, "" ], 164 "${aa}vel" => [ 2, "" ], 165 "axe" => [ 3, "" ], 166 "az" => [ 3, "" ], 167 "bel" => [ 5, "" ], 168 "bil" => [ 0, "vel" ], 169 "ble" => [ 5, "" ], 170 "cionista" => [ 5, "" ], 171 "edeiro" => [ 3, "" ], 172 "eiro" => [ 3, "" ], 173 "edouro" => [ 3, "" ], 174 "edor" => [ 3, "" ], 175 "dor" => [ 2, "" ], 176 "encialista" => [ 4, "" ], 177 "encial" => [ 5, "" ], 178 "${ec}ncia" => [ 3, "" ], 179 "encia" => [ 3, "" ], 180 "${ea}ncia" => [ 3, "" ], 181 "ense" => [ 3, "" ], 182 "ente" => [ 4, "" ], 183 "erio" => [ 6, "" ], 184 "${ea}rio" => [ 6, "" ], 185 "esco" => [ 4, "" ], 186 "${ec}utico" => [ 4, "" ], 187 "${ea}utico" => [ 4, "" ], 188 "eza" => [ 3, "" ], 189 "ez" => [ 4, "" ], 190 "${ia}aco" => [ 3, "" ], 191 "ial" => [ 3, "" ], 192 "iamento" => [ 4, "" ], 193 "amento" => [ 3, "" ], 194 "imento" => [ 3, "" ], 195 "emento" => [ 3, "" ], 196 "mento" => [ 6, "" ], 197 "${ia}bel" => [ 5, "" ], 198 "ible" => [ 5, "" ], 199 "icionista" => [ 4, "" ], 200 "iza${cc}" => [ 5, "" ], 201 "izaci" => [ 5, "" ], 202 "izaz" => [ 5, "" ], 203 "ice" => [ 4, "" ], 204 "ici" => [ 3, "" ], 205 "i${cc}" => [ 3, "" ], 206 "iz" => [ 3, "" ], 207 "idade" => [ 4, "" ], 208 "ideiro" => [ 3, "" ], 209 "ideira" => [ 3, "" ], 210 "ido" => [ 3, "" ], 211 "idor" => [ 4, "" ], 212 "inal" => [ 3, "" ], 213 "ional" => [ 4, "" ], 214 "ionar" => [ 5, "" ], 215 "ionista" => [ 5, "" ], 216 "ismo" => [ 3, "" ], 217 "ista" => [ 3, "" ], 218 "${ia}vel" => [ 5, "" ], 219 "ividade" => [ 5, "" ], 220 "ivo" => [ 4, "" ], 221 "izado" => [ 5, "" ], 222 "or" => [ 3, "" ], 223 "oria" => [ 3, "" ], 224 "or${ia}a" => [ 4, "" ], 225 "oso" => [ 3, "" ], 226 "queiro" => [ 3, "c" ], 227 "quice" => [ 4, "c" ], 228 "rio" => [ 5, "" ], 229 "sor" => [ 2, "" ], 230 "tico" => [ 3, "" ], 231 "tivo" => [ 4, "" ], 232 "tizado" => [ 4, "" ], 233 "tiza${cc}" => [ 5, "" ], 234 "tizaci" => [ 5, "" ], 235 "tizaz" => [ 5, "" ], 236 "tor" => [ 5, "" ], 237 "ual" => [ 3, "" ], 238 "uoso" => [ 3, "" ], 239 "ura" => [ 4, "" ], 240 "vel" => [ 5, "" ], 241}; 242 243 244$rule{verb} = { 245 "aba" => [ 2, "" ], 246 "abade" => [ 2, "" ], 247 "${aa}bade" => [ 2, "" ], 248 "abamo" => [ 2, "" ], 249 "${aa}bamo" => [ 2, "" ], 250 "aban" => [ 2, "" ], 251 "ache" => [ 2, "" ], 252 "ade" => [ 2, "" ], 253 "ai" => [ 2, "" ], 254 "am" => [ 2, "" ], 255 "amo" => [ 2, "" ], 256 "an" => [ 2, "" ], 257 "ando" => [ 2, "" ], 258 "ar" => [ 2, "" ], 259 "ara" => [ 2, "" ], 260 "ar${aa}" => [ 2, "" ], 261 "arade" => [ 2, "" ], 262 "${aa}rade" => [ 2, "" ], 263 "aram" => [ 2, "" ], 264 "ar${aa}m" => [ 2, "" ], 265 "aramo" => [ 2, "" ], 266 "${aa}ramo" => [ 2, "" ], 267 "ar${aa}n" => [ 2, "" ], 268 "ar${at}o" => [ 2, "" ], 269 "arde" => [ 2, "" ], 270 "are" => [ 2, "" ], 271 "arei" => [ 2, "" ], 272 "${aa}rei" => [ 2, "" ], 273 "arem" => [ 2, "" ], 274 "aremo" => [ 2, "" ], 275 "aria" => [ 2, "" ], 276 "ar${ia}a" => [ 2, "" ], 277 "ariade" => [ 2, "" ], 278 "ar${ia}ade" => [ 2, "" ], 279 "ariam" => [ 2, "" ], 280 "ariamo" => [ 2, "" ], 281 "ar${ia}amo" => [ 2, "" ], 282 "ar${ia}ei" => [ 2, "" ], 283 "armo" => [ 2, "" ], 284 "${aa}rom" => [ 2, "" ], 285 "aron" => [ 2, "" ], 286 "ase" => [ 2, "" ], 287 "asede" => [ 2, "" ], 288 "${aa}sede" => [ 2, "" ], 289 "asemo" => [ 2, "" ], 290 "${aa}semo" => [ 2, "" ], 291 "asen" => [ 2, "" ], 292 "asse" => [ 2, "" ], 293 "${aa}ssei" => [ 2, "" ], 294 "assem" => [ 2, "" ], 295 "${aa}ssemo" => [ 2, "" ], 296 "aste" => [ 2, "" ], 297 "ava" => [ 2, "" ], 298 "avam" => [ 2, "" ], 299 "${aa}vamo" => [ 2, "" ], 300 "avan" => [ 2, "" ], 301 "${aa}vei" => [ 2, "" ], 302 "ear" => [ 4, "" ], 303 "ede" => [ 1, "" ], 304 "ei" => [ 3, "" ], 305 "em" => [ 2, "" ], 306 "emo" => [ 2, "" ], 307 "en" => [ 2, "" ], 308 "endo" => [ 1, "" ], 309 "eou" => [ 5, "" ], 310 "er" => [ 1, "" ], 311 "era" => [ 1, "" ], 312 "er${aa}" => [ 1, "" ], 313 "erade" => [ 1, "" ], 314 "${ea}rade" => [ 1, "" ], 315 "eram" => [ 1, "" ], 316 "er${aa}m" => [ 1, "" ], 317 "eramo" => [ 1, "" ], 318 "${ea}ramo" => [ 1, "" ], 319 "${ec}ramo" => [ 1, "" ], 320 "er${aa}n" => [ 1, "" ], 321 "er${at}o" => [ 1, "" ], 322 "erde" => [ 1, "" ], 323 "ere" => [ 1, "" ], 324 "erei" => [ 1, "" ], 325 "${ec}rei" => [ 1, "" ], 326 "erem" => [ 1, "" ], 327 "eremo" => [ 1, "" ], 328 "eria" => [ 1, "" ], 329 "er${ia}a" => [ 1, "" ], 330 "eriade" => [ 1, "" ], 331 "er${ia}ade" => [ 1, "" ], 332 "eriam" => [ 1, "" ], 333 "eriamo" => [ 1, "" ], 334 "er${ia}amo" => [ 1, "" ], 335 "erian" => [ 1, "" ], 336 "er${ia}an" => [ 1, "" ], 337 "er${ia}ei" => [ 1, "" ], 338 "ermo" => [ 1, "" ], 339 "${ec}rom" => [ 1, "" ], 340 "eron" => [ 1, "" ], 341 "ese" => [ 1, "" ], 342 "esedes" => [ 1, "" ], 343 "${ea}sedes" => [ 1, "" ], 344 "esemo" => [ 1, "" ], 345 "${ea}semo" => [ 1, "" ], 346 "esen" => [ 1, "" ], 347 "esse" => [ 1, "" ], 348 "${ec}ssede" => [ 1, "" ], 349 "${ec}ssei" => [ 1, "" ], 350 "essem" => [ 1, "" ], 351 "${ec}ssemo" => [ 1, "" ], 352 "este" => [ 1, "" ], 353 "eu" => [ 1, "" ], 354 "guem" => [ 1, "g" ], 355 "i" => [ 1, "" ], 356 "ia" => [ 1, "" ], 357 "${ia}a" => [ 1, "" ], 358 "iade" => [ 1, "" ], 359 "${ia}ade" => [ 1, "" ], 360 "iam" => [ 1, "" ], 361 "iamo" => [ 1, "" ], 362 "${ia}amo" => [ 1, "" ], 363 "ian" => [ 1, "" ], 364 "${ia}an" => [ 1, "" ], 365 "iava" => [ 1, "" ], 366 "iche" => [ 1, "" ], 367 "ide" => [ 1, "" ], 368 "${ia}do" => [ 3, "" ], 369 "${ia}ei" => [ 1, "" ], 370 "im" => [ 1, "" ], 371 "imo" => [ 3, "" ], 372 "imo" => [ 3, "" ], 373 "in" => [ 3, "" ], 374 "indo" => [ 3, "" ], 375 "iona" => [ 3, "" ], 376 "ir" => [ 3, "" ], 377 "ira" => [ 3, "" ], 378 "ir${aa}" => [ 3, "" ], 379 "irade" => [ 3, "" ], 380 "${ia}rade" => [ 3, "" ], 381 "iram" => [ 3, "" ], 382 "ir${aa}m" => [ 3, "" ], 383 "${ia}ram" => [ 3, "" ], 384 "iramo" => [ 3, "" ], 385 "${ia}ramo" => [ 3, "" ], 386 "ir${aa}n" => [ 3, "" ], 387 "ir${at}o" => [ 2, "" ], 388 "irde" => [ 2, "" ], 389 "ire" => [ 3, "" ], 390 "irei" => [ 3, "" ], 391 "irem" => [ 3, "" ], 392 "iremo" => [ 3, "" ], 393 "iria" => [ 3, "" ], 394 "ir${ia}a" => [ 3, "" ], 395 "iriade" => [ 3, "" ], 396 "ir${ia}ade" => [ 3, "" ], 397 "iriam" => [ 3, "" ], 398 "iriamo" => [ 3, "" ], 399 "ir${ia}amo" => [ 3, "" ], 400 "irian" => [ 3, "" ], 401 "ir${ia}an" => [ 3, "" ], 402 "ir${ia}ei" => [ 3, "" ], 403 "irmo" => [ 3, "" ], 404 "${ia}rom" => [ 3, "" ], 405 "iron" => [ 3, "" ], 406 "ise" => [ 3, "" ], 407 "isede" => [ 3, "" ], 408 "${ia}sede" => [ 3, "" ], 409 "isemo" => [ 3, "" ], 410 "${ia}semo" => [ 3, "" ], 411 "isen" => [ 3, "" ], 412 "isse" => [ 3, "" ], 413 "${ia}ssede" => [ 3, "" ], 414 "${ia}ssei" => [ 3, "" ], 415 "issem" => [ 3, "" ], 416 "${ia}ssemo" => [ 3, "" ], 417 "iste" => [ 4, "" ], 418 "itar" => [ 5, "" ], 419 "iu" => [ 3, "" ], 420 "izar" => [ 3, "" ], 421 "omo" => [ 3, "" ], 422 "ondo" => [ 3, "" ], 423 "ou" => [ 3, "" ], 424 "tizar" => [ 4, "" ], 425 "uei" => [ 3, "" ], 426 "u${ia}a" => [ 5, "u" ], 427}; 428 429$rule{accent} = { 430 $aa => 'a', 431 $ea => 'e', 432 $ia => 'i', 433 $oa => 'o', 434 $ua => 'u', 435 $at => 'a', 436 $ot => 'o', 437 $ec => 'e', 438 $cc => 'c', 439 $nt => 'n', 440}; 441 442$rule{vowel} = { 443 "bil" => [ 2, "vel" ], 444 "gue" => [ 2, "g" ], 445 "a" => [ 3, "" ], 446 "e" => [ 3, "" ], 447 "o" => [ 3, "" ], 448}; 449 450sub strip($$) { 451 my $cmd = shift; 452 my $word = shift; 453 if($cmd eq 'accent'){ 454 foreach my $a (keys %{$rule{accent}}){ 455 $word =~ s/$a/$rule{accent}->{$a}/eg; 456 } 457 } 458 elsif($cmd eq 'adv'){ $word =~ s/(.{4,})mente/$1/o; } 459 else{ 460 my $cmdref = $rule{$cmd}; 461 for my $key (sort { length $b <=> length $a } keys %{$cmdref}){ 462 my $patt = join q//, "^(.{", $cmdref->{$key}->[0], ",})", $key, '$'; 463 if($word =~ /$patt/){ 464 $word =~ s/$patt/$1.($cmdref->{$key}->[1])/e; 465 last; 466 } 467 } 468 } 469 return $word; 470} 471 472 473sub stem { 474 my @stems; 475 foreach ( ref($_[0]) ? @{$_[0]} : @_ ){ 476 my $word = $_; 477 $word = strip('plural', $word) if $word =~ /s$/o; 478 $word = strip('femin', $word) if $word =~ /a$/o; 479 foreach my $op (qw/augment adv noun verb vowel accent/){ 480 $word = strip($op, $word); 481 } 482 push @stems, $word; 483 } 484 wantarray ? @stems : \@stems; 485} 486 4871; 488__END__ 489# Below is stub documentation for your module. You better edit it! 490 491=head1 NAME 492 493Lingua::GL::Stemmer - Galician Stemmer 494 495=head1 SYNOPSIS 496 497 use Lingua::GL::Stemmer; 498 499 Lingua::GL::Stemmer::stem(\@words); 500 501 # or 502 503 Lingua::GL::Stemmer::stem(@words); 504 505=head1 DESCRIPTION 506 507Galician is an endangered language spoken in northwest region of Spain. Galician is morphologically similar to Portuguese but phonetics differs greatly. Due to the morphological similarity between Portuguese and Galician, Portuguese stemming algorithm can be adopted to stem Galician texts. 508 509See L<Lingua::PT::Stemmer> for a sketch of the stemming algorithm, and L<http://bvg.udc.es/recursos_lingua/stemming.html> for stemming rules. 510 511=head1 SEE ALSO 512 513L<Lingua::PT::Stemmer> 514 515Stemming rules 516L<http://bvg.udc.es/recursos_lingua/stemming.html> 517 518=head1 COPYRIGHT 519 520xern E<lt>xern@cpan.orgE<gt> 521 522This module is free software; you can redistribute it or modify it under the same terms as Perl itself. 523 524=cut 525