1package Lingua::GL::Stemmer;
2$Lingua::GL::Stemmer::VERSION = '0.02';
3use 5.006;
4use strict;
5use warnings;
6my $aa = "\xe1";
7my $ea = "\xe9";
8my $ia = "\xed";
9my $oa = "\xf3";
10my $ua = "\xfa";
11my $at = "\xe3";
12my $ot = "\xf5";
13my $nt = "\xf1";
14my $ac = "\xe2";
15my $ec = "\xea";
16my $cc = "\xe7";
17my %rule;
18
19$rule{plural} = {
20    "ns"  => [ 1, "n" ],
21    "${ot}es" => [ 3, "${ot}n" ],
22    "${at}es" => [ 1, "${at}o" ],
23    "ais" => [ 1, "al" ],
24    "${ea}is" => [ 2, "el" ],
25    "eis" => [ 2, "el" ],
26    "${oa}is" => [ 2, "ol" ],
27    "ois" => [ 2, "ol" ],
28    "${ia}s"  => [ 2, "il" ],
29    "les" => [ 2, "l" ],
30    "res" => [ 3, "r" ],
31    "s"   => [ 2, "" ],
32};
33
34$rule{femin} = {
35    "ona" => [ 3, "${oa}n" ],
36    "oa" => [ 3, "${oa}n" ],
37    "ora" => [ 3, "or" ],
38    "na" => [ 4, "no" ],
39    "inha" => [ 3, "inho" ],
40    "i${nt}a" => [ 3, "i${nt}o" ],
41    "esa" => [ 3, "${ea}s" ],
42    "osa" => [ 3, "oso" ],
43    "${ia}aca" => [ 3, "${ia}aco" ],
44    "ica" => [ 3, "ico" ],
45    "ada" => [ 3, "ado" ],
46    "ida" => [ 3, "ido" ],
47    "${ia}da" => [ 3, "ido" ],
48    "ana" => [ 2, "${aa}n" ],
49    "${aa}ria" => [ 3, "${aa}rio" ],
50    "ima" => [ 3, "imo" ],
51    "iva" => [ 3, "ivo" ],
52    "eira" => [ 3, "eiro" ],
53    "${at}" => [ 2, "${at}o" ],
54    "${aa}" => [ 2, "${at}n" ],
55};
56
57$rule{augment} = {
58    "d${ia}ssimo" => [ 5, '' ],
59    "d${ia}simo" => [ 5, '' ],
60    "abil${ia}ssimo" => [ 5,'' ],
61    "abil${ia}simo" => [ 5,'' ],
62    "${ia}ssimo" => [ 3,'' ],
63    "${ia}simo" => [ 3,'' ],
64    "${ea}simo" => [ 3,'' ],
65    "${ea}sima" => [ 3,'' ],
66    "${ea}rrimo" => [ 4,'' ],
67    "${ea}rrima" => [ 4,'' ],
68    "zinho" => [ 2,'' ],
69    "ci${nt}o" => [ 2,'' ],
70    "a${cc}o" => [ 4, '' ],
71    "a${cc}a" => [ 4, '' ],
72    "azo" => [ 4, '' ],
73    "aza" => [ 4, '' ],
74    "ad${at}o" => [ 4, '' ],
75    "acho" => [ 2, '' ],
76    "acha" => [ 2, '' ],
77    "adinho" => [ 3, '' ],
78    "adi${nt}o" => [ 3, '' ],
79    "alh${aa}m" => [ 4, '' ],
80    "alh${at}o" => [ 4, '' ],
81    "all${aa}n" => [ 4, '' ],
82    "allo" => [ 4, '' ],
83    "alla" => [ 4, '' ],
84    "z${at}o" => [ 2,'' ],
85    "z${oa}n" => [ 2,'' ],
86    "zom" => [ 2,'' ],
87    "${aa}n" => [ 4, '' ],
88    "${oa}n" => [ 3, '' ],
89    "${at}o" => [ 3, '' ],
90    "arra" => [ 3,'' ],
91    "astro" => [ 3,'' ],
92    "${aa}zio" => [ 3,'' ],
93    "echo" => [ 3,'' ],
94    "echa" => [ 3,'' ],
95    "edela" => [ 3,'' ],
96    "ela" => [ 4,'' ],
97    "elo" => [ 4,'' ],
98    "eta" => [ 3,'' ],
99    "ete" => [ 3,'' ],
100    "ica" => [ 3,'' ],
101    "id${at}o" => [ 3,'' ],
102    "quinho" => [ 4, "c" ],
103    "qui${nt}o" => [ 4, "c" ],
104    "uinho" => [ 4,'' ],
105    "ui${nt}o" => [ 4,'' ],
106    "inho" => [ 3,'' ],
107    "i${nt}o" => [ 3,'' ],
108    "ito" => [ 3, '' ],
109    "ocho" => [ 4, '' ],
110    "ocha" => [ 4, '' ],
111    "oide" => [ 3, '' ],
112    "ola" => [ 3, '' ],
113    "olo" => [ 3, '' ],
114    "ote" => [ 3, '' ],
115    "ota" => [ 3, '' ],
116    "u${cc}a" => [ 4,'' ],
117    "ucha" => [ 3,'' ],
118    "ucho" => [ 3,'' ],
119    "uco" => [ 4,'' ],
120    "uza" => [ 4,'' ],
121    "uxa" => [ 3,'' ],
122};
123
124
125$rule{noun} = {
126    "abilidade" => [ 5, "" ],
127    "${aa}bel" => [ 2, "" ],
128    "able" => [ 2, "" ],
129    "aci" => [ 3, "" ],
130    "a${cc}" => [ 3, "" ],
131    "adeiro" => [ 3, "" ],
132    "ador" => [ 3, "" ],
133    "ado" => [ 2, "" ],
134    "agem" => [ 3, "" ],
135    "age" => [ 3, "" ],
136    "alismo" => [ 4, "" ],
137    "al${ia}stico" => [ 3, "" ],
138    "alista" => [ 5, "" ],
139    "alizado" => [ 4, "" ],
140    "alizaci" => [ 5, "" ],
141    "aliza${cc}" => [ 5, "" ],
142    "alizaz" => [ 5, "" ],
143    "al" => [ 4, "" ],
144    "ancia" => [ 4, "" ],
145    "${aa}ncia" => [ 4, "" ],
146    "${ac}ncia" => [ 4, "" ],
147    "ano" => [ 4, "" ],
148    "ante" => [ 2, "" ],
149    "ario" => [ 3, "" ],
150    "${aa}rio" => [ 3, "" ],
151    "${aa}stico" => [ 4, "" ],
152    "ativo" => [ 4, "" ],
153    "atizado" => [ 4, "" ],
154    "atizaci" => [ 4, "" ],
155    "atiza${cc}" => [ 4, "" ],
156    "atizaz" => [ 4, "" ],
157    "atoria" => [ 5, "" ],
158    "at${oa}ria" => [ 5, "" ],
159    "atorio" => [ 3, "" ],
160    "at${oa}rio" => [ 3, "" ],
161    "${aa}utico" => [ 4, "" ],
162    "ico" => [ 4, "" ],
163    "auta" => [ 5, "" ],
164    "${aa}vel" => [ 2, "" ],
165    "axe" => [ 3, "" ],
166    "az" => [ 3, "" ],
167    "bel" => [ 5, "" ],
168    "bil" => [ 0, "vel" ],
169    "ble" => [ 5, "" ],
170    "cionista" => [ 5, "" ],
171    "edeiro" => [ 3, "" ],
172    "eiro" => [ 3, "" ],
173    "edouro" => [ 3, "" ],
174    "edor" => [ 3, "" ],
175    "dor" => [ 2, "" ],
176    "encialista" => [ 4, "" ],
177    "encial" => [ 5, "" ],
178    "${ec}ncia" => [ 3, "" ],
179    "encia" => [ 3, "" ],
180    "${ea}ncia" => [ 3, "" ],
181    "ense" => [ 3, "" ],
182    "ente" => [ 4, "" ],
183    "erio" => [ 6, "" ],
184    "${ea}rio" => [ 6, "" ],
185    "esco" => [ 4, "" ],
186    "${ec}utico" => [ 4, "" ],
187    "${ea}utico" => [ 4, "" ],
188    "eza" => [ 3, "" ],
189    "ez" => [ 4, "" ],
190    "${ia}aco" => [ 3, "" ],
191    "ial" => [ 3, "" ],
192    "iamento" => [ 4, "" ],
193    "amento" => [ 3, "" ],
194    "imento" => [ 3, "" ],
195    "emento" => [ 3, "" ],
196    "mento" => [ 6, "" ],
197    "${ia}bel" => [ 5, "" ],
198    "ible" => [ 5, "" ],
199    "icionista" => [ 4, "" ],
200    "iza${cc}" => [ 5, "" ],
201    "izaci" => [ 5, "" ],
202    "izaz" => [ 5, "" ],
203    "ice" => [ 4, "" ],
204    "ici" => [ 3, "" ],
205    "i${cc}" => [ 3, "" ],
206    "iz" => [ 3, "" ],
207    "idade" => [ 4, "" ],
208    "ideiro" => [ 3, "" ],
209    "ideira" => [ 3, "" ],
210    "ido" => [ 3, "" ],
211    "idor" => [ 4, "" ],
212    "inal" => [ 3, "" ],
213    "ional" => [ 4, "" ],
214    "ionar" => [ 5, "" ],
215    "ionista" => [ 5, "" ],
216    "ismo" => [ 3, "" ],
217    "ista" => [ 3, "" ],
218    "${ia}vel" => [ 5, "" ],
219    "ividade" => [ 5, "" ],
220    "ivo" => [ 4, "" ],
221    "izado" => [ 5, "" ],
222    "or" => [ 3, "" ],
223    "oria" => [ 3, "" ],
224    "or${ia}a" => [ 4, "" ],
225    "oso" => [ 3, "" ],
226    "queiro" => [ 3, "c" ],
227    "quice" => [ 4, "c" ],
228    "rio" => [ 5, "" ],
229    "sor" => [ 2, "" ],
230    "tico" => [ 3, "" ],
231    "tivo" => [ 4, "" ],
232    "tizado" => [ 4, "" ],
233    "tiza${cc}" => [ 5, "" ],
234    "tizaci" => [ 5, "" ],
235    "tizaz" => [ 5, "" ],
236    "tor" => [ 5, "" ],
237    "ual" => [ 3, "" ],
238    "uoso" => [ 3, "" ],
239    "ura" => [ 4, "" ],
240    "vel" => [ 5, "" ],
241};
242
243
244$rule{verb} = {
245    "aba"  => [ 2, "" ],
246    "abade" => [ 2, "" ],
247    "${aa}bade" => [ 2, "" ],
248    "abamo" => [ 2, "" ],
249    "${aa}bamo" => [ 2, "" ],
250    "aban" => [ 2, "" ],
251    "ache" => [ 2, "" ],
252    "ade" => [ 2, "" ],
253    "ai" => [ 2, "" ],
254    "am" => [ 2, "" ],
255    "amo" => [ 2, "" ],
256    "an" => [ 2, "" ],
257    "ando" => [ 2, "" ],
258    "ar" => [ 2, "" ],
259    "ara" => [ 2, "" ],
260    "ar${aa}" => [ 2, "" ],
261    "arade" => [ 2, "" ],
262    "${aa}rade" => [ 2, "" ],
263    "aram" => [ 2, "" ],
264    "ar${aa}m" => [ 2, "" ],
265    "aramo" => [ 2, "" ],
266    "${aa}ramo" => [ 2, "" ],
267    "ar${aa}n" => [ 2, "" ],
268    "ar${at}o" => [ 2, "" ],
269    "arde" => [ 2, "" ],
270    "are" => [ 2, "" ],
271    "arei" => [ 2, "" ],
272    "${aa}rei" => [ 2, "" ],
273    "arem" => [ 2, "" ],
274    "aremo" => [ 2, "" ],
275    "aria" => [ 2, "" ],
276    "ar${ia}a" => [ 2, "" ],
277    "ariade" => [ 2, "" ],
278    "ar${ia}ade" => [ 2, "" ],
279    "ariam" => [ 2, "" ],
280    "ariamo" => [ 2, "" ],
281    "ar${ia}amo" => [ 2, "" ],
282    "ar${ia}ei" => [ 2, "" ],
283    "armo" => [ 2, "" ],
284    "${aa}rom" => [ 2, "" ],
285    "aron" => [ 2, "" ],
286    "ase" => [ 2, "" ],
287    "asede" => [ 2, "" ],
288    "${aa}sede" => [ 2, "" ],
289    "asemo" => [ 2, "" ],
290    "${aa}semo" => [ 2, "" ],
291    "asen" => [ 2, "" ],
292    "asse" => [ 2, "" ],
293    "${aa}ssei" => [ 2, "" ],
294    "assem" => [ 2, "" ],
295    "${aa}ssemo" => [ 2, "" ],
296    "aste" => [ 2, "" ],
297    "ava" => [ 2, "" ],
298    "avam" => [ 2, "" ],
299    "${aa}vamo" => [ 2, "" ],
300    "avan" => [ 2, "" ],
301    "${aa}vei" => [ 2, "" ],
302    "ear" => [ 4, "" ],
303    "ede" => [ 1, "" ],
304    "ei" => [ 3, "" ],
305    "em" => [ 2, "" ],
306    "emo" => [ 2, "" ],
307    "en" => [ 2, "" ],
308    "endo" => [ 1, "" ],
309    "eou" => [ 5, "" ],
310    "er" => [ 1, "" ],
311    "era" => [ 1, "" ],
312    "er${aa}" => [ 1, "" ],
313    "erade" => [ 1, "" ],
314    "${ea}rade" => [ 1, "" ],
315    "eram" => [ 1, "" ],
316    "er${aa}m" => [ 1, "" ],
317    "eramo" => [ 1, "" ],
318    "${ea}ramo" => [ 1, "" ],
319    "${ec}ramo" => [ 1, "" ],
320    "er${aa}n" => [ 1, "" ],
321    "er${at}o" => [ 1, "" ],
322    "erde" => [ 1, "" ],
323    "ere" => [ 1, "" ],
324    "erei" => [ 1, "" ],
325    "${ec}rei" => [ 1, "" ],
326    "erem" => [ 1, "" ],
327    "eremo" => [ 1, "" ],
328    "eria" => [ 1, "" ],
329    "er${ia}a" => [ 1, "" ],
330    "eriade" => [ 1, "" ],
331    "er${ia}ade" => [ 1, "" ],
332    "eriam" => [ 1, "" ],
333    "eriamo" => [ 1, "" ],
334    "er${ia}amo" => [ 1, "" ],
335    "erian" => [ 1, "" ],
336    "er${ia}an" => [ 1, "" ],
337    "er${ia}ei" => [ 1, "" ],
338    "ermo" => [ 1, "" ],
339    "${ec}rom" => [ 1, "" ],
340    "eron" => [ 1, "" ],
341    "ese" => [ 1, "" ],
342    "esedes" => [ 1, "" ],
343    "${ea}sedes" => [ 1, "" ],
344    "esemo" => [ 1, "" ],
345    "${ea}semo" => [ 1, "" ],
346    "esen" => [ 1, "" ],
347    "esse" => [ 1, "" ],
348    "${ec}ssede" => [ 1, "" ],
349    "${ec}ssei" => [ 1, "" ],
350    "essem" => [ 1, "" ],
351    "${ec}ssemo" => [ 1, "" ],
352    "este" => [ 1, "" ],
353    "eu" => [ 1, "" ],
354    "guem" => [ 1, "g" ],
355    "i" => [ 1, "" ],
356    "ia" => [ 1, "" ],
357    "${ia}a" => [ 1, "" ],
358    "iade" => [ 1, "" ],
359    "${ia}ade" => [ 1, "" ],
360    "iam" => [ 1, "" ],
361    "iamo" => [ 1, "" ],
362    "${ia}amo" => [ 1, "" ],
363    "ian" => [ 1, "" ],
364    "${ia}an" => [ 1, "" ],
365    "iava" => [ 1, "" ],
366    "iche" => [ 1, "" ],
367    "ide" => [ 1, "" ],
368    "${ia}do" => [ 3, "" ],
369    "${ia}ei" => [ 1, "" ],
370    "im" => [ 1, "" ],
371    "imo" => [ 3, "" ],
372    "imo" => [ 3, "" ],
373    "in" => [ 3, "" ],
374    "indo" => [ 3, "" ],
375    "iona" => [ 3, "" ],
376    "ir" => [ 3, "" ],
377    "ira" => [ 3, "" ],
378    "ir${aa}" => [ 3, "" ],
379    "irade" => [ 3, "" ],
380    "${ia}rade" => [ 3, "" ],
381    "iram" => [ 3, "" ],
382    "ir${aa}m" => [ 3, "" ],
383    "${ia}ram" => [ 3, "" ],
384    "iramo" => [ 3, "" ],
385    "${ia}ramo" => [ 3, "" ],
386    "ir${aa}n" => [ 3, "" ],
387    "ir${at}o" => [ 2, "" ],
388    "irde" => [ 2, "" ],
389    "ire" => [ 3, "" ],
390    "irei" => [ 3, "" ],
391    "irem" => [ 3, "" ],
392    "iremo" => [ 3, "" ],
393    "iria" => [ 3, "" ],
394    "ir${ia}a" => [ 3, "" ],
395    "iriade" => [ 3, "" ],
396    "ir${ia}ade" => [ 3, "" ],
397    "iriam" => [ 3, "" ],
398    "iriamo" => [ 3, "" ],
399    "ir${ia}amo" => [ 3, "" ],
400    "irian" => [ 3, "" ],
401    "ir${ia}an" => [ 3, "" ],
402    "ir${ia}ei" => [ 3, "" ],
403    "irmo" => [ 3, "" ],
404    "${ia}rom" => [ 3, "" ],
405    "iron" => [ 3, "" ],
406    "ise" => [ 3, "" ],
407    "isede" => [ 3, "" ],
408    "${ia}sede" => [ 3, "" ],
409    "isemo" => [ 3, "" ],
410    "${ia}semo" => [ 3, "" ],
411    "isen" => [ 3, "" ],
412    "isse" => [ 3, "" ],
413    "${ia}ssede" => [ 3, "" ],
414    "${ia}ssei" => [ 3, "" ],
415    "issem" => [ 3, "" ],
416    "${ia}ssemo" => [ 3, "" ],
417    "iste" => [ 4, "" ],
418    "itar" => [ 5, "" ],
419    "iu" => [ 3, "" ],
420    "izar" => [ 3, "" ],
421    "omo" => [ 3, "" ],
422    "ondo" => [ 3, "" ],
423    "ou" => [ 3, "" ],
424    "tizar" => [ 4, "" ],
425    "uei" => [ 3, "" ],
426    "u${ia}a" => [ 5, "u" ],
427};
428
429$rule{accent} = {
430    $aa => 'a',
431    $ea => 'e',
432    $ia => 'i',
433    $oa => 'o',
434    $ua => 'u',
435    $at => 'a',
436    $ot => 'o',
437    $ec => 'e',
438    $cc => 'c',
439    $nt => 'n',
440};
441
442$rule{vowel} = {
443    "bil" => [ 2, "vel" ],
444    "gue" => [ 2, "g" ],
445    "a" => [ 3, "" ],
446    "e" => [ 3, "" ],
447    "o" => [ 3, "" ],
448};
449
450sub strip($$) {
451    my $cmd = shift;
452    my $word = shift;
453    if($cmd eq 'accent'){
454        foreach my $a (keys %{$rule{accent}}){
455            $word =~ s/$a/$rule{accent}->{$a}/eg;
456        }
457    }
458    elsif($cmd eq 'adv'){       $word =~ s/(.{4,})mente/$1/o;    }
459    else{
460        my $cmdref = $rule{$cmd};
461        for my $key (sort { length $b <=> length $a } keys %{$cmdref}){
462            my $patt = join q//, "^(.{", $cmdref->{$key}->[0], ",})", $key, '$';
463            if($word =~ /$patt/){
464              $word =~ s/$patt/$1.($cmdref->{$key}->[1])/e;
465              last;
466            }
467        }
468    }
469    return $word;
470}
471
472
473sub stem {
474    my @stems;
475    foreach ( ref($_[0]) ? @{$_[0]} : @_ ){
476        my $word = $_;
477        $word = strip('plural', $word) if $word =~ /s$/o;
478        $word = strip('femin', $word) if $word =~ /a$/o;
479        foreach my $op (qw/augment adv noun verb vowel accent/){
480            $word = strip($op, $word);
481        }
482        push @stems, $word;
483    }
484    wantarray ? @stems : \@stems;
485}
486
4871;
488__END__
489# Below is stub documentation for your module. You better edit it!
490
491=head1 NAME
492
493Lingua::GL::Stemmer - Galician Stemmer
494
495=head1 SYNOPSIS
496
497  use Lingua::GL::Stemmer;
498
499  Lingua::GL::Stemmer::stem(\@words);
500
501  # or
502
503  Lingua::GL::Stemmer::stem(@words);
504
505=head1 DESCRIPTION
506
507Galician is an endangered language spoken in northwest region of Spain. Galician is morphologically similar to Portuguese but phonetics differs greatly. Due to the morphological similarity between Portuguese and Galician, Portuguese stemming algorithm can be adopted to stem Galician texts.
508
509See L<Lingua::PT::Stemmer> for a sketch of the stemming algorithm, and L<http://bvg.udc.es/recursos_lingua/stemming.html> for stemming rules.
510
511=head1 SEE ALSO
512
513L<Lingua::PT::Stemmer>
514
515Stemming rules
516L<http://bvg.udc.es/recursos_lingua/stemming.html>
517
518=head1 COPYRIGHT
519
520xern E<lt>xern@cpan.orgE<gt>
521
522This module is free software; you can redistribute it or modify it under the same terms as Perl itself.
523
524=cut
525