1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #include "ContribInc.h"
8 #include "BrazilianStemmer.h"
9 #include "MiscUtils.h"
10 #include "UnicodeUtils.h"
11 #include "StringUtils.h"
12 
13 namespace Lucene {
14 
~BrazilianStemmer()15 BrazilianStemmer::~BrazilianStemmer() {
16 }
17 
stem(const String & term)18 String BrazilianStemmer::stem(const String& term) {
19     // creates CT
20     createCT(term);
21 
22     if (!isIndexable(CT)) {
23         return L"";
24     }
25     if (!isStemmable(CT)) {
26         return CT;
27     }
28 
29     R1 = getR1(CT);
30     R2 = getR1(R1);
31     RV = getRV(CT);
32     TERM = term + L";" + CT;
33 
34     bool altered = step1();
35     if (!altered) {
36         altered = step2();
37     }
38 
39     if (altered) {
40         step3();
41     } else {
42         step4();
43     }
44 
45     step5();
46 
47     return CT;
48 }
49 
isStemmable(const String & term)50 bool BrazilianStemmer::isStemmable(const String& term) {
51     for (int32_t c = 0; c < (int32_t)term.length(); ++c) {
52         // Discard terms that contain non-letter characters.
53         if (!UnicodeUtil::isAlpha(term[c])) {
54             return false;
55         }
56     }
57     return true;
58 }
59 
isIndexable(const String & term)60 bool BrazilianStemmer::isIndexable(const String& term) {
61     return (term.length() < 30) && (term.length() > 2);
62 }
63 
isVowel(wchar_t value)64 bool BrazilianStemmer::isVowel(wchar_t value) {
65     return (value == L'a' || value == L'e' || value == L'i' || value == L'o' || value == L'u');
66 }
67 
getR1(const String & value)68 String BrazilianStemmer::getR1(const String& value) {
69     if (value.empty()) {
70         return L"";
71     }
72 
73     // find 1st vowel
74     int32_t i = (int32_t)(value.length() - 1);
75     int32_t j = 0;
76     for (; j < i; ++j) {
77         if (isVowel(value[j])) {
78             break;
79         }
80     }
81 
82     if (j >= i) {
83         return L"";
84     }
85 
86     // find 1st non-vowel
87     for (; j < i; ++j) {
88         if (!isVowel(value[j])) {
89             break;
90         }
91     }
92 
93     if (j >= i) {
94         return L"";
95     }
96 
97     return value.substr(j + 1);
98 }
99 
getRV(const String & value)100 String BrazilianStemmer::getRV(const String& value) {
101     if (value.empty()) {
102         return L"";
103     }
104 
105     int32_t i = (int32_t)(value.length() - 1);
106 
107     // RV - IF the second letter is a consonant, RV is the region after the next following vowel
108     if (i > 0 && !isVowel(value[1])) {
109         int32_t j = 2;
110         // find 1st vowel
111         for (; j < i; ++j) {
112             if (isVowel(value[j])) {
113                 break;
114             }
115         }
116 
117         if (j < i) {
118             return value.substr(j + 1);
119         }
120     }
121 
122 
123     // RV - OR if the first two letters are vowels, RV is the region after the next consonant,
124     if (i > 1 && isVowel(value[0]) && isVowel(value[1])) {
125         int32_t j = 2;
126         // find 1st consonant
127         for (; j < i; ++j) {
128             if (!isVowel(value[j])) {
129                 break;
130             }
131         }
132 
133         if (j < i) {
134             return value.substr(j + 1);
135         }
136     }
137 
138     // RV - AND otherwise (consonant-vowel case) RV is the region after the third letter.
139     if (i > 2) {
140         return value.substr(3);
141     }
142 
143     return L"";
144 }
145 
changeTerm(const String & value)146 String BrazilianStemmer::changeTerm(const String& value) {
147     if (value.empty()) {
148         return L"";
149     }
150 
151     String lowerValue(StringUtils::toLower(value));
152     String r;
153 
154     for (int32_t j = 0; j < (int32_t)value.length(); ++j) {
155         if (value[j] == 0x00e1 || value[j] == 0x00e2 || value[j] == 0x00e3) {
156             r += L"a";
157             continue;
158         }
159         if (value[j] == 0x00e9 || value[j] == 0x00ea) {
160             r += L"e";
161             continue;
162         }
163         if (value[j] == 0x00ed) {
164             r += L"i";
165             continue;
166         }
167         if (value[j] == 0x00f3 || value[j] == 0x00f4 || value[j] == 0x00f5) {
168             r += L"o";
169             continue;
170         }
171         if (value[j] == 0x00fa || value[j] == 0x00fc) {
172             r += L"u";
173             continue;
174         }
175         if (value[j] == 0x00e7) {
176             r += L"c";
177             continue;
178         }
179         if (value[j] == 0x00f1) {
180             r += L"n";
181             continue;
182         }
183 
184         r += value[j];
185     }
186 
187     return r ;
188 }
189 
checkSuffix(const String & value,const String & suffix)190 bool BrazilianStemmer::checkSuffix(const String& value, const String& suffix) {
191     if (value.empty() || suffix.empty()) {
192         return false;
193     }
194     if (suffix.length() > value.length()) {
195         return false;
196     }
197     return (value.substr(value.length() - suffix.length()) == suffix);
198 }
199 
replaceSuffix(const String & value,const String & toReplace,const String & changeTo)200 String BrazilianStemmer::replaceSuffix(const String& value, const String& toReplace, const String& changeTo) {
201     if (value.empty() || toReplace.empty() || changeTo.empty()) {
202         return value;
203     }
204 
205     String vvalue = removeSuffix(value, toReplace);
206 
207     if (value == vvalue) {
208         return value;
209     } else {
210         return vvalue + changeTo;
211     }
212 }
213 
removeSuffix(const String & value,const String & toRemove)214 String BrazilianStemmer::removeSuffix(const String& value, const String& toRemove) {
215     if (value.empty() || toRemove.empty() || !checkSuffix(value, toRemove)) {
216         return value;
217     }
218     return value.substr(0, value.length() - toRemove.length());
219 }
220 
suffixPreceded(const String & value,const String & suffix,const String & preceded)221 bool BrazilianStemmer::suffixPreceded(const String& value, const String& suffix, const String& preceded) {
222     if (value.empty() || suffix.empty() || preceded.empty() || !checkSuffix(value, suffix)) {
223         return false;
224     }
225     return checkSuffix(removeSuffix(value, suffix), preceded);
226 }
227 
createCT(const String & term)228 void BrazilianStemmer::createCT(const String& term) {
229     CT = changeTerm(term);
230 
231     if (CT.length() < 2) {
232         return;
233     }
234 
235     // if the first character is ... , remove it
236     if (CT[0] == L'"' || CT[0] == L'\'' || CT[0] == L'-' || CT[0] == L',' ||
237             CT[0] == L';' || CT[0] == L'.' || CT[0] == L'?' || CT[0] == L'!') {
238         CT = CT.substr(1);
239     }
240 
241     if (CT.length() < 2) {
242         return;
243     }
244 
245     // if the last character is ... , remove it
246     if (CT[CT.length() - 1] == L'-' || CT[CT.length() - 1] == L',' || CT[CT.length() - 1] == L';' ||
247             CT[CT.length() - 1] == L'.' || CT[CT.length() - 1] == L'?' || CT[CT.length() - 1] == L'!' ||
248             CT[CT.length() - 1] == L'\'' || CT[CT.length() - 1] == L'"') {
249         CT = CT.substr(0, CT.length() - 1);
250     }
251 }
252 
step1()253 bool BrazilianStemmer::step1() {
254     if (CT.empty()) {
255         return false;
256     }
257 
258     // suffix length = 7
259     if (checkSuffix(CT, L"uciones") && checkSuffix(R2, L"uciones")) {
260         CT = replaceSuffix(CT, L"uciones", L"u");
261         return true;
262     }
263 
264     // suffix length = 6
265     if (CT.length() >= 6) {
266         if (checkSuffix(CT, L"imentos") && checkSuffix(R2, L"imentos")) {
267             CT = removeSuffix(CT, L"imentos");
268             return true;
269         }
270         if (checkSuffix(CT, L"amentos") && checkSuffix(R2, L"amentos")) {
271             CT = removeSuffix(CT, L"amentos");
272             return true;
273         }
274         if (checkSuffix(CT, L"adores") && checkSuffix(R2, L"adores")) {
275             CT = removeSuffix(CT, L"adores");
276             return true;
277         }
278         if (checkSuffix(CT, L"adoras") && checkSuffix(R2, L"adoras")) {
279             CT = removeSuffix(CT, L"adoras");
280             return true;
281         }
282         if (checkSuffix(CT, L"logias") && checkSuffix(R2, L"logias")) {
283             replaceSuffix(CT, L"logias", L"log");
284             return true;
285         }
286         if (checkSuffix(CT, L"encias") && checkSuffix(R2, L"encias")) {
287             CT = replaceSuffix(CT, L"encias", L"ente");
288             return true;
289         }
290         if (checkSuffix(CT, L"amente") && checkSuffix(R1, L"amente")) {
291             CT = removeSuffix(CT, L"amente");
292             return true;
293         }
294         if (checkSuffix(CT, L"idades") && checkSuffix(R2, L"idades")) {
295             CT = removeSuffix(CT, L"idades");
296             return true;
297         }
298     }
299 
300     // suffix length = 5
301     if (CT.length() >= 5) {
302         if (checkSuffix(CT, L"acoes") && checkSuffix(R2, L"acoes")) {
303             CT = removeSuffix(CT, L"acoes");
304             return true;
305         }
306         if (checkSuffix(CT, L"imento") && checkSuffix(R2, L"imento")) {
307             CT = removeSuffix(CT, L"imento");
308             return true;
309         }
310         if (checkSuffix(CT, L"amento") && checkSuffix(R2, L"amento")) {
311             CT = removeSuffix(CT, L"amento");
312             return true;
313         }
314         if (checkSuffix(CT, L"adora") && checkSuffix(R2, L"adora")) {
315             CT = removeSuffix(CT, L"adora");
316             return true;
317         }
318         if (checkSuffix(CT, L"ismos") && checkSuffix(R2, L"ismos")) {
319             CT = removeSuffix(CT, L"ismos");
320             return true;
321         }
322         if (checkSuffix(CT, L"istas") && checkSuffix(R2, L"istas")) {
323             CT = removeSuffix(CT, L"istas");
324             return true;
325         }
326         if (checkSuffix(CT, L"logia") && checkSuffix(R2, L"logia")) {
327             CT = replaceSuffix(CT, L"logia", L"log");
328             return true;
329         }
330         if (checkSuffix(CT, L"ucion") && checkSuffix(R2, L"ucion")) {
331             CT = replaceSuffix(CT, L"ucion", L"u");
332             return true;
333         }
334         if (checkSuffix(CT, L"encia") && checkSuffix(R2, L"encia")) {
335             CT = replaceSuffix(CT, L"encia", L"ente");
336             return true;
337         }
338         if (checkSuffix(CT, L"mente") && checkSuffix(R2, L"mente")) {
339             CT = removeSuffix(CT, L"mente");
340             return true;
341         }
342         if (checkSuffix(CT, L"idade") && checkSuffix(R2, L"idade")) {
343             CT = removeSuffix(CT, L"idade");
344             return true;
345         }
346     }
347 
348     // suffix length = 4
349     if (CT.length() >= 4) {
350         if (checkSuffix(CT, L"acao") && checkSuffix(R2, L"acao")) {
351             CT = removeSuffix(CT, L"acao");
352             return true;
353         }
354         if (checkSuffix(CT, L"ezas") && checkSuffix(R2, L"ezas")) {
355             CT = removeSuffix(CT, L"ezas");
356             return true;
357         }
358         if (checkSuffix(CT, L"icos") && checkSuffix(R2, L"icos")) {
359             CT = removeSuffix(CT, L"icos");
360             return true;
361         }
362         if (checkSuffix(CT, L"icas") && checkSuffix(R2, L"icas")) {
363             CT = removeSuffix(CT, L"icas");
364             return true;
365         }
366         if (checkSuffix(CT, L"ismo") && checkSuffix(R2, L"ismo")) {
367             CT = removeSuffix(CT, L"ismo");
368             return true;
369         }
370         if (checkSuffix(CT, L"avel") && checkSuffix(R2, L"avel")) {
371             CT = removeSuffix(CT, L"avel");
372             return true;
373         }
374         if (checkSuffix(CT, L"ivel") && checkSuffix(R2, L"ivel")) {
375             CT = removeSuffix(CT, L"ivel");
376             return true;
377         }
378         if (checkSuffix(CT, L"ista") && checkSuffix(R2, L"ista")) {
379             CT = removeSuffix(CT, L"ista");
380             return true;
381         }
382         if (checkSuffix(CT, L"osos") && checkSuffix(R2, L"osos")) {
383             CT = removeSuffix(CT, L"osos");
384             return true;
385         }
386         if (checkSuffix(CT, L"osas") && checkSuffix(R2, L"osas")) {
387             CT = removeSuffix(CT, L"osas");
388             return true;
389         }
390         if (checkSuffix(CT, L"ador") && checkSuffix(R2, L"ador")) {
391             CT = removeSuffix(CT, L"ador");
392             return true;
393         }
394         if (checkSuffix(CT, L"ivas") && checkSuffix(R2, L"ivas")) {
395             CT = removeSuffix(CT, L"ivas");
396             return true;
397         }
398         if (checkSuffix(CT, L"ivos") && checkSuffix(R2, L"ivos")) {
399             CT = removeSuffix(CT, L"ivos");
400             return true;
401         }
402         if (checkSuffix(CT, L"iras") && checkSuffix(RV, L"iras") && suffixPreceded(CT, L"iras", L"e")) {
403             CT = replaceSuffix(CT, L"iras", L"ir");
404             return true;
405         }
406     }
407 
408     // suffix length = 3
409     if (CT.length() >= 3) {
410         if (checkSuffix(CT, L"eza") && checkSuffix(R2, L"eza")) {
411             CT = removeSuffix(CT, L"eza");
412             return true;
413         }
414         if (checkSuffix(CT, L"ico") && checkSuffix(R2, L"ico")) {
415             CT = removeSuffix(CT, L"ico");
416             return true;
417         }
418         if (checkSuffix(CT, L"ica") && checkSuffix(R2, L"ica")) {
419             CT = removeSuffix(CT, L"ica");
420             return true;
421         }
422         if (checkSuffix(CT, L"oso") && checkSuffix(R2, L"oso")) {
423             CT = removeSuffix(CT, L"oso");
424             return true;
425         }
426         if (checkSuffix(CT, L"osa") && checkSuffix(R2, L"osa")) {
427             CT = removeSuffix(CT, L"osa");
428             return true;
429         }
430         if (checkSuffix(CT, L"iva") && checkSuffix(R2, L"iva")) {
431             CT = removeSuffix(CT, L"iva");
432             return true;
433         }
434         if (checkSuffix(CT, L"ivo") && checkSuffix(R2, L"ivo")) {
435             CT = removeSuffix(CT, L"ivo");
436             return true;
437         }
438         if (checkSuffix(CT, L"ira") && checkSuffix(RV, L"ira") && suffixPreceded(CT, L"ira", L"e")) {
439             CT = replaceSuffix(CT, L"ira", L"ir");
440             return true;
441         }
442     }
443 
444     // no ending was removed by step1
445     return false;
446 }
447 
step2()448 bool BrazilianStemmer::step2() {
449     if (RV.empty()) {
450         return false;
451     }
452 
453     // suffix lenght = 7
454     if (RV.length() >= 7) {
455         if (checkSuffix(RV, L"issemos")) {
456             CT = removeSuffix(CT, L"issemos");
457             return true;
458         }
459         if (checkSuffix(RV, L"essemos")) {
460             CT = removeSuffix(CT, L"essemos");
461             return true;
462         }
463         if (checkSuffix(RV, L"assemos")) {
464             CT = removeSuffix(CT, L"assemos");
465             return true;
466         }
467         if (checkSuffix(RV, L"ariamos")) {
468             CT = removeSuffix(CT, L"ariamos");
469             return true;
470         }
471         if (checkSuffix(RV, L"eriamos")) {
472             CT = removeSuffix(CT, L"eriamos");
473             return true;
474         }
475         if (checkSuffix(RV, L"iriamos")) {
476             CT = removeSuffix(CT, L"iriamos");
477             return true;
478         }
479     }
480 
481     // suffix length = 6
482     if (RV.length() >= 6) {
483         if (checkSuffix(RV, L"iremos")) {
484             CT = removeSuffix(CT, L"iremos");
485             return true;
486         }
487         if (checkSuffix(RV, L"eremos")) {
488             CT = removeSuffix(CT, L"eremos");
489             return true;
490         }
491         if (checkSuffix(RV, L"aremos")) {
492             CT = removeSuffix(CT, L"aremos");
493             return true;
494         }
495         if (checkSuffix(RV, L"avamos")) {
496             CT = removeSuffix(CT, L"avamos");
497             return true;
498         }
499         if (checkSuffix(RV, L"iramos")) {
500             CT = removeSuffix(CT, L"iramos");
501             return true;
502         }
503         if (checkSuffix(RV, L"eramos")) {
504             CT = removeSuffix(CT, L"eramos");
505             return true;
506         }
507         if (checkSuffix(RV, L"aramos")) {
508             CT = removeSuffix(CT, L"aramos");
509             return true;
510         }
511         if (checkSuffix(RV, L"asseis")) {
512             CT = removeSuffix(CT, L"asseis");
513             return true;
514         }
515         if (checkSuffix(RV, L"esseis")) {
516             CT = removeSuffix(CT, L"esseis");
517             return true;
518         }
519         if (checkSuffix(RV, L"isseis")) {
520             CT = removeSuffix(CT, L"isseis");
521             return true;
522         }
523         if (checkSuffix(RV, L"arieis")) {
524             CT = removeSuffix(CT, L"arieis");
525             return true;
526         }
527         if (checkSuffix(RV, L"erieis")) {
528             CT = removeSuffix(CT, L"erieis");
529             return true;
530         }
531         if (checkSuffix(RV, L"irieis")) {
532             CT = removeSuffix(CT, L"irieis");
533             return true;
534         }
535     }
536 
537     // suffix length = 5
538     if (RV.length() >= 5) {
539         if (checkSuffix(RV, L"irmos")) {
540             CT = removeSuffix(CT, L"irmos");
541             return true;
542         }
543         if (checkSuffix(RV, L"iamos")) {
544             CT = removeSuffix(CT, L"iamos");
545             return true;
546         }
547         if (checkSuffix(RV, L"armos")) {
548             CT = removeSuffix(CT, L"armos");
549             return true;
550         }
551         if (checkSuffix(RV, L"ermos")) {
552             CT = removeSuffix(CT, L"ermos");
553             return true;
554         }
555         if (checkSuffix(RV, L"areis")) {
556             CT = removeSuffix(CT, L"areis");
557             return true;
558         }
559         if (checkSuffix(RV, L"ereis")) {
560             CT = removeSuffix(CT, L"ereis");
561             return true;
562         }
563         if (checkSuffix(RV, L"ireis")) {
564             CT = removeSuffix(CT, L"ireis");
565             return true;
566         }
567         if (checkSuffix(RV, L"asses")) {
568             CT = removeSuffix(CT, L"asses");
569             return true;
570         }
571         if (checkSuffix(RV, L"esses")) {
572             CT = removeSuffix(CT, L"esses");
573             return true;
574         }
575         if (checkSuffix(RV, L"isses")) {
576             CT = removeSuffix(CT, L"isses");
577             return true;
578         }
579         if (checkSuffix(RV, L"astes")) {
580             CT = removeSuffix(CT, L"astes");
581             return true;
582         }
583         if (checkSuffix(RV, L"assem")) {
584             CT = removeSuffix(CT, L"assem");
585             return true;
586         }
587         if (checkSuffix(RV, L"essem")) {
588             CT = removeSuffix(CT, L"essem");
589             return true;
590         }
591         if (checkSuffix(RV, L"issem")) {
592             CT = removeSuffix(CT, L"issem");
593             return true;
594         }
595         if (checkSuffix(RV, L"ardes")) {
596             CT = removeSuffix(CT, L"ardes");
597             return true;
598         }
599         if (checkSuffix(RV, L"erdes")) {
600             CT = removeSuffix(CT, L"erdes");
601             return true;
602         }
603         if (checkSuffix(RV, L"irdes")) {
604             CT = removeSuffix(CT, L"irdes");
605             return true;
606         }
607         if (checkSuffix(RV, L"ariam")) {
608             CT = removeSuffix(CT, L"ariam");
609             return true;
610         }
611         if (checkSuffix(RV, L"eriam")) {
612             CT = removeSuffix(CT, L"eriam");
613             return true;
614         }
615         if (checkSuffix(RV, L"iriam")) {
616             CT = removeSuffix(CT, L"iriam");
617             return true;
618         }
619         if (checkSuffix(RV, L"arias")) {
620             CT = removeSuffix(CT, L"arias");
621             return true;
622         }
623         if (checkSuffix(RV, L"erias")) {
624             CT = removeSuffix(CT, L"erias");
625             return true;
626         }
627         if (checkSuffix(RV, L"irias")) {
628             CT = removeSuffix(CT, L"irias");
629             return true;
630         }
631         if (checkSuffix(RV, L"estes")) {
632             CT = removeSuffix(CT, L"estes");
633             return true;
634         }
635         if (checkSuffix(RV, L"istes")) {
636             CT = removeSuffix(CT, L"istes");
637             return true;
638         }
639         if (checkSuffix(RV, L"areis")) {
640             CT = removeSuffix(CT, L"areis");
641             return true;
642         }
643         if (checkSuffix(RV, L"aveis")) {
644             CT = removeSuffix(CT, L"aveis");
645             return true;
646         }
647     }
648 
649     // suffix length = 4
650     if (RV.length() >= 4) {
651         if (checkSuffix(RV, L"aria")) {
652             CT = removeSuffix(CT, L"aria");
653             return true;
654         }
655         if (checkSuffix(RV, L"eria")) {
656             CT = removeSuffix(CT, L"eria");
657             return true;
658         }
659         if (checkSuffix(RV, L"iria")) {
660             CT = removeSuffix(CT, L"iria");
661             return true;
662         }
663         if (checkSuffix(RV, L"asse")) {
664             CT = removeSuffix(CT, L"asse");
665             return true;
666         }
667         if (checkSuffix(RV, L"esse")) {
668             CT = removeSuffix(CT, L"esse");
669             return true;
670         }
671         if (checkSuffix(RV, L"isse")) {
672             CT = removeSuffix(CT, L"isse");
673             return true;
674         }
675         if (checkSuffix(RV, L"aste")) {
676             CT = removeSuffix(CT, L"aste");
677             return true;
678         }
679         if (checkSuffix(RV, L"este")) {
680             CT = removeSuffix(CT, L"este");
681             return true;
682         }
683         if (checkSuffix(RV, L"iste")) {
684             CT = removeSuffix(CT, L"iste");
685             return true;
686         }
687         if (checkSuffix(RV, L"arei")) {
688             CT = removeSuffix(CT, L"arei");
689             return true;
690         }
691         if (checkSuffix(RV, L"erei")) {
692             CT = removeSuffix(CT, L"erei");
693             return true;
694         }
695         if (checkSuffix(RV, L"irei")) {
696             CT = removeSuffix(CT, L"irei");
697             return true;
698         }
699         if (checkSuffix(RV, L"aram")) {
700             CT = removeSuffix(CT, L"aram");
701             return true;
702         }
703         if (checkSuffix(RV, L"eram")) {
704             CT = removeSuffix(CT, L"eram");
705             return true;
706         }
707         if (checkSuffix(RV, L"iram")) {
708             CT = removeSuffix(CT, L"iram");
709             return true;
710         }
711         if (checkSuffix(RV, L"avam")) {
712             CT = removeSuffix(CT, L"avam");
713             return true;
714         }
715         if (checkSuffix(RV, L"arem")) {
716             CT = removeSuffix(CT, L"arem");
717             return true;
718         }
719         if (checkSuffix(RV, L"erem")) {
720             CT = removeSuffix(CT, L"erem");
721             return true;
722         }
723         if (checkSuffix(RV, L"irem")) {
724             CT = removeSuffix(CT, L"irem");
725             return true;
726         }
727         if (checkSuffix(RV, L"ando")) {
728             CT = removeSuffix(CT, L"ando");
729             return true;
730         }
731         if (checkSuffix(RV, L"endo")) {
732             CT = removeSuffix(CT, L"endo");
733             return true;
734         }
735         if (checkSuffix(RV, L"indo")) {
736             CT = removeSuffix(CT, L"indo");
737             return true;
738         }
739         if (checkSuffix(RV, L"arao")) {
740             CT = removeSuffix(CT, L"arao");
741             return true;
742         }
743         if (checkSuffix(RV, L"erao")) {
744             CT = removeSuffix(CT, L"erao");
745             return true;
746         }
747         if (checkSuffix(RV, L"irao")) {
748             CT = removeSuffix(CT, L"irao");
749             return true;
750         }
751         if (checkSuffix(RV, L"adas")) {
752             CT = removeSuffix(CT, L"adas");
753             return true;
754         }
755         if (checkSuffix(RV, L"idas")) {
756             CT = removeSuffix(CT, L"idas");
757             return true;
758         }
759         if (checkSuffix(RV, L"aras")) {
760             CT = removeSuffix(CT, L"aras");
761             return true;
762         }
763         if (checkSuffix(RV, L"eras")) {
764             CT = removeSuffix(CT, L"eras");
765             return true;
766         }
767         if (checkSuffix(RV, L"iras")) {
768             CT = removeSuffix(CT, L"iras");
769             return true;
770         }
771         if (checkSuffix(RV, L"avas")) {
772             CT = removeSuffix(CT, L"avas");
773             return true;
774         }
775         if (checkSuffix(RV, L"ares")) {
776             CT = removeSuffix(CT, L"ares");
777             return true;
778         }
779         if (checkSuffix(RV, L"eres")) {
780             CT = removeSuffix(CT, L"eres");
781             return true;
782         }
783         if (checkSuffix(RV, L"ires")) {
784             CT = removeSuffix(CT, L"ires");
785             return true;
786         }
787         if (checkSuffix(RV, L"ados")) {
788             CT = removeSuffix(CT, L"ados");
789             return true;
790         }
791         if (checkSuffix(RV, L"idos")) {
792             CT = removeSuffix(CT, L"idos");
793             return true;
794         }
795         if (checkSuffix(RV, L"amos")) {
796             CT = removeSuffix(CT, L"amos");
797             return true;
798         }
799         if (checkSuffix(RV, L"emos")) {
800             CT = removeSuffix(CT, L"emos");
801             return true;
802         }
803         if (checkSuffix(RV, L"imos")) {
804             CT = removeSuffix(CT, L"imos");
805             return true;
806         }
807         if (checkSuffix(RV, L"iras")) {
808             CT = removeSuffix(CT, L"iras");
809             return true;
810         }
811         if (checkSuffix(RV, L"ieis")) {
812             CT = removeSuffix(CT, L"ieis");
813             return true;
814         }
815     }
816 
817     // suffix length = 3
818     if (RV.length() >= 3) {
819         if (checkSuffix(RV, L"ada")) {
820             CT = removeSuffix(CT, L"ada");
821             return true;
822         }
823         if (checkSuffix(RV, L"ida")) {
824             CT = removeSuffix(CT, L"ida");
825             return true;
826         }
827         if (checkSuffix(RV, L"ara")) {
828             CT = removeSuffix(CT, L"ara");
829             return true;
830         }
831         if (checkSuffix(RV, L"era")) {
832             CT = removeSuffix(CT, L"era");
833             return true;
834         }
835         if (checkSuffix(RV, L"ira")) {
836             CT = removeSuffix(CT, L"ava");
837             return true;
838         }
839         if (checkSuffix(RV, L"iam")) {
840             CT = removeSuffix(CT, L"iam");
841             return true;
842         }
843         if (checkSuffix(RV, L"ado")) {
844             CT = removeSuffix(CT, L"ado");
845             return true;
846         }
847         if (checkSuffix(RV, L"ido")) {
848             CT = removeSuffix(CT, L"ido");
849             return true;
850         }
851         if (checkSuffix(RV, L"ias")) {
852             CT = removeSuffix(CT, L"ias");
853             return true;
854         }
855         if (checkSuffix(RV, L"ais")) {
856             CT = removeSuffix(CT, L"ais");
857             return true;
858         }
859         if (checkSuffix(RV, L"eis")) {
860             CT = removeSuffix(CT, L"eis");
861             return true;
862         }
863         if (checkSuffix(RV, L"ira")) {
864             CT = removeSuffix(CT, L"ira");
865             return true;
866         }
867         if (checkSuffix(RV, L"ear")) {
868             CT = removeSuffix(CT, L"ear");
869             return true;
870         }
871     }
872 
873     // suffix length = 2
874     if (RV.length() >= 2) {
875         if (checkSuffix(RV, L"ia")) {
876             CT = removeSuffix(CT, L"ia");
877             return true;
878         }
879         if (checkSuffix(RV, L"ei")) {
880             CT = removeSuffix(CT, L"ei");
881             return true;
882         }
883         if (checkSuffix(RV, L"am")) {
884             CT = removeSuffix(CT, L"am");
885             return true;
886         }
887         if (checkSuffix(RV, L"em")) {
888             CT = removeSuffix(CT, L"em");
889             return true;
890         }
891         if (checkSuffix(RV, L"ar")) {
892             CT = removeSuffix(CT, L"ar");
893             return true;
894         }
895         if (checkSuffix(RV, L"er")) {
896             CT = removeSuffix(CT, L"er");
897             return true;
898         }
899         if (checkSuffix(RV, L"ir")) {
900             CT = removeSuffix(CT, L"ir");
901             return true;
902         }
903         if (checkSuffix(RV, L"as")) {
904             CT = removeSuffix(CT, L"as");
905             return true;
906         }
907         if (checkSuffix(RV, L"es")) {
908             CT = removeSuffix(CT, L"es");
909             return true;
910         }
911         if (checkSuffix(RV, L"is")) {
912             CT = removeSuffix(CT, L"is");
913             return true;
914         }
915         if (checkSuffix(RV, L"eu")) {
916             CT = removeSuffix(CT, L"eu");
917             return true;
918         }
919         if (checkSuffix(RV, L"iu")) {
920             CT = removeSuffix(CT, L"iu");
921             return true;
922         }
923         if (checkSuffix(RV, L"iu")) {
924             CT = removeSuffix(CT, L"iu");
925             return true;
926         }
927         if (checkSuffix(RV, L"ou")) {
928             CT = removeSuffix(CT, L"ou");
929             return true;
930         }
931     }
932 
933     // no ending was removed by step2
934     return false;
935 }
936 
step3()937 void BrazilianStemmer::step3() {
938     if (RV.empty()) {
939         return;
940     }
941 
942     if (checkSuffix(RV, L"i") && suffixPreceded(RV, L"i", L"c")) {
943         CT = removeSuffix(CT, L"i");
944     }
945 }
946 
step4()947 void BrazilianStemmer::step4() {
948     if (RV.empty()) {
949         return;
950     }
951 
952     if (checkSuffix(RV, L"os")) {
953         CT = removeSuffix(CT, L"os");
954         return;
955     }
956     if (checkSuffix(RV, L"a")) {
957         CT = removeSuffix(CT, L"a");
958         return;
959     }
960     if (checkSuffix(RV, L"i")) {
961         CT = removeSuffix(CT, L"i");
962         return;
963     }
964     if (checkSuffix(RV, L"o")) {
965         CT = removeSuffix(CT, L"o");
966         return;
967     }
968 }
969 
step5()970 void BrazilianStemmer::step5() {
971     if (RV.empty()) {
972         return;
973     }
974 
975     if (checkSuffix(RV, L"e")) {
976         if (suffixPreceded(RV, L"e", L"gu")) {
977             CT = removeSuffix(CT, L"e");
978             CT = removeSuffix(CT, L"u");
979             return;
980         }
981 
982         if (suffixPreceded(RV, L"e", L"ci")) {
983             CT = removeSuffix(CT, L"e");
984             CT = removeSuffix(CT, L"i");
985             return;
986         }
987 
988         CT = removeSuffix(CT, L"e");
989         return;
990     }
991 }
992 
993 }
994