1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6
7 #include "ContribInc.h"
8 #include "BrazilianStemmer.h"
9 #include "MiscUtils.h"
10 #include "UnicodeUtils.h"
11 #include "StringUtils.h"
12
13 namespace Lucene {
14
~BrazilianStemmer()15 BrazilianStemmer::~BrazilianStemmer() {
16 }
17
stem(const String & term)18 String BrazilianStemmer::stem(const String& term) {
19 // creates CT
20 createCT(term);
21
22 if (!isIndexable(CT)) {
23 return L"";
24 }
25 if (!isStemmable(CT)) {
26 return CT;
27 }
28
29 R1 = getR1(CT);
30 R2 = getR1(R1);
31 RV = getRV(CT);
32 TERM = term + L";" + CT;
33
34 bool altered = step1();
35 if (!altered) {
36 altered = step2();
37 }
38
39 if (altered) {
40 step3();
41 } else {
42 step4();
43 }
44
45 step5();
46
47 return CT;
48 }
49
isStemmable(const String & term)50 bool BrazilianStemmer::isStemmable(const String& term) {
51 for (int32_t c = 0; c < (int32_t)term.length(); ++c) {
52 // Discard terms that contain non-letter characters.
53 if (!UnicodeUtil::isAlpha(term[c])) {
54 return false;
55 }
56 }
57 return true;
58 }
59
isIndexable(const String & term)60 bool BrazilianStemmer::isIndexable(const String& term) {
61 return (term.length() < 30) && (term.length() > 2);
62 }
63
isVowel(wchar_t value)64 bool BrazilianStemmer::isVowel(wchar_t value) {
65 return (value == L'a' || value == L'e' || value == L'i' || value == L'o' || value == L'u');
66 }
67
getR1(const String & value)68 String BrazilianStemmer::getR1(const String& value) {
69 if (value.empty()) {
70 return L"";
71 }
72
73 // find 1st vowel
74 int32_t i = (int32_t)(value.length() - 1);
75 int32_t j = 0;
76 for (; j < i; ++j) {
77 if (isVowel(value[j])) {
78 break;
79 }
80 }
81
82 if (j >= i) {
83 return L"";
84 }
85
86 // find 1st non-vowel
87 for (; j < i; ++j) {
88 if (!isVowel(value[j])) {
89 break;
90 }
91 }
92
93 if (j >= i) {
94 return L"";
95 }
96
97 return value.substr(j + 1);
98 }
99
getRV(const String & value)100 String BrazilianStemmer::getRV(const String& value) {
101 if (value.empty()) {
102 return L"";
103 }
104
105 int32_t i = (int32_t)(value.length() - 1);
106
107 // RV - IF the second letter is a consonant, RV is the region after the next following vowel
108 if (i > 0 && !isVowel(value[1])) {
109 int32_t j = 2;
110 // find 1st vowel
111 for (; j < i; ++j) {
112 if (isVowel(value[j])) {
113 break;
114 }
115 }
116
117 if (j < i) {
118 return value.substr(j + 1);
119 }
120 }
121
122
123 // RV - OR if the first two letters are vowels, RV is the region after the next consonant,
124 if (i > 1 && isVowel(value[0]) && isVowel(value[1])) {
125 int32_t j = 2;
126 // find 1st consonant
127 for (; j < i; ++j) {
128 if (!isVowel(value[j])) {
129 break;
130 }
131 }
132
133 if (j < i) {
134 return value.substr(j + 1);
135 }
136 }
137
138 // RV - AND otherwise (consonant-vowel case) RV is the region after the third letter.
139 if (i > 2) {
140 return value.substr(3);
141 }
142
143 return L"";
144 }
145
changeTerm(const String & value)146 String BrazilianStemmer::changeTerm(const String& value) {
147 if (value.empty()) {
148 return L"";
149 }
150
151 String lowerValue(StringUtils::toLower(value));
152 String r;
153
154 for (int32_t j = 0; j < (int32_t)value.length(); ++j) {
155 if (value[j] == 0x00e1 || value[j] == 0x00e2 || value[j] == 0x00e3) {
156 r += L"a";
157 continue;
158 }
159 if (value[j] == 0x00e9 || value[j] == 0x00ea) {
160 r += L"e";
161 continue;
162 }
163 if (value[j] == 0x00ed) {
164 r += L"i";
165 continue;
166 }
167 if (value[j] == 0x00f3 || value[j] == 0x00f4 || value[j] == 0x00f5) {
168 r += L"o";
169 continue;
170 }
171 if (value[j] == 0x00fa || value[j] == 0x00fc) {
172 r += L"u";
173 continue;
174 }
175 if (value[j] == 0x00e7) {
176 r += L"c";
177 continue;
178 }
179 if (value[j] == 0x00f1) {
180 r += L"n";
181 continue;
182 }
183
184 r += value[j];
185 }
186
187 return r ;
188 }
189
checkSuffix(const String & value,const String & suffix)190 bool BrazilianStemmer::checkSuffix(const String& value, const String& suffix) {
191 if (value.empty() || suffix.empty()) {
192 return false;
193 }
194 if (suffix.length() > value.length()) {
195 return false;
196 }
197 return (value.substr(value.length() - suffix.length()) == suffix);
198 }
199
replaceSuffix(const String & value,const String & toReplace,const String & changeTo)200 String BrazilianStemmer::replaceSuffix(const String& value, const String& toReplace, const String& changeTo) {
201 if (value.empty() || toReplace.empty() || changeTo.empty()) {
202 return value;
203 }
204
205 String vvalue = removeSuffix(value, toReplace);
206
207 if (value == vvalue) {
208 return value;
209 } else {
210 return vvalue + changeTo;
211 }
212 }
213
removeSuffix(const String & value,const String & toRemove)214 String BrazilianStemmer::removeSuffix(const String& value, const String& toRemove) {
215 if (value.empty() || toRemove.empty() || !checkSuffix(value, toRemove)) {
216 return value;
217 }
218 return value.substr(0, value.length() - toRemove.length());
219 }
220
suffixPreceded(const String & value,const String & suffix,const String & preceded)221 bool BrazilianStemmer::suffixPreceded(const String& value, const String& suffix, const String& preceded) {
222 if (value.empty() || suffix.empty() || preceded.empty() || !checkSuffix(value, suffix)) {
223 return false;
224 }
225 return checkSuffix(removeSuffix(value, suffix), preceded);
226 }
227
createCT(const String & term)228 void BrazilianStemmer::createCT(const String& term) {
229 CT = changeTerm(term);
230
231 if (CT.length() < 2) {
232 return;
233 }
234
235 // if the first character is ... , remove it
236 if (CT[0] == L'"' || CT[0] == L'\'' || CT[0] == L'-' || CT[0] == L',' ||
237 CT[0] == L';' || CT[0] == L'.' || CT[0] == L'?' || CT[0] == L'!') {
238 CT = CT.substr(1);
239 }
240
241 if (CT.length() < 2) {
242 return;
243 }
244
245 // if the last character is ... , remove it
246 if (CT[CT.length() - 1] == L'-' || CT[CT.length() - 1] == L',' || CT[CT.length() - 1] == L';' ||
247 CT[CT.length() - 1] == L'.' || CT[CT.length() - 1] == L'?' || CT[CT.length() - 1] == L'!' ||
248 CT[CT.length() - 1] == L'\'' || CT[CT.length() - 1] == L'"') {
249 CT = CT.substr(0, CT.length() - 1);
250 }
251 }
252
step1()253 bool BrazilianStemmer::step1() {
254 if (CT.empty()) {
255 return false;
256 }
257
258 // suffix length = 7
259 if (checkSuffix(CT, L"uciones") && checkSuffix(R2, L"uciones")) {
260 CT = replaceSuffix(CT, L"uciones", L"u");
261 return true;
262 }
263
264 // suffix length = 6
265 if (CT.length() >= 6) {
266 if (checkSuffix(CT, L"imentos") && checkSuffix(R2, L"imentos")) {
267 CT = removeSuffix(CT, L"imentos");
268 return true;
269 }
270 if (checkSuffix(CT, L"amentos") && checkSuffix(R2, L"amentos")) {
271 CT = removeSuffix(CT, L"amentos");
272 return true;
273 }
274 if (checkSuffix(CT, L"adores") && checkSuffix(R2, L"adores")) {
275 CT = removeSuffix(CT, L"adores");
276 return true;
277 }
278 if (checkSuffix(CT, L"adoras") && checkSuffix(R2, L"adoras")) {
279 CT = removeSuffix(CT, L"adoras");
280 return true;
281 }
282 if (checkSuffix(CT, L"logias") && checkSuffix(R2, L"logias")) {
283 replaceSuffix(CT, L"logias", L"log");
284 return true;
285 }
286 if (checkSuffix(CT, L"encias") && checkSuffix(R2, L"encias")) {
287 CT = replaceSuffix(CT, L"encias", L"ente");
288 return true;
289 }
290 if (checkSuffix(CT, L"amente") && checkSuffix(R1, L"amente")) {
291 CT = removeSuffix(CT, L"amente");
292 return true;
293 }
294 if (checkSuffix(CT, L"idades") && checkSuffix(R2, L"idades")) {
295 CT = removeSuffix(CT, L"idades");
296 return true;
297 }
298 }
299
300 // suffix length = 5
301 if (CT.length() >= 5) {
302 if (checkSuffix(CT, L"acoes") && checkSuffix(R2, L"acoes")) {
303 CT = removeSuffix(CT, L"acoes");
304 return true;
305 }
306 if (checkSuffix(CT, L"imento") && checkSuffix(R2, L"imento")) {
307 CT = removeSuffix(CT, L"imento");
308 return true;
309 }
310 if (checkSuffix(CT, L"amento") && checkSuffix(R2, L"amento")) {
311 CT = removeSuffix(CT, L"amento");
312 return true;
313 }
314 if (checkSuffix(CT, L"adora") && checkSuffix(R2, L"adora")) {
315 CT = removeSuffix(CT, L"adora");
316 return true;
317 }
318 if (checkSuffix(CT, L"ismos") && checkSuffix(R2, L"ismos")) {
319 CT = removeSuffix(CT, L"ismos");
320 return true;
321 }
322 if (checkSuffix(CT, L"istas") && checkSuffix(R2, L"istas")) {
323 CT = removeSuffix(CT, L"istas");
324 return true;
325 }
326 if (checkSuffix(CT, L"logia") && checkSuffix(R2, L"logia")) {
327 CT = replaceSuffix(CT, L"logia", L"log");
328 return true;
329 }
330 if (checkSuffix(CT, L"ucion") && checkSuffix(R2, L"ucion")) {
331 CT = replaceSuffix(CT, L"ucion", L"u");
332 return true;
333 }
334 if (checkSuffix(CT, L"encia") && checkSuffix(R2, L"encia")) {
335 CT = replaceSuffix(CT, L"encia", L"ente");
336 return true;
337 }
338 if (checkSuffix(CT, L"mente") && checkSuffix(R2, L"mente")) {
339 CT = removeSuffix(CT, L"mente");
340 return true;
341 }
342 if (checkSuffix(CT, L"idade") && checkSuffix(R2, L"idade")) {
343 CT = removeSuffix(CT, L"idade");
344 return true;
345 }
346 }
347
348 // suffix length = 4
349 if (CT.length() >= 4) {
350 if (checkSuffix(CT, L"acao") && checkSuffix(R2, L"acao")) {
351 CT = removeSuffix(CT, L"acao");
352 return true;
353 }
354 if (checkSuffix(CT, L"ezas") && checkSuffix(R2, L"ezas")) {
355 CT = removeSuffix(CT, L"ezas");
356 return true;
357 }
358 if (checkSuffix(CT, L"icos") && checkSuffix(R2, L"icos")) {
359 CT = removeSuffix(CT, L"icos");
360 return true;
361 }
362 if (checkSuffix(CT, L"icas") && checkSuffix(R2, L"icas")) {
363 CT = removeSuffix(CT, L"icas");
364 return true;
365 }
366 if (checkSuffix(CT, L"ismo") && checkSuffix(R2, L"ismo")) {
367 CT = removeSuffix(CT, L"ismo");
368 return true;
369 }
370 if (checkSuffix(CT, L"avel") && checkSuffix(R2, L"avel")) {
371 CT = removeSuffix(CT, L"avel");
372 return true;
373 }
374 if (checkSuffix(CT, L"ivel") && checkSuffix(R2, L"ivel")) {
375 CT = removeSuffix(CT, L"ivel");
376 return true;
377 }
378 if (checkSuffix(CT, L"ista") && checkSuffix(R2, L"ista")) {
379 CT = removeSuffix(CT, L"ista");
380 return true;
381 }
382 if (checkSuffix(CT, L"osos") && checkSuffix(R2, L"osos")) {
383 CT = removeSuffix(CT, L"osos");
384 return true;
385 }
386 if (checkSuffix(CT, L"osas") && checkSuffix(R2, L"osas")) {
387 CT = removeSuffix(CT, L"osas");
388 return true;
389 }
390 if (checkSuffix(CT, L"ador") && checkSuffix(R2, L"ador")) {
391 CT = removeSuffix(CT, L"ador");
392 return true;
393 }
394 if (checkSuffix(CT, L"ivas") && checkSuffix(R2, L"ivas")) {
395 CT = removeSuffix(CT, L"ivas");
396 return true;
397 }
398 if (checkSuffix(CT, L"ivos") && checkSuffix(R2, L"ivos")) {
399 CT = removeSuffix(CT, L"ivos");
400 return true;
401 }
402 if (checkSuffix(CT, L"iras") && checkSuffix(RV, L"iras") && suffixPreceded(CT, L"iras", L"e")) {
403 CT = replaceSuffix(CT, L"iras", L"ir");
404 return true;
405 }
406 }
407
408 // suffix length = 3
409 if (CT.length() >= 3) {
410 if (checkSuffix(CT, L"eza") && checkSuffix(R2, L"eza")) {
411 CT = removeSuffix(CT, L"eza");
412 return true;
413 }
414 if (checkSuffix(CT, L"ico") && checkSuffix(R2, L"ico")) {
415 CT = removeSuffix(CT, L"ico");
416 return true;
417 }
418 if (checkSuffix(CT, L"ica") && checkSuffix(R2, L"ica")) {
419 CT = removeSuffix(CT, L"ica");
420 return true;
421 }
422 if (checkSuffix(CT, L"oso") && checkSuffix(R2, L"oso")) {
423 CT = removeSuffix(CT, L"oso");
424 return true;
425 }
426 if (checkSuffix(CT, L"osa") && checkSuffix(R2, L"osa")) {
427 CT = removeSuffix(CT, L"osa");
428 return true;
429 }
430 if (checkSuffix(CT, L"iva") && checkSuffix(R2, L"iva")) {
431 CT = removeSuffix(CT, L"iva");
432 return true;
433 }
434 if (checkSuffix(CT, L"ivo") && checkSuffix(R2, L"ivo")) {
435 CT = removeSuffix(CT, L"ivo");
436 return true;
437 }
438 if (checkSuffix(CT, L"ira") && checkSuffix(RV, L"ira") && suffixPreceded(CT, L"ira", L"e")) {
439 CT = replaceSuffix(CT, L"ira", L"ir");
440 return true;
441 }
442 }
443
444 // no ending was removed by step1
445 return false;
446 }
447
step2()448 bool BrazilianStemmer::step2() {
449 if (RV.empty()) {
450 return false;
451 }
452
453 // suffix lenght = 7
454 if (RV.length() >= 7) {
455 if (checkSuffix(RV, L"issemos")) {
456 CT = removeSuffix(CT, L"issemos");
457 return true;
458 }
459 if (checkSuffix(RV, L"essemos")) {
460 CT = removeSuffix(CT, L"essemos");
461 return true;
462 }
463 if (checkSuffix(RV, L"assemos")) {
464 CT = removeSuffix(CT, L"assemos");
465 return true;
466 }
467 if (checkSuffix(RV, L"ariamos")) {
468 CT = removeSuffix(CT, L"ariamos");
469 return true;
470 }
471 if (checkSuffix(RV, L"eriamos")) {
472 CT = removeSuffix(CT, L"eriamos");
473 return true;
474 }
475 if (checkSuffix(RV, L"iriamos")) {
476 CT = removeSuffix(CT, L"iriamos");
477 return true;
478 }
479 }
480
481 // suffix length = 6
482 if (RV.length() >= 6) {
483 if (checkSuffix(RV, L"iremos")) {
484 CT = removeSuffix(CT, L"iremos");
485 return true;
486 }
487 if (checkSuffix(RV, L"eremos")) {
488 CT = removeSuffix(CT, L"eremos");
489 return true;
490 }
491 if (checkSuffix(RV, L"aremos")) {
492 CT = removeSuffix(CT, L"aremos");
493 return true;
494 }
495 if (checkSuffix(RV, L"avamos")) {
496 CT = removeSuffix(CT, L"avamos");
497 return true;
498 }
499 if (checkSuffix(RV, L"iramos")) {
500 CT = removeSuffix(CT, L"iramos");
501 return true;
502 }
503 if (checkSuffix(RV, L"eramos")) {
504 CT = removeSuffix(CT, L"eramos");
505 return true;
506 }
507 if (checkSuffix(RV, L"aramos")) {
508 CT = removeSuffix(CT, L"aramos");
509 return true;
510 }
511 if (checkSuffix(RV, L"asseis")) {
512 CT = removeSuffix(CT, L"asseis");
513 return true;
514 }
515 if (checkSuffix(RV, L"esseis")) {
516 CT = removeSuffix(CT, L"esseis");
517 return true;
518 }
519 if (checkSuffix(RV, L"isseis")) {
520 CT = removeSuffix(CT, L"isseis");
521 return true;
522 }
523 if (checkSuffix(RV, L"arieis")) {
524 CT = removeSuffix(CT, L"arieis");
525 return true;
526 }
527 if (checkSuffix(RV, L"erieis")) {
528 CT = removeSuffix(CT, L"erieis");
529 return true;
530 }
531 if (checkSuffix(RV, L"irieis")) {
532 CT = removeSuffix(CT, L"irieis");
533 return true;
534 }
535 }
536
537 // suffix length = 5
538 if (RV.length() >= 5) {
539 if (checkSuffix(RV, L"irmos")) {
540 CT = removeSuffix(CT, L"irmos");
541 return true;
542 }
543 if (checkSuffix(RV, L"iamos")) {
544 CT = removeSuffix(CT, L"iamos");
545 return true;
546 }
547 if (checkSuffix(RV, L"armos")) {
548 CT = removeSuffix(CT, L"armos");
549 return true;
550 }
551 if (checkSuffix(RV, L"ermos")) {
552 CT = removeSuffix(CT, L"ermos");
553 return true;
554 }
555 if (checkSuffix(RV, L"areis")) {
556 CT = removeSuffix(CT, L"areis");
557 return true;
558 }
559 if (checkSuffix(RV, L"ereis")) {
560 CT = removeSuffix(CT, L"ereis");
561 return true;
562 }
563 if (checkSuffix(RV, L"ireis")) {
564 CT = removeSuffix(CT, L"ireis");
565 return true;
566 }
567 if (checkSuffix(RV, L"asses")) {
568 CT = removeSuffix(CT, L"asses");
569 return true;
570 }
571 if (checkSuffix(RV, L"esses")) {
572 CT = removeSuffix(CT, L"esses");
573 return true;
574 }
575 if (checkSuffix(RV, L"isses")) {
576 CT = removeSuffix(CT, L"isses");
577 return true;
578 }
579 if (checkSuffix(RV, L"astes")) {
580 CT = removeSuffix(CT, L"astes");
581 return true;
582 }
583 if (checkSuffix(RV, L"assem")) {
584 CT = removeSuffix(CT, L"assem");
585 return true;
586 }
587 if (checkSuffix(RV, L"essem")) {
588 CT = removeSuffix(CT, L"essem");
589 return true;
590 }
591 if (checkSuffix(RV, L"issem")) {
592 CT = removeSuffix(CT, L"issem");
593 return true;
594 }
595 if (checkSuffix(RV, L"ardes")) {
596 CT = removeSuffix(CT, L"ardes");
597 return true;
598 }
599 if (checkSuffix(RV, L"erdes")) {
600 CT = removeSuffix(CT, L"erdes");
601 return true;
602 }
603 if (checkSuffix(RV, L"irdes")) {
604 CT = removeSuffix(CT, L"irdes");
605 return true;
606 }
607 if (checkSuffix(RV, L"ariam")) {
608 CT = removeSuffix(CT, L"ariam");
609 return true;
610 }
611 if (checkSuffix(RV, L"eriam")) {
612 CT = removeSuffix(CT, L"eriam");
613 return true;
614 }
615 if (checkSuffix(RV, L"iriam")) {
616 CT = removeSuffix(CT, L"iriam");
617 return true;
618 }
619 if (checkSuffix(RV, L"arias")) {
620 CT = removeSuffix(CT, L"arias");
621 return true;
622 }
623 if (checkSuffix(RV, L"erias")) {
624 CT = removeSuffix(CT, L"erias");
625 return true;
626 }
627 if (checkSuffix(RV, L"irias")) {
628 CT = removeSuffix(CT, L"irias");
629 return true;
630 }
631 if (checkSuffix(RV, L"estes")) {
632 CT = removeSuffix(CT, L"estes");
633 return true;
634 }
635 if (checkSuffix(RV, L"istes")) {
636 CT = removeSuffix(CT, L"istes");
637 return true;
638 }
639 if (checkSuffix(RV, L"areis")) {
640 CT = removeSuffix(CT, L"areis");
641 return true;
642 }
643 if (checkSuffix(RV, L"aveis")) {
644 CT = removeSuffix(CT, L"aveis");
645 return true;
646 }
647 }
648
649 // suffix length = 4
650 if (RV.length() >= 4) {
651 if (checkSuffix(RV, L"aria")) {
652 CT = removeSuffix(CT, L"aria");
653 return true;
654 }
655 if (checkSuffix(RV, L"eria")) {
656 CT = removeSuffix(CT, L"eria");
657 return true;
658 }
659 if (checkSuffix(RV, L"iria")) {
660 CT = removeSuffix(CT, L"iria");
661 return true;
662 }
663 if (checkSuffix(RV, L"asse")) {
664 CT = removeSuffix(CT, L"asse");
665 return true;
666 }
667 if (checkSuffix(RV, L"esse")) {
668 CT = removeSuffix(CT, L"esse");
669 return true;
670 }
671 if (checkSuffix(RV, L"isse")) {
672 CT = removeSuffix(CT, L"isse");
673 return true;
674 }
675 if (checkSuffix(RV, L"aste")) {
676 CT = removeSuffix(CT, L"aste");
677 return true;
678 }
679 if (checkSuffix(RV, L"este")) {
680 CT = removeSuffix(CT, L"este");
681 return true;
682 }
683 if (checkSuffix(RV, L"iste")) {
684 CT = removeSuffix(CT, L"iste");
685 return true;
686 }
687 if (checkSuffix(RV, L"arei")) {
688 CT = removeSuffix(CT, L"arei");
689 return true;
690 }
691 if (checkSuffix(RV, L"erei")) {
692 CT = removeSuffix(CT, L"erei");
693 return true;
694 }
695 if (checkSuffix(RV, L"irei")) {
696 CT = removeSuffix(CT, L"irei");
697 return true;
698 }
699 if (checkSuffix(RV, L"aram")) {
700 CT = removeSuffix(CT, L"aram");
701 return true;
702 }
703 if (checkSuffix(RV, L"eram")) {
704 CT = removeSuffix(CT, L"eram");
705 return true;
706 }
707 if (checkSuffix(RV, L"iram")) {
708 CT = removeSuffix(CT, L"iram");
709 return true;
710 }
711 if (checkSuffix(RV, L"avam")) {
712 CT = removeSuffix(CT, L"avam");
713 return true;
714 }
715 if (checkSuffix(RV, L"arem")) {
716 CT = removeSuffix(CT, L"arem");
717 return true;
718 }
719 if (checkSuffix(RV, L"erem")) {
720 CT = removeSuffix(CT, L"erem");
721 return true;
722 }
723 if (checkSuffix(RV, L"irem")) {
724 CT = removeSuffix(CT, L"irem");
725 return true;
726 }
727 if (checkSuffix(RV, L"ando")) {
728 CT = removeSuffix(CT, L"ando");
729 return true;
730 }
731 if (checkSuffix(RV, L"endo")) {
732 CT = removeSuffix(CT, L"endo");
733 return true;
734 }
735 if (checkSuffix(RV, L"indo")) {
736 CT = removeSuffix(CT, L"indo");
737 return true;
738 }
739 if (checkSuffix(RV, L"arao")) {
740 CT = removeSuffix(CT, L"arao");
741 return true;
742 }
743 if (checkSuffix(RV, L"erao")) {
744 CT = removeSuffix(CT, L"erao");
745 return true;
746 }
747 if (checkSuffix(RV, L"irao")) {
748 CT = removeSuffix(CT, L"irao");
749 return true;
750 }
751 if (checkSuffix(RV, L"adas")) {
752 CT = removeSuffix(CT, L"adas");
753 return true;
754 }
755 if (checkSuffix(RV, L"idas")) {
756 CT = removeSuffix(CT, L"idas");
757 return true;
758 }
759 if (checkSuffix(RV, L"aras")) {
760 CT = removeSuffix(CT, L"aras");
761 return true;
762 }
763 if (checkSuffix(RV, L"eras")) {
764 CT = removeSuffix(CT, L"eras");
765 return true;
766 }
767 if (checkSuffix(RV, L"iras")) {
768 CT = removeSuffix(CT, L"iras");
769 return true;
770 }
771 if (checkSuffix(RV, L"avas")) {
772 CT = removeSuffix(CT, L"avas");
773 return true;
774 }
775 if (checkSuffix(RV, L"ares")) {
776 CT = removeSuffix(CT, L"ares");
777 return true;
778 }
779 if (checkSuffix(RV, L"eres")) {
780 CT = removeSuffix(CT, L"eres");
781 return true;
782 }
783 if (checkSuffix(RV, L"ires")) {
784 CT = removeSuffix(CT, L"ires");
785 return true;
786 }
787 if (checkSuffix(RV, L"ados")) {
788 CT = removeSuffix(CT, L"ados");
789 return true;
790 }
791 if (checkSuffix(RV, L"idos")) {
792 CT = removeSuffix(CT, L"idos");
793 return true;
794 }
795 if (checkSuffix(RV, L"amos")) {
796 CT = removeSuffix(CT, L"amos");
797 return true;
798 }
799 if (checkSuffix(RV, L"emos")) {
800 CT = removeSuffix(CT, L"emos");
801 return true;
802 }
803 if (checkSuffix(RV, L"imos")) {
804 CT = removeSuffix(CT, L"imos");
805 return true;
806 }
807 if (checkSuffix(RV, L"iras")) {
808 CT = removeSuffix(CT, L"iras");
809 return true;
810 }
811 if (checkSuffix(RV, L"ieis")) {
812 CT = removeSuffix(CT, L"ieis");
813 return true;
814 }
815 }
816
817 // suffix length = 3
818 if (RV.length() >= 3) {
819 if (checkSuffix(RV, L"ada")) {
820 CT = removeSuffix(CT, L"ada");
821 return true;
822 }
823 if (checkSuffix(RV, L"ida")) {
824 CT = removeSuffix(CT, L"ida");
825 return true;
826 }
827 if (checkSuffix(RV, L"ara")) {
828 CT = removeSuffix(CT, L"ara");
829 return true;
830 }
831 if (checkSuffix(RV, L"era")) {
832 CT = removeSuffix(CT, L"era");
833 return true;
834 }
835 if (checkSuffix(RV, L"ira")) {
836 CT = removeSuffix(CT, L"ava");
837 return true;
838 }
839 if (checkSuffix(RV, L"iam")) {
840 CT = removeSuffix(CT, L"iam");
841 return true;
842 }
843 if (checkSuffix(RV, L"ado")) {
844 CT = removeSuffix(CT, L"ado");
845 return true;
846 }
847 if (checkSuffix(RV, L"ido")) {
848 CT = removeSuffix(CT, L"ido");
849 return true;
850 }
851 if (checkSuffix(RV, L"ias")) {
852 CT = removeSuffix(CT, L"ias");
853 return true;
854 }
855 if (checkSuffix(RV, L"ais")) {
856 CT = removeSuffix(CT, L"ais");
857 return true;
858 }
859 if (checkSuffix(RV, L"eis")) {
860 CT = removeSuffix(CT, L"eis");
861 return true;
862 }
863 if (checkSuffix(RV, L"ira")) {
864 CT = removeSuffix(CT, L"ira");
865 return true;
866 }
867 if (checkSuffix(RV, L"ear")) {
868 CT = removeSuffix(CT, L"ear");
869 return true;
870 }
871 }
872
873 // suffix length = 2
874 if (RV.length() >= 2) {
875 if (checkSuffix(RV, L"ia")) {
876 CT = removeSuffix(CT, L"ia");
877 return true;
878 }
879 if (checkSuffix(RV, L"ei")) {
880 CT = removeSuffix(CT, L"ei");
881 return true;
882 }
883 if (checkSuffix(RV, L"am")) {
884 CT = removeSuffix(CT, L"am");
885 return true;
886 }
887 if (checkSuffix(RV, L"em")) {
888 CT = removeSuffix(CT, L"em");
889 return true;
890 }
891 if (checkSuffix(RV, L"ar")) {
892 CT = removeSuffix(CT, L"ar");
893 return true;
894 }
895 if (checkSuffix(RV, L"er")) {
896 CT = removeSuffix(CT, L"er");
897 return true;
898 }
899 if (checkSuffix(RV, L"ir")) {
900 CT = removeSuffix(CT, L"ir");
901 return true;
902 }
903 if (checkSuffix(RV, L"as")) {
904 CT = removeSuffix(CT, L"as");
905 return true;
906 }
907 if (checkSuffix(RV, L"es")) {
908 CT = removeSuffix(CT, L"es");
909 return true;
910 }
911 if (checkSuffix(RV, L"is")) {
912 CT = removeSuffix(CT, L"is");
913 return true;
914 }
915 if (checkSuffix(RV, L"eu")) {
916 CT = removeSuffix(CT, L"eu");
917 return true;
918 }
919 if (checkSuffix(RV, L"iu")) {
920 CT = removeSuffix(CT, L"iu");
921 return true;
922 }
923 if (checkSuffix(RV, L"iu")) {
924 CT = removeSuffix(CT, L"iu");
925 return true;
926 }
927 if (checkSuffix(RV, L"ou")) {
928 CT = removeSuffix(CT, L"ou");
929 return true;
930 }
931 }
932
933 // no ending was removed by step2
934 return false;
935 }
936
step3()937 void BrazilianStemmer::step3() {
938 if (RV.empty()) {
939 return;
940 }
941
942 if (checkSuffix(RV, L"i") && suffixPreceded(RV, L"i", L"c")) {
943 CT = removeSuffix(CT, L"i");
944 }
945 }
946
step4()947 void BrazilianStemmer::step4() {
948 if (RV.empty()) {
949 return;
950 }
951
952 if (checkSuffix(RV, L"os")) {
953 CT = removeSuffix(CT, L"os");
954 return;
955 }
956 if (checkSuffix(RV, L"a")) {
957 CT = removeSuffix(CT, L"a");
958 return;
959 }
960 if (checkSuffix(RV, L"i")) {
961 CT = removeSuffix(CT, L"i");
962 return;
963 }
964 if (checkSuffix(RV, L"o")) {
965 CT = removeSuffix(CT, L"o");
966 return;
967 }
968 }
969
step5()970 void BrazilianStemmer::step5() {
971 if (RV.empty()) {
972 return;
973 }
974
975 if (checkSuffix(RV, L"e")) {
976 if (suffixPreceded(RV, L"e", L"gu")) {
977 CT = removeSuffix(CT, L"e");
978 CT = removeSuffix(CT, L"u");
979 return;
980 }
981
982 if (suffixPreceded(RV, L"e", L"ci")) {
983 CT = removeSuffix(CT, L"e");
984 CT = removeSuffix(CT, L"i");
985 return;
986 }
987
988 CT = removeSuffix(CT, L"e");
989 return;
990 }
991 }
992
993 }
994