1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6
7 #include "ContribInc.h"
8 #include <boost/algorithm/string.hpp>
9 #include "GermanStemmer.h"
10 #include "MiscUtils.h"
11 #include "UnicodeUtils.h"
12 #include "StringUtils.h"
13
14 namespace Lucene {
15
GermanStemmer()16 GermanStemmer::GermanStemmer() {
17 substCount = 0;
18 }
19
~GermanStemmer()20 GermanStemmer::~GermanStemmer() {
21 }
22
stem(const String & term)23 String GermanStemmer::stem(const String& term) {
24 // Use lowercase for medium stemming.
25 buffer = StringUtils::toLower(term);
26 if (!isStemmable()) {
27 return buffer;
28 }
29
30 // Stemming starts here
31 substitute();
32 strip();
33 optimize();
34 resubstitute();
35 removeParticleDenotion();
36
37 return buffer;
38 }
39
isStemmable()40 bool GermanStemmer::isStemmable() {
41 for (int32_t c = 0; c < (int32_t)buffer.length(); ++c) {
42 if (!UnicodeUtil::isAlpha(buffer[c])) {
43 return false;
44 }
45 }
46 return true;
47 }
48
strip()49 void GermanStemmer::strip() {
50 bool doMore = true;
51 while (doMore && buffer.length() > 3) {
52 if (buffer.length() + substCount > 5 && boost::ends_with(buffer, L"nd")) {
53 buffer.resize(buffer.length() - 2);
54 } else if (buffer.length() + substCount > 4 && boost::ends_with(buffer, L"em")) {
55 buffer.resize(buffer.length() - 2);
56 } else if (buffer.length() + substCount > 4 && boost::ends_with(buffer, L"er")) {
57 buffer.resize(buffer.length() - 2);
58 } else if (buffer[buffer.length() - 1] == L'e') {
59 buffer.resize(buffer.length() - 1);
60 } else if (buffer[buffer.length() - 1] == L's') {
61 buffer.resize(buffer.length() - 1);
62 } else if (buffer[buffer.length() - 1] == L'n') {
63 buffer.resize(buffer.length() - 1);
64 }
65 // "t" occurs only as suffix of verbs.
66 else if (buffer[buffer.length() - 1] == L't') {
67 buffer.resize(buffer.length() - 1);
68 } else {
69 doMore = false;
70 }
71 }
72 }
73
optimize()74 void GermanStemmer::optimize() {
75 // Additional step for female plurals of professions and inhabitants.
76 if (buffer.length() > 5 && boost::ends_with(buffer, L"erin*")) {
77 buffer.resize(buffer.length() - 1);
78 strip();
79 }
80
81 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
82 if (buffer[buffer.length() - 1] == L'z') {
83 buffer[buffer.length() - 1] = L'x';
84 }
85 }
86
removeParticleDenotion()87 void GermanStemmer::removeParticleDenotion() {
88 if (buffer.length() > 4) {
89 for (int32_t c = 0; c < (int32_t)buffer.length() - 3; ++c) {
90 if (buffer.substr(c, 4) == L"gege") {
91 buffer.erase(c, 2);
92 return;
93 }
94 }
95 }
96 }
97
substitute()98 void GermanStemmer::substitute() {
99 substCount = 0;
100 for (int32_t c = 0; c < (int32_t)buffer.length(); ++c) {
101 // Replace the second char of a pair of the equal characters with an asterisk
102 if (c > 0 && buffer[c] == buffer[c - 1]) {
103 buffer[c] = L'*';
104 }
105 // Substitute Umlauts.
106 else if (buffer[c] == L'\x00e4') {
107 buffer[c] = L'a';
108 } else if (buffer[c] == L'\x00f6') {
109 buffer[c] = L'o';
110 } else if (buffer[c] == L'\x00fc') {
111 buffer[c] = L'u';
112 }
113 // Fix bug so that '�' at the end of a word is replaced.
114 else if (buffer[c] == L'\x00df') {
115 buffer[c] = L's';
116 buffer.insert(c + 1, 1, L's');
117 ++substCount;
118 }
119 // Take care that at least one character is left left side from the current one
120 if (c < (int32_t)buffer.length() - 1) {
121 // Masking several common character combinations with an token
122 if (c < (int32_t)buffer.length() - 2 && buffer[c] == L's' && buffer[c + 1] == L'c' && buffer[c + 2] == L'h') {
123 buffer[c] = L'$';
124 buffer.erase(c + 1, 2);
125 substCount += 2;
126 } else if (buffer[c] == L'c' && buffer[c + 1] == L'h') {
127 buffer[c] = L'\x00a7';
128 buffer.erase(c + 1, 1);
129 ++substCount;
130 } else if (buffer[c] == L'e' && buffer[c + 1] == L'i') {
131 buffer[c] = L'%';
132 buffer.erase(c + 1, 1);
133 ++substCount;
134 } else if (buffer[c] == L'i' && buffer[c + 1] == L'e') {
135 buffer[c] = L'&';
136 buffer.erase(c + 1, 1);
137 ++substCount;
138 } else if (buffer[c] == L'i' && buffer[c + 1] == L'g') {
139 buffer[c] = L'#';
140 buffer.erase(c + 1, 1);
141 ++substCount;
142 } else if (buffer[c] == L's' && buffer[c + 1] == L't') {
143 buffer[c] = L'!';
144 buffer.erase(c + 1, 1);
145 ++substCount;
146 }
147 }
148 }
149 }
150
resubstitute()151 void GermanStemmer::resubstitute() {
152 for (int32_t c = 0; c < (int32_t)buffer.length(); ++c) {
153 if (buffer[c] == L'*') {
154 buffer[c] = buffer[c - 1];
155 } else if (buffer[c] == L'$') {
156 buffer[c] = L's';
157 buffer.insert(c + 1, L"ch");
158 } else if (buffer[c] == L'\x00a7') {
159 buffer[c] = L'c';
160 buffer.insert(c + 1, 1, L'h');
161 } else if (buffer[c] == L'%') {
162 buffer[c] = L'e';
163 buffer.insert(c + 1, 1, L'i');
164 } else if (buffer[c] == L'&') {
165 buffer[c] = L'i';
166 buffer.insert(c + 1, 1, L'e');
167 } else if (buffer[c] == L'#') {
168 buffer[c] = L'i';
169 buffer.insert(c + 1, 1, L'g');
170 } else if (buffer[c] == L'!') {
171 buffer[c] = L's';
172 buffer.insert(c + 1, 1, L't');
173 }
174 }
175 }
176
177 }
178