1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8 #include "CLucene/util/StringBuffer.h"
9 #include "GermanStemmer.h"
10
11 CL_NS_USE(util)
CL_NS_USE2(analysis,de)12 CL_NS_USE2(analysis,de)
13
14 GermanStemmer::GermanStemmer() :
15 sb() {
16 }
17
stem(const TCHAR * term,size_t length)18 TCHAR* GermanStemmer::stem(const TCHAR* term, size_t length) {
19 if (length <= 0) {
20 length = _tcslen(term);
21 }
22
23 // Reset the StringBuffer.
24 sb.clear();
25 sb.append(term, length);
26
27 if (!isStemmable(sb.getBuffer(), sb.length()))
28 return sb.giveBuffer();
29
30 // Stemming starts here...
31 substitute(sb);
32 strip(sb);
33 optimize(sb);
34 resubstitute(sb);
35 removeParticleDenotion(sb);
36
37 return sb.giveBuffer();
38 }
39
isStemmable(const TCHAR * term,size_t length) const40 bool GermanStemmer::isStemmable(const TCHAR* term, size_t length) const {
41 if (length <= 0) {
42 length = _tcslen(term);
43 }
44 for (size_t c = 0; c < length; c++) {
45 if (_istalpha(term[c]) == 0)
46 return false;
47 }
48 return true;
49 }
50
strip(StringBuffer & buffer)51 void GermanStemmer::strip(StringBuffer& buffer)
52 {
53 bool doMore = true;
54 while ( doMore && buffer.length() > 3 ) {
55 if ( ( buffer.length() + substCount > 5 ) &&
56 buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("nd"), 2 ) )
57 {
58 buffer.deleteChars( buffer.length() - 2, buffer.length() );
59 }
60 else if ( ( buffer.length() + substCount > 4 ) &&
61 buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("em"), 2 ) ) {
62 buffer.deleteChars( buffer.length() - 2, buffer.length() );
63 }
64 else if ( ( buffer.length() + substCount > 4 ) &&
65 buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("er"), 2 ) ) {
66 buffer.deleteChars( buffer.length() - 2, buffer.length() );
67 }
68 else if ( buffer.charAt( buffer.length() - 1 ) == _T('e') ) {
69 buffer.deleteCharAt( buffer.length() - 1 );
70 }
71 else if ( buffer.charAt( buffer.length() - 1 ) == _T('s') ) {
72 buffer.deleteCharAt( buffer.length() - 1 );
73 }
74 else if ( buffer.charAt( buffer.length() - 1 ) == _T('n') ) {
75 buffer.deleteCharAt( buffer.length() - 1 );
76 }
77 // "t" occurs only as suffix of verbs.
78 else if ( buffer.charAt( buffer.length() - 1 ) == _T('t') ) {
79 buffer.deleteCharAt( buffer.length() - 1 );
80 }
81 else {
82 doMore = false;
83 }
84 }
85 }
86
optimize(StringBuffer & buffer)87 void GermanStemmer::optimize(StringBuffer& buffer) {
88 // Additional step for female plurals of professions and inhabitants.
89 if ( buffer.length() > 5 && buffer.substringEquals( buffer.length() - 5, buffer.length(), _T("erin*"), 5 ) ) {
90 buffer.deleteCharAt( buffer.length() -1 );
91 strip( buffer );
92 }
93 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
94 if ( buffer.charAt( buffer.length() - 1 ) == ( _T('z') ) ) {
95 buffer.setCharAt( buffer.length() - 1, _T('x') );
96 }
97 }
98
removeParticleDenotion(StringBuffer & buffer)99 void GermanStemmer::removeParticleDenotion(StringBuffer& buffer) {
100 if ( buffer.length() > 4 ) {
101 for ( size_t c = 0; c < buffer.length() - 3; c++ ) {
102 if ( buffer.substringEquals( c, c + 4, _T("gege"), 4 ) ) {
103 buffer.deleteChars( c, c + 2 );
104 return;
105 }
106 }
107 }
108 }
109
substitute(StringBuffer & buffer)110 void GermanStemmer::substitute(StringBuffer& buffer) {
111 substCount = 0;
112
113 for ( size_t i = 0; i < buffer.length(); i++ ) {
114 #ifdef _UCS2
115 TCHAR c = buffer.charAt(i);
116 #else
117 unsigned char c = buffer.charAt(i);
118 #endif
119 // Replace the second char of a pair of the equal characters with an asterisk
120 if ( i > 0 && c == buffer.charAt ( i - 1 ) ) {
121 buffer.setCharAt( i, _T('*') );
122 }
123 // Substitute Umlauts.
124 else if ( c == 0xe4 ) {
125 buffer.setCharAt( i, _T('a') );
126 }
127 else if ( c == 0xf6 ) {
128 buffer.setCharAt( i, _T('o') );
129 }
130 else if ( c == 0xfc ) {
131 buffer.setCharAt( i, _T('u') );
132 }
133 // Fix bug so that 'ß' at the end of a word is replaced.
134 else if ( c == 0xdf ) {
135 buffer.setCharAt( i, _T('s') );
136 buffer.insert( i + 1, _T('s') );
137 substCount++;
138 }
139 // Take care that at least one character is left left side from the current one
140 if ( i < buffer.length() - 1 ) {
141 // Masking several common character combinations with an token
142 if ( ( i < buffer.length() - 2 ) && c == _T('s') &&
143 buffer.charAt( i + 1 ) == _T('c') && buffer.charAt( i + 2 ) == _T('h') )
144 {
145 buffer.setCharAt( i, _T('$') );
146 buffer.deleteChars( i + 1, i + 3 );
147 substCount += 2;
148 }
149 else if ( c == _T('c') && buffer.charAt( i + 1 ) == _T('h') ) {
150 buffer.setCharAt( i, 0xa7 ); // section sign in UTF-16
151 buffer.deleteCharAt( i + 1 );
152 substCount++;
153 }
154 else if ( c == _T('e') && buffer.charAt( i + 1 ) == _T('i') ) {
155 buffer.setCharAt( i, _T('%') );
156 buffer.deleteCharAt( i + 1 );
157 substCount++;
158 }
159 else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('e') ) {
160 buffer.setCharAt( i, _T('&') );
161 buffer.deleteCharAt( i + 1 );
162 substCount++;
163 }
164 else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('g') ) {
165 buffer.setCharAt( i, _T('#') );
166 buffer.deleteCharAt( i + 1 );
167 substCount++;
168 }
169 else if ( c == _T('s') && buffer.charAt( i + 1 ) == _T('t') ) {
170 buffer.setCharAt( i, _T('!') );
171 buffer.deleteCharAt( i + 1 );
172 substCount++;
173 }
174 }
175 }
176 }
177
resubstitute(StringBuffer & buffer)178 void GermanStemmer::resubstitute(StringBuffer& buffer) {
179 for ( size_t i = 0; i < buffer.length(); i++ ) {
180 #ifdef _UCS2
181 TCHAR c = buffer.charAt(i);
182 #else
183 unsigned char c = buffer.charAt(i);
184 #endif
185 if ( c == _T('*') ) {
186 buffer.setCharAt( i, buffer.charAt( i - 1 ) );
187 }
188 else if ( c == _T('$') ) {
189 buffer.setCharAt( i, 's' );
190 buffer.insert( i + 1, _T("ch"), 2 );
191 }
192 else if ( c == 0xa7 ) { // section sign in UTF-16
193 buffer.setCharAt( i, _T('c') );
194 buffer.insert( i + 1, _T('h') );
195 }
196 else if ( c == _T('%') ) {
197 buffer.setCharAt( i, _T('e') );
198 buffer.insert( i + 1, _T('i') );
199 }
200 else if ( c == _T('&') ) {
201 buffer.setCharAt( i, _T('i') );
202 buffer.insert( i + 1, _T('e') );
203 }
204 else if ( c == _T('#') ) {
205 buffer.setCharAt( i, _T('i') );
206 buffer.insert( i + 1, _T('g') );
207 }
208 else if ( c == _T('!') ) {
209 buffer.setCharAt( i, _T('s') );
210 buffer.insert( i + 1, _T('t') );
211 }
212 }
213 }
214