1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8 #include "CLucene/util/StringBuffer.h"
9 #include "GermanStemmer.h"
10 
11 CL_NS_USE(util)
CL_NS_USE2(analysis,de)12 CL_NS_USE2(analysis,de)
13 
14     GermanStemmer::GermanStemmer() :
15       sb() {
16     }
17 
stem(const TCHAR * term,size_t length)18     TCHAR* GermanStemmer::stem(const TCHAR* term, size_t length) {
19       if (length <= 0) {
20         length = _tcslen(term);
21       }
22 
23       // Reset the StringBuffer.
24       sb.clear();
25       sb.append(term, length);
26 
27       if (!isStemmable(sb.getBuffer(), sb.length()))
28         return sb.giveBuffer();
29 
30       // Stemming starts here...
31       substitute(sb);
32       strip(sb);
33       optimize(sb);
34       resubstitute(sb);
35       removeParticleDenotion(sb);
36 
37       return sb.giveBuffer();
38     }
39 
isStemmable(const TCHAR * term,size_t length) const40     bool GermanStemmer::isStemmable(const TCHAR* term, size_t length) const {
41       if (length <= 0) {
42         length = _tcslen(term);
43       }
44       for (size_t c = 0; c < length; c++) {
45         if (_istalpha(term[c]) == 0)
46           return false;
47       }
48       return true;
49     }
50 
strip(StringBuffer & buffer)51     void GermanStemmer::strip(StringBuffer& buffer)
52     {
53       bool doMore = true;
54       while ( doMore && buffer.length() > 3 ) {
55         if ( ( buffer.length() + substCount > 5 ) &&
56           buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("nd"), 2 ) )
57         {
58           buffer.deleteChars( buffer.length() - 2, buffer.length() );
59         }
60         else if ( ( buffer.length() + substCount > 4 ) &&
61           buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("em"), 2 ) ) {
62             buffer.deleteChars( buffer.length() - 2, buffer.length() );
63         }
64         else if ( ( buffer.length() + substCount > 4 ) &&
65           buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("er"), 2 ) ) {
66             buffer.deleteChars( buffer.length() - 2, buffer.length() );
67         }
68         else if ( buffer.charAt( buffer.length() - 1 ) == _T('e') ) {
69           buffer.deleteCharAt( buffer.length() - 1 );
70         }
71         else if ( buffer.charAt( buffer.length() - 1 ) == _T('s') ) {
72           buffer.deleteCharAt( buffer.length() - 1 );
73         }
74         else if ( buffer.charAt( buffer.length() - 1 ) == _T('n') ) {
75           buffer.deleteCharAt( buffer.length() - 1 );
76         }
77         // "t" occurs only as suffix of verbs.
78         else if ( buffer.charAt( buffer.length() - 1 ) == _T('t') ) {
79           buffer.deleteCharAt( buffer.length() - 1 );
80         }
81         else {
82           doMore = false;
83         }
84       }
85     }
86 
optimize(StringBuffer & buffer)87     void GermanStemmer::optimize(StringBuffer& buffer) {
88       // Additional step for female plurals of professions and inhabitants.
89       if ( buffer.length() > 5 && buffer.substringEquals( buffer.length() - 5, buffer.length(), _T("erin*"), 5 ) ) {
90         buffer.deleteCharAt( buffer.length() -1 );
91         strip( buffer );
92       }
93       // Additional step for irregular plural nouns like "Matrizen -> Matrix".
94       if ( buffer.charAt( buffer.length() - 1 ) == ( _T('z') ) ) {
95         buffer.setCharAt( buffer.length() - 1, _T('x') );
96       }
97     }
98 
removeParticleDenotion(StringBuffer & buffer)99     void GermanStemmer::removeParticleDenotion(StringBuffer& buffer) {
100       if ( buffer.length() > 4 ) {
101         for ( size_t c = 0; c < buffer.length() - 3; c++ ) {
102           if ( buffer.substringEquals( c, c + 4, _T("gege"), 4 ) ) {
103             buffer.deleteChars( c, c + 2 );
104             return;
105           }
106         }
107       }
108     }
109 
substitute(StringBuffer & buffer)110     void GermanStemmer::substitute(StringBuffer& buffer) {
111       substCount = 0;
112 
113       for ( size_t i = 0; i < buffer.length(); i++ ) {
114 #ifdef _UCS2
115         TCHAR c = buffer.charAt(i);
116 #else
117         unsigned char c = buffer.charAt(i);
118 #endif
119         // Replace the second char of a pair of the equal characters with an asterisk
120         if ( i > 0 && c == buffer.charAt ( i - 1 )  ) {
121           buffer.setCharAt( i, _T('*') );
122         }
123         // Substitute Umlauts.
124         else if ( c  == 0xe4 ) {
125           buffer.setCharAt( i, _T('a') );
126         }
127         else if ( c == 0xf6 ) {
128           buffer.setCharAt( i, _T('o') );
129         }
130         else if ( c == 0xfc ) {
131           buffer.setCharAt( i, _T('u') );
132         }
133         // Fix bug so that 'ß' at the end of a word is replaced.
134         else if ( c == 0xdf ) {
135             buffer.setCharAt( i, _T('s') );
136             buffer.insert( i + 1, _T('s') );
137             substCount++;
138         }
139         // Take care that at least one character is left left side from the current one
140         if ( i < buffer.length() - 1 ) {
141           // Masking several common character combinations with an token
142           if ( ( i < buffer.length() - 2 ) && c == _T('s') &&
143             buffer.charAt( i + 1 ) == _T('c') && buffer.charAt( i + 2 ) == _T('h') )
144           {
145             buffer.setCharAt( i, _T('$') );
146             buffer.deleteChars( i + 1, i + 3 );
147             substCount += 2;
148           }
149           else if ( c == _T('c') && buffer.charAt( i + 1 ) == _T('h') ) {
150             buffer.setCharAt( i, 0xa7 ); // section sign in UTF-16
151             buffer.deleteCharAt( i + 1 );
152             substCount++;
153           }
154           else if ( c == _T('e') && buffer.charAt( i + 1 ) == _T('i') ) {
155             buffer.setCharAt( i, _T('%') );
156             buffer.deleteCharAt( i + 1 );
157             substCount++;
158           }
159           else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('e') ) {
160             buffer.setCharAt( i, _T('&') );
161             buffer.deleteCharAt( i + 1 );
162             substCount++;
163           }
164           else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('g') ) {
165             buffer.setCharAt( i, _T('#') );
166             buffer.deleteCharAt( i + 1 );
167             substCount++;
168           }
169           else if ( c == _T('s') && buffer.charAt( i + 1 ) == _T('t') ) {
170             buffer.setCharAt( i, _T('!') );
171             buffer.deleteCharAt( i + 1 );
172             substCount++;
173           }
174         }
175       }
176     }
177 
resubstitute(StringBuffer & buffer)178     void GermanStemmer::resubstitute(StringBuffer& buffer) {
179       for ( size_t i = 0; i < buffer.length(); i++ ) {
180 #ifdef _UCS2
181         TCHAR c = buffer.charAt(i);
182 #else
183         unsigned char c = buffer.charAt(i);
184 #endif
185         if ( c == _T('*') ) {
186           buffer.setCharAt( i, buffer.charAt( i - 1 ) );
187         }
188         else if ( c == _T('$') ) {
189           buffer.setCharAt( i, 's' );
190           buffer.insert( i + 1, _T("ch"), 2 );
191         }
192         else if ( c == 0xa7 ) { // section sign in UTF-16
193           buffer.setCharAt( i, _T('c') );
194           buffer.insert( i + 1, _T('h') );
195         }
196         else if ( c == _T('%') ) {
197           buffer.setCharAt( i, _T('e') );
198           buffer.insert( i + 1, _T('i') );
199         }
200         else if ( c == _T('&') ) {
201           buffer.setCharAt( i, _T('i') );
202           buffer.insert( i + 1, _T('e') );
203         }
204         else if ( c == _T('#') ) {
205           buffer.setCharAt( i, _T('i') );
206           buffer.insert( i + 1, _T('g') );
207         }
208         else if ( c == _T('!') ) {
209           buffer.setCharAt( i, _T('s') );
210           buffer.insert( i + 1, _T('t') );
211         }
212       }
213     }
214