1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #ifndef GERMANSTEMMER_H
8 #define GERMANSTEMMER_H
9 
10 #include "LuceneContrib.h"
11 #include "LuceneObject.h"
12 
13 namespace Lucene {
14 
15 /// A stemmer for German words.
16 ///
17 /// The algorithm is based on the report "A Fast and Simple Stemming Algorithm for German Words" by Jörg
18 /// Caumanns (joerg.caumanns at isst.fhg.de).
19 class LPPCONTRIBAPI GermanStemmer : public LuceneObject {
20 public:
21     GermanStemmer();
22     virtual ~GermanStemmer();
23 
24     LUCENE_CLASS(GermanStemmer);
25 
26 protected:
27     /// Buffer for the terms while stemming them.
28     String buffer;
29 
30     /// Amount of characters that are removed with substitute() while stemming.
31     int32_t substCount;
32 
33 public:
34     /// Stems the given term to a unique discriminator.
35     ///
36     /// @param term The term that should be stemmed.
37     /// @return Discriminator for term.
38     String stem(const String& term);
39 
40 protected:
41     /// Checks if a term could be stemmed.
42     /// @return true if, and only if, the given term consists in letters.
43     bool isStemmable();
44 
45     /// Suffix stripping (stemming) on the current term. The stripping is reduced to the seven "base"
46     /// suffixes "e", "s", "n", "t", "em", "er" and * "nd", from which all regular suffixes are build
47     /// of. The simplification causes some overstemming, and way more irregular stems, but still
48     /// provides unique.
49     /// Discriminators in the most of those cases.
50     /// The algorithm is context free, except of the length restrictions.
51     void strip();
52 
53     /// Does some optimizations on the term. This optimisations are contextual.
54     void optimize();
55 
56     /// Removes a particle denotion ("ge") from a term.
57     void removeParticleDenotion();
58 
59     /// Do some substitutions for the term to reduce overstemming:
60     ///
61     /// - Substitute Umlauts with their corresponding vowel: ��� -> aou, "�" is substituted by "ss"
62     /// - Substitute a second char of a pair of equal characters with an asterisk: ?? -> ?*
63     /// - Substitute some common character combinations with a token: sch/ch/ei/ie/ig/st -> $/�/%/&/#/!
64     void substitute();
65 
66     /// Undoes the changes made by substitute(). That are character pairs and character combinations.
67     /// Umlauts will remain as their corresponding vowel, as "�" remains as "ss".
68     void resubstitute();
69 };
70 
71 }
72 
73 #endif
74