1 ///////////////////////////////////////////////////////////////////////////// 2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved. 3 // Distributable under the terms of either the Apache License (Version 2.0) 4 // or the GNU Lesser General Public License. 5 ///////////////////////////////////////////////////////////////////////////// 6 7 #ifndef GERMANSTEMMER_H 8 #define GERMANSTEMMER_H 9 10 #include "LuceneContrib.h" 11 #include "LuceneObject.h" 12 13 namespace Lucene { 14 15 /// A stemmer for German words. 16 /// 17 /// The algorithm is based on the report "A Fast and Simple Stemming Algorithm for German Words" by Jörg 18 /// Caumanns (joerg.caumanns at isst.fhg.de). 19 class LPPCONTRIBAPI GermanStemmer : public LuceneObject { 20 public: 21 GermanStemmer(); 22 virtual ~GermanStemmer(); 23 24 LUCENE_CLASS(GermanStemmer); 25 26 protected: 27 /// Buffer for the terms while stemming them. 28 String buffer; 29 30 /// Amount of characters that are removed with substitute() while stemming. 31 int32_t substCount; 32 33 public: 34 /// Stems the given term to a unique discriminator. 35 /// 36 /// @param term The term that should be stemmed. 37 /// @return Discriminator for term. 38 String stem(const String& term); 39 40 protected: 41 /// Checks if a term could be stemmed. 42 /// @return true if, and only if, the given term consists in letters. 43 bool isStemmable(); 44 45 /// Suffix stripping (stemming) on the current term. The stripping is reduced to the seven "base" 46 /// suffixes "e", "s", "n", "t", "em", "er" and * "nd", from which all regular suffixes are build 47 /// of. The simplification causes some overstemming, and way more irregular stems, but still 48 /// provides unique. 49 /// Discriminators in the most of those cases. 50 /// The algorithm is context free, except of the length restrictions. 51 void strip(); 52 53 /// Does some optimizations on the term. This optimisations are contextual. 54 void optimize(); 55 56 /// Removes a particle denotion ("ge") from a term. 57 void removeParticleDenotion(); 58 59 /// Do some substitutions for the term to reduce overstemming: 60 /// 61 /// - Substitute Umlauts with their corresponding vowel: ��� -> aou, "�" is substituted by "ss" 62 /// - Substitute a second char of a pair of equal characters with an asterisk: ?? -> ?* 63 /// - Substitute some common character combinations with a token: sch/ch/ei/ie/ig/st -> $/�/%/&/#/! 64 void substitute(); 65 66 /// Undoes the changes made by substitute(). That are character pairs and character combinations. 67 /// Umlauts will remain as their corresponding vowel, as "�" remains as "ss". 68 void resubstitute(); 69 }; 70 71 } 72 73 #endif 74