1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8 #include "CLucene/util/CLStreams.h"
9 #include "CLucene/analysis/Analyzers.h"
10 #include "CLucene/analysis/standard/StandardTokenizer.h"
11 #include "CLucene/analysis/standard/StandardFilter.h"
12 #include "CLucene/util/StringBuffer.h"
13 #include "GermanAnalyzer.h"
14 #include "GermanStemmer.h"
15 #include "GermanStemFilter.h"
16 
17 CL_NS_USE(analysis)
18 CL_NS_USE2(analysis,de)
19 CL_NS_USE2(analysis,standard)
20 
21   const TCHAR GermanAnalyzer_DASZ[] = { 0x64, 0x61, 0xdf };
22   const TCHAR GermanAnalyzer_FUER[] = { 0x66, 0xfc, 0x72 };
23   const TCHAR* GermanAnalyzer_GERMAN_STOP_WORDS[] = {
24     _T("einer"), _T("eine"), _T("eines"), _T("einem"), _T("einen"),
25     _T("der"), _T("die"), _T("das"), _T("dass"), GermanAnalyzer_DASZ,
26     _T("du"), _T("er"), _T("sie"), _T("es"),
27     _T("was"), _T("wer"), _T("wie"), _T("wir"),
28     _T("und"), _T("oder"), _T("ohne"), _T("mit"),
29     _T("am"), _T("im"),_T("in"), _T("aus"), _T("auf"),
30     _T("ist"), _T("sein"), _T("war"), _T("wird"),
31     _T("ihr"), _T("ihre"), _T("ihres"),
32     _T("als"), GermanAnalyzer_FUER, _T("von"), _T("mit"),
33     _T("dich"), _T("dir"), _T("mich"), _T("mir"),
34     _T("mein"), _T("sein"), _T("kein"),
35     _T("durch"), _T("wegen"), _T("wird")
36   };
37 
38   CL_NS(util)::ConstValueArray<const TCHAR*> GermanAnalyzer::GERMAN_STOP_WORDS( GermanAnalyzer_GERMAN_STOP_WORDS, 48 );
39 
40   class GermanAnalyzer::SavedStreams : public TokenStream {
41   public:
42       StandardTokenizer* tokenStream;
43       TokenStream* filteredTokenStream;
44 
SavedStreams()45       SavedStreams():tokenStream(NULL), filteredTokenStream(NULL)
46       {
47       }
48 
close()49       void close(){}
next(Token * token)50       Token* next(Token* token) {return NULL;}
51   };
52 
GermanAnalyzer()53   GermanAnalyzer::GermanAnalyzer() {
54     exclusionSet = NULL;
55     stopSet = _CLNEW CLTCSetList;
56     StopFilter::fillStopTable(stopSet, GERMAN_STOP_WORDS.values);
57   }
58 
GermanAnalyzer(const TCHAR ** stopwords)59   GermanAnalyzer::GermanAnalyzer(const TCHAR** stopwords) {
60     exclusionSet = NULL;
61     stopSet = _CLNEW CLTCSetList;
62     StopFilter::fillStopTable(stopSet, stopwords);
63   }
64 
GermanAnalyzer(CL_NS (analysis)::CLTCSetList * stopwords)65   GermanAnalyzer::GermanAnalyzer(CL_NS(analysis)::CLTCSetList* stopwords) {
66     exclusionSet = NULL;
67     stopSet = stopwords;
68   }
69 
GermanAnalyzer(const char * stopwordsFile,const char * enc)70   GermanAnalyzer::GermanAnalyzer(const char* stopwordsFile, const char* enc) {
71     exclusionSet = NULL;
72     stopSet = WordlistLoader::getWordSet(stopwordsFile, enc);
73   }
74 
GermanAnalyzer(CL_NS (util)::Reader * stopwordsReader,const bool deleteReader)75   GermanAnalyzer::GermanAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool deleteReader) {
76     exclusionSet = NULL;
77     stopSet = WordlistLoader::getWordSet(stopwordsReader, NULL, deleteReader);
78   }
79 
~GermanAnalyzer()80   GermanAnalyzer::~GermanAnalyzer() {
81     _CLLDELETE(stopSet);
82     _CLLDELETE(exclusionSet);
83   }
84 
setStemExclusionTable(const TCHAR ** exclusionlist)85   void GermanAnalyzer::setStemExclusionTable(const TCHAR** exclusionlist) {
86     if (exclusionSet != NULL) {
87       exclusionSet->clear();
88     } else {
89       exclusionSet = _CLNEW CLTCSetList;
90     }
91 
92     CL_NS(analysis)::StopFilter::fillStopTable(exclusionSet, exclusionlist);
93   }
94 
setStemExclusionTable(CL_NS (analysis)::CLTCSetList * exclusionlist)95   void GermanAnalyzer::setStemExclusionTable(CL_NS(analysis)::CLTCSetList* exclusionlist) {
96     if (exclusionSet != exclusionlist) {
97       _CLLDELETE(exclusionSet);
98       exclusionSet = exclusionlist;
99     }
100   }
101 
setStemExclusionTable(const char * exclusionlistFile,const char * enc)102   void GermanAnalyzer::setStemExclusionTable(const char* exclusionlistFile, const char* enc) {
103     exclusionSet = WordlistLoader::getWordSet(exclusionlistFile, enc, exclusionSet);
104   }
105 
setStemExclusionTable(CL_NS (util)::Reader * exclusionlistReader,const bool deleteReader)106   void GermanAnalyzer::setStemExclusionTable(CL_NS(util)::Reader* exclusionlistReader, const bool deleteReader) {
107     exclusionSet = WordlistLoader::getWordSet(exclusionlistReader, exclusionSet, deleteReader);
108   }
109 
tokenStream(const TCHAR * fieldName,CL_NS (util)::Reader * reader)110   TokenStream* GermanAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
111     TokenStream* result;
112     CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
113 
114     if ( bufferedReader == NULL )
115       result = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
116     else
117       result = _CLNEW StandardTokenizer(bufferedReader);
118 
119     result = _CLNEW StandardFilter(result, true);
120     result = _CLNEW LowerCaseFilter(result, true);
121     result = _CLNEW StopFilter(result, true, stopSet);
122     result = _CLNEW GermanStemFilter(result, true, exclusionSet);
123 
124     return result;
125   }
126 
reusableTokenStream(const TCHAR * fieldName,CL_NS (util)::Reader * reader)127   TokenStream* GermanAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)
128   {
129     SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
130 
131     if (streams == NULL) {
132       streams = _CLNEW SavedStreams();
133       CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
134 
135       if ( bufferedReader == NULL )
136         streams->tokenStream = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
137       else
138         streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
139 
140       streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
141       streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
142       streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
143       streams->filteredTokenStream = _CLNEW GermanStemFilter(streams->filteredTokenStream, true, exclusionSet);
144       setPreviousTokenStream(streams);
145     } else
146       streams->tokenStream->reset(reader);
147 
148     return streams->filteredTokenStream;
149   }
150