1 /*------------------------------------------------------------------------------ 2 * Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team 3 * 4 * Distributable under the terms of either the Apache License (Version 2.0) or 5 * the GNU Lesser General Public License, as specified in the COPYING file. 6 ------------------------------------------------------------------------------*/ 7 #include "CLucene/_ApiHeader.h" 8 #include "CLucene/util/CLStreams.h" 9 #include "CLucene/analysis/Analyzers.h" 10 #include "CLucene/analysis/standard/StandardTokenizer.h" 11 #include "CLucene/analysis/standard/StandardFilter.h" 12 #include "CLucene/util/StringBuffer.h" 13 #include "GermanAnalyzer.h" 14 #include "GermanStemmer.h" 15 #include "GermanStemFilter.h" 16 17 CL_NS_USE(analysis) 18 CL_NS_USE2(analysis,de) 19 CL_NS_USE2(analysis,standard) 20 21 const TCHAR GermanAnalyzer_DASZ[] = { 0x64, 0x61, 0xdf }; 22 const TCHAR GermanAnalyzer_FUER[] = { 0x66, 0xfc, 0x72 }; 23 const TCHAR* GermanAnalyzer_GERMAN_STOP_WORDS[] = { 24 _T("einer"), _T("eine"), _T("eines"), _T("einem"), _T("einen"), 25 _T("der"), _T("die"), _T("das"), _T("dass"), GermanAnalyzer_DASZ, 26 _T("du"), _T("er"), _T("sie"), _T("es"), 27 _T("was"), _T("wer"), _T("wie"), _T("wir"), 28 _T("und"), _T("oder"), _T("ohne"), _T("mit"), 29 _T("am"), _T("im"),_T("in"), _T("aus"), _T("auf"), 30 _T("ist"), _T("sein"), _T("war"), _T("wird"), 31 _T("ihr"), _T("ihre"), _T("ihres"), 32 _T("als"), GermanAnalyzer_FUER, _T("von"), _T("mit"), 33 _T("dich"), _T("dir"), _T("mich"), _T("mir"), 34 _T("mein"), _T("sein"), _T("kein"), 35 _T("durch"), _T("wegen"), _T("wird") 36 }; 37 38 CL_NS(util)::ConstValueArray<const TCHAR*> GermanAnalyzer::GERMAN_STOP_WORDS( GermanAnalyzer_GERMAN_STOP_WORDS, 48 ); 39 40 class GermanAnalyzer::SavedStreams : public TokenStream { 41 public: 42 StandardTokenizer* tokenStream; 43 TokenStream* filteredTokenStream; 44 SavedStreams()45 SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) 46 { 47 } 48 close()49 void close(){} next(Token * token)50 Token* next(Token* token) {return NULL;} 51 }; 52 GermanAnalyzer()53 GermanAnalyzer::GermanAnalyzer() { 54 exclusionSet = NULL; 55 stopSet = _CLNEW CLTCSetList; 56 StopFilter::fillStopTable(stopSet, GERMAN_STOP_WORDS.values); 57 } 58 GermanAnalyzer(const TCHAR ** stopwords)59 GermanAnalyzer::GermanAnalyzer(const TCHAR** stopwords) { 60 exclusionSet = NULL; 61 stopSet = _CLNEW CLTCSetList; 62 StopFilter::fillStopTable(stopSet, stopwords); 63 } 64 GermanAnalyzer(CL_NS (analysis)::CLTCSetList * stopwords)65 GermanAnalyzer::GermanAnalyzer(CL_NS(analysis)::CLTCSetList* stopwords) { 66 exclusionSet = NULL; 67 stopSet = stopwords; 68 } 69 GermanAnalyzer(const char * stopwordsFile,const char * enc)70 GermanAnalyzer::GermanAnalyzer(const char* stopwordsFile, const char* enc) { 71 exclusionSet = NULL; 72 stopSet = WordlistLoader::getWordSet(stopwordsFile, enc); 73 } 74 GermanAnalyzer(CL_NS (util)::Reader * stopwordsReader,const bool deleteReader)75 GermanAnalyzer::GermanAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool deleteReader) { 76 exclusionSet = NULL; 77 stopSet = WordlistLoader::getWordSet(stopwordsReader, NULL, deleteReader); 78 } 79 ~GermanAnalyzer()80 GermanAnalyzer::~GermanAnalyzer() { 81 _CLLDELETE(stopSet); 82 _CLLDELETE(exclusionSet); 83 } 84 setStemExclusionTable(const TCHAR ** exclusionlist)85 void GermanAnalyzer::setStemExclusionTable(const TCHAR** exclusionlist) { 86 if (exclusionSet != NULL) { 87 exclusionSet->clear(); 88 } else { 89 exclusionSet = _CLNEW CLTCSetList; 90 } 91 92 CL_NS(analysis)::StopFilter::fillStopTable(exclusionSet, exclusionlist); 93 } 94 setStemExclusionTable(CL_NS (analysis)::CLTCSetList * exclusionlist)95 void GermanAnalyzer::setStemExclusionTable(CL_NS(analysis)::CLTCSetList* exclusionlist) { 96 if (exclusionSet != exclusionlist) { 97 _CLLDELETE(exclusionSet); 98 exclusionSet = exclusionlist; 99 } 100 } 101 setStemExclusionTable(const char * exclusionlistFile,const char * enc)102 void GermanAnalyzer::setStemExclusionTable(const char* exclusionlistFile, const char* enc) { 103 exclusionSet = WordlistLoader::getWordSet(exclusionlistFile, enc, exclusionSet); 104 } 105 setStemExclusionTable(CL_NS (util)::Reader * exclusionlistReader,const bool deleteReader)106 void GermanAnalyzer::setStemExclusionTable(CL_NS(util)::Reader* exclusionlistReader, const bool deleteReader) { 107 exclusionSet = WordlistLoader::getWordSet(exclusionlistReader, exclusionSet, deleteReader); 108 } 109 tokenStream(const TCHAR * fieldName,CL_NS (util)::Reader * reader)110 TokenStream* GermanAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) { 111 TokenStream* result; 112 CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); 113 114 if ( bufferedReader == NULL ) 115 result = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true ); 116 else 117 result = _CLNEW StandardTokenizer(bufferedReader); 118 119 result = _CLNEW StandardFilter(result, true); 120 result = _CLNEW LowerCaseFilter(result, true); 121 result = _CLNEW StopFilter(result, true, stopSet); 122 result = _CLNEW GermanStemFilter(result, true, exclusionSet); 123 124 return result; 125 } 126 reusableTokenStream(const TCHAR * fieldName,CL_NS (util)::Reader * reader)127 TokenStream* GermanAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) 128 { 129 SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream()); 130 131 if (streams == NULL) { 132 streams = _CLNEW SavedStreams(); 133 CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); 134 135 if ( bufferedReader == NULL ) 136 streams->tokenStream = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true ); 137 else 138 streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader); 139 140 streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true); 141 streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true); 142 streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet); 143 streams->filteredTokenStream = _CLNEW GermanStemFilter(streams->filteredTokenStream, true, exclusionSet); 144 setPreviousTokenStream(streams); 145 } else 146 streams->tokenStream->reset(reader); 147 148 return streams->filteredTokenStream; 149 } 150