1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6
7 #include "LuceneInc.h"
8 #include "StandardAnalyzer.h"
9 #include "_StandardAnalyzer.h"
10 #include "StandardTokenizer.h"
11 #include "StandardFilter.h"
12 #include "LowerCaseFilter.h"
13 #include "StopAnalyzer.h"
14 #include "StopFilter.h"
15 #include "WordlistLoader.h"
16
17 namespace Lucene {
18
19 /// Construct an analyzer with the given stop words.
20 const int32_t StandardAnalyzer::DEFAULT_MAX_TOKEN_LENGTH = 255;
21
StandardAnalyzer(LuceneVersion::Version matchVersion)22 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion) {
23 ConstructAnalyser(matchVersion, StopAnalyzer::ENGLISH_STOP_WORDS_SET());
24 }
25
StandardAnalyzer(LuceneVersion::Version matchVersion,HashSet<String> stopWords)26 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion, HashSet<String> stopWords) {
27 ConstructAnalyser(matchVersion, stopWords);
28 }
29
StandardAnalyzer(LuceneVersion::Version matchVersion,const String & stopwords)30 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion, const String& stopwords) {
31 ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords));
32 }
33
StandardAnalyzer(LuceneVersion::Version matchVersion,const ReaderPtr & stopwords)34 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion, const ReaderPtr& stopwords) {
35 ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords));
36 }
37
~StandardAnalyzer()38 StandardAnalyzer::~StandardAnalyzer() {
39 }
40
ConstructAnalyser(LuceneVersion::Version matchVersion,HashSet<String> stopWords)41 void StandardAnalyzer::ConstructAnalyser(LuceneVersion::Version matchVersion, HashSet<String> stopWords) {
42 stopSet = stopWords;
43 enableStopPositionIncrements = StopFilter::getEnablePositionIncrementsVersionDefault(matchVersion);
44 replaceInvalidAcronym = LuceneVersion::onOrAfter(matchVersion, LuceneVersion::LUCENE_24);
45 this->matchVersion = matchVersion;
46 this->maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
47 }
48
tokenStream(const String & fieldName,const ReaderPtr & reader)49 TokenStreamPtr StandardAnalyzer::tokenStream(const String& fieldName, const ReaderPtr& reader) {
50 StandardTokenizerPtr tokenStream(newLucene<StandardTokenizer>(matchVersion, reader));
51 tokenStream->setMaxTokenLength(maxTokenLength);
52 TokenStreamPtr result(newLucene<StandardFilter>(tokenStream));
53 result = newLucene<LowerCaseFilter>(result);
54 result = newLucene<StopFilter>(enableStopPositionIncrements, result, stopSet);
55 return result;
56 }
57
setMaxTokenLength(int32_t length)58 void StandardAnalyzer::setMaxTokenLength(int32_t length) {
59 maxTokenLength = length;
60 }
61
getMaxTokenLength()62 int32_t StandardAnalyzer::getMaxTokenLength() {
63 return maxTokenLength;
64 }
65
reusableTokenStream(const String & fieldName,const ReaderPtr & reader)66 TokenStreamPtr StandardAnalyzer::reusableTokenStream(const String& fieldName, const ReaderPtr& reader) {
67 StandardAnalyzerSavedStreamsPtr streams = boost::dynamic_pointer_cast<StandardAnalyzerSavedStreams>(getPreviousTokenStream());
68 if (!streams) {
69 streams = newLucene<StandardAnalyzerSavedStreams>();
70 setPreviousTokenStream(streams);
71 streams->tokenStream = newLucene<StandardTokenizer>(matchVersion, reader);
72 streams->filteredTokenStream = newLucene<StandardFilter>(streams->tokenStream);
73 streams->filteredTokenStream = newLucene<LowerCaseFilter>(streams->filteredTokenStream);
74 streams->filteredTokenStream = newLucene<StopFilter>(enableStopPositionIncrements, streams->filteredTokenStream, stopSet);
75 } else {
76 streams->tokenStream->reset(reader);
77 }
78 streams->tokenStream->setMaxTokenLength(maxTokenLength);
79
80 streams->tokenStream->setReplaceInvalidAcronym(replaceInvalidAcronym);
81
82 return streams->filteredTokenStream;
83 }
84
~StandardAnalyzerSavedStreams()85 StandardAnalyzerSavedStreams::~StandardAnalyzerSavedStreams() {
86 }
87
88 }
89