1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #include "LuceneInc.h"
8 #include "StandardAnalyzer.h"
9 #include "_StandardAnalyzer.h"
10 #include "StandardTokenizer.h"
11 #include "StandardFilter.h"
12 #include "LowerCaseFilter.h"
13 #include "StopAnalyzer.h"
14 #include "StopFilter.h"
15 #include "WordlistLoader.h"
16 
17 namespace Lucene {
18 
19 /// Construct an analyzer with the given stop words.
20 const int32_t StandardAnalyzer::DEFAULT_MAX_TOKEN_LENGTH = 255;
21 
StandardAnalyzer(LuceneVersion::Version matchVersion)22 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion) {
23     ConstructAnalyser(matchVersion, StopAnalyzer::ENGLISH_STOP_WORDS_SET());
24 }
25 
StandardAnalyzer(LuceneVersion::Version matchVersion,HashSet<String> stopWords)26 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion, HashSet<String> stopWords) {
27     ConstructAnalyser(matchVersion, stopWords);
28 }
29 
StandardAnalyzer(LuceneVersion::Version matchVersion,const String & stopwords)30 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion, const String& stopwords) {
31     ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords));
32 }
33 
StandardAnalyzer(LuceneVersion::Version matchVersion,const ReaderPtr & stopwords)34 StandardAnalyzer::StandardAnalyzer(LuceneVersion::Version matchVersion, const ReaderPtr& stopwords) {
35     ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords));
36 }
37 
~StandardAnalyzer()38 StandardAnalyzer::~StandardAnalyzer() {
39 }
40 
ConstructAnalyser(LuceneVersion::Version matchVersion,HashSet<String> stopWords)41 void StandardAnalyzer::ConstructAnalyser(LuceneVersion::Version matchVersion, HashSet<String> stopWords) {
42     stopSet = stopWords;
43     enableStopPositionIncrements = StopFilter::getEnablePositionIncrementsVersionDefault(matchVersion);
44     replaceInvalidAcronym = LuceneVersion::onOrAfter(matchVersion, LuceneVersion::LUCENE_24);
45     this->matchVersion = matchVersion;
46     this->maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
47 }
48 
tokenStream(const String & fieldName,const ReaderPtr & reader)49 TokenStreamPtr StandardAnalyzer::tokenStream(const String& fieldName, const ReaderPtr& reader) {
50     StandardTokenizerPtr tokenStream(newLucene<StandardTokenizer>(matchVersion, reader));
51     tokenStream->setMaxTokenLength(maxTokenLength);
52     TokenStreamPtr result(newLucene<StandardFilter>(tokenStream));
53     result = newLucene<LowerCaseFilter>(result);
54     result = newLucene<StopFilter>(enableStopPositionIncrements, result, stopSet);
55     return result;
56 }
57 
setMaxTokenLength(int32_t length)58 void StandardAnalyzer::setMaxTokenLength(int32_t length) {
59     maxTokenLength = length;
60 }
61 
getMaxTokenLength()62 int32_t StandardAnalyzer::getMaxTokenLength() {
63     return maxTokenLength;
64 }
65 
reusableTokenStream(const String & fieldName,const ReaderPtr & reader)66 TokenStreamPtr StandardAnalyzer::reusableTokenStream(const String& fieldName, const ReaderPtr& reader) {
67     StandardAnalyzerSavedStreamsPtr streams = boost::dynamic_pointer_cast<StandardAnalyzerSavedStreams>(getPreviousTokenStream());
68     if (!streams) {
69         streams = newLucene<StandardAnalyzerSavedStreams>();
70         setPreviousTokenStream(streams);
71         streams->tokenStream = newLucene<StandardTokenizer>(matchVersion, reader);
72         streams->filteredTokenStream = newLucene<StandardFilter>(streams->tokenStream);
73         streams->filteredTokenStream = newLucene<LowerCaseFilter>(streams->filteredTokenStream);
74         streams->filteredTokenStream = newLucene<StopFilter>(enableStopPositionIncrements, streams->filteredTokenStream, stopSet);
75     } else {
76         streams->tokenStream->reset(reader);
77     }
78     streams->tokenStream->setMaxTokenLength(maxTokenLength);
79 
80     streams->tokenStream->setReplaceInvalidAcronym(replaceInvalidAcronym);
81 
82     return streams->filteredTokenStream;
83 }
84 
~StandardAnalyzerSavedStreams()85 StandardAnalyzerSavedStreams::~StandardAnalyzerSavedStreams() {
86 }
87 
88 }
89