1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections;
20 using System.Collections.Generic;
21 using Lucene.Net.Analysis;
22 using Lucene.Net.Util;
23 using Version = Lucene.Net.Util.Version;
24 
25 namespace Lucene.Net.Analysis.Standard
26 {
27 
28 	/// <summary> Filters <see cref="StandardTokenizer" /> with <see cref="StandardFilter" />,
29 	/// <see cref="LowerCaseFilter" /> and <see cref="StopFilter" />, using a list of English stop
30 	/// words.
31 	///
32 	/// <a name="version"/>
33 	/// <p/>
34 	/// You must specify the required <see cref="Version" /> compatibility when creating
35 	/// StandardAnalyzer:
36 	/// <list type="bullet">
37 	/// <item>As of 2.9, StopFilter preserves position increments</item>
38 	/// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
39 	/// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>)</item>
40 	/// </list>
41 	/// </summary>
42 	public class StandardAnalyzer : Analyzer
43 	{
44 		private ISet<string> stopSet;
45 
46 		/// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
47         /// See <a href="https://issues.apache.org/jira/browse/LUCENE-1068">https://issues.apache.org/jira/browse/LUCENE-1068</a>
48 		/// </summary>
49 		private bool replaceInvalidAcronym, enableStopPositionIncrements;
50 
51 		/// <summary>An unmodifiable set containing some common English words that are usually not
52 		/// useful for searching.
53 		/// </summary>
54 		public static readonly ISet<string> STOP_WORDS_SET;
55 		private Version matchVersion;
56 
57 		/// <summary>Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET" />).
58 		/// </summary>
59 		/// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param>
StandardAnalyzer(Version matchVersion)60 		public StandardAnalyzer(Version matchVersion)
61             : this(matchVersion, STOP_WORDS_SET)
62 		{ }
63 
64 		/// <summary>Builds an analyzer with the given stop words.</summary>
65         /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
66 		///
67 		/// </param>
68 		/// <param name="stopWords">stop words
69 		/// </param>
StandardAnalyzer(Version matchVersion, ISet<string> stopWords)70 		public StandardAnalyzer(Version matchVersion, ISet<string> stopWords)
71 		{
72 			stopSet = stopWords;
73             SetOverridesTokenStreamMethod<StandardAnalyzer>();
74             enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
75             replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24);
76             this.matchVersion = matchVersion;
77 		}
78 
79 		/// <summary>Builds an analyzer with the stop words from the given file.</summary>
80 		/// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
81 		/// </seealso>
82         /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
83 		///
84 		/// </param>
85 		/// <param name="stopwords">File to read stop words from
86 		/// </param>
StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)87 		public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
88             : this (matchVersion, WordlistLoader.GetWordSet(stopwords))
89 		{
90 		}
91 
92 		/// <summary>Builds an analyzer with the stop words from the given reader.</summary>
93         /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
94 		/// </seealso>
95         /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
96 		///
97 		/// </param>
98 		/// <param name="stopwords">Reader to read stop words from
99 		/// </param>
StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)100 		public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
101             : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
102 		{ }
103 
104 		/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
105 		///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
106 		/// </summary>
TokenStream(System.String fieldName, System.IO.TextReader reader)107 		public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
108 		{
109 			StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
110 			tokenStream.MaxTokenLength = maxTokenLength;
111 			TokenStream result = new StandardFilter(tokenStream);
112 			result = new LowerCaseFilter(result);
113 			result = new StopFilter(enableStopPositionIncrements, result, stopSet);
114 			return result;
115 		}
116 
117 		private sealed class SavedStreams
118 		{
119 			internal StandardTokenizer tokenStream;
120 			internal TokenStream filteredTokenStream;
121 		}
122 
123 		/// <summary>Default maximum allowed token length </summary>
124 		public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
125 
126 		private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
127 
128 	    /// <summary> Set maximum allowed token length.  If a token is seen
129 	    /// that exceeds this length then it is discarded.  This
130 	    /// setting only takes effect the next time tokenStream or
131 	    /// reusableTokenStream is called.
132 	    /// </summary>
133 	    public virtual int MaxTokenLength
134 	    {
135 	        get { return maxTokenLength; }
136 	        set { maxTokenLength = value; }
137 	    }
138 
ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)139 	    public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
140 		{
141 			if (overridesTokenStreamMethod)
142 			{
143 				// LUCENE-1678: force fallback to tokenStream() if we
144 				// have been subclassed and that subclass overrides
145 				// tokenStream but not reusableTokenStream
146 				return TokenStream(fieldName, reader);
147 			}
148 			SavedStreams streams = (SavedStreams) PreviousTokenStream;
149 			if (streams == null)
150 			{
151 				streams = new SavedStreams();
152 				PreviousTokenStream = streams;
153 				streams.tokenStream = new StandardTokenizer(matchVersion, reader);
154 				streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
155 				streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
156 			    streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
157                                                              streams.filteredTokenStream, stopSet);
158 			}
159 			else
160 			{
161 				streams.tokenStream.Reset(reader);
162 			}
163 			streams.tokenStream.MaxTokenLength = maxTokenLength;
164 
165 			streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
166 
167 			return streams.filteredTokenStream;
168 		}
StandardAnalyzer()169 		static StandardAnalyzer()
170 		{
171 			STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
172 		}
173 	}
174 }