1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 using System; 19 using System.Collections; 20 using System.Collections.Generic; 21 using Lucene.Net.Analysis; 22 using Lucene.Net.Util; 23 using Version = Lucene.Net.Util.Version; 24 25 namespace Lucene.Net.Analysis.Standard 26 { 27 28 /// <summary> Filters <see cref="StandardTokenizer" /> with <see cref="StandardFilter" />, 29 /// <see cref="LowerCaseFilter" /> and <see cref="StopFilter" />, using a list of English stop 30 /// words. 31 /// 32 /// <a name="version"/> 33 /// <p/> 34 /// You must specify the required <see cref="Version" /> compatibility when creating 35 /// StandardAnalyzer: 36 /// <list type="bullet"> 37 /// <item>As of 2.9, StopFilter preserves position increments</item> 38 /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see 39 /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>)</item> 40 /// </list> 41 /// </summary> 42 public class StandardAnalyzer : Analyzer 43 { 44 private ISet<string> stopSet; 45 46 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type. 47 /// See <a href="https://issues.apache.org/jira/browse/LUCENE-1068">https://issues.apache.org/jira/browse/LUCENE-1068</a> 48 /// </summary> 49 private bool replaceInvalidAcronym, enableStopPositionIncrements; 50 51 /// <summary>An unmodifiable set containing some common English words that are usually not 52 /// useful for searching. 53 /// </summary> 54 public static readonly ISet<string> STOP_WORDS_SET; 55 private Version matchVersion; 56 57 /// <summary>Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET" />). 58 /// </summary> 59 /// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param> StandardAnalyzer(Version matchVersion)60 public StandardAnalyzer(Version matchVersion) 61 : this(matchVersion, STOP_WORDS_SET) 62 { } 63 64 /// <summary>Builds an analyzer with the given stop words.</summary> 65 /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> 66 /// 67 /// </param> 68 /// <param name="stopWords">stop words 69 /// </param> StandardAnalyzer(Version matchVersion, ISet<string> stopWords)70 public StandardAnalyzer(Version matchVersion, ISet<string> stopWords) 71 { 72 stopSet = stopWords; 73 SetOverridesTokenStreamMethod<StandardAnalyzer>(); 74 enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); 75 replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24); 76 this.matchVersion = matchVersion; 77 } 78 79 /// <summary>Builds an analyzer with the stop words from the given file.</summary> 80 /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)"> 81 /// </seealso> 82 /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> 83 /// 84 /// </param> 85 /// <param name="stopwords">File to read stop words from 86 /// </param> StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)87 public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords) 88 : this (matchVersion, WordlistLoader.GetWordSet(stopwords)) 89 { 90 } 91 92 /// <summary>Builds an analyzer with the stop words from the given reader.</summary> 93 /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)"> 94 /// </seealso> 95 /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> 96 /// 97 /// </param> 98 /// <param name="stopwords">Reader to read stop words from 99 /// </param> StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)100 public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords) 101 : this(matchVersion, WordlistLoader.GetWordSet(stopwords)) 102 { } 103 104 /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> 105 ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. 106 /// </summary> TokenStream(System.String fieldName, System.IO.TextReader reader)107 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) 108 { 109 StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); 110 tokenStream.MaxTokenLength = maxTokenLength; 111 TokenStream result = new StandardFilter(tokenStream); 112 result = new LowerCaseFilter(result); 113 result = new StopFilter(enableStopPositionIncrements, result, stopSet); 114 return result; 115 } 116 117 private sealed class SavedStreams 118 { 119 internal StandardTokenizer tokenStream; 120 internal TokenStream filteredTokenStream; 121 } 122 123 /// <summary>Default maximum allowed token length </summary> 124 public const int DEFAULT_MAX_TOKEN_LENGTH = 255; 125 126 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; 127 128 /// <summary> Set maximum allowed token length. If a token is seen 129 /// that exceeds this length then it is discarded. This 130 /// setting only takes effect the next time tokenStream or 131 /// reusableTokenStream is called. 132 /// </summary> 133 public virtual int MaxTokenLength 134 { 135 get { return maxTokenLength; } 136 set { maxTokenLength = value; } 137 } 138 ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)139 public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) 140 { 141 if (overridesTokenStreamMethod) 142 { 143 // LUCENE-1678: force fallback to tokenStream() if we 144 // have been subclassed and that subclass overrides 145 // tokenStream but not reusableTokenStream 146 return TokenStream(fieldName, reader); 147 } 148 SavedStreams streams = (SavedStreams) PreviousTokenStream; 149 if (streams == null) 150 { 151 streams = new SavedStreams(); 152 PreviousTokenStream = streams; 153 streams.tokenStream = new StandardTokenizer(matchVersion, reader); 154 streams.filteredTokenStream = new StandardFilter(streams.tokenStream); 155 streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); 156 streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, 157 streams.filteredTokenStream, stopSet); 158 } 159 else 160 { 161 streams.tokenStream.Reset(reader); 162 } 163 streams.tokenStream.MaxTokenLength = maxTokenLength; 164 165 streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym); 166 167 return streams.filteredTokenStream; 168 } StandardAnalyzer()169 static StandardAnalyzer() 170 { 171 STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 172 } 173 } 174 }