1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Runtime.InteropServices;
21 using IndexReader = Lucene.Net.Index.IndexReader;
22 using Term = Lucene.Net.Index.Term;
23 using QueryParser = Lucene.Net.QueryParsers.QueryParser;
24 using ToStringUtils = Lucene.Net.Util.ToStringUtils;
25 
26 namespace Lucene.Net.Search
27 {
28 
29 	/// <summary> An abstract <see cref="Query" /> that matches documents
30 	/// containing a subset of terms provided by a <see cref="FilteredTermEnum" />
31 	/// enumeration.
32 	///
33 	/// <p/>This query cannot be used directly; you must subclass
34 	/// it and define <see cref="GetEnum" /> to provide a <see cref="FilteredTermEnum" />
35 	/// that iterates through the terms to be
36 	/// matched.
37 	///
38 	/// <p/><b>NOTE</b>: if <see cref="RewriteMethod" /> is either
39 	/// <see cref="CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE" /> or <see cref="SCORING_BOOLEAN_QUERY_REWRITE" />
40 	///, you may encounter a
41 	/// <see cref="BooleanQuery.TooManyClauses" /> exception during
42 	/// searching, which happens when the number of terms to be
43 	/// searched exceeds <see cref="BooleanQuery.MaxClauseCount" />
44 	///.  Setting <see cref="RewriteMethod" />
45 	/// to <see cref="CONSTANT_SCORE_FILTER_REWRITE" />
46 	/// prevents this.
47 	///
48 	/// <p/>The recommended rewrite method is <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" />
49 	///: it doesn't spend CPU
50 	/// computing unhelpful scores, and it tries to pick the most
51 	/// performant rewrite method given the query.
52 	///
53 	/// Note that <see cref="QueryParser" /> produces
54 	/// MultiTermQueries using <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" />
55 	/// by default.
56 	/// </summary>
57 	[Serializable]
58 	public abstract class MultiTermQuery:Query
59 	{
60 		[Serializable]
61 		public class AnonymousClassConstantScoreAutoRewrite:ConstantScoreAutoRewrite
62 		{
63 		    public override int TermCountCutoff
64 		    {
65 		        set { throw new System.NotSupportedException("Please create a private instance"); }
66 		    }
67 
68 		    public override double DocCountPercent
69 		    {
70 		        set { throw new System.NotSupportedException("Please create a private instance"); }
71 		    }
72 
73 		    // Make sure we are still a singleton even after deserializing
ReadResolve()74 			protected internal virtual System.Object ReadResolve()
75 			{
76 				return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
77 			}
78 		}
79 		protected internal RewriteMethod internalRewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
80 		[NonSerialized]
81 		internal int numberOfTerms = 0;
82 
83 	    [Serializable]
84 		private sealed class ConstantScoreFilterRewrite:RewriteMethod
85 		{
Rewrite(IndexReader reader, MultiTermQuery query)86 			public override Query Rewrite(IndexReader reader, MultiTermQuery query)
87 			{
88 				Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
89 				result.Boost = query.Boost;
90 				return result;
91 			}
92 
93 			// Make sure we are still a singleton even after deserializing
ReadResolve()94 			internal System.Object ReadResolve()
95 			{
96 				return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
97 			}
98 		}
99 
100 		/// <summary>A rewrite method that first creates a private Filter,
101 		/// by visiting each term in sequence and marking all docs
102 		/// for that term.  Matching documents are assigned a
103 		/// constant score equal to the query's boost.
104 		///
105 		/// <p/> This method is faster than the BooleanQuery
106 		/// rewrite methods when the number of matched terms or
107 		/// matched documents is non-trivial. Also, it will never
108 		/// hit an errant <see cref="BooleanQuery.TooManyClauses" />
109 		/// exception.
110 		///
111 		/// </summary>
112 		/// <seealso cref="RewriteMethod">
113 		/// </seealso>
114 		public static readonly RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite();
115 
116 		[Serializable]
117 		private class ScoringBooleanQueryRewrite:RewriteMethod
118 		{
Rewrite(IndexReader reader, MultiTermQuery query)119 			public override Query Rewrite(IndexReader reader, MultiTermQuery query)
120 			{
121 
122 				FilteredTermEnum enumerator = query.GetEnum(reader);
123 				BooleanQuery result = new BooleanQuery(true);
124 				int count = 0;
125 				try
126 				{
127 					do
128 					{
129 						Term t = enumerator.Term;
130 						if (t != null)
131 						{
132 							TermQuery tq = new TermQuery(t); // found a match
133 							tq.Boost = query.Boost * enumerator.Difference(); // set the boost
134 							result.Add(tq, Occur.SHOULD); // add to query
135 							count++;
136 						}
137 					}
138 					while (enumerator.Next());
139 				}
140 				finally
141 				{
142 					enumerator.Close();
143 				}
144 				query.IncTotalNumberOfTerms(count);
145 				return result;
146 			}
147 
148 			// Make sure we are still a singleton even after deserializing
ReadResolve()149 			protected internal virtual System.Object ReadResolve()
150 			{
151 				return Lucene.Net.Search.MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
152 			}
153 		}
154 
155 		/// <summary>A rewrite method that first translates each term into
156 		/// <see cref="Occur.SHOULD" /> clause in a
157 		/// BooleanQuery, and keeps the scores as computed by the
158 		/// query.  Note that typically such scores are
159 		/// meaningless to the user, and require non-trivial CPU
160 		/// to compute, so it's almost always better to use <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" />
161 		/// instead.
162 		///
163 		/// <p/><b>NOTE</b>: This rewrite method will hit <see cref="BooleanQuery.TooManyClauses" />
164 		/// if the number of terms
165 		/// exceeds <see cref="BooleanQuery.MaxClauseCount" />.
166 		///
167 		/// </summary>
168 		/// <seealso cref="RewriteMethod">
169 		/// </seealso>
170 		public static readonly RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
171 
172 		[Serializable]
173 		private class ConstantScoreBooleanQueryRewrite:ScoringBooleanQueryRewrite
174 		{
Rewrite(IndexReader reader, MultiTermQuery query)175 			public override Query Rewrite(IndexReader reader, MultiTermQuery query)
176 			{
177 				// strip the scores off
178 				Query result = new ConstantScoreQuery(new QueryWrapperFilter(base.Rewrite(reader, query)));
179 				result.Boost = query.Boost;
180 				return result;
181 			}
182 
183 			// Make sure we are still a singleton even after deserializing
ReadResolve()184 			protected internal override System.Object ReadResolve()
185 			{
186 				return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
187 			}
188 		}
189 
190 		/// <summary>Like <see cref="SCORING_BOOLEAN_QUERY_REWRITE" /> except
191 		/// scores are not computed.  Instead, each matching
192 		/// document receives a constant score equal to the
193 		/// query's boost.
194 		///
195 		/// <p/><b>NOTE</b>: This rewrite method will hit <see cref="BooleanQuery.TooManyClauses" />
196 		/// if the number of terms
197 		/// exceeds <see cref="BooleanQuery.MaxClauseCount" />.
198 		///
199 		/// </summary>
200 		/// <seealso cref="RewriteMethod">
201 		/// </seealso>
202 		public static readonly RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite();
203 
204 
205 		/// <summary>A rewrite method that tries to pick the best
206 		/// constant-score rewrite method based on term and
207 		/// document counts from the query.  If both the number of
208 		/// terms and documents is small enough, then <see cref="CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE" />
209 		/// is used.
210 		/// Otherwise, <see cref="CONSTANT_SCORE_FILTER_REWRITE" /> is
211 		/// used.
212 		/// </summary>
213 		[Serializable]
214 		public class ConstantScoreAutoRewrite:RewriteMethod
215 		{
ConstantScoreAutoRewrite()216 			public ConstantScoreAutoRewrite()
217 			{
218 				InitBlock();
219 			}
InitBlock()220 			private void  InitBlock()
221 			{
222 				termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
223 				docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
224 			}
225 
226 			// Defaults derived from rough tests with a 20.0 million
227 			// doc Wikipedia index.  With more than 350 terms in the
228 			// query, the filter method is fastest:
229 			public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
230 
231 			// If the query will hit more than 1 in 1000 of the docs
232 			// in the index (0.1%), the filter method is fastest:
233 			public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
234 
235 			private int termCountCutoff;
236 			private double docCountPercent;
237 
238 		    /// <summary>If the number of terms in this query is equal to or
239 		    /// larger than this setting then <see cref="CONSTANT_SCORE_FILTER_REWRITE" />
240 		    /// is used.
241 		    /// </summary>
242 		    public virtual int TermCountCutoff
243 		    {
244 		        get { return termCountCutoff; }
245 		        set { termCountCutoff = value; }
246 		    }
247 
248 		    /// <summary>If the number of documents to be visited in the
249 		    /// postings exceeds this specified percentage of the
250 		    /// MaxDoc for the index, then <see cref="CONSTANT_SCORE_FILTER_REWRITE" />
251 		    /// is used.
252 		    /// </summary>
253 		    /// <value> 0.0 to 100.0 </value>
254 		    public virtual double DocCountPercent
255 		    {
256 		        get { return docCountPercent; }
257 		        set { docCountPercent = value; }
258 		    }
259 
Rewrite(IndexReader reader, MultiTermQuery query)260 		    public override Query Rewrite(IndexReader reader, MultiTermQuery query)
261 			{
262 				// Get the enum and start visiting terms.  If we
263 				// exhaust the enum before hitting either of the
264 				// cutoffs, we use ConstantBooleanQueryRewrite; else,
265 				// ConstantFilterRewrite:
266 				ICollection<Term> pendingTerms = new List<Term>();
267 				int docCountCutoff = (int) ((docCountPercent / 100.0) * reader.MaxDoc);
268 				int termCountLimit = System.Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff);
269 				int docVisitCount = 0;
270 
271 				FilteredTermEnum enumerator = query.GetEnum(reader);
272 				try
273 				{
274 					while (true)
275 					{
276 						Term t = enumerator.Term;
277 						if (t != null)
278 						{
279 							pendingTerms.Add(t);
280 							// Loading the TermInfo from the terms dict here
281 							// should not be costly, because 1) the
282 							// query/filter will load the TermInfo when it
283 							// runs, and 2) the terms dict has a cache:
284 							docVisitCount += reader.DocFreq(t);
285 						}
286 
287 						if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff)
288 						{
289 							// Too many terms -- make a filter.
290 							Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
291 							result.Boost = query.Boost;
292 							return result;
293 						}
294 						else if (!enumerator.Next())
295 						{
296 							// Enumeration is done, and we hit a small
297 							// enough number of terms & docs -- just make a
298 							// BooleanQuery, now
299 							BooleanQuery bq = new BooleanQuery(true);
300 							foreach(Term term in pendingTerms)
301 							{
302 								TermQuery tq = new TermQuery(term);
303 								bq.Add(tq, Occur.SHOULD);
304 							}
305 							// Strip scores
306 							Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
307 							result.Boost = query.Boost;
308 							query.IncTotalNumberOfTerms(pendingTerms.Count);
309 							return result;
310 						}
311 					}
312 				}
313 				finally
314 				{
315 					enumerator.Close();
316 				}
317 			}
318 
GetHashCode()319 			public override int GetHashCode()
320 			{
321 				int prime = 1279;
322 				return (int) (prime * termCountCutoff + BitConverter.DoubleToInt64Bits(docCountPercent));
323 			}
324 
Equals(System.Object obj)325 			public  override bool Equals(System.Object obj)
326 			{
327 				if (this == obj)
328 					return true;
329 				if (obj == null)
330 					return false;
331 				if (GetType() != obj.GetType())
332 					return false;
333 
334 				ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
335 				if (other.termCountCutoff != termCountCutoff)
336 				{
337 					return false;
338 				}
339 
340 				if (BitConverter.DoubleToInt64Bits(other.docCountPercent) != BitConverter.DoubleToInt64Bits(docCountPercent))
341 				{
342 					return false;
343 				}
344 
345 				return true;
346 			}
347 		}
348 
349 		/// <summary>Read-only default instance of <see cref="ConstantScoreAutoRewrite" />
350 		///, with <see cref="ConstantScoreAutoRewrite.TermCountCutoff" />
351 		/// set to
352 		/// <see cref="ConstantScoreAutoRewrite.DEFAULT_TERM_COUNT_CUTOFF" />
353 		///
354 		/// and <see cref="ConstantScoreAutoRewrite.DocCountPercent" />
355 		/// set to
356 		/// <see cref="ConstantScoreAutoRewrite.DEFAULT_DOC_COUNT_PERCENT" />
357 		///.
358 		/// Note that you cannot alter the configuration of this
359 		/// instance; you'll need to create a private instance
360 		/// instead.
361 		/// </summary>
362 		public static readonly RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
363 
364 		/// <summary> Constructs a query matching terms that cannot be represented with a single
365 		/// Term.
366 		/// </summary>
MultiTermQuery()367 		protected MultiTermQuery()
368 		{
369 		}
370 
371 		/// <summary>Construct the enumeration to be used, expanding the pattern term. </summary>
GetEnum(IndexReader reader)372 		protected internal abstract FilteredTermEnum GetEnum(IndexReader reader);
373 
374 	    /// <summary> Expert: Return the number of unique terms visited during execution of the query.
375 	    /// If there are many of them, you may consider using another query type
376 	    /// or optimize your total term count in index.
377 	    /// <p/>This method is not thread safe, be sure to only call it when no query is running!
378 	    /// If you re-use the same query instance for another
379 	    /// search, be sure to first reset the term counter
380 	    /// with <see cref="ClearTotalNumberOfTerms" />.
381 	    /// <p/>On optimized indexes / no MultiReaders, you get the correct number of
382 	    /// unique terms for the whole index. Use this number to compare different queries.
383 	    /// For non-optimized indexes this number can also be achived in
384 	    /// non-constant-score mode. In constant-score mode you get the total number of
385 	    /// terms seeked for all segments / sub-readers.
386 	    /// </summary>
387 	    /// <seealso cref="ClearTotalNumberOfTerms">
388 	    /// </seealso>
389 	    public virtual int TotalNumberOfTerms
390 	    {
391 	        get { return numberOfTerms; }
392 	    }
393 
394 	    /// <summary> Expert: Resets the counting of unique terms.
395 		/// Do this before executing the query/filter.
396 		/// </summary>
397 		/// <seealso cref="TotalNumberOfTerms">
398 		/// </seealso>
ClearTotalNumberOfTerms()399 		public virtual void  ClearTotalNumberOfTerms()
400 		{
401 			numberOfTerms = 0;
402 		}
403 
IncTotalNumberOfTerms(int inc)404 		protected internal virtual void  IncTotalNumberOfTerms(int inc)
405 		{
406 			numberOfTerms += inc;
407 		}
408 
Rewrite(IndexReader reader)409 		public override Query Rewrite(IndexReader reader)
410 		{
411 			return internalRewriteMethod.Rewrite(reader, this);
412 		}
413 
414 	    /// <summary> Sets the rewrite method to be used when executing the
415 	    /// query.  You can use one of the four core methods, or
416 	    /// implement your own subclass of <see cref="Search.RewriteMethod" />.
417 	    /// </summary>
418 	    public virtual RewriteMethod RewriteMethod
419 	    {
420             get { return internalRewriteMethod; }
421 	        set { internalRewriteMethod = value; }
422 	    }
423 
424 	    //@Override
GetHashCode()425 		public override int GetHashCode()
426 		{
427 			int prime = 31;
428 			int result = 1;
429 			result = prime * result + System.Convert.ToInt32(Boost);
430 			result = prime * result;
431 			result += internalRewriteMethod.GetHashCode();
432 			return result;
433 		}
434 
435 		//@Override
Equals(System.Object obj)436 		public  override bool Equals(System.Object obj)
437 		{
438 			if (this == obj)
439 				return true;
440 			if (obj == null)
441 				return false;
442 			if (GetType() != obj.GetType())
443 				return false;
444 			MultiTermQuery other = (MultiTermQuery) obj;
445 			if (System.Convert.ToInt32(Boost) != System.Convert.ToInt32(other.Boost))
446 				return false;
447 			if (!internalRewriteMethod.Equals(other.internalRewriteMethod))
448 			{
449 				return false;
450 			}
451 			return true;
452 		}
MultiTermQuery()453 		static MultiTermQuery()
454 		{
455 			CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new AnonymousClassConstantScoreAutoRewrite();
456 		}
457 	}
458 
459     /// <summary>Abstract class that defines how the query is rewritten. </summary>
460     [Serializable]
461     public abstract class RewriteMethod
462     {
Rewrite(IndexReader reader, MultiTermQuery query)463         public abstract Query Rewrite(IndexReader reader, MultiTermQuery query);
464     }
465 }