1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 using System; 19 using System.Collections.Generic; 20 using System.Runtime.InteropServices; 21 using IndexReader = Lucene.Net.Index.IndexReader; 22 using Term = Lucene.Net.Index.Term; 23 using QueryParser = Lucene.Net.QueryParsers.QueryParser; 24 using ToStringUtils = Lucene.Net.Util.ToStringUtils; 25 26 namespace Lucene.Net.Search 27 { 28 29 /// <summary> An abstract <see cref="Query" /> that matches documents 30 /// containing a subset of terms provided by a <see cref="FilteredTermEnum" /> 31 /// enumeration. 32 /// 33 /// <p/>This query cannot be used directly; you must subclass 34 /// it and define <see cref="GetEnum" /> to provide a <see cref="FilteredTermEnum" /> 35 /// that iterates through the terms to be 36 /// matched. 37 /// 38 /// <p/><b>NOTE</b>: if <see cref="RewriteMethod" /> is either 39 /// <see cref="CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE" /> or <see cref="SCORING_BOOLEAN_QUERY_REWRITE" /> 40 ///, you may encounter a 41 /// <see cref="BooleanQuery.TooManyClauses" /> exception during 42 /// searching, which happens when the number of terms to be 43 /// searched exceeds <see cref="BooleanQuery.MaxClauseCount" /> 44 ///. Setting <see cref="RewriteMethod" /> 45 /// to <see cref="CONSTANT_SCORE_FILTER_REWRITE" /> 46 /// prevents this. 47 /// 48 /// <p/>The recommended rewrite method is <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" /> 49 ///: it doesn't spend CPU 50 /// computing unhelpful scores, and it tries to pick the most 51 /// performant rewrite method given the query. 52 /// 53 /// Note that <see cref="QueryParser" /> produces 54 /// MultiTermQueries using <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" /> 55 /// by default. 56 /// </summary> 57 [Serializable] 58 public abstract class MultiTermQuery:Query 59 { 60 [Serializable] 61 public class AnonymousClassConstantScoreAutoRewrite:ConstantScoreAutoRewrite 62 { 63 public override int TermCountCutoff 64 { 65 set { throw new System.NotSupportedException("Please create a private instance"); } 66 } 67 68 public override double DocCountPercent 69 { 70 set { throw new System.NotSupportedException("Please create a private instance"); } 71 } 72 73 // Make sure we are still a singleton even after deserializing ReadResolve()74 protected internal virtual System.Object ReadResolve() 75 { 76 return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; 77 } 78 } 79 protected internal RewriteMethod internalRewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; 80 [NonSerialized] 81 internal int numberOfTerms = 0; 82 83 [Serializable] 84 private sealed class ConstantScoreFilterRewrite:RewriteMethod 85 { Rewrite(IndexReader reader, MultiTermQuery query)86 public override Query Rewrite(IndexReader reader, MultiTermQuery query) 87 { 88 Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query)); 89 result.Boost = query.Boost; 90 return result; 91 } 92 93 // Make sure we are still a singleton even after deserializing ReadResolve()94 internal System.Object ReadResolve() 95 { 96 return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE; 97 } 98 } 99 100 /// <summary>A rewrite method that first creates a private Filter, 101 /// by visiting each term in sequence and marking all docs 102 /// for that term. Matching documents are assigned a 103 /// constant score equal to the query's boost. 104 /// 105 /// <p/> This method is faster than the BooleanQuery 106 /// rewrite methods when the number of matched terms or 107 /// matched documents is non-trivial. Also, it will never 108 /// hit an errant <see cref="BooleanQuery.TooManyClauses" /> 109 /// exception. 110 /// 111 /// </summary> 112 /// <seealso cref="RewriteMethod"> 113 /// </seealso> 114 public static readonly RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite(); 115 116 [Serializable] 117 private class ScoringBooleanQueryRewrite:RewriteMethod 118 { Rewrite(IndexReader reader, MultiTermQuery query)119 public override Query Rewrite(IndexReader reader, MultiTermQuery query) 120 { 121 122 FilteredTermEnum enumerator = query.GetEnum(reader); 123 BooleanQuery result = new BooleanQuery(true); 124 int count = 0; 125 try 126 { 127 do 128 { 129 Term t = enumerator.Term; 130 if (t != null) 131 { 132 TermQuery tq = new TermQuery(t); // found a match 133 tq.Boost = query.Boost * enumerator.Difference(); // set the boost 134 result.Add(tq, Occur.SHOULD); // add to query 135 count++; 136 } 137 } 138 while (enumerator.Next()); 139 } 140 finally 141 { 142 enumerator.Close(); 143 } 144 query.IncTotalNumberOfTerms(count); 145 return result; 146 } 147 148 // Make sure we are still a singleton even after deserializing ReadResolve()149 protected internal virtual System.Object ReadResolve() 150 { 151 return Lucene.Net.Search.MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; 152 } 153 } 154 155 /// <summary>A rewrite method that first translates each term into 156 /// <see cref="Occur.SHOULD" /> clause in a 157 /// BooleanQuery, and keeps the scores as computed by the 158 /// query. Note that typically such scores are 159 /// meaningless to the user, and require non-trivial CPU 160 /// to compute, so it's almost always better to use <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" /> 161 /// instead. 162 /// 163 /// <p/><b>NOTE</b>: This rewrite method will hit <see cref="BooleanQuery.TooManyClauses" /> 164 /// if the number of terms 165 /// exceeds <see cref="BooleanQuery.MaxClauseCount" />. 166 /// 167 /// </summary> 168 /// <seealso cref="RewriteMethod"> 169 /// </seealso> 170 public static readonly RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite(); 171 172 [Serializable] 173 private class ConstantScoreBooleanQueryRewrite:ScoringBooleanQueryRewrite 174 { Rewrite(IndexReader reader, MultiTermQuery query)175 public override Query Rewrite(IndexReader reader, MultiTermQuery query) 176 { 177 // strip the scores off 178 Query result = new ConstantScoreQuery(new QueryWrapperFilter(base.Rewrite(reader, query))); 179 result.Boost = query.Boost; 180 return result; 181 } 182 183 // Make sure we are still a singleton even after deserializing ReadResolve()184 protected internal override System.Object ReadResolve() 185 { 186 return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE; 187 } 188 } 189 190 /// <summary>Like <see cref="SCORING_BOOLEAN_QUERY_REWRITE" /> except 191 /// scores are not computed. Instead, each matching 192 /// document receives a constant score equal to the 193 /// query's boost. 194 /// 195 /// <p/><b>NOTE</b>: This rewrite method will hit <see cref="BooleanQuery.TooManyClauses" /> 196 /// if the number of terms 197 /// exceeds <see cref="BooleanQuery.MaxClauseCount" />. 198 /// 199 /// </summary> 200 /// <seealso cref="RewriteMethod"> 201 /// </seealso> 202 public static readonly RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite(); 203 204 205 /// <summary>A rewrite method that tries to pick the best 206 /// constant-score rewrite method based on term and 207 /// document counts from the query. If both the number of 208 /// terms and documents is small enough, then <see cref="CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE" /> 209 /// is used. 210 /// Otherwise, <see cref="CONSTANT_SCORE_FILTER_REWRITE" /> is 211 /// used. 212 /// </summary> 213 [Serializable] 214 public class ConstantScoreAutoRewrite:RewriteMethod 215 { ConstantScoreAutoRewrite()216 public ConstantScoreAutoRewrite() 217 { 218 InitBlock(); 219 } InitBlock()220 private void InitBlock() 221 { 222 termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; 223 docCountPercent = DEFAULT_DOC_COUNT_PERCENT; 224 } 225 226 // Defaults derived from rough tests with a 20.0 million 227 // doc Wikipedia index. With more than 350 terms in the 228 // query, the filter method is fastest: 229 public static int DEFAULT_TERM_COUNT_CUTOFF = 350; 230 231 // If the query will hit more than 1 in 1000 of the docs 232 // in the index (0.1%), the filter method is fastest: 233 public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; 234 235 private int termCountCutoff; 236 private double docCountPercent; 237 238 /// <summary>If the number of terms in this query is equal to or 239 /// larger than this setting then <see cref="CONSTANT_SCORE_FILTER_REWRITE" /> 240 /// is used. 241 /// </summary> 242 public virtual int TermCountCutoff 243 { 244 get { return termCountCutoff; } 245 set { termCountCutoff = value; } 246 } 247 248 /// <summary>If the number of documents to be visited in the 249 /// postings exceeds this specified percentage of the 250 /// MaxDoc for the index, then <see cref="CONSTANT_SCORE_FILTER_REWRITE" /> 251 /// is used. 252 /// </summary> 253 /// <value> 0.0 to 100.0 </value> 254 public virtual double DocCountPercent 255 { 256 get { return docCountPercent; } 257 set { docCountPercent = value; } 258 } 259 Rewrite(IndexReader reader, MultiTermQuery query)260 public override Query Rewrite(IndexReader reader, MultiTermQuery query) 261 { 262 // Get the enum and start visiting terms. If we 263 // exhaust the enum before hitting either of the 264 // cutoffs, we use ConstantBooleanQueryRewrite; else, 265 // ConstantFilterRewrite: 266 ICollection<Term> pendingTerms = new List<Term>(); 267 int docCountCutoff = (int) ((docCountPercent / 100.0) * reader.MaxDoc); 268 int termCountLimit = System.Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff); 269 int docVisitCount = 0; 270 271 FilteredTermEnum enumerator = query.GetEnum(reader); 272 try 273 { 274 while (true) 275 { 276 Term t = enumerator.Term; 277 if (t != null) 278 { 279 pendingTerms.Add(t); 280 // Loading the TermInfo from the terms dict here 281 // should not be costly, because 1) the 282 // query/filter will load the TermInfo when it 283 // runs, and 2) the terms dict has a cache: 284 docVisitCount += reader.DocFreq(t); 285 } 286 287 if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff) 288 { 289 // Too many terms -- make a filter. 290 Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query)); 291 result.Boost = query.Boost; 292 return result; 293 } 294 else if (!enumerator.Next()) 295 { 296 // Enumeration is done, and we hit a small 297 // enough number of terms & docs -- just make a 298 // BooleanQuery, now 299 BooleanQuery bq = new BooleanQuery(true); 300 foreach(Term term in pendingTerms) 301 { 302 TermQuery tq = new TermQuery(term); 303 bq.Add(tq, Occur.SHOULD); 304 } 305 // Strip scores 306 Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); 307 result.Boost = query.Boost; 308 query.IncTotalNumberOfTerms(pendingTerms.Count); 309 return result; 310 } 311 } 312 } 313 finally 314 { 315 enumerator.Close(); 316 } 317 } 318 GetHashCode()319 public override int GetHashCode() 320 { 321 int prime = 1279; 322 return (int) (prime * termCountCutoff + BitConverter.DoubleToInt64Bits(docCountPercent)); 323 } 324 Equals(System.Object obj)325 public override bool Equals(System.Object obj) 326 { 327 if (this == obj) 328 return true; 329 if (obj == null) 330 return false; 331 if (GetType() != obj.GetType()) 332 return false; 333 334 ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; 335 if (other.termCountCutoff != termCountCutoff) 336 { 337 return false; 338 } 339 340 if (BitConverter.DoubleToInt64Bits(other.docCountPercent) != BitConverter.DoubleToInt64Bits(docCountPercent)) 341 { 342 return false; 343 } 344 345 return true; 346 } 347 } 348 349 /// <summary>Read-only default instance of <see cref="ConstantScoreAutoRewrite" /> 350 ///, with <see cref="ConstantScoreAutoRewrite.TermCountCutoff" /> 351 /// set to 352 /// <see cref="ConstantScoreAutoRewrite.DEFAULT_TERM_COUNT_CUTOFF" /> 353 /// 354 /// and <see cref="ConstantScoreAutoRewrite.DocCountPercent" /> 355 /// set to 356 /// <see cref="ConstantScoreAutoRewrite.DEFAULT_DOC_COUNT_PERCENT" /> 357 ///. 358 /// Note that you cannot alter the configuration of this 359 /// instance; you'll need to create a private instance 360 /// instead. 361 /// </summary> 362 public static readonly RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; 363 364 /// <summary> Constructs a query matching terms that cannot be represented with a single 365 /// Term. 366 /// </summary> MultiTermQuery()367 protected MultiTermQuery() 368 { 369 } 370 371 /// <summary>Construct the enumeration to be used, expanding the pattern term. </summary> GetEnum(IndexReader reader)372 protected internal abstract FilteredTermEnum GetEnum(IndexReader reader); 373 374 /// <summary> Expert: Return the number of unique terms visited during execution of the query. 375 /// If there are many of them, you may consider using another query type 376 /// or optimize your total term count in index. 377 /// <p/>This method is not thread safe, be sure to only call it when no query is running! 378 /// If you re-use the same query instance for another 379 /// search, be sure to first reset the term counter 380 /// with <see cref="ClearTotalNumberOfTerms" />. 381 /// <p/>On optimized indexes / no MultiReaders, you get the correct number of 382 /// unique terms for the whole index. Use this number to compare different queries. 383 /// For non-optimized indexes this number can also be achived in 384 /// non-constant-score mode. In constant-score mode you get the total number of 385 /// terms seeked for all segments / sub-readers. 386 /// </summary> 387 /// <seealso cref="ClearTotalNumberOfTerms"> 388 /// </seealso> 389 public virtual int TotalNumberOfTerms 390 { 391 get { return numberOfTerms; } 392 } 393 394 /// <summary> Expert: Resets the counting of unique terms. 395 /// Do this before executing the query/filter. 396 /// </summary> 397 /// <seealso cref="TotalNumberOfTerms"> 398 /// </seealso> ClearTotalNumberOfTerms()399 public virtual void ClearTotalNumberOfTerms() 400 { 401 numberOfTerms = 0; 402 } 403 IncTotalNumberOfTerms(int inc)404 protected internal virtual void IncTotalNumberOfTerms(int inc) 405 { 406 numberOfTerms += inc; 407 } 408 Rewrite(IndexReader reader)409 public override Query Rewrite(IndexReader reader) 410 { 411 return internalRewriteMethod.Rewrite(reader, this); 412 } 413 414 /// <summary> Sets the rewrite method to be used when executing the 415 /// query. You can use one of the four core methods, or 416 /// implement your own subclass of <see cref="Search.RewriteMethod" />. 417 /// </summary> 418 public virtual RewriteMethod RewriteMethod 419 { 420 get { return internalRewriteMethod; } 421 set { internalRewriteMethod = value; } 422 } 423 424 //@Override GetHashCode()425 public override int GetHashCode() 426 { 427 int prime = 31; 428 int result = 1; 429 result = prime * result + System.Convert.ToInt32(Boost); 430 result = prime * result; 431 result += internalRewriteMethod.GetHashCode(); 432 return result; 433 } 434 435 //@Override Equals(System.Object obj)436 public override bool Equals(System.Object obj) 437 { 438 if (this == obj) 439 return true; 440 if (obj == null) 441 return false; 442 if (GetType() != obj.GetType()) 443 return false; 444 MultiTermQuery other = (MultiTermQuery) obj; 445 if (System.Convert.ToInt32(Boost) != System.Convert.ToInt32(other.Boost)) 446 return false; 447 if (!internalRewriteMethod.Equals(other.internalRewriteMethod)) 448 { 449 return false; 450 } 451 return true; 452 } MultiTermQuery()453 static MultiTermQuery() 454 { 455 CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new AnonymousClassConstantScoreAutoRewrite(); 456 } 457 } 458 459 /// <summary>Abstract class that defines how the query is rewritten. </summary> 460 [Serializable] 461 public abstract class RewriteMethod 462 { Rewrite(IndexReader reader, MultiTermQuery query)463 public abstract Query Rewrite(IndexReader reader, MultiTermQuery query); 464 } 465 }