1 ///////////////////////////////////////////////////////////////////////////// 2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved. 3 // Distributable under the terms of either the Apache License (Version 2.0) 4 // or the GNU Lesser General Public License. 5 ///////////////////////////////////////////////////////////////////////////// 6 7 #ifndef MULTITERMQUERY_H 8 #define MULTITERMQUERY_H 9 10 #include "Query.h" 11 12 namespace Lucene { 13 14 /// An abstract {@link Query} that matches documents containing a subset of terms provided by a {@link 15 /// FilteredTermEnum} enumeration. 16 /// 17 /// This query cannot be used directly; you must subclass it and define {@link #getEnum} to provide a 18 /// {@link FilteredTermEnum} that iterates through the terms to be matched. 19 /// 20 /// NOTE: if {@link #setRewriteMethod} is either {@link #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} or {@link 21 /// #SCORING_BOOLEAN_QUERY_REWRITE}, you may encounter a {@link BooleanQuery.TooManyClauses} exception 22 /// during searching, which happens when the number of terms to be searched exceeds {@link 23 /// BooleanQuery#getMaxClauseCount()}. Setting {@link #setRewriteMethod} to {@link 24 /// #CONSTANT_SCORE_FILTER_REWRITE} prevents this. 25 /// 26 /// The recommended rewrite method is {@link #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}: it doesn't spend CPU 27 /// computing unhelpful scores, and it tries to pick the most performant rewrite method given the query. 28 /// 29 /// Note that {@link QueryParser} produces MultiTermQueries using {@link #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} 30 /// by default. 31 class LPPAPI MultiTermQuery : public Query { 32 public: 33 MultiTermQuery(); 34 virtual ~MultiTermQuery(); 35 36 LUCENE_CLASS(MultiTermQuery); 37 38 protected: 39 RewriteMethodPtr rewriteMethod; 40 int32_t numberOfTerms; 41 42 public: 43 /// A rewrite method that first creates a private Filter, by visiting each term in sequence and marking 44 /// all docs for that term. Matching documents are assigned a constant score equal to the query's boost. 45 /// 46 /// This method is faster than the BooleanQuery rewrite methods when the number of matched terms or matched 47 /// documents is non-trivial. Also, it will never hit an errant TooManyClauses exception. 48 /// 49 /// @see #setRewriteMethod 50 static RewriteMethodPtr CONSTANT_SCORE_FILTER_REWRITE(); 51 52 /// A rewrite method that first translates each term into {@link BooleanClause.Occur#SHOULD} clause in a 53 /// BooleanQuery, and keeps the scores as computed by the query. Note that typically such scores are 54 /// meaningless to the user, and require non-trivial CPU to compute, so it's almost always better to use 55 /// {@link #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. 56 /// 57 /// NOTE: This rewrite method will hit {@link BooleanQuery.TooManyClauses} if the number of terms exceeds 58 /// {@link BooleanQuery#getMaxClauseCount}. 59 /// 60 /// @see #setRewriteMethod 61 static RewriteMethodPtr SCORING_BOOLEAN_QUERY_REWRITE(); 62 63 /// Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except scores are not computed. Instead, each matching 64 /// document receives a constant score equal to the query's boost. 65 /// 66 /// NOTE: This rewrite method will hit TooManyClauses if the number of terms exceeds {@link 67 /// BooleanQuery#getMaxClauseCount}. 68 /// 69 /// @see #setRewriteMethod 70 static RewriteMethodPtr CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE(); 71 72 /// Read-only default instance of {@link ConstantScoreAutoRewrite}, with {@link 73 /// ConstantScoreAutoRewrite#setTermCountCutoff} set to {@link ConstantScoreAutoRewrite#DEFAULT_TERM_COUNT_CUTOFF} 74 /// and {@link ConstantScoreAutoRewrite#setDocCountPercent} set to {@link 75 /// ConstantScoreAutoRewrite#DEFAULT_DOC_COUNT_PERCENT}. Note that you cannot alter the configuration of 76 /// this instance; you'll need to create a private instance instead. 77 static RewriteMethodPtr CONSTANT_SCORE_AUTO_REWRITE_DEFAULT(); 78 79 /// Return the number of unique terms visited during execution of the query. If there are many of them, 80 /// you may consider using another query type or optimize your total term count in index. 81 /// 82 /// This method is not thread safe, be sure to only call it when no query is running! If you re-use the 83 /// same query instance for another search, be sure to first reset the term counter with {@link 84 /// #clearTotalNumberOfTerms}. 85 /// 86 /// On optimized indexes / no MultiReaders, you get the correct number of unique terms for the whole index. 87 /// Use this number to compare different queries. For non-optimized indexes this number can also be achieved 88 /// in non-constant-score mode. In constant-score mode you get the total number of terms seeked for all 89 /// segments / sub-readers. 90 /// @see #clearTotalNumberOfTerms 91 int32_t getTotalNumberOfTerms(); 92 93 /// Resets the counting of unique terms. Do this before executing the query/filter. 94 /// @see #getTotalNumberOfTerms 95 void clearTotalNumberOfTerms(); 96 97 virtual QueryPtr rewrite(const IndexReaderPtr& reader); 98 99 /// @see #setRewriteMethod 100 virtual RewriteMethodPtr getRewriteMethod(); 101 102 /// Sets the rewrite method to be used when executing the query. You can use one of the four core methods, 103 /// or implement your own subclass of {@link RewriteMethod}. 104 virtual void setRewriteMethod(const RewriteMethodPtr& method); 105 106 virtual LuceneObjectPtr clone(const LuceneObjectPtr& other = LuceneObjectPtr()); 107 virtual int32_t hashCode(); 108 virtual bool equals(const LuceneObjectPtr& other); 109 110 protected: 111 /// Construct the enumeration to be used, expanding the pattern term. 112 virtual FilteredTermEnumPtr getEnum(const IndexReaderPtr& reader) = 0; 113 114 void incTotalNumberOfTerms(int32_t inc); 115 116 friend class MultiTermQueryWrapperFilter; 117 friend class ScoringBooleanQueryRewrite; 118 friend class ConstantScoreAutoRewrite; 119 }; 120 121 /// Abstract class that defines how the query is rewritten. 122 class LPPAPI RewriteMethod : public LuceneObject { 123 public: 124 virtual ~RewriteMethod(); 125 LUCENE_CLASS(RewriteMethod); 126 127 public: 128 virtual QueryPtr rewrite(const IndexReaderPtr& reader, const MultiTermQueryPtr& query) = 0; 129 }; 130 131 /// A rewrite method that tries to pick the best constant-score rewrite method based on term and document 132 /// counts from the query. If both the number of terms and documents is small enough, then {@link 133 /// #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} is used. Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is 134 /// used. 135 class LPPAPI ConstantScoreAutoRewrite : public RewriteMethod { 136 public: 137 ConstantScoreAutoRewrite(); 138 virtual ~ConstantScoreAutoRewrite(); 139 140 LUCENE_CLASS(ConstantScoreAutoRewrite); 141 142 public: 143 // Defaults derived from rough tests with a 20.0 million doc Wikipedia index. With more than 350 terms 144 // in the query, the filter method is fastest 145 static const int32_t DEFAULT_TERM_COUNT_CUTOFF; 146 147 // If the query will hit more than 1 in 1000 of the docs in the index (0.1%), the filter method is fastest 148 static const double DEFAULT_DOC_COUNT_PERCENT; 149 150 protected: 151 int32_t termCountCutoff; 152 double docCountPercent; 153 154 public: 155 /// If the number of terms in this query is equal to or larger than this setting then {@link 156 /// #CONSTANT_SCORE_FILTER_REWRITE} is used. 157 virtual void setTermCountCutoff(int32_t count); 158 159 /// @see #setTermCountCutoff 160 virtual int32_t getTermCountCutoff(); 161 162 /// If the number of documents to be visited in the postings exceeds this specified percentage of the 163 /// maxDoc() for the index, then {@link #CONSTANT_SCORE_FILTER_REWRITE} is used. 164 /// @param percent 0.0 to 100.0 165 virtual void setDocCountPercent(double percent); 166 167 /// @see #setDocCountPercent 168 virtual double getDocCountPercent(); 169 170 virtual QueryPtr rewrite(const IndexReaderPtr& reader, const MultiTermQueryPtr& query); 171 172 virtual int32_t hashCode(); 173 virtual bool equals(const LuceneObjectPtr& other); 174 }; 175 176 } 177 178 #endif 179