1 /** @file omenquireinternal.h
2  * @brief Internals
3  */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2001,2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2016 Olly Betts
7  * Copyright 2009 Lemur Consulting Ltd
8  * Copyright 2011 Action Without Borders
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License as
12  * published by the Free Software Foundation; either version 2 of the
13  * License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
23  * USA
24  */
25 
26 #ifndef OM_HGUARD_OMENQUIREINTERNAL_H
27 #define OM_HGUARD_OMENQUIREINTERNAL_H
28 
29 #include "xapian/database.h"
30 #include "xapian/document.h"
31 #include "xapian/enquire.h"
32 #include "xapian/query.h"
33 #include "xapian/keymaker.h"
34 
35 #include <algorithm>
36 #include <cmath>
37 #include <map>
38 #include <set>
39 #include <unordered_map>
40 
41 #include "weight/weightinternal.h"
42 
43 using namespace std;
44 
45 class OmExpand;
46 class MultiMatch;
47 
48 namespace Xapian {
49 
50 class TermIterator;
51 
52 namespace Internal {
53 
54 /** An item resulting from a query.
55  *  This item contains the document id, and the weight calculated for
56  *  the document.
57  */
58 class MSetItem {
59     public:
MSetItem(double wt_,Xapian::docid did_)60 	MSetItem(double wt_, Xapian::docid did_)
61 		: wt(wt_), did(did_), collapse_count(0) {}
62 
MSetItem(double wt_,Xapian::docid did_,const string & key_)63 	MSetItem(double wt_, Xapian::docid did_, const string &key_)
64 		: wt(wt_), did(did_), collapse_key(key_), collapse_count(0) {}
65 
MSetItem(double wt_,Xapian::docid did_,const string & key_,Xapian::doccount collapse_count_)66 	MSetItem(double wt_, Xapian::docid did_, const string &key_,
67 		 Xapian::doccount collapse_count_)
68 		: wt(wt_), did(did_), collapse_key(key_),
69 		  collapse_count(collapse_count_) {}
70 
swap(MSetItem & o)71 	void swap(MSetItem & o) {
72 	    std::swap(wt, o.wt);
73 	    std::swap(did, o.did);
74 	    std::swap(collapse_key, o.collapse_key);
75 	    std::swap(collapse_count, o.collapse_count);
76 	    std::swap(sort_key, o.sort_key);
77 	}
78 
79 	/** Weight calculated. */
80 	double wt;
81 
82 	/** Document id. */
83 	Xapian::docid did;
84 
85 	/** Value which was used to collapse upon.
86 	 *
87 	 *  If the collapse option is not being used, this will always
88 	 *  have a null value.
89 	 *
90 	 *  If the collapse option is in use, this will contain the collapse
91 	 *  key's value for this particular item.  If the key is not present
92 	 *  for this item, the value will be a null string.  Only one instance
93 	 *  of each key value (apart from the null string) will be present in
94 	 *  the items in the returned Xapian::MSet.
95 	 */
96 	string collapse_key;
97 
98 	/** Count of collapses done on collapse_key so far
99 	 *
100 	 * This is normally 0, and goes up for each collapse done
101 	 * It is not necessarily an indication of how many collapses
102 	 * might be done if an exhaustive match was done
103 	 */
104 	Xapian::doccount collapse_count;
105 
106 	/** Used when sorting by value. */
107 	string sort_key;
108 
109 	/// Return a string describing this object.
110 	string get_description() const;
111 };
112 
113 }
114 
115 /** Internals of enquire system.
116  *  This allows the implementation of Xapian::Enquire to be hidden and reference
117  *  counted.
118  */
119 class Enquire::Internal : public Xapian::Internal::intrusive_base {
120     friend class MSet::Internal;
121     private:
122 	/// The database which this enquire object uses.
123 	const Xapian::Database db;
124 
125 	/// The user's query.
126 	Query query;
127 
128 	/// The query length.
129 	termcount qlen;
130 
131 	/// Copy not allowed
132 	Internal(const Internal &);
133 	/// Assignment not allowed
134 	void operator=(const Internal &);
135 
136     public:
137 	typedef enum { REL, VAL, VAL_REL, REL_VAL } sort_setting;
138 
139 	Xapian::valueno collapse_key;
140 
141 	Xapian::doccount collapse_max;
142 
143 	Xapian::Enquire::docid_order order;
144 
145 	int percent_cutoff;
146 
147 	double weight_cutoff;
148 
149 	Xapian::valueno sort_key;
150 	sort_setting sort_by;
151 	bool sort_value_forward;
152 
153 	Xapian::Internal::opt_intrusive_ptr<KeyMaker> sorter;
154 
155 	double time_limit;
156 
157 	/** The weight to use for this query.
158 	 *
159 	 *  This is mutable so that the default BM25Weight object can be
160 	 *  created lazily when first required.
161 	 */
162 	mutable Weight * weight;
163 
164 	/// The weighting scheme to use for query expansion.
165 	std::string eweightname;
166 
167 	/// The parameter required for TradWeight query expansion.
168 	double expand_k;
169 
170 	vector<Xapian::Internal::opt_intrusive_ptr<MatchSpy>> spies;
171 
172 	explicit Internal(const Xapian::Database &databases);
173 	~Internal();
174 
175 	/** Request a document from the database.
176 	 */
177 	void request_doc(const Xapian::Internal::MSetItem &item) const;
178 
179 	/** Read a previously requested document from the database.
180 	 */
181 	Xapian::Document read_doc(const Xapian::Internal::MSetItem &item) const;
182 
183 	Xapian::Document get_document(const Xapian::Internal::MSetItem &item) const;
184 
185 	void set_query(const Query & query_, termcount qlen_);
186 	const Query & get_query() const;
187 	MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems,
188 		      Xapian::doccount check_at_least,
189 		      const RSet *omrset,
190 		      const MatchDecider *mdecider) const;
191 
192 	ESet get_eset(Xapian::termcount maxitems, const RSet & omrset, int flags,
193 		      const ExpandDecider *edecider, double min_wt) const;
194 
195 	TermIterator get_matching_terms(Xapian::docid did) const;
196 	TermIterator get_matching_terms(const Xapian::MSetIterator &it) const;
197 
198 	Xapian::doccount get_termfreq(const string &tname) const;
199 
200 	string get_description() const;
201 };
202 
203 class MSet::Internal : public Xapian::Internal::intrusive_base {
204     public:
205 	/// Factor to multiply weights by to convert them to percentages.
206 	double percent_factor;
207 
208     private:
209 	/** The set of documents which have been requested but not yet
210 	 *  collected.
211 	 */
212 	mutable set<Xapian::doccount> requested_docs;
213 
214 	/// Cache of documents, indexed by MSet index.
215 	mutable map<Xapian::doccount, Xapian::Document> indexeddocs;
216 
217 	/// Read and cache the documents so far requested.
218 	void read_docs() const;
219 
220 	/// Copy not allowed
221 	Internal(const Internal &);
222 	/// Assignment not allowed
223 	void operator=(const Internal &);
224 
225 	mutable std::unordered_map<std::string, double> snippet_bg_relevance;
226 
227     public:
228 	/// Xapian::Enquire reference, for getting documents.
229 	Xapian::Internal::intrusive_ptr<const Enquire::Internal> enquire;
230 
231 	/** Provides the term frequency and weight for each term in the query. */
232 	Xapian::Weight::Internal * stats;
233 
234 	/// A list of items comprising the (selected part of the) MSet.
235 	vector<Xapian::Internal::MSetItem> items;
236 
237 	/// Rank of first item in MSet.
238 	Xapian::doccount firstitem;
239 
240 	Xapian::doccount matches_lower_bound;
241 
242 	Xapian::doccount matches_estimated;
243 
244 	Xapian::doccount matches_upper_bound;
245 
246 	Xapian::doccount uncollapsed_lower_bound;
247 
248 	Xapian::doccount uncollapsed_estimated;
249 
250 	Xapian::doccount uncollapsed_upper_bound;
251 
252 	double max_possible;
253 
254 	double max_attained;
255 
Internal()256 	Internal()
257 		: percent_factor(0),
258 		  stats(NULL),
259 		  firstitem(0),
260 		  matches_lower_bound(0),
261 		  matches_estimated(0),
262 		  matches_upper_bound(0),
263 		  uncollapsed_lower_bound(0),
264 		  uncollapsed_estimated(0),
265 		  uncollapsed_upper_bound(0),
266 		  max_possible(0),
267 		  max_attained(0) {}
268 
269 	/// Note: destroys parameter items.
Internal(Xapian::doccount firstitem_,Xapian::doccount matches_upper_bound_,Xapian::doccount matches_lower_bound_,Xapian::doccount matches_estimated_,Xapian::doccount uncollapsed_upper_bound_,Xapian::doccount uncollapsed_lower_bound_,Xapian::doccount uncollapsed_estimated_,double max_possible_,double max_attained_,vector<Xapian::Internal::MSetItem> & items_,double percent_factor_)270 	Internal(Xapian::doccount firstitem_,
271 	     Xapian::doccount matches_upper_bound_,
272 	     Xapian::doccount matches_lower_bound_,
273 	     Xapian::doccount matches_estimated_,
274 	     Xapian::doccount uncollapsed_upper_bound_,
275 	     Xapian::doccount uncollapsed_lower_bound_,
276 	     Xapian::doccount uncollapsed_estimated_,
277 	     double max_possible_,
278 	     double max_attained_,
279 	     vector<Xapian::Internal::MSetItem> &items_,
280 	     double percent_factor_)
281 		: percent_factor(percent_factor_),
282 		  stats(NULL),
283 		  firstitem(firstitem_),
284 		  matches_lower_bound(matches_lower_bound_),
285 		  matches_estimated(matches_estimated_),
286 		  matches_upper_bound(matches_upper_bound_),
287 		  uncollapsed_lower_bound(uncollapsed_lower_bound_),
288 		  uncollapsed_estimated(uncollapsed_estimated_),
289 		  uncollapsed_upper_bound(uncollapsed_upper_bound_),
290 		  max_possible(max_possible_),
291 		  max_attained(max_attained_) {
292 	    std::swap(items, items_);
293 	}
294 
~Internal()295 	~Internal() { delete stats; }
296 
297 	/// get a document by index in MSet, via the cache.
298 	Xapian::Document get_doc_by_index(Xapian::doccount index) const;
299 
300 	/// Converts a weight to a percentage weight
301 	int convert_to_percent_internal(double wt) const;
302 
303 	std::string snippet(const std::string & text, size_t length,
304 			    const Xapian::Stem & stemmer,
305 			    unsigned flags,
306 			    const std::string & hi_start,
307 			    const std::string & hi_end,
308 			    const std::string & omit) const;
309 
310 	/// Return a string describing this object.
311 	string get_description() const;
312 
313 	/** Fetch items specified into the document cache.
314 	 */
315 	void fetch_items(Xapian::doccount first, Xapian::doccount last) const;
316 };
317 
318 class RSet::Internal : public Xapian::Internal::intrusive_base {
319     friend class Xapian::RSet;
320 
321     private:
322 	/// Items in the relevance set.
323 	set<Xapian::docid> items;
324 
325     public:
get_items()326 	const set<Xapian::docid> & get_items() const { return items; }
327 
328 	/// Return a string describing this object.
329 	string get_description() const;
330 };
331 
332 }
333 
334 #endif // OM_HGUARD_OMENQUIREINTERNAL_H
335