1 /** @file
2 * @brief N-way OR postlist with wt=max(wt_i)
3 */
4 /* Copyright (C) 2007,2009,2010,2011,2012,2013,2014 Olly Betts
5 * Copyright (C) 2009 Lemur Consulting Ltd
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include <config.h>
23
24 #include "maxpostlist.h"
25
26 #include "debuglog.h"
27 #include "omassert.h"
28
29 using namespace std;
30
~MaxPostList()31 MaxPostList::~MaxPostList()
32 {
33 if (plist) {
34 for (size_t i = 0; i < n_kids; ++i) {
35 delete plist[i];
36 }
37 delete [] plist;
38 }
39 }
40
41 Xapian::doccount
get_termfreq_min() const42 MaxPostList::get_termfreq_min() const
43 {
44 Xapian::doccount res = plist[0]->get_termfreq_min();
45 for (size_t i = 1; i < n_kids; ++i) {
46 res = max(res, plist[i]->get_termfreq_min());
47 }
48 return res;
49 }
50
51 Xapian::doccount
get_termfreq_max() const52 MaxPostList::get_termfreq_max() const
53 {
54 Xapian::doccount res = plist[0]->get_termfreq_max();
55 for (size_t i = 1; i < n_kids; ++i) {
56 Xapian::doccount c = plist[i]->get_termfreq_max();
57 if (db_size - res <= c)
58 return db_size;
59 res += c;
60 }
61 return res;
62 }
63
64 Xapian::doccount
get_termfreq_est() const65 MaxPostList::get_termfreq_est() const
66 {
67 if (rare(db_size == 0))
68 return 0;
69
70 // We calculate the estimate assuming independence. The simplest
71 // way to calculate this seems to be a series of (n_kids - 1) pairwise
72 // calculations, which gives the same answer regardless of the order.
73 double scale = 1.0 / db_size;
74 double P_est = plist[0]->get_termfreq_est() * scale;
75 for (size_t i = 1; i < n_kids; ++i) {
76 double P_i = plist[i]->get_termfreq_est() * scale;
77 P_est += P_i - P_est * P_i;
78 }
79 return static_cast<Xapian::doccount>(P_est * db_size + 0.5);
80 }
81
82 TermFreqs
get_termfreq_est_using_stats(const Xapian::Weight::Internal & stats) const83 MaxPostList::get_termfreq_est_using_stats(
84 const Xapian::Weight::Internal & stats) const
85 {
86 // We calculate the estimate assuming independence. The simplest
87 // way to calculate this seems to be a series of (n_kids - 1) pairwise
88 // calculations, which gives the same answer regardless of the order.
89 TermFreqs freqs(plist[0]->get_termfreq_est_using_stats(stats));
90
91 // Our caller should have ensured this.
92 Assert(stats.collection_size);
93 double scale = 1.0 / stats.collection_size;
94 double P_est = freqs.termfreq * scale;
95 double rtf_scale = 0.0;
96 if (stats.rset_size != 0) {
97 rtf_scale = 1.0 / stats.rset_size;
98 }
99 double Pr_est = freqs.reltermfreq * rtf_scale;
100 // If total_length is 0, cf must always be 0 so cf_scale is irrelevant.
101 double cf_scale = 0.0;
102 if (usual(stats.total_length != 0)) {
103 cf_scale = 1.0 / stats.total_length;
104 }
105 double Pc_est = freqs.collfreq * cf_scale;
106
107 for (size_t i = 1; i < n_kids; ++i) {
108 freqs = plist[i]->get_termfreq_est_using_stats(stats);
109 double P_i = freqs.termfreq * scale;
110 P_est += P_i - P_est * P_i;
111 double Pc_i = freqs.collfreq * cf_scale;
112 Pc_est += Pc_i - Pc_est * Pc_i;
113 // If the rset is empty, Pr_est should be 0 already, so leave
114 // it alone.
115 if (stats.rset_size != 0) {
116 double Pr_i = freqs.reltermfreq * rtf_scale;
117 Pr_est += Pr_i - Pr_est * Pr_i;
118 }
119 }
120 return TermFreqs(Xapian::doccount(P_est * stats.collection_size + 0.5),
121 Xapian::doccount(Pr_est * stats.rset_size + 0.5),
122 Xapian::termcount(Pc_est * stats.total_length + 0.5));
123 }
124
125 double
get_maxweight() const126 MaxPostList::get_maxweight() const
127 {
128 return max_cached;
129 }
130
131 Xapian::docid
get_docid() const132 MaxPostList::get_docid() const
133 {
134 return did;
135 }
136
137 Xapian::termcount
get_doclength() const138 MaxPostList::get_doclength() const
139 {
140 Assert(did);
141 Xapian::termcount doclength = 0;
142 bool doclength_set = false;
143 for (size_t i = 0; i < n_kids; ++i) {
144 if (plist[i]->get_docid() == did) {
145 if (doclength_set) {
146 AssertEq(doclength, plist[i]->get_doclength());
147 } else {
148 doclength = plist[i]->get_doclength();
149 doclength_set = true;
150 }
151 }
152 }
153 Assert(doclength_set);
154 return doclength;
155 }
156
157 Xapian::termcount
get_unique_terms() const158 MaxPostList::get_unique_terms() const
159 {
160 Assert(did);
161 Xapian::termcount unique_terms = 0;
162 bool unique_terms_set = false;
163 for (size_t i = 0; i < n_kids; ++i) {
164 if (plist[i]->get_docid() == did) {
165 if (unique_terms_set) {
166 AssertEq(unique_terms, plist[i]->get_unique_terms());
167 } else {
168 unique_terms = plist[i]->get_unique_terms();
169 unique_terms_set = true;
170 }
171 }
172 }
173 Assert(unique_terms_set);
174 return unique_terms;
175 }
176
177 double
get_weight() const178 MaxPostList::get_weight() const
179 {
180 Assert(did);
181 double res = 0.0;
182 for (size_t i = 0; i < n_kids; ++i) {
183 if (plist[i]->get_docid() == did)
184 res = max(res, plist[i]->get_weight());
185 }
186 return res;
187 }
188
189 bool
at_end() const190 MaxPostList::at_end() const
191 {
192 return (did == 0);
193 }
194
195 double
recalc_maxweight()196 MaxPostList::recalc_maxweight()
197 {
198 max_cached = plist[0]->recalc_maxweight();
199 for (size_t i = 1; i < n_kids; ++i) {
200 max_cached = max(max_cached, plist[i]->recalc_maxweight());
201 }
202 return max_cached;
203 }
204
205 PostList *
next(double w_min)206 MaxPostList::next(double w_min)
207 {
208 Xapian::docid old_did = did;
209 did = 0;
210 for (size_t i = 0; i < n_kids; ++i) {
211 Xapian::docid cur_did = 0;
212 if (old_did != 0)
213 cur_did = plist[i]->get_docid();
214 if (cur_did <= old_did) {
215 PostList * res;
216 if (old_did == 0 || cur_did == old_did) {
217 res = plist[i]->next(w_min);
218 } else {
219 res = plist[i]->skip_to(old_did + 1, w_min);
220 }
221 if (res) {
222 delete plist[i];
223 plist[i] = res;
224 }
225
226 if (plist[i]->at_end()) {
227 erase_sublist(i--);
228 continue;
229 }
230
231 if (res)
232 matcher->recalc_maxweight();
233
234 cur_did = plist[i]->get_docid();
235 }
236
237 if (did == 0 || cur_did < did) {
238 did = cur_did;
239 }
240 }
241
242 if (n_kids == 1) {
243 n_kids = 0;
244 return plist[0];
245 }
246
247 return NULL;
248 }
249
250 PostList *
skip_to(Xapian::docid did_min,double w_min)251 MaxPostList::skip_to(Xapian::docid did_min, double w_min)
252 {
253 Xapian::docid old_did = did;
254 did = 0;
255 for (size_t i = 0; i < n_kids; ++i) {
256 Xapian::docid cur_did = 0;
257 if (old_did != 0)
258 cur_did = plist[i]->get_docid();
259 if (cur_did < did_min) {
260 PostList * res = plist[i]->skip_to(did_min, w_min);
261 if (res) {
262 delete plist[i];
263 plist[i] = res;
264 }
265
266 if (plist[i]->at_end()) {
267 erase_sublist(i--);
268 continue;
269 }
270
271 if (res)
272 matcher->recalc_maxweight();
273
274 cur_did = plist[i]->get_docid();
275 }
276
277 if (did == 0 || cur_did < did) {
278 did = cur_did;
279 }
280 }
281
282 if (n_kids == 1) {
283 n_kids = 0;
284 return plist[0];
285 }
286
287 return NULL;
288 }
289
290 string
get_description() const291 MaxPostList::get_description() const
292 {
293 string desc("(");
294 desc += plist[0]->get_description();
295 for (size_t i = 1; i < n_kids; ++i) {
296 desc += " MAX ";
297 desc += plist[i]->get_description();
298 }
299 desc += ')';
300 return desc;
301 }
302
303 Xapian::termcount
get_wdf() const304 MaxPostList::get_wdf() const
305 {
306 Xapian::termcount totwdf = 0;
307 for (size_t i = 0; i < n_kids; ++i) {
308 if (plist[i]->get_docid() == did)
309 totwdf += plist[i]->get_wdf();
310 }
311 return totwdf;
312 }
313
314 Xapian::termcount
count_matching_subqs() const315 MaxPostList::count_matching_subqs() const
316 {
317 return 1;
318 }
319