1 /** @file
2  * @brief N-way OR postlist with wt=max(wt_i)
3  */
4 /* Copyright (C) 2007,2009,2010,2011,2012,2013,2014 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "maxpostlist.h"
25 
26 #include "debuglog.h"
27 #include "omassert.h"
28 
29 using namespace std;
30 
~MaxPostList()31 MaxPostList::~MaxPostList()
32 {
33     if (plist) {
34 	for (size_t i = 0; i < n_kids; ++i) {
35 	    delete plist[i];
36 	}
37 	delete [] plist;
38     }
39 }
40 
41 Xapian::doccount
get_termfreq_min() const42 MaxPostList::get_termfreq_min() const
43 {
44     Xapian::doccount res = plist[0]->get_termfreq_min();
45     for (size_t i = 1; i < n_kids; ++i) {
46 	res = max(res, plist[i]->get_termfreq_min());
47     }
48     return res;
49 }
50 
51 Xapian::doccount
get_termfreq_max() const52 MaxPostList::get_termfreq_max() const
53 {
54     Xapian::doccount res = plist[0]->get_termfreq_max();
55     for (size_t i = 1; i < n_kids; ++i) {
56 	Xapian::doccount c = plist[i]->get_termfreq_max();
57 	if (db_size - res <= c)
58 	    return db_size;
59 	res += c;
60     }
61     return res;
62 }
63 
64 Xapian::doccount
get_termfreq_est() const65 MaxPostList::get_termfreq_est() const
66 {
67     if (rare(db_size == 0))
68 	return 0;
69 
70     // We calculate the estimate assuming independence.  The simplest
71     // way to calculate this seems to be a series of (n_kids - 1) pairwise
72     // calculations, which gives the same answer regardless of the order.
73     double scale = 1.0 / db_size;
74     double P_est = plist[0]->get_termfreq_est() * scale;
75     for (size_t i = 1; i < n_kids; ++i) {
76 	double P_i = plist[i]->get_termfreq_est() * scale;
77 	P_est += P_i - P_est * P_i;
78     }
79     return static_cast<Xapian::doccount>(P_est * db_size + 0.5);
80 }
81 
82 TermFreqs
get_termfreq_est_using_stats(const Xapian::Weight::Internal & stats) const83 MaxPostList::get_termfreq_est_using_stats(
84 	const Xapian::Weight::Internal & stats) const
85 {
86     // We calculate the estimate assuming independence.  The simplest
87     // way to calculate this seems to be a series of (n_kids - 1) pairwise
88     // calculations, which gives the same answer regardless of the order.
89     TermFreqs freqs(plist[0]->get_termfreq_est_using_stats(stats));
90 
91     // Our caller should have ensured this.
92     Assert(stats.collection_size);
93     double scale = 1.0 / stats.collection_size;
94     double P_est = freqs.termfreq * scale;
95     double rtf_scale = 0.0;
96     if (stats.rset_size != 0) {
97 	rtf_scale = 1.0 / stats.rset_size;
98     }
99     double Pr_est = freqs.reltermfreq * rtf_scale;
100     // If total_length is 0, cf must always be 0 so cf_scale is irrelevant.
101     double cf_scale = 0.0;
102     if (usual(stats.total_length != 0)) {
103 	cf_scale = 1.0 / stats.total_length;
104     }
105     double Pc_est = freqs.collfreq * cf_scale;
106 
107     for (size_t i = 1; i < n_kids; ++i) {
108 	freqs = plist[i]->get_termfreq_est_using_stats(stats);
109 	double P_i = freqs.termfreq * scale;
110 	P_est += P_i - P_est * P_i;
111 	double Pc_i = freqs.collfreq * cf_scale;
112 	Pc_est += Pc_i - Pc_est * Pc_i;
113 	// If the rset is empty, Pr_est should be 0 already, so leave
114 	// it alone.
115 	if (stats.rset_size != 0) {
116 	    double Pr_i = freqs.reltermfreq * rtf_scale;
117 	    Pr_est += Pr_i - Pr_est * Pr_i;
118 	}
119     }
120     return TermFreqs(Xapian::doccount(P_est * stats.collection_size + 0.5),
121 		     Xapian::doccount(Pr_est * stats.rset_size + 0.5),
122 		     Xapian::termcount(Pc_est * stats.total_length + 0.5));
123 }
124 
125 double
get_maxweight() const126 MaxPostList::get_maxweight() const
127 {
128     return max_cached;
129 }
130 
131 Xapian::docid
get_docid() const132 MaxPostList::get_docid() const
133 {
134     return did;
135 }
136 
137 Xapian::termcount
get_doclength() const138 MaxPostList::get_doclength() const
139 {
140     Assert(did);
141     Xapian::termcount doclength = 0;
142     bool doclength_set = false;
143     for (size_t i = 0; i < n_kids; ++i) {
144 	if (plist[i]->get_docid() == did) {
145 	    if (doclength_set) {
146 		AssertEq(doclength, plist[i]->get_doclength());
147 	    } else {
148 		doclength = plist[i]->get_doclength();
149 		doclength_set = true;
150 	    }
151 	}
152     }
153     Assert(doclength_set);
154     return doclength;
155 }
156 
157 Xapian::termcount
get_unique_terms() const158 MaxPostList::get_unique_terms() const
159 {
160     Assert(did);
161     Xapian::termcount unique_terms = 0;
162     bool unique_terms_set = false;
163     for (size_t i = 0; i < n_kids; ++i) {
164 	if (plist[i]->get_docid() == did) {
165 	    if (unique_terms_set) {
166 		AssertEq(unique_terms, plist[i]->get_unique_terms());
167 	    } else {
168 		unique_terms = plist[i]->get_unique_terms();
169 		unique_terms_set = true;
170 	    }
171 	}
172     }
173     Assert(unique_terms_set);
174     return unique_terms;
175 }
176 
177 double
get_weight() const178 MaxPostList::get_weight() const
179 {
180     Assert(did);
181     double res = 0.0;
182     for (size_t i = 0; i < n_kids; ++i) {
183 	if (plist[i]->get_docid() == did)
184 	    res = max(res, plist[i]->get_weight());
185     }
186     return res;
187 }
188 
189 bool
at_end() const190 MaxPostList::at_end() const
191 {
192     return (did == 0);
193 }
194 
195 double
recalc_maxweight()196 MaxPostList::recalc_maxweight()
197 {
198     max_cached = plist[0]->recalc_maxweight();
199     for (size_t i = 1; i < n_kids; ++i) {
200 	max_cached = max(max_cached, plist[i]->recalc_maxweight());
201     }
202     return max_cached;
203 }
204 
205 PostList *
next(double w_min)206 MaxPostList::next(double w_min)
207 {
208     Xapian::docid old_did = did;
209     did = 0;
210     for (size_t i = 0; i < n_kids; ++i) {
211 	Xapian::docid cur_did = 0;
212 	if (old_did != 0)
213 	    cur_did = plist[i]->get_docid();
214 	if (cur_did <= old_did) {
215 	    PostList * res;
216 	    if (old_did == 0 || cur_did == old_did) {
217 		res = plist[i]->next(w_min);
218 	    } else {
219 		res = plist[i]->skip_to(old_did + 1, w_min);
220 	    }
221 	    if (res) {
222 		delete plist[i];
223 		plist[i] = res;
224 	    }
225 
226 	    if (plist[i]->at_end()) {
227 		erase_sublist(i--);
228 		continue;
229 	    }
230 
231 	    if (res)
232 		matcher->recalc_maxweight();
233 
234 	    cur_did = plist[i]->get_docid();
235 	}
236 
237 	if (did == 0 || cur_did < did) {
238 	    did = cur_did;
239 	}
240     }
241 
242     if (n_kids == 1) {
243 	n_kids = 0;
244 	return plist[0];
245     }
246 
247     return NULL;
248 }
249 
250 PostList *
skip_to(Xapian::docid did_min,double w_min)251 MaxPostList::skip_to(Xapian::docid did_min, double w_min)
252 {
253     Xapian::docid old_did = did;
254     did = 0;
255     for (size_t i = 0; i < n_kids; ++i) {
256 	Xapian::docid cur_did = 0;
257 	if (old_did != 0)
258 	    cur_did = plist[i]->get_docid();
259 	if (cur_did < did_min) {
260 	    PostList * res = plist[i]->skip_to(did_min, w_min);
261 	    if (res) {
262 		delete plist[i];
263 		plist[i] = res;
264 	    }
265 
266 	    if (plist[i]->at_end()) {
267 		erase_sublist(i--);
268 		continue;
269 	    }
270 
271 	    if (res)
272 		matcher->recalc_maxweight();
273 
274 	    cur_did = plist[i]->get_docid();
275 	}
276 
277 	if (did == 0 || cur_did < did) {
278 	    did = cur_did;
279 	}
280     }
281 
282     if (n_kids == 1) {
283 	n_kids = 0;
284 	return plist[0];
285     }
286 
287     return NULL;
288 }
289 
290 string
get_description() const291 MaxPostList::get_description() const
292 {
293     string desc("(");
294     desc += plist[0]->get_description();
295     for (size_t i = 1; i < n_kids; ++i) {
296 	desc += " MAX ";
297 	desc += plist[i]->get_description();
298     }
299     desc += ')';
300     return desc;
301 }
302 
303 Xapian::termcount
get_wdf() const304 MaxPostList::get_wdf() const
305 {
306     Xapian::termcount totwdf = 0;
307     for (size_t i = 0; i < n_kids; ++i) {
308 	if (plist[i]->get_docid() == did)
309 	    totwdf += plist[i]->get_wdf();
310     }
311     return totwdf;
312 }
313 
314 Xapian::termcount
count_matching_subqs() const315 MaxPostList::count_matching_subqs() const
316 {
317     return 1;
318 }
319