1 /** @file
2 * @brief tests of OP_SYNONYM and OP_MAX.
3 */
4 /* Copyright 2009,2011,2014 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #include <config.h>
24
25 #include "api_opsynonym.h"
26
27 #include <map>
28 #include <set>
29 #include <vector>
30
31 #include <xapian.h>
32
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
36
37 #include "apitest.h"
38
39 using namespace std;
40
41 // #######################################################################
42 // # Tests start here
43
44 struct synonym1_data_type {
45 // How many results should have the same weight when combined with
46 // OP_SYNONYM instead of OP_OR.
47 int sameweight_count;
48 // How many results should have a different weight when combined with
49 // OP_SYNONYM instead of OP_OR.
50 int diffweight_count;
51 // How many subqueries.
52 unsigned n_subqs;
53 // The subqueries (use NOQ for unused ones).
54 Xapian::Query subqs[4];
55 };
56
57 #define NOQ Xapian::Query::MatchNothing
58 static const synonym1_data_type synonym1_data[] = {
59 {
60 // Single term - all 33 results should be same weight.
61 33, 0, 1,
62 { Xapian::Query("date"), NOQ, NOQ, NOQ }
63 },
64 {
65 // Two terms, which co-occur in some documents.
66 //
67 // All 34 results should be different.
68 0, 34, 2,
69 { Xapian::Query("sky"), Xapian::Query("date"), NOQ, NOQ }
70 },
71 {
72 // Two terms which are entirely disjoint, and where the maximum weight
73 // doesn't occur in the first or second match.
74 //
75 // All 18 results should be different.
76 0, 18, 2,
77 { Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ, NOQ }
78 },
79 {
80 // All 34 results should be different.
81 0, 34, 2,
82 {
83 Xapian::Query("date"),
84 Xapian::Query(Xapian::Query::OP_OR,
85 Xapian::Query("sky"),
86 Xapian::Query("glove")),
87 NOQ, NOQ
88 }
89 },
90 {
91 // All 34 results should be different.
92 0, 34, 2,
93 {
94 Xapian::Query("date"),
95 Xapian::Query(Xapian::Query::OP_OR,
96 Xapian::Query("sky"),
97 Xapian::Query("date")),
98 NOQ, NOQ
99 }
100 },
101 {
102 // All 34 results should be different.
103 0, 34, 2,
104 {
105 Xapian::Query("date"),
106 Xapian::Query(Xapian::Query::OP_AND_MAYBE,
107 Xapian::Query("sky"),
108 Xapian::Query("date")),
109 NOQ, NOQ
110 }
111 },
112 {
113 // All 34 results should be different.
114 0, 34, 2,
115 {
116 Xapian::Query("date"),
117 Xapian::Query(Xapian::Query::OP_AND_NOT,
118 Xapian::Query("sky"),
119 Xapian::Query("date")),
120 NOQ, NOQ
121 }
122 },
123 {
124 // The AND only matches 1 document, so the estimated termfreq for the
125 // whole synonym works out as 33 (due to rounding), which is the same
126 // as the termfreq for "date". Therefore most of the weights are the
127 // same as just for the pure "date" search, and the only document which
128 // gets a different weight is the one also matched by "sky" (because it
129 // has a wdf boost).
130 32, 1, 2,
131 {
132 Xapian::Query("date"),
133 Xapian::Query(Xapian::Query::OP_AND,
134 Xapian::Query("sky"),
135 Xapian::Query("date")),
136 NOQ, NOQ
137 }
138 },
139 {
140 // All 34 results should be different.
141 0, 34, 2,
142 {
143 Xapian::Query("date"),
144 Xapian::Query(Xapian::Query::OP_XOR,
145 Xapian::Query("sky"),
146 Xapian::Query("date")),
147 NOQ, NOQ
148 }
149 },
150 {
151 // When the top-level operator is OR, the synonym part has an estimated
152 // termfreq of 35. When the top-level operator is SYNONYM, the whole
153 // query has an estimated termfreq of 66, which is rather bogus, but
154 // that's the current situation here (1.2 did better as it flattened
155 // this into a single OP_SYNONYM operator and then merged the two
156 // "date" terms to one with wqf=2. We've decided we shouldn't do such
157 // merging from 1.3.x on (merging to sum the scale_factors is fine, but
158 // we don't do that yet - FIXME).
159 //
160 // Anyway, this means that currently the weights are different for all
161 // matches.
162 0, 34, 2,
163 {
164 Xapian::Query("date"),
165 Xapian::Query(Xapian::Query::OP_SYNONYM,
166 Xapian::Query("sky"),
167 Xapian::Query("date")),
168 NOQ, NOQ
169 }
170 },
171 {
172 // All 35 results should be different.
173 0, 35, 4,
174 {
175 Xapian::Query("sky"),
176 Xapian::Query("date"),
177 Xapian::Query("stein"),
178 Xapian::Query("ally")
179 }
180 },
181 {
182 // The estimated term frequency for the synoynm is 2 (because the
183 // estimate for the phrase is 0), which is the same as the term
184 // frequency of "attitud". Thus, the synonym gets the same weight as
185 // "attitud", so documents with only "attitud" (but not the phrase) in
186 // them get the same wdf, and have the same total weight. There turns
187 // out to be exactly one such document.
188 1, 3, 2,
189 {
190 Xapian::Query("attitud"),
191 Xapian::Query(Xapian::Query::OP_PHRASE,
192 Xapian::Query("german"),
193 Xapian::Query("adventur")),
194 NOQ, NOQ
195 }
196 },
197 {
198 // All 54 results should be different.
199 0, 54, 2,
200 {
201 Xapian::Query("attitud"),
202 Xapian::Query(Xapian::Query::OP_OR,
203 Xapian::Query("german"),
204 Xapian::Query(Xapian::Query::OP_SYNONYM,
205 Xapian::Query("sky"),
206 Xapian::Query("date"))),
207 NOQ, NOQ
208 }
209 }
210 };
211
212 // Check a synonym search
DEFINE_TESTCASE(synonym1,backend)213 DEFINE_TESTCASE(synonym1, backend) {
214 Xapian::Database db(get_database("etext"));
215
216 TEST_REL(db.get_doclength_upper_bound(), >, 0);
217
218 const Xapian::doccount lots = 214;
219
220 for (size_t subqgroup = 0;
221 subqgroup != sizeof(synonym1_data) / sizeof(synonym1_data[0]);
222 ++subqgroup) {
223 const synonym1_data_type & data = synonym1_data[subqgroup];
224 const Xapian::Query * qlist = data.subqs;
225 const Xapian::Query * qlist_end = qlist + data.n_subqs;
226
227 // Run two queries, one joining the subqueries with OR and one joining
228 // them with SYNONYM.
229 Xapian::Enquire enquire(db);
230
231 // Do the search with OP_OR, getting all the results.
232 Xapian::Query orquery(Xapian::Query::OP_OR, qlist, qlist_end);
233 enquire.set_query(orquery);
234 Xapian::MSet ormset = enquire.get_mset(0, lots);
235
236 // Do the search with OP_SYNONYM, getting all the results.
237 Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist, qlist_end);
238 enquire.set_query(synquery);
239 Xapian::MSet synmset = enquire.get_mset(0, lots);
240
241 tout << "Comparing " << orquery << " with " << synquery << '\n';
242
243 // Check that the queries return some results.
244 TEST_NOT_EQUAL(synmset.size(), 0);
245 // Check that the queries return the same number of results.
246 TEST_EQUAL(synmset.size(), ormset.size());
247 map<Xapian::docid, double> values_or;
248 map<Xapian::docid, double> values_synonym;
249 for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
250 values_or[*ormset[i]] = ormset[i].get_weight();
251 values_synonym[*synmset[i]] = synmset[i].get_weight();
252 }
253 TEST_EQUAL(values_or.size(), values_synonym.size());
254
255 /* Check that the most of the weights for items in the "or" mset are
256 * different from those in the "synonym" mset. */
257 int same_weight = 0;
258 int different_weight = 0;
259 for (map<Xapian::docid, double>::const_iterator
260 j = values_or.begin(); j != values_or.end(); ++j) {
261 Xapian::docid did = j->first;
262 // Check that all the results in the or tree make it to the synonym
263 // tree.
264 TEST(values_synonym.find(did) != values_synonym.end());
265 if (values_or[did] == values_synonym[did]) {
266 ++same_weight;
267 } else {
268 ++different_weight;
269 }
270 }
271
272 TEST_EQUAL(different_weight, data.diffweight_count);
273 TEST_EQUAL(same_weight, data.sameweight_count);
274
275 // Do the search with synonym, but just get the top result.
276 // (Regression test - the OR subquery in the synonym postlist tree used
277 // to shortcut incorrectly, and return the wrong result here).
278 Xapian::MSet mset_top = enquire.get_mset(0, 1);
279 TEST_EQUAL(mset_top.size(), 1);
280 TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
281 }
282 }
283
284 // Regression test - test a synonym search with a MultiAndPostlist.
DEFINE_TESTCASE(synonym2,backend)285 DEFINE_TESTCASE(synonym2, backend) {
286 Xapian::Query query;
287 vector<Xapian::Query> subqueries;
288 subqueries.push_back(Xapian::Query("file"));
289 subqueries.push_back(Xapian::Query("the"));
290 subqueries.push_back(Xapian::Query("next"));
291 subqueries.push_back(Xapian::Query("reader"));
292 query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
293 subqueries.clear();
294 subqueries.push_back(query);
295 subqueries.push_back(Xapian::Query("gutenberg"));
296 query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
297
298 tout << query << '\n';
299
300 Xapian::Database db(get_database("etext"));
301 Xapian::Enquire enquire(db);
302 enquire.set_query(query);
303 Xapian::MSet mset = enquire.get_mset(0, 10);
304 tout << mset << '\n';
305
306 // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
307 double maxposs = mset.get_max_possible();
308 query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
309 enquire.set_query(query);
310 mset = enquire.get_mset(0, 10);
311 double maxposs2 = mset.get_max_possible();
312
313 TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
314 }
315
316 static void
check_msets_contain_same_docs(const Xapian::MSet & mset1,const Xapian::MSet & mset2)317 check_msets_contain_same_docs(const Xapian::MSet & mset1,
318 const Xapian::MSet & mset2)
319 {
320 TEST_EQUAL(mset1.size(), mset2.size());
321
322 set<Xapian::docid> docids;
323 for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
324 docids.insert(*mset1[i]);
325 }
326
327 // Check that all the results in mset1 are in mset2.
328 for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
329 // Check that we can erase each entry from mset2 element. Since mset1
330 // and mset2 are the same size this means we can be sure that there
331 // were no repeated docids in either (it would be a bug if there were).
332 TEST(docids.erase(*mset2[j]));
333 }
334 }
335
336 // Test a synonym search which has had its weight scaled to 0.
DEFINE_TESTCASE(synonym3,backend)337 DEFINE_TESTCASE(synonym3, backend) {
338 Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
339 Xapian::Query("sky"),
340 Xapian::Query("date"));
341
342 Xapian::Database db(get_database("etext"));
343 Xapian::Enquire enquire(db);
344 enquire.set_query(query);
345 Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
346
347 tout << query << '\n';
348 tout << mset_orig << '\n';
349
350 // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
351 // (this has a special codepath to avoid doing the synonym calculation).
352 query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
353 enquire.set_query(query);
354 Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
355
356 tout << query << '\n';
357 tout << mset_zero << '\n';
358
359 // Check that the queries return some results.
360 TEST_NOT_EQUAL(mset_zero.size(), 0);
361 // Check that the queries return the same document IDs, and the zero
362 // one has zero weight.
363 check_msets_contain_same_docs(mset_orig, mset_zero);
364 for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
365 TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
366 TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
367 }
368 }
369
370 // Test synonym searches combined with various operators.
DEFINE_TESTCASE(synonym4,backend)371 DEFINE_TESTCASE(synonym4, backend) {
372 Xapian::Database db(get_database("etext"));
373 Xapian::Enquire enquire(db);
374 Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
375 Xapian::Query("gutenberg"),
376 Xapian::Query("blockhead"));
377 Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
378 Xapian::Query("gutenberg"),
379 Xapian::Query("blockhead"));
380 Xapian::Query date_query = Xapian::Query("date");
381
382 // Check some queries.
383 static const Xapian::Query::op operators[] = {
384 Xapian::Query::OP_AND_MAYBE,
385 Xapian::Query::OP_AND_NOT,
386 Xapian::Query::OP_AND,
387 Xapian::Query::OP_XOR,
388 Xapian::Query::OP_OR,
389 Xapian::Query::OP_SYNONYM
390 };
391 const Xapian::Query::op * end;
392 end = operators + sizeof(operators) / sizeof(operators[0]);
393 for (const Xapian::Query::op * i = operators; i != end; ++i) {
394 tout.str(string());
395 Xapian::Query query1(*i, syn_query, date_query);
396 Xapian::Query query2(*i, or_query, date_query);
397
398 enquire.set_query(query1);
399 tout << "query1:" << query1 << '\n';
400 Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
401 tout << "mset1:" << mset1 << '\n';
402 enquire.set_query(query2);
403 tout << "query2:" << query2 << '\n';
404 Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
405 tout << "mset2:" << mset2 << '\n';
406
407 TEST_NOT_EQUAL(mset1.size(), 0);
408 if (*i != Xapian::Query::OP_XOR) {
409 TEST_EQUAL(mset1[0].get_percent(), 100);
410 } else {
411 TEST(mset1[0].get_percent() != 100);
412 }
413 check_msets_contain_same_docs(mset1, mset2);
414 }
415 }
416
DEFINE_TESTCASE(opmax1,backend)417 DEFINE_TESTCASE(opmax1, backend) {
418 Xapian::Database db(get_database("etext"));
419 Xapian::Enquire enq(db);
420 Xapian::Query q1("king");
421 Xapian::Query q2("friedrich");
422 Xapian::Query qmax(Xapian::Query::OP_MAX, q1, q2);
423 enq.set_query(q1);
424 Xapian::MSet mset1 = enq.get_mset(0, db.get_doccount());
425 enq.set_query(q2);
426 Xapian::MSet mset2 = enq.get_mset(0, db.get_doccount());
427 enq.set_query(qmax);
428 Xapian::MSet msetmax = enq.get_mset(0, db.get_doccount());
429
430 // Check that the weights in msetmax are the maximum of the weights in
431 // mset1 and mset2 for each docid.
432 map<Xapian::docid, double> expected_weights;
433 Xapian::MSetIterator i;
434 for (i = mset1.begin(); i != mset1.end(); ++i) {
435 expected_weights[*i] = i.get_weight();
436 }
437 for (i = mset2.begin(); i != mset2.end(); ++i) {
438 map<Xapian::docid, double>::iterator j;
439 j = expected_weights.find(*i);
440 if (j != expected_weights.end()) {
441 j->second = max(j->second, i.get_weight());
442 } else {
443 expected_weights[*i] = i.get_weight();
444 }
445 }
446
447 for (i = msetmax.begin(); i != msetmax.end(); ++i) {
448 map<Xapian::docid, double>::iterator j;
449 j = expected_weights.find(*i);
450 TEST(j != expected_weights.end());
451 TEST_EQUAL_DOUBLE(j->second, i.get_weight());
452 expected_weights.erase(j);
453 tout << expected_weights.size() << endl;
454 }
455
456 // Any document in mset1 or mset2 should also be in msetmax.
457 TEST_EQUAL(expected_weights.size(), 0);
458 }
459