1 /** @file
2  * @brief tests of OP_SYNONYM and OP_MAX.
3  */
4 /* Copyright 2009,2011,2014 Olly Betts
5  * Copyright 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "api_opsynonym.h"
26 
27 #include <map>
28 #include <set>
29 #include <vector>
30 
31 #include <xapian.h>
32 
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
36 
37 #include "apitest.h"
38 
39 using namespace std;
40 
41 // #######################################################################
42 // # Tests start here
43 
44 struct synonym1_data_type {
45     // How many results should have the same weight when combined with
46     // OP_SYNONYM instead of OP_OR.
47     int sameweight_count;
48     // How many results should have a different weight when combined with
49     // OP_SYNONYM instead of OP_OR.
50     int diffweight_count;
51     // How many subqueries.
52     unsigned n_subqs;
53     // The subqueries (use NOQ for unused ones).
54     Xapian::Query subqs[4];
55 };
56 
57 #define NOQ Xapian::Query::MatchNothing
58 static const synonym1_data_type synonym1_data[] = {
59     {
60 	// Single term - all 33 results should be same weight.
61 	33, 0, 1,
62 	{ Xapian::Query("date"), NOQ, NOQ, NOQ }
63     },
64     {
65 	// Two terms, which co-occur in some documents.
66 	//
67 	// All 34 results should be different.
68 	0, 34, 2,
69 	{ Xapian::Query("sky"), Xapian::Query("date"), NOQ, NOQ }
70     },
71     {
72 	// Two terms which are entirely disjoint, and where the maximum weight
73 	// doesn't occur in the first or second match.
74 	//
75 	// All 18 results should be different.
76 	0, 18, 2,
77 	{ Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ, NOQ }
78     },
79     {
80 	// All 34 results should be different.
81 	0, 34, 2,
82 	{
83 	    Xapian::Query("date"),
84 	    Xapian::Query(Xapian::Query::OP_OR,
85 			  Xapian::Query("sky"),
86 			  Xapian::Query("glove")),
87 	    NOQ, NOQ
88 	}
89     },
90     {
91 	// All 34 results should be different.
92 	0, 34, 2,
93 	{
94 	    Xapian::Query("date"),
95 	    Xapian::Query(Xapian::Query::OP_OR,
96 			  Xapian::Query("sky"),
97 			  Xapian::Query("date")),
98 	    NOQ, NOQ
99 	}
100     },
101     {
102 	// All 34 results should be different.
103 	0, 34, 2,
104 	{
105 	    Xapian::Query("date"),
106 	    Xapian::Query(Xapian::Query::OP_AND_MAYBE,
107 			  Xapian::Query("sky"),
108 			  Xapian::Query("date")),
109 	    NOQ, NOQ
110 	}
111     },
112     {
113 	// All 34 results should be different.
114 	0, 34, 2,
115 	{
116 	    Xapian::Query("date"),
117 	    Xapian::Query(Xapian::Query::OP_AND_NOT,
118 			  Xapian::Query("sky"),
119 			  Xapian::Query("date")),
120 	    NOQ, NOQ
121 	}
122     },
123     {
124 	// The AND only matches 1 document, so the estimated termfreq for the
125 	// whole synonym works out as 33 (due to rounding), which is the same
126 	// as the termfreq for "date".  Therefore most of the weights are the
127 	// same as just for the pure "date" search, and the only document which
128 	// gets a different weight is the one also matched by "sky" (because it
129 	// has a wdf boost).
130 	32, 1, 2,
131 	{
132 	    Xapian::Query("date"),
133 	    Xapian::Query(Xapian::Query::OP_AND,
134 			  Xapian::Query("sky"),
135 			  Xapian::Query("date")),
136 	    NOQ, NOQ
137 	}
138     },
139     {
140 	// All 34 results should be different.
141 	0, 34, 2,
142 	{
143 	    Xapian::Query("date"),
144 	    Xapian::Query(Xapian::Query::OP_XOR,
145 			  Xapian::Query("sky"),
146 			  Xapian::Query("date")),
147 	    NOQ, NOQ
148 	}
149     },
150     {
151 	// When the top-level operator is OR, the synonym part has an estimated
152 	// termfreq of 35.  When the top-level operator is SYNONYM, the whole
153 	// query has an estimated termfreq of 66, which is rather bogus, but
154 	// that's the current situation here (1.2 did better as it flattened
155 	// this into a single OP_SYNONYM operator and then merged the two
156 	// "date" terms to one with wqf=2.  We've decided we shouldn't do such
157 	// merging from 1.3.x on (merging to sum the scale_factors is fine, but
158 	// we don't do that yet - FIXME).
159 	//
160 	// Anyway, this means that currently the weights are different for all
161 	// matches.
162 	0, 34, 2,
163 	{
164 	    Xapian::Query("date"),
165 	    Xapian::Query(Xapian::Query::OP_SYNONYM,
166 			  Xapian::Query("sky"),
167 			  Xapian::Query("date")),
168 	    NOQ, NOQ
169 	}
170     },
171     {
172 	// All 35 results should be different.
173 	0, 35, 4,
174 	{
175 	    Xapian::Query("sky"),
176 	    Xapian::Query("date"),
177 	    Xapian::Query("stein"),
178 	    Xapian::Query("ally")
179 	}
180     },
181     {
182 	// The estimated term frequency for the synoynm is 2 (because the
183 	// estimate for the phrase is 0), which is the same as the term
184 	// frequency of "attitud".  Thus, the synonym gets the same weight as
185 	// "attitud", so documents with only "attitud" (but not the phrase) in
186 	// them get the same wdf, and have the same total weight.  There turns
187 	// out to be exactly one such document.
188 	1, 3, 2,
189 	{
190 	    Xapian::Query("attitud"),
191 	    Xapian::Query(Xapian::Query::OP_PHRASE,
192 			  Xapian::Query("german"),
193 			  Xapian::Query("adventur")),
194 	    NOQ, NOQ
195 	}
196     },
197     {
198 	// All 54 results should be different.
199 	0, 54, 2,
200 	{
201 	    Xapian::Query("attitud"),
202 	    Xapian::Query(Xapian::Query::OP_OR,
203 			  Xapian::Query("german"),
204 			  Xapian::Query(Xapian::Query::OP_SYNONYM,
205 					Xapian::Query("sky"),
206 					Xapian::Query("date"))),
207 	    NOQ, NOQ
208 	}
209     }
210 };
211 
212 // Check a synonym search
DEFINE_TESTCASE(synonym1,backend)213 DEFINE_TESTCASE(synonym1, backend) {
214     Xapian::Database db(get_database("etext"));
215 
216     TEST_REL(db.get_doclength_upper_bound(), >, 0);
217 
218     const Xapian::doccount lots = 214;
219 
220     for (size_t subqgroup = 0;
221 	 subqgroup != sizeof(synonym1_data) / sizeof(synonym1_data[0]);
222 	 ++subqgroup) {
223 	const synonym1_data_type & data = synonym1_data[subqgroup];
224 	const Xapian::Query * qlist = data.subqs;
225 	const Xapian::Query * qlist_end = qlist + data.n_subqs;
226 
227 	// Run two queries, one joining the subqueries with OR and one joining
228 	// them with SYNONYM.
229 	Xapian::Enquire enquire(db);
230 
231 	// Do the search with OP_OR, getting all the results.
232 	Xapian::Query orquery(Xapian::Query::OP_OR, qlist, qlist_end);
233 	enquire.set_query(orquery);
234 	Xapian::MSet ormset = enquire.get_mset(0, lots);
235 
236 	// Do the search with OP_SYNONYM, getting all the results.
237 	Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist, qlist_end);
238 	enquire.set_query(synquery);
239 	Xapian::MSet synmset = enquire.get_mset(0, lots);
240 
241 	tout << "Comparing " << orquery << " with " << synquery << '\n';
242 
243 	// Check that the queries return some results.
244 	TEST_NOT_EQUAL(synmset.size(), 0);
245 	// Check that the queries return the same number of results.
246 	TEST_EQUAL(synmset.size(), ormset.size());
247 	map<Xapian::docid, double> values_or;
248 	map<Xapian::docid, double> values_synonym;
249 	for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
250 	    values_or[*ormset[i]] = ormset[i].get_weight();
251 	    values_synonym[*synmset[i]] = synmset[i].get_weight();
252 	}
253 	TEST_EQUAL(values_or.size(), values_synonym.size());
254 
255 	/* Check that the most of the weights for items in the "or" mset are
256 	 * different from those in the "synonym" mset. */
257 	int same_weight = 0;
258 	int different_weight = 0;
259 	for (map<Xapian::docid, double>::const_iterator
260 	     j = values_or.begin(); j != values_or.end(); ++j) {
261 	    Xapian::docid did = j->first;
262 	    // Check that all the results in the or tree make it to the synonym
263 	    // tree.
264 	    TEST(values_synonym.find(did) != values_synonym.end());
265 	    if (values_or[did] == values_synonym[did]) {
266 		++same_weight;
267 	    } else {
268 		++different_weight;
269 	    }
270 	}
271 
272 	TEST_EQUAL(different_weight, data.diffweight_count);
273 	TEST_EQUAL(same_weight, data.sameweight_count);
274 
275 	// Do the search with synonym, but just get the top result.
276 	// (Regression test - the OR subquery in the synonym postlist tree used
277 	// to shortcut incorrectly, and return the wrong result here).
278 	Xapian::MSet mset_top = enquire.get_mset(0, 1);
279 	TEST_EQUAL(mset_top.size(), 1);
280 	TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
281     }
282 }
283 
284 // Regression test - test a synonym search with a MultiAndPostlist.
DEFINE_TESTCASE(synonym2,backend)285 DEFINE_TESTCASE(synonym2, backend) {
286     Xapian::Query query;
287     vector<Xapian::Query> subqueries;
288     subqueries.push_back(Xapian::Query("file"));
289     subqueries.push_back(Xapian::Query("the"));
290     subqueries.push_back(Xapian::Query("next"));
291     subqueries.push_back(Xapian::Query("reader"));
292     query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
293     subqueries.clear();
294     subqueries.push_back(query);
295     subqueries.push_back(Xapian::Query("gutenberg"));
296     query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
297 
298     tout << query << '\n';
299 
300     Xapian::Database db(get_database("etext"));
301     Xapian::Enquire enquire(db);
302     enquire.set_query(query);
303     Xapian::MSet mset = enquire.get_mset(0, 10);
304     tout << mset << '\n';
305 
306     // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
307     double maxposs = mset.get_max_possible();
308     query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
309     enquire.set_query(query);
310     mset = enquire.get_mset(0, 10);
311     double maxposs2 = mset.get_max_possible();
312 
313     TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
314 }
315 
316 static void
check_msets_contain_same_docs(const Xapian::MSet & mset1,const Xapian::MSet & mset2)317 check_msets_contain_same_docs(const Xapian::MSet & mset1,
318 			      const Xapian::MSet & mset2)
319 {
320     TEST_EQUAL(mset1.size(), mset2.size());
321 
322     set<Xapian::docid> docids;
323     for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
324 	docids.insert(*mset1[i]);
325     }
326 
327     // Check that all the results in mset1 are in mset2.
328     for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
329 	// Check that we can erase each entry from mset2 element.  Since mset1
330 	// and mset2 are the same size this means we can be sure that there
331 	// were no repeated docids in either (it would be a bug if there were).
332 	TEST(docids.erase(*mset2[j]));
333     }
334 }
335 
336 // Test a synonym search which has had its weight scaled to 0.
DEFINE_TESTCASE(synonym3,backend)337 DEFINE_TESTCASE(synonym3, backend) {
338     Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
339 					Xapian::Query("sky"),
340 					Xapian::Query("date"));
341 
342     Xapian::Database db(get_database("etext"));
343     Xapian::Enquire enquire(db);
344     enquire.set_query(query);
345     Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
346 
347     tout << query << '\n';
348     tout << mset_orig << '\n';
349 
350     // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
351     // (this has a special codepath to avoid doing the synonym calculation).
352     query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
353     enquire.set_query(query);
354     Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
355 
356     tout << query << '\n';
357     tout << mset_zero << '\n';
358 
359     // Check that the queries return some results.
360     TEST_NOT_EQUAL(mset_zero.size(), 0);
361     // Check that the queries return the same document IDs, and the zero
362     // one has zero weight.
363     check_msets_contain_same_docs(mset_orig, mset_zero);
364     for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
365 	TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
366 	TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
367     }
368 }
369 
370 // Test synonym searches combined with various operators.
DEFINE_TESTCASE(synonym4,backend)371 DEFINE_TESTCASE(synonym4, backend) {
372     Xapian::Database db(get_database("etext"));
373     Xapian::Enquire enquire(db);
374     Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
375 					    Xapian::Query("gutenberg"),
376 					    Xapian::Query("blockhead"));
377     Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
378 					   Xapian::Query("gutenberg"),
379 					   Xapian::Query("blockhead"));
380     Xapian::Query date_query = Xapian::Query("date");
381 
382     // Check some queries.
383     static const Xapian::Query::op operators[] = {
384 	Xapian::Query::OP_AND_MAYBE,
385 	Xapian::Query::OP_AND_NOT,
386 	Xapian::Query::OP_AND,
387 	Xapian::Query::OP_XOR,
388 	Xapian::Query::OP_OR,
389 	Xapian::Query::OP_SYNONYM
390     };
391     const Xapian::Query::op * end;
392     end = operators + sizeof(operators) / sizeof(operators[0]);
393     for (const Xapian::Query::op * i = operators; i != end; ++i) {
394 	tout.str(string());
395 	Xapian::Query query1(*i, syn_query, date_query);
396 	Xapian::Query query2(*i, or_query, date_query);
397 
398 	enquire.set_query(query1);
399 	tout << "query1:" << query1 << '\n';
400 	Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
401 	tout << "mset1:" << mset1 << '\n';
402 	enquire.set_query(query2);
403 	tout << "query2:" << query2 << '\n';
404 	Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
405 	tout << "mset2:" << mset2 << '\n';
406 
407 	TEST_NOT_EQUAL(mset1.size(), 0);
408 	if (*i != Xapian::Query::OP_XOR) {
409 	    TEST_EQUAL(mset1[0].get_percent(), 100);
410 	} else {
411 	    TEST(mset1[0].get_percent() != 100);
412 	}
413 	check_msets_contain_same_docs(mset1, mset2);
414     }
415 }
416 
DEFINE_TESTCASE(opmax1,backend)417 DEFINE_TESTCASE(opmax1, backend) {
418     Xapian::Database db(get_database("etext"));
419     Xapian::Enquire enq(db);
420     Xapian::Query q1("king");
421     Xapian::Query q2("friedrich");
422     Xapian::Query qmax(Xapian::Query::OP_MAX, q1, q2);
423     enq.set_query(q1);
424     Xapian::MSet mset1 = enq.get_mset(0, db.get_doccount());
425     enq.set_query(q2);
426     Xapian::MSet mset2 = enq.get_mset(0, db.get_doccount());
427     enq.set_query(qmax);
428     Xapian::MSet msetmax = enq.get_mset(0, db.get_doccount());
429 
430     // Check that the weights in msetmax are the maximum of the weights in
431     // mset1 and mset2 for each docid.
432     map<Xapian::docid, double> expected_weights;
433     Xapian::MSetIterator i;
434     for (i = mset1.begin(); i != mset1.end(); ++i) {
435 	expected_weights[*i] = i.get_weight();
436     }
437     for (i = mset2.begin(); i != mset2.end(); ++i) {
438 	map<Xapian::docid, double>::iterator j;
439 	j = expected_weights.find(*i);
440 	if (j != expected_weights.end()) {
441 	    j->second = max(j->second, i.get_weight());
442 	} else {
443 	    expected_weights[*i] = i.get_weight();
444 	}
445     }
446 
447     for (i = msetmax.begin(); i != msetmax.end(); ++i) {
448 	map<Xapian::docid, double>::iterator j;
449 	j = expected_weights.find(*i);
450 	TEST(j != expected_weights.end());
451 	TEST_EQUAL_DOUBLE(j->second, i.get_weight());
452 	expected_weights.erase(j);
453 	tout << expected_weights.size() << endl;
454     }
455 
456     // Any document in mset1 or mset2 should also be in msetmax.
457     TEST_EQUAL(expected_weights.size(), 0);
458 }
459