1 /** @file api_opsynonym.cc
2  * @brief tests of OP_SYNONYM.
3  */
4 /* Copyright 2009 Olly Betts
5  * Copyright 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "api_opsynonym.h"
26 
27 #include <map>
28 #include <set>
29 #include <vector>
30 
31 #include <xapian.h>
32 
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
36 
37 #include "apitest.h"
38 
39 using namespace std;
40 
41 // #######################################################################
42 // # Tests start here
43 
44 // Check a synonym search
DEFINE_TESTCASE(synonym1,backend)45 DEFINE_TESTCASE(synonym1, backend) {
46     Xapian::Database db(get_database("etext"));
47 
48     TEST_REL(db.get_doclength_upper_bound(), >, 0);
49 
50     Xapian::doccount lots = 214;
51 
52     // Make a list of lists of subqueries, which are going to be joined
53     // together as a synonym.
54     vector<vector<Xapian::Query> > subqueries_list;
55 
56     // For each set of subqueries, keep a list of the number of results for
57     // which the weight should be the same when combined with OP_SYNONYM as
58     // when combined with OP_OR.
59     vector<int> subqueries_sameweight_count;
60     vector<int> subqueries_diffweight_count;
61 
62     vector<Xapian::Query> subqueries;
63     subqueries.push_back(Xapian::Query("date"));
64     subqueries_list.push_back(subqueries);
65     // Single term - all 33 results should be same weight.
66     subqueries_sameweight_count.push_back(33);
67     subqueries_diffweight_count.push_back(0);
68 
69     // Two terms, which co-occur in some documents.
70     subqueries.clear();
71     subqueries.push_back(Xapian::Query("sky"));
72     subqueries.push_back(Xapian::Query("date"));
73     subqueries_list.push_back(subqueries);
74     // All 34 results should be different.
75     subqueries_sameweight_count.push_back(0);
76     subqueries_diffweight_count.push_back(34);
77 
78     // Two terms which are entirely disjoint, and where the maximum weight
79     // doesn't occur in the first or second match.
80     subqueries.clear();
81     subqueries.push_back(Xapian::Query("gutenberg"));
82     subqueries.push_back(Xapian::Query("blockhead"));
83     subqueries_list.push_back(subqueries);
84     // All 18 results should be different.
85     subqueries_sameweight_count.push_back(0);
86     subqueries_diffweight_count.push_back(18);
87 
88     subqueries.clear();
89     subqueries.push_back(Xapian::Query("date"));
90     subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
91 				       Xapian::Query("sky"),
92 				       Xapian::Query("glove")));
93     subqueries_list.push_back(subqueries);
94     // All 34 results should be different.
95     subqueries_sameweight_count.push_back(0);
96     subqueries_diffweight_count.push_back(34);
97 
98     subqueries.clear();
99     subqueries.push_back(Xapian::Query("date"));
100     subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
101 				       Xapian::Query("sky"),
102 				       Xapian::Query("date")));
103     subqueries_list.push_back(subqueries);
104     // All 34 results should be different.
105     subqueries_sameweight_count.push_back(0);
106     subqueries_diffweight_count.push_back(34);
107 
108     subqueries.clear();
109     subqueries.push_back(Xapian::Query("date"));
110     subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND_MAYBE,
111 				       Xapian::Query("sky"),
112 				       Xapian::Query("date")));
113     subqueries_list.push_back(subqueries);
114     // All 34 results should be different.
115     subqueries_sameweight_count.push_back(0);
116     subqueries_diffweight_count.push_back(34);
117 
118     subqueries.clear();
119     subqueries.push_back(Xapian::Query("date"));
120     subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND_NOT,
121 				       Xapian::Query("sky"),
122 				       Xapian::Query("date")));
123     subqueries_list.push_back(subqueries);
124     // All 34 results should be different.
125     subqueries_sameweight_count.push_back(0);
126     subqueries_diffweight_count.push_back(34);
127 
128     subqueries.clear();
129     subqueries.push_back(Xapian::Query("date"));
130     subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND,
131 				       Xapian::Query("sky"),
132 				       Xapian::Query("date")));
133     subqueries_list.push_back(subqueries);
134     // The AND only matches 1 document, so the estimated termfreq for the whole
135     // synonym works out as 33 (due to rounding), which is the same as the
136     // termfreq for "date".  Therefore most of the weights are the same as just
137     // for the pure "date" search, and the only document which gets a different
138     // weight is the one also matched by "sky" (because it has a wdf boost).
139     subqueries_sameweight_count.push_back(32);
140     subqueries_diffweight_count.push_back(1);
141 
142     subqueries.clear();
143     subqueries.push_back(Xapian::Query("date"));
144     subqueries.push_back(Xapian::Query(Xapian::Query::OP_XOR,
145 				       Xapian::Query("sky"),
146 				       Xapian::Query("date")));
147     subqueries_list.push_back(subqueries);
148     // All 34 results should be different.
149     subqueries_sameweight_count.push_back(0);
150     subqueries_diffweight_count.push_back(34);
151 
152     subqueries.clear();
153     subqueries.push_back(Xapian::Query("date"));
154     subqueries.push_back(Xapian::Query(Xapian::Query::OP_SYNONYM,
155 				       Xapian::Query("sky"),
156 				       Xapian::Query("date")));
157     subqueries_list.push_back(subqueries);
158     // When the top-level operator is OR, the synonym part has an estimated
159     // termfreq of 35.  When the top-level operator is SYNONYM, the whole query
160     // has an estimated termfreq of 35, and is in fact the same as the synonym
161     // part in the OR query, except that the wqf of "date" is 2.  We're
162     // currently not using the wqfs of components of synonyms, so this
163     // difference has no effect on the weightings.  Therefore, for the 1
164     // document which does not contain "data", we get the same result with
165     // SYNONYM as with OR.
166     subqueries_sameweight_count.push_back(1);
167     subqueries_diffweight_count.push_back(33);
168 
169     subqueries.clear();
170     subqueries.push_back(Xapian::Query("sky"));
171     subqueries.push_back(Xapian::Query("date"));
172     subqueries.push_back(Xapian::Query("stein"));
173     subqueries.push_back(Xapian::Query("ally"));
174     subqueries_list.push_back(subqueries);
175     // All 35 results should be different.
176     subqueries_sameweight_count.push_back(0);
177     subqueries_diffweight_count.push_back(35);
178 
179     subqueries.clear();
180     subqueries.push_back(Xapian::Query("attitud"));
181     subqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
182 				       Xapian::Query("german"),
183 				       Xapian::Query("adventur")));
184     subqueries_list.push_back(subqueries);
185     // The estimated term frequency for the synoynm is 2 (because the estimate
186     // for the phrase is 0), which is the same as the term frequency of
187     // "attitud".  Thus, the synonym gets the same weight as "attitud", so
188     // documents with only "attitud" (but not the phrase) in them get the same
189     // wdf, and have the same total weight.  There turns out to be exactly one
190     // such document.
191     subqueries_sameweight_count.push_back(1);
192     subqueries_diffweight_count.push_back(3);
193 
194     subqueries.clear();
195     subqueries.push_back(Xapian::Query("attitud"));
196     subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
197 				       Xapian::Query("german"),
198 				       Xapian::Query(Xapian::Query::OP_SYNONYM,
199 						     Xapian::Query("sky"),
200 						     Xapian::Query("date"))));
201     subqueries_list.push_back(subqueries);
202     // All 54 results are different.
203     subqueries_sameweight_count.push_back(0);
204     subqueries_diffweight_count.push_back(54);
205 
206     for (vector<vector<Xapian::Query> >::size_type subqgroup = 0;
207 	 subqgroup != subqueries_list.size(); ++subqgroup)
208     {
209 	vector<Xapian::Query> * qlist = &(subqueries_list[subqgroup]);
210 	// Run two queries, one joining the subqueries with OR and one joining
211 	// them with SYNONYM.
212 	Xapian::Enquire enquire(db);
213 
214 	// Do the search with OR
215 	Xapian::Query orquery(Xapian::Query::OP_OR, qlist->begin(), qlist->end());
216 	enquire.set_query(orquery);
217 	Xapian::MSet ormset = enquire.get_mset(0, lots);
218 
219 	// Do the search with synonym, getting all the results.
220 	Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist->begin(), qlist->end());
221 	enquire.set_query(synquery);
222 	Xapian::MSet synmset = enquire.get_mset(0, lots);
223 
224 	tout << "Comparing " << orquery << " with " << synquery << '\n';
225 
226 	// Check that the queries return some results.
227 	TEST_NOT_EQUAL(synmset.size(), 0);
228 	// Check that the queries return the same number of results.
229 	TEST_EQUAL(synmset.size(), ormset.size());
230 	map<Xapian::docid, Xapian::weight> values_or;
231 	map<Xapian::docid, Xapian::weight> values_synonym;
232 	for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
233 	    values_or[*ormset[i]] = ormset[i].get_weight();
234 	    values_synonym[*synmset[i]] = synmset[i].get_weight();
235 	}
236 	TEST_EQUAL(values_or.size(), values_synonym.size());
237 
238 	/* Check that the most of the weights for items in the "or" mset are
239 	 * different from those in the "synonym" mset. */
240 	int same_weight = 0;
241 	int different_weight = 0;
242 	for (map<Xapian::docid, Xapian::weight>::const_iterator
243 	     j = values_or.begin(); j != values_or.end(); ++j) {
244 	    Xapian::docid did = j->first;
245 	    // Check that all the results in the or tree make it to the synonym
246 	    // tree.
247 	    TEST(values_synonym.find(did) != values_synonym.end());
248 	    if (values_or[did] == values_synonym[did]) {
249 		++same_weight;
250 	    } else {
251 		++different_weight;
252 	    }
253 	}
254 
255 	int expected_same = subqueries_sameweight_count[subqgroup];
256 	int expected_diff = subqueries_diffweight_count[subqgroup];
257 
258 	TEST_EQUAL(different_weight, expected_diff);
259 	TEST_EQUAL(same_weight, expected_same);
260 
261 	// Do the search with synonym, but just get the top result.
262 	// (Regression test - the OR subquery in the synonym postlist tree used
263 	// to shortcut incorrectly, and return the wrong result here).
264 	Xapian::MSet mset_top = enquire.get_mset(0, 1);
265 	TEST_EQUAL(mset_top.size(), 1);
266 	TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
267     }
268     return true;
269 }
270 
271 // Regression test - test a synonym search with a MultiAndPostlist.
DEFINE_TESTCASE(synonym2,backend)272 DEFINE_TESTCASE(synonym2, backend) {
273     Xapian::Query query;
274     vector<Xapian::Query> subqueries;
275     subqueries.push_back(Xapian::Query("file"));
276     subqueries.push_back(Xapian::Query("the"));
277     subqueries.push_back(Xapian::Query("next"));
278     subqueries.push_back(Xapian::Query("reader"));
279     query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
280     subqueries.clear();
281     subqueries.push_back(query);
282     subqueries.push_back(Xapian::Query("gutenberg"));
283     query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
284 
285     tout << query << '\n';
286 
287     Xapian::Database db(get_database("etext"));
288     Xapian::Enquire enquire(db);
289     enquire.set_query(query);
290     Xapian::MSet mset = enquire.get_mset(0, 10);
291     tout << mset << '\n';
292 
293     // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
294     double maxposs = mset.get_max_possible();
295     query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
296     enquire.set_query(query);
297     mset = enquire.get_mset(0, 10);
298     double maxposs2 = mset.get_max_possible();
299 
300     TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
301 
302     return true;
303 }
304 
305 static void
check_msets_contain_same_docs(const Xapian::MSet & mset1,const Xapian::MSet & mset2)306 check_msets_contain_same_docs(const Xapian::MSet & mset1,
307 			      const Xapian::MSet & mset2)
308 {
309     TEST_EQUAL(mset1.size(), mset2.size());
310 
311     set<Xapian::docid> docids;
312     for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
313 	docids.insert(*mset1[i]);
314     }
315 
316     // Check that all the results in mset1 are in mset2.
317     for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
318 	// Check that we can erase each entry from mset2 element.  Since mset1
319 	// and mset2 are the same size this means we can be sure that there
320 	// were no repeated docids in either (it would be a bug if there were).
321 	TEST(docids.erase(*mset2[j]));
322     }
323 }
324 
325 // Test a synonym search which has had its weight scaled to 0.
DEFINE_TESTCASE(synonym3,backend)326 DEFINE_TESTCASE(synonym3, backend) {
327     Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
328 					Xapian::Query("sky"),
329 					Xapian::Query("date"));
330 
331     Xapian::Database db(get_database("etext"));
332     Xapian::Enquire enquire(db);
333     enquire.set_query(query);
334     Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
335 
336     tout << query << '\n';
337     tout << mset_orig << '\n';
338 
339     // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
340     // (this has a special codepath to avoid doing the synonym calculation).
341     query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
342     enquire.set_query(query);
343     Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
344 
345     tout << query << '\n';
346     tout << mset_zero << '\n';
347 
348     // Check that the queries return some results.
349     TEST_NOT_EQUAL(mset_zero.size(), 0);
350     // Check that the queries return the same document IDs, and the zero
351     // one has zero weight.
352     check_msets_contain_same_docs(mset_orig, mset_zero);
353     for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
354 	TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
355 	TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
356     }
357 
358     return true;
359 }
360 
361 // Test synonym searches combined with various operators.
DEFINE_TESTCASE(synonym4,backend)362 DEFINE_TESTCASE(synonym4, backend) {
363     Xapian::Database db(get_database("etext"));
364     Xapian::Enquire enquire(db);
365     Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
366 					    Xapian::Query("gutenberg"),
367 					    Xapian::Query("blockhead"));
368     Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
369 					   Xapian::Query("gutenberg"),
370 					   Xapian::Query("blockhead"));
371     Xapian::Query date_query = Xapian::Query("date");
372 
373     // Check some queries.
374     static const Xapian::Query::op operators[] = {
375 	Xapian::Query::OP_AND_MAYBE,
376 	Xapian::Query::OP_AND_NOT,
377 	Xapian::Query::OP_AND,
378 	Xapian::Query::OP_XOR,
379 	Xapian::Query::OP_OR,
380 	Xapian::Query::OP_SYNONYM
381     };
382     const Xapian::Query::op * end;
383     end = operators + sizeof(operators) / sizeof(operators[0]);
384     for (const Xapian::Query::op * i = operators; i != end; ++i) {
385 	tout.str(string());
386 	Xapian::Query query1(*i, syn_query, date_query);
387 	Xapian::Query query2(*i, or_query, date_query);
388 
389 	enquire.set_query(query1);
390 	tout << "query1:" << query1 << '\n';
391 	Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
392 	tout << "mset1:" << mset1 << '\n';
393 	enquire.set_query(query2);
394 	tout << "query2:" << query2 << '\n';
395 	Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
396 	tout << "mset2:" << mset2 << '\n';
397 
398 	TEST_NOT_EQUAL(mset1.size(), 0);
399 	if (*i != Xapian::Query::OP_XOR) {
400 	    TEST_EQUAL(mset1[0].get_percent(), 100);
401 	} else {
402 	    TEST(mset1[0].get_percent() != 100);
403 	}
404 	check_msets_contain_same_docs(mset1, mset2);
405     }
406 
407     return true;
408 }
409