1 /** @file api_opsynonym.cc
2 * @brief tests of OP_SYNONYM.
3 */
4 /* Copyright 2009 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #include <config.h>
24
25 #include "api_opsynonym.h"
26
27 #include <map>
28 #include <set>
29 #include <vector>
30
31 #include <xapian.h>
32
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
36
37 #include "apitest.h"
38
39 using namespace std;
40
41 // #######################################################################
42 // # Tests start here
43
44 // Check a synonym search
DEFINE_TESTCASE(synonym1,backend)45 DEFINE_TESTCASE(synonym1, backend) {
46 Xapian::Database db(get_database("etext"));
47
48 TEST_REL(db.get_doclength_upper_bound(), >, 0);
49
50 Xapian::doccount lots = 214;
51
52 // Make a list of lists of subqueries, which are going to be joined
53 // together as a synonym.
54 vector<vector<Xapian::Query> > subqueries_list;
55
56 // For each set of subqueries, keep a list of the number of results for
57 // which the weight should be the same when combined with OP_SYNONYM as
58 // when combined with OP_OR.
59 vector<int> subqueries_sameweight_count;
60 vector<int> subqueries_diffweight_count;
61
62 vector<Xapian::Query> subqueries;
63 subqueries.push_back(Xapian::Query("date"));
64 subqueries_list.push_back(subqueries);
65 // Single term - all 33 results should be same weight.
66 subqueries_sameweight_count.push_back(33);
67 subqueries_diffweight_count.push_back(0);
68
69 // Two terms, which co-occur in some documents.
70 subqueries.clear();
71 subqueries.push_back(Xapian::Query("sky"));
72 subqueries.push_back(Xapian::Query("date"));
73 subqueries_list.push_back(subqueries);
74 // All 34 results should be different.
75 subqueries_sameweight_count.push_back(0);
76 subqueries_diffweight_count.push_back(34);
77
78 // Two terms which are entirely disjoint, and where the maximum weight
79 // doesn't occur in the first or second match.
80 subqueries.clear();
81 subqueries.push_back(Xapian::Query("gutenberg"));
82 subqueries.push_back(Xapian::Query("blockhead"));
83 subqueries_list.push_back(subqueries);
84 // All 18 results should be different.
85 subqueries_sameweight_count.push_back(0);
86 subqueries_diffweight_count.push_back(18);
87
88 subqueries.clear();
89 subqueries.push_back(Xapian::Query("date"));
90 subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
91 Xapian::Query("sky"),
92 Xapian::Query("glove")));
93 subqueries_list.push_back(subqueries);
94 // All 34 results should be different.
95 subqueries_sameweight_count.push_back(0);
96 subqueries_diffweight_count.push_back(34);
97
98 subqueries.clear();
99 subqueries.push_back(Xapian::Query("date"));
100 subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
101 Xapian::Query("sky"),
102 Xapian::Query("date")));
103 subqueries_list.push_back(subqueries);
104 // All 34 results should be different.
105 subqueries_sameweight_count.push_back(0);
106 subqueries_diffweight_count.push_back(34);
107
108 subqueries.clear();
109 subqueries.push_back(Xapian::Query("date"));
110 subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND_MAYBE,
111 Xapian::Query("sky"),
112 Xapian::Query("date")));
113 subqueries_list.push_back(subqueries);
114 // All 34 results should be different.
115 subqueries_sameweight_count.push_back(0);
116 subqueries_diffweight_count.push_back(34);
117
118 subqueries.clear();
119 subqueries.push_back(Xapian::Query("date"));
120 subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND_NOT,
121 Xapian::Query("sky"),
122 Xapian::Query("date")));
123 subqueries_list.push_back(subqueries);
124 // All 34 results should be different.
125 subqueries_sameweight_count.push_back(0);
126 subqueries_diffweight_count.push_back(34);
127
128 subqueries.clear();
129 subqueries.push_back(Xapian::Query("date"));
130 subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND,
131 Xapian::Query("sky"),
132 Xapian::Query("date")));
133 subqueries_list.push_back(subqueries);
134 // The AND only matches 1 document, so the estimated termfreq for the whole
135 // synonym works out as 33 (due to rounding), which is the same as the
136 // termfreq for "date". Therefore most of the weights are the same as just
137 // for the pure "date" search, and the only document which gets a different
138 // weight is the one also matched by "sky" (because it has a wdf boost).
139 subqueries_sameweight_count.push_back(32);
140 subqueries_diffweight_count.push_back(1);
141
142 subqueries.clear();
143 subqueries.push_back(Xapian::Query("date"));
144 subqueries.push_back(Xapian::Query(Xapian::Query::OP_XOR,
145 Xapian::Query("sky"),
146 Xapian::Query("date")));
147 subqueries_list.push_back(subqueries);
148 // All 34 results should be different.
149 subqueries_sameweight_count.push_back(0);
150 subqueries_diffweight_count.push_back(34);
151
152 subqueries.clear();
153 subqueries.push_back(Xapian::Query("date"));
154 subqueries.push_back(Xapian::Query(Xapian::Query::OP_SYNONYM,
155 Xapian::Query("sky"),
156 Xapian::Query("date")));
157 subqueries_list.push_back(subqueries);
158 // When the top-level operator is OR, the synonym part has an estimated
159 // termfreq of 35. When the top-level operator is SYNONYM, the whole query
160 // has an estimated termfreq of 35, and is in fact the same as the synonym
161 // part in the OR query, except that the wqf of "date" is 2. We're
162 // currently not using the wqfs of components of synonyms, so this
163 // difference has no effect on the weightings. Therefore, for the 1
164 // document which does not contain "data", we get the same result with
165 // SYNONYM as with OR.
166 subqueries_sameweight_count.push_back(1);
167 subqueries_diffweight_count.push_back(33);
168
169 subqueries.clear();
170 subqueries.push_back(Xapian::Query("sky"));
171 subqueries.push_back(Xapian::Query("date"));
172 subqueries.push_back(Xapian::Query("stein"));
173 subqueries.push_back(Xapian::Query("ally"));
174 subqueries_list.push_back(subqueries);
175 // All 35 results should be different.
176 subqueries_sameweight_count.push_back(0);
177 subqueries_diffweight_count.push_back(35);
178
179 subqueries.clear();
180 subqueries.push_back(Xapian::Query("attitud"));
181 subqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
182 Xapian::Query("german"),
183 Xapian::Query("adventur")));
184 subqueries_list.push_back(subqueries);
185 // The estimated term frequency for the synoynm is 2 (because the estimate
186 // for the phrase is 0), which is the same as the term frequency of
187 // "attitud". Thus, the synonym gets the same weight as "attitud", so
188 // documents with only "attitud" (but not the phrase) in them get the same
189 // wdf, and have the same total weight. There turns out to be exactly one
190 // such document.
191 subqueries_sameweight_count.push_back(1);
192 subqueries_diffweight_count.push_back(3);
193
194 subqueries.clear();
195 subqueries.push_back(Xapian::Query("attitud"));
196 subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
197 Xapian::Query("german"),
198 Xapian::Query(Xapian::Query::OP_SYNONYM,
199 Xapian::Query("sky"),
200 Xapian::Query("date"))));
201 subqueries_list.push_back(subqueries);
202 // All 54 results are different.
203 subqueries_sameweight_count.push_back(0);
204 subqueries_diffweight_count.push_back(54);
205
206 for (vector<vector<Xapian::Query> >::size_type subqgroup = 0;
207 subqgroup != subqueries_list.size(); ++subqgroup)
208 {
209 vector<Xapian::Query> * qlist = &(subqueries_list[subqgroup]);
210 // Run two queries, one joining the subqueries with OR and one joining
211 // them with SYNONYM.
212 Xapian::Enquire enquire(db);
213
214 // Do the search with OR
215 Xapian::Query orquery(Xapian::Query::OP_OR, qlist->begin(), qlist->end());
216 enquire.set_query(orquery);
217 Xapian::MSet ormset = enquire.get_mset(0, lots);
218
219 // Do the search with synonym, getting all the results.
220 Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist->begin(), qlist->end());
221 enquire.set_query(synquery);
222 Xapian::MSet synmset = enquire.get_mset(0, lots);
223
224 tout << "Comparing " << orquery << " with " << synquery << '\n';
225
226 // Check that the queries return some results.
227 TEST_NOT_EQUAL(synmset.size(), 0);
228 // Check that the queries return the same number of results.
229 TEST_EQUAL(synmset.size(), ormset.size());
230 map<Xapian::docid, Xapian::weight> values_or;
231 map<Xapian::docid, Xapian::weight> values_synonym;
232 for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
233 values_or[*ormset[i]] = ormset[i].get_weight();
234 values_synonym[*synmset[i]] = synmset[i].get_weight();
235 }
236 TEST_EQUAL(values_or.size(), values_synonym.size());
237
238 /* Check that the most of the weights for items in the "or" mset are
239 * different from those in the "synonym" mset. */
240 int same_weight = 0;
241 int different_weight = 0;
242 for (map<Xapian::docid, Xapian::weight>::const_iterator
243 j = values_or.begin(); j != values_or.end(); ++j) {
244 Xapian::docid did = j->first;
245 // Check that all the results in the or tree make it to the synonym
246 // tree.
247 TEST(values_synonym.find(did) != values_synonym.end());
248 if (values_or[did] == values_synonym[did]) {
249 ++same_weight;
250 } else {
251 ++different_weight;
252 }
253 }
254
255 int expected_same = subqueries_sameweight_count[subqgroup];
256 int expected_diff = subqueries_diffweight_count[subqgroup];
257
258 TEST_EQUAL(different_weight, expected_diff);
259 TEST_EQUAL(same_weight, expected_same);
260
261 // Do the search with synonym, but just get the top result.
262 // (Regression test - the OR subquery in the synonym postlist tree used
263 // to shortcut incorrectly, and return the wrong result here).
264 Xapian::MSet mset_top = enquire.get_mset(0, 1);
265 TEST_EQUAL(mset_top.size(), 1);
266 TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
267 }
268 return true;
269 }
270
271 // Regression test - test a synonym search with a MultiAndPostlist.
DEFINE_TESTCASE(synonym2,backend)272 DEFINE_TESTCASE(synonym2, backend) {
273 Xapian::Query query;
274 vector<Xapian::Query> subqueries;
275 subqueries.push_back(Xapian::Query("file"));
276 subqueries.push_back(Xapian::Query("the"));
277 subqueries.push_back(Xapian::Query("next"));
278 subqueries.push_back(Xapian::Query("reader"));
279 query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
280 subqueries.clear();
281 subqueries.push_back(query);
282 subqueries.push_back(Xapian::Query("gutenberg"));
283 query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
284
285 tout << query << '\n';
286
287 Xapian::Database db(get_database("etext"));
288 Xapian::Enquire enquire(db);
289 enquire.set_query(query);
290 Xapian::MSet mset = enquire.get_mset(0, 10);
291 tout << mset << '\n';
292
293 // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
294 double maxposs = mset.get_max_possible();
295 query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
296 enquire.set_query(query);
297 mset = enquire.get_mset(0, 10);
298 double maxposs2 = mset.get_max_possible();
299
300 TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
301
302 return true;
303 }
304
305 static void
check_msets_contain_same_docs(const Xapian::MSet & mset1,const Xapian::MSet & mset2)306 check_msets_contain_same_docs(const Xapian::MSet & mset1,
307 const Xapian::MSet & mset2)
308 {
309 TEST_EQUAL(mset1.size(), mset2.size());
310
311 set<Xapian::docid> docids;
312 for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
313 docids.insert(*mset1[i]);
314 }
315
316 // Check that all the results in mset1 are in mset2.
317 for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
318 // Check that we can erase each entry from mset2 element. Since mset1
319 // and mset2 are the same size this means we can be sure that there
320 // were no repeated docids in either (it would be a bug if there were).
321 TEST(docids.erase(*mset2[j]));
322 }
323 }
324
325 // Test a synonym search which has had its weight scaled to 0.
DEFINE_TESTCASE(synonym3,backend)326 DEFINE_TESTCASE(synonym3, backend) {
327 Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
328 Xapian::Query("sky"),
329 Xapian::Query("date"));
330
331 Xapian::Database db(get_database("etext"));
332 Xapian::Enquire enquire(db);
333 enquire.set_query(query);
334 Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
335
336 tout << query << '\n';
337 tout << mset_orig << '\n';
338
339 // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
340 // (this has a special codepath to avoid doing the synonym calculation).
341 query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
342 enquire.set_query(query);
343 Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
344
345 tout << query << '\n';
346 tout << mset_zero << '\n';
347
348 // Check that the queries return some results.
349 TEST_NOT_EQUAL(mset_zero.size(), 0);
350 // Check that the queries return the same document IDs, and the zero
351 // one has zero weight.
352 check_msets_contain_same_docs(mset_orig, mset_zero);
353 for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
354 TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
355 TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
356 }
357
358 return true;
359 }
360
361 // Test synonym searches combined with various operators.
DEFINE_TESTCASE(synonym4,backend)362 DEFINE_TESTCASE(synonym4, backend) {
363 Xapian::Database db(get_database("etext"));
364 Xapian::Enquire enquire(db);
365 Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
366 Xapian::Query("gutenberg"),
367 Xapian::Query("blockhead"));
368 Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
369 Xapian::Query("gutenberg"),
370 Xapian::Query("blockhead"));
371 Xapian::Query date_query = Xapian::Query("date");
372
373 // Check some queries.
374 static const Xapian::Query::op operators[] = {
375 Xapian::Query::OP_AND_MAYBE,
376 Xapian::Query::OP_AND_NOT,
377 Xapian::Query::OP_AND,
378 Xapian::Query::OP_XOR,
379 Xapian::Query::OP_OR,
380 Xapian::Query::OP_SYNONYM
381 };
382 const Xapian::Query::op * end;
383 end = operators + sizeof(operators) / sizeof(operators[0]);
384 for (const Xapian::Query::op * i = operators; i != end; ++i) {
385 tout.str(string());
386 Xapian::Query query1(*i, syn_query, date_query);
387 Xapian::Query query2(*i, or_query, date_query);
388
389 enquire.set_query(query1);
390 tout << "query1:" << query1 << '\n';
391 Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
392 tout << "mset1:" << mset1 << '\n';
393 enquire.set_query(query2);
394 tout << "query2:" << query2 << '\n';
395 Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
396 tout << "mset2:" << mset2 << '\n';
397
398 TEST_NOT_EQUAL(mset1.size(), 0);
399 if (*i != Xapian::Query::OP_XOR) {
400 TEST_EQUAL(mset1[0].get_percent(), 100);
401 } else {
402 TEST(mset1[0].get_percent() != 100);
403 }
404 check_msets_contain_same_docs(mset1, mset2);
405 }
406
407 return true;
408 }
409