1 /** @file expandweight.cc
2 * @brief Calculate term weights for the ESet.
3 */
4 /* Copyright (C) 2007,2008 Olly Betts
5 * Copyright (C) 2011 Action Without Borders
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include <config.h>
23
24 #include "expandweight.h"
25
26 #include "debuglog.h"
27 #include "omassert.h"
28 #include "termlist.h"
29
30 #include <cmath>
31
32 using namespace std;
33
34 namespace Xapian {
35 namespace Internal {
36
37 Xapian::weight
get_weight(TermList * merger,const string & term) const38 ExpandWeight::get_weight(TermList * merger, const string & term) const
39 {
40 LOGCALL(MATCH, Xapian::weight, "ExpandWeight::get_weight", merger | term);
41
42 // Accumulate the stats for this term across all relevant documents.
43 ExpandStats stats(avlen, expand_k);
44 merger->accumulate_stats(stats);
45
46 double termfreq = stats.termfreq;
47 double rtermfreq = stats.rtermfreq;
48
49 LOGVALUE(EXPAND, rsize);
50 LOGVALUE(EXPAND, rtermfreq);
51
52 LOGVALUE(EXPAND, dbsize);
53 LOGVALUE(EXPAND, stats.dbsize);
54 if (stats.dbsize == dbsize) {
55 // Either we're expanding from just one database, or we got stats from
56 // all the sub-databases (because at least one relevant document from
57 // each sub-database contained this term), so termfreq should already
58 // be exact.
59 AssertEqParanoid(termfreq, db.get_termfreq(term));
60 } else {
61 AssertRel(stats.dbsize,<,dbsize);
62 // We're expanding from more than one database and the stats we've got
63 // only cover some of the sub-databases, so termfreq only includes
64 // those sub-databases.
65 if (use_exact_termfreq) {
66 LOGLINE(EXPAND, "Had to request exact termfreq");
67 termfreq = db.get_termfreq(term);
68 } else {
69 // Approximate the termfreq by scaling it up from the databases we
70 // do have information from.
71 termfreq *= double(dbsize) / double(stats.dbsize);
72 LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
73 dbsize << " / " << stats.dbsize << " = " <<
74 termfreq);
75 LOGVALUE(EXPAND, db.get_termfreq(term));
76 if (termfreq < rtermfreq) {
77 // termfreq must be at least rtermfreq, since there are at
78 // least rtermfreq documents indexed by this term.
79 LOGLINE(EXPAND, "termfreq must be at least rtermfreq");
80 termfreq = rtermfreq;
81 } else {
82 // termfreq can't be more than (dbsize - rsize + rtermfreq)
83 // since the number of relevant documents not indexed by this
84 // term can't be more than the number of documents not indexed
85 // by this term, so:
86 //
87 // rsize - rtermfreq <= dbsize - termfreq
88 // <=> termfreq <= dbsize - (rsize - rtermfreq)
89 double termfreq_upper_bound = dbsize - (rsize - rtermfreq);
90 if (termfreq > termfreq_upper_bound) {
91 LOGLINE(EXPAND, "termfreq can't be more than "
92 "dbsize - (rsize + rtermfreq)");
93 termfreq = termfreq_upper_bound;
94 }
95 }
96 }
97 }
98 LOGVALUE(EXPAND, termfreq);
99
100 double reldocs_without_term = rsize - rtermfreq;
101 double num, denom;
102 num = (rtermfreq + 0.5) * (dbsize - termfreq - reldocs_without_term + 0.5);
103 AssertRel(num,>,0);
104 denom = (termfreq - rtermfreq + 0.5) * (reldocs_without_term + 0.5);
105 AssertRel(denom,>,0);
106
107 Xapian::weight tw = log(num / denom);
108 LOGVALUE(EXPAND, tw);
109 LOGVALUE(EXPAND, stats.multiplier);
110 RETURN(stats.multiplier * tw);
111 }
112
113 }
114 }
115