1 /** @file expandweight.cc
2  * @brief Calculate term weights for the ESet.
3  */
4 /* Copyright (C) 2007,2008 Olly Betts
5  * Copyright (C) 2011 Action Without Borders
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "expandweight.h"
25 
26 #include "debuglog.h"
27 #include "omassert.h"
28 #include "termlist.h"
29 
30 #include <cmath>
31 
32 using namespace std;
33 
34 namespace Xapian {
35 namespace Internal {
36 
37 Xapian::weight
get_weight(TermList * merger,const string & term) const38 ExpandWeight::get_weight(TermList * merger, const string & term) const
39 {
40     LOGCALL(MATCH, Xapian::weight, "ExpandWeight::get_weight", merger | term);
41 
42     // Accumulate the stats for this term across all relevant documents.
43     ExpandStats stats(avlen, expand_k);
44     merger->accumulate_stats(stats);
45 
46     double termfreq = stats.termfreq;
47     double rtermfreq = stats.rtermfreq;
48 
49     LOGVALUE(EXPAND, rsize);
50     LOGVALUE(EXPAND, rtermfreq);
51 
52     LOGVALUE(EXPAND, dbsize);
53     LOGVALUE(EXPAND, stats.dbsize);
54     if (stats.dbsize == dbsize) {
55 	// Either we're expanding from just one database, or we got stats from
56 	// all the sub-databases (because at least one relevant document from
57 	// each sub-database contained this term), so termfreq should already
58 	// be exact.
59 	AssertEqParanoid(termfreq, db.get_termfreq(term));
60     } else {
61 	AssertRel(stats.dbsize,<,dbsize);
62 	// We're expanding from more than one database and the stats we've got
63 	// only cover some of the sub-databases, so termfreq only includes
64 	// those sub-databases.
65 	if (use_exact_termfreq) {
66 	    LOGLINE(EXPAND, "Had to request exact termfreq");
67 	    termfreq = db.get_termfreq(term);
68 	} else {
69 	    // Approximate the termfreq by scaling it up from the databases we
70 	    // do have information from.
71 	    termfreq *= double(dbsize) / double(stats.dbsize);
72 	    LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
73 			    dbsize << " / " << stats.dbsize << " = " <<
74 			    termfreq);
75 	    LOGVALUE(EXPAND, db.get_termfreq(term));
76 	    if (termfreq < rtermfreq) {
77 		// termfreq must be at least rtermfreq, since there are at
78 		// least rtermfreq documents indexed by this term.
79 		LOGLINE(EXPAND, "termfreq must be at least rtermfreq");
80 		termfreq = rtermfreq;
81 	    } else {
82 		// termfreq can't be more than (dbsize - rsize + rtermfreq)
83 		// since the number of relevant documents not indexed by this
84 		// term can't be more than the number of documents not indexed
85 		// by this term, so:
86 		//
87 		//     rsize - rtermfreq <= dbsize - termfreq
88 		// <=> termfreq <= dbsize - (rsize - rtermfreq)
89 		double termfreq_upper_bound = dbsize - (rsize - rtermfreq);
90 		if (termfreq > termfreq_upper_bound) {
91 		    LOGLINE(EXPAND, "termfreq can't be more than "
92 				    "dbsize - (rsize + rtermfreq)");
93 		    termfreq = termfreq_upper_bound;
94 		}
95 	    }
96 	}
97     }
98     LOGVALUE(EXPAND, termfreq);
99 
100     double reldocs_without_term = rsize - rtermfreq;
101     double num, denom;
102     num = (rtermfreq + 0.5) * (dbsize - termfreq - reldocs_without_term + 0.5);
103     AssertRel(num,>,0);
104     denom = (termfreq - rtermfreq + 0.5) * (reldocs_without_term + 0.5);
105     AssertRel(denom,>,0);
106 
107     Xapian::weight tw = log(num / denom);
108     LOGVALUE(EXPAND, tw);
109     LOGVALUE(EXPAND, stats.multiplier);
110     RETURN(stats.multiplier * tw);
111 }
112 
113 }
114 }
115