1 /** @file
2 * @brief Calculate term weights for the ESet.
3 */
4 /* Copyright (C) 2007,2008,2011,2017 Olly Betts
5 * Copyright (C) 2011 Action Without Borders
6 * Copyright (C) 2013 Aarsh Shah
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include <config.h>
24
25 #include "expandweight.h"
26
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "api/termlist.h"
30
31 using namespace std;
32
33 namespace Xapian {
34 namespace Internal {
35
36 void
collect_stats(TermList * merger,const std::string & term)37 ExpandWeight::collect_stats(TermList * merger, const std::string & term)
38 {
39 LOGCALL_VOID(API, "ExpandWeight::collect_stats", merger | term);
40
41 stats.clear_stats();
42
43 merger->accumulate_stats(stats);
44
45 collection_freq = db.get_collection_freq(term);
46
47 LOGVALUE(EXPAND, rsize);
48 LOGVALUE(EXPAND, stats.rtermfreq);
49
50 LOGVALUE(EXPAND, dbsize);
51 LOGVALUE(EXPAND, stats.dbsize);
52 if (stats.dbsize == dbsize) {
53 // Either we're expanding from just one database, or we got stats from
54 // all the sub-databases (because at least one relevant document from
55 // each sub-database contained this term), so termfreq should already
56 // be exact.
57 AssertEqParanoid(stats.termfreq, db.get_termfreq(term));
58 } else {
59 AssertRel(stats.dbsize,<,dbsize);
60 // We're expanding from more than one database and the stats we've got
61 // only cover some of the sub-databases, so termfreq only includes
62 // those sub-databases.
63 if (use_exact_termfreq) {
64 LOGLINE(EXPAND, "Had to request exact termfreq");
65 stats.termfreq = db.get_termfreq(term);
66 } else {
67 // Approximate the termfreq by scaling it up from the databases we
68 // do have information from.
69 double tf = double(stats.termfreq) * dbsize / stats.dbsize;
70 LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
71 dbsize << " / " << stats.dbsize << " = " <<
72 tf);
73
74 stats.termfreq = static_cast<Xapian::doccount>(tf + 0.5);
75
76 // termfreq can't be more than (dbsize - rsize + rtermfreq)
77 // since the number of relevant documents not indexed by this
78 // term can't be more than the number of documents not indexed
79 // by this term, so:
80 //
81 // rsize - rtermfreq <= dbsize - termfreq
82 // <=> termfreq <= dbsize - (rsize - rtermfreq)
83 auto termfreq_upper_bound = dbsize - (rsize - stats.rtermfreq);
84 if (stats.termfreq > termfreq_upper_bound) {
85 LOGLINE(EXPAND, "termfreq can't be more than "
86 "dbsize - (rsize + rtermfreq)");
87 stats.termfreq = termfreq_upper_bound;
88 }
89 }
90 }
91 LOGVALUE(EXPAND, stats.termfreq);
92 }
93
94 }
95 }
96