1 /** @file
2  * @brief Calculate term weights for the ESet.
3  */
4 /* Copyright (C) 2007,2008,2011,2017 Olly Betts
5  * Copyright (C) 2011 Action Without Borders
6  * Copyright (C) 2013 Aarsh Shah
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "expandweight.h"
26 
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "api/termlist.h"
30 
31 using namespace std;
32 
33 namespace Xapian {
34 namespace Internal {
35 
36 void
collect_stats(TermList * merger,const std::string & term)37 ExpandWeight::collect_stats(TermList * merger, const std::string & term)
38 {
39     LOGCALL_VOID(API, "ExpandWeight::collect_stats", merger | term);
40 
41     stats.clear_stats();
42 
43     merger->accumulate_stats(stats);
44 
45     collection_freq = db.get_collection_freq(term);
46 
47     LOGVALUE(EXPAND, rsize);
48     LOGVALUE(EXPAND, stats.rtermfreq);
49 
50     LOGVALUE(EXPAND, dbsize);
51     LOGVALUE(EXPAND, stats.dbsize);
52     if (stats.dbsize == dbsize) {
53 	// Either we're expanding from just one database, or we got stats from
54 	// all the sub-databases (because at least one relevant document from
55 	// each sub-database contained this term), so termfreq should already
56 	// be exact.
57 	AssertEqParanoid(stats.termfreq, db.get_termfreq(term));
58     } else {
59 	AssertRel(stats.dbsize,<,dbsize);
60 	// We're expanding from more than one database and the stats we've got
61 	// only cover some of the sub-databases, so termfreq only includes
62 	// those sub-databases.
63 	if (use_exact_termfreq) {
64 	    LOGLINE(EXPAND, "Had to request exact termfreq");
65 	    stats.termfreq = db.get_termfreq(term);
66 	} else {
67 	    // Approximate the termfreq by scaling it up from the databases we
68 	    // do have information from.
69 	    double tf = double(stats.termfreq) * dbsize / stats.dbsize;
70 	    LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
71 			    dbsize << " / " << stats.dbsize << " = " <<
72 			    tf);
73 
74 	    stats.termfreq = static_cast<Xapian::doccount>(tf + 0.5);
75 
76 	    // termfreq can't be more than (dbsize - rsize + rtermfreq)
77 	    // since the number of relevant documents not indexed by this
78 	    // term can't be more than the number of documents not indexed
79 	    // by this term, so:
80 	    //
81 	    //     rsize - rtermfreq <= dbsize - termfreq
82 	    // <=> termfreq <= dbsize - (rsize - rtermfreq)
83 	    auto termfreq_upper_bound = dbsize - (rsize - stats.rtermfreq);
84 	    if (stats.termfreq > termfreq_upper_bound) {
85 		LOGLINE(EXPAND, "termfreq can't be more than "
86 				"dbsize - (rsize + rtermfreq)");
87 		stats.termfreq = termfreq_upper_bound;
88 	    }
89 	}
90     }
91     LOGVALUE(EXPAND, stats.termfreq);
92 }
93 
94 }
95 }
96