1 /** @file
2  * @brief Inverter class which "inverts the file".
3  */
4 /* Copyright (C) 2009,2010,2013,2014 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19  */
20 
21 #ifndef XAPIAN_INCLUDED_GLASS_INVERTER_H
22 #define XAPIAN_INCLUDED_GLASS_INVERTER_H
23 
24 #include "xapian/types.h"
25 
26 #include <map>
27 #include <string>
28 #include <vector>
29 
30 #include "omassert.h"
31 #include "str.h"
32 #include "xapian/error.h"
33 
34 class GlassPostListTable;
35 class GlassPositionListTable;
36 
37 namespace Xapian {
38 class TermIterator;
39 }
40 
41 /** Magic wdf value used for a deleted posting. */
42 const Xapian::termcount DELETED_POSTING = Xapian::termcount(-1);
43 
44 /** Class which "inverts the file". */
45 class Inverter {
46     friend class GlassPostListTable;
47 
48     /// Class for storing the changes in frequencies for a term.
49     class PostingChanges {
50 	friend class GlassPostListTable;
51 
52 	/// Change in term frequency,
53 	Xapian::termcount_diff tf_delta;
54 
55 	/// Change in collection frequency.
56 	Xapian::termcount_diff cf_delta;
57 
58 	/// Changes to this term's postlist.
59 	std::map<Xapian::docid, Xapian::termcount> pl_changes;
60 
61       public:
62 	/// Constructor for an added posting.
PostingChanges(Xapian::docid did,Xapian::termcount wdf)63 	PostingChanges(Xapian::docid did, Xapian::termcount wdf)
64 	    : tf_delta(1), cf_delta(Xapian::termcount_diff(wdf))
65 	{
66 	    pl_changes.insert(std::make_pair(did, wdf));
67 	}
68 
69 	/// Constructor for a removed posting.
PostingChanges(Xapian::docid did,Xapian::termcount wdf,bool)70 	PostingChanges(Xapian::docid did, Xapian::termcount wdf, bool)
71 	    : tf_delta(-1), cf_delta(-Xapian::termcount_diff(wdf))
72 	{
73 	    pl_changes.insert(std::make_pair(did, DELETED_POSTING));
74 	}
75 
76 	/// Constructor for an updated posting.
PostingChanges(Xapian::docid did,Xapian::termcount old_wdf,Xapian::termcount new_wdf)77 	PostingChanges(Xapian::docid did, Xapian::termcount old_wdf,
78 		       Xapian::termcount new_wdf)
79 	    : tf_delta(0), cf_delta(Xapian::termcount_diff(new_wdf - old_wdf))
80 	{
81 	    pl_changes.insert(std::make_pair(did, new_wdf));
82 	}
83 
84 	/// Add a posting.
add_posting(Xapian::docid did,Xapian::termcount wdf)85 	void add_posting(Xapian::docid did, Xapian::termcount wdf) {
86 	    ++tf_delta;
87 	    cf_delta += wdf;
88 	    // Add did to term's postlist
89 	    pl_changes[did] = wdf;
90 	}
91 
92 	/// Remove a posting.
remove_posting(Xapian::docid did,Xapian::termcount wdf)93 	void remove_posting(Xapian::docid did, Xapian::termcount wdf) {
94 	    --tf_delta;
95 	    cf_delta -= wdf;
96 	    // Remove did from term's postlist.
97 	    pl_changes[did] = DELETED_POSTING;
98 	}
99 
100 	/// Update a posting.
update_posting(Xapian::docid did,Xapian::termcount old_wdf,Xapian::termcount new_wdf)101 	void update_posting(Xapian::docid did, Xapian::termcount old_wdf,
102 			    Xapian::termcount new_wdf) {
103 	    cf_delta += new_wdf - old_wdf;
104 	    pl_changes[did] = new_wdf;
105 	}
106 
107 	/// Get the term frequency delta.
get_tfdelta()108 	Xapian::termcount_diff get_tfdelta() const { return tf_delta; }
109 
110 	/// Get the collection frequency delta.
get_cfdelta()111 	Xapian::termcount_diff get_cfdelta() const { return cf_delta; }
112     };
113 
114     /// Buffered changes to postlists.
115     std::map<std::string, PostingChanges> postlist_changes;
116 
117     /// Buffered changes to positional data.
118     std::map<std::string, std::map<Xapian::docid, std::string>> pos_changes;
119 
120     void store_positions(const GlassPositionListTable & position_table,
121 			 Xapian::docid did,
122 			 const std::string & tname,
123 			 const std::vector<Xapian::termpos> & posvec,
124 			 bool modifying);
125 
126     void set_positionlist(Xapian::docid did,
127 			  const std::string & term,
128 			  const std::string & s);
129 
130   public:
131     /// Buffered changes to document lengths.
132     std::map<Xapian::docid, Xapian::termcount> doclen_changes;
133 
134   public:
add_posting(Xapian::docid did,const std::string & term,Xapian::doccount wdf)135     void add_posting(Xapian::docid did, const std::string & term,
136 		     Xapian::doccount wdf) {
137 	std::map<std::string, PostingChanges>::iterator i;
138 	i = postlist_changes.find(term);
139 	if (i == postlist_changes.end()) {
140 	    postlist_changes.insert(
141 		std::make_pair(term, PostingChanges(did, wdf)));
142 	} else {
143 	    i->second.add_posting(did, wdf);
144 	}
145     }
146 
remove_posting(Xapian::docid did,const std::string & term,Xapian::doccount wdf)147     void remove_posting(Xapian::docid did, const std::string & term,
148 			Xapian::doccount wdf) {
149 	std::map<std::string, PostingChanges>::iterator i;
150 	i = postlist_changes.find(term);
151 	if (i == postlist_changes.end()) {
152 	    postlist_changes.insert(
153 		std::make_pair(term, PostingChanges(did, wdf, false)));
154 	} else {
155 	    i->second.remove_posting(did, wdf);
156 	}
157     }
158 
update_posting(Xapian::docid did,const std::string & term,Xapian::termcount old_wdf,Xapian::termcount new_wdf)159     void update_posting(Xapian::docid did, const std::string & term,
160 			Xapian::termcount old_wdf,
161 			Xapian::termcount new_wdf) {
162 	std::map<std::string, PostingChanges>::iterator i;
163 	i = postlist_changes.find(term);
164 	if (i == postlist_changes.end()) {
165 	    postlist_changes.insert(
166 		std::make_pair(term, PostingChanges(did, old_wdf, new_wdf)));
167 	} else {
168 	    i->second.update_posting(did, old_wdf, new_wdf);
169 	}
170     }
171 
172     void set_positionlist(const GlassPositionListTable & position_table,
173 			  Xapian::docid did,
174 			  const std::string & tname,
175 			  const Xapian::TermIterator & term,
176 			  bool modifying = false);
177 
178     void delete_positionlist(Xapian::docid did,
179 			     const std::string & term);
180 
181     bool get_positionlist(Xapian::docid did,
182 			  const std::string & term,
183 			  std::string & s) const;
184 
185     bool has_positions(const GlassPositionListTable & position_table) const;
186 
clear()187     void clear() {
188 	doclen_changes.clear();
189 	postlist_changes.clear();
190 	pos_changes.clear();
191     }
192 
set_doclength(Xapian::docid did,Xapian::termcount doclen,bool add)193     void set_doclength(Xapian::docid did, Xapian::termcount doclen, bool add) {
194 	if (add) {
195 	    Assert(doclen_changes.find(did) == doclen_changes.end() || doclen_changes[did] == DELETED_POSTING);
196 	}
197 	doclen_changes[did] = doclen;
198     }
199 
delete_doclength(Xapian::docid did)200     void delete_doclength(Xapian::docid did) {
201 	Assert(doclen_changes.find(did) == doclen_changes.end() || doclen_changes[did] != DELETED_POSTING);
202 	doclen_changes[did] = DELETED_POSTING;
203     }
204 
get_doclength(Xapian::docid did,Xapian::termcount & doclen)205     bool get_doclength(Xapian::docid did, Xapian::termcount & doclen) const {
206 	std::map<Xapian::docid, Xapian::termcount>::const_iterator i;
207 	i = doclen_changes.find(did);
208 	if (i == doclen_changes.end())
209 	    return false;
210 	if (rare(i->second == DELETED_POSTING))
211 	    throw Xapian::DocNotFoundError("Document not found: " + str(did));
212 	doclen = i->second;
213 	return true;
214     }
215 
216     /// Flush document length changes.
217     void flush_doclengths(GlassPostListTable & table);
218 
219     /// Flush postlist changes for @a term.
220     void flush_post_list(GlassPostListTable & table, const std::string & term);
221 
222     /// Flush postlist changes for all terms.
223     void flush_all_post_lists(GlassPostListTable & table);
224 
225     /// Flush postlist changes for all terms which start with @a pfx.
226     void flush_post_lists(GlassPostListTable & table, const std::string & pfx);
227 
228     /// Flush all postlist table changes.
229     void flush(GlassPostListTable & table);
230 
231     /// Flush position changes.
232     void flush_pos_lists(GlassPositionListTable & table);
233 
get_deltas(const std::string & term,Xapian::termcount_diff & tf_delta,Xapian::termcount_diff & cf_delta)234     bool get_deltas(const std::string & term,
235 		    Xapian::termcount_diff & tf_delta,
236 		    Xapian::termcount_diff & cf_delta) const {
237 	std::map<std::string, PostingChanges>::const_iterator i;
238 	i = postlist_changes.find(term);
239 	if (i == postlist_changes.end()) {
240 	    return false;
241 	}
242 	tf_delta = i->second.get_tfdelta();
243 	cf_delta = i->second.get_cfdelta();
244 	return true;
245     }
246 };
247 
248 #endif // XAPIAN_INCLUDED_GLASS_INVERTER_H
249