1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: FrequencyDBImpl_bdb.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt. If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 // http://www.burton-computer.com/qpl.html
28 // http://www.cooldevtools.com/qpl.html
29 //
30
31 #ifdef USE_DB
32
33 #include <unistd.h>
34 #include <stdexcept>
35 #include <strstream>
36 #include "CleanupManager.h"
37 #include "LockFile.h"
38 #include "WordData.h"
39 #include "FrequencyDBImpl_bdb.h"
40
41 struct MyDBT : public DBT
42 {
MyDBTMyDBT43 MyDBT()
44 {
45 memset(this, 0, sizeof(*this));
46 }
47
MyDBTMyDBT48 MyDBT(const string &word)
49 {
50 memset(this, 0, sizeof(*this));
51 size = word.length() + 1;
52 data = (char *)word.c_str();
53 }
54
MyDBTMyDBT55 MyDBT(const WordData &counts)
56 {
57 memset(this, 0, sizeof(*this));
58 size = sizeof(WordData);
59 data = (char *)&counts;
60 }
61 };
62
throw_on_error(const char * function_name,int rc)63 inline int throw_on_error(const char *function_name,
64 int rc)
65 {
66 if (rc == DB_NOTFOUND) {
67 return rc;
68 }
69 if (rc != 0) {
70 static char buffer[4096];
71 ostrstream msg(buffer, sizeof(buffer));
72 msg << function_name << ": " << db_strerror(rc) << " (" << rc << ")" << ends;
73 throw runtime_error(buffer);
74 }
75 return rc;
76 }
77
warn_on_error(const char * function_name,int rc)78 inline int warn_on_error(const char *function_name,
79 int rc)
80 {
81 if (rc == DB_NOTFOUND) {
82 return rc;
83 }
84 if (rc != 0) {
85 cerr << "warning: berkeley db reported error: "
86 << function_name
87 << ": "
88 << db_strerror(rc)
89 << " (" << rc << ")"
90 << endl;
91 }
92 return rc;
93 }
94
factory(const DatabaseConfig * config)95 FrequencyDBImpl *FrequencyDBImpl_bdb::factory(const DatabaseConfig *config)
96 {
97 return new FrequencyDBImpl_bdb();
98 }
99
FrequencyDBImpl_bdb()100 FrequencyDBImpl_bdb::FrequencyDBImpl_bdb()
101 : m_env(0),
102 m_file(0),
103 m_cursor(0)
104 {
105 }
106
~FrequencyDBImpl_bdb()107 FrequencyDBImpl_bdb::~FrequencyDBImpl_bdb()
108 {
109 close();
110 }
111
open(const string & arg_filename,bool read_only,int create_mode)112 bool FrequencyDBImpl_bdb::open(const string &arg_filename,
113 bool read_only,
114 int create_mode)
115 {
116 close();
117
118 m_isReadOnly = read_only;
119
120 File db_file(arg_filename);
121 if (!openEnvironment(db_file, read_only, create_mode)) {
122 return false;
123 }
124
125 if (!openDatabase(db_file, read_only, create_mode)) {
126 closeEnvironment();
127 return false;
128 }
129
130 if (is_debug) {
131 cerr << "DATABASE OPENED " << db_file.getPath() << endl;
132 }
133
134 return true;
135 }
136
openDatabase(const File & db_file,bool read_only,int create_mode)137 bool FrequencyDBImpl_bdb::openDatabase(const File &db_file,
138 bool read_only,
139 int create_mode)
140 {
141 if (is_debug) {
142 cerr << "OPENING DATABASE " << db_file.getPath() << endl;
143 }
144
145 int ret = db_create(&m_file, m_env, 0);
146 if (ret != 0) {
147 cerr << "error: unable to create database " << db_file.getPath() << ": " << db_strerror(ret) << endl;
148 return false;
149 }
150
151 // SleepyCat in their infinite wisdom decided to change the open
152 // function's signature in the 4.1 release. Gee thanks for breaking
153 // my code guys. That was real smart and so much better than
154 // introducing a second open function rather than force me to embed
155 // hideous ifdefs into my code.
156 string filename(m_env ? db_file.getName() : db_file.getPath());
157 int flags = read_only ? DB_RDONLY : DB_CREATE;
158 #if DB_VERSION_MAJOR >= 4 && DB_VERSION_MINOR >= 1
159 ret = m_file->open(m_file, NULL, filename.c_str(), NULL, DB_BTREE, flags, create_mode);
160 #else
161 ret = m_file->open(m_file, filename.c_str(), NULL, DB_BTREE, flags, create_mode);
162 #endif
163 if (ret != 0) {
164 cerr << "error: unable to open database " << db_file.getPath() << ": " << db_strerror(ret) << endl;
165 m_file = 0;
166 return false;
167 }
168
169 if (is_debug) {
170 cerr << "OPENED DATABASE " << db_file.getPath() << endl;
171 }
172
173 return true;
174 }
175
openEnvironment(const File & db_file,bool read_only,int create_mode)176 bool FrequencyDBImpl_bdb::openEnvironment(const File &db_file,
177 bool read_only,
178 int create_mode)
179 {
180 #if USE_CDB
181 File env_dir(db_file.parent());
182
183 if (is_debug) {
184 cerr << "OPENING ENVIRONMENT " << env_dir.getPath() << endl;
185 }
186
187 int ret = db_env_create(&m_env, 0);
188 if (ret != 0) {
189 cerr << "error: unable to create environment " << db_file.getPath() << ": " << db_strerror(ret) << endl;
190 m_env = 0;
191 return false;
192 }
193
194 int env_flags = DB_INIT_CDB | DB_INIT_MPOOL | DB_CREATE;
195 ret = m_env->open(m_env, db_file.parent().getPath().c_str(), env_flags, create_mode);
196 if (ret != 0) {
197 if (read_only) {
198 // we can still operate without the environment if we're in read-only mode
199 m_env = 0;
200 } else {
201 cerr << "error: unable to open environment " << env_dir.getPath() << ": " << db_strerror(ret) << endl;
202 m_env = 0;
203 return false;
204 }
205 }
206 #else
207 m_env = 0;
208 #endif
209
210 return true;
211 }
212
closeCursor()213 void FrequencyDBImpl_bdb::closeCursor()
214 {
215 if (m_cursor) {
216 warn_on_error("c_close", m_cursor->c_close(m_cursor));
217 m_cursor = 0;
218 }
219 }
220
closeDatabase()221 void FrequencyDBImpl_bdb::closeDatabase()
222 {
223 if (m_file) {
224 warn_on_error("db sync", m_file->sync(m_file, 0));
225 warn_on_error("db close", m_file->close(m_file, 0));
226 m_file = 0;
227 }
228 }
229
closeEnvironment()230 void FrequencyDBImpl_bdb::closeEnvironment()
231 {
232 #ifdef USE_CDB
233 if (m_env) {
234 warn_on_error("env close", m_env->close(m_env, 0));
235 m_env = 0;
236 }
237 #endif
238 }
239
close()240 void FrequencyDBImpl_bdb::close()
241 {
242 closeCursor();
243 closeDatabase();
244 closeEnvironment();
245 }
246
flush()247 void FrequencyDBImpl_bdb::flush()
248 {
249 if (is_debug) {
250 cerr << "flushing database..." << endl;
251 }
252 throw_on_error("sync", m_file->sync(m_file, 0));
253 }
254
writeWord(const string & word,const WordData & counts)255 void FrequencyDBImpl_bdb::writeWord(const string &word,
256 const WordData &counts)
257 {
258 assert(m_file);
259
260 MyDBT key(word);
261 bool delete_word = counts.totalCount() <= 0;
262
263 if (is_debug) {
264 WordData old_counts;
265 bool exists = loadKey(key, old_counts);
266 if (delete_word) {
267 cerr << "writeWord: deleting '" << word << "'"
268 << endl;
269 } else if (exists) {
270 cerr << "writeWord: updating '" << word << "'"
271 << " old (" << old_counts.goodCount() << "," << old_counts.spamCount() << ")"
272 << " new (" << counts.goodCount() << "," << counts.spamCount() << ")"
273 << endl;
274 } else {
275 cerr << "writeWord: inserting '" << word << "'"
276 << " new (" << counts.goodCount() << "," << counts.spamCount() << ")"
277 << endl;
278 }
279 }
280
281 if (delete_word) {
282 throw_on_error("del", m_file->del(m_file, NULL, &key, 0));
283 } else {
284 MyDBT value(counts);
285 throw_on_error("put", m_file->put(m_file, NULL, &key, &value, 0));
286 }
287 }
288
readWord(const string & word,WordData & counts)289 bool FrequencyDBImpl_bdb::readWord(const string &word,
290 WordData &counts)
291 {
292 assert(m_file);
293
294 MyDBT key(word);
295 return loadKey(key, counts);
296 }
297
firstWord(string & word,WordData & counts)298 bool FrequencyDBImpl_bdb::firstWord(string &word,
299 WordData &counts)
300 {
301 return firstWord(word, counts, true);
302 }
303
firstWord(string & word,WordData & counts,bool read_only)304 bool FrequencyDBImpl_bdb::firstWord(string &word,
305 WordData &counts,
306 bool read_only)
307 {
308 closeCursor();
309
310 assert(read_only || !m_isReadOnly);
311
312 int cursor_flags = 0;
313 #ifdef USE_CDB
314 if (!read_only) {
315 cursor_flags |= DB_WRITECURSOR;
316 }
317 #endif
318
319 int ret = warn_on_error("cursor", m_file->cursor(m_file, NULL, &m_cursor, cursor_flags));
320 if (ret != 0) {
321 return false;
322 }
323 assert(m_cursor);
324
325 return nextWord(word, counts);
326 }
327
nextWord(string & word,WordData & counts)328 bool FrequencyDBImpl_bdb::nextWord(string &word,
329 WordData &counts)
330 {
331 if (!m_cursor) {
332 return false;
333 }
334
335 MyDBT key;
336 MyDBT value;
337 int ret = warn_on_error("c_get", m_cursor->c_get(m_cursor, &key, &value, DB_NEXT));
338 if (ret != 0) {
339 word.erase();
340 counts.clear();
341 closeCursor();
342 return false;
343 }
344
345 if (!value.data || value.size != sizeof(WordData)) {
346 word.erase();
347 counts.clear();
348 return false;
349 }
350
351 word.assign((const char *)key.data, max((u_int32_t)0, key.size - 1));
352 counts = *((WordData *)value.data);
353 return true;
354 }
355
loadKey(DBT & key,string & word,WordData & counts) const356 bool FrequencyDBImpl_bdb::loadKey(DBT &key,
357 string &word,
358 WordData &counts) const
359 {
360 if (key.data == NULL) {
361 word.erase();
362 counts.clear();
363 return false;
364 }
365
366 if (key.size == 0) {
367 word.erase();
368 } else {
369 word.assign((const char *)key.data, key.size - 1);
370 }
371 return loadKey(key, counts);
372 }
373
loadKey(DBT & key,WordData & counts) const374 bool FrequencyDBImpl_bdb::loadKey(DBT &key,
375 WordData &counts) const
376 {
377 if (key.data == NULL) {
378 counts.clear();
379 return false;
380 }
381
382 MyDBT value;
383 int ret = throw_on_error("get", m_file->get(m_file, NULL, &key, &value, 0));
384 if (ret != 0) {
385 counts.clear();
386 return false;
387 }
388
389 if (!value.data || value.size != sizeof(WordData)) {
390 counts.clear();
391 return false;
392 }
393
394 counts = *((WordData *)value.data);
395 return true;
396 }
397
getDatabaseType() const398 string FrequencyDBImpl_bdb::getDatabaseType() const
399 {
400 return "BerkeleyDB-btree";
401 }
402
sweepOutOldTerms(const CleanupManager & cleanman)403 void FrequencyDBImpl_bdb::sweepOutOldTerms(const CleanupManager &cleanman)
404 {
405 string word;
406 WordData counts;
407
408 assert(!m_isReadOnly);
409
410 bool again = firstWord(word, counts, false);
411 while (again) {
412 bool delete_word = false;
413 if (word.length() >= 3 && word[0] == '_' && word[1] == '_') {
414 if (starts_with(word, "__MD5") && counts.totalCount() == 0) {
415 // go ahead and remove digests that have a count of zero
416 delete_word = true;
417 } else {
418 // ignore special words like __MD5 and __COUNT__
419 }
420 } else {
421 delete_word = cleanman.shouldDelete(counts);
422 }
423 if (delete_word) {
424 if (is_debug) {
425 cerr << "sweepOutJunk: removing term " << word
426 << " with total count " << counts.totalCount()
427 << " and age " << counts.age()
428 << endl;
429 }
430 warn_on_error("c_del", m_cursor->c_del(m_cursor, 0));
431 }
432 again = nextWord(word, counts);
433 }
434
435 flush();
436 }
437
438 #endif // USE_DBM
439