1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: FrequencyDBImpl_bdb.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt.  If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 //    http://www.burton-computer.com/qpl.html
28 //    http://www.cooldevtools.com/qpl.html
29 //
30 
31 #ifdef USE_DB
32 
33 #include <unistd.h>
34 #include <stdexcept>
35 #include <strstream>
36 #include "CleanupManager.h"
37 #include "LockFile.h"
38 #include "WordData.h"
39 #include "FrequencyDBImpl_bdb.h"
40 
41 struct MyDBT : public DBT
42 {
MyDBTMyDBT43   MyDBT()
44   {
45     memset(this, 0, sizeof(*this));
46   }
47 
MyDBTMyDBT48   MyDBT(const string &word)
49   {
50     memset(this, 0, sizeof(*this));
51     size = word.length() + 1;
52     data = (char *)word.c_str();
53   }
54 
MyDBTMyDBT55   MyDBT(const WordData &counts)
56   {
57     memset(this, 0, sizeof(*this));
58     size = sizeof(WordData);
59     data = (char *)&counts;
60   }
61 };
62 
throw_on_error(const char * function_name,int rc)63 inline int throw_on_error(const char *function_name,
64                           int rc)
65 {
66   if (rc == DB_NOTFOUND) {
67     return rc;
68   }
69   if (rc != 0) {
70     static char buffer[4096];
71     ostrstream msg(buffer, sizeof(buffer));
72     msg << function_name << ": " << db_strerror(rc) << " (" << rc << ")" << ends;
73     throw runtime_error(buffer);
74   }
75   return rc;
76 }
77 
warn_on_error(const char * function_name,int rc)78 inline int warn_on_error(const char *function_name,
79                          int rc)
80 {
81   if (rc == DB_NOTFOUND) {
82     return rc;
83   }
84   if (rc != 0) {
85     cerr << "warning: berkeley db reported error: "
86          << function_name
87          << ": "
88          << db_strerror(rc)
89          << " (" << rc << ")"
90          << endl;
91   }
92   return rc;
93 }
94 
factory(const DatabaseConfig * config)95 FrequencyDBImpl *FrequencyDBImpl_bdb::factory(const DatabaseConfig *config)
96 {
97   return new FrequencyDBImpl_bdb();
98 }
99 
FrequencyDBImpl_bdb()100 FrequencyDBImpl_bdb::FrequencyDBImpl_bdb()
101   : m_env(0),
102     m_file(0),
103     m_cursor(0)
104 {
105 }
106 
~FrequencyDBImpl_bdb()107 FrequencyDBImpl_bdb::~FrequencyDBImpl_bdb()
108 {
109   close();
110 }
111 
open(const string & arg_filename,bool read_only,int create_mode)112 bool FrequencyDBImpl_bdb::open(const string &arg_filename,
113                                bool read_only,
114                                int create_mode)
115 {
116   close();
117 
118   m_isReadOnly = read_only;
119 
120   File db_file(arg_filename);
121   if (!openEnvironment(db_file, read_only, create_mode)) {
122     return false;
123   }
124 
125   if (!openDatabase(db_file, read_only, create_mode)) {
126     closeEnvironment();
127     return false;
128   }
129 
130   if (is_debug) {
131     cerr << "DATABASE OPENED " << db_file.getPath() << endl;
132   }
133 
134   return true;
135 }
136 
openDatabase(const File & db_file,bool read_only,int create_mode)137 bool FrequencyDBImpl_bdb::openDatabase(const File &db_file,
138                                        bool read_only,
139                                        int create_mode)
140 {
141   if (is_debug) {
142     cerr << "OPENING DATABASE " << db_file.getPath() << endl;
143   }
144 
145   int ret = db_create(&m_file, m_env, 0);
146   if (ret != 0) {
147     cerr << "error: unable to create database " << db_file.getPath() << ": " << db_strerror(ret) << endl;
148     return false;
149   }
150 
151   // SleepyCat in their infinite wisdom decided to change the open
152   // function's signature in the 4.1 release.  Gee thanks for breaking
153   // my code guys. That was real smart and so much better than
154   // introducing a second open function rather than force me to embed
155   // hideous ifdefs into my code.
156   string filename(m_env ? db_file.getName() : db_file.getPath());
157   int flags = read_only ? DB_RDONLY : DB_CREATE;
158 #if DB_VERSION_MAJOR >= 4 && DB_VERSION_MINOR >= 1
159   ret = m_file->open(m_file, NULL, filename.c_str(), NULL, DB_BTREE, flags, create_mode);
160 #else
161   ret = m_file->open(m_file, filename.c_str(), NULL, DB_BTREE, flags, create_mode);
162 #endif
163   if (ret != 0) {
164     cerr << "error: unable to open database " << db_file.getPath() << ": " << db_strerror(ret) << endl;
165     m_file = 0;
166     return false;
167   }
168 
169   if (is_debug) {
170     cerr << "OPENED DATABASE " << db_file.getPath() << endl;
171   }
172 
173   return true;
174 }
175 
openEnvironment(const File & db_file,bool read_only,int create_mode)176 bool FrequencyDBImpl_bdb::openEnvironment(const File &db_file,
177                                           bool read_only,
178                                           int create_mode)
179 {
180 #if USE_CDB
181   File env_dir(db_file.parent());
182 
183   if (is_debug) {
184     cerr << "OPENING ENVIRONMENT " << env_dir.getPath() << endl;
185   }
186 
187   int ret = db_env_create(&m_env, 0);
188   if (ret != 0) {
189     cerr << "error: unable to create environment " << db_file.getPath() << ": " << db_strerror(ret) << endl;
190     m_env = 0;
191     return false;
192   }
193 
194   int env_flags = DB_INIT_CDB | DB_INIT_MPOOL | DB_CREATE;
195   ret = m_env->open(m_env, db_file.parent().getPath().c_str(), env_flags, create_mode);
196   if (ret != 0) {
197     if (read_only) {
198       // we can still operate without the environment if we're in read-only mode
199       m_env = 0;
200     } else {
201       cerr << "error: unable to open environment " << env_dir.getPath() << ": " << db_strerror(ret) << endl;
202       m_env = 0;
203       return false;
204     }
205   }
206 #else
207   m_env = 0;
208 #endif
209 
210   return true;
211 }
212 
closeCursor()213 void FrequencyDBImpl_bdb::closeCursor()
214 {
215   if (m_cursor) {
216     warn_on_error("c_close", m_cursor->c_close(m_cursor));
217     m_cursor = 0;
218   }
219 }
220 
closeDatabase()221 void FrequencyDBImpl_bdb::closeDatabase()
222 {
223   if (m_file) {
224     warn_on_error("db sync", m_file->sync(m_file, 0));
225     warn_on_error("db close", m_file->close(m_file, 0));
226     m_file = 0;
227   }
228 }
229 
closeEnvironment()230 void FrequencyDBImpl_bdb::closeEnvironment()
231 {
232 #ifdef USE_CDB
233   if (m_env) {
234     warn_on_error("env close", m_env->close(m_env, 0));
235     m_env = 0;
236   }
237 #endif
238 }
239 
close()240 void FrequencyDBImpl_bdb::close()
241 {
242   closeCursor();
243   closeDatabase();
244   closeEnvironment();
245 }
246 
flush()247 void FrequencyDBImpl_bdb::flush()
248 {
249   if (is_debug) {
250     cerr << "flushing database..." << endl;
251   }
252   throw_on_error("sync", m_file->sync(m_file, 0));
253 }
254 
writeWord(const string & word,const WordData & counts)255 void FrequencyDBImpl_bdb::writeWord(const string &word,
256                                     const WordData &counts)
257 {
258   assert(m_file);
259 
260   MyDBT key(word);
261   bool delete_word = counts.totalCount() <= 0;
262 
263   if (is_debug) {
264     WordData old_counts;
265     bool exists = loadKey(key, old_counts);
266     if (delete_word) {
267       cerr << "writeWord: deleting '" << word << "'"
268            << endl;
269     } else if (exists) {
270       cerr << "writeWord: updating '" << word << "'"
271            << " old (" << old_counts.goodCount() << "," << old_counts.spamCount() << ")"
272            << " new (" << counts.goodCount() << "," << counts.spamCount() << ")"
273            << endl;
274     } else {
275       cerr << "writeWord: inserting '" << word << "'"
276            << " new (" << counts.goodCount() << "," << counts.spamCount() << ")"
277            << endl;
278     }
279   }
280 
281   if (delete_word) {
282     throw_on_error("del", m_file->del(m_file, NULL, &key, 0));
283   } else {
284     MyDBT value(counts);
285     throw_on_error("put", m_file->put(m_file, NULL, &key, &value, 0));
286   }
287 }
288 
readWord(const string & word,WordData & counts)289 bool FrequencyDBImpl_bdb::readWord(const string &word,
290                                    WordData &counts)
291 {
292   assert(m_file);
293 
294   MyDBT key(word);
295   return loadKey(key, counts);
296 }
297 
firstWord(string & word,WordData & counts)298 bool FrequencyDBImpl_bdb::firstWord(string &word,
299                                     WordData &counts)
300 {
301   return firstWord(word, counts, true);
302 }
303 
firstWord(string & word,WordData & counts,bool read_only)304 bool FrequencyDBImpl_bdb::firstWord(string &word,
305                                     WordData &counts,
306                                     bool read_only)
307 {
308   closeCursor();
309 
310   assert(read_only || !m_isReadOnly);
311 
312   int cursor_flags = 0;
313 #ifdef USE_CDB
314   if (!read_only) {
315     cursor_flags |= DB_WRITECURSOR;
316   }
317 #endif
318 
319   int ret = warn_on_error("cursor", m_file->cursor(m_file, NULL, &m_cursor, cursor_flags));
320   if (ret != 0) {
321     return false;
322   }
323   assert(m_cursor);
324 
325   return nextWord(word, counts);
326 }
327 
nextWord(string & word,WordData & counts)328 bool FrequencyDBImpl_bdb::nextWord(string &word,
329                                    WordData &counts)
330 {
331   if (!m_cursor) {
332     return false;
333   }
334 
335   MyDBT key;
336   MyDBT value;
337   int ret = warn_on_error("c_get", m_cursor->c_get(m_cursor, &key, &value, DB_NEXT));
338   if (ret != 0) {
339     word.erase();
340     counts.clear();
341     closeCursor();
342     return false;
343   }
344 
345   if (!value.data || value.size != sizeof(WordData)) {
346     word.erase();
347     counts.clear();
348     return false;
349   }
350 
351   word.assign((const char *)key.data, max((u_int32_t)0, key.size - 1));
352   counts = *((WordData *)value.data);
353   return true;
354 }
355 
loadKey(DBT & key,string & word,WordData & counts) const356 bool FrequencyDBImpl_bdb::loadKey(DBT &key,
357                                   string &word,
358                                   WordData &counts) const
359 {
360   if (key.data == NULL) {
361     word.erase();
362     counts.clear();
363     return false;
364   }
365 
366   if (key.size == 0) {
367     word.erase();
368   } else {
369     word.assign((const char *)key.data, key.size - 1);
370   }
371   return loadKey(key, counts);
372 }
373 
loadKey(DBT & key,WordData & counts) const374 bool FrequencyDBImpl_bdb::loadKey(DBT &key,
375                                   WordData &counts) const
376 {
377   if (key.data == NULL) {
378     counts.clear();
379     return false;
380   }
381 
382   MyDBT value;
383   int ret = throw_on_error("get", m_file->get(m_file, NULL, &key, &value, 0));
384   if (ret != 0) {
385     counts.clear();
386     return false;
387   }
388 
389   if (!value.data || value.size != sizeof(WordData)) {
390     counts.clear();
391     return false;
392   }
393 
394   counts = *((WordData *)value.data);
395   return true;
396 }
397 
getDatabaseType() const398 string FrequencyDBImpl_bdb::getDatabaseType() const
399 {
400   return "BerkeleyDB-btree";
401 }
402 
sweepOutOldTerms(const CleanupManager & cleanman)403 void FrequencyDBImpl_bdb::sweepOutOldTerms(const CleanupManager &cleanman)
404 {
405   string word;
406   WordData counts;
407 
408   assert(!m_isReadOnly);
409 
410   bool again = firstWord(word, counts, false);
411   while (again) {
412     bool delete_word = false;
413     if (word.length() >= 3 && word[0] == '_' && word[1] == '_') {
414       if (starts_with(word, "__MD5") && counts.totalCount() == 0) {
415         // go ahead and remove digests that have a count of zero
416         delete_word = true;
417       } else {
418         // ignore special words like __MD5 and __COUNT__
419       }
420     } else {
421       delete_word = cleanman.shouldDelete(counts);
422     }
423     if (delete_word) {
424       if (is_debug) {
425         cerr << "sweepOutJunk: removing term " << word
426              << " with total count " << counts.totalCount()
427              << " and age " << counts.age()
428              << endl;
429       }
430       warn_on_error("c_del", m_cursor->c_del(m_cursor, 0));
431     }
432     again = nextWord(word, counts);
433   }
434 
435   flush();
436 }
437 
438 #endif // USE_DBM
439