1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: HashDataFile.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt.  If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 //    http://www.burton-computer.com/qpl.html
28 //    http://www.cooldevtools.com/qpl.html
29 //
30 
31 #ifdef USE_MMAP
32 
33 #include <stdexcept>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <fcntl.h>
37 #include <errno.h>
38 #include <sys/mman.h>
39 #include "File.h"
40 #include "WordData.h"
41 #include "HashDataFile.h"
42 
43 enum {
44   SIZE_MULTIPLE = 1024,
45   DEBUG_HASH = 1,
46 };
47 
48 // table of primes indexed roughy by megabyte from 1 to 100 megs
49 static const int primes_by_meg[] = {
50   // assumes 12 bytes per entry
51      87359,     174761,     262139,     349519,     436889,   // 1 - 5 megs
52     524287,     611657,     699037,     786431,     873787,   // 6 - 10 megs
53     961189,    1048573,    1135951,    1223329,    1310719,   // 11 - 15 megs
54    1398091,    1485479,    1572853,    1660231,    1747619,   // 16 - 20 megs
55    1835003,    1922387,    2009759,    2097143,    2184509,   // 21 - 25 megs
56    2271901,    2359267,    2446673,    2534057,    2621431,   // 26 - 30 megs
57    2708821,    2796181,    2883577,    2970959,    3058343,   // 31 - 35 megs
58    3145721,    3233093,    3320477,    3407857,    3495253,   // 36 - 40 megs
59    3582629,    3670013,    3757393,    3844777,    3932153,   // 41 - 45 megs
60    4019527,    4106897,    4194301,    4281679,    4369061,   // 46 - 50 megs
61    4456433,    4543823,    4631203,    4718579,    4805959,   // 51 - 55 megs
62    4893341,    4980727,    5068111,    5155489,    5242877,   // 56 - 60 megs
63    5330251,    5417597,    5505023,    5592373,    5679769,   // 61 - 65 megs
64    5767129,    5854543,    5941921,    6029299,    6116687,   // 66 - 70 megs
65    6204071,    6291449,    6378787,    6466193,    6553577,   // 71 - 75 megs
66    6640979,    6728347,    6815741,    6903121,    6990493,   // 76 - 80 megs
67    7077883,    7165267,    7252649,    7340009,    7427353,   // 81 - 85 megs
68    7514791,    7602151,    7689557,    7776931,    7864301,   // 86 - 90 megs
69    7951693,    8039081,    8126453,    8213819,    8301217,   // 91 - 95 megs
70    8388593,    8475947,    8563351,    8650727,    8738129,   // 96 - 100 megs
71 };
72 
73 #define PRIMES_ARRAY_SIZE (sizeof(primes_by_meg) / sizeof(primes_by_meg[0]))
74 
compute_divisor_for_max_elements(int max_elements)75 static int compute_divisor_for_max_elements(int max_elements)
76 {
77   for (int i = PRIMES_ARRAY_SIZE - 1; i >= 0; --i) {
78     int divisor = primes_by_meg[i];
79     if (divisor <= max_elements) {
80       return divisor;
81     }
82   }
83   return primes_by_meg[0];
84 }
85 
HashDataFile(int num_headers,HashDataFile::ulong_t size_in_bytes)86 HashDataFile::HashDataFile(int num_headers,
87                            HashDataFile::ulong_t size_in_bytes)
88   : m_base(0)
89 {
90   initialize(num_headers, size_in_bytes);
91 }
92 
~HashDataFile()93 HashDataFile::~HashDataFile()
94 {
95   close();
96 }
97 
initialize(int num_headers,HashDataFile::ulong_t size_in_bytes)98 void HashDataFile::initialize(int num_headers,
99                               HashDataFile::ulong_t size_in_bytes)
100 {
101   m_numHeaders = num_headers;
102   int max_elements = (size_in_bytes / WordArray::ENTRY_SIZE) - m_numHeaders;
103   m_divisor = compute_divisor_for_max_elements(max_elements);
104   m_indexLimit = m_divisor + m_numHeaders;
105   m_size = WordArray::ENTRY_SIZE * m_indexLimit;
106   if (is_debug) {
107     cerr << "HASH_DIVISOR " << m_divisor
108          << " (0x" << hex << m_divisor << ")"
109          << " INDEX_LIMIT " << dec << m_indexLimit
110          << " SIZE " << m_size
111          << endl;
112   }
113 }
114 
populateEmptyFile(int fd)115 void HashDataFile::populateEmptyFile(int fd)
116 {
117   if (is_debug) {
118     cerr << "INITIALIZING NEW HASHFILE" << endl;
119   }
120 
121   const int SIZE_MULTIPLE = 1024;
122   unsigned char zeros[SIZE_MULTIPLE * WordArray::ENTRY_SIZE];
123   WordArray::fillNullBuffer(zeros, SIZE_MULTIPLE);
124 
125   for (int i = 0; i < m_indexLimit; i += SIZE_MULTIPLE) {
126     ::write(fd, &zeros, min(m_indexLimit - i, (HashDataFile::ulong_t)SIZE_MULTIPLE) * WordArray::ENTRY_SIZE);
127   }
128 }
129 
open(const string & filename,bool read_only,int create_mode)130 bool HashDataFile::open(const string &filename,
131                         bool read_only,
132                         int create_mode)
133 {
134   close();
135 
136   if (is_debug) {
137     cerr << "OPEN HASHFILE " << filename << endl;
138   }
139 
140   m_isReadOnly = read_only;
141   m_createMode = create_mode;
142 
143   File data_file(filename);
144   bool exists = data_file.isFile();
145 
146   m_isNewFile = !exists;
147 
148   if (read_only && !exists) {
149     cerr << "error: hash file "
150          << filename
151          << " does not exist and cannot be created in read-only mode"
152          << endl;
153     return false;
154   }
155 
156   if (exists) {
157     unsigned long file_size = data_file.getSize();
158     if ((file_size % WordArray::ENTRY_SIZE) != 0) {
159       cerr << "error: hash file "
160            << filename
161            << " size not a multiple of "
162            << WordArray::ENTRY_SIZE
163            << " bytes"
164            << endl;
165       return false;
166     }
167     initialize(m_numHeaders, file_size);
168     if (m_size != file_size) {
169       cerr << "error: hash file "
170            << filename
171            << " size invalid "
172            << file_size
173            << " != "
174            << m_size
175            << endl;
176       return false;
177     }
178   }
179 
180   if (is_debug) {
181     cerr << "HASHFILE " << filename << " SIZE " << m_size << endl;
182   }
183 
184   int flags = (read_only) ? O_RDONLY : O_RDWR;
185   if (!exists) {
186     flags |= O_CREAT;
187   }
188 
189   int fd = ::open(filename.c_str(), flags, create_mode);
190   if (fd < 0) {
191     cerr << "error: unable to open database " << filename << ": " << strerror(errno) << endl;
192     return false;
193   }
194 
195   if (!exists) {
196     populateEmptyFile(fd);
197   }
198 
199   if (!mmapFile(fd, read_only)) {
200     cerr << "error: unable to mmap file " << filename << ": " << strerror(errno) << endl;
201     close();
202     return false;
203   }
204 
205   m_filename = filename;
206 
207   return true;
208 }
209 
mmapFile(int fd,bool read_only)210 bool HashDataFile::mmapFile(int fd,
211                             bool read_only)
212 {
213 
214   if (is_debug) {
215     cerr << "MMAPPING HASHFILE" << endl;
216   }
217 
218   int flags = (read_only) ? PROT_READ : (PROT_READ | PROT_WRITE);
219   m_base = (char *)mmap(0, m_size, flags, MAP_SHARED, fd, 0);
220   if (m_base == (char *)-1) {
221     m_base = 0;
222     return false;
223   }
224 
225   m_array.reset(m_base, m_indexLimit);
226   return true;
227 }
228 
close()229 void HashDataFile::close()
230 {
231   if (m_base) {
232     ::munmap(m_base, m_size);
233     m_array.reset(0, 0);
234     m_base = 0;
235     m_filename.erase();
236   }
237 }
238 
isOpen()239 bool HashDataFile::isOpen()
240 {
241   return m_base != 0;
242 }
243 
writeRecord(int record_number,HashDataFile::ulong_t key,const WordData & word_data)244 void HashDataFile::writeRecord(int record_number,
245                                HashDataFile::ulong_t key,
246                                const WordData &word_data)
247 {
248   assert(m_base);
249   assert(record_number >= 0);
250   assert(record_number < m_indexLimit);
251 
252   m_array.writeWord(record_number, key, word_data);
253 }
254 
readRecord(int record_number,HashDataFile::ulong_t & key,WordData & word_data)255 void HashDataFile::readRecord(int record_number,
256                               HashDataFile::ulong_t &key,
257                               WordData &word_data)
258 {
259   assert(m_base);
260   assert(record_number >= 0);
261   assert(record_number < m_indexLimit);
262 
263   m_array.readWord(record_number, key, word_data);
264 }
265 
write(ulong_t key,const WordData & word_data)266 bool HashDataFile::write(ulong_t key,
267                          const WordData &word_data)
268 {
269   assert(m_base);
270 
271   int index = computeIndexForKey(key);
272   for (int i = 0; i < m_divisor; ++i) {
273     ulong_t old_key = m_array.readKey(index);
274     if (old_key == key || old_key == 0) {
275       m_array.writeWord(index, key, word_data);
276       return true;
277     }
278 
279     if (m_autoCleaner.isNotNull()) {
280       WordData old_data;
281       m_array.readWord(index, old_key, old_data);
282       if (m_autoCleaner->shouldDelete(old_data)) {
283 	m_array.writeWord(index, key, word_data);
284 	if (is_debug) {
285 	  cerr << "HashDataFile: overwriting old word for new one: "
286 	       << " index " << index
287 	       << " new_key " << key
288 	       << " old_key " << old_key
289 	       << " total " << old_data.totalCount()
290 	       << " good " << old_data.goodCount()
291 	       << " spam " << old_data.spamCount()
292 	       << " age " << old_data.age()
293 	       << endl;
294 	}
295 	return true;
296       }
297     }
298 
299     // skip this slot since it contains a collision
300     index = m_numHeaders + ((index - m_numHeaders + 1) % m_divisor);
301   }
302 
303   return false;
304 }
305 
read(ulong_t key,WordData & word_data)306 bool HashDataFile::read(ulong_t key,
307                         WordData &word_data)
308 {
309   assert(m_base);
310 
311   ulong_t old_key = 0;
312   int index = computeIndexForKey(key);
313   for (int i = 0; i < m_divisor; ++i) {
314     m_array.readWord(index, old_key, word_data);
315     if (old_key == key) {
316       // slot matches our key so return it's value
317       return true;
318     }
319 
320     if (old_key == 0) {
321       // slot is empty - this key is not in file
322       return false;
323     }
324 
325     // skip this slot since it contains a collision
326     index = m_numHeaders + ((index - m_numHeaders + 1) % m_divisor);
327   }
328 
329   return false;
330 }
331 
copyHeadersToFile(HashDataFile & file)332 void HashDataFile::copyHeadersToFile(HashDataFile &file)
333 {
334   ulong_t key = 0;
335   WordData word_data;
336   for (int i = 0; i < m_numHeaders; ++i) {
337     readRecord(i, key, word_data);
338     file.writeRecord(i, key, word_data);
339   }
340 }
341 
copyContentsToFile(HashDataFile & file)342 void HashDataFile::copyContentsToFile(HashDataFile &file)
343 {
344   ulong_t key = 0;
345   WordData word_data;
346   for (int i = m_numHeaders; i < m_indexLimit; ++i) {
347     readRecord(i, key, word_data);
348     if (key != 0) {
349       file.write(key, word_data);
350     }
351   }
352 }
353 
354 #endif // USE_MMAP
355