1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: HashDataFile.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt. If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 // http://www.burton-computer.com/qpl.html
28 // http://www.cooldevtools.com/qpl.html
29 //
30
31 #ifdef USE_MMAP
32
33 #include <stdexcept>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <fcntl.h>
37 #include <errno.h>
38 #include <sys/mman.h>
39 #include "File.h"
40 #include "WordData.h"
41 #include "HashDataFile.h"
42
43 enum {
44 SIZE_MULTIPLE = 1024,
45 DEBUG_HASH = 1,
46 };
47
48 // table of primes indexed roughy by megabyte from 1 to 100 megs
49 static const int primes_by_meg[] = {
50 // assumes 12 bytes per entry
51 87359, 174761, 262139, 349519, 436889, // 1 - 5 megs
52 524287, 611657, 699037, 786431, 873787, // 6 - 10 megs
53 961189, 1048573, 1135951, 1223329, 1310719, // 11 - 15 megs
54 1398091, 1485479, 1572853, 1660231, 1747619, // 16 - 20 megs
55 1835003, 1922387, 2009759, 2097143, 2184509, // 21 - 25 megs
56 2271901, 2359267, 2446673, 2534057, 2621431, // 26 - 30 megs
57 2708821, 2796181, 2883577, 2970959, 3058343, // 31 - 35 megs
58 3145721, 3233093, 3320477, 3407857, 3495253, // 36 - 40 megs
59 3582629, 3670013, 3757393, 3844777, 3932153, // 41 - 45 megs
60 4019527, 4106897, 4194301, 4281679, 4369061, // 46 - 50 megs
61 4456433, 4543823, 4631203, 4718579, 4805959, // 51 - 55 megs
62 4893341, 4980727, 5068111, 5155489, 5242877, // 56 - 60 megs
63 5330251, 5417597, 5505023, 5592373, 5679769, // 61 - 65 megs
64 5767129, 5854543, 5941921, 6029299, 6116687, // 66 - 70 megs
65 6204071, 6291449, 6378787, 6466193, 6553577, // 71 - 75 megs
66 6640979, 6728347, 6815741, 6903121, 6990493, // 76 - 80 megs
67 7077883, 7165267, 7252649, 7340009, 7427353, // 81 - 85 megs
68 7514791, 7602151, 7689557, 7776931, 7864301, // 86 - 90 megs
69 7951693, 8039081, 8126453, 8213819, 8301217, // 91 - 95 megs
70 8388593, 8475947, 8563351, 8650727, 8738129, // 96 - 100 megs
71 };
72
73 #define PRIMES_ARRAY_SIZE (sizeof(primes_by_meg) / sizeof(primes_by_meg[0]))
74
compute_divisor_for_max_elements(int max_elements)75 static int compute_divisor_for_max_elements(int max_elements)
76 {
77 for (int i = PRIMES_ARRAY_SIZE - 1; i >= 0; --i) {
78 int divisor = primes_by_meg[i];
79 if (divisor <= max_elements) {
80 return divisor;
81 }
82 }
83 return primes_by_meg[0];
84 }
85
HashDataFile(int num_headers,HashDataFile::ulong_t size_in_bytes)86 HashDataFile::HashDataFile(int num_headers,
87 HashDataFile::ulong_t size_in_bytes)
88 : m_base(0)
89 {
90 initialize(num_headers, size_in_bytes);
91 }
92
~HashDataFile()93 HashDataFile::~HashDataFile()
94 {
95 close();
96 }
97
initialize(int num_headers,HashDataFile::ulong_t size_in_bytes)98 void HashDataFile::initialize(int num_headers,
99 HashDataFile::ulong_t size_in_bytes)
100 {
101 m_numHeaders = num_headers;
102 int max_elements = (size_in_bytes / WordArray::ENTRY_SIZE) - m_numHeaders;
103 m_divisor = compute_divisor_for_max_elements(max_elements);
104 m_indexLimit = m_divisor + m_numHeaders;
105 m_size = WordArray::ENTRY_SIZE * m_indexLimit;
106 if (is_debug) {
107 cerr << "HASH_DIVISOR " << m_divisor
108 << " (0x" << hex << m_divisor << ")"
109 << " INDEX_LIMIT " << dec << m_indexLimit
110 << " SIZE " << m_size
111 << endl;
112 }
113 }
114
populateEmptyFile(int fd)115 void HashDataFile::populateEmptyFile(int fd)
116 {
117 if (is_debug) {
118 cerr << "INITIALIZING NEW HASHFILE" << endl;
119 }
120
121 const int SIZE_MULTIPLE = 1024;
122 unsigned char zeros[SIZE_MULTIPLE * WordArray::ENTRY_SIZE];
123 WordArray::fillNullBuffer(zeros, SIZE_MULTIPLE);
124
125 for (int i = 0; i < m_indexLimit; i += SIZE_MULTIPLE) {
126 ::write(fd, &zeros, min(m_indexLimit - i, (HashDataFile::ulong_t)SIZE_MULTIPLE) * WordArray::ENTRY_SIZE);
127 }
128 }
129
open(const string & filename,bool read_only,int create_mode)130 bool HashDataFile::open(const string &filename,
131 bool read_only,
132 int create_mode)
133 {
134 close();
135
136 if (is_debug) {
137 cerr << "OPEN HASHFILE " << filename << endl;
138 }
139
140 m_isReadOnly = read_only;
141 m_createMode = create_mode;
142
143 File data_file(filename);
144 bool exists = data_file.isFile();
145
146 m_isNewFile = !exists;
147
148 if (read_only && !exists) {
149 cerr << "error: hash file "
150 << filename
151 << " does not exist and cannot be created in read-only mode"
152 << endl;
153 return false;
154 }
155
156 if (exists) {
157 unsigned long file_size = data_file.getSize();
158 if ((file_size % WordArray::ENTRY_SIZE) != 0) {
159 cerr << "error: hash file "
160 << filename
161 << " size not a multiple of "
162 << WordArray::ENTRY_SIZE
163 << " bytes"
164 << endl;
165 return false;
166 }
167 initialize(m_numHeaders, file_size);
168 if (m_size != file_size) {
169 cerr << "error: hash file "
170 << filename
171 << " size invalid "
172 << file_size
173 << " != "
174 << m_size
175 << endl;
176 return false;
177 }
178 }
179
180 if (is_debug) {
181 cerr << "HASHFILE " << filename << " SIZE " << m_size << endl;
182 }
183
184 int flags = (read_only) ? O_RDONLY : O_RDWR;
185 if (!exists) {
186 flags |= O_CREAT;
187 }
188
189 int fd = ::open(filename.c_str(), flags, create_mode);
190 if (fd < 0) {
191 cerr << "error: unable to open database " << filename << ": " << strerror(errno) << endl;
192 return false;
193 }
194
195 if (!exists) {
196 populateEmptyFile(fd);
197 }
198
199 if (!mmapFile(fd, read_only)) {
200 cerr << "error: unable to mmap file " << filename << ": " << strerror(errno) << endl;
201 close();
202 return false;
203 }
204
205 m_filename = filename;
206
207 return true;
208 }
209
mmapFile(int fd,bool read_only)210 bool HashDataFile::mmapFile(int fd,
211 bool read_only)
212 {
213
214 if (is_debug) {
215 cerr << "MMAPPING HASHFILE" << endl;
216 }
217
218 int flags = (read_only) ? PROT_READ : (PROT_READ | PROT_WRITE);
219 m_base = (char *)mmap(0, m_size, flags, MAP_SHARED, fd, 0);
220 if (m_base == (char *)-1) {
221 m_base = 0;
222 return false;
223 }
224
225 m_array.reset(m_base, m_indexLimit);
226 return true;
227 }
228
close()229 void HashDataFile::close()
230 {
231 if (m_base) {
232 ::munmap(m_base, m_size);
233 m_array.reset(0, 0);
234 m_base = 0;
235 m_filename.erase();
236 }
237 }
238
isOpen()239 bool HashDataFile::isOpen()
240 {
241 return m_base != 0;
242 }
243
writeRecord(int record_number,HashDataFile::ulong_t key,const WordData & word_data)244 void HashDataFile::writeRecord(int record_number,
245 HashDataFile::ulong_t key,
246 const WordData &word_data)
247 {
248 assert(m_base);
249 assert(record_number >= 0);
250 assert(record_number < m_indexLimit);
251
252 m_array.writeWord(record_number, key, word_data);
253 }
254
readRecord(int record_number,HashDataFile::ulong_t & key,WordData & word_data)255 void HashDataFile::readRecord(int record_number,
256 HashDataFile::ulong_t &key,
257 WordData &word_data)
258 {
259 assert(m_base);
260 assert(record_number >= 0);
261 assert(record_number < m_indexLimit);
262
263 m_array.readWord(record_number, key, word_data);
264 }
265
write(ulong_t key,const WordData & word_data)266 bool HashDataFile::write(ulong_t key,
267 const WordData &word_data)
268 {
269 assert(m_base);
270
271 int index = computeIndexForKey(key);
272 for (int i = 0; i < m_divisor; ++i) {
273 ulong_t old_key = m_array.readKey(index);
274 if (old_key == key || old_key == 0) {
275 m_array.writeWord(index, key, word_data);
276 return true;
277 }
278
279 if (m_autoCleaner.isNotNull()) {
280 WordData old_data;
281 m_array.readWord(index, old_key, old_data);
282 if (m_autoCleaner->shouldDelete(old_data)) {
283 m_array.writeWord(index, key, word_data);
284 if (is_debug) {
285 cerr << "HashDataFile: overwriting old word for new one: "
286 << " index " << index
287 << " new_key " << key
288 << " old_key " << old_key
289 << " total " << old_data.totalCount()
290 << " good " << old_data.goodCount()
291 << " spam " << old_data.spamCount()
292 << " age " << old_data.age()
293 << endl;
294 }
295 return true;
296 }
297 }
298
299 // skip this slot since it contains a collision
300 index = m_numHeaders + ((index - m_numHeaders + 1) % m_divisor);
301 }
302
303 return false;
304 }
305
read(ulong_t key,WordData & word_data)306 bool HashDataFile::read(ulong_t key,
307 WordData &word_data)
308 {
309 assert(m_base);
310
311 ulong_t old_key = 0;
312 int index = computeIndexForKey(key);
313 for (int i = 0; i < m_divisor; ++i) {
314 m_array.readWord(index, old_key, word_data);
315 if (old_key == key) {
316 // slot matches our key so return it's value
317 return true;
318 }
319
320 if (old_key == 0) {
321 // slot is empty - this key is not in file
322 return false;
323 }
324
325 // skip this slot since it contains a collision
326 index = m_numHeaders + ((index - m_numHeaders + 1) % m_divisor);
327 }
328
329 return false;
330 }
331
copyHeadersToFile(HashDataFile & file)332 void HashDataFile::copyHeadersToFile(HashDataFile &file)
333 {
334 ulong_t key = 0;
335 WordData word_data;
336 for (int i = 0; i < m_numHeaders; ++i) {
337 readRecord(i, key, word_data);
338 file.writeRecord(i, key, word_data);
339 }
340 }
341
copyContentsToFile(HashDataFile & file)342 void HashDataFile::copyContentsToFile(HashDataFile &file)
343 {
344 ulong_t key = 0;
345 WordData word_data;
346 for (int i = m_numHeaders; i < m_indexLimit; ++i) {
347 readRecord(i, key, word_data);
348 if (key != 0) {
349 file.write(key, word_data);
350 }
351 }
352 }
353
354 #endif // USE_MMAP
355