1 /* udb.c - u(micro) data base.
2 * By W.C.A. Wijngaards
3 * Copyright 2010, NLnet Labs.
4 * BSD, see LICENSE.
5 */
6 #include "config.h"
7 #include "udb.h"
8 #include <string.h>
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <assert.h>
13 #include "lookup3.h"
14 #include "util.h"
15
16 /* mmap and friends */
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21
22 /* for systems without, portable definition, failed-1 and async is a flag */
23 #ifndef MAP_FAILED
24 #define MAP_FAILED ((void*)-1)
25 #endif
26 #ifndef MS_SYNC
27 #define MS_SYNC 0
28 #endif
29
30 /** move and fixup xl segment */
31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32 udb_void n, uint64_t sz, uint64_t startseg);
33 /** attempt to compact the data and move free space to the end */
34 static int udb_alloc_compact(void* base, udb_alloc* alloc);
35
36 /** convert pointer to the data part to a pointer to the base of the chunk */
37 static udb_void
chunk_from_dataptr(udb_void data)38 chunk_from_dataptr(udb_void data)
39 {
40 /* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41 * that xl_chunk_d is aligned on x**1024 boundaries. */
42 udb_void xl = data - sizeof(udb_xl_chunk_d);
43 if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44 return xl;
45 return data - sizeof(udb_chunk_d);
46 }
47
chunk_from_dataptr_ext(udb_void data)48 udb_void chunk_from_dataptr_ext(udb_void data) {
49 return chunk_from_dataptr(data);
50 }
51
52 #ifndef NDEBUG
53 /** read last octet from a chunk */
54 static uint8_t
chunk_get_last(void * base,udb_void chunk,int exp)55 chunk_get_last(void* base, udb_void chunk, int exp)
56 {
57 return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58 }
59 #endif
60
61 /** write last octet of a chunk */
62 static void
chunk_set_last(void * base,udb_void chunk,int exp,uint8_t value)63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64 {
65 assert(exp >= 0 && exp <= 63);
66 *((uint8_t*)UDB_REL(base, chunk+((uint64_t)1<<exp)-1)) = value;
67 }
68
69 /** create udb_base from a file descriptor (must be at start of file) */
70 udb_base*
udb_base_create_fd(const char * fname,int fd,udb_walk_relptr_func walkfunc,void * arg)71 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
72 void* arg)
73 {
74 uint64_t m, fsz;
75 udb_glob_d g;
76 ssize_t r;
77 udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
78 if(!udb) {
79 log_msg(LOG_ERR, "out of memory");
80 close(fd);
81 return NULL;
82 }
83 udb->fname = strdup(fname);
84 if(!udb->fname) {
85 log_msg(LOG_ERR, "out of memory");
86 free(udb);
87 close(fd);
88 return NULL;
89 }
90 udb->walkfunc = walkfunc;
91 udb->walkarg = arg;
92 udb->fd = fd;
93 udb->ram_size = 1024;
94 udb->ram_mask = (int)udb->ram_size - 1;
95 udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*),
96 udb->ram_size);
97 if(!udb->ram_hash) {
98 free(udb->fname);
99 free(udb);
100 log_msg(LOG_ERR, "out of memory");
101 close(fd);
102 return NULL;
103 }
104
105 /* read magic */
106 if((r=read(fd, &m, sizeof(m))) == -1) {
107 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
108 goto fail;
109 } else if(r != (ssize_t)sizeof(m)) {
110 log_msg(LOG_ERR, "%s: file too short", fname);
111 goto fail;
112 }
113 /* TODO : what if bigendian and littleendian file, see magic */
114 if(m != UDB_MAGIC) {
115 log_msg(LOG_ERR, "%s: wrong type of file", fname);
116 goto fail;
117 }
118 /* read header */
119 if((r=read(fd, &g, sizeof(g))) == -1) {
120 log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
121 goto fail;
122 } else if(r != (ssize_t)sizeof(g)) {
123 log_msg(LOG_ERR, "%s: file too short", fname);
124 goto fail;
125 }
126 if(g.version != 0) {
127 log_msg(LOG_ERR, "%s: unknown file version %d", fname,
128 (int)g.version);
129 goto fail;
130 }
131 if(g.hsize < UDB_HEADER_SIZE) {
132 log_msg(LOG_ERR, "%s: header size too small %d", fname,
133 (int)g.hsize);
134 goto fail;
135 }
136 if(g.hsize > UDB_HEADER_SIZE) {
137 log_msg(LOG_WARNING, "%s: header size too large %d", fname,
138 (int)g.hsize);
139 goto fail;
140 }
141 if(g.clean_close != 1) {
142 log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
143 (int)g.clean_close);
144 goto fail;
145 }
146 if(g.dirty_alloc != 0) {
147 log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname,
148 (int)g.dirty_alloc);
149 goto fail;
150 }
151
152 /* check file size correctly written, for 4.0.2 nsd.db failure */
153 fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END);
154 (void)lseek(fd, (off_t)0, SEEK_SET);
155 if(g.fsize != fsz) {
156 log_msg(LOG_WARNING, "%s: file size %llu but mmap header "
157 "has size %llu", fname, (unsigned long long)fsz,
158 (unsigned long long)g.fsize);
159 goto fail;
160 }
161
162 /* mmap it */
163 if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
164 log_msg(LOG_ERR, "%s: file too short", fname);
165 goto fail;
166 }
167 if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ {
168 log_msg(LOG_WARNING, "%s: file size too large %llu",
169 fname, (unsigned long long)g.fsize);
170 goto fail;
171 }
172 udb->base_size = (size_t)g.fsize;
173 #ifdef HAVE_MMAP
174 /* note the size_t casts must be there for portability, on some
175 * systems the layout of memory is otherwise broken. */
176 udb->base = mmap(NULL, (size_t)udb->base_size,
177 (int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
178 (int)udb->fd, (off_t)0);
179 #else
180 udb->base = MAP_FAILED; errno = ENOSYS;
181 #endif
182 if(udb->base == MAP_FAILED) {
183 udb->base = NULL;
184 log_msg(LOG_ERR, "mmap(size %u) error: %s",
185 (unsigned)udb->base_size, strerror(errno));
186 fail:
187 close(fd);
188 free(udb->fname);
189 free(udb->ram_hash);
190 free(udb);
191 return NULL;
192 }
193
194 /* init completion */
195 udb->glob_data = (udb_glob_d*)((char*)udb->base+sizeof(uint64_t));
196 r = 0;
197 /* cannot be dirty because that is goto fail above */
198 if(udb->glob_data->dirty_alloc != udb_dirty_clean)
199 r = 1;
200 udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
201 (char*)udb->glob_data+sizeof(*udb->glob_data)));
202 if(!udb->alloc) {
203 log_msg(LOG_ERR, "out of memory");
204 udb_base_free(udb);
205 return NULL;
206 }
207 if(r) {
208 /* and compact now, or resume compacting */
209 udb_alloc_compact(udb, udb->alloc);
210 udb_base_sync(udb, 1);
211 }
212 udb->glob_data->clean_close = 0;
213
214 return udb;
215 }
216
udb_base_create_read(const char * fname,udb_walk_relptr_func walkfunc,void * arg)217 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
218 void* arg)
219 {
220 int fd = open(fname, O_RDWR);
221 if(fd == -1) {
222 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
223 return NULL;
224 }
225 return udb_base_create_fd(fname, fd, walkfunc, arg);
226 }
227
228 /** init new udb_global structure */
udb_glob_init_new(udb_glob_d * g)229 static void udb_glob_init_new(udb_glob_d* g)
230 {
231 memset(g, 0, sizeof(*g));
232 g->hsize = UDB_HEADER_SIZE;
233 g->fsize = UDB_HEADER_SIZE;
234 }
235
236 /** write data to file and check result */
237 static int
write_fdata(const char * fname,int fd,void * data,size_t len)238 write_fdata(const char* fname, int fd, void* data, size_t len)
239 {
240 ssize_t w;
241 if((w=write(fd, data, len)) == -1) {
242 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
243 close(fd);
244 return 0;
245 } else if(w != (ssize_t)len) {
246 log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
247 close(fd);
248 return 0;
249 }
250 return 1;
251 }
252
udb_base_create_new(const char * fname,udb_walk_relptr_func walkfunc,void * arg)253 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
254 void* arg)
255 {
256 uint64_t m;
257 udb_glob_d g;
258 udb_alloc_d a;
259 uint64_t endsize = UDB_HEADER_SIZE;
260 uint64_t endexp = 0;
261 int fd = open(fname, O_CREAT|O_RDWR, 0600);
262 if(fd == -1) {
263 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
264 return NULL;
265 }
266 m = UDB_MAGIC;
267 udb_glob_init_new(&g);
268 udb_alloc_init_new(&a);
269 g.clean_close = 1;
270
271 /* write new data to file (closes fd on error) */
272 if(!write_fdata(fname, fd, &m, sizeof(m)))
273 return NULL;
274 if(!write_fdata(fname, fd, &g, sizeof(g)))
275 return NULL;
276 if(!write_fdata(fname, fd, &a, sizeof(a)))
277 return NULL;
278 if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
279 return NULL;
280 if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
281 return NULL;
282 /* rewind to start */
283 if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
284 log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
285 close(fd);
286 return NULL;
287 }
288 /* truncate to the right size */
289 if(ftruncate(fd, (off_t)g.fsize) < 0) {
290 log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname,
291 (int)g.fsize, strerror(errno));
292 close(fd);
293 return NULL;
294 }
295 return udb_base_create_fd(fname, fd, walkfunc, arg);
296 }
297
298 /** shrink the udb base if it has unused space at the end */
299 static void
udb_base_shrink(udb_base * udb,uint64_t nsize)300 udb_base_shrink(udb_base* udb, uint64_t nsize)
301 {
302 udb->glob_data->dirty_alloc = udb_dirty_fsize;
303 udb->glob_data->fsize = nsize;
304 /* sync, does not *seem* to be required on Linux, but it is
305 certainly required on OpenBSD. Otherwise changed data is lost. */
306 #ifdef HAVE_MMAP
307 msync(udb->base, udb->base_size, MS_ASYNC);
308 #endif
309 if(ftruncate(udb->fd, (off_t)nsize) != 0) {
310 log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
311 (unsigned)nsize, strerror(errno));
312 }
313 udb->glob_data->dirty_alloc = udb_dirty_clean;
314 }
315
udb_base_close(udb_base * udb)316 void udb_base_close(udb_base* udb)
317 {
318 if(!udb)
319 return;
320 if(udb->fd != -1 && udb->base && udb->alloc) {
321 uint64_t nsize = udb->alloc->disk->nextgrow;
322 if(nsize < udb->base_size)
323 udb_base_shrink(udb, nsize);
324 }
325 if(udb->fd != -1) {
326 udb->glob_data->clean_close = 1;
327 close(udb->fd);
328 udb->fd = -1;
329 }
330 if(udb->base) {
331 #ifdef HAVE_MMAP
332 if(munmap(udb->base, udb->base_size) == -1) {
333 log_msg(LOG_ERR, "munmap: %s", strerror(errno));
334 }
335 #endif
336 udb->base = NULL;
337 }
338 }
339
udb_base_free(udb_base * udb)340 void udb_base_free(udb_base* udb)
341 {
342 if(!udb)
343 return;
344 udb_base_close(udb);
345 udb_alloc_delete(udb->alloc);
346 free(udb->ram_hash);
347 free(udb->fname);
348 free(udb);
349 }
350
udb_base_free_keep_mmap(udb_base * udb)351 void udb_base_free_keep_mmap(udb_base* udb)
352 {
353 if(!udb) return;
354 if(udb->fd != -1) {
355 close(udb->fd);
356 udb->fd = -1;
357 }
358 udb->base = NULL;
359 udb_alloc_delete(udb->alloc);
360 free(udb->ram_hash);
361 free(udb->fname);
362 free(udb);
363 }
364
udb_base_sync(udb_base * udb,int wait)365 void udb_base_sync(udb_base* udb, int wait)
366 {
367 if(!udb) return;
368 #ifdef HAVE_MMAP
369 if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
370 log_msg(LOG_ERR, "msync(%s) error %s",
371 udb->fname, strerror(errno));
372 }
373 #else
374 (void)wait;
375 #endif
376 }
377
378 /** hash a chunk pointer */
379 static uint32_t
chunk_hash_ptr(udb_void p)380 chunk_hash_ptr(udb_void p)
381 {
382 /* put p into an array of uint32 */
383 uint32_t h[sizeof(p)/sizeof(uint32_t)];
384 memcpy(&h, &p, sizeof(h));
385 return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
386 }
387
388 /** check that the given pointer is on the bucket for the given offset */
udb_ptr_is_on_bucket(udb_base * udb,udb_ptr * ptr,udb_void to)389 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
390 {
391 uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
392 udb_ptr* p;
393 assert((size_t)i < udb->ram_size);
394 for(p = udb->ram_hash[i]; p; p=p->next) {
395 if(p == ptr)
396 return 1;
397 }
398 return 0;
399 }
400
401 /** grow the ram array */
402 static void
grow_ram_hash(udb_base * udb,udb_ptr ** newhash)403 grow_ram_hash(udb_base* udb, udb_ptr** newhash)
404 {
405 size_t i;
406 size_t osize= udb->ram_size;
407 udb_ptr* p, *np;
408 udb_ptr** oldhash = udb->ram_hash;
409 udb->ram_size *= 2;
410 udb->ram_mask <<= 1;
411 udb->ram_mask |= 1;
412 udb->ram_hash = newhash;
413 /* have to link in every element in the old list into the new list*/
414 for(i=0; i<osize; i++) {
415 p = oldhash[i];
416 while(p) {
417 np = p->next;
418 /* link into newhash */
419 p->prev=NULL;
420 p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
421 if(p->next) p->next->prev = p;
422 /* go to next element of oldhash */
423 p = np;
424 }
425 }
426 free(oldhash);
427 }
428
udb_base_link_ptr(udb_base * udb,udb_ptr * ptr)429 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
430 {
431 uint32_t i;
432 #ifdef UDB_CHECK
433 assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
434 #endif
435 udb->ram_num++;
436 if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) {
437 /* grow the array, if allocation succeeds */
438 udb_ptr** newram = (udb_ptr**)xalloc_array_zero(
439 sizeof(udb_ptr*), udb->ram_size*2);
440 if(newram) {
441 grow_ram_hash(udb, newram);
442 }
443 }
444 i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
445 assert((size_t)i < udb->ram_size);
446
447 ptr->prev = NULL;
448 ptr->next = udb->ram_hash[i];
449 udb->ram_hash[i] = ptr;
450 if(ptr->next)
451 ptr->next->prev = ptr;
452 }
453
udb_base_unlink_ptr(udb_base * udb,udb_ptr * ptr)454 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
455 {
456 assert(ptr->data);
457 #ifdef UDB_CHECK
458 assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
459 assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
460 #endif
461 udb->ram_num--;
462 if(ptr->next)
463 ptr->next->prev = ptr->prev;
464 if(ptr->prev)
465 ptr->prev->next = ptr->next;
466 else {
467 uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
468 assert((size_t)i < udb->ram_size);
469 udb->ram_hash[i] = ptr->next;
470 }
471 }
472
473 /** change a set of ram ptrs to a new value */
474 static void
udb_base_ram_ptr_edit(udb_base * udb,udb_void old,udb_void newd)475 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
476 {
477 uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
478 udb_ptr* p, *np;
479 /* edit them and move them into the new position */
480 p = udb->ram_hash[io];
481 while(p) {
482 np = p->next;
483 if(p->data == old) {
484 udb_base_unlink_ptr(udb, p);
485 p->data = newd;
486 udb_base_link_ptr(udb, p);
487 }
488 p = np;
489 }
490 }
491
udb_base_get_userdata(udb_base * udb)492 udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
493 {
494 return &udb->glob_data->user_global;
495 }
496
udb_base_set_userdata(udb_base * udb,udb_void user)497 void udb_base_set_userdata(udb_base* udb, udb_void user)
498 {
499 #ifdef UDB_CHECK
500 if(user) { assert(udb_valid_dataptr(udb, user)); }
501 #endif
502 udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
503 }
504
udb_base_set_userflags(udb_base * udb,uint8_t v)505 void udb_base_set_userflags(udb_base* udb, uint8_t v)
506 {
507 udb->glob_data->userflags = v;
508 }
509
udb_base_get_userflags(udb_base * udb)510 uint8_t udb_base_get_userflags(udb_base* udb)
511 {
512 return udb->glob_data->userflags;
513 }
514
515 /** re-mmap the udb to specified size */
516 static void*
udb_base_remap(udb_base * udb,udb_alloc * alloc,uint64_t nsize)517 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
518 {
519 #ifdef HAVE_MMAP
520 void* nb;
521 /* for use with valgrind, do not use mremap, but the other version */
522 #ifdef MREMAP_MAYMOVE
523 nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
524 if(nb == MAP_FAILED) {
525 log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
526 udb->fname, (unsigned)nsize, strerror(errno));
527 return 0;
528 }
529 #else /* !HAVE MREMAP */
530 /* use munmap-mmap to simulate mremap */
531 if(munmap(udb->base, udb->base_size) != 0) {
532 log_msg(LOG_ERR, "munmap(%s) error %s",
533 udb->fname, strerror(errno));
534 }
535 /* provide hint for new location */
536 /* note the size_t casts must be there for portability, on some
537 * systems the layout of memory is otherwise broken. */
538 nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
539 (int)MAP_SHARED, (int)udb->fd, (off_t)0);
540 /* retry the mmap without basept in case of ENOMEM (FreeBSD8),
541 * the kernel can then try to mmap it at a different location
542 * where more memory is available */
543 if(nb == MAP_FAILED && errno == ENOMEM) {
544 nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
545 (int)MAP_SHARED, (int)udb->fd, (off_t)0);
546 }
547 if(nb == MAP_FAILED) {
548 log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
549 udb->fname, (unsigned)nsize, strerror(errno));
550 udb->base = NULL;
551 return 0;
552 }
553 #endif /* HAVE MREMAP */
554 if(nb != udb->base) {
555 /* fix up realpointers in udb and alloc */
556 /* but mremap may have been nice and not move the base */
557 udb->base = nb;
558 udb->glob_data = (udb_glob_d*)((char*)nb+sizeof(uint64_t));
559 /* use passed alloc pointer because the udb->alloc may not
560 * be initialized yet */
561 alloc->disk = (udb_alloc_d*)((char*)udb->glob_data
562 +sizeof(*udb->glob_data));
563 }
564 udb->base_size = nsize;
565 return nb;
566 #else /* HAVE_MMAP */
567 (void)udb; (void)alloc; (void)nsize;
568 return NULL;
569 #endif /* HAVE_MMAP */
570 }
571
572 void
udb_base_remap_process(udb_base * udb)573 udb_base_remap_process(udb_base* udb)
574 {
575 /* assume that fsize is still accessible */
576 udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
577 }
578
579 /** grow file to specified size and re-mmap, return new base */
580 static void*
udb_base_grow_and_remap(udb_base * udb,uint64_t nsize)581 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
582 {
583 /* grow file by writing a single zero at that spot, the
584 * rest is filled in with zeroes. */
585 uint8_t z = 0;
586 ssize_t w;
587
588 assert(nsize > 0);
589 udb->glob_data->dirty_alloc = udb_dirty_fsize;
590 #ifdef HAVE_PWRITE
591 if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
592 #else
593 if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
594 log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
595 return 0;
596 }
597 if((w=write(udb->fd, &z, sizeof(z))) == -1) {
598 #endif
599 log_msg(LOG_ERR, "grow(%s, size %u) error %s",
600 udb->fname, (unsigned)nsize, strerror(errno));
601 return 0;
602 } else if(w != (ssize_t)sizeof(z)) {
603 log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
604 udb->fname, (unsigned)nsize);
605 return 0;
606 }
607 udb->glob_data->fsize = nsize;
608 udb->glob_data->dirty_alloc = udb_dirty_clean;
609 return udb_base_remap(udb, udb->alloc, nsize);
610 }
611
612 int udb_exp_size(uint64_t a)
613 {
614 /* find enclosing value such that 2**x >= a */
615 int x = 0;
616 uint64_t i = a;
617 assert(a != 0);
618
619 i --;
620 /* could optimise this with uint8* access, depends on endianness */
621 /* first whole bytes */
622 while( (i&(~(uint64_t)0xff)) ) {
623 i >>= 8;
624 x += 8;
625 }
626 /* now details */
627 while(i) {
628 i >>= 1;
629 x ++;
630 }
631 assert( x>=0 && x<=63);
632 assert( ((uint64_t)1<<x) >= a);
633 assert( x==0 || /* <<x-1 without negative number analyzer complaints: */ (((uint64_t)1<<x)>>1) < a);
634 return x;
635 }
636
637 int udb_exp_offset(uint64_t o)
638 {
639 /* this means measuring the number of 0 bits on the right */
640 /* so, if exp zero bits then (o&(2**x-1))==0 */
641 int x = 0;
642 uint64_t i = o;
643 assert(o != 0);
644 /* first whole bytes */
645 while( (i&(uint64_t)0xff) == 0) {
646 i >>= 8;
647 x += 8;
648 }
649 /* now details */
650 while( (i&(uint64_t)0x1) == 0) {
651 i >>= 1;
652 x ++;
653 }
654 assert( o % ((uint64_t)1<<x) == 0);
655 assert( o % ((uint64_t)1<<(x+1)) != 0);
656 return x;
657 }
658
659 void udb_alloc_init_new(udb_alloc_d* a)
660 {
661 assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
662 memset(a, 0, sizeof(*a));
663 /* set new allocations after header, as if allocated in a sequence
664 * of minsize allocations */
665 a->nextgrow = UDB_HEADER_SIZE;
666 }
667
668 /** fsck the file size, false if failed and file is useless */
669 static int
670 fsck_fsize(udb_base* udb, udb_alloc* alloc)
671 {
672 off_t realsize;
673 log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
674 realsize = lseek(udb->fd, (off_t)0, SEEK_END);
675 if(realsize == (off_t)-1) {
676 log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
677 return 0;
678 }
679 udb->glob_data->fsize = (uint64_t)realsize;
680 if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
681 return 0;
682 udb->glob_data->dirty_alloc = udb_dirty_clean;
683 log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
684 udb_base_sync(udb, 1);
685 return 1;
686 }
687
688 /** regenerate freelist add a new free chunk, return next todo */
689 static udb_void
690 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
691 {
692 udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
693 uint64_t esz = (uint64_t)1<<exp;
694 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
695 return 0;
696 }
697 cp->type = udb_chunk_type_free;
698 cp->flags = 0;
699 chunk_set_last(base, c, exp, (uint8_t)exp);
700 cp->prev = 0;
701 cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
702 if(cp->next)
703 UDB_FREE_CHUNK(cp->next)->prev = c;
704 regen->stat_free += esz;
705 return c + esz;
706 }
707
708 /** regenerate xl chunk, return next todo */
709 static udb_void
710 regen_xl(void* base, udb_void c, udb_alloc_d* regen)
711 {
712 udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
713 uint64_t xlsz = cp->size;
714 if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
715 return 0;
716 }
717 if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
718 return 0;
719 }
720 /* fixup end-size and end-expmarker */
721 regen->stat_alloc += xlsz;
722 return c + xlsz;
723 }
724
725 /** regenerate data chunk, return next todo */
726 static udb_void
727 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
728 {
729 uint64_t esz = (uint64_t)1<<exp;
730 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
731 return 0;
732 }
733 chunk_set_last(base, c, exp, (uint8_t)exp);
734 regen->stat_alloc += esz;
735 return c + esz;
736 }
737
738 /** regenerate a relptr structure inside a data segment */
739 static void
740 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
741 {
742 udb_void* a = (udb_void*)arg;
743 /* ignore 0 pointers */
744 if(!rp->data)
745 return;
746
747 /* edit relptrs that point to oldmoved to point to newmoved. */
748 if(rp->data == a[0])
749 rp->data = a[1];
750
751 /* regenerate relptr lists, add this item to the relptr list for
752 * the data that it points to */
753 udb_rel_ptr_link(base, rp, rp->data);
754 }
755
756 /** regenerate the relptrs store in this data segment */
757 static void
758 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
759 void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
760 {
761 udb_void arg[2];
762 arg[0] = rb_old; arg[1] = rb_new;
763 /* walk through the structs here and put them on their respective
764 * relptr lists */
765 (*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
766 ®en_relptr_func, arg);
767
768 }
769
770 /** regenerate relptrlists in the file */
771 static void
772 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
773 udb_void rb_old, udb_void rb_new)
774 {
775 udb_void at = alloc->udb->glob_data->hsize;
776 /* clear all ptrlist start pointers in the file. */
777 while(at < alloc->disk->nextgrow) {
778 int exp = (int)UDB_CHUNK(at)->exp;
779 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
780 if(exp == UDB_EXP_XL) {
781 UDB_XL_CHUNK(at)->ptrlist = 0;
782 at += UDB_XL_CHUNK(at)->size;
783 } else if(tp == udb_chunk_type_free) {
784 at += (uint64_t)1<<exp;
785 } else { /* data chunk */
786 UDB_CHUNK(at)->ptrlist = 0;
787 at += (uint64_t)1<<exp;
788 }
789 }
790 /* walk through all relptr structs and put on the right list. */
791 at = alloc->udb->glob_data->hsize;
792 while(at < alloc->disk->nextgrow) {
793 udb_chunk_d* atp = UDB_CHUNK(at);
794 int exp = (int)atp->exp;
795 udb_chunk_type tp = (udb_chunk_type)atp->type;
796 uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
797 (uint64_t)1<<exp);
798 if(exp == UDB_EXP_XL) {
799 assert(at != rb_old); /* should have been freed */
800 regen_its_ptrs(base, udb, atp,
801 ((char*)atp)+sizeof(udb_xl_chunk_d),
802 sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
803 rb_old, rb_new);
804 at += sz;
805 } else if(tp == udb_chunk_type_free) {
806 at += sz;
807 } else { /* data chunk */
808 assert(at != rb_old); /* should have been freed */
809 regen_its_ptrs(base, udb, atp,
810 ((char*)atp)+sizeof(udb_chunk_d),
811 sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
812 at += sz;
813 }
814 }
815 }
816
817
818 /** mark free elements from ex XL chunk space and later fixups pick that up */
819 static void
820 rb_mark_free_segs(void* base, udb_void s, uint64_t m)
821 {
822 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
823 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
824 assert(s >= UDB_ALLOC_CHUNK_SIZE);
825 while(q >= s) {
826 UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
827 UDB_CHUNK(q)->type = udb_chunk_type_free;
828 q -= UDB_ALLOC_CHUNK_SIZE;
829 }
830 }
831
832
833 /** fsck rollback or rollforward XL move results */
834 static int
835 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
836 uint64_t rb_size, uint64_t rb_seg)
837 {
838
839 if(rb_old <= rb_new)
840 return 0; /* XL move one way */
841 if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
842 return 0; /* not aligned */
843 if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
844 return 0; /* not aligned */
845 if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
846 return 0; /* not aligned */
847 if(rb_new + rb_size <= rb_old) {
848 /* not overlapping: resume copy */
849 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
850 /* and free up old piece(s) */
851 rb_mark_free_segs(base, rb_old, rb_size);
852 } else {
853 /* overlapping, see what segment we stopped at
854 * and continue there. */
855 move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
856 /* free up old piece(s); from the end of the moved segment,
857 * until the end of the old segment */
858 rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
859 (rb_new+rb_size));
860 }
861 /* do not call fix_ptrs, regenptrs does the job */
862 return 1;
863 }
864
865 /** fsck rollback or rollforward move results */
866 static int
867 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
868 udb_void* make_free)
869 {
870 if( (rb_size&(rb_size-1)) != 0)
871 return 0; /* not powerof2 */
872 if( (rb_old&(rb_size-1)) != 0)
873 return 0; /* not aligned */
874 if( (rb_new&(rb_size-1)) != 0)
875 return 0; /* not aligned */
876 /* resume copy */
877 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
878 /* do not call fix_ptrs, regenptrs does the job */
879 /* make sure udb_old is freed */
880 *make_free = rb_old;
881 return 1;
882 }
883
884 /** fsck the file and salvage, false if failed and file is useless */
885 static int
886 fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
887 {
888 void* base = udb->base;
889 udb_alloc_d regen;
890 udb_void at = udb->glob_data->hsize;
891 udb_void rb_old = udb->glob_data->rb_old;
892 udb_void rb_new = udb->glob_data->rb_new;
893 udb_void rb_seg = udb->glob_data->rb_seg;
894 udb_void make_free = 0;
895 uint64_t rb_size = udb->glob_data->rb_size;
896 log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
897 /* walk through the file, use the exp values to see what can be
898 * salvaged */
899 if(moved && rb_old && rb_new && rb_size) {
900 if(rb_old+rb_size <= alloc->disk->nextgrow
901 && rb_new+rb_size <= alloc->disk->nextgrow) {
902 /* we can use the move information to fix up the
903 * duplicate element (or partially moved element) */
904 if(rb_size > 1024*1024) {
905 /* XL chunk */
906 if(!fsck_rb_xl(base, udb, rb_old, rb_new,
907 rb_size, rb_seg))
908 return 0;
909 } else {
910 if(!fsck_rb(base, rb_old, rb_new, rb_size,
911 &make_free))
912 return 0;
913 }
914 }
915 }
916
917 /* rebuild freelists */
918 /* recalculate stats in alloc (except 'stat_data') */
919 /* possibly new end 'nextgrow' value */
920 memset(®en, 0, sizeof(regen));
921 regen.nextgrow = alloc->disk->nextgrow;
922 while(at < regen.nextgrow) {
923 /* figure out this chunk */
924 int exp = (int)UDB_CHUNK(at)->exp;
925 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
926 /* consistency check possible here with end-exp */
927 if(tp == udb_chunk_type_free || at == make_free) {
928 at = regen_free(base, at, exp, ®en);
929 if(!at) return 0;
930 } else if(exp == UDB_EXP_XL) {
931 /* allocated data of XL size */
932 at = regen_xl(base, at, ®en);
933 if(!at) return 0;
934 } else if(exp >= UDB_ALLOC_CHUNK_MINEXP
935 && exp <= UDB_ALLOC_CHUNKS_MAX) {
936 /* allocated data */
937 at = regen_data(base, at, exp, ®en);
938 if(!at) return 0;
939 } else {
940 /* garbage; this must be EOF then */
941 regen.nextgrow = at;
942 break;
943 }
944 }
945 *alloc->disk = regen;
946
947 /* rebuild relptr lists */
948 regen_ptrlist(base, udb, alloc, rb_old, rb_new);
949
950 log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
951 udb->fname);
952 udb->glob_data->rb_old = 0;
953 udb->glob_data->rb_new = 0;
954 udb->glob_data->rb_size = 0;
955 udb->glob_data->dirty_alloc = udb_dirty_clean;
956 udb_base_sync(udb, 1);
957 return 1;
958 }
959
960
961 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
962 {
963 udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
964 if(!alloc)
965 return NULL;
966 alloc->udb = udb;
967 alloc->disk = disk;
968 /* see if committed but uncompleted actions need to be done */
969 /* preserves the alloc state */
970 if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
971 if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
972 if(fsck_fsize(udb, alloc))
973 return alloc;
974 } else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
975 if(fsck_file(udb, alloc, 0))
976 return alloc;
977 } else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
978 if(fsck_file(udb, alloc, 1))
979 return alloc;
980 }
981 log_msg(LOG_ERR, "error: file allocation dirty (%d)",
982 (int)udb->glob_data->dirty_alloc);
983 free(alloc);
984 return NULL;
985 }
986 return alloc;
987 }
988
989 void udb_alloc_delete(udb_alloc* alloc)
990 {
991 if(!alloc) return;
992 free(alloc);
993 }
994
995 /** unlink this element from its freelist */
996 static void
997 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
998 {
999 udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
1000 assert(chunk);
1001 /* chunk is a free chunk */
1002 assert(fp->exp == (uint8_t)exp);
1003 assert(fp->type == udb_chunk_type_free);
1004 assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
1005 /* and thus freelist not empty */
1006 assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
1007 /* unlink */
1008 if(fp->prev)
1009 UDB_FREE_CHUNK(fp->prev)->next = fp->next;
1010 else alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1011 if(fp->next)
1012 UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
1013 }
1014
1015 /** pop first element off freelist, list may not be empty */
1016 static udb_void
1017 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
1018 {
1019 udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1020 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1021 assert(f);
1022 assert(fp->exp == (uint8_t)exp);
1023 assert(fp->type == udb_chunk_type_free);
1024 assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1025 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1026 if(fp->next) {
1027 UDB_FREE_CHUNK(fp->next)->prev = 0;
1028 }
1029 return f;
1030 }
1031
1032 /** push new element onto freelist */
1033 static void
1034 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
1035 {
1036 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1037 assert(f);
1038 fp->exp = (uint8_t)exp;
1039 fp->type = udb_chunk_type_free;
1040 fp->flags = 0;
1041 fp->prev = 0;
1042 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1043 if(fp->next)
1044 UDB_FREE_CHUNK(fp->next)->prev = f;
1045 chunk_set_last(base, f, exp, (uint8_t)exp);
1046 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1047 }
1048
1049 /** push new element onto freelist - do not initialize the elt */
1050 static void
1051 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
1052 {
1053 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1054 assert(f);
1055 assert(fp->exp == (uint8_t)exp);
1056 assert(fp->type == udb_chunk_type_free);
1057 assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1058 fp->prev = 0;
1059 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1060 if(fp->next)
1061 UDB_FREE_CHUNK(fp->next)->prev = f;
1062 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1063 }
1064
1065 /** add free chunks at end until specified alignment occurs */
1066 static void
1067 grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1068 {
1069 while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1070 /* the nextgrow is not a whole multiple of esz. */
1071 /* grow a free chunk of max allowed size */
1072 int fexp = udb_exp_offset(alloc->disk->nextgrow);
1073 uint64_t fsz = (uint64_t)1<<fexp;
1074 udb_void f = alloc->disk->nextgrow;
1075 udb_void fn = alloc->disk->nextgrow+fsz;
1076 assert(fn <= alloc->udb->base_size);
1077 alloc->disk->stat_free += fsz;
1078 udb_alloc_push_fl(base, alloc, f, fexp);
1079 /* now increase nextgrow to commit that free chunk */
1080 alloc->disk->nextgrow = fn;
1081 }
1082 }
1083
1084 /** append chunks at end of memory space to get size exp, return dataptr */
1085 static udb_void
1086 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1087 {
1088 uint64_t esz = (uint64_t)1<<exp;
1089 udb_void ret;
1090 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1091 grow_align(base, alloc, esz);
1092 /* free chunks are grown, grow the one we want to use */
1093 ret = alloc->disk->nextgrow;
1094 /* take a new alloced chunk into use */
1095 UDB_CHUNK(ret)->exp = (uint8_t)exp;
1096 UDB_CHUNK(ret)->flags = 0;
1097 UDB_CHUNK(ret)->ptrlist = 0;
1098 UDB_CHUNK(ret)->type = udb_chunk_type_data;
1099 /* store last octet */
1100 chunk_set_last(base, ret, exp, (uint8_t)exp);
1101 /* update stats */
1102 alloc->disk->stat_alloc += esz;
1103 alloc->disk->stat_data += sz;
1104 /* now increase nextgrow to commit this newly allocated chunk */
1105 alloc->disk->nextgrow += esz;
1106 assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1107 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1108 return ret + sizeof(udb_chunk_d); /* ptr to data */
1109 }
1110
1111 /** calculate how much space is necessary to grow for this exp */
1112 static uint64_t
1113 grow_end_calc(udb_alloc* alloc, int exp)
1114 {
1115 uint64_t sz = (uint64_t)1<<exp;
1116 uint64_t ng = alloc->disk->nextgrow;
1117 uint64_t res;
1118 /* if nextgrow is 2**expness, no extra growth needed, only size */
1119 if( (ng & (sz-1)) == 0) {
1120 /* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1121 return ng+sz; /* must grow exactly 2**exp */
1122 }
1123 /* grow until 2**expness and then we need 2**exp as well */
1124 /* so, round ng down to whole sz (basically ng-ng%sz, or ng/sz*sz)
1125 * and then add the sz twice (go up to whole sz, and to allocate) */
1126 res = (ng & ~(sz-1)) + 2*sz;
1127 return res;
1128 }
1129
1130 /** see if we need to grow more than specified to enable sustained growth */
1131 static uint64_t
1132 grow_extra_check(udb_alloc* alloc, uint64_t ge)
1133 {
1134 const uint64_t mb = 1024*1024;
1135 uint64_t bsz = alloc->udb->base_size;
1136 if(bsz <= mb) {
1137 /* below 1 Mb, double sizes for exponential growth */
1138 /* takes about 15 times to grow to 1Mb */
1139 if(ge < bsz*2)
1140 return bsz*2;
1141 } else {
1142 uint64_t gnow = ge - bsz;
1143 /* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1144 * in whole megabytes rounded up. */
1145 uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1146 if(gnow < want)
1147 return bsz + want;
1148 }
1149 return ge;
1150 }
1151
1152 /** see if free space is enough to warrant shrink (while file is open) */
1153 static int
1154 enough_free(udb_alloc* alloc)
1155 {
1156 if(alloc->udb->base_size <= 2*1024*1024) {
1157 /* below 1 Mb, grown by double size, (so up to 2 mb),
1158 * do not shrink unless we can 1/3 in size */
1159 if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1160 return 1;
1161 } else {
1162 /* grown 12.5%, shrink 25% if possible, at least one mb */
1163 /* between 1mb and 4mb size, it shrinks by 1mb if possible */
1164 uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1165 if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1166 || alloc->udb->base_size < 4*1024*1024))
1167 return 1;
1168 }
1169 return 0;
1170 }
1171
1172 /** grow space for a chunk of 2**exp and return dataptr */
1173 static udb_void
1174 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1175 {
1176 /* commit the grow action
1177 * - the file grow only changes filesize, but not the nextgrow.
1178 * - taking space after nextgrow into use (as free space),
1179 * is like free-ing a chunk (one at a time).
1180 * - and the last chunk taken into use is like alloc.
1181 */
1182 /* predict how much free space is needed for this */
1183 uint64_t grow_end = grow_end_calc(alloc, exp);
1184 assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1185 if(grow_end <= alloc->udb->base_size) {
1186 /* we can do this with the available space */
1187 return grow_chunks(base, alloc, sz, exp);
1188 }
1189 /* we have to grow the file, re-mmap */
1190 /* see if we need to grow a little more, to avoid endless grow
1191 * efforts on adding data */
1192 grow_end = grow_extra_check(alloc, grow_end);
1193 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1194 return 0; /* mmap or write failed (disk or mem full) */
1195 }
1196 /* we have enough space now */
1197 assert(grow_end <= alloc->udb->base_size);
1198 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1199 return grow_chunks(base, alloc, sz, exp);
1200 }
1201
1202 /** take XL allocation into use at end of file, return dataptr */
1203 static udb_void
1204 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1205 {
1206 udb_void ret;
1207 udb_xl_chunk_d* p;
1208 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1209
1210 /* align growth to whole mbs */
1211 grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1212
1213 /* grow XL segment */
1214 ret = alloc->disk->nextgrow;
1215 p = UDB_XL_CHUNK(ret);
1216 p->exp = UDB_EXP_XL;
1217 p->size = xlsz;
1218 p->flags = 0;
1219 p->ptrlist = 0;
1220 p->type = udb_chunk_type_data;
1221
1222 /* also put size and marker at end for compaction */
1223 *((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1224 *((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1225
1226 /* stats */
1227 alloc->disk->stat_data += sz;
1228 alloc->disk->stat_alloc += xlsz;
1229 /* now increase the nextgrow to commit this xl chunk */
1230 alloc->disk->nextgrow += xlsz;
1231 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1232 return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1233 }
1234
1235 /** make space for XL allocation */
1236 static udb_void
1237 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1238 {
1239 /* allocate whole mbs of space, at end of space */
1240 uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1241 uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1242 uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1243 assert(need >= asz);
1244 if(grow_end <= alloc->udb->base_size) {
1245 /* can do this in available space */
1246 return grow_xl(base, alloc, need, sz);
1247 }
1248 /* have to grow file and re-mmap */
1249 grow_end = grow_extra_check(alloc, grow_end);
1250 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1251 return 0; /* mmap or write failed (disk or mem full) */
1252 }
1253 /* we have enough space now */
1254 assert(grow_end <= alloc->udb->base_size);
1255 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1256 return grow_xl(base, alloc, need, sz);
1257 }
1258
1259 /** divide big(2**e2) into pieces so 2**exp fits */
1260 static udb_void
1261 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1262 int exp)
1263 {
1264 int e = e2;
1265 uint64_t sz = (uint64_t)1<<e2;
1266 assert(big && e2 > exp);
1267 /* so the returned piece to use is the first piece,
1268 * offload the later half until it fits */
1269 do {
1270 sz >>= 1; /* divide size of big by two */
1271 e--; /* that means its exp is one smaller */
1272 udb_alloc_push_fl(base, alloc, big+sz, e);
1273 } while(e != exp);
1274 /* exit loop when last pushed is same size as what we want */
1275 return big;
1276 }
1277
1278 /** returns the exponent size of the chunk needed for data sz */
1279 static int
1280 udb_alloc_exp_needed(size_t sz)
1281 {
1282 uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1283 int exp;
1284 if(asz > UDB_ALLOC_CHUNK_SIZE) {
1285 return UDB_EXP_XL;
1286 } else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1287 return UDB_ALLOC_CHUNK_MINEXP;
1288 }
1289 exp = udb_exp_size(asz);
1290 assert(exp <= UDB_ALLOC_CHUNKS_MAX);
1291 return exp;
1292 }
1293
1294 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1295 {
1296 void* base = alloc->udb->base;
1297 /* calculate actual allocation size */
1298 int e2, exp = udb_alloc_exp_needed(sz);
1299 if(exp == UDB_EXP_XL)
1300 return udb_alloc_xl_space(base, alloc, sz);
1301 /* see if there is a free chunk of that size exactly */
1302 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1303 /* snip from freelist, udb_chunk_d */
1304 udb_void ret;
1305 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1306 ret = udb_alloc_pop_fl(base, alloc, exp);
1307 /* use it - size octets already OK */
1308 UDB_CHUNK(ret)->flags = 0;
1309 UDB_CHUNK(ret)->ptrlist = 0;
1310 UDB_CHUNK(ret)->type = udb_chunk_type_data;
1311 /* update stats */
1312 alloc->disk->stat_data += sz;
1313 alloc->disk->stat_alloc += (1<<exp);
1314 assert(alloc->disk->stat_free >= (1u<<exp));
1315 alloc->disk->stat_free -= (1<<exp);
1316 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1317 return ret + sizeof(udb_chunk_d); /* ptr to data */
1318 }
1319 /* see if we can subdivide a larger chunk */
1320 for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1321 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1322 udb_void big, ret; /* udb_chunk_d */
1323 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1324 big = udb_alloc_pop_fl(base, alloc, e2);
1325 /* push other parts onto freelists (needs inited) */
1326 ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1327 /* use final part (needs inited) */
1328 UDB_CHUNK(ret)->exp = (uint8_t)exp;
1329 /* if stop here; the new exp makes smaller free chunk*/
1330 UDB_CHUNK(ret)->flags = 0;
1331 UDB_CHUNK(ret)->ptrlist = 0;
1332 /* set type to commit data chunk */
1333 UDB_CHUNK(ret)->type = udb_chunk_type_data;
1334 /* store last octet */
1335 chunk_set_last(base, ret, exp, (uint8_t)exp);
1336 /* update stats */
1337 alloc->disk->stat_data += sz;
1338 alloc->disk->stat_alloc += (1<<exp);
1339 assert(alloc->disk->stat_free >= (1u<<exp));
1340 alloc->disk->stat_free -= (1<<exp);
1341 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1342 return ret + sizeof(udb_chunk_d); /* ptr to data */
1343 }
1344 /* we need to grow an extra chunk */
1345 return udb_alloc_grow_space(base, alloc, sz, exp);
1346 }
1347
1348 /** see if there is free space to allocate a chunk into */
1349 static int
1350 have_free_for(udb_alloc* alloc, int exp)
1351 {
1352 int e2;
1353 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1354 return exp;
1355 for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1356 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1357 return e2;
1358 }
1359 return 0;
1360 }
1361
1362 /** fix relptr prev and next for moved relptr structures */
1363 static void
1364 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1365 {
1366 udb_void* data = (udb_void*)arg;
1367 udb_void r;
1368 if(!rp->data)
1369 return;
1370 r = UDB_SYSTOREL(base, rp);
1371 if(rp->next)
1372 UDB_REL_PTR(rp->next)->prev = r;
1373 if(rp->prev)
1374 UDB_REL_PTR(rp->prev)->next = r;
1375 else {
1376 /* if this is a pointer to its own chunk, fix it up;
1377 * the data ptr gets set by relptr_edit later. */
1378 if(rp->data == data[0])
1379 UDB_CHUNK(data[1])->ptrlist = r;
1380 else UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1381 }
1382 }
1383
1384 /** fix pointers from and to a moved chunk */
1385 static void
1386 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1387 uint64_t dsz, udb_void olddata)
1388 {
1389 udb_void d[2];
1390 d[0] = olddata;
1391 d[1] = data;
1392 (*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1393 dsz, &chunk_fix_ptr_each, d);
1394 udb_rel_ptr_edit(base, cp->ptrlist, data);
1395 udb_base_ram_ptr_edit(udb, olddata, data);
1396 }
1397
1398 /** move an allocated chunk to use a free chunk */
1399 static void
1400 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1401 int e2)
1402 {
1403 udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1404 udb_chunk_d* rp;
1405 udb_chunk_d* fp;
1406 if(exp != e2) {
1407 /* it is bigger, subdivide it */
1408 res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1409 }
1410 assert(res != f);
1411 /* setup rollback information */
1412 alloc->udb->glob_data->rb_old = f;
1413 alloc->udb->glob_data->rb_new = res;
1414 alloc->udb->glob_data->rb_size = esz;
1415 /* take the res, exp into use */
1416 rp = UDB_CHUNK(res);
1417 fp = UDB_CHUNK(f);
1418 /* copy over the data */
1419 memcpy(rp, fp, esz);
1420 /* adjust rel ptrs */
1421 chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1422 esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1423
1424 /* do not freeup the fp; caller does that */
1425 }
1426
1427 /** unlink several free elements to overwrite with xl chunk */
1428 static void
1429 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1430 {
1431 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1432 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1433 assert(s >= UDB_ALLOC_CHUNK_SIZE);
1434 while(q >= s) {
1435 assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1436 assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1437 udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1438 q -= UDB_ALLOC_CHUNK_SIZE;
1439 }
1440 }
1441
1442 /** move an XL chunk, and keep track of segments for rollback */
1443 static void
1444 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1445 uint64_t sz, uint64_t startseg)
1446 {
1447 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1448 udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1449 uint64_t amount = xl - n;
1450 assert(n < xl); /* move to compact */
1451
1452 /* setup move rollback */
1453 udb->glob_data->rb_old = xl;
1454 udb->glob_data->rb_new = n;
1455 udb->glob_data->rb_size = sz;
1456
1457 /* is it overlapping? */
1458 if(sz <= amount) {
1459 memcpy(np, xlp, sz);
1460 } else {
1461 /* move and commit per 1M segment to avoid data loss */
1462 uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1463 for(seg = startseg; seg<maxseg; seg++) {
1464 udb->glob_data->rb_seg = seg;
1465 memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1466 xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1467 UDB_ALLOC_CHUNK_SIZE);
1468 }
1469
1470 }
1471 }
1472
1473 /** move list of XL chunks to the front by the shift amount */
1474 static void
1475 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1476 uint64_t amount)
1477 {
1478 udb_void xl = xl_start;
1479 assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1480 assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1481 assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1482 while(xl < xl_start+xl_sz) {
1483 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1484 udb_void n = xl-amount;
1485 uint64_t sz = xlp->size;
1486 assert(xlp->exp == UDB_EXP_XL);
1487 move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1488 chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1489 n+sizeof(udb_xl_chunk_d),
1490 sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1491 xl+sizeof(udb_xl_chunk_d));
1492 }
1493 alloc->disk->stat_free -= amount;
1494 alloc->disk->nextgrow -= amount;
1495 alloc->udb->glob_data->rb_old = 0;
1496 alloc->udb->glob_data->rb_new = 0;
1497 alloc->udb->glob_data->rb_size = 0;
1498 }
1499
1500 /** see if free chunk can coagulate with another chunk, return other chunk */
1501 static udb_void
1502 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1503 uint64_t esz)
1504 {
1505 udb_void other = f^esz;
1506 if(exp == UDB_ALLOC_CHUNKS_MAX)
1507 return 0; /* no further merges */
1508 if(other >= alloc->udb->base_size)
1509 return 0; /* not allocated */
1510 if(other >= alloc->disk->nextgrow)
1511 return 0; /* not in use */
1512 if(other < alloc->udb->glob_data->hsize)
1513 return 0; /* cannot merge with header */
1514 /* the header is also protected by the special exp marker */
1515 /* see if the other chunk is a free chunk */
1516
1517 /* check closest marker to avoid large memory churn */
1518 /* and also it makes XL allocations and header special markers work */
1519 if(f > other) {
1520 assert(f > 1); /* this is certain because of header */
1521 if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1522 /* can do it if the other part is a free chunk */
1523 assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1524 if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1525 return other;
1526 }
1527 } else {
1528 if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1529 /* can do it if the other part is a free chunk */
1530 assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1531 if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1532 return other;
1533 }
1534 }
1535 return 0;
1536 }
1537
1538 /** coagulate and then add new free segment, return final free segment */
1539 static udb_void
1540 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1541 uint64_t esz)
1542 {
1543 /* new free chunk here, attempt coagulate */
1544 udb_void other;
1545 while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1546 /* unlink that other chunk */
1547 udb_alloc_unlink_fl(base, alloc, other, exp);
1548 /* merge up */
1549 if(other < last)
1550 last = other;
1551 exp++;
1552 esz <<= 1;
1553 }
1554 /* free the final segment */
1555 udb_alloc_push_fl(base, alloc, last, exp);
1556 return last;
1557 }
1558
1559 /** attempt to compact the data and move free space to the end */
1560 int
1561 udb_alloc_compact(void* base, udb_alloc* alloc)
1562 {
1563 udb_void last;
1564 int exp, e2;
1565 uint64_t esz;
1566 uint64_t at = alloc->disk->nextgrow;
1567 udb_void xl_start = 0;
1568 uint64_t xl_sz = 0;
1569 if(alloc->udb->inhibit_compact)
1570 return 1;
1571 alloc->udb->useful_compact = 0;
1572 while(at > alloc->udb->glob_data->hsize) {
1573 /* grab last entry */
1574 exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1575 if(exp == UDB_EXP_XL) {
1576 /* for XL chunks:
1577 * - inspect the size of the XLchunklist at end
1578 * - attempt to compact in front of of XLchunklist
1579 */
1580 uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1581 at-sizeof(uint64_t)*2));
1582 udb_void xl = at-xlsz;
1583 #ifndef NDEBUG
1584 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1585 assert(xlp->exp == UDB_EXP_XL);
1586 assert(xlp->type != udb_chunk_type_free);
1587 #endif
1588 /* got thesegment add to the xl chunk list */
1589 if(xl_start != 0 && xl+xlsz != xl_start) {
1590 /* nonadjoining XL part, but they are aligned,
1591 * so the space in between is whole Mbs,
1592 * shift the later part(s) and continue */
1593 uint64_t m = xl_start - (xl+xlsz);
1594 assert(xl_start > xl+xlsz);
1595 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1596 free_xl_space(base, alloc, xl+xlsz, m);
1597 move_xl_list(base, alloc, xl_start, xl_sz, m);
1598 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1599 }
1600 xl_start = xl;
1601 xl_sz += xlsz;
1602 at = xl;
1603 continue;
1604 /* end of XL if */
1605 } else if(exp < UDB_ALLOC_CHUNK_MINEXP
1606 || exp > UDB_ALLOC_CHUNKS_MAX)
1607 break; /* special chunk or garbage */
1608 esz = (uint64_t)1<<exp;
1609 last = at - esz;
1610 assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1611 if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1612 /* if xlstart continue looking to move stuff, but do
1613 * not unlink this free segment */
1614 if(!xl_start) {
1615 /* it is a free chunk, remove it */
1616 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1617 udb_alloc_unlink_fl(base, alloc, last, exp);
1618 alloc->disk->stat_free -= esz;
1619 alloc->disk->nextgrow = last;
1620 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1621 /* and continue at this point */
1622 }
1623 at = last;
1624 } else if( (e2=have_free_for(alloc, exp)) ) {
1625 /* last entry can be allocated in free chunks
1626 * move it to its new position, adjust rel_ptrs */
1627 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1628 move_chunk(base, alloc, last, exp, esz, e2);
1629 if(xl_start) {
1630 last = coagulate_and_push(base, alloc,
1631 last, exp, esz);
1632 } else {
1633 /* shorten usage */
1634 alloc->disk->stat_free -= esz;
1635 alloc->disk->nextgrow = last;
1636 }
1637 alloc->udb->glob_data->rb_old = 0;
1638 alloc->udb->glob_data->rb_new = 0;
1639 alloc->udb->glob_data->rb_size = 0;
1640 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1641 /* and continue in front of it */
1642 at = last;
1643 } else {
1644 /* cannot compact this block, stop compacting */
1645 break;
1646 }
1647 /* if that worked, repeat it */
1648 }
1649 /* if we passed xl chunks, see if XL-chunklist can move */
1650 if(xl_start) {
1651 /* calculate free space in front of the XLchunklist. */
1652 /* has to be whole mbs of free space */
1653 /* if so, we can move the XL chunks. Move them all back
1654 * by the new free space. */
1655 /* this compacts very well, but the XL chunks can be moved
1656 * multiple times; worst case for every mb freed a huge sized
1657 * xlchunklist gets moved. */
1658 /* free space must be, since aligned and coagulated, in
1659 * chunks of a whole MB */
1660 udb_void at = xl_start;
1661 uint64_t m = 0;
1662 while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1663 udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1664 if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1665 break;
1666 assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1667 m += UDB_ALLOC_CHUNK_SIZE;
1668 at = chunk;
1669 }
1670 if(m != 0) {
1671 assert(at+m == xl_start);
1672 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1673 free_xl_space(base, alloc, at, m);
1674 move_xl_list(base, alloc, xl_start, xl_sz, m);
1675 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1676 }
1677 }
1678
1679 /* if enough free, shrink the file; re-mmap */
1680 if(enough_free(alloc)) {
1681 uint64_t nsize = alloc->disk->nextgrow;
1682 udb_base_shrink(alloc->udb, nsize);
1683 if(!udb_base_remap(alloc->udb, alloc, nsize))
1684 return 0;
1685 }
1686 return 1;
1687 }
1688
1689 int
1690 udb_compact(udb_base* udb)
1691 {
1692 if(!udb) return 1;
1693 if(!udb->useful_compact) return 1;
1694 DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database..."));
1695 return udb_alloc_compact(udb->base, udb->alloc);
1696 }
1697
1698 void udb_compact_inhibited(udb_base* udb, int inhibit)
1699 {
1700 if(!udb) return;
1701 udb->inhibit_compact = inhibit;
1702 }
1703
1704 #ifdef UDB_CHECK
1705 /** check that rptrs are really zero before free */
1706 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1707 {
1708 (void)base;
1709 (void)arg;
1710 assert(p->data == 0);
1711 }
1712 #endif /* UDB_CHECK */
1713
1714 /** free XL chunk as multiples of CHUNK_SIZE free segments */
1715 static void
1716 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1717 size_t sz)
1718 {
1719 uint64_t xlsz = fp->size;
1720 uint64_t c;
1721 /* lightweight check for buffer overflow in xl data */
1722 assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1723 assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1724 assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1725 assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1726 #ifdef UDB_CHECK
1727 /* check that relptrs in this chunk have been zeroed */
1728 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1729 UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1730 &udb_check_rptr_zero, NULL);
1731 #endif
1732 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1733 /* update stats */
1734 alloc->disk->stat_data -= sz;
1735 alloc->disk->stat_alloc -= xlsz;
1736 alloc->disk->stat_free += xlsz;
1737 /* walk in reverse, so the front blocks go first on the list */
1738 c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1739 /* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1740 assert(f >= UDB_ALLOC_CHUNK_SIZE);
1741 while(c >= f) {
1742 /* free a block of CHUNK_SIZE (1 Mb) */
1743 udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1744 c -= UDB_ALLOC_CHUNK_SIZE;
1745 }
1746 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1747 }
1748
1749 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1750 {
1751 void* base;
1752 /* lookup chunk ptr */
1753 udb_void f;
1754 udb_chunk_d* fp;
1755 uint64_t esz;
1756 int exp;
1757 udb_void other;
1758 int coagulated = 0;
1759 if(!r)
1760 return 1; /* free(NULL) does nothing */
1761
1762 /* lookup size of chunk */
1763 base = alloc->udb->base;
1764 /* fails for XL blocks */
1765 f = chunk_from_dataptr(r);
1766 fp = UDB_CHUNK(f);
1767 assert(fp->type != udb_chunk_type_free);
1768
1769 /* see if it has a ptrlist, if so: trouble, the list is not properly
1770 * cleaned up. (although you can imagine a wholesale delete where
1771 * it does not matter) */
1772 assert(fp->ptrlist == 0);
1773
1774 /* set ptrlist to 0 to stop relptr from using it, robustness. */
1775 fp->ptrlist = 0;
1776
1777 if(fp->exp == UDB_EXP_XL) {
1778 udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1779 /* compact */
1780 if(alloc->udb->inhibit_compact) {
1781 alloc->udb->useful_compact = 1;
1782 return 1;
1783 }
1784 return udb_alloc_compact(base, alloc);
1785 }
1786 /* it is a regular chunk of 2**exp size */
1787 exp = (int)fp->exp;
1788 esz = (uint64_t)1<<exp;
1789 /* light check for e.g. buffer overflow of the data */
1790 assert(sz < esz);
1791 assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1792 #ifdef UDB_CHECK
1793 /* check that relptrs in this chunk have been zeroed */
1794 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1795 UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1796 #endif
1797
1798 /* update the stats */
1799 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1800 alloc->disk->stat_data -= sz;
1801 alloc->disk->stat_free += esz;
1802 alloc->disk->stat_alloc -= esz;
1803
1804 /* if it can be merged with other free chunks, do so */
1805 while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1806 coagulated = 1;
1807 /* unlink that other chunk and expand it (it has same size) */
1808 udb_alloc_unlink_fl(base, alloc, other, exp);
1809 /* merge up */
1810 if(other < f)
1811 f = other;
1812 exp++;
1813 esz <<= 1;
1814 }
1815 if(coagulated) {
1816 /* put big free chunk into freelist, and init it */
1817 udb_alloc_push_fl(base, alloc, f, exp);
1818 } else {
1819 /* we do not need to touch the last-exp-byte, which may save
1820 * a reference to that page of memory */
1821 fp->type = udb_chunk_type_free;
1822 fp->flags = 0;
1823 udb_alloc_push_fl_noinit(base, alloc, f, exp);
1824 }
1825 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1826 /* compact */
1827 if(alloc->udb->inhibit_compact) {
1828 alloc->udb->useful_compact = 1;
1829 return 1;
1830 }
1831 return udb_alloc_compact(base, alloc);
1832 }
1833
1834 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1835 {
1836 /* could be faster maybe, if grown? */
1837 udb_void r = udb_alloc_space(alloc, sz);
1838 if(!r) return r;
1839 memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1840 return r;
1841 }
1842
1843 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1844 {
1845 void* base = alloc->udb->base;
1846 udb_void c, n, newd;
1847 udb_chunk_d* cp, *np;
1848 uint64_t avail;
1849 uint8_t cp_type;
1850 /* emulate some posix realloc stuff */
1851 if(r == 0)
1852 return udb_alloc_space(alloc, sz);
1853 if(sz == 0) {
1854 if(!udb_alloc_free(alloc, r, osz))
1855 log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1856 return 0;
1857 }
1858 c = chunk_from_dataptr(r);
1859 cp = UDB_CHUNK(c);
1860 cp_type = cp->type;
1861 if(cp->exp == UDB_EXP_XL) {
1862 avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1863 - sizeof(uint64_t)*2;
1864 } else {
1865 avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1866 }
1867 if(sz <= avail)
1868 return r;
1869 /* reallocate it, and copy */
1870 newd = udb_alloc_space(alloc, sz);
1871 if(!newd) return 0;
1872 /* re-base after alloc, since re-mmap may have happened */
1873 base = alloc->udb->base;
1874 cp = NULL; /* may be invalid now, robustness */
1875 n = chunk_from_dataptr(newd);
1876 np = UDB_CHUNK(n);
1877 np->type = cp_type;
1878 memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1879 /* fixup ptrs */
1880 chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1881
1882 if(!udb_alloc_free(alloc, r, osz))
1883 log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1884 return newd;
1885 }
1886
1887 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1888 {
1889 const uint64_t mb = 1024*1024;
1890 int exp = udb_alloc_exp_needed(sz);
1891 uint64_t esz;
1892 uint64_t want;
1893 if(exp == UDB_EXP_XL)
1894 esz = (sz&(mb-1))+mb;
1895 else esz = (uint64_t)1<<exp;
1896 /* we need grow_end_calc to take into account alignment */
1897 want = grow_end_calc(alloc, exp) + esz*(num-1);
1898 assert(want >= alloc->udb->base_size);
1899 if(!udb_base_grow_and_remap(alloc->udb, want)) {
1900 log_msg(LOG_ERR, "failed to grow the specified amount");
1901 return 0;
1902 }
1903 return 1;
1904 }
1905
1906 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1907 {
1908 void* base = alloc->udb->base;
1909 udb_void f = chunk_from_dataptr(r);
1910 udb_chunk_d* fp = UDB_CHUNK(f);
1911 /* not the 'free' type, that must be set by allocation routines */
1912 assert(fp->type != udb_chunk_type_free);
1913 assert(tp != udb_chunk_type_free);
1914 fp->type = tp;
1915 }
1916
1917 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1918 {
1919 /* pointers are not valid before the header-size or after the
1920 * used-region of the mmap */
1921 return ( (to+destsize) <= udb->base_size &&
1922 to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1923 (to+destsize) <= udb->alloc->disk->nextgrow);
1924 }
1925
1926 int udb_valid_dataptr(udb_base* udb, udb_void to)
1927 {
1928 void* base = udb->base;
1929 udb_void ch;
1930 int exp;
1931 uint64_t esz;
1932 /* our data chunks are aligned and at least 8 bytes */
1933 if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1934 return 0;
1935 /* get the chunk pointer */
1936 ch = chunk_from_dataptr(to);
1937 if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1938 return 0;
1939 /* check its size */
1940 exp = UDB_CHUNK(ch)->exp;
1941 if(exp == UDB_EXP_XL) {
1942 /* check XL chunk */
1943 uint64_t xlsz;
1944 if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1945 return 0;
1946 xlsz = UDB_XL_CHUNK(ch)->size;
1947 if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1948 return 0;
1949 if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1950 return 0;
1951 if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1952 != xlsz)
1953 return 0;
1954 return 1;
1955 }
1956 /* check if regular chunk has matching end byte */
1957 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1958 return 0; /* cannot be a valid chunk */
1959 esz = 1<<exp;
1960 if(!udb_valid_offset(udb, ch+esz-1, 1))
1961 return 0;
1962 if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1963 return 0;
1964 return 1;
1965 }
1966
1967 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1968 {
1969 void* base = udb->base;
1970 udb_void p;
1971 if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1972 return 0;
1973 if(!udb_valid_dataptr(udb, to))
1974 return 0;
1975 p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1976 while(p) {
1977 if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1978 return 0;
1979 if(p == rptr)
1980 return 1;
1981 p = UDB_REL_PTR(p)->next;
1982 }
1983 return 0;
1984 }
1985
1986 void udb_rel_ptr_init(udb_rel_ptr* ptr)
1987 {
1988 memset(ptr, 0, sizeof(*ptr));
1989 }
1990
1991 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1992 {
1993 if(!ptr->data)
1994 return;
1995 if(ptr->prev) {
1996 UDB_REL_PTR(ptr->prev)->next = ptr->next;
1997 } else {
1998 UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1999 }
2000 if(ptr->next) {
2001 UDB_REL_PTR(ptr->next)->prev = ptr->prev;
2002 }
2003 }
2004
2005 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
2006 {
2007 udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
2008 ptr->prev = 0;
2009 ptr->next = chunk->ptrlist;
2010 if(ptr->next)
2011 UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
2012 chunk->ptrlist = UDB_SYSTOREL(base, ptr);
2013 ptr->data = to;
2014 }
2015
2016 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
2017 {
2018 assert(to == 0 || to > 64);
2019 udb_rel_ptr_unlink(base, ptr);
2020 if(to)
2021 udb_rel_ptr_link(base, ptr, to);
2022 else ptr->data = to;
2023 }
2024
2025 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
2026 {
2027 udb_void p = list;
2028 while(p) {
2029 UDB_REL_PTR(p)->data = to;
2030 p = UDB_REL_PTR(p)->next;
2031 }
2032 }
2033
2034 #ifdef UDB_CHECK
2035 /** check that all pointers are validly chained */
2036 static void
2037 udb_check_ptrs_valid(udb_base* udb)
2038 {
2039 size_t i;
2040 udb_ptr* p, *prev;
2041 for(i=0; i<udb->ram_size; i++) {
2042 prev = NULL;
2043 for(p=udb->ram_hash[i]; p; p=p->next) {
2044 assert(p->prev == prev);
2045 assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
2046 == i);
2047 assert(p->base == &udb->base);
2048 prev = p;
2049 }
2050 }
2051 }
2052 #endif /* UDB_CHECK */
2053
2054 void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
2055 {
2056 #ifdef UDB_CHECK
2057 udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
2058 #endif
2059 memset(ptr, 0, sizeof(*ptr));
2060 ptr->base = &udb->base;
2061 }
2062
2063 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
2064 {
2065 assert(newval == 0 || newval > 64);
2066 if(ptr->data)
2067 udb_base_unlink_ptr(udb, ptr);
2068 ptr->data = newval;
2069 if(newval)
2070 udb_base_link_ptr(udb, ptr);
2071 }
2072
2073 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
2074 size_t sz)
2075 {
2076 udb_void r;
2077 r = udb_alloc_space(udb->alloc, sz);
2078 if(!r) return 0;
2079 udb_alloc_set_type(udb->alloc, r, type);
2080 udb_ptr_init(ptr, udb);
2081 udb_ptr_set(ptr, udb, r);
2082 return 1;
2083 }
2084
2085 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2086 {
2087 if(ptr->data) {
2088 udb_void d = ptr->data;
2089 udb_ptr_set(ptr, udb, 0);
2090 udb_alloc_free(udb->alloc, d, sz);
2091 }
2092 }
2093
2094 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2095 {
2096 udb_void f;
2097 if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2098 f = chunk_from_dataptr(ptr->data);
2099 return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2100 }
2101