1 /* udb.c - u(micro) data base.
2  * By W.C.A. Wijngaards
3  * Copyright 2010, NLnet Labs.
4  * BSD, see LICENSE.
5  */
6 #include "config.h"
7 #include "udb.h"
8 #include <string.h>
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <assert.h>
13 #include "lookup3.h"
14 #include "util.h"
15 
16 /* mmap and friends */
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 
22 /* for systems without, portable definition, failed-1 and async is a flag */
23 #ifndef MAP_FAILED
24 #define MAP_FAILED ((void*)-1)
25 #endif
26 #ifndef MS_SYNC
27 #define MS_SYNC 0
28 #endif
29 
30 /** move and fixup xl segment */
31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32 	udb_void n, uint64_t sz, uint64_t startseg);
33 /** attempt to compact the data and move free space to the end */
34 static int udb_alloc_compact(void* base, udb_alloc* alloc);
35 
36 /** convert pointer to the data part to a pointer to the base of the chunk */
37 static udb_void
chunk_from_dataptr(udb_void data)38 chunk_from_dataptr(udb_void data)
39 {
40 	/* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41 	 * that xl_chunk_d is aligned on x**1024 boundaries. */
42 	udb_void xl = data - sizeof(udb_xl_chunk_d);
43 	if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44 		return xl;
45 	return data - sizeof(udb_chunk_d);
46 }
47 
chunk_from_dataptr_ext(udb_void data)48 udb_void chunk_from_dataptr_ext(udb_void data) {
49 	return chunk_from_dataptr(data);
50 }
51 
52 #ifndef NDEBUG
53 /** read last octet from a chunk */
54 static uint8_t
chunk_get_last(void * base,udb_void chunk,int exp)55 chunk_get_last(void* base, udb_void chunk, int exp)
56 {
57 	return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58 }
59 #endif
60 
61 /** write last octet of a chunk */
62 static void
chunk_set_last(void * base,udb_void chunk,int exp,uint8_t value)63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64 {
65 	assert(exp >= 0 && exp <= 63);
66 	*((uint8_t*)UDB_REL(base, chunk+((uint64_t)1<<exp)-1)) = value;
67 }
68 
69 /** create udb_base from a file descriptor (must be at start of file) */
70 udb_base*
udb_base_create_fd(const char * fname,int fd,udb_walk_relptr_func walkfunc,void * arg)71 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
72 	void* arg)
73 {
74 	uint64_t m, fsz;
75 	udb_glob_d g;
76 	ssize_t r;
77 	udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
78 	if(!udb) {
79 		log_msg(LOG_ERR, "out of memory");
80 		close(fd);
81 		return NULL;
82 	}
83 	udb->fname = strdup(fname);
84 	if(!udb->fname) {
85 		log_msg(LOG_ERR, "out of memory");
86 		free(udb);
87 		close(fd);
88 		return NULL;
89 	}
90 	udb->walkfunc = walkfunc;
91 	udb->walkarg = arg;
92 	udb->fd = fd;
93 	udb->ram_size = 1024;
94 	udb->ram_mask = (int)udb->ram_size - 1;
95 	udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*),
96 		udb->ram_size);
97 	if(!udb->ram_hash) {
98 		free(udb->fname);
99 		free(udb);
100 		log_msg(LOG_ERR, "out of memory");
101 		close(fd);
102 		return NULL;
103 	}
104 
105 	/* read magic */
106 	if((r=read(fd, &m, sizeof(m))) == -1) {
107 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
108 		goto fail;
109 	} else if(r != (ssize_t)sizeof(m)) {
110 		log_msg(LOG_ERR, "%s: file too short", fname);
111 		goto fail;
112 	}
113 	/* TODO : what if bigendian and littleendian file, see magic */
114 	if(m != UDB_MAGIC) {
115 		log_msg(LOG_ERR, "%s: wrong type of file", fname);
116 		goto fail;
117 	}
118 	/* read header */
119 	if((r=read(fd, &g, sizeof(g))) == -1) {
120 		log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
121 		goto fail;
122 	} else if(r != (ssize_t)sizeof(g)) {
123 		log_msg(LOG_ERR, "%s: file too short", fname);
124 		goto fail;
125 	}
126 	if(g.version != 0) {
127 		log_msg(LOG_ERR, "%s: unknown file version %d", fname,
128 			(int)g.version);
129 		goto fail;
130 	}
131 	if(g.hsize < UDB_HEADER_SIZE) {
132 		log_msg(LOG_ERR, "%s: header size too small %d", fname,
133 			(int)g.hsize);
134 		goto fail;
135 	}
136 	if(g.hsize > UDB_HEADER_SIZE) {
137 		log_msg(LOG_WARNING, "%s: header size too large %d", fname,
138 			(int)g.hsize);
139 		goto fail;
140 	}
141 	if(g.clean_close != 1) {
142 		log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
143 			(int)g.clean_close);
144 		goto fail;
145 	}
146 	if(g.dirty_alloc != 0) {
147 		log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname,
148 			(int)g.dirty_alloc);
149 		goto fail;
150 	}
151 
152 	/* check file size correctly written, for 4.0.2 nsd.db failure */
153 	fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END);
154 	(void)lseek(fd, (off_t)0, SEEK_SET);
155 	if(g.fsize != fsz) {
156 		log_msg(LOG_WARNING, "%s: file size %llu but mmap header "
157 			"has size %llu", fname, (unsigned long long)fsz,
158 			(unsigned long long)g.fsize);
159 		goto fail;
160 	}
161 
162 	/* mmap it */
163 	if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
164 		log_msg(LOG_ERR, "%s: file too short", fname);
165 		goto fail;
166 	}
167 	if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ {
168 		log_msg(LOG_WARNING, "%s: file size too large %llu",
169 			fname, (unsigned long long)g.fsize);
170 		goto fail;
171 	}
172 	udb->base_size = (size_t)g.fsize;
173 #ifdef HAVE_MMAP
174 	/* note the size_t casts must be there for portability, on some
175 	 * systems the layout of memory is otherwise broken. */
176 	udb->base = mmap(NULL, (size_t)udb->base_size,
177 		(int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
178 		(int)udb->fd, (off_t)0);
179 #else
180 	udb->base = MAP_FAILED; errno = ENOSYS;
181 #endif
182 	if(udb->base == MAP_FAILED) {
183 		udb->base = NULL;
184 		log_msg(LOG_ERR, "mmap(size %u) error: %s",
185 			(unsigned)udb->base_size, strerror(errno));
186 	fail:
187 		close(fd);
188 		free(udb->fname);
189 		free(udb->ram_hash);
190 		free(udb);
191 		return NULL;
192 	}
193 
194 	/* init completion */
195 	udb->glob_data = (udb_glob_d*)((char*)udb->base+sizeof(uint64_t));
196 	r = 0;
197 	/* cannot be dirty because that is goto fail above */
198 	if(udb->glob_data->dirty_alloc != udb_dirty_clean)
199 		r = 1;
200 	udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
201 		(char*)udb->glob_data+sizeof(*udb->glob_data)));
202 	if(!udb->alloc) {
203 		log_msg(LOG_ERR, "out of memory");
204 		udb_base_free(udb);
205 		return NULL;
206 	}
207 	if(r) {
208 		/* and compact now, or resume compacting */
209 		udb_alloc_compact(udb, udb->alloc);
210 		udb_base_sync(udb, 1);
211 	}
212 	udb->glob_data->clean_close = 0;
213 
214 	return udb;
215 }
216 
udb_base_create_read(const char * fname,udb_walk_relptr_func walkfunc,void * arg)217 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
218 	void* arg)
219 {
220 	int fd = open(fname, O_RDWR);
221 	if(fd == -1) {
222 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
223 		return NULL;
224 	}
225 	return udb_base_create_fd(fname, fd, walkfunc, arg);
226 }
227 
228 /** init new udb_global structure */
udb_glob_init_new(udb_glob_d * g)229 static void udb_glob_init_new(udb_glob_d* g)
230 {
231 	memset(g, 0, sizeof(*g));
232 	g->hsize = UDB_HEADER_SIZE;
233 	g->fsize = UDB_HEADER_SIZE;
234 }
235 
236 /** write data to file and check result */
237 static int
write_fdata(const char * fname,int fd,void * data,size_t len)238 write_fdata(const char* fname, int fd, void* data, size_t len)
239 {
240 	ssize_t w;
241 	if((w=write(fd, data, len)) == -1) {
242 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
243 		close(fd);
244 		return 0;
245 	} else if(w != (ssize_t)len) {
246 		log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
247 		close(fd);
248 		return 0;
249 	}
250 	return 1;
251 }
252 
udb_base_create_new(const char * fname,udb_walk_relptr_func walkfunc,void * arg)253 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
254 	void* arg)
255 {
256 	uint64_t m;
257 	udb_glob_d g;
258 	udb_alloc_d a;
259 	uint64_t endsize = UDB_HEADER_SIZE;
260 	uint64_t endexp = 0;
261 	int fd = open(fname, O_CREAT|O_RDWR, 0600);
262 	if(fd == -1) {
263 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
264 		return NULL;
265 	}
266 	m = UDB_MAGIC;
267 	udb_glob_init_new(&g);
268 	udb_alloc_init_new(&a);
269 	g.clean_close = 1;
270 
271 	/* write new data to file (closes fd on error) */
272 	if(!write_fdata(fname, fd, &m, sizeof(m)))
273 		return NULL;
274 	if(!write_fdata(fname, fd, &g, sizeof(g)))
275 		return NULL;
276 	if(!write_fdata(fname, fd, &a, sizeof(a)))
277 		return NULL;
278 	if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
279 		return NULL;
280 	if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
281 		return NULL;
282 	/* rewind to start */
283 	if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
284 		log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
285 		close(fd);
286 		return NULL;
287 	}
288 	/* truncate to the right size */
289 	if(ftruncate(fd, (off_t)g.fsize) < 0) {
290 		log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname,
291 			(int)g.fsize, strerror(errno));
292 		close(fd);
293 		return NULL;
294 	}
295 	return udb_base_create_fd(fname, fd, walkfunc, arg);
296 }
297 
298 /** shrink the udb base if it has unused space at the end */
299 static void
udb_base_shrink(udb_base * udb,uint64_t nsize)300 udb_base_shrink(udb_base* udb, uint64_t nsize)
301 {
302 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
303 	udb->glob_data->fsize = nsize;
304 	/* sync, does not *seem* to be required on Linux, but it is
305 	   certainly required on OpenBSD.  Otherwise changed data is lost. */
306 #ifdef HAVE_MMAP
307 	msync(udb->base, udb->base_size, MS_ASYNC);
308 #endif
309 	if(ftruncate(udb->fd, (off_t)nsize) != 0) {
310 		log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
311 			(unsigned)nsize, strerror(errno));
312 	}
313 	udb->glob_data->dirty_alloc = udb_dirty_clean;
314 }
315 
udb_base_close(udb_base * udb)316 void udb_base_close(udb_base* udb)
317 {
318 	if(!udb)
319 		return;
320 	if(udb->fd != -1 && udb->base && udb->alloc) {
321 		uint64_t nsize = udb->alloc->disk->nextgrow;
322 		if(nsize < udb->base_size)
323 			udb_base_shrink(udb, nsize);
324 	}
325 	if(udb->fd != -1) {
326 		udb->glob_data->clean_close = 1;
327 		close(udb->fd);
328 		udb->fd = -1;
329 	}
330 	if(udb->base) {
331 #ifdef HAVE_MMAP
332 		if(munmap(udb->base, udb->base_size) == -1) {
333 			log_msg(LOG_ERR, "munmap: %s", strerror(errno));
334 		}
335 #endif
336 		udb->base = NULL;
337 	}
338 }
339 
udb_base_free(udb_base * udb)340 void udb_base_free(udb_base* udb)
341 {
342 	if(!udb)
343 		return;
344 	udb_base_close(udb);
345 	udb_alloc_delete(udb->alloc);
346 	free(udb->ram_hash);
347 	free(udb->fname);
348 	free(udb);
349 }
350 
udb_base_free_keep_mmap(udb_base * udb)351 void udb_base_free_keep_mmap(udb_base* udb)
352 {
353 	if(!udb) return;
354 	if(udb->fd != -1) {
355 		close(udb->fd);
356 		udb->fd = -1;
357 	}
358 	udb->base = NULL;
359 	udb_alloc_delete(udb->alloc);
360 	free(udb->ram_hash);
361 	free(udb->fname);
362 	free(udb);
363 }
364 
udb_base_sync(udb_base * udb,int wait)365 void udb_base_sync(udb_base* udb, int wait)
366 {
367 	if(!udb) return;
368 #ifdef HAVE_MMAP
369 	if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
370 		log_msg(LOG_ERR, "msync(%s) error %s",
371 			udb->fname, strerror(errno));
372 	}
373 #else
374 	(void)wait;
375 #endif
376 }
377 
378 /** hash a chunk pointer */
379 static uint32_t
chunk_hash_ptr(udb_void p)380 chunk_hash_ptr(udb_void p)
381 {
382 	/* put p into an array of uint32 */
383 	uint32_t h[sizeof(p)/sizeof(uint32_t)];
384 	memcpy(&h, &p, sizeof(h));
385 	return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
386 }
387 
388 /** check that the given pointer is on the bucket for the given offset */
udb_ptr_is_on_bucket(udb_base * udb,udb_ptr * ptr,udb_void to)389 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
390 {
391 	uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
392 	udb_ptr* p;
393 	assert((size_t)i < udb->ram_size);
394 	for(p = udb->ram_hash[i]; p; p=p->next) {
395 		if(p == ptr)
396 			return 1;
397 	}
398 	return 0;
399 }
400 
401 /** grow the ram array */
402 static void
grow_ram_hash(udb_base * udb,udb_ptr ** newhash)403 grow_ram_hash(udb_base* udb, udb_ptr** newhash)
404 {
405 	size_t i;
406 	size_t osize= udb->ram_size;
407 	udb_ptr* p, *np;
408 	udb_ptr** oldhash = udb->ram_hash;
409 	udb->ram_size *= 2;
410 	udb->ram_mask <<= 1;
411 	udb->ram_mask |= 1;
412 	udb->ram_hash = newhash;
413 	/* have to link in every element in the old list into the new list*/
414 	for(i=0; i<osize; i++) {
415 		p = oldhash[i];
416 		while(p) {
417 			np = p->next;
418 			/* link into newhash */
419 			p->prev=NULL;
420 			p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
421 			if(p->next) p->next->prev = p;
422 			/* go to next element of oldhash */
423 			p = np;
424 		}
425 	}
426 	free(oldhash);
427 }
428 
udb_base_link_ptr(udb_base * udb,udb_ptr * ptr)429 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
430 {
431 	uint32_t i;
432 #ifdef UDB_CHECK
433 	assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
434 #endif
435 	udb->ram_num++;
436 	if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) {
437 		/* grow the array, if allocation succeeds */
438 		udb_ptr** newram = (udb_ptr**)xalloc_array_zero(
439 			sizeof(udb_ptr*), udb->ram_size*2);
440 		if(newram) {
441 			grow_ram_hash(udb, newram);
442 		}
443 	}
444 	i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
445 	assert((size_t)i < udb->ram_size);
446 
447 	ptr->prev = NULL;
448 	ptr->next = udb->ram_hash[i];
449 	udb->ram_hash[i] = ptr;
450 	if(ptr->next)
451 		ptr->next->prev = ptr;
452 }
453 
udb_base_unlink_ptr(udb_base * udb,udb_ptr * ptr)454 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
455 {
456 	assert(ptr->data);
457 #ifdef UDB_CHECK
458 	assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
459 	assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
460 #endif
461 	udb->ram_num--;
462 	if(ptr->next)
463 		ptr->next->prev = ptr->prev;
464 	if(ptr->prev)
465 		ptr->prev->next = ptr->next;
466 	else	{
467 		uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
468 		assert((size_t)i < udb->ram_size);
469 		udb->ram_hash[i] = ptr->next;
470 	}
471 }
472 
473 /** change a set of ram ptrs to a new value */
474 static void
udb_base_ram_ptr_edit(udb_base * udb,udb_void old,udb_void newd)475 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
476 {
477 	uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
478 	udb_ptr* p, *np;
479 	/* edit them and move them into the new position */
480 	p = udb->ram_hash[io];
481 	while(p) {
482 		np = p->next;
483 		if(p->data == old) {
484 			udb_base_unlink_ptr(udb, p);
485 			p->data = newd;
486 			udb_base_link_ptr(udb, p);
487 		}
488 		p = np;
489 	}
490 }
491 
udb_base_get_userdata(udb_base * udb)492 udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
493 {
494 	return &udb->glob_data->user_global;
495 }
496 
udb_base_set_userdata(udb_base * udb,udb_void user)497 void udb_base_set_userdata(udb_base* udb, udb_void user)
498 {
499 #ifdef UDB_CHECK
500 	if(user) { assert(udb_valid_dataptr(udb, user)); }
501 #endif
502 	udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
503 }
504 
udb_base_set_userflags(udb_base * udb,uint8_t v)505 void udb_base_set_userflags(udb_base* udb, uint8_t v)
506 {
507 	udb->glob_data->userflags = v;
508 }
509 
udb_base_get_userflags(udb_base * udb)510 uint8_t udb_base_get_userflags(udb_base* udb)
511 {
512 	return udb->glob_data->userflags;
513 }
514 
515 /** re-mmap the udb to specified size */
516 static void*
udb_base_remap(udb_base * udb,udb_alloc * alloc,uint64_t nsize)517 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
518 {
519 #ifdef HAVE_MMAP
520 	void* nb;
521 	/* for use with valgrind, do not use mremap, but the other version */
522 #ifdef MREMAP_MAYMOVE
523 	nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
524 	if(nb == MAP_FAILED) {
525 		log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
526 			udb->fname, (unsigned)nsize, strerror(errno));
527 		return 0;
528 	}
529 #else /* !HAVE MREMAP */
530 	/* use munmap-mmap to simulate mremap */
531 	if(munmap(udb->base, udb->base_size) != 0) {
532 		log_msg(LOG_ERR, "munmap(%s) error %s",
533 			udb->fname, strerror(errno));
534 	}
535 	/* provide hint for new location */
536 	/* note the size_t casts must be there for portability, on some
537 	 * systems the layout of memory is otherwise broken. */
538 	nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
539 		(int)MAP_SHARED, (int)udb->fd, (off_t)0);
540 	/* retry the mmap without basept in case of ENOMEM (FreeBSD8),
541 	 * the kernel can then try to mmap it at a different location
542 	 * where more memory is available */
543 	if(nb == MAP_FAILED && errno == ENOMEM) {
544 		nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
545 			(int)MAP_SHARED, (int)udb->fd, (off_t)0);
546 	}
547 	if(nb == MAP_FAILED) {
548 		log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
549 			udb->fname, (unsigned)nsize, strerror(errno));
550 		udb->base = NULL;
551 		return 0;
552 	}
553 #endif /* HAVE MREMAP */
554 	if(nb != udb->base) {
555 		/* fix up realpointers in udb and alloc */
556 		/* but mremap may have been nice and not move the base */
557 		udb->base = nb;
558 		udb->glob_data = (udb_glob_d*)((char*)nb+sizeof(uint64_t));
559 		/* use passed alloc pointer because the udb->alloc may not
560 		 * be initialized yet */
561 		alloc->disk = (udb_alloc_d*)((char*)udb->glob_data
562 			+sizeof(*udb->glob_data));
563 	}
564 	udb->base_size = nsize;
565 	return nb;
566 #else /* HAVE_MMAP */
567 	(void)udb; (void)alloc; (void)nsize;
568 	return NULL;
569 #endif /* HAVE_MMAP */
570 }
571 
572 void
udb_base_remap_process(udb_base * udb)573 udb_base_remap_process(udb_base* udb)
574 {
575 	/* assume that fsize is still accessible */
576 	udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
577 }
578 
579 /** grow file to specified size and re-mmap, return new base */
580 static void*
udb_base_grow_and_remap(udb_base * udb,uint64_t nsize)581 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
582 {
583 	/* grow file by writing a single zero at that spot, the
584 	 * rest is filled in with zeroes. */
585 	uint8_t z = 0;
586 	ssize_t w;
587 
588 	assert(nsize > 0);
589 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
590 #ifdef HAVE_PWRITE
591 	if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
592 #else
593 	if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
594 		log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
595 		return 0;
596 	}
597 	if((w=write(udb->fd, &z, sizeof(z))) == -1) {
598 #endif
599 		log_msg(LOG_ERR, "grow(%s, size %u) error %s",
600 			udb->fname, (unsigned)nsize, strerror(errno));
601 		return 0;
602 	} else if(w != (ssize_t)sizeof(z)) {
603 		log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
604 			udb->fname, (unsigned)nsize);
605 		return 0;
606 	}
607 	udb->glob_data->fsize = nsize;
608 	udb->glob_data->dirty_alloc = udb_dirty_clean;
609 	return udb_base_remap(udb, udb->alloc, nsize);
610 }
611 
612 int udb_exp_size(uint64_t a)
613 {
614 	/* find enclosing value such that 2**x >= a */
615 	int x = 0;
616 	uint64_t i = a;
617 	assert(a != 0);
618 
619 	i --;
620 	/* could optimise this with uint8* access, depends on endianness */
621 	/* first whole bytes */
622 	while( (i&(~(uint64_t)0xff)) ) {
623 		i >>= 8;
624 		x += 8;
625 	}
626 	/* now details */
627 	while(i) {
628 		i >>= 1;
629 		x ++;
630 	}
631 	assert( x>=0 && x<=63);
632 	assert( ((uint64_t)1<<x) >= a);
633 	assert( x==0 || /* <<x-1 without negative number analyzer complaints: */ (((uint64_t)1<<x)>>1) < a);
634 	return x;
635 }
636 
637 int udb_exp_offset(uint64_t o)
638 {
639 	/* this means measuring the number of 0 bits on the right */
640 	/* so, if exp zero bits then (o&(2**x-1))==0 */
641 	int x = 0;
642 	uint64_t i = o;
643 	assert(o != 0);
644 	/* first whole bytes */
645 	while( (i&(uint64_t)0xff) == 0) {
646 		i >>= 8;
647 		x += 8;
648 	}
649 	/* now details */
650 	while( (i&(uint64_t)0x1) == 0) {
651 		i >>= 1;
652 		x ++;
653 	}
654 	assert( o % ((uint64_t)1<<x) == 0);
655 	assert( o % ((uint64_t)1<<(x+1)) != 0);
656 	return x;
657 }
658 
659 void udb_alloc_init_new(udb_alloc_d* a)
660 {
661 	assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
662 	memset(a, 0, sizeof(*a));
663 	/* set new allocations after header, as if allocated in a sequence
664 	 * of minsize allocations */
665 	a->nextgrow = UDB_HEADER_SIZE;
666 }
667 
668 /** fsck the file size, false if failed and file is useless */
669 static int
670 fsck_fsize(udb_base* udb, udb_alloc* alloc)
671 {
672 	off_t realsize;
673 	log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
674 	realsize = lseek(udb->fd, (off_t)0, SEEK_END);
675 	if(realsize == (off_t)-1) {
676 		log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
677 		return 0;
678 	}
679 	udb->glob_data->fsize = (uint64_t)realsize;
680 	if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
681 		return 0;
682 	udb->glob_data->dirty_alloc = udb_dirty_clean;
683 	log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
684 	udb_base_sync(udb, 1);
685 	return 1;
686 }
687 
688 /** regenerate freelist add a new free chunk, return next todo */
689 static udb_void
690 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
691 {
692 	udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
693 	uint64_t esz = (uint64_t)1<<exp;
694 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
695 		return 0;
696 	}
697 	cp->type = udb_chunk_type_free;
698 	cp->flags = 0;
699 	chunk_set_last(base, c, exp, (uint8_t)exp);
700 	cp->prev = 0;
701 	cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
702 	if(cp->next)
703 		UDB_FREE_CHUNK(cp->next)->prev = c;
704 	regen->stat_free += esz;
705 	return c + esz;
706 }
707 
708 /** regenerate xl chunk, return next todo */
709 static udb_void
710 regen_xl(void* base, udb_void c, udb_alloc_d* regen)
711 {
712 	udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
713 	uint64_t xlsz = cp->size;
714 	if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
715 		return 0;
716 	}
717 	if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
718 		return 0;
719 	}
720 	/* fixup end-size and end-expmarker */
721 	regen->stat_alloc += xlsz;
722 	return c + xlsz;
723 }
724 
725 /** regenerate data chunk, return next todo */
726 static udb_void
727 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
728 {
729 	uint64_t esz = (uint64_t)1<<exp;
730 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
731 		return 0;
732 	}
733 	chunk_set_last(base, c, exp, (uint8_t)exp);
734 	regen->stat_alloc += esz;
735 	return c + esz;
736 }
737 
738 /** regenerate a relptr structure inside a data segment */
739 static void
740 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
741 {
742 	udb_void* a = (udb_void*)arg;
743 	/* ignore 0 pointers */
744 	if(!rp->data)
745 		return;
746 
747 	/* edit relptrs that point to oldmoved to point to newmoved. */
748 	if(rp->data == a[0])
749 		rp->data = a[1];
750 
751 	/* regenerate relptr lists, add this item to the relptr list for
752 	 * the data that it points to */
753 	udb_rel_ptr_link(base, rp, rp->data);
754 }
755 
756 /** regenerate the relptrs store in this data segment */
757 static void
758 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
759 	void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
760 {
761 	udb_void arg[2];
762 	arg[0] = rb_old; arg[1] = rb_new;
763 	/* walk through the structs here and put them on their respective
764 	 * relptr lists */
765 	(*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
766 		&regen_relptr_func, arg);
767 
768 }
769 
770 /** regenerate relptrlists in the file */
771 static void
772 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
773 	udb_void rb_old, udb_void rb_new)
774 {
775 	udb_void at = alloc->udb->glob_data->hsize;
776 	/* clear all ptrlist start pointers in the file. */
777 	while(at < alloc->disk->nextgrow) {
778 		int exp = (int)UDB_CHUNK(at)->exp;
779 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
780 		if(exp == UDB_EXP_XL) {
781 			UDB_XL_CHUNK(at)->ptrlist = 0;
782 			at += UDB_XL_CHUNK(at)->size;
783 		} else if(tp == udb_chunk_type_free) {
784 			at += (uint64_t)1<<exp;
785 		} else { /* data chunk */
786 			UDB_CHUNK(at)->ptrlist = 0;
787 			at += (uint64_t)1<<exp;
788 		}
789 	}
790 	/* walk through all relptr structs and put on the right list. */
791 	at = alloc->udb->glob_data->hsize;
792 	while(at < alloc->disk->nextgrow) {
793 		udb_chunk_d* atp = UDB_CHUNK(at);
794 		int exp = (int)atp->exp;
795 		udb_chunk_type tp = (udb_chunk_type)atp->type;
796 		uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
797 			(uint64_t)1<<exp);
798 		if(exp == UDB_EXP_XL) {
799 			assert(at != rb_old); /* should have been freed */
800 			regen_its_ptrs(base, udb, atp,
801 				((char*)atp)+sizeof(udb_xl_chunk_d),
802 				sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
803 				rb_old, rb_new);
804 			at += sz;
805 		} else if(tp == udb_chunk_type_free) {
806 			at += sz;
807 		} else { /* data chunk */
808 			assert(at != rb_old); /* should have been freed */
809 			regen_its_ptrs(base, udb, atp,
810 				((char*)atp)+sizeof(udb_chunk_d),
811 				sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
812 			at += sz;
813 		}
814 	}
815 }
816 
817 
818 /** mark free elements from ex XL chunk space and later fixups pick that up */
819 static void
820 rb_mark_free_segs(void* base, udb_void s, uint64_t m)
821 {
822 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
823 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
824 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
825 	while(q >= s) {
826 		UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
827 		UDB_CHUNK(q)->type = udb_chunk_type_free;
828 		q -= UDB_ALLOC_CHUNK_SIZE;
829 	}
830 }
831 
832 
833 /** fsck rollback or rollforward XL move results */
834 static int
835 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
836 	uint64_t rb_size, uint64_t rb_seg)
837 {
838 
839 	if(rb_old <= rb_new)
840 		return 0; /* XL move one way */
841 	if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
842 		return 0; /* not aligned */
843 	if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
844 		return 0; /* not aligned */
845 	if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
846 		return 0; /* not aligned */
847 	if(rb_new + rb_size <= rb_old) {
848 		/* not overlapping: resume copy */
849 		memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
850 		/* and free up old piece(s) */
851 		rb_mark_free_segs(base, rb_old, rb_size);
852 	} else {
853 		/* overlapping, see what segment we stopped at
854 		 * and continue there. */
855 		move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
856 		/* free up old piece(s); from the end of the moved segment,
857 		 * until the end of the old segment */
858 		rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
859 			(rb_new+rb_size));
860 	}
861 	/* do not call fix_ptrs, regenptrs does the job */
862 	return 1;
863 }
864 
865 /** fsck rollback or rollforward move results */
866 static int
867 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
868 	udb_void* make_free)
869 {
870 	if( (rb_size&(rb_size-1)) != 0)
871 		return 0; /* not powerof2 */
872 	if( (rb_old&(rb_size-1)) != 0)
873 		return 0; /* not aligned */
874 	if( (rb_new&(rb_size-1)) != 0)
875 		return 0; /* not aligned */
876 	/* resume copy */
877 	memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
878 	/* do not call fix_ptrs, regenptrs does the job */
879 	/* make sure udb_old is freed */
880 	*make_free = rb_old;
881 	return 1;
882 }
883 
884 /** fsck the file and salvage, false if failed and file is useless */
885 static int
886 fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
887 {
888 	void* base = udb->base;
889 	udb_alloc_d regen;
890 	udb_void at = udb->glob_data->hsize;
891 	udb_void rb_old = udb->glob_data->rb_old;
892 	udb_void rb_new = udb->glob_data->rb_new;
893 	udb_void rb_seg = udb->glob_data->rb_seg;
894 	udb_void make_free = 0;
895 	uint64_t rb_size = udb->glob_data->rb_size;
896 	log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
897 	/* walk through the file, use the exp values to see what can be
898 	 * salvaged */
899 	if(moved && rb_old && rb_new && rb_size) {
900 		if(rb_old+rb_size <= alloc->disk->nextgrow
901 			&& rb_new+rb_size <= alloc->disk->nextgrow) {
902 			/* we can use the move information to fix up the
903 			 * duplicate element (or partially moved element) */
904 			if(rb_size > 1024*1024) {
905 				/* XL chunk */
906 				if(!fsck_rb_xl(base, udb, rb_old, rb_new,
907 					rb_size, rb_seg))
908 					return 0;
909 			} else {
910 				if(!fsck_rb(base, rb_old, rb_new, rb_size,
911 					&make_free))
912 					return 0;
913 			}
914 		}
915 	}
916 
917 	/* rebuild freelists */
918 	/* recalculate stats in alloc (except 'stat_data') */
919 	/* possibly new end 'nextgrow' value */
920 	memset(&regen, 0, sizeof(regen));
921 	regen.nextgrow = alloc->disk->nextgrow;
922 	while(at < regen.nextgrow) {
923 		/* figure out this chunk */
924 		int exp = (int)UDB_CHUNK(at)->exp;
925 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
926 		/* consistency check possible here with end-exp */
927 		if(tp == udb_chunk_type_free || at == make_free) {
928 			at = regen_free(base, at, exp, &regen);
929 			if(!at) return 0;
930 		} else if(exp == UDB_EXP_XL) {
931 			/* allocated data of XL size */
932 			at = regen_xl(base, at, &regen);
933 			if(!at) return 0;
934 		} else if(exp >= UDB_ALLOC_CHUNK_MINEXP
935 			&& exp <= UDB_ALLOC_CHUNKS_MAX) {
936 			/* allocated data */
937 			at = regen_data(base, at, exp, &regen);
938 			if(!at) return 0;
939 		} else {
940 			/* garbage; this must be EOF then */
941 			regen.nextgrow = at;
942 			break;
943 		}
944 	}
945 	*alloc->disk = regen;
946 
947 	/* rebuild relptr lists */
948 	regen_ptrlist(base, udb, alloc, rb_old, rb_new);
949 
950 	log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
951 		udb->fname);
952 	udb->glob_data->rb_old = 0;
953 	udb->glob_data->rb_new = 0;
954 	udb->glob_data->rb_size = 0;
955 	udb->glob_data->dirty_alloc = udb_dirty_clean;
956 	udb_base_sync(udb, 1);
957 	return 1;
958 }
959 
960 
961 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
962 {
963 	udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
964 	if(!alloc)
965 		return NULL;
966 	alloc->udb = udb;
967 	alloc->disk = disk;
968 	/* see if committed but uncompleted actions need to be done */
969 	/* preserves the alloc state */
970 	if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
971 		if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
972 			if(fsck_fsize(udb, alloc))
973 				return alloc;
974 		} else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
975 			if(fsck_file(udb, alloc, 0))
976 				return alloc;
977 		} else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
978 			if(fsck_file(udb, alloc, 1))
979 				return alloc;
980 		}
981 		log_msg(LOG_ERR, "error: file allocation dirty (%d)",
982 			(int)udb->glob_data->dirty_alloc);
983 		free(alloc);
984 		return NULL;
985 	}
986 	return alloc;
987 }
988 
989 void udb_alloc_delete(udb_alloc* alloc)
990 {
991 	if(!alloc) return;
992 	free(alloc);
993 }
994 
995 /** unlink this element from its freelist */
996 static void
997 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
998 {
999 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
1000 	assert(chunk);
1001 	/* chunk is a free chunk */
1002 	assert(fp->exp == (uint8_t)exp);
1003 	assert(fp->type == udb_chunk_type_free);
1004 	assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
1005 	/* and thus freelist not empty */
1006 	assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
1007 	/* unlink */
1008 	if(fp->prev)
1009 		UDB_FREE_CHUNK(fp->prev)->next = fp->next;
1010 	else	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1011 	if(fp->next)
1012 		UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
1013 }
1014 
1015 /** pop first element off freelist, list may not be empty */
1016 static udb_void
1017 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
1018 {
1019 	udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1020 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1021 	assert(f);
1022 	assert(fp->exp == (uint8_t)exp);
1023 	assert(fp->type == udb_chunk_type_free);
1024 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1025 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1026 	if(fp->next) {
1027 		UDB_FREE_CHUNK(fp->next)->prev = 0;
1028 	}
1029 	return f;
1030 }
1031 
1032 /** push new element onto freelist */
1033 static void
1034 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
1035 {
1036 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1037 	assert(f);
1038 	fp->exp = (uint8_t)exp;
1039 	fp->type = udb_chunk_type_free;
1040 	fp->flags = 0;
1041 	fp->prev = 0;
1042 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1043 	if(fp->next)
1044 		UDB_FREE_CHUNK(fp->next)->prev = f;
1045 	chunk_set_last(base, f, exp, (uint8_t)exp);
1046 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1047 }
1048 
1049 /** push new element onto freelist - do not initialize the elt */
1050 static void
1051 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
1052 {
1053 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1054 	assert(f);
1055 	assert(fp->exp == (uint8_t)exp);
1056 	assert(fp->type == udb_chunk_type_free);
1057 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1058 	fp->prev = 0;
1059 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1060 	if(fp->next)
1061 		UDB_FREE_CHUNK(fp->next)->prev = f;
1062 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1063 }
1064 
1065 /** add free chunks at end until specified alignment occurs */
1066 static void
1067 grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1068 {
1069 	while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1070 		/* the nextgrow is not a whole multiple of esz. */
1071 		/* grow a free chunk of max allowed size */
1072 		int fexp = udb_exp_offset(alloc->disk->nextgrow);
1073 		uint64_t fsz = (uint64_t)1<<fexp;
1074 		udb_void f = alloc->disk->nextgrow;
1075 		udb_void fn = alloc->disk->nextgrow+fsz;
1076 		assert(fn <= alloc->udb->base_size);
1077 		alloc->disk->stat_free += fsz;
1078 		udb_alloc_push_fl(base, alloc, f, fexp);
1079 		/* now increase nextgrow to commit that free chunk */
1080 		alloc->disk->nextgrow = fn;
1081 	}
1082 }
1083 
1084 /** append chunks at end of memory space to get size exp, return dataptr */
1085 static udb_void
1086 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1087 {
1088 	uint64_t esz = (uint64_t)1<<exp;
1089 	udb_void ret;
1090 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1091 	grow_align(base, alloc, esz);
1092 	/* free chunks are grown, grow the one we want to use */
1093 	ret = alloc->disk->nextgrow;
1094 	/* take a new alloced chunk into use */
1095 	UDB_CHUNK(ret)->exp = (uint8_t)exp;
1096 	UDB_CHUNK(ret)->flags = 0;
1097 	UDB_CHUNK(ret)->ptrlist = 0;
1098 	UDB_CHUNK(ret)->type = udb_chunk_type_data;
1099 	/* store last octet */
1100 	chunk_set_last(base, ret, exp, (uint8_t)exp);
1101 	/* update stats */
1102 	alloc->disk->stat_alloc += esz;
1103 	alloc->disk->stat_data += sz;
1104 	/* now increase nextgrow to commit this newly allocated chunk */
1105 	alloc->disk->nextgrow += esz;
1106 	assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1107 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1108 	return ret + sizeof(udb_chunk_d); /* ptr to data */
1109 }
1110 
1111 /** calculate how much space is necessary to grow for this exp */
1112 static uint64_t
1113 grow_end_calc(udb_alloc* alloc, int exp)
1114 {
1115 	uint64_t sz = (uint64_t)1<<exp;
1116 	uint64_t ng = alloc->disk->nextgrow;
1117 	uint64_t res;
1118 	/* if nextgrow is 2**expness, no extra growth needed, only size */
1119 	if( (ng & (sz-1)) == 0) {
1120 		/* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1121 		return ng+sz; /* must grow exactly 2**exp */
1122 	}
1123 	/* grow until 2**expness and then we need 2**exp as well */
1124 	/* so, round ng down to whole sz (basically  ng-ng%sz, or ng/sz*sz)
1125 	 * and then add the sz twice (go up to whole sz, and to allocate) */
1126 	res = (ng & ~(sz-1)) + 2*sz;
1127 	return res;
1128 }
1129 
1130 /** see if we need to grow more than specified to enable sustained growth */
1131 static uint64_t
1132 grow_extra_check(udb_alloc* alloc, uint64_t ge)
1133 {
1134 	const uint64_t mb = 1024*1024;
1135 	uint64_t bsz = alloc->udb->base_size;
1136 	if(bsz <= mb) {
1137 		/* below 1 Mb, double sizes for exponential growth */
1138 		/* takes about 15 times to grow to 1Mb */
1139 		if(ge < bsz*2)
1140 			return bsz*2;
1141 	} else {
1142 		uint64_t gnow = ge - bsz;
1143 		/* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1144 		 * in whole megabytes rounded up. */
1145 		uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1146 		if(gnow < want)
1147 			return bsz + want;
1148 	}
1149 	return ge;
1150 }
1151 
1152 /** see if free space is enogh to warrant shrink (while file is open) */
1153 static int
1154 enough_free(udb_alloc* alloc)
1155 {
1156 	if(alloc->udb->base_size <= 2*1024*1024) {
1157 		/* below 1 Mb, grown by double size, (so up to 2 mb),
1158 		 * do not shrink unless we can 1/3 in size */
1159 		if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1160 			return 1;
1161 	} else {
1162 		/* grown 12.5%, shrink 25% if possible, at least one mb */
1163 		/* between 1mb and 4mb size, it shrinks by 1mb if possible */
1164 		uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1165 		if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1166 			|| alloc->udb->base_size < 4*1024*1024))
1167 			return 1;
1168 	}
1169 	return 0;
1170 }
1171 
1172 /** grow space for a chunk of 2**exp and return dataptr */
1173 static udb_void
1174 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1175 {
1176 	/* commit the grow action
1177 	 * - the file grow only changes filesize, but not the nextgrow.
1178 	 * - taking space after nextgrow into use (as free space),
1179 	 *   is like free-ing a chunk (one at a time).
1180 	 * - and the last chunk taken into use is like alloc.
1181 	 */
1182 	/* predict how much free space is needed for this */
1183 	uint64_t grow_end = grow_end_calc(alloc, exp);
1184 	assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1185 	if(grow_end <= alloc->udb->base_size) {
1186 		/* we can do this with the available space */
1187 		return grow_chunks(base, alloc, sz, exp);
1188 	}
1189 	/* we have to grow the file, re-mmap */
1190 	/* see if we need to grow a little more, to avoid endless grow
1191 	 * efforts on adding data */
1192 	grow_end = grow_extra_check(alloc, grow_end);
1193 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1194 		return 0; /* mmap or write failed (disk or mem full) */
1195 	}
1196 	/* we have enough space now */
1197 	assert(grow_end <= alloc->udb->base_size);
1198 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1199 	return grow_chunks(base, alloc, sz, exp);
1200 }
1201 
1202 /** take XL allocation into use at end of file, return dataptr */
1203 static udb_void
1204 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1205 {
1206 	udb_void ret;
1207 	udb_xl_chunk_d* p;
1208 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1209 
1210 	/* align growth to whole mbs */
1211 	grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1212 
1213 	/* grow XL segment */
1214 	ret = alloc->disk->nextgrow;
1215 	p = UDB_XL_CHUNK(ret);
1216 	p->exp = UDB_EXP_XL;
1217 	p->size = xlsz;
1218 	p->flags = 0;
1219 	p->ptrlist = 0;
1220 	p->type = udb_chunk_type_data;
1221 
1222 	/* also put size and marker at end for compaction */
1223 	*((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1224 	*((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1225 
1226 	/* stats */
1227 	alloc->disk->stat_data += sz;
1228 	alloc->disk->stat_alloc += xlsz;
1229 	/* now increase the nextgrow to commit this xl chunk */
1230 	alloc->disk->nextgrow += xlsz;
1231 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1232 	return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1233 }
1234 
1235 /** make space for XL allocation */
1236 static udb_void
1237 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1238 {
1239 	/* allocate whole mbs of space, at end of space */
1240 	uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1241 	uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1242 	uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1243 	assert(need >= asz);
1244 	if(grow_end <= alloc->udb->base_size) {
1245 		/* can do this in available space */
1246 		return grow_xl(base, alloc, need, sz);
1247 	}
1248 	/* have to grow file and re-mmap */
1249 	grow_end = grow_extra_check(alloc, grow_end);
1250 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1251 		return 0; /* mmap or write failed (disk or mem full) */
1252 	}
1253 	/* we have enough space now */
1254 	assert(grow_end <= alloc->udb->base_size);
1255 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1256 	return grow_xl(base, alloc, need, sz);
1257 }
1258 
1259 /** divide big(2**e2) into pieces so 2**exp fits */
1260 static udb_void
1261 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1262 	int exp)
1263 {
1264 	int e = e2;
1265 	uint64_t sz = (uint64_t)1<<e2;
1266 	assert(big && e2 > exp);
1267 	/* so the returned piece to use is the first piece,
1268 	 * offload the later half until it fits */
1269 	do {
1270 		sz >>= 1; /* divide size of big by two */
1271 		e--;      /* that means its exp is one smaller */
1272 		udb_alloc_push_fl(base, alloc, big+sz, e);
1273 	} while(e != exp);
1274 	/* exit loop when last pushed is same size as what we want */
1275 	return big;
1276 }
1277 
1278 /** returns the exponent size of the chunk needed for data sz */
1279 static int
1280 udb_alloc_exp_needed(size_t sz)
1281 {
1282 	uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1283 	if(asz > UDB_ALLOC_CHUNK_SIZE) {
1284 		return UDB_EXP_XL;
1285 	} else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1286 		return UDB_ALLOC_CHUNK_MINEXP;
1287 	}
1288 	return udb_exp_size(asz);
1289 }
1290 
1291 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1292 {
1293 	void* base = alloc->udb->base;
1294 	/* calculate actual allocation size */
1295 	int e2, exp = udb_alloc_exp_needed(sz);
1296 	if(exp == UDB_EXP_XL)
1297 		return udb_alloc_xl_space(base, alloc, sz);
1298 	/* see if there is a free chunk of that size exactly */
1299 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1300 		/* snip from freelist, udb_chunk_d */
1301 		udb_void ret;
1302 		alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1303 		ret = udb_alloc_pop_fl(base, alloc, exp);
1304 		/* use it - size octets already OK */
1305 		UDB_CHUNK(ret)->flags = 0;
1306 		UDB_CHUNK(ret)->ptrlist = 0;
1307 		UDB_CHUNK(ret)->type = udb_chunk_type_data;
1308 		/* update stats */
1309 		alloc->disk->stat_data += sz;
1310 		alloc->disk->stat_alloc += (1<<exp);
1311 		assert(alloc->disk->stat_free >= (1u<<exp));
1312 		alloc->disk->stat_free -= (1<<exp);
1313 		alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1314 		return ret + sizeof(udb_chunk_d); /* ptr to data */
1315 	}
1316 	/* see if we can subdivide a larger chunk */
1317 	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1318 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1319 			udb_void big, ret; /* udb_chunk_d */
1320 			alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1321 			big = udb_alloc_pop_fl(base, alloc, e2);
1322 			/* push other parts onto freelists (needs inited) */
1323 			ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1324 			/* use final part (needs inited) */
1325 			UDB_CHUNK(ret)->exp = (uint8_t)exp;
1326 			/* if stop here; the new exp makes smaller free chunk*/
1327 			UDB_CHUNK(ret)->flags = 0;
1328 			UDB_CHUNK(ret)->ptrlist = 0;
1329 			/* set type to commit data chunk */
1330 			UDB_CHUNK(ret)->type = udb_chunk_type_data;
1331 			/* store last octet */
1332 			chunk_set_last(base, ret, exp, (uint8_t)exp);
1333 			/* update stats */
1334 			alloc->disk->stat_data += sz;
1335 			alloc->disk->stat_alloc += (1<<exp);
1336 			assert(alloc->disk->stat_free >= (1u<<exp));
1337 			alloc->disk->stat_free -= (1<<exp);
1338 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1339 			return ret + sizeof(udb_chunk_d); /* ptr to data */
1340 		}
1341 	/* we need to grow an extra chunk */
1342 	return udb_alloc_grow_space(base, alloc, sz, exp);
1343 }
1344 
1345 /** see if there is free space to allocate a chunk into */
1346 static int
1347 have_free_for(udb_alloc* alloc, int exp)
1348 {
1349 	int e2;
1350 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1351 		return exp;
1352 	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1353 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1354 			return e2;
1355 		}
1356 	return 0;
1357 }
1358 
1359 /** fix relptr prev and next for moved relptr structures */
1360 static void
1361 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1362 {
1363 	udb_void* data = (udb_void*)arg;
1364 	udb_void r;
1365 	if(!rp->data)
1366 		return;
1367 	r = UDB_SYSTOREL(base, rp);
1368 	if(rp->next)
1369 		UDB_REL_PTR(rp->next)->prev = r;
1370 	if(rp->prev)
1371 		UDB_REL_PTR(rp->prev)->next = r;
1372 	else	{
1373 		/* if this is a pointer to its own chunk, fix it up;
1374 		 * the data ptr gets set by relptr_edit later. */
1375 		if(rp->data == data[0])
1376 			UDB_CHUNK(data[1])->ptrlist = r;
1377 		else	UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1378 	}
1379 }
1380 
1381 /** fix pointers from and to a moved chunk */
1382 static void
1383 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1384 	uint64_t dsz, udb_void olddata)
1385 {
1386 	udb_void d[2];
1387 	d[0] = olddata;
1388 	d[1] = data;
1389 	(*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1390 		dsz, &chunk_fix_ptr_each, d);
1391 	udb_rel_ptr_edit(base, cp->ptrlist, data);
1392 	udb_base_ram_ptr_edit(udb, olddata, data);
1393 }
1394 
1395 /** move an allocated chunk to use a free chunk */
1396 static void
1397 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1398 	int e2)
1399 {
1400 	udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1401 	udb_chunk_d* rp;
1402 	udb_chunk_d* fp;
1403 	if(exp != e2) {
1404 		/* it is bigger, subdivide it */
1405 		res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1406 	}
1407 	assert(res != f);
1408 	/* setup rollback information */
1409 	alloc->udb->glob_data->rb_old = f;
1410 	alloc->udb->glob_data->rb_new = res;
1411 	alloc->udb->glob_data->rb_size = esz;
1412 	/* take the res, exp into use */
1413 	rp = UDB_CHUNK(res);
1414 	fp = UDB_CHUNK(f);
1415 	/* copy over the data */
1416 	memcpy(rp, fp, esz);
1417 	/* adjust rel ptrs */
1418 	chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1419 		esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1420 
1421 	/* do not freeup the fp; caller does that */
1422 }
1423 
1424 /** unlink several free elements to overwrite with xl chunk */
1425 static void
1426 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1427 {
1428 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1429 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1430 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
1431 	while(q >= s) {
1432 		assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1433 		assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1434 		udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1435 		q -= UDB_ALLOC_CHUNK_SIZE;
1436 	}
1437 }
1438 
1439 /** move an XL chunk, and keep track of segments for rollback */
1440 static void
1441 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1442 	uint64_t sz, uint64_t startseg)
1443 {
1444 	udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1445 	udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1446 	uint64_t amount = xl - n;
1447 	assert(n < xl); /* move to compact */
1448 
1449 	/* setup move rollback */
1450 	udb->glob_data->rb_old = xl;
1451 	udb->glob_data->rb_new = n;
1452 	udb->glob_data->rb_size = sz;
1453 
1454 	/* is it overlapping? */
1455 	if(sz <= amount) {
1456 		memcpy(np, xlp, sz);
1457 	} else {
1458 		/* move and commit per 1M segment to avoid data loss */
1459 		uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1460 		for(seg = startseg; seg<maxseg; seg++) {
1461 			udb->glob_data->rb_seg = seg;
1462 			memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1463 				xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1464 				UDB_ALLOC_CHUNK_SIZE);
1465 		}
1466 
1467 	}
1468 }
1469 
1470 /** move list of XL chunks to the front by the shift amount */
1471 static void
1472 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1473 	uint64_t amount)
1474 {
1475 	udb_void xl = xl_start;
1476 	assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1477 	assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1478 	assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1479 	while(xl < xl_start+xl_sz) {
1480 		udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1481 		udb_void n = xl-amount;
1482 		uint64_t sz = xlp->size;
1483 		assert(xlp->exp == UDB_EXP_XL);
1484 		move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1485 		chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1486 			n+sizeof(udb_xl_chunk_d),
1487 			sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1488 			xl+sizeof(udb_xl_chunk_d));
1489 	}
1490 	alloc->disk->stat_free -= amount;
1491 	alloc->disk->nextgrow -= amount;
1492 	alloc->udb->glob_data->rb_old = 0;
1493 	alloc->udb->glob_data->rb_new = 0;
1494 	alloc->udb->glob_data->rb_size = 0;
1495 }
1496 
1497 /** see if free chunk can coagulate with another chunk, return other chunk */
1498 static udb_void
1499 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1500 	uint64_t esz)
1501 {
1502 	udb_void other = f^esz;
1503 	if(exp == UDB_ALLOC_CHUNKS_MAX)
1504 		return 0; /* no further merges */
1505 	if(other >= alloc->udb->base_size)
1506 		return 0; /* not allocated */
1507 	if(other >= alloc->disk->nextgrow)
1508 		return 0; /* not in use */
1509 	if(other < alloc->udb->glob_data->hsize)
1510 		return 0; /* cannot merge with header */
1511 		/* the header is also protected by the special exp marker */
1512 	/* see if the other chunk is a free chunk */
1513 
1514 	/* check closest marker to avoid large memory churn */
1515 	/* and also it makes XL allocations and header special markers work */
1516 	if(f > other) {
1517 		assert(f > 1); /* this is certain because of header */
1518 		if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1519 			/* can do it if the other part is a free chunk */
1520 			assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1521 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1522 				return other;
1523 		}
1524 	} else {
1525 		if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1526 			/* can do it if the other part is a free chunk */
1527 			assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1528 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1529 				return other;
1530 		}
1531 	}
1532 	return 0;
1533 }
1534 
1535 /** coagulate and then add new free segment, return final free segment */
1536 static udb_void
1537 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1538 	uint64_t esz)
1539 {
1540 	/* new free chunk here, attempt coagulate */
1541 	udb_void other;
1542 	while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1543 		/* unlink that other chunk */
1544 		udb_alloc_unlink_fl(base, alloc, other, exp);
1545 		/* merge up */
1546 		if(other < last)
1547 			last = other;
1548 		exp++;
1549 		esz <<= 1;
1550 	}
1551 	/* free the final segment */
1552 	udb_alloc_push_fl(base, alloc, last, exp);
1553 	return last;
1554 }
1555 
1556 /** attempt to compact the data and move free space to the end */
1557 int
1558 udb_alloc_compact(void* base, udb_alloc* alloc)
1559 {
1560 	udb_void last;
1561 	int exp, e2;
1562 	uint64_t esz;
1563 	uint64_t at = alloc->disk->nextgrow;
1564 	udb_void xl_start = 0;
1565 	uint64_t xl_sz = 0;
1566 	if(alloc->udb->inhibit_compact)
1567 		return 1;
1568 	alloc->udb->useful_compact = 0;
1569 	while(at > alloc->udb->glob_data->hsize) {
1570 		/* grab last entry */
1571 		exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1572 		if(exp == UDB_EXP_XL) {
1573 			/* for XL chunks:
1574 			 * - inspect the size of the XLchunklist at end
1575 			 * - attempt to compact in front of of XLchunklist
1576 			 */
1577 			uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1578 				at-sizeof(uint64_t)*2));
1579 			udb_void xl = at-xlsz;
1580 #ifndef NDEBUG
1581 			udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1582 			assert(xlp->exp == UDB_EXP_XL);
1583 			assert(xlp->type != udb_chunk_type_free);
1584 #endif
1585 			/* got thesegment add to the xl chunk list */
1586 			if(xl_start != 0 && xl+xlsz != xl_start) {
1587 				/* nonadjoining XL part, but they are aligned,
1588 				 * so the space in between is whole Mbs,
1589 				 * shift the later part(s) and continue */
1590 				uint64_t m = xl_start - (xl+xlsz);
1591 				assert(xl_start > xl+xlsz);
1592 				alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1593 				free_xl_space(base, alloc, xl+xlsz, m);
1594 				move_xl_list(base, alloc, xl_start, xl_sz, m);
1595 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1596 			}
1597 			xl_start = xl;
1598 			xl_sz += xlsz;
1599 			at = xl;
1600 			continue;
1601 			/* end of XL if */
1602 		} else if(exp < UDB_ALLOC_CHUNK_MINEXP
1603 			|| exp > UDB_ALLOC_CHUNKS_MAX)
1604 			break; /* special chunk or garbage */
1605 		esz = (uint64_t)1<<exp;
1606 		last = at - esz;
1607 		assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1608 		if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1609 			/* if xlstart continue looking to move stuff, but do
1610 			 * not unlink this free segment */
1611 			if(!xl_start) {
1612 				/* it is a free chunk, remove it */
1613 				alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1614 				udb_alloc_unlink_fl(base, alloc, last, exp);
1615 				alloc->disk->stat_free -= esz;
1616 				alloc->disk->nextgrow = last;
1617 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1618 				/* and continue at this point */
1619 			}
1620 			at = last;
1621 		} else if( (e2=have_free_for(alloc, exp)) ) {
1622 			/* last entry can be allocated in free chunks
1623 			 * move it to its new position, adjust rel_ptrs */
1624 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1625 			move_chunk(base, alloc, last, exp, esz, e2);
1626 			if(xl_start) {
1627 				last = coagulate_and_push(base, alloc,
1628 					last, exp, esz);
1629 			} else {
1630 				/* shorten usage */
1631 				alloc->disk->stat_free -= esz;
1632 				alloc->disk->nextgrow = last;
1633 			}
1634 			alloc->udb->glob_data->rb_old = 0;
1635 			alloc->udb->glob_data->rb_new = 0;
1636 			alloc->udb->glob_data->rb_size = 0;
1637 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1638 			/* and continue in front of it */
1639 			at = last;
1640 		} else {
1641 			/* cannot compact this block, stop compacting */
1642 			break;
1643 		}
1644 		/* if that worked, repeat it */
1645 	}
1646 	/* if we passed xl chunks, see if XL-chunklist can move */
1647 	if(xl_start) {
1648 		/* calculate free space in front of the XLchunklist. */
1649 		/* has to be whole mbs of free space */
1650 		/* if so, we can move the XL chunks.  Move them all back
1651 		 * by the new free space. */
1652 		/* this compacts very well, but the XL chunks can be moved
1653 		 * multiple times; worst case for every mb freed a huge sized
1654 		 * xlchunklist gets moved. */
1655 		/* free space must be, since aligned and coagulated, in
1656 		 * chunks of a whole MB */
1657 		udb_void at = xl_start;
1658 		uint64_t m = 0;
1659 		while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1660 			udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1661 			if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1662 				break;
1663 			assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1664 			m += UDB_ALLOC_CHUNK_SIZE;
1665 			at = chunk;
1666 		}
1667 		if(m != 0) {
1668 			assert(at+m == xl_start);
1669 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1670 			free_xl_space(base, alloc, at, m);
1671 			move_xl_list(base, alloc, xl_start, xl_sz, m);
1672 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1673 		}
1674 	}
1675 
1676 	/* if enough free, shrink the file; re-mmap */
1677 	if(enough_free(alloc)) {
1678 		uint64_t nsize = alloc->disk->nextgrow;
1679 		udb_base_shrink(alloc->udb, nsize);
1680 		if(!udb_base_remap(alloc->udb, alloc, nsize))
1681 			return 0;
1682 	}
1683 	return 1;
1684 }
1685 
1686 int
1687 udb_compact(udb_base* udb)
1688 {
1689 	if(!udb) return 1;
1690 	if(!udb->useful_compact) return 1;
1691 	DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database..."));
1692 	return udb_alloc_compact(udb->base, udb->alloc);
1693 }
1694 
1695 void udb_compact_inhibited(udb_base* udb, int inhibit)
1696 {
1697 	if(!udb) return;
1698 	udb->inhibit_compact = inhibit;
1699 }
1700 
1701 #ifdef UDB_CHECK
1702 /** check that rptrs are really zero before free */
1703 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1704 {
1705 	(void)base;
1706 	(void)arg;
1707 	assert(p->data == 0);
1708 }
1709 #endif /* UDB_CHECK */
1710 
1711 /** free XL chunk as multiples of CHUNK_SIZE free segments */
1712 static void
1713 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1714 	size_t sz)
1715 {
1716 	uint64_t xlsz = fp->size;
1717 	uint64_t c;
1718 	/* lightweight check for buffer overflow in xl data */
1719 	assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1720 	assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1721 	assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1722 	assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1723 #ifdef UDB_CHECK
1724 	/* check that relptrs in this chunk have been zeroed */
1725 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1726 		UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1727 		&udb_check_rptr_zero, NULL);
1728 #endif
1729 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1730 	/* update stats */
1731 	alloc->disk->stat_data -= sz;
1732 	alloc->disk->stat_alloc -= xlsz;
1733 	alloc->disk->stat_free += xlsz;
1734 	/* walk in reverse, so the front blocks go first on the list */
1735 	c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1736 	/* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1737 	assert(f >= UDB_ALLOC_CHUNK_SIZE);
1738 	while(c >= f) {
1739 		/* free a block of CHUNK_SIZE (1 Mb) */
1740 		udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1741 		c -= UDB_ALLOC_CHUNK_SIZE;
1742 	}
1743 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1744 }
1745 
1746 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1747 {
1748 	void* base;
1749 	/* lookup chunk ptr */
1750 	udb_void f;
1751 	udb_chunk_d* fp;
1752 	uint64_t esz;
1753 	int exp;
1754 	udb_void other;
1755 	int coagulated = 0;
1756 	if(!r)
1757 		return 1; /* free(NULL) does nothing */
1758 
1759 	/* lookup size of chunk */
1760 	base = alloc->udb->base;
1761 	/* fails for XL blocks */
1762 	f = chunk_from_dataptr(r);
1763 	fp = UDB_CHUNK(f);
1764 	assert(fp->type != udb_chunk_type_free);
1765 
1766 	/* see if it has a ptrlist, if so: trouble, the list is not properly
1767 	 * cleaned up. (although you can imagine a wholesale delete where
1768 	 * it does not matter) */
1769 	assert(fp->ptrlist == 0);
1770 
1771 	/* set ptrlist to 0 to stop relptr from using it, robustness. */
1772 	fp->ptrlist = 0;
1773 
1774 	if(fp->exp == UDB_EXP_XL) {
1775 		udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1776 		/* compact */
1777 		if(alloc->udb->inhibit_compact) {
1778 			alloc->udb->useful_compact = 1;
1779 			return 1;
1780 		}
1781 		return udb_alloc_compact(base, alloc);
1782 	}
1783 	/* it is a regular chunk of 2**exp size */
1784 	exp = (int)fp->exp;
1785 	esz = (uint64_t)1<<exp;
1786 	/* light check for e.g. buffer overflow of the data */
1787 	assert(sz < esz);
1788 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1789 #ifdef UDB_CHECK
1790 	/* check that relptrs in this chunk have been zeroed */
1791 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1792 		UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1793 #endif
1794 
1795 	/* update the stats */
1796 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1797 	alloc->disk->stat_data -= sz;
1798 	alloc->disk->stat_free += esz;
1799 	alloc->disk->stat_alloc -= esz;
1800 
1801 	/* if it can be merged with other free chunks, do so */
1802 	while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1803 		coagulated = 1;
1804 		/* unlink that other chunk and expand it (it has same size) */
1805 		udb_alloc_unlink_fl(base, alloc, other, exp);
1806 		/* merge up */
1807 		if(other < f)
1808 			f = other;
1809 		exp++;
1810 		esz <<= 1;
1811 	}
1812 	if(coagulated) {
1813 		/* put big free chunk into freelist, and init it */
1814 		udb_alloc_push_fl(base, alloc, f, exp);
1815 	} else {
1816 		/* we do not need to touch the last-exp-byte, which may save
1817 		 * a reference to that page of memory */
1818 		fp->type = udb_chunk_type_free;
1819 		fp->flags = 0;
1820 		udb_alloc_push_fl_noinit(base, alloc, f, exp);
1821 	}
1822 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1823 	/* compact */
1824 	if(alloc->udb->inhibit_compact) {
1825 		alloc->udb->useful_compact = 1;
1826 		return 1;
1827 	}
1828 	return udb_alloc_compact(base, alloc);
1829 }
1830 
1831 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1832 {
1833 	/* could be faster maybe, if grown? */
1834 	udb_void r = udb_alloc_space(alloc, sz);
1835 	if(!r) return r;
1836 	memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1837 	return r;
1838 }
1839 
1840 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1841 {
1842 	void* base = alloc->udb->base;
1843 	udb_void c, n, newd;
1844 	udb_chunk_d* cp, *np;
1845 	uint64_t avail;
1846 	uint8_t cp_type;
1847 	/* emulate some posix realloc stuff */
1848 	if(r == 0)
1849 		return udb_alloc_space(alloc, sz);
1850 	if(sz == 0) {
1851 		if(!udb_alloc_free(alloc, r, osz))
1852 			log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1853 		return 0;
1854 	}
1855 	c = chunk_from_dataptr(r);
1856 	cp = UDB_CHUNK(c);
1857 	cp_type = cp->type;
1858 	if(cp->exp == UDB_EXP_XL) {
1859 		avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1860 			- sizeof(uint64_t)*2;
1861 	} else {
1862 		avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1863 	}
1864 	if(sz <= avail)
1865 		return r;
1866 	/* reallocate it, and copy */
1867 	newd = udb_alloc_space(alloc, sz);
1868 	if(!newd) return 0;
1869 	/* re-base after alloc, since re-mmap may have happened */
1870 	base = alloc->udb->base;
1871 	cp = NULL; /* may be invalid now, robustness */
1872 	n = chunk_from_dataptr(newd);
1873 	np = UDB_CHUNK(n);
1874 	np->type = cp_type;
1875 	memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1876 	/* fixup ptrs */
1877 	chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1878 
1879 	if(!udb_alloc_free(alloc, r, osz))
1880 		log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1881 	return newd;
1882 }
1883 
1884 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1885 {
1886 	const uint64_t mb = 1024*1024;
1887 	int exp = udb_alloc_exp_needed(sz);
1888 	uint64_t esz;
1889 	uint64_t want;
1890 	if(exp == UDB_EXP_XL)
1891 		esz = (sz&(mb-1))+mb;
1892 	else	esz = (uint64_t)1<<exp;
1893 	/* we need grow_end_calc to take into account alignment */
1894 	want = grow_end_calc(alloc, exp) + esz*(num-1);
1895 	assert(want >= alloc->udb->base_size);
1896 	if(!udb_base_grow_and_remap(alloc->udb, want)) {
1897 		log_msg(LOG_ERR, "failed to grow the specified amount");
1898 		return 0;
1899 	}
1900 	return 1;
1901 }
1902 
1903 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1904 {
1905 	void* base = alloc->udb->base;
1906 	udb_void f = chunk_from_dataptr(r);
1907 	udb_chunk_d* fp = UDB_CHUNK(f);
1908 	/* not the 'free' type, that must be set by allocation routines */
1909 	assert(fp->type != udb_chunk_type_free);
1910 	assert(tp != udb_chunk_type_free);
1911 	fp->type = tp;
1912 }
1913 
1914 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1915 {
1916 	/* pointers are not valid before the header-size or after the
1917 	 * used-region of the mmap */
1918 	return ( (to+destsize) <= udb->base_size &&
1919 		to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1920 		(to+destsize) <= udb->alloc->disk->nextgrow);
1921 }
1922 
1923 int udb_valid_dataptr(udb_base* udb, udb_void to)
1924 {
1925 	void* base = udb->base;
1926 	udb_void ch;
1927 	int exp;
1928 	uint64_t esz;
1929 	/* our data chunks are aligned and at least 8 bytes */
1930 	if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1931 		return 0;
1932 	/* get the chunk pointer */
1933 	ch = chunk_from_dataptr(to);
1934 	if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1935 		return 0;
1936 	/* check its size */
1937 	exp = UDB_CHUNK(ch)->exp;
1938 	if(exp == UDB_EXP_XL) {
1939 		/* check XL chunk */
1940 		uint64_t xlsz;
1941 		if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1942 			return 0;
1943 		xlsz = UDB_XL_CHUNK(ch)->size;
1944 		if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1945 			return 0;
1946 		if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1947 			return 0;
1948 		if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1949 			!= xlsz)
1950 			return 0;
1951 		return 1;
1952 	}
1953 	/* check if regular chunk has matching end byte */
1954 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1955 		return 0; /* cannot be a valid chunk */
1956 	esz = 1<<exp;
1957 	if(!udb_valid_offset(udb, ch+esz-1, 1))
1958 		return 0;
1959 	if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1960 		return 0;
1961 	return 1;
1962 }
1963 
1964 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1965 {
1966 	void* base = udb->base;
1967 	udb_void p;
1968 	if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1969 		return 0;
1970 	if(!udb_valid_dataptr(udb, to))
1971 		return 0;
1972 	p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1973 	while(p) {
1974 		if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1975 			return 0;
1976 		if(p == rptr)
1977 			return 1;
1978 		p = UDB_REL_PTR(p)->next;
1979 	}
1980 	return 0;
1981 }
1982 
1983 void udb_rel_ptr_init(udb_rel_ptr* ptr)
1984 {
1985 	memset(ptr, 0, sizeof(*ptr));
1986 }
1987 
1988 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1989 {
1990 	if(!ptr->data)
1991 		return;
1992 	if(ptr->prev) {
1993 		UDB_REL_PTR(ptr->prev)->next = ptr->next;
1994 	} else {
1995 		UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1996 	}
1997 	if(ptr->next) {
1998 		UDB_REL_PTR(ptr->next)->prev = ptr->prev;
1999 	}
2000 }
2001 
2002 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
2003 {
2004 	udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
2005 	ptr->prev = 0;
2006 	ptr->next = chunk->ptrlist;
2007 	if(ptr->next)
2008 		UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
2009 	chunk->ptrlist = UDB_SYSTOREL(base, ptr);
2010 	ptr->data = to;
2011 }
2012 
2013 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
2014 {
2015 	assert(to == 0 || to > 64);
2016 	udb_rel_ptr_unlink(base, ptr);
2017 	if(to)
2018 		udb_rel_ptr_link(base, ptr, to);
2019 	else	ptr->data = to;
2020 }
2021 
2022 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
2023 {
2024 	udb_void p = list;
2025 	while(p) {
2026 		UDB_REL_PTR(p)->data = to;
2027 		p = UDB_REL_PTR(p)->next;
2028 	}
2029 }
2030 
2031 #ifdef UDB_CHECK
2032 /** check that all pointers are validly chained */
2033 static void
2034 udb_check_ptrs_valid(udb_base* udb)
2035 {
2036 	size_t i;
2037 	udb_ptr* p, *prev;
2038 	for(i=0; i<udb->ram_size; i++) {
2039 		prev = NULL;
2040 		for(p=udb->ram_hash[i]; p; p=p->next) {
2041 			assert(p->prev == prev);
2042 			assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
2043 				== i);
2044 			assert(p->base == &udb->base);
2045 			prev = p;
2046 		}
2047 	}
2048 }
2049 #endif /* UDB_CHECK */
2050 
2051 void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
2052 {
2053 #ifdef UDB_CHECK
2054 	udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
2055 #endif
2056 	memset(ptr, 0, sizeof(*ptr));
2057 	ptr->base = &udb->base;
2058 }
2059 
2060 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
2061 {
2062 	assert(newval == 0 || newval > 64);
2063 	if(ptr->data)
2064 		udb_base_unlink_ptr(udb, ptr);
2065 	ptr->data = newval;
2066 	if(newval)
2067 		udb_base_link_ptr(udb, ptr);
2068 }
2069 
2070 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
2071 	size_t sz)
2072 {
2073 	udb_void r;
2074 	r = udb_alloc_space(udb->alloc, sz);
2075 	if(!r) return 0;
2076 	udb_alloc_set_type(udb->alloc, r, type);
2077 	udb_ptr_init(ptr, udb);
2078 	udb_ptr_set(ptr, udb, r);
2079 	return 1;
2080 }
2081 
2082 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2083 {
2084 	if(ptr->data) {
2085 		udb_void d = ptr->data;
2086 		udb_ptr_set(ptr, udb, 0);
2087 		udb_alloc_free(udb->alloc, d, sz);
2088 	}
2089 }
2090 
2091 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2092 {
2093 	udb_void f;
2094 	if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2095 	f = chunk_from_dataptr(ptr->data);
2096 	return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2097 }
2098