1 /* 2 Unix SMB/CIFS implementation. 3 4 trivial database library 5 6 Copyright (C) Rusty Russell 2009 7 8 ** NOTE! The following LGPL license applies to the tdb 9 ** library. This does NOT imply that all of Samba is released 10 ** under the LGPL 11 12 This library is free software; you can redistribute it and/or 13 modify it under the terms of the GNU Lesser General Public 14 License as published by the Free Software Foundation; either 15 version 3 of the License, or (at your option) any later version. 16 17 This library is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 Lesser General Public License for more details. 21 22 You should have received a copy of the GNU Lesser General Public 23 License along with this library; if not, see <http://www.gnu.org/licenses/>. 24 */ 25 #include "tdb_private.h" 26 27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */ 28 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery) 29 { 30 struct tdb_header hdr; 31 uint32_t h1, h2; 32 33 if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1) 34 return false; 35 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) 36 goto corrupt; 37 38 CONVERT(hdr); 39 if (hdr.version != TDB_VERSION) 40 goto corrupt; 41 42 if (hdr.rwlocks != 0 && 43 hdr.rwlocks != TDB_FEATURE_FLAG_MAGIC && 44 hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC) 45 goto corrupt; 46 47 tdb_header_hash(tdb, &h1, &h2); 48 if (hdr.magic1_hash && hdr.magic2_hash && 49 (hdr.magic1_hash != h1 || hdr.magic2_hash != h2)) 50 goto corrupt; 51 52 if (hdr.hash_size == 0) 53 goto corrupt; 54 55 if (hdr.hash_size != tdb->hash_size) 56 goto corrupt; 57 58 if (hdr.recovery_start != 0 && 59 hdr.recovery_start < TDB_DATA_START(tdb->hash_size)) 60 goto corrupt; 61 62 *recovery = hdr.recovery_start; 63 return true; 64 65 corrupt: 66 tdb->ecode = TDB_ERR_CORRUPT; 67 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n")); 68 return false; 69 } 70 71 /* Generic record header check. */ 72 static bool tdb_check_record(struct tdb_context *tdb, 73 tdb_off_t off, 74 const struct tdb_record *rec) 75 { 76 tdb_off_t tailer; 77 78 /* Check rec->next: 0 or points to record offset, aligned. */ 79 if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->hash_size)){ 80 TDB_LOG((tdb, TDB_DEBUG_ERROR, 81 "Record offset %u too small next %u\n", 82 off, rec->next)); 83 goto corrupt; 84 } 85 if (rec->next + sizeof(*rec) < rec->next) { _add_wrapper(name)86 TDB_LOG((tdb, TDB_DEBUG_ERROR, 87 "Record offset %u too large next %u\n", 88 off, rec->next)); wrapper(self, *args, **kwargs)89 goto corrupt; 90 } 91 if ((rec->next % TDB_ALIGNMENT) != 0) { 92 TDB_LOG((tdb, TDB_DEBUG_ERROR, 93 "Record offset %u misaligned next %u\n", 94 off, rec->next)); 95 goto corrupt; 96 } 97 if (tdb_oob(tdb, rec->next, sizeof(*rec), 0)) 98 goto corrupt; 99 100 /* Check rec_len: similar to rec->next, implies next record. */ 101 if ((rec->rec_len % TDB_ALIGNMENT) != 0) { 102 TDB_LOG((tdb, TDB_DEBUG_ERROR, 103 "Record offset %u misaligned length %u\n", 104 off, rec->rec_len)); 105 goto corrupt; 106 } 107 /* Must fit tailer. */ 108 if (rec->rec_len < sizeof(tailer)) { 109 TDB_LOG((tdb, TDB_DEBUG_ERROR, 110 "Record offset %u too short length %u\n", 111 off, rec->rec_len)); 112 goto corrupt; 113 } 114 /* OOB allows "right at the end" access, so this works for last rec. */ 115 if (tdb_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0)) _add_getter(name)116 goto corrupt; 117 118 /* Check tailer. */ 119 if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer), getter(self)120 &tailer) == -1) 121 goto corrupt; 122 if (tailer != sizeof(*rec) + rec->rec_len) { setter(self, value)123 TDB_LOG((tdb, TDB_DEBUG_ERROR, 124 "Record offset %u invalid tailer\n", off)); 125 goto corrupt; 126 } 127 128 return true; 129 130 corrupt: 131 tdb->ecode = TDB_ERR_CORRUPT; 132 return false; 133 } 134 135 /* Grab some bytes: may copy if can't use mmap. 136 Caller has already done bounds check. */ 137 static TDB_DATA get_bytes(struct tdb_context *tdb, 138 tdb_off_t off, tdb_len_t len) 139 { 140 TDB_DATA d; 141 142 d.dsize = len; 143 144 if (tdb->transaction == NULL && tdb->map_ptr != NULL) 145 d.dptr = (unsigned char *)tdb->map_ptr + off; 146 else 147 d.dptr = tdb_alloc_read(tdb, off, d.dsize); 148 return d; 149 } 150 151 /* Frees data if we're not able to simply use mmap. */ 152 static void put_bytes(struct tdb_context *tdb, TDB_DATA d) 153 { 154 if (tdb->transaction == NULL && tdb->map_ptr != NULL) 155 return; 156 free(d.dptr); 157 } 158 159 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2. 160 * See: http://burtleburtle.net/bob/c/lookup3.c 161 */ 162 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) 163 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb) 164 { 165 uint32_t a,b,c; 166 167 /* Set up the internal state */ 168 a = b = c = 0xdeadbeef + *pc; 169 c += *pb; 170 a += key; 171 c ^= b; c -= rot(b,14); 172 a ^= c; a -= rot(c,11); 173 b ^= a; b -= rot(a,25); 174 c ^= b; c -= rot(b,16); 175 a ^= c; a -= rot(c,4); 176 b ^= a; b -= rot(a,14); 177 c ^= b; c -= rot(b,24); 178 *pc=c; *pb=b; 179 } 180 181 /* 182 We want to check that all free records are in the free list 183 (only once), and all free list entries are free records. Similarly 184 for each hash chain of used records. 185 186 Doing that naively (without walking hash chains, since we want to be 187 linear) means keeping a list of records which have been seen in each 188 hash chain, and another of records pointed to (ie. next pointers 189 from records and the initial hash chain heads). These two lists 190 should be equal. This will take 8 bytes per record, and require 191 sorting at the end. 192 193 So instead, we record each offset in a bitmap such a way that 194 recording it twice will cancel out. Since each offset should appear 195 exactly twice, the bitmap should be zero at the end. 196 197 The approach was inspired by Bloom Filters (see Wikipedia). For 198 each value, we flip K bits in a bitmap of size N. The number of 199 distinct arrangements is: 200 201 N! / (K! * (N-K)!) 202 203 Of course, not all arrangements are actually distinct, but testing 204 shows this formula to be close enough. 205 206 So, if K == 8 and N == 256, the probability of two things flipping the same 207 bits is 1 in 409,663,695,276,000. 208 209 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain 210 (320k) seems reasonable. 211 */ 212 #define NUM_HASHES 8 213 #define BITMAP_BITS 256 214 215 static void bit_flip(unsigned char bits[], unsigned int idx) 216 { 217 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT)); 218 } 219 220 /* We record offsets in a bitmap for the particular chain it should be in. */ 221 static void record_offset(unsigned char bits[], tdb_off_t off) 222 { 223 uint32_t h1 = off, h2 = 0; 224 unsigned int i; 225 226 /* We get two good hash values out of jhash2, so we use both. Then 227 * we keep going to produce further hash values. */ 228 for (i = 0; i < NUM_HASHES / 2; i++) { 229 hash(off, &h1, &h2); 230 bit_flip(bits, h1 % BITMAP_BITS); 231 bit_flip(bits, h2 % BITMAP_BITS); 232 h2++; 233 } 234 } 235 236 /* Check that an in-use record is valid. */ 237 static bool tdb_check_used_record(struct tdb_context *tdb, 238 tdb_off_t off, 239 const struct tdb_record *rec, 240 unsigned char **hashes, 241 int (*check)(TDB_DATA, TDB_DATA, void *), 242 void *private_data) 243 { 244 TDB_DATA key, data; 245 tdb_len_t len; 246 247 if (!tdb_check_record(tdb, off, rec)) 248 return false; 249 250 /* key + data + tailer must fit in record */ 251 len = rec->key_len; 252 len += rec->data_len; 253 if (len < rec->data_len) { 254 /* overflow */ 255 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n")); 256 return false; 257 } 258 len += sizeof(tdb_off_t); 259 if (len < sizeof(tdb_off_t)) { 260 /* overflow */ 261 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n")); 262 return false; 263 } 264 265 if (len > rec->rec_len) { 266 TDB_LOG((tdb, TDB_DEBUG_ERROR, 267 "Record offset %u too short for contents\n", off)); 268 return false; 269 } 270 271 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len); 272 if (!key.dptr) 273 return false; 274 275 if (tdb->hash_fn(&key) != rec->full_hash) { 276 TDB_LOG((tdb, TDB_DEBUG_ERROR, 277 "Record offset %u has incorrect hash\n", off)); 278 goto fail_put_key; 279 } 280 281 /* Mark this offset as a known value for this hash bucket. */ 282 record_offset(hashes[BUCKET(rec->full_hash)+1], off); 283 /* And similarly if the next pointer is valid. */ 284 if (rec->next) 285 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next); 286 287 /* If they supply a check function and this record isn't dead, 288 get data and feed it. */ 289 if (check && rec->magic != TDB_DEAD_MAGIC) { 290 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len, 291 rec->data_len); 292 if (!data.dptr) 293 goto fail_put_key; 294 295 if (check(key, data, private_data) == -1) 296 goto fail_put_data; 297 put_bytes(tdb, data); 298 } 299 300 put_bytes(tdb, key); 301 return true; 302 303 fail_put_data: 304 put_bytes(tdb, data); 305 fail_put_key: 306 put_bytes(tdb, key); 307 return false; 308 } 309 310 /* Check that an unused record is valid. */ 311 static bool tdb_check_free_record(struct tdb_context *tdb, 312 tdb_off_t off, 313 const struct tdb_record *rec, 314 unsigned char **hashes) 315 { 316 if (!tdb_check_record(tdb, off, rec)) 317 return false; 318 319 /* Mark this offset as a known value for the free list. */ 320 record_offset(hashes[0], off); 321 /* And similarly if the next pointer is valid. */ 322 if (rec->next) 323 record_offset(hashes[0], rec->next); 324 return true; 325 } 326 327 /* Slow, but should be very rare. */ 328 size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off) 329 { 330 size_t len; 331 332 for (len = 0; off + len < tdb->map_size; len++) { 333 char c; 334 if (tdb->methods->tdb_read(tdb, off, &c, 1, 0)) 335 return 0; 336 if (c != 0 && c != 0x42) 337 break; 338 } 339 return len; 340 } 341 342 _PUBLIC_ int tdb_check(struct tdb_context *tdb, 343 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), 344 void *private_data) 345 { 346 unsigned int h; 347 unsigned char **hashes; 348 tdb_off_t off, recovery_start; 349 struct tdb_record rec; 350 bool found_recovery = false; 351 tdb_len_t dead; 352 bool locked; 353 354 /* Read-only databases use no locking at all: it's best-effort. 355 * We may have a write lock already, so skip that case too. */ 356 if (tdb->read_only || tdb->allrecord_lock.count != 0) { 357 locked = false; 358 } else { 359 if (tdb_lockall_read(tdb) == -1) 360 return -1; 361 locked = true; 362 } 363 364 /* Make sure we know true size of the underlying file. */ 365 tdb_oob(tdb, tdb->map_size, 1, 1); 366 367 /* Header must be OK: also gets us the recovery ptr, if any. */ 368 if (!tdb_check_header(tdb, &recovery_start)) 369 goto unlock; 370 371 /* We should have the whole header, too. */ 372 if (tdb->map_size < TDB_DATA_START(tdb->hash_size)) { 373 tdb->ecode = TDB_ERR_CORRUPT; 374 TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n")); 375 goto unlock; 376 } 377 378 /* One big malloc: pointers then bit arrays. */ 379 hashes = (unsigned char **)calloc( 380 1, sizeof(hashes[0]) * (1+tdb->hash_size) 381 + BITMAP_BITS / CHAR_BIT * (1+tdb->hash_size)); 382 if (!hashes) { 383 tdb->ecode = TDB_ERR_OOM; 384 goto unlock; 385 } 386 387 /* Initialize pointers */ 388 hashes[0] = (unsigned char *)(&hashes[1+tdb->hash_size]); 389 for (h = 1; h < 1+tdb->hash_size; h++) 390 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT; 391 392 /* Freelist and hash headers are all in a row: read them. */ 393 for (h = 0; h < 1+tdb->hash_size; h++) { 394 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t), 395 &off) == -1) 396 goto free; 397 if (off) 398 record_offset(hashes[h], off); 399 } 400 401 /* For each record, read it in and check it's ok. */ 402 for (off = TDB_DATA_START(tdb->hash_size); 403 off < tdb->map_size; 404 off += sizeof(rec) + rec.rec_len) { 405 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec), 406 DOCONV()) == -1) 407 goto free; 408 switch (rec.magic) { 409 case TDB_MAGIC: 410 case TDB_DEAD_MAGIC: 411 if (!tdb_check_used_record(tdb, off, &rec, hashes, 412 check, private_data)) 413 goto free; 414 break; 415 case TDB_FREE_MAGIC: 416 if (!tdb_check_free_record(tdb, off, &rec, hashes)) 417 goto free; 418 break; 419 /* If we crash after ftruncate, we can get zeroes or fill. */ 420 case TDB_RECOVERY_INVALID_MAGIC: 421 case 0x42424242: 422 if (recovery_start == off) { 423 found_recovery = true; 424 break; 425 } 426 dead = tdb_dead_space(tdb, off); 427 if (dead < sizeof(rec)) 428 goto corrupt; 429 430 TDB_LOG((tdb, TDB_DEBUG_ERROR, 431 "Dead space at %u-%u (of %u)\n", 432 off, off + dead, tdb->map_size)); 433 rec.rec_len = dead - sizeof(rec); 434 break; 435 case TDB_RECOVERY_MAGIC: 436 if (recovery_start != off) { 437 TDB_LOG((tdb, TDB_DEBUG_ERROR, 438 "Unexpected recovery record at offset %u\n", 439 off)); 440 goto free; 441 } 442 found_recovery = true; 443 break; 444 default: ; 445 corrupt: 446 tdb->ecode = TDB_ERR_CORRUPT; 447 TDB_LOG((tdb, TDB_DEBUG_ERROR, 448 "Bad magic 0x%x at offset %u\n", 449 rec.magic, off)); 450 goto free; 451 } 452 } 453 454 /* Now, hashes should all be empty: each record exists and is referred 455 * to by one other. */ 456 for (h = 0; h < 1+tdb->hash_size; h++) { 457 unsigned int i; 458 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) { 459 if (hashes[h][i] != 0) { 460 tdb->ecode = TDB_ERR_CORRUPT; 461 TDB_LOG((tdb, TDB_DEBUG_ERROR, 462 "Hashes do not match records\n")); 463 goto free; 464 } 465 } 466 } 467 468 /* We must have found recovery area if there was one. */ 469 if (recovery_start != 0 && !found_recovery) { 470 TDB_LOG((tdb, TDB_DEBUG_ERROR, 471 "Expected a recovery area at %u\n", 472 recovery_start)); 473 goto free; 474 } 475 476 free(hashes); 477 if (locked) { 478 tdb_unlockall_read(tdb); 479 } 480 return 0; 481 482 free: 483 free(hashes); 484 unlock: 485 if (locked) { 486 tdb_unlockall_read(tdb); 487 } 488 return -1; 489 } 490