1 /* $NetBSD: vfs_wapbl.c,v 1.39 2011/01/08 20:37:05 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.39 2011/01/08 20:37:05 christos Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/bitops.h> 43 44 #ifdef _KERNEL 45 #include <sys/param.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/sysctl.h> 49 #include <sys/uio.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/malloc.h> 53 #include <sys/module.h> 54 #include <sys/resourcevar.h> 55 #include <sys/conf.h> 56 #include <sys/mount.h> 57 #include <sys/kernel.h> 58 #include <sys/kauth.h> 59 #include <sys/mutex.h> 60 #include <sys/atomic.h> 61 #include <sys/wapbl.h> 62 #include <sys/wapbl_replay.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 #if 0 /* notyet */ 67 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP) 68 #define wapbl_free(a, s) kmem_free((a), (s)) 69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 70 #else 71 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); 72 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) 73 #define wapbl_free(a, s) free((a), M_WAPBL) 74 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) 75 #endif 76 77 static struct sysctllog *wapbl_sysctl; 78 static int wapbl_flush_disk_cache = 1; 79 static int wapbl_verbose_commit = 0; 80 81 #else /* !_KERNEL */ 82 #include <assert.h> 83 #include <errno.h> 84 #include <stdio.h> 85 #include <stdbool.h> 86 #include <stdlib.h> 87 #include <string.h> 88 89 #include <sys/time.h> 90 #include <sys/wapbl.h> 91 #include <sys/wapbl_replay.h> 92 93 #define KDASSERT(x) assert(x) 94 #define KASSERT(x) assert(x) 95 #define wapbl_malloc(s) malloc(s) 96 #define wapbl_free(a, s) free(a) 97 #define wapbl_calloc(n, s) calloc((n), (s)) 98 99 #endif /* !_KERNEL */ 100 101 /* 102 * INTERNAL DATA STRUCTURES 103 */ 104 105 /* 106 * This structure holds per-mount log information. 107 * 108 * Legend: a = atomic access only 109 * r = read-only after init 110 * l = rwlock held 111 * m = mutex held 112 * lm = rwlock held writing or mutex held 113 * u = unlocked access ok 114 * b = bufcache_lock held 115 */ 116 struct wapbl { 117 struct vnode *wl_logvp; /* r: log here */ 118 struct vnode *wl_devvp; /* r: log on this device */ 119 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 120 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 121 int wl_log_dev_bshift; /* r: logarithm of device block size of log 122 device */ 123 int wl_fs_dev_bshift; /* r: logarithm of device block size of 124 filesystem device */ 125 126 unsigned wl_lock_count; /* m: Count of transactions in progress */ 127 128 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 129 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 130 131 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 132 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 133 134 off_t wl_head; /* l: Byte offset of log head */ 135 off_t wl_tail; /* l: Byte offset of log tail */ 136 /* 137 * head == tail == 0 means log is empty 138 * head == tail != 0 means log is full 139 * see assertions in wapbl_advance() for other boundary conditions. 140 * only truncate moves the tail, except when flush sets it to 141 * wl_header_size only flush moves the head, except when truncate 142 * sets it to 0. 143 */ 144 145 struct wapbl_wc_header *wl_wc_header; /* l */ 146 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 147 148 kmutex_t wl_mtx; /* u: short-term lock */ 149 krwlock_t wl_rwlock; /* u: File system transaction lock */ 150 151 /* 152 * Must be held while accessing 153 * wl_count or wl_bufs or head or tail 154 */ 155 156 /* 157 * Callback called from within the flush routine to flush any extra 158 * bits. Note that flush may be skipped without calling this if 159 * there are no outstanding buffers in the transaction. 160 */ 161 #if _KERNEL 162 wapbl_flush_fn_t wl_flush; /* r */ 163 wapbl_flush_fn_t wl_flush_abort;/* r */ 164 #endif 165 166 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 167 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 168 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 169 170 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 171 172 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 173 size_t wl_reclaimable_bytes; /* m: Amount of space available for 174 reclamation by truncate */ 175 int wl_error_count; /* m: # of wl_entries with errors */ 176 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 177 178 #ifdef WAPBL_DEBUG_BUFBYTES 179 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 180 #endif 181 182 daddr_t *wl_deallocblks;/* lm: address of block */ 183 int *wl_dealloclens; /* lm: size of block */ 184 int wl_dealloccnt; /* lm: total count */ 185 int wl_dealloclim; /* l: max count */ 186 187 /* hashtable of inode numbers for allocated but unlinked inodes */ 188 /* synch ??? */ 189 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 190 u_long wl_inohashmask; 191 int wl_inohashcnt; 192 193 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 194 accounting */ 195 }; 196 197 #ifdef WAPBL_DEBUG_PRINT 198 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 199 #endif 200 201 /****************************************************************/ 202 #ifdef _KERNEL 203 204 #ifdef WAPBL_DEBUG 205 struct wapbl *wapbl_debug_wl; 206 #endif 207 208 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 209 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 210 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 211 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 212 #endif /* _KERNEL */ 213 214 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 215 216 static inline size_t wapbl_space_free(size_t avail, off_t head, 217 off_t tail); 218 static inline size_t wapbl_space_used(size_t avail, off_t head, 219 off_t tail); 220 221 #ifdef _KERNEL 222 223 #define WAPBL_INODETRK_SIZE 83 224 static int wapbl_ino_pool_refcount; 225 static struct pool wapbl_ino_pool; 226 struct wapbl_ino { 227 LIST_ENTRY(wapbl_ino) wi_hash; 228 ino_t wi_ino; 229 mode_t wi_mode; 230 }; 231 232 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 233 static void wapbl_inodetrk_free(struct wapbl *wl); 234 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 235 236 static size_t wapbl_transaction_len(struct wapbl *wl); 237 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 238 239 #if 0 240 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 241 #endif 242 243 static int wapbl_replay_isopen1(struct wapbl_replay *); 244 245 /* 246 * This is useful for debugging. If set, the log will 247 * only be truncated when necessary. 248 */ 249 int wapbl_lazy_truncate = 0; 250 251 struct wapbl_ops wapbl_ops = { 252 .wo_wapbl_discard = wapbl_discard, 253 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 254 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 255 .wo_wapbl_replay_read = wapbl_replay_read, 256 .wo_wapbl_add_buf = wapbl_add_buf, 257 .wo_wapbl_remove_buf = wapbl_remove_buf, 258 .wo_wapbl_resize_buf = wapbl_resize_buf, 259 .wo_wapbl_begin = wapbl_begin, 260 .wo_wapbl_end = wapbl_end, 261 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 262 263 /* XXX: the following is only used to say "this is a wapbl buf" */ 264 .wo_wapbl_biodone = wapbl_biodone, 265 }; 266 267 static int 268 wapbl_sysctl_init(void) 269 { 270 int rv; 271 const struct sysctlnode *rnode, *cnode; 272 273 wapbl_sysctl = NULL; 274 275 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 276 CTLFLAG_PERMANENT, 277 CTLTYPE_NODE, "vfs", NULL, 278 NULL, 0, NULL, 0, 279 CTL_VFS, CTL_EOL); 280 if (rv) 281 return rv; 282 283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode, 284 CTLFLAG_PERMANENT, 285 CTLTYPE_NODE, "wapbl", 286 SYSCTL_DESCR("WAPBL journaling options"), 287 NULL, 0, NULL, 0, 288 CTL_CREATE, CTL_EOL); 289 if (rv) 290 return rv; 291 292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 294 CTLTYPE_INT, "flush_disk_cache", 295 SYSCTL_DESCR("flush disk cache"), 296 NULL, 0, &wapbl_flush_disk_cache, 0, 297 CTL_CREATE, CTL_EOL); 298 if (rv) 299 return rv; 300 301 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 302 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 303 CTLTYPE_INT, "verbose_commit", 304 SYSCTL_DESCR("show time and size of wapbl log commits"), 305 NULL, 0, &wapbl_verbose_commit, 0, 306 CTL_CREATE, CTL_EOL); 307 return rv; 308 } 309 310 static void 311 wapbl_init(void) 312 { 313 malloc_type_attach(M_WAPBL); 314 wapbl_sysctl_init(); 315 } 316 317 #ifdef notyet 318 static int 319 wapbl_fini(bool interface) 320 { 321 if (aio_sysctl != NULL) 322 sysctl_teardown(&aio_sysctl); 323 return 0; 324 } 325 #endif 326 327 static int 328 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 329 { 330 int error, i; 331 332 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 333 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 334 335 /* 336 * Its only valid to reuse the replay log if its 337 * the same as the new log we just opened. 338 */ 339 KDASSERT(!wapbl_replay_isopen(wr)); 340 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 341 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 342 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 343 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 344 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 345 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 346 347 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 348 349 for (i = 0; i < wr->wr_inodescnt; i++) 350 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 351 wr->wr_inodes[i].wr_imode); 352 353 /* Make sure new transaction won't overwrite old inodes list */ 354 KDASSERT(wapbl_transaction_len(wl) <= 355 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 356 wr->wr_inodestail)); 357 358 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 359 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 360 wapbl_transaction_len(wl); 361 362 error = wapbl_write_inodes(wl, &wl->wl_head); 363 if (error) 364 return error; 365 366 KASSERT(wl->wl_head != wl->wl_tail); 367 KASSERT(wl->wl_head != 0); 368 369 return 0; 370 } 371 372 int 373 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 374 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 375 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 376 { 377 struct wapbl *wl; 378 struct vnode *devvp; 379 daddr_t logpbn; 380 int error; 381 int log_dev_bshift = ilog2(blksize); 382 int fs_dev_bshift = log_dev_bshift; 383 int run; 384 385 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 386 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 387 388 if (log_dev_bshift > fs_dev_bshift) { 389 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 390 ("wapbl: log device's block size cannot be larger " 391 "than filesystem's\n")); 392 /* 393 * Not currently implemented, although it could be if 394 * needed someday. 395 */ 396 return ENOSYS; 397 } 398 399 if (off < 0) 400 return EINVAL; 401 402 if (blksize < DEV_BSIZE) 403 return EINVAL; 404 if (blksize % DEV_BSIZE) 405 return EINVAL; 406 407 /* XXXTODO: verify that the full load is writable */ 408 409 /* 410 * XXX check for minimum log size 411 * minimum is governed by minimum amount of space 412 * to complete a transaction. (probably truncate) 413 */ 414 /* XXX for now pick something minimal */ 415 if ((count * blksize) < MAXPHYS) { 416 return ENOSPC; 417 } 418 419 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 420 return error; 421 } 422 423 wl = wapbl_calloc(1, sizeof(*wl)); 424 rw_init(&wl->wl_rwlock); 425 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 426 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 427 LIST_INIT(&wl->wl_bufs); 428 SIMPLEQ_INIT(&wl->wl_entries); 429 430 wl->wl_logvp = vp; 431 wl->wl_devvp = devvp; 432 wl->wl_mount = mp; 433 wl->wl_logpbn = logpbn; 434 wl->wl_log_dev_bshift = log_dev_bshift; 435 wl->wl_fs_dev_bshift = fs_dev_bshift; 436 437 wl->wl_flush = flushfn; 438 wl->wl_flush_abort = flushabortfn; 439 440 /* Reserve two log device blocks for the commit headers */ 441 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 442 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 443 /* truncate the log usage to a multiple of log_dev_bshift */ 444 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 445 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 446 447 /* 448 * wl_bufbytes_max limits the size of the in memory transaction space. 449 * - Since buffers are allocated and accounted for in units of 450 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 451 * (i.e. 1<<PAGE_SHIFT) 452 * - Since the log device has to be written in units of 453 * 1<<wl_log_dev_bshift it is required to be a mulitple of 454 * 1<<wl_log_dev_bshift. 455 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 456 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 457 * Therefore it must be multiple of the least common multiple of those 458 * three quantities. Fortunately, all of those quantities are 459 * guaranteed to be a power of two, and the least common multiple of 460 * a set of numbers which are all powers of two is simply the maximum 461 * of those numbers. Finally, the maximum logarithm of a power of two 462 * is the same as the log of the maximum power of two. So we can do 463 * the following operations to size wl_bufbytes_max: 464 */ 465 466 /* XXX fix actual number of pages reserved per filesystem. */ 467 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 468 469 /* Round wl_bufbytes_max to the largest power of two constraint */ 470 wl->wl_bufbytes_max >>= PAGE_SHIFT; 471 wl->wl_bufbytes_max <<= PAGE_SHIFT; 472 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 473 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 474 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 475 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 476 477 /* XXX maybe use filesystem fragment size instead of 1024 */ 478 /* XXX fix actual number of buffers reserved per filesystem. */ 479 wl->wl_bufcount_max = (nbuf / 2) * 1024; 480 481 /* XXX tie this into resource estimation */ 482 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max); 483 484 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * 485 wl->wl_dealloclim); 486 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * 487 wl->wl_dealloclim); 488 489 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 490 491 /* Initialize the commit header */ 492 { 493 struct wapbl_wc_header *wc; 494 size_t len = 1 << wl->wl_log_dev_bshift; 495 wc = wapbl_calloc(1, len); 496 wc->wc_type = WAPBL_WC_HEADER; 497 wc->wc_len = len; 498 wc->wc_circ_off = wl->wl_circ_off; 499 wc->wc_circ_size = wl->wl_circ_size; 500 /* XXX wc->wc_fsid */ 501 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 502 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 503 wl->wl_wc_header = wc; 504 wl->wl_wc_scratch = wapbl_malloc(len); 505 } 506 507 /* 508 * if there was an existing set of unlinked but 509 * allocated inodes, preserve it in the new 510 * log. 511 */ 512 if (wr && wr->wr_inodescnt) { 513 error = wapbl_start_flush_inodes(wl, wr); 514 if (error) 515 goto errout; 516 } 517 518 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 519 if (error) { 520 goto errout; 521 } 522 523 *wlp = wl; 524 #if defined(WAPBL_DEBUG) 525 wapbl_debug_wl = wl; 526 #endif 527 528 return 0; 529 errout: 530 wapbl_discard(wl); 531 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 532 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 533 wapbl_free(wl->wl_deallocblks, 534 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 535 wapbl_free(wl->wl_dealloclens, 536 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 537 wapbl_inodetrk_free(wl); 538 wapbl_free(wl, sizeof(*wl)); 539 540 return error; 541 } 542 543 /* 544 * Like wapbl_flush, only discards the transaction 545 * completely 546 */ 547 548 void 549 wapbl_discard(struct wapbl *wl) 550 { 551 struct wapbl_entry *we; 552 struct buf *bp; 553 int i; 554 555 /* 556 * XXX we may consider using upgrade here 557 * if we want to call flush from inside a transaction 558 */ 559 rw_enter(&wl->wl_rwlock, RW_WRITER); 560 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 561 wl->wl_dealloccnt); 562 563 #ifdef WAPBL_DEBUG_PRINT 564 { 565 pid_t pid = -1; 566 lwpid_t lid = -1; 567 if (curproc) 568 pid = curproc->p_pid; 569 if (curlwp) 570 lid = curlwp->l_lid; 571 #ifdef WAPBL_DEBUG_BUFBYTES 572 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 573 ("wapbl_discard: thread %d.%d discarding " 574 "transaction\n" 575 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 576 "deallocs=%d inodes=%d\n" 577 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 578 "unsynced=%zu\n", 579 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 580 wl->wl_bcount, wl->wl_dealloccnt, 581 wl->wl_inohashcnt, wl->wl_error_count, 582 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 583 wl->wl_unsynced_bufbytes)); 584 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 585 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 586 ("\tentry: bufcount = %zu, reclaimable = %zu, " 587 "error = %d, unsynced = %zu\n", 588 we->we_bufcount, we->we_reclaimable_bytes, 589 we->we_error, we->we_unsynced_bufbytes)); 590 } 591 #else /* !WAPBL_DEBUG_BUFBYTES */ 592 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 593 ("wapbl_discard: thread %d.%d discarding transaction\n" 594 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 595 "deallocs=%d inodes=%d\n" 596 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 597 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 598 wl->wl_bcount, wl->wl_dealloccnt, 599 wl->wl_inohashcnt, wl->wl_error_count, 600 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 601 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 602 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 603 ("\tentry: bufcount = %zu, reclaimable = %zu, " 604 "error = %d\n", 605 we->we_bufcount, we->we_reclaimable_bytes, 606 we->we_error)); 607 } 608 #endif /* !WAPBL_DEBUG_BUFBYTES */ 609 } 610 #endif /* WAPBL_DEBUG_PRINT */ 611 612 for (i = 0; i <= wl->wl_inohashmask; i++) { 613 struct wapbl_ino_head *wih; 614 struct wapbl_ino *wi; 615 616 wih = &wl->wl_inohash[i]; 617 while ((wi = LIST_FIRST(wih)) != NULL) { 618 LIST_REMOVE(wi, wi_hash); 619 pool_put(&wapbl_ino_pool, wi); 620 KASSERT(wl->wl_inohashcnt > 0); 621 wl->wl_inohashcnt--; 622 } 623 } 624 625 /* 626 * clean buffer list 627 */ 628 mutex_enter(&bufcache_lock); 629 mutex_enter(&wl->wl_mtx); 630 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 631 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 632 /* 633 * The buffer will be unlocked and 634 * removed from the transaction in brelse 635 */ 636 mutex_exit(&wl->wl_mtx); 637 brelsel(bp, 0); 638 mutex_enter(&wl->wl_mtx); 639 } 640 } 641 mutex_exit(&wl->wl_mtx); 642 mutex_exit(&bufcache_lock); 643 644 /* 645 * Remove references to this wl from wl_entries, free any which 646 * no longer have buffers, others will be freed in wapbl_biodone 647 * when they no longer have any buffers. 648 */ 649 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 650 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 651 /* XXX should we be accumulating wl_error_count 652 * and increasing reclaimable bytes ? */ 653 we->we_wapbl = NULL; 654 if (we->we_bufcount == 0) { 655 #ifdef WAPBL_DEBUG_BUFBYTES 656 KASSERT(we->we_unsynced_bufbytes == 0); 657 #endif 658 wapbl_free(we, sizeof(*we)); 659 } 660 } 661 662 /* Discard list of deallocs */ 663 wl->wl_dealloccnt = 0; 664 /* XXX should we clear wl_reserved_bytes? */ 665 666 KASSERT(wl->wl_bufbytes == 0); 667 KASSERT(wl->wl_bcount == 0); 668 KASSERT(wl->wl_bufcount == 0); 669 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 670 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 671 KASSERT(wl->wl_inohashcnt == 0); 672 673 rw_exit(&wl->wl_rwlock); 674 } 675 676 int 677 wapbl_stop(struct wapbl *wl, int force) 678 { 679 struct vnode *vp; 680 int error; 681 682 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 683 error = wapbl_flush(wl, 1); 684 if (error) { 685 if (force) 686 wapbl_discard(wl); 687 else 688 return error; 689 } 690 691 /* Unlinked inodes persist after a flush */ 692 if (wl->wl_inohashcnt) { 693 if (force) { 694 wapbl_discard(wl); 695 } else { 696 return EBUSY; 697 } 698 } 699 700 KASSERT(wl->wl_bufbytes == 0); 701 KASSERT(wl->wl_bcount == 0); 702 KASSERT(wl->wl_bufcount == 0); 703 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 704 KASSERT(wl->wl_dealloccnt == 0); 705 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 706 KASSERT(wl->wl_inohashcnt == 0); 707 708 vp = wl->wl_logvp; 709 710 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 711 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 712 wapbl_free(wl->wl_deallocblks, 713 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 714 wapbl_free(wl->wl_dealloclens, 715 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 716 wapbl_inodetrk_free(wl); 717 718 cv_destroy(&wl->wl_reclaimable_cv); 719 mutex_destroy(&wl->wl_mtx); 720 rw_destroy(&wl->wl_rwlock); 721 wapbl_free(wl, sizeof(*wl)); 722 723 return 0; 724 } 725 726 static int 727 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 728 { 729 struct pstats *pstats = curlwp->l_proc->p_stats; 730 struct buf *bp; 731 int error; 732 733 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 734 KASSERT(devvp->v_type == VBLK); 735 736 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 737 mutex_enter(&devvp->v_interlock); 738 devvp->v_numoutput++; 739 mutex_exit(&devvp->v_interlock); 740 pstats->p_ru.ru_oublock++; 741 } else { 742 pstats->p_ru.ru_inblock++; 743 } 744 745 bp = getiobuf(devvp, true); 746 bp->b_flags = flags; 747 bp->b_cflags = BC_BUSY; /* silly & dubious */ 748 bp->b_dev = devvp->v_rdev; 749 bp->b_data = data; 750 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 751 bp->b_blkno = pbn; 752 753 WAPBL_PRINTF(WAPBL_PRINT_IO, 754 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 755 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 756 bp->b_blkno, bp->b_dev)); 757 758 VOP_STRATEGY(devvp, bp); 759 760 error = biowait(bp); 761 putiobuf(bp); 762 763 if (error) { 764 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 765 ("wapbl_doio: %s %zu bytes at block %" PRId64 766 " on dev 0x%"PRIx64" failed with error %d\n", 767 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 768 "write" : "read"), 769 len, pbn, devvp->v_rdev, error)); 770 } 771 772 return error; 773 } 774 775 int 776 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 777 { 778 779 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 780 } 781 782 int 783 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 784 { 785 786 return wapbl_doio(data, len, devvp, pbn, B_READ); 787 } 788 789 /* 790 * Off is byte offset returns new offset for next write 791 * handles log wraparound 792 */ 793 static int 794 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 795 { 796 size_t slen; 797 off_t off = *offp; 798 int error; 799 daddr_t pbn; 800 801 KDASSERT(((len >> wl->wl_log_dev_bshift) << 802 wl->wl_log_dev_bshift) == len); 803 804 if (off < wl->wl_circ_off) 805 off = wl->wl_circ_off; 806 slen = wl->wl_circ_off + wl->wl_circ_size - off; 807 if (slen < len) { 808 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 809 #ifdef _KERNEL 810 pbn = btodb(pbn << wl->wl_log_dev_bshift); 811 #endif 812 error = wapbl_write(data, slen, wl->wl_devvp, pbn); 813 if (error) 814 return error; 815 data = (uint8_t *)data + slen; 816 len -= slen; 817 off = wl->wl_circ_off; 818 } 819 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 820 #ifdef _KERNEL 821 pbn = btodb(pbn << wl->wl_log_dev_bshift); 822 #endif 823 error = wapbl_write(data, len, wl->wl_devvp, pbn); 824 if (error) 825 return error; 826 off += len; 827 if (off >= wl->wl_circ_off + wl->wl_circ_size) 828 off = wl->wl_circ_off; 829 *offp = off; 830 return 0; 831 } 832 833 /****************************************************************/ 834 835 int 836 wapbl_begin(struct wapbl *wl, const char *file, int line) 837 { 838 int doflush; 839 unsigned lockcount; 840 841 KDASSERT(wl); 842 843 /* 844 * XXX this needs to be made much more sophisticated. 845 * perhaps each wapbl_begin could reserve a specified 846 * number of buffers and bytes. 847 */ 848 mutex_enter(&wl->wl_mtx); 849 lockcount = wl->wl_lock_count; 850 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 851 wl->wl_bufbytes_max / 2) || 852 ((wl->wl_bufcount + (lockcount * 10)) > 853 wl->wl_bufcount_max / 2) || 854 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 855 (wl->wl_dealloccnt >= 856 (wl->wl_dealloclim - (wl->wl_dealloclim >> 8))); 857 mutex_exit(&wl->wl_mtx); 858 859 if (doflush) { 860 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 861 ("force flush lockcnt=%d bufbytes=%zu " 862 "(max=%zu) bufcount=%zu (max=%zu) " 863 "dealloccnt %d (lim=%d)\n", 864 lockcount, wl->wl_bufbytes, 865 wl->wl_bufbytes_max, wl->wl_bufcount, 866 wl->wl_bufcount_max, 867 wl->wl_dealloccnt, wl->wl_dealloclim)); 868 } 869 870 if (doflush) { 871 int error = wapbl_flush(wl, 0); 872 if (error) 873 return error; 874 } 875 876 rw_enter(&wl->wl_rwlock, RW_READER); 877 mutex_enter(&wl->wl_mtx); 878 wl->wl_lock_count++; 879 mutex_exit(&wl->wl_mtx); 880 881 #if defined(WAPBL_DEBUG_PRINT) 882 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 883 ("wapbl_begin thread %d.%d with bufcount=%zu " 884 "bufbytes=%zu bcount=%zu at %s:%d\n", 885 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 886 wl->wl_bufbytes, wl->wl_bcount, file, line)); 887 #endif 888 889 return 0; 890 } 891 892 void 893 wapbl_end(struct wapbl *wl) 894 { 895 896 #if defined(WAPBL_DEBUG_PRINT) 897 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 898 ("wapbl_end thread %d.%d with bufcount=%zu " 899 "bufbytes=%zu bcount=%zu\n", 900 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 901 wl->wl_bufbytes, wl->wl_bcount)); 902 #endif 903 904 mutex_enter(&wl->wl_mtx); 905 KASSERT(wl->wl_lock_count > 0); 906 wl->wl_lock_count--; 907 mutex_exit(&wl->wl_mtx); 908 909 rw_exit(&wl->wl_rwlock); 910 } 911 912 void 913 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 914 { 915 916 KASSERT(bp->b_cflags & BC_BUSY); 917 KASSERT(bp->b_vp); 918 919 wapbl_jlock_assert(wl); 920 921 #if 0 922 /* 923 * XXX this might be an issue for swapfiles. 924 * see uvm_swap.c:1702 925 * 926 * XXX2 why require it then? leap of semantics? 927 */ 928 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 929 #endif 930 931 mutex_enter(&wl->wl_mtx); 932 if (bp->b_flags & B_LOCKED) { 933 LIST_REMOVE(bp, b_wapbllist); 934 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 935 ("wapbl_add_buf thread %d.%d re-adding buf %p " 936 "with %d bytes %d bcount\n", 937 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 938 bp->b_bcount)); 939 } else { 940 /* unlocked by dirty buffers shouldn't exist */ 941 KASSERT(!(bp->b_oflags & BO_DELWRI)); 942 wl->wl_bufbytes += bp->b_bufsize; 943 wl->wl_bcount += bp->b_bcount; 944 wl->wl_bufcount++; 945 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 946 ("wapbl_add_buf thread %d.%d adding buf %p " 947 "with %d bytes %d bcount\n", 948 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 949 bp->b_bcount)); 950 } 951 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 952 mutex_exit(&wl->wl_mtx); 953 954 bp->b_flags |= B_LOCKED; 955 } 956 957 static void 958 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 959 { 960 961 KASSERT(mutex_owned(&wl->wl_mtx)); 962 KASSERT(bp->b_cflags & BC_BUSY); 963 wapbl_jlock_assert(wl); 964 965 #if 0 966 /* 967 * XXX this might be an issue for swapfiles. 968 * see uvm_swap.c:1725 969 * 970 * XXXdeux: see above 971 */ 972 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 973 #endif 974 KASSERT(bp->b_flags & B_LOCKED); 975 976 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 977 ("wapbl_remove_buf thread %d.%d removing buf %p with " 978 "%d bytes %d bcount\n", 979 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 980 981 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 982 wl->wl_bufbytes -= bp->b_bufsize; 983 KASSERT(wl->wl_bcount >= bp->b_bcount); 984 wl->wl_bcount -= bp->b_bcount; 985 KASSERT(wl->wl_bufcount > 0); 986 wl->wl_bufcount--; 987 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 988 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 989 LIST_REMOVE(bp, b_wapbllist); 990 991 bp->b_flags &= ~B_LOCKED; 992 } 993 994 /* called from brelsel() in vfs_bio among other places */ 995 void 996 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 997 { 998 999 mutex_enter(&wl->wl_mtx); 1000 wapbl_remove_buf_locked(wl, bp); 1001 mutex_exit(&wl->wl_mtx); 1002 } 1003 1004 void 1005 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1006 { 1007 1008 KASSERT(bp->b_cflags & BC_BUSY); 1009 1010 /* 1011 * XXX: why does this depend on B_LOCKED? otherwise the buf 1012 * is not for a transaction? if so, why is this called in the 1013 * first place? 1014 */ 1015 if (bp->b_flags & B_LOCKED) { 1016 mutex_enter(&wl->wl_mtx); 1017 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1018 wl->wl_bcount += bp->b_bcount - oldcnt; 1019 mutex_exit(&wl->wl_mtx); 1020 } 1021 } 1022 1023 #endif /* _KERNEL */ 1024 1025 /****************************************************************/ 1026 /* Some utility inlines */ 1027 1028 /* This is used to advance the pointer at old to new value at old+delta */ 1029 static inline off_t 1030 wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 1031 { 1032 off_t new; 1033 1034 /* Define acceptable ranges for inputs. */ 1035 KASSERT(delta <= size); 1036 KASSERT((old == 0) || (old >= off)); 1037 KASSERT(old < (size + off)); 1038 1039 if ((old == 0) && (delta != 0)) 1040 new = off + delta; 1041 else if ((old + delta) < (size + off)) 1042 new = old + delta; 1043 else 1044 new = (old + delta) - size; 1045 1046 /* Note some interesting axioms */ 1047 KASSERT((delta != 0) || (new == old)); 1048 KASSERT((delta == 0) || (new != 0)); 1049 KASSERT((delta != (size)) || (new == old)); 1050 1051 /* Define acceptable ranges for output. */ 1052 KASSERT((new == 0) || (new >= off)); 1053 KASSERT(new < (size + off)); 1054 return new; 1055 } 1056 1057 static inline size_t 1058 wapbl_space_used(size_t avail, off_t head, off_t tail) 1059 { 1060 1061 if (tail == 0) { 1062 KASSERT(head == 0); 1063 return 0; 1064 } 1065 return ((head + (avail - 1) - tail) % avail) + 1; 1066 } 1067 1068 static inline size_t 1069 wapbl_space_free(size_t avail, off_t head, off_t tail) 1070 { 1071 1072 return avail - wapbl_space_used(avail, head, tail); 1073 } 1074 1075 static inline void 1076 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1077 off_t *tailp) 1078 { 1079 off_t head = *headp; 1080 off_t tail = *tailp; 1081 1082 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1083 head = wapbl_advance(size, off, head, delta); 1084 if ((tail == 0) && (head != 0)) 1085 tail = off; 1086 *headp = head; 1087 *tailp = tail; 1088 } 1089 1090 static inline void 1091 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1092 off_t *tailp) 1093 { 1094 off_t head = *headp; 1095 off_t tail = *tailp; 1096 1097 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1098 tail = wapbl_advance(size, off, tail, delta); 1099 if (head == tail) { 1100 head = tail = 0; 1101 } 1102 *headp = head; 1103 *tailp = tail; 1104 } 1105 1106 #ifdef _KERNEL 1107 1108 /****************************************************************/ 1109 1110 /* 1111 * Remove transactions whose buffers are completely flushed to disk. 1112 * Will block until at least minfree space is available. 1113 * only intended to be called from inside wapbl_flush and therefore 1114 * does not protect against commit races with itself or with flush. 1115 */ 1116 static int 1117 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1118 { 1119 size_t delta; 1120 size_t avail; 1121 off_t head; 1122 off_t tail; 1123 int error = 0; 1124 1125 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1126 KASSERT(rw_write_held(&wl->wl_rwlock)); 1127 1128 mutex_enter(&wl->wl_mtx); 1129 1130 /* 1131 * First check to see if we have to do a commit 1132 * at all. 1133 */ 1134 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1135 if (minfree < avail) { 1136 mutex_exit(&wl->wl_mtx); 1137 return 0; 1138 } 1139 minfree -= avail; 1140 while ((wl->wl_error_count == 0) && 1141 (wl->wl_reclaimable_bytes < minfree)) { 1142 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1143 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1144 "minfree=%zd\n", 1145 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1146 minfree)); 1147 1148 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1149 } 1150 if (wl->wl_reclaimable_bytes < minfree) { 1151 KASSERT(wl->wl_error_count); 1152 /* XXX maybe get actual error from buffer instead someday? */ 1153 error = EIO; 1154 } 1155 head = wl->wl_head; 1156 tail = wl->wl_tail; 1157 delta = wl->wl_reclaimable_bytes; 1158 1159 /* If all of of the entries are flushed, then be sure to keep 1160 * the reserved bytes reserved. Watch out for discarded transactions, 1161 * which could leave more bytes reserved than are reclaimable. 1162 */ 1163 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1164 (delta >= wl->wl_reserved_bytes)) { 1165 delta -= wl->wl_reserved_bytes; 1166 } 1167 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1168 &tail); 1169 KDASSERT(wl->wl_reserved_bytes <= 1170 wapbl_space_used(wl->wl_circ_size, head, tail)); 1171 mutex_exit(&wl->wl_mtx); 1172 1173 if (error) 1174 return error; 1175 1176 if (waitonly) 1177 return 0; 1178 1179 /* 1180 * This is where head, tail and delta are unprotected 1181 * from races against itself or flush. This is ok since 1182 * we only call this routine from inside flush itself. 1183 * 1184 * XXX: how can it race against itself when accessed only 1185 * from behind the write-locked rwlock? 1186 */ 1187 error = wapbl_write_commit(wl, head, tail); 1188 if (error) 1189 return error; 1190 1191 wl->wl_head = head; 1192 wl->wl_tail = tail; 1193 1194 mutex_enter(&wl->wl_mtx); 1195 KASSERT(wl->wl_reclaimable_bytes >= delta); 1196 wl->wl_reclaimable_bytes -= delta; 1197 mutex_exit(&wl->wl_mtx); 1198 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1199 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1200 curproc->p_pid, curlwp->l_lid, delta)); 1201 1202 return 0; 1203 } 1204 1205 /****************************************************************/ 1206 1207 void 1208 wapbl_biodone(struct buf *bp) 1209 { 1210 struct wapbl_entry *we = bp->b_private; 1211 struct wapbl *wl = we->we_wapbl; 1212 1213 /* 1214 * Handle possible flushing of buffers after log has been 1215 * decomissioned. 1216 */ 1217 if (!wl) { 1218 KASSERT(we->we_bufcount > 0); 1219 we->we_bufcount--; 1220 #ifdef WAPBL_DEBUG_BUFBYTES 1221 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1222 we->we_unsynced_bufbytes -= bp->b_bufsize; 1223 #endif 1224 1225 if (we->we_bufcount == 0) { 1226 #ifdef WAPBL_DEBUG_BUFBYTES 1227 KASSERT(we->we_unsynced_bufbytes == 0); 1228 #endif 1229 wapbl_free(we, sizeof(*we)); 1230 } 1231 1232 brelse(bp, 0); 1233 return; 1234 } 1235 1236 #ifdef ohbother 1237 KDASSERT(bp->b_flags & B_DONE); 1238 KDASSERT(!(bp->b_flags & B_DELWRI)); 1239 KDASSERT(bp->b_flags & B_ASYNC); 1240 KDASSERT(bp->b_flags & B_BUSY); 1241 KDASSERT(!(bp->b_flags & B_LOCKED)); 1242 KDASSERT(!(bp->b_flags & B_READ)); 1243 KDASSERT(!(bp->b_flags & B_INVAL)); 1244 KDASSERT(!(bp->b_flags & B_NOCACHE)); 1245 #endif 1246 1247 if (bp->b_error) { 1248 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1249 /* 1250 * XXXpooka: interfaces not fully updated 1251 * Note: this was not enabled in the original patch 1252 * against netbsd4 either. I don't know if comment 1253 * above is true or not. 1254 */ 1255 1256 /* 1257 * If an error occurs, report the error and leave the 1258 * buffer as a delayed write on the LRU queue. 1259 * restarting the write would likely result in 1260 * an error spinloop, so let it be done harmlessly 1261 * by the syncer. 1262 */ 1263 bp->b_flags &= ~(B_DONE); 1264 simple_unlock(&bp->b_interlock); 1265 1266 if (we->we_error == 0) { 1267 mutex_enter(&wl->wl_mtx); 1268 wl->wl_error_count++; 1269 mutex_exit(&wl->wl_mtx); 1270 cv_broadcast(&wl->wl_reclaimable_cv); 1271 } 1272 we->we_error = bp->b_error; 1273 bp->b_error = 0; 1274 brelse(bp); 1275 return; 1276 #else 1277 /* For now, just mark the log permanently errored out */ 1278 1279 mutex_enter(&wl->wl_mtx); 1280 if (wl->wl_error_count == 0) { 1281 wl->wl_error_count++; 1282 cv_broadcast(&wl->wl_reclaimable_cv); 1283 } 1284 mutex_exit(&wl->wl_mtx); 1285 #endif 1286 } 1287 1288 mutex_enter(&wl->wl_mtx); 1289 1290 KASSERT(we->we_bufcount > 0); 1291 we->we_bufcount--; 1292 #ifdef WAPBL_DEBUG_BUFBYTES 1293 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1294 we->we_unsynced_bufbytes -= bp->b_bufsize; 1295 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); 1296 wl->wl_unsynced_bufbytes -= bp->b_bufsize; 1297 #endif 1298 1299 /* 1300 * If the current transaction can be reclaimed, start 1301 * at the beginning and reclaim any consecutive reclaimable 1302 * transactions. If we successfully reclaim anything, 1303 * then wakeup anyone waiting for the reclaim. 1304 */ 1305 if (we->we_bufcount == 0) { 1306 size_t delta = 0; 1307 int errcnt = 0; 1308 #ifdef WAPBL_DEBUG_BUFBYTES 1309 KDASSERT(we->we_unsynced_bufbytes == 0); 1310 #endif 1311 /* 1312 * clear any posted error, since the buffer it came from 1313 * has successfully flushed by now 1314 */ 1315 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1316 (we->we_bufcount == 0)) { 1317 delta += we->we_reclaimable_bytes; 1318 if (we->we_error) 1319 errcnt++; 1320 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1321 wapbl_free(we, sizeof(*we)); 1322 } 1323 1324 if (delta) { 1325 wl->wl_reclaimable_bytes += delta; 1326 KASSERT(wl->wl_error_count >= errcnt); 1327 wl->wl_error_count -= errcnt; 1328 cv_broadcast(&wl->wl_reclaimable_cv); 1329 } 1330 } 1331 1332 mutex_exit(&wl->wl_mtx); 1333 brelse(bp, 0); 1334 } 1335 1336 /* 1337 * Write transactions to disk + start I/O for contents 1338 */ 1339 int 1340 wapbl_flush(struct wapbl *wl, int waitfor) 1341 { 1342 struct buf *bp; 1343 struct wapbl_entry *we; 1344 off_t off; 1345 off_t head; 1346 off_t tail; 1347 size_t delta = 0; 1348 size_t flushsize; 1349 size_t reserved; 1350 int error = 0; 1351 1352 /* 1353 * Do a quick check to see if a full flush can be skipped 1354 * This assumes that the flush callback does not need to be called 1355 * unless there are other outstanding bufs. 1356 */ 1357 if (!waitfor) { 1358 size_t nbufs; 1359 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1360 protect the KASSERTS */ 1361 nbufs = wl->wl_bufcount; 1362 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1363 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1364 mutex_exit(&wl->wl_mtx); 1365 if (nbufs == 0) 1366 return 0; 1367 } 1368 1369 /* 1370 * XXX we may consider using LK_UPGRADE here 1371 * if we want to call flush from inside a transaction 1372 */ 1373 rw_enter(&wl->wl_rwlock, RW_WRITER); 1374 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1375 wl->wl_dealloccnt); 1376 1377 /* 1378 * Now that we are fully locked and flushed, 1379 * do another check for nothing to do. 1380 */ 1381 if (wl->wl_bufcount == 0) { 1382 goto out; 1383 } 1384 1385 #if 0 1386 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1387 ("wapbl_flush thread %d.%d flushing entries with " 1388 "bufcount=%zu bufbytes=%zu\n", 1389 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1390 wl->wl_bufbytes)); 1391 #endif 1392 1393 /* Calculate amount of space needed to flush */ 1394 flushsize = wapbl_transaction_len(wl); 1395 if (wapbl_verbose_commit) { 1396 struct timespec ts; 1397 getnanotime(&ts); 1398 printf("%s: %lld.%06ld this transaction = %zu bytes\n", 1399 __func__, (long long)ts.tv_sec, 1400 (long)ts.tv_nsec, flushsize); 1401 } 1402 1403 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1404 /* 1405 * XXX this could be handled more gracefully, perhaps place 1406 * only a partial transaction in the log and allow the 1407 * remaining to flush without the protection of the journal. 1408 */ 1409 panic("wapbl_flush: current transaction too big to flush\n"); 1410 } 1411 1412 error = wapbl_truncate(wl, flushsize, 0); 1413 if (error) 1414 goto out2; 1415 1416 off = wl->wl_head; 1417 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1418 (off < wl->wl_circ_off + wl->wl_circ_size))); 1419 error = wapbl_write_blocks(wl, &off); 1420 if (error) 1421 goto out2; 1422 error = wapbl_write_revocations(wl, &off); 1423 if (error) 1424 goto out2; 1425 error = wapbl_write_inodes(wl, &off); 1426 if (error) 1427 goto out2; 1428 1429 reserved = 0; 1430 if (wl->wl_inohashcnt) 1431 reserved = wapbl_transaction_inodes_len(wl); 1432 1433 head = wl->wl_head; 1434 tail = wl->wl_tail; 1435 1436 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1437 &head, &tail); 1438 #ifdef WAPBL_DEBUG 1439 if (head != off) { 1440 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1441 " off=%"PRIdMAX" flush=%zu\n", 1442 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1443 flushsize); 1444 } 1445 #else 1446 KASSERT(head == off); 1447 #endif 1448 1449 /* Opportunistically move the tail forward if we can */ 1450 if (!wapbl_lazy_truncate) { 1451 mutex_enter(&wl->wl_mtx); 1452 delta = wl->wl_reclaimable_bytes; 1453 mutex_exit(&wl->wl_mtx); 1454 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1455 &head, &tail); 1456 } 1457 1458 error = wapbl_write_commit(wl, head, tail); 1459 if (error) 1460 goto out2; 1461 1462 we = wapbl_calloc(1, sizeof(*we)); 1463 1464 #ifdef WAPBL_DEBUG_BUFBYTES 1465 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1466 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1467 " unsynced=%zu" 1468 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1469 "inodes=%d\n", 1470 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1471 wapbl_space_used(wl->wl_circ_size, head, tail), 1472 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1473 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1474 wl->wl_inohashcnt)); 1475 #else 1476 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1477 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1478 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1479 "inodes=%d\n", 1480 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1481 wapbl_space_used(wl->wl_circ_size, head, tail), 1482 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1483 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1484 #endif 1485 1486 1487 mutex_enter(&bufcache_lock); 1488 mutex_enter(&wl->wl_mtx); 1489 1490 wl->wl_reserved_bytes = reserved; 1491 wl->wl_head = head; 1492 wl->wl_tail = tail; 1493 KASSERT(wl->wl_reclaimable_bytes >= delta); 1494 wl->wl_reclaimable_bytes -= delta; 1495 wl->wl_dealloccnt = 0; 1496 #ifdef WAPBL_DEBUG_BUFBYTES 1497 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1498 #endif 1499 1500 we->we_wapbl = wl; 1501 we->we_bufcount = wl->wl_bufcount; 1502 #ifdef WAPBL_DEBUG_BUFBYTES 1503 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1504 #endif 1505 we->we_reclaimable_bytes = flushsize; 1506 we->we_error = 0; 1507 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1508 1509 /* 1510 * this flushes bufs in reverse order than they were queued 1511 * it shouldn't matter, but if we care we could use TAILQ instead. 1512 * XXX Note they will get put on the lru queue when they flush 1513 * so we might actually want to change this to preserve order. 1514 */ 1515 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1516 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1517 continue; 1518 } 1519 bp->b_iodone = wapbl_biodone; 1520 bp->b_private = we; 1521 bremfree(bp); 1522 wapbl_remove_buf_locked(wl, bp); 1523 mutex_exit(&wl->wl_mtx); 1524 mutex_exit(&bufcache_lock); 1525 bawrite(bp); 1526 mutex_enter(&bufcache_lock); 1527 mutex_enter(&wl->wl_mtx); 1528 } 1529 mutex_exit(&wl->wl_mtx); 1530 mutex_exit(&bufcache_lock); 1531 1532 #if 0 1533 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1534 ("wapbl_flush thread %d.%d done flushing entries...\n", 1535 curproc->p_pid, curlwp->l_lid)); 1536 #endif 1537 1538 out: 1539 1540 /* 1541 * If the waitfor flag is set, don't return until everything is 1542 * fully flushed and the on disk log is empty. 1543 */ 1544 if (waitfor) { 1545 error = wapbl_truncate(wl, wl->wl_circ_size - 1546 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1547 } 1548 1549 out2: 1550 if (error) { 1551 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1552 wl->wl_dealloclens, wl->wl_dealloccnt); 1553 } 1554 1555 #ifdef WAPBL_DEBUG_PRINT 1556 if (error) { 1557 pid_t pid = -1; 1558 lwpid_t lid = -1; 1559 if (curproc) 1560 pid = curproc->p_pid; 1561 if (curlwp) 1562 lid = curlwp->l_lid; 1563 mutex_enter(&wl->wl_mtx); 1564 #ifdef WAPBL_DEBUG_BUFBYTES 1565 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1566 ("wapbl_flush: thread %d.%d aborted flush: " 1567 "error = %d\n" 1568 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1569 "deallocs=%d inodes=%d\n" 1570 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1571 "unsynced=%zu\n", 1572 pid, lid, error, wl->wl_bufcount, 1573 wl->wl_bufbytes, wl->wl_bcount, 1574 wl->wl_dealloccnt, wl->wl_inohashcnt, 1575 wl->wl_error_count, wl->wl_reclaimable_bytes, 1576 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1577 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1578 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1579 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1580 "error = %d, unsynced = %zu\n", 1581 we->we_bufcount, we->we_reclaimable_bytes, 1582 we->we_error, we->we_unsynced_bufbytes)); 1583 } 1584 #else 1585 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1586 ("wapbl_flush: thread %d.%d aborted flush: " 1587 "error = %d\n" 1588 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1589 "deallocs=%d inodes=%d\n" 1590 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1591 pid, lid, error, wl->wl_bufcount, 1592 wl->wl_bufbytes, wl->wl_bcount, 1593 wl->wl_dealloccnt, wl->wl_inohashcnt, 1594 wl->wl_error_count, wl->wl_reclaimable_bytes, 1595 wl->wl_reserved_bytes)); 1596 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1597 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1598 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1599 "error = %d\n", we->we_bufcount, 1600 we->we_reclaimable_bytes, we->we_error)); 1601 } 1602 #endif 1603 mutex_exit(&wl->wl_mtx); 1604 } 1605 #endif 1606 1607 rw_exit(&wl->wl_rwlock); 1608 return error; 1609 } 1610 1611 /****************************************************************/ 1612 1613 void 1614 wapbl_jlock_assert(struct wapbl *wl) 1615 { 1616 1617 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1618 } 1619 1620 void 1621 wapbl_junlock_assert(struct wapbl *wl) 1622 { 1623 1624 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1625 } 1626 1627 /****************************************************************/ 1628 1629 /* locks missing */ 1630 void 1631 wapbl_print(struct wapbl *wl, 1632 int full, 1633 void (*pr)(const char *, ...)) 1634 { 1635 struct buf *bp; 1636 struct wapbl_entry *we; 1637 (*pr)("wapbl %p", wl); 1638 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1639 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1640 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1641 wl->wl_circ_size, wl->wl_circ_off, 1642 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1643 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1644 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1645 #ifdef WAPBL_DEBUG_BUFBYTES 1646 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1647 "reserved = %zu errcnt = %d unsynced = %zu\n", 1648 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1649 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1650 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1651 #else 1652 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1653 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1654 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1655 wl->wl_error_count); 1656 #endif 1657 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1658 wl->wl_dealloccnt, wl->wl_dealloclim); 1659 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1660 wl->wl_inohashcnt, wl->wl_inohashmask); 1661 (*pr)("entries:\n"); 1662 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1663 #ifdef WAPBL_DEBUG_BUFBYTES 1664 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1665 "unsynced = %zu\n", 1666 we->we_bufcount, we->we_reclaimable_bytes, 1667 we->we_error, we->we_unsynced_bufbytes); 1668 #else 1669 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1670 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1671 #endif 1672 } 1673 if (full) { 1674 int cnt = 0; 1675 (*pr)("bufs ="); 1676 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1677 if (!LIST_NEXT(bp, b_wapbllist)) { 1678 (*pr)(" %p", bp); 1679 } else if ((++cnt % 6) == 0) { 1680 (*pr)(" %p,\n\t", bp); 1681 } else { 1682 (*pr)(" %p,", bp); 1683 } 1684 } 1685 (*pr)("\n"); 1686 1687 (*pr)("dealloced blks = "); 1688 { 1689 int i; 1690 cnt = 0; 1691 for (i = 0; i < wl->wl_dealloccnt; i++) { 1692 (*pr)(" %"PRId64":%d,", 1693 wl->wl_deallocblks[i], 1694 wl->wl_dealloclens[i]); 1695 if ((++cnt % 4) == 0) { 1696 (*pr)("\n\t"); 1697 } 1698 } 1699 } 1700 (*pr)("\n"); 1701 1702 (*pr)("registered inodes = "); 1703 { 1704 int i; 1705 cnt = 0; 1706 for (i = 0; i <= wl->wl_inohashmask; i++) { 1707 struct wapbl_ino_head *wih; 1708 struct wapbl_ino *wi; 1709 1710 wih = &wl->wl_inohash[i]; 1711 LIST_FOREACH(wi, wih, wi_hash) { 1712 if (wi->wi_ino == 0) 1713 continue; 1714 (*pr)(" %"PRId32"/0%06"PRIo32",", 1715 wi->wi_ino, wi->wi_mode); 1716 if ((++cnt % 4) == 0) { 1717 (*pr)("\n\t"); 1718 } 1719 } 1720 } 1721 (*pr)("\n"); 1722 } 1723 } 1724 } 1725 1726 #if defined(WAPBL_DEBUG) || defined(DDB) 1727 void 1728 wapbl_dump(struct wapbl *wl) 1729 { 1730 #if defined(WAPBL_DEBUG) 1731 if (!wl) 1732 wl = wapbl_debug_wl; 1733 #endif 1734 if (!wl) 1735 return; 1736 wapbl_print(wl, 1, printf); 1737 } 1738 #endif 1739 1740 /****************************************************************/ 1741 1742 void 1743 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1744 { 1745 1746 wapbl_jlock_assert(wl); 1747 1748 mutex_enter(&wl->wl_mtx); 1749 /* XXX should eventually instead tie this into resource estimation */ 1750 /* 1751 * XXX this panic needs locking/mutex analysis and the 1752 * ability to cope with the failure. 1753 */ 1754 /* XXX this XXX doesn't have enough XXX */ 1755 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1756 panic("wapbl_register_deallocation: out of resources"); 1757 1758 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1759 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1760 wl->wl_dealloccnt++; 1761 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1762 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1763 mutex_exit(&wl->wl_mtx); 1764 } 1765 1766 /****************************************************************/ 1767 1768 static void 1769 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1770 { 1771 1772 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1773 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1774 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1775 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1776 } 1777 } 1778 1779 static void 1780 wapbl_inodetrk_free(struct wapbl *wl) 1781 { 1782 1783 /* XXX this KASSERT needs locking/mutex analysis */ 1784 KASSERT(wl->wl_inohashcnt == 0); 1785 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1786 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1787 pool_destroy(&wapbl_ino_pool); 1788 } 1789 } 1790 1791 static struct wapbl_ino * 1792 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1793 { 1794 struct wapbl_ino_head *wih; 1795 struct wapbl_ino *wi; 1796 1797 KASSERT(mutex_owned(&wl->wl_mtx)); 1798 1799 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1800 LIST_FOREACH(wi, wih, wi_hash) { 1801 if (ino == wi->wi_ino) 1802 return wi; 1803 } 1804 return 0; 1805 } 1806 1807 void 1808 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1809 { 1810 struct wapbl_ino_head *wih; 1811 struct wapbl_ino *wi; 1812 1813 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1814 1815 mutex_enter(&wl->wl_mtx); 1816 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1817 wi->wi_ino = ino; 1818 wi->wi_mode = mode; 1819 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1820 LIST_INSERT_HEAD(wih, wi, wi_hash); 1821 wl->wl_inohashcnt++; 1822 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1823 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1824 mutex_exit(&wl->wl_mtx); 1825 } else { 1826 mutex_exit(&wl->wl_mtx); 1827 pool_put(&wapbl_ino_pool, wi); 1828 } 1829 } 1830 1831 void 1832 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1833 { 1834 struct wapbl_ino *wi; 1835 1836 mutex_enter(&wl->wl_mtx); 1837 wi = wapbl_inodetrk_get(wl, ino); 1838 if (wi) { 1839 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1840 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1841 KASSERT(wl->wl_inohashcnt > 0); 1842 wl->wl_inohashcnt--; 1843 LIST_REMOVE(wi, wi_hash); 1844 mutex_exit(&wl->wl_mtx); 1845 1846 pool_put(&wapbl_ino_pool, wi); 1847 } else { 1848 mutex_exit(&wl->wl_mtx); 1849 } 1850 } 1851 1852 /****************************************************************/ 1853 1854 static inline size_t 1855 wapbl_transaction_inodes_len(struct wapbl *wl) 1856 { 1857 int blocklen = 1<<wl->wl_log_dev_bshift; 1858 int iph; 1859 1860 /* Calculate number of inodes described in a inodelist header */ 1861 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1862 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1863 1864 KASSERT(iph > 0); 1865 1866 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 1867 } 1868 1869 1870 /* Calculate amount of space a transaction will take on disk */ 1871 static size_t 1872 wapbl_transaction_len(struct wapbl *wl) 1873 { 1874 int blocklen = 1<<wl->wl_log_dev_bshift; 1875 size_t len; 1876 int bph; 1877 1878 /* Calculate number of blocks described in a blocklist header */ 1879 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1880 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1881 1882 KASSERT(bph > 0); 1883 1884 len = wl->wl_bcount; 1885 len += howmany(wl->wl_bufcount, bph) * blocklen; 1886 len += howmany(wl->wl_dealloccnt, bph) * blocklen; 1887 len += wapbl_transaction_inodes_len(wl); 1888 1889 return len; 1890 } 1891 1892 /* 1893 * Perform commit operation 1894 * 1895 * Note that generation number incrementation needs to 1896 * be protected against racing with other invocations 1897 * of wapbl_commit. This is ok since this routine 1898 * is only invoked from wapbl_flush 1899 */ 1900 static int 1901 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 1902 { 1903 struct wapbl_wc_header *wc = wl->wl_wc_header; 1904 struct timespec ts; 1905 int error; 1906 int force = 1; 1907 daddr_t pbn; 1908 1909 if (wapbl_flush_disk_cache) { 1910 /* XXX Calc checksum here, instead we do this for now */ 1911 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 1912 FWRITE, FSCRED); 1913 if (error) { 1914 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1915 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1916 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1917 } 1918 } 1919 1920 wc->wc_head = head; 1921 wc->wc_tail = tail; 1922 wc->wc_checksum = 0; 1923 wc->wc_version = 1; 1924 getnanotime(&ts); 1925 wc->wc_time = ts.tv_sec; 1926 wc->wc_timensec = ts.tv_nsec; 1927 1928 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1929 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 1930 (intmax_t)head, (intmax_t)tail)); 1931 1932 /* 1933 * XXX if generation will rollover, then first zero 1934 * over second commit header before trying to write both headers. 1935 */ 1936 1937 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 1938 #ifdef _KERNEL 1939 pbn = btodb(pbn << wc->wc_log_dev_bshift); 1940 #endif 1941 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn); 1942 if (error) 1943 return error; 1944 1945 if (wapbl_flush_disk_cache) { 1946 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 1947 FWRITE, FSCRED); 1948 if (error) { 1949 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1950 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1951 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1952 } 1953 } 1954 1955 /* 1956 * If the generation number was zero, write it out a second time. 1957 * This handles initialization and generation number rollover 1958 */ 1959 if (wc->wc_generation++ == 0) { 1960 error = wapbl_write_commit(wl, head, tail); 1961 /* 1962 * This panic should be able to be removed if we do the 1963 * zero'ing mentioned above, and we are certain to roll 1964 * back generation number on failure. 1965 */ 1966 if (error) 1967 panic("wapbl_write_commit: error writing duplicate " 1968 "log header: %d\n", error); 1969 } 1970 return 0; 1971 } 1972 1973 /* Returns new offset value */ 1974 static int 1975 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 1976 { 1977 struct wapbl_wc_blocklist *wc = 1978 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1979 int blocklen = 1<<wl->wl_log_dev_bshift; 1980 int bph; 1981 struct buf *bp; 1982 off_t off = *offp; 1983 int error; 1984 size_t padding; 1985 1986 KASSERT(rw_write_held(&wl->wl_rwlock)); 1987 1988 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1989 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1990 1991 bp = LIST_FIRST(&wl->wl_bufs); 1992 1993 while (bp) { 1994 int cnt; 1995 struct buf *obp = bp; 1996 1997 KASSERT(bp->b_flags & B_LOCKED); 1998 1999 wc->wc_type = WAPBL_WC_BLOCKS; 2000 wc->wc_len = blocklen; 2001 wc->wc_blkcount = 0; 2002 while (bp && (wc->wc_blkcount < bph)) { 2003 /* 2004 * Make sure all the physical block numbers are up to 2005 * date. If this is not always true on a given 2006 * filesystem, then VOP_BMAP must be called. We 2007 * could call VOP_BMAP here, or else in the filesystem 2008 * specific flush callback, although neither of those 2009 * solutions allow us to take the vnode lock. If a 2010 * filesystem requires that we must take the vnode lock 2011 * to call VOP_BMAP, then we can probably do it in 2012 * bwrite when the vnode lock should already be held 2013 * by the invoking code. 2014 */ 2015 KASSERT((bp->b_vp->v_type == VBLK) || 2016 (bp->b_blkno != bp->b_lblkno)); 2017 KASSERT(bp->b_blkno > 0); 2018 2019 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2020 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2021 wc->wc_len += bp->b_bcount; 2022 wc->wc_blkcount++; 2023 bp = LIST_NEXT(bp, b_wapbllist); 2024 } 2025 if (wc->wc_len % blocklen != 0) { 2026 padding = blocklen - wc->wc_len % blocklen; 2027 wc->wc_len += padding; 2028 } else { 2029 padding = 0; 2030 } 2031 2032 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2033 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2034 wc->wc_len, padding, (intmax_t)off)); 2035 2036 error = wapbl_circ_write(wl, wc, blocklen, &off); 2037 if (error) 2038 return error; 2039 bp = obp; 2040 cnt = 0; 2041 while (bp && (cnt++ < bph)) { 2042 error = wapbl_circ_write(wl, bp->b_data, 2043 bp->b_bcount, &off); 2044 if (error) 2045 return error; 2046 bp = LIST_NEXT(bp, b_wapbllist); 2047 } 2048 if (padding) { 2049 void *zero; 2050 2051 zero = wapbl_malloc(padding); 2052 memset(zero, 0, padding); 2053 error = wapbl_circ_write(wl, zero, padding, &off); 2054 wapbl_free(zero, padding); 2055 if (error) 2056 return error; 2057 } 2058 } 2059 *offp = off; 2060 return 0; 2061 } 2062 2063 static int 2064 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2065 { 2066 struct wapbl_wc_blocklist *wc = 2067 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2068 int i; 2069 int blocklen = 1<<wl->wl_log_dev_bshift; 2070 int bph; 2071 off_t off = *offp; 2072 int error; 2073 2074 if (wl->wl_dealloccnt == 0) 2075 return 0; 2076 2077 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2078 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2079 2080 i = 0; 2081 while (i < wl->wl_dealloccnt) { 2082 wc->wc_type = WAPBL_WC_REVOCATIONS; 2083 wc->wc_len = blocklen; 2084 wc->wc_blkcount = 0; 2085 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2086 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2087 wl->wl_deallocblks[i]; 2088 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2089 wl->wl_dealloclens[i]; 2090 wc->wc_blkcount++; 2091 i++; 2092 } 2093 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2094 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2095 wc->wc_len, (intmax_t)off)); 2096 error = wapbl_circ_write(wl, wc, blocklen, &off); 2097 if (error) 2098 return error; 2099 } 2100 *offp = off; 2101 return 0; 2102 } 2103 2104 static int 2105 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2106 { 2107 struct wapbl_wc_inodelist *wc = 2108 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2109 int i; 2110 int blocklen = 1 << wl->wl_log_dev_bshift; 2111 off_t off = *offp; 2112 int error; 2113 2114 struct wapbl_ino_head *wih; 2115 struct wapbl_ino *wi; 2116 int iph; 2117 2118 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2119 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2120 2121 i = 0; 2122 wih = &wl->wl_inohash[0]; 2123 wi = 0; 2124 do { 2125 wc->wc_type = WAPBL_WC_INODES; 2126 wc->wc_len = blocklen; 2127 wc->wc_inocnt = 0; 2128 wc->wc_clear = (i == 0); 2129 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2130 while (!wi) { 2131 KASSERT((wih - &wl->wl_inohash[0]) 2132 <= wl->wl_inohashmask); 2133 wi = LIST_FIRST(wih++); 2134 } 2135 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2136 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2137 wc->wc_inocnt++; 2138 i++; 2139 wi = LIST_NEXT(wi, wi_hash); 2140 } 2141 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2142 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2143 wc->wc_len, (intmax_t)off)); 2144 error = wapbl_circ_write(wl, wc, blocklen, &off); 2145 if (error) 2146 return error; 2147 } while (i < wl->wl_inohashcnt); 2148 2149 *offp = off; 2150 return 0; 2151 } 2152 2153 #endif /* _KERNEL */ 2154 2155 /****************************************************************/ 2156 2157 struct wapbl_blk { 2158 LIST_ENTRY(wapbl_blk) wb_hash; 2159 daddr_t wb_blk; 2160 off_t wb_off; /* Offset of this block in the log */ 2161 }; 2162 #define WAPBL_BLKPOOL_MIN 83 2163 2164 static void 2165 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2166 { 2167 if (size < WAPBL_BLKPOOL_MIN) 2168 size = WAPBL_BLKPOOL_MIN; 2169 KASSERT(wr->wr_blkhash == 0); 2170 #ifdef _KERNEL 2171 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2172 #else /* ! _KERNEL */ 2173 /* Manually implement hashinit */ 2174 { 2175 unsigned long i, hashsize; 2176 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2177 continue; 2178 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); 2179 for (i = 0; i < hashsize; i++) 2180 LIST_INIT(&wr->wr_blkhash[i]); 2181 wr->wr_blkhashmask = hashsize - 1; 2182 } 2183 #endif /* ! _KERNEL */ 2184 } 2185 2186 static void 2187 wapbl_blkhash_free(struct wapbl_replay *wr) 2188 { 2189 KASSERT(wr->wr_blkhashcnt == 0); 2190 #ifdef _KERNEL 2191 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2192 #else /* ! _KERNEL */ 2193 wapbl_free(wr->wr_blkhash, 2194 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2195 #endif /* ! _KERNEL */ 2196 } 2197 2198 static struct wapbl_blk * 2199 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2200 { 2201 struct wapbl_blk_head *wbh; 2202 struct wapbl_blk *wb; 2203 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2204 LIST_FOREACH(wb, wbh, wb_hash) { 2205 if (blk == wb->wb_blk) 2206 return wb; 2207 } 2208 return 0; 2209 } 2210 2211 static void 2212 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2213 { 2214 struct wapbl_blk_head *wbh; 2215 struct wapbl_blk *wb; 2216 wb = wapbl_blkhash_get(wr, blk); 2217 if (wb) { 2218 KASSERT(wb->wb_blk == blk); 2219 wb->wb_off = off; 2220 } else { 2221 wb = wapbl_malloc(sizeof(*wb)); 2222 wb->wb_blk = blk; 2223 wb->wb_off = off; 2224 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2225 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2226 wr->wr_blkhashcnt++; 2227 } 2228 } 2229 2230 static void 2231 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2232 { 2233 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2234 if (wb) { 2235 KASSERT(wr->wr_blkhashcnt > 0); 2236 wr->wr_blkhashcnt--; 2237 LIST_REMOVE(wb, wb_hash); 2238 wapbl_free(wb, sizeof(*wb)); 2239 } 2240 } 2241 2242 static void 2243 wapbl_blkhash_clear(struct wapbl_replay *wr) 2244 { 2245 unsigned long i; 2246 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2247 struct wapbl_blk *wb; 2248 2249 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2250 KASSERT(wr->wr_blkhashcnt > 0); 2251 wr->wr_blkhashcnt--; 2252 LIST_REMOVE(wb, wb_hash); 2253 wapbl_free(wb, sizeof(*wb)); 2254 } 2255 } 2256 KASSERT(wr->wr_blkhashcnt == 0); 2257 } 2258 2259 /****************************************************************/ 2260 2261 static int 2262 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2263 { 2264 size_t slen; 2265 off_t off = *offp; 2266 int error; 2267 daddr_t pbn; 2268 2269 KASSERT(((len >> wr->wr_log_dev_bshift) << 2270 wr->wr_log_dev_bshift) == len); 2271 2272 if (off < wr->wr_circ_off) 2273 off = wr->wr_circ_off; 2274 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2275 if (slen < len) { 2276 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2277 #ifdef _KERNEL 2278 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2279 #endif 2280 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2281 if (error) 2282 return error; 2283 data = (uint8_t *)data + slen; 2284 len -= slen; 2285 off = wr->wr_circ_off; 2286 } 2287 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2288 #ifdef _KERNEL 2289 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2290 #endif 2291 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2292 if (error) 2293 return error; 2294 off += len; 2295 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2296 off = wr->wr_circ_off; 2297 *offp = off; 2298 return 0; 2299 } 2300 2301 static void 2302 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2303 { 2304 size_t slen; 2305 off_t off = *offp; 2306 2307 KASSERT(((len >> wr->wr_log_dev_bshift) << 2308 wr->wr_log_dev_bshift) == len); 2309 2310 if (off < wr->wr_circ_off) 2311 off = wr->wr_circ_off; 2312 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2313 if (slen < len) { 2314 len -= slen; 2315 off = wr->wr_circ_off; 2316 } 2317 off += len; 2318 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2319 off = wr->wr_circ_off; 2320 *offp = off; 2321 } 2322 2323 /****************************************************************/ 2324 2325 int 2326 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2327 daddr_t off, size_t count, size_t blksize) 2328 { 2329 struct wapbl_replay *wr; 2330 int error; 2331 struct vnode *devvp; 2332 daddr_t logpbn; 2333 uint8_t *scratch; 2334 struct wapbl_wc_header *wch; 2335 struct wapbl_wc_header *wch2; 2336 /* Use this until we read the actual log header */ 2337 int log_dev_bshift = ilog2(blksize); 2338 size_t used; 2339 daddr_t pbn; 2340 2341 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2342 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2343 vp, off, count, blksize)); 2344 2345 if (off < 0) 2346 return EINVAL; 2347 2348 if (blksize < DEV_BSIZE) 2349 return EINVAL; 2350 if (blksize % DEV_BSIZE) 2351 return EINVAL; 2352 2353 #ifdef _KERNEL 2354 #if 0 2355 /* XXX vp->v_size isn't reliably set for VBLK devices, 2356 * especially root. However, we might still want to verify 2357 * that the full load is readable */ 2358 if ((off + count) * blksize > vp->v_size) 2359 return EINVAL; 2360 #endif 2361 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2362 return error; 2363 } 2364 #else /* ! _KERNEL */ 2365 devvp = vp; 2366 logpbn = off; 2367 #endif /* ! _KERNEL */ 2368 2369 scratch = wapbl_malloc(MAXBSIZE); 2370 2371 pbn = logpbn; 2372 #ifdef _KERNEL 2373 pbn = btodb(pbn << log_dev_bshift); 2374 #endif 2375 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2376 if (error) 2377 goto errout; 2378 2379 wch = (struct wapbl_wc_header *)scratch; 2380 wch2 = 2381 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2382 /* XXX verify checksums and magic numbers */ 2383 if (wch->wc_type != WAPBL_WC_HEADER) { 2384 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2385 error = EFTYPE; 2386 goto errout; 2387 } 2388 2389 if (wch2->wc_generation > wch->wc_generation) 2390 wch = wch2; 2391 2392 wr = wapbl_calloc(1, sizeof(*wr)); 2393 2394 wr->wr_logvp = vp; 2395 wr->wr_devvp = devvp; 2396 wr->wr_logpbn = logpbn; 2397 2398 wr->wr_scratch = scratch; 2399 2400 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2401 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2402 wr->wr_circ_off = wch->wc_circ_off; 2403 wr->wr_circ_size = wch->wc_circ_size; 2404 wr->wr_generation = wch->wc_generation; 2405 2406 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2407 2408 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2409 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2410 " len=%"PRId64" used=%zu\n", 2411 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2412 wch->wc_circ_size, used)); 2413 2414 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2415 2416 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2417 if (error) { 2418 wapbl_replay_stop(wr); 2419 wapbl_replay_free(wr); 2420 return error; 2421 } 2422 2423 *wrp = wr; 2424 return 0; 2425 2426 errout: 2427 wapbl_free(scratch, MAXBSIZE); 2428 return error; 2429 } 2430 2431 void 2432 wapbl_replay_stop(struct wapbl_replay *wr) 2433 { 2434 2435 if (!wapbl_replay_isopen(wr)) 2436 return; 2437 2438 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2439 2440 wapbl_free(wr->wr_scratch, MAXBSIZE); 2441 wr->wr_scratch = NULL; 2442 2443 wr->wr_logvp = NULL; 2444 2445 wapbl_blkhash_clear(wr); 2446 wapbl_blkhash_free(wr); 2447 } 2448 2449 void 2450 wapbl_replay_free(struct wapbl_replay *wr) 2451 { 2452 2453 KDASSERT(!wapbl_replay_isopen(wr)); 2454 2455 if (wr->wr_inodes) 2456 wapbl_free(wr->wr_inodes, 2457 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2458 wapbl_free(wr, sizeof(*wr)); 2459 } 2460 2461 #ifdef _KERNEL 2462 int 2463 wapbl_replay_isopen1(struct wapbl_replay *wr) 2464 { 2465 2466 return wapbl_replay_isopen(wr); 2467 } 2468 #endif 2469 2470 static void 2471 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2472 { 2473 struct wapbl_wc_blocklist *wc = 2474 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2475 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2476 int i, j, n; 2477 2478 for (i = 0; i < wc->wc_blkcount; i++) { 2479 /* 2480 * Enter each physical block into the hashtable independently. 2481 */ 2482 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2483 for (j = 0; j < n; j++) { 2484 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen), 2485 *offp); 2486 wapbl_circ_advance(wr, fsblklen, offp); 2487 } 2488 } 2489 } 2490 2491 static void 2492 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2493 { 2494 struct wapbl_wc_blocklist *wc = 2495 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2496 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2497 int i, j, n; 2498 2499 for (i = 0; i < wc->wc_blkcount; i++) { 2500 /* 2501 * Remove any blocks found from the hashtable. 2502 */ 2503 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2504 for (j = 0; j < n; j++) 2505 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2506 } 2507 } 2508 2509 static void 2510 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2511 { 2512 struct wapbl_wc_inodelist *wc = 2513 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2514 void *new_inodes; 2515 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2516 2517 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2518 2519 /* 2520 * Keep track of where we found this so location won't be 2521 * overwritten. 2522 */ 2523 if (wc->wc_clear) { 2524 wr->wr_inodestail = oldoff; 2525 wr->wr_inodescnt = 0; 2526 if (wr->wr_inodes != NULL) { 2527 wapbl_free(wr->wr_inodes, oldsize); 2528 wr->wr_inodes = NULL; 2529 } 2530 } 2531 wr->wr_inodeshead = newoff; 2532 if (wc->wc_inocnt == 0) 2533 return; 2534 2535 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) * 2536 sizeof(wr->wr_inodes[0])); 2537 if (wr->wr_inodes != NULL) { 2538 memcpy(new_inodes, wr->wr_inodes, oldsize); 2539 wapbl_free(wr->wr_inodes, oldsize); 2540 } 2541 wr->wr_inodes = new_inodes; 2542 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2543 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2544 wr->wr_inodescnt += wc->wc_inocnt; 2545 } 2546 2547 static int 2548 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2549 { 2550 off_t off; 2551 int error; 2552 2553 int logblklen = 1 << wr->wr_log_dev_bshift; 2554 2555 wapbl_blkhash_clear(wr); 2556 2557 off = tail; 2558 while (off != head) { 2559 struct wapbl_wc_null *wcn; 2560 off_t saveoff = off; 2561 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2562 if (error) 2563 goto errout; 2564 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2565 switch (wcn->wc_type) { 2566 case WAPBL_WC_BLOCKS: 2567 wapbl_replay_process_blocks(wr, &off); 2568 break; 2569 2570 case WAPBL_WC_REVOCATIONS: 2571 wapbl_replay_process_revocations(wr); 2572 break; 2573 2574 case WAPBL_WC_INODES: 2575 wapbl_replay_process_inodes(wr, saveoff, off); 2576 break; 2577 2578 default: 2579 printf("Unrecognized wapbl type: 0x%08x\n", 2580 wcn->wc_type); 2581 error = EFTYPE; 2582 goto errout; 2583 } 2584 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2585 if (off != saveoff) { 2586 printf("wapbl_replay: corrupted records\n"); 2587 error = EFTYPE; 2588 goto errout; 2589 } 2590 } 2591 return 0; 2592 2593 errout: 2594 wapbl_blkhash_clear(wr); 2595 return error; 2596 } 2597 2598 #if 0 2599 int 2600 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2601 { 2602 off_t off; 2603 int mismatchcnt = 0; 2604 int logblklen = 1 << wr->wr_log_dev_bshift; 2605 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2606 void *scratch1 = wapbl_malloc(MAXBSIZE); 2607 void *scratch2 = wapbl_malloc(MAXBSIZE); 2608 int error = 0; 2609 2610 KDASSERT(wapbl_replay_isopen(wr)); 2611 2612 off = wch->wc_tail; 2613 while (off != wch->wc_head) { 2614 struct wapbl_wc_null *wcn; 2615 #ifdef DEBUG 2616 off_t saveoff = off; 2617 #endif 2618 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2619 if (error) 2620 goto out; 2621 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2622 switch (wcn->wc_type) { 2623 case WAPBL_WC_BLOCKS: 2624 { 2625 struct wapbl_wc_blocklist *wc = 2626 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2627 int i; 2628 for (i = 0; i < wc->wc_blkcount; i++) { 2629 int foundcnt = 0; 2630 int dirtycnt = 0; 2631 int j, n; 2632 /* 2633 * Check each physical block into the 2634 * hashtable independently 2635 */ 2636 n = wc->wc_blocks[i].wc_dlen >> 2637 wch->wc_fs_dev_bshift; 2638 for (j = 0; j < n; j++) { 2639 struct wapbl_blk *wb = 2640 wapbl_blkhash_get(wr, 2641 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2642 if (wb && (wb->wb_off == off)) { 2643 foundcnt++; 2644 error = 2645 wapbl_circ_read(wr, 2646 scratch1, fsblklen, 2647 &off); 2648 if (error) 2649 goto out; 2650 error = 2651 wapbl_read(scratch2, 2652 fsblklen, fsdevvp, 2653 wb->wb_blk); 2654 if (error) 2655 goto out; 2656 if (memcmp(scratch1, 2657 scratch2, 2658 fsblklen)) { 2659 printf( 2660 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2661 wb->wb_blk, (intmax_t)off); 2662 dirtycnt++; 2663 mismatchcnt++; 2664 } 2665 } else { 2666 wapbl_circ_advance(wr, 2667 fsblklen, &off); 2668 } 2669 } 2670 #if 0 2671 /* 2672 * If all of the blocks in an entry 2673 * are clean, then remove all of its 2674 * blocks from the hashtable since they 2675 * never will need replay. 2676 */ 2677 if ((foundcnt != 0) && 2678 (dirtycnt == 0)) { 2679 off = saveoff; 2680 wapbl_circ_advance(wr, 2681 logblklen, &off); 2682 for (j = 0; j < n; j++) { 2683 struct wapbl_blk *wb = 2684 wapbl_blkhash_get(wr, 2685 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2686 if (wb && 2687 (wb->wb_off == off)) { 2688 wapbl_blkhash_rem(wr, wb->wb_blk); 2689 } 2690 wapbl_circ_advance(wr, 2691 fsblklen, &off); 2692 } 2693 } 2694 #endif 2695 } 2696 } 2697 break; 2698 case WAPBL_WC_REVOCATIONS: 2699 case WAPBL_WC_INODES: 2700 break; 2701 default: 2702 KASSERT(0); 2703 } 2704 #ifdef DEBUG 2705 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2706 KASSERT(off == saveoff); 2707 #endif 2708 } 2709 out: 2710 wapbl_free(scratch1, MAXBSIZE); 2711 wapbl_free(scratch2, MAXBSIZE); 2712 if (!error && mismatchcnt) 2713 error = EFTYPE; 2714 return error; 2715 } 2716 #endif 2717 2718 int 2719 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2720 { 2721 struct wapbl_blk *wb; 2722 size_t i; 2723 off_t off; 2724 void *scratch; 2725 int error = 0; 2726 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2727 2728 KDASSERT(wapbl_replay_isopen(wr)); 2729 2730 scratch = wapbl_malloc(MAXBSIZE); 2731 2732 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 2733 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2734 off = wb->wb_off; 2735 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2736 if (error) 2737 break; 2738 error = wapbl_write(scratch, fsblklen, fsdevvp, 2739 wb->wb_blk); 2740 if (error) 2741 break; 2742 } 2743 } 2744 2745 wapbl_free(scratch, MAXBSIZE); 2746 return error; 2747 } 2748 2749 int 2750 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2751 { 2752 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2753 2754 KDASSERT(wapbl_replay_isopen(wr)); 2755 KASSERT((len % fsblklen) == 0); 2756 2757 while (len != 0) { 2758 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2759 if (wb) 2760 return 1; 2761 len -= fsblklen; 2762 } 2763 return 0; 2764 } 2765 2766 int 2767 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2768 { 2769 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2770 2771 KDASSERT(wapbl_replay_isopen(wr)); 2772 2773 KASSERT((len % fsblklen) == 0); 2774 2775 while (len != 0) { 2776 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2777 if (wb) { 2778 off_t off = wb->wb_off; 2779 int error; 2780 error = wapbl_circ_read(wr, data, fsblklen, &off); 2781 if (error) 2782 return error; 2783 } 2784 data = (uint8_t *)data + fsblklen; 2785 len -= fsblklen; 2786 blk++; 2787 } 2788 return 0; 2789 } 2790 2791 #ifdef _KERNEL 2792 /* 2793 * This is not really a module now, but maybe on it's way to 2794 * being one some day. 2795 */ 2796 MODULE(MODULE_CLASS_VFS, wapbl, NULL); 2797 2798 static int 2799 wapbl_modcmd(modcmd_t cmd, void *arg) 2800 { 2801 2802 switch (cmd) { 2803 case MODULE_CMD_INIT: 2804 wapbl_init(); 2805 return 0; 2806 case MODULE_CMD_FINI: 2807 #ifdef notyet 2808 return wapbl_fini(true); 2809 #endif 2810 return EOPNOTSUPP; 2811 default: 2812 return ENOTTY; 2813 } 2814 } 2815 #endif /* _KERNEL */ 2816