1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/nlookup.h> 39 #include <sys/vnode.h> 40 #include <sys/mount.h> 41 #include <sys/fcntl.h> 42 #include <sys/buf.h> 43 #include <sys/uuid.h> 44 #include <sys/vfsops.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 #include <sys/objcache.h> 48 49 #include <sys/proc.h> 50 #include <sys/namei.h> 51 #include <sys/mountctl.h> 52 #include <sys/dirent.h> 53 #include <sys/uio.h> 54 55 #include <sys/mutex.h> 56 #include <sys/mutex2.h> 57 58 #include "hammer2.h" 59 #include "hammer2_disk.h" 60 #include "hammer2_mount.h" 61 #include "hammer2_lz4.h" 62 63 #include "zlib/hammer2_zlib.h" 64 65 #define REPORT_REFS_ERRORS 1 /* XXX remove me */ 66 67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); 68 69 struct hammer2_sync_info { 70 hammer2_trans_t trans; 71 int error; 72 int waitfor; 73 }; 74 75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev); 76 TAILQ_HEAD(hammer2_pfslist, hammer2_pfs); 77 static struct hammer2_mntlist hammer2_mntlist; 78 static struct hammer2_pfslist hammer2_pfslist; 79 static struct lock hammer2_mntlk; 80 81 int hammer2_debug; 82 int hammer2_cluster_enable = 1; 83 int hammer2_hardlink_enable = 1; 84 int hammer2_flush_pipe = 100; 85 int hammer2_synchronous_flush = 1; 86 int hammer2_dio_count; 87 long hammer2_limit_dirty_chains; 88 long hammer2_iod_file_read; 89 long hammer2_iod_meta_read; 90 long hammer2_iod_indr_read; 91 long hammer2_iod_fmap_read; 92 long hammer2_iod_volu_read; 93 long hammer2_iod_file_write; 94 long hammer2_iod_meta_write; 95 long hammer2_iod_indr_write; 96 long hammer2_iod_fmap_write; 97 long hammer2_iod_volu_write; 98 long hammer2_ioa_file_read; 99 long hammer2_ioa_meta_read; 100 long hammer2_ioa_indr_read; 101 long hammer2_ioa_fmap_read; 102 long hammer2_ioa_volu_read; 103 long hammer2_ioa_fmap_write; 104 long hammer2_ioa_file_write; 105 long hammer2_ioa_meta_write; 106 long hammer2_ioa_indr_write; 107 long hammer2_ioa_volu_write; 108 109 MALLOC_DECLARE(C_BUFFER); 110 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression."); 111 112 MALLOC_DECLARE(D_BUFFER); 113 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression."); 114 115 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem"); 116 117 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW, 118 &hammer2_debug, 0, ""); 119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW, 120 &hammer2_cluster_enable, 0, ""); 121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW, 122 &hammer2_hardlink_enable, 0, ""); 123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, 124 &hammer2_flush_pipe, 0, ""); 125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW, 126 &hammer2_synchronous_flush, 0, ""); 127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW, 128 &hammer2_limit_dirty_chains, 0, ""); 129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, 130 &hammer2_dio_count, 0, ""); 131 132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, 133 &hammer2_iod_file_read, 0, ""); 134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, 135 &hammer2_iod_meta_read, 0, ""); 136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, 137 &hammer2_iod_indr_read, 0, ""); 138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, 139 &hammer2_iod_fmap_read, 0, ""); 140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, 141 &hammer2_iod_volu_read, 0, ""); 142 143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, 144 &hammer2_iod_file_write, 0, ""); 145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, 146 &hammer2_iod_meta_write, 0, ""); 147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, 148 &hammer2_iod_indr_write, 0, ""); 149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, 150 &hammer2_iod_fmap_write, 0, ""); 151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, 152 &hammer2_iod_volu_write, 0, ""); 153 154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW, 155 &hammer2_ioa_file_read, 0, ""); 156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW, 157 &hammer2_ioa_meta_read, 0, ""); 158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW, 159 &hammer2_ioa_indr_read, 0, ""); 160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW, 161 &hammer2_ioa_fmap_read, 0, ""); 162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW, 163 &hammer2_ioa_volu_read, 0, ""); 164 165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW, 166 &hammer2_ioa_file_write, 0, ""); 167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW, 168 &hammer2_ioa_meta_write, 0, ""); 169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW, 170 &hammer2_ioa_indr_write, 0, ""); 171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW, 172 &hammer2_ioa_fmap_write, 0, ""); 173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW, 174 &hammer2_ioa_volu_write, 0, ""); 175 176 static int hammer2_vfs_init(struct vfsconf *conf); 177 static int hammer2_vfs_uninit(struct vfsconf *vfsp); 178 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 179 struct ucred *cred); 180 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *, 181 struct vnode *, struct ucred *); 182 static int hammer2_recovery(hammer2_dev_t *hmp); 183 static int hammer2_vfs_unmount(struct mount *mp, int mntflags); 184 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); 185 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, 186 struct ucred *cred); 187 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, 188 struct ucred *cred); 189 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 190 ino_t ino, struct vnode **vpp); 191 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 192 struct fid *fhp, struct vnode **vpp); 193 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp); 194 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 195 int *exflagsp, struct ucred **credanonp); 196 197 static int hammer2_install_volume_header(hammer2_dev_t *hmp); 198 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 199 200 static void hammer2_update_pmps(hammer2_dev_t *hmp); 201 static void hammer2_write_thread(void *arg); 202 203 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp); 204 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, 205 hammer2_dev_t *hmp); 206 207 /* 208 * Functions for compression in threads, 209 * from hammer2_vnops.c 210 */ 211 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans, 212 hammer2_inode_t *ip, 213 const hammer2_inode_data_t *ripdata, 214 hammer2_cluster_t *cparent, 215 hammer2_key_t lbase, int ioflag, int pblksize, 216 int *errorp); 217 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans, 218 hammer2_inode_t *ip, 219 const hammer2_inode_data_t *ripdata, 220 hammer2_cluster_t *cparent, 221 hammer2_key_t lbase, int ioflag, 222 int pblksize, int *errorp, 223 int comp_algo, int check_algo); 224 static void hammer2_zero_check_and_write(struct buf *bp, 225 hammer2_trans_t *trans, hammer2_inode_t *ip, 226 const hammer2_inode_data_t *ripdata, 227 hammer2_cluster_t *cparent, 228 hammer2_key_t lbase, 229 int ioflag, int pblksize, int *errorp, 230 int check_algo); 231 static int test_block_zeros(const char *buf, size_t bytes); 232 static void zero_write(struct buf *bp, hammer2_trans_t *trans, 233 hammer2_inode_t *ip, 234 const hammer2_inode_data_t *ripdata, 235 hammer2_cluster_t *cparent, 236 hammer2_key_t lbase, 237 int *errorp); 238 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, 239 int ioflag, int pblksize, int *errorp, 240 int check_algo); 241 242 /* 243 * HAMMER2 vfs operations. 244 */ 245 static struct vfsops hammer2_vfsops = { 246 .vfs_init = hammer2_vfs_init, 247 .vfs_uninit = hammer2_vfs_uninit, 248 .vfs_sync = hammer2_vfs_sync, 249 .vfs_mount = hammer2_vfs_mount, 250 .vfs_unmount = hammer2_vfs_unmount, 251 .vfs_root = hammer2_vfs_root, 252 .vfs_statfs = hammer2_vfs_statfs, 253 .vfs_statvfs = hammer2_vfs_statvfs, 254 .vfs_vget = hammer2_vfs_vget, 255 .vfs_vptofh = hammer2_vfs_vptofh, 256 .vfs_fhtovp = hammer2_vfs_fhtovp, 257 .vfs_checkexp = hammer2_vfs_checkexp 258 }; 259 260 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); 261 262 VFS_SET(hammer2_vfsops, hammer2, 0); 263 MODULE_VERSION(hammer2, 1); 264 265 static 266 int 267 hammer2_vfs_init(struct vfsconf *conf) 268 { 269 static struct objcache_malloc_args margs_read; 270 static struct objcache_malloc_args margs_write; 271 272 int error; 273 274 error = 0; 275 276 if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) 277 error = EINVAL; 278 if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data)) 279 error = EINVAL; 280 if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data)) 281 error = EINVAL; 282 283 if (error) 284 kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); 285 286 margs_read.objsize = 65536; 287 margs_read.mtype = D_BUFFER; 288 289 margs_write.objsize = 32768; 290 margs_write.mtype = C_BUFFER; 291 292 cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, 293 0, 1, NULL, NULL, NULL, objcache_malloc_alloc, 294 objcache_malloc_free, &margs_read); 295 cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, 296 0, 1, NULL, NULL, NULL, objcache_malloc_alloc, 297 objcache_malloc_free, &margs_write); 298 299 lockinit(&hammer2_mntlk, "mntlk", 0, 0); 300 TAILQ_INIT(&hammer2_mntlist); 301 TAILQ_INIT(&hammer2_pfslist); 302 303 hammer2_limit_dirty_chains = desiredvnodes / 10; 304 305 return (error); 306 } 307 308 static 309 int 310 hammer2_vfs_uninit(struct vfsconf *vfsp __unused) 311 { 312 objcache_destroy(cache_buffer_read); 313 objcache_destroy(cache_buffer_write); 314 return 0; 315 } 316 317 /* 318 * Core PFS allocator. Used to allocate the pmp structure for PFS cluster 319 * mounts and the spmp structure for media (hmp) structures. 320 * 321 * pmp->modify_tid tracks new modify_tid transaction ids for front-end 322 * transactions. Note that synchronization does not use this field. 323 * (typically frontend operations and synchronization cannot run on the 324 * same PFS node at the same time). 325 * 326 * XXX check locking 327 */ 328 hammer2_pfs_t * 329 hammer2_pfsalloc(hammer2_cluster_t *cluster, 330 const hammer2_inode_data_t *ripdata, 331 hammer2_tid_t modify_tid) 332 { 333 hammer2_chain_t *rchain; 334 hammer2_inode_t *iroot; 335 hammer2_pfs_t *pmp; 336 int count; 337 int i; 338 int j; 339 340 /* 341 * Locate or create the PFS based on the cluster id. If ripdata 342 * is NULL this is a spmp which is unique and is always allocated. 343 */ 344 if (ripdata) { 345 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 346 if (bcmp(&pmp->pfs_clid, &ripdata->pfs_clid, 347 sizeof(pmp->pfs_clid)) == 0) { 348 break; 349 } 350 } 351 } else { 352 pmp = NULL; 353 } 354 355 if (pmp == NULL) { 356 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO); 357 hammer2_trans_manage_init(&pmp->tmanage); 358 kmalloc_create(&pmp->minode, "HAMMER2-inodes"); 359 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg"); 360 lockinit(&pmp->lock, "pfslk", 0, 0); 361 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum"); 362 RB_INIT(&pmp->inum_tree); 363 TAILQ_INIT(&pmp->unlinkq); 364 spin_init(&pmp->list_spin, "hm2pfsalloc_list"); 365 366 /* 367 * Save the last media transaction id for the flusher. Set 368 * initial 369 */ 370 if (ripdata) 371 pmp->pfs_clid = ripdata->pfs_clid; 372 hammer2_mtx_init(&pmp->wthread_mtx, "h2wthr"); 373 bioq_init(&pmp->wthread_bioq); 374 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry); 375 376 /* 377 * The synchronization thread may start too early, make 378 * sure it stays frozen until we are ready to let it go. 379 * XXX 380 */ 381 /* 382 pmp->primary_thr.flags = HAMMER2_SYNCTHR_FROZEN | 383 HAMMER2_SYNCTHR_REMASTER; 384 */ 385 } 386 387 /* 388 * Create the PFS's root inode. 389 */ 390 if ((iroot = pmp->iroot) == NULL) { 391 iroot = hammer2_inode_get(pmp, NULL, NULL); 392 pmp->iroot = iroot; 393 hammer2_inode_ref(iroot); 394 hammer2_inode_unlock(iroot, NULL); 395 } 396 397 /* 398 * Stop here if no cluster is passed in. 399 */ 400 if (cluster == NULL) 401 goto done; 402 403 /* 404 * When a cluster is passed in we must add the cluster's chains 405 * to the PFS's root inode, update pmp->pfs_types[], and update 406 * the syncronization threads. 407 * 408 * At the moment empty spots can develop due to removals or failures. 409 * Ultimately we want to re-fill these spots but doing so might 410 * confused running code. XXX 411 */ 412 hammer2_inode_ref(iroot); 413 hammer2_mtx_ex(&iroot->lock); 414 j = iroot->cluster.nchains; 415 416 kprintf("add PFS to pmp %p[%d]\n", pmp, j); 417 418 for (i = 0; i < cluster->nchains; ++i) { 419 if (j == HAMMER2_MAXCLUSTER) 420 break; 421 rchain = cluster->array[i].chain; 422 KKASSERT(rchain->pmp == NULL); 423 rchain->pmp = pmp; 424 hammer2_chain_ref(rchain); 425 iroot->cluster.array[j].chain = rchain; 426 pmp->pfs_types[j] = ripdata->pfs_type; 427 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2); 428 429 /* 430 * If the PFS is already mounted we must account 431 * for the mount_count here. 432 */ 433 if (pmp->mp) 434 ++rchain->hmp->mount_count; 435 436 /* 437 * May have to fixup dirty chain tracking. Previous 438 * pmp was NULL so nothing to undo. 439 */ 440 if (rchain->flags & HAMMER2_CHAIN_MODIFIED) 441 hammer2_pfs_memory_inc(pmp); 442 ++j; 443 } 444 iroot->cluster.nchains = j; 445 446 if (i != cluster->nchains) { 447 kprintf("hammer2_mount: cluster full!\n"); 448 /* XXX fatal error? */ 449 } 450 451 /* 452 * Update nmasters from any PFS inode which is part of the cluster. 453 * It is possible that this will result in a value which is too 454 * high. MASTER PFSs are authoritative for pfs_nmasters and will 455 * override this value later on. 456 * 457 * (This informs us of masters that might not currently be 458 * discoverable by this mount). 459 */ 460 if (ripdata && pmp->pfs_nmasters < ripdata->pfs_nmasters) { 461 pmp->pfs_nmasters = ripdata->pfs_nmasters; 462 } 463 464 /* 465 * Count visible masters. Masters are usually added with 466 * ripdata->pfs_nmasters set to 1. This detects when there 467 * are more (XXX and must update the master inodes). 468 */ 469 count = 0; 470 for (i = 0; i < iroot->cluster.nchains; ++i) { 471 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) 472 ++count; 473 } 474 if (pmp->pfs_nmasters < count) 475 pmp->pfs_nmasters = count; 476 477 /* 478 * Create missing synchronization threads. 479 * 480 * Single-node masters (including snapshots) have nothing to 481 * synchronize and do not require this thread. 482 * 483 * Multi-node masters or any number of soft masters, slaves, copy, 484 * or other PFS types need the thread. 485 * 486 * Each thread is responsible for its particular cluster index. 487 * We use independent threads so stalls or mismatches related to 488 * any given target do not affect other targets. 489 */ 490 for (i = 0; i < iroot->cluster.nchains; ++i) { 491 if (pmp->sync_thrs[i].td) 492 continue; 493 if ((pmp->pfs_nmasters > 1 && 494 (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)) || 495 pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER) { 496 hammer2_syncthr_create(&pmp->sync_thrs[i], pmp, i, 497 hammer2_syncthr_primary); 498 } 499 } 500 501 hammer2_mtx_unlock(&iroot->lock); 502 hammer2_inode_drop(iroot); 503 done: 504 return pmp; 505 } 506 507 /* 508 * Destroy a PFS, typically only occurs after the last mount on a device 509 * has gone away. 510 */ 511 static void 512 hammer2_pfsfree(hammer2_pfs_t *pmp) 513 { 514 hammer2_inode_t *iroot; 515 int i; 516 517 /* 518 * Cleanup our reference on iroot. iroot is (should) not be needed 519 * by the flush code. 520 */ 521 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry); 522 523 iroot = pmp->iroot; 524 if (iroot) { 525 for (i = 0; i < iroot->cluster.nchains; ++i) 526 hammer2_syncthr_delete(&pmp->sync_thrs[i]); 527 #if REPORT_REFS_ERRORS 528 if (pmp->iroot->refs != 1) 529 kprintf("PMP->IROOT %p REFS WRONG %d\n", 530 pmp->iroot, pmp->iroot->refs); 531 #else 532 KKASSERT(pmp->iroot->refs == 1); 533 #endif 534 /* ref for pmp->iroot */ 535 hammer2_inode_drop(pmp->iroot); 536 pmp->iroot = NULL; 537 } 538 539 kmalloc_destroy(&pmp->mmsg); 540 kmalloc_destroy(&pmp->minode); 541 542 kfree(pmp, M_HAMMER2); 543 } 544 545 /* 546 * Remove all references to hmp from the pfs list. Any PFS which becomes 547 * empty is terminated and freed. 548 * 549 * XXX inefficient. 550 */ 551 static void 552 hammer2_pfsfree_scan(hammer2_dev_t *hmp) 553 { 554 hammer2_pfs_t *pmp; 555 hammer2_inode_t *iroot; 556 hammer2_cluster_t *cluster; 557 hammer2_chain_t *rchain; 558 int didfreeze; 559 int i; 560 561 again: 562 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 563 if ((iroot = pmp->iroot) == NULL) 564 continue; 565 if (hmp->spmp == pmp) { 566 kprintf("unmount hmp %p remove spmp %p\n", 567 hmp, pmp); 568 hmp->spmp = NULL; 569 } 570 571 /* 572 * Determine if this PFS is affected. If it is we must 573 * freeze all management threads and lock its iroot. 574 * 575 * Freezing a management thread forces it idle, operations 576 * in-progress will be aborted and it will have to start 577 * over again when unfrozen, or exit if told to exit. 578 */ 579 cluster = &iroot->cluster; 580 for (i = 0; i < cluster->nchains; ++i) { 581 rchain = cluster->array[i].chain; 582 if (rchain == NULL || rchain->hmp != hmp) 583 continue; 584 break; 585 } 586 if (i != cluster->nchains) { 587 /* 588 * Make sure all synchronization threads are locked 589 * down. 590 */ 591 for (i = 0; i < iroot->cluster.nchains; ++i) 592 hammer2_syncthr_freeze(&pmp->sync_thrs[i]); 593 594 /* 595 * Lock the inode and clean out matching chains. 596 * Note that we cannot use hammer2_inode_lock_*() 597 * here because that would attempt to validate the 598 * cluster that we are in the middle of ripping 599 * apart. 600 * 601 * WARNING! We are working directly on the inodes 602 * embedded cluster. 603 */ 604 hammer2_mtx_ex(&iroot->lock); 605 606 /* 607 * Remove the chain from matching elements of the PFS. 608 */ 609 for (i = 0; i < cluster->nchains; ++i) { 610 rchain = cluster->array[i].chain; 611 if (rchain == NULL || rchain->hmp != hmp) 612 continue; 613 hammer2_syncthr_delete(&pmp->sync_thrs[i]); 614 rchain = cluster->array[i].chain; 615 cluster->array[i].chain = NULL; 616 pmp->pfs_types[i] = 0; 617 if (pmp->pfs_names[i]) { 618 kfree(pmp->pfs_names[i], M_HAMMER2); 619 pmp->pfs_names[i] = NULL; 620 } 621 hammer2_chain_drop(rchain); 622 623 /* focus hint */ 624 if (cluster->focus == rchain) 625 cluster->focus = NULL; 626 } 627 hammer2_mtx_unlock(&iroot->lock); 628 didfreeze = 1; /* remaster, unfreeze down below */ 629 } else { 630 didfreeze = 0; 631 } 632 633 /* 634 * Cleanup trailing chains. Do not reorder chains (for now). 635 * XXX might remove more than we intended. 636 */ 637 while (i > 0) { 638 if (cluster->array[i - 1].chain) 639 break; 640 --i; 641 } 642 cluster->nchains = i; 643 644 /* 645 * If the PMP has no elements remaining we can destroy it. 646 * (this will transition management threads from frozen->exit). 647 */ 648 if (cluster->nchains == 0) { 649 kprintf("unmount hmp %p last ref to PMP=%p\n", 650 hmp, pmp); 651 hammer2_pfsfree(pmp); 652 goto again; 653 } 654 655 /* 656 * If elements still remain we need to set the REMASTER 657 * flag and unfreeze it. 658 */ 659 if (didfreeze) { 660 for (i = 0; i < iroot->cluster.nchains; ++i) { 661 hammer2_syncthr_remaster(&pmp->sync_thrs[i]); 662 hammer2_syncthr_unfreeze(&pmp->sync_thrs[i]); 663 } 664 } 665 } 666 } 667 668 /* 669 * Mount or remount HAMMER2 fileystem from physical media 670 * 671 * mountroot 672 * mp mount point structure 673 * path NULL 674 * data <unused> 675 * cred <unused> 676 * 677 * mount 678 * mp mount point structure 679 * path path to mount point 680 * data pointer to argument structure in user space 681 * volume volume path (device@LABEL form) 682 * hflags user mount flags 683 * cred user credentials 684 * 685 * RETURNS: 0 Success 686 * !0 error number 687 */ 688 static 689 int 690 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 691 struct ucred *cred) 692 { 693 struct hammer2_mount_info info; 694 hammer2_pfs_t *pmp; 695 hammer2_pfs_t *spmp; 696 hammer2_dev_t *hmp; 697 hammer2_key_t key_next; 698 hammer2_key_t key_dummy; 699 hammer2_key_t lhc; 700 struct vnode *devvp; 701 struct nlookupdata nd; 702 hammer2_chain_t *parent; 703 hammer2_cluster_t *cluster; 704 hammer2_cluster_t *cparent; 705 const hammer2_inode_data_t *ripdata; 706 hammer2_blockref_t bref; 707 struct file *fp; 708 char devstr[MNAMELEN]; 709 size_t size; 710 size_t done; 711 char *dev; 712 char *label; 713 int ronly = 1; 714 int error; 715 int cache_index; 716 int i; 717 718 hmp = NULL; 719 pmp = NULL; 720 dev = NULL; 721 label = NULL; 722 devvp = NULL; 723 cache_index = -1; 724 725 kprintf("hammer2_mount\n"); 726 727 if (path == NULL) { 728 /* 729 * Root mount 730 */ 731 bzero(&info, sizeof(info)); 732 info.cluster_fd = -1; 733 return (EOPNOTSUPP); 734 } else { 735 /* 736 * Non-root mount or updating a mount 737 */ 738 error = copyin(data, &info, sizeof(info)); 739 if (error) 740 return (error); 741 742 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); 743 if (error) 744 return (error); 745 746 /* Extract device and label */ 747 dev = devstr; 748 label = strchr(devstr, '@'); 749 if (label == NULL || 750 ((label + 1) - dev) > done) { 751 return (EINVAL); 752 } 753 *label = '\0'; 754 label++; 755 if (*label == '\0') 756 return (EINVAL); 757 758 if (mp->mnt_flag & MNT_UPDATE) { 759 /* 760 * Update mount. Note that pmp->iroot->cluster is 761 * an inode-embedded cluster and thus cannot be 762 * directly locked. 763 * 764 * XXX HAMMER2 needs to implement NFS export via 765 * mountctl. 766 */ 767 pmp = MPTOPMP(mp); 768 cluster = &pmp->iroot->cluster; 769 for (i = 0; i < cluster->nchains; ++i) { 770 if (cluster->array[i].chain == NULL) 771 continue; 772 hmp = cluster->array[i].chain->hmp; 773 devvp = hmp->devvp; 774 error = hammer2_remount(hmp, mp, path, 775 devvp, cred); 776 if (error) 777 break; 778 } 779 /*hammer2_inode_install_hidden(pmp);*/ 780 781 return error; 782 } 783 } 784 785 /* 786 * HMP device mount 787 * 788 * Lookup name and verify it refers to a block device. 789 */ 790 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); 791 if (error == 0) 792 error = nlookup(&nd); 793 if (error == 0) 794 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); 795 nlookup_done(&nd); 796 797 if (error == 0) { 798 if (vn_isdisk(devvp, &error)) 799 error = vfs_mountedon(devvp); 800 } 801 802 /* 803 * Determine if the device has already been mounted. After this 804 * check hmp will be non-NULL if we are doing the second or more 805 * hammer2 mounts from the same device. 806 */ 807 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 808 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 809 if (hmp->devvp == devvp) 810 break; 811 } 812 813 /* 814 * Open the device if this isn't a secondary mount and construct 815 * the H2 device mount (hmp). 816 */ 817 if (hmp == NULL) { 818 hammer2_chain_t *schain; 819 hammer2_xid_t xid; 820 821 if (error == 0 && vcount(devvp) > 0) 822 error = EBUSY; 823 824 /* 825 * Now open the device 826 */ 827 if (error == 0) { 828 ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); 829 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 830 error = vinvalbuf(devvp, V_SAVE, 0, 0); 831 if (error == 0) { 832 error = VOP_OPEN(devvp, 833 ronly ? FREAD : FREAD | FWRITE, 834 FSCRED, NULL); 835 } 836 vn_unlock(devvp); 837 } 838 if (error && devvp) { 839 vrele(devvp); 840 devvp = NULL; 841 } 842 if (error) { 843 lockmgr(&hammer2_mntlk, LK_RELEASE); 844 return error; 845 } 846 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); 847 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev); 848 hmp->ronly = ronly; 849 hmp->devvp = devvp; 850 kmalloc_create(&hmp->mchain, "HAMMER2-chains"); 851 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry); 852 RB_INIT(&hmp->iotree); 853 spin_init(&hmp->io_spin, "hm2mount_io"); 854 spin_init(&hmp->list_spin, "hm2mount_list"); 855 TAILQ_INIT(&hmp->flushq); 856 857 lockinit(&hmp->vollk, "h2vol", 0, 0); 858 859 /* 860 * vchain setup. vchain.data is embedded. 861 * vchain.refs is initialized and will never drop to 0. 862 * 863 * NOTE! voldata is not yet loaded. 864 */ 865 hmp->vchain.hmp = hmp; 866 hmp->vchain.refs = 1; 867 hmp->vchain.data = (void *)&hmp->voldata; 868 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; 869 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 870 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 871 872 hammer2_chain_core_init(&hmp->vchain); 873 /* hmp->vchain.u.xxx is left NULL */ 874 875 /* 876 * fchain setup. fchain.data is embedded. 877 * fchain.refs is initialized and will never drop to 0. 878 * 879 * The data is not used but needs to be initialized to 880 * pass assertion muster. We use this chain primarily 881 * as a placeholder for the freemap's top-level RBTREE 882 * so it does not interfere with the volume's topology 883 * RBTREE. 884 */ 885 hmp->fchain.hmp = hmp; 886 hmp->fchain.refs = 1; 887 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset; 888 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP; 889 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 890 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 891 hmp->fchain.bref.methods = 892 HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) | 893 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 894 895 hammer2_chain_core_init(&hmp->fchain); 896 /* hmp->fchain.u.xxx is left NULL */ 897 898 /* 899 * Install the volume header and initialize fields from 900 * voldata. 901 */ 902 error = hammer2_install_volume_header(hmp); 903 if (error) { 904 hammer2_unmount_helper(mp, NULL, hmp); 905 lockmgr(&hammer2_mntlk, LK_RELEASE); 906 hammer2_vfs_unmount(mp, MNT_FORCE); 907 return error; 908 } 909 910 /* 911 * Really important to get these right or flush will get 912 * confused. 913 */ 914 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0); 915 kprintf("alloc spmp %p tid %016jx\n", 916 hmp->spmp, hmp->voldata.mirror_tid); 917 spmp = hmp->spmp; 918 919 /* 920 * Dummy-up vchain and fchain's modify_tid. mirror_tid 921 * is inherited from the volume header. 922 */ 923 xid = 0; 924 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 925 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid; 926 hmp->vchain.pmp = spmp; 927 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 928 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid; 929 hmp->fchain.pmp = spmp; 930 931 /* 932 * First locate the super-root inode, which is key 0 933 * relative to the volume header's blockset. 934 * 935 * Then locate the root inode by scanning the directory keyspace 936 * represented by the label. 937 */ 938 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 939 schain = hammer2_chain_lookup(&parent, &key_dummy, 940 HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 941 &cache_index, 0); 942 hammer2_chain_lookup_done(parent); 943 if (schain == NULL) { 944 kprintf("hammer2_mount: invalid super-root\n"); 945 hammer2_unmount_helper(mp, NULL, hmp); 946 lockmgr(&hammer2_mntlk, LK_RELEASE); 947 hammer2_vfs_unmount(mp, MNT_FORCE); 948 return EINVAL; 949 } 950 if (schain->error) { 951 kprintf("hammer2_mount: error %s reading super-root\n", 952 hammer2_error_str(schain->error)); 953 hammer2_chain_unlock(schain); 954 hammer2_chain_drop(schain); 955 schain = NULL; 956 hammer2_unmount_helper(mp, NULL, hmp); 957 lockmgr(&hammer2_mntlk, LK_RELEASE); 958 hammer2_vfs_unmount(mp, MNT_FORCE); 959 return EINVAL; 960 } 961 962 /* 963 * The super-root always uses an inode_tid of 1 when 964 * creating PFSs. 965 */ 966 spmp->inode_tid = 1; 967 spmp->modify_tid = schain->bref.modify_tid; 968 969 /* 970 * Sanity-check schain's pmp and finish initialization. 971 * Any chain belonging to the super-root topology should 972 * have a NULL pmp (not even set to spmp). 973 */ 974 ripdata = &hammer2_chain_rdata(schain)->ipdata; 975 KKASSERT(schain->pmp == NULL); 976 spmp->pfs_clid = ripdata->pfs_clid; 977 978 /* 979 * Replace the dummy spmp->iroot with a real one. It's 980 * easier to just do a wholesale replacement than to try 981 * to update the chain and fixup the iroot fields. 982 * 983 * The returned inode is locked with the supplied cluster. 984 */ 985 cluster = hammer2_cluster_from_chain(schain); 986 hammer2_inode_drop(spmp->iroot); 987 spmp->iroot = NULL; 988 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster); 989 spmp->spmp_hmp = hmp; 990 spmp->pfs_types[0] = ripdata->pfs_type; 991 hammer2_inode_ref(spmp->iroot); 992 hammer2_inode_unlock(spmp->iroot, cluster); 993 schain = NULL; 994 /* leave spmp->iroot with one ref */ 995 996 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 997 error = hammer2_recovery(hmp); 998 /* XXX do something with error */ 999 } 1000 hammer2_update_pmps(hmp); 1001 hammer2_iocom_init(hmp); 1002 1003 /* 1004 * Ref the cluster management messaging descriptor. The mount 1005 * program deals with the other end of the communications pipe. 1006 */ 1007 fp = holdfp(curproc->p_fd, info.cluster_fd, -1); 1008 if (fp) { 1009 hammer2_cluster_reconnect(hmp, fp); 1010 } else { 1011 kprintf("hammer2_mount: bad cluster_fd!\n"); 1012 } 1013 } else { 1014 spmp = hmp->spmp; 1015 } 1016 1017 /* 1018 * Lookup the mount point under the media-localized super-root. 1019 * Scanning hammer2_pfslist doesn't help us because it represents 1020 * PFS cluster ids which can aggregate several named PFSs together. 1021 * 1022 * cluster->pmp will incorrectly point to spmp and must be fixed 1023 * up later on. 1024 */ 1025 cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS); 1026 lhc = hammer2_dirhash(label, strlen(label)); 1027 cluster = hammer2_cluster_lookup(cparent, &key_next, 1028 lhc, lhc + HAMMER2_DIRHASH_LOMASK, 1029 0); 1030 while (cluster) { 1031 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE && 1032 strcmp(label, 1033 hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) { 1034 break; 1035 } 1036 cluster = hammer2_cluster_next(cparent, cluster, &key_next, 1037 key_next, 1038 lhc + HAMMER2_DIRHASH_LOMASK, 0); 1039 } 1040 hammer2_inode_unlock(spmp->iroot, cparent); 1041 1042 /* 1043 * PFS could not be found? 1044 */ 1045 if (cluster == NULL) { 1046 kprintf("hammer2_mount: PFS label not found\n"); 1047 hammer2_unmount_helper(mp, NULL, hmp); 1048 lockmgr(&hammer2_mntlk, LK_RELEASE); 1049 hammer2_vfs_unmount(mp, MNT_FORCE); 1050 1051 return EINVAL; 1052 } 1053 1054 /* 1055 * Acquire the pmp structure (it should have already been allocated 1056 * via hammer2_update_pmps() so do not pass cluster in to add to 1057 * available chains). 1058 * 1059 * Check if the cluster has already been mounted. A cluster can 1060 * only be mounted once, use null mounts to mount additional copies. 1061 */ 1062 ripdata = &hammer2_cluster_rdata(cluster)->ipdata; 1063 hammer2_cluster_bref(cluster, &bref); 1064 pmp = hammer2_pfsalloc(NULL, ripdata, bref.modify_tid); 1065 hammer2_cluster_unlock(cluster); 1066 hammer2_cluster_drop(cluster); 1067 1068 if (pmp->mp) { 1069 kprintf("hammer2_mount: PFS already mounted!\n"); 1070 hammer2_unmount_helper(mp, NULL, hmp); 1071 lockmgr(&hammer2_mntlk, LK_RELEASE); 1072 hammer2_vfs_unmount(mp, MNT_FORCE); 1073 1074 return EBUSY; 1075 } 1076 1077 /* 1078 * Finish the mount 1079 */ 1080 kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp); 1081 1082 mp->mnt_flag = MNT_LOCAL; 1083 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; /* all entry pts are SMP */ 1084 mp->mnt_kern_flag |= MNTK_THR_SYNC; /* new vsyncscan semantics */ 1085 1086 /* 1087 * required mount structure initializations 1088 */ 1089 mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE; 1090 mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE; 1091 1092 mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE; 1093 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 1094 1095 /* 1096 * Optional fields 1097 */ 1098 mp->mnt_iosize_max = MAXPHYS; 1099 1100 /* 1101 * Connect up mount pointers. 1102 */ 1103 hammer2_mount_helper(mp, pmp); 1104 1105 lockmgr(&hammer2_mntlk, LK_RELEASE); 1106 1107 /* 1108 * A mounted PFS needs a write thread for logical buffers and 1109 * a hidden directory for deletions of open files. These features 1110 * are not used by unmounted PFSs. 1111 * 1112 * The logical file buffer bio write thread handles things like 1113 * physical block assignment and compression. 1114 */ 1115 pmp->wthread_destroy = 0; 1116 lwkt_create(hammer2_write_thread, pmp, 1117 &pmp->wthread_td, NULL, 0, -1, "h2pfs-%s", label); 1118 1119 /* 1120 * With the cluster operational install ihidden. 1121 * (only applicable to pfs mounts, not applicable to spmp) 1122 */ 1123 hammer2_inode_install_hidden(pmp); 1124 1125 /* 1126 * Finish setup 1127 */ 1128 vfs_getnewfsid(mp); 1129 vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops); 1130 vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops); 1131 vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops); 1132 1133 copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); 1134 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); 1135 bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); 1136 copyinstr(path, mp->mnt_stat.f_mntonname, 1137 sizeof(mp->mnt_stat.f_mntonname) - 1, 1138 &size); 1139 1140 /* 1141 * Initial statfs to prime mnt_stat. 1142 */ 1143 hammer2_vfs_statfs(mp, &mp->mnt_stat, cred); 1144 1145 return 0; 1146 } 1147 1148 /* 1149 * Scan PFSs under the super-root and create hammer2_pfs structures. 1150 */ 1151 static 1152 void 1153 hammer2_update_pmps(hammer2_dev_t *hmp) 1154 { 1155 const hammer2_inode_data_t *ripdata; 1156 hammer2_cluster_t *cparent; 1157 hammer2_cluster_t *cluster; 1158 hammer2_blockref_t bref; 1159 hammer2_pfs_t *spmp; 1160 hammer2_pfs_t *pmp; 1161 hammer2_key_t key_next; 1162 1163 /* 1164 * Lookup mount point under the media-localized super-root. 1165 * 1166 * cluster->pmp will incorrectly point to spmp and must be fixed 1167 * up later on. 1168 */ 1169 spmp = hmp->spmp; 1170 cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS); 1171 cluster = hammer2_cluster_lookup(cparent, &key_next, 1172 HAMMER2_KEY_MIN, 1173 HAMMER2_KEY_MAX, 1174 0); 1175 while (cluster) { 1176 if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE) 1177 continue; 1178 ripdata = &hammer2_cluster_rdata(cluster)->ipdata; 1179 hammer2_cluster_bref(cluster, &bref); 1180 kprintf("ADD LOCAL PFS: %s\n", ripdata->filename); 1181 1182 pmp = hammer2_pfsalloc(cluster, ripdata, bref.modify_tid); 1183 cluster = hammer2_cluster_next(cparent, cluster, 1184 &key_next, 1185 key_next, 1186 HAMMER2_KEY_MAX, 1187 0); 1188 } 1189 hammer2_inode_unlock(spmp->iroot, cparent); 1190 } 1191 1192 /* 1193 * Handle bioq for strategy write 1194 */ 1195 static 1196 void 1197 hammer2_write_thread(void *arg) 1198 { 1199 hammer2_pfs_t *pmp; 1200 struct bio *bio; 1201 struct buf *bp; 1202 hammer2_trans_t trans; 1203 struct vnode *vp; 1204 hammer2_inode_t *ip; 1205 hammer2_cluster_t *cparent; 1206 const hammer2_inode_data_t *ripdata; 1207 hammer2_key_t lbase; 1208 int lblksize; 1209 int pblksize; 1210 int error; 1211 1212 pmp = arg; 1213 1214 hammer2_mtx_ex(&pmp->wthread_mtx); 1215 for (;;) { 1216 /* 1217 * Wait for work. Break out and destroy the thread only if 1218 * requested and no work remains. 1219 */ 1220 if (bioq_first(&pmp->wthread_bioq) == NULL) { 1221 if (pmp->wthread_destroy) 1222 break; 1223 mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx, 1224 0, "h2bioqw", 0); 1225 continue; 1226 } 1227 1228 /* 1229 * Special transaction for logical buffer cache writes. 1230 */ 1231 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE); 1232 1233 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) { 1234 /* 1235 * dummy bio for synchronization. The transaction 1236 * must be terminated. 1237 */ 1238 if (bio->bio_buf == NULL) { 1239 bio->bio_flags |= BIO_DONE; 1240 /* bio will become invalid after DONE set */ 1241 wakeup(bio); 1242 break; 1243 } 1244 1245 /* 1246 * else normal bio processing 1247 */ 1248 hammer2_mtx_unlock(&pmp->wthread_mtx); 1249 1250 hammer2_lwinprog_drop(pmp); 1251 1252 error = 0; 1253 bp = bio->bio_buf; 1254 vp = bp->b_vp; 1255 ip = VTOI(vp); 1256 1257 /* 1258 * Inode is modified, flush size and mtime changes 1259 * to ensure that the file size remains consistent 1260 * with the buffers being flushed. 1261 * 1262 * NOTE: The inode_fsync() call only flushes the 1263 * inode's meta-data state, it doesn't try 1264 * to flush underlying buffers or chains. 1265 * 1266 * NOTE: hammer2_write_file_core() may indirectly 1267 * modify and modsync the inode. 1268 */ 1269 cparent = hammer2_inode_lock(ip, 1270 HAMMER2_RESOLVE_ALWAYS); 1271 if (ip->flags & (HAMMER2_INODE_RESIZED | 1272 HAMMER2_INODE_MTIME)) { 1273 hammer2_inode_fsync(&trans, ip, cparent); 1274 } 1275 ripdata = &hammer2_cluster_rdata(cparent)->ipdata; 1276 lblksize = hammer2_calc_logical(ip, bio->bio_offset, 1277 &lbase, NULL); 1278 pblksize = hammer2_calc_physical(ip, ripdata, lbase); 1279 hammer2_write_file_core(bp, &trans, ip, ripdata, 1280 cparent, 1281 lbase, IO_ASYNC, 1282 pblksize, &error); 1283 /* ripdata can be invalid after call */ 1284 hammer2_inode_unlock(ip, cparent); 1285 if (error) { 1286 kprintf("hammer2: error in buffer write\n"); 1287 bp->b_flags |= B_ERROR; 1288 bp->b_error = EIO; 1289 } 1290 biodone(bio); 1291 hammer2_mtx_ex(&pmp->wthread_mtx); 1292 } 1293 hammer2_trans_done(&trans); 1294 } 1295 pmp->wthread_destroy = -1; 1296 wakeup(&pmp->wthread_destroy); 1297 1298 hammer2_mtx_unlock(&pmp->wthread_mtx); 1299 } 1300 1301 void 1302 hammer2_bioq_sync(hammer2_pfs_t *pmp) 1303 { 1304 struct bio sync_bio; 1305 1306 bzero(&sync_bio, sizeof(sync_bio)); /* dummy with no bio_buf */ 1307 hammer2_mtx_ex(&pmp->wthread_mtx); 1308 if (pmp->wthread_destroy == 0 && 1309 TAILQ_FIRST(&pmp->wthread_bioq.queue)) { 1310 bioq_insert_tail(&pmp->wthread_bioq, &sync_bio); 1311 while ((sync_bio.bio_flags & BIO_DONE) == 0) 1312 mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0); 1313 } 1314 hammer2_mtx_unlock(&pmp->wthread_mtx); 1315 } 1316 1317 /* 1318 * Return a chain suitable for I/O, creating the chain if necessary 1319 * and assigning its physical block. The cluster will be in a modified 1320 * state. 1321 * 1322 * cparent can wind up being anything. 1323 * 1324 * NOTE: Special case for data embedded in inode. 1325 */ 1326 static 1327 hammer2_cluster_t * 1328 hammer2_assign_physical(hammer2_trans_t *trans, 1329 hammer2_inode_t *ip, hammer2_cluster_t *cparent, 1330 hammer2_key_t lbase, int pblksize, int *errorp) 1331 { 1332 hammer2_cluster_t *cluster; 1333 hammer2_cluster_t *dparent; 1334 hammer2_key_t key_dummy; 1335 int pradix = hammer2_getradix(pblksize); 1336 1337 /* 1338 * Locate the chain associated with lbase, return a locked chain. 1339 * However, do not instantiate any data reference (which utilizes a 1340 * device buffer) because we will be using direct IO via the 1341 * logical buffer cache buffer. 1342 */ 1343 *errorp = 0; 1344 KKASSERT(pblksize >= HAMMER2_ALLOC_MIN); 1345 retry: 1346 dparent = hammer2_cluster_lookup_init(cparent, 0); 1347 cluster = hammer2_cluster_lookup(dparent, &key_dummy, 1348 lbase, lbase, 1349 HAMMER2_LOOKUP_NODATA); 1350 1351 if (cluster == NULL) { 1352 /* 1353 * We found a hole, create a new chain entry. 1354 * 1355 * NOTE: DATA chains are created without device backing 1356 * store (nor do we want any). 1357 */ 1358 *errorp = hammer2_cluster_create(trans, dparent, &cluster, 1359 lbase, HAMMER2_PBUFRADIX, 1360 HAMMER2_BREF_TYPE_DATA, 1361 pblksize, 0); 1362 if (cluster == NULL) { 1363 hammer2_cluster_lookup_done(dparent); 1364 panic("hammer2_cluster_create: par=%p error=%d\n", 1365 dparent->focus, *errorp); 1366 goto retry; 1367 } 1368 /*ip->delta_dcount += pblksize;*/ 1369 } else { 1370 switch (hammer2_cluster_type(cluster)) { 1371 case HAMMER2_BREF_TYPE_INODE: 1372 /* 1373 * The data is embedded in the inode, which requires 1374 * a bit more finess. 1375 */ 1376 hammer2_cluster_modify_ip(trans, ip, cluster, 0); 1377 break; 1378 case HAMMER2_BREF_TYPE_DATA: 1379 if (hammer2_cluster_need_resize(cluster, pblksize)) { 1380 hammer2_cluster_resize(trans, ip, 1381 dparent, cluster, 1382 pradix, 1383 HAMMER2_MODIFY_OPTDATA); 1384 } 1385 1386 /* 1387 * DATA buffers must be marked modified whether the 1388 * data is in a logical buffer or not. We also have 1389 * to make this call to fixup the chain data pointers 1390 * after resizing in case this is an encrypted or 1391 * compressed buffer. 1392 */ 1393 hammer2_cluster_modify(trans, cluster, 1394 HAMMER2_MODIFY_OPTDATA); 1395 break; 1396 default: 1397 panic("hammer2_assign_physical: bad type"); 1398 /* NOT REACHED */ 1399 break; 1400 } 1401 } 1402 1403 /* 1404 * Cleanup. If cluster wound up being the inode itself, i.e. 1405 * the DIRECTDATA case for offset 0, then we need to update cparent. 1406 * The caller expects cparent to not become stale. 1407 */ 1408 hammer2_cluster_lookup_done(dparent); 1409 /* dparent = NULL; safety */ 1410 return (cluster); 1411 } 1412 1413 /* 1414 * bio queued from hammer2_vnops.c. 1415 * 1416 * The core write function which determines which path to take 1417 * depending on compression settings. We also have to locate the 1418 * related clusters so we can calculate and set the check data for 1419 * the blockref. 1420 */ 1421 static 1422 void 1423 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans, 1424 hammer2_inode_t *ip, 1425 const hammer2_inode_data_t *ripdata, 1426 hammer2_cluster_t *cparent, 1427 hammer2_key_t lbase, int ioflag, int pblksize, 1428 int *errorp) 1429 { 1430 hammer2_cluster_t *cluster; 1431 1432 switch(HAMMER2_DEC_ALGO(ripdata->comp_algo)) { 1433 case HAMMER2_COMP_NONE: 1434 /* 1435 * We have to assign physical storage to the buffer 1436 * we intend to dirty or write now to avoid deadlocks 1437 * in the strategy code later. 1438 * 1439 * This can return NOOFFSET for inode-embedded data. 1440 * The strategy code will take care of it in that case. 1441 */ 1442 cluster = hammer2_assign_physical(trans, ip, cparent, 1443 lbase, pblksize, 1444 errorp); 1445 if (cluster->ddflag) { 1446 hammer2_inode_data_t *wipdata; 1447 1448 wipdata = hammer2_cluster_modify_ip(trans, ip, 1449 cluster, 0); 1450 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA); 1451 KKASSERT(bp->b_loffset == 0); 1452 bcopy(bp->b_data, wipdata->u.data, 1453 HAMMER2_EMBEDDED_BYTES); 1454 hammer2_cluster_modsync(cluster); 1455 } else { 1456 hammer2_write_bp(cluster, bp, ioflag, pblksize, 1457 errorp, ripdata->check_algo); 1458 } 1459 /* ripdata can become invalid */ 1460 if (cluster) { 1461 hammer2_cluster_unlock(cluster); 1462 hammer2_cluster_drop(cluster); 1463 } 1464 break; 1465 case HAMMER2_COMP_AUTOZERO: 1466 /* 1467 * Check for zero-fill only 1468 */ 1469 hammer2_zero_check_and_write(bp, trans, ip, 1470 ripdata, cparent, lbase, 1471 ioflag, pblksize, errorp, 1472 ripdata->check_algo); 1473 break; 1474 case HAMMER2_COMP_LZ4: 1475 case HAMMER2_COMP_ZLIB: 1476 default: 1477 /* 1478 * Check for zero-fill and attempt compression. 1479 */ 1480 hammer2_compress_and_write(bp, trans, ip, 1481 ripdata, cparent, 1482 lbase, ioflag, 1483 pblksize, errorp, 1484 ripdata->comp_algo, 1485 ripdata->check_algo); 1486 break; 1487 } 1488 } 1489 1490 /* 1491 * Generic function that will perform the compression in compression 1492 * write path. The compression algorithm is determined by the settings 1493 * obtained from inode. 1494 */ 1495 static 1496 void 1497 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans, 1498 hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata, 1499 hammer2_cluster_t *cparent, 1500 hammer2_key_t lbase, int ioflag, int pblksize, 1501 int *errorp, int comp_algo, int check_algo) 1502 { 1503 hammer2_cluster_t *cluster; 1504 hammer2_chain_t *chain; 1505 int comp_size; 1506 int comp_block_size; 1507 int i; 1508 char *comp_buffer; 1509 1510 if (test_block_zeros(bp->b_data, pblksize)) { 1511 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp); 1512 return; 1513 } 1514 1515 comp_size = 0; 1516 comp_buffer = NULL; 1517 1518 KKASSERT(pblksize / 2 <= 32768); 1519 1520 if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) { 1521 z_stream strm_compress; 1522 int comp_level; 1523 int ret; 1524 1525 switch(HAMMER2_DEC_ALGO(comp_algo)) { 1526 case HAMMER2_COMP_LZ4: 1527 comp_buffer = objcache_get(cache_buffer_write, 1528 M_INTWAIT); 1529 comp_size = LZ4_compress_limitedOutput( 1530 bp->b_data, 1531 &comp_buffer[sizeof(int)], 1532 pblksize, 1533 pblksize / 2 - sizeof(int)); 1534 /* 1535 * We need to prefix with the size, LZ4 1536 * doesn't do it for us. Add the related 1537 * overhead. 1538 */ 1539 *(int *)comp_buffer = comp_size; 1540 if (comp_size) 1541 comp_size += sizeof(int); 1542 break; 1543 case HAMMER2_COMP_ZLIB: 1544 comp_level = HAMMER2_DEC_LEVEL(comp_algo); 1545 if (comp_level == 0) 1546 comp_level = 6; /* default zlib compression */ 1547 else if (comp_level < 6) 1548 comp_level = 6; 1549 else if (comp_level > 9) 1550 comp_level = 9; 1551 ret = deflateInit(&strm_compress, comp_level); 1552 if (ret != Z_OK) { 1553 kprintf("HAMMER2 ZLIB: fatal error " 1554 "on deflateInit.\n"); 1555 } 1556 1557 comp_buffer = objcache_get(cache_buffer_write, 1558 M_INTWAIT); 1559 strm_compress.next_in = bp->b_data; 1560 strm_compress.avail_in = pblksize; 1561 strm_compress.next_out = comp_buffer; 1562 strm_compress.avail_out = pblksize / 2; 1563 ret = deflate(&strm_compress, Z_FINISH); 1564 if (ret == Z_STREAM_END) { 1565 comp_size = pblksize / 2 - 1566 strm_compress.avail_out; 1567 } else { 1568 comp_size = 0; 1569 } 1570 ret = deflateEnd(&strm_compress); 1571 break; 1572 default: 1573 kprintf("Error: Unknown compression method.\n"); 1574 kprintf("Comp_method = %d.\n", comp_algo); 1575 break; 1576 } 1577 } 1578 1579 if (comp_size == 0) { 1580 /* 1581 * compression failed or turned off 1582 */ 1583 comp_block_size = pblksize; /* safety */ 1584 if (++ip->comp_heuristic > 128) 1585 ip->comp_heuristic = 8; 1586 } else { 1587 /* 1588 * compression succeeded 1589 */ 1590 ip->comp_heuristic = 0; 1591 if (comp_size <= 1024) { 1592 comp_block_size = 1024; 1593 } else if (comp_size <= 2048) { 1594 comp_block_size = 2048; 1595 } else if (comp_size <= 4096) { 1596 comp_block_size = 4096; 1597 } else if (comp_size <= 8192) { 1598 comp_block_size = 8192; 1599 } else if (comp_size <= 16384) { 1600 comp_block_size = 16384; 1601 } else if (comp_size <= 32768) { 1602 comp_block_size = 32768; 1603 } else { 1604 panic("hammer2: WRITE PATH: " 1605 "Weird comp_size value."); 1606 /* NOT REACHED */ 1607 comp_block_size = pblksize; 1608 } 1609 } 1610 1611 cluster = hammer2_assign_physical(trans, ip, cparent, 1612 lbase, comp_block_size, 1613 errorp); 1614 ripdata = NULL; 1615 1616 if (*errorp) { 1617 kprintf("WRITE PATH: An error occurred while " 1618 "assigning physical space.\n"); 1619 KKASSERT(cluster == NULL); 1620 goto done; 1621 } 1622 1623 if (cluster->ddflag) { 1624 hammer2_inode_data_t *wipdata; 1625 1626 wipdata = &hammer2_cluster_wdata(cluster)->ipdata; 1627 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA); 1628 KKASSERT(bp->b_loffset == 0); 1629 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 1630 hammer2_cluster_modsync(cluster); 1631 } else 1632 for (i = 0; i < cluster->nchains; ++i) { 1633 hammer2_io_t *dio; 1634 char *bdata; 1635 1636 /* XXX hackx */ 1637 1638 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 1639 continue; 1640 chain = cluster->array[i].chain; /* XXX */ 1641 if (chain == NULL) 1642 continue; 1643 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 1644 1645 switch(chain->bref.type) { 1646 case HAMMER2_BREF_TYPE_INODE: 1647 panic("hammer2_write_bp: unexpected inode\n"); 1648 break; 1649 case HAMMER2_BREF_TYPE_DATA: 1650 /* 1651 * Optimize out the read-before-write 1652 * if possible. 1653 */ 1654 *errorp = hammer2_io_newnz(chain->hmp, 1655 chain->bref.data_off, 1656 chain->bytes, 1657 &dio); 1658 if (*errorp) { 1659 hammer2_io_brelse(&dio); 1660 kprintf("hammer2: WRITE PATH: " 1661 "dbp bread error\n"); 1662 break; 1663 } 1664 bdata = hammer2_io_data(dio, chain->bref.data_off); 1665 1666 /* 1667 * When loading the block make sure we don't 1668 * leave garbage after the compressed data. 1669 */ 1670 if (comp_size) { 1671 chain->bref.methods = 1672 HAMMER2_ENC_COMP(comp_algo) + 1673 HAMMER2_ENC_CHECK(check_algo); 1674 bcopy(comp_buffer, bdata, comp_size); 1675 if (comp_size != comp_block_size) { 1676 bzero(bdata + comp_size, 1677 comp_block_size - comp_size); 1678 } 1679 } else { 1680 chain->bref.methods = 1681 HAMMER2_ENC_COMP( 1682 HAMMER2_COMP_NONE) + 1683 HAMMER2_ENC_CHECK(check_algo); 1684 bcopy(bp->b_data, bdata, pblksize); 1685 } 1686 1687 /* 1688 * The flush code doesn't calculate check codes for 1689 * file data (doing so can result in excessive I/O), 1690 * so we do it here. 1691 */ 1692 hammer2_chain_setcheck(chain, bdata); 1693 1694 /* 1695 * Device buffer is now valid, chain is no longer in 1696 * the initial state. 1697 * 1698 * (No blockref table worries with file data) 1699 */ 1700 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1701 1702 /* Now write the related bdp. */ 1703 if (ioflag & IO_SYNC) { 1704 /* 1705 * Synchronous I/O requested. 1706 */ 1707 hammer2_io_bwrite(&dio); 1708 /* 1709 } else if ((ioflag & IO_DIRECT) && 1710 loff + n == pblksize) { 1711 hammer2_io_bdwrite(&dio); 1712 */ 1713 } else if (ioflag & IO_ASYNC) { 1714 hammer2_io_bawrite(&dio); 1715 } else { 1716 hammer2_io_bdwrite(&dio); 1717 } 1718 break; 1719 default: 1720 panic("hammer2_write_bp: bad chain type %d\n", 1721 chain->bref.type); 1722 /* NOT REACHED */ 1723 break; 1724 } 1725 } 1726 done: 1727 if (cluster) { 1728 hammer2_cluster_unlock(cluster); 1729 hammer2_cluster_drop(cluster); 1730 } 1731 if (comp_buffer) 1732 objcache_put(cache_buffer_write, comp_buffer); 1733 } 1734 1735 /* 1736 * Function that performs zero-checking and writing without compression, 1737 * it corresponds to default zero-checking path. 1738 */ 1739 static 1740 void 1741 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans, 1742 hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata, 1743 hammer2_cluster_t *cparent, 1744 hammer2_key_t lbase, int ioflag, int pblksize, int *errorp, 1745 int check_algo) 1746 { 1747 hammer2_cluster_t *cluster; 1748 1749 if (test_block_zeros(bp->b_data, pblksize)) { 1750 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp); 1751 /* ripdata can become invalid */ 1752 } else { 1753 cluster = hammer2_assign_physical(trans, ip, cparent, 1754 lbase, pblksize, errorp); 1755 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp, 1756 check_algo); 1757 /* ripdata can become invalid */ 1758 if (cluster) { 1759 hammer2_cluster_unlock(cluster); 1760 hammer2_cluster_drop(cluster); 1761 } 1762 } 1763 } 1764 1765 /* 1766 * A function to test whether a block of data contains only zeros, 1767 * returns TRUE (non-zero) if the block is all zeros. 1768 */ 1769 static 1770 int 1771 test_block_zeros(const char *buf, size_t bytes) 1772 { 1773 size_t i; 1774 1775 for (i = 0; i < bytes; i += sizeof(long)) { 1776 if (*(const long *)(buf + i) != 0) 1777 return (0); 1778 } 1779 return (1); 1780 } 1781 1782 /* 1783 * Function to "write" a block that contains only zeros. 1784 */ 1785 static 1786 void 1787 zero_write(struct buf *bp, hammer2_trans_t *trans, 1788 hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata, 1789 hammer2_cluster_t *cparent, 1790 hammer2_key_t lbase, int *errorp __unused) 1791 { 1792 hammer2_cluster_t *cluster; 1793 hammer2_key_t key_dummy; 1794 1795 cparent = hammer2_cluster_lookup_init(cparent, 0); 1796 cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase, 1797 HAMMER2_LOOKUP_NODATA); 1798 if (cluster) { 1799 if (cluster->ddflag) { 1800 hammer2_inode_data_t *wipdata; 1801 1802 wipdata = hammer2_cluster_modify_ip(trans, ip, 1803 cluster, 0); 1804 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA); 1805 KKASSERT(bp->b_loffset == 0); 1806 bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 1807 hammer2_cluster_modsync(cluster); 1808 } else { 1809 hammer2_cluster_delete(trans, cparent, cluster, 1810 HAMMER2_DELETE_PERMANENT); 1811 } 1812 hammer2_cluster_unlock(cluster); 1813 hammer2_cluster_drop(cluster); 1814 } 1815 hammer2_cluster_lookup_done(cparent); 1816 } 1817 1818 /* 1819 * Function to write the data as it is, without performing any sort of 1820 * compression. This function is used in path without compression and 1821 * default zero-checking path. 1822 */ 1823 static 1824 void 1825 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag, 1826 int pblksize, int *errorp, int check_algo) 1827 { 1828 hammer2_chain_t *chain; 1829 hammer2_inode_data_t *wipdata; 1830 hammer2_io_t *dio; 1831 char *bdata; 1832 int error; 1833 int i; 1834 1835 error = 0; /* XXX TODO below */ 1836 1837 for (i = 0; i < cluster->nchains; ++i) { 1838 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 1839 continue; 1840 chain = cluster->array[i].chain; /* XXX */ 1841 if (chain == NULL) 1842 continue; 1843 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 1844 1845 switch(chain->bref.type) { 1846 case HAMMER2_BREF_TYPE_INODE: 1847 wipdata = &hammer2_chain_wdata(chain)->ipdata; 1848 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA); 1849 KKASSERT(bp->b_loffset == 0); 1850 bcopy(bp->b_data, wipdata->u.data, 1851 HAMMER2_EMBEDDED_BYTES); 1852 error = 0; 1853 break; 1854 case HAMMER2_BREF_TYPE_DATA: 1855 error = hammer2_io_newnz(chain->hmp, 1856 chain->bref.data_off, 1857 chain->bytes, &dio); 1858 if (error) { 1859 hammer2_io_bqrelse(&dio); 1860 kprintf("hammer2: WRITE PATH: " 1861 "dbp bread error\n"); 1862 break; 1863 } 1864 bdata = hammer2_io_data(dio, chain->bref.data_off); 1865 1866 chain->bref.methods = HAMMER2_ENC_COMP( 1867 HAMMER2_COMP_NONE) + 1868 HAMMER2_ENC_CHECK(check_algo); 1869 bcopy(bp->b_data, bdata, chain->bytes); 1870 1871 /* 1872 * The flush code doesn't calculate check codes for 1873 * file data (doing so can result in excessive I/O), 1874 * so we do it here. 1875 */ 1876 hammer2_chain_setcheck(chain, bdata); 1877 1878 /* 1879 * Device buffer is now valid, chain is no longer in 1880 * the initial state. 1881 * 1882 * (No blockref table worries with file data) 1883 */ 1884 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1885 1886 if (ioflag & IO_SYNC) { 1887 /* 1888 * Synchronous I/O requested. 1889 */ 1890 hammer2_io_bwrite(&dio); 1891 /* 1892 } else if ((ioflag & IO_DIRECT) && 1893 loff + n == pblksize) { 1894 hammer2_io_bdwrite(&dio); 1895 */ 1896 } else if (ioflag & IO_ASYNC) { 1897 hammer2_io_bawrite(&dio); 1898 } else { 1899 hammer2_io_bdwrite(&dio); 1900 } 1901 break; 1902 default: 1903 panic("hammer2_write_bp: bad chain type %d\n", 1904 chain->bref.type); 1905 /* NOT REACHED */ 1906 error = 0; 1907 break; 1908 } 1909 KKASSERT(error == 0); /* XXX TODO */ 1910 } 1911 *errorp = error; 1912 } 1913 1914 static 1915 int 1916 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path, 1917 struct vnode *devvp, struct ucred *cred) 1918 { 1919 int error; 1920 1921 if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { 1922 error = hammer2_recovery(hmp); 1923 } else { 1924 error = 0; 1925 } 1926 return error; 1927 } 1928 1929 static 1930 int 1931 hammer2_vfs_unmount(struct mount *mp, int mntflags) 1932 { 1933 hammer2_pfs_t *pmp; 1934 int flags; 1935 int error = 0; 1936 1937 pmp = MPTOPMP(mp); 1938 1939 if (pmp == NULL) 1940 return(0); 1941 1942 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1943 1944 /* 1945 * If mount initialization proceeded far enough we must flush 1946 * its vnodes and sync the underlying mount points. Three syncs 1947 * are required to fully flush the filesystem (freemap updates lag 1948 * by one flush, and one extra for safety). 1949 */ 1950 if (mntflags & MNT_FORCE) 1951 flags = FORCECLOSE; 1952 else 1953 flags = 0; 1954 if (pmp->iroot) { 1955 error = vflush(mp, 0, flags); 1956 if (error) 1957 goto failed; 1958 hammer2_vfs_sync(mp, MNT_WAIT); 1959 hammer2_vfs_sync(mp, MNT_WAIT); 1960 hammer2_vfs_sync(mp, MNT_WAIT); 1961 } 1962 1963 if (pmp->wthread_td) { 1964 hammer2_mtx_ex(&pmp->wthread_mtx); 1965 pmp->wthread_destroy = 1; 1966 wakeup(&pmp->wthread_bioq); 1967 while (pmp->wthread_destroy != -1) { 1968 mtxsleep(&pmp->wthread_destroy, 1969 &pmp->wthread_mtx, 0, 1970 "umount-sleep", 0); 1971 } 1972 hammer2_mtx_unlock(&pmp->wthread_mtx); 1973 pmp->wthread_td = NULL; 1974 } 1975 1976 /* 1977 * Cleanup our reference on ihidden. 1978 */ 1979 if (pmp->ihidden) { 1980 hammer2_inode_drop(pmp->ihidden); 1981 pmp->ihidden = NULL; 1982 } 1983 if (pmp->mp) 1984 hammer2_unmount_helper(mp, pmp, NULL); 1985 1986 error = 0; 1987 failed: 1988 lockmgr(&hammer2_mntlk, LK_RELEASE); 1989 1990 return (error); 1991 } 1992 1993 /* 1994 * Mount helper, hook the system mount into our PFS. 1995 * The mount lock is held. 1996 * 1997 * We must bump the mount_count on related devices for any 1998 * mounted PFSs. 1999 */ 2000 static 2001 void 2002 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) 2003 { 2004 hammer2_cluster_t *cluster; 2005 hammer2_chain_t *rchain; 2006 int i; 2007 2008 mp->mnt_data = (qaddr_t)pmp; 2009 pmp->mp = mp; 2010 2011 /* 2012 * After pmp->mp is set we have to adjust hmp->mount_count. 2013 */ 2014 cluster = &pmp->iroot->cluster; 2015 for (i = 0; i < cluster->nchains; ++i) { 2016 rchain = cluster->array[i].chain; 2017 if (rchain == NULL) 2018 continue; 2019 ++rchain->hmp->mount_count; 2020 kprintf("hammer2_mount hmp=%p ++mount_count=%d\n", 2021 rchain->hmp, rchain->hmp->mount_count); 2022 } 2023 } 2024 2025 /* 2026 * Mount helper, unhook the system mount from our PFS. 2027 * The mount lock is held. 2028 * 2029 * If hmp is supplied a mount responsible for being the first to open 2030 * the block device failed and the block device and all PFSs using the 2031 * block device must be cleaned up. 2032 * 2033 * If pmp is supplied multiple devices might be backing the PFS and each 2034 * must be disconnect. This might not be the last PFS using some of the 2035 * underlying devices. Also, we have to adjust our hmp->mount_count 2036 * accounting for the devices backing the pmp which is now undergoing an 2037 * unmount. 2038 */ 2039 static 2040 void 2041 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp) 2042 { 2043 hammer2_cluster_t *cluster; 2044 hammer2_chain_t *rchain; 2045 struct vnode *devvp; 2046 int dumpcnt; 2047 int ronly = 0; 2048 int i; 2049 2050 /* 2051 * If no device supplied this is a high-level unmount and we have to 2052 * to disconnect the mount, adjust mount_count, and locate devices 2053 * that might now have no mounts. 2054 */ 2055 if (pmp) { 2056 KKASSERT(hmp == NULL); 2057 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp); 2058 pmp->mp = NULL; 2059 mp->mnt_data = NULL; 2060 2061 /* 2062 * After pmp->mp is cleared we have to account for 2063 * mount_count. 2064 */ 2065 cluster = &pmp->iroot->cluster; 2066 for (i = 0; i < cluster->nchains; ++i) { 2067 rchain = cluster->array[i].chain; 2068 if (rchain == NULL) 2069 continue; 2070 --rchain->hmp->mount_count; 2071 kprintf("hammer2_unmount hmp=%p --mount_count=%d\n", 2072 rchain->hmp, rchain->hmp->mount_count); 2073 /* scrapping hmp now may invalidate the pmp */ 2074 } 2075 again: 2076 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 2077 if (hmp->mount_count == 0) { 2078 hammer2_unmount_helper(NULL, NULL, hmp); 2079 goto again; 2080 } 2081 } 2082 return; 2083 } 2084 2085 /* 2086 * Try to terminate the block device. We can't terminate it if 2087 * there are still PFSs referencing it. 2088 */ 2089 kprintf("hammer2_unmount hmp=%p mount_count=%d\n", 2090 hmp, hmp->mount_count); 2091 if (hmp->mount_count) 2092 return; 2093 2094 hammer2_pfsfree_scan(hmp); 2095 hammer2_dev_exlock(hmp); /* XXX order */ 2096 2097 /* 2098 * Cycle the volume data lock as a safety (probably not needed any 2099 * more). To ensure everything is out we need to flush at least 2100 * three times. (1) The running of the unlinkq can dirty the 2101 * filesystem, (2) A normal flush can dirty the freemap, and 2102 * (3) ensure that the freemap is fully synchronized. 2103 * 2104 * The next mount's recovery scan can clean everything up but we want 2105 * to leave the filesystem in a 100% clean state on a normal unmount. 2106 */ 2107 #if 0 2108 hammer2_voldata_lock(hmp); 2109 hammer2_voldata_unlock(hmp); 2110 #endif 2111 hammer2_iocom_uninit(hmp); 2112 2113 if ((hmp->vchain.flags | hmp->fchain.flags) & 2114 HAMMER2_CHAIN_FLUSH_MASK) { 2115 kprintf("hammer2_unmount: chains left over " 2116 "after final sync\n"); 2117 kprintf(" vchain %08x\n", hmp->vchain.flags); 2118 kprintf(" fchain %08x\n", hmp->fchain.flags); 2119 2120 if (hammer2_debug & 0x0010) 2121 Debugger("entered debugger"); 2122 } 2123 2124 KKASSERT(hmp->spmp == NULL); 2125 2126 /* 2127 * Finish up with the device vnode 2128 */ 2129 if ((devvp = hmp->devvp) != NULL) { 2130 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 2131 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0); 2132 hmp->devvp = NULL; 2133 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL); 2134 vn_unlock(devvp); 2135 vrele(devvp); 2136 devvp = NULL; 2137 } 2138 2139 /* 2140 * Clear vchain/fchain flags that might prevent final cleanup 2141 * of these chains. 2142 */ 2143 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) { 2144 atomic_clear_int(&hmp->vchain.flags, 2145 HAMMER2_CHAIN_MODIFIED); 2146 hammer2_pfs_memory_wakeup(hmp->vchain.pmp); 2147 hammer2_chain_drop(&hmp->vchain); 2148 } 2149 if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) { 2150 atomic_clear_int(&hmp->vchain.flags, 2151 HAMMER2_CHAIN_UPDATE); 2152 hammer2_chain_drop(&hmp->vchain); 2153 } 2154 2155 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) { 2156 atomic_clear_int(&hmp->fchain.flags, 2157 HAMMER2_CHAIN_MODIFIED); 2158 hammer2_pfs_memory_wakeup(hmp->fchain.pmp); 2159 hammer2_chain_drop(&hmp->fchain); 2160 } 2161 if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) { 2162 atomic_clear_int(&hmp->fchain.flags, 2163 HAMMER2_CHAIN_UPDATE); 2164 hammer2_chain_drop(&hmp->fchain); 2165 } 2166 2167 /* 2168 * Final drop of embedded freemap root chain to 2169 * clean up fchain.core (fchain structure is not 2170 * flagged ALLOCATED so it is cleaned out and then 2171 * left to rot). 2172 */ 2173 hammer2_chain_drop(&hmp->fchain); 2174 2175 /* 2176 * Final drop of embedded volume root chain to clean 2177 * up vchain.core (vchain structure is not flagged 2178 * ALLOCATED so it is cleaned out and then left to 2179 * rot). 2180 */ 2181 dumpcnt = 50; 2182 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v'); 2183 dumpcnt = 50; 2184 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f'); 2185 hammer2_dev_unlock(hmp); 2186 hammer2_chain_drop(&hmp->vchain); 2187 2188 hammer2_io_cleanup(hmp, &hmp->iotree); 2189 if (hmp->iofree_count) { 2190 kprintf("io_cleanup: %d I/O's left hanging\n", 2191 hmp->iofree_count); 2192 } 2193 2194 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry); 2195 kmalloc_destroy(&hmp->mchain); 2196 kfree(hmp, M_HAMMER2); 2197 } 2198 2199 static 2200 int 2201 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 2202 ino_t ino, struct vnode **vpp) 2203 { 2204 kprintf("hammer2_vget\n"); 2205 return (EOPNOTSUPP); 2206 } 2207 2208 static 2209 int 2210 hammer2_vfs_root(struct mount *mp, struct vnode **vpp) 2211 { 2212 hammer2_pfs_t *pmp; 2213 hammer2_cluster_t *cparent; 2214 int error; 2215 struct vnode *vp; 2216 2217 pmp = MPTOPMP(mp); 2218 if (pmp->iroot == NULL) { 2219 *vpp = NULL; 2220 error = EINVAL; 2221 } else { 2222 cparent = hammer2_inode_lock(pmp->iroot, 2223 HAMMER2_RESOLVE_ALWAYS | 2224 HAMMER2_RESOLVE_SHARED); 2225 vp = hammer2_igetv(pmp->iroot, cparent, &error); 2226 hammer2_inode_unlock(pmp->iroot, cparent); 2227 *vpp = vp; 2228 if (vp == NULL) 2229 kprintf("vnodefail\n"); 2230 } 2231 2232 return (error); 2233 } 2234 2235 /* 2236 * Filesystem status 2237 * 2238 * XXX incorporate ipdata->inode_quota and data_quota 2239 */ 2240 static 2241 int 2242 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) 2243 { 2244 hammer2_pfs_t *pmp; 2245 hammer2_dev_t *hmp; 2246 hammer2_blockref_t bref; 2247 2248 pmp = MPTOPMP(mp); 2249 KKASSERT(pmp->iroot->cluster.nchains >= 1); 2250 hmp = pmp->iroot->cluster.focus->hmp; /* iroot retains focus */ 2251 bref = pmp->iroot->cluster.focus->bref; /* no lock */ 2252 2253 mp->mnt_stat.f_files = bref.inode_count; 2254 mp->mnt_stat.f_ffree = 0; 2255 mp->mnt_stat.f_blocks = (bref.data_count + 2256 hmp->voldata.allocator_free) / 2257 mp->mnt_vstat.f_bsize; 2258 mp->mnt_stat.f_bfree = hmp->voldata.allocator_free / 2259 mp->mnt_vstat.f_bsize; 2260 mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree; 2261 2262 *sbp = mp->mnt_stat; 2263 return (0); 2264 } 2265 2266 static 2267 int 2268 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) 2269 { 2270 hammer2_pfs_t *pmp; 2271 hammer2_dev_t *hmp; 2272 hammer2_blockref_t bref; 2273 2274 pmp = MPTOPMP(mp); 2275 KKASSERT(pmp->iroot->cluster.nchains >= 1); 2276 hmp = pmp->iroot->cluster.focus->hmp; /* iroot retains focus */ 2277 bref = pmp->iroot->cluster.focus->bref; /* no lock */ 2278 2279 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 2280 mp->mnt_vstat.f_files = bref.inode_count; 2281 mp->mnt_vstat.f_ffree = 0; 2282 mp->mnt_vstat.f_blocks = (bref.data_count + 2283 hmp->voldata.allocator_free) / 2284 mp->mnt_vstat.f_bsize; 2285 mp->mnt_vstat.f_bfree = hmp->voldata.allocator_free / 2286 mp->mnt_vstat.f_bsize; 2287 mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree; 2288 2289 *sbp = mp->mnt_vstat; 2290 return (0); 2291 } 2292 2293 /* 2294 * Mount-time recovery (RW mounts) 2295 * 2296 * Updates to the free block table are allowed to lag flushes by one 2297 * transaction. In case of a crash, then on a fresh mount we must do an 2298 * incremental scan of the last committed transaction id and make sure that 2299 * all related blocks have been marked allocated. 2300 * 2301 * The super-root topology and each PFS has its own transaction id domain, 2302 * so we must track PFS boundary transitions. 2303 */ 2304 struct hammer2_recovery_elm { 2305 TAILQ_ENTRY(hammer2_recovery_elm) entry; 2306 hammer2_chain_t *chain; 2307 hammer2_tid_t sync_tid; 2308 }; 2309 2310 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); 2311 2312 struct hammer2_recovery_info { 2313 struct hammer2_recovery_list list; 2314 int depth; 2315 }; 2316 2317 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp, 2318 hammer2_chain_t *parent, 2319 struct hammer2_recovery_info *info, 2320 hammer2_tid_t sync_tid); 2321 2322 #define HAMMER2_RECOVERY_MAXDEPTH 10 2323 2324 static 2325 int 2326 hammer2_recovery(hammer2_dev_t *hmp) 2327 { 2328 hammer2_trans_t trans; 2329 struct hammer2_recovery_info info; 2330 struct hammer2_recovery_elm *elm; 2331 hammer2_chain_t *parent; 2332 hammer2_tid_t sync_tid; 2333 hammer2_tid_t mirror_tid; 2334 int error; 2335 int cumulative_error = 0; 2336 2337 hammer2_trans_init(&trans, hmp->spmp, 0); 2338 2339 sync_tid = hmp->voldata.freemap_tid; 2340 mirror_tid = hmp->voldata.mirror_tid; 2341 2342 kprintf("hammer2 mount \"%s\": ", hmp->devrepname); 2343 if (sync_tid >= mirror_tid) { 2344 kprintf(" no recovery needed\n"); 2345 } else { 2346 kprintf(" freemap recovery %016jx-%016jx\n", 2347 sync_tid + 1, mirror_tid); 2348 } 2349 2350 TAILQ_INIT(&info.list); 2351 info.depth = 0; 2352 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 2353 cumulative_error = hammer2_recovery_scan(&trans, hmp, parent, 2354 &info, sync_tid); 2355 hammer2_chain_lookup_done(parent); 2356 2357 while ((elm = TAILQ_FIRST(&info.list)) != NULL) { 2358 TAILQ_REMOVE(&info.list, elm, entry); 2359 parent = elm->chain; 2360 sync_tid = elm->sync_tid; 2361 kfree(elm, M_HAMMER2); 2362 2363 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2364 error = hammer2_recovery_scan(&trans, hmp, parent, 2365 &info, 2366 hmp->voldata.freemap_tid); 2367 hammer2_chain_unlock(parent); 2368 hammer2_chain_drop(parent); /* drop elm->chain ref */ 2369 if (error) 2370 cumulative_error = error; 2371 } 2372 hammer2_trans_done(&trans); 2373 2374 return cumulative_error; 2375 } 2376 2377 static 2378 int 2379 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp, 2380 hammer2_chain_t *parent, 2381 struct hammer2_recovery_info *info, 2382 hammer2_tid_t sync_tid) 2383 { 2384 const hammer2_inode_data_t *ripdata; 2385 hammer2_chain_t *chain; 2386 int cache_index; 2387 int cumulative_error = 0; 2388 int error; 2389 2390 /* 2391 * Adjust freemap to ensure that the block(s) are marked allocated. 2392 */ 2393 if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { 2394 hammer2_freemap_adjust(trans, hmp, &parent->bref, 2395 HAMMER2_FREEMAP_DORECOVER); 2396 } 2397 2398 /* 2399 * Check type for recursive scan 2400 */ 2401 switch(parent->bref.type) { 2402 case HAMMER2_BREF_TYPE_VOLUME: 2403 /* data already instantiated */ 2404 break; 2405 case HAMMER2_BREF_TYPE_INODE: 2406 /* 2407 * Must instantiate data for DIRECTDATA test and also 2408 * for recursion. 2409 */ 2410 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2411 ripdata = &hammer2_chain_rdata(parent)->ipdata; 2412 if (ripdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2413 /* not applicable to recovery scan */ 2414 hammer2_chain_unlock(parent); 2415 return 0; 2416 } 2417 hammer2_chain_unlock(parent); 2418 break; 2419 case HAMMER2_BREF_TYPE_INDIRECT: 2420 /* 2421 * Must instantiate data for recursion 2422 */ 2423 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2424 hammer2_chain_unlock(parent); 2425 break; 2426 case HAMMER2_BREF_TYPE_DATA: 2427 case HAMMER2_BREF_TYPE_FREEMAP: 2428 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2429 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2430 /* not applicable to recovery scan */ 2431 return 0; 2432 break; 2433 default: 2434 return EDOM; 2435 } 2436 2437 /* 2438 * Defer operation if depth limit reached or if we are crossing a 2439 * PFS boundary. 2440 */ 2441 if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) { 2442 struct hammer2_recovery_elm *elm; 2443 2444 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); 2445 elm->chain = parent; 2446 elm->sync_tid = sync_tid; 2447 hammer2_chain_ref(parent); 2448 TAILQ_INSERT_TAIL(&info->list, elm, entry); 2449 /* unlocked by caller */ 2450 2451 return(0); 2452 } 2453 2454 2455 /* 2456 * Recursive scan of the last flushed transaction only. We are 2457 * doing this without pmp assignments so don't leave the chains 2458 * hanging around after we are done with them. 2459 */ 2460 cache_index = 0; 2461 chain = hammer2_chain_scan(parent, NULL, &cache_index, 2462 HAMMER2_LOOKUP_NODATA); 2463 while (chain) { 2464 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 2465 if (chain->bref.mirror_tid > sync_tid) { 2466 ++info->depth; 2467 error = hammer2_recovery_scan(trans, hmp, chain, 2468 info, sync_tid); 2469 --info->depth; 2470 if (error) 2471 cumulative_error = error; 2472 } 2473 2474 /* 2475 * Flush the recovery at the PFS boundary to stage it for 2476 * the final flush of the super-root topology. 2477 */ 2478 if ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && 2479 (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { 2480 hammer2_flush(trans, chain, 1); 2481 } 2482 chain = hammer2_chain_scan(parent, chain, &cache_index, 2483 HAMMER2_LOOKUP_NODATA); 2484 } 2485 2486 return cumulative_error; 2487 } 2488 2489 /* 2490 * Sync a mount point; this is called on a per-mount basis from the 2491 * filesystem syncer process periodically and whenever a user issues 2492 * a sync. 2493 */ 2494 int 2495 hammer2_vfs_sync(struct mount *mp, int waitfor) 2496 { 2497 struct hammer2_sync_info info; 2498 hammer2_inode_t *iroot; 2499 hammer2_chain_t *chain; 2500 hammer2_chain_t *parent; 2501 hammer2_pfs_t *pmp; 2502 hammer2_dev_t *hmp; 2503 int flags; 2504 int error; 2505 int total_error; 2506 int i; 2507 int j; 2508 2509 pmp = MPTOPMP(mp); 2510 iroot = pmp->iroot; 2511 KKASSERT(iroot); 2512 KKASSERT(iroot->pmp == pmp); 2513 2514 /* 2515 * We can't acquire locks on existing vnodes while in a transaction 2516 * without risking a deadlock. This assumes that vfsync() can be 2517 * called without the vnode locked (which it can in DragonFly). 2518 * Otherwise we'd have to implement a multi-pass or flag the lock 2519 * failures and retry. 2520 * 2521 * The reclamation code interlocks with the sync list's token 2522 * (by removing the vnode from the scan list) before unlocking 2523 * the inode, giving us time to ref the inode. 2524 */ 2525 /*flags = VMSC_GETVP;*/ 2526 flags = 0; 2527 if (waitfor & MNT_LAZY) 2528 flags |= VMSC_ONEPASS; 2529 2530 #if 0 2531 /* 2532 * Preflush the vnodes using a normal transaction before interlocking 2533 * with a flush transaction. 2534 */ 2535 hammer2_trans_init(&info.trans, pmp, 0); 2536 info.error = 0; 2537 info.waitfor = MNT_NOWAIT; 2538 vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); 2539 hammer2_trans_done(&info.trans); 2540 #endif 2541 2542 /* 2543 * Start our flush transaction. This does not return until all 2544 * concurrent transactions have completed and will prevent any 2545 * new transactions from running concurrently, except for the 2546 * buffer cache transactions. 2547 * 2548 * For efficiency do an async pass before making sure with a 2549 * synchronous pass on all related buffer cache buffers. It 2550 * should theoretically not be possible for any new file buffers 2551 * to be instantiated during this sequence. 2552 */ 2553 hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH | 2554 HAMMER2_TRANS_PREFLUSH); 2555 hammer2_run_unlinkq(&info.trans, pmp); 2556 2557 info.error = 0; 2558 info.waitfor = MNT_NOWAIT; 2559 vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); 2560 info.waitfor = MNT_WAIT; 2561 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2562 2563 /* 2564 * Clear PREFLUSH. This prevents (or asserts on) any new logical 2565 * buffer cache flushes which occur during the flush. Device buffers 2566 * are not affected. 2567 */ 2568 hammer2_bioq_sync(info.trans.pmp); 2569 atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH); 2570 2571 total_error = 0; 2572 2573 /* 2574 * Flush all nodes to synchronize the PFSROOT subtopology to the media. 2575 * 2576 * Note that this flush will not be visible on crash recovery until 2577 * we flush the super-root topology in the next loop. 2578 */ 2579 for (i = 0; iroot && i < iroot->cluster.nchains; ++i) { 2580 chain = iroot->cluster.array[i].chain; 2581 if (chain == NULL) 2582 continue; 2583 2584 hammer2_chain_ref(chain); 2585 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 2586 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) { 2587 hammer2_flush(&info.trans, chain, 1); 2588 parent = chain->parent; 2589 KKASSERT(chain->pmp != parent->pmp); 2590 hammer2_chain_setflush(&info.trans, parent); 2591 } 2592 hammer2_chain_unlock(chain); 2593 hammer2_chain_drop(chain); 2594 } 2595 hammer2_trans_done(&info.trans); 2596 2597 /* 2598 * Flush all volume roots to synchronize PFS flushes with the 2599 * storage media volume header. This will flush the freemap and 2600 * the superroot topology but stops when it reaches a PFSROOT 2601 * (which we already flushed above). 2602 * 2603 * This is the last step which connects the volume root to the 2604 * PFSROOT dirs flushed above. 2605 * 2606 * Each spmp (representing the hmp's super-root) requires its own 2607 * transaction. 2608 */ 2609 for (i = 0; iroot && i < iroot->cluster.nchains; ++i) { 2610 hammer2_chain_t *tmp; 2611 2612 chain = iroot->cluster.array[i].chain; 2613 if (chain == NULL) 2614 continue; 2615 2616 hmp = chain->hmp; 2617 2618 /* 2619 * We only have to flush each hmp once 2620 */ 2621 for (j = i - 1; j >= 0; --j) { 2622 if ((tmp = iroot->cluster.array[j].chain) != NULL) { 2623 if (tmp->hmp == hmp) 2624 break; 2625 } 2626 } 2627 if (j >= 0) 2628 continue; 2629 2630 /* 2631 * spmp transaction. The super-root is never directly 2632 * mounted so there shouldn't be any vnodes, let alone any 2633 * dirty vnodes associated with it. 2634 */ 2635 hammer2_trans_init(&info.trans, hmp->spmp, 2636 HAMMER2_TRANS_ISFLUSH); 2637 2638 /* 2639 * Media mounts have two 'roots', vchain for the topology 2640 * and fchain for the free block table. Flush both. 2641 * 2642 * Note that the topology and free block table are handled 2643 * independently, so the free block table can wind up being 2644 * ahead of the topology. We depend on the bulk free scan 2645 * code to deal with any loose ends. 2646 */ 2647 hammer2_chain_ref(&hmp->vchain); 2648 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 2649 hammer2_chain_ref(&hmp->fchain); 2650 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 2651 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 2652 /* 2653 * This will also modify vchain as a side effect, 2654 * mark vchain as modified now. 2655 */ 2656 hammer2_voldata_modify(hmp); 2657 chain = &hmp->fchain; 2658 hammer2_flush(&info.trans, chain, 1); 2659 KKASSERT(chain == &hmp->fchain); 2660 } 2661 hammer2_chain_unlock(&hmp->fchain); 2662 hammer2_chain_unlock(&hmp->vchain); 2663 hammer2_chain_drop(&hmp->fchain); 2664 /* vchain dropped down below */ 2665 2666 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 2667 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 2668 chain = &hmp->vchain; 2669 hammer2_flush(&info.trans, chain, 1); 2670 KKASSERT(chain == &hmp->vchain); 2671 } 2672 hammer2_chain_unlock(&hmp->vchain); 2673 hammer2_chain_drop(&hmp->vchain); 2674 2675 error = 0; 2676 2677 /* 2678 * We can't safely flush the volume header until we have 2679 * flushed any device buffers which have built up. 2680 * 2681 * XXX this isn't being incremental 2682 */ 2683 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY); 2684 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0); 2685 vn_unlock(hmp->devvp); 2686 2687 /* 2688 * The flush code sets CHAIN_VOLUMESYNC to indicate that the 2689 * volume header needs synchronization via hmp->volsync. 2690 * 2691 * XXX synchronize the flag & data with only this flush XXX 2692 */ 2693 if (error == 0 && 2694 (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) { 2695 struct buf *bp; 2696 2697 /* 2698 * Synchronize the disk before flushing the volume 2699 * header. 2700 */ 2701 bp = getpbuf(NULL); 2702 bp->b_bio1.bio_offset = 0; 2703 bp->b_bufsize = 0; 2704 bp->b_bcount = 0; 2705 bp->b_cmd = BUF_CMD_FLUSH; 2706 bp->b_bio1.bio_done = biodone_sync; 2707 bp->b_bio1.bio_flags |= BIO_SYNC; 2708 vn_strategy(hmp->devvp, &bp->b_bio1); 2709 biowait(&bp->b_bio1, "h2vol"); 2710 relpbuf(bp, NULL); 2711 2712 /* 2713 * Then we can safely flush the version of the 2714 * volume header synchronized by the flush code. 2715 */ 2716 i = hmp->volhdrno + 1; 2717 if (i >= HAMMER2_NUM_VOLHDRS) 2718 i = 0; 2719 if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE > 2720 hmp->volsync.volu_size) { 2721 i = 0; 2722 } 2723 kprintf("sync volhdr %d %jd\n", 2724 i, (intmax_t)hmp->volsync.volu_size); 2725 bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2726 HAMMER2_PBUFSIZE, 0, 0); 2727 atomic_clear_int(&hmp->vchain.flags, 2728 HAMMER2_CHAIN_VOLUMESYNC); 2729 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE); 2730 bawrite(bp); 2731 hmp->volhdrno = i; 2732 } 2733 if (error) 2734 total_error = error; 2735 2736 hammer2_trans_done(&info.trans); /* spmp trans */ 2737 } 2738 return (total_error); 2739 } 2740 2741 /* 2742 * Sync passes. 2743 */ 2744 static int 2745 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data) 2746 { 2747 struct hammer2_sync_info *info = data; 2748 hammer2_inode_t *ip; 2749 int error; 2750 2751 /* 2752 * Degenerate cases. Note that ip == NULL typically means the 2753 * syncer vnode itself and we don't want to vclrisdirty() in that 2754 * situation. 2755 */ 2756 ip = VTOI(vp); 2757 if (ip == NULL) { 2758 return(0); 2759 } 2760 if (vp->v_type == VNON || vp->v_type == VBAD) { 2761 vclrisdirty(vp); 2762 return(0); 2763 } 2764 2765 /* 2766 * VOP_FSYNC will start a new transaction so replicate some code 2767 * here to do it inline (see hammer2_vop_fsync()). 2768 * 2769 * WARNING: The vfsync interacts with the buffer cache and might 2770 * block, we can't hold the inode lock at that time. 2771 * However, we MUST ref ip before blocking to ensure that 2772 * it isn't ripped out from under us (since we do not 2773 * hold a lock on the vnode). 2774 */ 2775 hammer2_inode_ref(ip); 2776 atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED); 2777 if ((ip->flags & HAMMER2_INODE_MODIFIED) || 2778 !RB_EMPTY(&vp->v_rbdirty_tree)) { 2779 vfsync(vp, info->waitfor, 1, NULL, NULL); 2780 } 2781 if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 && 2782 RB_EMPTY(&vp->v_rbdirty_tree)) { 2783 vclrisdirty(vp); 2784 } 2785 2786 hammer2_inode_drop(ip); 2787 #if 1 2788 error = 0; 2789 if (error) 2790 info->error = error; 2791 #endif 2792 return(0); 2793 } 2794 2795 static 2796 int 2797 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp) 2798 { 2799 return (0); 2800 } 2801 2802 static 2803 int 2804 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 2805 struct fid *fhp, struct vnode **vpp) 2806 { 2807 return (0); 2808 } 2809 2810 static 2811 int 2812 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 2813 int *exflagsp, struct ucred **credanonp) 2814 { 2815 return (0); 2816 } 2817 2818 /* 2819 * Support code for hammer2_vfs_mount(). Read, verify, and install the volume 2820 * header into the HMP 2821 * 2822 * XXX read four volhdrs and use the one with the highest TID whos CRC 2823 * matches. 2824 * 2825 * XXX check iCRCs. 2826 * 2827 * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to 2828 * nonexistant locations. 2829 * 2830 * XXX Record selected volhdr and ring updates to each of 4 volhdrs 2831 */ 2832 static 2833 int 2834 hammer2_install_volume_header(hammer2_dev_t *hmp) 2835 { 2836 hammer2_volume_data_t *vd; 2837 struct buf *bp; 2838 hammer2_crc32_t crc0, crc, bcrc0, bcrc; 2839 int error_reported; 2840 int error; 2841 int valid; 2842 int i; 2843 2844 error_reported = 0; 2845 error = 0; 2846 valid = 0; 2847 bp = NULL; 2848 2849 /* 2850 * There are up to 4 copies of the volume header (syncs iterate 2851 * between them so there is no single master). We don't trust the 2852 * volu_size field so we don't know precisely how large the filesystem 2853 * is, so depend on the OS to return an error if we go beyond the 2854 * block device's EOF. 2855 */ 2856 for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { 2857 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2858 HAMMER2_VOLUME_BYTES, &bp); 2859 if (error) { 2860 brelse(bp); 2861 bp = NULL; 2862 continue; 2863 } 2864 2865 vd = (struct hammer2_volume_data *) bp->b_data; 2866 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) && 2867 (vd->magic != HAMMER2_VOLUME_ID_ABO)) { 2868 brelse(bp); 2869 bp = NULL; 2870 continue; 2871 } 2872 2873 if (vd->magic == HAMMER2_VOLUME_ID_ABO) { 2874 /* XXX: Reversed-endianness filesystem */ 2875 kprintf("hammer2: reverse-endian filesystem detected"); 2876 brelse(bp); 2877 bp = NULL; 2878 continue; 2879 } 2880 2881 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0]; 2882 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF, 2883 HAMMER2_VOLUME_ICRC0_SIZE); 2884 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1]; 2885 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF, 2886 HAMMER2_VOLUME_ICRC1_SIZE); 2887 if ((crc0 != crc) || (bcrc0 != bcrc)) { 2888 kprintf("hammer2 volume header crc " 2889 "mismatch copy #%d %08x/%08x\n", 2890 i, crc0, crc); 2891 error_reported = 1; 2892 brelse(bp); 2893 bp = NULL; 2894 continue; 2895 } 2896 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) { 2897 valid = 1; 2898 hmp->voldata = *vd; 2899 hmp->volhdrno = i; 2900 } 2901 brelse(bp); 2902 bp = NULL; 2903 } 2904 if (valid) { 2905 hmp->volsync = hmp->voldata; 2906 error = 0; 2907 if (error_reported || bootverbose || 1) { /* 1/DEBUG */ 2908 kprintf("hammer2: using volume header #%d\n", 2909 hmp->volhdrno); 2910 } 2911 } else { 2912 error = EINVAL; 2913 kprintf("hammer2: no valid volume headers found!\n"); 2914 } 2915 return (error); 2916 } 2917 2918 /* 2919 * This handles hysteresis on regular file flushes. Because the BIOs are 2920 * routed to a thread it is possible for an excessive number to build up 2921 * and cause long front-end stalls long before the runningbuffspace limit 2922 * is hit, so we implement hammer2_flush_pipe to control the 2923 * hysteresis. 2924 * 2925 * This is a particular problem when compression is used. 2926 */ 2927 void 2928 hammer2_lwinprog_ref(hammer2_pfs_t *pmp) 2929 { 2930 atomic_add_int(&pmp->count_lwinprog, 1); 2931 } 2932 2933 void 2934 hammer2_lwinprog_drop(hammer2_pfs_t *pmp) 2935 { 2936 int lwinprog; 2937 2938 lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1); 2939 if ((lwinprog & HAMMER2_LWINPROG_WAITING) && 2940 (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) { 2941 atomic_clear_int(&pmp->count_lwinprog, 2942 HAMMER2_LWINPROG_WAITING); 2943 wakeup(&pmp->count_lwinprog); 2944 } 2945 } 2946 2947 void 2948 hammer2_lwinprog_wait(hammer2_pfs_t *pmp) 2949 { 2950 int lwinprog; 2951 2952 for (;;) { 2953 lwinprog = pmp->count_lwinprog; 2954 cpu_ccfence(); 2955 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe) 2956 break; 2957 tsleep_interlock(&pmp->count_lwinprog, 0); 2958 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING); 2959 lwinprog = pmp->count_lwinprog; 2960 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe) 2961 break; 2962 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz); 2963 } 2964 } 2965 2966 /* 2967 * Manage excessive memory resource use for chain and related 2968 * structures. 2969 */ 2970 void 2971 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp) 2972 { 2973 uint32_t waiting; 2974 uint32_t count; 2975 uint32_t limit; 2976 #if 0 2977 static int zzticks; 2978 #endif 2979 2980 /* 2981 * Atomic check condition and wait. Also do an early speedup of 2982 * the syncer to try to avoid hitting the wait. 2983 */ 2984 for (;;) { 2985 waiting = pmp->inmem_dirty_chains; 2986 cpu_ccfence(); 2987 count = waiting & HAMMER2_DIRTYCHAIN_MASK; 2988 2989 limit = pmp->mp->mnt_nvnodelistsize / 10; 2990 if (limit < hammer2_limit_dirty_chains) 2991 limit = hammer2_limit_dirty_chains; 2992 if (limit < 1000) 2993 limit = 1000; 2994 2995 #if 0 2996 if ((int)(ticks - zzticks) > hz) { 2997 zzticks = ticks; 2998 kprintf("count %ld %ld\n", count, limit); 2999 } 3000 #endif 3001 3002 /* 3003 * Block if there are too many dirty chains present, wait 3004 * for the flush to clean some out. 3005 */ 3006 if (count > limit) { 3007 tsleep_interlock(&pmp->inmem_dirty_chains, 0); 3008 if (atomic_cmpset_int(&pmp->inmem_dirty_chains, 3009 waiting, 3010 waiting | HAMMER2_DIRTYCHAIN_WAITING)) { 3011 speedup_syncer(pmp->mp); 3012 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED, 3013 "chnmem", hz); 3014 } 3015 continue; /* loop on success or fail */ 3016 } 3017 3018 /* 3019 * Try to start an early flush before we are forced to block. 3020 */ 3021 if (count > limit * 7 / 10) 3022 speedup_syncer(pmp->mp); 3023 break; 3024 } 3025 } 3026 3027 void 3028 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp) 3029 { 3030 if (pmp) { 3031 atomic_add_int(&pmp->inmem_dirty_chains, 1); 3032 } 3033 } 3034 3035 void 3036 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp) 3037 { 3038 uint32_t waiting; 3039 3040 if (pmp == NULL) 3041 return; 3042 3043 for (;;) { 3044 waiting = pmp->inmem_dirty_chains; 3045 cpu_ccfence(); 3046 if (atomic_cmpset_int(&pmp->inmem_dirty_chains, 3047 waiting, 3048 (waiting - 1) & 3049 ~HAMMER2_DIRTYCHAIN_WAITING)) { 3050 break; 3051 } 3052 } 3053 3054 if (waiting & HAMMER2_DIRTYCHAIN_WAITING) 3055 wakeup(&pmp->inmem_dirty_chains); 3056 } 3057 3058 /* 3059 * Debugging 3060 */ 3061 void 3062 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx) 3063 { 3064 hammer2_chain_t *scan; 3065 hammer2_chain_t *parent; 3066 3067 --*countp; 3068 if (*countp == 0) { 3069 kprintf("%*.*s...\n", tab, tab, ""); 3070 return; 3071 } 3072 if (*countp < 0) 3073 return; 3074 kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n", 3075 tab, tab, "", pfx, 3076 chain, chain->bref.type, 3077 chain->bref.key, chain->bref.keybits, 3078 chain->bref.mirror_tid); 3079 3080 kprintf("%*.*s [%08x] (%s) refs=%d", 3081 tab, tab, "", 3082 chain->flags, 3083 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 3084 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 3085 chain->refs); 3086 3087 parent = chain->parent; 3088 if (parent) 3089 kprintf("\n%*.*s p=%p [pflags %08x prefs %d", 3090 tab, tab, "", 3091 parent, parent->flags, parent->refs); 3092 if (RB_EMPTY(&chain->core.rbtree)) { 3093 kprintf("\n"); 3094 } else { 3095 kprintf(" {\n"); 3096 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) 3097 hammer2_dump_chain(scan, tab + 4, countp, 'a'); 3098 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 3099 kprintf("%*.*s}(%s)\n", tab, tab, "", 3100 chain->data->ipdata.filename); 3101 else 3102 kprintf("%*.*s}\n", tab, tab, ""); 3103 } 3104 } 3105