1 /* 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/nlookup.h> 39 #include <sys/vnode.h> 40 #include <sys/mount.h> 41 #include <sys/fcntl.h> 42 #include <sys/buf.h> 43 #include <sys/uuid.h> 44 #include <sys/vfsops.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 #include <sys/objcache.h> 48 49 #include <sys/proc.h> 50 #include <sys/namei.h> 51 #include <sys/mountctl.h> 52 #include <sys/dirent.h> 53 #include <sys/uio.h> 54 55 #include <sys/mutex.h> 56 #include <sys/mutex2.h> 57 58 #include "hammer2.h" 59 #include "hammer2_disk.h" 60 #include "hammer2_mount.h" 61 #include "hammer2_lz4.h" 62 63 #include "zlib/hammer2_zlib.h" 64 65 #define REPORT_REFS_ERRORS 1 /* XXX remove me */ 66 67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); 68 69 struct hammer2_sync_info { 70 int error; 71 int waitfor; 72 int pass; 73 }; 74 75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev); 76 static struct hammer2_mntlist hammer2_mntlist; 77 78 struct hammer2_pfslist hammer2_pfslist; 79 struct hammer2_pfslist hammer2_spmplist; 80 struct lock hammer2_mntlk; 81 82 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT; 83 int hammer2_debug; 84 int hammer2_cluster_meta_read = 1; /* physical read-ahead */ 85 int hammer2_cluster_data_read = 4; /* physical read-ahead */ 86 int hammer2_dedup_enable = 1; 87 int hammer2_always_compress = 0; /* always try to compress */ 88 int hammer2_inval_enable = 0; 89 int hammer2_flush_pipe = 100; 90 int hammer2_dio_count; 91 int hammer2_dio_limit = 256; 92 int hammer2_bulkfree_tps = 5000; 93 long hammer2_chain_allocs; 94 long hammer2_chain_frees; 95 long hammer2_limit_dirty_chains; 96 long hammer2_limit_dirty_inodes; 97 long hammer2_count_modified_chains; 98 long hammer2_iod_invals; 99 long hammer2_iod_file_read; 100 long hammer2_iod_meta_read; 101 long hammer2_iod_indr_read; 102 long hammer2_iod_fmap_read; 103 long hammer2_iod_volu_read; 104 long hammer2_iod_file_write; 105 long hammer2_iod_file_wembed; 106 long hammer2_iod_file_wzero; 107 long hammer2_iod_file_wdedup; 108 long hammer2_iod_meta_write; 109 long hammer2_iod_indr_write; 110 long hammer2_iod_fmap_write; 111 long hammer2_iod_volu_write; 112 113 MALLOC_DECLARE(M_HAMMER2_CBUFFER); 114 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer", 115 "Buffer used for compression."); 116 117 MALLOC_DECLARE(M_HAMMER2_DEBUFFER); 118 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer", 119 "Buffer used for decompression."); 120 121 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem"); 122 123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD, 124 &hammer2_supported_version, 0, ""); 125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW, 126 &hammer2_debug, 0, ""); 127 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW, 128 &hammer2_cluster_meta_read, 0, ""); 129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW, 130 &hammer2_cluster_data_read, 0, ""); 131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW, 132 &hammer2_dedup_enable, 0, ""); 133 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW, 134 &hammer2_always_compress, 0, ""); 135 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, 136 &hammer2_inval_enable, 0, ""); 137 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, 138 &hammer2_flush_pipe, 0, ""); 139 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW, 140 &hammer2_bulkfree_tps, 0, ""); 141 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW, 142 &hammer2_chain_allocs, 0, ""); 143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW, 144 &hammer2_chain_frees, 0, ""); 145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW, 146 &hammer2_limit_dirty_chains, 0, ""); 147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW, 148 &hammer2_limit_dirty_inodes, 0, ""); 149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, 150 &hammer2_count_modified_chains, 0, ""); 151 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, 152 &hammer2_dio_count, 0, ""); 153 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW, 154 &hammer2_dio_limit, 0, ""); 155 156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, 157 &hammer2_iod_invals, 0, ""); 158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, 159 &hammer2_iod_file_read, 0, ""); 160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, 161 &hammer2_iod_meta_read, 0, ""); 162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, 163 &hammer2_iod_indr_read, 0, ""); 164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, 165 &hammer2_iod_fmap_read, 0, ""); 166 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, 167 &hammer2_iod_volu_read, 0, ""); 168 169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, 170 &hammer2_iod_file_write, 0, ""); 171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW, 172 &hammer2_iod_file_wembed, 0, ""); 173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW, 174 &hammer2_iod_file_wzero, 0, ""); 175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW, 176 &hammer2_iod_file_wdedup, 0, ""); 177 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, 178 &hammer2_iod_meta_write, 0, ""); 179 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, 180 &hammer2_iod_indr_write, 0, ""); 181 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, 182 &hammer2_iod_fmap_write, 0, ""); 183 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, 184 &hammer2_iod_volu_write, 0, ""); 185 186 long hammer2_process_icrc32; 187 long hammer2_process_xxhash64; 188 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW, 189 &hammer2_process_icrc32, 0, ""); 190 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW, 191 &hammer2_process_xxhash64, 0, ""); 192 193 static int hammer2_vfs_init(struct vfsconf *conf); 194 static int hammer2_vfs_uninit(struct vfsconf *vfsp); 195 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 196 struct ucred *cred); 197 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *, 198 struct vnode *, struct ucred *); 199 static int hammer2_recovery(hammer2_dev_t *hmp); 200 static int hammer2_vfs_unmount(struct mount *mp, int mntflags); 201 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); 202 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, 203 struct ucred *cred); 204 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, 205 struct ucred *cred); 206 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 207 struct fid *fhp, struct vnode **vpp); 208 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp); 209 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 210 int *exflagsp, struct ucred **credanonp); 211 212 static int hammer2_install_volume_header(hammer2_dev_t *hmp); 213 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 214 215 static void hammer2_update_pmps(hammer2_dev_t *hmp); 216 217 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp); 218 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, 219 hammer2_dev_t *hmp); 220 static int hammer2_fixup_pfses(hammer2_dev_t *hmp); 221 222 /* 223 * HAMMER2 vfs operations. 224 */ 225 static struct vfsops hammer2_vfsops = { 226 .vfs_init = hammer2_vfs_init, 227 .vfs_uninit = hammer2_vfs_uninit, 228 .vfs_sync = hammer2_vfs_sync, 229 .vfs_mount = hammer2_vfs_mount, 230 .vfs_unmount = hammer2_vfs_unmount, 231 .vfs_root = hammer2_vfs_root, 232 .vfs_statfs = hammer2_vfs_statfs, 233 .vfs_statvfs = hammer2_vfs_statvfs, 234 .vfs_vget = hammer2_vfs_vget, 235 .vfs_vptofh = hammer2_vfs_vptofh, 236 .vfs_fhtovp = hammer2_vfs_fhtovp, 237 .vfs_checkexp = hammer2_vfs_checkexp 238 }; 239 240 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); 241 242 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE); 243 MODULE_VERSION(hammer2, 1); 244 245 static 246 int 247 hammer2_vfs_init(struct vfsconf *conf) 248 { 249 static struct objcache_malloc_args margs_read; 250 static struct objcache_malloc_args margs_write; 251 static struct objcache_malloc_args margs_vop; 252 253 int error; 254 255 error = 0; 256 257 /* 258 * A large DIO cache is needed to retain dedup enablement masks. 259 * The bulkfree code clears related masks as part of the disk block 260 * recycling algorithm, preventing it from being used for a later 261 * dedup. 262 * 263 * NOTE: A large buffer cache can actually interfere with dedup 264 * operation because we dedup based on media physical buffers 265 * and not logical buffers. Try to make the DIO case large 266 * enough to avoid this problem, but also cap it. 267 */ 268 hammer2_dio_limit = nbuf * 2; 269 if (hammer2_dio_limit > 100000) 270 hammer2_dio_limit = 100000; 271 272 if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) 273 error = EINVAL; 274 if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data)) 275 error = EINVAL; 276 if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data)) 277 error = EINVAL; 278 279 if (error) 280 kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); 281 282 margs_read.objsize = 65536; 283 margs_read.mtype = M_HAMMER2_DEBUFFER; 284 285 margs_write.objsize = 32768; 286 margs_write.mtype = M_HAMMER2_CBUFFER; 287 288 margs_vop.objsize = sizeof(hammer2_xop_t); 289 margs_vop.mtype = M_HAMMER2; 290 291 /* 292 * Note thaht for the XOPS cache we want backing store allocations 293 * to use M_ZERO. This is not allowed in objcache_get() (to avoid 294 * confusion), so use the backing store function that does it. This 295 * means that initial XOPS objects are zerod but REUSED objects are 296 * not. So we are responsible for cleaning the object up sufficiently 297 * for our needs before objcache_put()ing it back (typically just the 298 * FIFO indices). 299 */ 300 cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, 301 0, 1, NULL, NULL, NULL, 302 objcache_malloc_alloc, 303 objcache_malloc_free, 304 &margs_read); 305 cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, 306 0, 1, NULL, NULL, NULL, 307 objcache_malloc_alloc, 308 objcache_malloc_free, 309 &margs_write); 310 cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc, 311 0, 1, NULL, NULL, NULL, 312 objcache_malloc_alloc_zero, 313 objcache_malloc_free, 314 &margs_vop); 315 316 317 lockinit(&hammer2_mntlk, "mntlk", 0, 0); 318 TAILQ_INIT(&hammer2_mntlist); 319 TAILQ_INIT(&hammer2_pfslist); 320 TAILQ_INIT(&hammer2_spmplist); 321 322 hammer2_limit_dirty_chains = maxvnodes / 10; 323 if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS) 324 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS; 325 326 hammer2_limit_dirty_inodes = maxvnodes / 100; 327 if (hammer2_limit_dirty_inodes < 100) 328 hammer2_limit_dirty_inodes = 100; 329 if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES) 330 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES; 331 332 return (error); 333 } 334 335 static 336 int 337 hammer2_vfs_uninit(struct vfsconf *vfsp __unused) 338 { 339 objcache_destroy(cache_buffer_read); 340 objcache_destroy(cache_buffer_write); 341 objcache_destroy(cache_xops); 342 return 0; 343 } 344 345 /* 346 * Core PFS allocator. Used to allocate or reference the pmp structure 347 * for PFS cluster mounts and the spmp structure for media (hmp) structures. 348 * The pmp can be passed in or loaded by this function using the chain and 349 * inode data. 350 * 351 * pmp->modify_tid tracks new modify_tid transaction ids for front-end 352 * transactions. Note that synchronization does not use this field. 353 * (typically frontend operations and synchronization cannot run on the 354 * same PFS node at the same time). 355 * 356 * XXX check locking 357 */ 358 hammer2_pfs_t * 359 hammer2_pfsalloc(hammer2_chain_t *chain, 360 const hammer2_inode_data_t *ripdata, 361 hammer2_tid_t modify_tid, hammer2_dev_t *force_local) 362 { 363 hammer2_pfs_t *pmp; 364 hammer2_inode_t *iroot; 365 int count; 366 int i; 367 int j; 368 369 pmp = NULL; 370 371 /* 372 * Locate or create the PFS based on the cluster id. If ripdata 373 * is NULL this is a spmp which is unique and is always allocated. 374 * 375 * If the device is mounted in local mode all PFSs are considered 376 * independent and not part of any cluster (for debugging only). 377 */ 378 if (ripdata) { 379 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 380 if (force_local != pmp->force_local) 381 continue; 382 if (force_local == NULL && 383 bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid, 384 sizeof(pmp->pfs_clid)) == 0) { 385 break; 386 } else if (force_local && pmp->pfs_names[0] && 387 strcmp(pmp->pfs_names[0], ripdata->filename) == 0) { 388 break; 389 } 390 } 391 } 392 393 if (pmp == NULL) { 394 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO); 395 pmp->force_local = force_local; 396 hammer2_trans_manage_init(pmp); 397 kmalloc_create(&pmp->minode, "HAMMER2-inodes"); 398 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg"); 399 lockinit(&pmp->lock, "pfslk", 0, 0); 400 lockinit(&pmp->lock_nlink, "h2nlink", 0, 0); 401 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum"); 402 spin_init(&pmp->xop_spin, "h2xop"); 403 spin_init(&pmp->lru_spin, "h2lru"); 404 RB_INIT(&pmp->inum_tree); 405 TAILQ_INIT(&pmp->sideq); 406 TAILQ_INIT(&pmp->lru_list); 407 spin_init(&pmp->list_spin, "hm2pfsalloc_list"); 408 409 /* 410 * Distribute backend operations to threads 411 */ 412 for (i = 0; i < HAMMER2_XOPGROUPS; ++i) 413 hammer2_xop_group_init(pmp, &pmp->xop_groups[i]); 414 415 /* 416 * Save the last media transaction id for the flusher. Set 417 * initial 418 */ 419 if (ripdata) { 420 pmp->pfs_clid = ripdata->meta.pfs_clid; 421 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry); 422 } else { 423 pmp->flags |= HAMMER2_PMPF_SPMP; 424 TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry); 425 } 426 427 /* 428 * The synchronization thread may start too early, make 429 * sure it stays frozen until we are ready to let it go. 430 * XXX 431 */ 432 /* 433 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN | 434 HAMMER2_THREAD_REMASTER; 435 */ 436 } 437 438 /* 439 * Create the PFS's root inode and any missing XOP helper threads. 440 */ 441 if ((iroot = pmp->iroot) == NULL) { 442 iroot = hammer2_inode_get(pmp, NULL, NULL, -1); 443 if (ripdata) 444 iroot->meta = ripdata->meta; 445 pmp->iroot = iroot; 446 hammer2_inode_ref(iroot); 447 hammer2_inode_unlock(iroot); 448 } 449 450 /* 451 * Stop here if no chain is passed in. 452 */ 453 if (chain == NULL) 454 goto done; 455 456 /* 457 * When a chain is passed in we must add it to the PFS's root 458 * inode, update pmp->pfs_types[], and update the syncronization 459 * threads. 460 * 461 * When forcing local mode, mark the PFS as a MASTER regardless. 462 * 463 * At the moment empty spots can develop due to removals or failures. 464 * Ultimately we want to re-fill these spots but doing so might 465 * confused running code. XXX 466 */ 467 hammer2_inode_ref(iroot); 468 hammer2_mtx_ex(&iroot->lock); 469 j = iroot->cluster.nchains; 470 471 if (j == HAMMER2_MAXCLUSTER) { 472 kprintf("hammer2_mount: cluster full!\n"); 473 /* XXX fatal error? */ 474 } else { 475 KKASSERT(chain->pmp == NULL); 476 chain->pmp = pmp; 477 hammer2_chain_ref(chain); 478 iroot->cluster.array[j].chain = chain; 479 if (force_local) 480 pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER; 481 else 482 pmp->pfs_types[j] = ripdata->meta.pfs_type; 483 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2); 484 pmp->pfs_hmps[j] = chain->hmp; 485 486 /* 487 * If the PFS is already mounted we must account 488 * for the mount_count here. 489 */ 490 if (pmp->mp) 491 ++chain->hmp->mount_count; 492 493 /* 494 * May have to fixup dirty chain tracking. Previous 495 * pmp was NULL so nothing to undo. 496 */ 497 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 498 hammer2_pfs_memory_inc(pmp); 499 ++j; 500 } 501 iroot->cluster.nchains = j; 502 503 /* 504 * Update nmasters from any PFS inode which is part of the cluster. 505 * It is possible that this will result in a value which is too 506 * high. MASTER PFSs are authoritative for pfs_nmasters and will 507 * override this value later on. 508 * 509 * (This informs us of masters that might not currently be 510 * discoverable by this mount). 511 */ 512 if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) { 513 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters; 514 } 515 516 /* 517 * Count visible masters. Masters are usually added with 518 * ripdata->meta.pfs_nmasters set to 1. This detects when there 519 * are more (XXX and must update the master inodes). 520 */ 521 count = 0; 522 for (i = 0; i < iroot->cluster.nchains; ++i) { 523 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) 524 ++count; 525 } 526 if (pmp->pfs_nmasters < count) 527 pmp->pfs_nmasters = count; 528 529 /* 530 * Create missing synchronization and support threads. 531 * 532 * Single-node masters (including snapshots) have nothing to 533 * synchronize and do not require this thread. 534 * 535 * Multi-node masters or any number of soft masters, slaves, copy, 536 * or other PFS types need the thread. 537 * 538 * Each thread is responsible for its particular cluster index. 539 * We use independent threads so stalls or mismatches related to 540 * any given target do not affect other targets. 541 */ 542 for (i = 0; i < iroot->cluster.nchains; ++i) { 543 /* 544 * Single-node masters (including snapshots) have nothing 545 * to synchronize and will make direct xops support calls, 546 * thus they do not require this thread. 547 * 548 * Note that there can be thousands of snapshots. We do not 549 * want to create thousands of threads. 550 */ 551 if (pmp->pfs_nmasters <= 1 && 552 pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) { 553 continue; 554 } 555 556 /* 557 * Sync support thread 558 */ 559 if (pmp->sync_thrs[i].td == NULL) { 560 hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL, 561 "h2nod", i, -1, 562 hammer2_primary_sync_thread); 563 } 564 } 565 566 /* 567 * Create missing Xop threads 568 * 569 * NOTE: We create helper threads for all mounted PFSs or any 570 * PFSs with 2+ nodes (so the sync thread can update them, 571 * even if not mounted). 572 */ 573 if (pmp->mp || iroot->cluster.nchains >= 2) 574 hammer2_xop_helper_create(pmp); 575 576 hammer2_mtx_unlock(&iroot->lock); 577 hammer2_inode_drop(iroot); 578 done: 579 return pmp; 580 } 581 582 /* 583 * Deallocate an element of a probed PFS. If destroying and this is a 584 * MASTER, adjust nmasters. 585 * 586 * This function does not physically destroy the PFS element in its device 587 * under the super-root (see hammer2_ioctl_pfs_delete()). 588 */ 589 void 590 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying) 591 { 592 hammer2_inode_t *iroot; 593 hammer2_chain_t *chain; 594 int j; 595 596 /* 597 * Cleanup our reference on iroot. iroot is (should) not be needed 598 * by the flush code. 599 */ 600 iroot = pmp->iroot; 601 if (iroot) { 602 /* 603 * Stop synchronizing 604 * 605 * XXX flush after acquiring the iroot lock. 606 * XXX clean out the cluster index from all inode structures. 607 */ 608 hammer2_thr_delete(&pmp->sync_thrs[clindex]); 609 610 /* 611 * Remove the cluster index from the group. If destroying 612 * the PFS and this is a master, adjust pfs_nmasters. 613 */ 614 hammer2_mtx_ex(&iroot->lock); 615 chain = iroot->cluster.array[clindex].chain; 616 iroot->cluster.array[clindex].chain = NULL; 617 618 switch(pmp->pfs_types[clindex]) { 619 case HAMMER2_PFSTYPE_MASTER: 620 if (destroying && pmp->pfs_nmasters > 0) 621 --pmp->pfs_nmasters; 622 /* XXX adjust ripdata->meta.pfs_nmasters */ 623 break; 624 default: 625 break; 626 } 627 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE; 628 629 hammer2_mtx_unlock(&iroot->lock); 630 631 /* 632 * Release the chain. 633 */ 634 if (chain) { 635 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 636 hammer2_chain_drop(chain); 637 } 638 639 /* 640 * Terminate all XOP threads for the cluster index. 641 */ 642 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) 643 hammer2_thr_delete(&pmp->xop_groups[j].thrs[clindex]); 644 } 645 } 646 647 /* 648 * Destroy a PFS, typically only occurs after the last mount on a device 649 * has gone away. 650 */ 651 static void 652 hammer2_pfsfree(hammer2_pfs_t *pmp) 653 { 654 hammer2_inode_t *iroot; 655 hammer2_chain_t *chain; 656 int i; 657 int j; 658 659 /* 660 * Cleanup our reference on iroot. iroot is (should) not be needed 661 * by the flush code. 662 */ 663 if (pmp->flags & HAMMER2_PMPF_SPMP) 664 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry); 665 else 666 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry); 667 668 iroot = pmp->iroot; 669 if (iroot) { 670 for (i = 0; i < iroot->cluster.nchains; ++i) { 671 hammer2_thr_delete(&pmp->sync_thrs[i]); 672 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) 673 hammer2_thr_delete(&pmp->xop_groups[j].thrs[i]); 674 } 675 #if REPORT_REFS_ERRORS 676 if (pmp->iroot->refs != 1) 677 kprintf("PMP->IROOT %p REFS WRONG %d\n", 678 pmp->iroot, pmp->iroot->refs); 679 #else 680 KKASSERT(pmp->iroot->refs == 1); 681 #endif 682 /* ref for pmp->iroot */ 683 hammer2_inode_drop(pmp->iroot); 684 pmp->iroot = NULL; 685 } 686 687 /* 688 * Cleanup chains remaining on LRU list. 689 */ 690 hammer2_spin_ex(&pmp->lru_spin); 691 while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) { 692 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU); 693 atomic_add_int(&pmp->lru_count, -1); 694 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 695 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 696 hammer2_chain_ref(chain); 697 hammer2_spin_unex(&pmp->lru_spin); 698 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 699 hammer2_chain_drop(chain); 700 hammer2_spin_ex(&pmp->lru_spin); 701 } 702 hammer2_spin_unex(&pmp->lru_spin); 703 704 /* 705 * Free remaining pmp resources 706 */ 707 kmalloc_destroy(&pmp->mmsg); 708 kmalloc_destroy(&pmp->minode); 709 710 kfree(pmp, M_HAMMER2); 711 } 712 713 /* 714 * Remove all references to hmp from the pfs list. Any PFS which becomes 715 * empty is terminated and freed. 716 * 717 * XXX inefficient. 718 */ 719 static void 720 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which) 721 { 722 hammer2_pfs_t *pmp; 723 hammer2_inode_t *iroot; 724 hammer2_chain_t *rchain; 725 int didfreeze; 726 int i; 727 int j; 728 struct hammer2_pfslist *wlist; 729 730 if (which == 0) 731 wlist = &hammer2_pfslist; 732 else 733 wlist = &hammer2_spmplist; 734 again: 735 TAILQ_FOREACH(pmp, wlist, mntentry) { 736 if ((iroot = pmp->iroot) == NULL) 737 continue; 738 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 739 hammer2_inode_run_sideq(pmp, 1); 740 hammer2_bioq_sync(pmp); 741 hammer2_trans_done(pmp); 742 743 /* 744 * Determine if this PFS is affected. If it is we must 745 * freeze all management threads and lock its iroot. 746 * 747 * Freezing a management thread forces it idle, operations 748 * in-progress will be aborted and it will have to start 749 * over again when unfrozen, or exit if told to exit. 750 */ 751 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 752 if (pmp->pfs_hmps[i] == hmp) 753 break; 754 } 755 if (i != HAMMER2_MAXCLUSTER) { 756 /* 757 * Make sure all synchronization threads are locked 758 * down. 759 */ 760 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 761 if (pmp->pfs_hmps[i] == NULL) 762 continue; 763 hammer2_thr_freeze_async(&pmp->sync_thrs[i]); 764 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 765 hammer2_thr_freeze_async( 766 &pmp->xop_groups[j].thrs[i]); 767 } 768 } 769 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 770 if (pmp->pfs_hmps[i] == NULL) 771 continue; 772 hammer2_thr_freeze(&pmp->sync_thrs[i]); 773 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 774 hammer2_thr_freeze( 775 &pmp->xop_groups[j].thrs[i]); 776 } 777 } 778 779 /* 780 * Lock the inode and clean out matching chains. 781 * Note that we cannot use hammer2_inode_lock_*() 782 * here because that would attempt to validate the 783 * cluster that we are in the middle of ripping 784 * apart. 785 * 786 * WARNING! We are working directly on the inodes 787 * embedded cluster. 788 */ 789 hammer2_mtx_ex(&iroot->lock); 790 791 /* 792 * Remove the chain from matching elements of the PFS. 793 */ 794 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 795 if (pmp->pfs_hmps[i] != hmp) 796 continue; 797 hammer2_thr_delete(&pmp->sync_thrs[i]); 798 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 799 hammer2_thr_delete( 800 &pmp->xop_groups[j].thrs[i]); 801 } 802 rchain = iroot->cluster.array[i].chain; 803 iroot->cluster.array[i].chain = NULL; 804 pmp->pfs_types[i] = 0; 805 if (pmp->pfs_names[i]) { 806 kfree(pmp->pfs_names[i], M_HAMMER2); 807 pmp->pfs_names[i] = NULL; 808 } 809 if (rchain) { 810 hammer2_chain_drop(rchain); 811 /* focus hint */ 812 if (iroot->cluster.focus == rchain) 813 iroot->cluster.focus = NULL; 814 } 815 pmp->pfs_hmps[i] = NULL; 816 } 817 hammer2_mtx_unlock(&iroot->lock); 818 didfreeze = 1; /* remaster, unfreeze down below */ 819 } else { 820 didfreeze = 0; 821 } 822 823 /* 824 * Cleanup trailing chains. Gaps may remain. 825 */ 826 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) { 827 if (pmp->pfs_hmps[i]) 828 break; 829 } 830 iroot->cluster.nchains = i + 1; 831 832 /* 833 * If the PMP has no elements remaining we can destroy it. 834 * (this will transition management threads from frozen->exit). 835 */ 836 if (iroot->cluster.nchains == 0) { 837 /* 838 * If this was the hmp's spmp, we need to clean 839 * a little more stuff out. 840 */ 841 if (hmp->spmp == pmp) { 842 hmp->spmp = NULL; 843 hmp->vchain.pmp = NULL; 844 hmp->fchain.pmp = NULL; 845 } 846 847 /* 848 * Free the pmp and restart the loop 849 */ 850 hammer2_pfsfree(pmp); 851 goto again; 852 } 853 854 /* 855 * If elements still remain we need to set the REMASTER 856 * flag and unfreeze it. 857 */ 858 if (didfreeze) { 859 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 860 if (pmp->pfs_hmps[i] == NULL) 861 continue; 862 hammer2_thr_remaster(&pmp->sync_thrs[i]); 863 hammer2_thr_unfreeze(&pmp->sync_thrs[i]); 864 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 865 hammer2_thr_remaster( 866 &pmp->xop_groups[j].thrs[i]); 867 hammer2_thr_unfreeze( 868 &pmp->xop_groups[j].thrs[i]); 869 } 870 } 871 } 872 } 873 } 874 875 /* 876 * Mount or remount HAMMER2 fileystem from physical media 877 * 878 * mountroot 879 * mp mount point structure 880 * path NULL 881 * data <unused> 882 * cred <unused> 883 * 884 * mount 885 * mp mount point structure 886 * path path to mount point 887 * data pointer to argument structure in user space 888 * volume volume path (device@LABEL form) 889 * hflags user mount flags 890 * cred user credentials 891 * 892 * RETURNS: 0 Success 893 * !0 error number 894 */ 895 static 896 int 897 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 898 struct ucred *cred) 899 { 900 struct hammer2_mount_info info; 901 hammer2_pfs_t *pmp; 902 hammer2_pfs_t *spmp; 903 hammer2_dev_t *hmp; 904 hammer2_dev_t *force_local; 905 hammer2_key_t key_next; 906 hammer2_key_t key_dummy; 907 hammer2_key_t lhc; 908 struct vnode *devvp; 909 struct nlookupdata nd; 910 hammer2_chain_t *parent; 911 hammer2_chain_t *chain; 912 hammer2_cluster_t *cluster; 913 const hammer2_inode_data_t *ripdata; 914 hammer2_blockref_t bref; 915 struct file *fp; 916 char devstr[MNAMELEN]; 917 size_t size; 918 size_t done; 919 char *dev; 920 char *label; 921 int ronly = 1; 922 int error; 923 int i; 924 925 hmp = NULL; 926 pmp = NULL; 927 dev = NULL; 928 label = NULL; 929 devvp = NULL; 930 931 kprintf("hammer2_mount\n"); 932 933 if (path == NULL) { 934 /* 935 * Root mount 936 */ 937 bzero(&info, sizeof(info)); 938 info.cluster_fd = -1; 939 ksnprintf(devstr, sizeof(devstr), "%s", 940 mp->mnt_stat.f_mntfromname); 941 kprintf("hammer2_mount: root '%s'\n", devstr); 942 } else { 943 /* 944 * Non-root mount or updating a mount 945 */ 946 error = copyin(data, &info, sizeof(info)); 947 if (error) 948 return (error); 949 950 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); 951 if (error) 952 return (error); 953 } 954 955 /* 956 * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA 957 * if no label specified, based on the partition id. Error out if no 958 * label or device (with partition id) is specified. This is strictly 959 * a convenience to match the default label created by newfs_hammer2, 960 * our preference is that a label always be specified. 961 * 962 * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command 963 * that does not specify a device, as long as some H2 label 964 * has already been mounted from that device. This makes 965 * mounting snapshots a lot easier. 966 */ 967 dev = devstr; 968 label = strchr(devstr, '@'); 969 if (label && ((label + 1) - dev) > done) 970 return (EINVAL); 971 if (label == NULL || label[1] == 0) { 972 char slice; 973 974 if (label == NULL) 975 label = devstr + strlen(devstr); 976 slice = label[-1]; 977 switch(slice) { 978 case 'a': 979 label = "BOOT"; 980 break; 981 case 'd': 982 label = "ROOT"; 983 break; 984 default: 985 label = "DATA"; 986 break; 987 } 988 } else { 989 *label = '\0'; 990 label++; 991 } 992 993 kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n", 994 dev, label, (mp->mnt_flag & MNT_RDONLY)); 995 996 if (mp->mnt_flag & MNT_UPDATE) { 997 /* 998 * Update mount. Note that pmp->iroot->cluster is 999 * an inode-embedded cluster and thus cannot be 1000 * directly locked. 1001 * 1002 * XXX HAMMER2 needs to implement NFS export via 1003 * mountctl. 1004 */ 1005 pmp = MPTOPMP(mp); 1006 pmp->hflags = info.hflags; 1007 cluster = &pmp->iroot->cluster; 1008 for (i = 0; i < cluster->nchains; ++i) { 1009 if (cluster->array[i].chain == NULL) 1010 continue; 1011 hmp = cluster->array[i].chain->hmp; 1012 devvp = hmp->devvp; 1013 error = hammer2_remount(hmp, mp, path, 1014 devvp, cred); 1015 if (error) 1016 break; 1017 } 1018 1019 return error; 1020 } 1021 1022 /* 1023 * HMP device mount 1024 * 1025 * If a path is specified and dev is not an empty string, lookup the 1026 * name and verify that it referes to a block device. 1027 * 1028 * If a path is specified and dev is an empty string we fall through 1029 * and locate the label in the hmp search. 1030 */ 1031 if (path && *dev != 0) { 1032 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); 1033 if (error == 0) 1034 error = nlookup(&nd); 1035 if (error == 0) 1036 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); 1037 nlookup_done(&nd); 1038 } else if (path == NULL) { 1039 /* root mount */ 1040 cdev_t cdev = kgetdiskbyname(dev); 1041 error = bdevvp(cdev, &devvp); 1042 if (error) 1043 kprintf("hammer2: cannot find '%s'\n", dev); 1044 } else { 1045 /* 1046 * We will locate the hmp using the label in the hmp loop. 1047 */ 1048 error = 0; 1049 } 1050 1051 /* 1052 * Make sure its a block device. Do not check to see if it is 1053 * already mounted until we determine that its a fresh H2 device. 1054 */ 1055 if (error == 0 && devvp) { 1056 vn_isdisk(devvp, &error); 1057 } 1058 1059 /* 1060 * Determine if the device has already been mounted. After this 1061 * check hmp will be non-NULL if we are doing the second or more 1062 * hammer2 mounts from the same device. 1063 */ 1064 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1065 if (devvp) { 1066 /* 1067 * Match the device. Due to the way devfs works, 1068 * we may not be able to directly match the vnode pointer, 1069 * so also check to see if the underlying device matches. 1070 */ 1071 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1072 if (hmp->devvp == devvp) 1073 break; 1074 if (devvp->v_rdev && 1075 hmp->devvp->v_rdev == devvp->v_rdev) { 1076 break; 1077 } 1078 } 1079 1080 /* 1081 * If no match this may be a fresh H2 mount, make sure 1082 * the device is not mounted on anything else. 1083 */ 1084 if (hmp == NULL) 1085 error = vfs_mountedon(devvp); 1086 } else if (error == 0) { 1087 /* 1088 * Match the label to a pmp already probed. 1089 */ 1090 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 1091 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 1092 if (pmp->pfs_names[i] && 1093 strcmp(pmp->pfs_names[i], label) == 0) { 1094 hmp = pmp->pfs_hmps[i]; 1095 break; 1096 } 1097 } 1098 if (hmp) 1099 break; 1100 } 1101 if (hmp == NULL) 1102 error = ENOENT; 1103 } 1104 1105 /* 1106 * Open the device if this isn't a secondary mount and construct 1107 * the H2 device mount (hmp). 1108 */ 1109 if (hmp == NULL) { 1110 hammer2_chain_t *schain; 1111 hammer2_xid_t xid; 1112 1113 if (error == 0 && vcount(devvp) > 0) { 1114 kprintf("Primary device already has references\n"); 1115 error = EBUSY; 1116 } 1117 1118 /* 1119 * Now open the device 1120 */ 1121 if (error == 0) { 1122 ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); 1123 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1124 error = vinvalbuf(devvp, V_SAVE, 0, 0); 1125 if (error == 0) { 1126 error = VOP_OPEN(devvp, 1127 (ronly ? FREAD : FREAD | FWRITE), 1128 FSCRED, NULL); 1129 } 1130 vn_unlock(devvp); 1131 } 1132 if (error && devvp) { 1133 vrele(devvp); 1134 devvp = NULL; 1135 } 1136 if (error) { 1137 lockmgr(&hammer2_mntlk, LK_RELEASE); 1138 return error; 1139 } 1140 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); 1141 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev); 1142 hmp->ronly = ronly; 1143 hmp->devvp = devvp; 1144 hmp->hflags = info.hflags & HMNT2_DEVFLAGS; 1145 kmalloc_create(&hmp->mchain, "HAMMER2-chains"); 1146 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry); 1147 RB_INIT(&hmp->iotree); 1148 spin_init(&hmp->io_spin, "hm2mount_io"); 1149 spin_init(&hmp->list_spin, "hm2mount_list"); 1150 TAILQ_INIT(&hmp->flushq); 1151 1152 lockinit(&hmp->vollk, "h2vol", 0, 0); 1153 lockinit(&hmp->bulklk, "h2bulk", 0, 0); 1154 lockinit(&hmp->bflock, "h2bflk", 0, 0); 1155 1156 /* 1157 * vchain setup. vchain.data is embedded. 1158 * vchain.refs is initialized and will never drop to 0. 1159 * 1160 * NOTE! voldata is not yet loaded. 1161 */ 1162 hmp->vchain.hmp = hmp; 1163 hmp->vchain.refs = 1; 1164 hmp->vchain.data = (void *)&hmp->voldata; 1165 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; 1166 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1167 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1168 1169 hammer2_chain_core_init(&hmp->vchain); 1170 /* hmp->vchain.u.xxx is left NULL */ 1171 1172 /* 1173 * fchain setup. fchain.data is embedded. 1174 * fchain.refs is initialized and will never drop to 0. 1175 * 1176 * The data is not used but needs to be initialized to 1177 * pass assertion muster. We use this chain primarily 1178 * as a placeholder for the freemap's top-level RBTREE 1179 * so it does not interfere with the volume's topology 1180 * RBTREE. 1181 */ 1182 hmp->fchain.hmp = hmp; 1183 hmp->fchain.refs = 1; 1184 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset; 1185 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP; 1186 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1187 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1188 hmp->fchain.bref.methods = 1189 HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) | 1190 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 1191 1192 hammer2_chain_core_init(&hmp->fchain); 1193 /* hmp->fchain.u.xxx is left NULL */ 1194 1195 /* 1196 * Install the volume header and initialize fields from 1197 * voldata. 1198 */ 1199 error = hammer2_install_volume_header(hmp); 1200 if (error) { 1201 hammer2_unmount_helper(mp, NULL, hmp); 1202 lockmgr(&hammer2_mntlk, LK_RELEASE); 1203 hammer2_vfs_unmount(mp, MNT_FORCE); 1204 return error; 1205 } 1206 1207 /* 1208 * Really important to get these right or flush will get 1209 * confused. 1210 */ 1211 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL); 1212 spmp = hmp->spmp; 1213 1214 /* 1215 * Dummy-up vchain and fchain's modify_tid. mirror_tid 1216 * is inherited from the volume header. 1217 */ 1218 xid = 0; 1219 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1220 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid; 1221 hmp->vchain.pmp = spmp; 1222 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1223 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid; 1224 hmp->fchain.pmp = spmp; 1225 1226 /* 1227 * First locate the super-root inode, which is key 0 1228 * relative to the volume header's blockset. 1229 * 1230 * Then locate the root inode by scanning the directory keyspace 1231 * represented by the label. 1232 */ 1233 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 1234 schain = hammer2_chain_lookup(&parent, &key_dummy, 1235 HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 1236 &error, 0); 1237 hammer2_chain_lookup_done(parent); 1238 if (schain == NULL) { 1239 kprintf("hammer2_mount: invalid super-root\n"); 1240 hammer2_unmount_helper(mp, NULL, hmp); 1241 lockmgr(&hammer2_mntlk, LK_RELEASE); 1242 hammer2_vfs_unmount(mp, MNT_FORCE); 1243 return EINVAL; 1244 } 1245 if (schain->error) { 1246 kprintf("hammer2_mount: error %s reading super-root\n", 1247 hammer2_error_str(schain->error)); 1248 hammer2_chain_unlock(schain); 1249 hammer2_chain_drop(schain); 1250 schain = NULL; 1251 hammer2_unmount_helper(mp, NULL, hmp); 1252 lockmgr(&hammer2_mntlk, LK_RELEASE); 1253 hammer2_vfs_unmount(mp, MNT_FORCE); 1254 return EINVAL; 1255 } 1256 1257 /* 1258 * The super-root always uses an inode_tid of 1 when 1259 * creating PFSs. 1260 */ 1261 spmp->inode_tid = 1; 1262 spmp->modify_tid = schain->bref.modify_tid + 1; 1263 1264 /* 1265 * Sanity-check schain's pmp and finish initialization. 1266 * Any chain belonging to the super-root topology should 1267 * have a NULL pmp (not even set to spmp). 1268 */ 1269 ripdata = &hammer2_chain_rdata(schain)->ipdata; 1270 KKASSERT(schain->pmp == NULL); 1271 spmp->pfs_clid = ripdata->meta.pfs_clid; 1272 1273 /* 1274 * Replace the dummy spmp->iroot with a real one. It's 1275 * easier to just do a wholesale replacement than to try 1276 * to update the chain and fixup the iroot fields. 1277 * 1278 * The returned inode is locked with the supplied cluster. 1279 */ 1280 cluster = hammer2_cluster_from_chain(schain); 1281 hammer2_inode_drop(spmp->iroot); 1282 spmp->iroot = NULL; 1283 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster, -1); 1284 spmp->spmp_hmp = hmp; 1285 spmp->pfs_types[0] = ripdata->meta.pfs_type; 1286 spmp->pfs_hmps[0] = hmp; 1287 hammer2_inode_ref(spmp->iroot); 1288 hammer2_inode_unlock(spmp->iroot); 1289 hammer2_cluster_unlock(cluster); 1290 hammer2_cluster_drop(cluster); 1291 schain = NULL; 1292 /* leave spmp->iroot with one ref */ 1293 1294 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 1295 error = hammer2_recovery(hmp); 1296 if (error == 0) 1297 error |= hammer2_fixup_pfses(hmp); 1298 /* XXX do something with error */ 1299 } 1300 hammer2_update_pmps(hmp); 1301 hammer2_iocom_init(hmp); 1302 hammer2_bulkfree_init(hmp); 1303 1304 /* 1305 * Ref the cluster management messaging descriptor. The mount 1306 * program deals with the other end of the communications pipe. 1307 * 1308 * Root mounts typically do not supply one. 1309 */ 1310 if (info.cluster_fd >= 0) { 1311 fp = holdfp(curthread, info.cluster_fd, -1); 1312 if (fp) { 1313 hammer2_cluster_reconnect(hmp, fp); 1314 } else { 1315 kprintf("hammer2_mount: bad cluster_fd!\n"); 1316 } 1317 } 1318 } else { 1319 spmp = hmp->spmp; 1320 if (info.hflags & HMNT2_DEVFLAGS) { 1321 kprintf("hammer2: Warning: mount flags pertaining " 1322 "to the whole device may only be specified " 1323 "on the first mount of the device: %08x\n", 1324 info.hflags & HMNT2_DEVFLAGS); 1325 } 1326 } 1327 1328 /* 1329 * Force local mount (disassociate all PFSs from their clusters). 1330 * Used primarily for debugging. 1331 */ 1332 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1333 1334 /* 1335 * Lookup the mount point under the media-localized super-root. 1336 * Scanning hammer2_pfslist doesn't help us because it represents 1337 * PFS cluster ids which can aggregate several named PFSs together. 1338 * 1339 * cluster->pmp will incorrectly point to spmp and must be fixed 1340 * up later on. 1341 */ 1342 hammer2_inode_lock(spmp->iroot, 0); 1343 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1344 lhc = hammer2_dirhash(label, strlen(label)); 1345 chain = hammer2_chain_lookup(&parent, &key_next, 1346 lhc, lhc + HAMMER2_DIRHASH_LOMASK, 1347 &error, 0); 1348 while (chain) { 1349 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 1350 strcmp(label, chain->data->ipdata.filename) == 0) { 1351 break; 1352 } 1353 chain = hammer2_chain_next(&parent, chain, &key_next, 1354 key_next, 1355 lhc + HAMMER2_DIRHASH_LOMASK, 1356 &error, 0); 1357 } 1358 if (parent) { 1359 hammer2_chain_unlock(parent); 1360 hammer2_chain_drop(parent); 1361 } 1362 hammer2_inode_unlock(spmp->iroot); 1363 1364 /* 1365 * PFS could not be found? 1366 */ 1367 if (chain == NULL) { 1368 if (error) 1369 kprintf("hammer2_mount: PFS label I/O error\n"); 1370 else 1371 kprintf("hammer2_mount: PFS label not found\n"); 1372 hammer2_unmount_helper(mp, NULL, hmp); 1373 lockmgr(&hammer2_mntlk, LK_RELEASE); 1374 hammer2_vfs_unmount(mp, MNT_FORCE); 1375 1376 return EINVAL; 1377 } 1378 1379 /* 1380 * Acquire the pmp structure (it should have already been allocated 1381 * via hammer2_update_pmps() so do not pass cluster in to add to 1382 * available chains). 1383 * 1384 * Check if the cluster has already been mounted. A cluster can 1385 * only be mounted once, use null mounts to mount additional copies. 1386 */ 1387 if (chain->error) { 1388 kprintf("hammer2_mount: PFS label I/O error\n"); 1389 } else { 1390 ripdata = &chain->data->ipdata; 1391 bref = chain->bref; 1392 pmp = hammer2_pfsalloc(NULL, ripdata, 1393 bref.modify_tid, force_local); 1394 } 1395 hammer2_chain_unlock(chain); 1396 hammer2_chain_drop(chain); 1397 1398 /* 1399 * Finish the mount 1400 */ 1401 kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp); 1402 1403 if (pmp->mp) { 1404 kprintf("hammer2_mount: PFS already mounted!\n"); 1405 hammer2_unmount_helper(mp, NULL, hmp); 1406 lockmgr(&hammer2_mntlk, LK_RELEASE); 1407 hammer2_vfs_unmount(mp, MNT_FORCE); 1408 1409 return EBUSY; 1410 } 1411 1412 pmp->hflags = info.hflags; 1413 mp->mnt_flag |= MNT_LOCAL; 1414 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; /* all entry pts are SMP */ 1415 mp->mnt_kern_flag |= MNTK_THR_SYNC; /* new vsyncscan semantics */ 1416 1417 /* 1418 * required mount structure initializations 1419 */ 1420 mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE; 1421 mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE; 1422 1423 mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE; 1424 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 1425 1426 /* 1427 * Optional fields 1428 */ 1429 mp->mnt_iosize_max = MAXPHYS; 1430 1431 /* 1432 * Connect up mount pointers. 1433 */ 1434 hammer2_mount_helper(mp, pmp); 1435 1436 lockmgr(&hammer2_mntlk, LK_RELEASE); 1437 1438 /* 1439 * Finish setup 1440 */ 1441 vfs_getnewfsid(mp); 1442 vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops); 1443 vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops); 1444 vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops); 1445 1446 if (path) { 1447 copyinstr(info.volume, mp->mnt_stat.f_mntfromname, 1448 MNAMELEN - 1, &size); 1449 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); 1450 } /* else root mount, already in there */ 1451 1452 bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); 1453 if (path) { 1454 copyinstr(path, mp->mnt_stat.f_mntonname, 1455 sizeof(mp->mnt_stat.f_mntonname) - 1, 1456 &size); 1457 } else { 1458 /* root mount */ 1459 mp->mnt_stat.f_mntonname[0] = '/'; 1460 } 1461 1462 /* 1463 * Initial statfs to prime mnt_stat. 1464 */ 1465 hammer2_vfs_statfs(mp, &mp->mnt_stat, cred); 1466 1467 return 0; 1468 } 1469 1470 /* 1471 * Scan PFSs under the super-root and create hammer2_pfs structures. 1472 */ 1473 static 1474 void 1475 hammer2_update_pmps(hammer2_dev_t *hmp) 1476 { 1477 const hammer2_inode_data_t *ripdata; 1478 hammer2_chain_t *parent; 1479 hammer2_chain_t *chain; 1480 hammer2_blockref_t bref; 1481 hammer2_dev_t *force_local; 1482 hammer2_pfs_t *spmp; 1483 hammer2_pfs_t *pmp; 1484 hammer2_key_t key_next; 1485 int error; 1486 1487 /* 1488 * Force local mount (disassociate all PFSs from their clusters). 1489 * Used primarily for debugging. 1490 */ 1491 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1492 1493 /* 1494 * Lookup mount point under the media-localized super-root. 1495 * 1496 * cluster->pmp will incorrectly point to spmp and must be fixed 1497 * up later on. 1498 */ 1499 spmp = hmp->spmp; 1500 hammer2_inode_lock(spmp->iroot, 0); 1501 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1502 chain = hammer2_chain_lookup(&parent, &key_next, 1503 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 1504 &error, 0); 1505 while (chain) { 1506 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 1507 continue; 1508 if (chain->error) { 1509 kprintf("I/O error scanning PFS labels\n"); 1510 } else { 1511 ripdata = &chain->data->ipdata; 1512 bref = chain->bref; 1513 1514 pmp = hammer2_pfsalloc(chain, ripdata, 1515 bref.modify_tid, force_local); 1516 } 1517 chain = hammer2_chain_next(&parent, chain, &key_next, 1518 key_next, HAMMER2_KEY_MAX, 1519 &error, 0); 1520 } 1521 if (parent) { 1522 hammer2_chain_unlock(parent); 1523 hammer2_chain_drop(parent); 1524 } 1525 hammer2_inode_unlock(spmp->iroot); 1526 } 1527 1528 static 1529 int 1530 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused, 1531 struct vnode *devvp, struct ucred *cred) 1532 { 1533 int error; 1534 1535 if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { 1536 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1537 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL); 1538 vn_unlock(devvp); 1539 error = hammer2_recovery(hmp); 1540 if (error == 0) 1541 error |= hammer2_fixup_pfses(hmp); 1542 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1543 if (error == 0) { 1544 VOP_CLOSE(devvp, FREAD, NULL); 1545 hmp->ronly = 0; 1546 } else { 1547 VOP_CLOSE(devvp, FREAD | FWRITE, NULL); 1548 } 1549 vn_unlock(devvp); 1550 } else { 1551 error = 0; 1552 } 1553 return error; 1554 } 1555 1556 static 1557 int 1558 hammer2_vfs_unmount(struct mount *mp, int mntflags) 1559 { 1560 hammer2_pfs_t *pmp; 1561 int flags; 1562 int error = 0; 1563 1564 pmp = MPTOPMP(mp); 1565 1566 if (pmp == NULL) 1567 return(0); 1568 1569 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1570 1571 /* 1572 * If mount initialization proceeded far enough we must flush 1573 * its vnodes and sync the underlying mount points. Three syncs 1574 * are required to fully flush the filesystem (freemap updates lag 1575 * by one flush, and one extra for safety). 1576 */ 1577 if (mntflags & MNT_FORCE) 1578 flags = FORCECLOSE; 1579 else 1580 flags = 0; 1581 if (pmp->iroot) { 1582 error = vflush(mp, 0, flags); 1583 if (error) 1584 goto failed; 1585 hammer2_vfs_sync(mp, MNT_WAIT); 1586 hammer2_vfs_sync(mp, MNT_WAIT); 1587 hammer2_vfs_sync(mp, MNT_WAIT); 1588 } 1589 1590 /* 1591 * Cleanup the frontend support XOPS threads 1592 */ 1593 hammer2_xop_helper_cleanup(pmp); 1594 1595 if (pmp->mp) 1596 hammer2_unmount_helper(mp, pmp, NULL); 1597 1598 error = 0; 1599 failed: 1600 lockmgr(&hammer2_mntlk, LK_RELEASE); 1601 1602 return (error); 1603 } 1604 1605 /* 1606 * Mount helper, hook the system mount into our PFS. 1607 * The mount lock is held. 1608 * 1609 * We must bump the mount_count on related devices for any 1610 * mounted PFSs. 1611 */ 1612 static 1613 void 1614 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) 1615 { 1616 hammer2_cluster_t *cluster; 1617 hammer2_chain_t *rchain; 1618 int i; 1619 1620 mp->mnt_data = (qaddr_t)pmp; 1621 pmp->mp = mp; 1622 1623 /* 1624 * After pmp->mp is set we have to adjust hmp->mount_count. 1625 */ 1626 cluster = &pmp->iroot->cluster; 1627 for (i = 0; i < cluster->nchains; ++i) { 1628 rchain = cluster->array[i].chain; 1629 if (rchain == NULL) 1630 continue; 1631 ++rchain->hmp->mount_count; 1632 } 1633 1634 /* 1635 * Create missing Xop threads 1636 */ 1637 hammer2_xop_helper_create(pmp); 1638 } 1639 1640 /* 1641 * Mount helper, unhook the system mount from our PFS. 1642 * The mount lock is held. 1643 * 1644 * If hmp is supplied a mount responsible for being the first to open 1645 * the block device failed and the block device and all PFSs using the 1646 * block device must be cleaned up. 1647 * 1648 * If pmp is supplied multiple devices might be backing the PFS and each 1649 * must be disconnected. This might not be the last PFS using some of the 1650 * underlying devices. Also, we have to adjust our hmp->mount_count 1651 * accounting for the devices backing the pmp which is now undergoing an 1652 * unmount. 1653 */ 1654 static 1655 void 1656 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp) 1657 { 1658 hammer2_cluster_t *cluster; 1659 hammer2_chain_t *rchain; 1660 struct vnode *devvp; 1661 int dumpcnt; 1662 int ronly; 1663 int i; 1664 1665 /* 1666 * If no device supplied this is a high-level unmount and we have to 1667 * to disconnect the mount, adjust mount_count, and locate devices 1668 * that might now have no mounts. 1669 */ 1670 if (pmp) { 1671 KKASSERT(hmp == NULL); 1672 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp); 1673 pmp->mp = NULL; 1674 mp->mnt_data = NULL; 1675 1676 /* 1677 * After pmp->mp is cleared we have to account for 1678 * mount_count. 1679 */ 1680 cluster = &pmp->iroot->cluster; 1681 for (i = 0; i < cluster->nchains; ++i) { 1682 rchain = cluster->array[i].chain; 1683 if (rchain == NULL) 1684 continue; 1685 --rchain->hmp->mount_count; 1686 /* scrapping hmp now may invalidate the pmp */ 1687 } 1688 again: 1689 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1690 if (hmp->mount_count == 0) { 1691 hammer2_unmount_helper(NULL, NULL, hmp); 1692 goto again; 1693 } 1694 } 1695 return; 1696 } 1697 1698 /* 1699 * Try to terminate the block device. We can't terminate it if 1700 * there are still PFSs referencing it. 1701 */ 1702 if (hmp->mount_count) 1703 return; 1704 1705 /* 1706 * Decomission the network before we start messing with the 1707 * device and PFS. 1708 */ 1709 hammer2_iocom_uninit(hmp); 1710 1711 hammer2_bulkfree_uninit(hmp); 1712 hammer2_pfsfree_scan(hmp, 0); 1713 hammer2_dev_exlock(hmp); /* XXX order */ 1714 1715 /* 1716 * Cycle the volume data lock as a safety (probably not needed any 1717 * more). To ensure everything is out we need to flush at least 1718 * three times. (1) The running of the sideq can dirty the 1719 * filesystem, (2) A normal flush can dirty the freemap, and 1720 * (3) ensure that the freemap is fully synchronized. 1721 * 1722 * The next mount's recovery scan can clean everything up but we want 1723 * to leave the filesystem in a 100% clean state on a normal unmount. 1724 */ 1725 #if 0 1726 hammer2_voldata_lock(hmp); 1727 hammer2_voldata_unlock(hmp); 1728 #endif 1729 1730 /* 1731 * Flush whatever is left. Unmounted but modified PFS's might still 1732 * have some dirty chains on them. 1733 */ 1734 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1735 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 1736 1737 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1738 hammer2_voldata_modify(hmp); 1739 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP | 1740 HAMMER2_FLUSH_ALL); 1741 } 1742 hammer2_chain_unlock(&hmp->fchain); 1743 1744 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1745 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP | 1746 HAMMER2_FLUSH_ALL); 1747 } 1748 hammer2_chain_unlock(&hmp->vchain); 1749 1750 if ((hmp->vchain.flags | hmp->fchain.flags) & 1751 HAMMER2_CHAIN_FLUSH_MASK) { 1752 kprintf("hammer2_unmount: chains left over " 1753 "after final sync\n"); 1754 kprintf(" vchain %08x\n", hmp->vchain.flags); 1755 kprintf(" fchain %08x\n", hmp->fchain.flags); 1756 1757 if (hammer2_debug & 0x0010) 1758 Debugger("entered debugger"); 1759 } 1760 1761 hammer2_pfsfree_scan(hmp, 1); 1762 1763 KKASSERT(hmp->spmp == NULL); 1764 1765 /* 1766 * Finish up with the device vnode 1767 */ 1768 if ((devvp = hmp->devvp) != NULL) { 1769 ronly = hmp->ronly; 1770 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1771 kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n", 1772 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree), 1773 ronly); 1774 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0); 1775 kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n", 1776 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree)); 1777 hmp->devvp = NULL; 1778 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL); 1779 vn_unlock(devvp); 1780 vrele(devvp); 1781 devvp = NULL; 1782 } 1783 1784 /* 1785 * Clear vchain/fchain flags that might prevent final cleanup 1786 * of these chains. 1787 */ 1788 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) { 1789 atomic_add_long(&hammer2_count_modified_chains, -1); 1790 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1791 hammer2_pfs_memory_wakeup(hmp->vchain.pmp); 1792 } 1793 if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) { 1794 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE); 1795 } 1796 1797 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) { 1798 atomic_add_long(&hammer2_count_modified_chains, -1); 1799 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED); 1800 hammer2_pfs_memory_wakeup(hmp->fchain.pmp); 1801 } 1802 if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) { 1803 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE); 1804 } 1805 1806 /* 1807 * Final drop of embedded freemap root chain to 1808 * clean up fchain.core (fchain structure is not 1809 * flagged ALLOCATED so it is cleaned out and then 1810 * left to rot). 1811 */ 1812 hammer2_chain_drop(&hmp->fchain); 1813 1814 /* 1815 * Final drop of embedded volume root chain to clean 1816 * up vchain.core (vchain structure is not flagged 1817 * ALLOCATED so it is cleaned out and then left to 1818 * rot). 1819 */ 1820 dumpcnt = 50; 1821 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1); 1822 dumpcnt = 50; 1823 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1); 1824 hammer2_dev_unlock(hmp); 1825 hammer2_chain_drop(&hmp->vchain); 1826 1827 hammer2_io_cleanup(hmp, &hmp->iotree); 1828 if (hmp->iofree_count) { 1829 kprintf("io_cleanup: %d I/O's left hanging\n", 1830 hmp->iofree_count); 1831 } 1832 1833 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry); 1834 kmalloc_destroy(&hmp->mchain); 1835 kfree(hmp, M_HAMMER2); 1836 } 1837 1838 int 1839 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 1840 ino_t ino, struct vnode **vpp) 1841 { 1842 hammer2_xop_lookup_t *xop; 1843 hammer2_pfs_t *pmp; 1844 hammer2_inode_t *ip; 1845 hammer2_tid_t inum; 1846 int error; 1847 1848 inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK; 1849 1850 error = 0; 1851 pmp = MPTOPMP(mp); 1852 1853 /* 1854 * Easy if we already have it cached 1855 */ 1856 ip = hammer2_inode_lookup(pmp, inum); 1857 if (ip) { 1858 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 1859 *vpp = hammer2_igetv(ip, &error); 1860 hammer2_inode_unlock(ip); 1861 hammer2_inode_drop(ip); /* from lookup */ 1862 1863 return error; 1864 } 1865 1866 /* 1867 * Otherwise we have to find the inode 1868 */ 1869 xop = hammer2_xop_alloc(pmp->iroot, 0); 1870 xop->lhc = inum; 1871 hammer2_xop_start(&xop->head, hammer2_xop_lookup); 1872 error = hammer2_xop_collect(&xop->head, 0); 1873 1874 if (error == 0) { 1875 if (hammer2_cluster_rdata(&xop->head.cluster) == NULL) { 1876 kprintf("vget: no collect error but also no rdata\n"); 1877 kprintf("xop %p\n", xop); 1878 while ((hammer2_debug & 0x80000) == 0) { 1879 tsleep(xop, PCATCH, "wait", hz * 10); 1880 } 1881 ip = NULL; 1882 } else { 1883 ip = hammer2_inode_get(pmp, NULL, &xop->head.cluster, -1); 1884 } 1885 } 1886 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1887 1888 if (ip) { 1889 *vpp = hammer2_igetv(ip, &error); 1890 hammer2_inode_unlock(ip); 1891 } else { 1892 *vpp = NULL; 1893 error = ENOENT; 1894 } 1895 return (error); 1896 } 1897 1898 static 1899 int 1900 hammer2_vfs_root(struct mount *mp, struct vnode **vpp) 1901 { 1902 hammer2_pfs_t *pmp; 1903 struct vnode *vp; 1904 int error; 1905 1906 pmp = MPTOPMP(mp); 1907 if (pmp->iroot == NULL) { 1908 *vpp = NULL; 1909 return EINVAL; 1910 } 1911 1912 error = 0; 1913 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1914 1915 while (pmp->inode_tid == 0) { 1916 hammer2_xop_ipcluster_t *xop; 1917 hammer2_inode_meta_t *meta; 1918 1919 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING); 1920 hammer2_xop_start(&xop->head, hammer2_xop_ipcluster); 1921 error = hammer2_xop_collect(&xop->head, 0); 1922 1923 if (error == 0) { 1924 meta = &xop->head.cluster.focus->data->ipdata.meta; 1925 pmp->iroot->meta = *meta; 1926 pmp->inode_tid = meta->pfs_inum + 1; 1927 if (pmp->inode_tid < HAMMER2_INODE_START) 1928 pmp->inode_tid = HAMMER2_INODE_START; 1929 pmp->modify_tid = 1930 xop->head.cluster.focus->bref.modify_tid + 1; 1931 #if 0 1932 kprintf("PFS: Starting inode %jd\n", 1933 (intmax_t)pmp->inode_tid); 1934 kprintf("PMP focus good set nextino=%ld mod=%016jx\n", 1935 pmp->inode_tid, pmp->modify_tid); 1936 #endif 1937 wakeup(&pmp->iroot); 1938 1939 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1940 1941 /* 1942 * Prime the mount info. 1943 */ 1944 hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL); 1945 break; 1946 } 1947 1948 /* 1949 * Loop, try again 1950 */ 1951 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1952 hammer2_inode_unlock(pmp->iroot); 1953 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz); 1954 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1955 if (error == EINTR) 1956 break; 1957 } 1958 1959 if (error) { 1960 hammer2_inode_unlock(pmp->iroot); 1961 *vpp = NULL; 1962 } else { 1963 vp = hammer2_igetv(pmp->iroot, &error); 1964 hammer2_inode_unlock(pmp->iroot); 1965 *vpp = vp; 1966 } 1967 1968 return (error); 1969 } 1970 1971 /* 1972 * Filesystem status 1973 * 1974 * XXX incorporate ipdata->meta.inode_quota and data_quota 1975 */ 1976 static 1977 int 1978 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) 1979 { 1980 hammer2_pfs_t *pmp; 1981 hammer2_dev_t *hmp; 1982 hammer2_blockref_t bref; 1983 struct statfs tmp; 1984 int i; 1985 1986 /* 1987 * NOTE: iroot might not have validated the cluster yet. 1988 */ 1989 pmp = MPTOPMP(mp); 1990 1991 bzero(&tmp, sizeof(tmp)); 1992 1993 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 1994 hmp = pmp->pfs_hmps[i]; 1995 if (hmp == NULL) 1996 continue; 1997 if (pmp->iroot->cluster.array[i].chain) 1998 bref = pmp->iroot->cluster.array[i].chain->bref; 1999 else 2000 bzero(&bref, sizeof(bref)); 2001 2002 tmp.f_files = bref.embed.stats.inode_count; 2003 tmp.f_ffree = 0; 2004 tmp.f_blocks = hmp->voldata.allocator_size / 2005 mp->mnt_vstat.f_bsize; 2006 tmp.f_bfree = hmp->voldata.allocator_free / 2007 mp->mnt_vstat.f_bsize; 2008 tmp.f_bavail = tmp.f_bfree; 2009 2010 if (cred && cred->cr_uid != 0) { 2011 uint64_t adj; 2012 2013 /* 5% */ 2014 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2015 tmp.f_blocks -= adj; 2016 tmp.f_bfree -= adj; 2017 tmp.f_bavail -= adj; 2018 } 2019 2020 mp->mnt_stat.f_blocks = tmp.f_blocks; 2021 mp->mnt_stat.f_bfree = tmp.f_bfree; 2022 mp->mnt_stat.f_bavail = tmp.f_bavail; 2023 mp->mnt_stat.f_files = tmp.f_files; 2024 mp->mnt_stat.f_ffree = tmp.f_ffree; 2025 2026 *sbp = mp->mnt_stat; 2027 } 2028 return (0); 2029 } 2030 2031 static 2032 int 2033 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) 2034 { 2035 hammer2_pfs_t *pmp; 2036 hammer2_dev_t *hmp; 2037 hammer2_blockref_t bref; 2038 struct statvfs tmp; 2039 int i; 2040 2041 /* 2042 * NOTE: iroot might not have validated the cluster yet. 2043 */ 2044 pmp = MPTOPMP(mp); 2045 bzero(&tmp, sizeof(tmp)); 2046 2047 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2048 hmp = pmp->pfs_hmps[i]; 2049 if (hmp == NULL) 2050 continue; 2051 if (pmp->iroot->cluster.array[i].chain) 2052 bref = pmp->iroot->cluster.array[i].chain->bref; 2053 else 2054 bzero(&bref, sizeof(bref)); 2055 2056 tmp.f_files = bref.embed.stats.inode_count; 2057 tmp.f_ffree = 0; 2058 tmp.f_blocks = hmp->voldata.allocator_size / 2059 mp->mnt_vstat.f_bsize; 2060 tmp.f_bfree = hmp->voldata.allocator_free / 2061 mp->mnt_vstat.f_bsize; 2062 tmp.f_bavail = tmp.f_bfree; 2063 2064 if (cred && cred->cr_uid != 0) { 2065 uint64_t adj; 2066 2067 /* 5% */ 2068 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2069 tmp.f_blocks -= adj; 2070 tmp.f_bfree -= adj; 2071 tmp.f_bavail -= adj; 2072 } 2073 2074 mp->mnt_vstat.f_blocks = tmp.f_blocks; 2075 mp->mnt_vstat.f_bfree = tmp.f_bfree; 2076 mp->mnt_vstat.f_bavail = tmp.f_bavail; 2077 mp->mnt_vstat.f_files = tmp.f_files; 2078 mp->mnt_vstat.f_ffree = tmp.f_ffree; 2079 2080 *sbp = mp->mnt_vstat; 2081 } 2082 return (0); 2083 } 2084 2085 /* 2086 * Mount-time recovery (RW mounts) 2087 * 2088 * Updates to the free block table are allowed to lag flushes by one 2089 * transaction. In case of a crash, then on a fresh mount we must do an 2090 * incremental scan of the last committed transaction id and make sure that 2091 * all related blocks have been marked allocated. 2092 * 2093 * The super-root topology and each PFS has its own transaction id domain, 2094 * so we must track PFS boundary transitions. 2095 */ 2096 struct hammer2_recovery_elm { 2097 TAILQ_ENTRY(hammer2_recovery_elm) entry; 2098 hammer2_chain_t *chain; 2099 hammer2_tid_t sync_tid; 2100 }; 2101 2102 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); 2103 2104 struct hammer2_recovery_info { 2105 struct hammer2_recovery_list list; 2106 hammer2_tid_t mtid; 2107 int depth; 2108 }; 2109 2110 static int hammer2_recovery_scan(hammer2_dev_t *hmp, 2111 hammer2_chain_t *parent, 2112 struct hammer2_recovery_info *info, 2113 hammer2_tid_t sync_tid); 2114 2115 #define HAMMER2_RECOVERY_MAXDEPTH 10 2116 2117 static 2118 int 2119 hammer2_recovery(hammer2_dev_t *hmp) 2120 { 2121 struct hammer2_recovery_info info; 2122 struct hammer2_recovery_elm *elm; 2123 hammer2_chain_t *parent; 2124 hammer2_tid_t sync_tid; 2125 hammer2_tid_t mirror_tid; 2126 int error; 2127 2128 hammer2_trans_init(hmp->spmp, 0); 2129 2130 sync_tid = hmp->voldata.freemap_tid; 2131 mirror_tid = hmp->voldata.mirror_tid; 2132 2133 kprintf("hammer2 mount \"%s\": ", hmp->devrepname); 2134 if (sync_tid >= mirror_tid) { 2135 kprintf(" no recovery needed\n"); 2136 } else { 2137 kprintf(" freemap recovery %016jx-%016jx\n", 2138 sync_tid + 1, mirror_tid); 2139 } 2140 2141 TAILQ_INIT(&info.list); 2142 info.depth = 0; 2143 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 2144 error = hammer2_recovery_scan(hmp, parent, &info, sync_tid); 2145 hammer2_chain_lookup_done(parent); 2146 2147 while ((elm = TAILQ_FIRST(&info.list)) != NULL) { 2148 TAILQ_REMOVE(&info.list, elm, entry); 2149 parent = elm->chain; 2150 sync_tid = elm->sync_tid; 2151 kfree(elm, M_HAMMER2); 2152 2153 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2154 error |= hammer2_recovery_scan(hmp, parent, &info, 2155 hmp->voldata.freemap_tid); 2156 hammer2_chain_unlock(parent); 2157 hammer2_chain_drop(parent); /* drop elm->chain ref */ 2158 } 2159 2160 hammer2_trans_done(hmp->spmp); 2161 2162 return error; 2163 } 2164 2165 static 2166 int 2167 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, 2168 struct hammer2_recovery_info *info, 2169 hammer2_tid_t sync_tid) 2170 { 2171 const hammer2_inode_data_t *ripdata; 2172 hammer2_chain_t *chain; 2173 hammer2_blockref_t bref; 2174 int tmp_error; 2175 int rup_error; 2176 int error; 2177 int first; 2178 2179 /* 2180 * Adjust freemap to ensure that the block(s) are marked allocated. 2181 */ 2182 if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { 2183 hammer2_freemap_adjust(hmp, &parent->bref, 2184 HAMMER2_FREEMAP_DORECOVER); 2185 } 2186 2187 /* 2188 * Check type for recursive scan 2189 */ 2190 switch(parent->bref.type) { 2191 case HAMMER2_BREF_TYPE_VOLUME: 2192 /* data already instantiated */ 2193 break; 2194 case HAMMER2_BREF_TYPE_INODE: 2195 /* 2196 * Must instantiate data for DIRECTDATA test and also 2197 * for recursion. 2198 */ 2199 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2200 ripdata = &hammer2_chain_rdata(parent)->ipdata; 2201 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2202 /* not applicable to recovery scan */ 2203 hammer2_chain_unlock(parent); 2204 return 0; 2205 } 2206 hammer2_chain_unlock(parent); 2207 break; 2208 case HAMMER2_BREF_TYPE_INDIRECT: 2209 /* 2210 * Must instantiate data for recursion 2211 */ 2212 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2213 hammer2_chain_unlock(parent); 2214 break; 2215 case HAMMER2_BREF_TYPE_DIRENT: 2216 case HAMMER2_BREF_TYPE_DATA: 2217 case HAMMER2_BREF_TYPE_FREEMAP: 2218 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2219 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2220 /* not applicable to recovery scan */ 2221 return 0; 2222 break; 2223 default: 2224 return HAMMER2_ERROR_BADBREF; 2225 } 2226 2227 /* 2228 * Defer operation if depth limit reached or if we are crossing a 2229 * PFS boundary. 2230 */ 2231 if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) { 2232 struct hammer2_recovery_elm *elm; 2233 2234 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); 2235 elm->chain = parent; 2236 elm->sync_tid = sync_tid; 2237 hammer2_chain_ref(parent); 2238 TAILQ_INSERT_TAIL(&info->list, elm, entry); 2239 /* unlocked by caller */ 2240 2241 return(0); 2242 } 2243 2244 2245 /* 2246 * Recursive scan of the last flushed transaction only. We are 2247 * doing this without pmp assignments so don't leave the chains 2248 * hanging around after we are done with them. 2249 * 2250 * error Cumulative error this level only 2251 * rup_error Cumulative error for recursion 2252 * tmp_error Specific non-cumulative recursion error 2253 */ 2254 chain = NULL; 2255 first = 1; 2256 rup_error = 0; 2257 error = 0; 2258 2259 for (;;) { 2260 error |= hammer2_chain_scan(parent, &chain, &bref, 2261 &first, 2262 HAMMER2_LOOKUP_NODATA); 2263 2264 /* 2265 * Problem during scan or EOF 2266 */ 2267 if (error) 2268 break; 2269 2270 /* 2271 * If this is a leaf 2272 */ 2273 if (chain == NULL) { 2274 if (bref.mirror_tid > sync_tid) { 2275 hammer2_freemap_adjust(hmp, &bref, 2276 HAMMER2_FREEMAP_DORECOVER); 2277 } 2278 continue; 2279 } 2280 2281 /* 2282 * This may or may not be a recursive node. 2283 */ 2284 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 2285 if (bref.mirror_tid > sync_tid) { 2286 ++info->depth; 2287 tmp_error = hammer2_recovery_scan(hmp, chain, 2288 info, sync_tid); 2289 --info->depth; 2290 } else { 2291 tmp_error = 0; 2292 } 2293 2294 /* 2295 * Flush the recovery at the PFS boundary to stage it for 2296 * the final flush of the super-root topology. 2297 */ 2298 if (tmp_error == 0 && 2299 (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && 2300 (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { 2301 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2302 HAMMER2_FLUSH_ALL); 2303 } 2304 rup_error |= tmp_error; 2305 } 2306 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 2307 } 2308 2309 /* 2310 * This fixes up an error introduced in earlier H2 implementations where 2311 * moving a PFS inode into an indirect block wound up causing the 2312 * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared. 2313 */ 2314 static 2315 int 2316 hammer2_fixup_pfses(hammer2_dev_t *hmp) 2317 { 2318 const hammer2_inode_data_t *ripdata; 2319 hammer2_chain_t *parent; 2320 hammer2_chain_t *chain; 2321 hammer2_key_t key_next; 2322 hammer2_pfs_t *spmp; 2323 int error; 2324 2325 error = 0; 2326 2327 /* 2328 * Lookup mount point under the media-localized super-root. 2329 * 2330 * cluster->pmp will incorrectly point to spmp and must be fixed 2331 * up later on. 2332 */ 2333 spmp = hmp->spmp; 2334 hammer2_inode_lock(spmp->iroot, 0); 2335 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 2336 chain = hammer2_chain_lookup(&parent, &key_next, 2337 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 2338 &error, 0); 2339 while (chain) { 2340 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 2341 continue; 2342 if (chain->error) { 2343 kprintf("I/O error scanning PFS labels\n"); 2344 error |= chain->error; 2345 } else if ((chain->bref.flags & 2346 HAMMER2_BREF_FLAG_PFSROOT) == 0) { 2347 int error2; 2348 2349 ripdata = &chain->data->ipdata; 2350 hammer2_trans_init(hmp->spmp, 0); 2351 error2 = hammer2_chain_modify(chain, 2352 chain->bref.modify_tid, 2353 0, 0); 2354 if (error2 == 0) { 2355 kprintf("hammer2: Correct mis-flagged PFS %s\n", 2356 ripdata->filename); 2357 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 2358 } else { 2359 error |= error2; 2360 } 2361 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2362 HAMMER2_FLUSH_ALL); 2363 hammer2_trans_done(hmp->spmp); 2364 } 2365 chain = hammer2_chain_next(&parent, chain, &key_next, 2366 key_next, HAMMER2_KEY_MAX, 2367 &error, 0); 2368 } 2369 if (parent) { 2370 hammer2_chain_unlock(parent); 2371 hammer2_chain_drop(parent); 2372 } 2373 hammer2_inode_unlock(spmp->iroot); 2374 2375 return error; 2376 } 2377 2378 /* 2379 * Sync a mount point; this is called periodically on a per-mount basis from 2380 * the filesystem syncer, and whenever a user issues a sync. 2381 */ 2382 int 2383 hammer2_vfs_sync(struct mount *mp, int waitfor) 2384 { 2385 hammer2_xop_flush_t *xop; 2386 struct hammer2_sync_info info; 2387 hammer2_inode_t *iroot; 2388 hammer2_pfs_t *pmp; 2389 int flags; 2390 int error; 2391 2392 pmp = MPTOPMP(mp); 2393 iroot = pmp->iroot; 2394 KKASSERT(iroot); 2395 KKASSERT(iroot->pmp == pmp); 2396 2397 /* 2398 * We can't acquire locks on existing vnodes while in a transaction 2399 * without risking a deadlock. This assumes that vfsync() can be 2400 * called without the vnode locked (which it can in DragonFly). 2401 * Otherwise we'd have to implement a multi-pass or flag the lock 2402 * failures and retry. 2403 * 2404 * The reclamation code interlocks with the sync list's token 2405 * (by removing the vnode from the scan list) before unlocking 2406 * the inode, giving us time to ref the inode. 2407 */ 2408 /*flags = VMSC_GETVP;*/ 2409 flags = 0; 2410 if (waitfor & MNT_LAZY) 2411 flags |= VMSC_ONEPASS; 2412 2413 /* 2414 * Flush vnodes individually using a normal transaction to avoid 2415 * stalling any concurrent operations. This will flush the related 2416 * buffer cache buffers and inodes to the media. 2417 * 2418 * For efficiency do an async pass before making sure with a 2419 * synchronous pass on all related buffer cache buffers. 2420 */ 2421 hammer2_trans_init(pmp, 0); 2422 2423 info.error = 0; 2424 2425 info.waitfor = MNT_NOWAIT; 2426 info.pass = 1; 2427 vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); 2428 2429 /* 2430 * Now do two passes making sure we get everything. The first pass 2431 * vfsync()s dirty vnodes. The second pass waits for their I/O's 2432 * to finish and cleans up the dirty flag on the vnode. 2433 */ 2434 info.pass = 1; 2435 info.waitfor = MNT_WAIT; 2436 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2437 2438 info.pass = 2; 2439 info.waitfor = MNT_WAIT; 2440 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2441 2442 /* 2443 * We must also run the sideq to handle any disconnected inodes 2444 * as the vnode scan will not see these. 2445 */ 2446 hammer2_inode_run_sideq(pmp, 1); 2447 hammer2_trans_done(pmp); 2448 2449 /* 2450 * Start our flush transaction and flush the root topology down to 2451 * the inodes, but not the inodes themselves (which we already flushed 2452 * above). Any concurrent activity effecting inode contents will not 2453 * 2454 * The flush sequence will 2455 * 2456 * NOTE! It is still possible for the paging code to push pages 2457 * out via a UIO_NOCOPY hammer2_vop_write() during the main 2458 * flush. 2459 */ 2460 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 2461 2462 /* 2463 * sync dirty vnodes again while in the flush transaction. This is 2464 * currently an expensive shim to makre sure the logical topology is 2465 * completely consistent before we flush the volume header. 2466 */ 2467 info.pass = 1; 2468 info.waitfor = MNT_WAIT; 2469 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2470 2471 info.pass = 2; 2472 info.waitfor = MNT_WAIT; 2473 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2474 2475 /* 2476 * Use the XOP interface to concurrently flush all nodes to 2477 * synchronize the PFSROOT subtopology to the media. A standard 2478 * end-of-scan ENOENT error indicates cluster sufficiency. 2479 * 2480 * Note that this flush will not be visible on crash recovery until 2481 * we flush the super-root topology in the next loop. 2482 * 2483 * XXX For now wait for all flushes to complete. 2484 */ 2485 if (iroot) { 2486 /* 2487 * If unmounting try to flush everything including any 2488 * sub-trees under inodes, just in case there is dangling 2489 * modified data, as a safety. Otherwise just flush up to 2490 * the inodes in this stage. 2491 */ 2492 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 2493 xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING | 2494 HAMMER2_XOP_VOLHDR); 2495 } else { 2496 xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING | 2497 HAMMER2_XOP_INODE_STOP | 2498 HAMMER2_XOP_VOLHDR); 2499 } 2500 hammer2_xop_start(&xop->head, hammer2_inode_xop_flush); 2501 error = hammer2_xop_collect(&xop->head, 2502 HAMMER2_XOP_COLLECT_WAITALL); 2503 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2504 if (error == HAMMER2_ERROR_ENOENT) 2505 error = 0; 2506 else 2507 error = hammer2_error_to_errno(error); 2508 } else { 2509 error = 0; 2510 } 2511 hammer2_trans_done(pmp); 2512 2513 return (error); 2514 } 2515 2516 /* 2517 * Sync passes. 2518 * 2519 * Note that we ignore the tranasction mtid we got above. Instead, 2520 * each vfsync below will ultimately get its own via TRANS_BUFCACHE 2521 * transactions. 2522 * 2523 * WARNING! The frontend might be waiting on chnmem (limit_dirty_chains) 2524 * while holding a vnode locked. When this situation occurs we cannot 2525 * safely test whether it is ok to clear the dirty bit on the vnode. 2526 * However, we can still flush the inode's topology. 2527 */ 2528 static int 2529 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data) 2530 { 2531 struct hammer2_sync_info *info = data; 2532 hammer2_inode_t *ip; 2533 int error; 2534 2535 /* 2536 * Degenerate cases. Note that ip == NULL typically means the 2537 * syncer vnode itself and we don't want to vclrisdirty() in that 2538 * situation. 2539 */ 2540 ip = VTOI(vp); 2541 if (ip == NULL) { 2542 return(0); 2543 } 2544 if (vp->v_type == VNON || vp->v_type == VBAD) { 2545 vclrisdirty(vp); 2546 return(0); 2547 } 2548 2549 /* 2550 * Synchronize the buffer cche and inode meta-data to the backing 2551 * chain topology. 2552 * 2553 * vfsync is not necessarily synchronous, so it is best NOT to try 2554 * to flush the backing topology to media at this point. 2555 */ 2556 hammer2_inode_ref(ip); 2557 if ((ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED)) || 2558 !RB_EMPTY(&vp->v_rbdirty_tree)) { 2559 if (info->pass == 1) 2560 vfsync(vp, info->waitfor, 1, NULL, NULL); 2561 else 2562 bio_track_wait(&vp->v_track_write, 0, 0); 2563 } 2564 if (info->pass == 2 && (vp->v_flag & VISDIRTY)) { 2565 /* 2566 * v_token is needed to interlock v_rbdirty_tree. 2567 */ 2568 lwkt_gettoken(&vp->v_token); 2569 hammer2_inode_lock(ip, 0); 2570 hammer2_inode_chain_sync(ip); 2571 hammer2_inode_chain_flush(ip); 2572 if ((ip->flags & (HAMMER2_INODE_MODIFIED | 2573 HAMMER2_INODE_RESIZED | 2574 HAMMER2_INODE_DIRTYDATA)) == 0 && 2575 RB_EMPTY(&vp->v_rbdirty_tree) && 2576 !bio_track_active(&vp->v_track_write)) { 2577 vclrisdirty(vp); 2578 } 2579 hammer2_inode_unlock(ip); 2580 lwkt_reltoken(&vp->v_token); 2581 } 2582 hammer2_inode_drop(ip); 2583 #if 1 2584 error = 0; 2585 if (error) 2586 info->error = error; 2587 #endif 2588 return(0); 2589 } 2590 2591 static 2592 int 2593 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp) 2594 { 2595 hammer2_inode_t *ip; 2596 2597 KKASSERT(MAXFIDSZ >= 16); 2598 ip = VTOI(vp); 2599 fhp->fid_len = offsetof(struct fid, fid_data[16]); 2600 fhp->fid_ext = 0; 2601 ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum; 2602 ((hammer2_tid_t *)fhp->fid_data)[1] = 0; 2603 2604 return 0; 2605 } 2606 2607 static 2608 int 2609 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 2610 struct fid *fhp, struct vnode **vpp) 2611 { 2612 hammer2_pfs_t *pmp; 2613 hammer2_tid_t inum; 2614 int error; 2615 2616 pmp = MPTOPMP(mp); 2617 inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK; 2618 if (vpp) { 2619 if (inum == 1) 2620 error = hammer2_vfs_root(mp, vpp); 2621 else 2622 error = hammer2_vfs_vget(mp, NULL, inum, vpp); 2623 } else { 2624 error = 0; 2625 } 2626 if (error) 2627 kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error); 2628 return error; 2629 } 2630 2631 static 2632 int 2633 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 2634 int *exflagsp, struct ucred **credanonp) 2635 { 2636 hammer2_pfs_t *pmp; 2637 struct netcred *np; 2638 int error; 2639 2640 pmp = MPTOPMP(mp); 2641 np = vfs_export_lookup(mp, &pmp->export, nam); 2642 if (np) { 2643 *exflagsp = np->netc_exflags; 2644 *credanonp = &np->netc_anon; 2645 error = 0; 2646 } else { 2647 error = EACCES; 2648 } 2649 return error; 2650 } 2651 2652 /* 2653 * Support code for hammer2_vfs_mount(). Read, verify, and install the volume 2654 * header into the HMP 2655 * 2656 * XXX read four volhdrs and use the one with the highest TID whos CRC 2657 * matches. 2658 * 2659 * XXX check iCRCs. 2660 * 2661 * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to 2662 * nonexistant locations. 2663 * 2664 * XXX Record selected volhdr and ring updates to each of 4 volhdrs 2665 */ 2666 static 2667 int 2668 hammer2_install_volume_header(hammer2_dev_t *hmp) 2669 { 2670 hammer2_volume_data_t *vd; 2671 struct buf *bp; 2672 hammer2_crc32_t crc0, crc, bcrc0, bcrc; 2673 int error_reported; 2674 int error; 2675 int valid; 2676 int i; 2677 2678 error_reported = 0; 2679 error = 0; 2680 valid = 0; 2681 bp = NULL; 2682 2683 /* 2684 * There are up to 4 copies of the volume header (syncs iterate 2685 * between them so there is no single master). We don't trust the 2686 * volu_size field so we don't know precisely how large the filesystem 2687 * is, so depend on the OS to return an error if we go beyond the 2688 * block device's EOF. 2689 */ 2690 for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { 2691 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2692 HAMMER2_VOLUME_BYTES, &bp); 2693 if (error) { 2694 brelse(bp); 2695 bp = NULL; 2696 continue; 2697 } 2698 2699 vd = (struct hammer2_volume_data *) bp->b_data; 2700 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) && 2701 (vd->magic != HAMMER2_VOLUME_ID_ABO)) { 2702 brelse(bp); 2703 bp = NULL; 2704 continue; 2705 } 2706 2707 if (vd->magic == HAMMER2_VOLUME_ID_ABO) { 2708 /* XXX: Reversed-endianness filesystem */ 2709 kprintf("hammer2: reverse-endian filesystem detected"); 2710 brelse(bp); 2711 bp = NULL; 2712 continue; 2713 } 2714 2715 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0]; 2716 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF, 2717 HAMMER2_VOLUME_ICRC0_SIZE); 2718 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1]; 2719 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF, 2720 HAMMER2_VOLUME_ICRC1_SIZE); 2721 if ((crc0 != crc) || (bcrc0 != bcrc)) { 2722 kprintf("hammer2 volume header crc " 2723 "mismatch copy #%d %08x/%08x\n", 2724 i, crc0, crc); 2725 error_reported = 1; 2726 brelse(bp); 2727 bp = NULL; 2728 continue; 2729 } 2730 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) { 2731 valid = 1; 2732 hmp->voldata = *vd; 2733 hmp->volhdrno = i; 2734 } 2735 brelse(bp); 2736 bp = NULL; 2737 } 2738 if (valid) { 2739 hmp->volsync = hmp->voldata; 2740 hmp->free_reserved = hmp->voldata.allocator_size / 20; 2741 error = 0; 2742 if (error_reported || bootverbose || 1) { /* 1/DEBUG */ 2743 kprintf("hammer2: using volume header #%d\n", 2744 hmp->volhdrno); 2745 } 2746 } else { 2747 error = EINVAL; 2748 kprintf("hammer2: no valid volume headers found!\n"); 2749 } 2750 return (error); 2751 } 2752 2753 /* 2754 * This handles hysteresis on regular file flushes. Because the BIOs are 2755 * routed to a thread it is possible for an excessive number to build up 2756 * and cause long front-end stalls long before the runningbuffspace limit 2757 * is hit, so we implement hammer2_flush_pipe to control the 2758 * hysteresis. 2759 * 2760 * This is a particular problem when compression is used. 2761 */ 2762 void 2763 hammer2_lwinprog_ref(hammer2_pfs_t *pmp) 2764 { 2765 atomic_add_int(&pmp->count_lwinprog, 1); 2766 } 2767 2768 void 2769 hammer2_lwinprog_drop(hammer2_pfs_t *pmp) 2770 { 2771 int lwinprog; 2772 2773 lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1); 2774 if ((lwinprog & HAMMER2_LWINPROG_WAITING) && 2775 (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) { 2776 atomic_clear_int(&pmp->count_lwinprog, 2777 HAMMER2_LWINPROG_WAITING); 2778 wakeup(&pmp->count_lwinprog); 2779 } 2780 if ((lwinprog & HAMMER2_LWINPROG_WAITING0) && 2781 (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) { 2782 atomic_clear_int(&pmp->count_lwinprog, 2783 HAMMER2_LWINPROG_WAITING0); 2784 wakeup(&pmp->count_lwinprog); 2785 } 2786 } 2787 2788 void 2789 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe) 2790 { 2791 int lwinprog; 2792 int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING : 2793 HAMMER2_LWINPROG_WAITING0; 2794 2795 for (;;) { 2796 lwinprog = pmp->count_lwinprog; 2797 cpu_ccfence(); 2798 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 2799 break; 2800 tsleep_interlock(&pmp->count_lwinprog, 0); 2801 atomic_set_int(&pmp->count_lwinprog, lwflag); 2802 lwinprog = pmp->count_lwinprog; 2803 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 2804 break; 2805 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz); 2806 } 2807 } 2808 2809 /* 2810 * Attempt to proactively fsync dirty vnodes if we have too many. This 2811 * solves an issue where the kernel syncer thread can get seriously behind 2812 * when multiple user processes/threads are furiously modifying inodes. 2813 * This situation can occur on slow storage and is only limited by 2814 * kern.maxvnodes without the moderation code below. It is made worse 2815 * when the device buffers underlying the modified inodes (which are clean) 2816 * get evicted before the flush can occur, forcing a re-read. 2817 * 2818 * We do not want sysads to feel that they have to torpedo kern.maxvnodes 2819 * to solve this problem, so we implement vfs.hammer2.limit_dirty_inodes 2820 * (per-mount-basis) and default it to something reasonable. 2821 */ 2822 static void 2823 hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate) 2824 { 2825 hammer2_pfs_t *pmp = ip->pmp; 2826 struct mount *mp = pmp->mp; 2827 2828 if (mp && vn_syncer_count(mp) > hammer2_limit_dirty_inodes) { 2829 vn_syncer_one(mp); 2830 } 2831 } 2832 2833 /* 2834 * Manage excessive memory resource use for chain and related 2835 * structures. 2836 * 2837 * Called without any inode locks or transaction locks. VNodes 2838 * might be locked by the kernel in the call stack. 2839 */ 2840 void 2841 hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate) 2842 { 2843 hammer2_pfs_t *pmp = ip->pmp; 2844 uint32_t waiting; 2845 uint32_t count; 2846 uint32_t limit; 2847 #if 0 2848 static int zzticks; 2849 #endif 2850 2851 /* 2852 * Moderate the number of dirty inodes 2853 */ 2854 hammer2_pfs_moderate(ip, always_moderate); 2855 2856 /* 2857 * Atomic check condition and wait. Also do an early speedup of 2858 * the syncer to try to avoid hitting the wait. 2859 */ 2860 for (;;) { 2861 waiting = pmp->inmem_dirty_chains; 2862 cpu_ccfence(); 2863 count = waiting & HAMMER2_DIRTYCHAIN_MASK; 2864 2865 limit = pmp->mp->mnt_nvnodelistsize / 10; 2866 if (limit < hammer2_limit_dirty_chains) 2867 limit = hammer2_limit_dirty_chains; 2868 if (limit < 1000) 2869 limit = 1000; 2870 2871 #if 0 2872 if ((int)(ticks - zzticks) > hz) { 2873 zzticks = ticks; 2874 kprintf("count %ld %ld\n", count, limit); 2875 } 2876 #endif 2877 2878 /* 2879 * Block if there are too many dirty chains present, wait 2880 * for the flush to clean some out. 2881 */ 2882 if (count > limit) { 2883 hammer2_pfs_moderate(ip, always_moderate); 2884 tsleep_interlock(&pmp->inmem_dirty_chains, 0); 2885 if (atomic_cmpset_int(&pmp->inmem_dirty_chains, 2886 waiting, 2887 waiting | HAMMER2_DIRTYCHAIN_WAITING)) { 2888 if (ticks != pmp->speedup_ticks) { 2889 pmp->speedup_ticks = ticks; 2890 speedup_syncer(pmp->mp); 2891 } 2892 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED, 2893 "chnmem", hz); 2894 } 2895 continue; /* loop on success or fail */ 2896 } 2897 2898 /* 2899 * Try to start an early flush before we are forced to block. 2900 */ 2901 if (count > limit * 5 / 10 && 2902 ticks != pmp->speedup_ticks) { 2903 pmp->speedup_ticks = ticks; 2904 speedup_syncer(pmp->mp); 2905 } 2906 break; 2907 } 2908 } 2909 2910 void 2911 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp) 2912 { 2913 if (pmp) { 2914 atomic_add_int(&pmp->inmem_dirty_chains, 1); 2915 } 2916 } 2917 2918 void 2919 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp) 2920 { 2921 uint32_t waiting; 2922 2923 if (pmp) { 2924 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, -1); 2925 /* don't need --waiting to test flag */ 2926 if (waiting & HAMMER2_DIRTYCHAIN_WAITING) { 2927 atomic_clear_int(&pmp->inmem_dirty_chains, 2928 HAMMER2_DIRTYCHAIN_WAITING); 2929 wakeup(&pmp->inmem_dirty_chains); 2930 } 2931 } 2932 } 2933 2934 /* 2935 * Returns 0 if the filesystem has tons of free space 2936 * Returns 1 if the filesystem has less than 10% remaining 2937 * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining. 2938 */ 2939 int 2940 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred) 2941 { 2942 hammer2_pfs_t *pmp; 2943 hammer2_dev_t *hmp; 2944 hammer2_off_t free_reserved; 2945 hammer2_off_t free_nominal; 2946 int i; 2947 2948 pmp = ip->pmp; 2949 2950 if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) { 2951 free_reserved = HAMMER2_SEGSIZE; 2952 free_nominal = 0x7FFFFFFFFFFFFFFFLLU; 2953 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2954 hmp = pmp->pfs_hmps[i]; 2955 if (hmp == NULL) 2956 continue; 2957 if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER && 2958 pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER) 2959 continue; 2960 2961 if (free_nominal > hmp->voldata.allocator_free) 2962 free_nominal = hmp->voldata.allocator_free; 2963 if (free_reserved < hmp->free_reserved) 2964 free_reserved = hmp->free_reserved; 2965 } 2966 2967 /* 2968 * SMP races ok 2969 */ 2970 pmp->free_reserved = free_reserved; 2971 pmp->free_nominal = free_nominal; 2972 pmp->free_ticks = ticks; 2973 } else { 2974 free_reserved = pmp->free_reserved; 2975 free_nominal = pmp->free_nominal; 2976 } 2977 if (cred && cred->cr_uid != 0) { 2978 if ((int64_t)(free_nominal - bytes) < 2979 (int64_t)free_reserved) { 2980 return 2; 2981 } 2982 } else { 2983 if ((int64_t)(free_nominal - bytes) < 2984 (int64_t)free_reserved / 2) { 2985 return 2; 2986 } 2987 } 2988 if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2) 2989 return 1; 2990 return 0; 2991 } 2992 2993 /* 2994 * Debugging 2995 */ 2996 void 2997 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, 2998 u_int flags) 2999 { 3000 hammer2_chain_t *scan; 3001 hammer2_chain_t *parent; 3002 3003 --*countp; 3004 if (*countp == 0) { 3005 kprintf("%*.*s...\n", tab, tab, ""); 3006 return; 3007 } 3008 if (*countp < 0) 3009 return; 3010 kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n", 3011 tab, tab, "", pfx, 3012 chain, chain->bref.type, 3013 chain->bref.key, chain->bref.keybits, 3014 chain->bref.mirror_tid); 3015 3016 kprintf("%*.*s [%08x] (%s) refs=%d", 3017 tab, tab, "", 3018 chain->flags, 3019 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 3020 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 3021 chain->refs); 3022 3023 parent = chain->parent; 3024 if (parent) 3025 kprintf("\n%*.*s p=%p [pflags %08x prefs %d", 3026 tab, tab, "", 3027 parent, parent->flags, parent->refs); 3028 if (RB_EMPTY(&chain->core.rbtree)) { 3029 kprintf("\n"); 3030 } else { 3031 kprintf(" {\n"); 3032 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 3033 if ((scan->flags & flags) || flags == (u_int)-1) { 3034 hammer2_dump_chain(scan, tab + 4, countp, 'a', 3035 flags); 3036 } 3037 } 3038 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 3039 kprintf("%*.*s}(%s)\n", tab, tab, "", 3040 chain->data->ipdata.filename); 3041 else 3042 kprintf("%*.*s}\n", tab, tab, ""); 3043 } 3044 } 3045