1 /* 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/nlookup.h> 39 #include <sys/vnode.h> 40 #include <sys/mount.h> 41 #include <sys/fcntl.h> 42 #include <sys/buf.h> 43 #include <sys/uuid.h> 44 #include <sys/vfsops.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 #include <sys/objcache.h> 48 49 #include <sys/proc.h> 50 #include <sys/namei.h> 51 #include <sys/mountctl.h> 52 #include <sys/dirent.h> 53 #include <sys/uio.h> 54 55 #include <sys/mutex.h> 56 #include <sys/mutex2.h> 57 58 #include "hammer2.h" 59 #include "hammer2_disk.h" 60 #include "hammer2_mount.h" 61 #include "hammer2_lz4.h" 62 63 #include "zlib/hammer2_zlib.h" 64 65 #define REPORT_REFS_ERRORS 1 /* XXX remove me */ 66 67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); 68 69 struct hammer2_sync_info { 70 int error; 71 int waitfor; 72 int pass; 73 }; 74 75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev); 76 static struct hammer2_mntlist hammer2_mntlist; 77 78 struct hammer2_pfslist hammer2_pfslist; 79 struct hammer2_pfslist hammer2_spmplist; 80 struct lock hammer2_mntlk; 81 82 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT; 83 int hammer2_debug; 84 int hammer2_cluster_meta_read = 1; /* physical read-ahead */ 85 int hammer2_cluster_data_read = 4; /* physical read-ahead */ 86 int hammer2_dedup_enable = 1; 87 int hammer2_always_compress = 0; /* always try to compress */ 88 int hammer2_inval_enable = 0; 89 int hammer2_flush_pipe = 100; 90 int hammer2_dio_count; 91 int hammer2_dio_limit = 256; 92 int hammer2_bulkfree_tps = 5000; 93 long hammer2_chain_allocs; 94 long hammer2_chain_frees; 95 long hammer2_limit_dirty_chains; 96 long hammer2_limit_dirty_inodes; 97 long hammer2_count_modified_chains; 98 long hammer2_iod_invals; 99 long hammer2_iod_file_read; 100 long hammer2_iod_meta_read; 101 long hammer2_iod_indr_read; 102 long hammer2_iod_fmap_read; 103 long hammer2_iod_volu_read; 104 long hammer2_iod_file_write; 105 long hammer2_iod_file_wembed; 106 long hammer2_iod_file_wzero; 107 long hammer2_iod_file_wdedup; 108 long hammer2_iod_meta_write; 109 long hammer2_iod_indr_write; 110 long hammer2_iod_fmap_write; 111 long hammer2_iod_volu_write; 112 113 MALLOC_DECLARE(M_HAMMER2_CBUFFER); 114 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer", 115 "Buffer used for compression."); 116 117 MALLOC_DECLARE(M_HAMMER2_DEBUFFER); 118 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer", 119 "Buffer used for decompression."); 120 121 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem"); 122 123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD, 124 &hammer2_supported_version, 0, ""); 125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW, 126 &hammer2_debug, 0, ""); 127 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW, 128 &hammer2_cluster_meta_read, 0, ""); 129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW, 130 &hammer2_cluster_data_read, 0, ""); 131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW, 132 &hammer2_dedup_enable, 0, ""); 133 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW, 134 &hammer2_always_compress, 0, ""); 135 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, 136 &hammer2_inval_enable, 0, ""); 137 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, 138 &hammer2_flush_pipe, 0, ""); 139 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW, 140 &hammer2_bulkfree_tps, 0, ""); 141 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW, 142 &hammer2_chain_allocs, 0, ""); 143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW, 144 &hammer2_chain_frees, 0, ""); 145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW, 146 &hammer2_limit_dirty_chains, 0, ""); 147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW, 148 &hammer2_limit_dirty_inodes, 0, ""); 149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, 150 &hammer2_count_modified_chains, 0, ""); 151 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, 152 &hammer2_dio_count, 0, ""); 153 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW, 154 &hammer2_dio_limit, 0, ""); 155 156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, 157 &hammer2_iod_invals, 0, ""); 158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, 159 &hammer2_iod_file_read, 0, ""); 160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, 161 &hammer2_iod_meta_read, 0, ""); 162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, 163 &hammer2_iod_indr_read, 0, ""); 164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, 165 &hammer2_iod_fmap_read, 0, ""); 166 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, 167 &hammer2_iod_volu_read, 0, ""); 168 169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, 170 &hammer2_iod_file_write, 0, ""); 171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW, 172 &hammer2_iod_file_wembed, 0, ""); 173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW, 174 &hammer2_iod_file_wzero, 0, ""); 175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW, 176 &hammer2_iod_file_wdedup, 0, ""); 177 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, 178 &hammer2_iod_meta_write, 0, ""); 179 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, 180 &hammer2_iod_indr_write, 0, ""); 181 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, 182 &hammer2_iod_fmap_write, 0, ""); 183 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, 184 &hammer2_iod_volu_write, 0, ""); 185 186 long hammer2_process_icrc32; 187 long hammer2_process_xxhash64; 188 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW, 189 &hammer2_process_icrc32, 0, ""); 190 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW, 191 &hammer2_process_xxhash64, 0, ""); 192 193 static int hammer2_vfs_init(struct vfsconf *conf); 194 static int hammer2_vfs_uninit(struct vfsconf *vfsp); 195 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 196 struct ucred *cred); 197 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *, 198 struct vnode *, struct ucred *); 199 static int hammer2_recovery(hammer2_dev_t *hmp); 200 static int hammer2_vfs_unmount(struct mount *mp, int mntflags); 201 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); 202 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, 203 struct ucred *cred); 204 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, 205 struct ucred *cred); 206 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 207 struct fid *fhp, struct vnode **vpp); 208 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp); 209 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 210 int *exflagsp, struct ucred **credanonp); 211 212 static int hammer2_install_volume_header(hammer2_dev_t *hmp); 213 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 214 215 static void hammer2_update_pmps(hammer2_dev_t *hmp); 216 217 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp); 218 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, 219 hammer2_dev_t *hmp); 220 static int hammer2_fixup_pfses(hammer2_dev_t *hmp); 221 222 /* 223 * HAMMER2 vfs operations. 224 */ 225 static struct vfsops hammer2_vfsops = { 226 .vfs_init = hammer2_vfs_init, 227 .vfs_uninit = hammer2_vfs_uninit, 228 .vfs_sync = hammer2_vfs_sync, 229 .vfs_mount = hammer2_vfs_mount, 230 .vfs_unmount = hammer2_vfs_unmount, 231 .vfs_root = hammer2_vfs_root, 232 .vfs_statfs = hammer2_vfs_statfs, 233 .vfs_statvfs = hammer2_vfs_statvfs, 234 .vfs_vget = hammer2_vfs_vget, 235 .vfs_vptofh = hammer2_vfs_vptofh, 236 .vfs_fhtovp = hammer2_vfs_fhtovp, 237 .vfs_checkexp = hammer2_vfs_checkexp 238 }; 239 240 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); 241 242 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE); 243 MODULE_VERSION(hammer2, 1); 244 245 static 246 int 247 hammer2_vfs_init(struct vfsconf *conf) 248 { 249 static struct objcache_malloc_args margs_read; 250 static struct objcache_malloc_args margs_write; 251 static struct objcache_malloc_args margs_vop; 252 253 int error; 254 255 error = 0; 256 257 /* 258 * A large DIO cache is needed to retain dedup enablement masks. 259 * The bulkfree code clears related masks as part of the disk block 260 * recycling algorithm, preventing it from being used for a later 261 * dedup. 262 * 263 * NOTE: A large buffer cache can actually interfere with dedup 264 * operation because we dedup based on media physical buffers 265 * and not logical buffers. Try to make the DIO case large 266 * enough to avoid this problem, but also cap it. 267 */ 268 hammer2_dio_limit = nbuf * 2; 269 if (hammer2_dio_limit > 100000) 270 hammer2_dio_limit = 100000; 271 272 if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) 273 error = EINVAL; 274 if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data)) 275 error = EINVAL; 276 if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data)) 277 error = EINVAL; 278 279 if (error) 280 kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); 281 282 margs_read.objsize = 65536; 283 margs_read.mtype = M_HAMMER2_DEBUFFER; 284 285 margs_write.objsize = 32768; 286 margs_write.mtype = M_HAMMER2_CBUFFER; 287 288 margs_vop.objsize = sizeof(hammer2_xop_t); 289 margs_vop.mtype = M_HAMMER2; 290 291 /* 292 * Note thaht for the XOPS cache we want backing store allocations 293 * to use M_ZERO. This is not allowed in objcache_get() (to avoid 294 * confusion), so use the backing store function that does it. This 295 * means that initial XOPS objects are zerod but REUSED objects are 296 * not. So we are responsible for cleaning the object up sufficiently 297 * for our needs before objcache_put()ing it back (typically just the 298 * FIFO indices). 299 */ 300 cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, 301 0, 1, NULL, NULL, NULL, 302 objcache_malloc_alloc, 303 objcache_malloc_free, 304 &margs_read); 305 cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, 306 0, 1, NULL, NULL, NULL, 307 objcache_malloc_alloc, 308 objcache_malloc_free, 309 &margs_write); 310 cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc, 311 0, 1, NULL, NULL, NULL, 312 objcache_malloc_alloc_zero, 313 objcache_malloc_free, 314 &margs_vop); 315 316 317 lockinit(&hammer2_mntlk, "mntlk", 0, 0); 318 TAILQ_INIT(&hammer2_mntlist); 319 TAILQ_INIT(&hammer2_pfslist); 320 TAILQ_INIT(&hammer2_spmplist); 321 322 hammer2_limit_dirty_chains = maxvnodes / 10; 323 if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS) 324 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS; 325 326 hammer2_limit_dirty_inodes = maxvnodes / 100; 327 if (hammer2_limit_dirty_inodes < 100) 328 hammer2_limit_dirty_inodes = 100; 329 if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES) 330 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES; 331 332 return (error); 333 } 334 335 static 336 int 337 hammer2_vfs_uninit(struct vfsconf *vfsp __unused) 338 { 339 objcache_destroy(cache_buffer_read); 340 objcache_destroy(cache_buffer_write); 341 objcache_destroy(cache_xops); 342 return 0; 343 } 344 345 /* 346 * Core PFS allocator. Used to allocate or reference the pmp structure 347 * for PFS cluster mounts and the spmp structure for media (hmp) structures. 348 * The pmp can be passed in or loaded by this function using the chain and 349 * inode data. 350 * 351 * pmp->modify_tid tracks new modify_tid transaction ids for front-end 352 * transactions. Note that synchronization does not use this field. 353 * (typically frontend operations and synchronization cannot run on the 354 * same PFS node at the same time). 355 * 356 * XXX check locking 357 */ 358 hammer2_pfs_t * 359 hammer2_pfsalloc(hammer2_chain_t *chain, 360 const hammer2_inode_data_t *ripdata, 361 hammer2_tid_t modify_tid, hammer2_dev_t *force_local) 362 { 363 hammer2_pfs_t *pmp; 364 hammer2_inode_t *iroot; 365 int count; 366 int i; 367 int j; 368 369 pmp = NULL; 370 371 /* 372 * Locate or create the PFS based on the cluster id. If ripdata 373 * is NULL this is a spmp which is unique and is always allocated. 374 * 375 * If the device is mounted in local mode all PFSs are considered 376 * independent and not part of any cluster (for debugging only). 377 */ 378 if (ripdata) { 379 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 380 if (force_local != pmp->force_local) 381 continue; 382 if (force_local == NULL && 383 bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid, 384 sizeof(pmp->pfs_clid)) == 0) { 385 break; 386 } else if (force_local && pmp->pfs_names[0] && 387 strcmp(pmp->pfs_names[0], ripdata->filename) == 0) { 388 break; 389 } 390 } 391 } 392 393 if (pmp == NULL) { 394 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO); 395 pmp->force_local = force_local; 396 hammer2_trans_manage_init(pmp); 397 kmalloc_create(&pmp->minode, "HAMMER2-inodes"); 398 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg"); 399 lockinit(&pmp->lock, "pfslk", 0, 0); 400 lockinit(&pmp->lock_nlink, "h2nlink", 0, 0); 401 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum"); 402 spin_init(&pmp->xop_spin, "h2xop"); 403 spin_init(&pmp->lru_spin, "h2lru"); 404 RB_INIT(&pmp->inum_tree); 405 TAILQ_INIT(&pmp->sideq); 406 TAILQ_INIT(&pmp->lru_list); 407 spin_init(&pmp->list_spin, "hm2pfsalloc_list"); 408 409 /* 410 * Distribute backend operations to threads 411 */ 412 for (i = 0; i < HAMMER2_XOPGROUPS; ++i) 413 hammer2_xop_group_init(pmp, &pmp->xop_groups[i]); 414 415 /* 416 * Save the last media transaction id for the flusher. Set 417 * initial 418 */ 419 if (ripdata) { 420 pmp->pfs_clid = ripdata->meta.pfs_clid; 421 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry); 422 } else { 423 pmp->flags |= HAMMER2_PMPF_SPMP; 424 TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry); 425 } 426 427 /* 428 * The synchronization thread may start too early, make 429 * sure it stays frozen until we are ready to let it go. 430 * XXX 431 */ 432 /* 433 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN | 434 HAMMER2_THREAD_REMASTER; 435 */ 436 } 437 438 /* 439 * Create the PFS's root inode and any missing XOP helper threads. 440 */ 441 if ((iroot = pmp->iroot) == NULL) { 442 iroot = hammer2_inode_get(pmp, NULL, NULL, -1); 443 if (ripdata) 444 iroot->meta = ripdata->meta; 445 pmp->iroot = iroot; 446 hammer2_inode_ref(iroot); 447 hammer2_inode_unlock(iroot); 448 } 449 450 /* 451 * Stop here if no chain is passed in. 452 */ 453 if (chain == NULL) 454 goto done; 455 456 /* 457 * When a chain is passed in we must add it to the PFS's root 458 * inode, update pmp->pfs_types[], and update the syncronization 459 * threads. 460 * 461 * When forcing local mode, mark the PFS as a MASTER regardless. 462 * 463 * At the moment empty spots can develop due to removals or failures. 464 * Ultimately we want to re-fill these spots but doing so might 465 * confused running code. XXX 466 */ 467 hammer2_inode_ref(iroot); 468 hammer2_mtx_ex(&iroot->lock); 469 j = iroot->cluster.nchains; 470 471 if (j == HAMMER2_MAXCLUSTER) { 472 kprintf("hammer2_mount: cluster full!\n"); 473 /* XXX fatal error? */ 474 } else { 475 KKASSERT(chain->pmp == NULL); 476 chain->pmp = pmp; 477 hammer2_chain_ref(chain); 478 iroot->cluster.array[j].chain = chain; 479 if (force_local) 480 pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER; 481 else 482 pmp->pfs_types[j] = ripdata->meta.pfs_type; 483 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2); 484 pmp->pfs_hmps[j] = chain->hmp; 485 486 /* 487 * If the PFS is already mounted we must account 488 * for the mount_count here. 489 */ 490 if (pmp->mp) 491 ++chain->hmp->mount_count; 492 493 /* 494 * May have to fixup dirty chain tracking. Previous 495 * pmp was NULL so nothing to undo. 496 */ 497 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 498 hammer2_pfs_memory_inc(pmp); 499 ++j; 500 } 501 iroot->cluster.nchains = j; 502 503 /* 504 * Update nmasters from any PFS inode which is part of the cluster. 505 * It is possible that this will result in a value which is too 506 * high. MASTER PFSs are authoritative for pfs_nmasters and will 507 * override this value later on. 508 * 509 * (This informs us of masters that might not currently be 510 * discoverable by this mount). 511 */ 512 if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) { 513 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters; 514 } 515 516 /* 517 * Count visible masters. Masters are usually added with 518 * ripdata->meta.pfs_nmasters set to 1. This detects when there 519 * are more (XXX and must update the master inodes). 520 */ 521 count = 0; 522 for (i = 0; i < iroot->cluster.nchains; ++i) { 523 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) 524 ++count; 525 } 526 if (pmp->pfs_nmasters < count) 527 pmp->pfs_nmasters = count; 528 529 /* 530 * Create missing synchronization and support threads. 531 * 532 * Single-node masters (including snapshots) have nothing to 533 * synchronize and do not require this thread. 534 * 535 * Multi-node masters or any number of soft masters, slaves, copy, 536 * or other PFS types need the thread. 537 * 538 * Each thread is responsible for its particular cluster index. 539 * We use independent threads so stalls or mismatches related to 540 * any given target do not affect other targets. 541 */ 542 for (i = 0; i < iroot->cluster.nchains; ++i) { 543 /* 544 * Single-node masters (including snapshots) have nothing 545 * to synchronize and will make direct xops support calls, 546 * thus they do not require this thread. 547 * 548 * Note that there can be thousands of snapshots. We do not 549 * want to create thousands of threads. 550 */ 551 if (pmp->pfs_nmasters <= 1 && 552 pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) { 553 continue; 554 } 555 556 /* 557 * Sync support thread 558 */ 559 if (pmp->sync_thrs[i].td == NULL) { 560 hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL, 561 "h2nod", i, -1, 562 hammer2_primary_sync_thread); 563 } 564 } 565 566 /* 567 * Create missing Xop threads 568 * 569 * NOTE: We create helper threads for all mounted PFSs or any 570 * PFSs with 2+ nodes (so the sync thread can update them, 571 * even if not mounted). 572 */ 573 if (pmp->mp || iroot->cluster.nchains >= 2) 574 hammer2_xop_helper_create(pmp); 575 576 hammer2_mtx_unlock(&iroot->lock); 577 hammer2_inode_drop(iroot); 578 done: 579 return pmp; 580 } 581 582 /* 583 * Deallocate an element of a probed PFS. If destroying and this is a 584 * MASTER, adjust nmasters. 585 * 586 * This function does not physically destroy the PFS element in its device 587 * under the super-root (see hammer2_ioctl_pfs_delete()). 588 */ 589 void 590 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying) 591 { 592 hammer2_inode_t *iroot; 593 hammer2_chain_t *chain; 594 int j; 595 596 /* 597 * Cleanup our reference on iroot. iroot is (should) not be needed 598 * by the flush code. 599 */ 600 iroot = pmp->iroot; 601 if (iroot) { 602 /* 603 * Stop synchronizing 604 * 605 * XXX flush after acquiring the iroot lock. 606 * XXX clean out the cluster index from all inode structures. 607 */ 608 hammer2_thr_delete(&pmp->sync_thrs[clindex]); 609 610 /* 611 * Remove the cluster index from the group. If destroying 612 * the PFS and this is a master, adjust pfs_nmasters. 613 */ 614 hammer2_mtx_ex(&iroot->lock); 615 chain = iroot->cluster.array[clindex].chain; 616 iroot->cluster.array[clindex].chain = NULL; 617 618 switch(pmp->pfs_types[clindex]) { 619 case HAMMER2_PFSTYPE_MASTER: 620 if (destroying && pmp->pfs_nmasters > 0) 621 --pmp->pfs_nmasters; 622 /* XXX adjust ripdata->meta.pfs_nmasters */ 623 break; 624 default: 625 break; 626 } 627 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE; 628 629 hammer2_mtx_unlock(&iroot->lock); 630 631 /* 632 * Release the chain. 633 */ 634 if (chain) { 635 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 636 hammer2_chain_drop(chain); 637 } 638 639 /* 640 * Terminate all XOP threads for the cluster index. 641 */ 642 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) 643 hammer2_thr_delete(&pmp->xop_groups[j].thrs[clindex]); 644 } 645 } 646 647 /* 648 * Destroy a PFS, typically only occurs after the last mount on a device 649 * has gone away. 650 */ 651 static void 652 hammer2_pfsfree(hammer2_pfs_t *pmp) 653 { 654 hammer2_inode_t *iroot; 655 hammer2_chain_t *chain; 656 int i; 657 int j; 658 659 /* 660 * Cleanup our reference on iroot. iroot is (should) not be needed 661 * by the flush code. 662 */ 663 if (pmp->flags & HAMMER2_PMPF_SPMP) 664 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry); 665 else 666 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry); 667 668 iroot = pmp->iroot; 669 if (iroot) { 670 for (i = 0; i < iroot->cluster.nchains; ++i) { 671 hammer2_thr_delete(&pmp->sync_thrs[i]); 672 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) 673 hammer2_thr_delete(&pmp->xop_groups[j].thrs[i]); 674 } 675 #if REPORT_REFS_ERRORS 676 if (pmp->iroot->refs != 1) 677 kprintf("PMP->IROOT %p REFS WRONG %d\n", 678 pmp->iroot, pmp->iroot->refs); 679 #else 680 KKASSERT(pmp->iroot->refs == 1); 681 #endif 682 /* ref for pmp->iroot */ 683 hammer2_inode_drop(pmp->iroot); 684 pmp->iroot = NULL; 685 } 686 687 /* 688 * Cleanup chains remaining on LRU list. 689 */ 690 hammer2_spin_ex(&pmp->lru_spin); 691 while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) { 692 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU); 693 atomic_add_int(&pmp->lru_count, -1); 694 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 695 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 696 hammer2_chain_ref(chain); 697 hammer2_spin_unex(&pmp->lru_spin); 698 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 699 hammer2_chain_drop(chain); 700 hammer2_spin_ex(&pmp->lru_spin); 701 } 702 hammer2_spin_unex(&pmp->lru_spin); 703 704 /* 705 * Free remaining pmp resources 706 */ 707 kmalloc_destroy(&pmp->mmsg); 708 kmalloc_destroy(&pmp->minode); 709 710 kfree(pmp, M_HAMMER2); 711 } 712 713 /* 714 * Remove all references to hmp from the pfs list. Any PFS which becomes 715 * empty is terminated and freed. 716 * 717 * XXX inefficient. 718 */ 719 static void 720 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which) 721 { 722 hammer2_pfs_t *pmp; 723 hammer2_inode_t *iroot; 724 hammer2_chain_t *rchain; 725 int didfreeze; 726 int i; 727 int j; 728 struct hammer2_pfslist *wlist; 729 730 if (which == 0) 731 wlist = &hammer2_pfslist; 732 else 733 wlist = &hammer2_spmplist; 734 again: 735 TAILQ_FOREACH(pmp, wlist, mntentry) { 736 if ((iroot = pmp->iroot) == NULL) 737 continue; 738 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 739 hammer2_inode_run_sideq(pmp, 1); 740 hammer2_bioq_sync(pmp); 741 hammer2_trans_done(pmp, 0); 742 743 /* 744 * Determine if this PFS is affected. If it is we must 745 * freeze all management threads and lock its iroot. 746 * 747 * Freezing a management thread forces it idle, operations 748 * in-progress will be aborted and it will have to start 749 * over again when unfrozen, or exit if told to exit. 750 */ 751 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 752 if (pmp->pfs_hmps[i] == hmp) 753 break; 754 } 755 if (i != HAMMER2_MAXCLUSTER) { 756 /* 757 * Make sure all synchronization threads are locked 758 * down. 759 */ 760 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 761 if (pmp->pfs_hmps[i] == NULL) 762 continue; 763 hammer2_thr_freeze_async(&pmp->sync_thrs[i]); 764 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 765 hammer2_thr_freeze_async( 766 &pmp->xop_groups[j].thrs[i]); 767 } 768 } 769 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 770 if (pmp->pfs_hmps[i] == NULL) 771 continue; 772 hammer2_thr_freeze(&pmp->sync_thrs[i]); 773 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 774 hammer2_thr_freeze( 775 &pmp->xop_groups[j].thrs[i]); 776 } 777 } 778 779 /* 780 * Lock the inode and clean out matching chains. 781 * Note that we cannot use hammer2_inode_lock_*() 782 * here because that would attempt to validate the 783 * cluster that we are in the middle of ripping 784 * apart. 785 * 786 * WARNING! We are working directly on the inodes 787 * embedded cluster. 788 */ 789 hammer2_mtx_ex(&iroot->lock); 790 791 /* 792 * Remove the chain from matching elements of the PFS. 793 */ 794 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 795 if (pmp->pfs_hmps[i] != hmp) 796 continue; 797 hammer2_thr_delete(&pmp->sync_thrs[i]); 798 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 799 hammer2_thr_delete( 800 &pmp->xop_groups[j].thrs[i]); 801 } 802 rchain = iroot->cluster.array[i].chain; 803 iroot->cluster.array[i].chain = NULL; 804 pmp->pfs_types[i] = 0; 805 if (pmp->pfs_names[i]) { 806 kfree(pmp->pfs_names[i], M_HAMMER2); 807 pmp->pfs_names[i] = NULL; 808 } 809 if (rchain) { 810 hammer2_chain_drop(rchain); 811 /* focus hint */ 812 if (iroot->cluster.focus == rchain) 813 iroot->cluster.focus = NULL; 814 } 815 pmp->pfs_hmps[i] = NULL; 816 } 817 hammer2_mtx_unlock(&iroot->lock); 818 didfreeze = 1; /* remaster, unfreeze down below */ 819 } else { 820 didfreeze = 0; 821 } 822 823 /* 824 * Cleanup trailing chains. Gaps may remain. 825 */ 826 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) { 827 if (pmp->pfs_hmps[i]) 828 break; 829 } 830 iroot->cluster.nchains = i + 1; 831 832 /* 833 * If the PMP has no elements remaining we can destroy it. 834 * (this will transition management threads from frozen->exit). 835 */ 836 if (iroot->cluster.nchains == 0) { 837 /* 838 * If this was the hmp's spmp, we need to clean 839 * a little more stuff out. 840 */ 841 if (hmp->spmp == pmp) { 842 hmp->spmp = NULL; 843 hmp->vchain.pmp = NULL; 844 hmp->fchain.pmp = NULL; 845 } 846 847 /* 848 * Free the pmp and restart the loop 849 */ 850 hammer2_pfsfree(pmp); 851 goto again; 852 } 853 854 /* 855 * If elements still remain we need to set the REMASTER 856 * flag and unfreeze it. 857 */ 858 if (didfreeze) { 859 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 860 if (pmp->pfs_hmps[i] == NULL) 861 continue; 862 hammer2_thr_remaster(&pmp->sync_thrs[i]); 863 hammer2_thr_unfreeze(&pmp->sync_thrs[i]); 864 for (j = 0; j < HAMMER2_XOPGROUPS; ++j) { 865 hammer2_thr_remaster( 866 &pmp->xop_groups[j].thrs[i]); 867 hammer2_thr_unfreeze( 868 &pmp->xop_groups[j].thrs[i]); 869 } 870 } 871 } 872 } 873 } 874 875 /* 876 * Mount or remount HAMMER2 fileystem from physical media 877 * 878 * mountroot 879 * mp mount point structure 880 * path NULL 881 * data <unused> 882 * cred <unused> 883 * 884 * mount 885 * mp mount point structure 886 * path path to mount point 887 * data pointer to argument structure in user space 888 * volume volume path (device@LABEL form) 889 * hflags user mount flags 890 * cred user credentials 891 * 892 * RETURNS: 0 Success 893 * !0 error number 894 */ 895 static 896 int 897 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 898 struct ucred *cred) 899 { 900 struct hammer2_mount_info info; 901 hammer2_pfs_t *pmp; 902 hammer2_pfs_t *spmp; 903 hammer2_dev_t *hmp; 904 hammer2_dev_t *force_local; 905 hammer2_key_t key_next; 906 hammer2_key_t key_dummy; 907 hammer2_key_t lhc; 908 struct vnode *devvp; 909 struct nlookupdata nd; 910 hammer2_chain_t *parent; 911 hammer2_chain_t *chain; 912 const hammer2_inode_data_t *ripdata; 913 hammer2_blockref_t bref; 914 struct file *fp; 915 char devstr[MNAMELEN]; 916 size_t size; 917 size_t done; 918 char *dev; 919 char *label; 920 int ronly = 1; 921 int error; 922 int i; 923 924 hmp = NULL; 925 pmp = NULL; 926 dev = NULL; 927 label = NULL; 928 devvp = NULL; 929 930 if (path == NULL) { 931 /* 932 * Root mount 933 */ 934 bzero(&info, sizeof(info)); 935 info.cluster_fd = -1; 936 ksnprintf(devstr, sizeof(devstr), "%s", 937 mp->mnt_stat.f_mntfromname); 938 kprintf("hammer2_mount: root '%s'\n", devstr); 939 done = strlen(devstr) + 1; 940 } else { 941 /* 942 * Non-root mount or updating a mount 943 */ 944 error = copyin(data, &info, sizeof(info)); 945 if (error) 946 return (error); 947 948 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); 949 if (error) 950 return (error); 951 kprintf("hammer2_mount: '%s'\n", devstr); 952 } 953 954 /* 955 * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA 956 * if no label specified, based on the partition id. Error out if no 957 * label or device (with partition id) is specified. This is strictly 958 * a convenience to match the default label created by newfs_hammer2, 959 * our preference is that a label always be specified. 960 * 961 * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command 962 * that does not specify a device, as long as some H2 label 963 * has already been mounted from that device. This makes 964 * mounting snapshots a lot easier. 965 */ 966 dev = devstr; 967 label = strchr(devstr, '@'); 968 if (label && ((label + 1) - dev) > done) { 969 kprintf("hammer2: mount: bad label %s/%zd\n", 970 devstr, done); 971 return (EINVAL); 972 } 973 if (label == NULL || label[1] == 0) { 974 char slice; 975 976 if (label == NULL) 977 label = devstr + strlen(devstr); 978 else 979 *label = '\0'; /* clean up trailing @ */ 980 981 slice = label[-1]; 982 switch(slice) { 983 case 'a': 984 label = "BOOT"; 985 break; 986 case 'd': 987 label = "ROOT"; 988 break; 989 default: 990 label = "DATA"; 991 break; 992 } 993 } else { 994 *label = '\0'; 995 label++; 996 } 997 998 kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n", 999 dev, label, (mp->mnt_flag & MNT_RDONLY)); 1000 1001 if (mp->mnt_flag & MNT_UPDATE) { 1002 /* 1003 * Update mount. Note that pmp->iroot->cluster is 1004 * an inode-embedded cluster and thus cannot be 1005 * directly locked. 1006 * 1007 * XXX HAMMER2 needs to implement NFS export via 1008 * mountctl. 1009 */ 1010 hammer2_cluster_t *cluster; 1011 1012 pmp = MPTOPMP(mp); 1013 pmp->hflags = info.hflags; 1014 cluster = &pmp->iroot->cluster; 1015 for (i = 0; i < cluster->nchains; ++i) { 1016 if (cluster->array[i].chain == NULL) 1017 continue; 1018 hmp = cluster->array[i].chain->hmp; 1019 devvp = hmp->devvp; 1020 error = hammer2_remount(hmp, mp, path, 1021 devvp, cred); 1022 if (error) 1023 break; 1024 } 1025 1026 return error; 1027 } 1028 1029 /* 1030 * HMP device mount 1031 * 1032 * If a path is specified and dev is not an empty string, lookup the 1033 * name and verify that it referes to a block device. 1034 * 1035 * If a path is specified and dev is an empty string we fall through 1036 * and locate the label in the hmp search. 1037 */ 1038 if (path && *dev != 0) { 1039 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); 1040 if (error == 0) 1041 error = nlookup(&nd); 1042 if (error == 0) 1043 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); 1044 nlookup_done(&nd); 1045 } else if (path == NULL) { 1046 /* root mount */ 1047 cdev_t cdev = kgetdiskbyname(dev); 1048 error = bdevvp(cdev, &devvp); 1049 if (error) 1050 kprintf("hammer2: cannot find '%s'\n", dev); 1051 } else { 1052 /* 1053 * We will locate the hmp using the label in the hmp loop. 1054 */ 1055 error = 0; 1056 } 1057 1058 /* 1059 * Make sure its a block device. Do not check to see if it is 1060 * already mounted until we determine that its a fresh H2 device. 1061 */ 1062 if (error == 0 && devvp) { 1063 vn_isdisk(devvp, &error); 1064 } 1065 1066 /* 1067 * Determine if the device has already been mounted. After this 1068 * check hmp will be non-NULL if we are doing the second or more 1069 * hammer2 mounts from the same device. 1070 */ 1071 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1072 if (devvp) { 1073 /* 1074 * Match the device. Due to the way devfs works, 1075 * we may not be able to directly match the vnode pointer, 1076 * so also check to see if the underlying device matches. 1077 */ 1078 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1079 if (hmp->devvp == devvp) 1080 break; 1081 if (devvp->v_rdev && 1082 hmp->devvp->v_rdev == devvp->v_rdev) { 1083 break; 1084 } 1085 } 1086 1087 /* 1088 * If no match this may be a fresh H2 mount, make sure 1089 * the device is not mounted on anything else. 1090 */ 1091 if (hmp == NULL) 1092 error = vfs_mountedon(devvp); 1093 } else if (error == 0) { 1094 /* 1095 * Match the label to a pmp already probed. 1096 */ 1097 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 1098 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 1099 if (pmp->pfs_names[i] && 1100 strcmp(pmp->pfs_names[i], label) == 0) { 1101 hmp = pmp->pfs_hmps[i]; 1102 break; 1103 } 1104 } 1105 if (hmp) 1106 break; 1107 } 1108 if (hmp == NULL) 1109 error = ENOENT; 1110 } 1111 1112 /* 1113 * Open the device if this isn't a secondary mount and construct 1114 * the H2 device mount (hmp). 1115 */ 1116 if (hmp == NULL) { 1117 hammer2_chain_t *schain; 1118 hammer2_xid_t xid; 1119 hammer2_xop_head_t xop; 1120 1121 if (error == 0 && vcount(devvp) > 0) { 1122 kprintf("Primary device already has references\n"); 1123 error = EBUSY; 1124 } 1125 1126 /* 1127 * Now open the device 1128 */ 1129 if (error == 0) { 1130 ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); 1131 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1132 error = vinvalbuf(devvp, V_SAVE, 0, 0); 1133 if (error == 0) { 1134 error = VOP_OPEN(devvp, 1135 (ronly ? FREAD : FREAD | FWRITE), 1136 FSCRED, NULL); 1137 } 1138 vn_unlock(devvp); 1139 } 1140 if (error && devvp) { 1141 vrele(devvp); 1142 devvp = NULL; 1143 } 1144 if (error) { 1145 lockmgr(&hammer2_mntlk, LK_RELEASE); 1146 return error; 1147 } 1148 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); 1149 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev); 1150 hmp->ronly = ronly; 1151 hmp->devvp = devvp; 1152 hmp->hflags = info.hflags & HMNT2_DEVFLAGS; 1153 kmalloc_create(&hmp->mchain, "HAMMER2-chains"); 1154 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry); 1155 RB_INIT(&hmp->iotree); 1156 spin_init(&hmp->io_spin, "hm2mount_io"); 1157 spin_init(&hmp->list_spin, "hm2mount_list"); 1158 TAILQ_INIT(&hmp->flushq); 1159 1160 lockinit(&hmp->vollk, "h2vol", 0, 0); 1161 lockinit(&hmp->bulklk, "h2bulk", 0, 0); 1162 lockinit(&hmp->bflock, "h2bflk", 0, 0); 1163 1164 /* 1165 * vchain setup. vchain.data is embedded. 1166 * vchain.refs is initialized and will never drop to 0. 1167 * 1168 * NOTE! voldata is not yet loaded. 1169 */ 1170 hmp->vchain.hmp = hmp; 1171 hmp->vchain.refs = 1; 1172 hmp->vchain.data = (void *)&hmp->voldata; 1173 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; 1174 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1175 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1176 1177 hammer2_chain_core_init(&hmp->vchain); 1178 /* hmp->vchain.u.xxx is left NULL */ 1179 1180 /* 1181 * fchain setup. fchain.data is embedded. 1182 * fchain.refs is initialized and will never drop to 0. 1183 * 1184 * The data is not used but needs to be initialized to 1185 * pass assertion muster. We use this chain primarily 1186 * as a placeholder for the freemap's top-level RBTREE 1187 * so it does not interfere with the volume's topology 1188 * RBTREE. 1189 */ 1190 hmp->fchain.hmp = hmp; 1191 hmp->fchain.refs = 1; 1192 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset; 1193 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP; 1194 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1195 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1196 hmp->fchain.bref.methods = 1197 HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) | 1198 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 1199 1200 hammer2_chain_core_init(&hmp->fchain); 1201 /* hmp->fchain.u.xxx is left NULL */ 1202 1203 /* 1204 * Install the volume header and initialize fields from 1205 * voldata. 1206 */ 1207 error = hammer2_install_volume_header(hmp); 1208 if (error) { 1209 hammer2_unmount_helper(mp, NULL, hmp); 1210 lockmgr(&hammer2_mntlk, LK_RELEASE); 1211 hammer2_vfs_unmount(mp, MNT_FORCE); 1212 return error; 1213 } 1214 1215 /* 1216 * Really important to get these right or flush will get 1217 * confused. 1218 */ 1219 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL); 1220 spmp = hmp->spmp; 1221 1222 /* 1223 * Dummy-up vchain and fchain's modify_tid. mirror_tid 1224 * is inherited from the volume header. 1225 */ 1226 xid = 0; 1227 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1228 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid; 1229 hmp->vchain.pmp = spmp; 1230 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1231 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid; 1232 hmp->fchain.pmp = spmp; 1233 1234 /* 1235 * First locate the super-root inode, which is key 0 1236 * relative to the volume header's blockset. 1237 * 1238 * Then locate the root inode by scanning the directory keyspace 1239 * represented by the label. 1240 */ 1241 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 1242 schain = hammer2_chain_lookup(&parent, &key_dummy, 1243 HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 1244 &error, 0); 1245 hammer2_chain_lookup_done(parent); 1246 if (schain == NULL) { 1247 kprintf("hammer2_mount: invalid super-root\n"); 1248 hammer2_unmount_helper(mp, NULL, hmp); 1249 lockmgr(&hammer2_mntlk, LK_RELEASE); 1250 hammer2_vfs_unmount(mp, MNT_FORCE); 1251 return EINVAL; 1252 } 1253 if (schain->error) { 1254 kprintf("hammer2_mount: error %s reading super-root\n", 1255 hammer2_error_str(schain->error)); 1256 hammer2_chain_unlock(schain); 1257 hammer2_chain_drop(schain); 1258 schain = NULL; 1259 hammer2_unmount_helper(mp, NULL, hmp); 1260 lockmgr(&hammer2_mntlk, LK_RELEASE); 1261 hammer2_vfs_unmount(mp, MNT_FORCE); 1262 return EINVAL; 1263 } 1264 1265 /* 1266 * The super-root always uses an inode_tid of 1 when 1267 * creating PFSs. 1268 */ 1269 spmp->inode_tid = 1; 1270 spmp->modify_tid = schain->bref.modify_tid + 1; 1271 1272 /* 1273 * Sanity-check schain's pmp and finish initialization. 1274 * Any chain belonging to the super-root topology should 1275 * have a NULL pmp (not even set to spmp). 1276 */ 1277 ripdata = &hammer2_chain_rdata(schain)->ipdata; 1278 KKASSERT(schain->pmp == NULL); 1279 spmp->pfs_clid = ripdata->meta.pfs_clid; 1280 1281 /* 1282 * Replace the dummy spmp->iroot with a real one. It's 1283 * easier to just do a wholesale replacement than to try 1284 * to update the chain and fixup the iroot fields. 1285 * 1286 * The returned inode is locked with the supplied cluster. 1287 */ 1288 hammer2_dummy_xop_from_chain(&xop, schain); 1289 hammer2_inode_drop(spmp->iroot); 1290 spmp->iroot = NULL; 1291 spmp->iroot = hammer2_inode_get(spmp, NULL, &xop, -1); 1292 spmp->spmp_hmp = hmp; 1293 spmp->pfs_types[0] = ripdata->meta.pfs_type; 1294 spmp->pfs_hmps[0] = hmp; 1295 hammer2_inode_ref(spmp->iroot); 1296 hammer2_inode_unlock(spmp->iroot); 1297 hammer2_cluster_unlock(&xop.cluster); 1298 hammer2_chain_drop(schain); 1299 /* do not call hammer2_cluster_drop() on an embedded cluster */ 1300 schain = NULL; /* now invalid */ 1301 /* leave spmp->iroot with one ref */ 1302 1303 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 1304 error = hammer2_recovery(hmp); 1305 if (error == 0) 1306 error |= hammer2_fixup_pfses(hmp); 1307 /* XXX do something with error */ 1308 } 1309 hammer2_update_pmps(hmp); 1310 hammer2_iocom_init(hmp); 1311 hammer2_bulkfree_init(hmp); 1312 1313 /* 1314 * Ref the cluster management messaging descriptor. The mount 1315 * program deals with the other end of the communications pipe. 1316 * 1317 * Root mounts typically do not supply one. 1318 */ 1319 if (info.cluster_fd >= 0) { 1320 fp = holdfp(curthread, info.cluster_fd, -1); 1321 if (fp) { 1322 hammer2_cluster_reconnect(hmp, fp); 1323 } else { 1324 kprintf("hammer2_mount: bad cluster_fd!\n"); 1325 } 1326 } 1327 } else { 1328 spmp = hmp->spmp; 1329 if (info.hflags & HMNT2_DEVFLAGS) { 1330 kprintf("hammer2: Warning: mount flags pertaining " 1331 "to the whole device may only be specified " 1332 "on the first mount of the device: %08x\n", 1333 info.hflags & HMNT2_DEVFLAGS); 1334 } 1335 } 1336 1337 /* 1338 * Force local mount (disassociate all PFSs from their clusters). 1339 * Used primarily for debugging. 1340 */ 1341 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1342 1343 /* 1344 * Lookup the mount point under the media-localized super-root. 1345 * Scanning hammer2_pfslist doesn't help us because it represents 1346 * PFS cluster ids which can aggregate several named PFSs together. 1347 * 1348 * cluster->pmp will incorrectly point to spmp and must be fixed 1349 * up later on. 1350 */ 1351 hammer2_inode_lock(spmp->iroot, 0); 1352 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1353 lhc = hammer2_dirhash(label, strlen(label)); 1354 chain = hammer2_chain_lookup(&parent, &key_next, 1355 lhc, lhc + HAMMER2_DIRHASH_LOMASK, 1356 &error, 0); 1357 while (chain) { 1358 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 1359 strcmp(label, chain->data->ipdata.filename) == 0) { 1360 break; 1361 } 1362 chain = hammer2_chain_next(&parent, chain, &key_next, 1363 key_next, 1364 lhc + HAMMER2_DIRHASH_LOMASK, 1365 &error, 0); 1366 } 1367 if (parent) { 1368 hammer2_chain_unlock(parent); 1369 hammer2_chain_drop(parent); 1370 } 1371 hammer2_inode_unlock(spmp->iroot); 1372 1373 /* 1374 * PFS could not be found? 1375 */ 1376 if (chain == NULL) { 1377 if (error) 1378 kprintf("hammer2_mount: PFS label I/O error\n"); 1379 else 1380 kprintf("hammer2_mount: PFS label not found\n"); 1381 hammer2_unmount_helper(mp, NULL, hmp); 1382 lockmgr(&hammer2_mntlk, LK_RELEASE); 1383 hammer2_vfs_unmount(mp, MNT_FORCE); 1384 1385 return EINVAL; 1386 } 1387 1388 /* 1389 * Acquire the pmp structure (it should have already been allocated 1390 * via hammer2_update_pmps() so do not pass cluster in to add to 1391 * available chains). 1392 * 1393 * Check if the cluster has already been mounted. A cluster can 1394 * only be mounted once, use null mounts to mount additional copies. 1395 */ 1396 if (chain->error) { 1397 kprintf("hammer2_mount: PFS label I/O error\n"); 1398 } else { 1399 ripdata = &chain->data->ipdata; 1400 bref = chain->bref; 1401 pmp = hammer2_pfsalloc(NULL, ripdata, 1402 bref.modify_tid, force_local); 1403 } 1404 hammer2_chain_unlock(chain); 1405 hammer2_chain_drop(chain); 1406 1407 /* 1408 * Finish the mount 1409 */ 1410 kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp); 1411 1412 if (pmp->mp) { 1413 kprintf("hammer2_mount: PFS already mounted!\n"); 1414 hammer2_unmount_helper(mp, NULL, hmp); 1415 lockmgr(&hammer2_mntlk, LK_RELEASE); 1416 hammer2_vfs_unmount(mp, MNT_FORCE); 1417 1418 return EBUSY; 1419 } 1420 1421 pmp->hflags = info.hflags; 1422 mp->mnt_flag |= MNT_LOCAL; 1423 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; /* all entry pts are SMP */ 1424 mp->mnt_kern_flag |= MNTK_THR_SYNC; /* new vsyncscan semantics */ 1425 1426 /* 1427 * required mount structure initializations 1428 */ 1429 mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE; 1430 mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE; 1431 1432 mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE; 1433 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 1434 1435 /* 1436 * Optional fields 1437 */ 1438 mp->mnt_iosize_max = MAXPHYS; 1439 1440 /* 1441 * Connect up mount pointers. 1442 */ 1443 hammer2_mount_helper(mp, pmp); 1444 1445 lockmgr(&hammer2_mntlk, LK_RELEASE); 1446 1447 /* 1448 * Finish setup 1449 */ 1450 vfs_getnewfsid(mp); 1451 vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops); 1452 vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops); 1453 vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops); 1454 1455 if (path) { 1456 copyinstr(info.volume, mp->mnt_stat.f_mntfromname, 1457 MNAMELEN - 1, &size); 1458 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); 1459 } /* else root mount, already in there */ 1460 1461 bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); 1462 if (path) { 1463 copyinstr(path, mp->mnt_stat.f_mntonname, 1464 sizeof(mp->mnt_stat.f_mntonname) - 1, 1465 &size); 1466 } else { 1467 /* root mount */ 1468 mp->mnt_stat.f_mntonname[0] = '/'; 1469 } 1470 1471 /* 1472 * Initial statfs to prime mnt_stat. 1473 */ 1474 hammer2_vfs_statfs(mp, &mp->mnt_stat, cred); 1475 1476 return 0; 1477 } 1478 1479 /* 1480 * Scan PFSs under the super-root and create hammer2_pfs structures. 1481 */ 1482 static 1483 void 1484 hammer2_update_pmps(hammer2_dev_t *hmp) 1485 { 1486 const hammer2_inode_data_t *ripdata; 1487 hammer2_chain_t *parent; 1488 hammer2_chain_t *chain; 1489 hammer2_blockref_t bref; 1490 hammer2_dev_t *force_local; 1491 hammer2_pfs_t *spmp; 1492 hammer2_pfs_t *pmp; 1493 hammer2_key_t key_next; 1494 int error; 1495 1496 /* 1497 * Force local mount (disassociate all PFSs from their clusters). 1498 * Used primarily for debugging. 1499 */ 1500 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1501 1502 /* 1503 * Lookup mount point under the media-localized super-root. 1504 * 1505 * cluster->pmp will incorrectly point to spmp and must be fixed 1506 * up later on. 1507 */ 1508 spmp = hmp->spmp; 1509 hammer2_inode_lock(spmp->iroot, 0); 1510 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1511 chain = hammer2_chain_lookup(&parent, &key_next, 1512 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 1513 &error, 0); 1514 while (chain) { 1515 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 1516 continue; 1517 if (chain->error) { 1518 kprintf("I/O error scanning PFS labels\n"); 1519 } else { 1520 ripdata = &chain->data->ipdata; 1521 bref = chain->bref; 1522 1523 pmp = hammer2_pfsalloc(chain, ripdata, 1524 bref.modify_tid, force_local); 1525 } 1526 chain = hammer2_chain_next(&parent, chain, &key_next, 1527 key_next, HAMMER2_KEY_MAX, 1528 &error, 0); 1529 } 1530 if (parent) { 1531 hammer2_chain_unlock(parent); 1532 hammer2_chain_drop(parent); 1533 } 1534 hammer2_inode_unlock(spmp->iroot); 1535 } 1536 1537 static 1538 int 1539 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused, 1540 struct vnode *devvp, struct ucred *cred) 1541 { 1542 int error; 1543 1544 if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { 1545 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1546 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL); 1547 vn_unlock(devvp); 1548 error = hammer2_recovery(hmp); 1549 if (error == 0) 1550 error |= hammer2_fixup_pfses(hmp); 1551 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1552 if (error == 0) { 1553 VOP_CLOSE(devvp, FREAD, NULL); 1554 hmp->ronly = 0; 1555 } else { 1556 VOP_CLOSE(devvp, FREAD | FWRITE, NULL); 1557 } 1558 vn_unlock(devvp); 1559 } else { 1560 error = 0; 1561 } 1562 return error; 1563 } 1564 1565 static 1566 int 1567 hammer2_vfs_unmount(struct mount *mp, int mntflags) 1568 { 1569 hammer2_pfs_t *pmp; 1570 int flags; 1571 int error = 0; 1572 1573 pmp = MPTOPMP(mp); 1574 1575 if (pmp == NULL) 1576 return(0); 1577 1578 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1579 1580 /* 1581 * If mount initialization proceeded far enough we must flush 1582 * its vnodes and sync the underlying mount points. Three syncs 1583 * are required to fully flush the filesystem (freemap updates lag 1584 * by one flush, and one extra for safety). 1585 */ 1586 if (mntflags & MNT_FORCE) 1587 flags = FORCECLOSE; 1588 else 1589 flags = 0; 1590 if (pmp->iroot) { 1591 error = vflush(mp, 0, flags); 1592 if (error) 1593 goto failed; 1594 hammer2_vfs_sync(mp, MNT_WAIT); 1595 hammer2_vfs_sync(mp, MNT_WAIT); 1596 hammer2_vfs_sync(mp, MNT_WAIT); 1597 } 1598 1599 /* 1600 * Cleanup the frontend support XOPS threads 1601 */ 1602 hammer2_xop_helper_cleanup(pmp); 1603 1604 if (pmp->mp) 1605 hammer2_unmount_helper(mp, pmp, NULL); 1606 1607 error = 0; 1608 failed: 1609 lockmgr(&hammer2_mntlk, LK_RELEASE); 1610 1611 return (error); 1612 } 1613 1614 /* 1615 * Mount helper, hook the system mount into our PFS. 1616 * The mount lock is held. 1617 * 1618 * We must bump the mount_count on related devices for any 1619 * mounted PFSs. 1620 */ 1621 static 1622 void 1623 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) 1624 { 1625 hammer2_cluster_t *cluster; 1626 hammer2_chain_t *rchain; 1627 int i; 1628 1629 mp->mnt_data = (qaddr_t)pmp; 1630 pmp->mp = mp; 1631 1632 /* 1633 * After pmp->mp is set we have to adjust hmp->mount_count. 1634 */ 1635 cluster = &pmp->iroot->cluster; 1636 for (i = 0; i < cluster->nchains; ++i) { 1637 rchain = cluster->array[i].chain; 1638 if (rchain == NULL) 1639 continue; 1640 ++rchain->hmp->mount_count; 1641 } 1642 1643 /* 1644 * Create missing Xop threads 1645 */ 1646 hammer2_xop_helper_create(pmp); 1647 } 1648 1649 /* 1650 * Mount helper, unhook the system mount from our PFS. 1651 * The mount lock is held. 1652 * 1653 * If hmp is supplied a mount responsible for being the first to open 1654 * the block device failed and the block device and all PFSs using the 1655 * block device must be cleaned up. 1656 * 1657 * If pmp is supplied multiple devices might be backing the PFS and each 1658 * must be disconnected. This might not be the last PFS using some of the 1659 * underlying devices. Also, we have to adjust our hmp->mount_count 1660 * accounting for the devices backing the pmp which is now undergoing an 1661 * unmount. 1662 */ 1663 static 1664 void 1665 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp) 1666 { 1667 hammer2_cluster_t *cluster; 1668 hammer2_chain_t *rchain; 1669 struct vnode *devvp; 1670 int dumpcnt; 1671 int ronly; 1672 int i; 1673 1674 /* 1675 * If no device supplied this is a high-level unmount and we have to 1676 * to disconnect the mount, adjust mount_count, and locate devices 1677 * that might now have no mounts. 1678 */ 1679 if (pmp) { 1680 KKASSERT(hmp == NULL); 1681 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp); 1682 pmp->mp = NULL; 1683 mp->mnt_data = NULL; 1684 1685 /* 1686 * After pmp->mp is cleared we have to account for 1687 * mount_count. 1688 */ 1689 cluster = &pmp->iroot->cluster; 1690 for (i = 0; i < cluster->nchains; ++i) { 1691 rchain = cluster->array[i].chain; 1692 if (rchain == NULL) 1693 continue; 1694 --rchain->hmp->mount_count; 1695 /* scrapping hmp now may invalidate the pmp */ 1696 } 1697 again: 1698 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1699 if (hmp->mount_count == 0) { 1700 hammer2_unmount_helper(NULL, NULL, hmp); 1701 goto again; 1702 } 1703 } 1704 return; 1705 } 1706 1707 /* 1708 * Try to terminate the block device. We can't terminate it if 1709 * there are still PFSs referencing it. 1710 */ 1711 if (hmp->mount_count) 1712 return; 1713 1714 /* 1715 * Decomission the network before we start messing with the 1716 * device and PFS. 1717 */ 1718 hammer2_iocom_uninit(hmp); 1719 1720 hammer2_bulkfree_uninit(hmp); 1721 hammer2_pfsfree_scan(hmp, 0); 1722 hammer2_dev_exlock(hmp); /* XXX order */ 1723 1724 /* 1725 * Cycle the volume data lock as a safety (probably not needed any 1726 * more). To ensure everything is out we need to flush at least 1727 * three times. (1) The running of the sideq can dirty the 1728 * filesystem, (2) A normal flush can dirty the freemap, and 1729 * (3) ensure that the freemap is fully synchronized. 1730 * 1731 * The next mount's recovery scan can clean everything up but we want 1732 * to leave the filesystem in a 100% clean state on a normal unmount. 1733 */ 1734 #if 0 1735 hammer2_voldata_lock(hmp); 1736 hammer2_voldata_unlock(hmp); 1737 #endif 1738 1739 /* 1740 * Flush whatever is left. Unmounted but modified PFS's might still 1741 * have some dirty chains on them. 1742 */ 1743 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1744 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 1745 1746 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1747 hammer2_voldata_modify(hmp); 1748 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP | 1749 HAMMER2_FLUSH_ALL); 1750 } 1751 hammer2_chain_unlock(&hmp->fchain); 1752 1753 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1754 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP | 1755 HAMMER2_FLUSH_ALL); 1756 } 1757 hammer2_chain_unlock(&hmp->vchain); 1758 1759 if ((hmp->vchain.flags | hmp->fchain.flags) & 1760 HAMMER2_CHAIN_FLUSH_MASK) { 1761 kprintf("hammer2_unmount: chains left over " 1762 "after final sync\n"); 1763 kprintf(" vchain %08x\n", hmp->vchain.flags); 1764 kprintf(" fchain %08x\n", hmp->fchain.flags); 1765 1766 if (hammer2_debug & 0x0010) 1767 Debugger("entered debugger"); 1768 } 1769 1770 hammer2_pfsfree_scan(hmp, 1); 1771 1772 KKASSERT(hmp->spmp == NULL); 1773 1774 /* 1775 * Finish up with the device vnode 1776 */ 1777 if ((devvp = hmp->devvp) != NULL) { 1778 ronly = hmp->ronly; 1779 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1780 kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n", 1781 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree), 1782 ronly); 1783 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0); 1784 kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n", 1785 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree)); 1786 hmp->devvp = NULL; 1787 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL); 1788 vn_unlock(devvp); 1789 vrele(devvp); 1790 devvp = NULL; 1791 } 1792 1793 /* 1794 * Clear vchain/fchain flags that might prevent final cleanup 1795 * of these chains. 1796 */ 1797 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) { 1798 atomic_add_long(&hammer2_count_modified_chains, -1); 1799 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1800 hammer2_pfs_memory_wakeup(hmp->vchain.pmp); 1801 } 1802 if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) { 1803 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE); 1804 } 1805 1806 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) { 1807 atomic_add_long(&hammer2_count_modified_chains, -1); 1808 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED); 1809 hammer2_pfs_memory_wakeup(hmp->fchain.pmp); 1810 } 1811 if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) { 1812 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE); 1813 } 1814 1815 /* 1816 * Final drop of embedded freemap root chain to 1817 * clean up fchain.core (fchain structure is not 1818 * flagged ALLOCATED so it is cleaned out and then 1819 * left to rot). 1820 */ 1821 hammer2_chain_drop(&hmp->fchain); 1822 1823 /* 1824 * Final drop of embedded volume root chain to clean 1825 * up vchain.core (vchain structure is not flagged 1826 * ALLOCATED so it is cleaned out and then left to 1827 * rot). 1828 */ 1829 dumpcnt = 50; 1830 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1); 1831 dumpcnt = 50; 1832 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1); 1833 hammer2_dev_unlock(hmp); 1834 hammer2_chain_drop(&hmp->vchain); 1835 1836 hammer2_io_cleanup(hmp, &hmp->iotree); 1837 if (hmp->iofree_count) { 1838 kprintf("io_cleanup: %d I/O's left hanging\n", 1839 hmp->iofree_count); 1840 } 1841 1842 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry); 1843 kmalloc_destroy(&hmp->mchain); 1844 kfree(hmp, M_HAMMER2); 1845 } 1846 1847 int 1848 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 1849 ino_t ino, struct vnode **vpp) 1850 { 1851 hammer2_xop_lookup_t *xop; 1852 hammer2_pfs_t *pmp; 1853 hammer2_inode_t *ip; 1854 hammer2_tid_t inum; 1855 int error; 1856 1857 inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK; 1858 1859 error = 0; 1860 pmp = MPTOPMP(mp); 1861 1862 /* 1863 * Easy if we already have it cached 1864 */ 1865 ip = hammer2_inode_lookup(pmp, inum); 1866 if (ip) { 1867 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 1868 *vpp = hammer2_igetv(ip, &error); 1869 hammer2_inode_unlock(ip); 1870 hammer2_inode_drop(ip); /* from lookup */ 1871 1872 return error; 1873 } 1874 1875 /* 1876 * Otherwise we have to find the inode 1877 */ 1878 xop = hammer2_xop_alloc(pmp->iroot, 0); 1879 xop->lhc = inum; 1880 hammer2_xop_start(&xop->head, hammer2_xop_lookup); 1881 error = hammer2_xop_collect(&xop->head, 0); 1882 1883 if (error == 0) 1884 ip = hammer2_inode_get(pmp, NULL, &xop->head, -1); 1885 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1886 1887 if (ip) { 1888 *vpp = hammer2_igetv(ip, &error); 1889 hammer2_inode_unlock(ip); 1890 } else { 1891 *vpp = NULL; 1892 error = ENOENT; 1893 } 1894 return (error); 1895 } 1896 1897 static 1898 int 1899 hammer2_vfs_root(struct mount *mp, struct vnode **vpp) 1900 { 1901 hammer2_pfs_t *pmp; 1902 struct vnode *vp; 1903 int error; 1904 1905 pmp = MPTOPMP(mp); 1906 if (pmp->iroot == NULL) { 1907 kprintf("hammer2 (%s): no root inode\n", 1908 mp->mnt_stat.f_mntfromname); 1909 *vpp = NULL; 1910 return EINVAL; 1911 } 1912 1913 error = 0; 1914 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1915 1916 while (pmp->inode_tid == 0) { 1917 hammer2_xop_ipcluster_t *xop; 1918 const hammer2_inode_meta_t *meta; 1919 1920 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING); 1921 hammer2_xop_start(&xop->head, hammer2_xop_ipcluster); 1922 error = hammer2_xop_collect(&xop->head, 0); 1923 1924 if (error == 0) { 1925 meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta; 1926 pmp->iroot->meta = *meta; 1927 pmp->inode_tid = meta->pfs_inum + 1; 1928 hammer2_xop_pdata(&xop->head); 1929 /* meta invalid */ 1930 1931 if (pmp->inode_tid < HAMMER2_INODE_START) 1932 pmp->inode_tid = HAMMER2_INODE_START; 1933 pmp->modify_tid = 1934 xop->head.cluster.focus->bref.modify_tid + 1; 1935 #if 0 1936 kprintf("PFS: Starting inode %jd\n", 1937 (intmax_t)pmp->inode_tid); 1938 kprintf("PMP focus good set nextino=%ld mod=%016jx\n", 1939 pmp->inode_tid, pmp->modify_tid); 1940 #endif 1941 wakeup(&pmp->iroot); 1942 1943 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1944 1945 /* 1946 * Prime the mount info. 1947 */ 1948 hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL); 1949 break; 1950 } 1951 1952 /* 1953 * Loop, try again 1954 */ 1955 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1956 hammer2_inode_unlock(pmp->iroot); 1957 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz); 1958 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1959 if (error == EINTR) 1960 break; 1961 } 1962 1963 if (error) { 1964 hammer2_inode_unlock(pmp->iroot); 1965 *vpp = NULL; 1966 } else { 1967 vp = hammer2_igetv(pmp->iroot, &error); 1968 hammer2_inode_unlock(pmp->iroot); 1969 *vpp = vp; 1970 } 1971 1972 return (error); 1973 } 1974 1975 /* 1976 * Filesystem status 1977 * 1978 * XXX incorporate ipdata->meta.inode_quota and data_quota 1979 */ 1980 static 1981 int 1982 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) 1983 { 1984 hammer2_pfs_t *pmp; 1985 hammer2_dev_t *hmp; 1986 hammer2_blockref_t bref; 1987 struct statfs tmp; 1988 int i; 1989 1990 /* 1991 * NOTE: iroot might not have validated the cluster yet. 1992 */ 1993 pmp = MPTOPMP(mp); 1994 1995 bzero(&tmp, sizeof(tmp)); 1996 1997 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 1998 hmp = pmp->pfs_hmps[i]; 1999 if (hmp == NULL) 2000 continue; 2001 if (pmp->iroot->cluster.array[i].chain) 2002 bref = pmp->iroot->cluster.array[i].chain->bref; 2003 else 2004 bzero(&bref, sizeof(bref)); 2005 2006 tmp.f_files = bref.embed.stats.inode_count; 2007 tmp.f_ffree = 0; 2008 tmp.f_blocks = hmp->voldata.allocator_size / 2009 mp->mnt_vstat.f_bsize; 2010 tmp.f_bfree = hmp->voldata.allocator_free / 2011 mp->mnt_vstat.f_bsize; 2012 tmp.f_bavail = tmp.f_bfree; 2013 2014 if (cred && cred->cr_uid != 0) { 2015 uint64_t adj; 2016 2017 /* 5% */ 2018 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2019 tmp.f_blocks -= adj; 2020 tmp.f_bfree -= adj; 2021 tmp.f_bavail -= adj; 2022 } 2023 2024 mp->mnt_stat.f_blocks = tmp.f_blocks; 2025 mp->mnt_stat.f_bfree = tmp.f_bfree; 2026 mp->mnt_stat.f_bavail = tmp.f_bavail; 2027 mp->mnt_stat.f_files = tmp.f_files; 2028 mp->mnt_stat.f_ffree = tmp.f_ffree; 2029 2030 *sbp = mp->mnt_stat; 2031 } 2032 return (0); 2033 } 2034 2035 static 2036 int 2037 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) 2038 { 2039 hammer2_pfs_t *pmp; 2040 hammer2_dev_t *hmp; 2041 hammer2_blockref_t bref; 2042 struct statvfs tmp; 2043 int i; 2044 2045 /* 2046 * NOTE: iroot might not have validated the cluster yet. 2047 */ 2048 pmp = MPTOPMP(mp); 2049 bzero(&tmp, sizeof(tmp)); 2050 2051 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2052 hmp = pmp->pfs_hmps[i]; 2053 if (hmp == NULL) 2054 continue; 2055 if (pmp->iroot->cluster.array[i].chain) 2056 bref = pmp->iroot->cluster.array[i].chain->bref; 2057 else 2058 bzero(&bref, sizeof(bref)); 2059 2060 tmp.f_files = bref.embed.stats.inode_count; 2061 tmp.f_ffree = 0; 2062 tmp.f_blocks = hmp->voldata.allocator_size / 2063 mp->mnt_vstat.f_bsize; 2064 tmp.f_bfree = hmp->voldata.allocator_free / 2065 mp->mnt_vstat.f_bsize; 2066 tmp.f_bavail = tmp.f_bfree; 2067 2068 if (cred && cred->cr_uid != 0) { 2069 uint64_t adj; 2070 2071 /* 5% */ 2072 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2073 tmp.f_blocks -= adj; 2074 tmp.f_bfree -= adj; 2075 tmp.f_bavail -= adj; 2076 } 2077 2078 mp->mnt_vstat.f_blocks = tmp.f_blocks; 2079 mp->mnt_vstat.f_bfree = tmp.f_bfree; 2080 mp->mnt_vstat.f_bavail = tmp.f_bavail; 2081 mp->mnt_vstat.f_files = tmp.f_files; 2082 mp->mnt_vstat.f_ffree = tmp.f_ffree; 2083 2084 *sbp = mp->mnt_vstat; 2085 } 2086 return (0); 2087 } 2088 2089 /* 2090 * Mount-time recovery (RW mounts) 2091 * 2092 * Updates to the free block table are allowed to lag flushes by one 2093 * transaction. In case of a crash, then on a fresh mount we must do an 2094 * incremental scan of the last committed transaction id and make sure that 2095 * all related blocks have been marked allocated. 2096 * 2097 * The super-root topology and each PFS has its own transaction id domain, 2098 * so we must track PFS boundary transitions. 2099 */ 2100 struct hammer2_recovery_elm { 2101 TAILQ_ENTRY(hammer2_recovery_elm) entry; 2102 hammer2_chain_t *chain; 2103 hammer2_tid_t sync_tid; 2104 }; 2105 2106 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); 2107 2108 struct hammer2_recovery_info { 2109 struct hammer2_recovery_list list; 2110 hammer2_tid_t mtid; 2111 int depth; 2112 }; 2113 2114 static int hammer2_recovery_scan(hammer2_dev_t *hmp, 2115 hammer2_chain_t *parent, 2116 struct hammer2_recovery_info *info, 2117 hammer2_tid_t sync_tid); 2118 2119 #define HAMMER2_RECOVERY_MAXDEPTH 10 2120 2121 static 2122 int 2123 hammer2_recovery(hammer2_dev_t *hmp) 2124 { 2125 struct hammer2_recovery_info info; 2126 struct hammer2_recovery_elm *elm; 2127 hammer2_chain_t *parent; 2128 hammer2_tid_t sync_tid; 2129 hammer2_tid_t mirror_tid; 2130 int error; 2131 2132 hammer2_trans_init(hmp->spmp, 0); 2133 2134 sync_tid = hmp->voldata.freemap_tid; 2135 mirror_tid = hmp->voldata.mirror_tid; 2136 2137 kprintf("hammer2 mount \"%s\": ", hmp->devrepname); 2138 if (sync_tid >= mirror_tid) { 2139 kprintf(" no recovery needed\n"); 2140 } else { 2141 kprintf(" freemap recovery %016jx-%016jx\n", 2142 sync_tid + 1, mirror_tid); 2143 } 2144 2145 TAILQ_INIT(&info.list); 2146 info.depth = 0; 2147 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 2148 error = hammer2_recovery_scan(hmp, parent, &info, sync_tid); 2149 hammer2_chain_lookup_done(parent); 2150 2151 while ((elm = TAILQ_FIRST(&info.list)) != NULL) { 2152 TAILQ_REMOVE(&info.list, elm, entry); 2153 parent = elm->chain; 2154 sync_tid = elm->sync_tid; 2155 kfree(elm, M_HAMMER2); 2156 2157 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2158 error |= hammer2_recovery_scan(hmp, parent, &info, 2159 hmp->voldata.freemap_tid); 2160 hammer2_chain_unlock(parent); 2161 hammer2_chain_drop(parent); /* drop elm->chain ref */ 2162 } 2163 2164 hammer2_trans_done(hmp->spmp, 0); 2165 2166 return error; 2167 } 2168 2169 static 2170 int 2171 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, 2172 struct hammer2_recovery_info *info, 2173 hammer2_tid_t sync_tid) 2174 { 2175 const hammer2_inode_data_t *ripdata; 2176 hammer2_chain_t *chain; 2177 hammer2_blockref_t bref; 2178 int tmp_error; 2179 int rup_error; 2180 int error; 2181 int first; 2182 2183 /* 2184 * Adjust freemap to ensure that the block(s) are marked allocated. 2185 */ 2186 if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { 2187 hammer2_freemap_adjust(hmp, &parent->bref, 2188 HAMMER2_FREEMAP_DORECOVER); 2189 } 2190 2191 /* 2192 * Check type for recursive scan 2193 */ 2194 switch(parent->bref.type) { 2195 case HAMMER2_BREF_TYPE_VOLUME: 2196 /* data already instantiated */ 2197 break; 2198 case HAMMER2_BREF_TYPE_INODE: 2199 /* 2200 * Must instantiate data for DIRECTDATA test and also 2201 * for recursion. 2202 */ 2203 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2204 ripdata = &hammer2_chain_rdata(parent)->ipdata; 2205 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2206 /* not applicable to recovery scan */ 2207 hammer2_chain_unlock(parent); 2208 return 0; 2209 } 2210 hammer2_chain_unlock(parent); 2211 break; 2212 case HAMMER2_BREF_TYPE_INDIRECT: 2213 /* 2214 * Must instantiate data for recursion 2215 */ 2216 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2217 hammer2_chain_unlock(parent); 2218 break; 2219 case HAMMER2_BREF_TYPE_DIRENT: 2220 case HAMMER2_BREF_TYPE_DATA: 2221 case HAMMER2_BREF_TYPE_FREEMAP: 2222 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2223 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2224 /* not applicable to recovery scan */ 2225 return 0; 2226 break; 2227 default: 2228 return HAMMER2_ERROR_BADBREF; 2229 } 2230 2231 /* 2232 * Defer operation if depth limit reached or if we are crossing a 2233 * PFS boundary. 2234 */ 2235 if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) { 2236 struct hammer2_recovery_elm *elm; 2237 2238 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); 2239 elm->chain = parent; 2240 elm->sync_tid = sync_tid; 2241 hammer2_chain_ref(parent); 2242 TAILQ_INSERT_TAIL(&info->list, elm, entry); 2243 /* unlocked by caller */ 2244 2245 return(0); 2246 } 2247 2248 2249 /* 2250 * Recursive scan of the last flushed transaction only. We are 2251 * doing this without pmp assignments so don't leave the chains 2252 * hanging around after we are done with them. 2253 * 2254 * error Cumulative error this level only 2255 * rup_error Cumulative error for recursion 2256 * tmp_error Specific non-cumulative recursion error 2257 */ 2258 chain = NULL; 2259 first = 1; 2260 rup_error = 0; 2261 error = 0; 2262 2263 for (;;) { 2264 error |= hammer2_chain_scan(parent, &chain, &bref, 2265 &first, 2266 HAMMER2_LOOKUP_NODATA); 2267 2268 /* 2269 * Problem during scan or EOF 2270 */ 2271 if (error) 2272 break; 2273 2274 /* 2275 * If this is a leaf 2276 */ 2277 if (chain == NULL) { 2278 if (bref.mirror_tid > sync_tid) { 2279 hammer2_freemap_adjust(hmp, &bref, 2280 HAMMER2_FREEMAP_DORECOVER); 2281 } 2282 continue; 2283 } 2284 2285 /* 2286 * This may or may not be a recursive node. 2287 */ 2288 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 2289 if (bref.mirror_tid > sync_tid) { 2290 ++info->depth; 2291 tmp_error = hammer2_recovery_scan(hmp, chain, 2292 info, sync_tid); 2293 --info->depth; 2294 } else { 2295 tmp_error = 0; 2296 } 2297 2298 /* 2299 * Flush the recovery at the PFS boundary to stage it for 2300 * the final flush of the super-root topology. 2301 */ 2302 if (tmp_error == 0 && 2303 (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && 2304 (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { 2305 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2306 HAMMER2_FLUSH_ALL); 2307 } 2308 rup_error |= tmp_error; 2309 } 2310 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 2311 } 2312 2313 /* 2314 * This fixes up an error introduced in earlier H2 implementations where 2315 * moving a PFS inode into an indirect block wound up causing the 2316 * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared. 2317 */ 2318 static 2319 int 2320 hammer2_fixup_pfses(hammer2_dev_t *hmp) 2321 { 2322 const hammer2_inode_data_t *ripdata; 2323 hammer2_chain_t *parent; 2324 hammer2_chain_t *chain; 2325 hammer2_key_t key_next; 2326 hammer2_pfs_t *spmp; 2327 int error; 2328 2329 error = 0; 2330 2331 /* 2332 * Lookup mount point under the media-localized super-root. 2333 * 2334 * cluster->pmp will incorrectly point to spmp and must be fixed 2335 * up later on. 2336 */ 2337 spmp = hmp->spmp; 2338 hammer2_inode_lock(spmp->iroot, 0); 2339 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 2340 chain = hammer2_chain_lookup(&parent, &key_next, 2341 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 2342 &error, 0); 2343 while (chain) { 2344 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 2345 continue; 2346 if (chain->error) { 2347 kprintf("I/O error scanning PFS labels\n"); 2348 error |= chain->error; 2349 } else if ((chain->bref.flags & 2350 HAMMER2_BREF_FLAG_PFSROOT) == 0) { 2351 int error2; 2352 2353 ripdata = &chain->data->ipdata; 2354 hammer2_trans_init(hmp->spmp, 0); 2355 error2 = hammer2_chain_modify(chain, 2356 chain->bref.modify_tid, 2357 0, 0); 2358 if (error2 == 0) { 2359 kprintf("hammer2: Correct mis-flagged PFS %s\n", 2360 ripdata->filename); 2361 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 2362 } else { 2363 error |= error2; 2364 } 2365 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2366 HAMMER2_FLUSH_ALL); 2367 hammer2_trans_done(hmp->spmp, 0); 2368 } 2369 chain = hammer2_chain_next(&parent, chain, &key_next, 2370 key_next, HAMMER2_KEY_MAX, 2371 &error, 0); 2372 } 2373 if (parent) { 2374 hammer2_chain_unlock(parent); 2375 hammer2_chain_drop(parent); 2376 } 2377 hammer2_inode_unlock(spmp->iroot); 2378 2379 return error; 2380 } 2381 2382 /* 2383 * Sync a mount point; this is called periodically on a per-mount basis from 2384 * the filesystem syncer, and whenever a user issues a sync. 2385 */ 2386 int 2387 hammer2_vfs_sync(struct mount *mp, int waitfor) 2388 { 2389 hammer2_xop_flush_t *xop; 2390 struct hammer2_sync_info info; 2391 hammer2_inode_t *iroot; 2392 hammer2_pfs_t *pmp; 2393 int flags; 2394 int error; 2395 2396 pmp = MPTOPMP(mp); 2397 iroot = pmp->iroot; 2398 KKASSERT(iroot); 2399 KKASSERT(iroot->pmp == pmp); 2400 2401 /* 2402 * We can't acquire locks on existing vnodes while in a transaction 2403 * without risking a deadlock. This assumes that vfsync() can be 2404 * called without the vnode locked (which it can in DragonFly). 2405 * Otherwise we'd have to implement a multi-pass or flag the lock 2406 * failures and retry. 2407 * 2408 * The reclamation code interlocks with the sync list's token 2409 * (by removing the vnode from the scan list) before unlocking 2410 * the inode, giving us time to ref the inode. 2411 */ 2412 /*flags = VMSC_GETVP;*/ 2413 flags = 0; 2414 if (waitfor & MNT_LAZY) 2415 flags |= VMSC_ONEPASS; 2416 2417 /* 2418 * Flush vnodes individually using a normal transaction to avoid 2419 * stalling any concurrent operations. This will flush the related 2420 * buffer cache buffers and inodes to the media. 2421 * 2422 * For efficiency do an async pass before making sure with a 2423 * synchronous pass on all related buffer cache buffers. 2424 */ 2425 hammer2_trans_init(pmp, 0); 2426 2427 info.error = 0; 2428 2429 info.waitfor = MNT_NOWAIT; 2430 info.pass = 1; 2431 vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); 2432 2433 /* 2434 * Now do two passes making sure we get everything. The first pass 2435 * vfsync()s dirty vnodes. The second pass waits for their I/O's 2436 * to finish and cleans up the dirty flag on the vnode. 2437 */ 2438 info.pass = 1; 2439 info.waitfor = MNT_WAIT; 2440 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2441 2442 info.pass = 2; 2443 info.waitfor = MNT_WAIT; 2444 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2445 2446 /* 2447 * We must also run the sideq to handle any disconnected inodes 2448 * as the vnode scan will not see these. 2449 */ 2450 hammer2_inode_run_sideq(pmp, 1); 2451 hammer2_trans_done(pmp, 0); 2452 2453 /* 2454 * Start our flush transaction and flush the root topology down to 2455 * the inodes, but not the inodes themselves (which we already flushed 2456 * above). Any concurrent activity effecting inode contents will not 2457 * 2458 * The flush sequence will 2459 * 2460 * NOTE! It is still possible for the paging code to push pages 2461 * out via a UIO_NOCOPY hammer2_vop_write() during the main 2462 * flush. 2463 */ 2464 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 2465 2466 /* 2467 * sync dirty vnodes again while in the flush transaction. This is 2468 * currently an expensive shim to makre sure the logical topology is 2469 * completely consistent before we flush the volume header. 2470 */ 2471 info.pass = 1; 2472 info.waitfor = MNT_WAIT; 2473 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2474 2475 info.pass = 2; 2476 info.waitfor = MNT_WAIT; 2477 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2478 2479 /* 2480 * Use the XOP interface to concurrently flush all nodes to 2481 * synchronize the PFSROOT subtopology to the media. A standard 2482 * end-of-scan ENOENT error indicates cluster sufficiency. 2483 * 2484 * Note that this flush will not be visible on crash recovery until 2485 * we flush the super-root topology in the next loop. 2486 * 2487 * XXX For now wait for all flushes to complete. 2488 */ 2489 if (iroot) { 2490 /* 2491 * If unmounting try to flush everything including any 2492 * sub-trees under inodes, just in case there is dangling 2493 * modified data, as a safety. Otherwise just flush up to 2494 * the inodes in this stage. 2495 */ 2496 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 2497 xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING | 2498 HAMMER2_XOP_VOLHDR); 2499 } else { 2500 xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING | 2501 HAMMER2_XOP_INODE_STOP | 2502 HAMMER2_XOP_VOLHDR); 2503 } 2504 hammer2_xop_start(&xop->head, hammer2_inode_xop_flush); 2505 error = hammer2_xop_collect(&xop->head, 2506 HAMMER2_XOP_COLLECT_WAITALL); 2507 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2508 if (error == HAMMER2_ERROR_ENOENT) 2509 error = 0; 2510 else 2511 error = hammer2_error_to_errno(error); 2512 } else { 2513 error = 0; 2514 } 2515 hammer2_trans_done(pmp, 0); 2516 2517 return (error); 2518 } 2519 2520 /* 2521 * Sync passes. 2522 * 2523 * Note that we ignore the tranasction mtid we got above. Instead, 2524 * each vfsync below will ultimately get its own via TRANS_BUFCACHE 2525 * transactions. 2526 * 2527 * WARNING! The frontend might be waiting on chnmem (limit_dirty_chains) 2528 * while holding a vnode locked. When this situation occurs we cannot 2529 * safely test whether it is ok to clear the dirty bit on the vnode. 2530 * However, we can still flush the inode's topology. 2531 */ 2532 static int 2533 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data) 2534 { 2535 struct hammer2_sync_info *info = data; 2536 hammer2_inode_t *ip; 2537 int error; 2538 2539 /* 2540 * Degenerate cases. Note that ip == NULL typically means the 2541 * syncer vnode itself and we don't want to vclrisdirty() in that 2542 * situation. 2543 */ 2544 ip = VTOI(vp); 2545 if (ip == NULL) { 2546 return(0); 2547 } 2548 if (vp->v_type == VNON || vp->v_type == VBAD) { 2549 vclrisdirty(vp); 2550 return(0); 2551 } 2552 2553 /* 2554 * Synchronize the buffer cche and inode meta-data to the backing 2555 * chain topology. 2556 * 2557 * vfsync is not necessarily synchronous, so it is best NOT to try 2558 * to flush the backing topology to media at this point. 2559 */ 2560 hammer2_inode_ref(ip); 2561 if ((ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED)) || 2562 !RB_EMPTY(&vp->v_rbdirty_tree)) { 2563 if (info->pass == 1) 2564 vfsync(vp, info->waitfor, 1, NULL, NULL); 2565 else 2566 bio_track_wait(&vp->v_track_write, 0, 0); 2567 } 2568 if (info->pass == 2 && (vp->v_flag & VISDIRTY)) { 2569 /* 2570 * v_token is needed to interlock v_rbdirty_tree. 2571 */ 2572 lwkt_gettoken(&vp->v_token); 2573 hammer2_inode_lock(ip, 0); 2574 hammer2_inode_chain_sync(ip); 2575 hammer2_inode_chain_flush(ip); 2576 if ((ip->flags & (HAMMER2_INODE_MODIFIED | 2577 HAMMER2_INODE_RESIZED | 2578 HAMMER2_INODE_DIRTYDATA)) == 0 && 2579 RB_EMPTY(&vp->v_rbdirty_tree) && 2580 !bio_track_active(&vp->v_track_write)) { 2581 vclrisdirty(vp); 2582 } 2583 hammer2_inode_unlock(ip); 2584 lwkt_reltoken(&vp->v_token); 2585 } 2586 hammer2_inode_drop(ip); 2587 #if 1 2588 error = 0; 2589 if (error) 2590 info->error = error; 2591 #endif 2592 return(0); 2593 } 2594 2595 static 2596 int 2597 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp) 2598 { 2599 hammer2_inode_t *ip; 2600 2601 KKASSERT(MAXFIDSZ >= 16); 2602 ip = VTOI(vp); 2603 fhp->fid_len = offsetof(struct fid, fid_data[16]); 2604 fhp->fid_ext = 0; 2605 ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum; 2606 ((hammer2_tid_t *)fhp->fid_data)[1] = 0; 2607 2608 return 0; 2609 } 2610 2611 static 2612 int 2613 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 2614 struct fid *fhp, struct vnode **vpp) 2615 { 2616 hammer2_pfs_t *pmp; 2617 hammer2_tid_t inum; 2618 int error; 2619 2620 pmp = MPTOPMP(mp); 2621 inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK; 2622 if (vpp) { 2623 if (inum == 1) 2624 error = hammer2_vfs_root(mp, vpp); 2625 else 2626 error = hammer2_vfs_vget(mp, NULL, inum, vpp); 2627 } else { 2628 error = 0; 2629 } 2630 if (error) 2631 kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error); 2632 return error; 2633 } 2634 2635 static 2636 int 2637 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 2638 int *exflagsp, struct ucred **credanonp) 2639 { 2640 hammer2_pfs_t *pmp; 2641 struct netcred *np; 2642 int error; 2643 2644 pmp = MPTOPMP(mp); 2645 np = vfs_export_lookup(mp, &pmp->export, nam); 2646 if (np) { 2647 *exflagsp = np->netc_exflags; 2648 *credanonp = &np->netc_anon; 2649 error = 0; 2650 } else { 2651 error = EACCES; 2652 } 2653 return error; 2654 } 2655 2656 /* 2657 * Support code for hammer2_vfs_mount(). Read, verify, and install the volume 2658 * header into the HMP 2659 * 2660 * XXX read four volhdrs and use the one with the highest TID whos CRC 2661 * matches. 2662 * 2663 * XXX check iCRCs. 2664 * 2665 * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to 2666 * nonexistant locations. 2667 * 2668 * XXX Record selected volhdr and ring updates to each of 4 volhdrs 2669 */ 2670 static 2671 int 2672 hammer2_install_volume_header(hammer2_dev_t *hmp) 2673 { 2674 hammer2_volume_data_t *vd; 2675 struct buf *bp; 2676 hammer2_crc32_t crc0, crc, bcrc0, bcrc; 2677 int error_reported; 2678 int error; 2679 int valid; 2680 int i; 2681 2682 error_reported = 0; 2683 error = 0; 2684 valid = 0; 2685 bp = NULL; 2686 2687 /* 2688 * There are up to 4 copies of the volume header (syncs iterate 2689 * between them so there is no single master). We don't trust the 2690 * volu_size field so we don't know precisely how large the filesystem 2691 * is, so depend on the OS to return an error if we go beyond the 2692 * block device's EOF. 2693 */ 2694 for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { 2695 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2696 HAMMER2_VOLUME_BYTES, &bp); 2697 if (error) { 2698 brelse(bp); 2699 bp = NULL; 2700 continue; 2701 } 2702 2703 vd = (struct hammer2_volume_data *) bp->b_data; 2704 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) && 2705 (vd->magic != HAMMER2_VOLUME_ID_ABO)) { 2706 brelse(bp); 2707 bp = NULL; 2708 continue; 2709 } 2710 2711 if (vd->magic == HAMMER2_VOLUME_ID_ABO) { 2712 /* XXX: Reversed-endianness filesystem */ 2713 kprintf("hammer2: reverse-endian filesystem detected"); 2714 brelse(bp); 2715 bp = NULL; 2716 continue; 2717 } 2718 2719 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0]; 2720 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF, 2721 HAMMER2_VOLUME_ICRC0_SIZE); 2722 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1]; 2723 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF, 2724 HAMMER2_VOLUME_ICRC1_SIZE); 2725 if ((crc0 != crc) || (bcrc0 != bcrc)) { 2726 kprintf("hammer2 volume header crc " 2727 "mismatch copy #%d %08x/%08x\n", 2728 i, crc0, crc); 2729 error_reported = 1; 2730 brelse(bp); 2731 bp = NULL; 2732 continue; 2733 } 2734 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) { 2735 valid = 1; 2736 hmp->voldata = *vd; 2737 hmp->volhdrno = i; 2738 } 2739 brelse(bp); 2740 bp = NULL; 2741 } 2742 if (valid) { 2743 hmp->volsync = hmp->voldata; 2744 hmp->free_reserved = hmp->voldata.allocator_size / 20; 2745 error = 0; 2746 if (error_reported || bootverbose || 1) { /* 1/DEBUG */ 2747 kprintf("hammer2: using volume header #%d\n", 2748 hmp->volhdrno); 2749 } 2750 } else { 2751 error = EINVAL; 2752 kprintf("hammer2: no valid volume headers found!\n"); 2753 } 2754 return (error); 2755 } 2756 2757 /* 2758 * This handles hysteresis on regular file flushes. Because the BIOs are 2759 * routed to a thread it is possible for an excessive number to build up 2760 * and cause long front-end stalls long before the runningbuffspace limit 2761 * is hit, so we implement hammer2_flush_pipe to control the 2762 * hysteresis. 2763 * 2764 * This is a particular problem when compression is used. 2765 */ 2766 void 2767 hammer2_lwinprog_ref(hammer2_pfs_t *pmp) 2768 { 2769 atomic_add_int(&pmp->count_lwinprog, 1); 2770 } 2771 2772 void 2773 hammer2_lwinprog_drop(hammer2_pfs_t *pmp) 2774 { 2775 int lwinprog; 2776 2777 lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1); 2778 if ((lwinprog & HAMMER2_LWINPROG_WAITING) && 2779 (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) { 2780 atomic_clear_int(&pmp->count_lwinprog, 2781 HAMMER2_LWINPROG_WAITING); 2782 wakeup(&pmp->count_lwinprog); 2783 } 2784 if ((lwinprog & HAMMER2_LWINPROG_WAITING0) && 2785 (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) { 2786 atomic_clear_int(&pmp->count_lwinprog, 2787 HAMMER2_LWINPROG_WAITING0); 2788 wakeup(&pmp->count_lwinprog); 2789 } 2790 } 2791 2792 void 2793 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe) 2794 { 2795 int lwinprog; 2796 int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING : 2797 HAMMER2_LWINPROG_WAITING0; 2798 2799 for (;;) { 2800 lwinprog = pmp->count_lwinprog; 2801 cpu_ccfence(); 2802 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 2803 break; 2804 tsleep_interlock(&pmp->count_lwinprog, 0); 2805 atomic_set_int(&pmp->count_lwinprog, lwflag); 2806 lwinprog = pmp->count_lwinprog; 2807 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 2808 break; 2809 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz); 2810 } 2811 } 2812 2813 /* 2814 * Attempt to proactively fsync dirty vnodes if we have too many. This 2815 * solves an issue where the kernel syncer thread can get seriously behind 2816 * when multiple user processes/threads are furiously modifying inodes. 2817 * This situation can occur on slow storage and is only limited by 2818 * kern.maxvnodes without the moderation code below. It is made worse 2819 * when the device buffers underlying the modified inodes (which are clean) 2820 * get evicted before the flush can occur, forcing a re-read. 2821 * 2822 * We do not want sysads to feel that they have to torpedo kern.maxvnodes 2823 * to solve this problem, so we implement vfs.hammer2.limit_dirty_inodes 2824 * (per-mount-basis) and default it to something reasonable. 2825 */ 2826 static void 2827 hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate) 2828 { 2829 hammer2_pfs_t *pmp = ip->pmp; 2830 struct mount *mp = pmp->mp; 2831 2832 if (mp && vn_syncer_count(mp) > hammer2_limit_dirty_inodes) { 2833 vn_syncer_one(mp); 2834 } 2835 } 2836 2837 /* 2838 * Manage excessive memory resource use for chain and related 2839 * structures. 2840 * 2841 * Called without any inode locks or transaction locks. VNodes 2842 * might be locked by the kernel in the call stack. 2843 */ 2844 void 2845 hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate) 2846 { 2847 hammer2_pfs_t *pmp = ip->pmp; 2848 uint32_t waiting; 2849 uint32_t count; 2850 uint32_t limit; 2851 #if 0 2852 static int zzticks; 2853 #endif 2854 2855 /* 2856 * Moderate the number of dirty inodes 2857 */ 2858 hammer2_pfs_moderate(ip, always_moderate); 2859 2860 /* 2861 * Atomic check condition and wait. Also do an early speedup of 2862 * the syncer to try to avoid hitting the wait. 2863 */ 2864 for (;;) { 2865 waiting = pmp->inmem_dirty_chains; 2866 cpu_ccfence(); 2867 count = waiting & HAMMER2_DIRTYCHAIN_MASK; 2868 2869 limit = pmp->mp->mnt_nvnodelistsize / 10; 2870 if (limit < hammer2_limit_dirty_chains) 2871 limit = hammer2_limit_dirty_chains; 2872 if (limit < 1000) 2873 limit = 1000; 2874 2875 #if 0 2876 if ((int)(ticks - zzticks) > hz) { 2877 zzticks = ticks; 2878 kprintf("count %ld %ld\n", count, limit); 2879 } 2880 #endif 2881 2882 /* 2883 * Block if there are too many dirty chains present, wait 2884 * for the flush to clean some out. 2885 */ 2886 if (count > limit) { 2887 hammer2_pfs_moderate(ip, always_moderate); 2888 tsleep_interlock(&pmp->inmem_dirty_chains, 0); 2889 if (atomic_cmpset_int(&pmp->inmem_dirty_chains, 2890 waiting, 2891 waiting | HAMMER2_DIRTYCHAIN_WAITING)) { 2892 if (ticks != pmp->speedup_ticks) { 2893 pmp->speedup_ticks = ticks; 2894 speedup_syncer(pmp->mp); 2895 } 2896 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED, 2897 "chnmem", hz); 2898 } 2899 continue; /* loop on success or fail */ 2900 } 2901 2902 /* 2903 * Try to start an early flush before we are forced to block. 2904 */ 2905 if (count > limit * 5 / 10 && 2906 ticks != pmp->speedup_ticks) { 2907 pmp->speedup_ticks = ticks; 2908 speedup_syncer(pmp->mp); 2909 } 2910 break; 2911 } 2912 } 2913 2914 void 2915 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp) 2916 { 2917 if (pmp) { 2918 atomic_add_int(&pmp->inmem_dirty_chains, 1); 2919 } 2920 } 2921 2922 void 2923 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp) 2924 { 2925 uint32_t waiting; 2926 2927 if (pmp) { 2928 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, -1); 2929 /* don't need --waiting to test flag */ 2930 if (waiting & HAMMER2_DIRTYCHAIN_WAITING) { 2931 atomic_clear_int(&pmp->inmem_dirty_chains, 2932 HAMMER2_DIRTYCHAIN_WAITING); 2933 wakeup(&pmp->inmem_dirty_chains); 2934 } 2935 } 2936 } 2937 2938 /* 2939 * Returns 0 if the filesystem has tons of free space 2940 * Returns 1 if the filesystem has less than 10% remaining 2941 * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining. 2942 */ 2943 int 2944 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred) 2945 { 2946 hammer2_pfs_t *pmp; 2947 hammer2_dev_t *hmp; 2948 hammer2_off_t free_reserved; 2949 hammer2_off_t free_nominal; 2950 int i; 2951 2952 pmp = ip->pmp; 2953 2954 if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) { 2955 free_reserved = HAMMER2_SEGSIZE; 2956 free_nominal = 0x7FFFFFFFFFFFFFFFLLU; 2957 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2958 hmp = pmp->pfs_hmps[i]; 2959 if (hmp == NULL) 2960 continue; 2961 if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER && 2962 pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER) 2963 continue; 2964 2965 if (free_nominal > hmp->voldata.allocator_free) 2966 free_nominal = hmp->voldata.allocator_free; 2967 if (free_reserved < hmp->free_reserved) 2968 free_reserved = hmp->free_reserved; 2969 } 2970 2971 /* 2972 * SMP races ok 2973 */ 2974 pmp->free_reserved = free_reserved; 2975 pmp->free_nominal = free_nominal; 2976 pmp->free_ticks = ticks; 2977 } else { 2978 free_reserved = pmp->free_reserved; 2979 free_nominal = pmp->free_nominal; 2980 } 2981 if (cred && cred->cr_uid != 0) { 2982 if ((int64_t)(free_nominal - bytes) < 2983 (int64_t)free_reserved) { 2984 return 2; 2985 } 2986 } else { 2987 if ((int64_t)(free_nominal - bytes) < 2988 (int64_t)free_reserved / 2) { 2989 return 2; 2990 } 2991 } 2992 if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2) 2993 return 1; 2994 return 0; 2995 } 2996 2997 /* 2998 * Debugging 2999 */ 3000 void 3001 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, 3002 u_int flags) 3003 { 3004 hammer2_chain_t *scan; 3005 hammer2_chain_t *parent; 3006 3007 --*countp; 3008 if (*countp == 0) { 3009 kprintf("%*.*s...\n", tab, tab, ""); 3010 return; 3011 } 3012 if (*countp < 0) 3013 return; 3014 kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n", 3015 tab, tab, "", pfx, 3016 chain, chain->bref.type, 3017 chain->bref.key, chain->bref.keybits, 3018 chain->bref.mirror_tid); 3019 3020 kprintf("%*.*s [%08x] (%s) refs=%d", 3021 tab, tab, "", 3022 chain->flags, 3023 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 3024 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 3025 chain->refs); 3026 3027 parent = chain->parent; 3028 if (parent) 3029 kprintf("\n%*.*s p=%p [pflags %08x prefs %d", 3030 tab, tab, "", 3031 parent, parent->flags, parent->refs); 3032 if (RB_EMPTY(&chain->core.rbtree)) { 3033 kprintf("\n"); 3034 } else { 3035 kprintf(" {\n"); 3036 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 3037 if ((scan->flags & flags) || flags == (u_int)-1) { 3038 hammer2_dump_chain(scan, tab + 4, countp, 'a', 3039 flags); 3040 } 3041 } 3042 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 3043 kprintf("%*.*s}(%s)\n", tab, tab, "", 3044 chain->data->ipdata.filename); 3045 else 3046 kprintf("%*.*s}\n", tab, tab, ""); 3047 } 3048 } 3049