1 /* 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/nlookup.h> 39 #include <sys/vnode.h> 40 #include <sys/mount.h> 41 #include <sys/fcntl.h> 42 #include <sys/buf.h> 43 #include <sys/uuid.h> 44 #include <sys/vfsops.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 #include <sys/objcache.h> 48 49 #include <sys/proc.h> 50 #include <sys/namei.h> 51 #include <sys/mountctl.h> 52 #include <sys/dirent.h> 53 #include <sys/uio.h> 54 55 #include <sys/mutex.h> 56 #include <sys/mutex2.h> 57 58 #include "hammer2.h" 59 #include "hammer2_disk.h" 60 #include "hammer2_mount.h" 61 #include "hammer2_lz4.h" 62 63 #include "zlib/hammer2_zlib.h" 64 65 #define REPORT_REFS_ERRORS 1 /* XXX remove me */ 66 67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); 68 69 struct hammer2_sync_info { 70 int error; 71 int waitfor; 72 int pass; 73 }; 74 75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev); 76 static struct hammer2_mntlist hammer2_mntlist; 77 78 struct hammer2_pfslist hammer2_pfslist; 79 struct hammer2_pfslist hammer2_spmplist; 80 struct lock hammer2_mntlk; 81 82 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT; 83 int hammer2_debug; 84 int hammer2_xopgroups; 85 long hammer2_debug_inode; 86 int hammer2_cluster_meta_read = 1; /* physical read-ahead */ 87 int hammer2_cluster_data_read = 4; /* physical read-ahead */ 88 int hammer2_cluster_write = 0; /* physical write clustering */ 89 int hammer2_dedup_enable = 1; 90 int hammer2_always_compress = 0; /* always try to compress */ 91 int hammer2_inval_enable = 0; 92 int hammer2_flush_pipe = 100; 93 int hammer2_dio_count; 94 int hammer2_dio_limit = 256; 95 int hammer2_bulkfree_tps = 5000; 96 int hammer2_worker_rmask = 3; 97 long hammer2_chain_allocs; 98 long hammer2_chain_frees; 99 long hammer2_limit_dirty_chains; 100 long hammer2_limit_dirty_inodes; 101 long hammer2_count_modified_chains; 102 long hammer2_iod_invals; 103 long hammer2_iod_file_read; 104 long hammer2_iod_meta_read; 105 long hammer2_iod_indr_read; 106 long hammer2_iod_fmap_read; 107 long hammer2_iod_volu_read; 108 long hammer2_iod_file_write; 109 long hammer2_iod_file_wembed; 110 long hammer2_iod_file_wzero; 111 long hammer2_iod_file_wdedup; 112 long hammer2_iod_meta_write; 113 long hammer2_iod_indr_write; 114 long hammer2_iod_fmap_write; 115 long hammer2_iod_volu_write; 116 long hammer2_iod_inode_creates; 117 long hammer2_iod_inode_deletes; 118 119 MALLOC_DECLARE(M_HAMMER2_CBUFFER); 120 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer", 121 "Buffer used for compression."); 122 123 MALLOC_DECLARE(M_HAMMER2_DEBUFFER); 124 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer", 125 "Buffer used for decompression."); 126 127 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem"); 128 129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD, 130 &hammer2_supported_version, 0, ""); 131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW, 132 &hammer2_debug, 0, ""); 133 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, debug_inode, CTLFLAG_RW, 134 &hammer2_debug_inode, 0, ""); 135 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW, 136 &hammer2_cluster_meta_read, 0, ""); 137 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW, 138 &hammer2_cluster_data_read, 0, ""); 139 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW, 140 &hammer2_cluster_write, 0, ""); 141 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW, 142 &hammer2_dedup_enable, 0, ""); 143 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW, 144 &hammer2_always_compress, 0, ""); 145 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, 146 &hammer2_inval_enable, 0, ""); 147 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, 148 &hammer2_flush_pipe, 0, ""); 149 SYSCTL_INT(_vfs_hammer2, OID_AUTO, worker_rmask, CTLFLAG_RW, 150 &hammer2_worker_rmask, 0, ""); 151 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW, 152 &hammer2_bulkfree_tps, 0, ""); 153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW, 154 &hammer2_chain_allocs, 0, ""); 155 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW, 156 &hammer2_chain_frees, 0, ""); 157 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW, 158 &hammer2_limit_dirty_chains, 0, ""); 159 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW, 160 &hammer2_limit_dirty_inodes, 0, ""); 161 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, 162 &hammer2_count_modified_chains, 0, ""); 163 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, 164 &hammer2_dio_count, 0, ""); 165 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW, 166 &hammer2_dio_limit, 0, ""); 167 168 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, 169 &hammer2_iod_invals, 0, ""); 170 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, 171 &hammer2_iod_file_read, 0, ""); 172 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, 173 &hammer2_iod_meta_read, 0, ""); 174 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, 175 &hammer2_iod_indr_read, 0, ""); 176 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, 177 &hammer2_iod_fmap_read, 0, ""); 178 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, 179 &hammer2_iod_volu_read, 0, ""); 180 181 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, 182 &hammer2_iod_file_write, 0, ""); 183 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW, 184 &hammer2_iod_file_wembed, 0, ""); 185 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW, 186 &hammer2_iod_file_wzero, 0, ""); 187 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW, 188 &hammer2_iod_file_wdedup, 0, ""); 189 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, 190 &hammer2_iod_meta_write, 0, ""); 191 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, 192 &hammer2_iod_indr_write, 0, ""); 193 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, 194 &hammer2_iod_fmap_write, 0, ""); 195 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, 196 &hammer2_iod_volu_write, 0, ""); 197 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_creates, CTLFLAG_RW, 198 &hammer2_iod_inode_creates, 0, ""); 199 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_deletes, CTLFLAG_RW, 200 &hammer2_iod_inode_deletes, 0, ""); 201 202 long hammer2_process_icrc32; 203 long hammer2_process_xxhash64; 204 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW, 205 &hammer2_process_icrc32, 0, ""); 206 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW, 207 &hammer2_process_xxhash64, 0, ""); 208 209 static int hammer2_vfs_init(struct vfsconf *conf); 210 static int hammer2_vfs_uninit(struct vfsconf *vfsp); 211 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 212 struct ucred *cred); 213 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *, 214 struct vnode *, struct ucred *); 215 static int hammer2_recovery(hammer2_dev_t *hmp); 216 static int hammer2_vfs_unmount(struct mount *mp, int mntflags); 217 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); 218 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, 219 struct ucred *cred); 220 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, 221 struct ucred *cred); 222 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 223 struct fid *fhp, struct vnode **vpp); 224 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp); 225 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 226 int *exflagsp, struct ucred **credanonp); 227 static int hammer2_vfs_modifying(struct mount *mp); 228 229 static int hammer2_install_volume_header(hammer2_dev_t *hmp); 230 #if 0 231 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 232 #endif 233 234 static void hammer2_update_pmps(hammer2_dev_t *hmp); 235 236 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp); 237 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, 238 hammer2_dev_t *hmp); 239 static int hammer2_fixup_pfses(hammer2_dev_t *hmp); 240 241 /* 242 * HAMMER2 vfs operations. 243 */ 244 static struct vfsops hammer2_vfsops = { 245 .vfs_flags = 0, 246 .vfs_init = hammer2_vfs_init, 247 .vfs_uninit = hammer2_vfs_uninit, 248 .vfs_sync = hammer2_vfs_sync, 249 .vfs_mount = hammer2_vfs_mount, 250 .vfs_unmount = hammer2_vfs_unmount, 251 .vfs_root = hammer2_vfs_root, 252 .vfs_statfs = hammer2_vfs_statfs, 253 .vfs_statvfs = hammer2_vfs_statvfs, 254 .vfs_vget = hammer2_vfs_vget, 255 .vfs_vptofh = hammer2_vfs_vptofh, 256 .vfs_fhtovp = hammer2_vfs_fhtovp, 257 .vfs_checkexp = hammer2_vfs_checkexp, 258 .vfs_modifying = hammer2_vfs_modifying 259 }; 260 261 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); 262 263 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE); 264 MODULE_VERSION(hammer2, 1); 265 266 static 267 int 268 hammer2_vfs_init(struct vfsconf *conf) 269 { 270 static struct objcache_malloc_args margs_read; 271 static struct objcache_malloc_args margs_write; 272 static struct objcache_malloc_args margs_vop; 273 274 int error; 275 276 error = 0; 277 kmalloc_raise_limit(M_HAMMER2, 0); /* unlimited */ 278 279 /* 280 * hammer2_xopgroups must be even and is most optimal if 281 * 2 x ncpus so strategy functions can be queued to the same 282 * cpu. 283 */ 284 hammer2_xopgroups = HAMMER2_XOPGROUPS_MIN; 285 if (hammer2_xopgroups < ncpus * 2) 286 hammer2_xopgroups = ncpus * 2; 287 288 /* 289 * A large DIO cache is needed to retain dedup enablement masks. 290 * The bulkfree code clears related masks as part of the disk block 291 * recycling algorithm, preventing it from being used for a later 292 * dedup. 293 * 294 * NOTE: A large buffer cache can actually interfere with dedup 295 * operation because we dedup based on media physical buffers 296 * and not logical buffers. Try to make the DIO case large 297 * enough to avoid this problem, but also cap it. 298 */ 299 hammer2_dio_limit = nbuf * 2; 300 if (hammer2_dio_limit > 100000) 301 hammer2_dio_limit = 100000; 302 303 if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) 304 error = EINVAL; 305 if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data)) 306 error = EINVAL; 307 if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data)) 308 error = EINVAL; 309 310 if (error) 311 kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); 312 313 margs_read.objsize = 65536; 314 margs_read.mtype = M_HAMMER2_DEBUFFER; 315 316 margs_write.objsize = 32768; 317 margs_write.mtype = M_HAMMER2_CBUFFER; 318 319 margs_vop.objsize = sizeof(hammer2_xop_t); 320 margs_vop.mtype = M_HAMMER2; 321 322 /* 323 * Note thaht for the XOPS cache we want backing store allocations 324 * to use M_ZERO. This is not allowed in objcache_get() (to avoid 325 * confusion), so use the backing store function that does it. This 326 * means that initial XOPS objects are zerod but REUSED objects are 327 * not. So we are responsible for cleaning the object up sufficiently 328 * for our needs before objcache_put()ing it back (typically just the 329 * FIFO indices). 330 */ 331 cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, 332 0, 1, NULL, NULL, NULL, 333 objcache_malloc_alloc, 334 objcache_malloc_free, 335 &margs_read); 336 cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, 337 0, 1, NULL, NULL, NULL, 338 objcache_malloc_alloc, 339 objcache_malloc_free, 340 &margs_write); 341 cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc, 342 0, 1, NULL, NULL, NULL, 343 objcache_malloc_alloc_zero, 344 objcache_malloc_free, 345 &margs_vop); 346 347 348 lockinit(&hammer2_mntlk, "mntlk", 0, 0); 349 TAILQ_INIT(&hammer2_mntlist); 350 TAILQ_INIT(&hammer2_pfslist); 351 TAILQ_INIT(&hammer2_spmplist); 352 353 hammer2_limit_dirty_chains = maxvnodes / 10; 354 if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS) 355 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS; 356 if (hammer2_limit_dirty_chains < 1000) 357 hammer2_limit_dirty_chains = 1000; 358 359 hammer2_limit_dirty_inodes = maxvnodes / 25; 360 if (hammer2_limit_dirty_inodes < 100) 361 hammer2_limit_dirty_inodes = 100; 362 if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES) 363 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES; 364 365 return (error); 366 } 367 368 static 369 int 370 hammer2_vfs_uninit(struct vfsconf *vfsp __unused) 371 { 372 objcache_destroy(cache_buffer_read); 373 objcache_destroy(cache_buffer_write); 374 objcache_destroy(cache_xops); 375 return 0; 376 } 377 378 /* 379 * Core PFS allocator. Used to allocate or reference the pmp structure 380 * for PFS cluster mounts and the spmp structure for media (hmp) structures. 381 * The pmp can be passed in or loaded by this function using the chain and 382 * inode data. 383 * 384 * pmp->modify_tid tracks new modify_tid transaction ids for front-end 385 * transactions. Note that synchronization does not use this field. 386 * (typically frontend operations and synchronization cannot run on the 387 * same PFS node at the same time). 388 * 389 * XXX check locking 390 */ 391 hammer2_pfs_t * 392 hammer2_pfsalloc(hammer2_chain_t *chain, 393 const hammer2_inode_data_t *ripdata, 394 hammer2_tid_t modify_tid, hammer2_dev_t *force_local) 395 { 396 hammer2_pfs_t *pmp; 397 hammer2_inode_t *iroot; 398 int count; 399 int i; 400 int j; 401 402 pmp = NULL; 403 404 /* 405 * Locate or create the PFS based on the cluster id. If ripdata 406 * is NULL this is a spmp which is unique and is always allocated. 407 * 408 * If the device is mounted in local mode all PFSs are considered 409 * independent and not part of any cluster (for debugging only). 410 */ 411 if (ripdata) { 412 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 413 if (force_local != pmp->force_local) 414 continue; 415 if (force_local == NULL && 416 bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid, 417 sizeof(pmp->pfs_clid)) == 0) { 418 break; 419 } else if (force_local && pmp->pfs_names[0] && 420 strcmp(pmp->pfs_names[0], ripdata->filename) == 0) { 421 break; 422 } 423 } 424 } 425 426 if (pmp == NULL) { 427 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO); 428 pmp->force_local = force_local; 429 hammer2_trans_manage_init(pmp); 430 kmalloc_create(&pmp->minode, "HAMMER2-inodes"); 431 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg"); 432 lockinit(&pmp->lock, "pfslk", 0, 0); 433 lockinit(&pmp->lock_nlink, "h2nlink", 0, 0); 434 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum"); 435 spin_init(&pmp->xop_spin, "h2xop"); 436 spin_init(&pmp->lru_spin, "h2lru"); 437 RB_INIT(&pmp->inum_tree); 438 TAILQ_INIT(&pmp->syncq); 439 TAILQ_INIT(&pmp->depq); 440 TAILQ_INIT(&pmp->lru_list); 441 spin_init(&pmp->list_spin, "h2pfsalloc_list"); 442 443 /* 444 * Save the last media transaction id for the flusher. Set 445 * initial 446 */ 447 if (ripdata) { 448 pmp->pfs_clid = ripdata->meta.pfs_clid; 449 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry); 450 } else { 451 pmp->flags |= HAMMER2_PMPF_SPMP; 452 TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry); 453 } 454 455 /* 456 * The synchronization thread may start too early, make 457 * sure it stays frozen until we are ready to let it go. 458 * XXX 459 */ 460 /* 461 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN | 462 HAMMER2_THREAD_REMASTER; 463 */ 464 } 465 466 /* 467 * Create the PFS's root inode and any missing XOP helper threads. 468 */ 469 if ((iroot = pmp->iroot) == NULL) { 470 iroot = hammer2_inode_get(pmp, NULL, 1, -1); 471 if (ripdata) 472 iroot->meta = ripdata->meta; 473 pmp->iroot = iroot; 474 hammer2_inode_ref(iroot); 475 hammer2_inode_unlock(iroot); 476 } 477 478 /* 479 * Stop here if no chain is passed in. 480 */ 481 if (chain == NULL) 482 goto done; 483 484 /* 485 * When a chain is passed in we must add it to the PFS's root 486 * inode, update pmp->pfs_types[], and update the syncronization 487 * threads. 488 * 489 * When forcing local mode, mark the PFS as a MASTER regardless. 490 * 491 * At the moment empty spots can develop due to removals or failures. 492 * Ultimately we want to re-fill these spots but doing so might 493 * confused running code. XXX 494 */ 495 hammer2_inode_ref(iroot); 496 hammer2_mtx_ex(&iroot->lock); 497 j = iroot->cluster.nchains; 498 499 if (j == HAMMER2_MAXCLUSTER) { 500 kprintf("hammer2_mount: cluster full!\n"); 501 /* XXX fatal error? */ 502 } else { 503 KKASSERT(chain->pmp == NULL); 504 chain->pmp = pmp; 505 hammer2_chain_ref(chain); 506 iroot->cluster.array[j].chain = chain; 507 if (force_local) 508 pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER; 509 else 510 pmp->pfs_types[j] = ripdata->meta.pfs_type; 511 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2); 512 pmp->pfs_hmps[j] = chain->hmp; 513 hammer2_spin_ex(&pmp->inum_spin); 514 pmp->pfs_iroot_blocksets[j] = chain->data->ipdata.u.blockset; 515 hammer2_spin_unex(&pmp->inum_spin); 516 517 /* 518 * If the PFS is already mounted we must account 519 * for the mount_count here. 520 */ 521 if (pmp->mp) 522 ++chain->hmp->mount_count; 523 524 /* 525 * May have to fixup dirty chain tracking. Previous 526 * pmp was NULL so nothing to undo. 527 */ 528 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 529 hammer2_pfs_memory_inc(pmp); 530 ++j; 531 } 532 iroot->cluster.nchains = j; 533 534 /* 535 * Update nmasters from any PFS inode which is part of the cluster. 536 * It is possible that this will result in a value which is too 537 * high. MASTER PFSs are authoritative for pfs_nmasters and will 538 * override this value later on. 539 * 540 * (This informs us of masters that might not currently be 541 * discoverable by this mount). 542 */ 543 if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) { 544 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters; 545 } 546 547 /* 548 * Count visible masters. Masters are usually added with 549 * ripdata->meta.pfs_nmasters set to 1. This detects when there 550 * are more (XXX and must update the master inodes). 551 */ 552 count = 0; 553 for (i = 0; i < iroot->cluster.nchains; ++i) { 554 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) 555 ++count; 556 } 557 if (pmp->pfs_nmasters < count) 558 pmp->pfs_nmasters = count; 559 560 /* 561 * Create missing synchronization and support threads. 562 * 563 * Single-node masters (including snapshots) have nothing to 564 * synchronize and do not require this thread. 565 * 566 * Multi-node masters or any number of soft masters, slaves, copy, 567 * or other PFS types need the thread. 568 * 569 * Each thread is responsible for its particular cluster index. 570 * We use independent threads so stalls or mismatches related to 571 * any given target do not affect other targets. 572 */ 573 for (i = 0; i < iroot->cluster.nchains; ++i) { 574 /* 575 * Single-node masters (including snapshots) have nothing 576 * to synchronize and will make direct xops support calls, 577 * thus they do not require this thread. 578 * 579 * Note that there can be thousands of snapshots. We do not 580 * want to create thousands of threads. 581 */ 582 if (pmp->pfs_nmasters <= 1 && 583 pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) { 584 continue; 585 } 586 587 /* 588 * Sync support thread 589 */ 590 if (pmp->sync_thrs[i].td == NULL) { 591 hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL, 592 "h2nod", i, -1, 593 hammer2_primary_sync_thread); 594 } 595 } 596 597 /* 598 * Create missing Xop threads 599 * 600 * NOTE: We create helper threads for all mounted PFSs or any 601 * PFSs with 2+ nodes (so the sync thread can update them, 602 * even if not mounted). 603 */ 604 if (pmp->mp || iroot->cluster.nchains >= 2) 605 hammer2_xop_helper_create(pmp); 606 607 hammer2_mtx_unlock(&iroot->lock); 608 hammer2_inode_drop(iroot); 609 done: 610 return pmp; 611 } 612 613 /* 614 * Deallocate an element of a probed PFS. If destroying and this is a 615 * MASTER, adjust nmasters. 616 * 617 * This function does not physically destroy the PFS element in its device 618 * under the super-root (see hammer2_ioctl_pfs_delete()). 619 */ 620 void 621 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying) 622 { 623 hammer2_inode_t *iroot; 624 hammer2_chain_t *chain; 625 int j; 626 627 /* 628 * Cleanup our reference on iroot. iroot is (should) not be needed 629 * by the flush code. 630 */ 631 iroot = pmp->iroot; 632 if (iroot) { 633 /* 634 * Stop synchronizing 635 * 636 * XXX flush after acquiring the iroot lock. 637 * XXX clean out the cluster index from all inode structures. 638 */ 639 hammer2_thr_delete(&pmp->sync_thrs[clindex]); 640 641 /* 642 * Remove the cluster index from the group. If destroying 643 * the PFS and this is a master, adjust pfs_nmasters. 644 */ 645 hammer2_mtx_ex(&iroot->lock); 646 chain = iroot->cluster.array[clindex].chain; 647 iroot->cluster.array[clindex].chain = NULL; 648 649 switch(pmp->pfs_types[clindex]) { 650 case HAMMER2_PFSTYPE_MASTER: 651 if (destroying && pmp->pfs_nmasters > 0) 652 --pmp->pfs_nmasters; 653 /* XXX adjust ripdata->meta.pfs_nmasters */ 654 break; 655 default: 656 break; 657 } 658 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE; 659 660 hammer2_mtx_unlock(&iroot->lock); 661 662 /* 663 * Release the chain. 664 */ 665 if (chain) { 666 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 667 hammer2_chain_drop(chain); 668 } 669 670 /* 671 * Terminate all XOP threads for the cluster index. 672 */ 673 if (pmp->xop_groups) { 674 for (j = 0; j < hammer2_xopgroups; ++j) { 675 hammer2_thr_delete( 676 &pmp->xop_groups[j].thrs[clindex]); 677 } 678 } 679 } 680 } 681 682 /* 683 * Destroy a PFS, typically only occurs after the last mount on a device 684 * has gone away. 685 */ 686 static void 687 hammer2_pfsfree(hammer2_pfs_t *pmp) 688 { 689 hammer2_inode_t *iroot; 690 hammer2_chain_t *chain; 691 int chains_still_present = 0; 692 int i; 693 int j; 694 695 /* 696 * Cleanup our reference on iroot. iroot is (should) not be needed 697 * by the flush code. 698 */ 699 if (pmp->flags & HAMMER2_PMPF_SPMP) 700 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry); 701 else 702 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry); 703 704 /* 705 * Cleanup chains remaining on LRU list. 706 */ 707 hammer2_spin_ex(&pmp->lru_spin); 708 while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) { 709 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU); 710 atomic_add_int(&pmp->lru_count, -1); 711 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 712 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 713 hammer2_chain_ref(chain); 714 hammer2_spin_unex(&pmp->lru_spin); 715 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 716 hammer2_chain_drop(chain); 717 hammer2_spin_ex(&pmp->lru_spin); 718 } 719 hammer2_spin_unex(&pmp->lru_spin); 720 721 /* 722 * Clean up iroot 723 */ 724 iroot = pmp->iroot; 725 if (iroot) { 726 for (i = 0; i < iroot->cluster.nchains; ++i) { 727 hammer2_thr_delete(&pmp->sync_thrs[i]); 728 if (pmp->xop_groups) { 729 for (j = 0; j < hammer2_xopgroups; ++j) 730 hammer2_thr_delete( 731 &pmp->xop_groups[j].thrs[i]); 732 } 733 chain = iroot->cluster.array[i].chain; 734 if (chain && !RB_EMPTY(&chain->core.rbtree)) { 735 kprintf("hammer2: Warning pmp %p still " 736 "has active chains\n", pmp); 737 chains_still_present = 1; 738 } 739 } 740 #if REPORT_REFS_ERRORS 741 if (iroot->refs != 1) 742 kprintf("PMP->IROOT %p REFS WRONG %d\n", 743 iroot, iroot->refs); 744 #else 745 KKASSERT(iroot->refs == 1); 746 #endif 747 /* ref for iroot */ 748 hammer2_inode_drop(iroot); 749 pmp->iroot = NULL; 750 } 751 752 /* 753 * Free remaining pmp resources 754 */ 755 if (chains_still_present) { 756 kprintf("hammer2: cannot free pmp %p, still in use\n", pmp); 757 } else { 758 kmalloc_destroy(&pmp->mmsg); 759 kmalloc_destroy(&pmp->minode); 760 kfree(pmp, M_HAMMER2); 761 } 762 } 763 764 /* 765 * Remove all references to hmp from the pfs list. Any PFS which becomes 766 * empty is terminated and freed. 767 * 768 * XXX inefficient. 769 */ 770 static void 771 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which) 772 { 773 hammer2_pfs_t *pmp; 774 hammer2_inode_t *iroot; 775 hammer2_chain_t *rchain; 776 int i; 777 int j; 778 struct hammer2_pfslist *wlist; 779 780 if (which == 0) 781 wlist = &hammer2_pfslist; 782 else 783 wlist = &hammer2_spmplist; 784 again: 785 TAILQ_FOREACH(pmp, wlist, mntentry) { 786 if ((iroot = pmp->iroot) == NULL) 787 continue; 788 789 /* 790 * Determine if this PFS is affected. If it is we must 791 * freeze all management threads and lock its iroot. 792 * 793 * Freezing a management thread forces it idle, operations 794 * in-progress will be aborted and it will have to start 795 * over again when unfrozen, or exit if told to exit. 796 */ 797 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 798 if (pmp->pfs_hmps[i] == hmp) 799 break; 800 } 801 if (i == HAMMER2_MAXCLUSTER) 802 continue; 803 804 hammer2_vfs_sync_pmp(pmp, MNT_WAIT); 805 806 /* 807 * Make sure all synchronization threads are locked 808 * down. 809 */ 810 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 811 if (pmp->pfs_hmps[i] == NULL) 812 continue; 813 hammer2_thr_freeze_async(&pmp->sync_thrs[i]); 814 if (pmp->xop_groups) { 815 for (j = 0; j < hammer2_xopgroups; ++j) { 816 hammer2_thr_freeze_async( 817 &pmp->xop_groups[j].thrs[i]); 818 } 819 } 820 } 821 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 822 if (pmp->pfs_hmps[i] == NULL) 823 continue; 824 hammer2_thr_freeze(&pmp->sync_thrs[i]); 825 if (pmp->xop_groups) { 826 for (j = 0; j < hammer2_xopgroups; ++j) { 827 hammer2_thr_freeze( 828 &pmp->xop_groups[j].thrs[i]); 829 } 830 } 831 } 832 833 /* 834 * Lock the inode and clean out matching chains. 835 * Note that we cannot use hammer2_inode_lock_*() 836 * here because that would attempt to validate the 837 * cluster that we are in the middle of ripping 838 * apart. 839 * 840 * WARNING! We are working directly on the inodes 841 * embedded cluster. 842 */ 843 hammer2_mtx_ex(&iroot->lock); 844 845 /* 846 * Remove the chain from matching elements of the PFS. 847 */ 848 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 849 if (pmp->pfs_hmps[i] != hmp) 850 continue; 851 hammer2_thr_delete(&pmp->sync_thrs[i]); 852 if (pmp->xop_groups) { 853 for (j = 0; j < hammer2_xopgroups; ++j) { 854 hammer2_thr_delete( 855 &pmp->xop_groups[j].thrs[i]); 856 } 857 } 858 rchain = iroot->cluster.array[i].chain; 859 iroot->cluster.array[i].chain = NULL; 860 pmp->pfs_types[i] = 0; 861 if (pmp->pfs_names[i]) { 862 kfree(pmp->pfs_names[i], M_HAMMER2); 863 pmp->pfs_names[i] = NULL; 864 } 865 if (rchain) { 866 hammer2_chain_drop(rchain); 867 /* focus hint */ 868 if (iroot->cluster.focus == rchain) 869 iroot->cluster.focus = NULL; 870 } 871 pmp->pfs_hmps[i] = NULL; 872 } 873 hammer2_mtx_unlock(&iroot->lock); 874 875 /* 876 * Cleanup trailing chains. Gaps may remain. 877 */ 878 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) { 879 if (pmp->pfs_hmps[i]) 880 break; 881 } 882 iroot->cluster.nchains = i + 1; 883 884 /* 885 * If the PMP has no elements remaining we can destroy it. 886 * (this will transition management threads from frozen->exit). 887 */ 888 if (iroot->cluster.nchains == 0) { 889 /* 890 * If this was the hmp's spmp, we need to clean 891 * a little more stuff out. 892 */ 893 if (hmp->spmp == pmp) { 894 hmp->spmp = NULL; 895 hmp->vchain.pmp = NULL; 896 hmp->fchain.pmp = NULL; 897 } 898 899 /* 900 * Free the pmp and restart the loop 901 */ 902 KKASSERT(TAILQ_EMPTY(&pmp->syncq)); 903 KKASSERT(TAILQ_EMPTY(&pmp->depq)); 904 hammer2_pfsfree(pmp); 905 goto again; 906 } 907 908 /* 909 * If elements still remain we need to set the REMASTER 910 * flag and unfreeze it. 911 */ 912 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 913 if (pmp->pfs_hmps[i] == NULL) 914 continue; 915 hammer2_thr_remaster(&pmp->sync_thrs[i]); 916 hammer2_thr_unfreeze(&pmp->sync_thrs[i]); 917 if (pmp->xop_groups) { 918 for (j = 0; j < hammer2_xopgroups; ++j) { 919 hammer2_thr_remaster( 920 &pmp->xop_groups[j].thrs[i]); 921 hammer2_thr_unfreeze( 922 &pmp->xop_groups[j].thrs[i]); 923 } 924 } 925 } 926 } 927 } 928 929 /* 930 * Mount or remount HAMMER2 fileystem from physical media 931 * 932 * mountroot 933 * mp mount point structure 934 * path NULL 935 * data <unused> 936 * cred <unused> 937 * 938 * mount 939 * mp mount point structure 940 * path path to mount point 941 * data pointer to argument structure in user space 942 * volume volume path (device@LABEL form) 943 * hflags user mount flags 944 * cred user credentials 945 * 946 * RETURNS: 0 Success 947 * !0 error number 948 */ 949 static 950 int 951 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 952 struct ucred *cred) 953 { 954 struct hammer2_mount_info info; 955 hammer2_pfs_t *pmp; 956 hammer2_pfs_t *spmp; 957 hammer2_dev_t *hmp; 958 hammer2_dev_t *force_local; 959 hammer2_key_t key_next; 960 hammer2_key_t key_dummy; 961 hammer2_key_t lhc; 962 struct vnode *devvp; 963 struct nlookupdata nd; 964 hammer2_chain_t *parent; 965 hammer2_chain_t *chain; 966 const hammer2_inode_data_t *ripdata; 967 hammer2_blockref_t bref; 968 struct file *fp; 969 char devstr[MNAMELEN]; 970 size_t size; 971 size_t done; 972 char *dev; 973 char *label; 974 int ronly = 1; 975 int error; 976 int i; 977 978 hmp = NULL; 979 pmp = NULL; 980 dev = NULL; 981 label = NULL; 982 devvp = NULL; 983 984 if (path == NULL) { 985 /* 986 * Root mount 987 */ 988 bzero(&info, sizeof(info)); 989 info.cluster_fd = -1; 990 ksnprintf(devstr, sizeof(devstr), "%s", 991 mp->mnt_stat.f_mntfromname); 992 kprintf("hammer2_mount: root '%s'\n", devstr); 993 done = strlen(devstr) + 1; 994 } else { 995 /* 996 * Non-root mount or updating a mount 997 */ 998 error = copyin(data, &info, sizeof(info)); 999 if (error) 1000 return (error); 1001 1002 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); 1003 if (error) 1004 return (error); 1005 kprintf("hammer2_mount: '%s'\n", devstr); 1006 } 1007 1008 /* 1009 * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA 1010 * if no label specified, based on the partition id. Error out if no 1011 * label or device (with partition id) is specified. This is strictly 1012 * a convenience to match the default label created by newfs_hammer2, 1013 * our preference is that a label always be specified. 1014 * 1015 * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command 1016 * that does not specify a device, as long as some H2 label 1017 * has already been mounted from that device. This makes 1018 * mounting snapshots a lot easier. 1019 */ 1020 dev = devstr; 1021 label = strchr(devstr, '@'); 1022 if (label && ((label + 1) - dev) > done) { 1023 kprintf("hammer2: mount: bad label %s/%zd\n", 1024 devstr, done); 1025 return (EINVAL); 1026 } 1027 if (label == NULL || label[1] == 0) { 1028 char slice; 1029 1030 if (label == NULL) 1031 label = devstr + strlen(devstr); 1032 else 1033 *label = '\0'; /* clean up trailing @ */ 1034 1035 slice = label[-1]; 1036 switch(slice) { 1037 case 'a': 1038 label = "BOOT"; 1039 break; 1040 case 'd': 1041 label = "ROOT"; 1042 break; 1043 default: 1044 label = "DATA"; 1045 break; 1046 } 1047 } else { 1048 *label = '\0'; 1049 label++; 1050 } 1051 1052 kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n", 1053 dev, label, (mp->mnt_flag & MNT_RDONLY)); 1054 1055 if (mp->mnt_flag & MNT_UPDATE) { 1056 /* 1057 * Update mount. Note that pmp->iroot->cluster is 1058 * an inode-embedded cluster and thus cannot be 1059 * directly locked. 1060 * 1061 * XXX HAMMER2 needs to implement NFS export via 1062 * mountctl. 1063 */ 1064 hammer2_cluster_t *cluster; 1065 1066 pmp = MPTOPMP(mp); 1067 pmp->hflags = info.hflags; 1068 cluster = &pmp->iroot->cluster; 1069 for (i = 0; i < cluster->nchains; ++i) { 1070 if (cluster->array[i].chain == NULL) 1071 continue; 1072 hmp = cluster->array[i].chain->hmp; 1073 devvp = hmp->devvp; 1074 error = hammer2_remount(hmp, mp, path, 1075 devvp, cred); 1076 if (error) 1077 break; 1078 } 1079 1080 return error; 1081 } 1082 1083 /* 1084 * HMP device mount 1085 * 1086 * If a path is specified and dev is not an empty string, lookup the 1087 * name and verify that it referes to a block device. 1088 * 1089 * If a path is specified and dev is an empty string we fall through 1090 * and locate the label in the hmp search. 1091 */ 1092 if (path && *dev != 0) { 1093 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); 1094 if (error == 0) 1095 error = nlookup(&nd); 1096 if (error == 0) 1097 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); 1098 nlookup_done(&nd); 1099 } else if (path == NULL) { 1100 /* root mount */ 1101 cdev_t cdev = kgetdiskbyname(dev); 1102 error = bdevvp(cdev, &devvp); 1103 if (error) 1104 kprintf("hammer2: cannot find '%s'\n", dev); 1105 } else { 1106 /* 1107 * We will locate the hmp using the label in the hmp loop. 1108 */ 1109 error = 0; 1110 } 1111 1112 /* 1113 * Make sure its a block device. Do not check to see if it is 1114 * already mounted until we determine that its a fresh H2 device. 1115 */ 1116 if (error == 0 && devvp) { 1117 vn_isdisk(devvp, &error); 1118 } 1119 1120 /* 1121 * Determine if the device has already been mounted. After this 1122 * check hmp will be non-NULL if we are doing the second or more 1123 * hammer2 mounts from the same device. 1124 */ 1125 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1126 if (devvp) { 1127 /* 1128 * Match the device. Due to the way devfs works, 1129 * we may not be able to directly match the vnode pointer, 1130 * so also check to see if the underlying device matches. 1131 */ 1132 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1133 if (hmp->devvp == devvp) 1134 break; 1135 if (devvp->v_rdev && 1136 hmp->devvp->v_rdev == devvp->v_rdev) { 1137 break; 1138 } 1139 } 1140 1141 /* 1142 * If no match this may be a fresh H2 mount, make sure 1143 * the device is not mounted on anything else. 1144 */ 1145 if (hmp == NULL) 1146 error = vfs_mountedon(devvp); 1147 } else if (error == 0) { 1148 /* 1149 * Match the label to a pmp already probed. 1150 */ 1151 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 1152 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 1153 if (pmp->pfs_names[i] && 1154 strcmp(pmp->pfs_names[i], label) == 0) { 1155 hmp = pmp->pfs_hmps[i]; 1156 break; 1157 } 1158 } 1159 if (hmp) 1160 break; 1161 } 1162 if (hmp == NULL) 1163 error = ENOENT; 1164 } 1165 1166 /* 1167 * Open the device if this isn't a secondary mount and construct 1168 * the H2 device mount (hmp). 1169 */ 1170 if (hmp == NULL) { 1171 hammer2_chain_t *schain; 1172 hammer2_xid_t xid; 1173 hammer2_xop_head_t xop; 1174 1175 if (error == 0 && vcount(devvp) > 0) { 1176 kprintf("Primary device already has references\n"); 1177 error = EBUSY; 1178 } 1179 1180 /* 1181 * Now open the device 1182 */ 1183 if (error == 0) { 1184 ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); 1185 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1186 error = vinvalbuf(devvp, V_SAVE, 0, 0); 1187 if (error == 0) { 1188 error = VOP_OPEN(devvp, 1189 (ronly ? FREAD : FREAD | FWRITE), 1190 FSCRED, NULL); 1191 } 1192 vn_unlock(devvp); 1193 } 1194 if (error && devvp) { 1195 vrele(devvp); 1196 devvp = NULL; 1197 } 1198 if (error) { 1199 lockmgr(&hammer2_mntlk, LK_RELEASE); 1200 return error; 1201 } 1202 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); 1203 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev); 1204 hmp->ronly = ronly; 1205 hmp->devvp = devvp; 1206 hmp->hflags = info.hflags & HMNT2_DEVFLAGS; 1207 kmalloc_create(&hmp->mchain, "HAMMER2-chains"); 1208 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry); 1209 RB_INIT(&hmp->iotree); 1210 spin_init(&hmp->io_spin, "h2mount_io"); 1211 spin_init(&hmp->list_spin, "h2mount_list"); 1212 1213 lockinit(&hmp->vollk, "h2vol", 0, 0); 1214 lockinit(&hmp->bulklk, "h2bulk", 0, 0); 1215 lockinit(&hmp->bflock, "h2bflk", 0, 0); 1216 1217 /* 1218 * vchain setup. vchain.data is embedded. 1219 * vchain.refs is initialized and will never drop to 0. 1220 * 1221 * NOTE! voldata is not yet loaded. 1222 */ 1223 hmp->vchain.hmp = hmp; 1224 hmp->vchain.refs = 1; 1225 hmp->vchain.data = (void *)&hmp->voldata; 1226 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; 1227 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1228 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1229 1230 hammer2_chain_core_init(&hmp->vchain); 1231 /* hmp->vchain.u.xxx is left NULL */ 1232 1233 /* 1234 * fchain setup. fchain.data is embedded. 1235 * fchain.refs is initialized and will never drop to 0. 1236 * 1237 * The data is not used but needs to be initialized to 1238 * pass assertion muster. We use this chain primarily 1239 * as a placeholder for the freemap's top-level RBTREE 1240 * so it does not interfere with the volume's topology 1241 * RBTREE. 1242 */ 1243 hmp->fchain.hmp = hmp; 1244 hmp->fchain.refs = 1; 1245 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset; 1246 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP; 1247 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1248 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1249 hmp->fchain.bref.methods = 1250 HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) | 1251 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 1252 1253 hammer2_chain_core_init(&hmp->fchain); 1254 /* hmp->fchain.u.xxx is left NULL */ 1255 1256 /* 1257 * Install the volume header and initialize fields from 1258 * voldata. 1259 */ 1260 error = hammer2_install_volume_header(hmp); 1261 if (error) { 1262 hammer2_unmount_helper(mp, NULL, hmp); 1263 lockmgr(&hammer2_mntlk, LK_RELEASE); 1264 hammer2_vfs_unmount(mp, MNT_FORCE); 1265 return error; 1266 } 1267 1268 /* 1269 * Really important to get these right or the flush and 1270 * teardown code will get confused. 1271 */ 1272 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL); 1273 spmp = hmp->spmp; 1274 spmp->pfs_hmps[0] = hmp; 1275 1276 /* 1277 * Dummy-up vchain and fchain's modify_tid. mirror_tid 1278 * is inherited from the volume header. 1279 */ 1280 xid = 0; 1281 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1282 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid; 1283 hmp->vchain.pmp = spmp; 1284 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1285 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid; 1286 hmp->fchain.pmp = spmp; 1287 1288 /* 1289 * First locate the super-root inode, which is key 0 1290 * relative to the volume header's blockset. 1291 * 1292 * Then locate the root inode by scanning the directory keyspace 1293 * represented by the label. 1294 */ 1295 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 1296 schain = hammer2_chain_lookup(&parent, &key_dummy, 1297 HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 1298 &error, 0); 1299 hammer2_chain_lookup_done(parent); 1300 if (schain == NULL) { 1301 kprintf("hammer2_mount: invalid super-root\n"); 1302 hammer2_unmount_helper(mp, NULL, hmp); 1303 lockmgr(&hammer2_mntlk, LK_RELEASE); 1304 hammer2_vfs_unmount(mp, MNT_FORCE); 1305 return EINVAL; 1306 } 1307 if (schain->error) { 1308 kprintf("hammer2_mount: error %s reading super-root\n", 1309 hammer2_error_str(schain->error)); 1310 hammer2_chain_unlock(schain); 1311 hammer2_chain_drop(schain); 1312 schain = NULL; 1313 hammer2_unmount_helper(mp, NULL, hmp); 1314 lockmgr(&hammer2_mntlk, LK_RELEASE); 1315 hammer2_vfs_unmount(mp, MNT_FORCE); 1316 return EINVAL; 1317 } 1318 1319 /* 1320 * The super-root always uses an inode_tid of 1 when 1321 * creating PFSs. 1322 */ 1323 spmp->inode_tid = 1; 1324 spmp->modify_tid = schain->bref.modify_tid + 1; 1325 1326 /* 1327 * Sanity-check schain's pmp and finish initialization. 1328 * Any chain belonging to the super-root topology should 1329 * have a NULL pmp (not even set to spmp). 1330 */ 1331 ripdata = &hammer2_chain_rdata(schain)->ipdata; 1332 KKASSERT(schain->pmp == NULL); 1333 spmp->pfs_clid = ripdata->meta.pfs_clid; 1334 1335 /* 1336 * Replace the dummy spmp->iroot with a real one. It's 1337 * easier to just do a wholesale replacement than to try 1338 * to update the chain and fixup the iroot fields. 1339 * 1340 * The returned inode is locked with the supplied cluster. 1341 */ 1342 hammer2_dummy_xop_from_chain(&xop, schain); 1343 hammer2_inode_drop(spmp->iroot); 1344 spmp->iroot = NULL; 1345 spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1); 1346 spmp->spmp_hmp = hmp; 1347 spmp->pfs_types[0] = ripdata->meta.pfs_type; 1348 spmp->pfs_hmps[0] = hmp; 1349 hammer2_inode_ref(spmp->iroot); 1350 hammer2_inode_unlock(spmp->iroot); 1351 hammer2_cluster_unlock(&xop.cluster); 1352 hammer2_chain_drop(schain); 1353 /* do not call hammer2_cluster_drop() on an embedded cluster */ 1354 schain = NULL; /* now invalid */ 1355 /* leave spmp->iroot with one ref */ 1356 1357 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 1358 error = hammer2_recovery(hmp); 1359 if (error == 0) 1360 error |= hammer2_fixup_pfses(hmp); 1361 /* XXX do something with error */ 1362 } 1363 hammer2_update_pmps(hmp); 1364 hammer2_iocom_init(hmp); 1365 hammer2_bulkfree_init(hmp); 1366 1367 /* 1368 * Ref the cluster management messaging descriptor. The mount 1369 * program deals with the other end of the communications pipe. 1370 * 1371 * Root mounts typically do not supply one. 1372 */ 1373 if (info.cluster_fd >= 0) { 1374 fp = holdfp(curthread, info.cluster_fd, -1); 1375 if (fp) { 1376 hammer2_cluster_reconnect(hmp, fp); 1377 } else { 1378 kprintf("hammer2_mount: bad cluster_fd!\n"); 1379 } 1380 } 1381 } else { 1382 spmp = hmp->spmp; 1383 if (info.hflags & HMNT2_DEVFLAGS) { 1384 kprintf("hammer2: Warning: mount flags pertaining " 1385 "to the whole device may only be specified " 1386 "on the first mount of the device: %08x\n", 1387 info.hflags & HMNT2_DEVFLAGS); 1388 } 1389 } 1390 1391 /* 1392 * Force local mount (disassociate all PFSs from their clusters). 1393 * Used primarily for debugging. 1394 */ 1395 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1396 1397 /* 1398 * Lookup the mount point under the media-localized super-root. 1399 * Scanning hammer2_pfslist doesn't help us because it represents 1400 * PFS cluster ids which can aggregate several named PFSs together. 1401 * 1402 * cluster->pmp will incorrectly point to spmp and must be fixed 1403 * up later on. 1404 */ 1405 hammer2_inode_lock(spmp->iroot, 0); 1406 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1407 lhc = hammer2_dirhash(label, strlen(label)); 1408 chain = hammer2_chain_lookup(&parent, &key_next, 1409 lhc, lhc + HAMMER2_DIRHASH_LOMASK, 1410 &error, 0); 1411 while (chain) { 1412 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 1413 strcmp(label, chain->data->ipdata.filename) == 0) { 1414 break; 1415 } 1416 chain = hammer2_chain_next(&parent, chain, &key_next, 1417 key_next, 1418 lhc + HAMMER2_DIRHASH_LOMASK, 1419 &error, 0); 1420 } 1421 if (parent) { 1422 hammer2_chain_unlock(parent); 1423 hammer2_chain_drop(parent); 1424 } 1425 hammer2_inode_unlock(spmp->iroot); 1426 1427 /* 1428 * PFS could not be found? 1429 */ 1430 if (chain == NULL) { 1431 if (error) 1432 kprintf("hammer2_mount: PFS label I/O error\n"); 1433 else 1434 kprintf("hammer2_mount: PFS label not found\n"); 1435 hammer2_unmount_helper(mp, NULL, hmp); 1436 lockmgr(&hammer2_mntlk, LK_RELEASE); 1437 hammer2_vfs_unmount(mp, MNT_FORCE); 1438 1439 return EINVAL; 1440 } 1441 1442 /* 1443 * Acquire the pmp structure (it should have already been allocated 1444 * via hammer2_update_pmps() so do not pass cluster in to add to 1445 * available chains). 1446 * 1447 * Check if the cluster has already been mounted. A cluster can 1448 * only be mounted once, use null mounts to mount additional copies. 1449 */ 1450 if (chain->error) { 1451 kprintf("hammer2_mount: PFS label I/O error\n"); 1452 } else { 1453 ripdata = &chain->data->ipdata; 1454 bref = chain->bref; 1455 pmp = hammer2_pfsalloc(NULL, ripdata, 1456 bref.modify_tid, force_local); 1457 } 1458 hammer2_chain_unlock(chain); 1459 hammer2_chain_drop(chain); 1460 1461 /* 1462 * Finish the mount 1463 */ 1464 kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp); 1465 1466 if (pmp->mp) { 1467 kprintf("hammer2_mount: PFS already mounted!\n"); 1468 hammer2_unmount_helper(mp, NULL, hmp); 1469 lockmgr(&hammer2_mntlk, LK_RELEASE); 1470 hammer2_vfs_unmount(mp, MNT_FORCE); 1471 1472 return EBUSY; 1473 } 1474 1475 pmp->hflags = info.hflags; 1476 mp->mnt_flag |= MNT_LOCAL; 1477 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; /* all entry pts are SMP */ 1478 mp->mnt_kern_flag |= MNTK_THR_SYNC; /* new vsyncscan semantics */ 1479 1480 /* 1481 * required mount structure initializations 1482 */ 1483 mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE; 1484 mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE; 1485 1486 mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE; 1487 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 1488 1489 /* 1490 * Optional fields 1491 */ 1492 mp->mnt_iosize_max = MAXPHYS; 1493 1494 /* 1495 * Connect up mount pointers. 1496 */ 1497 hammer2_mount_helper(mp, pmp); 1498 1499 lockmgr(&hammer2_mntlk, LK_RELEASE); 1500 1501 /* 1502 * Finish setup 1503 */ 1504 vfs_getnewfsid(mp); 1505 vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops); 1506 vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops); 1507 vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops); 1508 1509 if (path) { 1510 copyinstr(info.volume, mp->mnt_stat.f_mntfromname, 1511 MNAMELEN - 1, &size); 1512 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); 1513 } /* else root mount, already in there */ 1514 1515 bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); 1516 if (path) { 1517 copyinstr(path, mp->mnt_stat.f_mntonname, 1518 sizeof(mp->mnt_stat.f_mntonname) - 1, 1519 &size); 1520 } else { 1521 /* root mount */ 1522 mp->mnt_stat.f_mntonname[0] = '/'; 1523 } 1524 1525 /* 1526 * Initial statfs to prime mnt_stat. 1527 */ 1528 hammer2_vfs_statfs(mp, &mp->mnt_stat, cred); 1529 1530 return 0; 1531 } 1532 1533 /* 1534 * Scan PFSs under the super-root and create hammer2_pfs structures. 1535 */ 1536 static 1537 void 1538 hammer2_update_pmps(hammer2_dev_t *hmp) 1539 { 1540 const hammer2_inode_data_t *ripdata; 1541 hammer2_chain_t *parent; 1542 hammer2_chain_t *chain; 1543 hammer2_blockref_t bref; 1544 hammer2_dev_t *force_local; 1545 hammer2_pfs_t *spmp; 1546 hammer2_pfs_t *pmp; 1547 hammer2_key_t key_next; 1548 int error; 1549 1550 /* 1551 * Force local mount (disassociate all PFSs from their clusters). 1552 * Used primarily for debugging. 1553 */ 1554 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1555 1556 /* 1557 * Lookup mount point under the media-localized super-root. 1558 * 1559 * cluster->pmp will incorrectly point to spmp and must be fixed 1560 * up later on. 1561 */ 1562 spmp = hmp->spmp; 1563 hammer2_inode_lock(spmp->iroot, 0); 1564 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1565 chain = hammer2_chain_lookup(&parent, &key_next, 1566 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 1567 &error, 0); 1568 while (chain) { 1569 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 1570 continue; 1571 if (chain->error) { 1572 kprintf("I/O error scanning PFS labels\n"); 1573 } else { 1574 ripdata = &chain->data->ipdata; 1575 bref = chain->bref; 1576 1577 pmp = hammer2_pfsalloc(chain, ripdata, 1578 bref.modify_tid, force_local); 1579 } 1580 chain = hammer2_chain_next(&parent, chain, &key_next, 1581 key_next, HAMMER2_KEY_MAX, 1582 &error, 0); 1583 } 1584 if (parent) { 1585 hammer2_chain_unlock(parent); 1586 hammer2_chain_drop(parent); 1587 } 1588 hammer2_inode_unlock(spmp->iroot); 1589 } 1590 1591 static 1592 int 1593 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused, 1594 struct vnode *devvp, struct ucred *cred) 1595 { 1596 int error; 1597 1598 if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { 1599 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1600 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL); 1601 vn_unlock(devvp); 1602 error = hammer2_recovery(hmp); 1603 if (error == 0) 1604 error |= hammer2_fixup_pfses(hmp); 1605 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1606 if (error == 0) { 1607 VOP_CLOSE(devvp, FREAD, NULL); 1608 hmp->ronly = 0; 1609 } else { 1610 VOP_CLOSE(devvp, FREAD | FWRITE, NULL); 1611 } 1612 vn_unlock(devvp); 1613 } else { 1614 error = 0; 1615 } 1616 return error; 1617 } 1618 1619 static 1620 int 1621 hammer2_vfs_unmount(struct mount *mp, int mntflags) 1622 { 1623 hammer2_pfs_t *pmp; 1624 int flags; 1625 int error = 0; 1626 1627 pmp = MPTOPMP(mp); 1628 1629 if (pmp == NULL) 1630 return(0); 1631 1632 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1633 1634 /* 1635 * If mount initialization proceeded far enough we must flush 1636 * its vnodes and sync the underlying mount points. Three syncs 1637 * are required to fully flush the filesystem (freemap updates lag 1638 * by one flush, and one extra for safety). 1639 */ 1640 if (mntflags & MNT_FORCE) 1641 flags = FORCECLOSE; 1642 else 1643 flags = 0; 1644 if (pmp->iroot) { 1645 error = vflush(mp, 0, flags); 1646 if (error) 1647 goto failed; 1648 hammer2_vfs_sync(mp, MNT_WAIT); 1649 hammer2_vfs_sync(mp, MNT_WAIT); 1650 hammer2_vfs_sync(mp, MNT_WAIT); 1651 } 1652 1653 /* 1654 * Cleanup the frontend support XOPS threads 1655 */ 1656 hammer2_xop_helper_cleanup(pmp); 1657 1658 if (pmp->mp) 1659 hammer2_unmount_helper(mp, pmp, NULL); 1660 1661 error = 0; 1662 failed: 1663 lockmgr(&hammer2_mntlk, LK_RELEASE); 1664 1665 return (error); 1666 } 1667 1668 /* 1669 * Mount helper, hook the system mount into our PFS. 1670 * The mount lock is held. 1671 * 1672 * We must bump the mount_count on related devices for any 1673 * mounted PFSs. 1674 */ 1675 static 1676 void 1677 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) 1678 { 1679 hammer2_cluster_t *cluster; 1680 hammer2_chain_t *rchain; 1681 int i; 1682 1683 mp->mnt_data = (qaddr_t)pmp; 1684 pmp->mp = mp; 1685 1686 /* 1687 * After pmp->mp is set we have to adjust hmp->mount_count. 1688 */ 1689 cluster = &pmp->iroot->cluster; 1690 for (i = 0; i < cluster->nchains; ++i) { 1691 rchain = cluster->array[i].chain; 1692 if (rchain == NULL) 1693 continue; 1694 ++rchain->hmp->mount_count; 1695 } 1696 1697 /* 1698 * Create missing Xop threads 1699 */ 1700 hammer2_xop_helper_create(pmp); 1701 } 1702 1703 /* 1704 * Mount helper, unhook the system mount from our PFS. 1705 * The mount lock is held. 1706 * 1707 * If hmp is supplied a mount responsible for being the first to open 1708 * the block device failed and the block device and all PFSs using the 1709 * block device must be cleaned up. 1710 * 1711 * If pmp is supplied multiple devices might be backing the PFS and each 1712 * must be disconnected. This might not be the last PFS using some of the 1713 * underlying devices. Also, we have to adjust our hmp->mount_count 1714 * accounting for the devices backing the pmp which is now undergoing an 1715 * unmount. 1716 */ 1717 static 1718 void 1719 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp) 1720 { 1721 hammer2_cluster_t *cluster; 1722 hammer2_chain_t *rchain; 1723 struct vnode *devvp; 1724 int dumpcnt; 1725 int ronly; 1726 int i; 1727 1728 /* 1729 * If no device supplied this is a high-level unmount and we have to 1730 * to disconnect the mount, adjust mount_count, and locate devices 1731 * that might now have no mounts. 1732 */ 1733 if (pmp) { 1734 KKASSERT(hmp == NULL); 1735 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp); 1736 pmp->mp = NULL; 1737 mp->mnt_data = NULL; 1738 1739 /* 1740 * After pmp->mp is cleared we have to account for 1741 * mount_count. 1742 */ 1743 cluster = &pmp->iroot->cluster; 1744 for (i = 0; i < cluster->nchains; ++i) { 1745 rchain = cluster->array[i].chain; 1746 if (rchain == NULL) 1747 continue; 1748 --rchain->hmp->mount_count; 1749 /* scrapping hmp now may invalidate the pmp */ 1750 } 1751 again: 1752 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1753 if (hmp->mount_count == 0) { 1754 hammer2_unmount_helper(NULL, NULL, hmp); 1755 goto again; 1756 } 1757 } 1758 return; 1759 } 1760 1761 /* 1762 * Try to terminate the block device. We can't terminate it if 1763 * there are still PFSs referencing it. 1764 */ 1765 if (hmp->mount_count) 1766 return; 1767 1768 /* 1769 * Decomission the network before we start messing with the 1770 * device and PFS. 1771 */ 1772 hammer2_iocom_uninit(hmp); 1773 1774 hammer2_bulkfree_uninit(hmp); 1775 hammer2_pfsfree_scan(hmp, 0); 1776 #if 0 1777 hammer2_dev_exlock(hmp); /* XXX order */ 1778 #endif 1779 1780 /* 1781 * Cycle the volume data lock as a safety (probably not needed any 1782 * more). To ensure everything is out we need to flush at least 1783 * three times. (1) The running of the sideq can dirty the 1784 * filesystem, (2) A normal flush can dirty the freemap, and 1785 * (3) ensure that the freemap is fully synchronized. 1786 * 1787 * The next mount's recovery scan can clean everything up but we want 1788 * to leave the filesystem in a 100% clean state on a normal unmount. 1789 */ 1790 #if 0 1791 hammer2_voldata_lock(hmp); 1792 hammer2_voldata_unlock(hmp); 1793 #endif 1794 1795 /* 1796 * Flush whatever is left. Unmounted but modified PFS's might still 1797 * have some dirty chains on them. 1798 */ 1799 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1800 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 1801 1802 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1803 hammer2_voldata_modify(hmp); 1804 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP | 1805 HAMMER2_FLUSH_ALL); 1806 } 1807 hammer2_chain_unlock(&hmp->fchain); 1808 1809 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1810 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP | 1811 HAMMER2_FLUSH_ALL); 1812 } 1813 hammer2_chain_unlock(&hmp->vchain); 1814 1815 if ((hmp->vchain.flags | hmp->fchain.flags) & 1816 HAMMER2_CHAIN_FLUSH_MASK) { 1817 kprintf("hammer2_unmount: chains left over " 1818 "after final sync\n"); 1819 kprintf(" vchain %08x\n", hmp->vchain.flags); 1820 kprintf(" fchain %08x\n", hmp->fchain.flags); 1821 1822 if (hammer2_debug & 0x0010) 1823 Debugger("entered debugger"); 1824 } 1825 1826 hammer2_pfsfree_scan(hmp, 1); 1827 1828 KKASSERT(hmp->spmp == NULL); 1829 1830 /* 1831 * Finish up with the device vnode 1832 */ 1833 if ((devvp = hmp->devvp) != NULL) { 1834 ronly = hmp->ronly; 1835 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1836 kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n", 1837 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree), 1838 ronly); 1839 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0); 1840 kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n", 1841 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree)); 1842 hmp->devvp = NULL; 1843 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL); 1844 vn_unlock(devvp); 1845 vrele(devvp); 1846 devvp = NULL; 1847 } 1848 1849 /* 1850 * Clear vchain/fchain flags that might prevent final cleanup 1851 * of these chains. 1852 */ 1853 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) { 1854 atomic_add_long(&hammer2_count_modified_chains, -1); 1855 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1856 hammer2_pfs_memory_wakeup(hmp->vchain.pmp, -1); 1857 } 1858 if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) { 1859 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE); 1860 } 1861 1862 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) { 1863 atomic_add_long(&hammer2_count_modified_chains, -1); 1864 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED); 1865 hammer2_pfs_memory_wakeup(hmp->fchain.pmp, -1); 1866 } 1867 if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) { 1868 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE); 1869 } 1870 1871 /* 1872 * Final drop of embedded freemap root chain to 1873 * clean up fchain.core (fchain structure is not 1874 * flagged ALLOCATED so it is cleaned out and then 1875 * left to rot). 1876 */ 1877 hammer2_chain_drop(&hmp->fchain); 1878 1879 /* 1880 * Final drop of embedded volume root chain to clean 1881 * up vchain.core (vchain structure is not flagged 1882 * ALLOCATED so it is cleaned out and then left to 1883 * rot). 1884 */ 1885 dumpcnt = 50; 1886 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1); 1887 dumpcnt = 50; 1888 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1); 1889 #if 0 1890 hammer2_dev_unlock(hmp); 1891 #endif 1892 hammer2_chain_drop(&hmp->vchain); 1893 1894 hammer2_io_cleanup(hmp, &hmp->iotree); 1895 if (hmp->iofree_count) { 1896 kprintf("io_cleanup: %d I/O's left hanging\n", 1897 hmp->iofree_count); 1898 } 1899 1900 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry); 1901 kmalloc_destroy(&hmp->mchain); 1902 kfree(hmp, M_HAMMER2); 1903 } 1904 1905 int 1906 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 1907 ino_t ino, struct vnode **vpp) 1908 { 1909 hammer2_xop_lookup_t *xop; 1910 hammer2_pfs_t *pmp; 1911 hammer2_inode_t *ip; 1912 hammer2_tid_t inum; 1913 int error; 1914 1915 inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK; 1916 1917 error = 0; 1918 pmp = MPTOPMP(mp); 1919 1920 /* 1921 * Easy if we already have it cached 1922 */ 1923 ip = hammer2_inode_lookup(pmp, inum); 1924 if (ip) { 1925 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 1926 *vpp = hammer2_igetv(ip, &error); 1927 hammer2_inode_unlock(ip); 1928 hammer2_inode_drop(ip); /* from lookup */ 1929 1930 return error; 1931 } 1932 1933 /* 1934 * Otherwise we have to find the inode 1935 */ 1936 xop = hammer2_xop_alloc(pmp->iroot, 0); 1937 xop->lhc = inum; 1938 hammer2_xop_start(&xop->head, &hammer2_lookup_desc); 1939 error = hammer2_xop_collect(&xop->head, 0); 1940 1941 if (error == 0) 1942 ip = hammer2_inode_get(pmp, &xop->head, -1, -1); 1943 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1944 1945 if (ip) { 1946 *vpp = hammer2_igetv(ip, &error); 1947 hammer2_inode_unlock(ip); 1948 } else { 1949 *vpp = NULL; 1950 error = ENOENT; 1951 } 1952 return (error); 1953 } 1954 1955 static 1956 int 1957 hammer2_vfs_root(struct mount *mp, struct vnode **vpp) 1958 { 1959 hammer2_pfs_t *pmp; 1960 struct vnode *vp; 1961 int error; 1962 1963 pmp = MPTOPMP(mp); 1964 if (pmp->iroot == NULL) { 1965 kprintf("hammer2 (%s): no root inode\n", 1966 mp->mnt_stat.f_mntfromname); 1967 *vpp = NULL; 1968 return EINVAL; 1969 } 1970 1971 error = 0; 1972 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1973 1974 while (pmp->inode_tid == 0) { 1975 hammer2_xop_ipcluster_t *xop; 1976 const hammer2_inode_meta_t *meta; 1977 1978 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING); 1979 hammer2_xop_start(&xop->head, &hammer2_ipcluster_desc); 1980 error = hammer2_xop_collect(&xop->head, 0); 1981 1982 if (error == 0) { 1983 meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta; 1984 pmp->iroot->meta = *meta; 1985 pmp->inode_tid = meta->pfs_inum + 1; 1986 hammer2_xop_pdata(&xop->head); 1987 /* meta invalid */ 1988 1989 if (pmp->inode_tid < HAMMER2_INODE_START) 1990 pmp->inode_tid = HAMMER2_INODE_START; 1991 pmp->modify_tid = 1992 xop->head.cluster.focus->bref.modify_tid + 1; 1993 #if 0 1994 kprintf("PFS: Starting inode %jd\n", 1995 (intmax_t)pmp->inode_tid); 1996 kprintf("PMP focus good set nextino=%ld mod=%016jx\n", 1997 pmp->inode_tid, pmp->modify_tid); 1998 #endif 1999 wakeup(&pmp->iroot); 2000 2001 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2002 2003 /* 2004 * Prime the mount info. 2005 */ 2006 hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL); 2007 break; 2008 } 2009 2010 /* 2011 * Loop, try again 2012 */ 2013 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2014 hammer2_inode_unlock(pmp->iroot); 2015 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz); 2016 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 2017 if (error == EINTR) 2018 break; 2019 } 2020 2021 if (error) { 2022 hammer2_inode_unlock(pmp->iroot); 2023 *vpp = NULL; 2024 } else { 2025 vp = hammer2_igetv(pmp->iroot, &error); 2026 hammer2_inode_unlock(pmp->iroot); 2027 *vpp = vp; 2028 } 2029 2030 return (error); 2031 } 2032 2033 /* 2034 * Filesystem status 2035 * 2036 * XXX incorporate ipdata->meta.inode_quota and data_quota 2037 */ 2038 static 2039 int 2040 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) 2041 { 2042 hammer2_pfs_t *pmp; 2043 hammer2_dev_t *hmp; 2044 hammer2_blockref_t bref; 2045 struct statfs tmp; 2046 int i; 2047 2048 /* 2049 * NOTE: iroot might not have validated the cluster yet. 2050 */ 2051 pmp = MPTOPMP(mp); 2052 2053 bzero(&tmp, sizeof(tmp)); 2054 2055 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2056 hmp = pmp->pfs_hmps[i]; 2057 if (hmp == NULL) 2058 continue; 2059 if (pmp->iroot->cluster.array[i].chain) 2060 bref = pmp->iroot->cluster.array[i].chain->bref; 2061 else 2062 bzero(&bref, sizeof(bref)); 2063 2064 tmp.f_files = bref.embed.stats.inode_count; 2065 tmp.f_ffree = 0; 2066 tmp.f_blocks = hmp->voldata.allocator_size / 2067 mp->mnt_vstat.f_bsize; 2068 tmp.f_bfree = hmp->voldata.allocator_free / 2069 mp->mnt_vstat.f_bsize; 2070 tmp.f_bavail = tmp.f_bfree; 2071 2072 if (cred && cred->cr_uid != 0) { 2073 uint64_t adj; 2074 2075 /* 5% */ 2076 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2077 tmp.f_blocks -= adj; 2078 tmp.f_bfree -= adj; 2079 tmp.f_bavail -= adj; 2080 } 2081 2082 mp->mnt_stat.f_blocks = tmp.f_blocks; 2083 mp->mnt_stat.f_bfree = tmp.f_bfree; 2084 mp->mnt_stat.f_bavail = tmp.f_bavail; 2085 mp->mnt_stat.f_files = tmp.f_files; 2086 mp->mnt_stat.f_ffree = tmp.f_ffree; 2087 2088 *sbp = mp->mnt_stat; 2089 } 2090 return (0); 2091 } 2092 2093 static 2094 int 2095 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) 2096 { 2097 hammer2_pfs_t *pmp; 2098 hammer2_dev_t *hmp; 2099 hammer2_blockref_t bref; 2100 struct statvfs tmp; 2101 int i; 2102 2103 /* 2104 * NOTE: iroot might not have validated the cluster yet. 2105 */ 2106 pmp = MPTOPMP(mp); 2107 bzero(&tmp, sizeof(tmp)); 2108 2109 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2110 hmp = pmp->pfs_hmps[i]; 2111 if (hmp == NULL) 2112 continue; 2113 if (pmp->iroot->cluster.array[i].chain) 2114 bref = pmp->iroot->cluster.array[i].chain->bref; 2115 else 2116 bzero(&bref, sizeof(bref)); 2117 2118 tmp.f_files = bref.embed.stats.inode_count; 2119 tmp.f_ffree = 0; 2120 tmp.f_blocks = hmp->voldata.allocator_size / 2121 mp->mnt_vstat.f_bsize; 2122 tmp.f_bfree = hmp->voldata.allocator_free / 2123 mp->mnt_vstat.f_bsize; 2124 tmp.f_bavail = tmp.f_bfree; 2125 2126 if (cred && cred->cr_uid != 0) { 2127 uint64_t adj; 2128 2129 /* 5% */ 2130 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2131 tmp.f_blocks -= adj; 2132 tmp.f_bfree -= adj; 2133 tmp.f_bavail -= adj; 2134 } 2135 2136 mp->mnt_vstat.f_blocks = tmp.f_blocks; 2137 mp->mnt_vstat.f_bfree = tmp.f_bfree; 2138 mp->mnt_vstat.f_bavail = tmp.f_bavail; 2139 mp->mnt_vstat.f_files = tmp.f_files; 2140 mp->mnt_vstat.f_ffree = tmp.f_ffree; 2141 2142 *sbp = mp->mnt_vstat; 2143 } 2144 return (0); 2145 } 2146 2147 /* 2148 * Mount-time recovery (RW mounts) 2149 * 2150 * Updates to the free block table are allowed to lag flushes by one 2151 * transaction. In case of a crash, then on a fresh mount we must do an 2152 * incremental scan of the last committed transaction id and make sure that 2153 * all related blocks have been marked allocated. 2154 * 2155 * The super-root topology and each PFS has its own transaction id domain, 2156 * so we must track PFS boundary transitions. 2157 */ 2158 struct hammer2_recovery_elm { 2159 TAILQ_ENTRY(hammer2_recovery_elm) entry; 2160 hammer2_chain_t *chain; 2161 hammer2_tid_t sync_tid; 2162 }; 2163 2164 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); 2165 2166 struct hammer2_recovery_info { 2167 struct hammer2_recovery_list list; 2168 hammer2_tid_t mtid; 2169 int depth; 2170 }; 2171 2172 static int hammer2_recovery_scan(hammer2_dev_t *hmp, 2173 hammer2_chain_t *parent, 2174 struct hammer2_recovery_info *info, 2175 hammer2_tid_t sync_tid); 2176 2177 #define HAMMER2_RECOVERY_MAXDEPTH 10 2178 2179 static 2180 int 2181 hammer2_recovery(hammer2_dev_t *hmp) 2182 { 2183 struct hammer2_recovery_info info; 2184 struct hammer2_recovery_elm *elm; 2185 hammer2_chain_t *parent; 2186 hammer2_tid_t sync_tid; 2187 hammer2_tid_t mirror_tid; 2188 int error; 2189 2190 hammer2_trans_init(hmp->spmp, 0); 2191 2192 sync_tid = hmp->voldata.freemap_tid; 2193 mirror_tid = hmp->voldata.mirror_tid; 2194 2195 kprintf("hammer2 mount \"%s\": ", hmp->devrepname); 2196 if (sync_tid >= mirror_tid) { 2197 kprintf(" no recovery needed\n"); 2198 } else { 2199 kprintf(" freemap recovery %016jx-%016jx\n", 2200 sync_tid + 1, mirror_tid); 2201 } 2202 2203 TAILQ_INIT(&info.list); 2204 info.depth = 0; 2205 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 2206 error = hammer2_recovery_scan(hmp, parent, &info, sync_tid); 2207 hammer2_chain_lookup_done(parent); 2208 2209 while ((elm = TAILQ_FIRST(&info.list)) != NULL) { 2210 TAILQ_REMOVE(&info.list, elm, entry); 2211 parent = elm->chain; 2212 sync_tid = elm->sync_tid; 2213 kfree(elm, M_HAMMER2); 2214 2215 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2216 error |= hammer2_recovery_scan(hmp, parent, &info, 2217 hmp->voldata.freemap_tid); 2218 hammer2_chain_unlock(parent); 2219 hammer2_chain_drop(parent); /* drop elm->chain ref */ 2220 } 2221 2222 hammer2_trans_done(hmp->spmp, 0); 2223 2224 return error; 2225 } 2226 2227 static 2228 int 2229 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, 2230 struct hammer2_recovery_info *info, 2231 hammer2_tid_t sync_tid) 2232 { 2233 const hammer2_inode_data_t *ripdata; 2234 hammer2_chain_t *chain; 2235 hammer2_blockref_t bref; 2236 int tmp_error; 2237 int rup_error; 2238 int error; 2239 int first; 2240 2241 /* 2242 * Adjust freemap to ensure that the block(s) are marked allocated. 2243 */ 2244 if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { 2245 hammer2_freemap_adjust(hmp, &parent->bref, 2246 HAMMER2_FREEMAP_DORECOVER); 2247 } 2248 2249 /* 2250 * Check type for recursive scan 2251 */ 2252 switch(parent->bref.type) { 2253 case HAMMER2_BREF_TYPE_VOLUME: 2254 /* data already instantiated */ 2255 break; 2256 case HAMMER2_BREF_TYPE_INODE: 2257 /* 2258 * Must instantiate data for DIRECTDATA test and also 2259 * for recursion. 2260 */ 2261 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2262 ripdata = &hammer2_chain_rdata(parent)->ipdata; 2263 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2264 /* not applicable to recovery scan */ 2265 hammer2_chain_unlock(parent); 2266 return 0; 2267 } 2268 hammer2_chain_unlock(parent); 2269 break; 2270 case HAMMER2_BREF_TYPE_INDIRECT: 2271 /* 2272 * Must instantiate data for recursion 2273 */ 2274 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2275 hammer2_chain_unlock(parent); 2276 break; 2277 case HAMMER2_BREF_TYPE_DIRENT: 2278 case HAMMER2_BREF_TYPE_DATA: 2279 case HAMMER2_BREF_TYPE_FREEMAP: 2280 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2281 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2282 /* not applicable to recovery scan */ 2283 return 0; 2284 break; 2285 default: 2286 return HAMMER2_ERROR_BADBREF; 2287 } 2288 2289 /* 2290 * Defer operation if depth limit reached or if we are crossing a 2291 * PFS boundary. 2292 */ 2293 if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) { 2294 struct hammer2_recovery_elm *elm; 2295 2296 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); 2297 elm->chain = parent; 2298 elm->sync_tid = sync_tid; 2299 hammer2_chain_ref(parent); 2300 TAILQ_INSERT_TAIL(&info->list, elm, entry); 2301 /* unlocked by caller */ 2302 2303 return(0); 2304 } 2305 2306 2307 /* 2308 * Recursive scan of the last flushed transaction only. We are 2309 * doing this without pmp assignments so don't leave the chains 2310 * hanging around after we are done with them. 2311 * 2312 * error Cumulative error this level only 2313 * rup_error Cumulative error for recursion 2314 * tmp_error Specific non-cumulative recursion error 2315 */ 2316 chain = NULL; 2317 first = 1; 2318 rup_error = 0; 2319 error = 0; 2320 2321 for (;;) { 2322 error |= hammer2_chain_scan(parent, &chain, &bref, 2323 &first, 2324 HAMMER2_LOOKUP_NODATA); 2325 2326 /* 2327 * Problem during scan or EOF 2328 */ 2329 if (error) 2330 break; 2331 2332 /* 2333 * If this is a leaf 2334 */ 2335 if (chain == NULL) { 2336 if (bref.mirror_tid > sync_tid) { 2337 hammer2_freemap_adjust(hmp, &bref, 2338 HAMMER2_FREEMAP_DORECOVER); 2339 } 2340 continue; 2341 } 2342 2343 /* 2344 * This may or may not be a recursive node. 2345 */ 2346 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 2347 if (bref.mirror_tid > sync_tid) { 2348 ++info->depth; 2349 tmp_error = hammer2_recovery_scan(hmp, chain, 2350 info, sync_tid); 2351 --info->depth; 2352 } else { 2353 tmp_error = 0; 2354 } 2355 2356 /* 2357 * Flush the recovery at the PFS boundary to stage it for 2358 * the final flush of the super-root topology. 2359 */ 2360 if (tmp_error == 0 && 2361 (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && 2362 (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { 2363 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2364 HAMMER2_FLUSH_ALL); 2365 } 2366 rup_error |= tmp_error; 2367 } 2368 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 2369 } 2370 2371 /* 2372 * This fixes up an error introduced in earlier H2 implementations where 2373 * moving a PFS inode into an indirect block wound up causing the 2374 * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared. 2375 */ 2376 static 2377 int 2378 hammer2_fixup_pfses(hammer2_dev_t *hmp) 2379 { 2380 const hammer2_inode_data_t *ripdata; 2381 hammer2_chain_t *parent; 2382 hammer2_chain_t *chain; 2383 hammer2_key_t key_next; 2384 hammer2_pfs_t *spmp; 2385 int error; 2386 2387 error = 0; 2388 2389 /* 2390 * Lookup mount point under the media-localized super-root. 2391 * 2392 * cluster->pmp will incorrectly point to spmp and must be fixed 2393 * up later on. 2394 */ 2395 spmp = hmp->spmp; 2396 hammer2_inode_lock(spmp->iroot, 0); 2397 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 2398 chain = hammer2_chain_lookup(&parent, &key_next, 2399 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 2400 &error, 0); 2401 while (chain) { 2402 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 2403 continue; 2404 if (chain->error) { 2405 kprintf("I/O error scanning PFS labels\n"); 2406 error |= chain->error; 2407 } else if ((chain->bref.flags & 2408 HAMMER2_BREF_FLAG_PFSROOT) == 0) { 2409 int error2; 2410 2411 ripdata = &chain->data->ipdata; 2412 hammer2_trans_init(hmp->spmp, 0); 2413 error2 = hammer2_chain_modify(chain, 2414 chain->bref.modify_tid, 2415 0, 0); 2416 if (error2 == 0) { 2417 kprintf("hammer2: Correct mis-flagged PFS %s\n", 2418 ripdata->filename); 2419 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 2420 } else { 2421 error |= error2; 2422 } 2423 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2424 HAMMER2_FLUSH_ALL); 2425 hammer2_trans_done(hmp->spmp, 0); 2426 } 2427 chain = hammer2_chain_next(&parent, chain, &key_next, 2428 key_next, HAMMER2_KEY_MAX, 2429 &error, 0); 2430 } 2431 if (parent) { 2432 hammer2_chain_unlock(parent); 2433 hammer2_chain_drop(parent); 2434 } 2435 hammer2_inode_unlock(spmp->iroot); 2436 2437 return error; 2438 } 2439 2440 /* 2441 * Sync a mount point; this is called periodically on a per-mount basis from 2442 * the filesystem syncer, and whenever a user issues a sync. 2443 */ 2444 int 2445 hammer2_vfs_sync(struct mount *mp, int waitfor) 2446 { 2447 int error; 2448 2449 error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor); 2450 2451 return error; 2452 } 2453 2454 /* 2455 * Because frontend operations lock vnodes before we get a chance to 2456 * lock the related inode, we can't just acquire a vnode lock without 2457 * risking a deadlock. The frontend may be holding a vnode lock while 2458 * also blocked on our SYNCQ flag while trying to get the inode lock. 2459 * 2460 * To deal with this situation we can check the vnode lock situation 2461 * after locking the inode and perform a work-around. 2462 */ 2463 int 2464 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor) 2465 { 2466 struct mount *mp; 2467 /*hammer2_xop_flush_t *xop;*/ 2468 /*struct hammer2_sync_info info;*/ 2469 hammer2_inode_t *ip; 2470 hammer2_depend_t *depend; 2471 hammer2_depend_t *depend_next; 2472 struct vnode *vp; 2473 uint32_t pass2; 2474 int error; 2475 int wakecount; 2476 int dorestart; 2477 2478 mp = pmp->mp; 2479 2480 /* 2481 * Move all inodes on sideq to syncq. This will clear sideq. 2482 * This should represent all flushable inodes. These inodes 2483 * will already have refs due to being on syncq or sideq. We 2484 * must do this all at once with the spinlock held to ensure that 2485 * all inode dependencies are part of the same flush. 2486 * 2487 * We should be able to do this asynchronously from frontend 2488 * operations because we will be locking the inodes later on 2489 * to actually flush them, and that will partition any frontend 2490 * op using the same inode. Either it has already locked the 2491 * inode and we will block, or it has not yet locked the inode 2492 * and it will block until we are finished flushing that inode. 2493 * 2494 * When restarting, only move the inodes flagged as PASS2 from 2495 * SIDEQ to SYNCQ. PASS2 propagation by inode_lock4() and 2496 * inode_depend() are atomic with the spin-lock. 2497 */ 2498 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 2499 #ifdef HAMMER2_DEBUG_SYNC 2500 kprintf("FILESYSTEM SYNC BOUNDARY\n"); 2501 #endif 2502 dorestart = 0; 2503 2504 /* 2505 * Move inodes from depq to syncq, releasing the related 2506 * depend structures. 2507 */ 2508 restart: 2509 #ifdef HAMMER2_DEBUG_SYNC 2510 kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart); 2511 #endif 2512 hammer2_trans_setflags(pmp, 0/*HAMMER2_TRANS_COPYQ*/); 2513 hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN); 2514 2515 /* 2516 * Move inodes from depq to syncq. When restarting, only depq's 2517 * marked pass2 are moved. 2518 */ 2519 hammer2_spin_ex(&pmp->list_spin); 2520 depend_next = TAILQ_FIRST(&pmp->depq); 2521 wakecount = 0; 2522 2523 while ((depend = depend_next) != NULL) { 2524 depend_next = TAILQ_NEXT(depend, entry); 2525 if (dorestart && depend->pass2 == 0) 2526 continue; 2527 TAILQ_FOREACH(ip, &depend->sideq, entry) { 2528 KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ); 2529 atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ); 2530 atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ); 2531 ip->depend = NULL; 2532 } 2533 2534 /* 2535 * NOTE: pmp->sideq_count includes both sideq and syncq 2536 */ 2537 TAILQ_CONCAT(&pmp->syncq, &depend->sideq, entry); 2538 2539 depend->count = 0; 2540 depend->pass2 = 0; 2541 TAILQ_REMOVE(&pmp->depq, depend, entry); 2542 } 2543 2544 hammer2_spin_unex(&pmp->list_spin); 2545 hammer2_trans_clearflags(pmp, /*HAMMER2_TRANS_COPYQ |*/ 2546 HAMMER2_TRANS_WAITING); 2547 dorestart = 0; 2548 2549 /* 2550 * sideq_count may have dropped enough to allow us to unstall 2551 * the frontend. 2552 */ 2553 hammer2_pfs_memory_wakeup(pmp, 0); 2554 2555 /* 2556 * Now run through all inodes on syncq. 2557 * 2558 * Flush transactions only interlock with other flush transactions. 2559 * Any conflicting frontend operations will block on the inode, but 2560 * may hold a vnode lock while doing so. 2561 */ 2562 hammer2_spin_ex(&pmp->list_spin); 2563 while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) { 2564 /* 2565 * Remove the inode from the SYNCQ, transfer the syncq ref 2566 * to us. We must clear SYNCQ to allow any potential 2567 * front-end deadlock to proceed. We must set PASS2 so 2568 * the dependency code knows what to do. 2569 */ 2570 pass2 = ip->flags; 2571 cpu_ccfence(); 2572 if (atomic_cmpset_int(&ip->flags, 2573 pass2, 2574 (pass2 & ~(HAMMER2_INODE_SYNCQ | 2575 HAMMER2_INODE_SYNCQ_WAKEUP)) | 2576 HAMMER2_INODE_SYNCQ_PASS2) == 0) { 2577 continue; 2578 } 2579 TAILQ_REMOVE(&pmp->syncq, ip, entry); 2580 --pmp->sideq_count; 2581 hammer2_spin_unex(&pmp->list_spin); 2582 2583 /* 2584 * Tickle anyone waiting on ip->flags or the hysteresis 2585 * on the dirty inode count. 2586 */ 2587 if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP) 2588 wakeup(&ip->flags); 2589 if (++wakecount >= hammer2_limit_dirty_inodes / 20 + 1) { 2590 wakecount = 0; 2591 hammer2_pfs_memory_wakeup(pmp, 0); 2592 } 2593 2594 /* 2595 * Relock the inode, and we inherit a ref from the above. 2596 * We will check for a race after we acquire the vnode. 2597 */ 2598 hammer2_mtx_ex(&ip->lock); 2599 2600 /* 2601 * We need the vp in order to vfsync() dirty buffers, so if 2602 * one isn't attached we can skip it. 2603 * 2604 * Ordering the inode lock and then the vnode lock has the 2605 * potential to deadlock. If we had left SYNCQ set that could 2606 * also deadlock us against the frontend even if we don't hold 2607 * any locks, but the latter is not a problem now since we 2608 * cleared it. igetv will temporarily release the inode lock 2609 * in a safe manner to work-around the deadlock. 2610 * 2611 * Unfortunately it is still possible to deadlock when the 2612 * frontend obtains multiple inode locks, because all the 2613 * related vnodes are already locked (nor can the vnode locks 2614 * be released and reacquired without messing up RECLAIM and 2615 * INACTIVE sequencing). 2616 * 2617 * The solution for now is to move the vp back onto SIDEQ 2618 * and set dorestart, which will restart the flush after we 2619 * exhaust the current SYNCQ. Note that additional 2620 * dependencies may build up, so we definitely need to move 2621 * the whole SIDEQ back to SYNCQ when we restart. 2622 */ 2623 vp = ip->vp; 2624 if (vp) { 2625 if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) { 2626 /* 2627 * Failed to get the vnode, requeue the inode 2628 * (PASS2 is already set so it will be found 2629 * again on the restart). 2630 * 2631 * Then unlock, possibly sleep, and retry 2632 * later. We sleep if PASS2 was *previously* 2633 * set, before we set it again above. 2634 */ 2635 vp = NULL; 2636 dorestart = 1; 2637 #ifdef HAMMER2_DEBUG_SYNC 2638 kprintf("inum %ld (sync delayed by vnode)\n", 2639 (long)ip->meta.inum); 2640 #endif 2641 hammer2_inode_delayed_sideq(ip); 2642 2643 hammer2_mtx_unlock(&ip->lock); 2644 hammer2_inode_drop(ip); 2645 2646 if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) { 2647 tsleep(&dorestart, 0, "h2syndel", 2); 2648 } 2649 hammer2_spin_ex(&pmp->list_spin); 2650 continue; 2651 } 2652 } else { 2653 vp = NULL; 2654 } 2655 2656 /* 2657 * If the inode wound up on a SIDEQ again it will already be 2658 * prepped for another PASS2. In this situation if we flush 2659 * it now we will just wind up flushing it again in the same 2660 * syncer run, so we might as well not flush it now. 2661 */ 2662 if (ip->flags & HAMMER2_INODE_SIDEQ) { 2663 hammer2_mtx_unlock(&ip->lock); 2664 hammer2_inode_drop(ip); 2665 if (vp) 2666 vput(vp); 2667 dorestart = 1; 2668 hammer2_spin_ex(&pmp->list_spin); 2669 continue; 2670 } 2671 2672 /* 2673 * Ok we have the inode exclusively locked and if vp is 2674 * not NULL that will also be exclusively locked. Do the 2675 * meat of the flush. 2676 * 2677 * vp token needed for v_rbdirty_tree check / vclrisdirty 2678 * sequencing. Though we hold the vnode exclusively so 2679 * we shouldn't need to hold the token also in this case. 2680 */ 2681 if (vp) { 2682 vfsync(vp, MNT_WAIT, 1, NULL, NULL); 2683 bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */ 2684 } 2685 2686 /* 2687 * If the inode has not yet been inserted into the tree 2688 * we must do so. Then sync and flush it. The flush should 2689 * update the parent. 2690 */ 2691 if (ip->flags & HAMMER2_INODE_DELETING) { 2692 #ifdef HAMMER2_DEBUG_SYNC 2693 kprintf("inum %ld destroy\n", (long)ip->meta.inum); 2694 #endif 2695 hammer2_inode_chain_des(ip); 2696 atomic_add_long(&hammer2_iod_inode_deletes, 1); 2697 } else if (ip->flags & HAMMER2_INODE_CREATING) { 2698 #ifdef HAMMER2_DEBUG_SYNC 2699 kprintf("inum %ld insert\n", (long)ip->meta.inum); 2700 #endif 2701 hammer2_inode_chain_ins(ip); 2702 atomic_add_long(&hammer2_iod_inode_creates, 1); 2703 } 2704 #ifdef HAMMER2_DEBUG_SYNC 2705 kprintf("inum %ld chain-sync\n", (long)ip->meta.inum); 2706 #endif 2707 2708 /* 2709 * Because I kinda messed up the design and index the inodes 2710 * under the root inode, along side the directory entries, 2711 * we can't flush the inode index under the iroot until the 2712 * end. If we do it now we might miss effects created by 2713 * other inodes on the SYNCQ. 2714 * 2715 * Do a normal (non-FSSYNC) flush instead, which allows the 2716 * vnode code to work the same. We don't want to force iroot 2717 * back onto the SIDEQ, and we also don't want the flush code 2718 * to update pfs_iroot_blocksets until the final flush later. 2719 * 2720 * XXX at the moment this will likely result in a double-flush 2721 * of the iroot chain. 2722 */ 2723 hammer2_inode_chain_sync(ip); 2724 if (ip == pmp->iroot) { 2725 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP); 2726 } else { 2727 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP | 2728 HAMMER2_XOP_FSSYNC); 2729 } 2730 if (vp) { 2731 lwkt_gettoken(&vp->v_token); 2732 if ((ip->flags & (HAMMER2_INODE_MODIFIED | 2733 HAMMER2_INODE_RESIZED | 2734 HAMMER2_INODE_DIRTYDATA)) == 0 && 2735 RB_EMPTY(&vp->v_rbdirty_tree) && 2736 !bio_track_active(&vp->v_track_write)) { 2737 vclrisdirty(vp); 2738 } else { 2739 hammer2_inode_delayed_sideq(ip); 2740 } 2741 lwkt_reltoken(&vp->v_token); 2742 vput(vp); 2743 vp = NULL; /* safety */ 2744 } 2745 atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2); 2746 hammer2_inode_unlock(ip); /* unlock+drop */ 2747 /* ip pointer invalid */ 2748 2749 /* 2750 * If the inode got dirted after we dropped our locks, 2751 * it will have already been moved back to the SIDEQ. 2752 */ 2753 hammer2_spin_ex(&pmp->list_spin); 2754 } 2755 hammer2_spin_unex(&pmp->list_spin); 2756 hammer2_pfs_memory_wakeup(pmp, 0); 2757 2758 if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) { 2759 #ifdef HAMMER2_DEBUG_SYNC 2760 kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n"); 2761 /*tsleep(&dorestart, 0, "h2STG1-R", hz*20);*/ 2762 #endif 2763 dorestart = 1; 2764 goto restart; 2765 } 2766 #ifdef HAMMER2_DEBUG_SYNC 2767 kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n"); 2768 /*tsleep(&dorestart, 0, "h2STG2", hz*20);*/ 2769 #endif 2770 2771 /* 2772 * We have to flush the PFS root last, even if it does not appear to 2773 * be dirty, because all the inodes in the PFS are indexed under it. 2774 * The normal flushing of iroot above would only occur if directory 2775 * entries under the root were changed. 2776 * 2777 * Specifying VOLHDR will cause an additionl flush of hmp->spmp 2778 * for the media making up the cluster. 2779 */ 2780 if ((ip = pmp->iroot) != NULL) { 2781 hammer2_inode_ref(ip); 2782 hammer2_mtx_ex(&ip->lock); 2783 hammer2_inode_chain_sync(ip); 2784 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP | 2785 HAMMER2_XOP_FSSYNC | 2786 HAMMER2_XOP_VOLHDR); 2787 hammer2_inode_unlock(ip); /* unlock+drop */ 2788 } 2789 #ifdef HAMMER2_DEBUG_SYNC 2790 kprintf("FILESYSTEM SYNC STAGE 2 DONE\n"); 2791 #endif 2792 2793 /* 2794 * device bioq sync 2795 */ 2796 hammer2_bioq_sync(pmp); 2797 2798 #if 0 2799 info.pass = 1; 2800 info.waitfor = MNT_WAIT; 2801 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2802 2803 info.pass = 2; 2804 info.waitfor = MNT_WAIT; 2805 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2806 #endif 2807 #if 0 2808 /* 2809 * Generally speaking we now want to flush the media topology from 2810 * the iroot through to the inodes. The flush stops at any inode 2811 * boundary, which allows the frontend to continue running concurrent 2812 * modifying operations on inodes (including kernel flushes of 2813 * buffers) without interfering with the main sync. 2814 * 2815 * Use the XOP interface to concurrently flush all nodes to 2816 * synchronize the PFSROOT subtopology to the media. A standard 2817 * end-of-scan ENOENT error indicates cluster sufficiency. 2818 * 2819 * Note that this flush will not be visible on crash recovery until 2820 * we flush the super-root topology in the next loop. 2821 * 2822 * XXX For now wait for all flushes to complete. 2823 */ 2824 if (mp && (ip = pmp->iroot) != NULL) { 2825 /* 2826 * If unmounting try to flush everything including any 2827 * sub-trees under inodes, just in case there is dangling 2828 * modified data, as a safety. Otherwise just flush up to 2829 * the inodes in this stage. 2830 */ 2831 kprintf("MP & IROOT\n"); 2832 #ifdef HAMMER2_DEBUG_SYNC 2833 kprintf("FILESYSTEM SYNC STAGE 3 IROOT BEGIN\n"); 2834 #endif 2835 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 2836 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | 2837 HAMMER2_XOP_VOLHDR | 2838 HAMMER2_XOP_FSSYNC | 2839 HAMMER2_XOP_INODE_STOP); 2840 } else { 2841 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | 2842 HAMMER2_XOP_INODE_STOP | 2843 HAMMER2_XOP_VOLHDR | 2844 HAMMER2_XOP_FSSYNC | 2845 HAMMER2_XOP_INODE_STOP); 2846 } 2847 hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc); 2848 error = hammer2_xop_collect(&xop->head, 2849 HAMMER2_XOP_COLLECT_WAITALL); 2850 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2851 #ifdef HAMMER2_DEBUG_SYNC 2852 kprintf("FILESYSTEM SYNC STAGE 3 IROOT END\n"); 2853 #endif 2854 if (error == HAMMER2_ERROR_ENOENT) 2855 error = 0; 2856 else 2857 error = hammer2_error_to_errno(error); 2858 } else { 2859 error = 0; 2860 } 2861 #endif 2862 error = 0; /* XXX */ 2863 hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH); 2864 2865 return (error); 2866 } 2867 2868 static 2869 int 2870 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp) 2871 { 2872 hammer2_inode_t *ip; 2873 2874 KKASSERT(MAXFIDSZ >= 16); 2875 ip = VTOI(vp); 2876 fhp->fid_len = offsetof(struct fid, fid_data[16]); 2877 fhp->fid_ext = 0; 2878 ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum; 2879 ((hammer2_tid_t *)fhp->fid_data)[1] = 0; 2880 2881 return 0; 2882 } 2883 2884 static 2885 int 2886 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 2887 struct fid *fhp, struct vnode **vpp) 2888 { 2889 hammer2_pfs_t *pmp; 2890 hammer2_tid_t inum; 2891 int error; 2892 2893 pmp = MPTOPMP(mp); 2894 inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK; 2895 if (vpp) { 2896 if (inum == 1) 2897 error = hammer2_vfs_root(mp, vpp); 2898 else 2899 error = hammer2_vfs_vget(mp, NULL, inum, vpp); 2900 } else { 2901 error = 0; 2902 } 2903 if (error) 2904 kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error); 2905 return error; 2906 } 2907 2908 static 2909 int 2910 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 2911 int *exflagsp, struct ucred **credanonp) 2912 { 2913 hammer2_pfs_t *pmp; 2914 struct netcred *np; 2915 int error; 2916 2917 pmp = MPTOPMP(mp); 2918 np = vfs_export_lookup(mp, &pmp->export, nam); 2919 if (np) { 2920 *exflagsp = np->netc_exflags; 2921 *credanonp = &np->netc_anon; 2922 error = 0; 2923 } else { 2924 error = EACCES; 2925 } 2926 return error; 2927 } 2928 2929 /* 2930 * Support code for hammer2_vfs_mount(). Read, verify, and install the volume 2931 * header into the HMP 2932 * 2933 * XXX read four volhdrs and use the one with the highest TID whos CRC 2934 * matches. 2935 * 2936 * XXX check iCRCs. 2937 * 2938 * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to 2939 * nonexistant locations. 2940 * 2941 * XXX Record selected volhdr and ring updates to each of 4 volhdrs 2942 */ 2943 static 2944 int 2945 hammer2_install_volume_header(hammer2_dev_t *hmp) 2946 { 2947 hammer2_volume_data_t *vd; 2948 struct buf *bp; 2949 hammer2_crc32_t crc0, crc, bcrc0, bcrc; 2950 int error_reported; 2951 int error; 2952 int valid; 2953 int i; 2954 2955 error_reported = 0; 2956 error = 0; 2957 valid = 0; 2958 bp = NULL; 2959 2960 /* 2961 * There are up to 4 copies of the volume header (syncs iterate 2962 * between them so there is no single master). We don't trust the 2963 * volu_size field so we don't know precisely how large the filesystem 2964 * is, so depend on the OS to return an error if we go beyond the 2965 * block device's EOF. 2966 */ 2967 for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { 2968 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2969 HAMMER2_VOLUME_BYTES, &bp); 2970 if (error) { 2971 brelse(bp); 2972 bp = NULL; 2973 continue; 2974 } 2975 2976 vd = (struct hammer2_volume_data *) bp->b_data; 2977 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) && 2978 (vd->magic != HAMMER2_VOLUME_ID_ABO)) { 2979 brelse(bp); 2980 bp = NULL; 2981 continue; 2982 } 2983 2984 if (vd->magic == HAMMER2_VOLUME_ID_ABO) { 2985 /* XXX: Reversed-endianness filesystem */ 2986 kprintf("hammer2: reverse-endian filesystem detected"); 2987 brelse(bp); 2988 bp = NULL; 2989 continue; 2990 } 2991 2992 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0]; 2993 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF, 2994 HAMMER2_VOLUME_ICRC0_SIZE); 2995 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1]; 2996 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF, 2997 HAMMER2_VOLUME_ICRC1_SIZE); 2998 if ((crc0 != crc) || (bcrc0 != bcrc)) { 2999 kprintf("hammer2 volume header crc " 3000 "mismatch copy #%d %08x/%08x\n", 3001 i, crc0, crc); 3002 error_reported = 1; 3003 brelse(bp); 3004 bp = NULL; 3005 continue; 3006 } 3007 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) { 3008 valid = 1; 3009 hmp->voldata = *vd; 3010 hmp->volhdrno = i; 3011 } 3012 brelse(bp); 3013 bp = NULL; 3014 } 3015 if (valid) { 3016 hmp->volsync = hmp->voldata; 3017 hmp->free_reserved = hmp->voldata.allocator_size / 20; 3018 error = 0; 3019 if (error_reported || bootverbose || 1) { /* 1/DEBUG */ 3020 kprintf("hammer2: using volume header #%d\n", 3021 hmp->volhdrno); 3022 } 3023 } else { 3024 error = EINVAL; 3025 kprintf("hammer2: no valid volume headers found!\n"); 3026 } 3027 return (error); 3028 } 3029 3030 /* 3031 * This handles hysteresis on regular file flushes. Because the BIOs are 3032 * routed to a thread it is possible for an excessive number to build up 3033 * and cause long front-end stalls long before the runningbuffspace limit 3034 * is hit, so we implement hammer2_flush_pipe to control the 3035 * hysteresis. 3036 * 3037 * This is a particular problem when compression is used. 3038 */ 3039 void 3040 hammer2_lwinprog_ref(hammer2_pfs_t *pmp) 3041 { 3042 atomic_add_int(&pmp->count_lwinprog, 1); 3043 } 3044 3045 void 3046 hammer2_lwinprog_drop(hammer2_pfs_t *pmp) 3047 { 3048 int lwinprog; 3049 3050 lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1); 3051 if ((lwinprog & HAMMER2_LWINPROG_WAITING) && 3052 (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) { 3053 atomic_clear_int(&pmp->count_lwinprog, 3054 HAMMER2_LWINPROG_WAITING); 3055 wakeup(&pmp->count_lwinprog); 3056 } 3057 if ((lwinprog & HAMMER2_LWINPROG_WAITING0) && 3058 (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) { 3059 atomic_clear_int(&pmp->count_lwinprog, 3060 HAMMER2_LWINPROG_WAITING0); 3061 wakeup(&pmp->count_lwinprog); 3062 } 3063 } 3064 3065 void 3066 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe) 3067 { 3068 int lwinprog; 3069 int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING : 3070 HAMMER2_LWINPROG_WAITING0; 3071 3072 for (;;) { 3073 lwinprog = pmp->count_lwinprog; 3074 cpu_ccfence(); 3075 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 3076 break; 3077 tsleep_interlock(&pmp->count_lwinprog, 0); 3078 atomic_set_int(&pmp->count_lwinprog, lwflag); 3079 lwinprog = pmp->count_lwinprog; 3080 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 3081 break; 3082 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz); 3083 } 3084 } 3085 3086 /* 3087 * It is possible for an excessive number of dirty chains or dirty inodes 3088 * to build up. When this occurs we start an asynchronous filesystem sync. 3089 * If the level continues to build up, we stall, waiting for it to drop, 3090 * with some hysteresis. 3091 * 3092 * This relies on the kernel calling hammer2_vfs_modifying() prior to 3093 * obtaining any vnode locks before making a modifying VOP call. 3094 */ 3095 static int 3096 hammer2_vfs_modifying(struct mount *mp) 3097 { 3098 if (mp->mnt_flag & MNT_RDONLY) 3099 return EROFS; 3100 hammer2_pfs_memory_wait(MPTOPMP(mp)); 3101 3102 return 0; 3103 } 3104 3105 /* 3106 * Initiate an asynchronous filesystem sync and, with hysteresis, 3107 * stall if the internal data structure count becomes too bloated. 3108 */ 3109 void 3110 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp) 3111 { 3112 uint32_t waiting; 3113 int pcatch; 3114 int error; 3115 3116 if (pmp == NULL || pmp->mp == NULL) 3117 return; 3118 3119 for (;;) { 3120 waiting = pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK; 3121 cpu_ccfence(); 3122 3123 /* 3124 * Start the syncer running at 1/2 the limit 3125 */ 3126 if (waiting > hammer2_limit_dirty_chains / 2 || 3127 pmp->sideq_count > hammer2_limit_dirty_inodes / 2) { 3128 trigger_syncer(pmp->mp); 3129 } 3130 3131 /* 3132 * Stall at the limit waiting for the counts to drop. 3133 * This code will typically be woken up once the count 3134 * drops below 3/4 the limit, or in one second. 3135 */ 3136 if (waiting < hammer2_limit_dirty_chains && 3137 pmp->sideq_count < hammer2_limit_dirty_inodes) { 3138 break; 3139 } 3140 3141 pcatch = curthread->td_proc ? PCATCH : 0; 3142 3143 tsleep_interlock(&pmp->inmem_dirty_chains, pcatch); 3144 atomic_set_int(&pmp->inmem_dirty_chains, 3145 HAMMER2_DIRTYCHAIN_WAITING); 3146 if (waiting < hammer2_limit_dirty_chains && 3147 pmp->sideq_count < hammer2_limit_dirty_inodes) { 3148 break; 3149 } 3150 trigger_syncer(pmp->mp); 3151 error = tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED | pcatch, 3152 "h2memw", hz); 3153 if (error == ERESTART) 3154 break; 3155 } 3156 } 3157 3158 /* 3159 * Wake up any stalled frontend ops waiting, with hysteresis, using 3160 * 2/3 of the limit. 3161 */ 3162 void 3163 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp, int count) 3164 { 3165 uint32_t waiting; 3166 3167 if (pmp) { 3168 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, count); 3169 /* don't need --waiting to test flag */ 3170 3171 if ((waiting & HAMMER2_DIRTYCHAIN_WAITING) && 3172 (pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK) <= 3173 hammer2_limit_dirty_chains * 2 / 3 && 3174 pmp->sideq_count <= hammer2_limit_dirty_inodes * 2 / 3) { 3175 atomic_clear_int(&pmp->inmem_dirty_chains, 3176 HAMMER2_DIRTYCHAIN_WAITING); 3177 wakeup(&pmp->inmem_dirty_chains); 3178 } 3179 } 3180 } 3181 3182 void 3183 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp) 3184 { 3185 if (pmp) { 3186 atomic_add_int(&pmp->inmem_dirty_chains, 1); 3187 } 3188 } 3189 3190 /* 3191 * Returns 0 if the filesystem has tons of free space 3192 * Returns 1 if the filesystem has less than 10% remaining 3193 * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining. 3194 */ 3195 int 3196 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred) 3197 { 3198 hammer2_pfs_t *pmp; 3199 hammer2_dev_t *hmp; 3200 hammer2_off_t free_reserved; 3201 hammer2_off_t free_nominal; 3202 int i; 3203 3204 pmp = ip->pmp; 3205 3206 if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) { 3207 free_reserved = HAMMER2_SEGSIZE; 3208 free_nominal = 0x7FFFFFFFFFFFFFFFLLU; 3209 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 3210 hmp = pmp->pfs_hmps[i]; 3211 if (hmp == NULL) 3212 continue; 3213 if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER && 3214 pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER) 3215 continue; 3216 3217 if (free_nominal > hmp->voldata.allocator_free) 3218 free_nominal = hmp->voldata.allocator_free; 3219 if (free_reserved < hmp->free_reserved) 3220 free_reserved = hmp->free_reserved; 3221 } 3222 3223 /* 3224 * SMP races ok 3225 */ 3226 pmp->free_reserved = free_reserved; 3227 pmp->free_nominal = free_nominal; 3228 pmp->free_ticks = ticks; 3229 } else { 3230 free_reserved = pmp->free_reserved; 3231 free_nominal = pmp->free_nominal; 3232 } 3233 if (cred && cred->cr_uid != 0) { 3234 if ((int64_t)(free_nominal - bytes) < 3235 (int64_t)free_reserved) { 3236 return 2; 3237 } 3238 } else { 3239 if ((int64_t)(free_nominal - bytes) < 3240 (int64_t)free_reserved / 2) { 3241 return 2; 3242 } 3243 } 3244 if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2) 3245 return 1; 3246 return 0; 3247 } 3248 3249 /* 3250 * Debugging 3251 */ 3252 void 3253 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, 3254 u_int flags) 3255 { 3256 hammer2_chain_t *scan; 3257 hammer2_chain_t *parent; 3258 3259 --*countp; 3260 if (*countp == 0) { 3261 kprintf("%*.*s...\n", tab, tab, ""); 3262 return; 3263 } 3264 if (*countp < 0) 3265 return; 3266 kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n", 3267 tab, tab, "", pfx, 3268 chain, chain->bref.type, 3269 chain->bref.key, chain->bref.keybits, 3270 chain->bref.mirror_tid); 3271 3272 kprintf("%*.*s [%08x] (%s) refs=%d", 3273 tab, tab, "", 3274 chain->flags, 3275 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 3276 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 3277 chain->refs); 3278 3279 parent = chain->parent; 3280 if (parent) 3281 kprintf("\n%*.*s p=%p [pflags %08x prefs %d", 3282 tab, tab, "", 3283 parent, parent->flags, parent->refs); 3284 if (RB_EMPTY(&chain->core.rbtree)) { 3285 kprintf("\n"); 3286 } else { 3287 kprintf(" {\n"); 3288 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 3289 if ((scan->flags & flags) || flags == (u_int)-1) { 3290 hammer2_dump_chain(scan, tab + 4, countp, 'a', 3291 flags); 3292 } 3293 } 3294 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 3295 kprintf("%*.*s}(%s)\n", tab, tab, "", 3296 chain->data->ipdata.filename); 3297 else 3298 kprintf("%*.*s}\n", tab, tab, ""); 3299 } 3300 } 3301