1 /* 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/nlookup.h> 39 #include <sys/vnode.h> 40 #include <sys/mount.h> 41 #include <sys/fcntl.h> 42 #include <sys/buf.h> 43 #include <sys/uuid.h> 44 #include <sys/vfsops.h> 45 #include <sys/sysctl.h> 46 #include <sys/socket.h> 47 #include <sys/objcache.h> 48 49 #include <sys/proc.h> 50 #include <sys/mountctl.h> 51 #include <sys/dirent.h> 52 #include <sys/uio.h> 53 54 #include "hammer2.h" 55 #include "hammer2_disk.h" 56 #include "hammer2_mount.h" 57 #include "hammer2_lz4.h" 58 59 #include "zlib/hammer2_zlib.h" 60 61 #define REPORT_REFS_ERRORS 1 /* XXX remove me */ 62 63 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); 64 65 struct hammer2_sync_info { 66 int error; 67 int waitfor; 68 int pass; 69 }; 70 71 TAILQ_HEAD(hammer2_mntlist, hammer2_dev); 72 static struct hammer2_mntlist hammer2_mntlist; 73 74 struct hammer2_pfslist hammer2_pfslist; 75 struct hammer2_pfslist hammer2_spmplist; 76 struct lock hammer2_mntlk; 77 78 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT; 79 int hammer2_debug; 80 int hammer2_xopgroups; 81 long hammer2_debug_inode; 82 int hammer2_cluster_meta_read = 1; /* physical read-ahead */ 83 int hammer2_cluster_data_read = 4; /* physical read-ahead */ 84 int hammer2_cluster_write = 0; /* physical write clustering */ 85 int hammer2_dedup_enable = 1; 86 int hammer2_always_compress = 0; /* always try to compress */ 87 int hammer2_inval_enable = 0; 88 int hammer2_flush_pipe = 100; 89 int hammer2_dio_count; 90 int hammer2_dio_limit = 256; 91 int hammer2_bulkfree_tps = 5000; 92 int hammer2_worker_rmask = 3; 93 long hammer2_chain_allocs; 94 long hammer2_chain_frees; 95 long hammer2_limit_dirty_chains; 96 long hammer2_limit_dirty_inodes; 97 long hammer2_count_modified_chains; 98 long hammer2_iod_invals; 99 long hammer2_iod_file_read; 100 long hammer2_iod_meta_read; 101 long hammer2_iod_indr_read; 102 long hammer2_iod_fmap_read; 103 long hammer2_iod_volu_read; 104 long hammer2_iod_file_write; 105 long hammer2_iod_file_wembed; 106 long hammer2_iod_file_wzero; 107 long hammer2_iod_file_wdedup; 108 long hammer2_iod_meta_write; 109 long hammer2_iod_indr_write; 110 long hammer2_iod_fmap_write; 111 long hammer2_iod_volu_write; 112 long hammer2_iod_inode_creates; 113 long hammer2_iod_inode_deletes; 114 115 MALLOC_DECLARE(M_HAMMER2_CBUFFER); 116 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer", 117 "Buffer used for compression."); 118 119 MALLOC_DECLARE(M_HAMMER2_DEBUFFER); 120 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer", 121 "Buffer used for decompression."); 122 123 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem"); 124 125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD, 126 &hammer2_supported_version, 0, ""); 127 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW, 128 &hammer2_debug, 0, ""); 129 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, debug_inode, CTLFLAG_RW, 130 &hammer2_debug_inode, 0, ""); 131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW, 132 &hammer2_cluster_meta_read, 0, ""); 133 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW, 134 &hammer2_cluster_data_read, 0, ""); 135 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW, 136 &hammer2_cluster_write, 0, ""); 137 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW, 138 &hammer2_dedup_enable, 0, ""); 139 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW, 140 &hammer2_always_compress, 0, ""); 141 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, 142 &hammer2_inval_enable, 0, ""); 143 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, 144 &hammer2_flush_pipe, 0, ""); 145 SYSCTL_INT(_vfs_hammer2, OID_AUTO, worker_rmask, CTLFLAG_RW, 146 &hammer2_worker_rmask, 0, ""); 147 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW, 148 &hammer2_bulkfree_tps, 0, ""); 149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW, 150 &hammer2_chain_allocs, 0, ""); 151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW, 152 &hammer2_chain_frees, 0, ""); 153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW, 154 &hammer2_limit_dirty_chains, 0, ""); 155 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW, 156 &hammer2_limit_dirty_inodes, 0, ""); 157 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, 158 &hammer2_count_modified_chains, 0, ""); 159 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, 160 &hammer2_dio_count, 0, ""); 161 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW, 162 &hammer2_dio_limit, 0, ""); 163 164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, 165 &hammer2_iod_invals, 0, ""); 166 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, 167 &hammer2_iod_file_read, 0, ""); 168 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, 169 &hammer2_iod_meta_read, 0, ""); 170 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, 171 &hammer2_iod_indr_read, 0, ""); 172 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, 173 &hammer2_iod_fmap_read, 0, ""); 174 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, 175 &hammer2_iod_volu_read, 0, ""); 176 177 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, 178 &hammer2_iod_file_write, 0, ""); 179 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW, 180 &hammer2_iod_file_wembed, 0, ""); 181 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW, 182 &hammer2_iod_file_wzero, 0, ""); 183 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW, 184 &hammer2_iod_file_wdedup, 0, ""); 185 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, 186 &hammer2_iod_meta_write, 0, ""); 187 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, 188 &hammer2_iod_indr_write, 0, ""); 189 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, 190 &hammer2_iod_fmap_write, 0, ""); 191 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, 192 &hammer2_iod_volu_write, 0, ""); 193 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_creates, CTLFLAG_RW, 194 &hammer2_iod_inode_creates, 0, ""); 195 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_deletes, CTLFLAG_RW, 196 &hammer2_iod_inode_deletes, 0, ""); 197 198 long hammer2_process_icrc32; 199 long hammer2_process_xxhash64; 200 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW, 201 &hammer2_process_icrc32, 0, ""); 202 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW, 203 &hammer2_process_xxhash64, 0, ""); 204 205 static int hammer2_vfs_init(struct vfsconf *conf); 206 static int hammer2_vfs_uninit(struct vfsconf *vfsp); 207 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 208 struct ucred *cred); 209 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *, 210 struct vnode *, struct ucred *); 211 static int hammer2_recovery(hammer2_dev_t *hmp); 212 static int hammer2_vfs_unmount(struct mount *mp, int mntflags); 213 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); 214 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, 215 struct ucred *cred); 216 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, 217 struct ucred *cred); 218 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 219 struct fid *fhp, struct vnode **vpp); 220 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp); 221 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 222 int *exflagsp, struct ucred **credanonp); 223 static int hammer2_vfs_modifying(struct mount *mp); 224 225 static int hammer2_install_volume_header(hammer2_dev_t *hmp); 226 #if 0 227 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 228 #endif 229 230 static void hammer2_update_pmps(hammer2_dev_t *hmp); 231 232 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp); 233 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, 234 hammer2_dev_t *hmp); 235 static int hammer2_fixup_pfses(hammer2_dev_t *hmp); 236 237 /* 238 * HAMMER2 vfs operations. 239 */ 240 static struct vfsops hammer2_vfsops = { 241 .vfs_flags = 0, 242 .vfs_init = hammer2_vfs_init, 243 .vfs_uninit = hammer2_vfs_uninit, 244 .vfs_sync = hammer2_vfs_sync, 245 .vfs_mount = hammer2_vfs_mount, 246 .vfs_unmount = hammer2_vfs_unmount, 247 .vfs_root = hammer2_vfs_root, 248 .vfs_statfs = hammer2_vfs_statfs, 249 .vfs_statvfs = hammer2_vfs_statvfs, 250 .vfs_vget = hammer2_vfs_vget, 251 .vfs_vptofh = hammer2_vfs_vptofh, 252 .vfs_fhtovp = hammer2_vfs_fhtovp, 253 .vfs_checkexp = hammer2_vfs_checkexp, 254 .vfs_modifying = hammer2_vfs_modifying 255 }; 256 257 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); 258 259 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE); 260 MODULE_VERSION(hammer2, 1); 261 262 static 263 int 264 hammer2_vfs_init(struct vfsconf *conf) 265 { 266 static struct objcache_malloc_args margs_read; 267 static struct objcache_malloc_args margs_write; 268 static struct objcache_malloc_args margs_vop; 269 270 int error; 271 272 error = 0; 273 kmalloc_raise_limit(M_HAMMER2, 0); /* unlimited */ 274 275 /* 276 * hammer2_xopgroups must be even and is most optimal if 277 * 2 x ncpus so strategy functions can be queued to the same 278 * cpu. 279 */ 280 hammer2_xopgroups = HAMMER2_XOPGROUPS_MIN; 281 if (hammer2_xopgroups < ncpus * 2) 282 hammer2_xopgroups = ncpus * 2; 283 284 /* 285 * A large DIO cache is needed to retain dedup enablement masks. 286 * The bulkfree code clears related masks as part of the disk block 287 * recycling algorithm, preventing it from being used for a later 288 * dedup. 289 * 290 * NOTE: A large buffer cache can actually interfere with dedup 291 * operation because we dedup based on media physical buffers 292 * and not logical buffers. Try to make the DIO case large 293 * enough to avoid this problem, but also cap it. 294 */ 295 hammer2_dio_limit = nbuf * 2; 296 if (hammer2_dio_limit > 100000) 297 hammer2_dio_limit = 100000; 298 299 if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) 300 error = EINVAL; 301 if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data)) 302 error = EINVAL; 303 if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data)) 304 error = EINVAL; 305 306 if (error) 307 kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); 308 309 margs_read.objsize = 65536; 310 margs_read.mtype = M_HAMMER2_DEBUFFER; 311 312 margs_write.objsize = 32768; 313 margs_write.mtype = M_HAMMER2_CBUFFER; 314 315 margs_vop.objsize = sizeof(hammer2_xop_t); 316 margs_vop.mtype = M_HAMMER2; 317 318 /* 319 * Note thaht for the XOPS cache we want backing store allocations 320 * to use M_ZERO. This is not allowed in objcache_get() (to avoid 321 * confusion), so use the backing store function that does it. This 322 * means that initial XOPS objects are zerod but REUSED objects are 323 * not. So we are responsible for cleaning the object up sufficiently 324 * for our needs before objcache_put()ing it back (typically just the 325 * FIFO indices). 326 */ 327 cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc, 328 0, 1, NULL, NULL, NULL, 329 objcache_malloc_alloc, 330 objcache_malloc_free, 331 &margs_read); 332 cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc, 333 0, 1, NULL, NULL, NULL, 334 objcache_malloc_alloc, 335 objcache_malloc_free, 336 &margs_write); 337 cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc, 338 0, 1, NULL, NULL, NULL, 339 objcache_malloc_alloc_zero, 340 objcache_malloc_free, 341 &margs_vop); 342 343 344 lockinit(&hammer2_mntlk, "mntlk", 0, 0); 345 TAILQ_INIT(&hammer2_mntlist); 346 TAILQ_INIT(&hammer2_pfslist); 347 TAILQ_INIT(&hammer2_spmplist); 348 349 hammer2_limit_dirty_chains = maxvnodes / 10; 350 if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS) 351 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS; 352 if (hammer2_limit_dirty_chains < 1000) 353 hammer2_limit_dirty_chains = 1000; 354 355 hammer2_limit_dirty_inodes = maxvnodes / 25; 356 if (hammer2_limit_dirty_inodes < 100) 357 hammer2_limit_dirty_inodes = 100; 358 if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES) 359 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES; 360 361 return (error); 362 } 363 364 static 365 int 366 hammer2_vfs_uninit(struct vfsconf *vfsp __unused) 367 { 368 objcache_destroy(cache_buffer_read); 369 objcache_destroy(cache_buffer_write); 370 objcache_destroy(cache_xops); 371 return 0; 372 } 373 374 /* 375 * Core PFS allocator. Used to allocate or reference the pmp structure 376 * for PFS cluster mounts and the spmp structure for media (hmp) structures. 377 * The pmp can be passed in or loaded by this function using the chain and 378 * inode data. 379 * 380 * pmp->modify_tid tracks new modify_tid transaction ids for front-end 381 * transactions. Note that synchronization does not use this field. 382 * (typically frontend operations and synchronization cannot run on the 383 * same PFS node at the same time). 384 * 385 * XXX check locking 386 */ 387 hammer2_pfs_t * 388 hammer2_pfsalloc(hammer2_chain_t *chain, 389 const hammer2_inode_data_t *ripdata, 390 hammer2_tid_t modify_tid, hammer2_dev_t *force_local) 391 { 392 hammer2_pfs_t *pmp; 393 hammer2_inode_t *iroot; 394 int count; 395 int i; 396 int j; 397 398 pmp = NULL; 399 400 /* 401 * Locate or create the PFS based on the cluster id. If ripdata 402 * is NULL this is a spmp which is unique and is always allocated. 403 * 404 * If the device is mounted in local mode all PFSs are considered 405 * independent and not part of any cluster (for debugging only). 406 */ 407 if (ripdata) { 408 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 409 if (force_local != pmp->force_local) 410 continue; 411 if (force_local == NULL && 412 bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid, 413 sizeof(pmp->pfs_clid)) == 0) { 414 break; 415 } else if (force_local && pmp->pfs_names[0] && 416 strcmp(pmp->pfs_names[0], ripdata->filename) == 0) { 417 break; 418 } 419 } 420 } 421 422 if (pmp == NULL) { 423 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO); 424 pmp->force_local = force_local; 425 hammer2_trans_manage_init(pmp); 426 kmalloc_create(&pmp->minode, "HAMMER2-inodes"); 427 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg"); 428 lockinit(&pmp->lock, "pfslk", 0, 0); 429 lockinit(&pmp->lock_nlink, "h2nlink", 0, 0); 430 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum"); 431 spin_init(&pmp->xop_spin, "h2xop"); 432 spin_init(&pmp->lru_spin, "h2lru"); 433 RB_INIT(&pmp->inum_tree); 434 TAILQ_INIT(&pmp->syncq); 435 TAILQ_INIT(&pmp->depq); 436 TAILQ_INIT(&pmp->lru_list); 437 spin_init(&pmp->list_spin, "h2pfsalloc_list"); 438 439 /* 440 * Save the last media transaction id for the flusher. Set 441 * initial 442 */ 443 if (ripdata) { 444 pmp->pfs_clid = ripdata->meta.pfs_clid; 445 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry); 446 } else { 447 pmp->flags |= HAMMER2_PMPF_SPMP; 448 TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry); 449 } 450 451 /* 452 * The synchronization thread may start too early, make 453 * sure it stays frozen until we are ready to let it go. 454 * XXX 455 */ 456 /* 457 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN | 458 HAMMER2_THREAD_REMASTER; 459 */ 460 } 461 462 /* 463 * Create the PFS's root inode and any missing XOP helper threads. 464 */ 465 if ((iroot = pmp->iroot) == NULL) { 466 iroot = hammer2_inode_get(pmp, NULL, 1, -1); 467 if (ripdata) 468 iroot->meta = ripdata->meta; 469 pmp->iroot = iroot; 470 hammer2_inode_ref(iroot); 471 hammer2_inode_unlock(iroot); 472 } 473 474 /* 475 * Stop here if no chain is passed in. 476 */ 477 if (chain == NULL) 478 goto done; 479 480 /* 481 * When a chain is passed in we must add it to the PFS's root 482 * inode, update pmp->pfs_types[], and update the syncronization 483 * threads. 484 * 485 * When forcing local mode, mark the PFS as a MASTER regardless. 486 * 487 * At the moment empty spots can develop due to removals or failures. 488 * Ultimately we want to re-fill these spots but doing so might 489 * confused running code. XXX 490 */ 491 hammer2_inode_ref(iroot); 492 hammer2_mtx_ex(&iroot->lock); 493 j = iroot->cluster.nchains; 494 495 if (j == HAMMER2_MAXCLUSTER) { 496 kprintf("hammer2_mount: cluster full!\n"); 497 /* XXX fatal error? */ 498 } else { 499 KKASSERT(chain->pmp == NULL); 500 chain->pmp = pmp; 501 hammer2_chain_ref(chain); 502 iroot->cluster.array[j].chain = chain; 503 if (force_local) 504 pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER; 505 else 506 pmp->pfs_types[j] = ripdata->meta.pfs_type; 507 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2); 508 pmp->pfs_hmps[j] = chain->hmp; 509 hammer2_spin_ex(&pmp->inum_spin); 510 pmp->pfs_iroot_blocksets[j] = chain->data->ipdata.u.blockset; 511 hammer2_spin_unex(&pmp->inum_spin); 512 513 /* 514 * If the PFS is already mounted we must account 515 * for the mount_count here. 516 */ 517 if (pmp->mp) 518 ++chain->hmp->mount_count; 519 520 /* 521 * May have to fixup dirty chain tracking. Previous 522 * pmp was NULL so nothing to undo. 523 */ 524 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 525 hammer2_pfs_memory_inc(pmp); 526 ++j; 527 } 528 iroot->cluster.nchains = j; 529 530 /* 531 * Update nmasters from any PFS inode which is part of the cluster. 532 * It is possible that this will result in a value which is too 533 * high. MASTER PFSs are authoritative for pfs_nmasters and will 534 * override this value later on. 535 * 536 * (This informs us of masters that might not currently be 537 * discoverable by this mount). 538 */ 539 if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) { 540 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters; 541 } 542 543 /* 544 * Count visible masters. Masters are usually added with 545 * ripdata->meta.pfs_nmasters set to 1. This detects when there 546 * are more (XXX and must update the master inodes). 547 */ 548 count = 0; 549 for (i = 0; i < iroot->cluster.nchains; ++i) { 550 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) 551 ++count; 552 } 553 if (pmp->pfs_nmasters < count) 554 pmp->pfs_nmasters = count; 555 556 /* 557 * Create missing synchronization and support threads. 558 * 559 * Single-node masters (including snapshots) have nothing to 560 * synchronize and do not require this thread. 561 * 562 * Multi-node masters or any number of soft masters, slaves, copy, 563 * or other PFS types need the thread. 564 * 565 * Each thread is responsible for its particular cluster index. 566 * We use independent threads so stalls or mismatches related to 567 * any given target do not affect other targets. 568 */ 569 for (i = 0; i < iroot->cluster.nchains; ++i) { 570 /* 571 * Single-node masters (including snapshots) have nothing 572 * to synchronize and will make direct xops support calls, 573 * thus they do not require this thread. 574 * 575 * Note that there can be thousands of snapshots. We do not 576 * want to create thousands of threads. 577 */ 578 if (pmp->pfs_nmasters <= 1 && 579 pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) { 580 continue; 581 } 582 583 /* 584 * Sync support thread 585 */ 586 if (pmp->sync_thrs[i].td == NULL) { 587 hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL, 588 "h2nod", i, -1, 589 hammer2_primary_sync_thread); 590 } 591 } 592 593 /* 594 * Create missing Xop threads 595 * 596 * NOTE: We create helper threads for all mounted PFSs or any 597 * PFSs with 2+ nodes (so the sync thread can update them, 598 * even if not mounted). 599 */ 600 if (pmp->mp || iroot->cluster.nchains >= 2) 601 hammer2_xop_helper_create(pmp); 602 603 hammer2_mtx_unlock(&iroot->lock); 604 hammer2_inode_drop(iroot); 605 done: 606 return pmp; 607 } 608 609 /* 610 * Deallocate an element of a probed PFS. If destroying and this is a 611 * MASTER, adjust nmasters. 612 * 613 * This function does not physically destroy the PFS element in its device 614 * under the super-root (see hammer2_ioctl_pfs_delete()). 615 */ 616 void 617 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying) 618 { 619 hammer2_inode_t *iroot; 620 hammer2_chain_t *chain; 621 int j; 622 623 /* 624 * Cleanup our reference on iroot. iroot is (should) not be needed 625 * by the flush code. 626 */ 627 iroot = pmp->iroot; 628 if (iroot) { 629 /* 630 * Stop synchronizing 631 * 632 * XXX flush after acquiring the iroot lock. 633 * XXX clean out the cluster index from all inode structures. 634 */ 635 hammer2_thr_delete(&pmp->sync_thrs[clindex]); 636 637 /* 638 * Remove the cluster index from the group. If destroying 639 * the PFS and this is a master, adjust pfs_nmasters. 640 */ 641 hammer2_mtx_ex(&iroot->lock); 642 chain = iroot->cluster.array[clindex].chain; 643 iroot->cluster.array[clindex].chain = NULL; 644 645 switch(pmp->pfs_types[clindex]) { 646 case HAMMER2_PFSTYPE_MASTER: 647 if (destroying && pmp->pfs_nmasters > 0) 648 --pmp->pfs_nmasters; 649 /* XXX adjust ripdata->meta.pfs_nmasters */ 650 break; 651 default: 652 break; 653 } 654 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE; 655 656 hammer2_mtx_unlock(&iroot->lock); 657 658 /* 659 * Release the chain. 660 */ 661 if (chain) { 662 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 663 hammer2_chain_drop(chain); 664 } 665 666 /* 667 * Terminate all XOP threads for the cluster index. 668 */ 669 if (pmp->xop_groups) { 670 for (j = 0; j < hammer2_xopgroups; ++j) { 671 hammer2_thr_delete( 672 &pmp->xop_groups[j].thrs[clindex]); 673 } 674 } 675 } 676 } 677 678 /* 679 * Destroy a PFS, typically only occurs after the last mount on a device 680 * has gone away. 681 */ 682 static void 683 hammer2_pfsfree(hammer2_pfs_t *pmp) 684 { 685 hammer2_inode_t *iroot; 686 hammer2_chain_t *chain; 687 int chains_still_present = 0; 688 int i; 689 int j; 690 691 /* 692 * Cleanup our reference on iroot. iroot is (should) not be needed 693 * by the flush code. 694 */ 695 if (pmp->flags & HAMMER2_PMPF_SPMP) 696 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry); 697 else 698 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry); 699 700 /* 701 * Cleanup chains remaining on LRU list. 702 */ 703 hammer2_spin_ex(&pmp->lru_spin); 704 while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) { 705 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU); 706 atomic_add_int(&pmp->lru_count, -1); 707 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 708 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 709 hammer2_chain_ref(chain); 710 hammer2_spin_unex(&pmp->lru_spin); 711 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 712 hammer2_chain_drop(chain); 713 hammer2_spin_ex(&pmp->lru_spin); 714 } 715 hammer2_spin_unex(&pmp->lru_spin); 716 717 /* 718 * Clean up iroot 719 */ 720 iroot = pmp->iroot; 721 if (iroot) { 722 for (i = 0; i < iroot->cluster.nchains; ++i) { 723 hammer2_thr_delete(&pmp->sync_thrs[i]); 724 if (pmp->xop_groups) { 725 for (j = 0; j < hammer2_xopgroups; ++j) 726 hammer2_thr_delete( 727 &pmp->xop_groups[j].thrs[i]); 728 } 729 chain = iroot->cluster.array[i].chain; 730 if (chain && !RB_EMPTY(&chain->core.rbtree)) { 731 kprintf("hammer2: Warning pmp %p still " 732 "has active chains\n", pmp); 733 chains_still_present = 1; 734 } 735 } 736 #if REPORT_REFS_ERRORS 737 if (iroot->refs != 1) 738 kprintf("PMP->IROOT %p REFS WRONG %d\n", 739 iroot, iroot->refs); 740 #else 741 KKASSERT(iroot->refs == 1); 742 #endif 743 /* ref for iroot */ 744 hammer2_inode_drop(iroot); 745 pmp->iroot = NULL; 746 } 747 748 /* 749 * Free remaining pmp resources 750 */ 751 if (chains_still_present) { 752 kprintf("hammer2: cannot free pmp %p, still in use\n", pmp); 753 } else { 754 kmalloc_destroy(&pmp->mmsg); 755 kmalloc_destroy(&pmp->minode); 756 kfree(pmp, M_HAMMER2); 757 } 758 } 759 760 /* 761 * Remove all references to hmp from the pfs list. Any PFS which becomes 762 * empty is terminated and freed. 763 * 764 * XXX inefficient. 765 */ 766 static void 767 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which) 768 { 769 hammer2_pfs_t *pmp; 770 hammer2_inode_t *iroot; 771 hammer2_chain_t *rchain; 772 int i; 773 int j; 774 struct hammer2_pfslist *wlist; 775 776 if (which == 0) 777 wlist = &hammer2_pfslist; 778 else 779 wlist = &hammer2_spmplist; 780 again: 781 TAILQ_FOREACH(pmp, wlist, mntentry) { 782 if ((iroot = pmp->iroot) == NULL) 783 continue; 784 785 /* 786 * Determine if this PFS is affected. If it is we must 787 * freeze all management threads and lock its iroot. 788 * 789 * Freezing a management thread forces it idle, operations 790 * in-progress will be aborted and it will have to start 791 * over again when unfrozen, or exit if told to exit. 792 */ 793 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 794 if (pmp->pfs_hmps[i] == hmp) 795 break; 796 } 797 if (i == HAMMER2_MAXCLUSTER) 798 continue; 799 800 hammer2_vfs_sync_pmp(pmp, MNT_WAIT); 801 802 /* 803 * Make sure all synchronization threads are locked 804 * down. 805 */ 806 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 807 if (pmp->pfs_hmps[i] == NULL) 808 continue; 809 hammer2_thr_freeze_async(&pmp->sync_thrs[i]); 810 if (pmp->xop_groups) { 811 for (j = 0; j < hammer2_xopgroups; ++j) { 812 hammer2_thr_freeze_async( 813 &pmp->xop_groups[j].thrs[i]); 814 } 815 } 816 } 817 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 818 if (pmp->pfs_hmps[i] == NULL) 819 continue; 820 hammer2_thr_freeze(&pmp->sync_thrs[i]); 821 if (pmp->xop_groups) { 822 for (j = 0; j < hammer2_xopgroups; ++j) { 823 hammer2_thr_freeze( 824 &pmp->xop_groups[j].thrs[i]); 825 } 826 } 827 } 828 829 /* 830 * Lock the inode and clean out matching chains. 831 * Note that we cannot use hammer2_inode_lock_*() 832 * here because that would attempt to validate the 833 * cluster that we are in the middle of ripping 834 * apart. 835 * 836 * WARNING! We are working directly on the inodes 837 * embedded cluster. 838 */ 839 hammer2_mtx_ex(&iroot->lock); 840 841 /* 842 * Remove the chain from matching elements of the PFS. 843 */ 844 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 845 if (pmp->pfs_hmps[i] != hmp) 846 continue; 847 hammer2_thr_delete(&pmp->sync_thrs[i]); 848 if (pmp->xop_groups) { 849 for (j = 0; j < hammer2_xopgroups; ++j) { 850 hammer2_thr_delete( 851 &pmp->xop_groups[j].thrs[i]); 852 } 853 } 854 rchain = iroot->cluster.array[i].chain; 855 iroot->cluster.array[i].chain = NULL; 856 pmp->pfs_types[i] = 0; 857 if (pmp->pfs_names[i]) { 858 kfree(pmp->pfs_names[i], M_HAMMER2); 859 pmp->pfs_names[i] = NULL; 860 } 861 if (rchain) { 862 hammer2_chain_drop(rchain); 863 /* focus hint */ 864 if (iroot->cluster.focus == rchain) 865 iroot->cluster.focus = NULL; 866 } 867 pmp->pfs_hmps[i] = NULL; 868 } 869 hammer2_mtx_unlock(&iroot->lock); 870 871 /* 872 * Cleanup trailing chains. Gaps may remain. 873 */ 874 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) { 875 if (pmp->pfs_hmps[i]) 876 break; 877 } 878 iroot->cluster.nchains = i + 1; 879 880 /* 881 * If the PMP has no elements remaining we can destroy it. 882 * (this will transition management threads from frozen->exit). 883 */ 884 if (iroot->cluster.nchains == 0) { 885 /* 886 * If this was the hmp's spmp, we need to clean 887 * a little more stuff out. 888 */ 889 if (hmp->spmp == pmp) { 890 hmp->spmp = NULL; 891 hmp->vchain.pmp = NULL; 892 hmp->fchain.pmp = NULL; 893 } 894 895 /* 896 * Free the pmp and restart the loop 897 */ 898 KKASSERT(TAILQ_EMPTY(&pmp->syncq)); 899 KKASSERT(TAILQ_EMPTY(&pmp->depq)); 900 hammer2_pfsfree(pmp); 901 goto again; 902 } 903 904 /* 905 * If elements still remain we need to set the REMASTER 906 * flag and unfreeze it. 907 */ 908 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 909 if (pmp->pfs_hmps[i] == NULL) 910 continue; 911 hammer2_thr_remaster(&pmp->sync_thrs[i]); 912 hammer2_thr_unfreeze(&pmp->sync_thrs[i]); 913 if (pmp->xop_groups) { 914 for (j = 0; j < hammer2_xopgroups; ++j) { 915 hammer2_thr_remaster( 916 &pmp->xop_groups[j].thrs[i]); 917 hammer2_thr_unfreeze( 918 &pmp->xop_groups[j].thrs[i]); 919 } 920 } 921 } 922 } 923 } 924 925 /* 926 * Mount or remount HAMMER2 fileystem from physical media 927 * 928 * mountroot 929 * mp mount point structure 930 * path NULL 931 * data <unused> 932 * cred <unused> 933 * 934 * mount 935 * mp mount point structure 936 * path path to mount point 937 * data pointer to argument structure in user space 938 * volume volume path (device@LABEL form) 939 * hflags user mount flags 940 * cred user credentials 941 * 942 * RETURNS: 0 Success 943 * !0 error number 944 */ 945 static 946 int 947 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, 948 struct ucred *cred) 949 { 950 struct hammer2_mount_info info; 951 hammer2_pfs_t *pmp; 952 hammer2_pfs_t *spmp; 953 hammer2_dev_t *hmp; 954 hammer2_dev_t *force_local; 955 hammer2_key_t key_next; 956 hammer2_key_t key_dummy; 957 hammer2_key_t lhc; 958 struct vnode *devvp; 959 struct nlookupdata nd; 960 hammer2_chain_t *parent; 961 hammer2_chain_t *chain; 962 const hammer2_inode_data_t *ripdata; 963 hammer2_blockref_t bref; 964 struct file *fp; 965 char devstr[MNAMELEN]; 966 size_t size; 967 size_t done; 968 char *dev; 969 char *label; 970 int ronly = 1; 971 int error; 972 int i; 973 974 hmp = NULL; 975 pmp = NULL; 976 dev = NULL; 977 label = NULL; 978 devvp = NULL; 979 980 if (path == NULL) { 981 /* 982 * Root mount 983 */ 984 bzero(&info, sizeof(info)); 985 info.cluster_fd = -1; 986 ksnprintf(devstr, sizeof(devstr), "%s", 987 mp->mnt_stat.f_mntfromname); 988 kprintf("hammer2_mount: root '%s'\n", devstr); 989 done = strlen(devstr) + 1; 990 } else { 991 /* 992 * Non-root mount or updating a mount 993 */ 994 error = copyin(data, &info, sizeof(info)); 995 if (error) 996 return (error); 997 998 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); 999 if (error) 1000 return (error); 1001 kprintf("hammer2_mount: '%s'\n", devstr); 1002 } 1003 1004 /* 1005 * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA 1006 * if no label specified, based on the partition id. Error out if no 1007 * label or device (with partition id) is specified. This is strictly 1008 * a convenience to match the default label created by newfs_hammer2, 1009 * our preference is that a label always be specified. 1010 * 1011 * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command 1012 * that does not specify a device, as long as some H2 label 1013 * has already been mounted from that device. This makes 1014 * mounting snapshots a lot easier. 1015 */ 1016 dev = devstr; 1017 label = strchr(devstr, '@'); 1018 if (label && ((label + 1) - dev) > done) { 1019 kprintf("hammer2: mount: bad label %s/%zd\n", 1020 devstr, done); 1021 return (EINVAL); 1022 } 1023 if (label == NULL || label[1] == 0) { 1024 char slice; 1025 1026 if (label == NULL) 1027 label = devstr + strlen(devstr); 1028 else 1029 *label = '\0'; /* clean up trailing @ */ 1030 1031 slice = label[-1]; 1032 switch(slice) { 1033 case 'a': 1034 label = "BOOT"; 1035 break; 1036 case 'd': 1037 label = "ROOT"; 1038 break; 1039 default: 1040 label = "DATA"; 1041 break; 1042 } 1043 } else { 1044 *label = '\0'; 1045 label++; 1046 } 1047 1048 kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n", 1049 dev, label, (mp->mnt_flag & MNT_RDONLY)); 1050 1051 if (mp->mnt_flag & MNT_UPDATE) { 1052 /* 1053 * Update mount. Note that pmp->iroot->cluster is 1054 * an inode-embedded cluster and thus cannot be 1055 * directly locked. 1056 * 1057 * XXX HAMMER2 needs to implement NFS export via 1058 * mountctl. 1059 */ 1060 hammer2_cluster_t *cluster; 1061 1062 pmp = MPTOPMP(mp); 1063 pmp->hflags = info.hflags; 1064 cluster = &pmp->iroot->cluster; 1065 for (i = 0; i < cluster->nchains; ++i) { 1066 if (cluster->array[i].chain == NULL) 1067 continue; 1068 hmp = cluster->array[i].chain->hmp; 1069 devvp = hmp->devvp; 1070 error = hammer2_remount(hmp, mp, path, 1071 devvp, cred); 1072 if (error) 1073 break; 1074 } 1075 1076 return error; 1077 } 1078 1079 /* 1080 * HMP device mount 1081 * 1082 * If a path is specified and dev is not an empty string, lookup the 1083 * name and verify that it referes to a block device. 1084 * 1085 * If a path is specified and dev is an empty string we fall through 1086 * and locate the label in the hmp search. 1087 */ 1088 if (path && *dev != 0) { 1089 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); 1090 if (error == 0) 1091 error = nlookup(&nd); 1092 if (error == 0) 1093 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); 1094 nlookup_done(&nd); 1095 } else if (path == NULL) { 1096 /* root mount */ 1097 cdev_t cdev = kgetdiskbyname(dev); 1098 error = bdevvp(cdev, &devvp); 1099 if (error) 1100 kprintf("hammer2: cannot find '%s'\n", dev); 1101 } else { 1102 /* 1103 * We will locate the hmp using the label in the hmp loop. 1104 */ 1105 error = 0; 1106 } 1107 1108 /* 1109 * Make sure its a block device. Do not check to see if it is 1110 * already mounted until we determine that its a fresh H2 device. 1111 */ 1112 if (error == 0 && devvp) { 1113 vn_isdisk(devvp, &error); 1114 } 1115 1116 /* 1117 * Determine if the device has already been mounted. After this 1118 * check hmp will be non-NULL if we are doing the second or more 1119 * hammer2 mounts from the same device. 1120 */ 1121 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1122 if (devvp) { 1123 /* 1124 * Match the device. Due to the way devfs works, 1125 * we may not be able to directly match the vnode pointer, 1126 * so also check to see if the underlying device matches. 1127 */ 1128 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1129 if (hmp->devvp == devvp) 1130 break; 1131 if (devvp->v_rdev && 1132 hmp->devvp->v_rdev == devvp->v_rdev) { 1133 break; 1134 } 1135 } 1136 1137 /* 1138 * If no match this may be a fresh H2 mount, make sure 1139 * the device is not mounted on anything else. 1140 */ 1141 if (hmp == NULL) 1142 error = vfs_mountedon(devvp); 1143 } else if (error == 0) { 1144 /* 1145 * Match the label to a pmp already probed. 1146 */ 1147 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) { 1148 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 1149 if (pmp->pfs_names[i] && 1150 strcmp(pmp->pfs_names[i], label) == 0) { 1151 hmp = pmp->pfs_hmps[i]; 1152 break; 1153 } 1154 } 1155 if (hmp) 1156 break; 1157 } 1158 if (hmp == NULL) 1159 error = ENOENT; 1160 } 1161 1162 /* 1163 * Open the device if this isn't a secondary mount and construct 1164 * the H2 device mount (hmp). 1165 */ 1166 if (hmp == NULL) { 1167 hammer2_chain_t *schain; 1168 hammer2_xid_t xid; 1169 hammer2_xop_head_t xop; 1170 1171 if (error == 0 && vcount(devvp) > 0) { 1172 kprintf("Primary device already has references\n"); 1173 error = EBUSY; 1174 } 1175 1176 /* 1177 * Now open the device 1178 */ 1179 if (error == 0) { 1180 ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); 1181 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1182 error = vinvalbuf(devvp, V_SAVE, 0, 0); 1183 if (error == 0) { 1184 error = VOP_OPEN(devvp, 1185 (ronly ? FREAD : FREAD | FWRITE), 1186 FSCRED, NULL); 1187 } 1188 vn_unlock(devvp); 1189 } 1190 if (error && devvp) { 1191 vrele(devvp); 1192 devvp = NULL; 1193 } 1194 if (error) { 1195 lockmgr(&hammer2_mntlk, LK_RELEASE); 1196 return error; 1197 } 1198 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); 1199 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev); 1200 hmp->ronly = ronly; 1201 hmp->devvp = devvp; 1202 hmp->hflags = info.hflags & HMNT2_DEVFLAGS; 1203 kmalloc_create(&hmp->mchain, "HAMMER2-chains"); 1204 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry); 1205 RB_INIT(&hmp->iotree); 1206 spin_init(&hmp->io_spin, "h2mount_io"); 1207 spin_init(&hmp->list_spin, "h2mount_list"); 1208 1209 lockinit(&hmp->vollk, "h2vol", 0, 0); 1210 lockinit(&hmp->bulklk, "h2bulk", 0, 0); 1211 lockinit(&hmp->bflock, "h2bflk", 0, 0); 1212 1213 /* 1214 * vchain setup. vchain.data is embedded. 1215 * vchain.refs is initialized and will never drop to 0. 1216 * 1217 * NOTE! voldata is not yet loaded. 1218 */ 1219 hmp->vchain.hmp = hmp; 1220 hmp->vchain.refs = 1; 1221 hmp->vchain.data = (void *)&hmp->voldata; 1222 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; 1223 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1224 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1225 1226 hammer2_chain_core_init(&hmp->vchain); 1227 /* hmp->vchain.u.xxx is left NULL */ 1228 1229 /* 1230 * fchain setup. fchain.data is embedded. 1231 * fchain.refs is initialized and will never drop to 0. 1232 * 1233 * The data is not used but needs to be initialized to 1234 * pass assertion muster. We use this chain primarily 1235 * as a placeholder for the freemap's top-level RBTREE 1236 * so it does not interfere with the volume's topology 1237 * RBTREE. 1238 */ 1239 hmp->fchain.hmp = hmp; 1240 hmp->fchain.refs = 1; 1241 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset; 1242 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP; 1243 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; 1244 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1245 hmp->fchain.bref.methods = 1246 HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) | 1247 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 1248 1249 hammer2_chain_core_init(&hmp->fchain); 1250 /* hmp->fchain.u.xxx is left NULL */ 1251 1252 /* 1253 * Install the volume header and initialize fields from 1254 * voldata. 1255 */ 1256 error = hammer2_install_volume_header(hmp); 1257 if (error) { 1258 hammer2_unmount_helper(mp, NULL, hmp); 1259 lockmgr(&hammer2_mntlk, LK_RELEASE); 1260 hammer2_vfs_unmount(mp, MNT_FORCE); 1261 return error; 1262 } 1263 1264 /* 1265 * Really important to get these right or the flush and 1266 * teardown code will get confused. 1267 */ 1268 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL); 1269 spmp = hmp->spmp; 1270 spmp->pfs_hmps[0] = hmp; 1271 1272 /* 1273 * Dummy-up vchain and fchain's modify_tid. mirror_tid 1274 * is inherited from the volume header. 1275 */ 1276 xid = 0; 1277 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; 1278 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid; 1279 hmp->vchain.pmp = spmp; 1280 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; 1281 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid; 1282 hmp->fchain.pmp = spmp; 1283 1284 /* 1285 * First locate the super-root inode, which is key 0 1286 * relative to the volume header's blockset. 1287 * 1288 * Then locate the root inode by scanning the directory keyspace 1289 * represented by the label. 1290 */ 1291 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 1292 schain = hammer2_chain_lookup(&parent, &key_dummy, 1293 HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 1294 &error, 0); 1295 hammer2_chain_lookup_done(parent); 1296 if (schain == NULL) { 1297 kprintf("hammer2_mount: invalid super-root\n"); 1298 hammer2_unmount_helper(mp, NULL, hmp); 1299 lockmgr(&hammer2_mntlk, LK_RELEASE); 1300 hammer2_vfs_unmount(mp, MNT_FORCE); 1301 return EINVAL; 1302 } 1303 if (schain->error) { 1304 kprintf("hammer2_mount: error %s reading super-root\n", 1305 hammer2_error_str(schain->error)); 1306 hammer2_chain_unlock(schain); 1307 hammer2_chain_drop(schain); 1308 schain = NULL; 1309 hammer2_unmount_helper(mp, NULL, hmp); 1310 lockmgr(&hammer2_mntlk, LK_RELEASE); 1311 hammer2_vfs_unmount(mp, MNT_FORCE); 1312 return EINVAL; 1313 } 1314 1315 /* 1316 * The super-root always uses an inode_tid of 1 when 1317 * creating PFSs. 1318 */ 1319 spmp->inode_tid = 1; 1320 spmp->modify_tid = schain->bref.modify_tid + 1; 1321 1322 /* 1323 * Sanity-check schain's pmp and finish initialization. 1324 * Any chain belonging to the super-root topology should 1325 * have a NULL pmp (not even set to spmp). 1326 */ 1327 ripdata = &hammer2_chain_rdata(schain)->ipdata; 1328 KKASSERT(schain->pmp == NULL); 1329 spmp->pfs_clid = ripdata->meta.pfs_clid; 1330 1331 /* 1332 * Replace the dummy spmp->iroot with a real one. It's 1333 * easier to just do a wholesale replacement than to try 1334 * to update the chain and fixup the iroot fields. 1335 * 1336 * The returned inode is locked with the supplied cluster. 1337 */ 1338 hammer2_dummy_xop_from_chain(&xop, schain); 1339 hammer2_inode_drop(spmp->iroot); 1340 spmp->iroot = NULL; 1341 spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1); 1342 spmp->spmp_hmp = hmp; 1343 spmp->pfs_types[0] = ripdata->meta.pfs_type; 1344 spmp->pfs_hmps[0] = hmp; 1345 hammer2_inode_ref(spmp->iroot); 1346 hammer2_inode_unlock(spmp->iroot); 1347 hammer2_cluster_unlock(&xop.cluster); 1348 hammer2_chain_drop(schain); 1349 /* do not call hammer2_cluster_drop() on an embedded cluster */ 1350 schain = NULL; /* now invalid */ 1351 /* leave spmp->iroot with one ref */ 1352 1353 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 1354 error = hammer2_recovery(hmp); 1355 if (error == 0) 1356 error |= hammer2_fixup_pfses(hmp); 1357 /* XXX do something with error */ 1358 } 1359 hammer2_update_pmps(hmp); 1360 hammer2_iocom_init(hmp); 1361 hammer2_bulkfree_init(hmp); 1362 1363 /* 1364 * Ref the cluster management messaging descriptor. The mount 1365 * program deals with the other end of the communications pipe. 1366 * 1367 * Root mounts typically do not supply one. 1368 */ 1369 if (info.cluster_fd >= 0) { 1370 fp = holdfp(curthread, info.cluster_fd, -1); 1371 if (fp) { 1372 hammer2_cluster_reconnect(hmp, fp); 1373 } else { 1374 kprintf("hammer2_mount: bad cluster_fd!\n"); 1375 } 1376 } 1377 } else { 1378 spmp = hmp->spmp; 1379 if (info.hflags & HMNT2_DEVFLAGS) { 1380 kprintf("hammer2: Warning: mount flags pertaining " 1381 "to the whole device may only be specified " 1382 "on the first mount of the device: %08x\n", 1383 info.hflags & HMNT2_DEVFLAGS); 1384 } 1385 } 1386 1387 /* 1388 * Force local mount (disassociate all PFSs from their clusters). 1389 * Used primarily for debugging. 1390 */ 1391 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1392 1393 /* 1394 * Lookup the mount point under the media-localized super-root. 1395 * Scanning hammer2_pfslist doesn't help us because it represents 1396 * PFS cluster ids which can aggregate several named PFSs together. 1397 * 1398 * cluster->pmp will incorrectly point to spmp and must be fixed 1399 * up later on. 1400 */ 1401 hammer2_inode_lock(spmp->iroot, 0); 1402 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1403 lhc = hammer2_dirhash(label, strlen(label)); 1404 chain = hammer2_chain_lookup(&parent, &key_next, 1405 lhc, lhc + HAMMER2_DIRHASH_LOMASK, 1406 &error, 0); 1407 while (chain) { 1408 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 1409 strcmp(label, chain->data->ipdata.filename) == 0) { 1410 break; 1411 } 1412 chain = hammer2_chain_next(&parent, chain, &key_next, 1413 key_next, 1414 lhc + HAMMER2_DIRHASH_LOMASK, 1415 &error, 0); 1416 } 1417 if (parent) { 1418 hammer2_chain_unlock(parent); 1419 hammer2_chain_drop(parent); 1420 } 1421 hammer2_inode_unlock(spmp->iroot); 1422 1423 /* 1424 * PFS could not be found? 1425 */ 1426 if (chain == NULL) { 1427 if (error) 1428 kprintf("hammer2_mount: PFS label I/O error\n"); 1429 else 1430 kprintf("hammer2_mount: PFS label not found\n"); 1431 hammer2_unmount_helper(mp, NULL, hmp); 1432 lockmgr(&hammer2_mntlk, LK_RELEASE); 1433 hammer2_vfs_unmount(mp, MNT_FORCE); 1434 1435 return EINVAL; 1436 } 1437 1438 /* 1439 * Acquire the pmp structure (it should have already been allocated 1440 * via hammer2_update_pmps() so do not pass cluster in to add to 1441 * available chains). 1442 * 1443 * Check if the cluster has already been mounted. A cluster can 1444 * only be mounted once, use null mounts to mount additional copies. 1445 */ 1446 if (chain->error) { 1447 kprintf("hammer2_mount: PFS label I/O error\n"); 1448 } else { 1449 ripdata = &chain->data->ipdata; 1450 bref = chain->bref; 1451 pmp = hammer2_pfsalloc(NULL, ripdata, 1452 bref.modify_tid, force_local); 1453 } 1454 hammer2_chain_unlock(chain); 1455 hammer2_chain_drop(chain); 1456 1457 /* 1458 * Finish the mount 1459 */ 1460 kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp); 1461 1462 if (pmp->mp) { 1463 kprintf("hammer2_mount: PFS already mounted!\n"); 1464 hammer2_unmount_helper(mp, NULL, hmp); 1465 lockmgr(&hammer2_mntlk, LK_RELEASE); 1466 hammer2_vfs_unmount(mp, MNT_FORCE); 1467 1468 return EBUSY; 1469 } 1470 1471 pmp->hflags = info.hflags; 1472 mp->mnt_flag |= MNT_LOCAL; 1473 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; /* all entry pts are SMP */ 1474 mp->mnt_kern_flag |= MNTK_THR_SYNC; /* new vsyncscan semantics */ 1475 1476 /* 1477 * required mount structure initializations 1478 */ 1479 mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE; 1480 mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE; 1481 1482 mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE; 1483 mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE; 1484 1485 /* 1486 * Optional fields 1487 */ 1488 mp->mnt_iosize_max = MAXPHYS; 1489 1490 /* 1491 * Connect up mount pointers. 1492 */ 1493 hammer2_mount_helper(mp, pmp); 1494 1495 lockmgr(&hammer2_mntlk, LK_RELEASE); 1496 1497 /* 1498 * Finish setup 1499 */ 1500 vfs_getnewfsid(mp); 1501 vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops); 1502 vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops); 1503 vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops); 1504 1505 if (path) { 1506 copyinstr(info.volume, mp->mnt_stat.f_mntfromname, 1507 MNAMELEN - 1, &size); 1508 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); 1509 } /* else root mount, already in there */ 1510 1511 bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); 1512 if (path) { 1513 copyinstr(path, mp->mnt_stat.f_mntonname, 1514 sizeof(mp->mnt_stat.f_mntonname) - 1, 1515 &size); 1516 } else { 1517 /* root mount */ 1518 mp->mnt_stat.f_mntonname[0] = '/'; 1519 } 1520 1521 /* 1522 * Initial statfs to prime mnt_stat. 1523 */ 1524 hammer2_vfs_statfs(mp, &mp->mnt_stat, cred); 1525 1526 return 0; 1527 } 1528 1529 /* 1530 * Scan PFSs under the super-root and create hammer2_pfs structures. 1531 */ 1532 static 1533 void 1534 hammer2_update_pmps(hammer2_dev_t *hmp) 1535 { 1536 const hammer2_inode_data_t *ripdata; 1537 hammer2_chain_t *parent; 1538 hammer2_chain_t *chain; 1539 hammer2_blockref_t bref; 1540 hammer2_dev_t *force_local; 1541 hammer2_pfs_t *spmp; 1542 hammer2_pfs_t *pmp; 1543 hammer2_key_t key_next; 1544 int error; 1545 1546 /* 1547 * Force local mount (disassociate all PFSs from their clusters). 1548 * Used primarily for debugging. 1549 */ 1550 force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL; 1551 1552 /* 1553 * Lookup mount point under the media-localized super-root. 1554 * 1555 * cluster->pmp will incorrectly point to spmp and must be fixed 1556 * up later on. 1557 */ 1558 spmp = hmp->spmp; 1559 hammer2_inode_lock(spmp->iroot, 0); 1560 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 1561 chain = hammer2_chain_lookup(&parent, &key_next, 1562 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 1563 &error, 0); 1564 while (chain) { 1565 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 1566 continue; 1567 if (chain->error) { 1568 kprintf("I/O error scanning PFS labels\n"); 1569 } else { 1570 ripdata = &chain->data->ipdata; 1571 bref = chain->bref; 1572 1573 pmp = hammer2_pfsalloc(chain, ripdata, 1574 bref.modify_tid, force_local); 1575 } 1576 chain = hammer2_chain_next(&parent, chain, &key_next, 1577 key_next, HAMMER2_KEY_MAX, 1578 &error, 0); 1579 } 1580 if (parent) { 1581 hammer2_chain_unlock(parent); 1582 hammer2_chain_drop(parent); 1583 } 1584 hammer2_inode_unlock(spmp->iroot); 1585 } 1586 1587 static 1588 int 1589 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused, 1590 struct vnode *devvp, struct ucred *cred) 1591 { 1592 int error; 1593 1594 if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { 1595 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1596 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL); 1597 vn_unlock(devvp); 1598 error = hammer2_recovery(hmp); 1599 if (error == 0) 1600 error |= hammer2_fixup_pfses(hmp); 1601 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1602 if (error == 0) { 1603 VOP_CLOSE(devvp, FREAD, NULL); 1604 hmp->ronly = 0; 1605 } else { 1606 VOP_CLOSE(devvp, FREAD | FWRITE, NULL); 1607 } 1608 vn_unlock(devvp); 1609 } else { 1610 error = 0; 1611 } 1612 return error; 1613 } 1614 1615 static 1616 int 1617 hammer2_vfs_unmount(struct mount *mp, int mntflags) 1618 { 1619 hammer2_pfs_t *pmp; 1620 int flags; 1621 int error = 0; 1622 1623 pmp = MPTOPMP(mp); 1624 1625 if (pmp == NULL) 1626 return(0); 1627 1628 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE); 1629 1630 /* 1631 * If mount initialization proceeded far enough we must flush 1632 * its vnodes and sync the underlying mount points. Three syncs 1633 * are required to fully flush the filesystem (freemap updates lag 1634 * by one flush, and one extra for safety). 1635 */ 1636 if (mntflags & MNT_FORCE) 1637 flags = FORCECLOSE; 1638 else 1639 flags = 0; 1640 if (pmp->iroot) { 1641 error = vflush(mp, 0, flags); 1642 if (error) 1643 goto failed; 1644 hammer2_vfs_sync(mp, MNT_WAIT); 1645 hammer2_vfs_sync(mp, MNT_WAIT); 1646 hammer2_vfs_sync(mp, MNT_WAIT); 1647 } 1648 1649 /* 1650 * Cleanup the frontend support XOPS threads 1651 */ 1652 hammer2_xop_helper_cleanup(pmp); 1653 1654 if (pmp->mp) 1655 hammer2_unmount_helper(mp, pmp, NULL); 1656 1657 error = 0; 1658 failed: 1659 lockmgr(&hammer2_mntlk, LK_RELEASE); 1660 1661 return (error); 1662 } 1663 1664 /* 1665 * Mount helper, hook the system mount into our PFS. 1666 * The mount lock is held. 1667 * 1668 * We must bump the mount_count on related devices for any 1669 * mounted PFSs. 1670 */ 1671 static 1672 void 1673 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp) 1674 { 1675 hammer2_cluster_t *cluster; 1676 hammer2_chain_t *rchain; 1677 int i; 1678 1679 mp->mnt_data = (qaddr_t)pmp; 1680 pmp->mp = mp; 1681 1682 /* 1683 * After pmp->mp is set we have to adjust hmp->mount_count. 1684 */ 1685 cluster = &pmp->iroot->cluster; 1686 for (i = 0; i < cluster->nchains; ++i) { 1687 rchain = cluster->array[i].chain; 1688 if (rchain == NULL) 1689 continue; 1690 ++rchain->hmp->mount_count; 1691 } 1692 1693 /* 1694 * Create missing Xop threads 1695 */ 1696 hammer2_xop_helper_create(pmp); 1697 } 1698 1699 /* 1700 * Mount helper, unhook the system mount from our PFS. 1701 * The mount lock is held. 1702 * 1703 * If hmp is supplied a mount responsible for being the first to open 1704 * the block device failed and the block device and all PFSs using the 1705 * block device must be cleaned up. 1706 * 1707 * If pmp is supplied multiple devices might be backing the PFS and each 1708 * must be disconnected. This might not be the last PFS using some of the 1709 * underlying devices. Also, we have to adjust our hmp->mount_count 1710 * accounting for the devices backing the pmp which is now undergoing an 1711 * unmount. 1712 */ 1713 static 1714 void 1715 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp) 1716 { 1717 hammer2_cluster_t *cluster; 1718 hammer2_chain_t *rchain; 1719 struct vnode *devvp; 1720 int dumpcnt; 1721 int ronly; 1722 int i; 1723 1724 /* 1725 * If no device supplied this is a high-level unmount and we have to 1726 * to disconnect the mount, adjust mount_count, and locate devices 1727 * that might now have no mounts. 1728 */ 1729 if (pmp) { 1730 KKASSERT(hmp == NULL); 1731 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp); 1732 pmp->mp = NULL; 1733 mp->mnt_data = NULL; 1734 1735 /* 1736 * After pmp->mp is cleared we have to account for 1737 * mount_count. 1738 */ 1739 cluster = &pmp->iroot->cluster; 1740 for (i = 0; i < cluster->nchains; ++i) { 1741 rchain = cluster->array[i].chain; 1742 if (rchain == NULL) 1743 continue; 1744 --rchain->hmp->mount_count; 1745 /* scrapping hmp now may invalidate the pmp */ 1746 } 1747 again: 1748 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) { 1749 if (hmp->mount_count == 0) { 1750 hammer2_unmount_helper(NULL, NULL, hmp); 1751 goto again; 1752 } 1753 } 1754 return; 1755 } 1756 1757 /* 1758 * Try to terminate the block device. We can't terminate it if 1759 * there are still PFSs referencing it. 1760 */ 1761 if (hmp->mount_count) 1762 return; 1763 1764 /* 1765 * Decomission the network before we start messing with the 1766 * device and PFS. 1767 */ 1768 hammer2_iocom_uninit(hmp); 1769 1770 hammer2_bulkfree_uninit(hmp); 1771 hammer2_pfsfree_scan(hmp, 0); 1772 #if 0 1773 hammer2_dev_exlock(hmp); /* XXX order */ 1774 #endif 1775 1776 /* 1777 * Cycle the volume data lock as a safety (probably not needed any 1778 * more). To ensure everything is out we need to flush at least 1779 * three times. (1) The running of the sideq can dirty the 1780 * filesystem, (2) A normal flush can dirty the freemap, and 1781 * (3) ensure that the freemap is fully synchronized. 1782 * 1783 * The next mount's recovery scan can clean everything up but we want 1784 * to leave the filesystem in a 100% clean state on a normal unmount. 1785 */ 1786 #if 0 1787 hammer2_voldata_lock(hmp); 1788 hammer2_voldata_unlock(hmp); 1789 #endif 1790 1791 /* 1792 * Flush whatever is left. Unmounted but modified PFS's might still 1793 * have some dirty chains on them. 1794 */ 1795 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1796 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 1797 1798 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1799 hammer2_voldata_modify(hmp); 1800 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP | 1801 HAMMER2_FLUSH_ALL); 1802 } 1803 hammer2_chain_unlock(&hmp->fchain); 1804 1805 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1806 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP | 1807 HAMMER2_FLUSH_ALL); 1808 } 1809 hammer2_chain_unlock(&hmp->vchain); 1810 1811 if ((hmp->vchain.flags | hmp->fchain.flags) & 1812 HAMMER2_CHAIN_FLUSH_MASK) { 1813 kprintf("hammer2_unmount: chains left over " 1814 "after final sync\n"); 1815 kprintf(" vchain %08x\n", hmp->vchain.flags); 1816 kprintf(" fchain %08x\n", hmp->fchain.flags); 1817 1818 if (hammer2_debug & 0x0010) 1819 Debugger("entered debugger"); 1820 } 1821 1822 hammer2_pfsfree_scan(hmp, 1); 1823 1824 KKASSERT(hmp->spmp == NULL); 1825 1826 /* 1827 * Finish up with the device vnode 1828 */ 1829 if ((devvp = hmp->devvp) != NULL) { 1830 ronly = hmp->ronly; 1831 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1832 kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n", 1833 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree), 1834 ronly); 1835 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0); 1836 kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n", 1837 hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree)); 1838 hmp->devvp = NULL; 1839 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL); 1840 vn_unlock(devvp); 1841 vrele(devvp); 1842 devvp = NULL; 1843 } 1844 1845 /* 1846 * Clear vchain/fchain flags that might prevent final cleanup 1847 * of these chains. 1848 */ 1849 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) { 1850 atomic_add_long(&hammer2_count_modified_chains, -1); 1851 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1852 hammer2_pfs_memory_wakeup(hmp->vchain.pmp, -1); 1853 } 1854 if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) { 1855 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE); 1856 } 1857 1858 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) { 1859 atomic_add_long(&hammer2_count_modified_chains, -1); 1860 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED); 1861 hammer2_pfs_memory_wakeup(hmp->fchain.pmp, -1); 1862 } 1863 if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) { 1864 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE); 1865 } 1866 1867 /* 1868 * Final drop of embedded freemap root chain to 1869 * clean up fchain.core (fchain structure is not 1870 * flagged ALLOCATED so it is cleaned out and then 1871 * left to rot). 1872 */ 1873 hammer2_chain_drop(&hmp->fchain); 1874 1875 /* 1876 * Final drop of embedded volume root chain to clean 1877 * up vchain.core (vchain structure is not flagged 1878 * ALLOCATED so it is cleaned out and then left to 1879 * rot). 1880 */ 1881 dumpcnt = 50; 1882 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1); 1883 dumpcnt = 50; 1884 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1); 1885 #if 0 1886 hammer2_dev_unlock(hmp); 1887 #endif 1888 hammer2_chain_drop(&hmp->vchain); 1889 1890 hammer2_io_cleanup(hmp, &hmp->iotree); 1891 if (hmp->iofree_count) { 1892 kprintf("io_cleanup: %d I/O's left hanging\n", 1893 hmp->iofree_count); 1894 } 1895 1896 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry); 1897 kmalloc_destroy(&hmp->mchain); 1898 kfree(hmp, M_HAMMER2); 1899 } 1900 1901 int 1902 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp, 1903 ino_t ino, struct vnode **vpp) 1904 { 1905 hammer2_xop_lookup_t *xop; 1906 hammer2_pfs_t *pmp; 1907 hammer2_inode_t *ip; 1908 hammer2_tid_t inum; 1909 int error; 1910 1911 inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK; 1912 1913 error = 0; 1914 pmp = MPTOPMP(mp); 1915 1916 /* 1917 * Easy if we already have it cached 1918 */ 1919 ip = hammer2_inode_lookup(pmp, inum); 1920 if (ip) { 1921 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 1922 *vpp = hammer2_igetv(ip, &error); 1923 hammer2_inode_unlock(ip); 1924 hammer2_inode_drop(ip); /* from lookup */ 1925 1926 return error; 1927 } 1928 1929 /* 1930 * Otherwise we have to find the inode 1931 */ 1932 xop = hammer2_xop_alloc(pmp->iroot, 0); 1933 xop->lhc = inum; 1934 hammer2_xop_start(&xop->head, &hammer2_lookup_desc); 1935 error = hammer2_xop_collect(&xop->head, 0); 1936 1937 if (error == 0) 1938 ip = hammer2_inode_get(pmp, &xop->head, -1, -1); 1939 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1940 1941 if (ip) { 1942 *vpp = hammer2_igetv(ip, &error); 1943 hammer2_inode_unlock(ip); 1944 } else { 1945 *vpp = NULL; 1946 error = ENOENT; 1947 } 1948 return (error); 1949 } 1950 1951 static 1952 int 1953 hammer2_vfs_root(struct mount *mp, struct vnode **vpp) 1954 { 1955 hammer2_pfs_t *pmp; 1956 struct vnode *vp; 1957 int error; 1958 1959 pmp = MPTOPMP(mp); 1960 if (pmp->iroot == NULL) { 1961 kprintf("hammer2 (%s): no root inode\n", 1962 mp->mnt_stat.f_mntfromname); 1963 *vpp = NULL; 1964 return EINVAL; 1965 } 1966 1967 error = 0; 1968 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 1969 1970 while (pmp->inode_tid == 0) { 1971 hammer2_xop_ipcluster_t *xop; 1972 const hammer2_inode_meta_t *meta; 1973 1974 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING); 1975 hammer2_xop_start(&xop->head, &hammer2_ipcluster_desc); 1976 error = hammer2_xop_collect(&xop->head, 0); 1977 1978 if (error == 0) { 1979 meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta; 1980 pmp->iroot->meta = *meta; 1981 pmp->inode_tid = meta->pfs_inum + 1; 1982 hammer2_xop_pdata(&xop->head); 1983 /* meta invalid */ 1984 1985 if (pmp->inode_tid < HAMMER2_INODE_START) 1986 pmp->inode_tid = HAMMER2_INODE_START; 1987 pmp->modify_tid = 1988 xop->head.cluster.focus->bref.modify_tid + 1; 1989 #if 0 1990 kprintf("PFS: Starting inode %jd\n", 1991 (intmax_t)pmp->inode_tid); 1992 kprintf("PMP focus good set nextino=%ld mod=%016jx\n", 1993 pmp->inode_tid, pmp->modify_tid); 1994 #endif 1995 wakeup(&pmp->iroot); 1996 1997 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 1998 1999 /* 2000 * Prime the mount info. 2001 */ 2002 hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL); 2003 break; 2004 } 2005 2006 /* 2007 * Loop, try again 2008 */ 2009 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2010 hammer2_inode_unlock(pmp->iroot); 2011 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz); 2012 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED); 2013 if (error == EINTR) 2014 break; 2015 } 2016 2017 if (error) { 2018 hammer2_inode_unlock(pmp->iroot); 2019 *vpp = NULL; 2020 } else { 2021 vp = hammer2_igetv(pmp->iroot, &error); 2022 hammer2_inode_unlock(pmp->iroot); 2023 *vpp = vp; 2024 } 2025 2026 return (error); 2027 } 2028 2029 /* 2030 * Filesystem status 2031 * 2032 * XXX incorporate ipdata->meta.inode_quota and data_quota 2033 */ 2034 static 2035 int 2036 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) 2037 { 2038 hammer2_pfs_t *pmp; 2039 hammer2_dev_t *hmp; 2040 hammer2_blockref_t bref; 2041 struct statfs tmp; 2042 int i; 2043 2044 /* 2045 * NOTE: iroot might not have validated the cluster yet. 2046 */ 2047 pmp = MPTOPMP(mp); 2048 2049 bzero(&tmp, sizeof(tmp)); 2050 2051 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2052 hmp = pmp->pfs_hmps[i]; 2053 if (hmp == NULL) 2054 continue; 2055 if (pmp->iroot->cluster.array[i].chain) 2056 bref = pmp->iroot->cluster.array[i].chain->bref; 2057 else 2058 bzero(&bref, sizeof(bref)); 2059 2060 tmp.f_files = bref.embed.stats.inode_count; 2061 tmp.f_ffree = 0; 2062 tmp.f_blocks = hmp->voldata.allocator_size / 2063 mp->mnt_vstat.f_bsize; 2064 tmp.f_bfree = hmp->voldata.allocator_free / 2065 mp->mnt_vstat.f_bsize; 2066 tmp.f_bavail = tmp.f_bfree; 2067 2068 if (cred && cred->cr_uid != 0) { 2069 uint64_t adj; 2070 2071 /* 5% */ 2072 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2073 tmp.f_blocks -= adj; 2074 tmp.f_bfree -= adj; 2075 tmp.f_bavail -= adj; 2076 } 2077 2078 mp->mnt_stat.f_blocks = tmp.f_blocks; 2079 mp->mnt_stat.f_bfree = tmp.f_bfree; 2080 mp->mnt_stat.f_bavail = tmp.f_bavail; 2081 mp->mnt_stat.f_files = tmp.f_files; 2082 mp->mnt_stat.f_ffree = tmp.f_ffree; 2083 2084 *sbp = mp->mnt_stat; 2085 } 2086 return (0); 2087 } 2088 2089 static 2090 int 2091 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) 2092 { 2093 hammer2_pfs_t *pmp; 2094 hammer2_dev_t *hmp; 2095 hammer2_blockref_t bref; 2096 struct statvfs tmp; 2097 int i; 2098 2099 /* 2100 * NOTE: iroot might not have validated the cluster yet. 2101 */ 2102 pmp = MPTOPMP(mp); 2103 bzero(&tmp, sizeof(tmp)); 2104 2105 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 2106 hmp = pmp->pfs_hmps[i]; 2107 if (hmp == NULL) 2108 continue; 2109 if (pmp->iroot->cluster.array[i].chain) 2110 bref = pmp->iroot->cluster.array[i].chain->bref; 2111 else 2112 bzero(&bref, sizeof(bref)); 2113 2114 tmp.f_files = bref.embed.stats.inode_count; 2115 tmp.f_ffree = 0; 2116 tmp.f_blocks = hmp->voldata.allocator_size / 2117 mp->mnt_vstat.f_bsize; 2118 tmp.f_bfree = hmp->voldata.allocator_free / 2119 mp->mnt_vstat.f_bsize; 2120 tmp.f_bavail = tmp.f_bfree; 2121 2122 if (cred && cred->cr_uid != 0) { 2123 uint64_t adj; 2124 2125 /* 5% */ 2126 adj = hmp->free_reserved / mp->mnt_vstat.f_bsize; 2127 tmp.f_blocks -= adj; 2128 tmp.f_bfree -= adj; 2129 tmp.f_bavail -= adj; 2130 } 2131 2132 mp->mnt_vstat.f_blocks = tmp.f_blocks; 2133 mp->mnt_vstat.f_bfree = tmp.f_bfree; 2134 mp->mnt_vstat.f_bavail = tmp.f_bavail; 2135 mp->mnt_vstat.f_files = tmp.f_files; 2136 mp->mnt_vstat.f_ffree = tmp.f_ffree; 2137 2138 *sbp = mp->mnt_vstat; 2139 } 2140 return (0); 2141 } 2142 2143 /* 2144 * Mount-time recovery (RW mounts) 2145 * 2146 * Updates to the free block table are allowed to lag flushes by one 2147 * transaction. In case of a crash, then on a fresh mount we must do an 2148 * incremental scan of the last committed transaction id and make sure that 2149 * all related blocks have been marked allocated. 2150 * 2151 * The super-root topology and each PFS has its own transaction id domain, 2152 * so we must track PFS boundary transitions. 2153 */ 2154 struct hammer2_recovery_elm { 2155 TAILQ_ENTRY(hammer2_recovery_elm) entry; 2156 hammer2_chain_t *chain; 2157 hammer2_tid_t sync_tid; 2158 }; 2159 2160 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); 2161 2162 struct hammer2_recovery_info { 2163 struct hammer2_recovery_list list; 2164 hammer2_tid_t mtid; 2165 int depth; 2166 }; 2167 2168 static int hammer2_recovery_scan(hammer2_dev_t *hmp, 2169 hammer2_chain_t *parent, 2170 struct hammer2_recovery_info *info, 2171 hammer2_tid_t sync_tid); 2172 2173 #define HAMMER2_RECOVERY_MAXDEPTH 10 2174 2175 static 2176 int 2177 hammer2_recovery(hammer2_dev_t *hmp) 2178 { 2179 struct hammer2_recovery_info info; 2180 struct hammer2_recovery_elm *elm; 2181 hammer2_chain_t *parent; 2182 hammer2_tid_t sync_tid; 2183 hammer2_tid_t mirror_tid; 2184 int error; 2185 2186 hammer2_trans_init(hmp->spmp, 0); 2187 2188 sync_tid = hmp->voldata.freemap_tid; 2189 mirror_tid = hmp->voldata.mirror_tid; 2190 2191 kprintf("hammer2 mount \"%s\": ", hmp->devrepname); 2192 if (sync_tid >= mirror_tid) { 2193 kprintf(" no recovery needed\n"); 2194 } else { 2195 kprintf(" freemap recovery %016jx-%016jx\n", 2196 sync_tid + 1, mirror_tid); 2197 } 2198 2199 TAILQ_INIT(&info.list); 2200 info.depth = 0; 2201 parent = hammer2_chain_lookup_init(&hmp->vchain, 0); 2202 error = hammer2_recovery_scan(hmp, parent, &info, sync_tid); 2203 hammer2_chain_lookup_done(parent); 2204 2205 while ((elm = TAILQ_FIRST(&info.list)) != NULL) { 2206 TAILQ_REMOVE(&info.list, elm, entry); 2207 parent = elm->chain; 2208 sync_tid = elm->sync_tid; 2209 kfree(elm, M_HAMMER2); 2210 2211 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2212 error |= hammer2_recovery_scan(hmp, parent, &info, 2213 hmp->voldata.freemap_tid); 2214 hammer2_chain_unlock(parent); 2215 hammer2_chain_drop(parent); /* drop elm->chain ref */ 2216 } 2217 2218 hammer2_trans_done(hmp->spmp, 0); 2219 2220 return error; 2221 } 2222 2223 static 2224 int 2225 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, 2226 struct hammer2_recovery_info *info, 2227 hammer2_tid_t sync_tid) 2228 { 2229 const hammer2_inode_data_t *ripdata; 2230 hammer2_chain_t *chain; 2231 hammer2_blockref_t bref; 2232 int tmp_error; 2233 int rup_error; 2234 int error; 2235 int first; 2236 2237 /* 2238 * Adjust freemap to ensure that the block(s) are marked allocated. 2239 */ 2240 if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { 2241 hammer2_freemap_adjust(hmp, &parent->bref, 2242 HAMMER2_FREEMAP_DORECOVER); 2243 } 2244 2245 /* 2246 * Check type for recursive scan 2247 */ 2248 switch(parent->bref.type) { 2249 case HAMMER2_BREF_TYPE_VOLUME: 2250 /* data already instantiated */ 2251 break; 2252 case HAMMER2_BREF_TYPE_INODE: 2253 /* 2254 * Must instantiate data for DIRECTDATA test and also 2255 * for recursion. 2256 */ 2257 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2258 ripdata = &hammer2_chain_rdata(parent)->ipdata; 2259 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2260 /* not applicable to recovery scan */ 2261 hammer2_chain_unlock(parent); 2262 return 0; 2263 } 2264 hammer2_chain_unlock(parent); 2265 break; 2266 case HAMMER2_BREF_TYPE_INDIRECT: 2267 /* 2268 * Must instantiate data for recursion 2269 */ 2270 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2271 hammer2_chain_unlock(parent); 2272 break; 2273 case HAMMER2_BREF_TYPE_DIRENT: 2274 case HAMMER2_BREF_TYPE_DATA: 2275 case HAMMER2_BREF_TYPE_FREEMAP: 2276 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2277 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2278 /* not applicable to recovery scan */ 2279 return 0; 2280 break; 2281 default: 2282 return HAMMER2_ERROR_BADBREF; 2283 } 2284 2285 /* 2286 * Defer operation if depth limit reached or if we are crossing a 2287 * PFS boundary. 2288 */ 2289 if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) { 2290 struct hammer2_recovery_elm *elm; 2291 2292 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); 2293 elm->chain = parent; 2294 elm->sync_tid = sync_tid; 2295 hammer2_chain_ref(parent); 2296 TAILQ_INSERT_TAIL(&info->list, elm, entry); 2297 /* unlocked by caller */ 2298 2299 return(0); 2300 } 2301 2302 2303 /* 2304 * Recursive scan of the last flushed transaction only. We are 2305 * doing this without pmp assignments so don't leave the chains 2306 * hanging around after we are done with them. 2307 * 2308 * error Cumulative error this level only 2309 * rup_error Cumulative error for recursion 2310 * tmp_error Specific non-cumulative recursion error 2311 */ 2312 chain = NULL; 2313 first = 1; 2314 rup_error = 0; 2315 error = 0; 2316 2317 for (;;) { 2318 error |= hammer2_chain_scan(parent, &chain, &bref, 2319 &first, 2320 HAMMER2_LOOKUP_NODATA); 2321 2322 /* 2323 * Problem during scan or EOF 2324 */ 2325 if (error) 2326 break; 2327 2328 /* 2329 * If this is a leaf 2330 */ 2331 if (chain == NULL) { 2332 if (bref.mirror_tid > sync_tid) { 2333 hammer2_freemap_adjust(hmp, &bref, 2334 HAMMER2_FREEMAP_DORECOVER); 2335 } 2336 continue; 2337 } 2338 2339 /* 2340 * This may or may not be a recursive node. 2341 */ 2342 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 2343 if (bref.mirror_tid > sync_tid) { 2344 ++info->depth; 2345 tmp_error = hammer2_recovery_scan(hmp, chain, 2346 info, sync_tid); 2347 --info->depth; 2348 } else { 2349 tmp_error = 0; 2350 } 2351 2352 /* 2353 * Flush the recovery at the PFS boundary to stage it for 2354 * the final flush of the super-root topology. 2355 */ 2356 if (tmp_error == 0 && 2357 (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && 2358 (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { 2359 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2360 HAMMER2_FLUSH_ALL); 2361 } 2362 rup_error |= tmp_error; 2363 } 2364 return ((error | rup_error) & ~HAMMER2_ERROR_EOF); 2365 } 2366 2367 /* 2368 * This fixes up an error introduced in earlier H2 implementations where 2369 * moving a PFS inode into an indirect block wound up causing the 2370 * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared. 2371 */ 2372 static 2373 int 2374 hammer2_fixup_pfses(hammer2_dev_t *hmp) 2375 { 2376 const hammer2_inode_data_t *ripdata; 2377 hammer2_chain_t *parent; 2378 hammer2_chain_t *chain; 2379 hammer2_key_t key_next; 2380 hammer2_pfs_t *spmp; 2381 int error; 2382 2383 error = 0; 2384 2385 /* 2386 * Lookup mount point under the media-localized super-root. 2387 * 2388 * cluster->pmp will incorrectly point to spmp and must be fixed 2389 * up later on. 2390 */ 2391 spmp = hmp->spmp; 2392 hammer2_inode_lock(spmp->iroot, 0); 2393 parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS); 2394 chain = hammer2_chain_lookup(&parent, &key_next, 2395 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 2396 &error, 0); 2397 while (chain) { 2398 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) 2399 continue; 2400 if (chain->error) { 2401 kprintf("I/O error scanning PFS labels\n"); 2402 error |= chain->error; 2403 } else if ((chain->bref.flags & 2404 HAMMER2_BREF_FLAG_PFSROOT) == 0) { 2405 int error2; 2406 2407 ripdata = &chain->data->ipdata; 2408 hammer2_trans_init(hmp->spmp, 0); 2409 error2 = hammer2_chain_modify(chain, 2410 chain->bref.modify_tid, 2411 0, 0); 2412 if (error2 == 0) { 2413 kprintf("hammer2: Correct mis-flagged PFS %s\n", 2414 ripdata->filename); 2415 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 2416 } else { 2417 error |= error2; 2418 } 2419 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 2420 HAMMER2_FLUSH_ALL); 2421 hammer2_trans_done(hmp->spmp, 0); 2422 } 2423 chain = hammer2_chain_next(&parent, chain, &key_next, 2424 key_next, HAMMER2_KEY_MAX, 2425 &error, 0); 2426 } 2427 if (parent) { 2428 hammer2_chain_unlock(parent); 2429 hammer2_chain_drop(parent); 2430 } 2431 hammer2_inode_unlock(spmp->iroot); 2432 2433 return error; 2434 } 2435 2436 /* 2437 * Sync a mount point; this is called periodically on a per-mount basis from 2438 * the filesystem syncer, and whenever a user issues a sync. 2439 */ 2440 int 2441 hammer2_vfs_sync(struct mount *mp, int waitfor) 2442 { 2443 int error; 2444 2445 error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor); 2446 2447 return error; 2448 } 2449 2450 /* 2451 * Because frontend operations lock vnodes before we get a chance to 2452 * lock the related inode, we can't just acquire a vnode lock without 2453 * risking a deadlock. The frontend may be holding a vnode lock while 2454 * also blocked on our SYNCQ flag while trying to get the inode lock. 2455 * 2456 * To deal with this situation we can check the vnode lock situation 2457 * after locking the inode and perform a work-around. 2458 */ 2459 int 2460 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor) 2461 { 2462 struct mount *mp; 2463 /*hammer2_xop_flush_t *xop;*/ 2464 /*struct hammer2_sync_info info;*/ 2465 hammer2_inode_t *ip; 2466 hammer2_depend_t *depend; 2467 hammer2_depend_t *depend_next; 2468 struct vnode *vp; 2469 uint32_t pass2; 2470 int error; 2471 int wakecount; 2472 int dorestart; 2473 2474 mp = pmp->mp; 2475 2476 /* 2477 * Move all inodes on sideq to syncq. This will clear sideq. 2478 * This should represent all flushable inodes. These inodes 2479 * will already have refs due to being on syncq or sideq. We 2480 * must do this all at once with the spinlock held to ensure that 2481 * all inode dependencies are part of the same flush. 2482 * 2483 * We should be able to do this asynchronously from frontend 2484 * operations because we will be locking the inodes later on 2485 * to actually flush them, and that will partition any frontend 2486 * op using the same inode. Either it has already locked the 2487 * inode and we will block, or it has not yet locked the inode 2488 * and it will block until we are finished flushing that inode. 2489 * 2490 * When restarting, only move the inodes flagged as PASS2 from 2491 * SIDEQ to SYNCQ. PASS2 propagation by inode_lock4() and 2492 * inode_depend() are atomic with the spin-lock. 2493 */ 2494 hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); 2495 #ifdef HAMMER2_DEBUG_SYNC 2496 kprintf("FILESYSTEM SYNC BOUNDARY\n"); 2497 #endif 2498 dorestart = 0; 2499 2500 /* 2501 * Move inodes from depq to syncq, releasing the related 2502 * depend structures. 2503 */ 2504 restart: 2505 #ifdef HAMMER2_DEBUG_SYNC 2506 kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart); 2507 #endif 2508 hammer2_trans_setflags(pmp, 0/*HAMMER2_TRANS_COPYQ*/); 2509 hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN); 2510 2511 /* 2512 * Move inodes from depq to syncq. When restarting, only depq's 2513 * marked pass2 are moved. 2514 */ 2515 hammer2_spin_ex(&pmp->list_spin); 2516 depend_next = TAILQ_FIRST(&pmp->depq); 2517 wakecount = 0; 2518 2519 while ((depend = depend_next) != NULL) { 2520 depend_next = TAILQ_NEXT(depend, entry); 2521 if (dorestart && depend->pass2 == 0) 2522 continue; 2523 TAILQ_FOREACH(ip, &depend->sideq, entry) { 2524 KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ); 2525 atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ); 2526 atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ); 2527 ip->depend = NULL; 2528 } 2529 2530 /* 2531 * NOTE: pmp->sideq_count includes both sideq and syncq 2532 */ 2533 TAILQ_CONCAT(&pmp->syncq, &depend->sideq, entry); 2534 2535 depend->count = 0; 2536 depend->pass2 = 0; 2537 TAILQ_REMOVE(&pmp->depq, depend, entry); 2538 } 2539 2540 hammer2_spin_unex(&pmp->list_spin); 2541 hammer2_trans_clearflags(pmp, /*HAMMER2_TRANS_COPYQ |*/ 2542 HAMMER2_TRANS_WAITING); 2543 dorestart = 0; 2544 2545 /* 2546 * sideq_count may have dropped enough to allow us to unstall 2547 * the frontend. 2548 */ 2549 hammer2_pfs_memory_wakeup(pmp, 0); 2550 2551 /* 2552 * Now run through all inodes on syncq. 2553 * 2554 * Flush transactions only interlock with other flush transactions. 2555 * Any conflicting frontend operations will block on the inode, but 2556 * may hold a vnode lock while doing so. 2557 */ 2558 hammer2_spin_ex(&pmp->list_spin); 2559 while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) { 2560 /* 2561 * Remove the inode from the SYNCQ, transfer the syncq ref 2562 * to us. We must clear SYNCQ to allow any potential 2563 * front-end deadlock to proceed. We must set PASS2 so 2564 * the dependency code knows what to do. 2565 */ 2566 pass2 = ip->flags; 2567 cpu_ccfence(); 2568 if (atomic_cmpset_int(&ip->flags, 2569 pass2, 2570 (pass2 & ~(HAMMER2_INODE_SYNCQ | 2571 HAMMER2_INODE_SYNCQ_WAKEUP)) | 2572 HAMMER2_INODE_SYNCQ_PASS2) == 0) { 2573 continue; 2574 } 2575 TAILQ_REMOVE(&pmp->syncq, ip, entry); 2576 --pmp->sideq_count; 2577 hammer2_spin_unex(&pmp->list_spin); 2578 2579 /* 2580 * Tickle anyone waiting on ip->flags or the hysteresis 2581 * on the dirty inode count. 2582 */ 2583 if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP) 2584 wakeup(&ip->flags); 2585 if (++wakecount >= hammer2_limit_dirty_inodes / 20 + 1) { 2586 wakecount = 0; 2587 hammer2_pfs_memory_wakeup(pmp, 0); 2588 } 2589 2590 /* 2591 * Relock the inode, and we inherit a ref from the above. 2592 * We will check for a race after we acquire the vnode. 2593 */ 2594 hammer2_mtx_ex(&ip->lock); 2595 2596 /* 2597 * We need the vp in order to vfsync() dirty buffers, so if 2598 * one isn't attached we can skip it. 2599 * 2600 * Ordering the inode lock and then the vnode lock has the 2601 * potential to deadlock. If we had left SYNCQ set that could 2602 * also deadlock us against the frontend even if we don't hold 2603 * any locks, but the latter is not a problem now since we 2604 * cleared it. igetv will temporarily release the inode lock 2605 * in a safe manner to work-around the deadlock. 2606 * 2607 * Unfortunately it is still possible to deadlock when the 2608 * frontend obtains multiple inode locks, because all the 2609 * related vnodes are already locked (nor can the vnode locks 2610 * be released and reacquired without messing up RECLAIM and 2611 * INACTIVE sequencing). 2612 * 2613 * The solution for now is to move the vp back onto SIDEQ 2614 * and set dorestart, which will restart the flush after we 2615 * exhaust the current SYNCQ. Note that additional 2616 * dependencies may build up, so we definitely need to move 2617 * the whole SIDEQ back to SYNCQ when we restart. 2618 */ 2619 vp = ip->vp; 2620 if (vp) { 2621 if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) { 2622 /* 2623 * Failed to get the vnode, requeue the inode 2624 * (PASS2 is already set so it will be found 2625 * again on the restart). 2626 * 2627 * Then unlock, possibly sleep, and retry 2628 * later. We sleep if PASS2 was *previously* 2629 * set, before we set it again above. 2630 */ 2631 vp = NULL; 2632 dorestart = 1; 2633 #ifdef HAMMER2_DEBUG_SYNC 2634 kprintf("inum %ld (sync delayed by vnode)\n", 2635 (long)ip->meta.inum); 2636 #endif 2637 hammer2_inode_delayed_sideq(ip); 2638 2639 hammer2_mtx_unlock(&ip->lock); 2640 hammer2_inode_drop(ip); 2641 2642 if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) { 2643 tsleep(&dorestart, 0, "h2syndel", 2); 2644 } 2645 hammer2_spin_ex(&pmp->list_spin); 2646 continue; 2647 } 2648 } else { 2649 vp = NULL; 2650 } 2651 2652 /* 2653 * If the inode wound up on a SIDEQ again it will already be 2654 * prepped for another PASS2. In this situation if we flush 2655 * it now we will just wind up flushing it again in the same 2656 * syncer run, so we might as well not flush it now. 2657 */ 2658 if (ip->flags & HAMMER2_INODE_SIDEQ) { 2659 hammer2_mtx_unlock(&ip->lock); 2660 hammer2_inode_drop(ip); 2661 if (vp) 2662 vput(vp); 2663 dorestart = 1; 2664 hammer2_spin_ex(&pmp->list_spin); 2665 continue; 2666 } 2667 2668 /* 2669 * Ok we have the inode exclusively locked and if vp is 2670 * not NULL that will also be exclusively locked. Do the 2671 * meat of the flush. 2672 * 2673 * vp token needed for v_rbdirty_tree check / vclrisdirty 2674 * sequencing. Though we hold the vnode exclusively so 2675 * we shouldn't need to hold the token also in this case. 2676 */ 2677 if (vp) { 2678 vfsync(vp, MNT_WAIT, 1, NULL, NULL); 2679 bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */ 2680 } 2681 2682 /* 2683 * If the inode has not yet been inserted into the tree 2684 * we must do so. Then sync and flush it. The flush should 2685 * update the parent. 2686 */ 2687 if (ip->flags & HAMMER2_INODE_DELETING) { 2688 #ifdef HAMMER2_DEBUG_SYNC 2689 kprintf("inum %ld destroy\n", (long)ip->meta.inum); 2690 #endif 2691 hammer2_inode_chain_des(ip); 2692 atomic_add_long(&hammer2_iod_inode_deletes, 1); 2693 } else if (ip->flags & HAMMER2_INODE_CREATING) { 2694 #ifdef HAMMER2_DEBUG_SYNC 2695 kprintf("inum %ld insert\n", (long)ip->meta.inum); 2696 #endif 2697 hammer2_inode_chain_ins(ip); 2698 atomic_add_long(&hammer2_iod_inode_creates, 1); 2699 } 2700 #ifdef HAMMER2_DEBUG_SYNC 2701 kprintf("inum %ld chain-sync\n", (long)ip->meta.inum); 2702 #endif 2703 2704 /* 2705 * Because I kinda messed up the design and index the inodes 2706 * under the root inode, along side the directory entries, 2707 * we can't flush the inode index under the iroot until the 2708 * end. If we do it now we might miss effects created by 2709 * other inodes on the SYNCQ. 2710 * 2711 * Do a normal (non-FSSYNC) flush instead, which allows the 2712 * vnode code to work the same. We don't want to force iroot 2713 * back onto the SIDEQ, and we also don't want the flush code 2714 * to update pfs_iroot_blocksets until the final flush later. 2715 * 2716 * XXX at the moment this will likely result in a double-flush 2717 * of the iroot chain. 2718 */ 2719 hammer2_inode_chain_sync(ip); 2720 if (ip == pmp->iroot) { 2721 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP); 2722 } else { 2723 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP | 2724 HAMMER2_XOP_FSSYNC); 2725 } 2726 if (vp) { 2727 lwkt_gettoken(&vp->v_token); 2728 if ((ip->flags & (HAMMER2_INODE_MODIFIED | 2729 HAMMER2_INODE_RESIZED | 2730 HAMMER2_INODE_DIRTYDATA)) == 0 && 2731 RB_EMPTY(&vp->v_rbdirty_tree) && 2732 !bio_track_active(&vp->v_track_write)) { 2733 vclrisdirty(vp); 2734 } else { 2735 hammer2_inode_delayed_sideq(ip); 2736 } 2737 lwkt_reltoken(&vp->v_token); 2738 vput(vp); 2739 vp = NULL; /* safety */ 2740 } 2741 atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2); 2742 hammer2_inode_unlock(ip); /* unlock+drop */ 2743 /* ip pointer invalid */ 2744 2745 /* 2746 * If the inode got dirted after we dropped our locks, 2747 * it will have already been moved back to the SIDEQ. 2748 */ 2749 hammer2_spin_ex(&pmp->list_spin); 2750 } 2751 hammer2_spin_unex(&pmp->list_spin); 2752 hammer2_pfs_memory_wakeup(pmp, 0); 2753 2754 if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) { 2755 #ifdef HAMMER2_DEBUG_SYNC 2756 kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n"); 2757 /*tsleep(&dorestart, 0, "h2STG1-R", hz*20);*/ 2758 #endif 2759 dorestart = 1; 2760 goto restart; 2761 } 2762 #ifdef HAMMER2_DEBUG_SYNC 2763 kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n"); 2764 /*tsleep(&dorestart, 0, "h2STG2", hz*20);*/ 2765 #endif 2766 2767 /* 2768 * We have to flush the PFS root last, even if it does not appear to 2769 * be dirty, because all the inodes in the PFS are indexed under it. 2770 * The normal flushing of iroot above would only occur if directory 2771 * entries under the root were changed. 2772 * 2773 * Specifying VOLHDR will cause an additionl flush of hmp->spmp 2774 * for the media making up the cluster. 2775 */ 2776 if ((ip = pmp->iroot) != NULL) { 2777 hammer2_inode_ref(ip); 2778 hammer2_mtx_ex(&ip->lock); 2779 hammer2_inode_chain_sync(ip); 2780 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP | 2781 HAMMER2_XOP_FSSYNC | 2782 HAMMER2_XOP_VOLHDR); 2783 hammer2_inode_unlock(ip); /* unlock+drop */ 2784 } 2785 #ifdef HAMMER2_DEBUG_SYNC 2786 kprintf("FILESYSTEM SYNC STAGE 2 DONE\n"); 2787 #endif 2788 2789 /* 2790 * device bioq sync 2791 */ 2792 hammer2_bioq_sync(pmp); 2793 2794 #if 0 2795 info.pass = 1; 2796 info.waitfor = MNT_WAIT; 2797 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2798 2799 info.pass = 2; 2800 info.waitfor = MNT_WAIT; 2801 vsyncscan(mp, flags, hammer2_sync_scan2, &info); 2802 #endif 2803 #if 0 2804 /* 2805 * Generally speaking we now want to flush the media topology from 2806 * the iroot through to the inodes. The flush stops at any inode 2807 * boundary, which allows the frontend to continue running concurrent 2808 * modifying operations on inodes (including kernel flushes of 2809 * buffers) without interfering with the main sync. 2810 * 2811 * Use the XOP interface to concurrently flush all nodes to 2812 * synchronize the PFSROOT subtopology to the media. A standard 2813 * end-of-scan ENOENT error indicates cluster sufficiency. 2814 * 2815 * Note that this flush will not be visible on crash recovery until 2816 * we flush the super-root topology in the next loop. 2817 * 2818 * XXX For now wait for all flushes to complete. 2819 */ 2820 if (mp && (ip = pmp->iroot) != NULL) { 2821 /* 2822 * If unmounting try to flush everything including any 2823 * sub-trees under inodes, just in case there is dangling 2824 * modified data, as a safety. Otherwise just flush up to 2825 * the inodes in this stage. 2826 */ 2827 kprintf("MP & IROOT\n"); 2828 #ifdef HAMMER2_DEBUG_SYNC 2829 kprintf("FILESYSTEM SYNC STAGE 3 IROOT BEGIN\n"); 2830 #endif 2831 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 2832 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | 2833 HAMMER2_XOP_VOLHDR | 2834 HAMMER2_XOP_FSSYNC | 2835 HAMMER2_XOP_INODE_STOP); 2836 } else { 2837 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | 2838 HAMMER2_XOP_INODE_STOP | 2839 HAMMER2_XOP_VOLHDR | 2840 HAMMER2_XOP_FSSYNC | 2841 HAMMER2_XOP_INODE_STOP); 2842 } 2843 hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc); 2844 error = hammer2_xop_collect(&xop->head, 2845 HAMMER2_XOP_COLLECT_WAITALL); 2846 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 2847 #ifdef HAMMER2_DEBUG_SYNC 2848 kprintf("FILESYSTEM SYNC STAGE 3 IROOT END\n"); 2849 #endif 2850 if (error == HAMMER2_ERROR_ENOENT) 2851 error = 0; 2852 else 2853 error = hammer2_error_to_errno(error); 2854 } else { 2855 error = 0; 2856 } 2857 #endif 2858 error = 0; /* XXX */ 2859 hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH); 2860 2861 return (error); 2862 } 2863 2864 static 2865 int 2866 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp) 2867 { 2868 hammer2_inode_t *ip; 2869 2870 KKASSERT(MAXFIDSZ >= 16); 2871 ip = VTOI(vp); 2872 fhp->fid_len = offsetof(struct fid, fid_data[16]); 2873 fhp->fid_ext = 0; 2874 ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum; 2875 ((hammer2_tid_t *)fhp->fid_data)[1] = 0; 2876 2877 return 0; 2878 } 2879 2880 static 2881 int 2882 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, 2883 struct fid *fhp, struct vnode **vpp) 2884 { 2885 hammer2_pfs_t *pmp; 2886 hammer2_tid_t inum; 2887 int error; 2888 2889 pmp = MPTOPMP(mp); 2890 inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK; 2891 if (vpp) { 2892 if (inum == 1) 2893 error = hammer2_vfs_root(mp, vpp); 2894 else 2895 error = hammer2_vfs_vget(mp, NULL, inum, vpp); 2896 } else { 2897 error = 0; 2898 } 2899 if (error) 2900 kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error); 2901 return error; 2902 } 2903 2904 static 2905 int 2906 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam, 2907 int *exflagsp, struct ucred **credanonp) 2908 { 2909 hammer2_pfs_t *pmp; 2910 struct netcred *np; 2911 int error; 2912 2913 pmp = MPTOPMP(mp); 2914 np = vfs_export_lookup(mp, &pmp->export, nam); 2915 if (np) { 2916 *exflagsp = np->netc_exflags; 2917 *credanonp = &np->netc_anon; 2918 error = 0; 2919 } else { 2920 error = EACCES; 2921 } 2922 return error; 2923 } 2924 2925 /* 2926 * Support code for hammer2_vfs_mount(). Read, verify, and install the volume 2927 * header into the HMP 2928 * 2929 * XXX read four volhdrs and use the one with the highest TID whos CRC 2930 * matches. 2931 * 2932 * XXX check iCRCs. 2933 * 2934 * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to 2935 * nonexistant locations. 2936 * 2937 * XXX Record selected volhdr and ring updates to each of 4 volhdrs 2938 */ 2939 static 2940 int 2941 hammer2_install_volume_header(hammer2_dev_t *hmp) 2942 { 2943 hammer2_volume_data_t *vd; 2944 struct buf *bp; 2945 hammer2_crc32_t crc0, crc, bcrc0, bcrc; 2946 int error_reported; 2947 int error; 2948 int valid; 2949 int i; 2950 2951 error_reported = 0; 2952 error = 0; 2953 valid = 0; 2954 bp = NULL; 2955 2956 /* 2957 * There are up to 4 copies of the volume header (syncs iterate 2958 * between them so there is no single master). We don't trust the 2959 * volu_size field so we don't know precisely how large the filesystem 2960 * is, so depend on the OS to return an error if we go beyond the 2961 * block device's EOF. 2962 */ 2963 for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { 2964 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64, 2965 HAMMER2_VOLUME_BYTES, &bp); 2966 if (error) { 2967 brelse(bp); 2968 bp = NULL; 2969 continue; 2970 } 2971 2972 vd = (struct hammer2_volume_data *) bp->b_data; 2973 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) && 2974 (vd->magic != HAMMER2_VOLUME_ID_ABO)) { 2975 brelse(bp); 2976 bp = NULL; 2977 continue; 2978 } 2979 2980 if (vd->magic == HAMMER2_VOLUME_ID_ABO) { 2981 /* XXX: Reversed-endianness filesystem */ 2982 kprintf("hammer2: reverse-endian filesystem detected"); 2983 brelse(bp); 2984 bp = NULL; 2985 continue; 2986 } 2987 2988 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0]; 2989 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF, 2990 HAMMER2_VOLUME_ICRC0_SIZE); 2991 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1]; 2992 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF, 2993 HAMMER2_VOLUME_ICRC1_SIZE); 2994 if ((crc0 != crc) || (bcrc0 != bcrc)) { 2995 kprintf("hammer2 volume header crc " 2996 "mismatch copy #%d %08x/%08x\n", 2997 i, crc0, crc); 2998 error_reported = 1; 2999 brelse(bp); 3000 bp = NULL; 3001 continue; 3002 } 3003 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) { 3004 valid = 1; 3005 hmp->voldata = *vd; 3006 hmp->volhdrno = i; 3007 } 3008 brelse(bp); 3009 bp = NULL; 3010 } 3011 if (valid) { 3012 hmp->volsync = hmp->voldata; 3013 hmp->free_reserved = hmp->voldata.allocator_size / 20; 3014 error = 0; 3015 if (error_reported || bootverbose || 1) { /* 1/DEBUG */ 3016 kprintf("hammer2: using volume header #%d\n", 3017 hmp->volhdrno); 3018 } 3019 } else { 3020 error = EINVAL; 3021 kprintf("hammer2: no valid volume headers found!\n"); 3022 } 3023 return (error); 3024 } 3025 3026 /* 3027 * This handles hysteresis on regular file flushes. Because the BIOs are 3028 * routed to a thread it is possible for an excessive number to build up 3029 * and cause long front-end stalls long before the runningbuffspace limit 3030 * is hit, so we implement hammer2_flush_pipe to control the 3031 * hysteresis. 3032 * 3033 * This is a particular problem when compression is used. 3034 */ 3035 void 3036 hammer2_lwinprog_ref(hammer2_pfs_t *pmp) 3037 { 3038 atomic_add_int(&pmp->count_lwinprog, 1); 3039 } 3040 3041 void 3042 hammer2_lwinprog_drop(hammer2_pfs_t *pmp) 3043 { 3044 int lwinprog; 3045 3046 lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1); 3047 if ((lwinprog & HAMMER2_LWINPROG_WAITING) && 3048 (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) { 3049 atomic_clear_int(&pmp->count_lwinprog, 3050 HAMMER2_LWINPROG_WAITING); 3051 wakeup(&pmp->count_lwinprog); 3052 } 3053 if ((lwinprog & HAMMER2_LWINPROG_WAITING0) && 3054 (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) { 3055 atomic_clear_int(&pmp->count_lwinprog, 3056 HAMMER2_LWINPROG_WAITING0); 3057 wakeup(&pmp->count_lwinprog); 3058 } 3059 } 3060 3061 void 3062 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe) 3063 { 3064 int lwinprog; 3065 int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING : 3066 HAMMER2_LWINPROG_WAITING0; 3067 3068 for (;;) { 3069 lwinprog = pmp->count_lwinprog; 3070 cpu_ccfence(); 3071 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 3072 break; 3073 tsleep_interlock(&pmp->count_lwinprog, 0); 3074 atomic_set_int(&pmp->count_lwinprog, lwflag); 3075 lwinprog = pmp->count_lwinprog; 3076 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe) 3077 break; 3078 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz); 3079 } 3080 } 3081 3082 /* 3083 * It is possible for an excessive number of dirty chains or dirty inodes 3084 * to build up. When this occurs we start an asynchronous filesystem sync. 3085 * If the level continues to build up, we stall, waiting for it to drop, 3086 * with some hysteresis. 3087 * 3088 * This relies on the kernel calling hammer2_vfs_modifying() prior to 3089 * obtaining any vnode locks before making a modifying VOP call. 3090 */ 3091 static int 3092 hammer2_vfs_modifying(struct mount *mp) 3093 { 3094 if (mp->mnt_flag & MNT_RDONLY) 3095 return EROFS; 3096 hammer2_pfs_memory_wait(MPTOPMP(mp)); 3097 3098 return 0; 3099 } 3100 3101 /* 3102 * Initiate an asynchronous filesystem sync and, with hysteresis, 3103 * stall if the internal data structure count becomes too bloated. 3104 */ 3105 void 3106 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp) 3107 { 3108 uint32_t waiting; 3109 int pcatch; 3110 int error; 3111 3112 if (pmp == NULL || pmp->mp == NULL) 3113 return; 3114 3115 for (;;) { 3116 waiting = pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK; 3117 cpu_ccfence(); 3118 3119 /* 3120 * Start the syncer running at 1/2 the limit 3121 */ 3122 if (waiting > hammer2_limit_dirty_chains / 2 || 3123 pmp->sideq_count > hammer2_limit_dirty_inodes / 2) { 3124 trigger_syncer(pmp->mp); 3125 } 3126 3127 /* 3128 * Stall at the limit waiting for the counts to drop. 3129 * This code will typically be woken up once the count 3130 * drops below 3/4 the limit, or in one second. 3131 */ 3132 if (waiting < hammer2_limit_dirty_chains && 3133 pmp->sideq_count < hammer2_limit_dirty_inodes) { 3134 break; 3135 } 3136 3137 pcatch = curthread->td_proc ? PCATCH : 0; 3138 3139 tsleep_interlock(&pmp->inmem_dirty_chains, pcatch); 3140 atomic_set_int(&pmp->inmem_dirty_chains, 3141 HAMMER2_DIRTYCHAIN_WAITING); 3142 if (waiting < hammer2_limit_dirty_chains && 3143 pmp->sideq_count < hammer2_limit_dirty_inodes) { 3144 break; 3145 } 3146 trigger_syncer(pmp->mp); 3147 error = tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED | pcatch, 3148 "h2memw", hz); 3149 if (error == ERESTART) 3150 break; 3151 } 3152 } 3153 3154 /* 3155 * Wake up any stalled frontend ops waiting, with hysteresis, using 3156 * 2/3 of the limit. 3157 */ 3158 void 3159 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp, int count) 3160 { 3161 uint32_t waiting; 3162 3163 if (pmp) { 3164 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, count); 3165 /* don't need --waiting to test flag */ 3166 3167 if ((waiting & HAMMER2_DIRTYCHAIN_WAITING) && 3168 (pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK) <= 3169 hammer2_limit_dirty_chains * 2 / 3 && 3170 pmp->sideq_count <= hammer2_limit_dirty_inodes * 2 / 3) { 3171 atomic_clear_int(&pmp->inmem_dirty_chains, 3172 HAMMER2_DIRTYCHAIN_WAITING); 3173 wakeup(&pmp->inmem_dirty_chains); 3174 } 3175 } 3176 } 3177 3178 void 3179 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp) 3180 { 3181 if (pmp) { 3182 atomic_add_int(&pmp->inmem_dirty_chains, 1); 3183 } 3184 } 3185 3186 /* 3187 * Returns 0 if the filesystem has tons of free space 3188 * Returns 1 if the filesystem has less than 10% remaining 3189 * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining. 3190 */ 3191 int 3192 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred) 3193 { 3194 hammer2_pfs_t *pmp; 3195 hammer2_dev_t *hmp; 3196 hammer2_off_t free_reserved; 3197 hammer2_off_t free_nominal; 3198 int i; 3199 3200 pmp = ip->pmp; 3201 3202 if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) { 3203 free_reserved = HAMMER2_SEGSIZE; 3204 free_nominal = 0x7FFFFFFFFFFFFFFFLLU; 3205 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) { 3206 hmp = pmp->pfs_hmps[i]; 3207 if (hmp == NULL) 3208 continue; 3209 if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER && 3210 pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER) 3211 continue; 3212 3213 if (free_nominal > hmp->voldata.allocator_free) 3214 free_nominal = hmp->voldata.allocator_free; 3215 if (free_reserved < hmp->free_reserved) 3216 free_reserved = hmp->free_reserved; 3217 } 3218 3219 /* 3220 * SMP races ok 3221 */ 3222 pmp->free_reserved = free_reserved; 3223 pmp->free_nominal = free_nominal; 3224 pmp->free_ticks = ticks; 3225 } else { 3226 free_reserved = pmp->free_reserved; 3227 free_nominal = pmp->free_nominal; 3228 } 3229 if (cred && cred->cr_uid != 0) { 3230 if ((int64_t)(free_nominal - bytes) < 3231 (int64_t)free_reserved) { 3232 return 2; 3233 } 3234 } else { 3235 if ((int64_t)(free_nominal - bytes) < 3236 (int64_t)free_reserved / 2) { 3237 return 2; 3238 } 3239 } 3240 if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2) 3241 return 1; 3242 return 0; 3243 } 3244 3245 /* 3246 * Debugging 3247 */ 3248 void 3249 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, 3250 u_int flags) 3251 { 3252 hammer2_chain_t *scan; 3253 hammer2_chain_t *parent; 3254 3255 --*countp; 3256 if (*countp == 0) { 3257 kprintf("%*.*s...\n", tab, tab, ""); 3258 return; 3259 } 3260 if (*countp < 0) 3261 return; 3262 kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n", 3263 tab, tab, "", pfx, 3264 chain, chain->bref.type, 3265 chain->bref.key, chain->bref.keybits, 3266 chain->bref.mirror_tid); 3267 3268 kprintf("%*.*s [%08x] (%s) refs=%d", 3269 tab, tab, "", 3270 chain->flags, 3271 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 3272 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 3273 chain->refs); 3274 3275 parent = chain->parent; 3276 if (parent) 3277 kprintf("\n%*.*s p=%p [pflags %08x prefs %d", 3278 tab, tab, "", 3279 parent, parent->flags, parent->refs); 3280 if (RB_EMPTY(&chain->core.rbtree)) { 3281 kprintf("\n"); 3282 } else { 3283 kprintf(" {\n"); 3284 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 3285 if ((scan->flags & flags) || flags == (u_int)-1) { 3286 hammer2_dump_chain(scan, tab + 4, countp, 'a', 3287 flags); 3288 } 3289 } 3290 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 3291 kprintf("%*.*s}(%s)\n", tab, tab, "", 3292 chain->data->ipdata.filename); 3293 else 3294 kprintf("%*.*s}\n", tab, tab, ""); 3295 } 3296 } 3297