1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/sysmacros.h> 33 #include <sys/mntent.h> 34 #include <sys/u8_textprep.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/file.h> 39 #include <sys/kmem.h> 40 #include <sys/errno.h> 41 #include <sys/atomic.h> 42 #include <sys/zfs_dir.h> 43 #include <sys/zfs_acl.h> 44 #include <sys/zfs_ioctl.h> 45 #include <sys/zfs_rlock.h> 46 #include <sys/zfs_fuid.h> 47 #include <sys/zfs_vnops.h> 48 #include <sys/zfs_ctldir.h> 49 #include <sys/dnode.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/zpl.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/dmu_tx.h> 57 #include <sys/zfs_refcount.h> 58 #include <sys/stat.h> 59 #include <sys/zap.h> 60 #include <sys/zfs_znode.h> 61 #include <sys/sa.h> 62 #include <sys/zfs_sa.h> 63 #include <sys/zfs_stat.h> 64 65 #include "zfs_prop.h" 66 #include "zfs_comutil.h" 67 68 /* 69 * Functions needed for userland (ie: libzpool) are not put under 70 * #ifdef_KERNEL; the rest of the functions have dependencies 71 * (such as VFS logic) that will not compile easily in userland. 72 */ 73 #ifdef _KERNEL 74 75 static kmem_cache_t *znode_cache = NULL; 76 static kmem_cache_t *znode_hold_cache = NULL; 77 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; 78 79 /* 80 * This is used by the test suite so that it can delay znodes from being 81 * freed in order to inspect the unlinked set. 82 */ 83 static int zfs_unlink_suspend_progress = 0; 84 85 /* 86 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on 87 * z_rangelock. It will modify the offset and length of the lock to reflect 88 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is 89 * called with the rangelock_t's rl_lock held, which avoids races. 90 */ 91 static void 92 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) 93 { 94 znode_t *zp = arg; 95 96 /* 97 * If in append mode, convert to writer and lock starting at the 98 * current end of file. 99 */ 100 if (new->lr_type == RL_APPEND) { 101 new->lr_offset = zp->z_size; 102 new->lr_type = RL_WRITER; 103 } 104 105 /* 106 * If we need to grow the block size then lock the whole file range. 107 */ 108 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); 109 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || 110 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { 111 new->lr_offset = 0; 112 new->lr_length = UINT64_MAX; 113 } 114 } 115 116 static int 117 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 118 { 119 (void) arg, (void) kmflags; 120 znode_t *zp = buf; 121 122 inode_init_once(ZTOI(zp)); 123 list_link_init(&zp->z_link_node); 124 125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 127 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); 128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 129 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); 130 131 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); 132 133 zp->z_dirlocks = NULL; 134 zp->z_acl_cached = NULL; 135 zp->z_xattr_cached = NULL; 136 zp->z_xattr_parent = 0; 137 zp->z_sync_writes_cnt = 0; 138 zp->z_async_writes_cnt = 0; 139 140 return (0); 141 } 142 143 static void 144 zfs_znode_cache_destructor(void *buf, void *arg) 145 { 146 (void) arg; 147 znode_t *zp = buf; 148 149 ASSERT(!list_link_active(&zp->z_link_node)); 150 mutex_destroy(&zp->z_lock); 151 rw_destroy(&zp->z_parent_lock); 152 rw_destroy(&zp->z_name_lock); 153 mutex_destroy(&zp->z_acl_lock); 154 rw_destroy(&zp->z_xattr_lock); 155 zfs_rangelock_fini(&zp->z_rangelock); 156 157 ASSERT3P(zp->z_dirlocks, ==, NULL); 158 ASSERT3P(zp->z_acl_cached, ==, NULL); 159 ASSERT3P(zp->z_xattr_cached, ==, NULL); 160 161 ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); 162 ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); 163 } 164 165 static int 166 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) 167 { 168 (void) arg, (void) kmflags; 169 znode_hold_t *zh = buf; 170 171 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); 172 zh->zh_refcount = 0; 173 174 return (0); 175 } 176 177 static void 178 zfs_znode_hold_cache_destructor(void *buf, void *arg) 179 { 180 (void) arg; 181 znode_hold_t *zh = buf; 182 183 mutex_destroy(&zh->zh_lock); 184 } 185 186 void 187 zfs_znode_init(void) 188 { 189 /* 190 * Initialize zcache. The KMC_SLAB hint is used in order that it be 191 * backed by kmalloc() when on the Linux slab in order that any 192 * wait_on_bit() operations on the related inode operate properly. 193 */ 194 ASSERT(znode_cache == NULL); 195 znode_cache = kmem_cache_create("zfs_znode_cache", 196 sizeof (znode_t), 0, zfs_znode_cache_constructor, 197 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); 198 199 ASSERT(znode_hold_cache == NULL); 200 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", 201 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, 202 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); 203 } 204 205 void 206 zfs_znode_fini(void) 207 { 208 /* 209 * Cleanup zcache 210 */ 211 if (znode_cache) 212 kmem_cache_destroy(znode_cache); 213 znode_cache = NULL; 214 215 if (znode_hold_cache) 216 kmem_cache_destroy(znode_hold_cache); 217 znode_hold_cache = NULL; 218 } 219 220 /* 221 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to 222 * serialize access to a znode and its SA buffer while the object is being 223 * created or destroyed. This kind of locking would normally reside in the 224 * znode itself but in this case that's impossible because the znode and SA 225 * buffer may not yet exist. Therefore the locking is handled externally 226 * with an array of mutexes and AVLs trees which contain per-object locks. 227 * 228 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted 229 * in to the correct AVL tree and finally the per-object lock is held. In 230 * zfs_znode_hold_exit() the process is reversed. The per-object lock is 231 * released, removed from the AVL tree and destroyed if there are no waiters. 232 * 233 * This scheme has two important properties: 234 * 235 * 1) No memory allocations are performed while holding one of the z_hold_locks. 236 * This ensures evict(), which can be called from direct memory reclaim, will 237 * never block waiting on a z_hold_locks which just happens to have hashed 238 * to the same index. 239 * 240 * 2) All locks used to serialize access to an object are per-object and never 241 * shared. This minimizes lock contention without creating a large number 242 * of dedicated locks. 243 * 244 * On the downside it does require znode_lock_t structures to be frequently 245 * allocated and freed. However, because these are backed by a kmem cache 246 * and very short lived this cost is minimal. 247 */ 248 int 249 zfs_znode_hold_compare(const void *a, const void *b) 250 { 251 const znode_hold_t *zh_a = (const znode_hold_t *)a; 252 const znode_hold_t *zh_b = (const znode_hold_t *)b; 253 254 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); 255 } 256 257 static boolean_t __maybe_unused 258 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) 259 { 260 znode_hold_t *zh, search; 261 int i = ZFS_OBJ_HASH(zfsvfs, obj); 262 boolean_t held; 263 264 search.zh_obj = obj; 265 266 mutex_enter(&zfsvfs->z_hold_locks[i]); 267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 268 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; 269 mutex_exit(&zfsvfs->z_hold_locks[i]); 270 271 return (held); 272 } 273 274 znode_hold_t * 275 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) 276 { 277 znode_hold_t *zh, *zh_new, search; 278 int i = ZFS_OBJ_HASH(zfsvfs, obj); 279 boolean_t found = B_FALSE; 280 281 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); 282 search.zh_obj = obj; 283 284 mutex_enter(&zfsvfs->z_hold_locks[i]); 285 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 286 if (likely(zh == NULL)) { 287 zh = zh_new; 288 zh->zh_obj = obj; 289 avl_add(&zfsvfs->z_hold_trees[i], zh); 290 } else { 291 ASSERT3U(zh->zh_obj, ==, obj); 292 found = B_TRUE; 293 } 294 zh->zh_refcount++; 295 ASSERT3S(zh->zh_refcount, >, 0); 296 mutex_exit(&zfsvfs->z_hold_locks[i]); 297 298 if (found == B_TRUE) 299 kmem_cache_free(znode_hold_cache, zh_new); 300 301 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); 302 mutex_enter(&zh->zh_lock); 303 304 return (zh); 305 } 306 307 void 308 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) 309 { 310 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); 311 boolean_t remove = B_FALSE; 312 313 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); 314 mutex_exit(&zh->zh_lock); 315 316 mutex_enter(&zfsvfs->z_hold_locks[i]); 317 ASSERT3S(zh->zh_refcount, >, 0); 318 if (--zh->zh_refcount == 0) { 319 avl_remove(&zfsvfs->z_hold_trees[i], zh); 320 remove = B_TRUE; 321 } 322 mutex_exit(&zfsvfs->z_hold_locks[i]); 323 324 if (remove == B_TRUE) 325 kmem_cache_free(znode_hold_cache, zh); 326 } 327 328 dev_t 329 zfs_cmpldev(uint64_t dev) 330 { 331 return (dev); 332 } 333 334 static void 335 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, 336 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) 337 { 338 ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); 339 340 mutex_enter(&zp->z_lock); 341 342 ASSERT(zp->z_sa_hdl == NULL); 343 ASSERT(zp->z_acl_cached == NULL); 344 if (sa_hdl == NULL) { 345 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, 346 SA_HDL_SHARED, &zp->z_sa_hdl)); 347 } else { 348 zp->z_sa_hdl = sa_hdl; 349 sa_set_userp(sa_hdl, zp); 350 } 351 352 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; 353 354 mutex_exit(&zp->z_lock); 355 } 356 357 void 358 zfs_znode_dmu_fini(znode_t *zp) 359 { 360 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || 361 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); 362 363 sa_handle_destroy(zp->z_sa_hdl); 364 zp->z_sa_hdl = NULL; 365 } 366 367 /* 368 * Called by new_inode() to allocate a new inode. 369 */ 370 int 371 zfs_inode_alloc(struct super_block *sb, struct inode **ip) 372 { 373 znode_t *zp; 374 375 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 376 *ip = ZTOI(zp); 377 378 return (0); 379 } 380 381 /* 382 * Called in multiple places when an inode should be destroyed. 383 */ 384 void 385 zfs_inode_destroy(struct inode *ip) 386 { 387 znode_t *zp = ITOZ(ip); 388 zfsvfs_t *zfsvfs = ZTOZSB(zp); 389 390 mutex_enter(&zfsvfs->z_znodes_lock); 391 if (list_link_active(&zp->z_link_node)) { 392 list_remove(&zfsvfs->z_all_znodes, zp); 393 zfsvfs->z_nr_znodes--; 394 } 395 mutex_exit(&zfsvfs->z_znodes_lock); 396 397 if (zp->z_acl_cached) { 398 zfs_acl_free(zp->z_acl_cached); 399 zp->z_acl_cached = NULL; 400 } 401 402 if (zp->z_xattr_cached) { 403 nvlist_free(zp->z_xattr_cached); 404 zp->z_xattr_cached = NULL; 405 } 406 407 kmem_cache_free(znode_cache, zp); 408 } 409 410 static void 411 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) 412 { 413 uint64_t rdev = 0; 414 415 switch (ip->i_mode & S_IFMT) { 416 case S_IFREG: 417 ip->i_op = &zpl_inode_operations; 418 ip->i_fop = &zpl_file_operations; 419 ip->i_mapping->a_ops = &zpl_address_space_operations; 420 break; 421 422 case S_IFDIR: 423 #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER 424 ip->i_flags |= S_IOPS_WRAPPER; 425 ip->i_op = &zpl_dir_inode_operations.ops; 426 #else 427 ip->i_op = &zpl_dir_inode_operations; 428 #endif 429 ip->i_fop = &zpl_dir_file_operations; 430 ITOZ(ip)->z_zn_prefetch = B_TRUE; 431 break; 432 433 case S_IFLNK: 434 ip->i_op = &zpl_symlink_inode_operations; 435 break; 436 437 /* 438 * rdev is only stored in a SA only for device files. 439 */ 440 case S_IFCHR: 441 case S_IFBLK: 442 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, 443 sizeof (rdev)); 444 zfs_fallthrough; 445 case S_IFIFO: 446 case S_IFSOCK: 447 init_special_inode(ip, ip->i_mode, rdev); 448 ip->i_op = &zpl_special_inode_operations; 449 break; 450 451 default: 452 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", 453 (u_longlong_t)ip->i_ino, ip->i_mode); 454 455 /* Assume the inode is a file and attempt to continue */ 456 ip->i_mode = S_IFREG | 0644; 457 ip->i_op = &zpl_inode_operations; 458 ip->i_fop = &zpl_file_operations; 459 ip->i_mapping->a_ops = &zpl_address_space_operations; 460 break; 461 } 462 } 463 464 static void 465 zfs_set_inode_flags(znode_t *zp, struct inode *ip) 466 { 467 /* 468 * Linux and Solaris have different sets of file attributes, so we 469 * restrict this conversion to the intersection of the two. 470 */ 471 #ifdef HAVE_INODE_SET_FLAGS 472 unsigned int flags = 0; 473 if (zp->z_pflags & ZFS_IMMUTABLE) 474 flags |= S_IMMUTABLE; 475 if (zp->z_pflags & ZFS_APPENDONLY) 476 flags |= S_APPEND; 477 478 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); 479 #else 480 if (zp->z_pflags & ZFS_IMMUTABLE) 481 ip->i_flags |= S_IMMUTABLE; 482 else 483 ip->i_flags &= ~S_IMMUTABLE; 484 485 if (zp->z_pflags & ZFS_APPENDONLY) 486 ip->i_flags |= S_APPEND; 487 else 488 ip->i_flags &= ~S_APPEND; 489 #endif 490 } 491 492 /* 493 * Update the embedded inode given the znode. 494 */ 495 void 496 zfs_znode_update_vfs(znode_t *zp) 497 { 498 struct inode *ip; 499 uint32_t blksize; 500 u_longlong_t i_blocks; 501 502 ASSERT(zp != NULL); 503 ip = ZTOI(zp); 504 505 /* Skip .zfs control nodes which do not exist on disk. */ 506 if (zfsctl_is_node(ip)) 507 return; 508 509 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); 510 511 spin_lock(&ip->i_lock); 512 ip->i_mode = zp->z_mode; 513 ip->i_blocks = i_blocks; 514 i_size_write(ip, zp->z_size); 515 spin_unlock(&ip->i_lock); 516 } 517 518 519 /* 520 * Construct a znode+inode and initialize. 521 * 522 * This does not do a call to dmu_set_user() that is 523 * up to the caller to do, in case you don't want to 524 * return the znode 525 */ 526 static znode_t * 527 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, 528 dmu_object_type_t obj_type, sa_handle_t *hdl) 529 { 530 znode_t *zp; 531 struct inode *ip; 532 uint64_t mode; 533 uint64_t parent; 534 uint64_t tmp_gen; 535 uint64_t links; 536 uint64_t z_uid, z_gid; 537 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 538 uint64_t projid = ZFS_DEFAULT_PROJID; 539 sa_bulk_attr_t bulk[12]; 540 int count = 0; 541 542 ASSERT(zfsvfs != NULL); 543 544 ip = new_inode(zfsvfs->z_sb); 545 if (ip == NULL) 546 return (NULL); 547 548 zp = ITOZ(ip); 549 ASSERT(zp->z_dirlocks == NULL); 550 ASSERT3P(zp->z_acl_cached, ==, NULL); 551 ASSERT3P(zp->z_xattr_cached, ==, NULL); 552 zp->z_unlinked = B_FALSE; 553 zp->z_atime_dirty = B_FALSE; 554 #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) 555 zp->z_is_mapped = B_FALSE; 556 #endif 557 zp->z_is_ctldir = B_FALSE; 558 zp->z_suspended = B_FALSE; 559 zp->z_sa_hdl = NULL; 560 zp->z_mapcnt = 0; 561 zp->z_id = db->db_object; 562 zp->z_blksz = blksz; 563 zp->z_seq = 0x7A4653; 564 zp->z_sync_cnt = 0; 565 zp->z_sync_writes_cnt = 0; 566 zp->z_async_writes_cnt = 0; 567 568 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); 569 570 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 571 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); 572 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 573 &zp->z_size, 8); 574 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 575 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 576 &zp->z_pflags, 8); 577 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 578 &parent, 8); 579 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); 580 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); 581 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 582 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 583 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 584 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 585 586 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || 587 (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 588 (zp->z_pflags & ZFS_PROJID) && 589 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { 590 if (hdl == NULL) 591 sa_handle_destroy(zp->z_sa_hdl); 592 zp->z_sa_hdl = NULL; 593 goto error; 594 } 595 596 zp->z_projid = projid; 597 zp->z_mode = ip->i_mode = mode; 598 ip->i_generation = (uint32_t)tmp_gen; 599 ip->i_blkbits = SPA_MINBLOCKSHIFT; 600 set_nlink(ip, (uint32_t)links); 601 zfs_uid_write(ip, z_uid); 602 zfs_gid_write(ip, z_gid); 603 zfs_set_inode_flags(zp, ip); 604 605 /* Cache the xattr parent id */ 606 if (zp->z_pflags & ZFS_XATTR) 607 zp->z_xattr_parent = parent; 608 609 ZFS_TIME_DECODE(&ip->i_atime, atime); 610 ZFS_TIME_DECODE(&ip->i_mtime, mtime); 611 ZFS_TIME_DECODE(&ip->i_ctime, ctime); 612 ZFS_TIME_DECODE(&zp->z_btime, btime); 613 614 ip->i_ino = zp->z_id; 615 zfs_znode_update_vfs(zp); 616 zfs_inode_set_ops(zfsvfs, ip); 617 618 /* 619 * The only way insert_inode_locked() can fail is if the ip->i_ino 620 * number is already hashed for this super block. This can never 621 * happen because the inode numbers map 1:1 with the object numbers. 622 * 623 * Exceptions include rolling back a mounted file system, either 624 * from the zfs rollback or zfs recv command. 625 * 626 * Active inodes are unhashed during the rollback, but since zrele 627 * can happen asynchronously, we can't guarantee they've been 628 * unhashed. This can cause hash collisions in unlinked drain 629 * processing so do not hash unlinked znodes. 630 */ 631 if (links > 0) 632 VERIFY3S(insert_inode_locked(ip), ==, 0); 633 634 mutex_enter(&zfsvfs->z_znodes_lock); 635 list_insert_tail(&zfsvfs->z_all_znodes, zp); 636 zfsvfs->z_nr_znodes++; 637 mutex_exit(&zfsvfs->z_znodes_lock); 638 639 if (links > 0) 640 unlock_new_inode(ip); 641 return (zp); 642 643 error: 644 iput(ip); 645 return (NULL); 646 } 647 648 /* 649 * Safely mark an inode dirty. Inodes which are part of a read-only 650 * file system or snapshot may not be dirtied. 651 */ 652 void 653 zfs_mark_inode_dirty(struct inode *ip) 654 { 655 zfsvfs_t *zfsvfs = ITOZSB(ip); 656 657 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 658 return; 659 660 mark_inode_dirty(ip); 661 } 662 663 static uint64_t empty_xattr; 664 static uint64_t pad[4]; 665 static zfs_acl_phys_t acl_phys; 666 /* 667 * Create a new DMU object to hold a zfs znode. 668 * 669 * IN: dzp - parent directory for new znode 670 * vap - file attributes for new znode 671 * tx - dmu transaction id for zap operations 672 * cr - credentials of caller 673 * flag - flags: 674 * IS_ROOT_NODE - new object will be root 675 * IS_TMPFILE - new object is of O_TMPFILE 676 * IS_XATTR - new object is an attribute 677 * acl_ids - ACL related attributes 678 * 679 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) 680 * 681 */ 682 void 683 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 684 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) 685 { 686 uint64_t crtime[2], atime[2], mtime[2], ctime[2]; 687 uint64_t mode, size, links, parent, pflags; 688 uint64_t projid = ZFS_DEFAULT_PROJID; 689 uint64_t rdev = 0; 690 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 691 dmu_buf_t *db; 692 inode_timespec_t now; 693 uint64_t gen, obj; 694 int bonuslen; 695 int dnodesize; 696 sa_handle_t *sa_hdl; 697 dmu_object_type_t obj_type; 698 sa_bulk_attr_t *sa_attrs; 699 int cnt = 0; 700 zfs_acl_locator_cb_t locate = { 0 }; 701 znode_hold_t *zh; 702 703 if (zfsvfs->z_replay) { 704 obj = vap->va_nodeid; 705 now = vap->va_ctime; /* see zfs_replay_create() */ 706 gen = vap->va_nblocks; /* ditto */ 707 dnodesize = vap->va_fsid; /* ditto */ 708 } else { 709 obj = 0; 710 gethrestime(&now); 711 gen = dmu_tx_get_txg(tx); 712 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); 713 } 714 715 if (dnodesize == 0) 716 dnodesize = DNODE_MIN_SIZE; 717 718 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; 719 720 bonuslen = (obj_type == DMU_OT_SA) ? 721 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; 722 723 /* 724 * Create a new DMU object. 725 */ 726 /* 727 * There's currently no mechanism for pre-reading the blocks that will 728 * be needed to allocate a new object, so we accept the small chance 729 * that there will be an i/o error and we will fail one of the 730 * assertions below. 731 */ 732 if (S_ISDIR(vap->va_mode)) { 733 if (zfsvfs->z_replay) { 734 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, 735 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 736 obj_type, bonuslen, dnodesize, tx)); 737 } else { 738 obj = zap_create_norm_dnsize(zfsvfs->z_os, 739 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 740 obj_type, bonuslen, dnodesize, tx); 741 } 742 } else { 743 if (zfsvfs->z_replay) { 744 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, 745 DMU_OT_PLAIN_FILE_CONTENTS, 0, 746 obj_type, bonuslen, dnodesize, tx)); 747 } else { 748 obj = dmu_object_alloc_dnsize(zfsvfs->z_os, 749 DMU_OT_PLAIN_FILE_CONTENTS, 0, 750 obj_type, bonuslen, dnodesize, tx); 751 } 752 } 753 754 zh = zfs_znode_hold_enter(zfsvfs, obj); 755 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); 756 757 /* 758 * If this is the root, fix up the half-initialized parent pointer 759 * to reference the just-allocated physical data area. 760 */ 761 if (flag & IS_ROOT_NODE) { 762 dzp->z_id = obj; 763 } 764 765 /* 766 * If parent is an xattr, so am I. 767 */ 768 if (dzp->z_pflags & ZFS_XATTR) { 769 flag |= IS_XATTR; 770 } 771 772 if (zfsvfs->z_use_fuids) 773 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 774 else 775 pflags = 0; 776 777 if (S_ISDIR(vap->va_mode)) { 778 size = 2; /* contents ("." and "..") */ 779 links = 2; 780 } else { 781 size = 0; 782 links = (flag & IS_TMPFILE) ? 0 : 1; 783 } 784 785 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) 786 rdev = vap->va_rdev; 787 788 parent = dzp->z_id; 789 mode = acl_ids->z_mode; 790 if (flag & IS_XATTR) 791 pflags |= ZFS_XATTR; 792 793 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { 794 /* 795 * With ZFS_PROJID flag, we can easily know whether there is 796 * project ID stored on disk or not. See zfs_space_delta_cb(). 797 */ 798 if (obj_type != DMU_OT_ZNODE && 799 dmu_objset_projectquota_enabled(zfsvfs->z_os)) 800 pflags |= ZFS_PROJID; 801 802 /* 803 * Inherit project ID from parent if required. 804 */ 805 projid = zfs_inherit_projid(dzp); 806 if (dzp->z_pflags & ZFS_PROJINHERIT) 807 pflags |= ZFS_PROJINHERIT; 808 } 809 810 /* 811 * No execs denied will be determined when zfs_mode_compute() is called. 812 */ 813 pflags |= acl_ids->z_aclp->z_hints & 814 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| 815 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); 816 817 ZFS_TIME_ENCODE(&now, crtime); 818 ZFS_TIME_ENCODE(&now, ctime); 819 820 if (vap->va_mask & ATTR_ATIME) { 821 ZFS_TIME_ENCODE(&vap->va_atime, atime); 822 } else { 823 ZFS_TIME_ENCODE(&now, atime); 824 } 825 826 if (vap->va_mask & ATTR_MTIME) { 827 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 828 } else { 829 ZFS_TIME_ENCODE(&now, mtime); 830 } 831 832 /* Now add in all of the "SA" attributes */ 833 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, 834 &sa_hdl)); 835 836 /* 837 * Setup the array of attributes to be replaced/set on the new file 838 * 839 * order for DMU_OT_ZNODE is critical since it needs to be constructed 840 * in the old znode_phys_t format. Don't change this ordering 841 */ 842 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); 843 844 if (obj_type == DMU_OT_ZNODE) { 845 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 846 NULL, &atime, 16); 847 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 848 NULL, &mtime, 16); 849 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 850 NULL, &ctime, 16); 851 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 852 NULL, &crtime, 16); 853 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 854 NULL, &gen, 8); 855 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 856 NULL, &mode, 8); 857 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 858 NULL, &size, 8); 859 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 860 NULL, &parent, 8); 861 } else { 862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 863 NULL, &mode, 8); 864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 865 NULL, &size, 8); 866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 867 NULL, &gen, 8); 868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), 869 NULL, &acl_ids->z_fuid, 8); 870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), 871 NULL, &acl_ids->z_fgid, 8); 872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 873 NULL, &parent, 8); 874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 875 NULL, &pflags, 8); 876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 877 NULL, &atime, 16); 878 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 879 NULL, &mtime, 16); 880 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 881 NULL, &ctime, 16); 882 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 883 NULL, &crtime, 16); 884 } 885 886 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 887 888 if (obj_type == DMU_OT_ZNODE) { 889 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, 890 &empty_xattr, 8); 891 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 892 pflags & ZFS_PROJID) { 893 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), 894 NULL, &projid, 8); 895 } 896 if (obj_type == DMU_OT_ZNODE || 897 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { 898 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), 899 NULL, &rdev, 8); 900 } 901 if (obj_type == DMU_OT_ZNODE) { 902 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 903 NULL, &pflags, 8); 904 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, 905 &acl_ids->z_fuid, 8); 906 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, 907 &acl_ids->z_fgid, 8); 908 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, 909 sizeof (uint64_t) * 4); 910 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, 911 &acl_phys, sizeof (zfs_acl_phys_t)); 912 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { 913 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, 914 &acl_ids->z_aclp->z_acl_count, 8); 915 locate.cb_aclp = acl_ids->z_aclp; 916 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), 917 zfs_acl_data_locator, &locate, 918 acl_ids->z_aclp->z_acl_bytes); 919 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, 920 acl_ids->z_fuid, acl_ids->z_fgid); 921 } 922 923 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); 924 925 if (!(flag & IS_ROOT_NODE)) { 926 /* 927 * The call to zfs_znode_alloc() may fail if memory is low 928 * via the call path: alloc_inode() -> inode_init_always() -> 929 * security_inode_alloc() -> inode_alloc_security(). Since 930 * the existing code is written such that zfs_mknode() can 931 * not fail retry until sufficient memory has been reclaimed. 932 */ 933 do { 934 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); 935 } while (*zpp == NULL); 936 937 VERIFY(*zpp != NULL); 938 VERIFY(dzp != NULL); 939 } else { 940 /* 941 * If we are creating the root node, the "parent" we 942 * passed in is the znode for the root. 943 */ 944 *zpp = dzp; 945 946 (*zpp)->z_sa_hdl = sa_hdl; 947 } 948 949 (*zpp)->z_pflags = pflags; 950 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; 951 (*zpp)->z_dnodesize = dnodesize; 952 (*zpp)->z_projid = projid; 953 954 if (obj_type == DMU_OT_ZNODE || 955 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { 956 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 957 } 958 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); 959 zfs_znode_hold_exit(zfsvfs, zh); 960 } 961 962 /* 963 * Update in-core attributes. It is assumed the caller will be doing an 964 * sa_bulk_update to push the changes out. 965 */ 966 void 967 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) 968 { 969 xoptattr_t *xoap; 970 boolean_t update_inode = B_FALSE; 971 972 xoap = xva_getxoptattr(xvap); 973 ASSERT(xoap); 974 975 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 976 uint64_t times[2]; 977 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); 978 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 979 ×, sizeof (times), tx); 980 XVA_SET_RTN(xvap, XAT_CREATETIME); 981 } 982 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 983 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, 984 zp->z_pflags, tx); 985 XVA_SET_RTN(xvap, XAT_READONLY); 986 } 987 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 988 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, 989 zp->z_pflags, tx); 990 XVA_SET_RTN(xvap, XAT_HIDDEN); 991 } 992 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 993 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, 994 zp->z_pflags, tx); 995 XVA_SET_RTN(xvap, XAT_SYSTEM); 996 } 997 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 998 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, 999 zp->z_pflags, tx); 1000 XVA_SET_RTN(xvap, XAT_ARCHIVE); 1001 } 1002 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 1003 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, 1004 zp->z_pflags, tx); 1005 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 1006 1007 update_inode = B_TRUE; 1008 } 1009 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 1010 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, 1011 zp->z_pflags, tx); 1012 XVA_SET_RTN(xvap, XAT_NOUNLINK); 1013 } 1014 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 1015 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, 1016 zp->z_pflags, tx); 1017 XVA_SET_RTN(xvap, XAT_APPENDONLY); 1018 1019 update_inode = B_TRUE; 1020 } 1021 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 1022 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, 1023 zp->z_pflags, tx); 1024 XVA_SET_RTN(xvap, XAT_NODUMP); 1025 } 1026 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 1027 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, 1028 zp->z_pflags, tx); 1029 XVA_SET_RTN(xvap, XAT_OPAQUE); 1030 } 1031 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 1032 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 1033 xoap->xoa_av_quarantined, zp->z_pflags, tx); 1034 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 1035 } 1036 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 1037 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, 1038 zp->z_pflags, tx); 1039 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 1040 } 1041 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 1042 zfs_sa_set_scanstamp(zp, xvap, tx); 1043 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 1044 } 1045 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 1046 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, 1047 zp->z_pflags, tx); 1048 XVA_SET_RTN(xvap, XAT_REPARSE); 1049 } 1050 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 1051 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, 1052 zp->z_pflags, tx); 1053 XVA_SET_RTN(xvap, XAT_OFFLINE); 1054 } 1055 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 1056 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, 1057 zp->z_pflags, tx); 1058 XVA_SET_RTN(xvap, XAT_SPARSE); 1059 } 1060 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 1061 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, 1062 zp->z_pflags, tx); 1063 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 1064 } 1065 1066 if (update_inode) 1067 zfs_set_inode_flags(zp, ZTOI(zp)); 1068 } 1069 1070 int 1071 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 1072 { 1073 dmu_object_info_t doi; 1074 dmu_buf_t *db; 1075 znode_t *zp; 1076 znode_hold_t *zh; 1077 int err; 1078 sa_handle_t *hdl; 1079 1080 *zpp = NULL; 1081 1082 again: 1083 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1084 1085 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1086 if (err) { 1087 zfs_znode_hold_exit(zfsvfs, zh); 1088 return (err); 1089 } 1090 1091 dmu_object_info_from_db(db, &doi); 1092 if (doi.doi_bonus_type != DMU_OT_SA && 1093 (doi.doi_bonus_type != DMU_OT_ZNODE || 1094 (doi.doi_bonus_type == DMU_OT_ZNODE && 1095 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1096 sa_buf_rele(db, NULL); 1097 zfs_znode_hold_exit(zfsvfs, zh); 1098 return (SET_ERROR(EINVAL)); 1099 } 1100 1101 hdl = dmu_buf_get_user(db); 1102 if (hdl != NULL) { 1103 zp = sa_get_userdata(hdl); 1104 1105 1106 /* 1107 * Since "SA" does immediate eviction we 1108 * should never find a sa handle that doesn't 1109 * know about the znode. 1110 */ 1111 1112 ASSERT3P(zp, !=, NULL); 1113 1114 mutex_enter(&zp->z_lock); 1115 ASSERT3U(zp->z_id, ==, obj_num); 1116 /* 1117 * If zp->z_unlinked is set, the znode is already marked 1118 * for deletion and should not be discovered. Check this 1119 * after checking igrab() due to fsetxattr() & O_TMPFILE. 1120 * 1121 * If igrab() returns NULL the VFS has independently 1122 * determined the inode should be evicted and has 1123 * called iput_final() to start the eviction process. 1124 * The SA handle is still valid but because the VFS 1125 * requires that the eviction succeed we must drop 1126 * our locks and references to allow the eviction to 1127 * complete. The zfs_zget() may then be retried. 1128 * 1129 * This unlikely case could be optimized by registering 1130 * a sops->drop_inode() callback. The callback would 1131 * need to detect the active SA hold thereby informing 1132 * the VFS that this inode should not be evicted. 1133 */ 1134 if (igrab(ZTOI(zp)) == NULL) { 1135 if (zp->z_unlinked) 1136 err = SET_ERROR(ENOENT); 1137 else 1138 err = SET_ERROR(EAGAIN); 1139 } else { 1140 *zpp = zp; 1141 err = 0; 1142 } 1143 1144 mutex_exit(&zp->z_lock); 1145 sa_buf_rele(db, NULL); 1146 zfs_znode_hold_exit(zfsvfs, zh); 1147 1148 if (err == EAGAIN) { 1149 /* inode might need this to finish evict */ 1150 cond_resched(); 1151 goto again; 1152 } 1153 return (err); 1154 } 1155 1156 /* 1157 * Not found create new znode/vnode but only if file exists. 1158 * 1159 * There is a small window where zfs_vget() could 1160 * find this object while a file create is still in 1161 * progress. This is checked for in zfs_znode_alloc() 1162 * 1163 * if zfs_znode_alloc() fails it will drop the hold on the 1164 * bonus buffer. 1165 */ 1166 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, 1167 doi.doi_bonus_type, NULL); 1168 if (zp == NULL) { 1169 err = SET_ERROR(ENOENT); 1170 } else { 1171 *zpp = zp; 1172 } 1173 zfs_znode_hold_exit(zfsvfs, zh); 1174 return (err); 1175 } 1176 1177 int 1178 zfs_rezget(znode_t *zp) 1179 { 1180 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1181 dmu_object_info_t doi; 1182 dmu_buf_t *db; 1183 uint64_t obj_num = zp->z_id; 1184 uint64_t mode; 1185 uint64_t links; 1186 sa_bulk_attr_t bulk[11]; 1187 int err; 1188 int count = 0; 1189 uint64_t gen; 1190 uint64_t z_uid, z_gid; 1191 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 1192 uint64_t projid = ZFS_DEFAULT_PROJID; 1193 znode_hold_t *zh; 1194 1195 /* 1196 * skip ctldir, otherwise they will always get invalidated. This will 1197 * cause funny behaviour for the mounted snapdirs. Especially for 1198 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent 1199 * anyone automount it again as long as someone is still using the 1200 * detached mount. 1201 */ 1202 if (zp->z_is_ctldir) 1203 return (0); 1204 1205 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1206 1207 mutex_enter(&zp->z_acl_lock); 1208 if (zp->z_acl_cached) { 1209 zfs_acl_free(zp->z_acl_cached); 1210 zp->z_acl_cached = NULL; 1211 } 1212 mutex_exit(&zp->z_acl_lock); 1213 1214 rw_enter(&zp->z_xattr_lock, RW_WRITER); 1215 if (zp->z_xattr_cached) { 1216 nvlist_free(zp->z_xattr_cached); 1217 zp->z_xattr_cached = NULL; 1218 } 1219 rw_exit(&zp->z_xattr_lock); 1220 1221 ASSERT(zp->z_sa_hdl == NULL); 1222 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1223 if (err) { 1224 zfs_znode_hold_exit(zfsvfs, zh); 1225 return (err); 1226 } 1227 1228 dmu_object_info_from_db(db, &doi); 1229 if (doi.doi_bonus_type != DMU_OT_SA && 1230 (doi.doi_bonus_type != DMU_OT_ZNODE || 1231 (doi.doi_bonus_type == DMU_OT_ZNODE && 1232 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1233 sa_buf_rele(db, NULL); 1234 zfs_znode_hold_exit(zfsvfs, zh); 1235 return (SET_ERROR(EINVAL)); 1236 } 1237 1238 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); 1239 1240 /* reload cached values */ 1241 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, 1242 &gen, sizeof (gen)); 1243 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1244 &zp->z_size, sizeof (zp->z_size)); 1245 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 1246 &links, sizeof (links)); 1247 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1248 &zp->z_pflags, sizeof (zp->z_pflags)); 1249 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1250 &z_uid, sizeof (z_uid)); 1251 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1252 &z_gid, sizeof (z_gid)); 1253 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 1254 &mode, sizeof (mode)); 1255 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 1256 &atime, 16); 1257 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 1258 &mtime, 16); 1259 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 1260 &ctime, 16); 1261 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 1262 1263 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { 1264 zfs_znode_dmu_fini(zp); 1265 zfs_znode_hold_exit(zfsvfs, zh); 1266 return (SET_ERROR(EIO)); 1267 } 1268 1269 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { 1270 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), 1271 &projid, 8); 1272 if (err != 0 && err != ENOENT) { 1273 zfs_znode_dmu_fini(zp); 1274 zfs_znode_hold_exit(zfsvfs, zh); 1275 return (SET_ERROR(err)); 1276 } 1277 } 1278 1279 zp->z_projid = projid; 1280 zp->z_mode = ZTOI(zp)->i_mode = mode; 1281 zfs_uid_write(ZTOI(zp), z_uid); 1282 zfs_gid_write(ZTOI(zp), z_gid); 1283 1284 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); 1285 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); 1286 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); 1287 ZFS_TIME_DECODE(&zp->z_btime, btime); 1288 1289 if ((uint32_t)gen != ZTOI(zp)->i_generation) { 1290 zfs_znode_dmu_fini(zp); 1291 zfs_znode_hold_exit(zfsvfs, zh); 1292 return (SET_ERROR(EIO)); 1293 } 1294 1295 set_nlink(ZTOI(zp), (uint32_t)links); 1296 zfs_set_inode_flags(zp, ZTOI(zp)); 1297 1298 zp->z_blksz = doi.doi_data_block_size; 1299 zp->z_atime_dirty = B_FALSE; 1300 zfs_znode_update_vfs(zp); 1301 1302 /* 1303 * If the file has zero links, then it has been unlinked on the send 1304 * side and it must be in the received unlinked set. 1305 * We call zfs_znode_dmu_fini() now to prevent any accesses to the 1306 * stale data and to prevent automatic removal of the file in 1307 * zfs_zinactive(). The file will be removed either when it is removed 1308 * on the send side and the next incremental stream is received or 1309 * when the unlinked set gets processed. 1310 */ 1311 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); 1312 if (zp->z_unlinked) 1313 zfs_znode_dmu_fini(zp); 1314 1315 zfs_znode_hold_exit(zfsvfs, zh); 1316 1317 return (0); 1318 } 1319 1320 void 1321 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1322 { 1323 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1324 objset_t *os = zfsvfs->z_os; 1325 uint64_t obj = zp->z_id; 1326 uint64_t acl_obj = zfs_external_acl(zp); 1327 znode_hold_t *zh; 1328 1329 zh = zfs_znode_hold_enter(zfsvfs, obj); 1330 if (acl_obj) { 1331 VERIFY(!zp->z_is_sa); 1332 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1333 } 1334 VERIFY(0 == dmu_object_free(os, obj, tx)); 1335 zfs_znode_dmu_fini(zp); 1336 zfs_znode_hold_exit(zfsvfs, zh); 1337 } 1338 1339 void 1340 zfs_zinactive(znode_t *zp) 1341 { 1342 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1343 uint64_t z_id = zp->z_id; 1344 znode_hold_t *zh; 1345 1346 ASSERT(zp->z_sa_hdl); 1347 1348 /* 1349 * Don't allow a zfs_zget() while were trying to release this znode. 1350 */ 1351 zh = zfs_znode_hold_enter(zfsvfs, z_id); 1352 1353 mutex_enter(&zp->z_lock); 1354 1355 /* 1356 * If this was the last reference to a file with no links, remove 1357 * the file from the file system unless the file system is mounted 1358 * read-only. That can happen, for example, if the file system was 1359 * originally read-write, the file was opened, then unlinked and 1360 * the file system was made read-only before the file was finally 1361 * closed. The file will remain in the unlinked set. 1362 */ 1363 if (zp->z_unlinked) { 1364 ASSERT(!zfsvfs->z_issnap); 1365 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { 1366 mutex_exit(&zp->z_lock); 1367 zfs_znode_hold_exit(zfsvfs, zh); 1368 zfs_rmnode(zp); 1369 return; 1370 } 1371 } 1372 1373 mutex_exit(&zp->z_lock); 1374 zfs_znode_dmu_fini(zp); 1375 1376 zfs_znode_hold_exit(zfsvfs, zh); 1377 } 1378 1379 #if defined(HAVE_INODE_TIMESPEC64_TIMES) 1380 #define zfs_compare_timespec timespec64_compare 1381 #else 1382 #define zfs_compare_timespec timespec_compare 1383 #endif 1384 1385 /* 1386 * Determine whether the znode's atime must be updated. The logic mostly 1387 * duplicates the Linux kernel's relatime_need_update() functionality. 1388 * This function is only called if the underlying filesystem actually has 1389 * atime updates enabled. 1390 */ 1391 boolean_t 1392 zfs_relatime_need_update(const struct inode *ip) 1393 { 1394 inode_timespec_t now; 1395 1396 gethrestime(&now); 1397 /* 1398 * In relatime mode, only update the atime if the previous atime 1399 * is earlier than either the ctime or mtime or if at least a day 1400 * has passed since the last update of atime. 1401 */ 1402 if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) 1403 return (B_TRUE); 1404 1405 if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) 1406 return (B_TRUE); 1407 1408 if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) 1409 return (B_TRUE); 1410 1411 return (B_FALSE); 1412 } 1413 1414 /* 1415 * Prepare to update znode time stamps. 1416 * 1417 * IN: zp - znode requiring timestamp update 1418 * flag - ATTR_MTIME, ATTR_CTIME flags 1419 * 1420 * OUT: zp - z_seq 1421 * mtime - new mtime 1422 * ctime - new ctime 1423 * 1424 * Note: We don't update atime here, because we rely on Linux VFS to do 1425 * atime updating. 1426 */ 1427 void 1428 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], 1429 uint64_t ctime[2]) 1430 { 1431 inode_timespec_t now; 1432 1433 gethrestime(&now); 1434 1435 zp->z_seq++; 1436 1437 if (flag & ATTR_MTIME) { 1438 ZFS_TIME_ENCODE(&now, mtime); 1439 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); 1440 if (ZTOZSB(zp)->z_use_fuids) { 1441 zp->z_pflags |= (ZFS_ARCHIVE | 1442 ZFS_AV_MODIFIED); 1443 } 1444 } 1445 1446 if (flag & ATTR_CTIME) { 1447 ZFS_TIME_ENCODE(&now, ctime); 1448 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); 1449 if (ZTOZSB(zp)->z_use_fuids) 1450 zp->z_pflags |= ZFS_ARCHIVE; 1451 } 1452 } 1453 1454 /* 1455 * Grow the block size for a file. 1456 * 1457 * IN: zp - znode of file to free data in. 1458 * size - requested block size 1459 * tx - open transaction. 1460 * 1461 * NOTE: this function assumes that the znode is write locked. 1462 */ 1463 void 1464 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1465 { 1466 int error; 1467 u_longlong_t dummy; 1468 1469 if (size <= zp->z_blksz) 1470 return; 1471 /* 1472 * If the file size is already greater than the current blocksize, 1473 * we will not grow. If there is more than one block in a file, 1474 * the blocksize cannot change. 1475 */ 1476 if (zp->z_blksz && zp->z_size > zp->z_blksz) 1477 return; 1478 1479 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, 1480 size, 0, tx); 1481 1482 if (error == ENOTSUP) 1483 return; 1484 ASSERT0(error); 1485 1486 /* What blocksize did we actually get? */ 1487 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); 1488 } 1489 1490 /* 1491 * Increase the file length 1492 * 1493 * IN: zp - znode of file to free data in. 1494 * end - new end-of-file 1495 * 1496 * RETURN: 0 on success, error code on failure 1497 */ 1498 static int 1499 zfs_extend(znode_t *zp, uint64_t end) 1500 { 1501 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1502 dmu_tx_t *tx; 1503 zfs_locked_range_t *lr; 1504 uint64_t newblksz; 1505 int error; 1506 1507 /* 1508 * We will change zp_size, lock the whole file. 1509 */ 1510 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1511 1512 /* 1513 * Nothing to do if file already at desired length. 1514 */ 1515 if (end <= zp->z_size) { 1516 zfs_rangelock_exit(lr); 1517 return (0); 1518 } 1519 tx = dmu_tx_create(zfsvfs->z_os); 1520 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1521 zfs_sa_upgrade_txholds(tx, zp); 1522 if (end > zp->z_blksz && 1523 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1524 /* 1525 * We are growing the file past the current block size. 1526 */ 1527 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { 1528 /* 1529 * File's blocksize is already larger than the 1530 * "recordsize" property. Only let it grow to 1531 * the next power of 2. 1532 */ 1533 ASSERT(!ISP2(zp->z_blksz)); 1534 newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); 1535 } else { 1536 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); 1537 } 1538 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1539 } else { 1540 newblksz = 0; 1541 } 1542 1543 error = dmu_tx_assign(tx, TXG_WAIT); 1544 if (error) { 1545 dmu_tx_abort(tx); 1546 zfs_rangelock_exit(lr); 1547 return (error); 1548 } 1549 1550 if (newblksz) 1551 zfs_grow_blocksize(zp, newblksz, tx); 1552 1553 zp->z_size = end; 1554 1555 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), 1556 &zp->z_size, sizeof (zp->z_size), tx)); 1557 1558 zfs_rangelock_exit(lr); 1559 1560 dmu_tx_commit(tx); 1561 1562 return (0); 1563 } 1564 1565 /* 1566 * zfs_zero_partial_page - Modeled after update_pages() but 1567 * with different arguments and semantics for use by zfs_freesp(). 1568 * 1569 * Zeroes a piece of a single page cache entry for zp at offset 1570 * start and length len. 1571 * 1572 * Caller must acquire a range lock on the file for the region 1573 * being zeroed in order that the ARC and page cache stay in sync. 1574 */ 1575 static void 1576 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) 1577 { 1578 struct address_space *mp = ZTOI(zp)->i_mapping; 1579 struct page *pp; 1580 int64_t off; 1581 void *pb; 1582 1583 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); 1584 1585 off = start & (PAGE_SIZE - 1); 1586 start &= PAGE_MASK; 1587 1588 pp = find_lock_page(mp, start >> PAGE_SHIFT); 1589 if (pp) { 1590 if (mapping_writably_mapped(mp)) 1591 flush_dcache_page(pp); 1592 1593 pb = kmap(pp); 1594 memset(pb + off, 0, len); 1595 kunmap(pp); 1596 1597 if (mapping_writably_mapped(mp)) 1598 flush_dcache_page(pp); 1599 1600 mark_page_accessed(pp); 1601 SetPageUptodate(pp); 1602 ClearPageError(pp); 1603 unlock_page(pp); 1604 put_page(pp); 1605 } 1606 } 1607 1608 /* 1609 * Free space in a file. 1610 * 1611 * IN: zp - znode of file to free data in. 1612 * off - start of section to free. 1613 * len - length of section to free. 1614 * 1615 * RETURN: 0 on success, error code on failure 1616 */ 1617 static int 1618 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1619 { 1620 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1621 zfs_locked_range_t *lr; 1622 int error; 1623 1624 /* 1625 * Lock the range being freed. 1626 */ 1627 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1628 1629 /* 1630 * Nothing to do if file already at desired length. 1631 */ 1632 if (off >= zp->z_size) { 1633 zfs_rangelock_exit(lr); 1634 return (0); 1635 } 1636 1637 if (off + len > zp->z_size) 1638 len = zp->z_size - off; 1639 1640 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1641 1642 /* 1643 * Zero partial page cache entries. This must be done under a 1644 * range lock in order to keep the ARC and page cache in sync. 1645 */ 1646 if (zn_has_cached_data(zp, off, off + len - 1)) { 1647 loff_t first_page, last_page, page_len; 1648 loff_t first_page_offset, last_page_offset; 1649 1650 /* first possible full page in hole */ 1651 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; 1652 /* last page of hole */ 1653 last_page = (off + len) >> PAGE_SHIFT; 1654 1655 /* offset of first_page */ 1656 first_page_offset = first_page << PAGE_SHIFT; 1657 /* offset of last_page */ 1658 last_page_offset = last_page << PAGE_SHIFT; 1659 1660 /* truncate whole pages */ 1661 if (last_page_offset > first_page_offset) { 1662 truncate_inode_pages_range(ZTOI(zp)->i_mapping, 1663 first_page_offset, last_page_offset - 1); 1664 } 1665 1666 /* truncate sub-page ranges */ 1667 if (first_page > last_page) { 1668 /* entire punched area within a single page */ 1669 zfs_zero_partial_page(zp, off, len); 1670 } else { 1671 /* beginning of punched area at the end of a page */ 1672 page_len = first_page_offset - off; 1673 if (page_len > 0) 1674 zfs_zero_partial_page(zp, off, page_len); 1675 1676 /* end of punched area at the beginning of a page */ 1677 page_len = off + len - last_page_offset; 1678 if (page_len > 0) 1679 zfs_zero_partial_page(zp, last_page_offset, 1680 page_len); 1681 } 1682 } 1683 zfs_rangelock_exit(lr); 1684 1685 return (error); 1686 } 1687 1688 /* 1689 * Truncate a file 1690 * 1691 * IN: zp - znode of file to free data in. 1692 * end - new end-of-file. 1693 * 1694 * RETURN: 0 on success, error code on failure 1695 */ 1696 static int 1697 zfs_trunc(znode_t *zp, uint64_t end) 1698 { 1699 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1700 dmu_tx_t *tx; 1701 zfs_locked_range_t *lr; 1702 int error; 1703 sa_bulk_attr_t bulk[2]; 1704 int count = 0; 1705 1706 /* 1707 * We will change zp_size, lock the whole file. 1708 */ 1709 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1710 1711 /* 1712 * Nothing to do if file already at desired length. 1713 */ 1714 if (end >= zp->z_size) { 1715 zfs_rangelock_exit(lr); 1716 return (0); 1717 } 1718 1719 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, 1720 DMU_OBJECT_END); 1721 if (error) { 1722 zfs_rangelock_exit(lr); 1723 return (error); 1724 } 1725 tx = dmu_tx_create(zfsvfs->z_os); 1726 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1727 zfs_sa_upgrade_txholds(tx, zp); 1728 dmu_tx_mark_netfree(tx); 1729 error = dmu_tx_assign(tx, TXG_WAIT); 1730 if (error) { 1731 dmu_tx_abort(tx); 1732 zfs_rangelock_exit(lr); 1733 return (error); 1734 } 1735 1736 zp->z_size = end; 1737 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1738 NULL, &zp->z_size, sizeof (zp->z_size)); 1739 1740 if (end == 0) { 1741 zp->z_pflags &= ~ZFS_SPARSE; 1742 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1743 NULL, &zp->z_pflags, 8); 1744 } 1745 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); 1746 1747 dmu_tx_commit(tx); 1748 zfs_rangelock_exit(lr); 1749 1750 return (0); 1751 } 1752 1753 /* 1754 * Free space in a file 1755 * 1756 * IN: zp - znode of file to free data in. 1757 * off - start of range 1758 * len - end of range (0 => EOF) 1759 * flag - current file open mode flags. 1760 * log - TRUE if this action should be logged 1761 * 1762 * RETURN: 0 on success, error code on failure 1763 */ 1764 int 1765 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1766 { 1767 dmu_tx_t *tx; 1768 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1769 zilog_t *zilog = zfsvfs->z_log; 1770 uint64_t mode; 1771 uint64_t mtime[2], ctime[2]; 1772 sa_bulk_attr_t bulk[3]; 1773 int count = 0; 1774 int error; 1775 1776 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, 1777 sizeof (mode))) != 0) 1778 return (error); 1779 1780 if (off > zp->z_size) { 1781 error = zfs_extend(zp, off+len); 1782 if (error == 0 && log) 1783 goto log; 1784 goto out; 1785 } 1786 1787 if (len == 0) { 1788 error = zfs_trunc(zp, off); 1789 } else { 1790 if ((error = zfs_free_range(zp, off, len)) == 0 && 1791 off + len > zp->z_size) 1792 error = zfs_extend(zp, off+len); 1793 } 1794 if (error || !log) 1795 goto out; 1796 log: 1797 tx = dmu_tx_create(zfsvfs->z_os); 1798 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1799 zfs_sa_upgrade_txholds(tx, zp); 1800 error = dmu_tx_assign(tx, TXG_WAIT); 1801 if (error) { 1802 dmu_tx_abort(tx); 1803 goto out; 1804 } 1805 1806 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); 1807 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); 1808 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1809 NULL, &zp->z_pflags, 8); 1810 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1811 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1812 ASSERT(error == 0); 1813 1814 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1815 1816 dmu_tx_commit(tx); 1817 1818 zfs_znode_update_vfs(zp); 1819 error = 0; 1820 1821 out: 1822 /* 1823 * Truncate the page cache - for file truncate operations, use 1824 * the purpose-built API for truncations. For punching operations, 1825 * the truncation is handled under a range lock in zfs_free_range. 1826 */ 1827 if (len == 0) 1828 truncate_setsize(ZTOI(zp), off); 1829 return (error); 1830 } 1831 1832 void 1833 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1834 { 1835 struct super_block *sb; 1836 zfsvfs_t *zfsvfs; 1837 uint64_t moid, obj, sa_obj, version; 1838 uint64_t sense = ZFS_CASE_SENSITIVE; 1839 uint64_t norm = 0; 1840 nvpair_t *elem; 1841 int size; 1842 int error; 1843 int i; 1844 znode_t *rootzp = NULL; 1845 vattr_t vattr; 1846 znode_t *zp; 1847 zfs_acl_ids_t acl_ids; 1848 1849 /* 1850 * First attempt to create master node. 1851 */ 1852 /* 1853 * In an empty objset, there are no blocks to read and thus 1854 * there can be no i/o errors (which we assert below). 1855 */ 1856 moid = MASTER_NODE_OBJ; 1857 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1858 DMU_OT_NONE, 0, tx); 1859 ASSERT(error == 0); 1860 1861 /* 1862 * Set starting attributes. 1863 */ 1864 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); 1865 elem = NULL; 1866 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1867 /* For the moment we expect all zpl props to be uint64_ts */ 1868 uint64_t val; 1869 const char *name; 1870 1871 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1872 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1873 name = nvpair_name(elem); 1874 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1875 if (val < version) 1876 version = val; 1877 } else { 1878 error = zap_update(os, moid, name, 8, 1, &val, tx); 1879 } 1880 ASSERT(error == 0); 1881 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1882 norm = val; 1883 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1884 sense = val; 1885 } 1886 ASSERT(version != 0); 1887 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1888 ASSERT(error == 0); 1889 1890 /* 1891 * Create zap object used for SA attribute registration 1892 */ 1893 1894 if (version >= ZPL_VERSION_SA) { 1895 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 1896 DMU_OT_NONE, 0, tx); 1897 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 1898 ASSERT(error == 0); 1899 } else { 1900 sa_obj = 0; 1901 } 1902 /* 1903 * Create a delete queue. 1904 */ 1905 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1906 1907 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1908 ASSERT(error == 0); 1909 1910 /* 1911 * Create root znode. Create minimal znode/inode/zfsvfs/sb 1912 * to allow zfs_mknode to work. 1913 */ 1914 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; 1915 vattr.va_mode = S_IFDIR|0755; 1916 vattr.va_uid = crgetuid(cr); 1917 vattr.va_gid = crgetgid(cr); 1918 1919 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1920 rootzp->z_unlinked = B_FALSE; 1921 rootzp->z_atime_dirty = B_FALSE; 1922 rootzp->z_is_sa = USE_SA(version, os); 1923 rootzp->z_pflags = 0; 1924 1925 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1926 zfsvfs->z_os = os; 1927 zfsvfs->z_parent = zfsvfs; 1928 zfsvfs->z_version = version; 1929 zfsvfs->z_use_fuids = USE_FUIDS(version, os); 1930 zfsvfs->z_use_sa = USE_SA(version, os); 1931 zfsvfs->z_norm = norm; 1932 1933 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); 1934 sb->s_fs_info = zfsvfs; 1935 1936 ZTOI(rootzp)->i_sb = sb; 1937 1938 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1939 &zfsvfs->z_attr_table); 1940 1941 ASSERT(error == 0); 1942 1943 /* 1944 * Fold case on file systems that are always or sometimes case 1945 * insensitive. 1946 */ 1947 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1948 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1949 1950 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1951 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1952 offsetof(znode_t, z_link_node)); 1953 1954 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); 1955 zfsvfs->z_hold_size = size; 1956 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, 1957 KM_SLEEP); 1958 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); 1959 for (i = 0; i != size; i++) { 1960 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, 1961 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); 1962 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); 1963 } 1964 1965 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1966 cr, NULL, &acl_ids, zfs_init_idmap)); 1967 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); 1968 ASSERT3P(zp, ==, rootzp); 1969 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1970 ASSERT(error == 0); 1971 zfs_acl_ids_free(&acl_ids); 1972 1973 atomic_set(&ZTOI(rootzp)->i_count, 0); 1974 sa_handle_destroy(rootzp->z_sa_hdl); 1975 kmem_cache_free(znode_cache, rootzp); 1976 1977 for (i = 0; i != size; i++) { 1978 avl_destroy(&zfsvfs->z_hold_trees[i]); 1979 mutex_destroy(&zfsvfs->z_hold_locks[i]); 1980 } 1981 1982 mutex_destroy(&zfsvfs->z_znodes_lock); 1983 1984 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); 1985 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); 1986 kmem_free(sb, sizeof (struct super_block)); 1987 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1988 } 1989 #endif /* _KERNEL */ 1990 1991 static int 1992 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) 1993 { 1994 uint64_t sa_obj = 0; 1995 int error; 1996 1997 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); 1998 if (error != 0 && error != ENOENT) 1999 return (error); 2000 2001 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); 2002 return (error); 2003 } 2004 2005 static int 2006 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, 2007 dmu_buf_t **db, const void *tag) 2008 { 2009 dmu_object_info_t doi; 2010 int error; 2011 2012 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) 2013 return (error); 2014 2015 dmu_object_info_from_db(*db, &doi); 2016 if ((doi.doi_bonus_type != DMU_OT_SA && 2017 doi.doi_bonus_type != DMU_OT_ZNODE) || 2018 (doi.doi_bonus_type == DMU_OT_ZNODE && 2019 doi.doi_bonus_size < sizeof (znode_phys_t))) { 2020 sa_buf_rele(*db, tag); 2021 return (SET_ERROR(ENOTSUP)); 2022 } 2023 2024 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); 2025 if (error != 0) { 2026 sa_buf_rele(*db, tag); 2027 return (error); 2028 } 2029 2030 return (0); 2031 } 2032 2033 static void 2034 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) 2035 { 2036 sa_handle_destroy(hdl); 2037 sa_buf_rele(db, tag); 2038 } 2039 2040 /* 2041 * Given an object number, return its parent object number and whether 2042 * or not the object is an extended attribute directory. 2043 */ 2044 static int 2045 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, 2046 uint64_t *pobjp, int *is_xattrdir) 2047 { 2048 uint64_t parent; 2049 uint64_t pflags; 2050 uint64_t mode; 2051 uint64_t parent_mode; 2052 sa_bulk_attr_t bulk[3]; 2053 sa_handle_t *sa_hdl; 2054 dmu_buf_t *sa_db; 2055 int count = 0; 2056 int error; 2057 2058 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, 2059 &parent, sizeof (parent)); 2060 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, 2061 &pflags, sizeof (pflags)); 2062 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2063 &mode, sizeof (mode)); 2064 2065 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) 2066 return (error); 2067 2068 /* 2069 * When a link is removed its parent pointer is not changed and will 2070 * be invalid. There are two cases where a link is removed but the 2071 * file stays around, when it goes to the delete queue and when there 2072 * are additional links. 2073 */ 2074 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); 2075 if (error != 0) 2076 return (error); 2077 2078 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); 2079 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2080 if (error != 0) 2081 return (error); 2082 2083 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); 2084 2085 /* 2086 * Extended attributes can be applied to files, directories, etc. 2087 * Otherwise the parent must be a directory. 2088 */ 2089 if (!*is_xattrdir && !S_ISDIR(parent_mode)) 2090 return (SET_ERROR(EINVAL)); 2091 2092 *pobjp = parent; 2093 2094 return (0); 2095 } 2096 2097 /* 2098 * Given an object number, return some zpl level statistics 2099 */ 2100 static int 2101 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, 2102 zfs_stat_t *sb) 2103 { 2104 sa_bulk_attr_t bulk[4]; 2105 int count = 0; 2106 2107 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2108 &sb->zs_mode, sizeof (sb->zs_mode)); 2109 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, 2110 &sb->zs_gen, sizeof (sb->zs_gen)); 2111 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, 2112 &sb->zs_links, sizeof (sb->zs_links)); 2113 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, 2114 &sb->zs_ctime, sizeof (sb->zs_ctime)); 2115 2116 return (sa_bulk_lookup(hdl, bulk, count)); 2117 } 2118 2119 static int 2120 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, 2121 sa_attr_type_t *sa_table, char *buf, int len) 2122 { 2123 sa_handle_t *sa_hdl; 2124 sa_handle_t *prevhdl = NULL; 2125 dmu_buf_t *prevdb = NULL; 2126 dmu_buf_t *sa_db = NULL; 2127 char *path = buf + len - 1; 2128 int error; 2129 2130 *path = '\0'; 2131 sa_hdl = hdl; 2132 2133 uint64_t deleteq_obj; 2134 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, 2135 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); 2136 error = zap_lookup_int(osp, deleteq_obj, obj); 2137 if (error == 0) { 2138 return (ESTALE); 2139 } else if (error != ENOENT) { 2140 return (error); 2141 } 2142 2143 for (;;) { 2144 uint64_t pobj = 0; 2145 char component[MAXNAMELEN + 2]; 2146 size_t complen; 2147 int is_xattrdir = 0; 2148 2149 if (prevdb) { 2150 ASSERT(prevhdl != NULL); 2151 zfs_release_sa_handle(prevhdl, prevdb, FTAG); 2152 } 2153 2154 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, 2155 &is_xattrdir)) != 0) 2156 break; 2157 2158 if (pobj == obj) { 2159 if (path[0] != '/') 2160 *--path = '/'; 2161 break; 2162 } 2163 2164 component[0] = '/'; 2165 if (is_xattrdir) { 2166 strcpy(component + 1, "<xattrdir>"); 2167 } else { 2168 error = zap_value_search(osp, pobj, obj, 2169 ZFS_DIRENT_OBJ(-1ULL), component + 1); 2170 if (error != 0) 2171 break; 2172 } 2173 2174 complen = strlen(component); 2175 path -= complen; 2176 ASSERT(path >= buf); 2177 memcpy(path, component, complen); 2178 obj = pobj; 2179 2180 if (sa_hdl != hdl) { 2181 prevhdl = sa_hdl; 2182 prevdb = sa_db; 2183 } 2184 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); 2185 if (error != 0) { 2186 sa_hdl = prevhdl; 2187 sa_db = prevdb; 2188 break; 2189 } 2190 } 2191 2192 if (sa_hdl != NULL && sa_hdl != hdl) { 2193 ASSERT(sa_db != NULL); 2194 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2195 } 2196 2197 if (error == 0) 2198 (void) memmove(buf, path, buf + len - path); 2199 2200 return (error); 2201 } 2202 2203 int 2204 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 2205 { 2206 sa_attr_type_t *sa_table; 2207 sa_handle_t *hdl; 2208 dmu_buf_t *db; 2209 int error; 2210 2211 error = zfs_sa_setup(osp, &sa_table); 2212 if (error != 0) 2213 return (error); 2214 2215 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2216 if (error != 0) 2217 return (error); 2218 2219 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2220 2221 zfs_release_sa_handle(hdl, db, FTAG); 2222 return (error); 2223 } 2224 2225 int 2226 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, 2227 char *buf, int len) 2228 { 2229 char *path = buf + len - 1; 2230 sa_attr_type_t *sa_table; 2231 sa_handle_t *hdl; 2232 dmu_buf_t *db; 2233 int error; 2234 2235 *path = '\0'; 2236 2237 error = zfs_sa_setup(osp, &sa_table); 2238 if (error != 0) 2239 return (error); 2240 2241 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2242 if (error != 0) 2243 return (error); 2244 2245 error = zfs_obj_to_stats_impl(hdl, sa_table, sb); 2246 if (error != 0) { 2247 zfs_release_sa_handle(hdl, db, FTAG); 2248 return (error); 2249 } 2250 2251 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2252 2253 zfs_release_sa_handle(hdl, db, FTAG); 2254 return (error); 2255 } 2256 2257 /* 2258 * Read a property stored within the master node. 2259 */ 2260 int 2261 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2262 { 2263 uint64_t *cached_copy = NULL; 2264 2265 /* 2266 * Figure out where in the objset_t the cached copy would live, if it 2267 * is available for the requested property. 2268 */ 2269 if (os != NULL) { 2270 switch (prop) { 2271 case ZFS_PROP_VERSION: 2272 cached_copy = &os->os_version; 2273 break; 2274 case ZFS_PROP_NORMALIZE: 2275 cached_copy = &os->os_normalization; 2276 break; 2277 case ZFS_PROP_UTF8ONLY: 2278 cached_copy = &os->os_utf8only; 2279 break; 2280 case ZFS_PROP_CASE: 2281 cached_copy = &os->os_casesensitivity; 2282 break; 2283 default: 2284 break; 2285 } 2286 } 2287 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2288 *value = *cached_copy; 2289 return (0); 2290 } 2291 2292 /* 2293 * If the property wasn't cached, look up the file system's value for 2294 * the property. For the version property, we look up a slightly 2295 * different string. 2296 */ 2297 const char *pname; 2298 int error = ENOENT; 2299 if (prop == ZFS_PROP_VERSION) 2300 pname = ZPL_VERSION_STR; 2301 else 2302 pname = zfs_prop_to_name(prop); 2303 2304 if (os != NULL) { 2305 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2306 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2307 } 2308 2309 if (error == ENOENT) { 2310 /* No value set, use the default value */ 2311 switch (prop) { 2312 case ZFS_PROP_VERSION: 2313 *value = ZPL_VERSION; 2314 break; 2315 case ZFS_PROP_NORMALIZE: 2316 case ZFS_PROP_UTF8ONLY: 2317 *value = 0; 2318 break; 2319 case ZFS_PROP_CASE: 2320 *value = ZFS_CASE_SENSITIVE; 2321 break; 2322 case ZFS_PROP_ACLTYPE: 2323 *value = ZFS_ACLTYPE_OFF; 2324 break; 2325 default: 2326 return (error); 2327 } 2328 error = 0; 2329 } 2330 2331 /* 2332 * If one of the methods for getting the property value above worked, 2333 * copy it into the objset_t's cache. 2334 */ 2335 if (error == 0 && cached_copy != NULL) { 2336 *cached_copy = *value; 2337 } 2338 2339 return (error); 2340 } 2341 2342 #if defined(_KERNEL) 2343 EXPORT_SYMBOL(zfs_create_fs); 2344 EXPORT_SYMBOL(zfs_obj_to_path); 2345 2346 /* CSTYLED */ 2347 module_param(zfs_object_mutex_size, uint, 0644); 2348 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); 2349 module_param(zfs_unlink_suspend_progress, int, 0644); 2350 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " 2351 "(debug - leaks space into the unlinked set)"); 2352 #endif 2353