1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #ifdef _KERNEL 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/sysmacros.h> 33 #include <sys/mntent.h> 34 #include <sys/u8_textprep.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/file.h> 39 #include <sys/kmem.h> 40 #include <sys/errno.h> 41 #include <sys/atomic.h> 42 #include <sys/zfs_dir.h> 43 #include <sys/zfs_acl.h> 44 #include <sys/zfs_ioctl.h> 45 #include <sys/zfs_rlock.h> 46 #include <sys/zfs_fuid.h> 47 #include <sys/zfs_vnops.h> 48 #include <sys/zfs_ctldir.h> 49 #include <sys/dnode.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/zpl.h> 52 #endif /* _KERNEL */ 53 54 #include <sys/dmu.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/dmu_tx.h> 57 #include <sys/zfs_refcount.h> 58 #include <sys/stat.h> 59 #include <sys/zap.h> 60 #include <sys/zfs_znode.h> 61 #include <sys/sa.h> 62 #include <sys/zfs_sa.h> 63 #include <sys/zfs_stat.h> 64 65 #include "zfs_prop.h" 66 #include "zfs_comutil.h" 67 68 /* 69 * Functions needed for userland (ie: libzpool) are not put under 70 * #ifdef_KERNEL; the rest of the functions have dependencies 71 * (such as VFS logic) that will not compile easily in userland. 72 */ 73 #ifdef _KERNEL 74 75 static kmem_cache_t *znode_cache = NULL; 76 static kmem_cache_t *znode_hold_cache = NULL; 77 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; 78 79 /* 80 * This is used by the test suite so that it can delay znodes from being 81 * freed in order to inspect the unlinked set. 82 */ 83 static int zfs_unlink_suspend_progress = 0; 84 85 /* 86 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on 87 * z_rangelock. It will modify the offset and length of the lock to reflect 88 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is 89 * called with the rangelock_t's rl_lock held, which avoids races. 90 */ 91 static void 92 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) 93 { 94 znode_t *zp = arg; 95 96 /* 97 * If in append mode, convert to writer and lock starting at the 98 * current end of file. 99 */ 100 if (new->lr_type == RL_APPEND) { 101 new->lr_offset = zp->z_size; 102 new->lr_type = RL_WRITER; 103 } 104 105 /* 106 * If we need to grow the block size then lock the whole file range. 107 */ 108 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); 109 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || 110 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { 111 new->lr_offset = 0; 112 new->lr_length = UINT64_MAX; 113 } 114 } 115 116 static int 117 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 118 { 119 (void) arg, (void) kmflags; 120 znode_t *zp = buf; 121 122 inode_init_once(ZTOI(zp)); 123 list_link_init(&zp->z_link_node); 124 125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 127 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); 128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 129 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); 130 131 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); 132 133 zp->z_dirlocks = NULL; 134 zp->z_acl_cached = NULL; 135 zp->z_xattr_cached = NULL; 136 zp->z_xattr_parent = 0; 137 zp->z_sync_writes_cnt = 0; 138 zp->z_async_writes_cnt = 0; 139 140 return (0); 141 } 142 143 static void 144 zfs_znode_cache_destructor(void *buf, void *arg) 145 { 146 (void) arg; 147 znode_t *zp = buf; 148 149 ASSERT(!list_link_active(&zp->z_link_node)); 150 mutex_destroy(&zp->z_lock); 151 rw_destroy(&zp->z_parent_lock); 152 rw_destroy(&zp->z_name_lock); 153 mutex_destroy(&zp->z_acl_lock); 154 rw_destroy(&zp->z_xattr_lock); 155 zfs_rangelock_fini(&zp->z_rangelock); 156 157 ASSERT3P(zp->z_dirlocks, ==, NULL); 158 ASSERT3P(zp->z_acl_cached, ==, NULL); 159 ASSERT3P(zp->z_xattr_cached, ==, NULL); 160 161 ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); 162 ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); 163 } 164 165 static int 166 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) 167 { 168 (void) arg, (void) kmflags; 169 znode_hold_t *zh = buf; 170 171 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); 172 zh->zh_refcount = 0; 173 174 return (0); 175 } 176 177 static void 178 zfs_znode_hold_cache_destructor(void *buf, void *arg) 179 { 180 (void) arg; 181 znode_hold_t *zh = buf; 182 183 mutex_destroy(&zh->zh_lock); 184 } 185 186 void 187 zfs_znode_init(void) 188 { 189 /* 190 * Initialize zcache. The KMC_SLAB hint is used in order that it be 191 * backed by kmalloc() when on the Linux slab in order that any 192 * wait_on_bit() operations on the related inode operate properly. 193 */ 194 ASSERT(znode_cache == NULL); 195 znode_cache = kmem_cache_create("zfs_znode_cache", 196 sizeof (znode_t), 0, zfs_znode_cache_constructor, 197 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); 198 199 ASSERT(znode_hold_cache == NULL); 200 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", 201 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, 202 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); 203 } 204 205 void 206 zfs_znode_fini(void) 207 { 208 /* 209 * Cleanup zcache 210 */ 211 if (znode_cache) 212 kmem_cache_destroy(znode_cache); 213 znode_cache = NULL; 214 215 if (znode_hold_cache) 216 kmem_cache_destroy(znode_hold_cache); 217 znode_hold_cache = NULL; 218 } 219 220 /* 221 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to 222 * serialize access to a znode and its SA buffer while the object is being 223 * created or destroyed. This kind of locking would normally reside in the 224 * znode itself but in this case that's impossible because the znode and SA 225 * buffer may not yet exist. Therefore the locking is handled externally 226 * with an array of mutexes and AVLs trees which contain per-object locks. 227 * 228 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted 229 * in to the correct AVL tree and finally the per-object lock is held. In 230 * zfs_znode_hold_exit() the process is reversed. The per-object lock is 231 * released, removed from the AVL tree and destroyed if there are no waiters. 232 * 233 * This scheme has two important properties: 234 * 235 * 1) No memory allocations are performed while holding one of the z_hold_locks. 236 * This ensures evict(), which can be called from direct memory reclaim, will 237 * never block waiting on a z_hold_locks which just happens to have hashed 238 * to the same index. 239 * 240 * 2) All locks used to serialize access to an object are per-object and never 241 * shared. This minimizes lock contention without creating a large number 242 * of dedicated locks. 243 * 244 * On the downside it does require znode_lock_t structures to be frequently 245 * allocated and freed. However, because these are backed by a kmem cache 246 * and very short lived this cost is minimal. 247 */ 248 int 249 zfs_znode_hold_compare(const void *a, const void *b) 250 { 251 const znode_hold_t *zh_a = (const znode_hold_t *)a; 252 const znode_hold_t *zh_b = (const znode_hold_t *)b; 253 254 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); 255 } 256 257 static boolean_t __maybe_unused 258 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) 259 { 260 znode_hold_t *zh, search; 261 int i = ZFS_OBJ_HASH(zfsvfs, obj); 262 boolean_t held; 263 264 search.zh_obj = obj; 265 266 mutex_enter(&zfsvfs->z_hold_locks[i]); 267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 268 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; 269 mutex_exit(&zfsvfs->z_hold_locks[i]); 270 271 return (held); 272 } 273 274 znode_hold_t * 275 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) 276 { 277 znode_hold_t *zh, *zh_new, search; 278 int i = ZFS_OBJ_HASH(zfsvfs, obj); 279 boolean_t found = B_FALSE; 280 281 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); 282 search.zh_obj = obj; 283 284 mutex_enter(&zfsvfs->z_hold_locks[i]); 285 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 286 if (likely(zh == NULL)) { 287 zh = zh_new; 288 zh->zh_obj = obj; 289 avl_add(&zfsvfs->z_hold_trees[i], zh); 290 } else { 291 ASSERT3U(zh->zh_obj, ==, obj); 292 found = B_TRUE; 293 } 294 zh->zh_refcount++; 295 ASSERT3S(zh->zh_refcount, >, 0); 296 mutex_exit(&zfsvfs->z_hold_locks[i]); 297 298 if (found == B_TRUE) 299 kmem_cache_free(znode_hold_cache, zh_new); 300 301 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); 302 mutex_enter(&zh->zh_lock); 303 304 return (zh); 305 } 306 307 void 308 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) 309 { 310 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); 311 boolean_t remove = B_FALSE; 312 313 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); 314 mutex_exit(&zh->zh_lock); 315 316 mutex_enter(&zfsvfs->z_hold_locks[i]); 317 ASSERT3S(zh->zh_refcount, >, 0); 318 if (--zh->zh_refcount == 0) { 319 avl_remove(&zfsvfs->z_hold_trees[i], zh); 320 remove = B_TRUE; 321 } 322 mutex_exit(&zfsvfs->z_hold_locks[i]); 323 324 if (remove == B_TRUE) 325 kmem_cache_free(znode_hold_cache, zh); 326 } 327 328 dev_t 329 zfs_cmpldev(uint64_t dev) 330 { 331 return (dev); 332 } 333 334 static void 335 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, 336 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) 337 { 338 ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); 339 340 mutex_enter(&zp->z_lock); 341 342 ASSERT(zp->z_sa_hdl == NULL); 343 ASSERT(zp->z_acl_cached == NULL); 344 if (sa_hdl == NULL) { 345 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, 346 SA_HDL_SHARED, &zp->z_sa_hdl)); 347 } else { 348 zp->z_sa_hdl = sa_hdl; 349 sa_set_userp(sa_hdl, zp); 350 } 351 352 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; 353 354 mutex_exit(&zp->z_lock); 355 } 356 357 void 358 zfs_znode_dmu_fini(znode_t *zp) 359 { 360 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || 361 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); 362 363 sa_handle_destroy(zp->z_sa_hdl); 364 zp->z_sa_hdl = NULL; 365 } 366 367 /* 368 * Called by new_inode() to allocate a new inode. 369 */ 370 int 371 zfs_inode_alloc(struct super_block *sb, struct inode **ip) 372 { 373 znode_t *zp; 374 375 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 376 *ip = ZTOI(zp); 377 378 return (0); 379 } 380 381 /* 382 * Called in multiple places when an inode should be destroyed. 383 */ 384 void 385 zfs_inode_destroy(struct inode *ip) 386 { 387 znode_t *zp = ITOZ(ip); 388 zfsvfs_t *zfsvfs = ZTOZSB(zp); 389 390 mutex_enter(&zfsvfs->z_znodes_lock); 391 if (list_link_active(&zp->z_link_node)) { 392 list_remove(&zfsvfs->z_all_znodes, zp); 393 zfsvfs->z_nr_znodes--; 394 } 395 mutex_exit(&zfsvfs->z_znodes_lock); 396 397 if (zp->z_acl_cached) { 398 zfs_acl_free(zp->z_acl_cached); 399 zp->z_acl_cached = NULL; 400 } 401 402 if (zp->z_xattr_cached) { 403 nvlist_free(zp->z_xattr_cached); 404 zp->z_xattr_cached = NULL; 405 } 406 407 kmem_cache_free(znode_cache, zp); 408 } 409 410 static void 411 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) 412 { 413 uint64_t rdev = 0; 414 415 switch (ip->i_mode & S_IFMT) { 416 case S_IFREG: 417 ip->i_op = &zpl_inode_operations; 418 ip->i_fop = &zpl_file_operations; 419 ip->i_mapping->a_ops = &zpl_address_space_operations; 420 break; 421 422 case S_IFDIR: 423 #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER 424 ip->i_flags |= S_IOPS_WRAPPER; 425 ip->i_op = &zpl_dir_inode_operations.ops; 426 #else 427 ip->i_op = &zpl_dir_inode_operations; 428 #endif 429 ip->i_fop = &zpl_dir_file_operations; 430 ITOZ(ip)->z_zn_prefetch = B_TRUE; 431 break; 432 433 case S_IFLNK: 434 ip->i_op = &zpl_symlink_inode_operations; 435 break; 436 437 /* 438 * rdev is only stored in a SA only for device files. 439 */ 440 case S_IFCHR: 441 case S_IFBLK: 442 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, 443 sizeof (rdev)); 444 zfs_fallthrough; 445 case S_IFIFO: 446 case S_IFSOCK: 447 init_special_inode(ip, ip->i_mode, rdev); 448 ip->i_op = &zpl_special_inode_operations; 449 break; 450 451 default: 452 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", 453 (u_longlong_t)ip->i_ino, ip->i_mode); 454 455 /* Assume the inode is a file and attempt to continue */ 456 ip->i_mode = S_IFREG | 0644; 457 ip->i_op = &zpl_inode_operations; 458 ip->i_fop = &zpl_file_operations; 459 ip->i_mapping->a_ops = &zpl_address_space_operations; 460 break; 461 } 462 } 463 464 static void 465 zfs_set_inode_flags(znode_t *zp, struct inode *ip) 466 { 467 /* 468 * Linux and Solaris have different sets of file attributes, so we 469 * restrict this conversion to the intersection of the two. 470 */ 471 #ifdef HAVE_INODE_SET_FLAGS 472 unsigned int flags = 0; 473 if (zp->z_pflags & ZFS_IMMUTABLE) 474 flags |= S_IMMUTABLE; 475 if (zp->z_pflags & ZFS_APPENDONLY) 476 flags |= S_APPEND; 477 478 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); 479 #else 480 if (zp->z_pflags & ZFS_IMMUTABLE) 481 ip->i_flags |= S_IMMUTABLE; 482 else 483 ip->i_flags &= ~S_IMMUTABLE; 484 485 if (zp->z_pflags & ZFS_APPENDONLY) 486 ip->i_flags |= S_APPEND; 487 else 488 ip->i_flags &= ~S_APPEND; 489 #endif 490 } 491 492 /* 493 * Update the embedded inode given the znode. 494 */ 495 void 496 zfs_znode_update_vfs(znode_t *zp) 497 { 498 struct inode *ip; 499 uint32_t blksize; 500 u_longlong_t i_blocks; 501 502 ASSERT(zp != NULL); 503 ip = ZTOI(zp); 504 505 /* Skip .zfs control nodes which do not exist on disk. */ 506 if (zfsctl_is_node(ip)) 507 return; 508 509 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); 510 511 spin_lock(&ip->i_lock); 512 ip->i_mode = zp->z_mode; 513 ip->i_blocks = i_blocks; 514 i_size_write(ip, zp->z_size); 515 spin_unlock(&ip->i_lock); 516 } 517 518 519 /* 520 * Construct a znode+inode and initialize. 521 * 522 * This does not do a call to dmu_set_user() that is 523 * up to the caller to do, in case you don't want to 524 * return the znode 525 */ 526 static znode_t * 527 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, 528 dmu_object_type_t obj_type, sa_handle_t *hdl) 529 { 530 znode_t *zp; 531 struct inode *ip; 532 uint64_t mode; 533 uint64_t parent; 534 uint64_t tmp_gen; 535 uint64_t links; 536 uint64_t z_uid, z_gid; 537 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 538 uint64_t projid = ZFS_DEFAULT_PROJID; 539 sa_bulk_attr_t bulk[12]; 540 int count = 0; 541 542 ASSERT(zfsvfs != NULL); 543 544 ip = new_inode(zfsvfs->z_sb); 545 if (ip == NULL) 546 return (NULL); 547 548 zp = ITOZ(ip); 549 ASSERT(zp->z_dirlocks == NULL); 550 ASSERT3P(zp->z_acl_cached, ==, NULL); 551 ASSERT3P(zp->z_xattr_cached, ==, NULL); 552 zp->z_unlinked = B_FALSE; 553 zp->z_atime_dirty = B_FALSE; 554 zp->z_is_mapped = B_FALSE; 555 zp->z_is_ctldir = B_FALSE; 556 zp->z_suspended = B_FALSE; 557 zp->z_sa_hdl = NULL; 558 zp->z_mapcnt = 0; 559 zp->z_id = db->db_object; 560 zp->z_blksz = blksz; 561 zp->z_seq = 0x7A4653; 562 zp->z_sync_cnt = 0; 563 zp->z_sync_writes_cnt = 0; 564 zp->z_async_writes_cnt = 0; 565 566 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); 567 568 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 569 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); 570 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 571 &zp->z_size, 8); 572 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 573 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 574 &zp->z_pflags, 8); 575 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 576 &parent, 8); 577 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); 578 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); 579 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 580 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 581 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 582 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 583 584 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || 585 (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 586 (zp->z_pflags & ZFS_PROJID) && 587 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { 588 if (hdl == NULL) 589 sa_handle_destroy(zp->z_sa_hdl); 590 zp->z_sa_hdl = NULL; 591 goto error; 592 } 593 594 zp->z_projid = projid; 595 zp->z_mode = ip->i_mode = mode; 596 ip->i_generation = (uint32_t)tmp_gen; 597 ip->i_blkbits = SPA_MINBLOCKSHIFT; 598 set_nlink(ip, (uint32_t)links); 599 zfs_uid_write(ip, z_uid); 600 zfs_gid_write(ip, z_gid); 601 zfs_set_inode_flags(zp, ip); 602 603 /* Cache the xattr parent id */ 604 if (zp->z_pflags & ZFS_XATTR) 605 zp->z_xattr_parent = parent; 606 607 ZFS_TIME_DECODE(&ip->i_atime, atime); 608 ZFS_TIME_DECODE(&ip->i_mtime, mtime); 609 ZFS_TIME_DECODE(&ip->i_ctime, ctime); 610 ZFS_TIME_DECODE(&zp->z_btime, btime); 611 612 ip->i_ino = zp->z_id; 613 zfs_znode_update_vfs(zp); 614 zfs_inode_set_ops(zfsvfs, ip); 615 616 /* 617 * The only way insert_inode_locked() can fail is if the ip->i_ino 618 * number is already hashed for this super block. This can never 619 * happen because the inode numbers map 1:1 with the object numbers. 620 * 621 * Exceptions include rolling back a mounted file system, either 622 * from the zfs rollback or zfs recv command. 623 * 624 * Active inodes are unhashed during the rollback, but since zrele 625 * can happen asynchronously, we can't guarantee they've been 626 * unhashed. This can cause hash collisions in unlinked drain 627 * processing so do not hash unlinked znodes. 628 */ 629 if (links > 0) 630 VERIFY3S(insert_inode_locked(ip), ==, 0); 631 632 mutex_enter(&zfsvfs->z_znodes_lock); 633 list_insert_tail(&zfsvfs->z_all_znodes, zp); 634 zfsvfs->z_nr_znodes++; 635 mutex_exit(&zfsvfs->z_znodes_lock); 636 637 if (links > 0) 638 unlock_new_inode(ip); 639 return (zp); 640 641 error: 642 iput(ip); 643 return (NULL); 644 } 645 646 /* 647 * Safely mark an inode dirty. Inodes which are part of a read-only 648 * file system or snapshot may not be dirtied. 649 */ 650 void 651 zfs_mark_inode_dirty(struct inode *ip) 652 { 653 zfsvfs_t *zfsvfs = ITOZSB(ip); 654 655 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 656 return; 657 658 mark_inode_dirty(ip); 659 } 660 661 static uint64_t empty_xattr; 662 static uint64_t pad[4]; 663 static zfs_acl_phys_t acl_phys; 664 /* 665 * Create a new DMU object to hold a zfs znode. 666 * 667 * IN: dzp - parent directory for new znode 668 * vap - file attributes for new znode 669 * tx - dmu transaction id for zap operations 670 * cr - credentials of caller 671 * flag - flags: 672 * IS_ROOT_NODE - new object will be root 673 * IS_TMPFILE - new object is of O_TMPFILE 674 * IS_XATTR - new object is an attribute 675 * acl_ids - ACL related attributes 676 * 677 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) 678 * 679 */ 680 void 681 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 682 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) 683 { 684 uint64_t crtime[2], atime[2], mtime[2], ctime[2]; 685 uint64_t mode, size, links, parent, pflags; 686 uint64_t projid = ZFS_DEFAULT_PROJID; 687 uint64_t rdev = 0; 688 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 689 dmu_buf_t *db; 690 inode_timespec_t now; 691 uint64_t gen, obj; 692 int bonuslen; 693 int dnodesize; 694 sa_handle_t *sa_hdl; 695 dmu_object_type_t obj_type; 696 sa_bulk_attr_t *sa_attrs; 697 int cnt = 0; 698 zfs_acl_locator_cb_t locate = { 0 }; 699 znode_hold_t *zh; 700 701 if (zfsvfs->z_replay) { 702 obj = vap->va_nodeid; 703 now = vap->va_ctime; /* see zfs_replay_create() */ 704 gen = vap->va_nblocks; /* ditto */ 705 dnodesize = vap->va_fsid; /* ditto */ 706 } else { 707 obj = 0; 708 gethrestime(&now); 709 gen = dmu_tx_get_txg(tx); 710 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); 711 } 712 713 if (dnodesize == 0) 714 dnodesize = DNODE_MIN_SIZE; 715 716 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; 717 718 bonuslen = (obj_type == DMU_OT_SA) ? 719 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; 720 721 /* 722 * Create a new DMU object. 723 */ 724 /* 725 * There's currently no mechanism for pre-reading the blocks that will 726 * be needed to allocate a new object, so we accept the small chance 727 * that there will be an i/o error and we will fail one of the 728 * assertions below. 729 */ 730 if (S_ISDIR(vap->va_mode)) { 731 if (zfsvfs->z_replay) { 732 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, 733 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 734 obj_type, bonuslen, dnodesize, tx)); 735 } else { 736 obj = zap_create_norm_dnsize(zfsvfs->z_os, 737 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 738 obj_type, bonuslen, dnodesize, tx); 739 } 740 } else { 741 if (zfsvfs->z_replay) { 742 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, 743 DMU_OT_PLAIN_FILE_CONTENTS, 0, 744 obj_type, bonuslen, dnodesize, tx)); 745 } else { 746 obj = dmu_object_alloc_dnsize(zfsvfs->z_os, 747 DMU_OT_PLAIN_FILE_CONTENTS, 0, 748 obj_type, bonuslen, dnodesize, tx); 749 } 750 } 751 752 zh = zfs_znode_hold_enter(zfsvfs, obj); 753 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); 754 755 /* 756 * If this is the root, fix up the half-initialized parent pointer 757 * to reference the just-allocated physical data area. 758 */ 759 if (flag & IS_ROOT_NODE) { 760 dzp->z_id = obj; 761 } 762 763 /* 764 * If parent is an xattr, so am I. 765 */ 766 if (dzp->z_pflags & ZFS_XATTR) { 767 flag |= IS_XATTR; 768 } 769 770 if (zfsvfs->z_use_fuids) 771 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 772 else 773 pflags = 0; 774 775 if (S_ISDIR(vap->va_mode)) { 776 size = 2; /* contents ("." and "..") */ 777 links = 2; 778 } else { 779 size = 0; 780 links = (flag & IS_TMPFILE) ? 0 : 1; 781 } 782 783 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) 784 rdev = vap->va_rdev; 785 786 parent = dzp->z_id; 787 mode = acl_ids->z_mode; 788 if (flag & IS_XATTR) 789 pflags |= ZFS_XATTR; 790 791 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { 792 /* 793 * With ZFS_PROJID flag, we can easily know whether there is 794 * project ID stored on disk or not. See zfs_space_delta_cb(). 795 */ 796 if (obj_type != DMU_OT_ZNODE && 797 dmu_objset_projectquota_enabled(zfsvfs->z_os)) 798 pflags |= ZFS_PROJID; 799 800 /* 801 * Inherit project ID from parent if required. 802 */ 803 projid = zfs_inherit_projid(dzp); 804 if (dzp->z_pflags & ZFS_PROJINHERIT) 805 pflags |= ZFS_PROJINHERIT; 806 } 807 808 /* 809 * No execs denied will be determined when zfs_mode_compute() is called. 810 */ 811 pflags |= acl_ids->z_aclp->z_hints & 812 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| 813 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); 814 815 ZFS_TIME_ENCODE(&now, crtime); 816 ZFS_TIME_ENCODE(&now, ctime); 817 818 if (vap->va_mask & ATTR_ATIME) { 819 ZFS_TIME_ENCODE(&vap->va_atime, atime); 820 } else { 821 ZFS_TIME_ENCODE(&now, atime); 822 } 823 824 if (vap->va_mask & ATTR_MTIME) { 825 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 826 } else { 827 ZFS_TIME_ENCODE(&now, mtime); 828 } 829 830 /* Now add in all of the "SA" attributes */ 831 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, 832 &sa_hdl)); 833 834 /* 835 * Setup the array of attributes to be replaced/set on the new file 836 * 837 * order for DMU_OT_ZNODE is critical since it needs to be constructed 838 * in the old znode_phys_t format. Don't change this ordering 839 */ 840 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); 841 842 if (obj_type == DMU_OT_ZNODE) { 843 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 844 NULL, &atime, 16); 845 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 846 NULL, &mtime, 16); 847 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 848 NULL, &ctime, 16); 849 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 850 NULL, &crtime, 16); 851 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 852 NULL, &gen, 8); 853 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 854 NULL, &mode, 8); 855 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 856 NULL, &size, 8); 857 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 858 NULL, &parent, 8); 859 } else { 860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 861 NULL, &mode, 8); 862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 863 NULL, &size, 8); 864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 865 NULL, &gen, 8); 866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), 867 NULL, &acl_ids->z_fuid, 8); 868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), 869 NULL, &acl_ids->z_fgid, 8); 870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 871 NULL, &parent, 8); 872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 873 NULL, &pflags, 8); 874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 875 NULL, &atime, 16); 876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 877 NULL, &mtime, 16); 878 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 879 NULL, &ctime, 16); 880 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 881 NULL, &crtime, 16); 882 } 883 884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 885 886 if (obj_type == DMU_OT_ZNODE) { 887 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, 888 &empty_xattr, 8); 889 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 890 pflags & ZFS_PROJID) { 891 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), 892 NULL, &projid, 8); 893 } 894 if (obj_type == DMU_OT_ZNODE || 895 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { 896 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), 897 NULL, &rdev, 8); 898 } 899 if (obj_type == DMU_OT_ZNODE) { 900 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 901 NULL, &pflags, 8); 902 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, 903 &acl_ids->z_fuid, 8); 904 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, 905 &acl_ids->z_fgid, 8); 906 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, 907 sizeof (uint64_t) * 4); 908 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, 909 &acl_phys, sizeof (zfs_acl_phys_t)); 910 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { 911 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, 912 &acl_ids->z_aclp->z_acl_count, 8); 913 locate.cb_aclp = acl_ids->z_aclp; 914 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), 915 zfs_acl_data_locator, &locate, 916 acl_ids->z_aclp->z_acl_bytes); 917 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, 918 acl_ids->z_fuid, acl_ids->z_fgid); 919 } 920 921 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); 922 923 if (!(flag & IS_ROOT_NODE)) { 924 /* 925 * The call to zfs_znode_alloc() may fail if memory is low 926 * via the call path: alloc_inode() -> inode_init_always() -> 927 * security_inode_alloc() -> inode_alloc_security(). Since 928 * the existing code is written such that zfs_mknode() can 929 * not fail retry until sufficient memory has been reclaimed. 930 */ 931 do { 932 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); 933 } while (*zpp == NULL); 934 935 VERIFY(*zpp != NULL); 936 VERIFY(dzp != NULL); 937 } else { 938 /* 939 * If we are creating the root node, the "parent" we 940 * passed in is the znode for the root. 941 */ 942 *zpp = dzp; 943 944 (*zpp)->z_sa_hdl = sa_hdl; 945 } 946 947 (*zpp)->z_pflags = pflags; 948 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; 949 (*zpp)->z_dnodesize = dnodesize; 950 (*zpp)->z_projid = projid; 951 952 if (obj_type == DMU_OT_ZNODE || 953 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { 954 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 955 } 956 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); 957 zfs_znode_hold_exit(zfsvfs, zh); 958 } 959 960 /* 961 * Update in-core attributes. It is assumed the caller will be doing an 962 * sa_bulk_update to push the changes out. 963 */ 964 void 965 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) 966 { 967 xoptattr_t *xoap; 968 boolean_t update_inode = B_FALSE; 969 970 xoap = xva_getxoptattr(xvap); 971 ASSERT(xoap); 972 973 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 974 uint64_t times[2]; 975 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); 976 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 977 ×, sizeof (times), tx); 978 XVA_SET_RTN(xvap, XAT_CREATETIME); 979 } 980 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 981 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, 982 zp->z_pflags, tx); 983 XVA_SET_RTN(xvap, XAT_READONLY); 984 } 985 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 986 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, 987 zp->z_pflags, tx); 988 XVA_SET_RTN(xvap, XAT_HIDDEN); 989 } 990 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 991 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, 992 zp->z_pflags, tx); 993 XVA_SET_RTN(xvap, XAT_SYSTEM); 994 } 995 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 996 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, 997 zp->z_pflags, tx); 998 XVA_SET_RTN(xvap, XAT_ARCHIVE); 999 } 1000 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 1001 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, 1002 zp->z_pflags, tx); 1003 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 1004 1005 update_inode = B_TRUE; 1006 } 1007 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 1008 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, 1009 zp->z_pflags, tx); 1010 XVA_SET_RTN(xvap, XAT_NOUNLINK); 1011 } 1012 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 1013 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, 1014 zp->z_pflags, tx); 1015 XVA_SET_RTN(xvap, XAT_APPENDONLY); 1016 1017 update_inode = B_TRUE; 1018 } 1019 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 1020 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, 1021 zp->z_pflags, tx); 1022 XVA_SET_RTN(xvap, XAT_NODUMP); 1023 } 1024 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 1025 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, 1026 zp->z_pflags, tx); 1027 XVA_SET_RTN(xvap, XAT_OPAQUE); 1028 } 1029 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 1030 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 1031 xoap->xoa_av_quarantined, zp->z_pflags, tx); 1032 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 1033 } 1034 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 1035 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, 1036 zp->z_pflags, tx); 1037 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 1038 } 1039 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 1040 zfs_sa_set_scanstamp(zp, xvap, tx); 1041 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 1042 } 1043 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 1044 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, 1045 zp->z_pflags, tx); 1046 XVA_SET_RTN(xvap, XAT_REPARSE); 1047 } 1048 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 1049 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, 1050 zp->z_pflags, tx); 1051 XVA_SET_RTN(xvap, XAT_OFFLINE); 1052 } 1053 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 1054 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, 1055 zp->z_pflags, tx); 1056 XVA_SET_RTN(xvap, XAT_SPARSE); 1057 } 1058 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 1059 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, 1060 zp->z_pflags, tx); 1061 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 1062 } 1063 1064 if (update_inode) 1065 zfs_set_inode_flags(zp, ZTOI(zp)); 1066 } 1067 1068 int 1069 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 1070 { 1071 dmu_object_info_t doi; 1072 dmu_buf_t *db; 1073 znode_t *zp; 1074 znode_hold_t *zh; 1075 int err; 1076 sa_handle_t *hdl; 1077 1078 *zpp = NULL; 1079 1080 again: 1081 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1082 1083 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1084 if (err) { 1085 zfs_znode_hold_exit(zfsvfs, zh); 1086 return (err); 1087 } 1088 1089 dmu_object_info_from_db(db, &doi); 1090 if (doi.doi_bonus_type != DMU_OT_SA && 1091 (doi.doi_bonus_type != DMU_OT_ZNODE || 1092 (doi.doi_bonus_type == DMU_OT_ZNODE && 1093 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1094 sa_buf_rele(db, NULL); 1095 zfs_znode_hold_exit(zfsvfs, zh); 1096 return (SET_ERROR(EINVAL)); 1097 } 1098 1099 hdl = dmu_buf_get_user(db); 1100 if (hdl != NULL) { 1101 zp = sa_get_userdata(hdl); 1102 1103 1104 /* 1105 * Since "SA" does immediate eviction we 1106 * should never find a sa handle that doesn't 1107 * know about the znode. 1108 */ 1109 1110 ASSERT3P(zp, !=, NULL); 1111 1112 mutex_enter(&zp->z_lock); 1113 ASSERT3U(zp->z_id, ==, obj_num); 1114 /* 1115 * If zp->z_unlinked is set, the znode is already marked 1116 * for deletion and should not be discovered. Check this 1117 * after checking igrab() due to fsetxattr() & O_TMPFILE. 1118 * 1119 * If igrab() returns NULL the VFS has independently 1120 * determined the inode should be evicted and has 1121 * called iput_final() to start the eviction process. 1122 * The SA handle is still valid but because the VFS 1123 * requires that the eviction succeed we must drop 1124 * our locks and references to allow the eviction to 1125 * complete. The zfs_zget() may then be retried. 1126 * 1127 * This unlikely case could be optimized by registering 1128 * a sops->drop_inode() callback. The callback would 1129 * need to detect the active SA hold thereby informing 1130 * the VFS that this inode should not be evicted. 1131 */ 1132 if (igrab(ZTOI(zp)) == NULL) { 1133 if (zp->z_unlinked) 1134 err = SET_ERROR(ENOENT); 1135 else 1136 err = SET_ERROR(EAGAIN); 1137 } else { 1138 *zpp = zp; 1139 err = 0; 1140 } 1141 1142 mutex_exit(&zp->z_lock); 1143 sa_buf_rele(db, NULL); 1144 zfs_znode_hold_exit(zfsvfs, zh); 1145 1146 if (err == EAGAIN) { 1147 /* inode might need this to finish evict */ 1148 cond_resched(); 1149 goto again; 1150 } 1151 return (err); 1152 } 1153 1154 /* 1155 * Not found create new znode/vnode but only if file exists. 1156 * 1157 * There is a small window where zfs_vget() could 1158 * find this object while a file create is still in 1159 * progress. This is checked for in zfs_znode_alloc() 1160 * 1161 * if zfs_znode_alloc() fails it will drop the hold on the 1162 * bonus buffer. 1163 */ 1164 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, 1165 doi.doi_bonus_type, NULL); 1166 if (zp == NULL) { 1167 err = SET_ERROR(ENOENT); 1168 } else { 1169 *zpp = zp; 1170 } 1171 zfs_znode_hold_exit(zfsvfs, zh); 1172 return (err); 1173 } 1174 1175 int 1176 zfs_rezget(znode_t *zp) 1177 { 1178 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1179 dmu_object_info_t doi; 1180 dmu_buf_t *db; 1181 uint64_t obj_num = zp->z_id; 1182 uint64_t mode; 1183 uint64_t links; 1184 sa_bulk_attr_t bulk[11]; 1185 int err; 1186 int count = 0; 1187 uint64_t gen; 1188 uint64_t z_uid, z_gid; 1189 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 1190 uint64_t projid = ZFS_DEFAULT_PROJID; 1191 znode_hold_t *zh; 1192 1193 /* 1194 * skip ctldir, otherwise they will always get invalidated. This will 1195 * cause funny behaviour for the mounted snapdirs. Especially for 1196 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent 1197 * anyone automount it again as long as someone is still using the 1198 * detached mount. 1199 */ 1200 if (zp->z_is_ctldir) 1201 return (0); 1202 1203 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1204 1205 mutex_enter(&zp->z_acl_lock); 1206 if (zp->z_acl_cached) { 1207 zfs_acl_free(zp->z_acl_cached); 1208 zp->z_acl_cached = NULL; 1209 } 1210 mutex_exit(&zp->z_acl_lock); 1211 1212 rw_enter(&zp->z_xattr_lock, RW_WRITER); 1213 if (zp->z_xattr_cached) { 1214 nvlist_free(zp->z_xattr_cached); 1215 zp->z_xattr_cached = NULL; 1216 } 1217 rw_exit(&zp->z_xattr_lock); 1218 1219 ASSERT(zp->z_sa_hdl == NULL); 1220 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1221 if (err) { 1222 zfs_znode_hold_exit(zfsvfs, zh); 1223 return (err); 1224 } 1225 1226 dmu_object_info_from_db(db, &doi); 1227 if (doi.doi_bonus_type != DMU_OT_SA && 1228 (doi.doi_bonus_type != DMU_OT_ZNODE || 1229 (doi.doi_bonus_type == DMU_OT_ZNODE && 1230 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1231 sa_buf_rele(db, NULL); 1232 zfs_znode_hold_exit(zfsvfs, zh); 1233 return (SET_ERROR(EINVAL)); 1234 } 1235 1236 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); 1237 1238 /* reload cached values */ 1239 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, 1240 &gen, sizeof (gen)); 1241 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1242 &zp->z_size, sizeof (zp->z_size)); 1243 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 1244 &links, sizeof (links)); 1245 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1246 &zp->z_pflags, sizeof (zp->z_pflags)); 1247 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1248 &z_uid, sizeof (z_uid)); 1249 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1250 &z_gid, sizeof (z_gid)); 1251 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 1252 &mode, sizeof (mode)); 1253 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 1254 &atime, 16); 1255 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 1256 &mtime, 16); 1257 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 1258 &ctime, 16); 1259 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 1260 1261 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { 1262 zfs_znode_dmu_fini(zp); 1263 zfs_znode_hold_exit(zfsvfs, zh); 1264 return (SET_ERROR(EIO)); 1265 } 1266 1267 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { 1268 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), 1269 &projid, 8); 1270 if (err != 0 && err != ENOENT) { 1271 zfs_znode_dmu_fini(zp); 1272 zfs_znode_hold_exit(zfsvfs, zh); 1273 return (SET_ERROR(err)); 1274 } 1275 } 1276 1277 zp->z_projid = projid; 1278 zp->z_mode = ZTOI(zp)->i_mode = mode; 1279 zfs_uid_write(ZTOI(zp), z_uid); 1280 zfs_gid_write(ZTOI(zp), z_gid); 1281 1282 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); 1283 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); 1284 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); 1285 ZFS_TIME_DECODE(&zp->z_btime, btime); 1286 1287 if ((uint32_t)gen != ZTOI(zp)->i_generation) { 1288 zfs_znode_dmu_fini(zp); 1289 zfs_znode_hold_exit(zfsvfs, zh); 1290 return (SET_ERROR(EIO)); 1291 } 1292 1293 set_nlink(ZTOI(zp), (uint32_t)links); 1294 zfs_set_inode_flags(zp, ZTOI(zp)); 1295 1296 zp->z_blksz = doi.doi_data_block_size; 1297 zp->z_atime_dirty = B_FALSE; 1298 zfs_znode_update_vfs(zp); 1299 1300 /* 1301 * If the file has zero links, then it has been unlinked on the send 1302 * side and it must be in the received unlinked set. 1303 * We call zfs_znode_dmu_fini() now to prevent any accesses to the 1304 * stale data and to prevent automatic removal of the file in 1305 * zfs_zinactive(). The file will be removed either when it is removed 1306 * on the send side and the next incremental stream is received or 1307 * when the unlinked set gets processed. 1308 */ 1309 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); 1310 if (zp->z_unlinked) 1311 zfs_znode_dmu_fini(zp); 1312 1313 zfs_znode_hold_exit(zfsvfs, zh); 1314 1315 return (0); 1316 } 1317 1318 void 1319 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1320 { 1321 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1322 objset_t *os = zfsvfs->z_os; 1323 uint64_t obj = zp->z_id; 1324 uint64_t acl_obj = zfs_external_acl(zp); 1325 znode_hold_t *zh; 1326 1327 zh = zfs_znode_hold_enter(zfsvfs, obj); 1328 if (acl_obj) { 1329 VERIFY(!zp->z_is_sa); 1330 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1331 } 1332 VERIFY(0 == dmu_object_free(os, obj, tx)); 1333 zfs_znode_dmu_fini(zp); 1334 zfs_znode_hold_exit(zfsvfs, zh); 1335 } 1336 1337 void 1338 zfs_zinactive(znode_t *zp) 1339 { 1340 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1341 uint64_t z_id = zp->z_id; 1342 znode_hold_t *zh; 1343 1344 ASSERT(zp->z_sa_hdl); 1345 1346 /* 1347 * Don't allow a zfs_zget() while were trying to release this znode. 1348 */ 1349 zh = zfs_znode_hold_enter(zfsvfs, z_id); 1350 1351 mutex_enter(&zp->z_lock); 1352 1353 /* 1354 * If this was the last reference to a file with no links, remove 1355 * the file from the file system unless the file system is mounted 1356 * read-only. That can happen, for example, if the file system was 1357 * originally read-write, the file was opened, then unlinked and 1358 * the file system was made read-only before the file was finally 1359 * closed. The file will remain in the unlinked set. 1360 */ 1361 if (zp->z_unlinked) { 1362 ASSERT(!zfsvfs->z_issnap); 1363 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { 1364 mutex_exit(&zp->z_lock); 1365 zfs_znode_hold_exit(zfsvfs, zh); 1366 zfs_rmnode(zp); 1367 return; 1368 } 1369 } 1370 1371 mutex_exit(&zp->z_lock); 1372 zfs_znode_dmu_fini(zp); 1373 1374 zfs_znode_hold_exit(zfsvfs, zh); 1375 } 1376 1377 #if defined(HAVE_INODE_TIMESPEC64_TIMES) 1378 #define zfs_compare_timespec timespec64_compare 1379 #else 1380 #define zfs_compare_timespec timespec_compare 1381 #endif 1382 1383 /* 1384 * Determine whether the znode's atime must be updated. The logic mostly 1385 * duplicates the Linux kernel's relatime_need_update() functionality. 1386 * This function is only called if the underlying filesystem actually has 1387 * atime updates enabled. 1388 */ 1389 boolean_t 1390 zfs_relatime_need_update(const struct inode *ip) 1391 { 1392 inode_timespec_t now; 1393 1394 gethrestime(&now); 1395 /* 1396 * In relatime mode, only update the atime if the previous atime 1397 * is earlier than either the ctime or mtime or if at least a day 1398 * has passed since the last update of atime. 1399 */ 1400 if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) 1401 return (B_TRUE); 1402 1403 if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) 1404 return (B_TRUE); 1405 1406 if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) 1407 return (B_TRUE); 1408 1409 return (B_FALSE); 1410 } 1411 1412 /* 1413 * Prepare to update znode time stamps. 1414 * 1415 * IN: zp - znode requiring timestamp update 1416 * flag - ATTR_MTIME, ATTR_CTIME flags 1417 * 1418 * OUT: zp - z_seq 1419 * mtime - new mtime 1420 * ctime - new ctime 1421 * 1422 * Note: We don't update atime here, because we rely on Linux VFS to do 1423 * atime updating. 1424 */ 1425 void 1426 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], 1427 uint64_t ctime[2]) 1428 { 1429 inode_timespec_t now; 1430 1431 gethrestime(&now); 1432 1433 zp->z_seq++; 1434 1435 if (flag & ATTR_MTIME) { 1436 ZFS_TIME_ENCODE(&now, mtime); 1437 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); 1438 if (ZTOZSB(zp)->z_use_fuids) { 1439 zp->z_pflags |= (ZFS_ARCHIVE | 1440 ZFS_AV_MODIFIED); 1441 } 1442 } 1443 1444 if (flag & ATTR_CTIME) { 1445 ZFS_TIME_ENCODE(&now, ctime); 1446 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); 1447 if (ZTOZSB(zp)->z_use_fuids) 1448 zp->z_pflags |= ZFS_ARCHIVE; 1449 } 1450 } 1451 1452 /* 1453 * Grow the block size for a file. 1454 * 1455 * IN: zp - znode of file to free data in. 1456 * size - requested block size 1457 * tx - open transaction. 1458 * 1459 * NOTE: this function assumes that the znode is write locked. 1460 */ 1461 void 1462 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1463 { 1464 int error; 1465 u_longlong_t dummy; 1466 1467 if (size <= zp->z_blksz) 1468 return; 1469 /* 1470 * If the file size is already greater than the current blocksize, 1471 * we will not grow. If there is more than one block in a file, 1472 * the blocksize cannot change. 1473 */ 1474 if (zp->z_blksz && zp->z_size > zp->z_blksz) 1475 return; 1476 1477 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, 1478 size, 0, tx); 1479 1480 if (error == ENOTSUP) 1481 return; 1482 ASSERT0(error); 1483 1484 /* What blocksize did we actually get? */ 1485 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); 1486 } 1487 1488 /* 1489 * Increase the file length 1490 * 1491 * IN: zp - znode of file to free data in. 1492 * end - new end-of-file 1493 * 1494 * RETURN: 0 on success, error code on failure 1495 */ 1496 static int 1497 zfs_extend(znode_t *zp, uint64_t end) 1498 { 1499 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1500 dmu_tx_t *tx; 1501 zfs_locked_range_t *lr; 1502 uint64_t newblksz; 1503 int error; 1504 1505 /* 1506 * We will change zp_size, lock the whole file. 1507 */ 1508 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1509 1510 /* 1511 * Nothing to do if file already at desired length. 1512 */ 1513 if (end <= zp->z_size) { 1514 zfs_rangelock_exit(lr); 1515 return (0); 1516 } 1517 tx = dmu_tx_create(zfsvfs->z_os); 1518 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1519 zfs_sa_upgrade_txholds(tx, zp); 1520 if (end > zp->z_blksz && 1521 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1522 /* 1523 * We are growing the file past the current block size. 1524 */ 1525 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { 1526 /* 1527 * File's blocksize is already larger than the 1528 * "recordsize" property. Only let it grow to 1529 * the next power of 2. 1530 */ 1531 ASSERT(!ISP2(zp->z_blksz)); 1532 newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); 1533 } else { 1534 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); 1535 } 1536 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1537 } else { 1538 newblksz = 0; 1539 } 1540 1541 error = dmu_tx_assign(tx, TXG_WAIT); 1542 if (error) { 1543 dmu_tx_abort(tx); 1544 zfs_rangelock_exit(lr); 1545 return (error); 1546 } 1547 1548 if (newblksz) 1549 zfs_grow_blocksize(zp, newblksz, tx); 1550 1551 zp->z_size = end; 1552 1553 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), 1554 &zp->z_size, sizeof (zp->z_size), tx)); 1555 1556 zfs_rangelock_exit(lr); 1557 1558 dmu_tx_commit(tx); 1559 1560 return (0); 1561 } 1562 1563 /* 1564 * zfs_zero_partial_page - Modeled after update_pages() but 1565 * with different arguments and semantics for use by zfs_freesp(). 1566 * 1567 * Zeroes a piece of a single page cache entry for zp at offset 1568 * start and length len. 1569 * 1570 * Caller must acquire a range lock on the file for the region 1571 * being zeroed in order that the ARC and page cache stay in sync. 1572 */ 1573 static void 1574 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) 1575 { 1576 struct address_space *mp = ZTOI(zp)->i_mapping; 1577 struct page *pp; 1578 int64_t off; 1579 void *pb; 1580 1581 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); 1582 1583 off = start & (PAGE_SIZE - 1); 1584 start &= PAGE_MASK; 1585 1586 pp = find_lock_page(mp, start >> PAGE_SHIFT); 1587 if (pp) { 1588 if (mapping_writably_mapped(mp)) 1589 flush_dcache_page(pp); 1590 1591 pb = kmap(pp); 1592 memset(pb + off, 0, len); 1593 kunmap(pp); 1594 1595 if (mapping_writably_mapped(mp)) 1596 flush_dcache_page(pp); 1597 1598 mark_page_accessed(pp); 1599 SetPageUptodate(pp); 1600 ClearPageError(pp); 1601 unlock_page(pp); 1602 put_page(pp); 1603 } 1604 } 1605 1606 /* 1607 * Free space in a file. 1608 * 1609 * IN: zp - znode of file to free data in. 1610 * off - start of section to free. 1611 * len - length of section to free. 1612 * 1613 * RETURN: 0 on success, error code on failure 1614 */ 1615 static int 1616 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1617 { 1618 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1619 zfs_locked_range_t *lr; 1620 int error; 1621 1622 /* 1623 * Lock the range being freed. 1624 */ 1625 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1626 1627 /* 1628 * Nothing to do if file already at desired length. 1629 */ 1630 if (off >= zp->z_size) { 1631 zfs_rangelock_exit(lr); 1632 return (0); 1633 } 1634 1635 if (off + len > zp->z_size) 1636 len = zp->z_size - off; 1637 1638 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1639 1640 /* 1641 * Zero partial page cache entries. This must be done under a 1642 * range lock in order to keep the ARC and page cache in sync. 1643 */ 1644 if (zp->z_is_mapped) { 1645 loff_t first_page, last_page, page_len; 1646 loff_t first_page_offset, last_page_offset; 1647 1648 /* first possible full page in hole */ 1649 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; 1650 /* last page of hole */ 1651 last_page = (off + len) >> PAGE_SHIFT; 1652 1653 /* offset of first_page */ 1654 first_page_offset = first_page << PAGE_SHIFT; 1655 /* offset of last_page */ 1656 last_page_offset = last_page << PAGE_SHIFT; 1657 1658 /* truncate whole pages */ 1659 if (last_page_offset > first_page_offset) { 1660 truncate_inode_pages_range(ZTOI(zp)->i_mapping, 1661 first_page_offset, last_page_offset - 1); 1662 } 1663 1664 /* truncate sub-page ranges */ 1665 if (first_page > last_page) { 1666 /* entire punched area within a single page */ 1667 zfs_zero_partial_page(zp, off, len); 1668 } else { 1669 /* beginning of punched area at the end of a page */ 1670 page_len = first_page_offset - off; 1671 if (page_len > 0) 1672 zfs_zero_partial_page(zp, off, page_len); 1673 1674 /* end of punched area at the beginning of a page */ 1675 page_len = off + len - last_page_offset; 1676 if (page_len > 0) 1677 zfs_zero_partial_page(zp, last_page_offset, 1678 page_len); 1679 } 1680 } 1681 zfs_rangelock_exit(lr); 1682 1683 return (error); 1684 } 1685 1686 /* 1687 * Truncate a file 1688 * 1689 * IN: zp - znode of file to free data in. 1690 * end - new end-of-file. 1691 * 1692 * RETURN: 0 on success, error code on failure 1693 */ 1694 static int 1695 zfs_trunc(znode_t *zp, uint64_t end) 1696 { 1697 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1698 dmu_tx_t *tx; 1699 zfs_locked_range_t *lr; 1700 int error; 1701 sa_bulk_attr_t bulk[2]; 1702 int count = 0; 1703 1704 /* 1705 * We will change zp_size, lock the whole file. 1706 */ 1707 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1708 1709 /* 1710 * Nothing to do if file already at desired length. 1711 */ 1712 if (end >= zp->z_size) { 1713 zfs_rangelock_exit(lr); 1714 return (0); 1715 } 1716 1717 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, 1718 DMU_OBJECT_END); 1719 if (error) { 1720 zfs_rangelock_exit(lr); 1721 return (error); 1722 } 1723 tx = dmu_tx_create(zfsvfs->z_os); 1724 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1725 zfs_sa_upgrade_txholds(tx, zp); 1726 dmu_tx_mark_netfree(tx); 1727 error = dmu_tx_assign(tx, TXG_WAIT); 1728 if (error) { 1729 dmu_tx_abort(tx); 1730 zfs_rangelock_exit(lr); 1731 return (error); 1732 } 1733 1734 zp->z_size = end; 1735 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1736 NULL, &zp->z_size, sizeof (zp->z_size)); 1737 1738 if (end == 0) { 1739 zp->z_pflags &= ~ZFS_SPARSE; 1740 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1741 NULL, &zp->z_pflags, 8); 1742 } 1743 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); 1744 1745 dmu_tx_commit(tx); 1746 zfs_rangelock_exit(lr); 1747 1748 return (0); 1749 } 1750 1751 /* 1752 * Free space in a file 1753 * 1754 * IN: zp - znode of file to free data in. 1755 * off - start of range 1756 * len - end of range (0 => EOF) 1757 * flag - current file open mode flags. 1758 * log - TRUE if this action should be logged 1759 * 1760 * RETURN: 0 on success, error code on failure 1761 */ 1762 int 1763 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1764 { 1765 dmu_tx_t *tx; 1766 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1767 zilog_t *zilog = zfsvfs->z_log; 1768 uint64_t mode; 1769 uint64_t mtime[2], ctime[2]; 1770 sa_bulk_attr_t bulk[3]; 1771 int count = 0; 1772 int error; 1773 1774 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, 1775 sizeof (mode))) != 0) 1776 return (error); 1777 1778 if (off > zp->z_size) { 1779 error = zfs_extend(zp, off+len); 1780 if (error == 0 && log) 1781 goto log; 1782 goto out; 1783 } 1784 1785 if (len == 0) { 1786 error = zfs_trunc(zp, off); 1787 } else { 1788 if ((error = zfs_free_range(zp, off, len)) == 0 && 1789 off + len > zp->z_size) 1790 error = zfs_extend(zp, off+len); 1791 } 1792 if (error || !log) 1793 goto out; 1794 log: 1795 tx = dmu_tx_create(zfsvfs->z_os); 1796 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1797 zfs_sa_upgrade_txholds(tx, zp); 1798 error = dmu_tx_assign(tx, TXG_WAIT); 1799 if (error) { 1800 dmu_tx_abort(tx); 1801 goto out; 1802 } 1803 1804 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); 1805 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); 1806 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1807 NULL, &zp->z_pflags, 8); 1808 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1809 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1810 ASSERT(error == 0); 1811 1812 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1813 1814 dmu_tx_commit(tx); 1815 1816 zfs_znode_update_vfs(zp); 1817 error = 0; 1818 1819 out: 1820 /* 1821 * Truncate the page cache - for file truncate operations, use 1822 * the purpose-built API for truncations. For punching operations, 1823 * the truncation is handled under a range lock in zfs_free_range. 1824 */ 1825 if (len == 0) 1826 truncate_setsize(ZTOI(zp), off); 1827 return (error); 1828 } 1829 1830 void 1831 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1832 { 1833 struct super_block *sb; 1834 zfsvfs_t *zfsvfs; 1835 uint64_t moid, obj, sa_obj, version; 1836 uint64_t sense = ZFS_CASE_SENSITIVE; 1837 uint64_t norm = 0; 1838 nvpair_t *elem; 1839 int size; 1840 int error; 1841 int i; 1842 znode_t *rootzp = NULL; 1843 vattr_t vattr; 1844 znode_t *zp; 1845 zfs_acl_ids_t acl_ids; 1846 1847 /* 1848 * First attempt to create master node. 1849 */ 1850 /* 1851 * In an empty objset, there are no blocks to read and thus 1852 * there can be no i/o errors (which we assert below). 1853 */ 1854 moid = MASTER_NODE_OBJ; 1855 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1856 DMU_OT_NONE, 0, tx); 1857 ASSERT(error == 0); 1858 1859 /* 1860 * Set starting attributes. 1861 */ 1862 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); 1863 elem = NULL; 1864 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1865 /* For the moment we expect all zpl props to be uint64_ts */ 1866 uint64_t val; 1867 char *name; 1868 1869 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1870 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1871 name = nvpair_name(elem); 1872 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1873 if (val < version) 1874 version = val; 1875 } else { 1876 error = zap_update(os, moid, name, 8, 1, &val, tx); 1877 } 1878 ASSERT(error == 0); 1879 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1880 norm = val; 1881 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1882 sense = val; 1883 } 1884 ASSERT(version != 0); 1885 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1886 ASSERT(error == 0); 1887 1888 /* 1889 * Create zap object used for SA attribute registration 1890 */ 1891 1892 if (version >= ZPL_VERSION_SA) { 1893 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 1894 DMU_OT_NONE, 0, tx); 1895 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 1896 ASSERT(error == 0); 1897 } else { 1898 sa_obj = 0; 1899 } 1900 /* 1901 * Create a delete queue. 1902 */ 1903 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1904 1905 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1906 ASSERT(error == 0); 1907 1908 /* 1909 * Create root znode. Create minimal znode/inode/zfsvfs/sb 1910 * to allow zfs_mknode to work. 1911 */ 1912 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; 1913 vattr.va_mode = S_IFDIR|0755; 1914 vattr.va_uid = crgetuid(cr); 1915 vattr.va_gid = crgetgid(cr); 1916 1917 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1918 rootzp->z_unlinked = B_FALSE; 1919 rootzp->z_atime_dirty = B_FALSE; 1920 rootzp->z_is_sa = USE_SA(version, os); 1921 rootzp->z_pflags = 0; 1922 1923 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1924 zfsvfs->z_os = os; 1925 zfsvfs->z_parent = zfsvfs; 1926 zfsvfs->z_version = version; 1927 zfsvfs->z_use_fuids = USE_FUIDS(version, os); 1928 zfsvfs->z_use_sa = USE_SA(version, os); 1929 zfsvfs->z_norm = norm; 1930 1931 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); 1932 sb->s_fs_info = zfsvfs; 1933 1934 ZTOI(rootzp)->i_sb = sb; 1935 1936 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1937 &zfsvfs->z_attr_table); 1938 1939 ASSERT(error == 0); 1940 1941 /* 1942 * Fold case on file systems that are always or sometimes case 1943 * insensitive. 1944 */ 1945 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1946 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1947 1948 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1949 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1950 offsetof(znode_t, z_link_node)); 1951 1952 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); 1953 zfsvfs->z_hold_size = size; 1954 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, 1955 KM_SLEEP); 1956 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); 1957 for (i = 0; i != size; i++) { 1958 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, 1959 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); 1960 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); 1961 } 1962 1963 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1964 cr, NULL, &acl_ids, kcred->user_ns)); 1965 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); 1966 ASSERT3P(zp, ==, rootzp); 1967 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1968 ASSERT(error == 0); 1969 zfs_acl_ids_free(&acl_ids); 1970 1971 atomic_set(&ZTOI(rootzp)->i_count, 0); 1972 sa_handle_destroy(rootzp->z_sa_hdl); 1973 kmem_cache_free(znode_cache, rootzp); 1974 1975 for (i = 0; i != size; i++) { 1976 avl_destroy(&zfsvfs->z_hold_trees[i]); 1977 mutex_destroy(&zfsvfs->z_hold_locks[i]); 1978 } 1979 1980 mutex_destroy(&zfsvfs->z_znodes_lock); 1981 1982 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); 1983 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); 1984 kmem_free(sb, sizeof (struct super_block)); 1985 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1986 } 1987 #endif /* _KERNEL */ 1988 1989 static int 1990 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) 1991 { 1992 uint64_t sa_obj = 0; 1993 int error; 1994 1995 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); 1996 if (error != 0 && error != ENOENT) 1997 return (error); 1998 1999 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); 2000 return (error); 2001 } 2002 2003 static int 2004 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, 2005 dmu_buf_t **db, const void *tag) 2006 { 2007 dmu_object_info_t doi; 2008 int error; 2009 2010 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) 2011 return (error); 2012 2013 dmu_object_info_from_db(*db, &doi); 2014 if ((doi.doi_bonus_type != DMU_OT_SA && 2015 doi.doi_bonus_type != DMU_OT_ZNODE) || 2016 (doi.doi_bonus_type == DMU_OT_ZNODE && 2017 doi.doi_bonus_size < sizeof (znode_phys_t))) { 2018 sa_buf_rele(*db, tag); 2019 return (SET_ERROR(ENOTSUP)); 2020 } 2021 2022 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); 2023 if (error != 0) { 2024 sa_buf_rele(*db, tag); 2025 return (error); 2026 } 2027 2028 return (0); 2029 } 2030 2031 static void 2032 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) 2033 { 2034 sa_handle_destroy(hdl); 2035 sa_buf_rele(db, tag); 2036 } 2037 2038 /* 2039 * Given an object number, return its parent object number and whether 2040 * or not the object is an extended attribute directory. 2041 */ 2042 static int 2043 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, 2044 uint64_t *pobjp, int *is_xattrdir) 2045 { 2046 uint64_t parent; 2047 uint64_t pflags; 2048 uint64_t mode; 2049 uint64_t parent_mode; 2050 sa_bulk_attr_t bulk[3]; 2051 sa_handle_t *sa_hdl; 2052 dmu_buf_t *sa_db; 2053 int count = 0; 2054 int error; 2055 2056 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, 2057 &parent, sizeof (parent)); 2058 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, 2059 &pflags, sizeof (pflags)); 2060 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2061 &mode, sizeof (mode)); 2062 2063 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) 2064 return (error); 2065 2066 /* 2067 * When a link is removed its parent pointer is not changed and will 2068 * be invalid. There are two cases where a link is removed but the 2069 * file stays around, when it goes to the delete queue and when there 2070 * are additional links. 2071 */ 2072 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); 2073 if (error != 0) 2074 return (error); 2075 2076 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); 2077 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2078 if (error != 0) 2079 return (error); 2080 2081 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); 2082 2083 /* 2084 * Extended attributes can be applied to files, directories, etc. 2085 * Otherwise the parent must be a directory. 2086 */ 2087 if (!*is_xattrdir && !S_ISDIR(parent_mode)) 2088 return (SET_ERROR(EINVAL)); 2089 2090 *pobjp = parent; 2091 2092 return (0); 2093 } 2094 2095 /* 2096 * Given an object number, return some zpl level statistics 2097 */ 2098 static int 2099 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, 2100 zfs_stat_t *sb) 2101 { 2102 sa_bulk_attr_t bulk[4]; 2103 int count = 0; 2104 2105 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2106 &sb->zs_mode, sizeof (sb->zs_mode)); 2107 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, 2108 &sb->zs_gen, sizeof (sb->zs_gen)); 2109 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, 2110 &sb->zs_links, sizeof (sb->zs_links)); 2111 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, 2112 &sb->zs_ctime, sizeof (sb->zs_ctime)); 2113 2114 return (sa_bulk_lookup(hdl, bulk, count)); 2115 } 2116 2117 static int 2118 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, 2119 sa_attr_type_t *sa_table, char *buf, int len) 2120 { 2121 sa_handle_t *sa_hdl; 2122 sa_handle_t *prevhdl = NULL; 2123 dmu_buf_t *prevdb = NULL; 2124 dmu_buf_t *sa_db = NULL; 2125 char *path = buf + len - 1; 2126 int error; 2127 2128 *path = '\0'; 2129 sa_hdl = hdl; 2130 2131 uint64_t deleteq_obj; 2132 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, 2133 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); 2134 error = zap_lookup_int(osp, deleteq_obj, obj); 2135 if (error == 0) { 2136 return (ESTALE); 2137 } else if (error != ENOENT) { 2138 return (error); 2139 } 2140 2141 for (;;) { 2142 uint64_t pobj = 0; 2143 char component[MAXNAMELEN + 2]; 2144 size_t complen; 2145 int is_xattrdir = 0; 2146 2147 if (prevdb) { 2148 ASSERT(prevhdl != NULL); 2149 zfs_release_sa_handle(prevhdl, prevdb, FTAG); 2150 } 2151 2152 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, 2153 &is_xattrdir)) != 0) 2154 break; 2155 2156 if (pobj == obj) { 2157 if (path[0] != '/') 2158 *--path = '/'; 2159 break; 2160 } 2161 2162 component[0] = '/'; 2163 if (is_xattrdir) { 2164 strcpy(component + 1, "<xattrdir>"); 2165 } else { 2166 error = zap_value_search(osp, pobj, obj, 2167 ZFS_DIRENT_OBJ(-1ULL), component + 1); 2168 if (error != 0) 2169 break; 2170 } 2171 2172 complen = strlen(component); 2173 path -= complen; 2174 ASSERT(path >= buf); 2175 memcpy(path, component, complen); 2176 obj = pobj; 2177 2178 if (sa_hdl != hdl) { 2179 prevhdl = sa_hdl; 2180 prevdb = sa_db; 2181 } 2182 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); 2183 if (error != 0) { 2184 sa_hdl = prevhdl; 2185 sa_db = prevdb; 2186 break; 2187 } 2188 } 2189 2190 if (sa_hdl != NULL && sa_hdl != hdl) { 2191 ASSERT(sa_db != NULL); 2192 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2193 } 2194 2195 if (error == 0) 2196 (void) memmove(buf, path, buf + len - path); 2197 2198 return (error); 2199 } 2200 2201 int 2202 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 2203 { 2204 sa_attr_type_t *sa_table; 2205 sa_handle_t *hdl; 2206 dmu_buf_t *db; 2207 int error; 2208 2209 error = zfs_sa_setup(osp, &sa_table); 2210 if (error != 0) 2211 return (error); 2212 2213 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2214 if (error != 0) 2215 return (error); 2216 2217 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2218 2219 zfs_release_sa_handle(hdl, db, FTAG); 2220 return (error); 2221 } 2222 2223 int 2224 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, 2225 char *buf, int len) 2226 { 2227 char *path = buf + len - 1; 2228 sa_attr_type_t *sa_table; 2229 sa_handle_t *hdl; 2230 dmu_buf_t *db; 2231 int error; 2232 2233 *path = '\0'; 2234 2235 error = zfs_sa_setup(osp, &sa_table); 2236 if (error != 0) 2237 return (error); 2238 2239 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2240 if (error != 0) 2241 return (error); 2242 2243 error = zfs_obj_to_stats_impl(hdl, sa_table, sb); 2244 if (error != 0) { 2245 zfs_release_sa_handle(hdl, db, FTAG); 2246 return (error); 2247 } 2248 2249 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2250 2251 zfs_release_sa_handle(hdl, db, FTAG); 2252 return (error); 2253 } 2254 2255 #if defined(_KERNEL) 2256 EXPORT_SYMBOL(zfs_create_fs); 2257 EXPORT_SYMBOL(zfs_obj_to_path); 2258 2259 /* CSTYLED */ 2260 module_param(zfs_object_mutex_size, uint, 0644); 2261 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); 2262 module_param(zfs_unlink_suspend_progress, int, 0644); 2263 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " 2264 "(debug - leaks space into the unlinked set)"); 2265 #endif 2266