1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 73 /* 74 * Programming rules. 75 * 76 * Each vnode op performs some logical unit of work. To do this, the ZPL must 77 * properly lock its in-core state, create a DMU transaction, do the work, 78 * record this work in the intent log (ZIL), commit the DMU transaction, 79 * and wait for the intent log to commit if it is a synchronous operation. 80 * Moreover, the vnode ops must work in both normal and log replay context. 81 * The ordering of events is important to avoid deadlocks and references 82 * to freed memory. The example below illustrates the following Big Rules: 83 * 84 * (1) A check must be made in each zfs thread for a mounted file system. 85 * This is done avoiding races using zfs_enter(zfsvfs). 86 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 87 * must be checked with zfs_verify_zp(zp). Both of these macros 88 * can return EIO from the calling function. 89 * 90 * (2) zrele() should always be the last thing except for zil_commit() (if 91 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 92 * last reference, the vnode/znode can be freed, so the zp may point to 93 * freed memory. Second, the last reference will call zfs_zinactive(), 94 * which may induce a lot of work -- pushing cached pages (which acquires 95 * range locks) and syncing out cached atime changes. Third, 96 * zfs_zinactive() may require a new tx, which could deadlock the system 97 * if you were already holding one. This deadlock occurs because the tx 98 * currently being operated on prevents a txg from syncing, which 99 * prevents the new tx from progressing, resulting in a deadlock. If you 100 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 101 * is a synonym for zrele(). 102 * 103 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 104 * as they can span dmu_tx_assign() calls. 105 * 106 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 107 * dmu_tx_assign(). This is critical because we don't want to block 108 * while holding locks. 109 * 110 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 111 * reduces lock contention and CPU usage when we must wait (note that if 112 * throughput is constrained by the storage, nearly every transaction 113 * must wait). 114 * 115 * Note, in particular, that if a lock is sometimes acquired before 116 * the tx assigns, and sometimes after (e.g. z_lock), then failing 117 * to use a non-blocking assign can deadlock the system. The scenario: 118 * 119 * Thread A has grabbed a lock before calling dmu_tx_assign(). 120 * Thread B is in an already-assigned tx, and blocks for this lock. 121 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 122 * forever, because the previous txg can't quiesce until B's tx commits. 123 * 124 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 125 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 126 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 127 * to indicate that this operation has already called dmu_tx_wait(). 128 * This will ensure that we don't retry forever, waiting a short bit 129 * each time. 130 * 131 * (5) If the operation succeeded, generate the intent log entry for it 132 * before dropping locks. This ensures that the ordering of events 133 * in the intent log matches the order in which they actually occurred. 134 * During ZIL replay the zfs_log_* functions will update the sequence 135 * number to indicate the zil transaction has replayed. 136 * 137 * (6) At the end of each vnode op, the DMU tx must always commit, 138 * regardless of whether there were any errors. 139 * 140 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 141 * to ensure that synchronous semantics are provided when necessary. 142 * 143 * In general, this is how things should be ordered in each vnode op: 144 * 145 * zfs_enter(zfsvfs); // exit if unmounted 146 * top: 147 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 148 * rw_enter(...); // grab any other locks you need 149 * tx = dmu_tx_create(...); // get DMU tx 150 * dmu_tx_hold_*(); // hold each object you might modify 151 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 152 * if (error) { 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * zrele(...); // release held znodes 156 * if (error == ERESTART) { 157 * waited = B_TRUE; 158 * dmu_tx_wait(tx); 159 * dmu_tx_abort(tx); 160 * goto top; 161 * } 162 * dmu_tx_abort(tx); // abort DMU tx 163 * zfs_exit(zfsvfs); // finished in zfs 164 * return (error); // really out of space 165 * } 166 * error = do_real_work(); // do whatever this VOP does 167 * if (error == 0) 168 * zfs_log_*(...); // on success, make ZIL entry 169 * dmu_tx_commit(tx); // commit DMU tx -- error or not 170 * rw_exit(...); // drop locks 171 * zfs_dirent_unlock(dl); // unlock directory entry 172 * zrele(...); // release held znodes 173 * zil_commit(zilog, foid); // synchronous when necessary 174 * zfs_exit(zfsvfs); // finished in zfs 175 * return (error); // done, report error 176 */ 177 int 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 179 { 180 (void) cr; 181 znode_t *zp = ITOZ(ip); 182 zfsvfs_t *zfsvfs = ITOZSB(ip); 183 int error; 184 185 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 186 return (error); 187 188 /* Honor ZFS_APPENDONLY file attribute */ 189 if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 190 ((flag & O_APPEND) == 0)) { 191 zfs_exit(zfsvfs, FTAG); 192 return (SET_ERROR(EPERM)); 193 } 194 195 /* Keep a count of the synchronous opens in the znode */ 196 if (flag & O_SYNC) 197 atomic_inc_32(&zp->z_sync_cnt); 198 199 zfs_exit(zfsvfs, FTAG); 200 return (0); 201 } 202 203 int 204 zfs_close(struct inode *ip, int flag, cred_t *cr) 205 { 206 (void) cr; 207 znode_t *zp = ITOZ(ip); 208 zfsvfs_t *zfsvfs = ITOZSB(ip); 209 int error; 210 211 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 212 return (error); 213 214 /* Decrement the synchronous opens in the znode */ 215 if (flag & O_SYNC) 216 atomic_dec_32(&zp->z_sync_cnt); 217 218 zfs_exit(zfsvfs, FTAG); 219 return (0); 220 } 221 222 #if defined(_KERNEL) 223 /* 224 * When a file is memory mapped, we must keep the IO data synchronized 225 * between the DMU cache and the memory mapped pages. What this means: 226 * 227 * On Write: If we find a memory mapped page, we write to *both* 228 * the page and the dmu buffer. 229 */ 230 void 231 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 232 { 233 struct inode *ip = ZTOI(zp); 234 struct address_space *mp = ip->i_mapping; 235 struct page *pp; 236 uint64_t nbytes; 237 int64_t off; 238 void *pb; 239 240 off = start & (PAGE_SIZE-1); 241 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 242 nbytes = MIN(PAGE_SIZE - off, len); 243 244 pp = find_lock_page(mp, start >> PAGE_SHIFT); 245 if (pp) { 246 if (mapping_writably_mapped(mp)) 247 flush_dcache_page(pp); 248 249 pb = kmap(pp); 250 (void) dmu_read(os, zp->z_id, start + off, nbytes, 251 pb + off, DMU_READ_PREFETCH); 252 kunmap(pp); 253 254 if (mapping_writably_mapped(mp)) 255 flush_dcache_page(pp); 256 257 mark_page_accessed(pp); 258 SetPageUptodate(pp); 259 ClearPageError(pp); 260 unlock_page(pp); 261 put_page(pp); 262 } 263 264 len -= nbytes; 265 off = 0; 266 } 267 } 268 269 /* 270 * When a file is memory mapped, we must keep the IO data synchronized 271 * between the DMU cache and the memory mapped pages. What this means: 272 * 273 * On Read: We "read" preferentially from memory mapped pages, 274 * else we default from the dmu buffer. 275 * 276 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 277 * the file is memory mapped. 278 */ 279 int 280 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 281 { 282 struct inode *ip = ZTOI(zp); 283 struct address_space *mp = ip->i_mapping; 284 struct page *pp; 285 int64_t start, off; 286 uint64_t bytes; 287 int len = nbytes; 288 int error = 0; 289 void *pb; 290 291 start = uio->uio_loffset; 292 off = start & (PAGE_SIZE-1); 293 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 294 bytes = MIN(PAGE_SIZE - off, len); 295 296 pp = find_lock_page(mp, start >> PAGE_SHIFT); 297 if (pp) { 298 ASSERT(PageUptodate(pp)); 299 unlock_page(pp); 300 301 pb = kmap(pp); 302 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 303 kunmap(pp); 304 305 if (mapping_writably_mapped(mp)) 306 flush_dcache_page(pp); 307 308 mark_page_accessed(pp); 309 put_page(pp); 310 } else { 311 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 312 uio, bytes); 313 } 314 315 len -= bytes; 316 off = 0; 317 if (error) 318 break; 319 } 320 return (error); 321 } 322 #endif /* _KERNEL */ 323 324 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 325 326 /* 327 * Write the bytes to a file. 328 * 329 * IN: zp - znode of file to be written to 330 * data - bytes to write 331 * len - number of bytes to write 332 * pos - offset to start writing at 333 * 334 * OUT: resid - remaining bytes to write 335 * 336 * RETURN: 0 if success 337 * positive error code if failure. EIO is returned 338 * for a short write when residp isn't provided. 339 * 340 * Timestamps: 341 * zp - ctime|mtime updated if byte count > 0 342 */ 343 int 344 zfs_write_simple(znode_t *zp, const void *data, size_t len, 345 loff_t pos, size_t *residp) 346 { 347 fstrans_cookie_t cookie; 348 int error; 349 350 struct iovec iov; 351 iov.iov_base = (void *)data; 352 iov.iov_len = len; 353 354 zfs_uio_t uio; 355 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 356 357 cookie = spl_fstrans_mark(); 358 error = zfs_write(zp, &uio, 0, kcred); 359 spl_fstrans_unmark(cookie); 360 361 if (error == 0) { 362 if (residp != NULL) 363 *residp = zfs_uio_resid(&uio); 364 else if (zfs_uio_resid(&uio) != 0) 365 error = SET_ERROR(EIO); 366 } 367 368 return (error); 369 } 370 371 static void 372 zfs_rele_async_task(void *arg) 373 { 374 iput(arg); 375 } 376 377 void 378 zfs_zrele_async(znode_t *zp) 379 { 380 struct inode *ip = ZTOI(zp); 381 objset_t *os = ITOZSB(ip)->z_os; 382 383 ASSERT(atomic_read(&ip->i_count) > 0); 384 ASSERT(os != NULL); 385 386 /* 387 * If decrementing the count would put us at 0, we can't do it inline 388 * here, because that would be synchronous. Instead, dispatch an iput 389 * to run later. 390 * 391 * For more information on the dangers of a synchronous iput, see the 392 * header comment of this file. 393 */ 394 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 395 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 396 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 397 } 398 } 399 400 401 /* 402 * Lookup an entry in a directory, or an extended attribute directory. 403 * If it exists, return a held inode reference for it. 404 * 405 * IN: zdp - znode of directory to search. 406 * nm - name of entry to lookup. 407 * flags - LOOKUP_XATTR set if looking for an attribute. 408 * cr - credentials of caller. 409 * direntflags - directory lookup flags 410 * realpnp - returned pathname. 411 * 412 * OUT: zpp - znode of located entry, NULL if not found. 413 * 414 * RETURN: 0 on success, error code on failure. 415 * 416 * Timestamps: 417 * NA 418 */ 419 int 420 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 421 int *direntflags, pathname_t *realpnp) 422 { 423 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 424 int error = 0; 425 426 /* 427 * Fast path lookup, however we must skip DNLC lookup 428 * for case folding or normalizing lookups because the 429 * DNLC code only stores the passed in name. This means 430 * creating 'a' and removing 'A' on a case insensitive 431 * file system would work, but DNLC still thinks 'a' 432 * exists and won't let you create it again on the next 433 * pass through fast path. 434 */ 435 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 436 437 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 438 return (SET_ERROR(ENOTDIR)); 439 } else if (zdp->z_sa_hdl == NULL) { 440 return (SET_ERROR(EIO)); 441 } 442 443 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 444 error = zfs_fastaccesschk_execute(zdp, cr); 445 if (!error) { 446 *zpp = zdp; 447 zhold(*zpp); 448 return (0); 449 } 450 return (error); 451 } 452 } 453 454 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 455 return (error); 456 457 *zpp = NULL; 458 459 if (flags & LOOKUP_XATTR) { 460 /* 461 * We don't allow recursive attributes.. 462 * Maybe someday we will. 463 */ 464 if (zdp->z_pflags & ZFS_XATTR) { 465 zfs_exit(zfsvfs, FTAG); 466 return (SET_ERROR(EINVAL)); 467 } 468 469 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 470 zfs_exit(zfsvfs, FTAG); 471 return (error); 472 } 473 474 /* 475 * Do we have permission to get into attribute directory? 476 */ 477 478 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 479 B_TRUE, cr, kcred->user_ns))) { 480 zrele(*zpp); 481 *zpp = NULL; 482 } 483 484 zfs_exit(zfsvfs, FTAG); 485 return (error); 486 } 487 488 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 489 zfs_exit(zfsvfs, FTAG); 490 return (SET_ERROR(ENOTDIR)); 491 } 492 493 /* 494 * Check accessibility of directory. 495 */ 496 497 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 498 kcred->user_ns))) { 499 zfs_exit(zfsvfs, FTAG); 500 return (error); 501 } 502 503 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 504 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 505 zfs_exit(zfsvfs, FTAG); 506 return (SET_ERROR(EILSEQ)); 507 } 508 509 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 510 if ((error == 0) && (*zpp)) 511 zfs_znode_update_vfs(*zpp); 512 513 zfs_exit(zfsvfs, FTAG); 514 return (error); 515 } 516 517 /* 518 * Attempt to create a new entry in a directory. If the entry 519 * already exists, truncate the file if permissible, else return 520 * an error. Return the ip of the created or trunc'd file. 521 * 522 * IN: dzp - znode of directory to put new file entry in. 523 * name - name of new file entry. 524 * vap - attributes of new file. 525 * excl - flag indicating exclusive or non-exclusive mode. 526 * mode - mode to open file with. 527 * cr - credentials of caller. 528 * flag - file flag. 529 * vsecp - ACL to be set 530 * mnt_ns - user namespace of the mount 531 * 532 * OUT: zpp - znode of created or trunc'd entry. 533 * 534 * RETURN: 0 on success, error code on failure. 535 * 536 * Timestamps: 537 * dzp - ctime|mtime updated if new entry created 538 * zp - ctime|mtime always, atime if new 539 */ 540 int 541 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 542 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 543 zuserns_t *mnt_ns) 544 { 545 znode_t *zp; 546 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 547 zilog_t *zilog; 548 objset_t *os; 549 zfs_dirlock_t *dl; 550 dmu_tx_t *tx; 551 int error; 552 uid_t uid; 553 gid_t gid; 554 zfs_acl_ids_t acl_ids; 555 boolean_t fuid_dirtied; 556 boolean_t have_acl = B_FALSE; 557 boolean_t waited = B_FALSE; 558 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 559 560 /* 561 * If we have an ephemeral id, ACL, or XVATTR then 562 * make sure file system is at proper version 563 */ 564 565 gid = crgetgid(cr); 566 uid = crgetuid(cr); 567 568 if (zfsvfs->z_use_fuids == B_FALSE && 569 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 570 return (SET_ERROR(EINVAL)); 571 572 if (name == NULL) 573 return (SET_ERROR(EINVAL)); 574 575 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 576 return (error); 577 os = zfsvfs->z_os; 578 zilog = zfsvfs->z_log; 579 580 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 581 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 582 zfs_exit(zfsvfs, FTAG); 583 return (SET_ERROR(EILSEQ)); 584 } 585 586 if (vap->va_mask & ATTR_XVATTR) { 587 if ((error = secpolicy_xvattr((xvattr_t *)vap, 588 crgetuid(cr), cr, vap->va_mode)) != 0) { 589 zfs_exit(zfsvfs, FTAG); 590 return (error); 591 } 592 } 593 594 top: 595 *zpp = NULL; 596 if (*name == '\0') { 597 /* 598 * Null component name refers to the directory itself. 599 */ 600 zhold(dzp); 601 zp = dzp; 602 dl = NULL; 603 error = 0; 604 } else { 605 /* possible igrab(zp) */ 606 int zflg = 0; 607 608 if (flag & FIGNORECASE) 609 zflg |= ZCILOOK; 610 611 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 612 NULL, NULL); 613 if (error) { 614 if (have_acl) 615 zfs_acl_ids_free(&acl_ids); 616 if (strcmp(name, "..") == 0) 617 error = SET_ERROR(EISDIR); 618 zfs_exit(zfsvfs, FTAG); 619 return (error); 620 } 621 } 622 623 if (zp == NULL) { 624 uint64_t txtype; 625 uint64_t projid = ZFS_DEFAULT_PROJID; 626 627 /* 628 * Create a new file object and update the directory 629 * to reference it. 630 */ 631 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 632 mnt_ns))) { 633 if (have_acl) 634 zfs_acl_ids_free(&acl_ids); 635 goto out; 636 } 637 638 /* 639 * We only support the creation of regular files in 640 * extended attribute directories. 641 */ 642 643 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 644 if (have_acl) 645 zfs_acl_ids_free(&acl_ids); 646 error = SET_ERROR(EINVAL); 647 goto out; 648 } 649 650 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 651 cr, vsecp, &acl_ids, mnt_ns)) != 0) 652 goto out; 653 have_acl = B_TRUE; 654 655 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 656 projid = zfs_inherit_projid(dzp); 657 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 658 zfs_acl_ids_free(&acl_ids); 659 error = SET_ERROR(EDQUOT); 660 goto out; 661 } 662 663 tx = dmu_tx_create(os); 664 665 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 666 ZFS_SA_BASE_ATTR_SIZE); 667 668 fuid_dirtied = zfsvfs->z_fuid_dirty; 669 if (fuid_dirtied) 670 zfs_fuid_txhold(zfsvfs, tx); 671 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 672 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 673 if (!zfsvfs->z_use_sa && 674 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 675 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 676 0, acl_ids.z_aclp->z_acl_bytes); 677 } 678 679 error = dmu_tx_assign(tx, 680 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 681 if (error) { 682 zfs_dirent_unlock(dl); 683 if (error == ERESTART) { 684 waited = B_TRUE; 685 dmu_tx_wait(tx); 686 dmu_tx_abort(tx); 687 goto top; 688 } 689 zfs_acl_ids_free(&acl_ids); 690 dmu_tx_abort(tx); 691 zfs_exit(zfsvfs, FTAG); 692 return (error); 693 } 694 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 695 696 error = zfs_link_create(dl, zp, tx, ZNEW); 697 if (error != 0) { 698 /* 699 * Since, we failed to add the directory entry for it, 700 * delete the newly created dnode. 701 */ 702 zfs_znode_delete(zp, tx); 703 remove_inode_hash(ZTOI(zp)); 704 zfs_acl_ids_free(&acl_ids); 705 dmu_tx_commit(tx); 706 goto out; 707 } 708 709 if (fuid_dirtied) 710 zfs_fuid_sync(zfsvfs, tx); 711 712 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 713 if (flag & FIGNORECASE) 714 txtype |= TX_CI; 715 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 716 vsecp, acl_ids.z_fuidp, vap); 717 zfs_acl_ids_free(&acl_ids); 718 dmu_tx_commit(tx); 719 } else { 720 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 721 722 if (have_acl) 723 zfs_acl_ids_free(&acl_ids); 724 725 /* 726 * A directory entry already exists for this name. 727 */ 728 /* 729 * Can't truncate an existing file if in exclusive mode. 730 */ 731 if (excl) { 732 error = SET_ERROR(EEXIST); 733 goto out; 734 } 735 /* 736 * Can't open a directory for writing. 737 */ 738 if (S_ISDIR(ZTOI(zp)->i_mode)) { 739 error = SET_ERROR(EISDIR); 740 goto out; 741 } 742 /* 743 * Verify requested access to file. 744 */ 745 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 746 mnt_ns))) { 747 goto out; 748 } 749 750 mutex_enter(&dzp->z_lock); 751 dzp->z_seq++; 752 mutex_exit(&dzp->z_lock); 753 754 /* 755 * Truncate regular files if requested. 756 */ 757 if (S_ISREG(ZTOI(zp)->i_mode) && 758 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 759 /* we can't hold any locks when calling zfs_freesp() */ 760 if (dl) { 761 zfs_dirent_unlock(dl); 762 dl = NULL; 763 } 764 error = zfs_freesp(zp, 0, 0, mode, TRUE); 765 } 766 } 767 out: 768 769 if (dl) 770 zfs_dirent_unlock(dl); 771 772 if (error) { 773 if (zp) 774 zrele(zp); 775 } else { 776 zfs_znode_update_vfs(dzp); 777 zfs_znode_update_vfs(zp); 778 *zpp = zp; 779 } 780 781 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 782 zil_commit(zilog, 0); 783 784 zfs_exit(zfsvfs, FTAG); 785 return (error); 786 } 787 788 int 789 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 790 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 791 zuserns_t *mnt_ns) 792 { 793 (void) excl, (void) mode, (void) flag; 794 znode_t *zp = NULL, *dzp = ITOZ(dip); 795 zfsvfs_t *zfsvfs = ITOZSB(dip); 796 objset_t *os; 797 dmu_tx_t *tx; 798 int error; 799 uid_t uid; 800 gid_t gid; 801 zfs_acl_ids_t acl_ids; 802 uint64_t projid = ZFS_DEFAULT_PROJID; 803 boolean_t fuid_dirtied; 804 boolean_t have_acl = B_FALSE; 805 boolean_t waited = B_FALSE; 806 807 /* 808 * If we have an ephemeral id, ACL, or XVATTR then 809 * make sure file system is at proper version 810 */ 811 812 gid = crgetgid(cr); 813 uid = crgetuid(cr); 814 815 if (zfsvfs->z_use_fuids == B_FALSE && 816 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 817 return (SET_ERROR(EINVAL)); 818 819 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 820 return (error); 821 os = zfsvfs->z_os; 822 823 if (vap->va_mask & ATTR_XVATTR) { 824 if ((error = secpolicy_xvattr((xvattr_t *)vap, 825 crgetuid(cr), cr, vap->va_mode)) != 0) { 826 zfs_exit(zfsvfs, FTAG); 827 return (error); 828 } 829 } 830 831 top: 832 *ipp = NULL; 833 834 /* 835 * Create a new file object and update the directory 836 * to reference it. 837 */ 838 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 839 if (have_acl) 840 zfs_acl_ids_free(&acl_ids); 841 goto out; 842 } 843 844 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 845 cr, vsecp, &acl_ids, mnt_ns)) != 0) 846 goto out; 847 have_acl = B_TRUE; 848 849 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 850 projid = zfs_inherit_projid(dzp); 851 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 852 zfs_acl_ids_free(&acl_ids); 853 error = SET_ERROR(EDQUOT); 854 goto out; 855 } 856 857 tx = dmu_tx_create(os); 858 859 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 860 ZFS_SA_BASE_ATTR_SIZE); 861 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 862 863 fuid_dirtied = zfsvfs->z_fuid_dirty; 864 if (fuid_dirtied) 865 zfs_fuid_txhold(zfsvfs, tx); 866 if (!zfsvfs->z_use_sa && 867 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 868 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 869 0, acl_ids.z_aclp->z_acl_bytes); 870 } 871 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 872 if (error) { 873 if (error == ERESTART) { 874 waited = B_TRUE; 875 dmu_tx_wait(tx); 876 dmu_tx_abort(tx); 877 goto top; 878 } 879 zfs_acl_ids_free(&acl_ids); 880 dmu_tx_abort(tx); 881 zfs_exit(zfsvfs, FTAG); 882 return (error); 883 } 884 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 885 886 if (fuid_dirtied) 887 zfs_fuid_sync(zfsvfs, tx); 888 889 /* Add to unlinked set */ 890 zp->z_unlinked = B_TRUE; 891 zfs_unlinked_add(zp, tx); 892 zfs_acl_ids_free(&acl_ids); 893 dmu_tx_commit(tx); 894 out: 895 896 if (error) { 897 if (zp) 898 zrele(zp); 899 } else { 900 zfs_znode_update_vfs(dzp); 901 zfs_znode_update_vfs(zp); 902 *ipp = ZTOI(zp); 903 } 904 905 zfs_exit(zfsvfs, FTAG); 906 return (error); 907 } 908 909 /* 910 * Remove an entry from a directory. 911 * 912 * IN: dzp - znode of directory to remove entry from. 913 * name - name of entry to remove. 914 * cr - credentials of caller. 915 * flags - case flags. 916 * 917 * RETURN: 0 if success 918 * error code if failure 919 * 920 * Timestamps: 921 * dzp - ctime|mtime 922 * ip - ctime (if nlink > 0) 923 */ 924 925 static uint64_t null_xattr = 0; 926 927 int 928 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 929 { 930 znode_t *zp; 931 znode_t *xzp; 932 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 933 zilog_t *zilog; 934 uint64_t acl_obj, xattr_obj; 935 uint64_t xattr_obj_unlinked = 0; 936 uint64_t obj = 0; 937 uint64_t links; 938 zfs_dirlock_t *dl; 939 dmu_tx_t *tx; 940 boolean_t may_delete_now, delete_now = FALSE; 941 boolean_t unlinked, toobig = FALSE; 942 uint64_t txtype; 943 pathname_t *realnmp = NULL; 944 pathname_t realnm; 945 int error; 946 int zflg = ZEXISTS; 947 boolean_t waited = B_FALSE; 948 949 if (name == NULL) 950 return (SET_ERROR(EINVAL)); 951 952 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 953 return (error); 954 zilog = zfsvfs->z_log; 955 956 if (flags & FIGNORECASE) { 957 zflg |= ZCILOOK; 958 pn_alloc(&realnm); 959 realnmp = &realnm; 960 } 961 962 top: 963 xattr_obj = 0; 964 xzp = NULL; 965 /* 966 * Attempt to lock directory; fail if entry doesn't exist. 967 */ 968 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 969 NULL, realnmp))) { 970 if (realnmp) 971 pn_free(realnmp); 972 zfs_exit(zfsvfs, FTAG); 973 return (error); 974 } 975 976 if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { 977 goto out; 978 } 979 980 /* 981 * Need to use rmdir for removing directories. 982 */ 983 if (S_ISDIR(ZTOI(zp)->i_mode)) { 984 error = SET_ERROR(EPERM); 985 goto out; 986 } 987 988 mutex_enter(&zp->z_lock); 989 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 990 !(zp->z_is_mapped); 991 mutex_exit(&zp->z_lock); 992 993 /* 994 * We may delete the znode now, or we may put it in the unlinked set; 995 * it depends on whether we're the last link, and on whether there are 996 * other holds on the inode. So we dmu_tx_hold() the right things to 997 * allow for either case. 998 */ 999 obj = zp->z_id; 1000 tx = dmu_tx_create(zfsvfs->z_os); 1001 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1002 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1003 zfs_sa_upgrade_txholds(tx, zp); 1004 zfs_sa_upgrade_txholds(tx, dzp); 1005 if (may_delete_now) { 1006 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1007 /* if the file is too big, only hold_free a token amount */ 1008 dmu_tx_hold_free(tx, zp->z_id, 0, 1009 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1010 } 1011 1012 /* are there any extended attributes? */ 1013 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1014 &xattr_obj, sizeof (xattr_obj)); 1015 if (error == 0 && xattr_obj) { 1016 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1017 ASSERT0(error); 1018 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1019 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1020 } 1021 1022 mutex_enter(&zp->z_lock); 1023 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1024 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1025 mutex_exit(&zp->z_lock); 1026 1027 /* charge as an update -- would be nice not to charge at all */ 1028 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1029 1030 /* 1031 * Mark this transaction as typically resulting in a net free of space 1032 */ 1033 dmu_tx_mark_netfree(tx); 1034 1035 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1036 if (error) { 1037 zfs_dirent_unlock(dl); 1038 if (error == ERESTART) { 1039 waited = B_TRUE; 1040 dmu_tx_wait(tx); 1041 dmu_tx_abort(tx); 1042 zrele(zp); 1043 if (xzp) 1044 zrele(xzp); 1045 goto top; 1046 } 1047 if (realnmp) 1048 pn_free(realnmp); 1049 dmu_tx_abort(tx); 1050 zrele(zp); 1051 if (xzp) 1052 zrele(xzp); 1053 zfs_exit(zfsvfs, FTAG); 1054 return (error); 1055 } 1056 1057 /* 1058 * Remove the directory entry. 1059 */ 1060 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1061 1062 if (error) { 1063 dmu_tx_commit(tx); 1064 goto out; 1065 } 1066 1067 if (unlinked) { 1068 /* 1069 * Hold z_lock so that we can make sure that the ACL obj 1070 * hasn't changed. Could have been deleted due to 1071 * zfs_sa_upgrade(). 1072 */ 1073 mutex_enter(&zp->z_lock); 1074 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1075 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1076 delete_now = may_delete_now && !toobig && 1077 atomic_read(&ZTOI(zp)->i_count) == 1 && 1078 !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && 1079 zfs_external_acl(zp) == acl_obj; 1080 } 1081 1082 if (delete_now) { 1083 if (xattr_obj_unlinked) { 1084 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1085 mutex_enter(&xzp->z_lock); 1086 xzp->z_unlinked = B_TRUE; 1087 clear_nlink(ZTOI(xzp)); 1088 links = 0; 1089 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1090 &links, sizeof (links), tx); 1091 ASSERT3U(error, ==, 0); 1092 mutex_exit(&xzp->z_lock); 1093 zfs_unlinked_add(xzp, tx); 1094 1095 if (zp->z_is_sa) 1096 error = sa_remove(zp->z_sa_hdl, 1097 SA_ZPL_XATTR(zfsvfs), tx); 1098 else 1099 error = sa_update(zp->z_sa_hdl, 1100 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1101 sizeof (uint64_t), tx); 1102 ASSERT0(error); 1103 } 1104 /* 1105 * Add to the unlinked set because a new reference could be 1106 * taken concurrently resulting in a deferred destruction. 1107 */ 1108 zfs_unlinked_add(zp, tx); 1109 mutex_exit(&zp->z_lock); 1110 } else if (unlinked) { 1111 mutex_exit(&zp->z_lock); 1112 zfs_unlinked_add(zp, tx); 1113 } 1114 1115 txtype = TX_REMOVE; 1116 if (flags & FIGNORECASE) 1117 txtype |= TX_CI; 1118 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1119 1120 dmu_tx_commit(tx); 1121 out: 1122 if (realnmp) 1123 pn_free(realnmp); 1124 1125 zfs_dirent_unlock(dl); 1126 zfs_znode_update_vfs(dzp); 1127 zfs_znode_update_vfs(zp); 1128 1129 if (delete_now) 1130 zrele(zp); 1131 else 1132 zfs_zrele_async(zp); 1133 1134 if (xzp) { 1135 zfs_znode_update_vfs(xzp); 1136 zfs_zrele_async(xzp); 1137 } 1138 1139 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1140 zil_commit(zilog, 0); 1141 1142 zfs_exit(zfsvfs, FTAG); 1143 return (error); 1144 } 1145 1146 /* 1147 * Create a new directory and insert it into dzp using the name 1148 * provided. Return a pointer to the inserted directory. 1149 * 1150 * IN: dzp - znode of directory to add subdir to. 1151 * dirname - name of new directory. 1152 * vap - attributes of new directory. 1153 * cr - credentials of caller. 1154 * flags - case flags. 1155 * vsecp - ACL to be set 1156 * mnt_ns - user namespace of the mount 1157 * 1158 * OUT: zpp - znode of created directory. 1159 * 1160 * RETURN: 0 if success 1161 * error code if failure 1162 * 1163 * Timestamps: 1164 * dzp - ctime|mtime updated 1165 * zpp - ctime|mtime|atime updated 1166 */ 1167 int 1168 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1169 cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) 1170 { 1171 znode_t *zp; 1172 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1173 zilog_t *zilog; 1174 zfs_dirlock_t *dl; 1175 uint64_t txtype; 1176 dmu_tx_t *tx; 1177 int error; 1178 int zf = ZNEW; 1179 uid_t uid; 1180 gid_t gid = crgetgid(cr); 1181 zfs_acl_ids_t acl_ids; 1182 boolean_t fuid_dirtied; 1183 boolean_t waited = B_FALSE; 1184 1185 ASSERT(S_ISDIR(vap->va_mode)); 1186 1187 /* 1188 * If we have an ephemeral id, ACL, or XVATTR then 1189 * make sure file system is at proper version 1190 */ 1191 1192 uid = crgetuid(cr); 1193 if (zfsvfs->z_use_fuids == B_FALSE && 1194 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1195 return (SET_ERROR(EINVAL)); 1196 1197 if (dirname == NULL) 1198 return (SET_ERROR(EINVAL)); 1199 1200 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1201 return (error); 1202 zilog = zfsvfs->z_log; 1203 1204 if (dzp->z_pflags & ZFS_XATTR) { 1205 zfs_exit(zfsvfs, FTAG); 1206 return (SET_ERROR(EINVAL)); 1207 } 1208 1209 if (zfsvfs->z_utf8 && u8_validate(dirname, 1210 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1211 zfs_exit(zfsvfs, FTAG); 1212 return (SET_ERROR(EILSEQ)); 1213 } 1214 if (flags & FIGNORECASE) 1215 zf |= ZCILOOK; 1216 1217 if (vap->va_mask & ATTR_XVATTR) { 1218 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1219 crgetuid(cr), cr, vap->va_mode)) != 0) { 1220 zfs_exit(zfsvfs, FTAG); 1221 return (error); 1222 } 1223 } 1224 1225 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1226 vsecp, &acl_ids, mnt_ns)) != 0) { 1227 zfs_exit(zfsvfs, FTAG); 1228 return (error); 1229 } 1230 /* 1231 * First make sure the new directory doesn't exist. 1232 * 1233 * Existence is checked first to make sure we don't return 1234 * EACCES instead of EEXIST which can cause some applications 1235 * to fail. 1236 */ 1237 top: 1238 *zpp = NULL; 1239 1240 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1241 NULL, NULL))) { 1242 zfs_acl_ids_free(&acl_ids); 1243 zfs_exit(zfsvfs, FTAG); 1244 return (error); 1245 } 1246 1247 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1248 mnt_ns))) { 1249 zfs_acl_ids_free(&acl_ids); 1250 zfs_dirent_unlock(dl); 1251 zfs_exit(zfsvfs, FTAG); 1252 return (error); 1253 } 1254 1255 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1256 zfs_acl_ids_free(&acl_ids); 1257 zfs_dirent_unlock(dl); 1258 zfs_exit(zfsvfs, FTAG); 1259 return (SET_ERROR(EDQUOT)); 1260 } 1261 1262 /* 1263 * Add a new entry to the directory. 1264 */ 1265 tx = dmu_tx_create(zfsvfs->z_os); 1266 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1267 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1268 fuid_dirtied = zfsvfs->z_fuid_dirty; 1269 if (fuid_dirtied) 1270 zfs_fuid_txhold(zfsvfs, tx); 1271 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1272 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1273 acl_ids.z_aclp->z_acl_bytes); 1274 } 1275 1276 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1277 ZFS_SA_BASE_ATTR_SIZE); 1278 1279 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1280 if (error) { 1281 zfs_dirent_unlock(dl); 1282 if (error == ERESTART) { 1283 waited = B_TRUE; 1284 dmu_tx_wait(tx); 1285 dmu_tx_abort(tx); 1286 goto top; 1287 } 1288 zfs_acl_ids_free(&acl_ids); 1289 dmu_tx_abort(tx); 1290 zfs_exit(zfsvfs, FTAG); 1291 return (error); 1292 } 1293 1294 /* 1295 * Create new node. 1296 */ 1297 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1298 1299 /* 1300 * Now put new name in parent dir. 1301 */ 1302 error = zfs_link_create(dl, zp, tx, ZNEW); 1303 if (error != 0) { 1304 zfs_znode_delete(zp, tx); 1305 remove_inode_hash(ZTOI(zp)); 1306 goto out; 1307 } 1308 1309 if (fuid_dirtied) 1310 zfs_fuid_sync(zfsvfs, tx); 1311 1312 *zpp = zp; 1313 1314 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1315 if (flags & FIGNORECASE) 1316 txtype |= TX_CI; 1317 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1318 acl_ids.z_fuidp, vap); 1319 1320 out: 1321 zfs_acl_ids_free(&acl_ids); 1322 1323 dmu_tx_commit(tx); 1324 1325 zfs_dirent_unlock(dl); 1326 1327 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1328 zil_commit(zilog, 0); 1329 1330 if (error != 0) { 1331 zrele(zp); 1332 } else { 1333 zfs_znode_update_vfs(dzp); 1334 zfs_znode_update_vfs(zp); 1335 } 1336 zfs_exit(zfsvfs, FTAG); 1337 return (error); 1338 } 1339 1340 /* 1341 * Remove a directory subdir entry. If the current working 1342 * directory is the same as the subdir to be removed, the 1343 * remove will fail. 1344 * 1345 * IN: dzp - znode of directory to remove from. 1346 * name - name of directory to be removed. 1347 * cwd - inode of current working directory. 1348 * cr - credentials of caller. 1349 * flags - case flags 1350 * 1351 * RETURN: 0 on success, error code on failure. 1352 * 1353 * Timestamps: 1354 * dzp - ctime|mtime updated 1355 */ 1356 int 1357 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1358 int flags) 1359 { 1360 znode_t *zp; 1361 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1362 zilog_t *zilog; 1363 zfs_dirlock_t *dl; 1364 dmu_tx_t *tx; 1365 int error; 1366 int zflg = ZEXISTS; 1367 boolean_t waited = B_FALSE; 1368 1369 if (name == NULL) 1370 return (SET_ERROR(EINVAL)); 1371 1372 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1373 return (error); 1374 zilog = zfsvfs->z_log; 1375 1376 if (flags & FIGNORECASE) 1377 zflg |= ZCILOOK; 1378 top: 1379 zp = NULL; 1380 1381 /* 1382 * Attempt to lock directory; fail if entry doesn't exist. 1383 */ 1384 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1385 NULL, NULL))) { 1386 zfs_exit(zfsvfs, FTAG); 1387 return (error); 1388 } 1389 1390 if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { 1391 goto out; 1392 } 1393 1394 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1395 error = SET_ERROR(ENOTDIR); 1396 goto out; 1397 } 1398 1399 if (zp == cwd) { 1400 error = SET_ERROR(EINVAL); 1401 goto out; 1402 } 1403 1404 /* 1405 * Grab a lock on the directory to make sure that no one is 1406 * trying to add (or lookup) entries while we are removing it. 1407 */ 1408 rw_enter(&zp->z_name_lock, RW_WRITER); 1409 1410 /* 1411 * Grab a lock on the parent pointer to make sure we play well 1412 * with the treewalk and directory rename code. 1413 */ 1414 rw_enter(&zp->z_parent_lock, RW_WRITER); 1415 1416 tx = dmu_tx_create(zfsvfs->z_os); 1417 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1418 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1419 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1420 zfs_sa_upgrade_txholds(tx, zp); 1421 zfs_sa_upgrade_txholds(tx, dzp); 1422 dmu_tx_mark_netfree(tx); 1423 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1424 if (error) { 1425 rw_exit(&zp->z_parent_lock); 1426 rw_exit(&zp->z_name_lock); 1427 zfs_dirent_unlock(dl); 1428 if (error == ERESTART) { 1429 waited = B_TRUE; 1430 dmu_tx_wait(tx); 1431 dmu_tx_abort(tx); 1432 zrele(zp); 1433 goto top; 1434 } 1435 dmu_tx_abort(tx); 1436 zrele(zp); 1437 zfs_exit(zfsvfs, FTAG); 1438 return (error); 1439 } 1440 1441 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1442 1443 if (error == 0) { 1444 uint64_t txtype = TX_RMDIR; 1445 if (flags & FIGNORECASE) 1446 txtype |= TX_CI; 1447 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1448 B_FALSE); 1449 } 1450 1451 dmu_tx_commit(tx); 1452 1453 rw_exit(&zp->z_parent_lock); 1454 rw_exit(&zp->z_name_lock); 1455 out: 1456 zfs_dirent_unlock(dl); 1457 1458 zfs_znode_update_vfs(dzp); 1459 zfs_znode_update_vfs(zp); 1460 zrele(zp); 1461 1462 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1463 zil_commit(zilog, 0); 1464 1465 zfs_exit(zfsvfs, FTAG); 1466 return (error); 1467 } 1468 1469 /* 1470 * Read directory entries from the given directory cursor position and emit 1471 * name and position for each entry. 1472 * 1473 * IN: ip - inode of directory to read. 1474 * ctx - directory entry context. 1475 * cr - credentials of caller. 1476 * 1477 * RETURN: 0 if success 1478 * error code if failure 1479 * 1480 * Timestamps: 1481 * ip - atime updated 1482 * 1483 * Note that the low 4 bits of the cookie returned by zap is always zero. 1484 * This allows us to use the low range for "special" directory entries: 1485 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1486 * we use the offset 2 for the '.zfs' directory. 1487 */ 1488 int 1489 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) 1490 { 1491 (void) cr; 1492 znode_t *zp = ITOZ(ip); 1493 zfsvfs_t *zfsvfs = ITOZSB(ip); 1494 objset_t *os; 1495 zap_cursor_t zc; 1496 zap_attribute_t zap; 1497 int error; 1498 uint8_t prefetch; 1499 uint8_t type; 1500 int done = 0; 1501 uint64_t parent; 1502 uint64_t offset; /* must be unsigned; checks for < 1 */ 1503 1504 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1505 return (error); 1506 1507 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1508 &parent, sizeof (parent))) != 0) 1509 goto out; 1510 1511 /* 1512 * Quit if directory has been removed (posix) 1513 */ 1514 if (zp->z_unlinked) 1515 goto out; 1516 1517 error = 0; 1518 os = zfsvfs->z_os; 1519 offset = ctx->pos; 1520 prefetch = zp->z_zn_prefetch; 1521 1522 /* 1523 * Initialize the iterator cursor. 1524 */ 1525 if (offset <= 3) { 1526 /* 1527 * Start iteration from the beginning of the directory. 1528 */ 1529 zap_cursor_init(&zc, os, zp->z_id); 1530 } else { 1531 /* 1532 * The offset is a serialized cursor. 1533 */ 1534 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1535 } 1536 1537 /* 1538 * Transform to file-system independent format 1539 */ 1540 while (!done) { 1541 uint64_t objnum; 1542 /* 1543 * Special case `.', `..', and `.zfs'. 1544 */ 1545 if (offset == 0) { 1546 (void) strcpy(zap.za_name, "."); 1547 zap.za_normalization_conflict = 0; 1548 objnum = zp->z_id; 1549 type = DT_DIR; 1550 } else if (offset == 1) { 1551 (void) strcpy(zap.za_name, ".."); 1552 zap.za_normalization_conflict = 0; 1553 objnum = parent; 1554 type = DT_DIR; 1555 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1556 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1557 zap.za_normalization_conflict = 0; 1558 objnum = ZFSCTL_INO_ROOT; 1559 type = DT_DIR; 1560 } else { 1561 /* 1562 * Grab next entry. 1563 */ 1564 if ((error = zap_cursor_retrieve(&zc, &zap))) { 1565 if (error == ENOENT) 1566 break; 1567 else 1568 goto update; 1569 } 1570 1571 /* 1572 * Allow multiple entries provided the first entry is 1573 * the object id. Non-zpl consumers may safely make 1574 * use of the additional space. 1575 * 1576 * XXX: This should be a feature flag for compatibility 1577 */ 1578 if (zap.za_integer_length != 8 || 1579 zap.za_num_integers == 0) { 1580 cmn_err(CE_WARN, "zap_readdir: bad directory " 1581 "entry, obj = %lld, offset = %lld, " 1582 "length = %d, num = %lld\n", 1583 (u_longlong_t)zp->z_id, 1584 (u_longlong_t)offset, 1585 zap.za_integer_length, 1586 (u_longlong_t)zap.za_num_integers); 1587 error = SET_ERROR(ENXIO); 1588 goto update; 1589 } 1590 1591 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1592 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1593 } 1594 1595 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), 1596 objnum, type); 1597 if (done) 1598 break; 1599 1600 /* Prefetch znode */ 1601 if (prefetch) { 1602 dmu_prefetch(os, objnum, 0, 0, 0, 1603 ZIO_PRIORITY_SYNC_READ); 1604 } 1605 1606 /* 1607 * Move to the next entry, fill in the previous offset. 1608 */ 1609 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1610 zap_cursor_advance(&zc); 1611 offset = zap_cursor_serialize(&zc); 1612 } else { 1613 offset += 1; 1614 } 1615 ctx->pos = offset; 1616 } 1617 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1618 1619 update: 1620 zap_cursor_fini(&zc); 1621 if (error == ENOENT) 1622 error = 0; 1623 out: 1624 zfs_exit(zfsvfs, FTAG); 1625 1626 return (error); 1627 } 1628 1629 /* 1630 * Get the basic file attributes and place them in the provided kstat 1631 * structure. The inode is assumed to be the authoritative source 1632 * for most of the attributes. However, the znode currently has the 1633 * authoritative atime, blksize, and block count. 1634 * 1635 * IN: ip - inode of file. 1636 * 1637 * OUT: sp - kstat values. 1638 * 1639 * RETURN: 0 (always succeeds) 1640 */ 1641 int 1642 zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, 1643 struct kstat *sp) 1644 { 1645 znode_t *zp = ITOZ(ip); 1646 zfsvfs_t *zfsvfs = ITOZSB(ip); 1647 uint32_t blksize; 1648 u_longlong_t nblocks; 1649 int error; 1650 1651 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1652 return (error); 1653 1654 mutex_enter(&zp->z_lock); 1655 1656 zpl_generic_fillattr(user_ns, ip, sp); 1657 /* 1658 * +1 link count for root inode with visible '.zfs' directory. 1659 */ 1660 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1661 if (sp->nlink < ZFS_LINK_MAX) 1662 sp->nlink++; 1663 1664 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1665 sp->blksize = blksize; 1666 sp->blocks = nblocks; 1667 1668 if (unlikely(zp->z_blksz == 0)) { 1669 /* 1670 * Block size hasn't been set; suggest maximal I/O transfers. 1671 */ 1672 sp->blksize = zfsvfs->z_max_blksz; 1673 } 1674 1675 mutex_exit(&zp->z_lock); 1676 1677 /* 1678 * Required to prevent NFS client from detecting different inode 1679 * numbers of snapshot root dentry before and after snapshot mount. 1680 */ 1681 if (zfsvfs->z_issnap) { 1682 if (ip->i_sb->s_root->d_inode == ip) 1683 sp->ino = ZFSCTL_INO_SNAPDIRS - 1684 dmu_objset_id(zfsvfs->z_os); 1685 } 1686 1687 zfs_exit(zfsvfs, FTAG); 1688 1689 return (0); 1690 } 1691 1692 /* 1693 * For the operation of changing file's user/group/project, we need to 1694 * handle not only the main object that is assigned to the file directly, 1695 * but also the ones that are used by the file via hidden xattr directory. 1696 * 1697 * Because the xattr directory may contains many EA entries, as to it may 1698 * be impossible to change all of them via the transaction of changing the 1699 * main object's user/group/project attributes. Then we have to change them 1700 * via other multiple independent transactions one by one. It may be not good 1701 * solution, but we have no better idea yet. 1702 */ 1703 static int 1704 zfs_setattr_dir(znode_t *dzp) 1705 { 1706 struct inode *dxip = ZTOI(dzp); 1707 struct inode *xip = NULL; 1708 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1709 objset_t *os = zfsvfs->z_os; 1710 zap_cursor_t zc; 1711 zap_attribute_t zap; 1712 zfs_dirlock_t *dl; 1713 znode_t *zp = NULL; 1714 dmu_tx_t *tx = NULL; 1715 uint64_t uid, gid; 1716 sa_bulk_attr_t bulk[4]; 1717 int count; 1718 int err; 1719 1720 zap_cursor_init(&zc, os, dzp->z_id); 1721 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 1722 count = 0; 1723 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 1724 err = ENXIO; 1725 break; 1726 } 1727 1728 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 1729 ZEXISTS, NULL, NULL); 1730 if (err == ENOENT) 1731 goto next; 1732 if (err) 1733 break; 1734 1735 xip = ZTOI(zp); 1736 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1737 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1738 zp->z_projid == dzp->z_projid) 1739 goto next; 1740 1741 tx = dmu_tx_create(os); 1742 if (!(zp->z_pflags & ZFS_PROJID)) 1743 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1744 else 1745 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1746 1747 err = dmu_tx_assign(tx, TXG_WAIT); 1748 if (err) 1749 break; 1750 1751 mutex_enter(&dzp->z_lock); 1752 1753 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1754 xip->i_uid = dxip->i_uid; 1755 uid = zfs_uid_read(dxip); 1756 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1757 &uid, sizeof (uid)); 1758 } 1759 1760 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1761 xip->i_gid = dxip->i_gid; 1762 gid = zfs_gid_read(dxip); 1763 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1764 &gid, sizeof (gid)); 1765 } 1766 1767 if (zp->z_projid != dzp->z_projid) { 1768 if (!(zp->z_pflags & ZFS_PROJID)) { 1769 zp->z_pflags |= ZFS_PROJID; 1770 SA_ADD_BULK_ATTR(bulk, count, 1771 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 1772 sizeof (zp->z_pflags)); 1773 } 1774 1775 zp->z_projid = dzp->z_projid; 1776 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), 1777 NULL, &zp->z_projid, sizeof (zp->z_projid)); 1778 } 1779 1780 mutex_exit(&dzp->z_lock); 1781 1782 if (likely(count > 0)) { 1783 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1784 dmu_tx_commit(tx); 1785 } else { 1786 dmu_tx_abort(tx); 1787 } 1788 tx = NULL; 1789 if (err != 0 && err != ENOENT) 1790 break; 1791 1792 next: 1793 if (zp) { 1794 zrele(zp); 1795 zp = NULL; 1796 zfs_dirent_unlock(dl); 1797 } 1798 zap_cursor_advance(&zc); 1799 } 1800 1801 if (tx) 1802 dmu_tx_abort(tx); 1803 if (zp) { 1804 zrele(zp); 1805 zfs_dirent_unlock(dl); 1806 } 1807 zap_cursor_fini(&zc); 1808 1809 return (err == ENOENT ? 0 : err); 1810 } 1811 1812 /* 1813 * Set the file attributes to the values contained in the 1814 * vattr structure. 1815 * 1816 * IN: zp - znode of file to be modified. 1817 * vap - new attribute values. 1818 * If ATTR_XVATTR set, then optional attrs are being set 1819 * flags - ATTR_UTIME set if non-default time values provided. 1820 * - ATTR_NOACLCHECK (CIFS context only). 1821 * cr - credentials of caller. 1822 * mnt_ns - user namespace of the mount 1823 * 1824 * RETURN: 0 if success 1825 * error code if failure 1826 * 1827 * Timestamps: 1828 * ip - ctime updated, mtime updated if size changed. 1829 */ 1830 int 1831 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) 1832 { 1833 struct inode *ip; 1834 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1835 objset_t *os = zfsvfs->z_os; 1836 zilog_t *zilog; 1837 dmu_tx_t *tx; 1838 vattr_t oldva; 1839 xvattr_t *tmpxvattr; 1840 uint_t mask = vap->va_mask; 1841 uint_t saved_mask = 0; 1842 int trim_mask = 0; 1843 uint64_t new_mode; 1844 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1845 uint64_t xattr_obj; 1846 uint64_t mtime[2], ctime[2], atime[2]; 1847 uint64_t projid = ZFS_INVALID_PROJID; 1848 znode_t *attrzp; 1849 int need_policy = FALSE; 1850 int err, err2 = 0; 1851 zfs_fuid_info_t *fuidp = NULL; 1852 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1853 xoptattr_t *xoap; 1854 zfs_acl_t *aclp; 1855 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1856 boolean_t fuid_dirtied = B_FALSE; 1857 boolean_t handle_eadir = B_FALSE; 1858 sa_bulk_attr_t *bulk, *xattr_bulk; 1859 int count = 0, xattr_count = 0, bulks = 8; 1860 1861 if (mask == 0) 1862 return (0); 1863 1864 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1865 return (err); 1866 ip = ZTOI(zp); 1867 1868 /* 1869 * If this is a xvattr_t, then get a pointer to the structure of 1870 * optional attributes. If this is NULL, then we have a vattr_t. 1871 */ 1872 xoap = xva_getxoptattr(xvap); 1873 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1874 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1875 if (!dmu_objset_projectquota_enabled(os) || 1876 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1877 zfs_exit(zfsvfs, FTAG); 1878 return (SET_ERROR(ENOTSUP)); 1879 } 1880 1881 projid = xoap->xoa_projid; 1882 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1883 zfs_exit(zfsvfs, FTAG); 1884 return (SET_ERROR(EINVAL)); 1885 } 1886 1887 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1888 projid = ZFS_INVALID_PROJID; 1889 else 1890 need_policy = TRUE; 1891 } 1892 1893 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1894 (xoap->xoa_projinherit != 1895 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1896 (!dmu_objset_projectquota_enabled(os) || 1897 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1898 zfs_exit(zfsvfs, FTAG); 1899 return (SET_ERROR(ENOTSUP)); 1900 } 1901 } 1902 1903 zilog = zfsvfs->z_log; 1904 1905 /* 1906 * Make sure that if we have ephemeral uid/gid or xvattr specified 1907 * that file system is at proper version level 1908 */ 1909 1910 if (zfsvfs->z_use_fuids == B_FALSE && 1911 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 1912 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 1913 (mask & ATTR_XVATTR))) { 1914 zfs_exit(zfsvfs, FTAG); 1915 return (SET_ERROR(EINVAL)); 1916 } 1917 1918 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 1919 zfs_exit(zfsvfs, FTAG); 1920 return (SET_ERROR(EISDIR)); 1921 } 1922 1923 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 1924 zfs_exit(zfsvfs, FTAG); 1925 return (SET_ERROR(EINVAL)); 1926 } 1927 1928 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 1929 xva_init(tmpxvattr); 1930 1931 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1932 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1933 1934 /* 1935 * Immutable files can only alter immutable bit and atime 1936 */ 1937 if ((zp->z_pflags & ZFS_IMMUTABLE) && 1938 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 1939 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 1940 err = SET_ERROR(EPERM); 1941 goto out3; 1942 } 1943 1944 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 1945 err = SET_ERROR(EPERM); 1946 goto out3; 1947 } 1948 1949 /* 1950 * Verify timestamps doesn't overflow 32 bits. 1951 * ZFS can handle large timestamps, but 32bit syscalls can't 1952 * handle times greater than 2039. This check should be removed 1953 * once large timestamps are fully supported. 1954 */ 1955 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 1956 if (((mask & ATTR_ATIME) && 1957 TIMESPEC_OVERFLOW(&vap->va_atime)) || 1958 ((mask & ATTR_MTIME) && 1959 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 1960 err = SET_ERROR(EOVERFLOW); 1961 goto out3; 1962 } 1963 } 1964 1965 top: 1966 attrzp = NULL; 1967 aclp = NULL; 1968 1969 /* Can this be moved to before the top label? */ 1970 if (zfs_is_readonly(zfsvfs)) { 1971 err = SET_ERROR(EROFS); 1972 goto out3; 1973 } 1974 1975 /* 1976 * First validate permissions 1977 */ 1978 1979 if (mask & ATTR_SIZE) { 1980 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 1981 mnt_ns); 1982 if (err) 1983 goto out3; 1984 1985 /* 1986 * XXX - Note, we are not providing any open 1987 * mode flags here (like FNDELAY), so we may 1988 * block if there are locks present... this 1989 * should be addressed in openat(). 1990 */ 1991 /* XXX - would it be OK to generate a log record here? */ 1992 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 1993 if (err) 1994 goto out3; 1995 } 1996 1997 if (mask & (ATTR_ATIME|ATTR_MTIME) || 1998 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 1999 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2000 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2001 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2002 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2003 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2004 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2005 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2006 skipaclchk, cr, mnt_ns); 2007 } 2008 2009 if (mask & (ATTR_UID|ATTR_GID)) { 2010 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2011 int take_owner; 2012 int take_group; 2013 uid_t uid; 2014 gid_t gid; 2015 2016 /* 2017 * NOTE: even if a new mode is being set, 2018 * we may clear S_ISUID/S_ISGID bits. 2019 */ 2020 2021 if (!(mask & ATTR_MODE)) 2022 vap->va_mode = zp->z_mode; 2023 2024 /* 2025 * Take ownership or chgrp to group we are a member of 2026 */ 2027 2028 uid = zfs_uid_to_vfsuid((struct user_namespace *)mnt_ns, 2029 zfs_i_user_ns(ip), vap->va_uid); 2030 gid = zfs_gid_to_vfsgid((struct user_namespace *)mnt_ns, 2031 zfs_i_user_ns(ip), vap->va_gid); 2032 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2033 take_group = (mask & ATTR_GID) && 2034 zfs_groupmember(zfsvfs, gid, cr); 2035 2036 /* 2037 * If both ATTR_UID and ATTR_GID are set then take_owner and 2038 * take_group must both be set in order to allow taking 2039 * ownership. 2040 * 2041 * Otherwise, send the check through secpolicy_vnode_setattr() 2042 * 2043 */ 2044 2045 if (((idmask == (ATTR_UID|ATTR_GID)) && 2046 take_owner && take_group) || 2047 ((idmask == ATTR_UID) && take_owner) || 2048 ((idmask == ATTR_GID) && take_group)) { 2049 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2050 skipaclchk, cr, mnt_ns) == 0) { 2051 /* 2052 * Remove setuid/setgid for non-privileged users 2053 */ 2054 (void) secpolicy_setid_clear(vap, cr); 2055 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2056 } else { 2057 need_policy = TRUE; 2058 } 2059 } else { 2060 need_policy = TRUE; 2061 } 2062 } 2063 2064 mutex_enter(&zp->z_lock); 2065 oldva.va_mode = zp->z_mode; 2066 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2067 if (mask & ATTR_XVATTR) { 2068 /* 2069 * Update xvattr mask to include only those attributes 2070 * that are actually changing. 2071 * 2072 * the bits will be restored prior to actually setting 2073 * the attributes so the caller thinks they were set. 2074 */ 2075 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2076 if (xoap->xoa_appendonly != 2077 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2078 need_policy = TRUE; 2079 } else { 2080 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2081 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2082 } 2083 } 2084 2085 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2086 if (xoap->xoa_projinherit != 2087 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2088 need_policy = TRUE; 2089 } else { 2090 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2091 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2092 } 2093 } 2094 2095 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2096 if (xoap->xoa_nounlink != 2097 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2098 need_policy = TRUE; 2099 } else { 2100 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2101 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2102 } 2103 } 2104 2105 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2106 if (xoap->xoa_immutable != 2107 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2108 need_policy = TRUE; 2109 } else { 2110 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2111 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2112 } 2113 } 2114 2115 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2116 if (xoap->xoa_nodump != 2117 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2118 need_policy = TRUE; 2119 } else { 2120 XVA_CLR_REQ(xvap, XAT_NODUMP); 2121 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2122 } 2123 } 2124 2125 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2126 if (xoap->xoa_av_modified != 2127 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2128 need_policy = TRUE; 2129 } else { 2130 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2131 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2132 } 2133 } 2134 2135 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2136 if ((!S_ISREG(ip->i_mode) && 2137 xoap->xoa_av_quarantined) || 2138 xoap->xoa_av_quarantined != 2139 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2140 need_policy = TRUE; 2141 } else { 2142 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2143 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2144 } 2145 } 2146 2147 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2148 mutex_exit(&zp->z_lock); 2149 err = SET_ERROR(EPERM); 2150 goto out3; 2151 } 2152 2153 if (need_policy == FALSE && 2154 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2155 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2156 need_policy = TRUE; 2157 } 2158 } 2159 2160 mutex_exit(&zp->z_lock); 2161 2162 if (mask & ATTR_MODE) { 2163 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2164 mnt_ns) == 0) { 2165 err = secpolicy_setid_setsticky_clear(ip, vap, 2166 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2167 if (err) 2168 goto out3; 2169 trim_mask |= ATTR_MODE; 2170 } else { 2171 need_policy = TRUE; 2172 } 2173 } 2174 2175 if (need_policy) { 2176 /* 2177 * If trim_mask is set then take ownership 2178 * has been granted or write_acl is present and user 2179 * has the ability to modify mode. In that case remove 2180 * UID|GID and or MODE from mask so that 2181 * secpolicy_vnode_setattr() doesn't revoke it. 2182 */ 2183 2184 if (trim_mask) { 2185 saved_mask = vap->va_mask; 2186 vap->va_mask &= ~trim_mask; 2187 } 2188 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2189 zfs_zaccess_unix, zp); 2190 if (err) 2191 goto out3; 2192 2193 if (trim_mask) 2194 vap->va_mask |= saved_mask; 2195 } 2196 2197 /* 2198 * secpolicy_vnode_setattr, or take ownership may have 2199 * changed va_mask 2200 */ 2201 mask = vap->va_mask; 2202 2203 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2204 handle_eadir = B_TRUE; 2205 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2206 &xattr_obj, sizeof (xattr_obj)); 2207 2208 if (err == 0 && xattr_obj) { 2209 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2210 if (err) 2211 goto out2; 2212 } 2213 if (mask & ATTR_UID) { 2214 new_kuid = zfs_fuid_create(zfsvfs, 2215 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2216 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2217 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2218 new_kuid)) { 2219 if (attrzp) 2220 zrele(attrzp); 2221 err = SET_ERROR(EDQUOT); 2222 goto out2; 2223 } 2224 } 2225 2226 if (mask & ATTR_GID) { 2227 new_kgid = zfs_fuid_create(zfsvfs, 2228 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2229 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2230 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2231 new_kgid)) { 2232 if (attrzp) 2233 zrele(attrzp); 2234 err = SET_ERROR(EDQUOT); 2235 goto out2; 2236 } 2237 } 2238 2239 if (projid != ZFS_INVALID_PROJID && 2240 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2241 if (attrzp) 2242 zrele(attrzp); 2243 err = EDQUOT; 2244 goto out2; 2245 } 2246 } 2247 tx = dmu_tx_create(os); 2248 2249 if (mask & ATTR_MODE) { 2250 uint64_t pmode = zp->z_mode; 2251 uint64_t acl_obj; 2252 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2253 2254 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2255 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2256 err = EPERM; 2257 goto out; 2258 } 2259 2260 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2261 goto out; 2262 2263 mutex_enter(&zp->z_lock); 2264 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2265 /* 2266 * Are we upgrading ACL from old V0 format 2267 * to V1 format? 2268 */ 2269 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2270 zfs_znode_acl_version(zp) == 2271 ZFS_ACL_VERSION_INITIAL) { 2272 dmu_tx_hold_free(tx, acl_obj, 0, 2273 DMU_OBJECT_END); 2274 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2275 0, aclp->z_acl_bytes); 2276 } else { 2277 dmu_tx_hold_write(tx, acl_obj, 0, 2278 aclp->z_acl_bytes); 2279 } 2280 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2281 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2282 0, aclp->z_acl_bytes); 2283 } 2284 mutex_exit(&zp->z_lock); 2285 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2286 } else { 2287 if (((mask & ATTR_XVATTR) && 2288 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2289 (projid != ZFS_INVALID_PROJID && 2290 !(zp->z_pflags & ZFS_PROJID))) 2291 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2292 else 2293 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2294 } 2295 2296 if (attrzp) { 2297 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2298 } 2299 2300 fuid_dirtied = zfsvfs->z_fuid_dirty; 2301 if (fuid_dirtied) 2302 zfs_fuid_txhold(zfsvfs, tx); 2303 2304 zfs_sa_upgrade_txholds(tx, zp); 2305 2306 err = dmu_tx_assign(tx, TXG_WAIT); 2307 if (err) 2308 goto out; 2309 2310 count = 0; 2311 /* 2312 * Set each attribute requested. 2313 * We group settings according to the locks they need to acquire. 2314 * 2315 * Note: you cannot set ctime directly, although it will be 2316 * updated as a side-effect of calling this function. 2317 */ 2318 2319 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2320 /* 2321 * For the existed object that is upgraded from old system, 2322 * its on-disk layout has no slot for the project ID attribute. 2323 * But quota accounting logic needs to access related slots by 2324 * offset directly. So we need to adjust old objects' layout 2325 * to make the project ID to some unified and fixed offset. 2326 */ 2327 if (attrzp) 2328 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2329 if (err == 0) 2330 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2331 2332 if (unlikely(err == EEXIST)) 2333 err = 0; 2334 else if (err != 0) 2335 goto out; 2336 else 2337 projid = ZFS_INVALID_PROJID; 2338 } 2339 2340 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2341 mutex_enter(&zp->z_acl_lock); 2342 mutex_enter(&zp->z_lock); 2343 2344 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2345 &zp->z_pflags, sizeof (zp->z_pflags)); 2346 2347 if (attrzp) { 2348 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2349 mutex_enter(&attrzp->z_acl_lock); 2350 mutex_enter(&attrzp->z_lock); 2351 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2352 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2353 sizeof (attrzp->z_pflags)); 2354 if (projid != ZFS_INVALID_PROJID) { 2355 attrzp->z_projid = projid; 2356 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2357 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2358 sizeof (attrzp->z_projid)); 2359 } 2360 } 2361 2362 if (mask & (ATTR_UID|ATTR_GID)) { 2363 2364 if (mask & ATTR_UID) { 2365 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2366 new_uid = zfs_uid_read(ZTOI(zp)); 2367 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2368 &new_uid, sizeof (new_uid)); 2369 if (attrzp) { 2370 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2371 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2372 sizeof (new_uid)); 2373 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2374 } 2375 } 2376 2377 if (mask & ATTR_GID) { 2378 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2379 new_gid = zfs_gid_read(ZTOI(zp)); 2380 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2381 NULL, &new_gid, sizeof (new_gid)); 2382 if (attrzp) { 2383 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2384 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2385 sizeof (new_gid)); 2386 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2387 } 2388 } 2389 if (!(mask & ATTR_MODE)) { 2390 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2391 NULL, &new_mode, sizeof (new_mode)); 2392 new_mode = zp->z_mode; 2393 } 2394 err = zfs_acl_chown_setattr(zp); 2395 ASSERT(err == 0); 2396 if (attrzp) { 2397 err = zfs_acl_chown_setattr(attrzp); 2398 ASSERT(err == 0); 2399 } 2400 } 2401 2402 if (mask & ATTR_MODE) { 2403 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2404 &new_mode, sizeof (new_mode)); 2405 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2406 ASSERT3P(aclp, !=, NULL); 2407 err = zfs_aclset_common(zp, aclp, cr, tx); 2408 ASSERT0(err); 2409 if (zp->z_acl_cached) 2410 zfs_acl_free(zp->z_acl_cached); 2411 zp->z_acl_cached = aclp; 2412 aclp = NULL; 2413 } 2414 2415 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2416 zp->z_atime_dirty = B_FALSE; 2417 ZFS_TIME_ENCODE(&ip->i_atime, atime); 2418 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2419 &atime, sizeof (atime)); 2420 } 2421 2422 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2423 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2424 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( 2425 vap->va_mtime, ZTOI(zp)); 2426 2427 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2428 mtime, sizeof (mtime)); 2429 } 2430 2431 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2432 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2433 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, 2434 ZTOI(zp)); 2435 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2436 ctime, sizeof (ctime)); 2437 } 2438 2439 if (projid != ZFS_INVALID_PROJID) { 2440 zp->z_projid = projid; 2441 SA_ADD_BULK_ATTR(bulk, count, 2442 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2443 sizeof (zp->z_projid)); 2444 } 2445 2446 if (attrzp && mask) { 2447 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2448 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2449 sizeof (ctime)); 2450 } 2451 2452 /* 2453 * Do this after setting timestamps to prevent timestamp 2454 * update from toggling bit 2455 */ 2456 2457 if (xoap && (mask & ATTR_XVATTR)) { 2458 2459 /* 2460 * restore trimmed off masks 2461 * so that return masks can be set for caller. 2462 */ 2463 2464 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2465 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2466 } 2467 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2468 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2469 } 2470 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2471 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2472 } 2473 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2474 XVA_SET_REQ(xvap, XAT_NODUMP); 2475 } 2476 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2477 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2478 } 2479 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2480 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2481 } 2482 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2483 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2484 } 2485 2486 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2487 ASSERT(S_ISREG(ip->i_mode)); 2488 2489 zfs_xvattr_set(zp, xvap, tx); 2490 } 2491 2492 if (fuid_dirtied) 2493 zfs_fuid_sync(zfsvfs, tx); 2494 2495 if (mask != 0) 2496 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2497 2498 mutex_exit(&zp->z_lock); 2499 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2500 mutex_exit(&zp->z_acl_lock); 2501 2502 if (attrzp) { 2503 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2504 mutex_exit(&attrzp->z_acl_lock); 2505 mutex_exit(&attrzp->z_lock); 2506 } 2507 out: 2508 if (err == 0 && xattr_count > 0) { 2509 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2510 xattr_count, tx); 2511 ASSERT(err2 == 0); 2512 } 2513 2514 if (aclp) 2515 zfs_acl_free(aclp); 2516 2517 if (fuidp) { 2518 zfs_fuid_info_free(fuidp); 2519 fuidp = NULL; 2520 } 2521 2522 if (err) { 2523 dmu_tx_abort(tx); 2524 if (attrzp) 2525 zrele(attrzp); 2526 if (err == ERESTART) 2527 goto top; 2528 } else { 2529 if (count > 0) 2530 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2531 dmu_tx_commit(tx); 2532 if (attrzp) { 2533 if (err2 == 0 && handle_eadir) 2534 err = zfs_setattr_dir(attrzp); 2535 zrele(attrzp); 2536 } 2537 zfs_znode_update_vfs(zp); 2538 } 2539 2540 out2: 2541 if (os->os_sync == ZFS_SYNC_ALWAYS) 2542 zil_commit(zilog, 0); 2543 2544 out3: 2545 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2546 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2547 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2548 zfs_exit(zfsvfs, FTAG); 2549 return (err); 2550 } 2551 2552 typedef struct zfs_zlock { 2553 krwlock_t *zl_rwlock; /* lock we acquired */ 2554 znode_t *zl_znode; /* znode we held */ 2555 struct zfs_zlock *zl_next; /* next in list */ 2556 } zfs_zlock_t; 2557 2558 /* 2559 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2560 */ 2561 static void 2562 zfs_rename_unlock(zfs_zlock_t **zlpp) 2563 { 2564 zfs_zlock_t *zl; 2565 2566 while ((zl = *zlpp) != NULL) { 2567 if (zl->zl_znode != NULL) 2568 zfs_zrele_async(zl->zl_znode); 2569 rw_exit(zl->zl_rwlock); 2570 *zlpp = zl->zl_next; 2571 kmem_free(zl, sizeof (*zl)); 2572 } 2573 } 2574 2575 /* 2576 * Search back through the directory tree, using the ".." entries. 2577 * Lock each directory in the chain to prevent concurrent renames. 2578 * Fail any attempt to move a directory into one of its own descendants. 2579 * XXX - z_parent_lock can overlap with map or grow locks 2580 */ 2581 static int 2582 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2583 { 2584 zfs_zlock_t *zl; 2585 znode_t *zp = tdzp; 2586 uint64_t rootid = ZTOZSB(zp)->z_root; 2587 uint64_t oidp = zp->z_id; 2588 krwlock_t *rwlp = &szp->z_parent_lock; 2589 krw_t rw = RW_WRITER; 2590 2591 /* 2592 * First pass write-locks szp and compares to zp->z_id. 2593 * Later passes read-lock zp and compare to zp->z_parent. 2594 */ 2595 do { 2596 if (!rw_tryenter(rwlp, rw)) { 2597 /* 2598 * Another thread is renaming in this path. 2599 * Note that if we are a WRITER, we don't have any 2600 * parent_locks held yet. 2601 */ 2602 if (rw == RW_READER && zp->z_id > szp->z_id) { 2603 /* 2604 * Drop our locks and restart 2605 */ 2606 zfs_rename_unlock(&zl); 2607 *zlpp = NULL; 2608 zp = tdzp; 2609 oidp = zp->z_id; 2610 rwlp = &szp->z_parent_lock; 2611 rw = RW_WRITER; 2612 continue; 2613 } else { 2614 /* 2615 * Wait for other thread to drop its locks 2616 */ 2617 rw_enter(rwlp, rw); 2618 } 2619 } 2620 2621 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2622 zl->zl_rwlock = rwlp; 2623 zl->zl_znode = NULL; 2624 zl->zl_next = *zlpp; 2625 *zlpp = zl; 2626 2627 if (oidp == szp->z_id) /* We're a descendant of szp */ 2628 return (SET_ERROR(EINVAL)); 2629 2630 if (oidp == rootid) /* We've hit the top */ 2631 return (0); 2632 2633 if (rw == RW_READER) { /* i.e. not the first pass */ 2634 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2635 if (error) 2636 return (error); 2637 zl->zl_znode = zp; 2638 } 2639 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2640 &oidp, sizeof (oidp)); 2641 rwlp = &zp->z_parent_lock; 2642 rw = RW_READER; 2643 2644 } while (zp->z_id != sdzp->z_id); 2645 2646 return (0); 2647 } 2648 2649 /* 2650 * Move an entry from the provided source directory to the target 2651 * directory. Change the entry name as indicated. 2652 * 2653 * IN: sdzp - Source directory containing the "old entry". 2654 * snm - Old entry name. 2655 * tdzp - Target directory to contain the "new entry". 2656 * tnm - New entry name. 2657 * cr - credentials of caller. 2658 * flags - case flags 2659 * rflags - RENAME_* flags 2660 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2661 * mnt_ns - user namespace of the mount 2662 * 2663 * RETURN: 0 on success, error code on failure. 2664 * 2665 * Timestamps: 2666 * sdzp,tdzp - ctime|mtime updated 2667 */ 2668 int 2669 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2670 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) 2671 { 2672 znode_t *szp, *tzp; 2673 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2674 zilog_t *zilog; 2675 zfs_dirlock_t *sdl, *tdl; 2676 dmu_tx_t *tx; 2677 zfs_zlock_t *zl; 2678 int cmp, serr, terr; 2679 int error = 0; 2680 int zflg = 0; 2681 boolean_t waited = B_FALSE; 2682 /* Needed for whiteout inode creation. */ 2683 boolean_t fuid_dirtied; 2684 zfs_acl_ids_t acl_ids; 2685 boolean_t have_acl = B_FALSE; 2686 znode_t *wzp = NULL; 2687 2688 2689 if (snm == NULL || tnm == NULL) 2690 return (SET_ERROR(EINVAL)); 2691 2692 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2693 return (SET_ERROR(EINVAL)); 2694 2695 /* Already checked by Linux VFS, but just to make sure. */ 2696 if (rflags & RENAME_EXCHANGE && 2697 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2698 return (SET_ERROR(EINVAL)); 2699 2700 /* 2701 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2702 * right kind of vattr_t for the whiteout file. These are set 2703 * internally by ZFS so should never be incorrect. 2704 */ 2705 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2706 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2707 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2708 2709 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2710 return (error); 2711 zilog = zfsvfs->z_log; 2712 2713 if ((error = zfs_verify_zp(tdzp)) != 0) { 2714 zfs_exit(zfsvfs, FTAG); 2715 return (error); 2716 } 2717 2718 /* 2719 * We check i_sb because snapshots and the ctldir must have different 2720 * super blocks. 2721 */ 2722 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2723 zfsctl_is_node(ZTOI(tdzp))) { 2724 zfs_exit(zfsvfs, FTAG); 2725 return (SET_ERROR(EXDEV)); 2726 } 2727 2728 if (zfsvfs->z_utf8 && u8_validate(tnm, 2729 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2730 zfs_exit(zfsvfs, FTAG); 2731 return (SET_ERROR(EILSEQ)); 2732 } 2733 2734 if (flags & FIGNORECASE) 2735 zflg |= ZCILOOK; 2736 2737 top: 2738 szp = NULL; 2739 tzp = NULL; 2740 zl = NULL; 2741 2742 /* 2743 * This is to prevent the creation of links into attribute space 2744 * by renaming a linked file into/outof an attribute directory. 2745 * See the comment in zfs_link() for why this is considered bad. 2746 */ 2747 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2748 zfs_exit(zfsvfs, FTAG); 2749 return (SET_ERROR(EINVAL)); 2750 } 2751 2752 /* 2753 * Lock source and target directory entries. To prevent deadlock, 2754 * a lock ordering must be defined. We lock the directory with 2755 * the smallest object id first, or if it's a tie, the one with 2756 * the lexically first name. 2757 */ 2758 if (sdzp->z_id < tdzp->z_id) { 2759 cmp = -1; 2760 } else if (sdzp->z_id > tdzp->z_id) { 2761 cmp = 1; 2762 } else { 2763 /* 2764 * First compare the two name arguments without 2765 * considering any case folding. 2766 */ 2767 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2768 2769 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2770 ASSERT(error == 0 || !zfsvfs->z_utf8); 2771 if (cmp == 0) { 2772 /* 2773 * POSIX: "If the old argument and the new argument 2774 * both refer to links to the same existing file, 2775 * the rename() function shall return successfully 2776 * and perform no other action." 2777 */ 2778 zfs_exit(zfsvfs, FTAG); 2779 return (0); 2780 } 2781 /* 2782 * If the file system is case-folding, then we may 2783 * have some more checking to do. A case-folding file 2784 * system is either supporting mixed case sensitivity 2785 * access or is completely case-insensitive. Note 2786 * that the file system is always case preserving. 2787 * 2788 * In mixed sensitivity mode case sensitive behavior 2789 * is the default. FIGNORECASE must be used to 2790 * explicitly request case insensitive behavior. 2791 * 2792 * If the source and target names provided differ only 2793 * by case (e.g., a request to rename 'tim' to 'Tim'), 2794 * we will treat this as a special case in the 2795 * case-insensitive mode: as long as the source name 2796 * is an exact match, we will allow this to proceed as 2797 * a name-change request. 2798 */ 2799 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2800 (zfsvfs->z_case == ZFS_CASE_MIXED && 2801 flags & FIGNORECASE)) && 2802 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2803 &error) == 0) { 2804 /* 2805 * case preserving rename request, require exact 2806 * name matches 2807 */ 2808 zflg |= ZCIEXACT; 2809 zflg &= ~ZCILOOK; 2810 } 2811 } 2812 2813 /* 2814 * If the source and destination directories are the same, we should 2815 * grab the z_name_lock of that directory only once. 2816 */ 2817 if (sdzp == tdzp) { 2818 zflg |= ZHAVELOCK; 2819 rw_enter(&sdzp->z_name_lock, RW_READER); 2820 } 2821 2822 if (cmp < 0) { 2823 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2824 ZEXISTS | zflg, NULL, NULL); 2825 terr = zfs_dirent_lock(&tdl, 2826 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2827 } else { 2828 terr = zfs_dirent_lock(&tdl, 2829 tdzp, tnm, &tzp, zflg, NULL, NULL); 2830 serr = zfs_dirent_lock(&sdl, 2831 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2832 NULL, NULL); 2833 } 2834 2835 if (serr) { 2836 /* 2837 * Source entry invalid or not there. 2838 */ 2839 if (!terr) { 2840 zfs_dirent_unlock(tdl); 2841 if (tzp) 2842 zrele(tzp); 2843 } 2844 2845 if (sdzp == tdzp) 2846 rw_exit(&sdzp->z_name_lock); 2847 2848 if (strcmp(snm, "..") == 0) 2849 serr = EINVAL; 2850 zfs_exit(zfsvfs, FTAG); 2851 return (serr); 2852 } 2853 if (terr) { 2854 zfs_dirent_unlock(sdl); 2855 zrele(szp); 2856 2857 if (sdzp == tdzp) 2858 rw_exit(&sdzp->z_name_lock); 2859 2860 if (strcmp(tnm, "..") == 0) 2861 terr = EINVAL; 2862 zfs_exit(zfsvfs, FTAG); 2863 return (terr); 2864 } 2865 2866 /* 2867 * If we are using project inheritance, means if the directory has 2868 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2869 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2870 * such case, we only allow renames into our tree when the project 2871 * IDs are the same. 2872 */ 2873 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2874 tdzp->z_projid != szp->z_projid) { 2875 error = SET_ERROR(EXDEV); 2876 goto out; 2877 } 2878 2879 /* 2880 * Must have write access at the source to remove the old entry 2881 * and write access at the target to create the new entry. 2882 * Note that if target and source are the same, this can be 2883 * done in a single check. 2884 */ 2885 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2886 goto out; 2887 2888 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2889 /* 2890 * Check to make sure rename is valid. 2891 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2892 */ 2893 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2894 goto out; 2895 } 2896 2897 /* 2898 * Does target exist? 2899 */ 2900 if (tzp) { 2901 if (rflags & RENAME_NOREPLACE) { 2902 error = SET_ERROR(EEXIST); 2903 goto out; 2904 } 2905 /* 2906 * Source and target must be the same type (unless exchanging). 2907 */ 2908 if (!(rflags & RENAME_EXCHANGE)) { 2909 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 2910 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 2911 2912 if (s_is_dir != t_is_dir) { 2913 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 2914 goto out; 2915 } 2916 } 2917 /* 2918 * POSIX dictates that when the source and target 2919 * entries refer to the same file object, rename 2920 * must do nothing and exit without error. 2921 */ 2922 if (szp->z_id == tzp->z_id) { 2923 error = 0; 2924 goto out; 2925 } 2926 } else if (rflags & RENAME_EXCHANGE) { 2927 /* Target must exist for RENAME_EXCHANGE. */ 2928 error = SET_ERROR(ENOENT); 2929 goto out; 2930 } 2931 2932 /* Set up inode creation for RENAME_WHITEOUT. */ 2933 if (rflags & RENAME_WHITEOUT) { 2934 /* 2935 * Whiteout files are not regular files or directories, so to 2936 * match zfs_create() we do not inherit the project id. 2937 */ 2938 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 2939 2940 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 2941 if (error) 2942 goto out; 2943 2944 if (!have_acl) { 2945 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 2946 &acl_ids, mnt_ns); 2947 if (error) 2948 goto out; 2949 have_acl = B_TRUE; 2950 } 2951 2952 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 2953 error = SET_ERROR(EDQUOT); 2954 goto out; 2955 } 2956 } 2957 2958 tx = dmu_tx_create(zfsvfs->z_os); 2959 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 2960 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2961 dmu_tx_hold_zap(tx, sdzp->z_id, 2962 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 2963 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 2964 if (sdzp != tdzp) { 2965 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 2966 zfs_sa_upgrade_txholds(tx, tdzp); 2967 } 2968 if (tzp) { 2969 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 2970 zfs_sa_upgrade_txholds(tx, tzp); 2971 } 2972 if (rflags & RENAME_WHITEOUT) { 2973 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2974 ZFS_SA_BASE_ATTR_SIZE); 2975 2976 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 2977 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2978 if (!zfsvfs->z_use_sa && 2979 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2980 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2981 0, acl_ids.z_aclp->z_acl_bytes); 2982 } 2983 } 2984 fuid_dirtied = zfsvfs->z_fuid_dirty; 2985 if (fuid_dirtied) 2986 zfs_fuid_txhold(zfsvfs, tx); 2987 zfs_sa_upgrade_txholds(tx, szp); 2988 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2989 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2990 if (error) { 2991 if (zl != NULL) 2992 zfs_rename_unlock(&zl); 2993 zfs_dirent_unlock(sdl); 2994 zfs_dirent_unlock(tdl); 2995 2996 if (sdzp == tdzp) 2997 rw_exit(&sdzp->z_name_lock); 2998 2999 if (error == ERESTART) { 3000 waited = B_TRUE; 3001 dmu_tx_wait(tx); 3002 dmu_tx_abort(tx); 3003 zrele(szp); 3004 if (tzp) 3005 zrele(tzp); 3006 goto top; 3007 } 3008 dmu_tx_abort(tx); 3009 zrele(szp); 3010 if (tzp) 3011 zrele(tzp); 3012 zfs_exit(zfsvfs, FTAG); 3013 return (error); 3014 } 3015 3016 /* 3017 * Unlink the source. 3018 */ 3019 szp->z_pflags |= ZFS_AV_MODIFIED; 3020 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3021 szp->z_pflags |= ZFS_PROJINHERIT; 3022 3023 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3024 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3025 VERIFY0(error); 3026 3027 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3028 if (error) 3029 goto commit; 3030 3031 /* 3032 * Unlink the target. 3033 */ 3034 if (tzp) { 3035 int tzflg = zflg; 3036 3037 if (rflags & RENAME_EXCHANGE) { 3038 /* This inode will be re-linked soon. */ 3039 tzflg |= ZRENAMING; 3040 3041 tzp->z_pflags |= ZFS_AV_MODIFIED; 3042 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3043 tzp->z_pflags |= ZFS_PROJINHERIT; 3044 3045 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3046 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3047 ASSERT0(error); 3048 } 3049 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3050 if (error) 3051 goto commit_link_szp; 3052 } 3053 3054 /* 3055 * Create the new target links: 3056 * * We always link the target. 3057 * * RENAME_EXCHANGE: Link the old target to the source. 3058 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3059 */ 3060 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3061 if (error) { 3062 /* 3063 * If we have removed the existing target, a subsequent call to 3064 * zfs_link_create() to add back the same entry, but with a new 3065 * dnode (szp), should not fail. 3066 */ 3067 ASSERT3P(tzp, ==, NULL); 3068 goto commit_link_tzp; 3069 } 3070 3071 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3072 case RENAME_EXCHANGE: 3073 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3074 /* 3075 * The same argument as zfs_link_create() failing for 3076 * szp applies here, since the source directory must 3077 * have had an entry we are replacing. 3078 */ 3079 ASSERT0(error); 3080 if (error) 3081 goto commit_unlink_td_szp; 3082 break; 3083 case RENAME_WHITEOUT: 3084 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3085 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3086 if (error) { 3087 zfs_znode_delete(wzp, tx); 3088 remove_inode_hash(ZTOI(wzp)); 3089 goto commit_unlink_td_szp; 3090 } 3091 break; 3092 } 3093 3094 if (fuid_dirtied) 3095 zfs_fuid_sync(zfsvfs, tx); 3096 3097 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3098 case RENAME_EXCHANGE: 3099 zfs_log_rename_exchange(zilog, tx, 3100 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3101 tdzp, tdl->dl_name, szp); 3102 break; 3103 case RENAME_WHITEOUT: 3104 zfs_log_rename_whiteout(zilog, tx, 3105 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3106 tdzp, tdl->dl_name, szp, wzp); 3107 break; 3108 default: 3109 ASSERT0(rflags & ~RENAME_NOREPLACE); 3110 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3111 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3112 break; 3113 } 3114 3115 commit: 3116 dmu_tx_commit(tx); 3117 out: 3118 if (have_acl) 3119 zfs_acl_ids_free(&acl_ids); 3120 3121 zfs_znode_update_vfs(sdzp); 3122 if (sdzp == tdzp) 3123 rw_exit(&sdzp->z_name_lock); 3124 3125 if (sdzp != tdzp) 3126 zfs_znode_update_vfs(tdzp); 3127 3128 zfs_znode_update_vfs(szp); 3129 zrele(szp); 3130 if (wzp) { 3131 zfs_znode_update_vfs(wzp); 3132 zrele(wzp); 3133 } 3134 if (tzp) { 3135 zfs_znode_update_vfs(tzp); 3136 zrele(tzp); 3137 } 3138 3139 if (zl != NULL) 3140 zfs_rename_unlock(&zl); 3141 3142 zfs_dirent_unlock(sdl); 3143 zfs_dirent_unlock(tdl); 3144 3145 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3146 zil_commit(zilog, 0); 3147 3148 zfs_exit(zfsvfs, FTAG); 3149 return (error); 3150 3151 /* 3152 * Clean-up path for broken link state. 3153 * 3154 * At this point we are in a (very) bad state, so we need to do our 3155 * best to correct the state. In particular, all of the nlinks are 3156 * wrong because we were destroying and creating links with ZRENAMING. 3157 * 3158 * In some form, all of these operations have to resolve the state: 3159 * 3160 * * link_destroy() *must* succeed. Fortunately, this is very likely 3161 * since we only just created it. 3162 * 3163 * * link_create()s are allowed to fail (though they shouldn't because 3164 * we only just unlinked them and are putting the entries back 3165 * during clean-up). But if they fail, we can just forcefully drop 3166 * the nlink value to (at the very least) avoid broken nlink values 3167 * -- though in the case of non-empty directories we will have to 3168 * panic (otherwise we'd have a leaked directory with a broken ..). 3169 */ 3170 commit_unlink_td_szp: 3171 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3172 commit_link_tzp: 3173 if (tzp) { 3174 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3175 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3176 } 3177 commit_link_szp: 3178 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3179 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3180 goto commit; 3181 } 3182 3183 /* 3184 * Insert the indicated symbolic reference entry into the directory. 3185 * 3186 * IN: dzp - Directory to contain new symbolic link. 3187 * name - Name of directory entry in dip. 3188 * vap - Attributes of new entry. 3189 * link - Name for new symlink entry. 3190 * cr - credentials of caller. 3191 * flags - case flags 3192 * mnt_ns - user namespace of the mount 3193 * 3194 * OUT: zpp - Znode for new symbolic link. 3195 * 3196 * RETURN: 0 on success, error code on failure. 3197 * 3198 * Timestamps: 3199 * dip - ctime|mtime updated 3200 */ 3201 int 3202 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3203 znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) 3204 { 3205 znode_t *zp; 3206 zfs_dirlock_t *dl; 3207 dmu_tx_t *tx; 3208 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3209 zilog_t *zilog; 3210 uint64_t len = strlen(link); 3211 int error; 3212 int zflg = ZNEW; 3213 zfs_acl_ids_t acl_ids; 3214 boolean_t fuid_dirtied; 3215 uint64_t txtype = TX_SYMLINK; 3216 boolean_t waited = B_FALSE; 3217 3218 ASSERT(S_ISLNK(vap->va_mode)); 3219 3220 if (name == NULL) 3221 return (SET_ERROR(EINVAL)); 3222 3223 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3224 return (error); 3225 zilog = zfsvfs->z_log; 3226 3227 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3228 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3229 zfs_exit(zfsvfs, FTAG); 3230 return (SET_ERROR(EILSEQ)); 3231 } 3232 if (flags & FIGNORECASE) 3233 zflg |= ZCILOOK; 3234 3235 if (len > MAXPATHLEN) { 3236 zfs_exit(zfsvfs, FTAG); 3237 return (SET_ERROR(ENAMETOOLONG)); 3238 } 3239 3240 if ((error = zfs_acl_ids_create(dzp, 0, 3241 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3242 zfs_exit(zfsvfs, FTAG); 3243 return (error); 3244 } 3245 top: 3246 *zpp = NULL; 3247 3248 /* 3249 * Attempt to lock directory; fail if entry already exists. 3250 */ 3251 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3252 if (error) { 3253 zfs_acl_ids_free(&acl_ids); 3254 zfs_exit(zfsvfs, FTAG); 3255 return (error); 3256 } 3257 3258 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3259 zfs_acl_ids_free(&acl_ids); 3260 zfs_dirent_unlock(dl); 3261 zfs_exit(zfsvfs, FTAG); 3262 return (error); 3263 } 3264 3265 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3266 zfs_acl_ids_free(&acl_ids); 3267 zfs_dirent_unlock(dl); 3268 zfs_exit(zfsvfs, FTAG); 3269 return (SET_ERROR(EDQUOT)); 3270 } 3271 tx = dmu_tx_create(zfsvfs->z_os); 3272 fuid_dirtied = zfsvfs->z_fuid_dirty; 3273 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3274 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3275 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3276 ZFS_SA_BASE_ATTR_SIZE + len); 3277 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3278 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3279 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3280 acl_ids.z_aclp->z_acl_bytes); 3281 } 3282 if (fuid_dirtied) 3283 zfs_fuid_txhold(zfsvfs, tx); 3284 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3285 if (error) { 3286 zfs_dirent_unlock(dl); 3287 if (error == ERESTART) { 3288 waited = B_TRUE; 3289 dmu_tx_wait(tx); 3290 dmu_tx_abort(tx); 3291 goto top; 3292 } 3293 zfs_acl_ids_free(&acl_ids); 3294 dmu_tx_abort(tx); 3295 zfs_exit(zfsvfs, FTAG); 3296 return (error); 3297 } 3298 3299 /* 3300 * Create a new object for the symlink. 3301 * for version 4 ZPL datasets the symlink will be an SA attribute 3302 */ 3303 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3304 3305 if (fuid_dirtied) 3306 zfs_fuid_sync(zfsvfs, tx); 3307 3308 mutex_enter(&zp->z_lock); 3309 if (zp->z_is_sa) 3310 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3311 link, len, tx); 3312 else 3313 zfs_sa_symlink(zp, link, len, tx); 3314 mutex_exit(&zp->z_lock); 3315 3316 zp->z_size = len; 3317 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3318 &zp->z_size, sizeof (zp->z_size), tx); 3319 /* 3320 * Insert the new object into the directory. 3321 */ 3322 error = zfs_link_create(dl, zp, tx, ZNEW); 3323 if (error != 0) { 3324 zfs_znode_delete(zp, tx); 3325 remove_inode_hash(ZTOI(zp)); 3326 } else { 3327 if (flags & FIGNORECASE) 3328 txtype |= TX_CI; 3329 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3330 3331 zfs_znode_update_vfs(dzp); 3332 zfs_znode_update_vfs(zp); 3333 } 3334 3335 zfs_acl_ids_free(&acl_ids); 3336 3337 dmu_tx_commit(tx); 3338 3339 zfs_dirent_unlock(dl); 3340 3341 if (error == 0) { 3342 *zpp = zp; 3343 3344 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3345 zil_commit(zilog, 0); 3346 } else { 3347 zrele(zp); 3348 } 3349 3350 zfs_exit(zfsvfs, FTAG); 3351 return (error); 3352 } 3353 3354 /* 3355 * Return, in the buffer contained in the provided uio structure, 3356 * the symbolic path referred to by ip. 3357 * 3358 * IN: ip - inode of symbolic link 3359 * uio - structure to contain the link path. 3360 * cr - credentials of caller. 3361 * 3362 * RETURN: 0 if success 3363 * error code if failure 3364 * 3365 * Timestamps: 3366 * ip - atime updated 3367 */ 3368 int 3369 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3370 { 3371 (void) cr; 3372 znode_t *zp = ITOZ(ip); 3373 zfsvfs_t *zfsvfs = ITOZSB(ip); 3374 int error; 3375 3376 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3377 return (error); 3378 3379 mutex_enter(&zp->z_lock); 3380 if (zp->z_is_sa) 3381 error = sa_lookup_uio(zp->z_sa_hdl, 3382 SA_ZPL_SYMLINK(zfsvfs), uio); 3383 else 3384 error = zfs_sa_readlink(zp, uio); 3385 mutex_exit(&zp->z_lock); 3386 3387 zfs_exit(zfsvfs, FTAG); 3388 return (error); 3389 } 3390 3391 /* 3392 * Insert a new entry into directory tdzp referencing szp. 3393 * 3394 * IN: tdzp - Directory to contain new entry. 3395 * szp - znode of new entry. 3396 * name - name of new entry. 3397 * cr - credentials of caller. 3398 * flags - case flags. 3399 * 3400 * RETURN: 0 if success 3401 * error code if failure 3402 * 3403 * Timestamps: 3404 * tdzp - ctime|mtime updated 3405 * szp - ctime updated 3406 */ 3407 int 3408 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3409 int flags) 3410 { 3411 struct inode *sip = ZTOI(szp); 3412 znode_t *tzp; 3413 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3414 zilog_t *zilog; 3415 zfs_dirlock_t *dl; 3416 dmu_tx_t *tx; 3417 int error; 3418 int zf = ZNEW; 3419 uint64_t parent; 3420 uid_t owner; 3421 boolean_t waited = B_FALSE; 3422 boolean_t is_tmpfile = 0; 3423 uint64_t txg; 3424 #ifdef HAVE_TMPFILE 3425 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3426 #endif 3427 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3428 3429 if (name == NULL) 3430 return (SET_ERROR(EINVAL)); 3431 3432 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3433 return (error); 3434 zilog = zfsvfs->z_log; 3435 3436 /* 3437 * POSIX dictates that we return EPERM here. 3438 * Better choices include ENOTSUP or EISDIR. 3439 */ 3440 if (S_ISDIR(sip->i_mode)) { 3441 zfs_exit(zfsvfs, FTAG); 3442 return (SET_ERROR(EPERM)); 3443 } 3444 3445 if ((error = zfs_verify_zp(szp)) != 0) { 3446 zfs_exit(zfsvfs, FTAG); 3447 return (error); 3448 } 3449 3450 /* 3451 * If we are using project inheritance, means if the directory has 3452 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3453 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3454 * such case, we only allow hard link creation in our tree when the 3455 * project IDs are the same. 3456 */ 3457 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3458 tdzp->z_projid != szp->z_projid) { 3459 zfs_exit(zfsvfs, FTAG); 3460 return (SET_ERROR(EXDEV)); 3461 } 3462 3463 /* 3464 * We check i_sb because snapshots and the ctldir must have different 3465 * super blocks. 3466 */ 3467 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3468 zfs_exit(zfsvfs, FTAG); 3469 return (SET_ERROR(EXDEV)); 3470 } 3471 3472 /* Prevent links to .zfs/shares files */ 3473 3474 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3475 &parent, sizeof (uint64_t))) != 0) { 3476 zfs_exit(zfsvfs, FTAG); 3477 return (error); 3478 } 3479 if (parent == zfsvfs->z_shares_dir) { 3480 zfs_exit(zfsvfs, FTAG); 3481 return (SET_ERROR(EPERM)); 3482 } 3483 3484 if (zfsvfs->z_utf8 && u8_validate(name, 3485 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3486 zfs_exit(zfsvfs, FTAG); 3487 return (SET_ERROR(EILSEQ)); 3488 } 3489 if (flags & FIGNORECASE) 3490 zf |= ZCILOOK; 3491 3492 /* 3493 * We do not support links between attributes and non-attributes 3494 * because of the potential security risk of creating links 3495 * into "normal" file space in order to circumvent restrictions 3496 * imposed in attribute space. 3497 */ 3498 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3499 zfs_exit(zfsvfs, FTAG); 3500 return (SET_ERROR(EINVAL)); 3501 } 3502 3503 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3504 cr, ZFS_OWNER); 3505 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3506 zfs_exit(zfsvfs, FTAG); 3507 return (SET_ERROR(EPERM)); 3508 } 3509 3510 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3511 kcred->user_ns))) { 3512 zfs_exit(zfsvfs, FTAG); 3513 return (error); 3514 } 3515 3516 top: 3517 /* 3518 * Attempt to lock directory; fail if entry already exists. 3519 */ 3520 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3521 if (error) { 3522 zfs_exit(zfsvfs, FTAG); 3523 return (error); 3524 } 3525 3526 tx = dmu_tx_create(zfsvfs->z_os); 3527 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3528 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3529 if (is_tmpfile) 3530 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3531 3532 zfs_sa_upgrade_txholds(tx, szp); 3533 zfs_sa_upgrade_txholds(tx, tdzp); 3534 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3535 if (error) { 3536 zfs_dirent_unlock(dl); 3537 if (error == ERESTART) { 3538 waited = B_TRUE; 3539 dmu_tx_wait(tx); 3540 dmu_tx_abort(tx); 3541 goto top; 3542 } 3543 dmu_tx_abort(tx); 3544 zfs_exit(zfsvfs, FTAG); 3545 return (error); 3546 } 3547 /* unmark z_unlinked so zfs_link_create will not reject */ 3548 if (is_tmpfile) 3549 szp->z_unlinked = B_FALSE; 3550 error = zfs_link_create(dl, szp, tx, 0); 3551 3552 if (error == 0) { 3553 uint64_t txtype = TX_LINK; 3554 /* 3555 * tmpfile is created to be in z_unlinkedobj, so remove it. 3556 * Also, we don't log in ZIL, because all previous file 3557 * operation on the tmpfile are ignored by ZIL. Instead we 3558 * always wait for txg to sync to make sure all previous 3559 * operation are sync safe. 3560 */ 3561 if (is_tmpfile) { 3562 VERIFY(zap_remove_int(zfsvfs->z_os, 3563 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3564 } else { 3565 if (flags & FIGNORECASE) 3566 txtype |= TX_CI; 3567 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3568 } 3569 } else if (is_tmpfile) { 3570 /* restore z_unlinked since when linking failed */ 3571 szp->z_unlinked = B_TRUE; 3572 } 3573 txg = dmu_tx_get_txg(tx); 3574 dmu_tx_commit(tx); 3575 3576 zfs_dirent_unlock(dl); 3577 3578 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3579 zil_commit(zilog, 0); 3580 3581 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3582 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3583 3584 zfs_znode_update_vfs(tdzp); 3585 zfs_znode_update_vfs(szp); 3586 zfs_exit(zfsvfs, FTAG); 3587 return (error); 3588 } 3589 3590 static void 3591 zfs_putpage_sync_commit_cb(void *arg) 3592 { 3593 struct page *pp = arg; 3594 3595 ClearPageError(pp); 3596 end_page_writeback(pp); 3597 } 3598 3599 static void 3600 zfs_putpage_async_commit_cb(void *arg) 3601 { 3602 struct page *pp = arg; 3603 znode_t *zp = ITOZ(pp->mapping->host); 3604 3605 ClearPageError(pp); 3606 end_page_writeback(pp); 3607 atomic_dec_32(&zp->z_async_writes_cnt); 3608 } 3609 3610 /* 3611 * Push a page out to disk, once the page is on stable storage the 3612 * registered commit callback will be run as notification of completion. 3613 * 3614 * IN: ip - page mapped for inode. 3615 * pp - page to push (page is locked) 3616 * wbc - writeback control data 3617 * for_sync - does the caller intend to wait synchronously for the 3618 * page writeback to complete? 3619 * 3620 * RETURN: 0 if success 3621 * error code if failure 3622 * 3623 * Timestamps: 3624 * ip - ctime|mtime updated 3625 */ 3626 int 3627 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3628 boolean_t for_sync) 3629 { 3630 znode_t *zp = ITOZ(ip); 3631 zfsvfs_t *zfsvfs = ITOZSB(ip); 3632 loff_t offset; 3633 loff_t pgoff; 3634 unsigned int pglen; 3635 dmu_tx_t *tx; 3636 caddr_t va; 3637 int err = 0; 3638 uint64_t mtime[2], ctime[2]; 3639 sa_bulk_attr_t bulk[3]; 3640 int cnt = 0; 3641 struct address_space *mapping; 3642 3643 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3644 return (err); 3645 3646 ASSERT(PageLocked(pp)); 3647 3648 pgoff = page_offset(pp); /* Page byte-offset in file */ 3649 offset = i_size_read(ip); /* File length in bytes */ 3650 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3651 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3652 3653 /* Page is beyond end of file */ 3654 if (pgoff >= offset) { 3655 unlock_page(pp); 3656 zfs_exit(zfsvfs, FTAG); 3657 return (0); 3658 } 3659 3660 /* Truncate page length to end of file */ 3661 if (pgoff + pglen > offset) 3662 pglen = offset - pgoff; 3663 3664 #if 0 3665 /* 3666 * FIXME: Allow mmap writes past its quota. The correct fix 3667 * is to register a page_mkwrite() handler to count the page 3668 * against its quota when it is about to be dirtied. 3669 */ 3670 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3671 KUID_TO_SUID(ip->i_uid)) || 3672 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3673 KGID_TO_SGID(ip->i_gid)) || 3674 (zp->z_projid != ZFS_DEFAULT_PROJID && 3675 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3676 zp->z_projid))) { 3677 err = EDQUOT; 3678 } 3679 #endif 3680 3681 /* 3682 * The ordering here is critical and must adhere to the following 3683 * rules in order to avoid deadlocking in either zfs_read() or 3684 * zfs_free_range() due to a lock inversion. 3685 * 3686 * 1) The page must be unlocked prior to acquiring the range lock. 3687 * This is critical because zfs_read() calls find_lock_page() 3688 * which may block on the page lock while holding the range lock. 3689 * 3690 * 2) Before setting or clearing write back on a page the range lock 3691 * must be held in order to prevent a lock inversion with the 3692 * zfs_free_range() function. 3693 * 3694 * This presents a problem because upon entering this function the 3695 * page lock is already held. To safely acquire the range lock the 3696 * page lock must be dropped. This creates a window where another 3697 * process could truncate, invalidate, dirty, or write out the page. 3698 * 3699 * Therefore, after successfully reacquiring the range and page locks 3700 * the current page state is checked. In the common case everything 3701 * will be as is expected and it can be written out. However, if 3702 * the page state has changed it must be handled accordingly. 3703 */ 3704 mapping = pp->mapping; 3705 redirty_page_for_writepage(wbc, pp); 3706 unlock_page(pp); 3707 3708 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3709 pgoff, pglen, RL_WRITER); 3710 lock_page(pp); 3711 3712 /* Page mapping changed or it was no longer dirty, we're done */ 3713 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3714 unlock_page(pp); 3715 zfs_rangelock_exit(lr); 3716 zfs_exit(zfsvfs, FTAG); 3717 return (0); 3718 } 3719 3720 /* Another process started write block if required */ 3721 if (PageWriteback(pp)) { 3722 unlock_page(pp); 3723 zfs_rangelock_exit(lr); 3724 3725 if (wbc->sync_mode != WB_SYNC_NONE) { 3726 /* 3727 * Speed up any non-sync page writebacks since 3728 * they may take several seconds to complete. 3729 * Refer to the comment in zpl_fsync() (when 3730 * HAVE_FSYNC_RANGE is defined) for details. 3731 */ 3732 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3733 zil_commit(zfsvfs->z_log, zp->z_id); 3734 } 3735 3736 if (PageWriteback(pp)) 3737 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3738 folio_wait_bit(page_folio(pp), PG_writeback); 3739 #else 3740 wait_on_page_bit(pp, PG_writeback); 3741 #endif 3742 } 3743 3744 zfs_exit(zfsvfs, FTAG); 3745 return (0); 3746 } 3747 3748 /* Clear the dirty flag the required locks are held */ 3749 if (!clear_page_dirty_for_io(pp)) { 3750 unlock_page(pp); 3751 zfs_rangelock_exit(lr); 3752 zfs_exit(zfsvfs, FTAG); 3753 return (0); 3754 } 3755 3756 /* 3757 * Counterpart for redirty_page_for_writepage() above. This page 3758 * was in fact not skipped and should not be counted as if it were. 3759 */ 3760 wbc->pages_skipped--; 3761 if (!for_sync) 3762 atomic_inc_32(&zp->z_async_writes_cnt); 3763 set_page_writeback(pp); 3764 unlock_page(pp); 3765 3766 tx = dmu_tx_create(zfsvfs->z_os); 3767 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3768 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3769 zfs_sa_upgrade_txholds(tx, zp); 3770 3771 err = dmu_tx_assign(tx, TXG_NOWAIT); 3772 if (err != 0) { 3773 if (err == ERESTART) 3774 dmu_tx_wait(tx); 3775 3776 dmu_tx_abort(tx); 3777 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3778 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3779 #else 3780 __set_page_dirty_nobuffers(pp); 3781 #endif 3782 ClearPageError(pp); 3783 end_page_writeback(pp); 3784 if (!for_sync) 3785 atomic_dec_32(&zp->z_async_writes_cnt); 3786 zfs_rangelock_exit(lr); 3787 zfs_exit(zfsvfs, FTAG); 3788 return (err); 3789 } 3790 3791 va = kmap(pp); 3792 ASSERT3U(pglen, <=, PAGE_SIZE); 3793 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3794 kunmap(pp); 3795 3796 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3797 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3798 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3799 &zp->z_pflags, 8); 3800 3801 /* Preserve the mtime and ctime provided by the inode */ 3802 ZFS_TIME_ENCODE(&ip->i_mtime, mtime); 3803 ZFS_TIME_ENCODE(&ip->i_ctime, ctime); 3804 zp->z_atime_dirty = B_FALSE; 3805 zp->z_seq++; 3806 3807 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3808 3809 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, 3810 for_sync ? zfs_putpage_sync_commit_cb : 3811 zfs_putpage_async_commit_cb, pp); 3812 3813 dmu_tx_commit(tx); 3814 3815 zfs_rangelock_exit(lr); 3816 3817 if (wbc->sync_mode != WB_SYNC_NONE) { 3818 /* 3819 * Note that this is rarely called under writepages(), because 3820 * writepages() normally handles the entire commit for 3821 * performance reasons. 3822 */ 3823 zil_commit(zfsvfs->z_log, zp->z_id); 3824 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3825 /* 3826 * If the caller does not intend to wait synchronously 3827 * for this page writeback to complete and there are active 3828 * synchronous calls on this file, do a commit so that 3829 * the latter don't accidentally end up waiting for 3830 * our writeback to complete. Refer to the comment in 3831 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3832 */ 3833 zil_commit(zfsvfs->z_log, zp->z_id); 3834 } 3835 3836 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3837 3838 zfs_exit(zfsvfs, FTAG); 3839 return (err); 3840 } 3841 3842 /* 3843 * Update the system attributes when the inode has been dirtied. For the 3844 * moment we only update the mode, atime, mtime, and ctime. 3845 */ 3846 int 3847 zfs_dirty_inode(struct inode *ip, int flags) 3848 { 3849 znode_t *zp = ITOZ(ip); 3850 zfsvfs_t *zfsvfs = ITOZSB(ip); 3851 dmu_tx_t *tx; 3852 uint64_t mode, atime[2], mtime[2], ctime[2]; 3853 sa_bulk_attr_t bulk[4]; 3854 int error = 0; 3855 int cnt = 0; 3856 3857 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3858 return (0); 3859 3860 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3861 return (error); 3862 3863 #ifdef I_DIRTY_TIME 3864 /* 3865 * This is the lazytime semantic introduced in Linux 4.0 3866 * This flag will only be called from update_time when lazytime is set. 3867 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3868 * Fortunately mtime and ctime are managed within ZFS itself, so we 3869 * only need to dirty atime. 3870 */ 3871 if (flags == I_DIRTY_TIME) { 3872 zp->z_atime_dirty = B_TRUE; 3873 goto out; 3874 } 3875 #endif 3876 3877 tx = dmu_tx_create(zfsvfs->z_os); 3878 3879 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3880 zfs_sa_upgrade_txholds(tx, zp); 3881 3882 error = dmu_tx_assign(tx, TXG_WAIT); 3883 if (error) { 3884 dmu_tx_abort(tx); 3885 goto out; 3886 } 3887 3888 mutex_enter(&zp->z_lock); 3889 zp->z_atime_dirty = B_FALSE; 3890 3891 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3892 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3893 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3894 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3895 3896 /* Preserve the mode, mtime and ctime provided by the inode */ 3897 ZFS_TIME_ENCODE(&ip->i_atime, atime); 3898 ZFS_TIME_ENCODE(&ip->i_mtime, mtime); 3899 ZFS_TIME_ENCODE(&ip->i_ctime, ctime); 3900 mode = ip->i_mode; 3901 3902 zp->z_mode = mode; 3903 3904 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3905 mutex_exit(&zp->z_lock); 3906 3907 dmu_tx_commit(tx); 3908 out: 3909 zfs_exit(zfsvfs, FTAG); 3910 return (error); 3911 } 3912 3913 void 3914 zfs_inactive(struct inode *ip) 3915 { 3916 znode_t *zp = ITOZ(ip); 3917 zfsvfs_t *zfsvfs = ITOZSB(ip); 3918 uint64_t atime[2]; 3919 int error; 3920 int need_unlock = 0; 3921 3922 /* Only read lock if we haven't already write locked, e.g. rollback */ 3923 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 3924 need_unlock = 1; 3925 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3926 } 3927 if (zp->z_sa_hdl == NULL) { 3928 if (need_unlock) 3929 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3930 return; 3931 } 3932 3933 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 3934 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3935 3936 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3937 zfs_sa_upgrade_txholds(tx, zp); 3938 error = dmu_tx_assign(tx, TXG_WAIT); 3939 if (error) { 3940 dmu_tx_abort(tx); 3941 } else { 3942 ZFS_TIME_ENCODE(&ip->i_atime, atime); 3943 mutex_enter(&zp->z_lock); 3944 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 3945 (void *)&atime, sizeof (atime), tx); 3946 zp->z_atime_dirty = B_FALSE; 3947 mutex_exit(&zp->z_lock); 3948 dmu_tx_commit(tx); 3949 } 3950 } 3951 3952 zfs_zinactive(zp); 3953 if (need_unlock) 3954 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3955 } 3956 3957 /* 3958 * Fill pages with data from the disk. 3959 */ 3960 static int 3961 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) 3962 { 3963 znode_t *zp = ITOZ(ip); 3964 zfsvfs_t *zfsvfs = ITOZSB(ip); 3965 objset_t *os; 3966 struct page *cur_pp; 3967 u_offset_t io_off, total; 3968 size_t io_len; 3969 loff_t i_size; 3970 unsigned page_idx; 3971 int err; 3972 3973 os = zfsvfs->z_os; 3974 io_len = nr_pages << PAGE_SHIFT; 3975 i_size = i_size_read(ip); 3976 io_off = page_offset(pl[0]); 3977 3978 if (io_off + io_len > i_size) 3979 io_len = i_size - io_off; 3980 3981 /* 3982 * Iterate over list of pages and read each page individually. 3983 */ 3984 page_idx = 0; 3985 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 3986 caddr_t va; 3987 3988 cur_pp = pl[page_idx++]; 3989 va = kmap(cur_pp); 3990 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 3991 DMU_READ_PREFETCH); 3992 kunmap(cur_pp); 3993 if (err) { 3994 /* convert checksum errors into IO errors */ 3995 if (err == ECKSUM) 3996 err = SET_ERROR(EIO); 3997 return (err); 3998 } 3999 } 4000 4001 return (0); 4002 } 4003 4004 /* 4005 * Uses zfs_fillpage to read data from the file and fill the pages. 4006 * 4007 * IN: ip - inode of file to get data from. 4008 * pl - list of pages to read 4009 * nr_pages - number of pages to read 4010 * 4011 * RETURN: 0 on success, error code on failure. 4012 * 4013 * Timestamps: 4014 * vp - atime updated 4015 */ 4016 int 4017 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) 4018 { 4019 znode_t *zp = ITOZ(ip); 4020 zfsvfs_t *zfsvfs = ITOZSB(ip); 4021 int err; 4022 4023 if (pl == NULL) 4024 return (0); 4025 4026 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4027 return (err); 4028 4029 err = zfs_fillpage(ip, pl, nr_pages); 4030 4031 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); 4032 4033 zfs_exit(zfsvfs, FTAG); 4034 return (err); 4035 } 4036 4037 /* 4038 * Check ZFS specific permissions to memory map a section of a file. 4039 * 4040 * IN: ip - inode of the file to mmap 4041 * off - file offset 4042 * addrp - start address in memory region 4043 * len - length of memory region 4044 * vm_flags- address flags 4045 * 4046 * RETURN: 0 if success 4047 * error code if failure 4048 */ 4049 int 4050 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4051 unsigned long vm_flags) 4052 { 4053 (void) addrp; 4054 znode_t *zp = ITOZ(ip); 4055 zfsvfs_t *zfsvfs = ITOZSB(ip); 4056 int error; 4057 4058 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4059 return (error); 4060 4061 if ((vm_flags & VM_WRITE) && (zp->z_pflags & 4062 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4063 zfs_exit(zfsvfs, FTAG); 4064 return (SET_ERROR(EPERM)); 4065 } 4066 4067 if ((vm_flags & (VM_READ | VM_EXEC)) && 4068 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4069 zfs_exit(zfsvfs, FTAG); 4070 return (SET_ERROR(EACCES)); 4071 } 4072 4073 if (off < 0 || len > MAXOFFSET_T - off) { 4074 zfs_exit(zfsvfs, FTAG); 4075 return (SET_ERROR(ENXIO)); 4076 } 4077 4078 zfs_exit(zfsvfs, FTAG); 4079 return (0); 4080 } 4081 4082 /* 4083 * Free or allocate space in a file. Currently, this function only 4084 * supports the `F_FREESP' command. However, this command is somewhat 4085 * misnamed, as its functionality includes the ability to allocate as 4086 * well as free space. 4087 * 4088 * IN: zp - znode of file to free data in. 4089 * cmd - action to take (only F_FREESP supported). 4090 * bfp - section of file to free/alloc. 4091 * flag - current file open mode flags. 4092 * offset - current file offset. 4093 * cr - credentials of caller. 4094 * 4095 * RETURN: 0 on success, error code on failure. 4096 * 4097 * Timestamps: 4098 * zp - ctime|mtime updated 4099 */ 4100 int 4101 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4102 offset_t offset, cred_t *cr) 4103 { 4104 (void) offset; 4105 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4106 uint64_t off, len; 4107 int error; 4108 4109 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4110 return (error); 4111 4112 if (cmd != F_FREESP) { 4113 zfs_exit(zfsvfs, FTAG); 4114 return (SET_ERROR(EINVAL)); 4115 } 4116 4117 /* 4118 * Callers might not be able to detect properly that we are read-only, 4119 * so check it explicitly here. 4120 */ 4121 if (zfs_is_readonly(zfsvfs)) { 4122 zfs_exit(zfsvfs, FTAG); 4123 return (SET_ERROR(EROFS)); 4124 } 4125 4126 if (bfp->l_len < 0) { 4127 zfs_exit(zfsvfs, FTAG); 4128 return (SET_ERROR(EINVAL)); 4129 } 4130 4131 /* 4132 * Permissions aren't checked on Solaris because on this OS 4133 * zfs_space() can only be called with an opened file handle. 4134 * On Linux we can get here through truncate_range() which 4135 * operates directly on inodes, so we need to check access rights. 4136 */ 4137 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4138 kcred->user_ns))) { 4139 zfs_exit(zfsvfs, FTAG); 4140 return (error); 4141 } 4142 4143 off = bfp->l_start; 4144 len = bfp->l_len; /* 0 means from off to end of file */ 4145 4146 error = zfs_freesp(zp, off, len, flag, TRUE); 4147 4148 zfs_exit(zfsvfs, FTAG); 4149 return (error); 4150 } 4151 4152 int 4153 zfs_fid(struct inode *ip, fid_t *fidp) 4154 { 4155 znode_t *zp = ITOZ(ip); 4156 zfsvfs_t *zfsvfs = ITOZSB(ip); 4157 uint32_t gen; 4158 uint64_t gen64; 4159 uint64_t object = zp->z_id; 4160 zfid_short_t *zfid; 4161 int size, i, error; 4162 4163 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4164 return (error); 4165 4166 if (fidp->fid_len < SHORT_FID_LEN) { 4167 fidp->fid_len = SHORT_FID_LEN; 4168 zfs_exit(zfsvfs, FTAG); 4169 return (SET_ERROR(ENOSPC)); 4170 } 4171 4172 if ((error = zfs_verify_zp(zp)) != 0) { 4173 zfs_exit(zfsvfs, FTAG); 4174 return (error); 4175 } 4176 4177 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4178 &gen64, sizeof (uint64_t))) != 0) { 4179 zfs_exit(zfsvfs, FTAG); 4180 return (error); 4181 } 4182 4183 gen = (uint32_t)gen64; 4184 4185 size = SHORT_FID_LEN; 4186 4187 zfid = (zfid_short_t *)fidp; 4188 4189 zfid->zf_len = size; 4190 4191 for (i = 0; i < sizeof (zfid->zf_object); i++) 4192 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4193 4194 /* Must have a non-zero generation number to distinguish from .zfs */ 4195 if (gen == 0) 4196 gen = 1; 4197 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4198 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4199 4200 zfs_exit(zfsvfs, FTAG); 4201 return (0); 4202 } 4203 4204 #if defined(_KERNEL) 4205 EXPORT_SYMBOL(zfs_open); 4206 EXPORT_SYMBOL(zfs_close); 4207 EXPORT_SYMBOL(zfs_lookup); 4208 EXPORT_SYMBOL(zfs_create); 4209 EXPORT_SYMBOL(zfs_tmpfile); 4210 EXPORT_SYMBOL(zfs_remove); 4211 EXPORT_SYMBOL(zfs_mkdir); 4212 EXPORT_SYMBOL(zfs_rmdir); 4213 EXPORT_SYMBOL(zfs_readdir); 4214 EXPORT_SYMBOL(zfs_getattr_fast); 4215 EXPORT_SYMBOL(zfs_setattr); 4216 EXPORT_SYMBOL(zfs_rename); 4217 EXPORT_SYMBOL(zfs_symlink); 4218 EXPORT_SYMBOL(zfs_readlink); 4219 EXPORT_SYMBOL(zfs_link); 4220 EXPORT_SYMBOL(zfs_inactive); 4221 EXPORT_SYMBOL(zfs_space); 4222 EXPORT_SYMBOL(zfs_fid); 4223 EXPORT_SYMBOL(zfs_getpage); 4224 EXPORT_SYMBOL(zfs_putpage); 4225 EXPORT_SYMBOL(zfs_dirty_inode); 4226 EXPORT_SYMBOL(zfs_map); 4227 4228 /* CSTYLED */ 4229 module_param(zfs_delete_blocks, ulong, 0644); 4230 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4231 4232 #endif 4233