1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 73 /* 74 * Programming rules. 75 * 76 * Each vnode op performs some logical unit of work. To do this, the ZPL must 77 * properly lock its in-core state, create a DMU transaction, do the work, 78 * record this work in the intent log (ZIL), commit the DMU transaction, 79 * and wait for the intent log to commit if it is a synchronous operation. 80 * Moreover, the vnode ops must work in both normal and log replay context. 81 * The ordering of events is important to avoid deadlocks and references 82 * to freed memory. The example below illustrates the following Big Rules: 83 * 84 * (1) A check must be made in each zfs thread for a mounted file system. 85 * This is done avoiding races using zfs_enter(zfsvfs). 86 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 87 * must be checked with zfs_verify_zp(zp). Both of these macros 88 * can return EIO from the calling function. 89 * 90 * (2) zrele() should always be the last thing except for zil_commit() (if 91 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 92 * last reference, the vnode/znode can be freed, so the zp may point to 93 * freed memory. Second, the last reference will call zfs_zinactive(), 94 * which may induce a lot of work -- pushing cached pages (which acquires 95 * range locks) and syncing out cached atime changes. Third, 96 * zfs_zinactive() may require a new tx, which could deadlock the system 97 * if you were already holding one. This deadlock occurs because the tx 98 * currently being operated on prevents a txg from syncing, which 99 * prevents the new tx from progressing, resulting in a deadlock. If you 100 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 101 * is a synonym for zrele(). 102 * 103 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 104 * as they can span dmu_tx_assign() calls. 105 * 106 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 107 * dmu_tx_assign(). This is critical because we don't want to block 108 * while holding locks. 109 * 110 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 111 * reduces lock contention and CPU usage when we must wait (note that if 112 * throughput is constrained by the storage, nearly every transaction 113 * must wait). 114 * 115 * Note, in particular, that if a lock is sometimes acquired before 116 * the tx assigns, and sometimes after (e.g. z_lock), then failing 117 * to use a non-blocking assign can deadlock the system. The scenario: 118 * 119 * Thread A has grabbed a lock before calling dmu_tx_assign(). 120 * Thread B is in an already-assigned tx, and blocks for this lock. 121 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 122 * forever, because the previous txg can't quiesce until B's tx commits. 123 * 124 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 125 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 126 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 127 * to indicate that this operation has already called dmu_tx_wait(). 128 * This will ensure that we don't retry forever, waiting a short bit 129 * each time. 130 * 131 * (5) If the operation succeeded, generate the intent log entry for it 132 * before dropping locks. This ensures that the ordering of events 133 * in the intent log matches the order in which they actually occurred. 134 * During ZIL replay the zfs_log_* functions will update the sequence 135 * number to indicate the zil transaction has replayed. 136 * 137 * (6) At the end of each vnode op, the DMU tx must always commit, 138 * regardless of whether there were any errors. 139 * 140 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 141 * to ensure that synchronous semantics are provided when necessary. 142 * 143 * In general, this is how things should be ordered in each vnode op: 144 * 145 * zfs_enter(zfsvfs); // exit if unmounted 146 * top: 147 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 148 * rw_enter(...); // grab any other locks you need 149 * tx = dmu_tx_create(...); // get DMU tx 150 * dmu_tx_hold_*(); // hold each object you might modify 151 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 152 * if (error) { 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * zrele(...); // release held znodes 156 * if (error == ERESTART) { 157 * waited = B_TRUE; 158 * dmu_tx_wait(tx); 159 * dmu_tx_abort(tx); 160 * goto top; 161 * } 162 * dmu_tx_abort(tx); // abort DMU tx 163 * zfs_exit(zfsvfs); // finished in zfs 164 * return (error); // really out of space 165 * } 166 * error = do_real_work(); // do whatever this VOP does 167 * if (error == 0) 168 * zfs_log_*(...); // on success, make ZIL entry 169 * dmu_tx_commit(tx); // commit DMU tx -- error or not 170 * rw_exit(...); // drop locks 171 * zfs_dirent_unlock(dl); // unlock directory entry 172 * zrele(...); // release held znodes 173 * zil_commit(zilog, foid); // synchronous when necessary 174 * zfs_exit(zfsvfs); // finished in zfs 175 * return (error); // done, report error 176 */ 177 int 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 179 { 180 (void) cr; 181 znode_t *zp = ITOZ(ip); 182 zfsvfs_t *zfsvfs = ITOZSB(ip); 183 int error; 184 185 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 186 return (error); 187 188 /* Honor ZFS_APPENDONLY file attribute */ 189 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && 190 ((flag & O_APPEND) == 0)) { 191 zfs_exit(zfsvfs, FTAG); 192 return (SET_ERROR(EPERM)); 193 } 194 195 /* Keep a count of the synchronous opens in the znode */ 196 if (flag & O_SYNC) 197 atomic_inc_32(&zp->z_sync_cnt); 198 199 zfs_exit(zfsvfs, FTAG); 200 return (0); 201 } 202 203 int 204 zfs_close(struct inode *ip, int flag, cred_t *cr) 205 { 206 (void) cr; 207 znode_t *zp = ITOZ(ip); 208 zfsvfs_t *zfsvfs = ITOZSB(ip); 209 int error; 210 211 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 212 return (error); 213 214 /* Decrement the synchronous opens in the znode */ 215 if (flag & O_SYNC) 216 atomic_dec_32(&zp->z_sync_cnt); 217 218 zfs_exit(zfsvfs, FTAG); 219 return (0); 220 } 221 222 #if defined(_KERNEL) 223 224 static int zfs_fillpage(struct inode *ip, struct page *pp); 225 226 /* 227 * When a file is memory mapped, we must keep the IO data synchronized 228 * between the DMU cache and the memory mapped pages. Update all mapped 229 * pages with the contents of the coresponding dmu buffer. 230 */ 231 void 232 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 233 { 234 struct address_space *mp = ZTOI(zp)->i_mapping; 235 int64_t off = start & (PAGE_SIZE - 1); 236 237 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 238 uint64_t nbytes = MIN(PAGE_SIZE - off, len); 239 240 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 241 if (pp) { 242 if (mapping_writably_mapped(mp)) 243 flush_dcache_page(pp); 244 245 void *pb = kmap(pp); 246 int error = dmu_read(os, zp->z_id, start + off, 247 nbytes, pb + off, DMU_READ_PREFETCH); 248 kunmap(pp); 249 250 if (error) { 251 SetPageError(pp); 252 ClearPageUptodate(pp); 253 } else { 254 ClearPageError(pp); 255 SetPageUptodate(pp); 256 257 if (mapping_writably_mapped(mp)) 258 flush_dcache_page(pp); 259 260 mark_page_accessed(pp); 261 } 262 263 unlock_page(pp); 264 put_page(pp); 265 } 266 267 len -= nbytes; 268 off = 0; 269 } 270 } 271 272 /* 273 * When a file is memory mapped, we must keep the I/O data synchronized 274 * between the DMU cache and the memory mapped pages. Preferentially read 275 * from memory mapped pages, otherwise fallback to reading through the dmu. 276 */ 277 int 278 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 279 { 280 struct inode *ip = ZTOI(zp); 281 struct address_space *mp = ip->i_mapping; 282 int64_t start = uio->uio_loffset; 283 int64_t off = start & (PAGE_SIZE - 1); 284 int len = nbytes; 285 int error = 0; 286 287 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 288 uint64_t bytes = MIN(PAGE_SIZE - off, len); 289 290 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 291 if (pp) { 292 /* 293 * If filemap_fault() retries there exists a window 294 * where the page will be unlocked and not up to date. 295 * In this case we must try and fill the page. 296 */ 297 if (unlikely(!PageUptodate(pp))) { 298 error = zfs_fillpage(ip, pp); 299 if (error) { 300 unlock_page(pp); 301 put_page(pp); 302 return (error); 303 } 304 } 305 306 ASSERT(PageUptodate(pp) || PageDirty(pp)); 307 308 unlock_page(pp); 309 310 void *pb = kmap(pp); 311 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 312 kunmap(pp); 313 314 if (mapping_writably_mapped(mp)) 315 flush_dcache_page(pp); 316 317 mark_page_accessed(pp); 318 put_page(pp); 319 } else { 320 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 321 uio, bytes); 322 } 323 324 len -= bytes; 325 off = 0; 326 327 if (error) 328 break; 329 } 330 331 return (error); 332 } 333 #endif /* _KERNEL */ 334 335 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 336 337 /* 338 * Write the bytes to a file. 339 * 340 * IN: zp - znode of file to be written to 341 * data - bytes to write 342 * len - number of bytes to write 343 * pos - offset to start writing at 344 * 345 * OUT: resid - remaining bytes to write 346 * 347 * RETURN: 0 if success 348 * positive error code if failure. EIO is returned 349 * for a short write when residp isn't provided. 350 * 351 * Timestamps: 352 * zp - ctime|mtime updated if byte count > 0 353 */ 354 int 355 zfs_write_simple(znode_t *zp, const void *data, size_t len, 356 loff_t pos, size_t *residp) 357 { 358 fstrans_cookie_t cookie; 359 int error; 360 361 struct iovec iov; 362 iov.iov_base = (void *)data; 363 iov.iov_len = len; 364 365 zfs_uio_t uio; 366 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 367 368 cookie = spl_fstrans_mark(); 369 error = zfs_write(zp, &uio, 0, kcred); 370 spl_fstrans_unmark(cookie); 371 372 if (error == 0) { 373 if (residp != NULL) 374 *residp = zfs_uio_resid(&uio); 375 else if (zfs_uio_resid(&uio) != 0) 376 error = SET_ERROR(EIO); 377 } 378 379 return (error); 380 } 381 382 static void 383 zfs_rele_async_task(void *arg) 384 { 385 iput(arg); 386 } 387 388 void 389 zfs_zrele_async(znode_t *zp) 390 { 391 struct inode *ip = ZTOI(zp); 392 objset_t *os = ITOZSB(ip)->z_os; 393 394 ASSERT(atomic_read(&ip->i_count) > 0); 395 ASSERT(os != NULL); 396 397 /* 398 * If decrementing the count would put us at 0, we can't do it inline 399 * here, because that would be synchronous. Instead, dispatch an iput 400 * to run later. 401 * 402 * For more information on the dangers of a synchronous iput, see the 403 * header comment of this file. 404 */ 405 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 406 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 407 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 408 } 409 } 410 411 412 /* 413 * Lookup an entry in a directory, or an extended attribute directory. 414 * If it exists, return a held inode reference for it. 415 * 416 * IN: zdp - znode of directory to search. 417 * nm - name of entry to lookup. 418 * flags - LOOKUP_XATTR set if looking for an attribute. 419 * cr - credentials of caller. 420 * direntflags - directory lookup flags 421 * realpnp - returned pathname. 422 * 423 * OUT: zpp - znode of located entry, NULL if not found. 424 * 425 * RETURN: 0 on success, error code on failure. 426 * 427 * Timestamps: 428 * NA 429 */ 430 int 431 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 432 int *direntflags, pathname_t *realpnp) 433 { 434 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 435 int error = 0; 436 437 /* 438 * Fast path lookup, however we must skip DNLC lookup 439 * for case folding or normalizing lookups because the 440 * DNLC code only stores the passed in name. This means 441 * creating 'a' and removing 'A' on a case insensitive 442 * file system would work, but DNLC still thinks 'a' 443 * exists and won't let you create it again on the next 444 * pass through fast path. 445 */ 446 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 447 448 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 449 return (SET_ERROR(ENOTDIR)); 450 } else if (zdp->z_sa_hdl == NULL) { 451 return (SET_ERROR(EIO)); 452 } 453 454 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 455 error = zfs_fastaccesschk_execute(zdp, cr); 456 if (!error) { 457 *zpp = zdp; 458 zhold(*zpp); 459 return (0); 460 } 461 return (error); 462 } 463 } 464 465 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 466 return (error); 467 468 *zpp = NULL; 469 470 if (flags & LOOKUP_XATTR) { 471 /* 472 * We don't allow recursive attributes.. 473 * Maybe someday we will. 474 */ 475 if (zdp->z_pflags & ZFS_XATTR) { 476 zfs_exit(zfsvfs, FTAG); 477 return (SET_ERROR(EINVAL)); 478 } 479 480 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 481 zfs_exit(zfsvfs, FTAG); 482 return (error); 483 } 484 485 /* 486 * Do we have permission to get into attribute directory? 487 */ 488 489 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 490 B_TRUE, cr, zfs_init_idmap))) { 491 zrele(*zpp); 492 *zpp = NULL; 493 } 494 495 zfs_exit(zfsvfs, FTAG); 496 return (error); 497 } 498 499 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 500 zfs_exit(zfsvfs, FTAG); 501 return (SET_ERROR(ENOTDIR)); 502 } 503 504 /* 505 * Check accessibility of directory. 506 */ 507 508 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 509 zfs_init_idmap))) { 510 zfs_exit(zfsvfs, FTAG); 511 return (error); 512 } 513 514 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 515 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 516 zfs_exit(zfsvfs, FTAG); 517 return (SET_ERROR(EILSEQ)); 518 } 519 520 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 521 if ((error == 0) && (*zpp)) 522 zfs_znode_update_vfs(*zpp); 523 524 zfs_exit(zfsvfs, FTAG); 525 return (error); 526 } 527 528 /* 529 * Attempt to create a new entry in a directory. If the entry 530 * already exists, truncate the file if permissible, else return 531 * an error. Return the ip of the created or trunc'd file. 532 * 533 * IN: dzp - znode of directory to put new file entry in. 534 * name - name of new file entry. 535 * vap - attributes of new file. 536 * excl - flag indicating exclusive or non-exclusive mode. 537 * mode - mode to open file with. 538 * cr - credentials of caller. 539 * flag - file flag. 540 * vsecp - ACL to be set 541 * mnt_ns - user namespace of the mount 542 * 543 * OUT: zpp - znode of created or trunc'd entry. 544 * 545 * RETURN: 0 on success, error code on failure. 546 * 547 * Timestamps: 548 * dzp - ctime|mtime updated if new entry created 549 * zp - ctime|mtime always, atime if new 550 */ 551 int 552 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 553 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 554 zidmap_t *mnt_ns) 555 { 556 znode_t *zp; 557 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 558 zilog_t *zilog; 559 objset_t *os; 560 zfs_dirlock_t *dl; 561 dmu_tx_t *tx; 562 int error; 563 uid_t uid; 564 gid_t gid; 565 zfs_acl_ids_t acl_ids; 566 boolean_t fuid_dirtied; 567 boolean_t have_acl = B_FALSE; 568 boolean_t waited = B_FALSE; 569 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 570 571 /* 572 * If we have an ephemeral id, ACL, or XVATTR then 573 * make sure file system is at proper version 574 */ 575 576 gid = crgetgid(cr); 577 uid = crgetuid(cr); 578 579 if (zfsvfs->z_use_fuids == B_FALSE && 580 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 581 return (SET_ERROR(EINVAL)); 582 583 if (name == NULL) 584 return (SET_ERROR(EINVAL)); 585 586 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 587 return (error); 588 os = zfsvfs->z_os; 589 zilog = zfsvfs->z_log; 590 591 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 592 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 593 zfs_exit(zfsvfs, FTAG); 594 return (SET_ERROR(EILSEQ)); 595 } 596 597 if (vap->va_mask & ATTR_XVATTR) { 598 if ((error = secpolicy_xvattr((xvattr_t *)vap, 599 crgetuid(cr), cr, vap->va_mode)) != 0) { 600 zfs_exit(zfsvfs, FTAG); 601 return (error); 602 } 603 } 604 605 top: 606 *zpp = NULL; 607 if (*name == '\0') { 608 /* 609 * Null component name refers to the directory itself. 610 */ 611 zhold(dzp); 612 zp = dzp; 613 dl = NULL; 614 error = 0; 615 } else { 616 /* possible igrab(zp) */ 617 int zflg = 0; 618 619 if (flag & FIGNORECASE) 620 zflg |= ZCILOOK; 621 622 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 623 NULL, NULL); 624 if (error) { 625 if (have_acl) 626 zfs_acl_ids_free(&acl_ids); 627 if (strcmp(name, "..") == 0) 628 error = SET_ERROR(EISDIR); 629 zfs_exit(zfsvfs, FTAG); 630 return (error); 631 } 632 } 633 634 if (zp == NULL) { 635 uint64_t txtype; 636 uint64_t projid = ZFS_DEFAULT_PROJID; 637 638 /* 639 * Create a new file object and update the directory 640 * to reference it. 641 */ 642 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 643 mnt_ns))) { 644 if (have_acl) 645 zfs_acl_ids_free(&acl_ids); 646 goto out; 647 } 648 649 /* 650 * We only support the creation of regular files in 651 * extended attribute directories. 652 */ 653 654 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 655 if (have_acl) 656 zfs_acl_ids_free(&acl_ids); 657 error = SET_ERROR(EINVAL); 658 goto out; 659 } 660 661 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 662 cr, vsecp, &acl_ids, mnt_ns)) != 0) 663 goto out; 664 have_acl = B_TRUE; 665 666 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 667 projid = zfs_inherit_projid(dzp); 668 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 669 zfs_acl_ids_free(&acl_ids); 670 error = SET_ERROR(EDQUOT); 671 goto out; 672 } 673 674 tx = dmu_tx_create(os); 675 676 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 677 ZFS_SA_BASE_ATTR_SIZE); 678 679 fuid_dirtied = zfsvfs->z_fuid_dirty; 680 if (fuid_dirtied) 681 zfs_fuid_txhold(zfsvfs, tx); 682 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 683 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 684 if (!zfsvfs->z_use_sa && 685 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 686 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 687 0, acl_ids.z_aclp->z_acl_bytes); 688 } 689 690 error = dmu_tx_assign(tx, 691 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 692 if (error) { 693 zfs_dirent_unlock(dl); 694 if (error == ERESTART) { 695 waited = B_TRUE; 696 dmu_tx_wait(tx); 697 dmu_tx_abort(tx); 698 goto top; 699 } 700 zfs_acl_ids_free(&acl_ids); 701 dmu_tx_abort(tx); 702 zfs_exit(zfsvfs, FTAG); 703 return (error); 704 } 705 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 706 707 error = zfs_link_create(dl, zp, tx, ZNEW); 708 if (error != 0) { 709 /* 710 * Since, we failed to add the directory entry for it, 711 * delete the newly created dnode. 712 */ 713 zfs_znode_delete(zp, tx); 714 remove_inode_hash(ZTOI(zp)); 715 zfs_acl_ids_free(&acl_ids); 716 dmu_tx_commit(tx); 717 goto out; 718 } 719 720 if (fuid_dirtied) 721 zfs_fuid_sync(zfsvfs, tx); 722 723 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 724 if (flag & FIGNORECASE) 725 txtype |= TX_CI; 726 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 727 vsecp, acl_ids.z_fuidp, vap); 728 zfs_acl_ids_free(&acl_ids); 729 dmu_tx_commit(tx); 730 } else { 731 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 732 733 if (have_acl) 734 zfs_acl_ids_free(&acl_ids); 735 736 /* 737 * A directory entry already exists for this name. 738 */ 739 /* 740 * Can't truncate an existing file if in exclusive mode. 741 */ 742 if (excl) { 743 error = SET_ERROR(EEXIST); 744 goto out; 745 } 746 /* 747 * Can't open a directory for writing. 748 */ 749 if (S_ISDIR(ZTOI(zp)->i_mode)) { 750 error = SET_ERROR(EISDIR); 751 goto out; 752 } 753 /* 754 * Verify requested access to file. 755 */ 756 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 757 mnt_ns))) { 758 goto out; 759 } 760 761 mutex_enter(&dzp->z_lock); 762 dzp->z_seq++; 763 mutex_exit(&dzp->z_lock); 764 765 /* 766 * Truncate regular files if requested. 767 */ 768 if (S_ISREG(ZTOI(zp)->i_mode) && 769 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 770 /* we can't hold any locks when calling zfs_freesp() */ 771 if (dl) { 772 zfs_dirent_unlock(dl); 773 dl = NULL; 774 } 775 error = zfs_freesp(zp, 0, 0, mode, TRUE); 776 } 777 } 778 out: 779 780 if (dl) 781 zfs_dirent_unlock(dl); 782 783 if (error) { 784 if (zp) 785 zrele(zp); 786 } else { 787 zfs_znode_update_vfs(dzp); 788 zfs_znode_update_vfs(zp); 789 *zpp = zp; 790 } 791 792 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 793 zil_commit(zilog, 0); 794 795 zfs_exit(zfsvfs, FTAG); 796 return (error); 797 } 798 799 int 800 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 801 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 802 zidmap_t *mnt_ns) 803 { 804 (void) excl, (void) mode, (void) flag; 805 znode_t *zp = NULL, *dzp = ITOZ(dip); 806 zfsvfs_t *zfsvfs = ITOZSB(dip); 807 objset_t *os; 808 dmu_tx_t *tx; 809 int error; 810 uid_t uid; 811 gid_t gid; 812 zfs_acl_ids_t acl_ids; 813 uint64_t projid = ZFS_DEFAULT_PROJID; 814 boolean_t fuid_dirtied; 815 boolean_t have_acl = B_FALSE; 816 boolean_t waited = B_FALSE; 817 818 /* 819 * If we have an ephemeral id, ACL, or XVATTR then 820 * make sure file system is at proper version 821 */ 822 823 gid = crgetgid(cr); 824 uid = crgetuid(cr); 825 826 if (zfsvfs->z_use_fuids == B_FALSE && 827 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 828 return (SET_ERROR(EINVAL)); 829 830 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 831 return (error); 832 os = zfsvfs->z_os; 833 834 if (vap->va_mask & ATTR_XVATTR) { 835 if ((error = secpolicy_xvattr((xvattr_t *)vap, 836 crgetuid(cr), cr, vap->va_mode)) != 0) { 837 zfs_exit(zfsvfs, FTAG); 838 return (error); 839 } 840 } 841 842 top: 843 *ipp = NULL; 844 845 /* 846 * Create a new file object and update the directory 847 * to reference it. 848 */ 849 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 850 if (have_acl) 851 zfs_acl_ids_free(&acl_ids); 852 goto out; 853 } 854 855 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 856 cr, vsecp, &acl_ids, mnt_ns)) != 0) 857 goto out; 858 have_acl = B_TRUE; 859 860 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 861 projid = zfs_inherit_projid(dzp); 862 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 863 zfs_acl_ids_free(&acl_ids); 864 error = SET_ERROR(EDQUOT); 865 goto out; 866 } 867 868 tx = dmu_tx_create(os); 869 870 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 871 ZFS_SA_BASE_ATTR_SIZE); 872 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 873 874 fuid_dirtied = zfsvfs->z_fuid_dirty; 875 if (fuid_dirtied) 876 zfs_fuid_txhold(zfsvfs, tx); 877 if (!zfsvfs->z_use_sa && 878 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 879 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 880 0, acl_ids.z_aclp->z_acl_bytes); 881 } 882 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 883 if (error) { 884 if (error == ERESTART) { 885 waited = B_TRUE; 886 dmu_tx_wait(tx); 887 dmu_tx_abort(tx); 888 goto top; 889 } 890 zfs_acl_ids_free(&acl_ids); 891 dmu_tx_abort(tx); 892 zfs_exit(zfsvfs, FTAG); 893 return (error); 894 } 895 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 896 897 if (fuid_dirtied) 898 zfs_fuid_sync(zfsvfs, tx); 899 900 /* Add to unlinked set */ 901 zp->z_unlinked = B_TRUE; 902 zfs_unlinked_add(zp, tx); 903 zfs_acl_ids_free(&acl_ids); 904 dmu_tx_commit(tx); 905 out: 906 907 if (error) { 908 if (zp) 909 zrele(zp); 910 } else { 911 zfs_znode_update_vfs(dzp); 912 zfs_znode_update_vfs(zp); 913 *ipp = ZTOI(zp); 914 } 915 916 zfs_exit(zfsvfs, FTAG); 917 return (error); 918 } 919 920 /* 921 * Remove an entry from a directory. 922 * 923 * IN: dzp - znode of directory to remove entry from. 924 * name - name of entry to remove. 925 * cr - credentials of caller. 926 * flags - case flags. 927 * 928 * RETURN: 0 if success 929 * error code if failure 930 * 931 * Timestamps: 932 * dzp - ctime|mtime 933 * ip - ctime (if nlink > 0) 934 */ 935 936 static uint64_t null_xattr = 0; 937 938 int 939 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 940 { 941 znode_t *zp; 942 znode_t *xzp; 943 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 944 zilog_t *zilog; 945 uint64_t acl_obj, xattr_obj; 946 uint64_t xattr_obj_unlinked = 0; 947 uint64_t obj = 0; 948 uint64_t links; 949 zfs_dirlock_t *dl; 950 dmu_tx_t *tx; 951 boolean_t may_delete_now, delete_now = FALSE; 952 boolean_t unlinked, toobig = FALSE; 953 uint64_t txtype; 954 pathname_t *realnmp = NULL; 955 pathname_t realnm; 956 int error; 957 int zflg = ZEXISTS; 958 boolean_t waited = B_FALSE; 959 960 if (name == NULL) 961 return (SET_ERROR(EINVAL)); 962 963 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 964 return (error); 965 zilog = zfsvfs->z_log; 966 967 if (flags & FIGNORECASE) { 968 zflg |= ZCILOOK; 969 pn_alloc(&realnm); 970 realnmp = &realnm; 971 } 972 973 top: 974 xattr_obj = 0; 975 xzp = NULL; 976 /* 977 * Attempt to lock directory; fail if entry doesn't exist. 978 */ 979 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 980 NULL, realnmp))) { 981 if (realnmp) 982 pn_free(realnmp); 983 zfs_exit(zfsvfs, FTAG); 984 return (error); 985 } 986 987 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 988 goto out; 989 } 990 991 /* 992 * Need to use rmdir for removing directories. 993 */ 994 if (S_ISDIR(ZTOI(zp)->i_mode)) { 995 error = SET_ERROR(EPERM); 996 goto out; 997 } 998 999 mutex_enter(&zp->z_lock); 1000 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 1001 !zn_has_cached_data(zp, 0, LLONG_MAX); 1002 mutex_exit(&zp->z_lock); 1003 1004 /* 1005 * We may delete the znode now, or we may put it in the unlinked set; 1006 * it depends on whether we're the last link, and on whether there are 1007 * other holds on the inode. So we dmu_tx_hold() the right things to 1008 * allow for either case. 1009 */ 1010 obj = zp->z_id; 1011 tx = dmu_tx_create(zfsvfs->z_os); 1012 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1013 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1014 zfs_sa_upgrade_txholds(tx, zp); 1015 zfs_sa_upgrade_txholds(tx, dzp); 1016 if (may_delete_now) { 1017 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1018 /* if the file is too big, only hold_free a token amount */ 1019 dmu_tx_hold_free(tx, zp->z_id, 0, 1020 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1021 } 1022 1023 /* are there any extended attributes? */ 1024 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1025 &xattr_obj, sizeof (xattr_obj)); 1026 if (error == 0 && xattr_obj) { 1027 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1028 ASSERT0(error); 1029 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1030 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1031 } 1032 1033 mutex_enter(&zp->z_lock); 1034 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1035 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1036 mutex_exit(&zp->z_lock); 1037 1038 /* charge as an update -- would be nice not to charge at all */ 1039 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1040 1041 /* 1042 * Mark this transaction as typically resulting in a net free of space 1043 */ 1044 dmu_tx_mark_netfree(tx); 1045 1046 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1047 if (error) { 1048 zfs_dirent_unlock(dl); 1049 if (error == ERESTART) { 1050 waited = B_TRUE; 1051 dmu_tx_wait(tx); 1052 dmu_tx_abort(tx); 1053 zrele(zp); 1054 if (xzp) 1055 zrele(xzp); 1056 goto top; 1057 } 1058 if (realnmp) 1059 pn_free(realnmp); 1060 dmu_tx_abort(tx); 1061 zrele(zp); 1062 if (xzp) 1063 zrele(xzp); 1064 zfs_exit(zfsvfs, FTAG); 1065 return (error); 1066 } 1067 1068 /* 1069 * Remove the directory entry. 1070 */ 1071 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1072 1073 if (error) { 1074 dmu_tx_commit(tx); 1075 goto out; 1076 } 1077 1078 if (unlinked) { 1079 /* 1080 * Hold z_lock so that we can make sure that the ACL obj 1081 * hasn't changed. Could have been deleted due to 1082 * zfs_sa_upgrade(). 1083 */ 1084 mutex_enter(&zp->z_lock); 1085 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1086 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1087 delete_now = may_delete_now && !toobig && 1088 atomic_read(&ZTOI(zp)->i_count) == 1 && 1089 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1090 xattr_obj == xattr_obj_unlinked && 1091 zfs_external_acl(zp) == acl_obj; 1092 VERIFY_IMPLY(xattr_obj_unlinked, xzp); 1093 } 1094 1095 if (delete_now) { 1096 if (xattr_obj_unlinked) { 1097 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1098 mutex_enter(&xzp->z_lock); 1099 xzp->z_unlinked = B_TRUE; 1100 clear_nlink(ZTOI(xzp)); 1101 links = 0; 1102 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1103 &links, sizeof (links), tx); 1104 ASSERT3U(error, ==, 0); 1105 mutex_exit(&xzp->z_lock); 1106 zfs_unlinked_add(xzp, tx); 1107 1108 if (zp->z_is_sa) 1109 error = sa_remove(zp->z_sa_hdl, 1110 SA_ZPL_XATTR(zfsvfs), tx); 1111 else 1112 error = sa_update(zp->z_sa_hdl, 1113 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1114 sizeof (uint64_t), tx); 1115 ASSERT0(error); 1116 } 1117 /* 1118 * Add to the unlinked set because a new reference could be 1119 * taken concurrently resulting in a deferred destruction. 1120 */ 1121 zfs_unlinked_add(zp, tx); 1122 mutex_exit(&zp->z_lock); 1123 } else if (unlinked) { 1124 mutex_exit(&zp->z_lock); 1125 zfs_unlinked_add(zp, tx); 1126 } 1127 1128 txtype = TX_REMOVE; 1129 if (flags & FIGNORECASE) 1130 txtype |= TX_CI; 1131 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1132 1133 dmu_tx_commit(tx); 1134 out: 1135 if (realnmp) 1136 pn_free(realnmp); 1137 1138 zfs_dirent_unlock(dl); 1139 zfs_znode_update_vfs(dzp); 1140 zfs_znode_update_vfs(zp); 1141 1142 if (delete_now) 1143 zrele(zp); 1144 else 1145 zfs_zrele_async(zp); 1146 1147 if (xzp) { 1148 zfs_znode_update_vfs(xzp); 1149 zfs_zrele_async(xzp); 1150 } 1151 1152 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1153 zil_commit(zilog, 0); 1154 1155 zfs_exit(zfsvfs, FTAG); 1156 return (error); 1157 } 1158 1159 /* 1160 * Create a new directory and insert it into dzp using the name 1161 * provided. Return a pointer to the inserted directory. 1162 * 1163 * IN: dzp - znode of directory to add subdir to. 1164 * dirname - name of new directory. 1165 * vap - attributes of new directory. 1166 * cr - credentials of caller. 1167 * flags - case flags. 1168 * vsecp - ACL to be set 1169 * mnt_ns - user namespace of the mount 1170 * 1171 * OUT: zpp - znode of created directory. 1172 * 1173 * RETURN: 0 if success 1174 * error code if failure 1175 * 1176 * Timestamps: 1177 * dzp - ctime|mtime updated 1178 * zpp - ctime|mtime|atime updated 1179 */ 1180 int 1181 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1182 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) 1183 { 1184 znode_t *zp; 1185 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1186 zilog_t *zilog; 1187 zfs_dirlock_t *dl; 1188 uint64_t txtype; 1189 dmu_tx_t *tx; 1190 int error; 1191 int zf = ZNEW; 1192 uid_t uid; 1193 gid_t gid = crgetgid(cr); 1194 zfs_acl_ids_t acl_ids; 1195 boolean_t fuid_dirtied; 1196 boolean_t waited = B_FALSE; 1197 1198 ASSERT(S_ISDIR(vap->va_mode)); 1199 1200 /* 1201 * If we have an ephemeral id, ACL, or XVATTR then 1202 * make sure file system is at proper version 1203 */ 1204 1205 uid = crgetuid(cr); 1206 if (zfsvfs->z_use_fuids == B_FALSE && 1207 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1208 return (SET_ERROR(EINVAL)); 1209 1210 if (dirname == NULL) 1211 return (SET_ERROR(EINVAL)); 1212 1213 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1214 return (error); 1215 zilog = zfsvfs->z_log; 1216 1217 if (dzp->z_pflags & ZFS_XATTR) { 1218 zfs_exit(zfsvfs, FTAG); 1219 return (SET_ERROR(EINVAL)); 1220 } 1221 1222 if (zfsvfs->z_utf8 && u8_validate(dirname, 1223 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1224 zfs_exit(zfsvfs, FTAG); 1225 return (SET_ERROR(EILSEQ)); 1226 } 1227 if (flags & FIGNORECASE) 1228 zf |= ZCILOOK; 1229 1230 if (vap->va_mask & ATTR_XVATTR) { 1231 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1232 crgetuid(cr), cr, vap->va_mode)) != 0) { 1233 zfs_exit(zfsvfs, FTAG); 1234 return (error); 1235 } 1236 } 1237 1238 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1239 vsecp, &acl_ids, mnt_ns)) != 0) { 1240 zfs_exit(zfsvfs, FTAG); 1241 return (error); 1242 } 1243 /* 1244 * First make sure the new directory doesn't exist. 1245 * 1246 * Existence is checked first to make sure we don't return 1247 * EACCES instead of EEXIST which can cause some applications 1248 * to fail. 1249 */ 1250 top: 1251 *zpp = NULL; 1252 1253 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1254 NULL, NULL))) { 1255 zfs_acl_ids_free(&acl_ids); 1256 zfs_exit(zfsvfs, FTAG); 1257 return (error); 1258 } 1259 1260 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1261 mnt_ns))) { 1262 zfs_acl_ids_free(&acl_ids); 1263 zfs_dirent_unlock(dl); 1264 zfs_exit(zfsvfs, FTAG); 1265 return (error); 1266 } 1267 1268 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1269 zfs_acl_ids_free(&acl_ids); 1270 zfs_dirent_unlock(dl); 1271 zfs_exit(zfsvfs, FTAG); 1272 return (SET_ERROR(EDQUOT)); 1273 } 1274 1275 /* 1276 * Add a new entry to the directory. 1277 */ 1278 tx = dmu_tx_create(zfsvfs->z_os); 1279 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1280 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1281 fuid_dirtied = zfsvfs->z_fuid_dirty; 1282 if (fuid_dirtied) 1283 zfs_fuid_txhold(zfsvfs, tx); 1284 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1285 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1286 acl_ids.z_aclp->z_acl_bytes); 1287 } 1288 1289 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1290 ZFS_SA_BASE_ATTR_SIZE); 1291 1292 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1293 if (error) { 1294 zfs_dirent_unlock(dl); 1295 if (error == ERESTART) { 1296 waited = B_TRUE; 1297 dmu_tx_wait(tx); 1298 dmu_tx_abort(tx); 1299 goto top; 1300 } 1301 zfs_acl_ids_free(&acl_ids); 1302 dmu_tx_abort(tx); 1303 zfs_exit(zfsvfs, FTAG); 1304 return (error); 1305 } 1306 1307 /* 1308 * Create new node. 1309 */ 1310 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1311 1312 /* 1313 * Now put new name in parent dir. 1314 */ 1315 error = zfs_link_create(dl, zp, tx, ZNEW); 1316 if (error != 0) { 1317 zfs_znode_delete(zp, tx); 1318 remove_inode_hash(ZTOI(zp)); 1319 goto out; 1320 } 1321 1322 if (fuid_dirtied) 1323 zfs_fuid_sync(zfsvfs, tx); 1324 1325 *zpp = zp; 1326 1327 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1328 if (flags & FIGNORECASE) 1329 txtype |= TX_CI; 1330 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1331 acl_ids.z_fuidp, vap); 1332 1333 out: 1334 zfs_acl_ids_free(&acl_ids); 1335 1336 dmu_tx_commit(tx); 1337 1338 zfs_dirent_unlock(dl); 1339 1340 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1341 zil_commit(zilog, 0); 1342 1343 if (error != 0) { 1344 zrele(zp); 1345 } else { 1346 zfs_znode_update_vfs(dzp); 1347 zfs_znode_update_vfs(zp); 1348 } 1349 zfs_exit(zfsvfs, FTAG); 1350 return (error); 1351 } 1352 1353 /* 1354 * Remove a directory subdir entry. If the current working 1355 * directory is the same as the subdir to be removed, the 1356 * remove will fail. 1357 * 1358 * IN: dzp - znode of directory to remove from. 1359 * name - name of directory to be removed. 1360 * cwd - inode of current working directory. 1361 * cr - credentials of caller. 1362 * flags - case flags 1363 * 1364 * RETURN: 0 on success, error code on failure. 1365 * 1366 * Timestamps: 1367 * dzp - ctime|mtime updated 1368 */ 1369 int 1370 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1371 int flags) 1372 { 1373 znode_t *zp; 1374 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1375 zilog_t *zilog; 1376 zfs_dirlock_t *dl; 1377 dmu_tx_t *tx; 1378 int error; 1379 int zflg = ZEXISTS; 1380 boolean_t waited = B_FALSE; 1381 1382 if (name == NULL) 1383 return (SET_ERROR(EINVAL)); 1384 1385 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1386 return (error); 1387 zilog = zfsvfs->z_log; 1388 1389 if (flags & FIGNORECASE) 1390 zflg |= ZCILOOK; 1391 top: 1392 zp = NULL; 1393 1394 /* 1395 * Attempt to lock directory; fail if entry doesn't exist. 1396 */ 1397 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1398 NULL, NULL))) { 1399 zfs_exit(zfsvfs, FTAG); 1400 return (error); 1401 } 1402 1403 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1404 goto out; 1405 } 1406 1407 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1408 error = SET_ERROR(ENOTDIR); 1409 goto out; 1410 } 1411 1412 if (zp == cwd) { 1413 error = SET_ERROR(EINVAL); 1414 goto out; 1415 } 1416 1417 /* 1418 * Grab a lock on the directory to make sure that no one is 1419 * trying to add (or lookup) entries while we are removing it. 1420 */ 1421 rw_enter(&zp->z_name_lock, RW_WRITER); 1422 1423 /* 1424 * Grab a lock on the parent pointer to make sure we play well 1425 * with the treewalk and directory rename code. 1426 */ 1427 rw_enter(&zp->z_parent_lock, RW_WRITER); 1428 1429 tx = dmu_tx_create(zfsvfs->z_os); 1430 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1431 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1432 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1433 zfs_sa_upgrade_txholds(tx, zp); 1434 zfs_sa_upgrade_txholds(tx, dzp); 1435 dmu_tx_mark_netfree(tx); 1436 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1437 if (error) { 1438 rw_exit(&zp->z_parent_lock); 1439 rw_exit(&zp->z_name_lock); 1440 zfs_dirent_unlock(dl); 1441 if (error == ERESTART) { 1442 waited = B_TRUE; 1443 dmu_tx_wait(tx); 1444 dmu_tx_abort(tx); 1445 zrele(zp); 1446 goto top; 1447 } 1448 dmu_tx_abort(tx); 1449 zrele(zp); 1450 zfs_exit(zfsvfs, FTAG); 1451 return (error); 1452 } 1453 1454 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1455 1456 if (error == 0) { 1457 uint64_t txtype = TX_RMDIR; 1458 if (flags & FIGNORECASE) 1459 txtype |= TX_CI; 1460 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1461 B_FALSE); 1462 } 1463 1464 dmu_tx_commit(tx); 1465 1466 rw_exit(&zp->z_parent_lock); 1467 rw_exit(&zp->z_name_lock); 1468 out: 1469 zfs_dirent_unlock(dl); 1470 1471 zfs_znode_update_vfs(dzp); 1472 zfs_znode_update_vfs(zp); 1473 zrele(zp); 1474 1475 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1476 zil_commit(zilog, 0); 1477 1478 zfs_exit(zfsvfs, FTAG); 1479 return (error); 1480 } 1481 1482 /* 1483 * Read directory entries from the given directory cursor position and emit 1484 * name and position for each entry. 1485 * 1486 * IN: ip - inode of directory to read. 1487 * ctx - directory entry context. 1488 * cr - credentials of caller. 1489 * 1490 * RETURN: 0 if success 1491 * error code if failure 1492 * 1493 * Timestamps: 1494 * ip - atime updated 1495 * 1496 * Note that the low 4 bits of the cookie returned by zap is always zero. 1497 * This allows us to use the low range for "special" directory entries: 1498 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1499 * we use the offset 2 for the '.zfs' directory. 1500 */ 1501 int 1502 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) 1503 { 1504 (void) cr; 1505 znode_t *zp = ITOZ(ip); 1506 zfsvfs_t *zfsvfs = ITOZSB(ip); 1507 objset_t *os; 1508 zap_cursor_t zc; 1509 zap_attribute_t zap; 1510 int error; 1511 uint8_t prefetch; 1512 uint8_t type; 1513 int done = 0; 1514 uint64_t parent; 1515 uint64_t offset; /* must be unsigned; checks for < 1 */ 1516 1517 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1518 return (error); 1519 1520 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1521 &parent, sizeof (parent))) != 0) 1522 goto out; 1523 1524 /* 1525 * Quit if directory has been removed (posix) 1526 */ 1527 if (zp->z_unlinked) 1528 goto out; 1529 1530 error = 0; 1531 os = zfsvfs->z_os; 1532 offset = ctx->pos; 1533 prefetch = zp->z_zn_prefetch; 1534 1535 /* 1536 * Initialize the iterator cursor. 1537 */ 1538 if (offset <= 3) { 1539 /* 1540 * Start iteration from the beginning of the directory. 1541 */ 1542 zap_cursor_init(&zc, os, zp->z_id); 1543 } else { 1544 /* 1545 * The offset is a serialized cursor. 1546 */ 1547 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1548 } 1549 1550 /* 1551 * Transform to file-system independent format 1552 */ 1553 while (!done) { 1554 uint64_t objnum; 1555 /* 1556 * Special case `.', `..', and `.zfs'. 1557 */ 1558 if (offset == 0) { 1559 (void) strcpy(zap.za_name, "."); 1560 zap.za_normalization_conflict = 0; 1561 objnum = zp->z_id; 1562 type = DT_DIR; 1563 } else if (offset == 1) { 1564 (void) strcpy(zap.za_name, ".."); 1565 zap.za_normalization_conflict = 0; 1566 objnum = parent; 1567 type = DT_DIR; 1568 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1569 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1570 zap.za_normalization_conflict = 0; 1571 objnum = ZFSCTL_INO_ROOT; 1572 type = DT_DIR; 1573 } else { 1574 /* 1575 * Grab next entry. 1576 */ 1577 if ((error = zap_cursor_retrieve(&zc, &zap))) { 1578 if (error == ENOENT) 1579 break; 1580 else 1581 goto update; 1582 } 1583 1584 /* 1585 * Allow multiple entries provided the first entry is 1586 * the object id. Non-zpl consumers may safely make 1587 * use of the additional space. 1588 * 1589 * XXX: This should be a feature flag for compatibility 1590 */ 1591 if (zap.za_integer_length != 8 || 1592 zap.za_num_integers == 0) { 1593 cmn_err(CE_WARN, "zap_readdir: bad directory " 1594 "entry, obj = %lld, offset = %lld, " 1595 "length = %d, num = %lld\n", 1596 (u_longlong_t)zp->z_id, 1597 (u_longlong_t)offset, 1598 zap.za_integer_length, 1599 (u_longlong_t)zap.za_num_integers); 1600 error = SET_ERROR(ENXIO); 1601 goto update; 1602 } 1603 1604 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1605 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1606 } 1607 1608 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), 1609 objnum, type); 1610 if (done) 1611 break; 1612 1613 if (prefetch) 1614 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); 1615 1616 /* 1617 * Move to the next entry, fill in the previous offset. 1618 */ 1619 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1620 zap_cursor_advance(&zc); 1621 offset = zap_cursor_serialize(&zc); 1622 } else { 1623 offset += 1; 1624 } 1625 ctx->pos = offset; 1626 } 1627 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1628 1629 update: 1630 zap_cursor_fini(&zc); 1631 if (error == ENOENT) 1632 error = 0; 1633 out: 1634 zfs_exit(zfsvfs, FTAG); 1635 1636 return (error); 1637 } 1638 1639 /* 1640 * Get the basic file attributes and place them in the provided kstat 1641 * structure. The inode is assumed to be the authoritative source 1642 * for most of the attributes. However, the znode currently has the 1643 * authoritative atime, blksize, and block count. 1644 * 1645 * IN: ip - inode of file. 1646 * 1647 * OUT: sp - kstat values. 1648 * 1649 * RETURN: 0 (always succeeds) 1650 */ 1651 int 1652 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) 1653 { 1654 znode_t *zp = ITOZ(ip); 1655 zfsvfs_t *zfsvfs = ITOZSB(ip); 1656 uint32_t blksize; 1657 u_longlong_t nblocks; 1658 int error; 1659 1660 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1661 return (error); 1662 1663 mutex_enter(&zp->z_lock); 1664 1665 zpl_generic_fillattr(user_ns, ip, sp); 1666 /* 1667 * +1 link count for root inode with visible '.zfs' directory. 1668 */ 1669 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1670 if (sp->nlink < ZFS_LINK_MAX) 1671 sp->nlink++; 1672 1673 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1674 sp->blksize = blksize; 1675 sp->blocks = nblocks; 1676 1677 if (unlikely(zp->z_blksz == 0)) { 1678 /* 1679 * Block size hasn't been set; suggest maximal I/O transfers. 1680 */ 1681 sp->blksize = zfsvfs->z_max_blksz; 1682 } 1683 1684 mutex_exit(&zp->z_lock); 1685 1686 /* 1687 * Required to prevent NFS client from detecting different inode 1688 * numbers of snapshot root dentry before and after snapshot mount. 1689 */ 1690 if (zfsvfs->z_issnap) { 1691 if (ip->i_sb->s_root->d_inode == ip) 1692 sp->ino = ZFSCTL_INO_SNAPDIRS - 1693 dmu_objset_id(zfsvfs->z_os); 1694 } 1695 1696 zfs_exit(zfsvfs, FTAG); 1697 1698 return (0); 1699 } 1700 1701 /* 1702 * For the operation of changing file's user/group/project, we need to 1703 * handle not only the main object that is assigned to the file directly, 1704 * but also the ones that are used by the file via hidden xattr directory. 1705 * 1706 * Because the xattr directory may contains many EA entries, as to it may 1707 * be impossible to change all of them via the transaction of changing the 1708 * main object's user/group/project attributes. Then we have to change them 1709 * via other multiple independent transactions one by one. It may be not good 1710 * solution, but we have no better idea yet. 1711 */ 1712 static int 1713 zfs_setattr_dir(znode_t *dzp) 1714 { 1715 struct inode *dxip = ZTOI(dzp); 1716 struct inode *xip = NULL; 1717 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1718 objset_t *os = zfsvfs->z_os; 1719 zap_cursor_t zc; 1720 zap_attribute_t zap; 1721 zfs_dirlock_t *dl; 1722 znode_t *zp = NULL; 1723 dmu_tx_t *tx = NULL; 1724 uint64_t uid, gid; 1725 sa_bulk_attr_t bulk[4]; 1726 int count; 1727 int err; 1728 1729 zap_cursor_init(&zc, os, dzp->z_id); 1730 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 1731 count = 0; 1732 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 1733 err = ENXIO; 1734 break; 1735 } 1736 1737 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 1738 ZEXISTS, NULL, NULL); 1739 if (err == ENOENT) 1740 goto next; 1741 if (err) 1742 break; 1743 1744 xip = ZTOI(zp); 1745 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1746 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1747 zp->z_projid == dzp->z_projid) 1748 goto next; 1749 1750 tx = dmu_tx_create(os); 1751 if (!(zp->z_pflags & ZFS_PROJID)) 1752 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1753 else 1754 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1755 1756 err = dmu_tx_assign(tx, TXG_WAIT); 1757 if (err) 1758 break; 1759 1760 mutex_enter(&dzp->z_lock); 1761 1762 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1763 xip->i_uid = dxip->i_uid; 1764 uid = zfs_uid_read(dxip); 1765 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1766 &uid, sizeof (uid)); 1767 } 1768 1769 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1770 xip->i_gid = dxip->i_gid; 1771 gid = zfs_gid_read(dxip); 1772 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1773 &gid, sizeof (gid)); 1774 } 1775 1776 if (zp->z_projid != dzp->z_projid) { 1777 if (!(zp->z_pflags & ZFS_PROJID)) { 1778 zp->z_pflags |= ZFS_PROJID; 1779 SA_ADD_BULK_ATTR(bulk, count, 1780 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 1781 sizeof (zp->z_pflags)); 1782 } 1783 1784 zp->z_projid = dzp->z_projid; 1785 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), 1786 NULL, &zp->z_projid, sizeof (zp->z_projid)); 1787 } 1788 1789 mutex_exit(&dzp->z_lock); 1790 1791 if (likely(count > 0)) { 1792 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1793 dmu_tx_commit(tx); 1794 } else { 1795 dmu_tx_abort(tx); 1796 } 1797 tx = NULL; 1798 if (err != 0 && err != ENOENT) 1799 break; 1800 1801 next: 1802 if (zp) { 1803 zrele(zp); 1804 zp = NULL; 1805 zfs_dirent_unlock(dl); 1806 } 1807 zap_cursor_advance(&zc); 1808 } 1809 1810 if (tx) 1811 dmu_tx_abort(tx); 1812 if (zp) { 1813 zrele(zp); 1814 zfs_dirent_unlock(dl); 1815 } 1816 zap_cursor_fini(&zc); 1817 1818 return (err == ENOENT ? 0 : err); 1819 } 1820 1821 /* 1822 * Set the file attributes to the values contained in the 1823 * vattr structure. 1824 * 1825 * IN: zp - znode of file to be modified. 1826 * vap - new attribute values. 1827 * If ATTR_XVATTR set, then optional attrs are being set 1828 * flags - ATTR_UTIME set if non-default time values provided. 1829 * - ATTR_NOACLCHECK (CIFS context only). 1830 * cr - credentials of caller. 1831 * mnt_ns - user namespace of the mount 1832 * 1833 * RETURN: 0 if success 1834 * error code if failure 1835 * 1836 * Timestamps: 1837 * ip - ctime updated, mtime updated if size changed. 1838 */ 1839 int 1840 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) 1841 { 1842 struct inode *ip; 1843 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1844 objset_t *os = zfsvfs->z_os; 1845 zilog_t *zilog; 1846 dmu_tx_t *tx; 1847 vattr_t oldva; 1848 xvattr_t *tmpxvattr; 1849 uint_t mask = vap->va_mask; 1850 uint_t saved_mask = 0; 1851 int trim_mask = 0; 1852 uint64_t new_mode; 1853 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1854 uint64_t xattr_obj; 1855 uint64_t mtime[2], ctime[2], atime[2]; 1856 uint64_t projid = ZFS_INVALID_PROJID; 1857 znode_t *attrzp; 1858 int need_policy = FALSE; 1859 int err, err2 = 0; 1860 zfs_fuid_info_t *fuidp = NULL; 1861 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1862 xoptattr_t *xoap; 1863 zfs_acl_t *aclp; 1864 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1865 boolean_t fuid_dirtied = B_FALSE; 1866 boolean_t handle_eadir = B_FALSE; 1867 sa_bulk_attr_t *bulk, *xattr_bulk; 1868 int count = 0, xattr_count = 0, bulks = 8; 1869 1870 if (mask == 0) 1871 return (0); 1872 1873 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1874 return (err); 1875 ip = ZTOI(zp); 1876 1877 /* 1878 * If this is a xvattr_t, then get a pointer to the structure of 1879 * optional attributes. If this is NULL, then we have a vattr_t. 1880 */ 1881 xoap = xva_getxoptattr(xvap); 1882 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1883 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1884 if (!dmu_objset_projectquota_enabled(os) || 1885 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1886 zfs_exit(zfsvfs, FTAG); 1887 return (SET_ERROR(ENOTSUP)); 1888 } 1889 1890 projid = xoap->xoa_projid; 1891 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1892 zfs_exit(zfsvfs, FTAG); 1893 return (SET_ERROR(EINVAL)); 1894 } 1895 1896 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1897 projid = ZFS_INVALID_PROJID; 1898 else 1899 need_policy = TRUE; 1900 } 1901 1902 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1903 (xoap->xoa_projinherit != 1904 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1905 (!dmu_objset_projectquota_enabled(os) || 1906 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1907 zfs_exit(zfsvfs, FTAG); 1908 return (SET_ERROR(ENOTSUP)); 1909 } 1910 } 1911 1912 zilog = zfsvfs->z_log; 1913 1914 /* 1915 * Make sure that if we have ephemeral uid/gid or xvattr specified 1916 * that file system is at proper version level 1917 */ 1918 1919 if (zfsvfs->z_use_fuids == B_FALSE && 1920 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 1921 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 1922 (mask & ATTR_XVATTR))) { 1923 zfs_exit(zfsvfs, FTAG); 1924 return (SET_ERROR(EINVAL)); 1925 } 1926 1927 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 1928 zfs_exit(zfsvfs, FTAG); 1929 return (SET_ERROR(EISDIR)); 1930 } 1931 1932 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 1933 zfs_exit(zfsvfs, FTAG); 1934 return (SET_ERROR(EINVAL)); 1935 } 1936 1937 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 1938 xva_init(tmpxvattr); 1939 1940 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1941 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1942 1943 /* 1944 * Immutable files can only alter immutable bit and atime 1945 */ 1946 if ((zp->z_pflags & ZFS_IMMUTABLE) && 1947 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 1948 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 1949 err = SET_ERROR(EPERM); 1950 goto out3; 1951 } 1952 1953 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 1954 err = SET_ERROR(EPERM); 1955 goto out3; 1956 } 1957 1958 /* 1959 * Verify timestamps doesn't overflow 32 bits. 1960 * ZFS can handle large timestamps, but 32bit syscalls can't 1961 * handle times greater than 2039. This check should be removed 1962 * once large timestamps are fully supported. 1963 */ 1964 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 1965 if (((mask & ATTR_ATIME) && 1966 TIMESPEC_OVERFLOW(&vap->va_atime)) || 1967 ((mask & ATTR_MTIME) && 1968 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 1969 err = SET_ERROR(EOVERFLOW); 1970 goto out3; 1971 } 1972 } 1973 1974 top: 1975 attrzp = NULL; 1976 aclp = NULL; 1977 1978 /* Can this be moved to before the top label? */ 1979 if (zfs_is_readonly(zfsvfs)) { 1980 err = SET_ERROR(EROFS); 1981 goto out3; 1982 } 1983 1984 /* 1985 * First validate permissions 1986 */ 1987 1988 if (mask & ATTR_SIZE) { 1989 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 1990 mnt_ns); 1991 if (err) 1992 goto out3; 1993 1994 /* 1995 * XXX - Note, we are not providing any open 1996 * mode flags here (like FNDELAY), so we may 1997 * block if there are locks present... this 1998 * should be addressed in openat(). 1999 */ 2000 /* XXX - would it be OK to generate a log record here? */ 2001 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2002 if (err) 2003 goto out3; 2004 } 2005 2006 if (mask & (ATTR_ATIME|ATTR_MTIME) || 2007 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2008 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2009 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2010 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2011 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2012 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2013 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2014 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2015 skipaclchk, cr, mnt_ns); 2016 } 2017 2018 if (mask & (ATTR_UID|ATTR_GID)) { 2019 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2020 int take_owner; 2021 int take_group; 2022 uid_t uid; 2023 gid_t gid; 2024 2025 /* 2026 * NOTE: even if a new mode is being set, 2027 * we may clear S_ISUID/S_ISGID bits. 2028 */ 2029 2030 if (!(mask & ATTR_MODE)) 2031 vap->va_mode = zp->z_mode; 2032 2033 /* 2034 * Take ownership or chgrp to group we are a member of 2035 */ 2036 2037 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), 2038 vap->va_uid); 2039 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), 2040 vap->va_gid); 2041 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2042 take_group = (mask & ATTR_GID) && 2043 zfs_groupmember(zfsvfs, gid, cr); 2044 2045 /* 2046 * If both ATTR_UID and ATTR_GID are set then take_owner and 2047 * take_group must both be set in order to allow taking 2048 * ownership. 2049 * 2050 * Otherwise, send the check through secpolicy_vnode_setattr() 2051 * 2052 */ 2053 2054 if (((idmask == (ATTR_UID|ATTR_GID)) && 2055 take_owner && take_group) || 2056 ((idmask == ATTR_UID) && take_owner) || 2057 ((idmask == ATTR_GID) && take_group)) { 2058 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2059 skipaclchk, cr, mnt_ns) == 0) { 2060 /* 2061 * Remove setuid/setgid for non-privileged users 2062 */ 2063 (void) secpolicy_setid_clear(vap, cr); 2064 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2065 } else { 2066 need_policy = TRUE; 2067 } 2068 } else { 2069 need_policy = TRUE; 2070 } 2071 } 2072 2073 mutex_enter(&zp->z_lock); 2074 oldva.va_mode = zp->z_mode; 2075 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2076 if (mask & ATTR_XVATTR) { 2077 /* 2078 * Update xvattr mask to include only those attributes 2079 * that are actually changing. 2080 * 2081 * the bits will be restored prior to actually setting 2082 * the attributes so the caller thinks they were set. 2083 */ 2084 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2085 if (xoap->xoa_appendonly != 2086 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2087 need_policy = TRUE; 2088 } else { 2089 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2090 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2091 } 2092 } 2093 2094 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2095 if (xoap->xoa_projinherit != 2096 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2097 need_policy = TRUE; 2098 } else { 2099 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2100 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2101 } 2102 } 2103 2104 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2105 if (xoap->xoa_nounlink != 2106 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2107 need_policy = TRUE; 2108 } else { 2109 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2110 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2111 } 2112 } 2113 2114 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2115 if (xoap->xoa_immutable != 2116 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2117 need_policy = TRUE; 2118 } else { 2119 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2120 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2121 } 2122 } 2123 2124 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2125 if (xoap->xoa_nodump != 2126 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2127 need_policy = TRUE; 2128 } else { 2129 XVA_CLR_REQ(xvap, XAT_NODUMP); 2130 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2131 } 2132 } 2133 2134 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2135 if (xoap->xoa_av_modified != 2136 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2137 need_policy = TRUE; 2138 } else { 2139 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2140 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2141 } 2142 } 2143 2144 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2145 if ((!S_ISREG(ip->i_mode) && 2146 xoap->xoa_av_quarantined) || 2147 xoap->xoa_av_quarantined != 2148 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2149 need_policy = TRUE; 2150 } else { 2151 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2152 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2153 } 2154 } 2155 2156 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2157 mutex_exit(&zp->z_lock); 2158 err = SET_ERROR(EPERM); 2159 goto out3; 2160 } 2161 2162 if (need_policy == FALSE && 2163 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2164 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2165 need_policy = TRUE; 2166 } 2167 } 2168 2169 mutex_exit(&zp->z_lock); 2170 2171 if (mask & ATTR_MODE) { 2172 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2173 mnt_ns) == 0) { 2174 err = secpolicy_setid_setsticky_clear(ip, vap, 2175 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2176 if (err) 2177 goto out3; 2178 trim_mask |= ATTR_MODE; 2179 } else { 2180 need_policy = TRUE; 2181 } 2182 } 2183 2184 if (need_policy) { 2185 /* 2186 * If trim_mask is set then take ownership 2187 * has been granted or write_acl is present and user 2188 * has the ability to modify mode. In that case remove 2189 * UID|GID and or MODE from mask so that 2190 * secpolicy_vnode_setattr() doesn't revoke it. 2191 */ 2192 2193 if (trim_mask) { 2194 saved_mask = vap->va_mask; 2195 vap->va_mask &= ~trim_mask; 2196 } 2197 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2198 zfs_zaccess_unix, zp); 2199 if (err) 2200 goto out3; 2201 2202 if (trim_mask) 2203 vap->va_mask |= saved_mask; 2204 } 2205 2206 /* 2207 * secpolicy_vnode_setattr, or take ownership may have 2208 * changed va_mask 2209 */ 2210 mask = vap->va_mask; 2211 2212 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2213 handle_eadir = B_TRUE; 2214 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2215 &xattr_obj, sizeof (xattr_obj)); 2216 2217 if (err == 0 && xattr_obj) { 2218 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2219 if (err) 2220 goto out2; 2221 } 2222 if (mask & ATTR_UID) { 2223 new_kuid = zfs_fuid_create(zfsvfs, 2224 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2225 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2226 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2227 new_kuid)) { 2228 if (attrzp) 2229 zrele(attrzp); 2230 err = SET_ERROR(EDQUOT); 2231 goto out2; 2232 } 2233 } 2234 2235 if (mask & ATTR_GID) { 2236 new_kgid = zfs_fuid_create(zfsvfs, 2237 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2238 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2239 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2240 new_kgid)) { 2241 if (attrzp) 2242 zrele(attrzp); 2243 err = SET_ERROR(EDQUOT); 2244 goto out2; 2245 } 2246 } 2247 2248 if (projid != ZFS_INVALID_PROJID && 2249 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2250 if (attrzp) 2251 zrele(attrzp); 2252 err = EDQUOT; 2253 goto out2; 2254 } 2255 } 2256 tx = dmu_tx_create(os); 2257 2258 if (mask & ATTR_MODE) { 2259 uint64_t pmode = zp->z_mode; 2260 uint64_t acl_obj; 2261 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2262 2263 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2264 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2265 err = EPERM; 2266 goto out; 2267 } 2268 2269 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2270 goto out; 2271 2272 mutex_enter(&zp->z_lock); 2273 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2274 /* 2275 * Are we upgrading ACL from old V0 format 2276 * to V1 format? 2277 */ 2278 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2279 zfs_znode_acl_version(zp) == 2280 ZFS_ACL_VERSION_INITIAL) { 2281 dmu_tx_hold_free(tx, acl_obj, 0, 2282 DMU_OBJECT_END); 2283 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2284 0, aclp->z_acl_bytes); 2285 } else { 2286 dmu_tx_hold_write(tx, acl_obj, 0, 2287 aclp->z_acl_bytes); 2288 } 2289 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2290 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2291 0, aclp->z_acl_bytes); 2292 } 2293 mutex_exit(&zp->z_lock); 2294 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2295 } else { 2296 if (((mask & ATTR_XVATTR) && 2297 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2298 (projid != ZFS_INVALID_PROJID && 2299 !(zp->z_pflags & ZFS_PROJID))) 2300 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2301 else 2302 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2303 } 2304 2305 if (attrzp) { 2306 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2307 } 2308 2309 fuid_dirtied = zfsvfs->z_fuid_dirty; 2310 if (fuid_dirtied) 2311 zfs_fuid_txhold(zfsvfs, tx); 2312 2313 zfs_sa_upgrade_txholds(tx, zp); 2314 2315 err = dmu_tx_assign(tx, TXG_WAIT); 2316 if (err) 2317 goto out; 2318 2319 count = 0; 2320 /* 2321 * Set each attribute requested. 2322 * We group settings according to the locks they need to acquire. 2323 * 2324 * Note: you cannot set ctime directly, although it will be 2325 * updated as a side-effect of calling this function. 2326 */ 2327 2328 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2329 /* 2330 * For the existed object that is upgraded from old system, 2331 * its on-disk layout has no slot for the project ID attribute. 2332 * But quota accounting logic needs to access related slots by 2333 * offset directly. So we need to adjust old objects' layout 2334 * to make the project ID to some unified and fixed offset. 2335 */ 2336 if (attrzp) 2337 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2338 if (err == 0) 2339 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2340 2341 if (unlikely(err == EEXIST)) 2342 err = 0; 2343 else if (err != 0) 2344 goto out; 2345 else 2346 projid = ZFS_INVALID_PROJID; 2347 } 2348 2349 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2350 mutex_enter(&zp->z_acl_lock); 2351 mutex_enter(&zp->z_lock); 2352 2353 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2354 &zp->z_pflags, sizeof (zp->z_pflags)); 2355 2356 if (attrzp) { 2357 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2358 mutex_enter(&attrzp->z_acl_lock); 2359 mutex_enter(&attrzp->z_lock); 2360 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2361 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2362 sizeof (attrzp->z_pflags)); 2363 if (projid != ZFS_INVALID_PROJID) { 2364 attrzp->z_projid = projid; 2365 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2366 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2367 sizeof (attrzp->z_projid)); 2368 } 2369 } 2370 2371 if (mask & (ATTR_UID|ATTR_GID)) { 2372 2373 if (mask & ATTR_UID) { 2374 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2375 new_uid = zfs_uid_read(ZTOI(zp)); 2376 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2377 &new_uid, sizeof (new_uid)); 2378 if (attrzp) { 2379 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2380 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2381 sizeof (new_uid)); 2382 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2383 } 2384 } 2385 2386 if (mask & ATTR_GID) { 2387 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2388 new_gid = zfs_gid_read(ZTOI(zp)); 2389 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2390 NULL, &new_gid, sizeof (new_gid)); 2391 if (attrzp) { 2392 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2393 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2394 sizeof (new_gid)); 2395 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2396 } 2397 } 2398 if (!(mask & ATTR_MODE)) { 2399 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2400 NULL, &new_mode, sizeof (new_mode)); 2401 new_mode = zp->z_mode; 2402 } 2403 err = zfs_acl_chown_setattr(zp); 2404 ASSERT(err == 0); 2405 if (attrzp) { 2406 err = zfs_acl_chown_setattr(attrzp); 2407 ASSERT(err == 0); 2408 } 2409 } 2410 2411 if (mask & ATTR_MODE) { 2412 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2413 &new_mode, sizeof (new_mode)); 2414 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2415 ASSERT3P(aclp, !=, NULL); 2416 err = zfs_aclset_common(zp, aclp, cr, tx); 2417 ASSERT0(err); 2418 if (zp->z_acl_cached) 2419 zfs_acl_free(zp->z_acl_cached); 2420 zp->z_acl_cached = aclp; 2421 aclp = NULL; 2422 } 2423 2424 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2425 zp->z_atime_dirty = B_FALSE; 2426 ZFS_TIME_ENCODE(&ip->i_atime, atime); 2427 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2428 &atime, sizeof (atime)); 2429 } 2430 2431 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2432 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2433 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( 2434 vap->va_mtime, ZTOI(zp)); 2435 2436 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2437 mtime, sizeof (mtime)); 2438 } 2439 2440 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2441 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2442 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, 2443 ZTOI(zp)); 2444 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2445 ctime, sizeof (ctime)); 2446 } 2447 2448 if (projid != ZFS_INVALID_PROJID) { 2449 zp->z_projid = projid; 2450 SA_ADD_BULK_ATTR(bulk, count, 2451 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2452 sizeof (zp->z_projid)); 2453 } 2454 2455 if (attrzp && mask) { 2456 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2457 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2458 sizeof (ctime)); 2459 } 2460 2461 /* 2462 * Do this after setting timestamps to prevent timestamp 2463 * update from toggling bit 2464 */ 2465 2466 if (xoap && (mask & ATTR_XVATTR)) { 2467 2468 /* 2469 * restore trimmed off masks 2470 * so that return masks can be set for caller. 2471 */ 2472 2473 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2474 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2475 } 2476 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2477 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2478 } 2479 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2480 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2481 } 2482 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2483 XVA_SET_REQ(xvap, XAT_NODUMP); 2484 } 2485 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2486 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2487 } 2488 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2489 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2490 } 2491 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2492 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2493 } 2494 2495 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2496 ASSERT(S_ISREG(ip->i_mode)); 2497 2498 zfs_xvattr_set(zp, xvap, tx); 2499 } 2500 2501 if (fuid_dirtied) 2502 zfs_fuid_sync(zfsvfs, tx); 2503 2504 if (mask != 0) 2505 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2506 2507 mutex_exit(&zp->z_lock); 2508 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2509 mutex_exit(&zp->z_acl_lock); 2510 2511 if (attrzp) { 2512 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2513 mutex_exit(&attrzp->z_acl_lock); 2514 mutex_exit(&attrzp->z_lock); 2515 } 2516 out: 2517 if (err == 0 && xattr_count > 0) { 2518 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2519 xattr_count, tx); 2520 ASSERT(err2 == 0); 2521 } 2522 2523 if (aclp) 2524 zfs_acl_free(aclp); 2525 2526 if (fuidp) { 2527 zfs_fuid_info_free(fuidp); 2528 fuidp = NULL; 2529 } 2530 2531 if (err) { 2532 dmu_tx_abort(tx); 2533 if (attrzp) 2534 zrele(attrzp); 2535 if (err == ERESTART) 2536 goto top; 2537 } else { 2538 if (count > 0) 2539 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2540 dmu_tx_commit(tx); 2541 if (attrzp) { 2542 if (err2 == 0 && handle_eadir) 2543 err = zfs_setattr_dir(attrzp); 2544 zrele(attrzp); 2545 } 2546 zfs_znode_update_vfs(zp); 2547 } 2548 2549 out2: 2550 if (os->os_sync == ZFS_SYNC_ALWAYS) 2551 zil_commit(zilog, 0); 2552 2553 out3: 2554 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2555 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2556 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2557 zfs_exit(zfsvfs, FTAG); 2558 return (err); 2559 } 2560 2561 typedef struct zfs_zlock { 2562 krwlock_t *zl_rwlock; /* lock we acquired */ 2563 znode_t *zl_znode; /* znode we held */ 2564 struct zfs_zlock *zl_next; /* next in list */ 2565 } zfs_zlock_t; 2566 2567 /* 2568 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2569 */ 2570 static void 2571 zfs_rename_unlock(zfs_zlock_t **zlpp) 2572 { 2573 zfs_zlock_t *zl; 2574 2575 while ((zl = *zlpp) != NULL) { 2576 if (zl->zl_znode != NULL) 2577 zfs_zrele_async(zl->zl_znode); 2578 rw_exit(zl->zl_rwlock); 2579 *zlpp = zl->zl_next; 2580 kmem_free(zl, sizeof (*zl)); 2581 } 2582 } 2583 2584 /* 2585 * Search back through the directory tree, using the ".." entries. 2586 * Lock each directory in the chain to prevent concurrent renames. 2587 * Fail any attempt to move a directory into one of its own descendants. 2588 * XXX - z_parent_lock can overlap with map or grow locks 2589 */ 2590 static int 2591 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2592 { 2593 zfs_zlock_t *zl; 2594 znode_t *zp = tdzp; 2595 uint64_t rootid = ZTOZSB(zp)->z_root; 2596 uint64_t oidp = zp->z_id; 2597 krwlock_t *rwlp = &szp->z_parent_lock; 2598 krw_t rw = RW_WRITER; 2599 2600 /* 2601 * First pass write-locks szp and compares to zp->z_id. 2602 * Later passes read-lock zp and compare to zp->z_parent. 2603 */ 2604 do { 2605 if (!rw_tryenter(rwlp, rw)) { 2606 /* 2607 * Another thread is renaming in this path. 2608 * Note that if we are a WRITER, we don't have any 2609 * parent_locks held yet. 2610 */ 2611 if (rw == RW_READER && zp->z_id > szp->z_id) { 2612 /* 2613 * Drop our locks and restart 2614 */ 2615 zfs_rename_unlock(&zl); 2616 *zlpp = NULL; 2617 zp = tdzp; 2618 oidp = zp->z_id; 2619 rwlp = &szp->z_parent_lock; 2620 rw = RW_WRITER; 2621 continue; 2622 } else { 2623 /* 2624 * Wait for other thread to drop its locks 2625 */ 2626 rw_enter(rwlp, rw); 2627 } 2628 } 2629 2630 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2631 zl->zl_rwlock = rwlp; 2632 zl->zl_znode = NULL; 2633 zl->zl_next = *zlpp; 2634 *zlpp = zl; 2635 2636 if (oidp == szp->z_id) /* We're a descendant of szp */ 2637 return (SET_ERROR(EINVAL)); 2638 2639 if (oidp == rootid) /* We've hit the top */ 2640 return (0); 2641 2642 if (rw == RW_READER) { /* i.e. not the first pass */ 2643 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2644 if (error) 2645 return (error); 2646 zl->zl_znode = zp; 2647 } 2648 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2649 &oidp, sizeof (oidp)); 2650 rwlp = &zp->z_parent_lock; 2651 rw = RW_READER; 2652 2653 } while (zp->z_id != sdzp->z_id); 2654 2655 return (0); 2656 } 2657 2658 /* 2659 * Move an entry from the provided source directory to the target 2660 * directory. Change the entry name as indicated. 2661 * 2662 * IN: sdzp - Source directory containing the "old entry". 2663 * snm - Old entry name. 2664 * tdzp - Target directory to contain the "new entry". 2665 * tnm - New entry name. 2666 * cr - credentials of caller. 2667 * flags - case flags 2668 * rflags - RENAME_* flags 2669 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2670 * mnt_ns - user namespace of the mount 2671 * 2672 * RETURN: 0 on success, error code on failure. 2673 * 2674 * Timestamps: 2675 * sdzp,tdzp - ctime|mtime updated 2676 */ 2677 int 2678 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2679 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) 2680 { 2681 znode_t *szp, *tzp; 2682 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2683 zilog_t *zilog; 2684 zfs_dirlock_t *sdl, *tdl; 2685 dmu_tx_t *tx; 2686 zfs_zlock_t *zl; 2687 int cmp, serr, terr; 2688 int error = 0; 2689 int zflg = 0; 2690 boolean_t waited = B_FALSE; 2691 /* Needed for whiteout inode creation. */ 2692 boolean_t fuid_dirtied; 2693 zfs_acl_ids_t acl_ids; 2694 boolean_t have_acl = B_FALSE; 2695 znode_t *wzp = NULL; 2696 2697 2698 if (snm == NULL || tnm == NULL) 2699 return (SET_ERROR(EINVAL)); 2700 2701 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2702 return (SET_ERROR(EINVAL)); 2703 2704 /* Already checked by Linux VFS, but just to make sure. */ 2705 if (rflags & RENAME_EXCHANGE && 2706 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2707 return (SET_ERROR(EINVAL)); 2708 2709 /* 2710 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2711 * right kind of vattr_t for the whiteout file. These are set 2712 * internally by ZFS so should never be incorrect. 2713 */ 2714 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2715 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2716 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2717 2718 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2719 return (error); 2720 zilog = zfsvfs->z_log; 2721 2722 if ((error = zfs_verify_zp(tdzp)) != 0) { 2723 zfs_exit(zfsvfs, FTAG); 2724 return (error); 2725 } 2726 2727 /* 2728 * We check i_sb because snapshots and the ctldir must have different 2729 * super blocks. 2730 */ 2731 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2732 zfsctl_is_node(ZTOI(tdzp))) { 2733 zfs_exit(zfsvfs, FTAG); 2734 return (SET_ERROR(EXDEV)); 2735 } 2736 2737 if (zfsvfs->z_utf8 && u8_validate(tnm, 2738 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2739 zfs_exit(zfsvfs, FTAG); 2740 return (SET_ERROR(EILSEQ)); 2741 } 2742 2743 if (flags & FIGNORECASE) 2744 zflg |= ZCILOOK; 2745 2746 top: 2747 szp = NULL; 2748 tzp = NULL; 2749 zl = NULL; 2750 2751 /* 2752 * This is to prevent the creation of links into attribute space 2753 * by renaming a linked file into/outof an attribute directory. 2754 * See the comment in zfs_link() for why this is considered bad. 2755 */ 2756 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2757 zfs_exit(zfsvfs, FTAG); 2758 return (SET_ERROR(EINVAL)); 2759 } 2760 2761 /* 2762 * Lock source and target directory entries. To prevent deadlock, 2763 * a lock ordering must be defined. We lock the directory with 2764 * the smallest object id first, or if it's a tie, the one with 2765 * the lexically first name. 2766 */ 2767 if (sdzp->z_id < tdzp->z_id) { 2768 cmp = -1; 2769 } else if (sdzp->z_id > tdzp->z_id) { 2770 cmp = 1; 2771 } else { 2772 /* 2773 * First compare the two name arguments without 2774 * considering any case folding. 2775 */ 2776 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2777 2778 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2779 ASSERT(error == 0 || !zfsvfs->z_utf8); 2780 if (cmp == 0) { 2781 /* 2782 * POSIX: "If the old argument and the new argument 2783 * both refer to links to the same existing file, 2784 * the rename() function shall return successfully 2785 * and perform no other action." 2786 */ 2787 zfs_exit(zfsvfs, FTAG); 2788 return (0); 2789 } 2790 /* 2791 * If the file system is case-folding, then we may 2792 * have some more checking to do. A case-folding file 2793 * system is either supporting mixed case sensitivity 2794 * access or is completely case-insensitive. Note 2795 * that the file system is always case preserving. 2796 * 2797 * In mixed sensitivity mode case sensitive behavior 2798 * is the default. FIGNORECASE must be used to 2799 * explicitly request case insensitive behavior. 2800 * 2801 * If the source and target names provided differ only 2802 * by case (e.g., a request to rename 'tim' to 'Tim'), 2803 * we will treat this as a special case in the 2804 * case-insensitive mode: as long as the source name 2805 * is an exact match, we will allow this to proceed as 2806 * a name-change request. 2807 */ 2808 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2809 (zfsvfs->z_case == ZFS_CASE_MIXED && 2810 flags & FIGNORECASE)) && 2811 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2812 &error) == 0) { 2813 /* 2814 * case preserving rename request, require exact 2815 * name matches 2816 */ 2817 zflg |= ZCIEXACT; 2818 zflg &= ~ZCILOOK; 2819 } 2820 } 2821 2822 /* 2823 * If the source and destination directories are the same, we should 2824 * grab the z_name_lock of that directory only once. 2825 */ 2826 if (sdzp == tdzp) { 2827 zflg |= ZHAVELOCK; 2828 rw_enter(&sdzp->z_name_lock, RW_READER); 2829 } 2830 2831 if (cmp < 0) { 2832 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2833 ZEXISTS | zflg, NULL, NULL); 2834 terr = zfs_dirent_lock(&tdl, 2835 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2836 } else { 2837 terr = zfs_dirent_lock(&tdl, 2838 tdzp, tnm, &tzp, zflg, NULL, NULL); 2839 serr = zfs_dirent_lock(&sdl, 2840 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2841 NULL, NULL); 2842 } 2843 2844 if (serr) { 2845 /* 2846 * Source entry invalid or not there. 2847 */ 2848 if (!terr) { 2849 zfs_dirent_unlock(tdl); 2850 if (tzp) 2851 zrele(tzp); 2852 } 2853 2854 if (sdzp == tdzp) 2855 rw_exit(&sdzp->z_name_lock); 2856 2857 if (strcmp(snm, "..") == 0) 2858 serr = EINVAL; 2859 zfs_exit(zfsvfs, FTAG); 2860 return (serr); 2861 } 2862 if (terr) { 2863 zfs_dirent_unlock(sdl); 2864 zrele(szp); 2865 2866 if (sdzp == tdzp) 2867 rw_exit(&sdzp->z_name_lock); 2868 2869 if (strcmp(tnm, "..") == 0) 2870 terr = EINVAL; 2871 zfs_exit(zfsvfs, FTAG); 2872 return (terr); 2873 } 2874 2875 /* 2876 * If we are using project inheritance, means if the directory has 2877 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2878 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2879 * such case, we only allow renames into our tree when the project 2880 * IDs are the same. 2881 */ 2882 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2883 tdzp->z_projid != szp->z_projid) { 2884 error = SET_ERROR(EXDEV); 2885 goto out; 2886 } 2887 2888 /* 2889 * Must have write access at the source to remove the old entry 2890 * and write access at the target to create the new entry. 2891 * Note that if target and source are the same, this can be 2892 * done in a single check. 2893 */ 2894 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2895 goto out; 2896 2897 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2898 /* 2899 * Check to make sure rename is valid. 2900 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2901 */ 2902 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2903 goto out; 2904 } 2905 2906 /* 2907 * Does target exist? 2908 */ 2909 if (tzp) { 2910 if (rflags & RENAME_NOREPLACE) { 2911 error = SET_ERROR(EEXIST); 2912 goto out; 2913 } 2914 /* 2915 * Source and target must be the same type (unless exchanging). 2916 */ 2917 if (!(rflags & RENAME_EXCHANGE)) { 2918 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 2919 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 2920 2921 if (s_is_dir != t_is_dir) { 2922 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 2923 goto out; 2924 } 2925 } 2926 /* 2927 * POSIX dictates that when the source and target 2928 * entries refer to the same file object, rename 2929 * must do nothing and exit without error. 2930 */ 2931 if (szp->z_id == tzp->z_id) { 2932 error = 0; 2933 goto out; 2934 } 2935 } else if (rflags & RENAME_EXCHANGE) { 2936 /* Target must exist for RENAME_EXCHANGE. */ 2937 error = SET_ERROR(ENOENT); 2938 goto out; 2939 } 2940 2941 /* Set up inode creation for RENAME_WHITEOUT. */ 2942 if (rflags & RENAME_WHITEOUT) { 2943 /* 2944 * Whiteout files are not regular files or directories, so to 2945 * match zfs_create() we do not inherit the project id. 2946 */ 2947 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 2948 2949 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 2950 if (error) 2951 goto out; 2952 2953 if (!have_acl) { 2954 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 2955 &acl_ids, mnt_ns); 2956 if (error) 2957 goto out; 2958 have_acl = B_TRUE; 2959 } 2960 2961 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 2962 error = SET_ERROR(EDQUOT); 2963 goto out; 2964 } 2965 } 2966 2967 tx = dmu_tx_create(zfsvfs->z_os); 2968 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 2969 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2970 dmu_tx_hold_zap(tx, sdzp->z_id, 2971 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 2972 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 2973 if (sdzp != tdzp) { 2974 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 2975 zfs_sa_upgrade_txholds(tx, tdzp); 2976 } 2977 if (tzp) { 2978 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 2979 zfs_sa_upgrade_txholds(tx, tzp); 2980 } 2981 if (rflags & RENAME_WHITEOUT) { 2982 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2983 ZFS_SA_BASE_ATTR_SIZE); 2984 2985 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 2986 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2987 if (!zfsvfs->z_use_sa && 2988 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2989 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2990 0, acl_ids.z_aclp->z_acl_bytes); 2991 } 2992 } 2993 fuid_dirtied = zfsvfs->z_fuid_dirty; 2994 if (fuid_dirtied) 2995 zfs_fuid_txhold(zfsvfs, tx); 2996 zfs_sa_upgrade_txholds(tx, szp); 2997 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2998 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2999 if (error) { 3000 if (zl != NULL) 3001 zfs_rename_unlock(&zl); 3002 zfs_dirent_unlock(sdl); 3003 zfs_dirent_unlock(tdl); 3004 3005 if (sdzp == tdzp) 3006 rw_exit(&sdzp->z_name_lock); 3007 3008 if (error == ERESTART) { 3009 waited = B_TRUE; 3010 dmu_tx_wait(tx); 3011 dmu_tx_abort(tx); 3012 zrele(szp); 3013 if (tzp) 3014 zrele(tzp); 3015 goto top; 3016 } 3017 dmu_tx_abort(tx); 3018 zrele(szp); 3019 if (tzp) 3020 zrele(tzp); 3021 zfs_exit(zfsvfs, FTAG); 3022 return (error); 3023 } 3024 3025 /* 3026 * Unlink the source. 3027 */ 3028 szp->z_pflags |= ZFS_AV_MODIFIED; 3029 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3030 szp->z_pflags |= ZFS_PROJINHERIT; 3031 3032 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3033 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3034 VERIFY0(error); 3035 3036 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3037 if (error) 3038 goto commit; 3039 3040 /* 3041 * Unlink the target. 3042 */ 3043 if (tzp) { 3044 int tzflg = zflg; 3045 3046 if (rflags & RENAME_EXCHANGE) { 3047 /* This inode will be re-linked soon. */ 3048 tzflg |= ZRENAMING; 3049 3050 tzp->z_pflags |= ZFS_AV_MODIFIED; 3051 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3052 tzp->z_pflags |= ZFS_PROJINHERIT; 3053 3054 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3055 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3056 ASSERT0(error); 3057 } 3058 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3059 if (error) 3060 goto commit_link_szp; 3061 } 3062 3063 /* 3064 * Create the new target links: 3065 * * We always link the target. 3066 * * RENAME_EXCHANGE: Link the old target to the source. 3067 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3068 */ 3069 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3070 if (error) { 3071 /* 3072 * If we have removed the existing target, a subsequent call to 3073 * zfs_link_create() to add back the same entry, but with a new 3074 * dnode (szp), should not fail. 3075 */ 3076 ASSERT3P(tzp, ==, NULL); 3077 goto commit_link_tzp; 3078 } 3079 3080 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3081 case RENAME_EXCHANGE: 3082 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3083 /* 3084 * The same argument as zfs_link_create() failing for 3085 * szp applies here, since the source directory must 3086 * have had an entry we are replacing. 3087 */ 3088 ASSERT0(error); 3089 if (error) 3090 goto commit_unlink_td_szp; 3091 break; 3092 case RENAME_WHITEOUT: 3093 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3094 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3095 if (error) { 3096 zfs_znode_delete(wzp, tx); 3097 remove_inode_hash(ZTOI(wzp)); 3098 goto commit_unlink_td_szp; 3099 } 3100 break; 3101 } 3102 3103 if (fuid_dirtied) 3104 zfs_fuid_sync(zfsvfs, tx); 3105 3106 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3107 case RENAME_EXCHANGE: 3108 zfs_log_rename_exchange(zilog, tx, 3109 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3110 tdzp, tdl->dl_name, szp); 3111 break; 3112 case RENAME_WHITEOUT: 3113 zfs_log_rename_whiteout(zilog, tx, 3114 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3115 tdzp, tdl->dl_name, szp, wzp); 3116 break; 3117 default: 3118 ASSERT0(rflags & ~RENAME_NOREPLACE); 3119 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3120 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3121 break; 3122 } 3123 3124 commit: 3125 dmu_tx_commit(tx); 3126 out: 3127 if (have_acl) 3128 zfs_acl_ids_free(&acl_ids); 3129 3130 zfs_znode_update_vfs(sdzp); 3131 if (sdzp == tdzp) 3132 rw_exit(&sdzp->z_name_lock); 3133 3134 if (sdzp != tdzp) 3135 zfs_znode_update_vfs(tdzp); 3136 3137 zfs_znode_update_vfs(szp); 3138 zrele(szp); 3139 if (wzp) { 3140 zfs_znode_update_vfs(wzp); 3141 zrele(wzp); 3142 } 3143 if (tzp) { 3144 zfs_znode_update_vfs(tzp); 3145 zrele(tzp); 3146 } 3147 3148 if (zl != NULL) 3149 zfs_rename_unlock(&zl); 3150 3151 zfs_dirent_unlock(sdl); 3152 zfs_dirent_unlock(tdl); 3153 3154 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3155 zil_commit(zilog, 0); 3156 3157 zfs_exit(zfsvfs, FTAG); 3158 return (error); 3159 3160 /* 3161 * Clean-up path for broken link state. 3162 * 3163 * At this point we are in a (very) bad state, so we need to do our 3164 * best to correct the state. In particular, all of the nlinks are 3165 * wrong because we were destroying and creating links with ZRENAMING. 3166 * 3167 * In some form, all of these operations have to resolve the state: 3168 * 3169 * * link_destroy() *must* succeed. Fortunately, this is very likely 3170 * since we only just created it. 3171 * 3172 * * link_create()s are allowed to fail (though they shouldn't because 3173 * we only just unlinked them and are putting the entries back 3174 * during clean-up). But if they fail, we can just forcefully drop 3175 * the nlink value to (at the very least) avoid broken nlink values 3176 * -- though in the case of non-empty directories we will have to 3177 * panic (otherwise we'd have a leaked directory with a broken ..). 3178 */ 3179 commit_unlink_td_szp: 3180 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3181 commit_link_tzp: 3182 if (tzp) { 3183 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3184 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3185 } 3186 commit_link_szp: 3187 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3188 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3189 goto commit; 3190 } 3191 3192 /* 3193 * Insert the indicated symbolic reference entry into the directory. 3194 * 3195 * IN: dzp - Directory to contain new symbolic link. 3196 * name - Name of directory entry in dip. 3197 * vap - Attributes of new entry. 3198 * link - Name for new symlink entry. 3199 * cr - credentials of caller. 3200 * flags - case flags 3201 * mnt_ns - user namespace of the mount 3202 * 3203 * OUT: zpp - Znode for new symbolic link. 3204 * 3205 * RETURN: 0 on success, error code on failure. 3206 * 3207 * Timestamps: 3208 * dip - ctime|mtime updated 3209 */ 3210 int 3211 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3212 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) 3213 { 3214 znode_t *zp; 3215 zfs_dirlock_t *dl; 3216 dmu_tx_t *tx; 3217 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3218 zilog_t *zilog; 3219 uint64_t len = strlen(link); 3220 int error; 3221 int zflg = ZNEW; 3222 zfs_acl_ids_t acl_ids; 3223 boolean_t fuid_dirtied; 3224 uint64_t txtype = TX_SYMLINK; 3225 boolean_t waited = B_FALSE; 3226 3227 ASSERT(S_ISLNK(vap->va_mode)); 3228 3229 if (name == NULL) 3230 return (SET_ERROR(EINVAL)); 3231 3232 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3233 return (error); 3234 zilog = zfsvfs->z_log; 3235 3236 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3237 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3238 zfs_exit(zfsvfs, FTAG); 3239 return (SET_ERROR(EILSEQ)); 3240 } 3241 if (flags & FIGNORECASE) 3242 zflg |= ZCILOOK; 3243 3244 if (len > MAXPATHLEN) { 3245 zfs_exit(zfsvfs, FTAG); 3246 return (SET_ERROR(ENAMETOOLONG)); 3247 } 3248 3249 if ((error = zfs_acl_ids_create(dzp, 0, 3250 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3251 zfs_exit(zfsvfs, FTAG); 3252 return (error); 3253 } 3254 top: 3255 *zpp = NULL; 3256 3257 /* 3258 * Attempt to lock directory; fail if entry already exists. 3259 */ 3260 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3261 if (error) { 3262 zfs_acl_ids_free(&acl_ids); 3263 zfs_exit(zfsvfs, FTAG); 3264 return (error); 3265 } 3266 3267 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3268 zfs_acl_ids_free(&acl_ids); 3269 zfs_dirent_unlock(dl); 3270 zfs_exit(zfsvfs, FTAG); 3271 return (error); 3272 } 3273 3274 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3275 zfs_acl_ids_free(&acl_ids); 3276 zfs_dirent_unlock(dl); 3277 zfs_exit(zfsvfs, FTAG); 3278 return (SET_ERROR(EDQUOT)); 3279 } 3280 tx = dmu_tx_create(zfsvfs->z_os); 3281 fuid_dirtied = zfsvfs->z_fuid_dirty; 3282 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3283 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3284 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3285 ZFS_SA_BASE_ATTR_SIZE + len); 3286 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3287 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3288 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3289 acl_ids.z_aclp->z_acl_bytes); 3290 } 3291 if (fuid_dirtied) 3292 zfs_fuid_txhold(zfsvfs, tx); 3293 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3294 if (error) { 3295 zfs_dirent_unlock(dl); 3296 if (error == ERESTART) { 3297 waited = B_TRUE; 3298 dmu_tx_wait(tx); 3299 dmu_tx_abort(tx); 3300 goto top; 3301 } 3302 zfs_acl_ids_free(&acl_ids); 3303 dmu_tx_abort(tx); 3304 zfs_exit(zfsvfs, FTAG); 3305 return (error); 3306 } 3307 3308 /* 3309 * Create a new object for the symlink. 3310 * for version 4 ZPL datasets the symlink will be an SA attribute 3311 */ 3312 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3313 3314 if (fuid_dirtied) 3315 zfs_fuid_sync(zfsvfs, tx); 3316 3317 mutex_enter(&zp->z_lock); 3318 if (zp->z_is_sa) 3319 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3320 link, len, tx); 3321 else 3322 zfs_sa_symlink(zp, link, len, tx); 3323 mutex_exit(&zp->z_lock); 3324 3325 zp->z_size = len; 3326 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3327 &zp->z_size, sizeof (zp->z_size), tx); 3328 /* 3329 * Insert the new object into the directory. 3330 */ 3331 error = zfs_link_create(dl, zp, tx, ZNEW); 3332 if (error != 0) { 3333 zfs_znode_delete(zp, tx); 3334 remove_inode_hash(ZTOI(zp)); 3335 } else { 3336 if (flags & FIGNORECASE) 3337 txtype |= TX_CI; 3338 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3339 3340 zfs_znode_update_vfs(dzp); 3341 zfs_znode_update_vfs(zp); 3342 } 3343 3344 zfs_acl_ids_free(&acl_ids); 3345 3346 dmu_tx_commit(tx); 3347 3348 zfs_dirent_unlock(dl); 3349 3350 if (error == 0) { 3351 *zpp = zp; 3352 3353 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3354 zil_commit(zilog, 0); 3355 } else { 3356 zrele(zp); 3357 } 3358 3359 zfs_exit(zfsvfs, FTAG); 3360 return (error); 3361 } 3362 3363 /* 3364 * Return, in the buffer contained in the provided uio structure, 3365 * the symbolic path referred to by ip. 3366 * 3367 * IN: ip - inode of symbolic link 3368 * uio - structure to contain the link path. 3369 * cr - credentials of caller. 3370 * 3371 * RETURN: 0 if success 3372 * error code if failure 3373 * 3374 * Timestamps: 3375 * ip - atime updated 3376 */ 3377 int 3378 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3379 { 3380 (void) cr; 3381 znode_t *zp = ITOZ(ip); 3382 zfsvfs_t *zfsvfs = ITOZSB(ip); 3383 int error; 3384 3385 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3386 return (error); 3387 3388 mutex_enter(&zp->z_lock); 3389 if (zp->z_is_sa) 3390 error = sa_lookup_uio(zp->z_sa_hdl, 3391 SA_ZPL_SYMLINK(zfsvfs), uio); 3392 else 3393 error = zfs_sa_readlink(zp, uio); 3394 mutex_exit(&zp->z_lock); 3395 3396 zfs_exit(zfsvfs, FTAG); 3397 return (error); 3398 } 3399 3400 /* 3401 * Insert a new entry into directory tdzp referencing szp. 3402 * 3403 * IN: tdzp - Directory to contain new entry. 3404 * szp - znode of new entry. 3405 * name - name of new entry. 3406 * cr - credentials of caller. 3407 * flags - case flags. 3408 * 3409 * RETURN: 0 if success 3410 * error code if failure 3411 * 3412 * Timestamps: 3413 * tdzp - ctime|mtime updated 3414 * szp - ctime updated 3415 */ 3416 int 3417 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3418 int flags) 3419 { 3420 struct inode *sip = ZTOI(szp); 3421 znode_t *tzp; 3422 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3423 zilog_t *zilog; 3424 zfs_dirlock_t *dl; 3425 dmu_tx_t *tx; 3426 int error; 3427 int zf = ZNEW; 3428 uint64_t parent; 3429 uid_t owner; 3430 boolean_t waited = B_FALSE; 3431 boolean_t is_tmpfile = 0; 3432 uint64_t txg; 3433 #ifdef HAVE_TMPFILE 3434 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3435 #endif 3436 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3437 3438 if (name == NULL) 3439 return (SET_ERROR(EINVAL)); 3440 3441 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3442 return (error); 3443 zilog = zfsvfs->z_log; 3444 3445 /* 3446 * POSIX dictates that we return EPERM here. 3447 * Better choices include ENOTSUP or EISDIR. 3448 */ 3449 if (S_ISDIR(sip->i_mode)) { 3450 zfs_exit(zfsvfs, FTAG); 3451 return (SET_ERROR(EPERM)); 3452 } 3453 3454 if ((error = zfs_verify_zp(szp)) != 0) { 3455 zfs_exit(zfsvfs, FTAG); 3456 return (error); 3457 } 3458 3459 /* 3460 * If we are using project inheritance, means if the directory has 3461 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3462 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3463 * such case, we only allow hard link creation in our tree when the 3464 * project IDs are the same. 3465 */ 3466 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3467 tdzp->z_projid != szp->z_projid) { 3468 zfs_exit(zfsvfs, FTAG); 3469 return (SET_ERROR(EXDEV)); 3470 } 3471 3472 /* 3473 * We check i_sb because snapshots and the ctldir must have different 3474 * super blocks. 3475 */ 3476 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3477 zfs_exit(zfsvfs, FTAG); 3478 return (SET_ERROR(EXDEV)); 3479 } 3480 3481 /* Prevent links to .zfs/shares files */ 3482 3483 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3484 &parent, sizeof (uint64_t))) != 0) { 3485 zfs_exit(zfsvfs, FTAG); 3486 return (error); 3487 } 3488 if (parent == zfsvfs->z_shares_dir) { 3489 zfs_exit(zfsvfs, FTAG); 3490 return (SET_ERROR(EPERM)); 3491 } 3492 3493 if (zfsvfs->z_utf8 && u8_validate(name, 3494 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3495 zfs_exit(zfsvfs, FTAG); 3496 return (SET_ERROR(EILSEQ)); 3497 } 3498 if (flags & FIGNORECASE) 3499 zf |= ZCILOOK; 3500 3501 /* 3502 * We do not support links between attributes and non-attributes 3503 * because of the potential security risk of creating links 3504 * into "normal" file space in order to circumvent restrictions 3505 * imposed in attribute space. 3506 */ 3507 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3508 zfs_exit(zfsvfs, FTAG); 3509 return (SET_ERROR(EINVAL)); 3510 } 3511 3512 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3513 cr, ZFS_OWNER); 3514 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3515 zfs_exit(zfsvfs, FTAG); 3516 return (SET_ERROR(EPERM)); 3517 } 3518 3519 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3520 zfs_init_idmap))) { 3521 zfs_exit(zfsvfs, FTAG); 3522 return (error); 3523 } 3524 3525 top: 3526 /* 3527 * Attempt to lock directory; fail if entry already exists. 3528 */ 3529 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3530 if (error) { 3531 zfs_exit(zfsvfs, FTAG); 3532 return (error); 3533 } 3534 3535 tx = dmu_tx_create(zfsvfs->z_os); 3536 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3537 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3538 if (is_tmpfile) 3539 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3540 3541 zfs_sa_upgrade_txholds(tx, szp); 3542 zfs_sa_upgrade_txholds(tx, tdzp); 3543 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3544 if (error) { 3545 zfs_dirent_unlock(dl); 3546 if (error == ERESTART) { 3547 waited = B_TRUE; 3548 dmu_tx_wait(tx); 3549 dmu_tx_abort(tx); 3550 goto top; 3551 } 3552 dmu_tx_abort(tx); 3553 zfs_exit(zfsvfs, FTAG); 3554 return (error); 3555 } 3556 /* unmark z_unlinked so zfs_link_create will not reject */ 3557 if (is_tmpfile) 3558 szp->z_unlinked = B_FALSE; 3559 error = zfs_link_create(dl, szp, tx, 0); 3560 3561 if (error == 0) { 3562 uint64_t txtype = TX_LINK; 3563 /* 3564 * tmpfile is created to be in z_unlinkedobj, so remove it. 3565 * Also, we don't log in ZIL, because all previous file 3566 * operation on the tmpfile are ignored by ZIL. Instead we 3567 * always wait for txg to sync to make sure all previous 3568 * operation are sync safe. 3569 */ 3570 if (is_tmpfile) { 3571 VERIFY(zap_remove_int(zfsvfs->z_os, 3572 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3573 } else { 3574 if (flags & FIGNORECASE) 3575 txtype |= TX_CI; 3576 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3577 } 3578 } else if (is_tmpfile) { 3579 /* restore z_unlinked since when linking failed */ 3580 szp->z_unlinked = B_TRUE; 3581 } 3582 txg = dmu_tx_get_txg(tx); 3583 dmu_tx_commit(tx); 3584 3585 zfs_dirent_unlock(dl); 3586 3587 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3588 zil_commit(zilog, 0); 3589 3590 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3591 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3592 3593 zfs_znode_update_vfs(tdzp); 3594 zfs_znode_update_vfs(szp); 3595 zfs_exit(zfsvfs, FTAG); 3596 return (error); 3597 } 3598 3599 static void 3600 zfs_putpage_sync_commit_cb(void *arg) 3601 { 3602 struct page *pp = arg; 3603 3604 ClearPageError(pp); 3605 end_page_writeback(pp); 3606 } 3607 3608 static void 3609 zfs_putpage_async_commit_cb(void *arg) 3610 { 3611 struct page *pp = arg; 3612 znode_t *zp = ITOZ(pp->mapping->host); 3613 3614 ClearPageError(pp); 3615 end_page_writeback(pp); 3616 atomic_dec_32(&zp->z_async_writes_cnt); 3617 } 3618 3619 /* 3620 * Push a page out to disk, once the page is on stable storage the 3621 * registered commit callback will be run as notification of completion. 3622 * 3623 * IN: ip - page mapped for inode. 3624 * pp - page to push (page is locked) 3625 * wbc - writeback control data 3626 * for_sync - does the caller intend to wait synchronously for the 3627 * page writeback to complete? 3628 * 3629 * RETURN: 0 if success 3630 * error code if failure 3631 * 3632 * Timestamps: 3633 * ip - ctime|mtime updated 3634 */ 3635 int 3636 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3637 boolean_t for_sync) 3638 { 3639 znode_t *zp = ITOZ(ip); 3640 zfsvfs_t *zfsvfs = ITOZSB(ip); 3641 loff_t offset; 3642 loff_t pgoff; 3643 unsigned int pglen; 3644 dmu_tx_t *tx; 3645 caddr_t va; 3646 int err = 0; 3647 uint64_t mtime[2], ctime[2]; 3648 sa_bulk_attr_t bulk[3]; 3649 int cnt = 0; 3650 struct address_space *mapping; 3651 3652 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3653 return (err); 3654 3655 ASSERT(PageLocked(pp)); 3656 3657 pgoff = page_offset(pp); /* Page byte-offset in file */ 3658 offset = i_size_read(ip); /* File length in bytes */ 3659 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3660 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3661 3662 /* Page is beyond end of file */ 3663 if (pgoff >= offset) { 3664 unlock_page(pp); 3665 zfs_exit(zfsvfs, FTAG); 3666 return (0); 3667 } 3668 3669 /* Truncate page length to end of file */ 3670 if (pgoff + pglen > offset) 3671 pglen = offset - pgoff; 3672 3673 #if 0 3674 /* 3675 * FIXME: Allow mmap writes past its quota. The correct fix 3676 * is to register a page_mkwrite() handler to count the page 3677 * against its quota when it is about to be dirtied. 3678 */ 3679 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3680 KUID_TO_SUID(ip->i_uid)) || 3681 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3682 KGID_TO_SGID(ip->i_gid)) || 3683 (zp->z_projid != ZFS_DEFAULT_PROJID && 3684 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3685 zp->z_projid))) { 3686 err = EDQUOT; 3687 } 3688 #endif 3689 3690 /* 3691 * The ordering here is critical and must adhere to the following 3692 * rules in order to avoid deadlocking in either zfs_read() or 3693 * zfs_free_range() due to a lock inversion. 3694 * 3695 * 1) The page must be unlocked prior to acquiring the range lock. 3696 * This is critical because zfs_read() calls find_lock_page() 3697 * which may block on the page lock while holding the range lock. 3698 * 3699 * 2) Before setting or clearing write back on a page the range lock 3700 * must be held in order to prevent a lock inversion with the 3701 * zfs_free_range() function. 3702 * 3703 * This presents a problem because upon entering this function the 3704 * page lock is already held. To safely acquire the range lock the 3705 * page lock must be dropped. This creates a window where another 3706 * process could truncate, invalidate, dirty, or write out the page. 3707 * 3708 * Therefore, after successfully reacquiring the range and page locks 3709 * the current page state is checked. In the common case everything 3710 * will be as is expected and it can be written out. However, if 3711 * the page state has changed it must be handled accordingly. 3712 */ 3713 mapping = pp->mapping; 3714 redirty_page_for_writepage(wbc, pp); 3715 unlock_page(pp); 3716 3717 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3718 pgoff, pglen, RL_WRITER); 3719 lock_page(pp); 3720 3721 /* Page mapping changed or it was no longer dirty, we're done */ 3722 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3723 unlock_page(pp); 3724 zfs_rangelock_exit(lr); 3725 zfs_exit(zfsvfs, FTAG); 3726 return (0); 3727 } 3728 3729 /* Another process started write block if required */ 3730 if (PageWriteback(pp)) { 3731 unlock_page(pp); 3732 zfs_rangelock_exit(lr); 3733 3734 if (wbc->sync_mode != WB_SYNC_NONE) { 3735 /* 3736 * Speed up any non-sync page writebacks since 3737 * they may take several seconds to complete. 3738 * Refer to the comment in zpl_fsync() (when 3739 * HAVE_FSYNC_RANGE is defined) for details. 3740 */ 3741 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3742 zil_commit(zfsvfs->z_log, zp->z_id); 3743 } 3744 3745 if (PageWriteback(pp)) 3746 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3747 folio_wait_bit(page_folio(pp), PG_writeback); 3748 #else 3749 wait_on_page_bit(pp, PG_writeback); 3750 #endif 3751 } 3752 3753 zfs_exit(zfsvfs, FTAG); 3754 return (0); 3755 } 3756 3757 /* Clear the dirty flag the required locks are held */ 3758 if (!clear_page_dirty_for_io(pp)) { 3759 unlock_page(pp); 3760 zfs_rangelock_exit(lr); 3761 zfs_exit(zfsvfs, FTAG); 3762 return (0); 3763 } 3764 3765 /* 3766 * Counterpart for redirty_page_for_writepage() above. This page 3767 * was in fact not skipped and should not be counted as if it were. 3768 */ 3769 wbc->pages_skipped--; 3770 if (!for_sync) 3771 atomic_inc_32(&zp->z_async_writes_cnt); 3772 set_page_writeback(pp); 3773 unlock_page(pp); 3774 3775 tx = dmu_tx_create(zfsvfs->z_os); 3776 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3777 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3778 zfs_sa_upgrade_txholds(tx, zp); 3779 3780 err = dmu_tx_assign(tx, TXG_NOWAIT); 3781 if (err != 0) { 3782 if (err == ERESTART) 3783 dmu_tx_wait(tx); 3784 3785 dmu_tx_abort(tx); 3786 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3787 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3788 #else 3789 __set_page_dirty_nobuffers(pp); 3790 #endif 3791 ClearPageError(pp); 3792 end_page_writeback(pp); 3793 if (!for_sync) 3794 atomic_dec_32(&zp->z_async_writes_cnt); 3795 zfs_rangelock_exit(lr); 3796 zfs_exit(zfsvfs, FTAG); 3797 return (err); 3798 } 3799 3800 va = kmap(pp); 3801 ASSERT3U(pglen, <=, PAGE_SIZE); 3802 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3803 kunmap(pp); 3804 3805 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3806 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3807 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3808 &zp->z_pflags, 8); 3809 3810 /* Preserve the mtime and ctime provided by the inode */ 3811 ZFS_TIME_ENCODE(&ip->i_mtime, mtime); 3812 ZFS_TIME_ENCODE(&ip->i_ctime, ctime); 3813 zp->z_atime_dirty = B_FALSE; 3814 zp->z_seq++; 3815 3816 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3817 3818 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, 3819 for_sync ? zfs_putpage_sync_commit_cb : 3820 zfs_putpage_async_commit_cb, pp); 3821 3822 dmu_tx_commit(tx); 3823 3824 zfs_rangelock_exit(lr); 3825 3826 if (wbc->sync_mode != WB_SYNC_NONE) { 3827 /* 3828 * Note that this is rarely called under writepages(), because 3829 * writepages() normally handles the entire commit for 3830 * performance reasons. 3831 */ 3832 zil_commit(zfsvfs->z_log, zp->z_id); 3833 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3834 /* 3835 * If the caller does not intend to wait synchronously 3836 * for this page writeback to complete and there are active 3837 * synchronous calls on this file, do a commit so that 3838 * the latter don't accidentally end up waiting for 3839 * our writeback to complete. Refer to the comment in 3840 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3841 */ 3842 zil_commit(zfsvfs->z_log, zp->z_id); 3843 } 3844 3845 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3846 3847 zfs_exit(zfsvfs, FTAG); 3848 return (err); 3849 } 3850 3851 /* 3852 * Update the system attributes when the inode has been dirtied. For the 3853 * moment we only update the mode, atime, mtime, and ctime. 3854 */ 3855 int 3856 zfs_dirty_inode(struct inode *ip, int flags) 3857 { 3858 znode_t *zp = ITOZ(ip); 3859 zfsvfs_t *zfsvfs = ITOZSB(ip); 3860 dmu_tx_t *tx; 3861 uint64_t mode, atime[2], mtime[2], ctime[2]; 3862 sa_bulk_attr_t bulk[4]; 3863 int error = 0; 3864 int cnt = 0; 3865 3866 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3867 return (0); 3868 3869 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3870 return (error); 3871 3872 #ifdef I_DIRTY_TIME 3873 /* 3874 * This is the lazytime semantic introduced in Linux 4.0 3875 * This flag will only be called from update_time when lazytime is set. 3876 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3877 * Fortunately mtime and ctime are managed within ZFS itself, so we 3878 * only need to dirty atime. 3879 */ 3880 if (flags == I_DIRTY_TIME) { 3881 zp->z_atime_dirty = B_TRUE; 3882 goto out; 3883 } 3884 #endif 3885 3886 tx = dmu_tx_create(zfsvfs->z_os); 3887 3888 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3889 zfs_sa_upgrade_txholds(tx, zp); 3890 3891 error = dmu_tx_assign(tx, TXG_WAIT); 3892 if (error) { 3893 dmu_tx_abort(tx); 3894 goto out; 3895 } 3896 3897 mutex_enter(&zp->z_lock); 3898 zp->z_atime_dirty = B_FALSE; 3899 3900 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3901 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3902 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3903 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3904 3905 /* Preserve the mode, mtime and ctime provided by the inode */ 3906 ZFS_TIME_ENCODE(&ip->i_atime, atime); 3907 ZFS_TIME_ENCODE(&ip->i_mtime, mtime); 3908 ZFS_TIME_ENCODE(&ip->i_ctime, ctime); 3909 mode = ip->i_mode; 3910 3911 zp->z_mode = mode; 3912 3913 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3914 mutex_exit(&zp->z_lock); 3915 3916 dmu_tx_commit(tx); 3917 out: 3918 zfs_exit(zfsvfs, FTAG); 3919 return (error); 3920 } 3921 3922 void 3923 zfs_inactive(struct inode *ip) 3924 { 3925 znode_t *zp = ITOZ(ip); 3926 zfsvfs_t *zfsvfs = ITOZSB(ip); 3927 uint64_t atime[2]; 3928 int error; 3929 int need_unlock = 0; 3930 3931 /* Only read lock if we haven't already write locked, e.g. rollback */ 3932 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 3933 need_unlock = 1; 3934 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3935 } 3936 if (zp->z_sa_hdl == NULL) { 3937 if (need_unlock) 3938 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3939 return; 3940 } 3941 3942 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 3943 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3944 3945 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3946 zfs_sa_upgrade_txholds(tx, zp); 3947 error = dmu_tx_assign(tx, TXG_WAIT); 3948 if (error) { 3949 dmu_tx_abort(tx); 3950 } else { 3951 ZFS_TIME_ENCODE(&ip->i_atime, atime); 3952 mutex_enter(&zp->z_lock); 3953 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 3954 (void *)&atime, sizeof (atime), tx); 3955 zp->z_atime_dirty = B_FALSE; 3956 mutex_exit(&zp->z_lock); 3957 dmu_tx_commit(tx); 3958 } 3959 } 3960 3961 zfs_zinactive(zp); 3962 if (need_unlock) 3963 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3964 } 3965 3966 /* 3967 * Fill pages with data from the disk. 3968 */ 3969 static int 3970 zfs_fillpage(struct inode *ip, struct page *pp) 3971 { 3972 zfsvfs_t *zfsvfs = ITOZSB(ip); 3973 loff_t i_size = i_size_read(ip); 3974 u_offset_t io_off = page_offset(pp); 3975 size_t io_len = PAGE_SIZE; 3976 3977 ASSERT3U(io_off, <, i_size); 3978 3979 if (io_off + io_len > i_size) 3980 io_len = i_size - io_off; 3981 3982 void *va = kmap(pp); 3983 int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, 3984 io_len, va, DMU_READ_PREFETCH); 3985 if (io_len != PAGE_SIZE) 3986 memset((char *)va + io_len, 0, PAGE_SIZE - io_len); 3987 kunmap(pp); 3988 3989 if (error) { 3990 /* convert checksum errors into IO errors */ 3991 if (error == ECKSUM) 3992 error = SET_ERROR(EIO); 3993 3994 SetPageError(pp); 3995 ClearPageUptodate(pp); 3996 } else { 3997 ClearPageError(pp); 3998 SetPageUptodate(pp); 3999 } 4000 4001 return (error); 4002 } 4003 4004 /* 4005 * Uses zfs_fillpage to read data from the file and fill the page. 4006 * 4007 * IN: ip - inode of file to get data from. 4008 * pp - page to read 4009 * 4010 * RETURN: 0 on success, error code on failure. 4011 * 4012 * Timestamps: 4013 * vp - atime updated 4014 */ 4015 int 4016 zfs_getpage(struct inode *ip, struct page *pp) 4017 { 4018 zfsvfs_t *zfsvfs = ITOZSB(ip); 4019 znode_t *zp = ITOZ(ip); 4020 int error; 4021 4022 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4023 return (error); 4024 4025 error = zfs_fillpage(ip, pp); 4026 if (error == 0) 4027 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); 4028 4029 zfs_exit(zfsvfs, FTAG); 4030 4031 return (error); 4032 } 4033 4034 /* 4035 * Check ZFS specific permissions to memory map a section of a file. 4036 * 4037 * IN: ip - inode of the file to mmap 4038 * off - file offset 4039 * addrp - start address in memory region 4040 * len - length of memory region 4041 * vm_flags- address flags 4042 * 4043 * RETURN: 0 if success 4044 * error code if failure 4045 */ 4046 int 4047 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4048 unsigned long vm_flags) 4049 { 4050 (void) addrp; 4051 znode_t *zp = ITOZ(ip); 4052 zfsvfs_t *zfsvfs = ITOZSB(ip); 4053 int error; 4054 4055 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4056 return (error); 4057 4058 if ((vm_flags & VM_WRITE) && (zp->z_pflags & 4059 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4060 zfs_exit(zfsvfs, FTAG); 4061 return (SET_ERROR(EPERM)); 4062 } 4063 4064 if ((vm_flags & (VM_READ | VM_EXEC)) && 4065 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4066 zfs_exit(zfsvfs, FTAG); 4067 return (SET_ERROR(EACCES)); 4068 } 4069 4070 if (off < 0 || len > MAXOFFSET_T - off) { 4071 zfs_exit(zfsvfs, FTAG); 4072 return (SET_ERROR(ENXIO)); 4073 } 4074 4075 zfs_exit(zfsvfs, FTAG); 4076 return (0); 4077 } 4078 4079 /* 4080 * Free or allocate space in a file. Currently, this function only 4081 * supports the `F_FREESP' command. However, this command is somewhat 4082 * misnamed, as its functionality includes the ability to allocate as 4083 * well as free space. 4084 * 4085 * IN: zp - znode of file to free data in. 4086 * cmd - action to take (only F_FREESP supported). 4087 * bfp - section of file to free/alloc. 4088 * flag - current file open mode flags. 4089 * offset - current file offset. 4090 * cr - credentials of caller. 4091 * 4092 * RETURN: 0 on success, error code on failure. 4093 * 4094 * Timestamps: 4095 * zp - ctime|mtime updated 4096 */ 4097 int 4098 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4099 offset_t offset, cred_t *cr) 4100 { 4101 (void) offset; 4102 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4103 uint64_t off, len; 4104 int error; 4105 4106 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4107 return (error); 4108 4109 if (cmd != F_FREESP) { 4110 zfs_exit(zfsvfs, FTAG); 4111 return (SET_ERROR(EINVAL)); 4112 } 4113 4114 /* 4115 * Callers might not be able to detect properly that we are read-only, 4116 * so check it explicitly here. 4117 */ 4118 if (zfs_is_readonly(zfsvfs)) { 4119 zfs_exit(zfsvfs, FTAG); 4120 return (SET_ERROR(EROFS)); 4121 } 4122 4123 if (bfp->l_len < 0) { 4124 zfs_exit(zfsvfs, FTAG); 4125 return (SET_ERROR(EINVAL)); 4126 } 4127 4128 /* 4129 * Permissions aren't checked on Solaris because on this OS 4130 * zfs_space() can only be called with an opened file handle. 4131 * On Linux we can get here through truncate_range() which 4132 * operates directly on inodes, so we need to check access rights. 4133 */ 4134 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4135 zfs_init_idmap))) { 4136 zfs_exit(zfsvfs, FTAG); 4137 return (error); 4138 } 4139 4140 off = bfp->l_start; 4141 len = bfp->l_len; /* 0 means from off to end of file */ 4142 4143 error = zfs_freesp(zp, off, len, flag, TRUE); 4144 4145 zfs_exit(zfsvfs, FTAG); 4146 return (error); 4147 } 4148 4149 int 4150 zfs_fid(struct inode *ip, fid_t *fidp) 4151 { 4152 znode_t *zp = ITOZ(ip); 4153 zfsvfs_t *zfsvfs = ITOZSB(ip); 4154 uint32_t gen; 4155 uint64_t gen64; 4156 uint64_t object = zp->z_id; 4157 zfid_short_t *zfid; 4158 int size, i, error; 4159 4160 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4161 return (error); 4162 4163 if (fidp->fid_len < SHORT_FID_LEN) { 4164 fidp->fid_len = SHORT_FID_LEN; 4165 zfs_exit(zfsvfs, FTAG); 4166 return (SET_ERROR(ENOSPC)); 4167 } 4168 4169 if ((error = zfs_verify_zp(zp)) != 0) { 4170 zfs_exit(zfsvfs, FTAG); 4171 return (error); 4172 } 4173 4174 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4175 &gen64, sizeof (uint64_t))) != 0) { 4176 zfs_exit(zfsvfs, FTAG); 4177 return (error); 4178 } 4179 4180 gen = (uint32_t)gen64; 4181 4182 size = SHORT_FID_LEN; 4183 4184 zfid = (zfid_short_t *)fidp; 4185 4186 zfid->zf_len = size; 4187 4188 for (i = 0; i < sizeof (zfid->zf_object); i++) 4189 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4190 4191 /* Must have a non-zero generation number to distinguish from .zfs */ 4192 if (gen == 0) 4193 gen = 1; 4194 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4195 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4196 4197 zfs_exit(zfsvfs, FTAG); 4198 return (0); 4199 } 4200 4201 #if defined(_KERNEL) 4202 EXPORT_SYMBOL(zfs_open); 4203 EXPORT_SYMBOL(zfs_close); 4204 EXPORT_SYMBOL(zfs_lookup); 4205 EXPORT_SYMBOL(zfs_create); 4206 EXPORT_SYMBOL(zfs_tmpfile); 4207 EXPORT_SYMBOL(zfs_remove); 4208 EXPORT_SYMBOL(zfs_mkdir); 4209 EXPORT_SYMBOL(zfs_rmdir); 4210 EXPORT_SYMBOL(zfs_readdir); 4211 EXPORT_SYMBOL(zfs_getattr_fast); 4212 EXPORT_SYMBOL(zfs_setattr); 4213 EXPORT_SYMBOL(zfs_rename); 4214 EXPORT_SYMBOL(zfs_symlink); 4215 EXPORT_SYMBOL(zfs_readlink); 4216 EXPORT_SYMBOL(zfs_link); 4217 EXPORT_SYMBOL(zfs_inactive); 4218 EXPORT_SYMBOL(zfs_space); 4219 EXPORT_SYMBOL(zfs_fid); 4220 EXPORT_SYMBOL(zfs_getpage); 4221 EXPORT_SYMBOL(zfs_putpage); 4222 EXPORT_SYMBOL(zfs_dirty_inode); 4223 EXPORT_SYMBOL(zfs_map); 4224 4225 /* CSTYLED */ 4226 module_param(zfs_delete_blocks, ulong, 0644); 4227 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4228 4229 #endif 4230