1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 73 /* 74 * Programming rules. 75 * 76 * Each vnode op performs some logical unit of work. To do this, the ZPL must 77 * properly lock its in-core state, create a DMU transaction, do the work, 78 * record this work in the intent log (ZIL), commit the DMU transaction, 79 * and wait for the intent log to commit if it is a synchronous operation. 80 * Moreover, the vnode ops must work in both normal and log replay context. 81 * The ordering of events is important to avoid deadlocks and references 82 * to freed memory. The example below illustrates the following Big Rules: 83 * 84 * (1) A check must be made in each zfs thread for a mounted file system. 85 * This is done avoiding races using zfs_enter(zfsvfs). 86 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 87 * must be checked with zfs_verify_zp(zp). Both of these macros 88 * can return EIO from the calling function. 89 * 90 * (2) zrele() should always be the last thing except for zil_commit() (if 91 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 92 * last reference, the vnode/znode can be freed, so the zp may point to 93 * freed memory. Second, the last reference will call zfs_zinactive(), 94 * which may induce a lot of work -- pushing cached pages (which acquires 95 * range locks) and syncing out cached atime changes. Third, 96 * zfs_zinactive() may require a new tx, which could deadlock the system 97 * if you were already holding one. This deadlock occurs because the tx 98 * currently being operated on prevents a txg from syncing, which 99 * prevents the new tx from progressing, resulting in a deadlock. If you 100 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 101 * is a synonym for zrele(). 102 * 103 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 104 * as they can span dmu_tx_assign() calls. 105 * 106 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 107 * dmu_tx_assign(). This is critical because we don't want to block 108 * while holding locks. 109 * 110 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 111 * reduces lock contention and CPU usage when we must wait (note that if 112 * throughput is constrained by the storage, nearly every transaction 113 * must wait). 114 * 115 * Note, in particular, that if a lock is sometimes acquired before 116 * the tx assigns, and sometimes after (e.g. z_lock), then failing 117 * to use a non-blocking assign can deadlock the system. The scenario: 118 * 119 * Thread A has grabbed a lock before calling dmu_tx_assign(). 120 * Thread B is in an already-assigned tx, and blocks for this lock. 121 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 122 * forever, because the previous txg can't quiesce until B's tx commits. 123 * 124 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 125 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 126 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 127 * to indicate that this operation has already called dmu_tx_wait(). 128 * This will ensure that we don't retry forever, waiting a short bit 129 * each time. 130 * 131 * (5) If the operation succeeded, generate the intent log entry for it 132 * before dropping locks. This ensures that the ordering of events 133 * in the intent log matches the order in which they actually occurred. 134 * During ZIL replay the zfs_log_* functions will update the sequence 135 * number to indicate the zil transaction has replayed. 136 * 137 * (6) At the end of each vnode op, the DMU tx must always commit, 138 * regardless of whether there were any errors. 139 * 140 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 141 * to ensure that synchronous semantics are provided when necessary. 142 * 143 * In general, this is how things should be ordered in each vnode op: 144 * 145 * zfs_enter(zfsvfs); // exit if unmounted 146 * top: 147 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 148 * rw_enter(...); // grab any other locks you need 149 * tx = dmu_tx_create(...); // get DMU tx 150 * dmu_tx_hold_*(); // hold each object you might modify 151 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 152 * if (error) { 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * zrele(...); // release held znodes 156 * if (error == ERESTART) { 157 * waited = B_TRUE; 158 * dmu_tx_wait(tx); 159 * dmu_tx_abort(tx); 160 * goto top; 161 * } 162 * dmu_tx_abort(tx); // abort DMU tx 163 * zfs_exit(zfsvfs); // finished in zfs 164 * return (error); // really out of space 165 * } 166 * error = do_real_work(); // do whatever this VOP does 167 * if (error == 0) 168 * zfs_log_*(...); // on success, make ZIL entry 169 * dmu_tx_commit(tx); // commit DMU tx -- error or not 170 * rw_exit(...); // drop locks 171 * zfs_dirent_unlock(dl); // unlock directory entry 172 * zrele(...); // release held znodes 173 * zil_commit(zilog, foid); // synchronous when necessary 174 * zfs_exit(zfsvfs); // finished in zfs 175 * return (error); // done, report error 176 */ 177 int 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 179 { 180 (void) cr; 181 znode_t *zp = ITOZ(ip); 182 zfsvfs_t *zfsvfs = ITOZSB(ip); 183 int error; 184 185 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 186 return (error); 187 188 /* Honor ZFS_APPENDONLY file attribute */ 189 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && 190 ((flag & O_APPEND) == 0)) { 191 zfs_exit(zfsvfs, FTAG); 192 return (SET_ERROR(EPERM)); 193 } 194 195 /* 196 * Keep a count of the synchronous opens in the znode. On first 197 * synchronous open we must convert all previous async transactions 198 * into sync to keep correct ordering. 199 */ 200 if (flag & O_SYNC) { 201 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) 202 zil_async_to_sync(zfsvfs->z_log, zp->z_id); 203 } 204 205 zfs_exit(zfsvfs, FTAG); 206 return (0); 207 } 208 209 int 210 zfs_close(struct inode *ip, int flag, cred_t *cr) 211 { 212 (void) cr; 213 znode_t *zp = ITOZ(ip); 214 zfsvfs_t *zfsvfs = ITOZSB(ip); 215 int error; 216 217 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 218 return (error); 219 220 /* Decrement the synchronous opens in the znode */ 221 if (flag & O_SYNC) 222 atomic_dec_32(&zp->z_sync_cnt); 223 224 zfs_exit(zfsvfs, FTAG); 225 return (0); 226 } 227 228 #if defined(_KERNEL) 229 230 static int zfs_fillpage(struct inode *ip, struct page *pp); 231 232 /* 233 * When a file is memory mapped, we must keep the IO data synchronized 234 * between the DMU cache and the memory mapped pages. Update all mapped 235 * pages with the contents of the coresponding dmu buffer. 236 */ 237 void 238 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 239 { 240 struct address_space *mp = ZTOI(zp)->i_mapping; 241 int64_t off = start & (PAGE_SIZE - 1); 242 243 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 244 uint64_t nbytes = MIN(PAGE_SIZE - off, len); 245 246 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 247 if (pp) { 248 if (mapping_writably_mapped(mp)) 249 flush_dcache_page(pp); 250 251 void *pb = kmap(pp); 252 int error = dmu_read(os, zp->z_id, start + off, 253 nbytes, pb + off, DMU_READ_PREFETCH); 254 kunmap(pp); 255 256 if (error) { 257 SetPageError(pp); 258 ClearPageUptodate(pp); 259 } else { 260 ClearPageError(pp); 261 SetPageUptodate(pp); 262 263 if (mapping_writably_mapped(mp)) 264 flush_dcache_page(pp); 265 266 mark_page_accessed(pp); 267 } 268 269 unlock_page(pp); 270 put_page(pp); 271 } 272 273 len -= nbytes; 274 off = 0; 275 } 276 } 277 278 /* 279 * When a file is memory mapped, we must keep the I/O data synchronized 280 * between the DMU cache and the memory mapped pages. Preferentially read 281 * from memory mapped pages, otherwise fallback to reading through the dmu. 282 */ 283 int 284 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 285 { 286 struct inode *ip = ZTOI(zp); 287 struct address_space *mp = ip->i_mapping; 288 int64_t start = uio->uio_loffset; 289 int64_t off = start & (PAGE_SIZE - 1); 290 int len = nbytes; 291 int error = 0; 292 293 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 294 uint64_t bytes = MIN(PAGE_SIZE - off, len); 295 296 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 297 if (pp) { 298 /* 299 * If filemap_fault() retries there exists a window 300 * where the page will be unlocked and not up to date. 301 * In this case we must try and fill the page. 302 */ 303 if (unlikely(!PageUptodate(pp))) { 304 error = zfs_fillpage(ip, pp); 305 if (error) { 306 unlock_page(pp); 307 put_page(pp); 308 return (error); 309 } 310 } 311 312 ASSERT(PageUptodate(pp) || PageDirty(pp)); 313 314 unlock_page(pp); 315 316 void *pb = kmap(pp); 317 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 318 kunmap(pp); 319 320 if (mapping_writably_mapped(mp)) 321 flush_dcache_page(pp); 322 323 mark_page_accessed(pp); 324 put_page(pp); 325 } else { 326 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 327 uio, bytes); 328 } 329 330 len -= bytes; 331 off = 0; 332 333 if (error) 334 break; 335 } 336 337 return (error); 338 } 339 #endif /* _KERNEL */ 340 341 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 342 343 /* 344 * Write the bytes to a file. 345 * 346 * IN: zp - znode of file to be written to 347 * data - bytes to write 348 * len - number of bytes to write 349 * pos - offset to start writing at 350 * 351 * OUT: resid - remaining bytes to write 352 * 353 * RETURN: 0 if success 354 * positive error code if failure. EIO is returned 355 * for a short write when residp isn't provided. 356 * 357 * Timestamps: 358 * zp - ctime|mtime updated if byte count > 0 359 */ 360 int 361 zfs_write_simple(znode_t *zp, const void *data, size_t len, 362 loff_t pos, size_t *residp) 363 { 364 fstrans_cookie_t cookie; 365 int error; 366 367 struct iovec iov; 368 iov.iov_base = (void *)data; 369 iov.iov_len = len; 370 371 zfs_uio_t uio; 372 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 373 374 cookie = spl_fstrans_mark(); 375 error = zfs_write(zp, &uio, 0, kcred); 376 spl_fstrans_unmark(cookie); 377 378 if (error == 0) { 379 if (residp != NULL) 380 *residp = zfs_uio_resid(&uio); 381 else if (zfs_uio_resid(&uio) != 0) 382 error = SET_ERROR(EIO); 383 } 384 385 return (error); 386 } 387 388 static void 389 zfs_rele_async_task(void *arg) 390 { 391 iput(arg); 392 } 393 394 void 395 zfs_zrele_async(znode_t *zp) 396 { 397 struct inode *ip = ZTOI(zp); 398 objset_t *os = ITOZSB(ip)->z_os; 399 400 ASSERT(atomic_read(&ip->i_count) > 0); 401 ASSERT(os != NULL); 402 403 /* 404 * If decrementing the count would put us at 0, we can't do it inline 405 * here, because that would be synchronous. Instead, dispatch an iput 406 * to run later. 407 * 408 * For more information on the dangers of a synchronous iput, see the 409 * header comment of this file. 410 */ 411 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 412 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 413 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 414 } 415 } 416 417 418 /* 419 * Lookup an entry in a directory, or an extended attribute directory. 420 * If it exists, return a held inode reference for it. 421 * 422 * IN: zdp - znode of directory to search. 423 * nm - name of entry to lookup. 424 * flags - LOOKUP_XATTR set if looking for an attribute. 425 * cr - credentials of caller. 426 * direntflags - directory lookup flags 427 * realpnp - returned pathname. 428 * 429 * OUT: zpp - znode of located entry, NULL if not found. 430 * 431 * RETURN: 0 on success, error code on failure. 432 * 433 * Timestamps: 434 * NA 435 */ 436 int 437 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 438 int *direntflags, pathname_t *realpnp) 439 { 440 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 441 int error = 0; 442 443 /* 444 * Fast path lookup, however we must skip DNLC lookup 445 * for case folding or normalizing lookups because the 446 * DNLC code only stores the passed in name. This means 447 * creating 'a' and removing 'A' on a case insensitive 448 * file system would work, but DNLC still thinks 'a' 449 * exists and won't let you create it again on the next 450 * pass through fast path. 451 */ 452 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 453 454 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 455 return (SET_ERROR(ENOTDIR)); 456 } else if (zdp->z_sa_hdl == NULL) { 457 return (SET_ERROR(EIO)); 458 } 459 460 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 461 error = zfs_fastaccesschk_execute(zdp, cr); 462 if (!error) { 463 *zpp = zdp; 464 zhold(*zpp); 465 return (0); 466 } 467 return (error); 468 } 469 } 470 471 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 472 return (error); 473 474 *zpp = NULL; 475 476 if (flags & LOOKUP_XATTR) { 477 /* 478 * We don't allow recursive attributes.. 479 * Maybe someday we will. 480 */ 481 if (zdp->z_pflags & ZFS_XATTR) { 482 zfs_exit(zfsvfs, FTAG); 483 return (SET_ERROR(EINVAL)); 484 } 485 486 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 487 zfs_exit(zfsvfs, FTAG); 488 return (error); 489 } 490 491 /* 492 * Do we have permission to get into attribute directory? 493 */ 494 495 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 496 B_TRUE, cr, zfs_init_idmap))) { 497 zrele(*zpp); 498 *zpp = NULL; 499 } 500 501 zfs_exit(zfsvfs, FTAG); 502 return (error); 503 } 504 505 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 506 zfs_exit(zfsvfs, FTAG); 507 return (SET_ERROR(ENOTDIR)); 508 } 509 510 /* 511 * Check accessibility of directory. 512 */ 513 514 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 515 zfs_init_idmap))) { 516 zfs_exit(zfsvfs, FTAG); 517 return (error); 518 } 519 520 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 521 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 522 zfs_exit(zfsvfs, FTAG); 523 return (SET_ERROR(EILSEQ)); 524 } 525 526 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 527 if ((error == 0) && (*zpp)) 528 zfs_znode_update_vfs(*zpp); 529 530 zfs_exit(zfsvfs, FTAG); 531 return (error); 532 } 533 534 /* 535 * Attempt to create a new entry in a directory. If the entry 536 * already exists, truncate the file if permissible, else return 537 * an error. Return the ip of the created or trunc'd file. 538 * 539 * IN: dzp - znode of directory to put new file entry in. 540 * name - name of new file entry. 541 * vap - attributes of new file. 542 * excl - flag indicating exclusive or non-exclusive mode. 543 * mode - mode to open file with. 544 * cr - credentials of caller. 545 * flag - file flag. 546 * vsecp - ACL to be set 547 * mnt_ns - user namespace of the mount 548 * 549 * OUT: zpp - znode of created or trunc'd entry. 550 * 551 * RETURN: 0 on success, error code on failure. 552 * 553 * Timestamps: 554 * dzp - ctime|mtime updated if new entry created 555 * zp - ctime|mtime always, atime if new 556 */ 557 int 558 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 559 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 560 zidmap_t *mnt_ns) 561 { 562 znode_t *zp; 563 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 564 zilog_t *zilog; 565 objset_t *os; 566 zfs_dirlock_t *dl; 567 dmu_tx_t *tx; 568 int error; 569 uid_t uid; 570 gid_t gid; 571 zfs_acl_ids_t acl_ids; 572 boolean_t fuid_dirtied; 573 boolean_t have_acl = B_FALSE; 574 boolean_t waited = B_FALSE; 575 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 576 577 /* 578 * If we have an ephemeral id, ACL, or XVATTR then 579 * make sure file system is at proper version 580 */ 581 582 gid = crgetgid(cr); 583 uid = crgetuid(cr); 584 585 if (zfsvfs->z_use_fuids == B_FALSE && 586 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 587 return (SET_ERROR(EINVAL)); 588 589 if (name == NULL) 590 return (SET_ERROR(EINVAL)); 591 592 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 593 return (error); 594 os = zfsvfs->z_os; 595 zilog = zfsvfs->z_log; 596 597 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 598 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 599 zfs_exit(zfsvfs, FTAG); 600 return (SET_ERROR(EILSEQ)); 601 } 602 603 if (vap->va_mask & ATTR_XVATTR) { 604 if ((error = secpolicy_xvattr((xvattr_t *)vap, 605 crgetuid(cr), cr, vap->va_mode)) != 0) { 606 zfs_exit(zfsvfs, FTAG); 607 return (error); 608 } 609 } 610 611 top: 612 *zpp = NULL; 613 if (*name == '\0') { 614 /* 615 * Null component name refers to the directory itself. 616 */ 617 zhold(dzp); 618 zp = dzp; 619 dl = NULL; 620 error = 0; 621 } else { 622 /* possible igrab(zp) */ 623 int zflg = 0; 624 625 if (flag & FIGNORECASE) 626 zflg |= ZCILOOK; 627 628 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 629 NULL, NULL); 630 if (error) { 631 if (have_acl) 632 zfs_acl_ids_free(&acl_ids); 633 if (strcmp(name, "..") == 0) 634 error = SET_ERROR(EISDIR); 635 zfs_exit(zfsvfs, FTAG); 636 return (error); 637 } 638 } 639 640 if (zp == NULL) { 641 uint64_t txtype; 642 uint64_t projid = ZFS_DEFAULT_PROJID; 643 644 /* 645 * Create a new file object and update the directory 646 * to reference it. 647 */ 648 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 649 mnt_ns))) { 650 if (have_acl) 651 zfs_acl_ids_free(&acl_ids); 652 goto out; 653 } 654 655 /* 656 * We only support the creation of regular files in 657 * extended attribute directories. 658 */ 659 660 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 661 if (have_acl) 662 zfs_acl_ids_free(&acl_ids); 663 error = SET_ERROR(EINVAL); 664 goto out; 665 } 666 667 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 668 cr, vsecp, &acl_ids, mnt_ns)) != 0) 669 goto out; 670 have_acl = B_TRUE; 671 672 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 673 projid = zfs_inherit_projid(dzp); 674 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 675 zfs_acl_ids_free(&acl_ids); 676 error = SET_ERROR(EDQUOT); 677 goto out; 678 } 679 680 tx = dmu_tx_create(os); 681 682 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 683 ZFS_SA_BASE_ATTR_SIZE); 684 685 fuid_dirtied = zfsvfs->z_fuid_dirty; 686 if (fuid_dirtied) 687 zfs_fuid_txhold(zfsvfs, tx); 688 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 689 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 690 if (!zfsvfs->z_use_sa && 691 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 692 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 693 0, acl_ids.z_aclp->z_acl_bytes); 694 } 695 696 error = dmu_tx_assign(tx, 697 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 698 if (error) { 699 zfs_dirent_unlock(dl); 700 if (error == ERESTART) { 701 waited = B_TRUE; 702 dmu_tx_wait(tx); 703 dmu_tx_abort(tx); 704 goto top; 705 } 706 zfs_acl_ids_free(&acl_ids); 707 dmu_tx_abort(tx); 708 zfs_exit(zfsvfs, FTAG); 709 return (error); 710 } 711 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 712 713 error = zfs_link_create(dl, zp, tx, ZNEW); 714 if (error != 0) { 715 /* 716 * Since, we failed to add the directory entry for it, 717 * delete the newly created dnode. 718 */ 719 zfs_znode_delete(zp, tx); 720 remove_inode_hash(ZTOI(zp)); 721 zfs_acl_ids_free(&acl_ids); 722 dmu_tx_commit(tx); 723 goto out; 724 } 725 726 if (fuid_dirtied) 727 zfs_fuid_sync(zfsvfs, tx); 728 729 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 730 if (flag & FIGNORECASE) 731 txtype |= TX_CI; 732 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 733 vsecp, acl_ids.z_fuidp, vap); 734 zfs_acl_ids_free(&acl_ids); 735 dmu_tx_commit(tx); 736 } else { 737 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 738 739 if (have_acl) 740 zfs_acl_ids_free(&acl_ids); 741 742 /* 743 * A directory entry already exists for this name. 744 */ 745 /* 746 * Can't truncate an existing file if in exclusive mode. 747 */ 748 if (excl) { 749 error = SET_ERROR(EEXIST); 750 goto out; 751 } 752 /* 753 * Can't open a directory for writing. 754 */ 755 if (S_ISDIR(ZTOI(zp)->i_mode)) { 756 error = SET_ERROR(EISDIR); 757 goto out; 758 } 759 /* 760 * Verify requested access to file. 761 */ 762 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 763 mnt_ns))) { 764 goto out; 765 } 766 767 mutex_enter(&dzp->z_lock); 768 dzp->z_seq++; 769 mutex_exit(&dzp->z_lock); 770 771 /* 772 * Truncate regular files if requested. 773 */ 774 if (S_ISREG(ZTOI(zp)->i_mode) && 775 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 776 /* we can't hold any locks when calling zfs_freesp() */ 777 if (dl) { 778 zfs_dirent_unlock(dl); 779 dl = NULL; 780 } 781 error = zfs_freesp(zp, 0, 0, mode, TRUE); 782 } 783 } 784 out: 785 786 if (dl) 787 zfs_dirent_unlock(dl); 788 789 if (error) { 790 if (zp) 791 zrele(zp); 792 } else { 793 zfs_znode_update_vfs(dzp); 794 zfs_znode_update_vfs(zp); 795 *zpp = zp; 796 } 797 798 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 799 zil_commit(zilog, 0); 800 801 zfs_exit(zfsvfs, FTAG); 802 return (error); 803 } 804 805 int 806 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 807 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 808 zidmap_t *mnt_ns) 809 { 810 (void) excl, (void) mode, (void) flag; 811 znode_t *zp = NULL, *dzp = ITOZ(dip); 812 zfsvfs_t *zfsvfs = ITOZSB(dip); 813 objset_t *os; 814 dmu_tx_t *tx; 815 int error; 816 uid_t uid; 817 gid_t gid; 818 zfs_acl_ids_t acl_ids; 819 uint64_t projid = ZFS_DEFAULT_PROJID; 820 boolean_t fuid_dirtied; 821 boolean_t have_acl = B_FALSE; 822 boolean_t waited = B_FALSE; 823 824 /* 825 * If we have an ephemeral id, ACL, or XVATTR then 826 * make sure file system is at proper version 827 */ 828 829 gid = crgetgid(cr); 830 uid = crgetuid(cr); 831 832 if (zfsvfs->z_use_fuids == B_FALSE && 833 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 834 return (SET_ERROR(EINVAL)); 835 836 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 837 return (error); 838 os = zfsvfs->z_os; 839 840 if (vap->va_mask & ATTR_XVATTR) { 841 if ((error = secpolicy_xvattr((xvattr_t *)vap, 842 crgetuid(cr), cr, vap->va_mode)) != 0) { 843 zfs_exit(zfsvfs, FTAG); 844 return (error); 845 } 846 } 847 848 top: 849 *ipp = NULL; 850 851 /* 852 * Create a new file object and update the directory 853 * to reference it. 854 */ 855 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 856 if (have_acl) 857 zfs_acl_ids_free(&acl_ids); 858 goto out; 859 } 860 861 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 862 cr, vsecp, &acl_ids, mnt_ns)) != 0) 863 goto out; 864 have_acl = B_TRUE; 865 866 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 867 projid = zfs_inherit_projid(dzp); 868 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 869 zfs_acl_ids_free(&acl_ids); 870 error = SET_ERROR(EDQUOT); 871 goto out; 872 } 873 874 tx = dmu_tx_create(os); 875 876 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 877 ZFS_SA_BASE_ATTR_SIZE); 878 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 879 880 fuid_dirtied = zfsvfs->z_fuid_dirty; 881 if (fuid_dirtied) 882 zfs_fuid_txhold(zfsvfs, tx); 883 if (!zfsvfs->z_use_sa && 884 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 885 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 886 0, acl_ids.z_aclp->z_acl_bytes); 887 } 888 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 889 if (error) { 890 if (error == ERESTART) { 891 waited = B_TRUE; 892 dmu_tx_wait(tx); 893 dmu_tx_abort(tx); 894 goto top; 895 } 896 zfs_acl_ids_free(&acl_ids); 897 dmu_tx_abort(tx); 898 zfs_exit(zfsvfs, FTAG); 899 return (error); 900 } 901 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 902 903 if (fuid_dirtied) 904 zfs_fuid_sync(zfsvfs, tx); 905 906 /* Add to unlinked set */ 907 zp->z_unlinked = B_TRUE; 908 zfs_unlinked_add(zp, tx); 909 zfs_acl_ids_free(&acl_ids); 910 dmu_tx_commit(tx); 911 out: 912 913 if (error) { 914 if (zp) 915 zrele(zp); 916 } else { 917 zfs_znode_update_vfs(dzp); 918 zfs_znode_update_vfs(zp); 919 *ipp = ZTOI(zp); 920 } 921 922 zfs_exit(zfsvfs, FTAG); 923 return (error); 924 } 925 926 /* 927 * Remove an entry from a directory. 928 * 929 * IN: dzp - znode of directory to remove entry from. 930 * name - name of entry to remove. 931 * cr - credentials of caller. 932 * flags - case flags. 933 * 934 * RETURN: 0 if success 935 * error code if failure 936 * 937 * Timestamps: 938 * dzp - ctime|mtime 939 * ip - ctime (if nlink > 0) 940 */ 941 942 static uint64_t null_xattr = 0; 943 944 int 945 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 946 { 947 znode_t *zp; 948 znode_t *xzp; 949 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 950 zilog_t *zilog; 951 uint64_t acl_obj, xattr_obj; 952 uint64_t xattr_obj_unlinked = 0; 953 uint64_t obj = 0; 954 uint64_t links; 955 zfs_dirlock_t *dl; 956 dmu_tx_t *tx; 957 boolean_t may_delete_now, delete_now = FALSE; 958 boolean_t unlinked, toobig = FALSE; 959 uint64_t txtype; 960 pathname_t *realnmp = NULL; 961 pathname_t realnm; 962 int error; 963 int zflg = ZEXISTS; 964 boolean_t waited = B_FALSE; 965 966 if (name == NULL) 967 return (SET_ERROR(EINVAL)); 968 969 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 970 return (error); 971 zilog = zfsvfs->z_log; 972 973 if (flags & FIGNORECASE) { 974 zflg |= ZCILOOK; 975 pn_alloc(&realnm); 976 realnmp = &realnm; 977 } 978 979 top: 980 xattr_obj = 0; 981 xzp = NULL; 982 /* 983 * Attempt to lock directory; fail if entry doesn't exist. 984 */ 985 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 986 NULL, realnmp))) { 987 if (realnmp) 988 pn_free(realnmp); 989 zfs_exit(zfsvfs, FTAG); 990 return (error); 991 } 992 993 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 994 goto out; 995 } 996 997 /* 998 * Need to use rmdir for removing directories. 999 */ 1000 if (S_ISDIR(ZTOI(zp)->i_mode)) { 1001 error = SET_ERROR(EPERM); 1002 goto out; 1003 } 1004 1005 mutex_enter(&zp->z_lock); 1006 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 1007 !zn_has_cached_data(zp, 0, LLONG_MAX); 1008 mutex_exit(&zp->z_lock); 1009 1010 /* 1011 * We may delete the znode now, or we may put it in the unlinked set; 1012 * it depends on whether we're the last link, and on whether there are 1013 * other holds on the inode. So we dmu_tx_hold() the right things to 1014 * allow for either case. 1015 */ 1016 obj = zp->z_id; 1017 tx = dmu_tx_create(zfsvfs->z_os); 1018 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1019 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1020 zfs_sa_upgrade_txholds(tx, zp); 1021 zfs_sa_upgrade_txholds(tx, dzp); 1022 if (may_delete_now) { 1023 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1024 /* if the file is too big, only hold_free a token amount */ 1025 dmu_tx_hold_free(tx, zp->z_id, 0, 1026 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1027 } 1028 1029 /* are there any extended attributes? */ 1030 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1031 &xattr_obj, sizeof (xattr_obj)); 1032 if (error == 0 && xattr_obj) { 1033 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1034 ASSERT0(error); 1035 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1036 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1037 } 1038 1039 mutex_enter(&zp->z_lock); 1040 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1041 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1042 mutex_exit(&zp->z_lock); 1043 1044 /* charge as an update -- would be nice not to charge at all */ 1045 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1046 1047 /* 1048 * Mark this transaction as typically resulting in a net free of space 1049 */ 1050 dmu_tx_mark_netfree(tx); 1051 1052 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1053 if (error) { 1054 zfs_dirent_unlock(dl); 1055 if (error == ERESTART) { 1056 waited = B_TRUE; 1057 dmu_tx_wait(tx); 1058 dmu_tx_abort(tx); 1059 zrele(zp); 1060 if (xzp) 1061 zrele(xzp); 1062 goto top; 1063 } 1064 if (realnmp) 1065 pn_free(realnmp); 1066 dmu_tx_abort(tx); 1067 zrele(zp); 1068 if (xzp) 1069 zrele(xzp); 1070 zfs_exit(zfsvfs, FTAG); 1071 return (error); 1072 } 1073 1074 /* 1075 * Remove the directory entry. 1076 */ 1077 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1078 1079 if (error) { 1080 dmu_tx_commit(tx); 1081 goto out; 1082 } 1083 1084 if (unlinked) { 1085 /* 1086 * Hold z_lock so that we can make sure that the ACL obj 1087 * hasn't changed. Could have been deleted due to 1088 * zfs_sa_upgrade(). 1089 */ 1090 mutex_enter(&zp->z_lock); 1091 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1092 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1093 delete_now = may_delete_now && !toobig && 1094 atomic_read(&ZTOI(zp)->i_count) == 1 && 1095 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1096 xattr_obj == xattr_obj_unlinked && 1097 zfs_external_acl(zp) == acl_obj; 1098 VERIFY_IMPLY(xattr_obj_unlinked, xzp); 1099 } 1100 1101 if (delete_now) { 1102 if (xattr_obj_unlinked) { 1103 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1104 mutex_enter(&xzp->z_lock); 1105 xzp->z_unlinked = B_TRUE; 1106 clear_nlink(ZTOI(xzp)); 1107 links = 0; 1108 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1109 &links, sizeof (links), tx); 1110 ASSERT3U(error, ==, 0); 1111 mutex_exit(&xzp->z_lock); 1112 zfs_unlinked_add(xzp, tx); 1113 1114 if (zp->z_is_sa) 1115 error = sa_remove(zp->z_sa_hdl, 1116 SA_ZPL_XATTR(zfsvfs), tx); 1117 else 1118 error = sa_update(zp->z_sa_hdl, 1119 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1120 sizeof (uint64_t), tx); 1121 ASSERT0(error); 1122 } 1123 /* 1124 * Add to the unlinked set because a new reference could be 1125 * taken concurrently resulting in a deferred destruction. 1126 */ 1127 zfs_unlinked_add(zp, tx); 1128 mutex_exit(&zp->z_lock); 1129 } else if (unlinked) { 1130 mutex_exit(&zp->z_lock); 1131 zfs_unlinked_add(zp, tx); 1132 } 1133 1134 txtype = TX_REMOVE; 1135 if (flags & FIGNORECASE) 1136 txtype |= TX_CI; 1137 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1138 1139 dmu_tx_commit(tx); 1140 out: 1141 if (realnmp) 1142 pn_free(realnmp); 1143 1144 zfs_dirent_unlock(dl); 1145 zfs_znode_update_vfs(dzp); 1146 zfs_znode_update_vfs(zp); 1147 1148 if (delete_now) 1149 zrele(zp); 1150 else 1151 zfs_zrele_async(zp); 1152 1153 if (xzp) { 1154 zfs_znode_update_vfs(xzp); 1155 zfs_zrele_async(xzp); 1156 } 1157 1158 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1159 zil_commit(zilog, 0); 1160 1161 zfs_exit(zfsvfs, FTAG); 1162 return (error); 1163 } 1164 1165 /* 1166 * Create a new directory and insert it into dzp using the name 1167 * provided. Return a pointer to the inserted directory. 1168 * 1169 * IN: dzp - znode of directory to add subdir to. 1170 * dirname - name of new directory. 1171 * vap - attributes of new directory. 1172 * cr - credentials of caller. 1173 * flags - case flags. 1174 * vsecp - ACL to be set 1175 * mnt_ns - user namespace of the mount 1176 * 1177 * OUT: zpp - znode of created directory. 1178 * 1179 * RETURN: 0 if success 1180 * error code if failure 1181 * 1182 * Timestamps: 1183 * dzp - ctime|mtime updated 1184 * zpp - ctime|mtime|atime updated 1185 */ 1186 int 1187 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1188 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) 1189 { 1190 znode_t *zp; 1191 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1192 zilog_t *zilog; 1193 zfs_dirlock_t *dl; 1194 uint64_t txtype; 1195 dmu_tx_t *tx; 1196 int error; 1197 int zf = ZNEW; 1198 uid_t uid; 1199 gid_t gid = crgetgid(cr); 1200 zfs_acl_ids_t acl_ids; 1201 boolean_t fuid_dirtied; 1202 boolean_t waited = B_FALSE; 1203 1204 ASSERT(S_ISDIR(vap->va_mode)); 1205 1206 /* 1207 * If we have an ephemeral id, ACL, or XVATTR then 1208 * make sure file system is at proper version 1209 */ 1210 1211 uid = crgetuid(cr); 1212 if (zfsvfs->z_use_fuids == B_FALSE && 1213 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1214 return (SET_ERROR(EINVAL)); 1215 1216 if (dirname == NULL) 1217 return (SET_ERROR(EINVAL)); 1218 1219 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1220 return (error); 1221 zilog = zfsvfs->z_log; 1222 1223 if (dzp->z_pflags & ZFS_XATTR) { 1224 zfs_exit(zfsvfs, FTAG); 1225 return (SET_ERROR(EINVAL)); 1226 } 1227 1228 if (zfsvfs->z_utf8 && u8_validate(dirname, 1229 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1230 zfs_exit(zfsvfs, FTAG); 1231 return (SET_ERROR(EILSEQ)); 1232 } 1233 if (flags & FIGNORECASE) 1234 zf |= ZCILOOK; 1235 1236 if (vap->va_mask & ATTR_XVATTR) { 1237 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1238 crgetuid(cr), cr, vap->va_mode)) != 0) { 1239 zfs_exit(zfsvfs, FTAG); 1240 return (error); 1241 } 1242 } 1243 1244 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1245 vsecp, &acl_ids, mnt_ns)) != 0) { 1246 zfs_exit(zfsvfs, FTAG); 1247 return (error); 1248 } 1249 /* 1250 * First make sure the new directory doesn't exist. 1251 * 1252 * Existence is checked first to make sure we don't return 1253 * EACCES instead of EEXIST which can cause some applications 1254 * to fail. 1255 */ 1256 top: 1257 *zpp = NULL; 1258 1259 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1260 NULL, NULL))) { 1261 zfs_acl_ids_free(&acl_ids); 1262 zfs_exit(zfsvfs, FTAG); 1263 return (error); 1264 } 1265 1266 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1267 mnt_ns))) { 1268 zfs_acl_ids_free(&acl_ids); 1269 zfs_dirent_unlock(dl); 1270 zfs_exit(zfsvfs, FTAG); 1271 return (error); 1272 } 1273 1274 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1275 zfs_acl_ids_free(&acl_ids); 1276 zfs_dirent_unlock(dl); 1277 zfs_exit(zfsvfs, FTAG); 1278 return (SET_ERROR(EDQUOT)); 1279 } 1280 1281 /* 1282 * Add a new entry to the directory. 1283 */ 1284 tx = dmu_tx_create(zfsvfs->z_os); 1285 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1286 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1287 fuid_dirtied = zfsvfs->z_fuid_dirty; 1288 if (fuid_dirtied) 1289 zfs_fuid_txhold(zfsvfs, tx); 1290 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1291 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1292 acl_ids.z_aclp->z_acl_bytes); 1293 } 1294 1295 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1296 ZFS_SA_BASE_ATTR_SIZE); 1297 1298 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1299 if (error) { 1300 zfs_dirent_unlock(dl); 1301 if (error == ERESTART) { 1302 waited = B_TRUE; 1303 dmu_tx_wait(tx); 1304 dmu_tx_abort(tx); 1305 goto top; 1306 } 1307 zfs_acl_ids_free(&acl_ids); 1308 dmu_tx_abort(tx); 1309 zfs_exit(zfsvfs, FTAG); 1310 return (error); 1311 } 1312 1313 /* 1314 * Create new node. 1315 */ 1316 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1317 1318 /* 1319 * Now put new name in parent dir. 1320 */ 1321 error = zfs_link_create(dl, zp, tx, ZNEW); 1322 if (error != 0) { 1323 zfs_znode_delete(zp, tx); 1324 remove_inode_hash(ZTOI(zp)); 1325 goto out; 1326 } 1327 1328 if (fuid_dirtied) 1329 zfs_fuid_sync(zfsvfs, tx); 1330 1331 *zpp = zp; 1332 1333 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1334 if (flags & FIGNORECASE) 1335 txtype |= TX_CI; 1336 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1337 acl_ids.z_fuidp, vap); 1338 1339 out: 1340 zfs_acl_ids_free(&acl_ids); 1341 1342 dmu_tx_commit(tx); 1343 1344 zfs_dirent_unlock(dl); 1345 1346 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1347 zil_commit(zilog, 0); 1348 1349 if (error != 0) { 1350 zrele(zp); 1351 } else { 1352 zfs_znode_update_vfs(dzp); 1353 zfs_znode_update_vfs(zp); 1354 } 1355 zfs_exit(zfsvfs, FTAG); 1356 return (error); 1357 } 1358 1359 /* 1360 * Remove a directory subdir entry. If the current working 1361 * directory is the same as the subdir to be removed, the 1362 * remove will fail. 1363 * 1364 * IN: dzp - znode of directory to remove from. 1365 * name - name of directory to be removed. 1366 * cwd - inode of current working directory. 1367 * cr - credentials of caller. 1368 * flags - case flags 1369 * 1370 * RETURN: 0 on success, error code on failure. 1371 * 1372 * Timestamps: 1373 * dzp - ctime|mtime updated 1374 */ 1375 int 1376 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1377 int flags) 1378 { 1379 znode_t *zp; 1380 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1381 zilog_t *zilog; 1382 zfs_dirlock_t *dl; 1383 dmu_tx_t *tx; 1384 int error; 1385 int zflg = ZEXISTS; 1386 boolean_t waited = B_FALSE; 1387 1388 if (name == NULL) 1389 return (SET_ERROR(EINVAL)); 1390 1391 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1392 return (error); 1393 zilog = zfsvfs->z_log; 1394 1395 if (flags & FIGNORECASE) 1396 zflg |= ZCILOOK; 1397 top: 1398 zp = NULL; 1399 1400 /* 1401 * Attempt to lock directory; fail if entry doesn't exist. 1402 */ 1403 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1404 NULL, NULL))) { 1405 zfs_exit(zfsvfs, FTAG); 1406 return (error); 1407 } 1408 1409 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1410 goto out; 1411 } 1412 1413 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1414 error = SET_ERROR(ENOTDIR); 1415 goto out; 1416 } 1417 1418 if (zp == cwd) { 1419 error = SET_ERROR(EINVAL); 1420 goto out; 1421 } 1422 1423 /* 1424 * Grab a lock on the directory to make sure that no one is 1425 * trying to add (or lookup) entries while we are removing it. 1426 */ 1427 rw_enter(&zp->z_name_lock, RW_WRITER); 1428 1429 /* 1430 * Grab a lock on the parent pointer to make sure we play well 1431 * with the treewalk and directory rename code. 1432 */ 1433 rw_enter(&zp->z_parent_lock, RW_WRITER); 1434 1435 tx = dmu_tx_create(zfsvfs->z_os); 1436 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1437 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1438 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1439 zfs_sa_upgrade_txholds(tx, zp); 1440 zfs_sa_upgrade_txholds(tx, dzp); 1441 dmu_tx_mark_netfree(tx); 1442 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1443 if (error) { 1444 rw_exit(&zp->z_parent_lock); 1445 rw_exit(&zp->z_name_lock); 1446 zfs_dirent_unlock(dl); 1447 if (error == ERESTART) { 1448 waited = B_TRUE; 1449 dmu_tx_wait(tx); 1450 dmu_tx_abort(tx); 1451 zrele(zp); 1452 goto top; 1453 } 1454 dmu_tx_abort(tx); 1455 zrele(zp); 1456 zfs_exit(zfsvfs, FTAG); 1457 return (error); 1458 } 1459 1460 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1461 1462 if (error == 0) { 1463 uint64_t txtype = TX_RMDIR; 1464 if (flags & FIGNORECASE) 1465 txtype |= TX_CI; 1466 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1467 B_FALSE); 1468 } 1469 1470 dmu_tx_commit(tx); 1471 1472 rw_exit(&zp->z_parent_lock); 1473 rw_exit(&zp->z_name_lock); 1474 out: 1475 zfs_dirent_unlock(dl); 1476 1477 zfs_znode_update_vfs(dzp); 1478 zfs_znode_update_vfs(zp); 1479 zrele(zp); 1480 1481 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1482 zil_commit(zilog, 0); 1483 1484 zfs_exit(zfsvfs, FTAG); 1485 return (error); 1486 } 1487 1488 /* 1489 * Read directory entries from the given directory cursor position and emit 1490 * name and position for each entry. 1491 * 1492 * IN: ip - inode of directory to read. 1493 * ctx - directory entry context. 1494 * cr - credentials of caller. 1495 * 1496 * RETURN: 0 if success 1497 * error code if failure 1498 * 1499 * Timestamps: 1500 * ip - atime updated 1501 * 1502 * Note that the low 4 bits of the cookie returned by zap is always zero. 1503 * This allows us to use the low range for "special" directory entries: 1504 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1505 * we use the offset 2 for the '.zfs' directory. 1506 */ 1507 int 1508 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) 1509 { 1510 (void) cr; 1511 znode_t *zp = ITOZ(ip); 1512 zfsvfs_t *zfsvfs = ITOZSB(ip); 1513 objset_t *os; 1514 zap_cursor_t zc; 1515 zap_attribute_t zap; 1516 int error; 1517 uint8_t prefetch; 1518 uint8_t type; 1519 int done = 0; 1520 uint64_t parent; 1521 uint64_t offset; /* must be unsigned; checks for < 1 */ 1522 1523 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1524 return (error); 1525 1526 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1527 &parent, sizeof (parent))) != 0) 1528 goto out; 1529 1530 /* 1531 * Quit if directory has been removed (posix) 1532 */ 1533 if (zp->z_unlinked) 1534 goto out; 1535 1536 error = 0; 1537 os = zfsvfs->z_os; 1538 offset = ctx->pos; 1539 prefetch = zp->z_zn_prefetch; 1540 1541 /* 1542 * Initialize the iterator cursor. 1543 */ 1544 if (offset <= 3) { 1545 /* 1546 * Start iteration from the beginning of the directory. 1547 */ 1548 zap_cursor_init(&zc, os, zp->z_id); 1549 } else { 1550 /* 1551 * The offset is a serialized cursor. 1552 */ 1553 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1554 } 1555 1556 /* 1557 * Transform to file-system independent format 1558 */ 1559 while (!done) { 1560 uint64_t objnum; 1561 /* 1562 * Special case `.', `..', and `.zfs'. 1563 */ 1564 if (offset == 0) { 1565 (void) strcpy(zap.za_name, "."); 1566 zap.za_normalization_conflict = 0; 1567 objnum = zp->z_id; 1568 type = DT_DIR; 1569 } else if (offset == 1) { 1570 (void) strcpy(zap.za_name, ".."); 1571 zap.za_normalization_conflict = 0; 1572 objnum = parent; 1573 type = DT_DIR; 1574 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1575 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1576 zap.za_normalization_conflict = 0; 1577 objnum = ZFSCTL_INO_ROOT; 1578 type = DT_DIR; 1579 } else { 1580 /* 1581 * Grab next entry. 1582 */ 1583 if ((error = zap_cursor_retrieve(&zc, &zap))) { 1584 if (error == ENOENT) 1585 break; 1586 else 1587 goto update; 1588 } 1589 1590 /* 1591 * Allow multiple entries provided the first entry is 1592 * the object id. Non-zpl consumers may safely make 1593 * use of the additional space. 1594 * 1595 * XXX: This should be a feature flag for compatibility 1596 */ 1597 if (zap.za_integer_length != 8 || 1598 zap.za_num_integers == 0) { 1599 cmn_err(CE_WARN, "zap_readdir: bad directory " 1600 "entry, obj = %lld, offset = %lld, " 1601 "length = %d, num = %lld\n", 1602 (u_longlong_t)zp->z_id, 1603 (u_longlong_t)offset, 1604 zap.za_integer_length, 1605 (u_longlong_t)zap.za_num_integers); 1606 error = SET_ERROR(ENXIO); 1607 goto update; 1608 } 1609 1610 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1611 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1612 } 1613 1614 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), 1615 objnum, type); 1616 if (done) 1617 break; 1618 1619 if (prefetch) 1620 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); 1621 1622 /* 1623 * Move to the next entry, fill in the previous offset. 1624 */ 1625 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1626 zap_cursor_advance(&zc); 1627 offset = zap_cursor_serialize(&zc); 1628 } else { 1629 offset += 1; 1630 } 1631 ctx->pos = offset; 1632 } 1633 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1634 1635 update: 1636 zap_cursor_fini(&zc); 1637 if (error == ENOENT) 1638 error = 0; 1639 out: 1640 zfs_exit(zfsvfs, FTAG); 1641 1642 return (error); 1643 } 1644 1645 /* 1646 * Get the basic file attributes and place them in the provided kstat 1647 * structure. The inode is assumed to be the authoritative source 1648 * for most of the attributes. However, the znode currently has the 1649 * authoritative atime, blksize, and block count. 1650 * 1651 * IN: ip - inode of file. 1652 * 1653 * OUT: sp - kstat values. 1654 * 1655 * RETURN: 0 (always succeeds) 1656 */ 1657 int 1658 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1659 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, 1660 struct kstat *sp) 1661 #else 1662 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) 1663 #endif 1664 { 1665 znode_t *zp = ITOZ(ip); 1666 zfsvfs_t *zfsvfs = ITOZSB(ip); 1667 uint32_t blksize; 1668 u_longlong_t nblocks; 1669 int error; 1670 1671 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1672 return (error); 1673 1674 mutex_enter(&zp->z_lock); 1675 1676 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1677 zpl_generic_fillattr(user_ns, request_mask, ip, sp); 1678 #else 1679 zpl_generic_fillattr(user_ns, ip, sp); 1680 #endif 1681 /* 1682 * +1 link count for root inode with visible '.zfs' directory. 1683 */ 1684 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1685 if (sp->nlink < ZFS_LINK_MAX) 1686 sp->nlink++; 1687 1688 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1689 sp->blksize = blksize; 1690 sp->blocks = nblocks; 1691 1692 if (unlikely(zp->z_blksz == 0)) { 1693 /* 1694 * Block size hasn't been set; suggest maximal I/O transfers. 1695 */ 1696 sp->blksize = zfsvfs->z_max_blksz; 1697 } 1698 1699 mutex_exit(&zp->z_lock); 1700 1701 /* 1702 * Required to prevent NFS client from detecting different inode 1703 * numbers of snapshot root dentry before and after snapshot mount. 1704 */ 1705 if (zfsvfs->z_issnap) { 1706 if (ip->i_sb->s_root->d_inode == ip) 1707 sp->ino = ZFSCTL_INO_SNAPDIRS - 1708 dmu_objset_id(zfsvfs->z_os); 1709 } 1710 1711 zfs_exit(zfsvfs, FTAG); 1712 1713 return (0); 1714 } 1715 1716 /* 1717 * For the operation of changing file's user/group/project, we need to 1718 * handle not only the main object that is assigned to the file directly, 1719 * but also the ones that are used by the file via hidden xattr directory. 1720 * 1721 * Because the xattr directory may contains many EA entries, as to it may 1722 * be impossible to change all of them via the transaction of changing the 1723 * main object's user/group/project attributes. Then we have to change them 1724 * via other multiple independent transactions one by one. It may be not good 1725 * solution, but we have no better idea yet. 1726 */ 1727 static int 1728 zfs_setattr_dir(znode_t *dzp) 1729 { 1730 struct inode *dxip = ZTOI(dzp); 1731 struct inode *xip = NULL; 1732 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1733 objset_t *os = zfsvfs->z_os; 1734 zap_cursor_t zc; 1735 zap_attribute_t zap; 1736 zfs_dirlock_t *dl; 1737 znode_t *zp = NULL; 1738 dmu_tx_t *tx = NULL; 1739 uint64_t uid, gid; 1740 sa_bulk_attr_t bulk[4]; 1741 int count; 1742 int err; 1743 1744 zap_cursor_init(&zc, os, dzp->z_id); 1745 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 1746 count = 0; 1747 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 1748 err = ENXIO; 1749 break; 1750 } 1751 1752 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 1753 ZEXISTS, NULL, NULL); 1754 if (err == ENOENT) 1755 goto next; 1756 if (err) 1757 break; 1758 1759 xip = ZTOI(zp); 1760 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1761 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1762 zp->z_projid == dzp->z_projid) 1763 goto next; 1764 1765 tx = dmu_tx_create(os); 1766 if (!(zp->z_pflags & ZFS_PROJID)) 1767 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1768 else 1769 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1770 1771 err = dmu_tx_assign(tx, TXG_WAIT); 1772 if (err) 1773 break; 1774 1775 mutex_enter(&dzp->z_lock); 1776 1777 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1778 xip->i_uid = dxip->i_uid; 1779 uid = zfs_uid_read(dxip); 1780 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1781 &uid, sizeof (uid)); 1782 } 1783 1784 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1785 xip->i_gid = dxip->i_gid; 1786 gid = zfs_gid_read(dxip); 1787 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1788 &gid, sizeof (gid)); 1789 } 1790 1791 if (zp->z_projid != dzp->z_projid) { 1792 if (!(zp->z_pflags & ZFS_PROJID)) { 1793 zp->z_pflags |= ZFS_PROJID; 1794 SA_ADD_BULK_ATTR(bulk, count, 1795 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 1796 sizeof (zp->z_pflags)); 1797 } 1798 1799 zp->z_projid = dzp->z_projid; 1800 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), 1801 NULL, &zp->z_projid, sizeof (zp->z_projid)); 1802 } 1803 1804 mutex_exit(&dzp->z_lock); 1805 1806 if (likely(count > 0)) { 1807 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1808 dmu_tx_commit(tx); 1809 } else { 1810 dmu_tx_abort(tx); 1811 } 1812 tx = NULL; 1813 if (err != 0 && err != ENOENT) 1814 break; 1815 1816 next: 1817 if (zp) { 1818 zrele(zp); 1819 zp = NULL; 1820 zfs_dirent_unlock(dl); 1821 } 1822 zap_cursor_advance(&zc); 1823 } 1824 1825 if (tx) 1826 dmu_tx_abort(tx); 1827 if (zp) { 1828 zrele(zp); 1829 zfs_dirent_unlock(dl); 1830 } 1831 zap_cursor_fini(&zc); 1832 1833 return (err == ENOENT ? 0 : err); 1834 } 1835 1836 /* 1837 * Set the file attributes to the values contained in the 1838 * vattr structure. 1839 * 1840 * IN: zp - znode of file to be modified. 1841 * vap - new attribute values. 1842 * If ATTR_XVATTR set, then optional attrs are being set 1843 * flags - ATTR_UTIME set if non-default time values provided. 1844 * - ATTR_NOACLCHECK (CIFS context only). 1845 * cr - credentials of caller. 1846 * mnt_ns - user namespace of the mount 1847 * 1848 * RETURN: 0 if success 1849 * error code if failure 1850 * 1851 * Timestamps: 1852 * ip - ctime updated, mtime updated if size changed. 1853 */ 1854 int 1855 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) 1856 { 1857 struct inode *ip; 1858 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1859 objset_t *os; 1860 zilog_t *zilog; 1861 dmu_tx_t *tx; 1862 vattr_t oldva; 1863 xvattr_t *tmpxvattr; 1864 uint_t mask = vap->va_mask; 1865 uint_t saved_mask = 0; 1866 int trim_mask = 0; 1867 uint64_t new_mode; 1868 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1869 uint64_t xattr_obj; 1870 uint64_t mtime[2], ctime[2], atime[2]; 1871 uint64_t projid = ZFS_INVALID_PROJID; 1872 znode_t *attrzp; 1873 int need_policy = FALSE; 1874 int err, err2 = 0; 1875 zfs_fuid_info_t *fuidp = NULL; 1876 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1877 xoptattr_t *xoap; 1878 zfs_acl_t *aclp; 1879 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1880 boolean_t fuid_dirtied = B_FALSE; 1881 boolean_t handle_eadir = B_FALSE; 1882 sa_bulk_attr_t *bulk, *xattr_bulk; 1883 int count = 0, xattr_count = 0, bulks = 8; 1884 1885 if (mask == 0) 1886 return (0); 1887 1888 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1889 return (err); 1890 ip = ZTOI(zp); 1891 os = zfsvfs->z_os; 1892 1893 /* 1894 * If this is a xvattr_t, then get a pointer to the structure of 1895 * optional attributes. If this is NULL, then we have a vattr_t. 1896 */ 1897 xoap = xva_getxoptattr(xvap); 1898 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1899 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1900 if (!dmu_objset_projectquota_enabled(os) || 1901 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1902 zfs_exit(zfsvfs, FTAG); 1903 return (SET_ERROR(ENOTSUP)); 1904 } 1905 1906 projid = xoap->xoa_projid; 1907 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1908 zfs_exit(zfsvfs, FTAG); 1909 return (SET_ERROR(EINVAL)); 1910 } 1911 1912 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1913 projid = ZFS_INVALID_PROJID; 1914 else 1915 need_policy = TRUE; 1916 } 1917 1918 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1919 (xoap->xoa_projinherit != 1920 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1921 (!dmu_objset_projectquota_enabled(os) || 1922 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1923 zfs_exit(zfsvfs, FTAG); 1924 return (SET_ERROR(ENOTSUP)); 1925 } 1926 } 1927 1928 zilog = zfsvfs->z_log; 1929 1930 /* 1931 * Make sure that if we have ephemeral uid/gid or xvattr specified 1932 * that file system is at proper version level 1933 */ 1934 1935 if (zfsvfs->z_use_fuids == B_FALSE && 1936 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 1937 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 1938 (mask & ATTR_XVATTR))) { 1939 zfs_exit(zfsvfs, FTAG); 1940 return (SET_ERROR(EINVAL)); 1941 } 1942 1943 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 1944 zfs_exit(zfsvfs, FTAG); 1945 return (SET_ERROR(EISDIR)); 1946 } 1947 1948 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 1949 zfs_exit(zfsvfs, FTAG); 1950 return (SET_ERROR(EINVAL)); 1951 } 1952 1953 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 1954 xva_init(tmpxvattr); 1955 1956 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1957 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1958 1959 /* 1960 * Immutable files can only alter immutable bit and atime 1961 */ 1962 if ((zp->z_pflags & ZFS_IMMUTABLE) && 1963 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 1964 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 1965 err = SET_ERROR(EPERM); 1966 goto out3; 1967 } 1968 1969 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 1970 err = SET_ERROR(EPERM); 1971 goto out3; 1972 } 1973 1974 /* 1975 * Verify timestamps doesn't overflow 32 bits. 1976 * ZFS can handle large timestamps, but 32bit syscalls can't 1977 * handle times greater than 2039. This check should be removed 1978 * once large timestamps are fully supported. 1979 */ 1980 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 1981 if (((mask & ATTR_ATIME) && 1982 TIMESPEC_OVERFLOW(&vap->va_atime)) || 1983 ((mask & ATTR_MTIME) && 1984 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 1985 err = SET_ERROR(EOVERFLOW); 1986 goto out3; 1987 } 1988 } 1989 1990 top: 1991 attrzp = NULL; 1992 aclp = NULL; 1993 1994 /* Can this be moved to before the top label? */ 1995 if (zfs_is_readonly(zfsvfs)) { 1996 err = SET_ERROR(EROFS); 1997 goto out3; 1998 } 1999 2000 /* 2001 * First validate permissions 2002 */ 2003 2004 if (mask & ATTR_SIZE) { 2005 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 2006 mnt_ns); 2007 if (err) 2008 goto out3; 2009 2010 /* 2011 * XXX - Note, we are not providing any open 2012 * mode flags here (like FNDELAY), so we may 2013 * block if there are locks present... this 2014 * should be addressed in openat(). 2015 */ 2016 /* XXX - would it be OK to generate a log record here? */ 2017 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2018 if (err) 2019 goto out3; 2020 } 2021 2022 if (mask & (ATTR_ATIME|ATTR_MTIME) || 2023 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2024 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2025 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2026 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2027 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2028 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2029 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2030 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2031 skipaclchk, cr, mnt_ns); 2032 } 2033 2034 if (mask & (ATTR_UID|ATTR_GID)) { 2035 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2036 int take_owner; 2037 int take_group; 2038 uid_t uid; 2039 gid_t gid; 2040 2041 /* 2042 * NOTE: even if a new mode is being set, 2043 * we may clear S_ISUID/S_ISGID bits. 2044 */ 2045 2046 if (!(mask & ATTR_MODE)) 2047 vap->va_mode = zp->z_mode; 2048 2049 /* 2050 * Take ownership or chgrp to group we are a member of 2051 */ 2052 2053 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), 2054 vap->va_uid); 2055 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), 2056 vap->va_gid); 2057 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2058 take_group = (mask & ATTR_GID) && 2059 zfs_groupmember(zfsvfs, gid, cr); 2060 2061 /* 2062 * If both ATTR_UID and ATTR_GID are set then take_owner and 2063 * take_group must both be set in order to allow taking 2064 * ownership. 2065 * 2066 * Otherwise, send the check through secpolicy_vnode_setattr() 2067 * 2068 */ 2069 2070 if (((idmask == (ATTR_UID|ATTR_GID)) && 2071 take_owner && take_group) || 2072 ((idmask == ATTR_UID) && take_owner) || 2073 ((idmask == ATTR_GID) && take_group)) { 2074 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2075 skipaclchk, cr, mnt_ns) == 0) { 2076 /* 2077 * Remove setuid/setgid for non-privileged users 2078 */ 2079 (void) secpolicy_setid_clear(vap, cr); 2080 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2081 } else { 2082 need_policy = TRUE; 2083 } 2084 } else { 2085 need_policy = TRUE; 2086 } 2087 } 2088 2089 mutex_enter(&zp->z_lock); 2090 oldva.va_mode = zp->z_mode; 2091 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2092 if (mask & ATTR_XVATTR) { 2093 /* 2094 * Update xvattr mask to include only those attributes 2095 * that are actually changing. 2096 * 2097 * the bits will be restored prior to actually setting 2098 * the attributes so the caller thinks they were set. 2099 */ 2100 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2101 if (xoap->xoa_appendonly != 2102 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2103 need_policy = TRUE; 2104 } else { 2105 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2106 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2107 } 2108 } 2109 2110 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2111 if (xoap->xoa_projinherit != 2112 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2113 need_policy = TRUE; 2114 } else { 2115 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2116 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2117 } 2118 } 2119 2120 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2121 if (xoap->xoa_nounlink != 2122 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2123 need_policy = TRUE; 2124 } else { 2125 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2126 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2127 } 2128 } 2129 2130 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2131 if (xoap->xoa_immutable != 2132 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2133 need_policy = TRUE; 2134 } else { 2135 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2136 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2137 } 2138 } 2139 2140 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2141 if (xoap->xoa_nodump != 2142 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2143 need_policy = TRUE; 2144 } else { 2145 XVA_CLR_REQ(xvap, XAT_NODUMP); 2146 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2147 } 2148 } 2149 2150 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2151 if (xoap->xoa_av_modified != 2152 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2153 need_policy = TRUE; 2154 } else { 2155 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2156 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2157 } 2158 } 2159 2160 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2161 if ((!S_ISREG(ip->i_mode) && 2162 xoap->xoa_av_quarantined) || 2163 xoap->xoa_av_quarantined != 2164 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2165 need_policy = TRUE; 2166 } else { 2167 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2168 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2169 } 2170 } 2171 2172 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2173 mutex_exit(&zp->z_lock); 2174 err = SET_ERROR(EPERM); 2175 goto out3; 2176 } 2177 2178 if (need_policy == FALSE && 2179 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2180 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2181 need_policy = TRUE; 2182 } 2183 } 2184 2185 mutex_exit(&zp->z_lock); 2186 2187 if (mask & ATTR_MODE) { 2188 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2189 mnt_ns) == 0) { 2190 err = secpolicy_setid_setsticky_clear(ip, vap, 2191 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2192 if (err) 2193 goto out3; 2194 trim_mask |= ATTR_MODE; 2195 } else { 2196 need_policy = TRUE; 2197 } 2198 } 2199 2200 if (need_policy) { 2201 /* 2202 * If trim_mask is set then take ownership 2203 * has been granted or write_acl is present and user 2204 * has the ability to modify mode. In that case remove 2205 * UID|GID and or MODE from mask so that 2206 * secpolicy_vnode_setattr() doesn't revoke it. 2207 */ 2208 2209 if (trim_mask) { 2210 saved_mask = vap->va_mask; 2211 vap->va_mask &= ~trim_mask; 2212 } 2213 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2214 zfs_zaccess_unix, zp); 2215 if (err) 2216 goto out3; 2217 2218 if (trim_mask) 2219 vap->va_mask |= saved_mask; 2220 } 2221 2222 /* 2223 * secpolicy_vnode_setattr, or take ownership may have 2224 * changed va_mask 2225 */ 2226 mask = vap->va_mask; 2227 2228 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2229 handle_eadir = B_TRUE; 2230 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2231 &xattr_obj, sizeof (xattr_obj)); 2232 2233 if (err == 0 && xattr_obj) { 2234 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2235 if (err) 2236 goto out2; 2237 } 2238 if (mask & ATTR_UID) { 2239 new_kuid = zfs_fuid_create(zfsvfs, 2240 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2241 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2242 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2243 new_kuid)) { 2244 if (attrzp) 2245 zrele(attrzp); 2246 err = SET_ERROR(EDQUOT); 2247 goto out2; 2248 } 2249 } 2250 2251 if (mask & ATTR_GID) { 2252 new_kgid = zfs_fuid_create(zfsvfs, 2253 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2254 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2255 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2256 new_kgid)) { 2257 if (attrzp) 2258 zrele(attrzp); 2259 err = SET_ERROR(EDQUOT); 2260 goto out2; 2261 } 2262 } 2263 2264 if (projid != ZFS_INVALID_PROJID && 2265 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2266 if (attrzp) 2267 zrele(attrzp); 2268 err = EDQUOT; 2269 goto out2; 2270 } 2271 } 2272 tx = dmu_tx_create(os); 2273 2274 if (mask & ATTR_MODE) { 2275 uint64_t pmode = zp->z_mode; 2276 uint64_t acl_obj; 2277 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2278 2279 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2280 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2281 err = EPERM; 2282 goto out; 2283 } 2284 2285 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2286 goto out; 2287 2288 mutex_enter(&zp->z_lock); 2289 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2290 /* 2291 * Are we upgrading ACL from old V0 format 2292 * to V1 format? 2293 */ 2294 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2295 zfs_znode_acl_version(zp) == 2296 ZFS_ACL_VERSION_INITIAL) { 2297 dmu_tx_hold_free(tx, acl_obj, 0, 2298 DMU_OBJECT_END); 2299 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2300 0, aclp->z_acl_bytes); 2301 } else { 2302 dmu_tx_hold_write(tx, acl_obj, 0, 2303 aclp->z_acl_bytes); 2304 } 2305 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2306 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2307 0, aclp->z_acl_bytes); 2308 } 2309 mutex_exit(&zp->z_lock); 2310 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2311 } else { 2312 if (((mask & ATTR_XVATTR) && 2313 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2314 (projid != ZFS_INVALID_PROJID && 2315 !(zp->z_pflags & ZFS_PROJID))) 2316 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2317 else 2318 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2319 } 2320 2321 if (attrzp) { 2322 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2323 } 2324 2325 fuid_dirtied = zfsvfs->z_fuid_dirty; 2326 if (fuid_dirtied) 2327 zfs_fuid_txhold(zfsvfs, tx); 2328 2329 zfs_sa_upgrade_txholds(tx, zp); 2330 2331 err = dmu_tx_assign(tx, TXG_WAIT); 2332 if (err) 2333 goto out; 2334 2335 count = 0; 2336 /* 2337 * Set each attribute requested. 2338 * We group settings according to the locks they need to acquire. 2339 * 2340 * Note: you cannot set ctime directly, although it will be 2341 * updated as a side-effect of calling this function. 2342 */ 2343 2344 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2345 /* 2346 * For the existed object that is upgraded from old system, 2347 * its on-disk layout has no slot for the project ID attribute. 2348 * But quota accounting logic needs to access related slots by 2349 * offset directly. So we need to adjust old objects' layout 2350 * to make the project ID to some unified and fixed offset. 2351 */ 2352 if (attrzp) 2353 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2354 if (err == 0) 2355 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2356 2357 if (unlikely(err == EEXIST)) 2358 err = 0; 2359 else if (err != 0) 2360 goto out; 2361 else 2362 projid = ZFS_INVALID_PROJID; 2363 } 2364 2365 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2366 mutex_enter(&zp->z_acl_lock); 2367 mutex_enter(&zp->z_lock); 2368 2369 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2370 &zp->z_pflags, sizeof (zp->z_pflags)); 2371 2372 if (attrzp) { 2373 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2374 mutex_enter(&attrzp->z_acl_lock); 2375 mutex_enter(&attrzp->z_lock); 2376 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2377 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2378 sizeof (attrzp->z_pflags)); 2379 if (projid != ZFS_INVALID_PROJID) { 2380 attrzp->z_projid = projid; 2381 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2382 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2383 sizeof (attrzp->z_projid)); 2384 } 2385 } 2386 2387 if (mask & (ATTR_UID|ATTR_GID)) { 2388 2389 if (mask & ATTR_UID) { 2390 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2391 new_uid = zfs_uid_read(ZTOI(zp)); 2392 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2393 &new_uid, sizeof (new_uid)); 2394 if (attrzp) { 2395 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2396 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2397 sizeof (new_uid)); 2398 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2399 } 2400 } 2401 2402 if (mask & ATTR_GID) { 2403 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2404 new_gid = zfs_gid_read(ZTOI(zp)); 2405 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2406 NULL, &new_gid, sizeof (new_gid)); 2407 if (attrzp) { 2408 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2409 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2410 sizeof (new_gid)); 2411 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2412 } 2413 } 2414 if (!(mask & ATTR_MODE)) { 2415 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2416 NULL, &new_mode, sizeof (new_mode)); 2417 new_mode = zp->z_mode; 2418 } 2419 err = zfs_acl_chown_setattr(zp); 2420 ASSERT(err == 0); 2421 if (attrzp) { 2422 err = zfs_acl_chown_setattr(attrzp); 2423 ASSERT(err == 0); 2424 } 2425 } 2426 2427 if (mask & ATTR_MODE) { 2428 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2429 &new_mode, sizeof (new_mode)); 2430 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2431 ASSERT3P(aclp, !=, NULL); 2432 err = zfs_aclset_common(zp, aclp, cr, tx); 2433 ASSERT0(err); 2434 if (zp->z_acl_cached) 2435 zfs_acl_free(zp->z_acl_cached); 2436 zp->z_acl_cached = aclp; 2437 aclp = NULL; 2438 } 2439 2440 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2441 zp->z_atime_dirty = B_FALSE; 2442 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); 2443 ZFS_TIME_ENCODE(&tmp_atime, atime); 2444 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2445 &atime, sizeof (atime)); 2446 } 2447 2448 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2449 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2450 zpl_inode_set_mtime_to_ts(ZTOI(zp), 2451 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); 2452 2453 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2454 mtime, sizeof (mtime)); 2455 } 2456 2457 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2458 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2459 zpl_inode_set_ctime_to_ts(ZTOI(zp), 2460 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); 2461 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2462 ctime, sizeof (ctime)); 2463 } 2464 2465 if (projid != ZFS_INVALID_PROJID) { 2466 zp->z_projid = projid; 2467 SA_ADD_BULK_ATTR(bulk, count, 2468 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2469 sizeof (zp->z_projid)); 2470 } 2471 2472 if (attrzp && mask) { 2473 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2474 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2475 sizeof (ctime)); 2476 } 2477 2478 /* 2479 * Do this after setting timestamps to prevent timestamp 2480 * update from toggling bit 2481 */ 2482 2483 if (xoap && (mask & ATTR_XVATTR)) { 2484 2485 /* 2486 * restore trimmed off masks 2487 * so that return masks can be set for caller. 2488 */ 2489 2490 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2491 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2492 } 2493 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2494 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2495 } 2496 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2497 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2498 } 2499 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2500 XVA_SET_REQ(xvap, XAT_NODUMP); 2501 } 2502 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2503 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2504 } 2505 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2506 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2507 } 2508 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2509 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2510 } 2511 2512 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2513 ASSERT(S_ISREG(ip->i_mode)); 2514 2515 zfs_xvattr_set(zp, xvap, tx); 2516 } 2517 2518 if (fuid_dirtied) 2519 zfs_fuid_sync(zfsvfs, tx); 2520 2521 if (mask != 0) 2522 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2523 2524 mutex_exit(&zp->z_lock); 2525 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2526 mutex_exit(&zp->z_acl_lock); 2527 2528 if (attrzp) { 2529 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2530 mutex_exit(&attrzp->z_acl_lock); 2531 mutex_exit(&attrzp->z_lock); 2532 } 2533 out: 2534 if (err == 0 && xattr_count > 0) { 2535 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2536 xattr_count, tx); 2537 ASSERT(err2 == 0); 2538 } 2539 2540 if (aclp) 2541 zfs_acl_free(aclp); 2542 2543 if (fuidp) { 2544 zfs_fuid_info_free(fuidp); 2545 fuidp = NULL; 2546 } 2547 2548 if (err) { 2549 dmu_tx_abort(tx); 2550 if (attrzp) 2551 zrele(attrzp); 2552 if (err == ERESTART) 2553 goto top; 2554 } else { 2555 if (count > 0) 2556 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2557 dmu_tx_commit(tx); 2558 if (attrzp) { 2559 if (err2 == 0 && handle_eadir) 2560 err = zfs_setattr_dir(attrzp); 2561 zrele(attrzp); 2562 } 2563 zfs_znode_update_vfs(zp); 2564 } 2565 2566 out2: 2567 if (os->os_sync == ZFS_SYNC_ALWAYS) 2568 zil_commit(zilog, 0); 2569 2570 out3: 2571 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2572 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2573 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2574 zfs_exit(zfsvfs, FTAG); 2575 return (err); 2576 } 2577 2578 typedef struct zfs_zlock { 2579 krwlock_t *zl_rwlock; /* lock we acquired */ 2580 znode_t *zl_znode; /* znode we held */ 2581 struct zfs_zlock *zl_next; /* next in list */ 2582 } zfs_zlock_t; 2583 2584 /* 2585 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2586 */ 2587 static void 2588 zfs_rename_unlock(zfs_zlock_t **zlpp) 2589 { 2590 zfs_zlock_t *zl; 2591 2592 while ((zl = *zlpp) != NULL) { 2593 if (zl->zl_znode != NULL) 2594 zfs_zrele_async(zl->zl_znode); 2595 rw_exit(zl->zl_rwlock); 2596 *zlpp = zl->zl_next; 2597 kmem_free(zl, sizeof (*zl)); 2598 } 2599 } 2600 2601 /* 2602 * Search back through the directory tree, using the ".." entries. 2603 * Lock each directory in the chain to prevent concurrent renames. 2604 * Fail any attempt to move a directory into one of its own descendants. 2605 * XXX - z_parent_lock can overlap with map or grow locks 2606 */ 2607 static int 2608 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2609 { 2610 zfs_zlock_t *zl; 2611 znode_t *zp = tdzp; 2612 uint64_t rootid = ZTOZSB(zp)->z_root; 2613 uint64_t oidp = zp->z_id; 2614 krwlock_t *rwlp = &szp->z_parent_lock; 2615 krw_t rw = RW_WRITER; 2616 2617 /* 2618 * First pass write-locks szp and compares to zp->z_id. 2619 * Later passes read-lock zp and compare to zp->z_parent. 2620 */ 2621 do { 2622 if (!rw_tryenter(rwlp, rw)) { 2623 /* 2624 * Another thread is renaming in this path. 2625 * Note that if we are a WRITER, we don't have any 2626 * parent_locks held yet. 2627 */ 2628 if (rw == RW_READER && zp->z_id > szp->z_id) { 2629 /* 2630 * Drop our locks and restart 2631 */ 2632 zfs_rename_unlock(&zl); 2633 *zlpp = NULL; 2634 zp = tdzp; 2635 oidp = zp->z_id; 2636 rwlp = &szp->z_parent_lock; 2637 rw = RW_WRITER; 2638 continue; 2639 } else { 2640 /* 2641 * Wait for other thread to drop its locks 2642 */ 2643 rw_enter(rwlp, rw); 2644 } 2645 } 2646 2647 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2648 zl->zl_rwlock = rwlp; 2649 zl->zl_znode = NULL; 2650 zl->zl_next = *zlpp; 2651 *zlpp = zl; 2652 2653 if (oidp == szp->z_id) /* We're a descendant of szp */ 2654 return (SET_ERROR(EINVAL)); 2655 2656 if (oidp == rootid) /* We've hit the top */ 2657 return (0); 2658 2659 if (rw == RW_READER) { /* i.e. not the first pass */ 2660 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2661 if (error) 2662 return (error); 2663 zl->zl_znode = zp; 2664 } 2665 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2666 &oidp, sizeof (oidp)); 2667 rwlp = &zp->z_parent_lock; 2668 rw = RW_READER; 2669 2670 } while (zp->z_id != sdzp->z_id); 2671 2672 return (0); 2673 } 2674 2675 /* 2676 * Move an entry from the provided source directory to the target 2677 * directory. Change the entry name as indicated. 2678 * 2679 * IN: sdzp - Source directory containing the "old entry". 2680 * snm - Old entry name. 2681 * tdzp - Target directory to contain the "new entry". 2682 * tnm - New entry name. 2683 * cr - credentials of caller. 2684 * flags - case flags 2685 * rflags - RENAME_* flags 2686 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2687 * mnt_ns - user namespace of the mount 2688 * 2689 * RETURN: 0 on success, error code on failure. 2690 * 2691 * Timestamps: 2692 * sdzp,tdzp - ctime|mtime updated 2693 */ 2694 int 2695 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2696 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) 2697 { 2698 znode_t *szp, *tzp; 2699 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2700 zilog_t *zilog; 2701 zfs_dirlock_t *sdl, *tdl; 2702 dmu_tx_t *tx; 2703 zfs_zlock_t *zl; 2704 int cmp, serr, terr; 2705 int error = 0; 2706 int zflg = 0; 2707 boolean_t waited = B_FALSE; 2708 /* Needed for whiteout inode creation. */ 2709 boolean_t fuid_dirtied; 2710 zfs_acl_ids_t acl_ids; 2711 boolean_t have_acl = B_FALSE; 2712 znode_t *wzp = NULL; 2713 2714 2715 if (snm == NULL || tnm == NULL) 2716 return (SET_ERROR(EINVAL)); 2717 2718 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2719 return (SET_ERROR(EINVAL)); 2720 2721 /* Already checked by Linux VFS, but just to make sure. */ 2722 if (rflags & RENAME_EXCHANGE && 2723 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2724 return (SET_ERROR(EINVAL)); 2725 2726 /* 2727 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2728 * right kind of vattr_t for the whiteout file. These are set 2729 * internally by ZFS so should never be incorrect. 2730 */ 2731 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2732 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2733 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2734 2735 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2736 return (error); 2737 zilog = zfsvfs->z_log; 2738 2739 if ((error = zfs_verify_zp(tdzp)) != 0) { 2740 zfs_exit(zfsvfs, FTAG); 2741 return (error); 2742 } 2743 2744 /* 2745 * We check i_sb because snapshots and the ctldir must have different 2746 * super blocks. 2747 */ 2748 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2749 zfsctl_is_node(ZTOI(tdzp))) { 2750 zfs_exit(zfsvfs, FTAG); 2751 return (SET_ERROR(EXDEV)); 2752 } 2753 2754 if (zfsvfs->z_utf8 && u8_validate(tnm, 2755 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2756 zfs_exit(zfsvfs, FTAG); 2757 return (SET_ERROR(EILSEQ)); 2758 } 2759 2760 if (flags & FIGNORECASE) 2761 zflg |= ZCILOOK; 2762 2763 top: 2764 szp = NULL; 2765 tzp = NULL; 2766 zl = NULL; 2767 2768 /* 2769 * This is to prevent the creation of links into attribute space 2770 * by renaming a linked file into/outof an attribute directory. 2771 * See the comment in zfs_link() for why this is considered bad. 2772 */ 2773 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2774 zfs_exit(zfsvfs, FTAG); 2775 return (SET_ERROR(EINVAL)); 2776 } 2777 2778 /* 2779 * Lock source and target directory entries. To prevent deadlock, 2780 * a lock ordering must be defined. We lock the directory with 2781 * the smallest object id first, or if it's a tie, the one with 2782 * the lexically first name. 2783 */ 2784 if (sdzp->z_id < tdzp->z_id) { 2785 cmp = -1; 2786 } else if (sdzp->z_id > tdzp->z_id) { 2787 cmp = 1; 2788 } else { 2789 /* 2790 * First compare the two name arguments without 2791 * considering any case folding. 2792 */ 2793 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2794 2795 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2796 ASSERT(error == 0 || !zfsvfs->z_utf8); 2797 if (cmp == 0) { 2798 /* 2799 * POSIX: "If the old argument and the new argument 2800 * both refer to links to the same existing file, 2801 * the rename() function shall return successfully 2802 * and perform no other action." 2803 */ 2804 zfs_exit(zfsvfs, FTAG); 2805 return (0); 2806 } 2807 /* 2808 * If the file system is case-folding, then we may 2809 * have some more checking to do. A case-folding file 2810 * system is either supporting mixed case sensitivity 2811 * access or is completely case-insensitive. Note 2812 * that the file system is always case preserving. 2813 * 2814 * In mixed sensitivity mode case sensitive behavior 2815 * is the default. FIGNORECASE must be used to 2816 * explicitly request case insensitive behavior. 2817 * 2818 * If the source and target names provided differ only 2819 * by case (e.g., a request to rename 'tim' to 'Tim'), 2820 * we will treat this as a special case in the 2821 * case-insensitive mode: as long as the source name 2822 * is an exact match, we will allow this to proceed as 2823 * a name-change request. 2824 */ 2825 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2826 (zfsvfs->z_case == ZFS_CASE_MIXED && 2827 flags & FIGNORECASE)) && 2828 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2829 &error) == 0) { 2830 /* 2831 * case preserving rename request, require exact 2832 * name matches 2833 */ 2834 zflg |= ZCIEXACT; 2835 zflg &= ~ZCILOOK; 2836 } 2837 } 2838 2839 /* 2840 * If the source and destination directories are the same, we should 2841 * grab the z_name_lock of that directory only once. 2842 */ 2843 if (sdzp == tdzp) { 2844 zflg |= ZHAVELOCK; 2845 rw_enter(&sdzp->z_name_lock, RW_READER); 2846 } 2847 2848 if (cmp < 0) { 2849 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2850 ZEXISTS | zflg, NULL, NULL); 2851 terr = zfs_dirent_lock(&tdl, 2852 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2853 } else { 2854 terr = zfs_dirent_lock(&tdl, 2855 tdzp, tnm, &tzp, zflg, NULL, NULL); 2856 serr = zfs_dirent_lock(&sdl, 2857 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2858 NULL, NULL); 2859 } 2860 2861 if (serr) { 2862 /* 2863 * Source entry invalid or not there. 2864 */ 2865 if (!terr) { 2866 zfs_dirent_unlock(tdl); 2867 if (tzp) 2868 zrele(tzp); 2869 } 2870 2871 if (sdzp == tdzp) 2872 rw_exit(&sdzp->z_name_lock); 2873 2874 if (strcmp(snm, "..") == 0) 2875 serr = EINVAL; 2876 zfs_exit(zfsvfs, FTAG); 2877 return (serr); 2878 } 2879 if (terr) { 2880 zfs_dirent_unlock(sdl); 2881 zrele(szp); 2882 2883 if (sdzp == tdzp) 2884 rw_exit(&sdzp->z_name_lock); 2885 2886 if (strcmp(tnm, "..") == 0) 2887 terr = EINVAL; 2888 zfs_exit(zfsvfs, FTAG); 2889 return (terr); 2890 } 2891 2892 /* 2893 * If we are using project inheritance, means if the directory has 2894 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2895 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2896 * such case, we only allow renames into our tree when the project 2897 * IDs are the same. 2898 */ 2899 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2900 tdzp->z_projid != szp->z_projid) { 2901 error = SET_ERROR(EXDEV); 2902 goto out; 2903 } 2904 2905 /* 2906 * Must have write access at the source to remove the old entry 2907 * and write access at the target to create the new entry. 2908 * Note that if target and source are the same, this can be 2909 * done in a single check. 2910 */ 2911 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2912 goto out; 2913 2914 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2915 /* 2916 * Check to make sure rename is valid. 2917 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2918 */ 2919 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2920 goto out; 2921 } 2922 2923 /* 2924 * Does target exist? 2925 */ 2926 if (tzp) { 2927 if (rflags & RENAME_NOREPLACE) { 2928 error = SET_ERROR(EEXIST); 2929 goto out; 2930 } 2931 /* 2932 * Source and target must be the same type (unless exchanging). 2933 */ 2934 if (!(rflags & RENAME_EXCHANGE)) { 2935 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 2936 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 2937 2938 if (s_is_dir != t_is_dir) { 2939 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 2940 goto out; 2941 } 2942 } 2943 /* 2944 * POSIX dictates that when the source and target 2945 * entries refer to the same file object, rename 2946 * must do nothing and exit without error. 2947 */ 2948 if (szp->z_id == tzp->z_id) { 2949 error = 0; 2950 goto out; 2951 } 2952 } else if (rflags & RENAME_EXCHANGE) { 2953 /* Target must exist for RENAME_EXCHANGE. */ 2954 error = SET_ERROR(ENOENT); 2955 goto out; 2956 } 2957 2958 /* Set up inode creation for RENAME_WHITEOUT. */ 2959 if (rflags & RENAME_WHITEOUT) { 2960 /* 2961 * Whiteout files are not regular files or directories, so to 2962 * match zfs_create() we do not inherit the project id. 2963 */ 2964 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 2965 2966 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 2967 if (error) 2968 goto out; 2969 2970 if (!have_acl) { 2971 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 2972 &acl_ids, mnt_ns); 2973 if (error) 2974 goto out; 2975 have_acl = B_TRUE; 2976 } 2977 2978 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 2979 error = SET_ERROR(EDQUOT); 2980 goto out; 2981 } 2982 } 2983 2984 tx = dmu_tx_create(zfsvfs->z_os); 2985 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 2986 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2987 dmu_tx_hold_zap(tx, sdzp->z_id, 2988 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 2989 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 2990 if (sdzp != tdzp) { 2991 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 2992 zfs_sa_upgrade_txholds(tx, tdzp); 2993 } 2994 if (tzp) { 2995 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 2996 zfs_sa_upgrade_txholds(tx, tzp); 2997 } 2998 if (rflags & RENAME_WHITEOUT) { 2999 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3000 ZFS_SA_BASE_ATTR_SIZE); 3001 3002 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 3003 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3004 if (!zfsvfs->z_use_sa && 3005 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3006 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3007 0, acl_ids.z_aclp->z_acl_bytes); 3008 } 3009 } 3010 fuid_dirtied = zfsvfs->z_fuid_dirty; 3011 if (fuid_dirtied) 3012 zfs_fuid_txhold(zfsvfs, tx); 3013 zfs_sa_upgrade_txholds(tx, szp); 3014 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3015 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3016 if (error) { 3017 if (zl != NULL) 3018 zfs_rename_unlock(&zl); 3019 zfs_dirent_unlock(sdl); 3020 zfs_dirent_unlock(tdl); 3021 3022 if (sdzp == tdzp) 3023 rw_exit(&sdzp->z_name_lock); 3024 3025 if (error == ERESTART) { 3026 waited = B_TRUE; 3027 dmu_tx_wait(tx); 3028 dmu_tx_abort(tx); 3029 zrele(szp); 3030 if (tzp) 3031 zrele(tzp); 3032 goto top; 3033 } 3034 dmu_tx_abort(tx); 3035 zrele(szp); 3036 if (tzp) 3037 zrele(tzp); 3038 zfs_exit(zfsvfs, FTAG); 3039 return (error); 3040 } 3041 3042 /* 3043 * Unlink the source. 3044 */ 3045 szp->z_pflags |= ZFS_AV_MODIFIED; 3046 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3047 szp->z_pflags |= ZFS_PROJINHERIT; 3048 3049 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3050 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3051 VERIFY0(error); 3052 3053 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3054 if (error) 3055 goto commit; 3056 3057 /* 3058 * Unlink the target. 3059 */ 3060 if (tzp) { 3061 int tzflg = zflg; 3062 3063 if (rflags & RENAME_EXCHANGE) { 3064 /* This inode will be re-linked soon. */ 3065 tzflg |= ZRENAMING; 3066 3067 tzp->z_pflags |= ZFS_AV_MODIFIED; 3068 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3069 tzp->z_pflags |= ZFS_PROJINHERIT; 3070 3071 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3072 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3073 ASSERT0(error); 3074 } 3075 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3076 if (error) 3077 goto commit_link_szp; 3078 } 3079 3080 /* 3081 * Create the new target links: 3082 * * We always link the target. 3083 * * RENAME_EXCHANGE: Link the old target to the source. 3084 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3085 */ 3086 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3087 if (error) { 3088 /* 3089 * If we have removed the existing target, a subsequent call to 3090 * zfs_link_create() to add back the same entry, but with a new 3091 * dnode (szp), should not fail. 3092 */ 3093 ASSERT3P(tzp, ==, NULL); 3094 goto commit_link_tzp; 3095 } 3096 3097 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3098 case RENAME_EXCHANGE: 3099 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3100 /* 3101 * The same argument as zfs_link_create() failing for 3102 * szp applies here, since the source directory must 3103 * have had an entry we are replacing. 3104 */ 3105 ASSERT0(error); 3106 if (error) 3107 goto commit_unlink_td_szp; 3108 break; 3109 case RENAME_WHITEOUT: 3110 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3111 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3112 if (error) { 3113 zfs_znode_delete(wzp, tx); 3114 remove_inode_hash(ZTOI(wzp)); 3115 goto commit_unlink_td_szp; 3116 } 3117 break; 3118 } 3119 3120 if (fuid_dirtied) 3121 zfs_fuid_sync(zfsvfs, tx); 3122 3123 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3124 case RENAME_EXCHANGE: 3125 zfs_log_rename_exchange(zilog, tx, 3126 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3127 tdzp, tdl->dl_name, szp); 3128 break; 3129 case RENAME_WHITEOUT: 3130 zfs_log_rename_whiteout(zilog, tx, 3131 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3132 tdzp, tdl->dl_name, szp, wzp); 3133 break; 3134 default: 3135 ASSERT0(rflags & ~RENAME_NOREPLACE); 3136 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3137 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3138 break; 3139 } 3140 3141 commit: 3142 dmu_tx_commit(tx); 3143 out: 3144 if (have_acl) 3145 zfs_acl_ids_free(&acl_ids); 3146 3147 zfs_znode_update_vfs(sdzp); 3148 if (sdzp == tdzp) 3149 rw_exit(&sdzp->z_name_lock); 3150 3151 if (sdzp != tdzp) 3152 zfs_znode_update_vfs(tdzp); 3153 3154 zfs_znode_update_vfs(szp); 3155 zrele(szp); 3156 if (wzp) { 3157 zfs_znode_update_vfs(wzp); 3158 zrele(wzp); 3159 } 3160 if (tzp) { 3161 zfs_znode_update_vfs(tzp); 3162 zrele(tzp); 3163 } 3164 3165 if (zl != NULL) 3166 zfs_rename_unlock(&zl); 3167 3168 zfs_dirent_unlock(sdl); 3169 zfs_dirent_unlock(tdl); 3170 3171 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3172 zil_commit(zilog, 0); 3173 3174 zfs_exit(zfsvfs, FTAG); 3175 return (error); 3176 3177 /* 3178 * Clean-up path for broken link state. 3179 * 3180 * At this point we are in a (very) bad state, so we need to do our 3181 * best to correct the state. In particular, all of the nlinks are 3182 * wrong because we were destroying and creating links with ZRENAMING. 3183 * 3184 * In some form, all of these operations have to resolve the state: 3185 * 3186 * * link_destroy() *must* succeed. Fortunately, this is very likely 3187 * since we only just created it. 3188 * 3189 * * link_create()s are allowed to fail (though they shouldn't because 3190 * we only just unlinked them and are putting the entries back 3191 * during clean-up). But if they fail, we can just forcefully drop 3192 * the nlink value to (at the very least) avoid broken nlink values 3193 * -- though in the case of non-empty directories we will have to 3194 * panic (otherwise we'd have a leaked directory with a broken ..). 3195 */ 3196 commit_unlink_td_szp: 3197 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3198 commit_link_tzp: 3199 if (tzp) { 3200 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3201 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3202 } 3203 commit_link_szp: 3204 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3205 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3206 goto commit; 3207 } 3208 3209 /* 3210 * Insert the indicated symbolic reference entry into the directory. 3211 * 3212 * IN: dzp - Directory to contain new symbolic link. 3213 * name - Name of directory entry in dip. 3214 * vap - Attributes of new entry. 3215 * link - Name for new symlink entry. 3216 * cr - credentials of caller. 3217 * flags - case flags 3218 * mnt_ns - user namespace of the mount 3219 * 3220 * OUT: zpp - Znode for new symbolic link. 3221 * 3222 * RETURN: 0 on success, error code on failure. 3223 * 3224 * Timestamps: 3225 * dip - ctime|mtime updated 3226 */ 3227 int 3228 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3229 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) 3230 { 3231 znode_t *zp; 3232 zfs_dirlock_t *dl; 3233 dmu_tx_t *tx; 3234 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3235 zilog_t *zilog; 3236 uint64_t len = strlen(link); 3237 int error; 3238 int zflg = ZNEW; 3239 zfs_acl_ids_t acl_ids; 3240 boolean_t fuid_dirtied; 3241 uint64_t txtype = TX_SYMLINK; 3242 boolean_t waited = B_FALSE; 3243 3244 ASSERT(S_ISLNK(vap->va_mode)); 3245 3246 if (name == NULL) 3247 return (SET_ERROR(EINVAL)); 3248 3249 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3250 return (error); 3251 zilog = zfsvfs->z_log; 3252 3253 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3254 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3255 zfs_exit(zfsvfs, FTAG); 3256 return (SET_ERROR(EILSEQ)); 3257 } 3258 if (flags & FIGNORECASE) 3259 zflg |= ZCILOOK; 3260 3261 if (len > MAXPATHLEN) { 3262 zfs_exit(zfsvfs, FTAG); 3263 return (SET_ERROR(ENAMETOOLONG)); 3264 } 3265 3266 if ((error = zfs_acl_ids_create(dzp, 0, 3267 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3268 zfs_exit(zfsvfs, FTAG); 3269 return (error); 3270 } 3271 top: 3272 *zpp = NULL; 3273 3274 /* 3275 * Attempt to lock directory; fail if entry already exists. 3276 */ 3277 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3278 if (error) { 3279 zfs_acl_ids_free(&acl_ids); 3280 zfs_exit(zfsvfs, FTAG); 3281 return (error); 3282 } 3283 3284 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3285 zfs_acl_ids_free(&acl_ids); 3286 zfs_dirent_unlock(dl); 3287 zfs_exit(zfsvfs, FTAG); 3288 return (error); 3289 } 3290 3291 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3292 zfs_acl_ids_free(&acl_ids); 3293 zfs_dirent_unlock(dl); 3294 zfs_exit(zfsvfs, FTAG); 3295 return (SET_ERROR(EDQUOT)); 3296 } 3297 tx = dmu_tx_create(zfsvfs->z_os); 3298 fuid_dirtied = zfsvfs->z_fuid_dirty; 3299 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3300 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3301 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3302 ZFS_SA_BASE_ATTR_SIZE + len); 3303 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3304 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3305 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3306 acl_ids.z_aclp->z_acl_bytes); 3307 } 3308 if (fuid_dirtied) 3309 zfs_fuid_txhold(zfsvfs, tx); 3310 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3311 if (error) { 3312 zfs_dirent_unlock(dl); 3313 if (error == ERESTART) { 3314 waited = B_TRUE; 3315 dmu_tx_wait(tx); 3316 dmu_tx_abort(tx); 3317 goto top; 3318 } 3319 zfs_acl_ids_free(&acl_ids); 3320 dmu_tx_abort(tx); 3321 zfs_exit(zfsvfs, FTAG); 3322 return (error); 3323 } 3324 3325 /* 3326 * Create a new object for the symlink. 3327 * for version 4 ZPL datasets the symlink will be an SA attribute 3328 */ 3329 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3330 3331 if (fuid_dirtied) 3332 zfs_fuid_sync(zfsvfs, tx); 3333 3334 mutex_enter(&zp->z_lock); 3335 if (zp->z_is_sa) 3336 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3337 link, len, tx); 3338 else 3339 zfs_sa_symlink(zp, link, len, tx); 3340 mutex_exit(&zp->z_lock); 3341 3342 zp->z_size = len; 3343 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3344 &zp->z_size, sizeof (zp->z_size), tx); 3345 /* 3346 * Insert the new object into the directory. 3347 */ 3348 error = zfs_link_create(dl, zp, tx, ZNEW); 3349 if (error != 0) { 3350 zfs_znode_delete(zp, tx); 3351 remove_inode_hash(ZTOI(zp)); 3352 } else { 3353 if (flags & FIGNORECASE) 3354 txtype |= TX_CI; 3355 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3356 3357 zfs_znode_update_vfs(dzp); 3358 zfs_znode_update_vfs(zp); 3359 } 3360 3361 zfs_acl_ids_free(&acl_ids); 3362 3363 dmu_tx_commit(tx); 3364 3365 zfs_dirent_unlock(dl); 3366 3367 if (error == 0) { 3368 *zpp = zp; 3369 3370 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3371 zil_commit(zilog, 0); 3372 } else { 3373 zrele(zp); 3374 } 3375 3376 zfs_exit(zfsvfs, FTAG); 3377 return (error); 3378 } 3379 3380 /* 3381 * Return, in the buffer contained in the provided uio structure, 3382 * the symbolic path referred to by ip. 3383 * 3384 * IN: ip - inode of symbolic link 3385 * uio - structure to contain the link path. 3386 * cr - credentials of caller. 3387 * 3388 * RETURN: 0 if success 3389 * error code if failure 3390 * 3391 * Timestamps: 3392 * ip - atime updated 3393 */ 3394 int 3395 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3396 { 3397 (void) cr; 3398 znode_t *zp = ITOZ(ip); 3399 zfsvfs_t *zfsvfs = ITOZSB(ip); 3400 int error; 3401 3402 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3403 return (error); 3404 3405 mutex_enter(&zp->z_lock); 3406 if (zp->z_is_sa) 3407 error = sa_lookup_uio(zp->z_sa_hdl, 3408 SA_ZPL_SYMLINK(zfsvfs), uio); 3409 else 3410 error = zfs_sa_readlink(zp, uio); 3411 mutex_exit(&zp->z_lock); 3412 3413 zfs_exit(zfsvfs, FTAG); 3414 return (error); 3415 } 3416 3417 /* 3418 * Insert a new entry into directory tdzp referencing szp. 3419 * 3420 * IN: tdzp - Directory to contain new entry. 3421 * szp - znode of new entry. 3422 * name - name of new entry. 3423 * cr - credentials of caller. 3424 * flags - case flags. 3425 * 3426 * RETURN: 0 if success 3427 * error code if failure 3428 * 3429 * Timestamps: 3430 * tdzp - ctime|mtime updated 3431 * szp - ctime updated 3432 */ 3433 int 3434 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3435 int flags) 3436 { 3437 struct inode *sip = ZTOI(szp); 3438 znode_t *tzp; 3439 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3440 zilog_t *zilog; 3441 zfs_dirlock_t *dl; 3442 dmu_tx_t *tx; 3443 int error; 3444 int zf = ZNEW; 3445 uint64_t parent; 3446 uid_t owner; 3447 boolean_t waited = B_FALSE; 3448 boolean_t is_tmpfile = 0; 3449 uint64_t txg; 3450 #ifdef HAVE_TMPFILE 3451 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3452 #endif 3453 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3454 3455 if (name == NULL) 3456 return (SET_ERROR(EINVAL)); 3457 3458 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3459 return (error); 3460 zilog = zfsvfs->z_log; 3461 3462 /* 3463 * POSIX dictates that we return EPERM here. 3464 * Better choices include ENOTSUP or EISDIR. 3465 */ 3466 if (S_ISDIR(sip->i_mode)) { 3467 zfs_exit(zfsvfs, FTAG); 3468 return (SET_ERROR(EPERM)); 3469 } 3470 3471 if ((error = zfs_verify_zp(szp)) != 0) { 3472 zfs_exit(zfsvfs, FTAG); 3473 return (error); 3474 } 3475 3476 /* 3477 * If we are using project inheritance, means if the directory has 3478 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3479 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3480 * such case, we only allow hard link creation in our tree when the 3481 * project IDs are the same. 3482 */ 3483 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3484 tdzp->z_projid != szp->z_projid) { 3485 zfs_exit(zfsvfs, FTAG); 3486 return (SET_ERROR(EXDEV)); 3487 } 3488 3489 /* 3490 * We check i_sb because snapshots and the ctldir must have different 3491 * super blocks. 3492 */ 3493 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3494 zfs_exit(zfsvfs, FTAG); 3495 return (SET_ERROR(EXDEV)); 3496 } 3497 3498 /* Prevent links to .zfs/shares files */ 3499 3500 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3501 &parent, sizeof (uint64_t))) != 0) { 3502 zfs_exit(zfsvfs, FTAG); 3503 return (error); 3504 } 3505 if (parent == zfsvfs->z_shares_dir) { 3506 zfs_exit(zfsvfs, FTAG); 3507 return (SET_ERROR(EPERM)); 3508 } 3509 3510 if (zfsvfs->z_utf8 && u8_validate(name, 3511 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3512 zfs_exit(zfsvfs, FTAG); 3513 return (SET_ERROR(EILSEQ)); 3514 } 3515 if (flags & FIGNORECASE) 3516 zf |= ZCILOOK; 3517 3518 /* 3519 * We do not support links between attributes and non-attributes 3520 * because of the potential security risk of creating links 3521 * into "normal" file space in order to circumvent restrictions 3522 * imposed in attribute space. 3523 */ 3524 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3525 zfs_exit(zfsvfs, FTAG); 3526 return (SET_ERROR(EINVAL)); 3527 } 3528 3529 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3530 cr, ZFS_OWNER); 3531 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3532 zfs_exit(zfsvfs, FTAG); 3533 return (SET_ERROR(EPERM)); 3534 } 3535 3536 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3537 zfs_init_idmap))) { 3538 zfs_exit(zfsvfs, FTAG); 3539 return (error); 3540 } 3541 3542 top: 3543 /* 3544 * Attempt to lock directory; fail if entry already exists. 3545 */ 3546 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3547 if (error) { 3548 zfs_exit(zfsvfs, FTAG); 3549 return (error); 3550 } 3551 3552 tx = dmu_tx_create(zfsvfs->z_os); 3553 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3554 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3555 if (is_tmpfile) 3556 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3557 3558 zfs_sa_upgrade_txholds(tx, szp); 3559 zfs_sa_upgrade_txholds(tx, tdzp); 3560 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3561 if (error) { 3562 zfs_dirent_unlock(dl); 3563 if (error == ERESTART) { 3564 waited = B_TRUE; 3565 dmu_tx_wait(tx); 3566 dmu_tx_abort(tx); 3567 goto top; 3568 } 3569 dmu_tx_abort(tx); 3570 zfs_exit(zfsvfs, FTAG); 3571 return (error); 3572 } 3573 /* unmark z_unlinked so zfs_link_create will not reject */ 3574 if (is_tmpfile) 3575 szp->z_unlinked = B_FALSE; 3576 error = zfs_link_create(dl, szp, tx, 0); 3577 3578 if (error == 0) { 3579 uint64_t txtype = TX_LINK; 3580 /* 3581 * tmpfile is created to be in z_unlinkedobj, so remove it. 3582 * Also, we don't log in ZIL, because all previous file 3583 * operation on the tmpfile are ignored by ZIL. Instead we 3584 * always wait for txg to sync to make sure all previous 3585 * operation are sync safe. 3586 */ 3587 if (is_tmpfile) { 3588 VERIFY(zap_remove_int(zfsvfs->z_os, 3589 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3590 } else { 3591 if (flags & FIGNORECASE) 3592 txtype |= TX_CI; 3593 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3594 } 3595 } else if (is_tmpfile) { 3596 /* restore z_unlinked since when linking failed */ 3597 szp->z_unlinked = B_TRUE; 3598 } 3599 txg = dmu_tx_get_txg(tx); 3600 dmu_tx_commit(tx); 3601 3602 zfs_dirent_unlock(dl); 3603 3604 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3605 zil_commit(zilog, 0); 3606 3607 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3608 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3609 3610 zfs_znode_update_vfs(tdzp); 3611 zfs_znode_update_vfs(szp); 3612 zfs_exit(zfsvfs, FTAG); 3613 return (error); 3614 } 3615 3616 static void 3617 zfs_putpage_sync_commit_cb(void *arg) 3618 { 3619 struct page *pp = arg; 3620 3621 ClearPageError(pp); 3622 end_page_writeback(pp); 3623 } 3624 3625 static void 3626 zfs_putpage_async_commit_cb(void *arg) 3627 { 3628 struct page *pp = arg; 3629 znode_t *zp = ITOZ(pp->mapping->host); 3630 3631 ClearPageError(pp); 3632 end_page_writeback(pp); 3633 atomic_dec_32(&zp->z_async_writes_cnt); 3634 } 3635 3636 /* 3637 * Push a page out to disk, once the page is on stable storage the 3638 * registered commit callback will be run as notification of completion. 3639 * 3640 * IN: ip - page mapped for inode. 3641 * pp - page to push (page is locked) 3642 * wbc - writeback control data 3643 * for_sync - does the caller intend to wait synchronously for the 3644 * page writeback to complete? 3645 * 3646 * RETURN: 0 if success 3647 * error code if failure 3648 * 3649 * Timestamps: 3650 * ip - ctime|mtime updated 3651 */ 3652 int 3653 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3654 boolean_t for_sync) 3655 { 3656 znode_t *zp = ITOZ(ip); 3657 zfsvfs_t *zfsvfs = ITOZSB(ip); 3658 loff_t offset; 3659 loff_t pgoff; 3660 unsigned int pglen; 3661 dmu_tx_t *tx; 3662 caddr_t va; 3663 int err = 0; 3664 uint64_t mtime[2], ctime[2]; 3665 inode_timespec_t tmp_ts; 3666 sa_bulk_attr_t bulk[3]; 3667 int cnt = 0; 3668 struct address_space *mapping; 3669 3670 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3671 return (err); 3672 3673 ASSERT(PageLocked(pp)); 3674 3675 pgoff = page_offset(pp); /* Page byte-offset in file */ 3676 offset = i_size_read(ip); /* File length in bytes */ 3677 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3678 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3679 3680 /* Page is beyond end of file */ 3681 if (pgoff >= offset) { 3682 unlock_page(pp); 3683 zfs_exit(zfsvfs, FTAG); 3684 return (0); 3685 } 3686 3687 /* Truncate page length to end of file */ 3688 if (pgoff + pglen > offset) 3689 pglen = offset - pgoff; 3690 3691 #if 0 3692 /* 3693 * FIXME: Allow mmap writes past its quota. The correct fix 3694 * is to register a page_mkwrite() handler to count the page 3695 * against its quota when it is about to be dirtied. 3696 */ 3697 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3698 KUID_TO_SUID(ip->i_uid)) || 3699 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3700 KGID_TO_SGID(ip->i_gid)) || 3701 (zp->z_projid != ZFS_DEFAULT_PROJID && 3702 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3703 zp->z_projid))) { 3704 err = EDQUOT; 3705 } 3706 #endif 3707 3708 /* 3709 * The ordering here is critical and must adhere to the following 3710 * rules in order to avoid deadlocking in either zfs_read() or 3711 * zfs_free_range() due to a lock inversion. 3712 * 3713 * 1) The page must be unlocked prior to acquiring the range lock. 3714 * This is critical because zfs_read() calls find_lock_page() 3715 * which may block on the page lock while holding the range lock. 3716 * 3717 * 2) Before setting or clearing write back on a page the range lock 3718 * must be held in order to prevent a lock inversion with the 3719 * zfs_free_range() function. 3720 * 3721 * This presents a problem because upon entering this function the 3722 * page lock is already held. To safely acquire the range lock the 3723 * page lock must be dropped. This creates a window where another 3724 * process could truncate, invalidate, dirty, or write out the page. 3725 * 3726 * Therefore, after successfully reacquiring the range and page locks 3727 * the current page state is checked. In the common case everything 3728 * will be as is expected and it can be written out. However, if 3729 * the page state has changed it must be handled accordingly. 3730 */ 3731 mapping = pp->mapping; 3732 redirty_page_for_writepage(wbc, pp); 3733 unlock_page(pp); 3734 3735 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3736 pgoff, pglen, RL_WRITER); 3737 lock_page(pp); 3738 3739 /* Page mapping changed or it was no longer dirty, we're done */ 3740 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3741 unlock_page(pp); 3742 zfs_rangelock_exit(lr); 3743 zfs_exit(zfsvfs, FTAG); 3744 return (0); 3745 } 3746 3747 /* Another process started write block if required */ 3748 if (PageWriteback(pp)) { 3749 unlock_page(pp); 3750 zfs_rangelock_exit(lr); 3751 3752 if (wbc->sync_mode != WB_SYNC_NONE) { 3753 /* 3754 * Speed up any non-sync page writebacks since 3755 * they may take several seconds to complete. 3756 * Refer to the comment in zpl_fsync() (when 3757 * HAVE_FSYNC_RANGE is defined) for details. 3758 */ 3759 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3760 zil_commit(zfsvfs->z_log, zp->z_id); 3761 } 3762 3763 if (PageWriteback(pp)) 3764 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3765 folio_wait_bit(page_folio(pp), PG_writeback); 3766 #else 3767 wait_on_page_bit(pp, PG_writeback); 3768 #endif 3769 } 3770 3771 zfs_exit(zfsvfs, FTAG); 3772 return (0); 3773 } 3774 3775 /* Clear the dirty flag the required locks are held */ 3776 if (!clear_page_dirty_for_io(pp)) { 3777 unlock_page(pp); 3778 zfs_rangelock_exit(lr); 3779 zfs_exit(zfsvfs, FTAG); 3780 return (0); 3781 } 3782 3783 /* 3784 * Counterpart for redirty_page_for_writepage() above. This page 3785 * was in fact not skipped and should not be counted as if it were. 3786 */ 3787 wbc->pages_skipped--; 3788 if (!for_sync) 3789 atomic_inc_32(&zp->z_async_writes_cnt); 3790 set_page_writeback(pp); 3791 unlock_page(pp); 3792 3793 tx = dmu_tx_create(zfsvfs->z_os); 3794 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3795 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3796 zfs_sa_upgrade_txholds(tx, zp); 3797 3798 err = dmu_tx_assign(tx, TXG_NOWAIT); 3799 if (err != 0) { 3800 if (err == ERESTART) 3801 dmu_tx_wait(tx); 3802 3803 dmu_tx_abort(tx); 3804 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3805 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3806 #else 3807 __set_page_dirty_nobuffers(pp); 3808 #endif 3809 ClearPageError(pp); 3810 end_page_writeback(pp); 3811 if (!for_sync) 3812 atomic_dec_32(&zp->z_async_writes_cnt); 3813 zfs_rangelock_exit(lr); 3814 zfs_exit(zfsvfs, FTAG); 3815 return (err); 3816 } 3817 3818 va = kmap(pp); 3819 ASSERT3U(pglen, <=, PAGE_SIZE); 3820 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3821 kunmap(pp); 3822 3823 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3824 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3825 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3826 &zp->z_pflags, 8); 3827 3828 /* Preserve the mtime and ctime provided by the inode */ 3829 tmp_ts = zpl_inode_get_mtime(ip); 3830 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3831 tmp_ts = zpl_inode_get_ctime(ip); 3832 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3833 zp->z_atime_dirty = B_FALSE; 3834 zp->z_seq++; 3835 3836 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3837 3838 boolean_t commit = B_FALSE; 3839 if (wbc->sync_mode != WB_SYNC_NONE) { 3840 /* 3841 * Note that this is rarely called under writepages(), because 3842 * writepages() normally handles the entire commit for 3843 * performance reasons. 3844 */ 3845 commit = B_TRUE; 3846 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3847 /* 3848 * If the caller does not intend to wait synchronously 3849 * for this page writeback to complete and there are active 3850 * synchronous calls on this file, do a commit so that 3851 * the latter don't accidentally end up waiting for 3852 * our writeback to complete. Refer to the comment in 3853 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3854 */ 3855 commit = B_TRUE; 3856 } 3857 3858 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, 3859 for_sync ? zfs_putpage_sync_commit_cb : 3860 zfs_putpage_async_commit_cb, pp); 3861 3862 dmu_tx_commit(tx); 3863 3864 zfs_rangelock_exit(lr); 3865 3866 if (commit) 3867 zil_commit(zfsvfs->z_log, zp->z_id); 3868 3869 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3870 3871 zfs_exit(zfsvfs, FTAG); 3872 return (err); 3873 } 3874 3875 /* 3876 * Update the system attributes when the inode has been dirtied. For the 3877 * moment we only update the mode, atime, mtime, and ctime. 3878 */ 3879 int 3880 zfs_dirty_inode(struct inode *ip, int flags) 3881 { 3882 znode_t *zp = ITOZ(ip); 3883 zfsvfs_t *zfsvfs = ITOZSB(ip); 3884 dmu_tx_t *tx; 3885 uint64_t mode, atime[2], mtime[2], ctime[2]; 3886 inode_timespec_t tmp_ts; 3887 sa_bulk_attr_t bulk[4]; 3888 int error = 0; 3889 int cnt = 0; 3890 3891 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3892 return (0); 3893 3894 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3895 return (error); 3896 3897 #ifdef I_DIRTY_TIME 3898 /* 3899 * This is the lazytime semantic introduced in Linux 4.0 3900 * This flag will only be called from update_time when lazytime is set. 3901 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3902 * Fortunately mtime and ctime are managed within ZFS itself, so we 3903 * only need to dirty atime. 3904 */ 3905 if (flags == I_DIRTY_TIME) { 3906 zp->z_atime_dirty = B_TRUE; 3907 goto out; 3908 } 3909 #endif 3910 3911 tx = dmu_tx_create(zfsvfs->z_os); 3912 3913 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3914 zfs_sa_upgrade_txholds(tx, zp); 3915 3916 error = dmu_tx_assign(tx, TXG_WAIT); 3917 if (error) { 3918 dmu_tx_abort(tx); 3919 goto out; 3920 } 3921 3922 mutex_enter(&zp->z_lock); 3923 zp->z_atime_dirty = B_FALSE; 3924 3925 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3926 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3927 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3928 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3929 3930 /* Preserve the mode, mtime and ctime provided by the inode */ 3931 tmp_ts = zpl_inode_get_atime(ip); 3932 ZFS_TIME_ENCODE(&tmp_ts, atime); 3933 tmp_ts = zpl_inode_get_mtime(ip); 3934 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3935 tmp_ts = zpl_inode_get_ctime(ip); 3936 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3937 mode = ip->i_mode; 3938 3939 zp->z_mode = mode; 3940 3941 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3942 mutex_exit(&zp->z_lock); 3943 3944 dmu_tx_commit(tx); 3945 out: 3946 zfs_exit(zfsvfs, FTAG); 3947 return (error); 3948 } 3949 3950 void 3951 zfs_inactive(struct inode *ip) 3952 { 3953 znode_t *zp = ITOZ(ip); 3954 zfsvfs_t *zfsvfs = ITOZSB(ip); 3955 uint64_t atime[2]; 3956 int error; 3957 int need_unlock = 0; 3958 3959 /* Only read lock if we haven't already write locked, e.g. rollback */ 3960 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 3961 need_unlock = 1; 3962 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3963 } 3964 if (zp->z_sa_hdl == NULL) { 3965 if (need_unlock) 3966 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3967 return; 3968 } 3969 3970 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 3971 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3972 3973 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3974 zfs_sa_upgrade_txholds(tx, zp); 3975 error = dmu_tx_assign(tx, TXG_WAIT); 3976 if (error) { 3977 dmu_tx_abort(tx); 3978 } else { 3979 inode_timespec_t tmp_atime; 3980 tmp_atime = zpl_inode_get_atime(ip); 3981 ZFS_TIME_ENCODE(&tmp_atime, atime); 3982 mutex_enter(&zp->z_lock); 3983 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 3984 (void *)&atime, sizeof (atime), tx); 3985 zp->z_atime_dirty = B_FALSE; 3986 mutex_exit(&zp->z_lock); 3987 dmu_tx_commit(tx); 3988 } 3989 } 3990 3991 zfs_zinactive(zp); 3992 if (need_unlock) 3993 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3994 } 3995 3996 /* 3997 * Fill pages with data from the disk. 3998 */ 3999 static int 4000 zfs_fillpage(struct inode *ip, struct page *pp) 4001 { 4002 zfsvfs_t *zfsvfs = ITOZSB(ip); 4003 loff_t i_size = i_size_read(ip); 4004 u_offset_t io_off = page_offset(pp); 4005 size_t io_len = PAGE_SIZE; 4006 4007 ASSERT3U(io_off, <, i_size); 4008 4009 if (io_off + io_len > i_size) 4010 io_len = i_size - io_off; 4011 4012 void *va = kmap(pp); 4013 int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, 4014 io_len, va, DMU_READ_PREFETCH); 4015 if (io_len != PAGE_SIZE) 4016 memset((char *)va + io_len, 0, PAGE_SIZE - io_len); 4017 kunmap(pp); 4018 4019 if (error) { 4020 /* convert checksum errors into IO errors */ 4021 if (error == ECKSUM) 4022 error = SET_ERROR(EIO); 4023 4024 SetPageError(pp); 4025 ClearPageUptodate(pp); 4026 } else { 4027 ClearPageError(pp); 4028 SetPageUptodate(pp); 4029 } 4030 4031 return (error); 4032 } 4033 4034 /* 4035 * Uses zfs_fillpage to read data from the file and fill the page. 4036 * 4037 * IN: ip - inode of file to get data from. 4038 * pp - page to read 4039 * 4040 * RETURN: 0 on success, error code on failure. 4041 * 4042 * Timestamps: 4043 * vp - atime updated 4044 */ 4045 int 4046 zfs_getpage(struct inode *ip, struct page *pp) 4047 { 4048 zfsvfs_t *zfsvfs = ITOZSB(ip); 4049 znode_t *zp = ITOZ(ip); 4050 int error; 4051 4052 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4053 return (error); 4054 4055 error = zfs_fillpage(ip, pp); 4056 if (error == 0) 4057 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); 4058 4059 zfs_exit(zfsvfs, FTAG); 4060 4061 return (error); 4062 } 4063 4064 /* 4065 * Check ZFS specific permissions to memory map a section of a file. 4066 * 4067 * IN: ip - inode of the file to mmap 4068 * off - file offset 4069 * addrp - start address in memory region 4070 * len - length of memory region 4071 * vm_flags- address flags 4072 * 4073 * RETURN: 0 if success 4074 * error code if failure 4075 */ 4076 int 4077 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4078 unsigned long vm_flags) 4079 { 4080 (void) addrp; 4081 znode_t *zp = ITOZ(ip); 4082 zfsvfs_t *zfsvfs = ITOZSB(ip); 4083 int error; 4084 4085 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4086 return (error); 4087 4088 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && 4089 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4090 zfs_exit(zfsvfs, FTAG); 4091 return (SET_ERROR(EPERM)); 4092 } 4093 4094 if ((vm_flags & (VM_READ | VM_EXEC)) && 4095 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4096 zfs_exit(zfsvfs, FTAG); 4097 return (SET_ERROR(EACCES)); 4098 } 4099 4100 if (off < 0 || len > MAXOFFSET_T - off) { 4101 zfs_exit(zfsvfs, FTAG); 4102 return (SET_ERROR(ENXIO)); 4103 } 4104 4105 zfs_exit(zfsvfs, FTAG); 4106 return (0); 4107 } 4108 4109 /* 4110 * Free or allocate space in a file. Currently, this function only 4111 * supports the `F_FREESP' command. However, this command is somewhat 4112 * misnamed, as its functionality includes the ability to allocate as 4113 * well as free space. 4114 * 4115 * IN: zp - znode of file to free data in. 4116 * cmd - action to take (only F_FREESP supported). 4117 * bfp - section of file to free/alloc. 4118 * flag - current file open mode flags. 4119 * offset - current file offset. 4120 * cr - credentials of caller. 4121 * 4122 * RETURN: 0 on success, error code on failure. 4123 * 4124 * Timestamps: 4125 * zp - ctime|mtime updated 4126 */ 4127 int 4128 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4129 offset_t offset, cred_t *cr) 4130 { 4131 (void) offset; 4132 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4133 uint64_t off, len; 4134 int error; 4135 4136 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4137 return (error); 4138 4139 if (cmd != F_FREESP) { 4140 zfs_exit(zfsvfs, FTAG); 4141 return (SET_ERROR(EINVAL)); 4142 } 4143 4144 /* 4145 * Callers might not be able to detect properly that we are read-only, 4146 * so check it explicitly here. 4147 */ 4148 if (zfs_is_readonly(zfsvfs)) { 4149 zfs_exit(zfsvfs, FTAG); 4150 return (SET_ERROR(EROFS)); 4151 } 4152 4153 if (bfp->l_len < 0) { 4154 zfs_exit(zfsvfs, FTAG); 4155 return (SET_ERROR(EINVAL)); 4156 } 4157 4158 /* 4159 * Permissions aren't checked on Solaris because on this OS 4160 * zfs_space() can only be called with an opened file handle. 4161 * On Linux we can get here through truncate_range() which 4162 * operates directly on inodes, so we need to check access rights. 4163 */ 4164 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4165 zfs_init_idmap))) { 4166 zfs_exit(zfsvfs, FTAG); 4167 return (error); 4168 } 4169 4170 off = bfp->l_start; 4171 len = bfp->l_len; /* 0 means from off to end of file */ 4172 4173 error = zfs_freesp(zp, off, len, flag, TRUE); 4174 4175 zfs_exit(zfsvfs, FTAG); 4176 return (error); 4177 } 4178 4179 int 4180 zfs_fid(struct inode *ip, fid_t *fidp) 4181 { 4182 znode_t *zp = ITOZ(ip); 4183 zfsvfs_t *zfsvfs = ITOZSB(ip); 4184 uint32_t gen; 4185 uint64_t gen64; 4186 uint64_t object = zp->z_id; 4187 zfid_short_t *zfid; 4188 int size, i, error; 4189 4190 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4191 return (error); 4192 4193 if (fidp->fid_len < SHORT_FID_LEN) { 4194 fidp->fid_len = SHORT_FID_LEN; 4195 zfs_exit(zfsvfs, FTAG); 4196 return (SET_ERROR(ENOSPC)); 4197 } 4198 4199 if ((error = zfs_verify_zp(zp)) != 0) { 4200 zfs_exit(zfsvfs, FTAG); 4201 return (error); 4202 } 4203 4204 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4205 &gen64, sizeof (uint64_t))) != 0) { 4206 zfs_exit(zfsvfs, FTAG); 4207 return (error); 4208 } 4209 4210 gen = (uint32_t)gen64; 4211 4212 size = SHORT_FID_LEN; 4213 4214 zfid = (zfid_short_t *)fidp; 4215 4216 zfid->zf_len = size; 4217 4218 for (i = 0; i < sizeof (zfid->zf_object); i++) 4219 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4220 4221 /* Must have a non-zero generation number to distinguish from .zfs */ 4222 if (gen == 0) 4223 gen = 1; 4224 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4225 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4226 4227 zfs_exit(zfsvfs, FTAG); 4228 return (0); 4229 } 4230 4231 #if defined(_KERNEL) 4232 EXPORT_SYMBOL(zfs_open); 4233 EXPORT_SYMBOL(zfs_close); 4234 EXPORT_SYMBOL(zfs_lookup); 4235 EXPORT_SYMBOL(zfs_create); 4236 EXPORT_SYMBOL(zfs_tmpfile); 4237 EXPORT_SYMBOL(zfs_remove); 4238 EXPORT_SYMBOL(zfs_mkdir); 4239 EXPORT_SYMBOL(zfs_rmdir); 4240 EXPORT_SYMBOL(zfs_readdir); 4241 EXPORT_SYMBOL(zfs_getattr_fast); 4242 EXPORT_SYMBOL(zfs_setattr); 4243 EXPORT_SYMBOL(zfs_rename); 4244 EXPORT_SYMBOL(zfs_symlink); 4245 EXPORT_SYMBOL(zfs_readlink); 4246 EXPORT_SYMBOL(zfs_link); 4247 EXPORT_SYMBOL(zfs_inactive); 4248 EXPORT_SYMBOL(zfs_space); 4249 EXPORT_SYMBOL(zfs_fid); 4250 EXPORT_SYMBOL(zfs_getpage); 4251 EXPORT_SYMBOL(zfs_putpage); 4252 EXPORT_SYMBOL(zfs_dirty_inode); 4253 EXPORT_SYMBOL(zfs_map); 4254 4255 /* CSTYLED */ 4256 module_param(zfs_delete_blocks, ulong, 0644); 4257 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4258 #endif 4259