1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 /* 32 * Copyright 2018 Nexenta Systems, Inc. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/thread.h> 39 #include <sys/t_lock.h> 40 #include <sys/time.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/errno.h> 44 #include <sys/buf.h> 45 #include <sys/stat.h> 46 #include <sys/cred.h> 47 #include <sys/kmem.h> 48 #include <sys/debug.h> 49 #include <sys/dnlc.h> 50 #include <sys/vmsystm.h> 51 #include <sys/flock.h> 52 #include <sys/share.h> 53 #include <sys/cmn_err.h> 54 #include <sys/tiuser.h> 55 #include <sys/sysmacros.h> 56 #include <sys/callb.h> 57 #include <sys/acl.h> 58 #include <sys/kstat.h> 59 #include <sys/signal.h> 60 #include <sys/list.h> 61 #include <sys/zone.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/nfs_cmd.h> 71 72 #include <nfs/rnode.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_vn.h> 83 84 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 85 cred_t *); 86 static int nfs_getattr_cache(vnode_t *, struct vattr *); 87 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 88 89 struct mi_globals { 90 kmutex_t mig_lock; /* lock protecting mig_list */ 91 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 92 boolean_t mig_destructor_called; 93 }; 94 95 static zone_key_t mi_list_key; 96 97 /* Debugging flag for PC file shares. */ 98 extern int share_debug; 99 100 /* 101 * Attributes caching: 102 * 103 * Attributes are cached in the rnode in struct vattr form. 104 * There is a time associated with the cached attributes (r_attrtime) 105 * which tells whether the attributes are valid. The time is initialized 106 * to the difference between current time and the modify time of the vnode 107 * when new attributes are cached. This allows the attributes for 108 * files that have changed recently to be timed out sooner than for files 109 * that have not changed for a long time. There are minimum and maximum 110 * timeout values that can be set per mount point. 111 */ 112 113 int 114 nfs_waitfor_purge_complete(vnode_t *vp) 115 { 116 rnode_t *rp; 117 k_sigset_t smask; 118 119 rp = VTOR(vp); 120 if (rp->r_serial != NULL && rp->r_serial != curthread) { 121 mutex_enter(&rp->r_statelock); 122 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 123 while (rp->r_serial != NULL) { 124 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 125 sigunintr(&smask); 126 mutex_exit(&rp->r_statelock); 127 return (EINTR); 128 } 129 } 130 sigunintr(&smask); 131 mutex_exit(&rp->r_statelock); 132 } 133 return (0); 134 } 135 136 /* 137 * Validate caches by checking cached attributes. If the cached 138 * attributes have timed out, then get new attributes from the server. 139 * As a side affect, this will do cache invalidation if the attributes 140 * have changed. 141 * 142 * If the attributes have not timed out and if there is a cache 143 * invalidation being done by some other thread, then wait until that 144 * thread has completed the cache invalidation. 145 */ 146 int 147 nfs_validate_caches(vnode_t *vp, cred_t *cr) 148 { 149 int error; 150 struct vattr va; 151 152 if (ATTRCACHE_VALID(vp)) { 153 error = nfs_waitfor_purge_complete(vp); 154 if (error) 155 return (error); 156 return (0); 157 } 158 159 va.va_mask = AT_ALL; 160 return (nfs_getattr_otw(vp, &va, cr)); 161 } 162 163 /* 164 * Validate caches by checking cached attributes. If the cached 165 * attributes have timed out, then get new attributes from the server. 166 * As a side affect, this will do cache invalidation if the attributes 167 * have changed. 168 * 169 * If the attributes have not timed out and if there is a cache 170 * invalidation being done by some other thread, then wait until that 171 * thread has completed the cache invalidation. 172 */ 173 int 174 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 175 { 176 int error; 177 struct vattr va; 178 179 if (ATTRCACHE_VALID(vp)) { 180 error = nfs_waitfor_purge_complete(vp); 181 if (error) 182 return (error); 183 return (0); 184 } 185 186 va.va_mask = AT_ALL; 187 return (nfs3_getattr_otw(vp, &va, cr)); 188 } 189 190 /* 191 * Purge all of the various NFS `data' caches. 192 */ 193 void 194 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 195 { 196 rnode_t *rp; 197 char *contents; 198 int size; 199 int error; 200 201 /* 202 * Purge the DNLC for any entries which refer to this file. 203 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 204 */ 205 rp = VTOR(vp); 206 mutex_enter(&rp->r_statelock); 207 if (vp->v_count > 1 && 208 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 209 !(rp->r_flags & RINDNLCPURGE)) { 210 /* 211 * Set the RINDNLCPURGE flag to prevent recursive entry 212 * into dnlc_purge_vp() 213 */ 214 if (vp->v_type == VDIR) 215 rp->r_flags |= RINDNLCPURGE; 216 mutex_exit(&rp->r_statelock); 217 dnlc_purge_vp(vp); 218 mutex_enter(&rp->r_statelock); 219 if (rp->r_flags & RINDNLCPURGE) 220 rp->r_flags &= ~RINDNLCPURGE; 221 } 222 223 /* 224 * Clear any readdir state bits and purge the readlink response cache. 225 */ 226 contents = rp->r_symlink.contents; 227 size = rp->r_symlink.size; 228 rp->r_symlink.contents = NULL; 229 mutex_exit(&rp->r_statelock); 230 231 if (contents != NULL) { 232 233 kmem_free((void *)contents, size); 234 } 235 236 /* 237 * Flush the page cache. 238 */ 239 if (vn_has_cached_data(vp)) { 240 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 241 if (error && (error == ENOSPC || error == EDQUOT)) { 242 mutex_enter(&rp->r_statelock); 243 if (!rp->r_error) 244 rp->r_error = error; 245 mutex_exit(&rp->r_statelock); 246 } 247 } 248 249 /* 250 * Flush the readdir response cache. 251 */ 252 if (HAVE_RDDIR_CACHE(rp)) 253 nfs_purge_rddir_cache(vp); 254 } 255 256 /* 257 * Purge the readdir cache of all entries 258 */ 259 void 260 nfs_purge_rddir_cache(vnode_t *vp) 261 { 262 rnode_t *rp; 263 rddir_cache *rdc; 264 rddir_cache *nrdc; 265 266 rp = VTOR(vp); 267 top: 268 mutex_enter(&rp->r_statelock); 269 rp->r_direof = NULL; 270 rp->r_flags &= ~RLOOKUP; 271 rp->r_flags |= RREADDIRPLUS; 272 rdc = avl_first(&rp->r_dir); 273 while (rdc != NULL) { 274 nrdc = AVL_NEXT(&rp->r_dir, rdc); 275 avl_remove(&rp->r_dir, rdc); 276 rddir_cache_rele(rdc); 277 rdc = nrdc; 278 } 279 mutex_exit(&rp->r_statelock); 280 } 281 282 /* 283 * Do a cache check based on the post-operation attributes. 284 * Then make them the new cached attributes. If no attributes 285 * were returned, then mark the attributes as timed out. 286 */ 287 void 288 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 289 { 290 vattr_t attr; 291 292 if (!poap->attributes) { 293 PURGE_ATTRCACHE(vp); 294 return; 295 } 296 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 297 } 298 299 /* 300 * Same as above, but using a vattr 301 */ 302 void 303 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 304 cred_t *cr) 305 { 306 if (!poap->attributes) { 307 PURGE_ATTRCACHE(vp); 308 return; 309 } 310 nfs_attr_cache(vp, poap->fres.vap, t, cr); 311 } 312 313 /* 314 * Do a cache check based on the weak cache consistency attributes. 315 * These consist of a small set of pre-operation attributes and the 316 * full set of post-operation attributes. 317 * 318 * If we are given the pre-operation attributes, then use them to 319 * check the validity of the various caches. Then, if we got the 320 * post-operation attributes, make them the new cached attributes. 321 * If we didn't get the post-operation attributes, then mark the 322 * attribute cache as timed out so that the next reference will 323 * cause a GETATTR to the server to refresh with the current 324 * attributes. 325 * 326 * Otherwise, if we didn't get the pre-operation attributes, but 327 * we did get the post-operation attributes, then use these 328 * attributes to check the validity of the various caches. This 329 * will probably cause a flush of the caches because if the 330 * operation succeeded, the attributes of the object were changed 331 * in some way from the old post-operation attributes. This 332 * should be okay because it is the safe thing to do. After 333 * checking the data caches, then we make these the new cached 334 * attributes. 335 * 336 * Otherwise, we didn't get either the pre- or post-operation 337 * attributes. Simply mark the attribute cache as timed out so 338 * the next reference will cause a GETATTR to the server to 339 * refresh with the current attributes. 340 * 341 * If an error occurred trying to convert the over the wire 342 * attributes to a vattr, then simply mark the attribute cache as 343 * timed out. 344 */ 345 void 346 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 347 { 348 vattr_t bva; 349 vattr_t ava; 350 351 if (wccp->after.attributes) { 352 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 353 PURGE_ATTRCACHE(vp); 354 return; 355 } 356 if (wccp->before.attributes) { 357 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 358 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 359 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 360 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 361 bva.va_size = wccp->before.attr.size; 362 nfs3_attr_cache(vp, &bva, &ava, t, cr); 363 } else 364 nfs_attr_cache(vp, &ava, t, cr); 365 } else { 366 PURGE_ATTRCACHE(vp); 367 } 368 } 369 370 /* 371 * Set attributes cache for given vnode using nfsattr. 372 * 373 * This routine does not do cache validation with the attributes. 374 * 375 * If an error occurred trying to convert the over the wire 376 * attributes to a vattr, then simply mark the attribute cache as 377 * timed out. 378 */ 379 void 380 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 381 { 382 rnode_t *rp; 383 struct vattr va; 384 385 if (!nattr_to_vattr(vp, na, &va)) { 386 rp = VTOR(vp); 387 mutex_enter(&rp->r_statelock); 388 if (rp->r_mtime <= t) 389 nfs_attrcache_va(vp, &va); 390 mutex_exit(&rp->r_statelock); 391 } else { 392 PURGE_ATTRCACHE(vp); 393 } 394 } 395 396 /* 397 * Set attributes cache for given vnode using fattr3. 398 * 399 * This routine does not do cache validation with the attributes. 400 * 401 * If an error occurred trying to convert the over the wire 402 * attributes to a vattr, then simply mark the attribute cache as 403 * timed out. 404 */ 405 void 406 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 407 { 408 rnode_t *rp; 409 struct vattr va; 410 411 if (!fattr3_to_vattr(vp, na, &va)) { 412 rp = VTOR(vp); 413 mutex_enter(&rp->r_statelock); 414 if (rp->r_mtime <= t) 415 nfs_attrcache_va(vp, &va); 416 mutex_exit(&rp->r_statelock); 417 } else { 418 PURGE_ATTRCACHE(vp); 419 } 420 } 421 422 /* 423 * Do a cache check based on attributes returned over the wire. The 424 * new attributes are cached. 425 * 426 * If an error occurred trying to convert the over the wire attributes 427 * to a vattr, then just return that error. 428 * 429 * As a side affect, the vattr argument is filled in with the converted 430 * attributes. 431 */ 432 int 433 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 434 cred_t *cr) 435 { 436 int error; 437 438 error = nattr_to_vattr(vp, na, vap); 439 if (error) 440 return (error); 441 nfs_attr_cache(vp, vap, t, cr); 442 return (0); 443 } 444 445 /* 446 * Do a cache check based on attributes returned over the wire. The 447 * new attributes are cached. 448 * 449 * If an error occurred trying to convert the over the wire attributes 450 * to a vattr, then just return that error. 451 * 452 * As a side affect, the vattr argument is filled in with the converted 453 * attributes. 454 */ 455 int 456 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 457 { 458 int error; 459 460 error = fattr3_to_vattr(vp, na, vap); 461 if (error) 462 return (error); 463 nfs_attr_cache(vp, vap, t, cr); 464 return (0); 465 } 466 467 /* 468 * Use the passed in virtual attributes to check to see whether the 469 * data and metadata caches are valid, cache the new attributes, and 470 * then do the cache invalidation if required. 471 * 472 * The cache validation and caching of the new attributes is done 473 * atomically via the use of the mutex, r_statelock. If required, 474 * the cache invalidation is done atomically w.r.t. the cache 475 * validation and caching of the attributes via the pseudo lock, 476 * r_serial. 477 * 478 * This routine is used to do cache validation and attributes caching 479 * for operations with a single set of post operation attributes. 480 */ 481 void 482 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 483 { 484 rnode_t *rp; 485 int mtime_changed = 0; 486 int ctime_changed = 0; 487 vsecattr_t *vsp; 488 int was_serial; 489 len_t preattr_rsize; 490 boolean_t writeattr_set = B_FALSE; 491 boolean_t cachepurge_set = B_FALSE; 492 493 rp = VTOR(vp); 494 495 mutex_enter(&rp->r_statelock); 496 497 if (rp->r_serial != curthread) { 498 klwp_t *lwp = ttolwp(curthread); 499 500 was_serial = 0; 501 if (lwp != NULL) 502 lwp->lwp_nostop++; 503 while (rp->r_serial != NULL) { 504 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 505 mutex_exit(&rp->r_statelock); 506 if (lwp != NULL) 507 lwp->lwp_nostop--; 508 return; 509 } 510 } 511 if (lwp != NULL) 512 lwp->lwp_nostop--; 513 } else 514 was_serial = 1; 515 516 if (rp->r_mtime > t) { 517 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 518 PURGE_ATTRCACHE_LOCKED(rp); 519 mutex_exit(&rp->r_statelock); 520 return; 521 } 522 523 /* 524 * Write thread after writing data to file on remote server, 525 * will always set RWRITEATTR to indicate that file on remote 526 * server was modified with a WRITE operation and would have 527 * marked attribute cache as timed out. If RWRITEATTR 528 * is set, then do not check for mtime and ctime change. 529 */ 530 if (!(rp->r_flags & RWRITEATTR)) { 531 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 532 mtime_changed = 1; 533 534 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 535 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 536 ctime_changed = 1; 537 } else { 538 writeattr_set = B_TRUE; 539 } 540 541 preattr_rsize = rp->r_size; 542 543 nfs_attrcache_va(vp, vap); 544 545 /* 546 * If we have updated filesize in nfs_attrcache_va, as soon as we 547 * drop statelock we will be in transition of purging all 548 * our caches and updating them. It is possible for another 549 * thread to pick this new file size and read in zeroed data. 550 * stall other threads till cache purge is complete. 551 */ 552 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 553 /* 554 * If RWRITEATTR was set and we have updated the file 555 * size, Server's returned file size need not necessarily 556 * be because of this Client's WRITE. We need to purge 557 * all caches. 558 */ 559 if (writeattr_set) 560 mtime_changed = 1; 561 562 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 563 rp->r_flags |= RINCACHEPURGE; 564 cachepurge_set = B_TRUE; 565 } 566 } 567 568 if (!mtime_changed && !ctime_changed) { 569 mutex_exit(&rp->r_statelock); 570 return; 571 } 572 573 rp->r_serial = curthread; 574 575 mutex_exit(&rp->r_statelock); 576 577 if (mtime_changed) 578 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 579 580 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 581 mutex_enter(&rp->r_statelock); 582 rp->r_flags &= ~RINCACHEPURGE; 583 cv_broadcast(&rp->r_cv); 584 mutex_exit(&rp->r_statelock); 585 cachepurge_set = B_FALSE; 586 } 587 588 if (ctime_changed) { 589 (void) nfs_access_purge_rp(rp); 590 if (rp->r_secattr != NULL) { 591 mutex_enter(&rp->r_statelock); 592 vsp = rp->r_secattr; 593 rp->r_secattr = NULL; 594 mutex_exit(&rp->r_statelock); 595 if (vsp != NULL) 596 nfs_acl_free(vsp); 597 } 598 } 599 600 if (!was_serial) { 601 mutex_enter(&rp->r_statelock); 602 rp->r_serial = NULL; 603 cv_broadcast(&rp->r_cv); 604 mutex_exit(&rp->r_statelock); 605 } 606 } 607 608 /* 609 * Use the passed in "before" virtual attributes to check to see 610 * whether the data and metadata caches are valid, cache the "after" 611 * new attributes, and then do the cache invalidation if required. 612 * 613 * The cache validation and caching of the new attributes is done 614 * atomically via the use of the mutex, r_statelock. If required, 615 * the cache invalidation is done atomically w.r.t. the cache 616 * validation and caching of the attributes via the pseudo lock, 617 * r_serial. 618 * 619 * This routine is used to do cache validation and attributes caching 620 * for operations with both pre operation attributes and post operation 621 * attributes. 622 */ 623 static void 624 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 625 cred_t *cr) 626 { 627 rnode_t *rp; 628 int mtime_changed = 0; 629 int ctime_changed = 0; 630 vsecattr_t *vsp; 631 int was_serial; 632 len_t preattr_rsize; 633 boolean_t writeattr_set = B_FALSE; 634 boolean_t cachepurge_set = B_FALSE; 635 636 rp = VTOR(vp); 637 638 mutex_enter(&rp->r_statelock); 639 640 if (rp->r_serial != curthread) { 641 klwp_t *lwp = ttolwp(curthread); 642 643 was_serial = 0; 644 if (lwp != NULL) 645 lwp->lwp_nostop++; 646 while (rp->r_serial != NULL) { 647 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 648 mutex_exit(&rp->r_statelock); 649 if (lwp != NULL) 650 lwp->lwp_nostop--; 651 return; 652 } 653 } 654 if (lwp != NULL) 655 lwp->lwp_nostop--; 656 } else 657 was_serial = 1; 658 659 if (rp->r_mtime > t) { 660 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size)) 661 PURGE_ATTRCACHE_LOCKED(rp); 662 mutex_exit(&rp->r_statelock); 663 return; 664 } 665 666 /* 667 * Write thread after writing data to file on remote server, 668 * will always set RWRITEATTR to indicate that file on remote 669 * server was modified with a WRITE operation and would have 670 * marked attribute cache as timed out. If RWRITEATTR 671 * is set, then do not check for mtime and ctime change. 672 */ 673 if (!(rp->r_flags & RWRITEATTR)) { 674 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 675 mtime_changed = 1; 676 677 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 678 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 679 ctime_changed = 1; 680 } else { 681 writeattr_set = B_TRUE; 682 } 683 684 preattr_rsize = rp->r_size; 685 686 nfs_attrcache_va(vp, avap); 687 688 /* 689 * If we have updated filesize in nfs_attrcache_va, as soon as we 690 * drop statelock we will be in transition of purging all 691 * our caches and updating them. It is possible for another 692 * thread to pick this new file size and read in zeroed data. 693 * stall other threads till cache purge is complete. 694 */ 695 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 696 /* 697 * If RWRITEATTR was set and we have updated the file 698 * size, Server's returned file size need not necessarily 699 * be because of this Client's WRITE. We need to purge 700 * all caches. 701 */ 702 if (writeattr_set) 703 mtime_changed = 1; 704 705 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 706 rp->r_flags |= RINCACHEPURGE; 707 cachepurge_set = B_TRUE; 708 } 709 } 710 711 if (!mtime_changed && !ctime_changed) { 712 mutex_exit(&rp->r_statelock); 713 return; 714 } 715 716 rp->r_serial = curthread; 717 718 mutex_exit(&rp->r_statelock); 719 720 if (mtime_changed) 721 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 722 723 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 724 mutex_enter(&rp->r_statelock); 725 rp->r_flags &= ~RINCACHEPURGE; 726 cv_broadcast(&rp->r_cv); 727 mutex_exit(&rp->r_statelock); 728 cachepurge_set = B_FALSE; 729 } 730 731 if (ctime_changed) { 732 (void) nfs_access_purge_rp(rp); 733 if (rp->r_secattr != NULL) { 734 mutex_enter(&rp->r_statelock); 735 vsp = rp->r_secattr; 736 rp->r_secattr = NULL; 737 mutex_exit(&rp->r_statelock); 738 if (vsp != NULL) 739 nfs_acl_free(vsp); 740 } 741 } 742 743 if (!was_serial) { 744 mutex_enter(&rp->r_statelock); 745 rp->r_serial = NULL; 746 cv_broadcast(&rp->r_cv); 747 mutex_exit(&rp->r_statelock); 748 } 749 } 750 751 /* 752 * Set attributes cache for given vnode using virtual attributes. 753 * 754 * Set the timeout value on the attribute cache and fill it 755 * with the passed in attributes. 756 * 757 * The caller must be holding r_statelock. 758 */ 759 void 760 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 761 { 762 rnode_t *rp; 763 mntinfo_t *mi; 764 hrtime_t delta; 765 hrtime_t now; 766 767 rp = VTOR(vp); 768 769 ASSERT(MUTEX_HELD(&rp->r_statelock)); 770 771 now = gethrtime(); 772 773 mi = VTOMI(vp); 774 775 /* 776 * Delta is the number of nanoseconds that we will 777 * cache the attributes of the file. It is based on 778 * the number of nanoseconds since the last time that 779 * we detected a change. The assumption is that files 780 * that changed recently are likely to change again. 781 * There is a minimum and a maximum for regular files 782 * and for directories which is enforced though. 783 * 784 * Using the time since last change was detected 785 * eliminates direct comparison or calculation 786 * using mixed client and server times. NFS does 787 * not make any assumptions regarding the client 788 * and server clocks being synchronized. 789 */ 790 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 791 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 792 va->va_size != rp->r_attr.va_size) 793 rp->r_mtime = now; 794 795 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 796 delta = 0; 797 else { 798 delta = now - rp->r_mtime; 799 if (vp->v_type == VDIR) { 800 if (delta < mi->mi_acdirmin) 801 delta = mi->mi_acdirmin; 802 else if (delta > mi->mi_acdirmax) 803 delta = mi->mi_acdirmax; 804 } else { 805 if (delta < mi->mi_acregmin) 806 delta = mi->mi_acregmin; 807 else if (delta > mi->mi_acregmax) 808 delta = mi->mi_acregmax; 809 } 810 } 811 rp->r_attrtime = now + delta; 812 rp->r_attr = *va; 813 /* 814 * Update the size of the file if there is no cached data or if 815 * the cached data is clean and there is no data being written 816 * out. 817 */ 818 if (rp->r_size != va->va_size && 819 (!vn_has_cached_data(vp) || 820 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 821 rp->r_size = va->va_size; 822 nfs_setswaplike(vp, va); 823 rp->r_flags &= ~RWRITEATTR; 824 } 825 826 /* 827 * Fill in attribute from the cache. 828 * If valid, then return 0 to indicate that no error occurred, 829 * otherwise return 1 to indicate that an error occurred. 830 */ 831 static int 832 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 833 { 834 rnode_t *rp; 835 uint_t mask = vap->va_mask; 836 837 rp = VTOR(vp); 838 mutex_enter(&rp->r_statelock); 839 if (ATTRCACHE_VALID(vp)) { 840 /* 841 * Cached attributes are valid 842 */ 843 *vap = rp->r_attr; 844 /* 845 * Set the caller's va_mask to the set of attributes 846 * that were requested ANDed with the attributes that 847 * are available. If attributes were requested that 848 * are not available, those bits must be turned off 849 * in the callers va_mask. 850 */ 851 vap->va_mask &= mask; 852 mutex_exit(&rp->r_statelock); 853 return (0); 854 } 855 mutex_exit(&rp->r_statelock); 856 return (1); 857 } 858 859 /* 860 * Get attributes over-the-wire and update attributes cache 861 * if no error occurred in the over-the-wire operation. 862 * Return 0 if successful, otherwise error. 863 */ 864 int 865 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 866 { 867 int error; 868 struct nfsattrstat ns; 869 int douprintf; 870 mntinfo_t *mi; 871 failinfo_t fi; 872 hrtime_t t; 873 874 mi = VTOMI(vp); 875 fi.vp = vp; 876 fi.fhp = NULL; /* no need to update, filehandle not copied */ 877 fi.copyproc = nfscopyfh; 878 fi.lookupproc = nfslookup; 879 fi.xattrdirproc = acl_getxattrdir2; 880 881 if (mi->mi_flags & MI_ACL) { 882 error = acl_getattr2_otw(vp, vap, cr); 883 if (mi->mi_flags & MI_ACL) 884 return (error); 885 } 886 887 douprintf = 1; 888 889 t = gethrtime(); 890 891 error = rfs2call(mi, RFS_GETATTR, 892 xdr_fhandle, (caddr_t)VTOFH(vp), 893 xdr_attrstat, (caddr_t)&ns, cr, 894 &douprintf, &ns.ns_status, 0, &fi); 895 896 if (!error) { 897 error = geterrno(ns.ns_status); 898 if (!error) 899 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 900 else { 901 PURGE_STALE_FH(error, vp, cr); 902 } 903 } 904 905 return (error); 906 } 907 908 /* 909 * Return either cached ot remote attributes. If get remote attr 910 * use them to check and invalidate caches, then cache the new attributes. 911 */ 912 int 913 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 914 { 915 int error; 916 rnode_t *rp; 917 918 /* 919 * If we've got cached attributes, we're done, otherwise go 920 * to the server to get attributes, which will update the cache 921 * in the process. 922 */ 923 error = nfs_getattr_cache(vp, vap); 924 if (error) 925 error = nfs_getattr_otw(vp, vap, cr); 926 927 /* Return the client's view of file size */ 928 rp = VTOR(vp); 929 mutex_enter(&rp->r_statelock); 930 vap->va_size = rp->r_size; 931 mutex_exit(&rp->r_statelock); 932 933 return (error); 934 } 935 936 /* 937 * Get attributes over-the-wire and update attributes cache 938 * if no error occurred in the over-the-wire operation. 939 * Return 0 if successful, otherwise error. 940 */ 941 int 942 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 943 { 944 int error; 945 GETATTR3args args; 946 GETATTR3vres res; 947 int douprintf; 948 failinfo_t fi; 949 hrtime_t t; 950 951 args.object = *VTOFH3(vp); 952 fi.vp = vp; 953 fi.fhp = (caddr_t)&args.object; 954 fi.copyproc = nfs3copyfh; 955 fi.lookupproc = nfs3lookup; 956 fi.xattrdirproc = acl_getxattrdir3; 957 res.fres.vp = vp; 958 res.fres.vap = vap; 959 960 douprintf = 1; 961 962 t = gethrtime(); 963 964 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 965 xdr_nfs_fh3, (caddr_t)&args, 966 xdr_GETATTR3vres, (caddr_t)&res, cr, 967 &douprintf, &res.status, 0, &fi); 968 969 if (error) 970 return (error); 971 972 error = geterrno3(res.status); 973 if (error) { 974 PURGE_STALE_FH(error, vp, cr); 975 return (error); 976 } 977 978 /* 979 * Catch status codes that indicate fattr3 to vattr translation failure 980 */ 981 if (res.fres.status) 982 return (res.fres.status); 983 984 nfs_attr_cache(vp, vap, t, cr); 985 return (0); 986 } 987 988 /* 989 * Return either cached or remote attributes. If get remote attr 990 * use them to check and invalidate caches, then cache the new attributes. 991 */ 992 int 993 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 994 { 995 int error; 996 rnode_t *rp; 997 998 /* 999 * If we've got cached attributes, we're done, otherwise go 1000 * to the server to get attributes, which will update the cache 1001 * in the process. 1002 */ 1003 error = nfs_getattr_cache(vp, vap); 1004 if (error) 1005 error = nfs3_getattr_otw(vp, vap, cr); 1006 1007 /* Return the client's view of file size */ 1008 rp = VTOR(vp); 1009 mutex_enter(&rp->r_statelock); 1010 vap->va_size = rp->r_size; 1011 mutex_exit(&rp->r_statelock); 1012 1013 return (error); 1014 } 1015 1016 vtype_t nf_to_vt[] = { 1017 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 1018 }; 1019 /* 1020 * Convert NFS Version 2 over the network attributes to the local 1021 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1022 * network representation and the local representation is done here. 1023 * Returns 0 for success, error if failed due to overflow. 1024 */ 1025 int 1026 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 1027 { 1028 /* overflow in time attributes? */ 1029 #ifndef _LP64 1030 if (!NFS2_FATTR_TIME_OK(na)) 1031 return (EOVERFLOW); 1032 #endif 1033 1034 vap->va_mask = AT_ALL; 1035 1036 if (na->na_type < NFNON || na->na_type > NFSOC) 1037 vap->va_type = VBAD; 1038 else 1039 vap->va_type = nf_to_vt[na->na_type]; 1040 vap->va_mode = na->na_mode; 1041 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 1042 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 1043 vap->va_fsid = vp->v_vfsp->vfs_dev; 1044 vap->va_nodeid = na->na_nodeid; 1045 vap->va_nlink = na->na_nlink; 1046 vap->va_size = na->na_size; /* keep for cache validation */ 1047 /* 1048 * nfs protocol defines times as unsigned so don't extend sign, 1049 * unless sysadmin set nfs_allow_preepoch_time. 1050 */ 1051 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 1052 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 1053 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 1054 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 1055 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 1056 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 1057 /* 1058 * Shannon's law - uncompress the received dev_t 1059 * if the top half of is zero indicating a response 1060 * from an `older style' OS. Except for when it is a 1061 * `new style' OS sending the maj device of zero, 1062 * in which case the algorithm still works because the 1063 * fact that it is a new style server 1064 * is hidden by the minor device not being greater 1065 * than 255 (a requirement in this case). 1066 */ 1067 if ((na->na_rdev & 0xffff0000) == 0) 1068 vap->va_rdev = nfsv2_expdev(na->na_rdev); 1069 else 1070 vap->va_rdev = expldev(na->na_rdev); 1071 1072 vap->va_nblocks = na->na_blocks; 1073 switch (na->na_type) { 1074 case NFBLK: 1075 vap->va_blksize = DEV_BSIZE; 1076 break; 1077 1078 case NFCHR: 1079 vap->va_blksize = MAXBSIZE; 1080 break; 1081 1082 case NFSOC: 1083 default: 1084 vap->va_blksize = na->na_blocksize; 1085 break; 1086 } 1087 /* 1088 * This bit of ugliness is a hack to preserve the 1089 * over-the-wire protocols for named-pipe vnodes. 1090 * It remaps the special over-the-wire type to the 1091 * VFIFO type. (see note in nfs.h) 1092 */ 1093 if (NA_ISFIFO(na)) { 1094 vap->va_type = VFIFO; 1095 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1096 vap->va_rdev = 0; 1097 vap->va_blksize = na->na_blocksize; 1098 } 1099 vap->va_seq = 0; 1100 return (0); 1101 } 1102 1103 /* 1104 * Convert NFS Version 3 over the network attributes to the local 1105 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1106 * network representation and the local representation is done here. 1107 */ 1108 vtype_t nf3_to_vt[] = { 1109 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1110 }; 1111 1112 int 1113 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1114 { 1115 1116 #ifndef _LP64 1117 /* overflow in time attributes? */ 1118 if (!NFS3_FATTR_TIME_OK(na)) 1119 return (EOVERFLOW); 1120 #endif 1121 if (!NFS3_SIZE_OK(na->size)) 1122 /* file too big */ 1123 return (EFBIG); 1124 1125 vap->va_mask = AT_ALL; 1126 1127 if (na->type < NF3REG || na->type > NF3FIFO) 1128 vap->va_type = VBAD; 1129 else 1130 vap->va_type = nf3_to_vt[na->type]; 1131 vap->va_mode = na->mode; 1132 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1133 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1134 vap->va_fsid = vp->v_vfsp->vfs_dev; 1135 vap->va_nodeid = na->fileid; 1136 vap->va_nlink = na->nlink; 1137 vap->va_size = na->size; 1138 1139 /* 1140 * nfs protocol defines times as unsigned so don't extend sign, 1141 * unless sysadmin set nfs_allow_preepoch_time. 1142 */ 1143 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1144 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1145 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1146 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1147 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1148 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1149 1150 switch (na->type) { 1151 case NF3BLK: 1152 vap->va_rdev = makedevice(na->rdev.specdata1, 1153 na->rdev.specdata2); 1154 vap->va_blksize = DEV_BSIZE; 1155 vap->va_nblocks = 0; 1156 break; 1157 case NF3CHR: 1158 vap->va_rdev = makedevice(na->rdev.specdata1, 1159 na->rdev.specdata2); 1160 vap->va_blksize = MAXBSIZE; 1161 vap->va_nblocks = 0; 1162 break; 1163 case NF3REG: 1164 case NF3DIR: 1165 case NF3LNK: 1166 vap->va_rdev = 0; 1167 vap->va_blksize = MAXBSIZE; 1168 vap->va_nblocks = (u_longlong_t) 1169 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1170 (size3)DEV_BSIZE); 1171 break; 1172 case NF3SOCK: 1173 case NF3FIFO: 1174 default: 1175 vap->va_rdev = 0; 1176 vap->va_blksize = MAXBSIZE; 1177 vap->va_nblocks = 0; 1178 break; 1179 } 1180 vap->va_seq = 0; 1181 return (0); 1182 } 1183 1184 /* 1185 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1186 * for the demand-based allocation of async threads per-mount. The 1187 * nfs_async_timeout is the amount of time a thread will live after it 1188 * becomes idle, unless new I/O requests are received before the thread 1189 * dies. See nfs_async_putpage and nfs_async_start. 1190 */ 1191 1192 int nfs_async_timeout = -1; /* uninitialized */ 1193 1194 static void nfs_async_start(struct vfs *); 1195 static void nfs_async_pgops_start(struct vfs *); 1196 static void nfs_async_common_start(struct vfs *, int); 1197 1198 static void 1199 free_async_args(struct nfs_async_reqs *args) 1200 { 1201 rnode_t *rp; 1202 1203 if (args->a_io != NFS_INACTIVE) { 1204 rp = VTOR(args->a_vp); 1205 mutex_enter(&rp->r_statelock); 1206 rp->r_count--; 1207 if (args->a_io == NFS_PUTAPAGE || 1208 args->a_io == NFS_PAGEIO) 1209 rp->r_awcount--; 1210 cv_broadcast(&rp->r_cv); 1211 mutex_exit(&rp->r_statelock); 1212 VN_RELE(args->a_vp); 1213 } 1214 crfree(args->a_cred); 1215 kmem_free(args, sizeof (*args)); 1216 } 1217 1218 /* 1219 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1220 * pageout(), running in the global zone, have legitimate reasons to do 1221 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1222 * use of a a per-mount "asynchronous requests manager thread" which is 1223 * signaled by the various asynchronous work routines when there is 1224 * asynchronous work to be done. It is responsible for creating new 1225 * worker threads if necessary, and notifying existing worker threads 1226 * that there is work to be done. 1227 * 1228 * In other words, it will "take the specifications from the customers and 1229 * give them to the engineers." 1230 * 1231 * Worker threads die off of their own accord if they are no longer 1232 * needed. 1233 * 1234 * This thread is killed when the zone is going away or the filesystem 1235 * is being unmounted. 1236 */ 1237 void 1238 nfs_async_manager(vfs_t *vfsp) 1239 { 1240 callb_cpr_t cprinfo; 1241 mntinfo_t *mi; 1242 uint_t max_threads; 1243 1244 mi = VFTOMI(vfsp); 1245 1246 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1247 "nfs_async_manager"); 1248 1249 mutex_enter(&mi->mi_async_lock); 1250 /* 1251 * We want to stash the max number of threads that this mount was 1252 * allowed so we can use it later when the variable is set to zero as 1253 * part of the zone/mount going away. 1254 * 1255 * We want to be able to create at least one thread to handle 1256 * asynchronous inactive calls. 1257 */ 1258 max_threads = MAX(mi->mi_max_threads, 1); 1259 /* 1260 * We don't want to wait for mi_max_threads to go to zero, since that 1261 * happens as part of a failed unmount, but this thread should only 1262 * exit when the mount/zone is really going away. 1263 * 1264 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1265 * attempted: the various _async_*() functions know to do things 1266 * inline if mi_max_threads == 0. Henceforth we just drain out the 1267 * outstanding requests. 1268 * 1269 * Note that we still create zthreads even if we notice the zone is 1270 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1271 * shutdown sequence to take slightly longer in some cases, but 1272 * doesn't violate the protocol, as all threads will exit as soon as 1273 * they're done processing the remaining requests. 1274 */ 1275 for (;;) { 1276 while (mi->mi_async_req_count > 0) { 1277 /* 1278 * Paranoia: If the mount started out having 1279 * (mi->mi_max_threads == 0), and the value was 1280 * later changed (via a debugger or somesuch), 1281 * we could be confused since we will think we 1282 * can't create any threads, and the calling 1283 * code (which looks at the current value of 1284 * mi->mi_max_threads, now non-zero) thinks we 1285 * can. 1286 * 1287 * So, because we're paranoid, we create threads 1288 * up to the maximum of the original and the 1289 * current value. This means that future 1290 * (debugger-induced) lowerings of 1291 * mi->mi_max_threads are ignored for our 1292 * purposes, but who told them they could change 1293 * random values on a live kernel anyhow? 1294 */ 1295 if (mi->mi_threads[NFS_ASYNC_QUEUE] < 1296 MAX(mi->mi_max_threads, max_threads)) { 1297 mi->mi_threads[NFS_ASYNC_QUEUE]++; 1298 mutex_exit(&mi->mi_async_lock); 1299 VFS_HOLD(vfsp); /* hold for new thread */ 1300 (void) zthread_create(NULL, 0, nfs_async_start, 1301 vfsp, 0, minclsyspri); 1302 mutex_enter(&mi->mi_async_lock); 1303 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] < 1304 NUM_ASYNC_PGOPS_THREADS) { 1305 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++; 1306 mutex_exit(&mi->mi_async_lock); 1307 VFS_HOLD(vfsp); /* hold for new thread */ 1308 (void) zthread_create(NULL, 0, 1309 nfs_async_pgops_start, vfsp, 0, 1310 minclsyspri); 1311 mutex_enter(&mi->mi_async_lock); 1312 } 1313 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1314 ASSERT(mi->mi_async_req_count != 0); 1315 mi->mi_async_req_count--; 1316 } 1317 1318 mutex_enter(&mi->mi_lock); 1319 if (mi->mi_flags & MI_ASYNC_MGR_STOP) { 1320 mutex_exit(&mi->mi_lock); 1321 break; 1322 } 1323 mutex_exit(&mi->mi_lock); 1324 1325 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1326 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1327 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1328 } 1329 /* 1330 * Let everyone know we're done. 1331 */ 1332 mi->mi_manager_thread = NULL; 1333 cv_broadcast(&mi->mi_async_cv); 1334 1335 /* 1336 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1337 * since CALLB_CPR_EXIT is actually responsible for releasing 1338 * 'mi_async_lock'. 1339 */ 1340 CALLB_CPR_EXIT(&cprinfo); 1341 VFS_RELE(vfsp); /* release thread's hold */ 1342 zthread_exit(); 1343 } 1344 1345 /* 1346 * Signal (and wait for) the async manager thread to clean up and go away. 1347 */ 1348 void 1349 nfs_async_manager_stop(vfs_t *vfsp) 1350 { 1351 mntinfo_t *mi = VFTOMI(vfsp); 1352 1353 mutex_enter(&mi->mi_async_lock); 1354 mutex_enter(&mi->mi_lock); 1355 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1356 mutex_exit(&mi->mi_lock); 1357 cv_broadcast(&mi->mi_async_reqs_cv); 1358 while (mi->mi_manager_thread != NULL) 1359 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1360 mutex_exit(&mi->mi_async_lock); 1361 } 1362 1363 int 1364 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1365 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1366 u_offset_t, caddr_t, struct seg *, cred_t *)) 1367 { 1368 rnode_t *rp; 1369 mntinfo_t *mi; 1370 struct nfs_async_reqs *args; 1371 1372 rp = VTOR(vp); 1373 ASSERT(rp->r_freef == NULL); 1374 1375 mi = VTOMI(vp); 1376 1377 /* 1378 * If addr falls in a different segment, don't bother doing readahead. 1379 */ 1380 if (addr >= seg->s_base + seg->s_size) 1381 return (-1); 1382 1383 /* 1384 * If we can't allocate a request structure, punt on the readahead. 1385 */ 1386 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1387 return (-1); 1388 1389 /* 1390 * If a lock operation is pending, don't initiate any new 1391 * readaheads. Otherwise, bump r_count to indicate the new 1392 * asynchronous I/O. 1393 */ 1394 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1395 kmem_free(args, sizeof (*args)); 1396 return (-1); 1397 } 1398 mutex_enter(&rp->r_statelock); 1399 rp->r_count++; 1400 mutex_exit(&rp->r_statelock); 1401 nfs_rw_exit(&rp->r_lkserlock); 1402 1403 args->a_next = NULL; 1404 #ifdef DEBUG 1405 args->a_queuer = curthread; 1406 #endif 1407 VN_HOLD(vp); 1408 args->a_vp = vp; 1409 ASSERT(cr != NULL); 1410 crhold(cr); 1411 args->a_cred = cr; 1412 args->a_io = NFS_READ_AHEAD; 1413 args->a_nfs_readahead = readahead; 1414 args->a_nfs_blkoff = blkoff; 1415 args->a_nfs_seg = seg; 1416 args->a_nfs_addr = addr; 1417 1418 mutex_enter(&mi->mi_async_lock); 1419 1420 /* 1421 * If asyncio has been disabled, don't bother readahead. 1422 */ 1423 if (mi->mi_max_threads == 0) { 1424 mutex_exit(&mi->mi_async_lock); 1425 goto noasync; 1426 } 1427 1428 /* 1429 * Link request structure into the async list and 1430 * wakeup async thread to do the i/o. 1431 */ 1432 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1433 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1434 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1435 } else { 1436 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1437 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1438 } 1439 1440 if (mi->mi_io_kstats) { 1441 mutex_enter(&mi->mi_lock); 1442 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1443 mutex_exit(&mi->mi_lock); 1444 } 1445 1446 mi->mi_async_req_count++; 1447 ASSERT(mi->mi_async_req_count != 0); 1448 cv_signal(&mi->mi_async_reqs_cv); 1449 mutex_exit(&mi->mi_async_lock); 1450 return (0); 1451 1452 noasync: 1453 mutex_enter(&rp->r_statelock); 1454 rp->r_count--; 1455 cv_broadcast(&rp->r_cv); 1456 mutex_exit(&rp->r_statelock); 1457 VN_RELE(vp); 1458 crfree(cr); 1459 kmem_free(args, sizeof (*args)); 1460 return (-1); 1461 } 1462 1463 int 1464 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1465 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1466 u_offset_t, size_t, int, cred_t *)) 1467 { 1468 rnode_t *rp; 1469 mntinfo_t *mi; 1470 struct nfs_async_reqs *args; 1471 1472 ASSERT(flags & B_ASYNC); 1473 ASSERT(vp->v_vfsp != NULL); 1474 1475 rp = VTOR(vp); 1476 ASSERT(rp->r_count > 0); 1477 1478 mi = VTOMI(vp); 1479 1480 /* 1481 * If we can't allocate a request structure, do the putpage 1482 * operation synchronously in this thread's context. 1483 */ 1484 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1485 goto noasync; 1486 1487 args->a_next = NULL; 1488 #ifdef DEBUG 1489 args->a_queuer = curthread; 1490 #endif 1491 VN_HOLD(vp); 1492 args->a_vp = vp; 1493 ASSERT(cr != NULL); 1494 crhold(cr); 1495 args->a_cred = cr; 1496 args->a_io = NFS_PUTAPAGE; 1497 args->a_nfs_putapage = putapage; 1498 args->a_nfs_pp = pp; 1499 args->a_nfs_off = off; 1500 args->a_nfs_len = (uint_t)len; 1501 args->a_nfs_flags = flags; 1502 1503 mutex_enter(&mi->mi_async_lock); 1504 1505 /* 1506 * If asyncio has been disabled, then make a synchronous request. 1507 * This check is done a second time in case async io was diabled 1508 * while this thread was blocked waiting for memory pressure to 1509 * reduce or for the queue to drain. 1510 */ 1511 if (mi->mi_max_threads == 0) { 1512 mutex_exit(&mi->mi_async_lock); 1513 goto noasync; 1514 } 1515 1516 /* 1517 * Link request structure into the async list and 1518 * wakeup async thread to do the i/o. 1519 */ 1520 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1521 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1522 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1523 } else { 1524 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1525 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1526 } 1527 1528 mutex_enter(&rp->r_statelock); 1529 rp->r_count++; 1530 rp->r_awcount++; 1531 mutex_exit(&rp->r_statelock); 1532 1533 if (mi->mi_io_kstats) { 1534 mutex_enter(&mi->mi_lock); 1535 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1536 mutex_exit(&mi->mi_lock); 1537 } 1538 1539 mi->mi_async_req_count++; 1540 ASSERT(mi->mi_async_req_count != 0); 1541 cv_signal(&mi->mi_async_reqs_cv); 1542 mutex_exit(&mi->mi_async_lock); 1543 return (0); 1544 1545 noasync: 1546 if (args != NULL) { 1547 VN_RELE(vp); 1548 crfree(cr); 1549 kmem_free(args, sizeof (*args)); 1550 } 1551 1552 if (curproc == proc_pageout || curproc == proc_fsflush) { 1553 /* 1554 * If we get here in the context of the pageout/fsflush, 1555 * we refuse to do a sync write, because this may hang 1556 * pageout (and the machine). In this case, we just 1557 * re-mark the page as dirty and punt on the page. 1558 * 1559 * Make sure B_FORCE isn't set. We can re-mark the 1560 * pages as dirty and unlock the pages in one swoop by 1561 * passing in B_ERROR to pvn_write_done(). However, 1562 * we should make sure B_FORCE isn't set - we don't 1563 * want the page tossed before it gets written out. 1564 */ 1565 if (flags & B_FORCE) 1566 flags &= ~(B_INVAL | B_FORCE); 1567 pvn_write_done(pp, flags | B_ERROR); 1568 return (0); 1569 } 1570 if (nfs_zone() != mi->mi_zone) { 1571 /* 1572 * So this was a cross-zone sync putpage. We pass in B_ERROR 1573 * to pvn_write_done() to re-mark the pages as dirty and unlock 1574 * them. 1575 * 1576 * We don't want to clear B_FORCE here as the caller presumably 1577 * knows what they're doing if they set it. 1578 */ 1579 pvn_write_done(pp, flags | B_ERROR); 1580 return (EPERM); 1581 } 1582 return ((*putapage)(vp, pp, off, len, flags, cr)); 1583 } 1584 1585 int 1586 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1587 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1588 size_t, int, cred_t *)) 1589 { 1590 rnode_t *rp; 1591 mntinfo_t *mi; 1592 struct nfs_async_reqs *args; 1593 1594 ASSERT(flags & B_ASYNC); 1595 ASSERT(vp->v_vfsp != NULL); 1596 1597 rp = VTOR(vp); 1598 ASSERT(rp->r_count > 0); 1599 1600 mi = VTOMI(vp); 1601 1602 /* 1603 * If we can't allocate a request structure, do the pageio 1604 * request synchronously in this thread's context. 1605 */ 1606 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1607 goto noasync; 1608 1609 args->a_next = NULL; 1610 #ifdef DEBUG 1611 args->a_queuer = curthread; 1612 #endif 1613 VN_HOLD(vp); 1614 args->a_vp = vp; 1615 ASSERT(cr != NULL); 1616 crhold(cr); 1617 args->a_cred = cr; 1618 args->a_io = NFS_PAGEIO; 1619 args->a_nfs_pageio = pageio; 1620 args->a_nfs_pp = pp; 1621 args->a_nfs_off = io_off; 1622 args->a_nfs_len = (uint_t)io_len; 1623 args->a_nfs_flags = flags; 1624 1625 mutex_enter(&mi->mi_async_lock); 1626 1627 /* 1628 * If asyncio has been disabled, then make a synchronous request. 1629 * This check is done a second time in case async io was diabled 1630 * while this thread was blocked waiting for memory pressure to 1631 * reduce or for the queue to drain. 1632 */ 1633 if (mi->mi_max_threads == 0) { 1634 mutex_exit(&mi->mi_async_lock); 1635 goto noasync; 1636 } 1637 1638 /* 1639 * Link request structure into the async list and 1640 * wakeup async thread to do the i/o. 1641 */ 1642 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1643 mi->mi_async_reqs[NFS_PAGEIO] = args; 1644 mi->mi_async_tail[NFS_PAGEIO] = args; 1645 } else { 1646 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1647 mi->mi_async_tail[NFS_PAGEIO] = args; 1648 } 1649 1650 mutex_enter(&rp->r_statelock); 1651 rp->r_count++; 1652 rp->r_awcount++; 1653 mutex_exit(&rp->r_statelock); 1654 1655 if (mi->mi_io_kstats) { 1656 mutex_enter(&mi->mi_lock); 1657 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1658 mutex_exit(&mi->mi_lock); 1659 } 1660 1661 mi->mi_async_req_count++; 1662 ASSERT(mi->mi_async_req_count != 0); 1663 cv_signal(&mi->mi_async_reqs_cv); 1664 mutex_exit(&mi->mi_async_lock); 1665 return (0); 1666 1667 noasync: 1668 if (args != NULL) { 1669 VN_RELE(vp); 1670 crfree(cr); 1671 kmem_free(args, sizeof (*args)); 1672 } 1673 1674 /* 1675 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1676 * the page list), for writes we do it synchronously, except for 1677 * proc_pageout/proc_fsflush as described below. 1678 */ 1679 if (flags & B_READ) { 1680 pvn_read_done(pp, flags | B_ERROR); 1681 return (0); 1682 } 1683 1684 if (curproc == proc_pageout || curproc == proc_fsflush) { 1685 /* 1686 * If we get here in the context of the pageout/fsflush, 1687 * we refuse to do a sync write, because this may hang 1688 * pageout/fsflush (and the machine). In this case, we just 1689 * re-mark the page as dirty and punt on the page. 1690 * 1691 * Make sure B_FORCE isn't set. We can re-mark the 1692 * pages as dirty and unlock the pages in one swoop by 1693 * passing in B_ERROR to pvn_write_done(). However, 1694 * we should make sure B_FORCE isn't set - we don't 1695 * want the page tossed before it gets written out. 1696 */ 1697 if (flags & B_FORCE) 1698 flags &= ~(B_INVAL | B_FORCE); 1699 pvn_write_done(pp, flags | B_ERROR); 1700 return (0); 1701 } 1702 1703 if (nfs_zone() != mi->mi_zone) { 1704 /* 1705 * So this was a cross-zone sync pageio. We pass in B_ERROR 1706 * to pvn_write_done() to re-mark the pages as dirty and unlock 1707 * them. 1708 * 1709 * We don't want to clear B_FORCE here as the caller presumably 1710 * knows what they're doing if they set it. 1711 */ 1712 pvn_write_done(pp, flags | B_ERROR); 1713 return (EPERM); 1714 } 1715 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1716 } 1717 1718 void 1719 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1720 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1721 { 1722 rnode_t *rp; 1723 mntinfo_t *mi; 1724 struct nfs_async_reqs *args; 1725 1726 rp = VTOR(vp); 1727 ASSERT(rp->r_freef == NULL); 1728 1729 mi = VTOMI(vp); 1730 1731 /* 1732 * If we can't allocate a request structure, do the readdir 1733 * operation synchronously in this thread's context. 1734 */ 1735 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1736 goto noasync; 1737 1738 args->a_next = NULL; 1739 #ifdef DEBUG 1740 args->a_queuer = curthread; 1741 #endif 1742 VN_HOLD(vp); 1743 args->a_vp = vp; 1744 ASSERT(cr != NULL); 1745 crhold(cr); 1746 args->a_cred = cr; 1747 args->a_io = NFS_READDIR; 1748 args->a_nfs_readdir = readdir; 1749 args->a_nfs_rdc = rdc; 1750 1751 mutex_enter(&mi->mi_async_lock); 1752 1753 /* 1754 * If asyncio has been disabled, then make a synchronous request. 1755 */ 1756 if (mi->mi_max_threads == 0) { 1757 mutex_exit(&mi->mi_async_lock); 1758 goto noasync; 1759 } 1760 1761 /* 1762 * Link request structure into the async list and 1763 * wakeup async thread to do the i/o. 1764 */ 1765 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1766 mi->mi_async_reqs[NFS_READDIR] = args; 1767 mi->mi_async_tail[NFS_READDIR] = args; 1768 } else { 1769 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1770 mi->mi_async_tail[NFS_READDIR] = args; 1771 } 1772 1773 mutex_enter(&rp->r_statelock); 1774 rp->r_count++; 1775 mutex_exit(&rp->r_statelock); 1776 1777 if (mi->mi_io_kstats) { 1778 mutex_enter(&mi->mi_lock); 1779 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1780 mutex_exit(&mi->mi_lock); 1781 } 1782 1783 mi->mi_async_req_count++; 1784 ASSERT(mi->mi_async_req_count != 0); 1785 cv_signal(&mi->mi_async_reqs_cv); 1786 mutex_exit(&mi->mi_async_lock); 1787 return; 1788 1789 noasync: 1790 if (args != NULL) { 1791 VN_RELE(vp); 1792 crfree(cr); 1793 kmem_free(args, sizeof (*args)); 1794 } 1795 1796 rdc->entries = NULL; 1797 mutex_enter(&rp->r_statelock); 1798 ASSERT(rdc->flags & RDDIR); 1799 rdc->flags &= ~RDDIR; 1800 rdc->flags |= RDDIRREQ; 1801 /* 1802 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1803 * is set, wakeup the thread sleeping in cv_wait_sig(). 1804 * The woken up thread will reset the flag to RDDIR and will 1805 * continue with the readdir opeartion. 1806 */ 1807 if (rdc->flags & RDDIRWAIT) { 1808 rdc->flags &= ~RDDIRWAIT; 1809 cv_broadcast(&rdc->cv); 1810 } 1811 mutex_exit(&rp->r_statelock); 1812 rddir_cache_rele(rdc); 1813 } 1814 1815 void 1816 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1817 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *)) 1818 { 1819 rnode_t *rp; 1820 mntinfo_t *mi; 1821 struct nfs_async_reqs *args; 1822 page_t *pp; 1823 1824 rp = VTOR(vp); 1825 mi = VTOMI(vp); 1826 1827 /* 1828 * If we can't allocate a request structure, do the commit 1829 * operation synchronously in this thread's context. 1830 */ 1831 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1832 goto noasync; 1833 1834 args->a_next = NULL; 1835 #ifdef DEBUG 1836 args->a_queuer = curthread; 1837 #endif 1838 VN_HOLD(vp); 1839 args->a_vp = vp; 1840 ASSERT(cr != NULL); 1841 crhold(cr); 1842 args->a_cred = cr; 1843 args->a_io = NFS_COMMIT; 1844 args->a_nfs_commit = commit; 1845 args->a_nfs_plist = plist; 1846 args->a_nfs_offset = offset; 1847 args->a_nfs_count = count; 1848 1849 mutex_enter(&mi->mi_async_lock); 1850 1851 /* 1852 * If asyncio has been disabled, then make a synchronous request. 1853 * This check is done a second time in case async io was diabled 1854 * while this thread was blocked waiting for memory pressure to 1855 * reduce or for the queue to drain. 1856 */ 1857 if (mi->mi_max_threads == 0) { 1858 mutex_exit(&mi->mi_async_lock); 1859 goto noasync; 1860 } 1861 1862 /* 1863 * Link request structure into the async list and 1864 * wakeup async thread to do the i/o. 1865 */ 1866 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1867 mi->mi_async_reqs[NFS_COMMIT] = args; 1868 mi->mi_async_tail[NFS_COMMIT] = args; 1869 } else { 1870 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1871 mi->mi_async_tail[NFS_COMMIT] = args; 1872 } 1873 1874 mutex_enter(&rp->r_statelock); 1875 rp->r_count++; 1876 mutex_exit(&rp->r_statelock); 1877 1878 if (mi->mi_io_kstats) { 1879 mutex_enter(&mi->mi_lock); 1880 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1881 mutex_exit(&mi->mi_lock); 1882 } 1883 1884 mi->mi_async_req_count++; 1885 ASSERT(mi->mi_async_req_count != 0); 1886 cv_signal(&mi->mi_async_reqs_cv); 1887 mutex_exit(&mi->mi_async_lock); 1888 return; 1889 1890 noasync: 1891 if (args != NULL) { 1892 VN_RELE(vp); 1893 crfree(cr); 1894 kmem_free(args, sizeof (*args)); 1895 } 1896 1897 if (curproc == proc_pageout || curproc == proc_fsflush || 1898 nfs_zone() != mi->mi_zone) { 1899 while (plist != NULL) { 1900 pp = plist; 1901 page_sub(&plist, pp); 1902 pp->p_fsdata = C_COMMIT; 1903 page_unlock(pp); 1904 } 1905 return; 1906 } 1907 (*commit)(vp, plist, offset, count, cr); 1908 } 1909 1910 void 1911 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1912 void (*inactive)(vnode_t *, cred_t *, caller_context_t *)) 1913 { 1914 mntinfo_t *mi; 1915 struct nfs_async_reqs *args; 1916 1917 mi = VTOMI(vp); 1918 1919 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1920 args->a_next = NULL; 1921 #ifdef DEBUG 1922 args->a_queuer = curthread; 1923 #endif 1924 args->a_vp = vp; 1925 ASSERT(cr != NULL); 1926 crhold(cr); 1927 args->a_cred = cr; 1928 args->a_io = NFS_INACTIVE; 1929 args->a_nfs_inactive = inactive; 1930 1931 /* 1932 * Note that we don't check mi->mi_max_threads here, since we 1933 * *need* to get rid of this vnode regardless of whether someone 1934 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1935 * 1936 * The manager thread knows about this and is willing to create 1937 * at least one thread to accommodate us. 1938 */ 1939 mutex_enter(&mi->mi_async_lock); 1940 if (mi->mi_manager_thread == NULL) { 1941 rnode_t *rp = VTOR(vp); 1942 1943 mutex_exit(&mi->mi_async_lock); 1944 crfree(cr); /* drop our reference */ 1945 kmem_free(args, sizeof (*args)); 1946 /* 1947 * We can't do an over-the-wire call since we're in the wrong 1948 * zone, so we need to clean up state as best we can and then 1949 * throw away the vnode. 1950 */ 1951 mutex_enter(&rp->r_statelock); 1952 if (rp->r_unldvp != NULL) { 1953 vnode_t *unldvp; 1954 char *unlname; 1955 cred_t *unlcred; 1956 1957 unldvp = rp->r_unldvp; 1958 rp->r_unldvp = NULL; 1959 unlname = rp->r_unlname; 1960 rp->r_unlname = NULL; 1961 unlcred = rp->r_unlcred; 1962 rp->r_unlcred = NULL; 1963 mutex_exit(&rp->r_statelock); 1964 1965 VN_RELE(unldvp); 1966 kmem_free(unlname, MAXNAMELEN); 1967 crfree(unlcred); 1968 } else { 1969 mutex_exit(&rp->r_statelock); 1970 } 1971 /* 1972 * No need to explicitly throw away any cached pages. The 1973 * eventual rinactive() will attempt a synchronous 1974 * VOP_PUTPAGE() which will immediately fail since the request 1975 * is coming from the wrong zone, and then will proceed to call 1976 * nfs_invalidate_pages() which will clean things up for us. 1977 */ 1978 rp_addfree(VTOR(vp), cr); 1979 return; 1980 } 1981 1982 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1983 mi->mi_async_reqs[NFS_INACTIVE] = args; 1984 } else { 1985 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1986 } 1987 mi->mi_async_tail[NFS_INACTIVE] = args; 1988 /* 1989 * Don't increment r_count, since we're trying to get rid of the vnode. 1990 */ 1991 1992 mi->mi_async_req_count++; 1993 ASSERT(mi->mi_async_req_count != 0); 1994 cv_signal(&mi->mi_async_reqs_cv); 1995 mutex_exit(&mi->mi_async_lock); 1996 } 1997 1998 static void 1999 nfs_async_start(struct vfs *vfsp) 2000 { 2001 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE); 2002 } 2003 2004 static void 2005 nfs_async_pgops_start(struct vfs *vfsp) 2006 { 2007 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE); 2008 } 2009 2010 /* 2011 * The async queues for each mounted file system are arranged as a 2012 * set of queues, one for each async i/o type. Requests are taken 2013 * from the queues in a round-robin fashion. A number of consecutive 2014 * requests are taken from each queue before moving on to the next 2015 * queue. This functionality may allow the NFS Version 2 server to do 2016 * write clustering, even if the client is mixing writes and reads 2017 * because it will take multiple write requests from the queue 2018 * before processing any of the other async i/o types. 2019 * 2020 * XXX The nfs_async_common_start thread is unsafe in the light of the present 2021 * model defined by cpr to suspend the system. Specifically over the 2022 * wire calls are cpr-unsafe. The thread should be reevaluated in 2023 * case of future updates to the cpr model. 2024 */ 2025 static void 2026 nfs_async_common_start(struct vfs *vfsp, int async_queue) 2027 { 2028 struct nfs_async_reqs *args; 2029 mntinfo_t *mi = VFTOMI(vfsp); 2030 clock_t time_left = 1; 2031 callb_cpr_t cprinfo; 2032 int i; 2033 int async_types; 2034 kcondvar_t *async_work_cv; 2035 2036 if (async_queue == NFS_ASYNC_QUEUE) { 2037 async_types = NFS_ASYNC_TYPES; 2038 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE]; 2039 } else { 2040 async_types = NFS_ASYNC_PGOPS_TYPES; 2041 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]; 2042 } 2043 2044 /* 2045 * Dynamic initialization of nfs_async_timeout to allow nfs to be 2046 * built in an implementation independent manner. 2047 */ 2048 if (nfs_async_timeout == -1) 2049 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 2050 2051 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 2052 2053 mutex_enter(&mi->mi_async_lock); 2054 for (;;) { 2055 /* 2056 * Find the next queue containing an entry. We start 2057 * at the current queue pointer and then round robin 2058 * through all of them until we either find a non-empty 2059 * queue or have looked through all of them. 2060 */ 2061 for (i = 0; i < async_types; i++) { 2062 args = *mi->mi_async_curr[async_queue]; 2063 if (args != NULL) 2064 break; 2065 mi->mi_async_curr[async_queue]++; 2066 if (mi->mi_async_curr[async_queue] == 2067 &mi->mi_async_reqs[async_types]) { 2068 mi->mi_async_curr[async_queue] = 2069 &mi->mi_async_reqs[0]; 2070 } 2071 } 2072 /* 2073 * If we didn't find a entry, then block until woken up 2074 * again and then look through the queues again. 2075 */ 2076 if (args == NULL) { 2077 /* 2078 * Exiting is considered to be safe for CPR as well 2079 */ 2080 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2081 2082 /* 2083 * Wakeup thread waiting to unmount the file 2084 * system only if all async threads are inactive. 2085 * 2086 * If we've timed-out and there's nothing to do, 2087 * then get rid of this thread. 2088 */ 2089 if (mi->mi_max_threads == 0 || time_left <= 0) { 2090 --mi->mi_threads[async_queue]; 2091 2092 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 2093 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0) 2094 cv_signal(&mi->mi_async_cv); 2095 CALLB_CPR_EXIT(&cprinfo); 2096 VFS_RELE(vfsp); /* release thread's hold */ 2097 zthread_exit(); 2098 /* NOTREACHED */ 2099 } 2100 time_left = cv_reltimedwait(async_work_cv, 2101 &mi->mi_async_lock, nfs_async_timeout, 2102 TR_CLOCK_TICK); 2103 2104 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 2105 2106 continue; 2107 } 2108 time_left = 1; 2109 2110 /* 2111 * Remove the request from the async queue and then 2112 * update the current async request queue pointer. If 2113 * the current queue is empty or we have removed enough 2114 * consecutive entries from it, then reset the counter 2115 * for this queue and then move the current pointer to 2116 * the next queue. 2117 */ 2118 *mi->mi_async_curr[async_queue] = args->a_next; 2119 if (*mi->mi_async_curr[async_queue] == NULL || 2120 --mi->mi_async_clusters[args->a_io] == 0) { 2121 mi->mi_async_clusters[args->a_io] = 2122 mi->mi_async_init_clusters; 2123 mi->mi_async_curr[async_queue]++; 2124 if (mi->mi_async_curr[async_queue] == 2125 &mi->mi_async_reqs[async_types]) { 2126 mi->mi_async_curr[async_queue] = 2127 &mi->mi_async_reqs[0]; 2128 } 2129 } 2130 2131 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2132 mutex_enter(&mi->mi_lock); 2133 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2134 mutex_exit(&mi->mi_lock); 2135 } 2136 2137 mutex_exit(&mi->mi_async_lock); 2138 2139 /* 2140 * Obtain arguments from the async request structure. 2141 */ 2142 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2143 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2144 args->a_nfs_addr, args->a_nfs_seg, 2145 args->a_cred); 2146 } else if (args->a_io == NFS_PUTAPAGE) { 2147 (void) (*args->a_nfs_putapage)(args->a_vp, 2148 args->a_nfs_pp, args->a_nfs_off, 2149 args->a_nfs_len, args->a_nfs_flags, 2150 args->a_cred); 2151 } else if (args->a_io == NFS_PAGEIO) { 2152 (void) (*args->a_nfs_pageio)(args->a_vp, 2153 args->a_nfs_pp, args->a_nfs_off, 2154 args->a_nfs_len, args->a_nfs_flags, 2155 args->a_cred); 2156 } else if (args->a_io == NFS_READDIR) { 2157 (void) ((*args->a_nfs_readdir)(args->a_vp, 2158 args->a_nfs_rdc, args->a_cred)); 2159 } else if (args->a_io == NFS_COMMIT) { 2160 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2161 args->a_nfs_offset, args->a_nfs_count, 2162 args->a_cred); 2163 } else if (args->a_io == NFS_INACTIVE) { 2164 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL); 2165 } 2166 2167 /* 2168 * Now, release the vnode and free the credentials 2169 * structure. 2170 */ 2171 free_async_args(args); 2172 /* 2173 * Reacquire the mutex because it will be needed above. 2174 */ 2175 mutex_enter(&mi->mi_async_lock); 2176 } 2177 } 2178 2179 void 2180 nfs_async_stop(struct vfs *vfsp) 2181 { 2182 mntinfo_t *mi = VFTOMI(vfsp); 2183 2184 /* 2185 * Wait for all outstanding async operations to complete and for the 2186 * worker threads to exit. 2187 */ 2188 mutex_enter(&mi->mi_async_lock); 2189 mi->mi_max_threads = 0; 2190 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2191 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2192 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) 2193 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2194 mutex_exit(&mi->mi_async_lock); 2195 } 2196 2197 /* 2198 * nfs_async_stop_sig: 2199 * Wait for all outstanding putpage operation to complete. If a signal 2200 * is deliver we will abort and return non-zero. If we can put all the 2201 * pages we will return 0. This routine is called from nfs_unmount and 2202 * nfs3_unmount to make these operations interruptible. 2203 */ 2204 int 2205 nfs_async_stop_sig(struct vfs *vfsp) 2206 { 2207 mntinfo_t *mi = VFTOMI(vfsp); 2208 ushort_t omax; 2209 int rval; 2210 2211 /* 2212 * Wait for all outstanding async operations to complete and for the 2213 * worker threads to exit. 2214 */ 2215 mutex_enter(&mi->mi_async_lock); 2216 omax = mi->mi_max_threads; 2217 mi->mi_max_threads = 0; 2218 /* 2219 * Tell all the worker threads to exit. 2220 */ 2221 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2222 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2223 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) { 2224 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2225 break; 2226 } 2227 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2228 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */ 2229 if (rval) 2230 mi->mi_max_threads = omax; 2231 mutex_exit(&mi->mi_async_lock); 2232 2233 return (rval); 2234 } 2235 2236 int 2237 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2238 { 2239 int pagecreate; 2240 int n; 2241 int saved_n; 2242 caddr_t saved_base; 2243 u_offset_t offset; 2244 int error; 2245 int sm_error; 2246 vnode_t *vp = RTOV(rp); 2247 2248 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2249 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2250 if (!vpm_enable) { 2251 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2252 } 2253 2254 /* 2255 * Move bytes in at most PAGESIZE chunks. We must avoid 2256 * spanning pages in uiomove() because page faults may cause 2257 * the cache to be invalidated out from under us. The r_size is not 2258 * updated until after the uiomove. If we push the last page of a 2259 * file before r_size is correct, we will lose the data written past 2260 * the current (and invalid) r_size. 2261 */ 2262 do { 2263 offset = uio->uio_loffset; 2264 pagecreate = 0; 2265 2266 /* 2267 * n is the number of bytes required to satisfy the request 2268 * or the number of bytes to fill out the page. 2269 */ 2270 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2271 2272 /* 2273 * Check to see if we can skip reading in the page 2274 * and just allocate the memory. We can do this 2275 * if we are going to rewrite the entire mapping 2276 * or if we are going to write to or beyond the current 2277 * end of file from the beginning of the mapping. 2278 * 2279 * The read of r_size is now protected by r_statelock. 2280 */ 2281 mutex_enter(&rp->r_statelock); 2282 /* 2283 * When pgcreated is nonzero the caller has already done 2284 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2285 * segkpm this means we already have at least one page 2286 * created and mapped at base. 2287 */ 2288 pagecreate = pgcreated || 2289 ((offset & PAGEOFFSET) == 0 && 2290 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2291 2292 mutex_exit(&rp->r_statelock); 2293 if (!vpm_enable && pagecreate) { 2294 /* 2295 * The last argument tells segmap_pagecreate() to 2296 * always lock the page, as opposed to sometimes 2297 * returning with the page locked. This way we avoid a 2298 * fault on the ensuing uiomove(), but also 2299 * more importantly (to fix bug 1094402) we can 2300 * call segmap_fault() to unlock the page in all 2301 * cases. An alternative would be to modify 2302 * segmap_pagecreate() to tell us when it is 2303 * locking a page, but that's a fairly major 2304 * interface change. 2305 */ 2306 if (pgcreated == 0) 2307 (void) segmap_pagecreate(segkmap, base, 2308 (uint_t)n, 1); 2309 saved_base = base; 2310 saved_n = n; 2311 } 2312 2313 /* 2314 * The number of bytes of data in the last page can not 2315 * be accurately be determined while page is being 2316 * uiomove'd to and the size of the file being updated. 2317 * Thus, inform threads which need to know accurately 2318 * how much data is in the last page of the file. They 2319 * will not do the i/o immediately, but will arrange for 2320 * the i/o to happen later when this modify operation 2321 * will have finished. 2322 */ 2323 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2324 mutex_enter(&rp->r_statelock); 2325 rp->r_flags |= RMODINPROGRESS; 2326 rp->r_modaddr = (offset & MAXBMASK); 2327 mutex_exit(&rp->r_statelock); 2328 2329 if (vpm_enable) { 2330 /* 2331 * Copy data. If new pages are created, part of 2332 * the page that is not written will be initizliazed 2333 * with zeros. 2334 */ 2335 error = vpm_data_copy(vp, offset, n, uio, 2336 !pagecreate, NULL, 0, S_WRITE); 2337 } else { 2338 error = uiomove(base, n, UIO_WRITE, uio); 2339 } 2340 2341 /* 2342 * r_size is the maximum number of 2343 * bytes known to be in the file. 2344 * Make sure it is at least as high as the 2345 * first unwritten byte pointed to by uio_loffset. 2346 */ 2347 mutex_enter(&rp->r_statelock); 2348 if (rp->r_size < uio->uio_loffset) 2349 rp->r_size = uio->uio_loffset; 2350 rp->r_flags &= ~RMODINPROGRESS; 2351 rp->r_flags |= RDIRTY; 2352 mutex_exit(&rp->r_statelock); 2353 2354 /* n = # of bytes written */ 2355 n = (int)(uio->uio_loffset - offset); 2356 2357 if (!vpm_enable) { 2358 base += n; 2359 } 2360 tcount -= n; 2361 /* 2362 * If we created pages w/o initializing them completely, 2363 * we need to zero the part that wasn't set up. 2364 * This happens on a most EOF write cases and if 2365 * we had some sort of error during the uiomove. 2366 */ 2367 if (!vpm_enable && pagecreate) { 2368 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2369 (void) kzero(base, PAGESIZE - n); 2370 2371 if (pgcreated) { 2372 /* 2373 * Caller is responsible for this page, 2374 * it was not created in this loop. 2375 */ 2376 pgcreated = 0; 2377 } else { 2378 /* 2379 * For bug 1094402: segmap_pagecreate locks 2380 * page. Unlock it. This also unlocks the 2381 * pages allocated by page_create_va() in 2382 * segmap_pagecreate(). 2383 */ 2384 sm_error = segmap_fault(kas.a_hat, segkmap, 2385 saved_base, saved_n, 2386 F_SOFTUNLOCK, S_WRITE); 2387 if (error == 0) 2388 error = sm_error; 2389 } 2390 } 2391 } while (tcount > 0 && error == 0); 2392 2393 return (error); 2394 } 2395 2396 int 2397 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2398 { 2399 rnode_t *rp; 2400 page_t *pp; 2401 u_offset_t eoff; 2402 u_offset_t io_off; 2403 size_t io_len; 2404 int error; 2405 int rdirty; 2406 int err; 2407 2408 rp = VTOR(vp); 2409 ASSERT(rp->r_count > 0); 2410 2411 if (!vn_has_cached_data(vp)) 2412 return (0); 2413 2414 ASSERT(vp->v_type != VCHR); 2415 2416 /* 2417 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2418 * writes. B_FORCE is set to force the VM system to actually 2419 * invalidate the pages, even if the i/o failed. The pages 2420 * need to get invalidated because they can't be written out 2421 * because there isn't any space left on either the server's 2422 * file system or in the user's disk quota. The B_FREE bit 2423 * is cleared to avoid confusion as to whether this is a 2424 * request to place the page on the freelist or to destroy 2425 * it. 2426 */ 2427 if ((rp->r_flags & ROUTOFSPACE) || 2428 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2429 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2430 2431 if (len == 0) { 2432 /* 2433 * If doing a full file synchronous operation, then clear 2434 * the RDIRTY bit. If a page gets dirtied while the flush 2435 * is happening, then RDIRTY will get set again. The 2436 * RDIRTY bit must get cleared before the flush so that 2437 * we don't lose this information. 2438 * 2439 * If there are no full file async write operations 2440 * pending and RDIRTY bit is set, clear it. 2441 */ 2442 if (off == (u_offset_t)0 && 2443 !(flags & B_ASYNC) && 2444 (rp->r_flags & RDIRTY)) { 2445 mutex_enter(&rp->r_statelock); 2446 rdirty = (rp->r_flags & RDIRTY); 2447 rp->r_flags &= ~RDIRTY; 2448 mutex_exit(&rp->r_statelock); 2449 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2450 mutex_enter(&rp->r_statelock); 2451 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2452 rdirty = (rp->r_flags & RDIRTY); 2453 rp->r_flags &= ~RDIRTY; 2454 } 2455 mutex_exit(&rp->r_statelock); 2456 } else 2457 rdirty = 0; 2458 2459 /* 2460 * Search the entire vp list for pages >= off, and flush 2461 * the dirty pages. 2462 */ 2463 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2464 flags, cr); 2465 2466 /* 2467 * If an error occurred and the file was marked as dirty 2468 * before and we aren't forcibly invalidating pages, then 2469 * reset the RDIRTY flag. 2470 */ 2471 if (error && rdirty && 2472 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2473 mutex_enter(&rp->r_statelock); 2474 rp->r_flags |= RDIRTY; 2475 mutex_exit(&rp->r_statelock); 2476 } 2477 } else { 2478 /* 2479 * Do a range from [off...off + len) looking for pages 2480 * to deal with. 2481 */ 2482 error = 0; 2483 #ifdef lint 2484 io_len = 0; 2485 #endif 2486 eoff = off + len; 2487 mutex_enter(&rp->r_statelock); 2488 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2489 io_off += io_len) { 2490 mutex_exit(&rp->r_statelock); 2491 /* 2492 * If we are not invalidating, synchronously 2493 * freeing or writing pages use the routine 2494 * page_lookup_nowait() to prevent reclaiming 2495 * them from the free list. 2496 */ 2497 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2498 pp = page_lookup(vp, io_off, 2499 (flags & (B_INVAL | B_FREE)) ? 2500 SE_EXCL : SE_SHARED); 2501 } else { 2502 pp = page_lookup_nowait(vp, io_off, 2503 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2504 } 2505 2506 if (pp == NULL || !pvn_getdirty(pp, flags)) 2507 io_len = PAGESIZE; 2508 else { 2509 err = (*rp->r_putapage)(vp, pp, &io_off, 2510 &io_len, flags, cr); 2511 if (!error) 2512 error = err; 2513 /* 2514 * "io_off" and "io_len" are returned as 2515 * the range of pages we actually wrote. 2516 * This allows us to skip ahead more quickly 2517 * since several pages may've been dealt 2518 * with by this iteration of the loop. 2519 */ 2520 } 2521 mutex_enter(&rp->r_statelock); 2522 } 2523 mutex_exit(&rp->r_statelock); 2524 } 2525 2526 return (error); 2527 } 2528 2529 void 2530 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2531 { 2532 rnode_t *rp; 2533 2534 rp = VTOR(vp); 2535 mutex_enter(&rp->r_statelock); 2536 while (rp->r_flags & RTRUNCATE) 2537 cv_wait(&rp->r_cv, &rp->r_statelock); 2538 rp->r_flags |= RTRUNCATE; 2539 if (off == (u_offset_t)0) { 2540 rp->r_flags &= ~RDIRTY; 2541 if (!(rp->r_flags & RSTALE)) 2542 rp->r_error = 0; 2543 } 2544 rp->r_truncaddr = off; 2545 mutex_exit(&rp->r_statelock); 2546 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2547 B_INVAL | B_TRUNC, cr); 2548 mutex_enter(&rp->r_statelock); 2549 rp->r_flags &= ~RTRUNCATE; 2550 cv_broadcast(&rp->r_cv); 2551 mutex_exit(&rp->r_statelock); 2552 } 2553 2554 static int nfs_write_error_to_cons_only = 0; 2555 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2556 2557 /* 2558 * Print a file handle 2559 */ 2560 void 2561 nfs_printfhandle(nfs_fhandle *fhp) 2562 { 2563 int *ip; 2564 char *buf; 2565 size_t bufsize; 2566 char *cp; 2567 2568 /* 2569 * 13 == "(file handle:" 2570 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2571 * 1 == ' ' 2572 * 8 == maximum strlen of "%x" 2573 * 3 == ")\n\0" 2574 */ 2575 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2576 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2577 if (buf == NULL) 2578 return; 2579 2580 cp = buf; 2581 (void) strcpy(cp, "(file handle:"); 2582 while (*cp != '\0') 2583 cp++; 2584 for (ip = (int *)fhp->fh_buf; 2585 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2586 ip++) { 2587 (void) sprintf(cp, " %x", *ip); 2588 while (*cp != '\0') 2589 cp++; 2590 } 2591 (void) strcpy(cp, ")\n"); 2592 2593 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2594 2595 kmem_free(buf, bufsize); 2596 } 2597 2598 /* 2599 * Notify the system administrator that an NFS write error has 2600 * occurred. 2601 */ 2602 2603 /* seconds between ENOSPC/EDQUOT messages */ 2604 clock_t nfs_write_error_interval = 5; 2605 2606 void 2607 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2608 { 2609 mntinfo_t *mi; 2610 clock_t now; 2611 2612 mi = VTOMI(vp); 2613 /* 2614 * In case of forced unmount or zone shutdown, do not print any 2615 * messages since it can flood the console with error messages. 2616 */ 2617 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2618 return; 2619 2620 /* 2621 * No use in flooding the console with ENOSPC 2622 * messages from the same file system. 2623 */ 2624 now = ddi_get_lbolt(); 2625 if ((error != ENOSPC && error != EDQUOT) || 2626 now - mi->mi_printftime > 0) { 2627 zoneid_t zoneid = mi->mi_zone->zone_id; 2628 2629 #ifdef DEBUG 2630 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2631 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2632 #else 2633 nfs_perror(error, "NFS write error on host %s: %m.\n", 2634 VTOR(vp)->r_server->sv_hostname, NULL); 2635 #endif 2636 if (error == ENOSPC || error == EDQUOT) { 2637 zcmn_err(zoneid, CE_CONT, 2638 MSG("^File: userid=%d, groupid=%d\n"), 2639 crgetuid(cr), crgetgid(cr)); 2640 if (crgetuid(CRED()) != crgetuid(cr) || 2641 crgetgid(CRED()) != crgetgid(cr)) { 2642 zcmn_err(zoneid, CE_CONT, 2643 MSG("^User: userid=%d, groupid=%d\n"), 2644 crgetuid(CRED()), crgetgid(CRED())); 2645 } 2646 mi->mi_printftime = now + 2647 nfs_write_error_interval * hz; 2648 } 2649 nfs_printfhandle(&VTOR(vp)->r_fh); 2650 #ifdef DEBUG 2651 if (error == EACCES) { 2652 zcmn_err(zoneid, CE_CONT, 2653 MSG("^nfs_bio: cred is%s kcred\n"), 2654 cr == kcred ? "" : " not"); 2655 } 2656 #endif 2657 } 2658 } 2659 2660 /* ARGSUSED */ 2661 static void * 2662 nfs_mi_init(zoneid_t zoneid) 2663 { 2664 struct mi_globals *mig; 2665 2666 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2667 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2668 list_create(&mig->mig_list, sizeof (mntinfo_t), 2669 offsetof(mntinfo_t, mi_zone_node)); 2670 mig->mig_destructor_called = B_FALSE; 2671 return (mig); 2672 } 2673 2674 /* 2675 * Callback routine to tell all NFS mounts in the zone to stop creating new 2676 * threads. Existing threads should exit. 2677 */ 2678 /* ARGSUSED */ 2679 static void 2680 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2681 { 2682 struct mi_globals *mig = data; 2683 mntinfo_t *mi; 2684 2685 ASSERT(mig != NULL); 2686 again: 2687 mutex_enter(&mig->mig_lock); 2688 for (mi = list_head(&mig->mig_list); mi != NULL; 2689 mi = list_next(&mig->mig_list, mi)) { 2690 2691 /* 2692 * If we've done the shutdown work for this FS, skip. 2693 * Once we go off the end of the list, we're done. 2694 */ 2695 if (mi->mi_flags & MI_DEAD) 2696 continue; 2697 2698 /* 2699 * We will do work, so not done. Get a hold on the FS. 2700 */ 2701 VFS_HOLD(mi->mi_vfsp); 2702 2703 /* 2704 * purge the DNLC for this filesystem 2705 */ 2706 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2707 2708 mutex_enter(&mi->mi_async_lock); 2709 /* 2710 * Tell existing async worker threads to exit. 2711 */ 2712 mi->mi_max_threads = 0; 2713 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2714 /* 2715 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2716 * getting ready to exit when it's done with its current work. 2717 * Also set MI_DEAD to note we've acted on this FS. 2718 */ 2719 mutex_enter(&mi->mi_lock); 2720 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2721 mutex_exit(&mi->mi_lock); 2722 /* 2723 * Wake up the async manager thread. 2724 */ 2725 cv_broadcast(&mi->mi_async_reqs_cv); 2726 mutex_exit(&mi->mi_async_lock); 2727 2728 /* 2729 * Drop lock and release FS, which may change list, then repeat. 2730 * We're done when every mi has been done or the list is empty. 2731 */ 2732 mutex_exit(&mig->mig_lock); 2733 VFS_RELE(mi->mi_vfsp); 2734 goto again; 2735 } 2736 mutex_exit(&mig->mig_lock); 2737 } 2738 2739 static void 2740 nfs_mi_free_globals(struct mi_globals *mig) 2741 { 2742 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2743 mutex_destroy(&mig->mig_lock); 2744 kmem_free(mig, sizeof (*mig)); 2745 2746 } 2747 2748 /* ARGSUSED */ 2749 static void 2750 nfs_mi_destroy(zoneid_t zoneid, void *data) 2751 { 2752 struct mi_globals *mig = data; 2753 2754 ASSERT(mig != NULL); 2755 mutex_enter(&mig->mig_lock); 2756 if (list_head(&mig->mig_list) != NULL) { 2757 /* Still waiting for VFS_FREEVFS() */ 2758 mig->mig_destructor_called = B_TRUE; 2759 mutex_exit(&mig->mig_lock); 2760 return; 2761 } 2762 nfs_mi_free_globals(mig); 2763 } 2764 2765 /* 2766 * Add an NFS mount to the per-zone list of NFS mounts. 2767 */ 2768 void 2769 nfs_mi_zonelist_add(mntinfo_t *mi) 2770 { 2771 struct mi_globals *mig; 2772 2773 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2774 mutex_enter(&mig->mig_lock); 2775 list_insert_head(&mig->mig_list, mi); 2776 mutex_exit(&mig->mig_lock); 2777 } 2778 2779 /* 2780 * Remove an NFS mount from the per-zone list of NFS mounts. 2781 */ 2782 static void 2783 nfs_mi_zonelist_remove(mntinfo_t *mi) 2784 { 2785 struct mi_globals *mig; 2786 2787 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2788 mutex_enter(&mig->mig_lock); 2789 list_remove(&mig->mig_list, mi); 2790 /* 2791 * We can be called asynchronously by VFS_FREEVFS() after the zone 2792 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2793 * mi globals. 2794 */ 2795 if (list_head(&mig->mig_list) == NULL && 2796 mig->mig_destructor_called == B_TRUE) { 2797 nfs_mi_free_globals(mig); 2798 return; 2799 } 2800 mutex_exit(&mig->mig_lock); 2801 } 2802 2803 /* 2804 * NFS Client initialization routine. This routine should only be called 2805 * once. It performs the following tasks: 2806 * - Initalize all global locks 2807 * - Call sub-initialization routines (localize access to variables) 2808 */ 2809 int 2810 nfs_clntinit(void) 2811 { 2812 #ifdef DEBUG 2813 static boolean_t nfs_clntup = B_FALSE; 2814 #endif 2815 int error; 2816 2817 #ifdef DEBUG 2818 ASSERT(nfs_clntup == B_FALSE); 2819 #endif 2820 2821 error = nfs_subrinit(); 2822 if (error) 2823 return (error); 2824 2825 error = nfs_vfsinit(); 2826 if (error) { 2827 /* 2828 * Cleanup nfs_subrinit() work 2829 */ 2830 nfs_subrfini(); 2831 return (error); 2832 } 2833 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2834 nfs_mi_destroy); 2835 2836 nfs4_clnt_init(); 2837 2838 nfscmd_init(); 2839 2840 #ifdef DEBUG 2841 nfs_clntup = B_TRUE; 2842 #endif 2843 2844 return (0); 2845 } 2846 2847 /* 2848 * This routine is only called if the NFS Client has been initialized but 2849 * the module failed to be installed. This routine will cleanup the previously 2850 * allocated/initialized work. 2851 */ 2852 void 2853 nfs_clntfini(void) 2854 { 2855 (void) zone_key_delete(mi_list_key); 2856 nfs_subrfini(); 2857 nfs_vfsfini(); 2858 nfs4_clnt_fini(); 2859 nfscmd_fini(); 2860 } 2861 2862 /* 2863 * nfs_lockrelease: 2864 * 2865 * Release any locks on the given vnode that are held by the current 2866 * process. 2867 */ 2868 void 2869 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2870 { 2871 flock64_t ld; 2872 struct shrlock shr; 2873 char *buf; 2874 int remote_lock_possible; 2875 int ret; 2876 2877 ASSERT((uintptr_t)vp > KERNELBASE); 2878 2879 /* 2880 * Generate an explicit unlock operation for the entire file. As a 2881 * partial optimization, only generate the unlock if there is a 2882 * lock registered for the file. We could check whether this 2883 * particular process has any locks on the file, but that would 2884 * require the local locking code to provide yet another query 2885 * routine. Note that no explicit synchronization is needed here. 2886 * At worst, flk_has_remote_locks() will return a false positive, 2887 * in which case the unlock call wastes time but doesn't harm 2888 * correctness. 2889 * 2890 * In addition, an unlock request is generated if the process 2891 * is listed as possibly having a lock on the file because the 2892 * server and client lock managers may have gotten out of sync. 2893 * N.B. It is important to make sure nfs_remove_locking_id() is 2894 * called here even if flk_has_remote_locks(vp) reports true. 2895 * If it is not called and there is an entry on the process id 2896 * list, that entry will never get removed. 2897 */ 2898 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2899 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2900 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2901 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2902 ld.l_whence = 0; /* unlock from start of file */ 2903 ld.l_start = 0; 2904 ld.l_len = 0; /* do entire file */ 2905 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr, 2906 NULL); 2907 2908 if (ret != 0) { 2909 /* 2910 * If VOP_FRLOCK fails, make sure we unregister 2911 * local locks before we continue. 2912 */ 2913 ld.l_pid = ttoproc(curthread)->p_pid; 2914 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2915 #ifdef DEBUG 2916 nfs_perror(ret, 2917 "NFS lock release error on vp %p: %m.\n", 2918 (void *)vp, NULL); 2919 #endif 2920 } 2921 2922 /* 2923 * The call to VOP_FRLOCK may put the pid back on the 2924 * list. We need to remove it. 2925 */ 2926 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2927 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2928 } 2929 2930 /* 2931 * As long as the vp has a share matching our pid, 2932 * pluck it off and unshare it. There are circumstances in 2933 * which the call to nfs_remove_locking_id() may put the 2934 * owner back on the list, in which case we simply do a 2935 * redundant and harmless unshare. 2936 */ 2937 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2938 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2939 (char *)NULL, buf, &shr.s_own_len)) { 2940 shr.s_owner = buf; 2941 shr.s_access = 0; 2942 shr.s_deny = 0; 2943 shr.s_sysid = 0; 2944 shr.s_pid = curproc->p_pid; 2945 2946 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL); 2947 #ifdef DEBUG 2948 if (ret != 0) { 2949 nfs_perror(ret, 2950 "NFS share release error on vp %p: %m.\n", 2951 (void *)vp, NULL); 2952 } 2953 #endif 2954 } 2955 kmem_free(buf, MAX_SHR_OWNER_LEN); 2956 } 2957 2958 /* 2959 * nfs_lockcompletion: 2960 * 2961 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2962 * as non cachable (set VNOCACHE bit). 2963 */ 2964 2965 void 2966 nfs_lockcompletion(vnode_t *vp, int cmd) 2967 { 2968 #ifdef DEBUG 2969 rnode_t *rp = VTOR(vp); 2970 2971 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2972 #endif 2973 2974 if (cmd == F_SETLK || cmd == F_SETLKW) { 2975 if (!lm_safemap(vp)) { 2976 mutex_enter(&vp->v_lock); 2977 vp->v_flag |= VNOCACHE; 2978 mutex_exit(&vp->v_lock); 2979 } else { 2980 mutex_enter(&vp->v_lock); 2981 vp->v_flag &= ~VNOCACHE; 2982 mutex_exit(&vp->v_lock); 2983 } 2984 } 2985 /* 2986 * The cached attributes of the file are stale after acquiring 2987 * the lock on the file. They were updated when the file was 2988 * opened, but not updated when the lock was acquired. Therefore the 2989 * cached attributes are invalidated after the lock is obtained. 2990 */ 2991 PURGE_ATTRCACHE(vp); 2992 } 2993 2994 /* 2995 * The lock manager holds state making it possible for the client 2996 * and server to be out of sync. For example, if the response from 2997 * the server granting a lock request is lost, the server will think 2998 * the lock is granted and the client will think the lock is lost. 2999 * The client can tell when it is not positive if it is in sync with 3000 * the server. 3001 * 3002 * To deal with this, a list of processes for which the client is 3003 * not sure if the server holds a lock is attached to the rnode. 3004 * When such a process closes the rnode, an unlock request is sent 3005 * to the server to unlock the entire file. 3006 * 3007 * The list is kept as a singularly linked NULL terminated list. 3008 * Because it is only added to under extreme error conditions, the 3009 * list shouldn't get very big. DEBUG kernels print a message if 3010 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 3011 * choosen to be 8, but can be tuned at runtime. 3012 */ 3013 #ifdef DEBUG 3014 /* int nfs_lmpl_high_water = 8; */ 3015 int nfs_lmpl_high_water = 128; 3016 int nfs_cnt_add_locking_id = 0; 3017 int nfs_len_add_locking_id = 0; 3018 #endif /* DEBUG */ 3019 3020 /* 3021 * Record that the nfs lock manager server may be holding a lock on 3022 * a vnode for a process. 3023 * 3024 * Because the nfs lock manager server holds state, it is possible 3025 * for the server to get out of sync with the client. This routine is called 3026 * from the client when it is no longer sure if the server is in sync 3027 * with the client. nfs_lockrelease() will then notice this and send 3028 * an unlock request when the file is closed 3029 */ 3030 void 3031 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 3032 { 3033 rnode_t *rp; 3034 lmpl_t *new; 3035 lmpl_t *cur; 3036 lmpl_t **lmplp; 3037 #ifdef DEBUG 3038 int list_len = 1; 3039 #endif /* DEBUG */ 3040 3041 #ifdef DEBUG 3042 ++nfs_cnt_add_locking_id; 3043 #endif /* DEBUG */ 3044 /* 3045 * allocate new lmpl_t now so we don't sleep 3046 * later after grabbing mutexes 3047 */ 3048 ASSERT(len < MAX_SHR_OWNER_LEN); 3049 new = kmem_alloc(sizeof (*new), KM_SLEEP); 3050 new->lmpl_type = type; 3051 new->lmpl_pid = pid; 3052 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 3053 bcopy(id, new->lmpl_owner, len); 3054 new->lmpl_own_len = len; 3055 new->lmpl_next = (lmpl_t *)NULL; 3056 #ifdef DEBUG 3057 if (type == RLMPL_PID) { 3058 ASSERT(len == sizeof (pid_t)); 3059 ASSERT(pid == *(pid_t *)new->lmpl_owner); 3060 } else { 3061 ASSERT(type == RLMPL_OWNER); 3062 } 3063 #endif 3064 3065 rp = VTOR(vp); 3066 mutex_enter(&rp->r_statelock); 3067 3068 /* 3069 * Add this id to the list for this rnode only if the 3070 * rnode is active and the id is not already there. 3071 */ 3072 ASSERT(rp->r_flags & RHASHED); 3073 lmplp = &(rp->r_lmpl); 3074 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3075 if (cur->lmpl_pid == pid && 3076 cur->lmpl_type == type && 3077 cur->lmpl_own_len == len && 3078 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 3079 kmem_free(new->lmpl_owner, len); 3080 kmem_free(new, sizeof (*new)); 3081 break; 3082 } 3083 lmplp = &cur->lmpl_next; 3084 #ifdef DEBUG 3085 ++list_len; 3086 #endif /* DEBUG */ 3087 } 3088 if (cur == (lmpl_t *)NULL) { 3089 *lmplp = new; 3090 #ifdef DEBUG 3091 if (list_len > nfs_len_add_locking_id) { 3092 nfs_len_add_locking_id = list_len; 3093 } 3094 if (list_len > nfs_lmpl_high_water) { 3095 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 3096 "vp=%p is %d", (void *)vp, list_len); 3097 } 3098 #endif /* DEBUG */ 3099 } 3100 3101 #ifdef DEBUG 3102 if (share_debug) { 3103 int nitems = 0; 3104 int npids = 0; 3105 int nowners = 0; 3106 3107 /* 3108 * Count the number of things left on r_lmpl after the remove. 3109 */ 3110 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3111 cur = cur->lmpl_next) { 3112 nitems++; 3113 if (cur->lmpl_type == RLMPL_PID) { 3114 npids++; 3115 } else if (cur->lmpl_type == RLMPL_OWNER) { 3116 nowners++; 3117 } else { 3118 cmn_err(CE_PANIC, "nfs_add_locking_id: " 3119 "unrecognized lmpl_type %d", 3120 cur->lmpl_type); 3121 } 3122 } 3123 3124 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 3125 "OWNs = %d items left on r_lmpl\n", 3126 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 3127 } 3128 #endif 3129 3130 mutex_exit(&rp->r_statelock); 3131 } 3132 3133 /* 3134 * Remove an id from the lock manager id list. 3135 * 3136 * If the id is not in the list return 0. If it was found and 3137 * removed, return 1. 3138 */ 3139 static int 3140 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3141 { 3142 lmpl_t *cur; 3143 lmpl_t **lmplp; 3144 rnode_t *rp; 3145 int rv = 0; 3146 3147 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3148 3149 rp = VTOR(vp); 3150 3151 mutex_enter(&rp->r_statelock); 3152 ASSERT(rp->r_flags & RHASHED); 3153 lmplp = &(rp->r_lmpl); 3154 3155 /* 3156 * Search through the list and remove the entry for this id 3157 * if it is there. The special case id == NULL allows removal 3158 * of the first share on the r_lmpl list belonging to the 3159 * current process (if any), without regard to further details 3160 * of its identity. 3161 */ 3162 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3163 if (cur->lmpl_type == type && 3164 cur->lmpl_pid == curproc->p_pid && 3165 (id == (char *)NULL || 3166 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3167 *lmplp = cur->lmpl_next; 3168 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3169 if (rid != NULL) { 3170 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3171 *rlen = cur->lmpl_own_len; 3172 } 3173 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3174 kmem_free(cur, sizeof (*cur)); 3175 rv = 1; 3176 break; 3177 } 3178 lmplp = &cur->lmpl_next; 3179 } 3180 3181 #ifdef DEBUG 3182 if (share_debug) { 3183 int nitems = 0; 3184 int npids = 0; 3185 int nowners = 0; 3186 3187 /* 3188 * Count the number of things left on r_lmpl after the remove. 3189 */ 3190 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3191 cur = cur->lmpl_next) { 3192 nitems++; 3193 if (cur->lmpl_type == RLMPL_PID) { 3194 npids++; 3195 } else if (cur->lmpl_type == RLMPL_OWNER) { 3196 nowners++; 3197 } else { 3198 cmn_err(CE_PANIC, 3199 "nrli: unrecognized lmpl_type %d", 3200 cur->lmpl_type); 3201 } 3202 } 3203 3204 cmn_err(CE_CONT, 3205 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3206 (type == RLMPL_PID) ? "P" : "O", 3207 npids, 3208 nowners, 3209 nitems); 3210 } 3211 #endif 3212 3213 mutex_exit(&rp->r_statelock); 3214 return (rv); 3215 } 3216 3217 void 3218 nfs_free_mi(mntinfo_t *mi) 3219 { 3220 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3221 ASSERT(mi->mi_manager_thread == NULL); 3222 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 3223 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0); 3224 3225 /* 3226 * Remove the node from the global list before we start tearing it down. 3227 */ 3228 nfs_mi_zonelist_remove(mi); 3229 if (mi->mi_klmconfig) { 3230 lm_free_config(mi->mi_klmconfig); 3231 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3232 } 3233 mutex_destroy(&mi->mi_lock); 3234 mutex_destroy(&mi->mi_remap_lock); 3235 mutex_destroy(&mi->mi_async_lock); 3236 mutex_destroy(&mi->mi_rnodes_lock); 3237 cv_destroy(&mi->mi_failover_cv); 3238 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]); 3239 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]); 3240 cv_destroy(&mi->mi_async_reqs_cv); 3241 cv_destroy(&mi->mi_async_cv); 3242 list_destroy(&mi->mi_rnodes); 3243 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS); 3244 kmem_free(mi, sizeof (*mi)); 3245 } 3246 3247 static int 3248 mnt_kstat_update(kstat_t *ksp, int rw) 3249 { 3250 mntinfo_t *mi; 3251 struct mntinfo_kstat *mik; 3252 vfs_t *vfsp; 3253 int i; 3254 3255 /* this is a read-only kstat. Bail out on a write */ 3256 if (rw == KSTAT_WRITE) 3257 return (EACCES); 3258 3259 /* 3260 * We don't want to wait here as kstat_chain_lock could be held by 3261 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3262 * and thus could lead to a deadlock. 3263 */ 3264 vfsp = (struct vfs *)ksp->ks_private; 3265 3266 3267 mi = VFTOMI(vfsp); 3268 3269 mik = (struct mntinfo_kstat *)ksp->ks_data; 3270 3271 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3272 mik->mik_vers = (uint32_t)mi->mi_vers; 3273 mik->mik_flags = mi->mi_flags; 3274 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3275 mik->mik_curread = (uint32_t)mi->mi_curread; 3276 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3277 mik->mik_retrans = mi->mi_retrans; 3278 mik->mik_timeo = mi->mi_timeo; 3279 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3280 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3281 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3282 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3283 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3284 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3285 mik->mik_timers[i].deviate = 3286 (uint32_t)mi->mi_timers[i].rt_deviate; 3287 mik->mik_timers[i].rtxcur = 3288 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3289 } 3290 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3291 mik->mik_failover = (uint32_t)mi->mi_failover; 3292 mik->mik_remap = (uint32_t)mi->mi_remap; 3293 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3294 3295 return (0); 3296 } 3297 3298 void 3299 nfs_mnt_kstat_init(struct vfs *vfsp) 3300 { 3301 mntinfo_t *mi = VFTOMI(vfsp); 3302 3303 /* 3304 * Create the version specific kstats. 3305 * 3306 * PSARC 2001/697 Contract Private Interface 3307 * All nfs kstats are under SunMC contract 3308 * Please refer to the PSARC listed above and contact 3309 * SunMC before making any changes! 3310 * 3311 * Changes must be reviewed by Solaris File Sharing 3312 * Changes must be communicated to contract-2001-697@sun.com 3313 * 3314 */ 3315 3316 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3317 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3318 if (mi->mi_io_kstats) { 3319 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3320 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3321 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3322 kstat_install(mi->mi_io_kstats); 3323 } 3324 3325 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3326 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3327 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3328 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3329 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3330 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3331 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3332 kstat_install(mi->mi_ro_kstats); 3333 } 3334 } 3335 3336 nfs_delmapcall_t * 3337 nfs_init_delmapcall() 3338 { 3339 nfs_delmapcall_t *delmap_call; 3340 3341 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3342 delmap_call->call_id = curthread; 3343 delmap_call->error = 0; 3344 3345 return (delmap_call); 3346 } 3347 3348 void 3349 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3350 { 3351 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3352 } 3353 3354 /* 3355 * Searches for the current delmap caller (based on curthread) in the list of 3356 * callers. If it is found, we remove it and free the delmap caller. 3357 * Returns: 3358 * 0 if the caller wasn't found 3359 * 1 if the caller was found, removed and freed. *errp is set to what 3360 * the result of the delmap was. 3361 */ 3362 int 3363 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3364 { 3365 nfs_delmapcall_t *delmap_call; 3366 3367 /* 3368 * If the list doesn't exist yet, we create it and return 3369 * that the caller wasn't found. No list = no callers. 3370 */ 3371 mutex_enter(&rp->r_statelock); 3372 if (!(rp->r_flags & RDELMAPLIST)) { 3373 /* The list does not exist */ 3374 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3375 offsetof(nfs_delmapcall_t, call_node)); 3376 rp->r_flags |= RDELMAPLIST; 3377 mutex_exit(&rp->r_statelock); 3378 return (0); 3379 } else { 3380 /* The list exists so search it */ 3381 for (delmap_call = list_head(&rp->r_indelmap); 3382 delmap_call != NULL; 3383 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3384 if (delmap_call->call_id == curthread) { 3385 /* current caller is in the list */ 3386 *errp = delmap_call->error; 3387 list_remove(&rp->r_indelmap, delmap_call); 3388 mutex_exit(&rp->r_statelock); 3389 nfs_free_delmapcall(delmap_call); 3390 return (1); 3391 } 3392 } 3393 } 3394 mutex_exit(&rp->r_statelock); 3395 return (0); 3396 } 3397