1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 28 * Copyright (c) 2016, 2017 by Delphix. All rights reserved. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/proc.h> 36 #include <sys/user.h> 37 #include <sys/time.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/socket.h> 42 #include <sys/uio.h> 43 #include <sys/tiuser.h> 44 #include <sys/swap.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/kmem.h> 48 #include <sys/kstat.h> 49 #include <sys/cmn_err.h> 50 #include <sys/vtrace.h> 51 #include <sys/session.h> 52 #include <sys/dnlc.h> 53 #include <sys/bitmap.h> 54 #include <sys/acl.h> 55 #include <sys/ddi.h> 56 #include <sys/pathname.h> 57 #include <sys/flock.h> 58 #include <sys/dirent.h> 59 #include <sys/flock.h> 60 #include <sys/callb.h> 61 #include <sys/atomic.h> 62 #include <sys/list.h> 63 #include <sys/tsol/tnet.h> 64 #include <sys/priv.h> 65 #include <sys/sdt.h> 66 #include <sys/attr.h> 67 68 #include <inet/ip6.h> 69 70 #include <rpc/types.h> 71 #include <rpc/xdr.h> 72 #include <rpc/auth.h> 73 #include <rpc/clnt.h> 74 75 #include <nfs/nfs.h> 76 #include <nfs/nfs4.h> 77 #include <nfs/nfs_clnt.h> 78 #include <nfs/rnode.h> 79 #include <nfs/nfs_acl.h> 80 81 #include <sys/tsol/label.h> 82 83 /* 84 * The hash queues for the access to active and cached rnodes 85 * are organized as doubly linked lists. A reader/writer lock 86 * for each hash bucket is used to control access and to synchronize 87 * lookups, additions, and deletions from the hash queue. 88 * 89 * The rnode freelist is organized as a doubly linked list with 90 * a head pointer. Additions and deletions are synchronized via 91 * a single mutex. 92 * 93 * In order to add an rnode to the free list, it must be hashed into 94 * a hash queue and the exclusive lock to the hash queue be held. 95 * If an rnode is not hashed into a hash queue, then it is destroyed 96 * because it represents no valuable information that can be reused 97 * about the file. The exclusive lock to the hash queue must be 98 * held in order to prevent a lookup in the hash queue from finding 99 * the rnode and using it and assuming that the rnode is not on the 100 * freelist. The lookup in the hash queue will have the hash queue 101 * locked, either exclusive or shared. 102 * 103 * The vnode reference count for each rnode is not allowed to drop 104 * below 1. This prevents external entities, such as the VM 105 * subsystem, from acquiring references to vnodes already on the 106 * freelist and then trying to place them back on the freelist 107 * when their reference is released. This means that the when an 108 * rnode is looked up in the hash queues, then either the rnode 109 * is removed from the freelist and that reference is transferred to 110 * the new reference or the vnode reference count must be incremented 111 * accordingly. The mutex for the freelist must be held in order to 112 * accurately test to see if the rnode is on the freelist or not. 113 * The hash queue lock might be held shared and it is possible that 114 * two different threads may race to remove the rnode from the 115 * freelist. This race can be resolved by holding the mutex for the 116 * freelist. Please note that the mutex for the freelist does not 117 * need to held if the rnode is not on the freelist. It can not be 118 * placed on the freelist due to the requirement that the thread 119 * putting the rnode on the freelist must hold the exclusive lock 120 * to the hash queue and the thread doing the lookup in the hash 121 * queue is holding either a shared or exclusive lock to the hash 122 * queue. 123 * 124 * The lock ordering is: 125 * 126 * hash bucket lock -> vnode lock 127 * hash bucket lock -> freelist lock 128 */ 129 static rhashq_t *rtable; 130 131 static kmutex_t rpfreelist_lock; 132 static rnode_t *rpfreelist = NULL; 133 static long rnew = 0; 134 long nrnode = 0; 135 136 static int rtablesize; 137 static int rtablemask; 138 139 static int hashlen = 4; 140 141 static struct kmem_cache *rnode_cache; 142 143 /* 144 * Mutex to protect the following variables: 145 * nfs_major 146 * nfs_minor 147 */ 148 kmutex_t nfs_minor_lock; 149 int nfs_major; 150 int nfs_minor; 151 152 /* Do we allow preepoch (negative) time values otw? */ 153 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 154 155 /* 156 * Access cache 157 */ 158 static acache_hash_t *acache; 159 static long nacache; /* used strictly to size the number of hash queues */ 160 161 static int acachesize; 162 static int acachemask; 163 static struct kmem_cache *acache_cache; 164 165 /* 166 * Client side utilities 167 */ 168 169 /* 170 * client side statistics 171 */ 172 static const struct clstat clstat_tmpl = { 173 { "calls", KSTAT_DATA_UINT64 }, 174 { "badcalls", KSTAT_DATA_UINT64 }, 175 { "clgets", KSTAT_DATA_UINT64 }, 176 { "cltoomany", KSTAT_DATA_UINT64 }, 177 #ifdef DEBUG 178 { "clalloc", KSTAT_DATA_UINT64 }, 179 { "noresponse", KSTAT_DATA_UINT64 }, 180 { "failover", KSTAT_DATA_UINT64 }, 181 { "remap", KSTAT_DATA_UINT64 }, 182 #endif 183 }; 184 185 /* 186 * The following are statistics that describe behavior of the system as a whole 187 * and doesn't correspond to any one particular zone. 188 */ 189 #ifdef DEBUG 190 static struct clstat_debug { 191 kstat_named_t nrnode; /* number of allocated rnodes */ 192 kstat_named_t access; /* size of access cache */ 193 kstat_named_t dirent; /* size of readdir cache */ 194 kstat_named_t dirents; /* size of readdir buf cache */ 195 kstat_named_t reclaim; /* number of reclaims */ 196 kstat_named_t clreclaim; /* number of cl reclaims */ 197 kstat_named_t f_reclaim; /* number of free reclaims */ 198 kstat_named_t a_reclaim; /* number of active reclaims */ 199 kstat_named_t r_reclaim; /* number of rnode reclaims */ 200 kstat_named_t rpath; /* bytes used to store rpaths */ 201 } clstat_debug = { 202 { "nrnode", KSTAT_DATA_UINT64 }, 203 { "access", KSTAT_DATA_UINT64 }, 204 { "dirent", KSTAT_DATA_UINT64 }, 205 { "dirents", KSTAT_DATA_UINT64 }, 206 { "reclaim", KSTAT_DATA_UINT64 }, 207 { "clreclaim", KSTAT_DATA_UINT64 }, 208 { "f_reclaim", KSTAT_DATA_UINT64 }, 209 { "a_reclaim", KSTAT_DATA_UINT64 }, 210 { "r_reclaim", KSTAT_DATA_UINT64 }, 211 { "r_path", KSTAT_DATA_UINT64 }, 212 }; 213 #endif /* DEBUG */ 214 215 /* 216 * We keep a global list of per-zone client data, so we can clean up all zones 217 * if we get low on memory. 218 */ 219 static list_t nfs_clnt_list; 220 static kmutex_t nfs_clnt_list_lock; 221 static zone_key_t nfsclnt_zone_key; 222 223 static struct kmem_cache *chtab_cache; 224 225 /* 226 * Some servers do not properly update the attributes of the 227 * directory when changes are made. To allow interoperability 228 * with these broken servers, the nfs_disable_rddir_cache 229 * parameter must be set in /etc/system 230 */ 231 int nfs_disable_rddir_cache = 0; 232 233 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 234 struct chtab **); 235 void clfree(CLIENT *, struct chtab *); 236 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 237 struct chtab **, struct nfs_clnt *); 238 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 239 struct chtab **, struct nfs_clnt *); 240 static void clreclaim(void *); 241 static int nfs_feedback(int, int, mntinfo_t *); 242 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 243 caddr_t, cred_t *, int *, enum clnt_stat *, int, 244 failinfo_t *); 245 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 246 caddr_t, cred_t *, int *, int, failinfo_t *); 247 static void rinactive(rnode_t *, cred_t *); 248 static int rtablehash(nfs_fhandle *); 249 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 250 struct vnodeops *, 251 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 252 cred_t *), 253 int (*)(const void *, const void *), int *, cred_t *, 254 char *, char *); 255 static void rp_rmfree(rnode_t *); 256 static void rp_addhash(rnode_t *); 257 static void rp_rmhash_locked(rnode_t *); 258 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 259 static void destroy_rnode(rnode_t *); 260 static void rddir_cache_free(rddir_cache *); 261 static int nfs_free_data_reclaim(rnode_t *); 262 static int nfs_active_data_reclaim(rnode_t *); 263 static int nfs_free_reclaim(void); 264 static int nfs_active_reclaim(void); 265 static int nfs_rnode_reclaim(void); 266 static void nfs_reclaim(void *); 267 static int failover_safe(failinfo_t *); 268 static void failover_newserver(mntinfo_t *mi); 269 static void failover_thread(mntinfo_t *mi); 270 static int failover_wait(mntinfo_t *); 271 static int failover_remap(failinfo_t *); 272 static int failover_lookup(char *, vnode_t *, 273 int (*)(vnode_t *, char *, vnode_t **, 274 struct pathname *, int, vnode_t *, cred_t *, int), 275 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 276 vnode_t **); 277 static void nfs_free_r_path(rnode_t *); 278 static void nfs_set_vroot(vnode_t *); 279 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 280 281 /* 282 * from rpcsec module (common/rpcsec) 283 */ 284 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 285 extern void sec_clnt_freeh(AUTH *); 286 extern void sec_clnt_freeinfo(struct sec_data *); 287 288 /* 289 * used in mount policy 290 */ 291 extern ts_label_t *getflabel_cipso(vfs_t *); 292 293 /* 294 * EIO or EINTR are not recoverable errors. 295 */ 296 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 297 298 #ifdef DEBUG 299 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" 300 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" 301 #else 302 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" 303 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" 304 #endif 305 /* 306 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 307 */ 308 static int 309 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 310 struct chtab **chp, struct nfs_clnt *nfscl) 311 { 312 struct chhead *ch, *newch; 313 struct chhead **plistp; 314 struct chtab *cp; 315 int error; 316 k_sigset_t smask; 317 318 if (newcl == NULL || chp == NULL || ci == NULL) 319 return (EINVAL); 320 321 *newcl = NULL; 322 *chp = NULL; 323 324 /* 325 * Find an unused handle or create one 326 */ 327 newch = NULL; 328 nfscl->nfscl_stat.clgets.value.ui64++; 329 top: 330 /* 331 * Find the correct entry in the cache to check for free 332 * client handles. The search is based on the RPC program 333 * number, program version number, dev_t for the transport 334 * device, and the protocol family. 335 */ 336 mutex_enter(&nfscl->nfscl_chtable_lock); 337 plistp = &nfscl->nfscl_chtable; 338 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 339 if (ch->ch_prog == ci->cl_prog && 340 ch->ch_vers == ci->cl_vers && 341 ch->ch_dev == svp->sv_knconf->knc_rdev && 342 (strcmp(ch->ch_protofmly, 343 svp->sv_knconf->knc_protofmly) == 0)) 344 break; 345 plistp = &ch->ch_next; 346 } 347 348 /* 349 * If we didn't find a cache entry for this quadruple, then 350 * create one. If we don't have one already preallocated, 351 * then drop the cache lock, create one, and then start over. 352 * If we did have a preallocated entry, then just add it to 353 * the front of the list. 354 */ 355 if (ch == NULL) { 356 if (newch == NULL) { 357 mutex_exit(&nfscl->nfscl_chtable_lock); 358 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 359 newch->ch_timesused = 0; 360 newch->ch_prog = ci->cl_prog; 361 newch->ch_vers = ci->cl_vers; 362 newch->ch_dev = svp->sv_knconf->knc_rdev; 363 newch->ch_protofmly = kmem_alloc( 364 strlen(svp->sv_knconf->knc_protofmly) + 1, 365 KM_SLEEP); 366 (void) strcpy(newch->ch_protofmly, 367 svp->sv_knconf->knc_protofmly); 368 newch->ch_list = NULL; 369 goto top; 370 } 371 ch = newch; 372 newch = NULL; 373 ch->ch_next = nfscl->nfscl_chtable; 374 nfscl->nfscl_chtable = ch; 375 /* 376 * We found a cache entry, but if it isn't on the front of the 377 * list, then move it to the front of the list to try to take 378 * advantage of locality of operations. 379 */ 380 } else if (ch != nfscl->nfscl_chtable) { 381 *plistp = ch->ch_next; 382 ch->ch_next = nfscl->nfscl_chtable; 383 nfscl->nfscl_chtable = ch; 384 } 385 386 /* 387 * If there was a free client handle cached, then remove it 388 * from the list, init it, and use it. 389 */ 390 if (ch->ch_list != NULL) { 391 cp = ch->ch_list; 392 ch->ch_list = cp->ch_list; 393 mutex_exit(&nfscl->nfscl_chtable_lock); 394 if (newch != NULL) { 395 kmem_free(newch->ch_protofmly, 396 strlen(newch->ch_protofmly) + 1); 397 kmem_free(newch, sizeof (*newch)); 398 } 399 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 400 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 401 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 402 &cp->ch_client->cl_auth); 403 if (error || cp->ch_client->cl_auth == NULL) { 404 CLNT_DESTROY(cp->ch_client); 405 kmem_cache_free(chtab_cache, cp); 406 return ((error != 0) ? error : EINTR); 407 } 408 ch->ch_timesused++; 409 *newcl = cp->ch_client; 410 *chp = cp; 411 return (0); 412 } 413 414 /* 415 * There weren't any free client handles which fit, so allocate 416 * a new one and use that. 417 */ 418 #ifdef DEBUG 419 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64); 420 #endif 421 mutex_exit(&nfscl->nfscl_chtable_lock); 422 423 nfscl->nfscl_stat.cltoomany.value.ui64++; 424 if (newch != NULL) { 425 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 426 kmem_free(newch, sizeof (*newch)); 427 } 428 429 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 430 cp->ch_head = ch; 431 432 sigintr(&smask, (int)ci->cl_flags & MI_INT); 433 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 434 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 435 sigunintr(&smask); 436 437 if (error != 0) { 438 kmem_cache_free(chtab_cache, cp); 439 #ifdef DEBUG 440 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 441 #endif 442 /* 443 * Warning is unnecessary if error is EINTR. 444 */ 445 if (error != EINTR) { 446 nfs_cmn_err(error, CE_WARN, 447 "clget: couldn't create handle: %m\n"); 448 } 449 return (error); 450 } 451 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 452 auth_destroy(cp->ch_client->cl_auth); 453 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 454 &cp->ch_client->cl_auth); 455 if (error || cp->ch_client->cl_auth == NULL) { 456 CLNT_DESTROY(cp->ch_client); 457 kmem_cache_free(chtab_cache, cp); 458 #ifdef DEBUG 459 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 460 #endif 461 return ((error != 0) ? error : EINTR); 462 } 463 ch->ch_timesused++; 464 *newcl = cp->ch_client; 465 ASSERT(cp->ch_client->cl_nosignal == FALSE); 466 *chp = cp; 467 return (0); 468 } 469 470 int 471 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 472 struct chtab **chp) 473 { 474 struct nfs_clnt *nfscl; 475 476 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 477 ASSERT(nfscl != NULL); 478 479 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 480 } 481 482 static int 483 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 484 struct chtab **chp, struct nfs_clnt *nfscl) 485 { 486 clinfo_t ci; 487 int error; 488 489 /* 490 * Set read buffer size to rsize 491 * and add room for RPC headers. 492 */ 493 ci.cl_readsize = mi->mi_tsize; 494 if (ci.cl_readsize != 0) 495 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 496 497 /* 498 * If soft mount and server is down just try once. 499 * meaning: do not retransmit. 500 */ 501 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 502 ci.cl_retrans = 0; 503 else 504 ci.cl_retrans = mi->mi_retrans; 505 506 ci.cl_prog = NFS_ACL_PROGRAM; 507 ci.cl_vers = mi->mi_vers; 508 ci.cl_flags = mi->mi_flags; 509 510 /* 511 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 512 * security flavor, the client tries to establish a security context 513 * by contacting the server. If the connection is timed out or reset, 514 * e.g. server reboot, we will try again. 515 */ 516 do { 517 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 518 519 if (error == 0) 520 break; 521 522 /* 523 * For forced unmount or zone shutdown, bail out, no retry. 524 */ 525 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 526 error = EIO; 527 break; 528 } 529 530 /* do not retry for softmount */ 531 if (!(mi->mi_flags & MI_HARD)) 532 break; 533 534 /* let the caller deal with the failover case */ 535 if (FAILOVER_MOUNT(mi)) 536 break; 537 538 } while (error == ETIMEDOUT || error == ECONNRESET); 539 540 return (error); 541 } 542 543 static int 544 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 545 struct chtab **chp, struct nfs_clnt *nfscl) 546 { 547 clinfo_t ci; 548 int error; 549 550 /* 551 * Set read buffer size to rsize 552 * and add room for RPC headers. 553 */ 554 ci.cl_readsize = mi->mi_tsize; 555 if (ci.cl_readsize != 0) 556 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 557 558 /* 559 * If soft mount and server is down just try once. 560 * meaning: do not retransmit. 561 */ 562 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 563 ci.cl_retrans = 0; 564 else 565 ci.cl_retrans = mi->mi_retrans; 566 567 ci.cl_prog = mi->mi_prog; 568 ci.cl_vers = mi->mi_vers; 569 ci.cl_flags = mi->mi_flags; 570 571 /* 572 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 573 * security flavor, the client tries to establish a security context 574 * by contacting the server. If the connection is timed out or reset, 575 * e.g. server reboot, we will try again. 576 */ 577 do { 578 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 579 580 if (error == 0) 581 break; 582 583 /* 584 * For forced unmount or zone shutdown, bail out, no retry. 585 */ 586 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 587 error = EIO; 588 break; 589 } 590 591 /* do not retry for softmount */ 592 if (!(mi->mi_flags & MI_HARD)) 593 break; 594 595 /* let the caller deal with the failover case */ 596 if (FAILOVER_MOUNT(mi)) 597 break; 598 599 } while (error == ETIMEDOUT || error == ECONNRESET); 600 601 return (error); 602 } 603 604 static void 605 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 606 { 607 if (cl->cl_auth != NULL) { 608 sec_clnt_freeh(cl->cl_auth); 609 cl->cl_auth = NULL; 610 } 611 612 /* 613 * Timestamp this cache entry so that we know when it was last 614 * used. 615 */ 616 cp->ch_freed = gethrestime_sec(); 617 618 /* 619 * Add the free client handle to the front of the list. 620 * This way, the list will be sorted in youngest to oldest 621 * order. 622 */ 623 mutex_enter(&nfscl->nfscl_chtable_lock); 624 cp->ch_list = cp->ch_head->ch_list; 625 cp->ch_head->ch_list = cp; 626 mutex_exit(&nfscl->nfscl_chtable_lock); 627 } 628 629 void 630 clfree(CLIENT *cl, struct chtab *cp) 631 { 632 struct nfs_clnt *nfscl; 633 634 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 635 ASSERT(nfscl != NULL); 636 637 clfree_impl(cl, cp, nfscl); 638 } 639 640 #define CL_HOLDTIME 60 /* time to hold client handles */ 641 642 static void 643 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 644 { 645 struct chhead *ch; 646 struct chtab *cp; /* list of objects that can be reclaimed */ 647 struct chtab *cpe; 648 struct chtab *cpl; 649 struct chtab **cpp; 650 #ifdef DEBUG 651 int n = 0; 652 #endif 653 654 /* 655 * Need to reclaim some memory, so step through the cache 656 * looking through the lists for entries which can be freed. 657 */ 658 cp = NULL; 659 660 mutex_enter(&nfscl->nfscl_chtable_lock); 661 662 /* 663 * Here we step through each non-NULL quadruple and start to 664 * construct the reclaim list pointed to by cp. Note that 665 * cp will contain all eligible chtab entries. When this traversal 666 * completes, chtab entries from the last quadruple will be at the 667 * front of cp and entries from previously inspected quadruples have 668 * been appended to the rear of cp. 669 */ 670 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 671 if (ch->ch_list == NULL) 672 continue; 673 /* 674 * Search each list for entries older then 675 * cl_holdtime seconds. The lists are maintained 676 * in youngest to oldest order so that when the 677 * first entry is found which is old enough, then 678 * all of the rest of the entries on the list will 679 * be old enough as well. 680 */ 681 cpl = ch->ch_list; 682 cpp = &ch->ch_list; 683 while (cpl != NULL && 684 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 685 cpp = &cpl->ch_list; 686 cpl = cpl->ch_list; 687 } 688 if (cpl != NULL) { 689 *cpp = NULL; 690 if (cp != NULL) { 691 cpe = cpl; 692 while (cpe->ch_list != NULL) 693 cpe = cpe->ch_list; 694 cpe->ch_list = cp; 695 } 696 cp = cpl; 697 } 698 } 699 700 mutex_exit(&nfscl->nfscl_chtable_lock); 701 702 /* 703 * If cp is empty, then there is nothing to reclaim here. 704 */ 705 if (cp == NULL) 706 return; 707 708 /* 709 * Step through the list of entries to free, destroying each client 710 * handle and kmem_free'ing the memory for each entry. 711 */ 712 while (cp != NULL) { 713 #ifdef DEBUG 714 n++; 715 #endif 716 CLNT_DESTROY(cp->ch_client); 717 cpl = cp->ch_list; 718 kmem_cache_free(chtab_cache, cp); 719 cp = cpl; 720 } 721 722 #ifdef DEBUG 723 /* 724 * Update clalloc so that nfsstat shows the current number 725 * of allocated client handles. 726 */ 727 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 728 #endif 729 } 730 731 /* ARGSUSED */ 732 static void 733 clreclaim(void *all) 734 { 735 struct nfs_clnt *nfscl; 736 737 #ifdef DEBUG 738 clstat_debug.clreclaim.value.ui64++; 739 #endif 740 /* 741 * The system is low on memory; go through and try to reclaim some from 742 * every zone on the system. 743 */ 744 mutex_enter(&nfs_clnt_list_lock); 745 nfscl = list_head(&nfs_clnt_list); 746 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 747 clreclaim_zone(nfscl, CL_HOLDTIME); 748 mutex_exit(&nfs_clnt_list_lock); 749 } 750 751 /* 752 * Minimum time-out values indexed by call type 753 * These units are in "eights" of a second to avoid multiplies 754 */ 755 static unsigned int minimum_timeo[] = { 756 6, 7, 10 757 }; 758 759 /* 760 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 761 */ 762 #define MAXTIMO (20*hz) 763 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 764 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 765 766 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 767 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 768 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 769 770 /* 771 * Function called when rfscall notices that we have been 772 * re-transmitting, or when we get a response without retransmissions. 773 * Return 1 if the transfer size was adjusted down - 0 if no change. 774 */ 775 static int 776 nfs_feedback(int flag, int which, mntinfo_t *mi) 777 { 778 int kind; 779 int r = 0; 780 781 mutex_enter(&mi->mi_lock); 782 if (flag == FEEDBACK_REXMIT1) { 783 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 784 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 785 goto done; 786 if (mi->mi_curread > MIN_NFS_TSIZE) { 787 mi->mi_curread /= 2; 788 if (mi->mi_curread < MIN_NFS_TSIZE) 789 mi->mi_curread = MIN_NFS_TSIZE; 790 r = 1; 791 } 792 793 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 794 mi->mi_curwrite /= 2; 795 if (mi->mi_curwrite < MIN_NFS_TSIZE) 796 mi->mi_curwrite = MIN_NFS_TSIZE; 797 r = 1; 798 } 799 } else if (flag == FEEDBACK_OK) { 800 kind = mi->mi_timer_type[which]; 801 if (kind == 0 || 802 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 803 goto done; 804 if (kind == 1) { 805 if (mi->mi_curread >= mi->mi_tsize) 806 goto done; 807 mi->mi_curread += MIN_NFS_TSIZE; 808 if (mi->mi_curread > mi->mi_tsize/2) 809 mi->mi_curread = mi->mi_tsize; 810 } else if (kind == 2) { 811 if (mi->mi_curwrite >= mi->mi_stsize) 812 goto done; 813 mi->mi_curwrite += MIN_NFS_TSIZE; 814 if (mi->mi_curwrite > mi->mi_stsize/2) 815 mi->mi_curwrite = mi->mi_stsize; 816 } 817 } 818 done: 819 mutex_exit(&mi->mi_lock); 820 return (r); 821 } 822 823 #ifdef DEBUG 824 static int rfs2call_hits = 0; 825 static int rfs2call_misses = 0; 826 #endif 827 828 int 829 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 830 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 831 enum nfsstat *statusp, int flags, failinfo_t *fi) 832 { 833 int rpcerror; 834 enum clnt_stat rpc_status; 835 836 ASSERT(statusp != NULL); 837 838 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 839 cr, douprintf, &rpc_status, flags, fi); 840 if (!rpcerror) { 841 /* 842 * See crnetadjust() for comments. 843 */ 844 if (*statusp == NFSERR_ACCES && 845 (cr = crnetadjust(cr)) != NULL) { 846 #ifdef DEBUG 847 rfs2call_hits++; 848 #endif 849 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 850 resp, cr, douprintf, NULL, flags, fi); 851 crfree(cr); 852 #ifdef DEBUG 853 if (*statusp == NFSERR_ACCES) 854 rfs2call_misses++; 855 #endif 856 } 857 } else if (rpc_status == RPC_PROCUNAVAIL) { 858 *statusp = NFSERR_OPNOTSUPP; 859 rpcerror = 0; 860 } 861 862 return (rpcerror); 863 } 864 865 #define NFS3_JUKEBOX_DELAY 10 * hz 866 867 static clock_t nfs3_jukebox_delay = 0; 868 869 #ifdef DEBUG 870 static int rfs3call_hits = 0; 871 static int rfs3call_misses = 0; 872 #endif 873 874 int 875 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 876 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 877 nfsstat3 *statusp, int flags, failinfo_t *fi) 878 { 879 int rpcerror; 880 int user_informed; 881 882 user_informed = 0; 883 do { 884 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 885 cr, douprintf, NULL, flags, fi); 886 if (!rpcerror) { 887 cred_t *crr; 888 if (*statusp == NFS3ERR_JUKEBOX) { 889 if (ttoproc(curthread) == &p0) { 890 rpcerror = EAGAIN; 891 break; 892 } 893 if (!user_informed) { 894 user_informed = 1; 895 uprintf( 896 "file temporarily unavailable on the server, retrying...\n"); 897 } 898 delay(nfs3_jukebox_delay); 899 } 900 /* 901 * See crnetadjust() for comments. 902 */ 903 else if (*statusp == NFS3ERR_ACCES && 904 (crr = crnetadjust(cr)) != NULL) { 905 #ifdef DEBUG 906 rfs3call_hits++; 907 #endif 908 rpcerror = rfscall(mi, which, xdrargs, argsp, 909 xdrres, resp, crr, douprintf, 910 NULL, flags, fi); 911 912 crfree(crr); 913 #ifdef DEBUG 914 if (*statusp == NFS3ERR_ACCES) 915 rfs3call_misses++; 916 #endif 917 } 918 } 919 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 920 921 return (rpcerror); 922 } 923 924 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 925 #define INC_READERS(mi) { \ 926 mi->mi_readers++; \ 927 } 928 #define DEC_READERS(mi) { \ 929 mi->mi_readers--; \ 930 if (mi->mi_readers == 0) \ 931 cv_broadcast(&mi->mi_failover_cv); \ 932 } 933 934 static int 935 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 936 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 937 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 938 { 939 CLIENT *client; 940 struct chtab *ch; 941 cred_t *cr = icr; 942 enum clnt_stat status; 943 struct rpc_err rpcerr, rpcerr_tmp; 944 struct timeval wait; 945 int timeo; /* in units of hz */ 946 int my_rsize, my_wsize; 947 bool_t tryagain; 948 bool_t cred_cloned = FALSE; 949 k_sigset_t smask; 950 servinfo_t *svp; 951 struct nfs_clnt *nfscl; 952 zoneid_t zoneid = getzoneid(); 953 char *msg; 954 #ifdef DEBUG 955 char *bufp; 956 #endif 957 958 959 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 960 "rfscall_start:which %d mi %p", which, mi); 961 962 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 963 ASSERT(nfscl != NULL); 964 965 nfscl->nfscl_stat.calls.value.ui64++; 966 mi->mi_reqs[which].value.ui64++; 967 968 rpcerr.re_status = RPC_SUCCESS; 969 970 /* 971 * In case of forced unmount or zone shutdown, return EIO. 972 */ 973 974 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 975 rpcerr.re_status = RPC_FAILED; 976 rpcerr.re_errno = EIO; 977 return (rpcerr.re_errno); 978 } 979 980 /* 981 * Remember the transfer sizes in case 982 * nfs_feedback changes them underneath us. 983 */ 984 my_rsize = mi->mi_curread; 985 my_wsize = mi->mi_curwrite; 986 987 /* 988 * NFS client failover support 989 * 990 * If this rnode is not in sync with the current server (VALID_FH), 991 * we'd like to do a remap to get in sync. We can be interrupted 992 * in failover_remap(), and if so we'll bail. Otherwise, we'll 993 * use the best info we have to try the RPC. Part of that is 994 * unconditionally updating the filehandle copy kept for V3. 995 * 996 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 997 * rw_enter(); we're trying to keep the current server from being 998 * changed on us until we're done with the remapping and have a 999 * matching client handle. We don't want to sending a filehandle 1000 * to the wrong host. 1001 */ 1002 failoverretry: 1003 if (FAILOVER_MOUNT(mi)) { 1004 mutex_enter(&mi->mi_lock); 1005 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1006 if (failover_wait(mi)) { 1007 mutex_exit(&mi->mi_lock); 1008 return (EINTR); 1009 } 1010 } 1011 INC_READERS(mi); 1012 mutex_exit(&mi->mi_lock); 1013 if (fi) { 1014 if (!VALID_FH(fi) && 1015 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1016 int remaperr; 1017 1018 svp = mi->mi_curr_serv; 1019 remaperr = failover_remap(fi); 1020 if (remaperr != 0) { 1021 #ifdef DEBUG 1022 if (remaperr != EINTR) 1023 nfs_cmn_err(remaperr, CE_WARN, 1024 "rfscall couldn't failover: %m"); 1025 #endif 1026 mutex_enter(&mi->mi_lock); 1027 DEC_READERS(mi); 1028 mutex_exit(&mi->mi_lock); 1029 /* 1030 * If failover_remap returns ETIMEDOUT 1031 * and the filesystem is hard mounted 1032 * we have to retry the call with a new 1033 * server. 1034 */ 1035 if ((mi->mi_flags & MI_HARD) && 1036 IS_RECOVERABLE_ERROR(remaperr)) { 1037 if (svp == mi->mi_curr_serv) 1038 failover_newserver(mi); 1039 rpcerr.re_status = RPC_SUCCESS; 1040 goto failoverretry; 1041 } 1042 rpcerr.re_errno = remaperr; 1043 return (remaperr); 1044 } 1045 } 1046 if (fi->fhp && fi->copyproc) 1047 (*fi->copyproc)(fi->fhp, fi->vp); 1048 } 1049 } 1050 1051 /* For TSOL, use a new cred which has net_mac_aware flag */ 1052 if (!cred_cloned && is_system_labeled()) { 1053 cred_cloned = TRUE; 1054 cr = crdup(icr); 1055 (void) setpflags(NET_MAC_AWARE, 1, cr); 1056 } 1057 1058 /* 1059 * clget() calls clnt_tli_kinit() which clears the xid, so we 1060 * are guaranteed to reprocess the retry as a new request. 1061 */ 1062 svp = mi->mi_curr_serv; 1063 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1064 1065 if (FAILOVER_MOUNT(mi)) { 1066 mutex_enter(&mi->mi_lock); 1067 DEC_READERS(mi); 1068 mutex_exit(&mi->mi_lock); 1069 1070 if ((rpcerr.re_errno == ETIMEDOUT || 1071 rpcerr.re_errno == ECONNRESET) && 1072 failover_safe(fi)) { 1073 if (svp == mi->mi_curr_serv) 1074 failover_newserver(mi); 1075 goto failoverretry; 1076 } 1077 } 1078 if (rpcerr.re_errno != 0) 1079 return (rpcerr.re_errno); 1080 1081 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1082 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1083 timeo = (mi->mi_timeo * hz) / 10; 1084 } else { 1085 mutex_enter(&mi->mi_lock); 1086 timeo = CLNT_SETTIMERS(client, 1087 &(mi->mi_timers[mi->mi_timer_type[which]]), 1088 &(mi->mi_timers[NFS_CALLTYPES]), 1089 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1090 (void (*)())NULL, (caddr_t)mi, 0); 1091 mutex_exit(&mi->mi_lock); 1092 } 1093 1094 /* 1095 * If hard mounted fs, retry call forever unless hard error occurs. 1096 */ 1097 do { 1098 tryagain = FALSE; 1099 1100 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1101 status = RPC_FAILED; 1102 rpcerr.re_status = RPC_FAILED; 1103 rpcerr.re_errno = EIO; 1104 break; 1105 } 1106 1107 TICK_TO_TIMEVAL(timeo, &wait); 1108 1109 /* 1110 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1111 * and SIGTERM. (Preserving the existing masks). 1112 * Mask out SIGINT if mount option nointr is specified. 1113 */ 1114 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1115 if (!(mi->mi_flags & MI_INT)) 1116 client->cl_nosignal = TRUE; 1117 1118 /* 1119 * If there is a current signal, then don't bother 1120 * even trying to send out the request because we 1121 * won't be able to block waiting for the response. 1122 * Simply assume RPC_INTR and get on with it. 1123 */ 1124 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1125 status = RPC_INTR; 1126 else { 1127 status = CLNT_CALL(client, which, xdrargs, argsp, 1128 xdrres, resp, wait); 1129 } 1130 1131 if (!(mi->mi_flags & MI_INT)) 1132 client->cl_nosignal = FALSE; 1133 /* 1134 * restore original signal mask 1135 */ 1136 sigunintr(&smask); 1137 1138 switch (status) { 1139 case RPC_SUCCESS: 1140 if ((mi->mi_flags & MI_DYNAMIC) && 1141 mi->mi_timer_type[which] != 0 && 1142 (mi->mi_curread != my_rsize || 1143 mi->mi_curwrite != my_wsize)) 1144 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1145 break; 1146 1147 case RPC_INTR: 1148 /* 1149 * There is no way to recover from this error, 1150 * even if mount option nointr is specified. 1151 * SIGKILL, for example, cannot be blocked. 1152 */ 1153 rpcerr.re_status = RPC_INTR; 1154 rpcerr.re_errno = EINTR; 1155 break; 1156 1157 case RPC_UDERROR: 1158 /* 1159 * If the NFS server is local (vold) and 1160 * it goes away then we get RPC_UDERROR. 1161 * This is a retryable error, so we would 1162 * loop, so check to see if the specific 1163 * error was ECONNRESET, indicating that 1164 * target did not exist at all. If so, 1165 * return with RPC_PROGUNAVAIL and 1166 * ECONNRESET to indicate why. 1167 */ 1168 CLNT_GETERR(client, &rpcerr); 1169 if (rpcerr.re_errno == ECONNRESET) { 1170 rpcerr.re_status = RPC_PROGUNAVAIL; 1171 rpcerr.re_errno = ECONNRESET; 1172 break; 1173 } 1174 /*FALLTHROUGH*/ 1175 1176 default: /* probably RPC_TIMEDOUT */ 1177 if (IS_UNRECOVERABLE_RPC(status)) 1178 break; 1179 1180 /* 1181 * increment server not responding count 1182 */ 1183 mutex_enter(&mi->mi_lock); 1184 mi->mi_noresponse++; 1185 mutex_exit(&mi->mi_lock); 1186 #ifdef DEBUG 1187 nfscl->nfscl_stat.noresponse.value.ui64++; 1188 #endif 1189 1190 if (!(mi->mi_flags & MI_HARD)) { 1191 if (!(mi->mi_flags & MI_SEMISOFT) || 1192 (mi->mi_ss_call_type[which] == 0)) 1193 break; 1194 } 1195 1196 /* 1197 * The call is in progress (over COTS). 1198 * Try the CLNT_CALL again, but don't 1199 * print a noisy error message. 1200 */ 1201 if (status == RPC_INPROGRESS) { 1202 tryagain = TRUE; 1203 break; 1204 } 1205 1206 if (flags & RFSCALL_SOFT) 1207 break; 1208 1209 /* 1210 * On zone shutdown, just move on. 1211 */ 1212 if (zone_status_get(curproc->p_zone) >= 1213 ZONE_IS_SHUTTING_DOWN) { 1214 rpcerr.re_status = RPC_FAILED; 1215 rpcerr.re_errno = EIO; 1216 break; 1217 } 1218 1219 /* 1220 * NFS client failover support 1221 * 1222 * If the current server just failed us, we'll 1223 * start the process of finding a new server. 1224 * After that, we can just retry. 1225 */ 1226 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1227 if (svp == mi->mi_curr_serv) 1228 failover_newserver(mi); 1229 clfree_impl(client, ch, nfscl); 1230 goto failoverretry; 1231 } 1232 1233 tryagain = TRUE; 1234 timeo = backoff(timeo); 1235 1236 CLNT_GETERR(client, &rpcerr_tmp); 1237 if ((status == RPC_CANTSEND) && 1238 (rpcerr_tmp.re_errno == ENOBUFS)) 1239 msg = SRV_QFULL_MSG; 1240 else 1241 msg = SRV_NOTRESP_MSG; 1242 1243 mutex_enter(&mi->mi_lock); 1244 if (!(mi->mi_flags & MI_PRINTED)) { 1245 mi->mi_flags |= MI_PRINTED; 1246 mutex_exit(&mi->mi_lock); 1247 #ifdef DEBUG 1248 zprintf(zoneid, msg, mi->mi_vers, 1249 svp->sv_hostname); 1250 #else 1251 zprintf(zoneid, msg, svp->sv_hostname); 1252 #endif 1253 } else 1254 mutex_exit(&mi->mi_lock); 1255 if (*douprintf && nfs_has_ctty()) { 1256 *douprintf = 0; 1257 if (!(mi->mi_flags & MI_NOPRINT)) 1258 #ifdef DEBUG 1259 uprintf(msg, mi->mi_vers, 1260 svp->sv_hostname); 1261 #else 1262 uprintf(msg, svp->sv_hostname); 1263 #endif 1264 } 1265 1266 /* 1267 * If doing dynamic adjustment of transfer 1268 * size and if it's a read or write call 1269 * and if the transfer size changed while 1270 * retransmitting or if the feedback routine 1271 * changed the transfer size, 1272 * then exit rfscall so that the transfer 1273 * size can be adjusted at the vnops level. 1274 */ 1275 if ((mi->mi_flags & MI_DYNAMIC) && 1276 mi->mi_timer_type[which] != 0 && 1277 (mi->mi_curread != my_rsize || 1278 mi->mi_curwrite != my_wsize || 1279 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1280 /* 1281 * On read or write calls, return 1282 * back to the vnode ops level if 1283 * the transfer size changed. 1284 */ 1285 clfree_impl(client, ch, nfscl); 1286 if (cred_cloned) 1287 crfree(cr); 1288 return (ENFS_TRYAGAIN); 1289 } 1290 } 1291 } while (tryagain); 1292 1293 if (status != RPC_SUCCESS) { 1294 /* 1295 * Let soft mounts use the timed out message. 1296 */ 1297 if (status == RPC_INPROGRESS) 1298 status = RPC_TIMEDOUT; 1299 nfscl->nfscl_stat.badcalls.value.ui64++; 1300 if (status != RPC_INTR) { 1301 mutex_enter(&mi->mi_lock); 1302 mi->mi_flags |= MI_DOWN; 1303 mutex_exit(&mi->mi_lock); 1304 CLNT_GETERR(client, &rpcerr); 1305 #ifdef DEBUG 1306 bufp = clnt_sperror(client, svp->sv_hostname); 1307 zprintf(zoneid, "NFS%d %s failed for %s\n", 1308 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1309 if (nfs_has_ctty()) { 1310 if (!(mi->mi_flags & MI_NOPRINT)) { 1311 uprintf("NFS%d %s failed for %s\n", 1312 mi->mi_vers, mi->mi_rfsnames[which], 1313 bufp); 1314 } 1315 } 1316 kmem_free(bufp, MAXPATHLEN); 1317 #else 1318 zprintf(zoneid, 1319 "NFS %s failed for server %s: error %d (%s)\n", 1320 mi->mi_rfsnames[which], svp->sv_hostname, 1321 status, clnt_sperrno(status)); 1322 if (nfs_has_ctty()) { 1323 if (!(mi->mi_flags & MI_NOPRINT)) { 1324 uprintf( 1325 "NFS %s failed for server %s: error %d (%s)\n", 1326 mi->mi_rfsnames[which], 1327 svp->sv_hostname, status, 1328 clnt_sperrno(status)); 1329 } 1330 } 1331 #endif 1332 /* 1333 * when CLNT_CALL() fails with RPC_AUTHERROR, 1334 * re_errno is set appropriately depending on 1335 * the authentication error 1336 */ 1337 if (status == RPC_VERSMISMATCH || 1338 status == RPC_PROGVERSMISMATCH) 1339 rpcerr.re_errno = EIO; 1340 } 1341 } else { 1342 /* 1343 * Test the value of mi_down and mi_printed without 1344 * holding the mi_lock mutex. If they are both zero, 1345 * then it is okay to skip the down and printed 1346 * processing. This saves on a mutex_enter and 1347 * mutex_exit pair for a normal, successful RPC. 1348 * This was just complete overhead. 1349 */ 1350 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1351 mutex_enter(&mi->mi_lock); 1352 mi->mi_flags &= ~MI_DOWN; 1353 if (mi->mi_flags & MI_PRINTED) { 1354 mi->mi_flags &= ~MI_PRINTED; 1355 mutex_exit(&mi->mi_lock); 1356 #ifdef DEBUG 1357 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1358 zprintf(zoneid, "NFS%d server %s ok\n", 1359 mi->mi_vers, svp->sv_hostname); 1360 #else 1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1362 zprintf(zoneid, "NFS server %s ok\n", 1363 svp->sv_hostname); 1364 #endif 1365 } else 1366 mutex_exit(&mi->mi_lock); 1367 } 1368 1369 if (*douprintf == 0) { 1370 if (!(mi->mi_flags & MI_NOPRINT)) 1371 #ifdef DEBUG 1372 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1373 uprintf("NFS%d server %s ok\n", 1374 mi->mi_vers, svp->sv_hostname); 1375 #else 1376 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1377 uprintf("NFS server %s ok\n", svp->sv_hostname); 1378 #endif 1379 *douprintf = 1; 1380 } 1381 } 1382 1383 clfree_impl(client, ch, nfscl); 1384 if (cred_cloned) 1385 crfree(cr); 1386 1387 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1388 1389 if (rpc_status != NULL) 1390 *rpc_status = rpcerr.re_status; 1391 1392 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1393 rpcerr.re_errno); 1394 1395 return (rpcerr.re_errno); 1396 } 1397 1398 #ifdef DEBUG 1399 static int acl2call_hits = 0; 1400 static int acl2call_misses = 0; 1401 #endif 1402 1403 int 1404 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1405 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1406 enum nfsstat *statusp, int flags, failinfo_t *fi) 1407 { 1408 int rpcerror; 1409 1410 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1411 cr, douprintf, flags, fi); 1412 if (!rpcerror) { 1413 /* 1414 * See comments with crnetadjust(). 1415 */ 1416 if (*statusp == NFSERR_ACCES && 1417 (cr = crnetadjust(cr)) != NULL) { 1418 #ifdef DEBUG 1419 acl2call_hits++; 1420 #endif 1421 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1422 resp, cr, douprintf, flags, fi); 1423 crfree(cr); 1424 #ifdef DEBUG 1425 if (*statusp == NFSERR_ACCES) 1426 acl2call_misses++; 1427 #endif 1428 } 1429 } 1430 1431 return (rpcerror); 1432 } 1433 1434 #ifdef DEBUG 1435 static int acl3call_hits = 0; 1436 static int acl3call_misses = 0; 1437 #endif 1438 1439 int 1440 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1441 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1442 nfsstat3 *statusp, int flags, failinfo_t *fi) 1443 { 1444 int rpcerror; 1445 int user_informed; 1446 1447 user_informed = 0; 1448 1449 do { 1450 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1451 cr, douprintf, flags, fi); 1452 if (!rpcerror) { 1453 cred_t *crr; 1454 if (*statusp == NFS3ERR_JUKEBOX) { 1455 if (!user_informed) { 1456 user_informed = 1; 1457 uprintf( 1458 "file temporarily unavailable on the server, retrying...\n"); 1459 } 1460 delay(nfs3_jukebox_delay); 1461 } 1462 /* 1463 * See crnetadjust() for comments. 1464 */ 1465 else if (*statusp == NFS3ERR_ACCES && 1466 (crr = crnetadjust(cr)) != NULL) { 1467 #ifdef DEBUG 1468 acl3call_hits++; 1469 #endif 1470 rpcerror = aclcall(mi, which, xdrargs, argsp, 1471 xdrres, resp, crr, douprintf, flags, fi); 1472 1473 crfree(crr); 1474 #ifdef DEBUG 1475 if (*statusp == NFS3ERR_ACCES) 1476 acl3call_misses++; 1477 #endif 1478 } 1479 } 1480 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1481 1482 return (rpcerror); 1483 } 1484 1485 static int 1486 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1487 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1488 int flags, failinfo_t *fi) 1489 { 1490 CLIENT *client; 1491 struct chtab *ch; 1492 cred_t *cr = icr; 1493 bool_t cred_cloned = FALSE; 1494 enum clnt_stat status; 1495 struct rpc_err rpcerr; 1496 struct timeval wait; 1497 int timeo; /* in units of hz */ 1498 #if 0 /* notyet */ 1499 int my_rsize, my_wsize; 1500 #endif 1501 bool_t tryagain; 1502 k_sigset_t smask; 1503 servinfo_t *svp; 1504 struct nfs_clnt *nfscl; 1505 zoneid_t zoneid = getzoneid(); 1506 #ifdef DEBUG 1507 char *bufp; 1508 #endif 1509 1510 #if 0 /* notyet */ 1511 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1512 "rfscall_start:which %d mi %p", which, mi); 1513 #endif 1514 1515 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1516 ASSERT(nfscl != NULL); 1517 1518 nfscl->nfscl_stat.calls.value.ui64++; 1519 mi->mi_aclreqs[which].value.ui64++; 1520 1521 rpcerr.re_status = RPC_SUCCESS; 1522 1523 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1524 rpcerr.re_status = RPC_FAILED; 1525 rpcerr.re_errno = EIO; 1526 return (rpcerr.re_errno); 1527 } 1528 1529 #if 0 /* notyet */ 1530 /* 1531 * Remember the transfer sizes in case 1532 * nfs_feedback changes them underneath us. 1533 */ 1534 my_rsize = mi->mi_curread; 1535 my_wsize = mi->mi_curwrite; 1536 #endif 1537 1538 /* 1539 * NFS client failover support 1540 * 1541 * If this rnode is not in sync with the current server (VALID_FH), 1542 * we'd like to do a remap to get in sync. We can be interrupted 1543 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1544 * use the best info we have to try the RPC. Part of that is 1545 * unconditionally updating the filehandle copy kept for V3. 1546 * 1547 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1548 * rw_enter(); we're trying to keep the current server from being 1549 * changed on us until we're done with the remapping and have a 1550 * matching client handle. We don't want to sending a filehandle 1551 * to the wrong host. 1552 */ 1553 failoverretry: 1554 if (FAILOVER_MOUNT(mi)) { 1555 mutex_enter(&mi->mi_lock); 1556 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1557 if (failover_wait(mi)) { 1558 mutex_exit(&mi->mi_lock); 1559 return (EINTR); 1560 } 1561 } 1562 INC_READERS(mi); 1563 mutex_exit(&mi->mi_lock); 1564 if (fi) { 1565 if (!VALID_FH(fi) && 1566 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1567 int remaperr; 1568 1569 svp = mi->mi_curr_serv; 1570 remaperr = failover_remap(fi); 1571 if (remaperr != 0) { 1572 #ifdef DEBUG 1573 if (remaperr != EINTR) 1574 nfs_cmn_err(remaperr, CE_WARN, 1575 "aclcall couldn't failover: %m"); 1576 #endif 1577 mutex_enter(&mi->mi_lock); 1578 DEC_READERS(mi); 1579 mutex_exit(&mi->mi_lock); 1580 1581 /* 1582 * If failover_remap returns ETIMEDOUT 1583 * and the filesystem is hard mounted 1584 * we have to retry the call with a new 1585 * server. 1586 */ 1587 if ((mi->mi_flags & MI_HARD) && 1588 IS_RECOVERABLE_ERROR(remaperr)) { 1589 if (svp == mi->mi_curr_serv) 1590 failover_newserver(mi); 1591 rpcerr.re_status = RPC_SUCCESS; 1592 goto failoverretry; 1593 } 1594 return (remaperr); 1595 } 1596 } 1597 if (fi->fhp && fi->copyproc) 1598 (*fi->copyproc)(fi->fhp, fi->vp); 1599 } 1600 } 1601 1602 /* For TSOL, use a new cred which has net_mac_aware flag */ 1603 if (!cred_cloned && is_system_labeled()) { 1604 cred_cloned = TRUE; 1605 cr = crdup(icr); 1606 (void) setpflags(NET_MAC_AWARE, 1, cr); 1607 } 1608 1609 /* 1610 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1611 * are guaranteed to reprocess the retry as a new request. 1612 */ 1613 svp = mi->mi_curr_serv; 1614 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1615 if (FAILOVER_MOUNT(mi)) { 1616 mutex_enter(&mi->mi_lock); 1617 DEC_READERS(mi); 1618 mutex_exit(&mi->mi_lock); 1619 1620 if ((rpcerr.re_errno == ETIMEDOUT || 1621 rpcerr.re_errno == ECONNRESET) && 1622 failover_safe(fi)) { 1623 if (svp == mi->mi_curr_serv) 1624 failover_newserver(mi); 1625 goto failoverretry; 1626 } 1627 } 1628 if (rpcerr.re_errno != 0) { 1629 if (cred_cloned) 1630 crfree(cr); 1631 return (rpcerr.re_errno); 1632 } 1633 1634 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1635 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1636 timeo = (mi->mi_timeo * hz) / 10; 1637 } else { 1638 mutex_enter(&mi->mi_lock); 1639 timeo = CLNT_SETTIMERS(client, 1640 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1641 &(mi->mi_timers[NFS_CALLTYPES]), 1642 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1643 (void (*)()) 0, (caddr_t)mi, 0); 1644 mutex_exit(&mi->mi_lock); 1645 } 1646 1647 /* 1648 * If hard mounted fs, retry call forever unless hard error occurs. 1649 */ 1650 do { 1651 tryagain = FALSE; 1652 1653 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1654 status = RPC_FAILED; 1655 rpcerr.re_status = RPC_FAILED; 1656 rpcerr.re_errno = EIO; 1657 break; 1658 } 1659 1660 TICK_TO_TIMEVAL(timeo, &wait); 1661 1662 /* 1663 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1664 * and SIGTERM. (Preserving the existing masks). 1665 * Mask out SIGINT if mount option nointr is specified. 1666 */ 1667 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1668 if (!(mi->mi_flags & MI_INT)) 1669 client->cl_nosignal = TRUE; 1670 1671 /* 1672 * If there is a current signal, then don't bother 1673 * even trying to send out the request because we 1674 * won't be able to block waiting for the response. 1675 * Simply assume RPC_INTR and get on with it. 1676 */ 1677 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1678 status = RPC_INTR; 1679 else { 1680 status = CLNT_CALL(client, which, xdrargs, argsp, 1681 xdrres, resp, wait); 1682 } 1683 1684 if (!(mi->mi_flags & MI_INT)) 1685 client->cl_nosignal = FALSE; 1686 /* 1687 * restore original signal mask 1688 */ 1689 sigunintr(&smask); 1690 1691 switch (status) { 1692 case RPC_SUCCESS: 1693 #if 0 /* notyet */ 1694 if ((mi->mi_flags & MI_DYNAMIC) && 1695 mi->mi_timer_type[which] != 0 && 1696 (mi->mi_curread != my_rsize || 1697 mi->mi_curwrite != my_wsize)) 1698 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1699 #endif 1700 break; 1701 1702 /* 1703 * Unfortunately, there are servers in the world which 1704 * are not coded correctly. They are not prepared to 1705 * handle RPC requests to the NFS port which are not 1706 * NFS requests. Thus, they may try to process the 1707 * NFS_ACL request as if it were an NFS request. This 1708 * does not work. Generally, an error will be generated 1709 * on the client because it will not be able to decode 1710 * the response from the server. However, it seems 1711 * possible that the server may not be able to decode 1712 * the arguments. Thus, the criteria for deciding 1713 * whether the server supports NFS_ACL or not is whether 1714 * the following RPC errors are returned from CLNT_CALL. 1715 */ 1716 case RPC_CANTDECODERES: 1717 case RPC_PROGUNAVAIL: 1718 case RPC_CANTDECODEARGS: 1719 case RPC_PROGVERSMISMATCH: 1720 mutex_enter(&mi->mi_lock); 1721 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1722 mutex_exit(&mi->mi_lock); 1723 break; 1724 1725 /* 1726 * If the server supports NFS_ACL but not the new ops 1727 * for extended attributes, make sure we don't retry. 1728 */ 1729 case RPC_PROCUNAVAIL: 1730 mutex_enter(&mi->mi_lock); 1731 mi->mi_flags &= ~MI_EXTATTR; 1732 mutex_exit(&mi->mi_lock); 1733 break; 1734 1735 case RPC_INTR: 1736 /* 1737 * There is no way to recover from this error, 1738 * even if mount option nointr is specified. 1739 * SIGKILL, for example, cannot be blocked. 1740 */ 1741 rpcerr.re_status = RPC_INTR; 1742 rpcerr.re_errno = EINTR; 1743 break; 1744 1745 case RPC_UDERROR: 1746 /* 1747 * If the NFS server is local (vold) and 1748 * it goes away then we get RPC_UDERROR. 1749 * This is a retryable error, so we would 1750 * loop, so check to see if the specific 1751 * error was ECONNRESET, indicating that 1752 * target did not exist at all. If so, 1753 * return with RPC_PROGUNAVAIL and 1754 * ECONNRESET to indicate why. 1755 */ 1756 CLNT_GETERR(client, &rpcerr); 1757 if (rpcerr.re_errno == ECONNRESET) { 1758 rpcerr.re_status = RPC_PROGUNAVAIL; 1759 rpcerr.re_errno = ECONNRESET; 1760 break; 1761 } 1762 /*FALLTHROUGH*/ 1763 1764 default: /* probably RPC_TIMEDOUT */ 1765 if (IS_UNRECOVERABLE_RPC(status)) 1766 break; 1767 1768 /* 1769 * increment server not responding count 1770 */ 1771 mutex_enter(&mi->mi_lock); 1772 mi->mi_noresponse++; 1773 mutex_exit(&mi->mi_lock); 1774 #ifdef DEBUG 1775 nfscl->nfscl_stat.noresponse.value.ui64++; 1776 #endif 1777 1778 if (!(mi->mi_flags & MI_HARD)) { 1779 if (!(mi->mi_flags & MI_SEMISOFT) || 1780 (mi->mi_acl_ss_call_type[which] == 0)) 1781 break; 1782 } 1783 1784 /* 1785 * The call is in progress (over COTS). 1786 * Try the CLNT_CALL again, but don't 1787 * print a noisy error message. 1788 */ 1789 if (status == RPC_INPROGRESS) { 1790 tryagain = TRUE; 1791 break; 1792 } 1793 1794 if (flags & RFSCALL_SOFT) 1795 break; 1796 1797 /* 1798 * On zone shutdown, just move on. 1799 */ 1800 if (zone_status_get(curproc->p_zone) >= 1801 ZONE_IS_SHUTTING_DOWN) { 1802 rpcerr.re_status = RPC_FAILED; 1803 rpcerr.re_errno = EIO; 1804 break; 1805 } 1806 1807 /* 1808 * NFS client failover support 1809 * 1810 * If the current server just failed us, we'll 1811 * start the process of finding a new server. 1812 * After that, we can just retry. 1813 */ 1814 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1815 if (svp == mi->mi_curr_serv) 1816 failover_newserver(mi); 1817 clfree_impl(client, ch, nfscl); 1818 goto failoverretry; 1819 } 1820 1821 tryagain = TRUE; 1822 timeo = backoff(timeo); 1823 mutex_enter(&mi->mi_lock); 1824 if (!(mi->mi_flags & MI_PRINTED)) { 1825 mi->mi_flags |= MI_PRINTED; 1826 mutex_exit(&mi->mi_lock); 1827 #ifdef DEBUG 1828 zprintf(zoneid, 1829 "NFS_ACL%d server %s not responding still trying\n", 1830 mi->mi_vers, svp->sv_hostname); 1831 #else 1832 zprintf(zoneid, 1833 "NFS server %s not responding still trying\n", 1834 svp->sv_hostname); 1835 #endif 1836 } else 1837 mutex_exit(&mi->mi_lock); 1838 if (*douprintf && nfs_has_ctty()) { 1839 *douprintf = 0; 1840 if (!(mi->mi_flags & MI_NOPRINT)) 1841 #ifdef DEBUG 1842 uprintf( 1843 "NFS_ACL%d server %s not responding still trying\n", 1844 mi->mi_vers, svp->sv_hostname); 1845 #else 1846 uprintf( 1847 "NFS server %s not responding still trying\n", 1848 svp->sv_hostname); 1849 #endif 1850 } 1851 1852 #if 0 /* notyet */ 1853 /* 1854 * If doing dynamic adjustment of transfer 1855 * size and if it's a read or write call 1856 * and if the transfer size changed while 1857 * retransmitting or if the feedback routine 1858 * changed the transfer size, 1859 * then exit rfscall so that the transfer 1860 * size can be adjusted at the vnops level. 1861 */ 1862 if ((mi->mi_flags & MI_DYNAMIC) && 1863 mi->mi_acl_timer_type[which] != 0 && 1864 (mi->mi_curread != my_rsize || 1865 mi->mi_curwrite != my_wsize || 1866 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1867 /* 1868 * On read or write calls, return 1869 * back to the vnode ops level if 1870 * the transfer size changed. 1871 */ 1872 clfree_impl(client, ch, nfscl); 1873 if (cred_cloned) 1874 crfree(cr); 1875 return (ENFS_TRYAGAIN); 1876 } 1877 #endif 1878 } 1879 } while (tryagain); 1880 1881 if (status != RPC_SUCCESS) { 1882 /* 1883 * Let soft mounts use the timed out message. 1884 */ 1885 if (status == RPC_INPROGRESS) 1886 status = RPC_TIMEDOUT; 1887 nfscl->nfscl_stat.badcalls.value.ui64++; 1888 if (status == RPC_CANTDECODERES || 1889 status == RPC_PROGUNAVAIL || 1890 status == RPC_PROCUNAVAIL || 1891 status == RPC_CANTDECODEARGS || 1892 status == RPC_PROGVERSMISMATCH) 1893 CLNT_GETERR(client, &rpcerr); 1894 else if (status != RPC_INTR) { 1895 mutex_enter(&mi->mi_lock); 1896 mi->mi_flags |= MI_DOWN; 1897 mutex_exit(&mi->mi_lock); 1898 CLNT_GETERR(client, &rpcerr); 1899 #ifdef DEBUG 1900 bufp = clnt_sperror(client, svp->sv_hostname); 1901 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1902 mi->mi_vers, mi->mi_aclnames[which], bufp); 1903 if (nfs_has_ctty()) { 1904 if (!(mi->mi_flags & MI_NOPRINT)) { 1905 uprintf("NFS_ACL%d %s failed for %s\n", 1906 mi->mi_vers, mi->mi_aclnames[which], 1907 bufp); 1908 } 1909 } 1910 kmem_free(bufp, MAXPATHLEN); 1911 #else 1912 zprintf(zoneid, 1913 "NFS %s failed for server %s: error %d (%s)\n", 1914 mi->mi_aclnames[which], svp->sv_hostname, 1915 status, clnt_sperrno(status)); 1916 if (nfs_has_ctty()) { 1917 if (!(mi->mi_flags & MI_NOPRINT)) 1918 uprintf( 1919 "NFS %s failed for server %s: error %d (%s)\n", 1920 mi->mi_aclnames[which], 1921 svp->sv_hostname, status, 1922 clnt_sperrno(status)); 1923 } 1924 #endif 1925 /* 1926 * when CLNT_CALL() fails with RPC_AUTHERROR, 1927 * re_errno is set appropriately depending on 1928 * the authentication error 1929 */ 1930 if (status == RPC_VERSMISMATCH || 1931 status == RPC_PROGVERSMISMATCH) 1932 rpcerr.re_errno = EIO; 1933 } 1934 } else { 1935 /* 1936 * Test the value of mi_down and mi_printed without 1937 * holding the mi_lock mutex. If they are both zero, 1938 * then it is okay to skip the down and printed 1939 * processing. This saves on a mutex_enter and 1940 * mutex_exit pair for a normal, successful RPC. 1941 * This was just complete overhead. 1942 */ 1943 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1944 mutex_enter(&mi->mi_lock); 1945 mi->mi_flags &= ~MI_DOWN; 1946 if (mi->mi_flags & MI_PRINTED) { 1947 mi->mi_flags &= ~MI_PRINTED; 1948 mutex_exit(&mi->mi_lock); 1949 #ifdef DEBUG 1950 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1951 mi->mi_vers, svp->sv_hostname); 1952 #else 1953 zprintf(zoneid, "NFS server %s ok\n", 1954 svp->sv_hostname); 1955 #endif 1956 } else 1957 mutex_exit(&mi->mi_lock); 1958 } 1959 1960 if (*douprintf == 0) { 1961 if (!(mi->mi_flags & MI_NOPRINT)) 1962 #ifdef DEBUG 1963 uprintf("NFS_ACL%d server %s ok\n", 1964 mi->mi_vers, svp->sv_hostname); 1965 #else 1966 uprintf("NFS server %s ok\n", svp->sv_hostname); 1967 #endif 1968 *douprintf = 1; 1969 } 1970 } 1971 1972 clfree_impl(client, ch, nfscl); 1973 if (cred_cloned) 1974 crfree(cr); 1975 1976 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1977 1978 #if 0 /* notyet */ 1979 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1980 rpcerr.re_errno); 1981 #endif 1982 1983 return (rpcerr.re_errno); 1984 } 1985 1986 int 1987 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1988 { 1989 uint_t mask = vap->va_mask; 1990 1991 if (!(mask & AT_MODE)) 1992 sa->sa_mode = (uint32_t)-1; 1993 else 1994 sa->sa_mode = vap->va_mode; 1995 if (!(mask & AT_UID)) 1996 sa->sa_uid = (uint32_t)-1; 1997 else 1998 sa->sa_uid = (uint32_t)vap->va_uid; 1999 if (!(mask & AT_GID)) 2000 sa->sa_gid = (uint32_t)-1; 2001 else 2002 sa->sa_gid = (uint32_t)vap->va_gid; 2003 if (!(mask & AT_SIZE)) 2004 sa->sa_size = (uint32_t)-1; 2005 else 2006 sa->sa_size = (uint32_t)vap->va_size; 2007 if (!(mask & AT_ATIME)) 2008 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 2009 else { 2010 /* check time validity */ 2011 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2012 return (EOVERFLOW); 2013 } 2014 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2015 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2016 } 2017 if (!(mask & AT_MTIME)) 2018 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2019 else { 2020 /* check time validity */ 2021 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2022 return (EOVERFLOW); 2023 } 2024 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2025 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2026 } 2027 return (0); 2028 } 2029 2030 int 2031 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2032 { 2033 uint_t mask = vap->va_mask; 2034 2035 if (!(mask & AT_MODE)) 2036 sa->mode.set_it = FALSE; 2037 else { 2038 sa->mode.set_it = TRUE; 2039 sa->mode.mode = (mode3)vap->va_mode; 2040 } 2041 if (!(mask & AT_UID)) 2042 sa->uid.set_it = FALSE; 2043 else { 2044 sa->uid.set_it = TRUE; 2045 sa->uid.uid = (uid3)vap->va_uid; 2046 } 2047 if (!(mask & AT_GID)) 2048 sa->gid.set_it = FALSE; 2049 else { 2050 sa->gid.set_it = TRUE; 2051 sa->gid.gid = (gid3)vap->va_gid; 2052 } 2053 if (!(mask & AT_SIZE)) 2054 sa->size.set_it = FALSE; 2055 else { 2056 sa->size.set_it = TRUE; 2057 sa->size.size = (size3)vap->va_size; 2058 } 2059 if (!(mask & AT_ATIME)) 2060 sa->atime.set_it = DONT_CHANGE; 2061 else { 2062 /* check time validity */ 2063 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2064 return (EOVERFLOW); 2065 } 2066 sa->atime.set_it = SET_TO_CLIENT_TIME; 2067 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2068 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2069 } 2070 if (!(mask & AT_MTIME)) 2071 sa->mtime.set_it = DONT_CHANGE; 2072 else { 2073 /* check time validity */ 2074 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2075 return (EOVERFLOW); 2076 } 2077 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2078 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2079 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2080 } 2081 return (0); 2082 } 2083 2084 void 2085 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2086 { 2087 2088 da->da_fhandle = VTOFH(dvp); 2089 da->da_name = nm; 2090 da->da_flags = 0; 2091 } 2092 2093 void 2094 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2095 { 2096 2097 da->dirp = VTOFH3(dvp); 2098 da->name = nm; 2099 } 2100 2101 int 2102 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2103 { 2104 int error; 2105 rnode_t *rp; 2106 struct vattr va; 2107 2108 va.va_mask = AT_MODE | AT_GID; 2109 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2110 if (error) 2111 return (error); 2112 2113 /* 2114 * To determine the expected group-id of the created file: 2115 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2116 * GRPID option, and the directory's set-gid bit is clear, 2117 * then use the process's gid. 2118 * 2) Otherwise, set the group-id to the gid of the parent directory. 2119 */ 2120 rp = VTOR(dvp); 2121 mutex_enter(&rp->r_statelock); 2122 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2123 *gidp = crgetgid(cr); 2124 else 2125 *gidp = va.va_gid; 2126 mutex_exit(&rp->r_statelock); 2127 return (0); 2128 } 2129 2130 int 2131 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2132 { 2133 int error; 2134 struct vattr va; 2135 2136 va.va_mask = AT_MODE; 2137 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2138 if (error) 2139 return (error); 2140 2141 /* 2142 * Modify the expected mode (om) so that the set-gid bit matches 2143 * that of the parent directory (dvp). 2144 */ 2145 if (va.va_mode & VSGID) 2146 *omp |= VSGID; 2147 else 2148 *omp &= ~VSGID; 2149 return (0); 2150 } 2151 2152 void 2153 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2154 { 2155 2156 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2157 if (!(vp->v_flag & VSWAPLIKE)) { 2158 mutex_enter(&vp->v_lock); 2159 vp->v_flag |= VSWAPLIKE; 2160 mutex_exit(&vp->v_lock); 2161 } 2162 } else { 2163 if (vp->v_flag & VSWAPLIKE) { 2164 mutex_enter(&vp->v_lock); 2165 vp->v_flag &= ~VSWAPLIKE; 2166 mutex_exit(&vp->v_lock); 2167 } 2168 } 2169 } 2170 2171 /* 2172 * Free the resources associated with an rnode. 2173 */ 2174 static void 2175 rinactive(rnode_t *rp, cred_t *cr) 2176 { 2177 vnode_t *vp; 2178 cred_t *cred; 2179 char *contents; 2180 int size; 2181 vsecattr_t *vsp; 2182 int error; 2183 nfs3_pathconf_info *info; 2184 2185 /* 2186 * Before freeing anything, wait until all asynchronous 2187 * activity is done on this rnode. This will allow all 2188 * asynchronous read ahead and write behind i/o's to 2189 * finish. 2190 */ 2191 mutex_enter(&rp->r_statelock); 2192 while (rp->r_count > 0) 2193 cv_wait(&rp->r_cv, &rp->r_statelock); 2194 mutex_exit(&rp->r_statelock); 2195 2196 /* 2197 * Flush and invalidate all pages associated with the vnode. 2198 */ 2199 vp = RTOV(rp); 2200 if (vn_has_cached_data(vp)) { 2201 ASSERT(vp->v_type != VCHR); 2202 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2203 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2204 if (error && (error == ENOSPC || error == EDQUOT)) { 2205 mutex_enter(&rp->r_statelock); 2206 if (!rp->r_error) 2207 rp->r_error = error; 2208 mutex_exit(&rp->r_statelock); 2209 } 2210 } 2211 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2212 } 2213 2214 /* 2215 * Free any held credentials and caches which may be associated 2216 * with this rnode. 2217 */ 2218 mutex_enter(&rp->r_statelock); 2219 cred = rp->r_cred; 2220 rp->r_cred = NULL; 2221 contents = rp->r_symlink.contents; 2222 size = rp->r_symlink.size; 2223 rp->r_symlink.contents = NULL; 2224 vsp = rp->r_secattr; 2225 rp->r_secattr = NULL; 2226 info = rp->r_pathconf; 2227 rp->r_pathconf = NULL; 2228 mutex_exit(&rp->r_statelock); 2229 2230 /* 2231 * Free the held credential. 2232 */ 2233 if (cred != NULL) 2234 crfree(cred); 2235 2236 /* 2237 * Free the access cache entries. 2238 */ 2239 (void) nfs_access_purge_rp(rp); 2240 2241 /* 2242 * Free the readdir cache entries. 2243 */ 2244 if (HAVE_RDDIR_CACHE(rp)) 2245 nfs_purge_rddir_cache(vp); 2246 2247 /* 2248 * Free the symbolic link cache. 2249 */ 2250 if (contents != NULL) { 2251 2252 kmem_free((void *)contents, size); 2253 } 2254 2255 /* 2256 * Free any cached ACL. 2257 */ 2258 if (vsp != NULL) 2259 nfs_acl_free(vsp); 2260 2261 /* 2262 * Free any cached pathconf information. 2263 */ 2264 if (info != NULL) 2265 kmem_free(info, sizeof (*info)); 2266 } 2267 2268 /* 2269 * Return a vnode for the given NFS Version 2 file handle. 2270 * If no rnode exists for this fhandle, create one and put it 2271 * into the hash queues. If the rnode for this fhandle 2272 * already exists, return it. 2273 * 2274 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2275 */ 2276 vnode_t * 2277 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2278 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2279 { 2280 int newnode; 2281 int index; 2282 vnode_t *vp; 2283 nfs_fhandle nfh; 2284 vattr_t va; 2285 2286 nfh.fh_len = NFS_FHSIZE; 2287 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2288 2289 index = rtablehash(&nfh); 2290 rw_enter(&rtable[index].r_lock, RW_READER); 2291 2292 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2293 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2294 2295 if (attr != NULL) { 2296 if (!newnode) { 2297 rw_exit(&rtable[index].r_lock); 2298 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2299 } else { 2300 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2301 vp->v_type = VBAD; 2302 else 2303 vp->v_type = n2v_type(attr); 2304 /* 2305 * A translation here seems to be necessary 2306 * because this function can be called 2307 * with `attr' that has come from the wire, 2308 * and been operated on by vattr_to_nattr(). 2309 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2310 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2311 * ->makenfsnode(). 2312 */ 2313 if ((attr->na_rdev & 0xffff0000) == 0) 2314 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2315 else 2316 vp->v_rdev = expldev(n2v_rdev(attr)); 2317 nfs_attrcache(vp, attr, t); 2318 rw_exit(&rtable[index].r_lock); 2319 } 2320 } else { 2321 if (newnode) { 2322 PURGE_ATTRCACHE(vp); 2323 } 2324 rw_exit(&rtable[index].r_lock); 2325 } 2326 2327 return (vp); 2328 } 2329 2330 /* 2331 * Return a vnode for the given NFS Version 3 file handle. 2332 * If no rnode exists for this fhandle, create one and put it 2333 * into the hash queues. If the rnode for this fhandle 2334 * already exists, return it. 2335 * 2336 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2337 */ 2338 vnode_t * 2339 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2340 cred_t *cr, char *dnm, char *nm) 2341 { 2342 int newnode; 2343 int index; 2344 vnode_t *vp; 2345 2346 index = rtablehash((nfs_fhandle *)fh); 2347 rw_enter(&rtable[index].r_lock, RW_READER); 2348 2349 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2350 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2351 dnm, nm); 2352 2353 if (vap == NULL) { 2354 if (newnode) { 2355 PURGE_ATTRCACHE(vp); 2356 } 2357 rw_exit(&rtable[index].r_lock); 2358 return (vp); 2359 } 2360 2361 if (!newnode) { 2362 rw_exit(&rtable[index].r_lock); 2363 nfs_attr_cache(vp, vap, t, cr); 2364 } else { 2365 rnode_t *rp = VTOR(vp); 2366 2367 vp->v_type = vap->va_type; 2368 vp->v_rdev = vap->va_rdev; 2369 2370 mutex_enter(&rp->r_statelock); 2371 if (rp->r_mtime <= t) 2372 nfs_attrcache_va(vp, vap); 2373 mutex_exit(&rp->r_statelock); 2374 rw_exit(&rtable[index].r_lock); 2375 } 2376 2377 return (vp); 2378 } 2379 2380 vnode_t * 2381 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2382 cred_t *cr, char *dnm, char *nm) 2383 { 2384 int newnode; 2385 int index; 2386 vnode_t *vp; 2387 vattr_t va; 2388 2389 index = rtablehash((nfs_fhandle *)fh); 2390 rw_enter(&rtable[index].r_lock, RW_READER); 2391 2392 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2393 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2394 dnm, nm); 2395 2396 if (attr == NULL) { 2397 if (newnode) { 2398 PURGE_ATTRCACHE(vp); 2399 } 2400 rw_exit(&rtable[index].r_lock); 2401 return (vp); 2402 } 2403 2404 if (!newnode) { 2405 rw_exit(&rtable[index].r_lock); 2406 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2407 } else { 2408 if (attr->type < NF3REG || attr->type > NF3FIFO) 2409 vp->v_type = VBAD; 2410 else 2411 vp->v_type = nf3_to_vt[attr->type]; 2412 vp->v_rdev = makedevice(attr->rdev.specdata1, 2413 attr->rdev.specdata2); 2414 nfs3_attrcache(vp, attr, t); 2415 rw_exit(&rtable[index].r_lock); 2416 } 2417 2418 return (vp); 2419 } 2420 2421 /* 2422 * Read this comment before making changes to rtablehash()! 2423 * This is a hash function in which seemingly obvious and harmless 2424 * changes can cause escalations costing million dollars! 2425 * Know what you are doing. 2426 * 2427 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2428 * algorithm is currently detailed here: 2429 * 2430 * http://burtleburtle.net/bob/hash/doobs.html 2431 * 2432 * Of course, the above link may not be valid by the time you are reading 2433 * this, but suffice it to say that the one-at-a-time algorithm works well in 2434 * almost all cases. If you are changing the algorithm be sure to verify that 2435 * the hash algorithm still provides even distribution in all cases and with 2436 * any server returning filehandles in whatever order (sequential or random). 2437 */ 2438 static int 2439 rtablehash(nfs_fhandle *fh) 2440 { 2441 ulong_t hash, len, i; 2442 char *key; 2443 2444 key = fh->fh_buf; 2445 len = (ulong_t)fh->fh_len; 2446 for (hash = 0, i = 0; i < len; i++) { 2447 hash += key[i]; 2448 hash += (hash << 10); 2449 hash ^= (hash >> 6); 2450 } 2451 hash += (hash << 3); 2452 hash ^= (hash >> 11); 2453 hash += (hash << 15); 2454 return (hash & rtablemask); 2455 } 2456 2457 static vnode_t * 2458 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2459 struct vnodeops *vops, 2460 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2461 int (*compar)(const void *, const void *), 2462 int *newnode, cred_t *cr, char *dnm, char *nm) 2463 { 2464 rnode_t *rp; 2465 rnode_t *trp; 2466 vnode_t *vp; 2467 mntinfo_t *mi; 2468 2469 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2470 2471 mi = VFTOMI(vfsp); 2472 start: 2473 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2474 vp = RTOV(rp); 2475 nfs_set_vroot(vp); 2476 *newnode = 0; 2477 return (vp); 2478 } 2479 rw_exit(&rhtp->r_lock); 2480 2481 mutex_enter(&rpfreelist_lock); 2482 if (rpfreelist != NULL && rnew >= nrnode) { 2483 rp = rpfreelist; 2484 rp_rmfree(rp); 2485 mutex_exit(&rpfreelist_lock); 2486 2487 vp = RTOV(rp); 2488 2489 if (rp->r_flags & RHASHED) { 2490 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2491 mutex_enter(&vp->v_lock); 2492 if (vp->v_count > 1) { 2493 VN_RELE_LOCKED(vp); 2494 mutex_exit(&vp->v_lock); 2495 rw_exit(&rp->r_hashq->r_lock); 2496 rw_enter(&rhtp->r_lock, RW_READER); 2497 goto start; 2498 } 2499 mutex_exit(&vp->v_lock); 2500 rp_rmhash_locked(rp); 2501 rw_exit(&rp->r_hashq->r_lock); 2502 } 2503 2504 rinactive(rp, cr); 2505 2506 mutex_enter(&vp->v_lock); 2507 if (vp->v_count > 1) { 2508 VN_RELE_LOCKED(vp); 2509 mutex_exit(&vp->v_lock); 2510 rw_enter(&rhtp->r_lock, RW_READER); 2511 goto start; 2512 } 2513 mutex_exit(&vp->v_lock); 2514 vn_invalid(vp); 2515 /* 2516 * destroy old locks before bzero'ing and 2517 * recreating the locks below. 2518 */ 2519 nfs_rw_destroy(&rp->r_rwlock); 2520 nfs_rw_destroy(&rp->r_lkserlock); 2521 mutex_destroy(&rp->r_statelock); 2522 cv_destroy(&rp->r_cv); 2523 cv_destroy(&rp->r_commit.c_cv); 2524 nfs_free_r_path(rp); 2525 avl_destroy(&rp->r_dir); 2526 /* 2527 * Make sure that if rnode is recycled then 2528 * VFS count is decremented properly before 2529 * reuse. 2530 */ 2531 VFS_RELE(vp->v_vfsp); 2532 vn_reinit(vp); 2533 } else { 2534 vnode_t *new_vp; 2535 2536 mutex_exit(&rpfreelist_lock); 2537 2538 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2539 new_vp = vn_alloc(KM_SLEEP); 2540 2541 atomic_inc_ulong((ulong_t *)&rnew); 2542 #ifdef DEBUG 2543 clstat_debug.nrnode.value.ui64++; 2544 #endif 2545 vp = new_vp; 2546 } 2547 2548 bzero(rp, sizeof (*rp)); 2549 rp->r_vnode = vp; 2550 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2551 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2552 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2553 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2554 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2555 rp->r_fh.fh_len = fh->fh_len; 2556 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2557 rp->r_server = mi->mi_curr_serv; 2558 if (FAILOVER_MOUNT(mi)) { 2559 /* 2560 * If replicated servers, stash pathnames 2561 */ 2562 if (dnm != NULL && nm != NULL) { 2563 char *s, *p; 2564 uint_t len; 2565 2566 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2567 rp->r_path = kmem_alloc(len, KM_SLEEP); 2568 #ifdef DEBUG 2569 clstat_debug.rpath.value.ui64 += len; 2570 #endif 2571 s = rp->r_path; 2572 for (p = dnm; *p; p++) 2573 *s++ = *p; 2574 *s++ = '/'; 2575 for (p = nm; *p; p++) 2576 *s++ = *p; 2577 *s = '\0'; 2578 } else { 2579 /* special case for root */ 2580 rp->r_path = kmem_alloc(2, KM_SLEEP); 2581 #ifdef DEBUG 2582 clstat_debug.rpath.value.ui64 += 2; 2583 #endif 2584 *rp->r_path = '.'; 2585 *(rp->r_path + 1) = '\0'; 2586 } 2587 } 2588 VFS_HOLD(vfsp); 2589 rp->r_putapage = putapage; 2590 rp->r_hashq = rhtp; 2591 rp->r_flags = RREADDIRPLUS; 2592 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2593 offsetof(rddir_cache, tree)); 2594 vn_setops(vp, vops); 2595 vp->v_data = (caddr_t)rp; 2596 vp->v_vfsp = vfsp; 2597 vp->v_type = VNON; 2598 vp->v_flag |= VMODSORT; 2599 nfs_set_vroot(vp); 2600 2601 /* 2602 * There is a race condition if someone else 2603 * alloc's the rnode while no locks are held, so we 2604 * check again and recover if found. 2605 */ 2606 rw_enter(&rhtp->r_lock, RW_WRITER); 2607 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2608 vp = RTOV(trp); 2609 nfs_set_vroot(vp); 2610 *newnode = 0; 2611 rw_exit(&rhtp->r_lock); 2612 rp_addfree(rp, cr); 2613 rw_enter(&rhtp->r_lock, RW_READER); 2614 return (vp); 2615 } 2616 rp_addhash(rp); 2617 *newnode = 1; 2618 return (vp); 2619 } 2620 2621 /* 2622 * Callback function to check if the page should be marked as 2623 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT. 2624 */ 2625 int 2626 nfs_setmod_check(page_t *pp) 2627 { 2628 if (pp->p_fsdata != C_NOCOMMIT) { 2629 pp->p_fsdata = C_NOCOMMIT; 2630 return (1); 2631 } 2632 return (0); 2633 } 2634 2635 static void 2636 nfs_set_vroot(vnode_t *vp) 2637 { 2638 rnode_t *rp; 2639 nfs_fhandle *rootfh; 2640 2641 rp = VTOR(vp); 2642 rootfh = &rp->r_server->sv_fhandle; 2643 if (rootfh->fh_len == rp->r_fh.fh_len && 2644 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2645 if (!(vp->v_flag & VROOT)) { 2646 mutex_enter(&vp->v_lock); 2647 vp->v_flag |= VROOT; 2648 mutex_exit(&vp->v_lock); 2649 } 2650 } 2651 } 2652 2653 static void 2654 nfs_free_r_path(rnode_t *rp) 2655 { 2656 char *path; 2657 size_t len; 2658 2659 path = rp->r_path; 2660 if (path) { 2661 rp->r_path = NULL; 2662 len = strlen(path) + 1; 2663 kmem_free(path, len); 2664 #ifdef DEBUG 2665 clstat_debug.rpath.value.ui64 -= len; 2666 #endif 2667 } 2668 } 2669 2670 /* 2671 * Put an rnode on the free list. 2672 * 2673 * Rnodes which were allocated above and beyond the normal limit 2674 * are immediately freed. 2675 */ 2676 void 2677 rp_addfree(rnode_t *rp, cred_t *cr) 2678 { 2679 vnode_t *vp; 2680 struct vfs *vfsp; 2681 2682 vp = RTOV(rp); 2683 ASSERT(vp->v_count >= 1); 2684 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2685 2686 /* 2687 * If we have too many rnodes allocated and there are no 2688 * references to this rnode, or if the rnode is no longer 2689 * accessible by it does not reside in the hash queues, 2690 * or if an i/o error occurred while writing to the file, 2691 * then just free it instead of putting it on the rnode 2692 * freelist. 2693 */ 2694 vfsp = vp->v_vfsp; 2695 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2696 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2697 if (rp->r_flags & RHASHED) { 2698 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2699 mutex_enter(&vp->v_lock); 2700 if (vp->v_count > 1) { 2701 VN_RELE_LOCKED(vp); 2702 mutex_exit(&vp->v_lock); 2703 rw_exit(&rp->r_hashq->r_lock); 2704 return; 2705 } 2706 mutex_exit(&vp->v_lock); 2707 rp_rmhash_locked(rp); 2708 rw_exit(&rp->r_hashq->r_lock); 2709 } 2710 2711 rinactive(rp, cr); 2712 2713 /* 2714 * Recheck the vnode reference count. We need to 2715 * make sure that another reference has not been 2716 * acquired while we were not holding v_lock. The 2717 * rnode is not in the rnode hash queues, so the 2718 * only way for a reference to have been acquired 2719 * is for a VOP_PUTPAGE because the rnode was marked 2720 * with RDIRTY or for a modified page. This 2721 * reference may have been acquired before our call 2722 * to rinactive. The i/o may have been completed, 2723 * thus allowing rinactive to complete, but the 2724 * reference to the vnode may not have been released 2725 * yet. In any case, the rnode can not be destroyed 2726 * until the other references to this vnode have been 2727 * released. The other references will take care of 2728 * either destroying the rnode or placing it on the 2729 * rnode freelist. If there are no other references, 2730 * then the rnode may be safely destroyed. 2731 */ 2732 mutex_enter(&vp->v_lock); 2733 if (vp->v_count > 1) { 2734 VN_RELE_LOCKED(vp); 2735 mutex_exit(&vp->v_lock); 2736 return; 2737 } 2738 mutex_exit(&vp->v_lock); 2739 2740 destroy_rnode(rp); 2741 return; 2742 } 2743 2744 /* 2745 * Lock the hash queue and then recheck the reference count 2746 * to ensure that no other threads have acquired a reference 2747 * to indicate that the rnode should not be placed on the 2748 * freelist. If another reference has been acquired, then 2749 * just release this one and let the other thread complete 2750 * the processing of adding this rnode to the freelist. 2751 */ 2752 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2753 2754 mutex_enter(&vp->v_lock); 2755 if (vp->v_count > 1) { 2756 VN_RELE_LOCKED(vp); 2757 mutex_exit(&vp->v_lock); 2758 rw_exit(&rp->r_hashq->r_lock); 2759 return; 2760 } 2761 mutex_exit(&vp->v_lock); 2762 2763 /* 2764 * If there is no cached data or metadata for this file, then 2765 * put the rnode on the front of the freelist so that it will 2766 * be reused before other rnodes which may have cached data or 2767 * metadata associated with them. 2768 */ 2769 mutex_enter(&rpfreelist_lock); 2770 if (rpfreelist == NULL) { 2771 rp->r_freef = rp; 2772 rp->r_freeb = rp; 2773 rpfreelist = rp; 2774 } else { 2775 rp->r_freef = rpfreelist; 2776 rp->r_freeb = rpfreelist->r_freeb; 2777 rpfreelist->r_freeb->r_freef = rp; 2778 rpfreelist->r_freeb = rp; 2779 if (!vn_has_cached_data(vp) && 2780 !HAVE_RDDIR_CACHE(rp) && 2781 rp->r_symlink.contents == NULL && 2782 rp->r_secattr == NULL && 2783 rp->r_pathconf == NULL) 2784 rpfreelist = rp; 2785 } 2786 mutex_exit(&rpfreelist_lock); 2787 2788 rw_exit(&rp->r_hashq->r_lock); 2789 } 2790 2791 /* 2792 * Remove an rnode from the free list. 2793 * 2794 * The caller must be holding rpfreelist_lock and the rnode 2795 * must be on the freelist. 2796 */ 2797 static void 2798 rp_rmfree(rnode_t *rp) 2799 { 2800 2801 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2802 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2803 2804 if (rp == rpfreelist) { 2805 rpfreelist = rp->r_freef; 2806 if (rp == rpfreelist) 2807 rpfreelist = NULL; 2808 } 2809 2810 rp->r_freeb->r_freef = rp->r_freef; 2811 rp->r_freef->r_freeb = rp->r_freeb; 2812 2813 rp->r_freef = rp->r_freeb = NULL; 2814 } 2815 2816 /* 2817 * Put a rnode in the hash table. 2818 * 2819 * The caller must be holding the exclusive hash queue lock. 2820 */ 2821 static void 2822 rp_addhash(rnode_t *rp) 2823 { 2824 mntinfo_t *mi; 2825 2826 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2827 ASSERT(!(rp->r_flags & RHASHED)); 2828 2829 rp->r_hashf = rp->r_hashq->r_hashf; 2830 rp->r_hashq->r_hashf = rp; 2831 rp->r_hashb = (rnode_t *)rp->r_hashq; 2832 rp->r_hashf->r_hashb = rp; 2833 2834 mutex_enter(&rp->r_statelock); 2835 rp->r_flags |= RHASHED; 2836 mutex_exit(&rp->r_statelock); 2837 2838 mi = VTOMI(RTOV(rp)); 2839 mutex_enter(&mi->mi_rnodes_lock); 2840 list_insert_tail(&mi->mi_rnodes, rp); 2841 mutex_exit(&mi->mi_rnodes_lock); 2842 } 2843 2844 /* 2845 * Remove a rnode from the hash table. 2846 * 2847 * The caller must be holding the hash queue lock. 2848 */ 2849 static void 2850 rp_rmhash_locked(rnode_t *rp) 2851 { 2852 mntinfo_t *mi; 2853 2854 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2855 ASSERT(rp->r_flags & RHASHED); 2856 2857 rp->r_hashb->r_hashf = rp->r_hashf; 2858 rp->r_hashf->r_hashb = rp->r_hashb; 2859 2860 mutex_enter(&rp->r_statelock); 2861 rp->r_flags &= ~RHASHED; 2862 mutex_exit(&rp->r_statelock); 2863 2864 mi = VTOMI(RTOV(rp)); 2865 mutex_enter(&mi->mi_rnodes_lock); 2866 if (list_link_active(&rp->r_mi_link)) 2867 list_remove(&mi->mi_rnodes, rp); 2868 mutex_exit(&mi->mi_rnodes_lock); 2869 } 2870 2871 /* 2872 * Remove a rnode from the hash table. 2873 * 2874 * The caller must not be holding the hash queue lock. 2875 */ 2876 void 2877 rp_rmhash(rnode_t *rp) 2878 { 2879 2880 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2881 rp_rmhash_locked(rp); 2882 rw_exit(&rp->r_hashq->r_lock); 2883 } 2884 2885 /* 2886 * Lookup a rnode by fhandle. 2887 * 2888 * The caller must be holding the hash queue lock, either shared or exclusive. 2889 */ 2890 static rnode_t * 2891 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2892 { 2893 rnode_t *rp; 2894 vnode_t *vp; 2895 2896 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2897 2898 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2899 vp = RTOV(rp); 2900 if (vp->v_vfsp == vfsp && 2901 rp->r_fh.fh_len == fh->fh_len && 2902 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2903 /* 2904 * remove rnode from free list, if necessary. 2905 */ 2906 if (rp->r_freef != NULL) { 2907 mutex_enter(&rpfreelist_lock); 2908 /* 2909 * If the rnode is on the freelist, 2910 * then remove it and use that reference 2911 * as the new reference. Otherwise, 2912 * need to increment the reference count. 2913 */ 2914 if (rp->r_freef != NULL) { 2915 rp_rmfree(rp); 2916 mutex_exit(&rpfreelist_lock); 2917 } else { 2918 mutex_exit(&rpfreelist_lock); 2919 VN_HOLD(vp); 2920 } 2921 } else 2922 VN_HOLD(vp); 2923 return (rp); 2924 } 2925 } 2926 return (NULL); 2927 } 2928 2929 /* 2930 * Return 1 if there is an active vnode belonging to this vfs in the 2931 * rtable cache. 2932 * 2933 * Several of these checks are done without holding the usual 2934 * locks. This is safe because destroy_rtable(), rp_addfree(), 2935 * etc. will redo the necessary checks before actually destroying 2936 * any rnodes. 2937 */ 2938 int 2939 check_rtable(struct vfs *vfsp) 2940 { 2941 rnode_t *rp; 2942 vnode_t *vp; 2943 mntinfo_t *mi; 2944 2945 ASSERT(vfsp != NULL); 2946 mi = VFTOMI(vfsp); 2947 2948 mutex_enter(&mi->mi_rnodes_lock); 2949 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 2950 rp = list_next(&mi->mi_rnodes, rp)) { 2951 vp = RTOV(rp); 2952 2953 if (rp->r_freef == NULL || 2954 (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) || 2955 rp->r_count > 0) { 2956 mutex_exit(&mi->mi_rnodes_lock); 2957 return (1); 2958 } 2959 } 2960 mutex_exit(&mi->mi_rnodes_lock); 2961 2962 return (0); 2963 } 2964 2965 /* 2966 * Destroy inactive vnodes from the hash queues which belong to this 2967 * vfs. It is essential that we destroy all inactive vnodes during a 2968 * forced unmount as well as during a normal unmount. 2969 */ 2970 void 2971 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2972 { 2973 rnode_t *rp; 2974 mntinfo_t *mi; 2975 2976 ASSERT(vfsp != NULL); 2977 2978 mi = VFTOMI(vfsp); 2979 2980 mutex_enter(&rpfreelist_lock); 2981 mutex_enter(&mi->mi_rnodes_lock); 2982 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) { 2983 /* 2984 * If the rnode is no longer on the freelist it is not 2985 * ours and it will be handled by some other thread, so 2986 * skip it. 2987 */ 2988 if (rp->r_freef == NULL) 2989 continue; 2990 mutex_exit(&mi->mi_rnodes_lock); 2991 2992 rp_rmfree(rp); 2993 mutex_exit(&rpfreelist_lock); 2994 2995 rp_rmhash(rp); 2996 2997 /* 2998 * This call to rp_addfree will end up destroying the 2999 * rnode, but in a safe way with the appropriate set 3000 * of checks done. 3001 */ 3002 rp_addfree(rp, cr); 3003 3004 mutex_enter(&rpfreelist_lock); 3005 mutex_enter(&mi->mi_rnodes_lock); 3006 } 3007 mutex_exit(&mi->mi_rnodes_lock); 3008 mutex_exit(&rpfreelist_lock); 3009 } 3010 3011 /* 3012 * This routine destroys all the resources associated with the rnode 3013 * and then the rnode itself. 3014 */ 3015 static void 3016 destroy_rnode(rnode_t *rp) 3017 { 3018 vnode_t *vp; 3019 vfs_t *vfsp; 3020 3021 vp = RTOV(rp); 3022 vfsp = vp->v_vfsp; 3023 3024 ASSERT(vp->v_count == 1); 3025 ASSERT(rp->r_count == 0); 3026 ASSERT(rp->r_lmpl == NULL); 3027 ASSERT(rp->r_mapcnt == 0); 3028 ASSERT(!(rp->r_flags & RHASHED)); 3029 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 3030 atomic_dec_ulong((ulong_t *)&rnew); 3031 #ifdef DEBUG 3032 clstat_debug.nrnode.value.ui64--; 3033 #endif 3034 nfs_rw_destroy(&rp->r_rwlock); 3035 nfs_rw_destroy(&rp->r_lkserlock); 3036 mutex_destroy(&rp->r_statelock); 3037 cv_destroy(&rp->r_cv); 3038 cv_destroy(&rp->r_commit.c_cv); 3039 if (rp->r_flags & RDELMAPLIST) 3040 list_destroy(&rp->r_indelmap); 3041 nfs_free_r_path(rp); 3042 avl_destroy(&rp->r_dir); 3043 vn_invalid(vp); 3044 vn_free(vp); 3045 kmem_cache_free(rnode_cache, rp); 3046 VFS_RELE(vfsp); 3047 } 3048 3049 /* 3050 * Flush all vnodes in this (or every) vfs. 3051 * Used by nfs_sync and by nfs_unmount. 3052 */ 3053 void 3054 rflush(struct vfs *vfsp, cred_t *cr) 3055 { 3056 int index; 3057 rnode_t *rp; 3058 vnode_t *vp, **vplist; 3059 long num, cnt; 3060 3061 /* 3062 * Check to see whether there is anything to do. 3063 */ 3064 num = rnew; 3065 if (num == 0) 3066 return; 3067 3068 /* 3069 * Allocate a slot for all currently active rnodes on the 3070 * supposition that they all may need flushing. 3071 */ 3072 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3073 cnt = 0; 3074 3075 /* 3076 * If the vfs is known we can do fast path by iterating all rnodes that 3077 * belongs to this vfs. This is much faster than the traditional way 3078 * of iterating rtable (below) in a case there is a lot of rnodes that 3079 * does not belong to our vfs. 3080 */ 3081 if (vfsp != NULL) { 3082 mntinfo_t *mi = VFTOMI(vfsp); 3083 3084 mutex_enter(&mi->mi_rnodes_lock); 3085 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 3086 rp = list_next(&mi->mi_rnodes, rp)) { 3087 vp = RTOV(rp); 3088 /* 3089 * Don't bother sync'ing a vp if it 3090 * is part of virtual swap device or 3091 * if VFS is read-only 3092 */ 3093 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3094 continue; 3095 /* 3096 * If the vnode has pages and is marked as either dirty 3097 * or mmap'd, hold and add this vnode to the list of 3098 * vnodes to flush. 3099 */ 3100 ASSERT(vp->v_vfsp == vfsp); 3101 if (vn_has_cached_data(vp) && 3102 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3103 VN_HOLD(vp); 3104 vplist[cnt++] = vp; 3105 if (cnt == num) { 3106 /* 3107 * The vplist is full because there is 3108 * too many rnodes. We are done for 3109 * now. 3110 */ 3111 break; 3112 } 3113 } 3114 } 3115 mutex_exit(&mi->mi_rnodes_lock); 3116 3117 goto done; 3118 } 3119 3120 ASSERT(vfsp == NULL); 3121 3122 /* 3123 * Walk the hash queues looking for rnodes with page 3124 * lists associated with them. Make a list of these 3125 * files. 3126 */ 3127 for (index = 0; index < rtablesize; index++) { 3128 rw_enter(&rtable[index].r_lock, RW_READER); 3129 for (rp = rtable[index].r_hashf; 3130 rp != (rnode_t *)(&rtable[index]); 3131 rp = rp->r_hashf) { 3132 vp = RTOV(rp); 3133 /* 3134 * Don't bother sync'ing a vp if it 3135 * is part of virtual swap device or 3136 * if VFS is read-only 3137 */ 3138 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3139 continue; 3140 /* 3141 * If the vnode has pages and is marked as either dirty 3142 * or mmap'd, hold and add this vnode to the list of 3143 * vnodes to flush. 3144 */ 3145 if (vn_has_cached_data(vp) && 3146 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3147 VN_HOLD(vp); 3148 vplist[cnt++] = vp; 3149 if (cnt == num) { 3150 rw_exit(&rtable[index].r_lock); 3151 /* 3152 * The vplist is full because there is 3153 * too many rnodes. We are done for 3154 * now. 3155 */ 3156 goto done; 3157 } 3158 } 3159 } 3160 rw_exit(&rtable[index].r_lock); 3161 } 3162 3163 done: 3164 3165 /* 3166 * Flush and release all of the files on the list. 3167 */ 3168 while (cnt-- > 0) { 3169 vp = vplist[cnt]; 3170 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3171 VN_RELE(vp); 3172 } 3173 3174 /* 3175 * Free the space allocated to hold the list. 3176 */ 3177 kmem_free(vplist, num * sizeof (*vplist)); 3178 } 3179 3180 /* 3181 * This probably needs to be larger than or equal to 3182 * log2(sizeof (struct rnode)) due to the way that rnodes are 3183 * allocated. 3184 */ 3185 #define ACACHE_SHIFT_BITS 9 3186 3187 static int 3188 acachehash(rnode_t *rp, cred_t *cr) 3189 { 3190 3191 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3192 acachemask); 3193 } 3194 3195 #ifdef DEBUG 3196 static long nfs_access_cache_hits = 0; 3197 static long nfs_access_cache_misses = 0; 3198 #endif 3199 3200 nfs_access_type_t 3201 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3202 { 3203 vnode_t *vp; 3204 acache_t *ap; 3205 acache_hash_t *hp; 3206 nfs_access_type_t all; 3207 3208 vp = RTOV(rp); 3209 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3210 return (NFS_ACCESS_UNKNOWN); 3211 3212 if (rp->r_acache != NULL) { 3213 hp = &acache[acachehash(rp, cr)]; 3214 rw_enter(&hp->lock, RW_READER); 3215 ap = hp->next; 3216 while (ap != (acache_t *)hp) { 3217 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3218 if ((ap->known & acc) == acc) { 3219 #ifdef DEBUG 3220 nfs_access_cache_hits++; 3221 #endif 3222 if ((ap->allowed & acc) == acc) 3223 all = NFS_ACCESS_ALLOWED; 3224 else 3225 all = NFS_ACCESS_DENIED; 3226 } else { 3227 #ifdef DEBUG 3228 nfs_access_cache_misses++; 3229 #endif 3230 all = NFS_ACCESS_UNKNOWN; 3231 } 3232 rw_exit(&hp->lock); 3233 return (all); 3234 } 3235 ap = ap->next; 3236 } 3237 rw_exit(&hp->lock); 3238 } 3239 3240 #ifdef DEBUG 3241 nfs_access_cache_misses++; 3242 #endif 3243 return (NFS_ACCESS_UNKNOWN); 3244 } 3245 3246 void 3247 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3248 { 3249 acache_t *ap; 3250 acache_t *nap; 3251 acache_hash_t *hp; 3252 3253 hp = &acache[acachehash(rp, cr)]; 3254 3255 /* 3256 * Allocate now assuming that mostly an allocation will be 3257 * required. This allows the allocation to happen without 3258 * holding the hash bucket locked. 3259 */ 3260 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3261 if (nap != NULL) { 3262 nap->known = acc; 3263 nap->allowed = resacc; 3264 nap->rnode = rp; 3265 crhold(cr); 3266 nap->cred = cr; 3267 nap->hashq = hp; 3268 } 3269 3270 rw_enter(&hp->lock, RW_WRITER); 3271 3272 if (rp->r_acache != NULL) { 3273 ap = hp->next; 3274 while (ap != (acache_t *)hp) { 3275 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3276 ap->known |= acc; 3277 ap->allowed &= ~acc; 3278 ap->allowed |= resacc; 3279 rw_exit(&hp->lock); 3280 if (nap != NULL) { 3281 crfree(nap->cred); 3282 kmem_cache_free(acache_cache, nap); 3283 } 3284 return; 3285 } 3286 ap = ap->next; 3287 } 3288 } 3289 3290 if (nap != NULL) { 3291 #ifdef DEBUG 3292 clstat_debug.access.value.ui64++; 3293 #endif 3294 nap->next = hp->next; 3295 hp->next = nap; 3296 nap->next->prev = nap; 3297 nap->prev = (acache_t *)hp; 3298 3299 mutex_enter(&rp->r_statelock); 3300 nap->list = rp->r_acache; 3301 rp->r_acache = nap; 3302 mutex_exit(&rp->r_statelock); 3303 } 3304 3305 rw_exit(&hp->lock); 3306 } 3307 3308 int 3309 nfs_access_purge_rp(rnode_t *rp) 3310 { 3311 acache_t *ap; 3312 acache_t *tmpap; 3313 acache_t *rplist; 3314 3315 /* 3316 * If there aren't any cached entries, then there is nothing 3317 * to free. 3318 */ 3319 if (rp->r_acache == NULL) 3320 return (0); 3321 3322 mutex_enter(&rp->r_statelock); 3323 rplist = rp->r_acache; 3324 rp->r_acache = NULL; 3325 mutex_exit(&rp->r_statelock); 3326 3327 /* 3328 * Loop through each entry in the list pointed to in the 3329 * rnode. Remove each of these entries from the hash 3330 * queue that it is on and remove it from the list in 3331 * the rnode. 3332 */ 3333 for (ap = rplist; ap != NULL; ap = tmpap) { 3334 rw_enter(&ap->hashq->lock, RW_WRITER); 3335 ap->prev->next = ap->next; 3336 ap->next->prev = ap->prev; 3337 rw_exit(&ap->hashq->lock); 3338 3339 tmpap = ap->list; 3340 crfree(ap->cred); 3341 kmem_cache_free(acache_cache, ap); 3342 #ifdef DEBUG 3343 clstat_debug.access.value.ui64--; 3344 #endif 3345 } 3346 3347 return (1); 3348 } 3349 3350 static const char prefix[] = ".nfs"; 3351 3352 static kmutex_t newnum_lock; 3353 3354 int 3355 newnum(void) 3356 { 3357 static uint_t newnum = 0; 3358 uint_t id; 3359 3360 mutex_enter(&newnum_lock); 3361 if (newnum == 0) 3362 newnum = gethrestime_sec() & 0xffff; 3363 id = newnum++; 3364 mutex_exit(&newnum_lock); 3365 return (id); 3366 } 3367 3368 char * 3369 newname(void) 3370 { 3371 char *news; 3372 char *s; 3373 const char *p; 3374 uint_t id; 3375 3376 id = newnum(); 3377 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3378 s = news; 3379 p = prefix; 3380 while (*p != '\0') 3381 *s++ = *p++; 3382 while (id != 0) { 3383 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3384 id >>= 4; 3385 } 3386 *s = '\0'; 3387 return (news); 3388 } 3389 3390 /* 3391 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3392 * framework. 3393 */ 3394 static int 3395 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3396 { 3397 ksp->ks_snaptime = gethrtime(); 3398 if (rw == KSTAT_WRITE) { 3399 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3400 #ifdef DEBUG 3401 /* 3402 * Currently only the global zone can write to kstats, but we 3403 * add the check just for paranoia. 3404 */ 3405 if (INGLOBALZONE(curproc)) 3406 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3407 sizeof (clstat_debug)); 3408 #endif 3409 } else { 3410 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3411 #ifdef DEBUG 3412 /* 3413 * If we're displaying the "global" debug kstat values, we 3414 * display them as-is to all zones since in fact they apply to 3415 * the system as a whole. 3416 */ 3417 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3418 sizeof (clstat_debug)); 3419 #endif 3420 } 3421 return (0); 3422 } 3423 3424 static void * 3425 clinit_zone(zoneid_t zoneid) 3426 { 3427 kstat_t *nfs_client_kstat; 3428 struct nfs_clnt *nfscl; 3429 uint_t ndata; 3430 3431 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3432 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3433 nfscl->nfscl_chtable = NULL; 3434 nfscl->nfscl_zoneid = zoneid; 3435 3436 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3437 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3438 #ifdef DEBUG 3439 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3440 #endif 3441 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3442 "misc", KSTAT_TYPE_NAMED, ndata, 3443 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3444 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3445 nfs_client_kstat->ks_snapshot = cl_snapshot; 3446 kstat_install(nfs_client_kstat); 3447 } 3448 mutex_enter(&nfs_clnt_list_lock); 3449 list_insert_head(&nfs_clnt_list, nfscl); 3450 mutex_exit(&nfs_clnt_list_lock); 3451 return (nfscl); 3452 } 3453 3454 /*ARGSUSED*/ 3455 static void 3456 clfini_zone(zoneid_t zoneid, void *arg) 3457 { 3458 struct nfs_clnt *nfscl = arg; 3459 chhead_t *chp, *next; 3460 3461 if (nfscl == NULL) 3462 return; 3463 mutex_enter(&nfs_clnt_list_lock); 3464 list_remove(&nfs_clnt_list, nfscl); 3465 mutex_exit(&nfs_clnt_list_lock); 3466 clreclaim_zone(nfscl, 0); 3467 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3468 ASSERT(chp->ch_list == NULL); 3469 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3470 next = chp->ch_next; 3471 kmem_free(chp, sizeof (*chp)); 3472 } 3473 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3474 mutex_destroy(&nfscl->nfscl_chtable_lock); 3475 kmem_free(nfscl, sizeof (*nfscl)); 3476 } 3477 3478 /* 3479 * Called by endpnt_destructor to make sure the client handles are 3480 * cleaned up before the RPC endpoints. This becomes a no-op if 3481 * clfini_zone (above) is called first. This function is needed 3482 * (rather than relying on clfini_zone to clean up) because the ZSD 3483 * callbacks have no ordering mechanism, so we have no way to ensure 3484 * that clfini_zone is called before endpnt_destructor. 3485 */ 3486 void 3487 clcleanup_zone(zoneid_t zoneid) 3488 { 3489 struct nfs_clnt *nfscl; 3490 3491 mutex_enter(&nfs_clnt_list_lock); 3492 nfscl = list_head(&nfs_clnt_list); 3493 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3494 if (nfscl->nfscl_zoneid == zoneid) { 3495 clreclaim_zone(nfscl, 0); 3496 break; 3497 } 3498 } 3499 mutex_exit(&nfs_clnt_list_lock); 3500 } 3501 3502 int 3503 nfs_subrinit(void) 3504 { 3505 int i; 3506 ulong_t nrnode_max; 3507 3508 /* 3509 * Allocate and initialize the rnode hash queues 3510 */ 3511 if (nrnode <= 0) 3512 nrnode = ncsize; 3513 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3514 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3515 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3516 "!setting nrnode to max value of %ld", nrnode_max); 3517 nrnode = nrnode_max; 3518 } 3519 3520 rtablesize = 1 << highbit(nrnode / hashlen); 3521 rtablemask = rtablesize - 1; 3522 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3523 for (i = 0; i < rtablesize; i++) { 3524 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3525 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3526 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3527 } 3528 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3529 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3530 3531 /* 3532 * Allocate and initialize the access cache 3533 */ 3534 3535 /* 3536 * Initial guess is one access cache entry per rnode unless 3537 * nacache is set to a non-zero value and then it is used to 3538 * indicate a guess at the number of access cache entries. 3539 */ 3540 if (nacache > 0) 3541 acachesize = 1 << highbit(nacache / hashlen); 3542 else 3543 acachesize = rtablesize; 3544 acachemask = acachesize - 1; 3545 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3546 for (i = 0; i < acachesize; i++) { 3547 acache[i].next = (acache_t *)&acache[i]; 3548 acache[i].prev = (acache_t *)&acache[i]; 3549 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3550 } 3551 acache_cache = kmem_cache_create("nfs_access_cache", 3552 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3553 /* 3554 * Allocate and initialize the client handle cache 3555 */ 3556 chtab_cache = kmem_cache_create("client_handle_cache", 3557 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3558 /* 3559 * Initialize the list of per-zone client handles (and associated data). 3560 * This needs to be done before we call zone_key_create(). 3561 */ 3562 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3563 offsetof(struct nfs_clnt, nfscl_node)); 3564 /* 3565 * Initialize the zone_key for per-zone client handle lists. 3566 */ 3567 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3568 /* 3569 * Initialize the various mutexes and reader/writer locks 3570 */ 3571 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3572 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3573 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3574 3575 /* 3576 * Assign unique major number for all nfs mounts 3577 */ 3578 if ((nfs_major = getudev()) == -1) { 3579 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3580 "nfs: init: can't get unique device number"); 3581 nfs_major = 0; 3582 } 3583 nfs_minor = 0; 3584 3585 if (nfs3_jukebox_delay == 0) 3586 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3587 3588 return (0); 3589 } 3590 3591 void 3592 nfs_subrfini(void) 3593 { 3594 int i; 3595 3596 /* 3597 * Deallocate the rnode hash queues 3598 */ 3599 kmem_cache_destroy(rnode_cache); 3600 3601 for (i = 0; i < rtablesize; i++) 3602 rw_destroy(&rtable[i].r_lock); 3603 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3604 3605 /* 3606 * Deallocated the access cache 3607 */ 3608 kmem_cache_destroy(acache_cache); 3609 3610 for (i = 0; i < acachesize; i++) 3611 rw_destroy(&acache[i].lock); 3612 kmem_free(acache, acachesize * sizeof (*acache)); 3613 3614 /* 3615 * Deallocate the client handle cache 3616 */ 3617 kmem_cache_destroy(chtab_cache); 3618 3619 /* 3620 * Destroy the various mutexes and reader/writer locks 3621 */ 3622 mutex_destroy(&rpfreelist_lock); 3623 mutex_destroy(&newnum_lock); 3624 mutex_destroy(&nfs_minor_lock); 3625 (void) zone_key_delete(nfsclnt_zone_key); 3626 } 3627 3628 enum nfsstat 3629 puterrno(int error) 3630 { 3631 3632 switch (error) { 3633 case EOPNOTSUPP: 3634 return (NFSERR_OPNOTSUPP); 3635 case ENAMETOOLONG: 3636 return (NFSERR_NAMETOOLONG); 3637 case ENOTEMPTY: 3638 return (NFSERR_NOTEMPTY); 3639 case EDQUOT: 3640 return (NFSERR_DQUOT); 3641 case ESTALE: 3642 return (NFSERR_STALE); 3643 case EREMOTE: 3644 return (NFSERR_REMOTE); 3645 case ENOSYS: 3646 return (NFSERR_OPNOTSUPP); 3647 case EOVERFLOW: 3648 return (NFSERR_INVAL); 3649 default: 3650 return ((enum nfsstat)error); 3651 } 3652 /* NOTREACHED */ 3653 } 3654 3655 int 3656 geterrno(enum nfsstat status) 3657 { 3658 3659 switch (status) { 3660 case NFSERR_OPNOTSUPP: 3661 return (EOPNOTSUPP); 3662 case NFSERR_NAMETOOLONG: 3663 return (ENAMETOOLONG); 3664 case NFSERR_NOTEMPTY: 3665 return (ENOTEMPTY); 3666 case NFSERR_DQUOT: 3667 return (EDQUOT); 3668 case NFSERR_STALE: 3669 return (ESTALE); 3670 case NFSERR_REMOTE: 3671 return (EREMOTE); 3672 case NFSERR_WFLUSH: 3673 return (EIO); 3674 default: 3675 return ((int)status); 3676 } 3677 /* NOTREACHED */ 3678 } 3679 3680 enum nfsstat3 3681 puterrno3(int error) 3682 { 3683 3684 #ifdef DEBUG 3685 switch (error) { 3686 case 0: 3687 return (NFS3_OK); 3688 case EPERM: 3689 return (NFS3ERR_PERM); 3690 case ENOENT: 3691 return (NFS3ERR_NOENT); 3692 case EIO: 3693 return (NFS3ERR_IO); 3694 case ENXIO: 3695 return (NFS3ERR_NXIO); 3696 case EACCES: 3697 return (NFS3ERR_ACCES); 3698 case EEXIST: 3699 return (NFS3ERR_EXIST); 3700 case EXDEV: 3701 return (NFS3ERR_XDEV); 3702 case ENODEV: 3703 return (NFS3ERR_NODEV); 3704 case ENOTDIR: 3705 return (NFS3ERR_NOTDIR); 3706 case EISDIR: 3707 return (NFS3ERR_ISDIR); 3708 case EINVAL: 3709 return (NFS3ERR_INVAL); 3710 case EFBIG: 3711 return (NFS3ERR_FBIG); 3712 case ENOSPC: 3713 return (NFS3ERR_NOSPC); 3714 case EROFS: 3715 return (NFS3ERR_ROFS); 3716 case EMLINK: 3717 return (NFS3ERR_MLINK); 3718 case ENAMETOOLONG: 3719 return (NFS3ERR_NAMETOOLONG); 3720 case ENOTEMPTY: 3721 return (NFS3ERR_NOTEMPTY); 3722 case EDQUOT: 3723 return (NFS3ERR_DQUOT); 3724 case ESTALE: 3725 return (NFS3ERR_STALE); 3726 case EREMOTE: 3727 return (NFS3ERR_REMOTE); 3728 case ENOSYS: 3729 case EOPNOTSUPP: 3730 return (NFS3ERR_NOTSUPP); 3731 case EOVERFLOW: 3732 return (NFS3ERR_INVAL); 3733 default: 3734 zcmn_err(getzoneid(), CE_WARN, 3735 "puterrno3: got error %d", error); 3736 return ((enum nfsstat3)error); 3737 } 3738 #else 3739 switch (error) { 3740 case ENAMETOOLONG: 3741 return (NFS3ERR_NAMETOOLONG); 3742 case ENOTEMPTY: 3743 return (NFS3ERR_NOTEMPTY); 3744 case EDQUOT: 3745 return (NFS3ERR_DQUOT); 3746 case ESTALE: 3747 return (NFS3ERR_STALE); 3748 case ENOSYS: 3749 case EOPNOTSUPP: 3750 return (NFS3ERR_NOTSUPP); 3751 case EREMOTE: 3752 return (NFS3ERR_REMOTE); 3753 case EOVERFLOW: 3754 return (NFS3ERR_INVAL); 3755 default: 3756 return ((enum nfsstat3)error); 3757 } 3758 #endif 3759 } 3760 3761 int 3762 geterrno3(enum nfsstat3 status) 3763 { 3764 3765 #ifdef DEBUG 3766 switch (status) { 3767 case NFS3_OK: 3768 return (0); 3769 case NFS3ERR_PERM: 3770 return (EPERM); 3771 case NFS3ERR_NOENT: 3772 return (ENOENT); 3773 case NFS3ERR_IO: 3774 return (EIO); 3775 case NFS3ERR_NXIO: 3776 return (ENXIO); 3777 case NFS3ERR_ACCES: 3778 return (EACCES); 3779 case NFS3ERR_EXIST: 3780 return (EEXIST); 3781 case NFS3ERR_XDEV: 3782 return (EXDEV); 3783 case NFS3ERR_NODEV: 3784 return (ENODEV); 3785 case NFS3ERR_NOTDIR: 3786 return (ENOTDIR); 3787 case NFS3ERR_ISDIR: 3788 return (EISDIR); 3789 case NFS3ERR_INVAL: 3790 return (EINVAL); 3791 case NFS3ERR_FBIG: 3792 return (EFBIG); 3793 case NFS3ERR_NOSPC: 3794 return (ENOSPC); 3795 case NFS3ERR_ROFS: 3796 return (EROFS); 3797 case NFS3ERR_MLINK: 3798 return (EMLINK); 3799 case NFS3ERR_NAMETOOLONG: 3800 return (ENAMETOOLONG); 3801 case NFS3ERR_NOTEMPTY: 3802 return (ENOTEMPTY); 3803 case NFS3ERR_DQUOT: 3804 return (EDQUOT); 3805 case NFS3ERR_STALE: 3806 return (ESTALE); 3807 case NFS3ERR_REMOTE: 3808 return (EREMOTE); 3809 case NFS3ERR_BADHANDLE: 3810 return (ESTALE); 3811 case NFS3ERR_NOT_SYNC: 3812 return (EINVAL); 3813 case NFS3ERR_BAD_COOKIE: 3814 return (ENOENT); 3815 case NFS3ERR_NOTSUPP: 3816 return (EOPNOTSUPP); 3817 case NFS3ERR_TOOSMALL: 3818 return (EINVAL); 3819 case NFS3ERR_SERVERFAULT: 3820 return (EIO); 3821 case NFS3ERR_BADTYPE: 3822 return (EINVAL); 3823 case NFS3ERR_JUKEBOX: 3824 return (ENXIO); 3825 default: 3826 zcmn_err(getzoneid(), CE_WARN, 3827 "geterrno3: got status %d", status); 3828 return ((int)status); 3829 } 3830 #else 3831 switch (status) { 3832 case NFS3ERR_NAMETOOLONG: 3833 return (ENAMETOOLONG); 3834 case NFS3ERR_NOTEMPTY: 3835 return (ENOTEMPTY); 3836 case NFS3ERR_DQUOT: 3837 return (EDQUOT); 3838 case NFS3ERR_STALE: 3839 case NFS3ERR_BADHANDLE: 3840 return (ESTALE); 3841 case NFS3ERR_NOTSUPP: 3842 return (EOPNOTSUPP); 3843 case NFS3ERR_REMOTE: 3844 return (EREMOTE); 3845 case NFS3ERR_NOT_SYNC: 3846 case NFS3ERR_TOOSMALL: 3847 case NFS3ERR_BADTYPE: 3848 return (EINVAL); 3849 case NFS3ERR_BAD_COOKIE: 3850 return (ENOENT); 3851 case NFS3ERR_SERVERFAULT: 3852 return (EIO); 3853 case NFS3ERR_JUKEBOX: 3854 return (ENXIO); 3855 default: 3856 return ((int)status); 3857 } 3858 #endif 3859 } 3860 3861 rddir_cache * 3862 rddir_cache_alloc(int flags) 3863 { 3864 rddir_cache *rc; 3865 3866 rc = kmem_alloc(sizeof (*rc), flags); 3867 if (rc != NULL) { 3868 rc->entries = NULL; 3869 rc->flags = RDDIR; 3870 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3871 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3872 rc->count = 1; 3873 #ifdef DEBUG 3874 atomic_inc_64(&clstat_debug.dirent.value.ui64); 3875 #endif 3876 } 3877 return (rc); 3878 } 3879 3880 static void 3881 rddir_cache_free(rddir_cache *rc) 3882 { 3883 3884 #ifdef DEBUG 3885 atomic_dec_64(&clstat_debug.dirent.value.ui64); 3886 #endif 3887 if (rc->entries != NULL) { 3888 #ifdef DEBUG 3889 rddir_cache_buf_free(rc->entries, rc->buflen); 3890 #else 3891 kmem_free(rc->entries, rc->buflen); 3892 #endif 3893 } 3894 cv_destroy(&rc->cv); 3895 mutex_destroy(&rc->lock); 3896 kmem_free(rc, sizeof (*rc)); 3897 } 3898 3899 void 3900 rddir_cache_hold(rddir_cache *rc) 3901 { 3902 3903 mutex_enter(&rc->lock); 3904 rc->count++; 3905 mutex_exit(&rc->lock); 3906 } 3907 3908 void 3909 rddir_cache_rele(rddir_cache *rc) 3910 { 3911 3912 mutex_enter(&rc->lock); 3913 ASSERT(rc->count > 0); 3914 if (--rc->count == 0) { 3915 mutex_exit(&rc->lock); 3916 rddir_cache_free(rc); 3917 } else 3918 mutex_exit(&rc->lock); 3919 } 3920 3921 #ifdef DEBUG 3922 char * 3923 rddir_cache_buf_alloc(size_t size, int flags) 3924 { 3925 char *rc; 3926 3927 rc = kmem_alloc(size, flags); 3928 if (rc != NULL) 3929 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3930 return (rc); 3931 } 3932 3933 void 3934 rddir_cache_buf_free(void *addr, size_t size) 3935 { 3936 3937 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3938 kmem_free(addr, size); 3939 } 3940 #endif 3941 3942 static int 3943 nfs_free_data_reclaim(rnode_t *rp) 3944 { 3945 char *contents; 3946 int size; 3947 vsecattr_t *vsp; 3948 nfs3_pathconf_info *info; 3949 int freed; 3950 cred_t *cred; 3951 3952 /* 3953 * Free any held credentials and caches which 3954 * may be associated with this rnode. 3955 */ 3956 mutex_enter(&rp->r_statelock); 3957 cred = rp->r_cred; 3958 rp->r_cred = NULL; 3959 contents = rp->r_symlink.contents; 3960 size = rp->r_symlink.size; 3961 rp->r_symlink.contents = NULL; 3962 vsp = rp->r_secattr; 3963 rp->r_secattr = NULL; 3964 info = rp->r_pathconf; 3965 rp->r_pathconf = NULL; 3966 mutex_exit(&rp->r_statelock); 3967 3968 if (cred != NULL) 3969 crfree(cred); 3970 3971 /* 3972 * Free the access cache entries. 3973 */ 3974 freed = nfs_access_purge_rp(rp); 3975 3976 if (!HAVE_RDDIR_CACHE(rp) && 3977 contents == NULL && 3978 vsp == NULL && 3979 info == NULL) 3980 return (freed); 3981 3982 /* 3983 * Free the readdir cache entries 3984 */ 3985 if (HAVE_RDDIR_CACHE(rp)) 3986 nfs_purge_rddir_cache(RTOV(rp)); 3987 3988 /* 3989 * Free the symbolic link cache. 3990 */ 3991 if (contents != NULL) { 3992 3993 kmem_free((void *)contents, size); 3994 } 3995 3996 /* 3997 * Free any cached ACL. 3998 */ 3999 if (vsp != NULL) 4000 nfs_acl_free(vsp); 4001 4002 /* 4003 * Free any cached pathconf information. 4004 */ 4005 if (info != NULL) 4006 kmem_free(info, sizeof (*info)); 4007 4008 return (1); 4009 } 4010 4011 static int 4012 nfs_active_data_reclaim(rnode_t *rp) 4013 { 4014 char *contents; 4015 int size; 4016 vsecattr_t *vsp; 4017 nfs3_pathconf_info *info; 4018 int freed; 4019 4020 /* 4021 * Free any held credentials and caches which 4022 * may be associated with this rnode. 4023 */ 4024 if (!mutex_tryenter(&rp->r_statelock)) 4025 return (0); 4026 contents = rp->r_symlink.contents; 4027 size = rp->r_symlink.size; 4028 rp->r_symlink.contents = NULL; 4029 vsp = rp->r_secattr; 4030 rp->r_secattr = NULL; 4031 info = rp->r_pathconf; 4032 rp->r_pathconf = NULL; 4033 mutex_exit(&rp->r_statelock); 4034 4035 /* 4036 * Free the access cache entries. 4037 */ 4038 freed = nfs_access_purge_rp(rp); 4039 4040 if (!HAVE_RDDIR_CACHE(rp) && 4041 contents == NULL && 4042 vsp == NULL && 4043 info == NULL) 4044 return (freed); 4045 4046 /* 4047 * Free the readdir cache entries 4048 */ 4049 if (HAVE_RDDIR_CACHE(rp)) 4050 nfs_purge_rddir_cache(RTOV(rp)); 4051 4052 /* 4053 * Free the symbolic link cache. 4054 */ 4055 if (contents != NULL) { 4056 4057 kmem_free((void *)contents, size); 4058 } 4059 4060 /* 4061 * Free any cached ACL. 4062 */ 4063 if (vsp != NULL) 4064 nfs_acl_free(vsp); 4065 4066 /* 4067 * Free any cached pathconf information. 4068 */ 4069 if (info != NULL) 4070 kmem_free(info, sizeof (*info)); 4071 4072 return (1); 4073 } 4074 4075 static int 4076 nfs_free_reclaim(void) 4077 { 4078 int freed; 4079 rnode_t *rp; 4080 4081 #ifdef DEBUG 4082 clstat_debug.f_reclaim.value.ui64++; 4083 #endif 4084 freed = 0; 4085 mutex_enter(&rpfreelist_lock); 4086 rp = rpfreelist; 4087 if (rp != NULL) { 4088 do { 4089 if (nfs_free_data_reclaim(rp)) 4090 freed = 1; 4091 } while ((rp = rp->r_freef) != rpfreelist); 4092 } 4093 mutex_exit(&rpfreelist_lock); 4094 return (freed); 4095 } 4096 4097 static int 4098 nfs_active_reclaim(void) 4099 { 4100 int freed; 4101 int index; 4102 rnode_t *rp; 4103 4104 #ifdef DEBUG 4105 clstat_debug.a_reclaim.value.ui64++; 4106 #endif 4107 freed = 0; 4108 for (index = 0; index < rtablesize; index++) { 4109 rw_enter(&rtable[index].r_lock, RW_READER); 4110 for (rp = rtable[index].r_hashf; 4111 rp != (rnode_t *)(&rtable[index]); 4112 rp = rp->r_hashf) { 4113 if (nfs_active_data_reclaim(rp)) 4114 freed = 1; 4115 } 4116 rw_exit(&rtable[index].r_lock); 4117 } 4118 return (freed); 4119 } 4120 4121 static int 4122 nfs_rnode_reclaim(void) 4123 { 4124 int freed; 4125 rnode_t *rp; 4126 vnode_t *vp; 4127 4128 #ifdef DEBUG 4129 clstat_debug.r_reclaim.value.ui64++; 4130 #endif 4131 freed = 0; 4132 mutex_enter(&rpfreelist_lock); 4133 while ((rp = rpfreelist) != NULL) { 4134 rp_rmfree(rp); 4135 mutex_exit(&rpfreelist_lock); 4136 if (rp->r_flags & RHASHED) { 4137 vp = RTOV(rp); 4138 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4139 mutex_enter(&vp->v_lock); 4140 if (vp->v_count > 1) { 4141 VN_RELE_LOCKED(vp); 4142 mutex_exit(&vp->v_lock); 4143 rw_exit(&rp->r_hashq->r_lock); 4144 mutex_enter(&rpfreelist_lock); 4145 continue; 4146 } 4147 mutex_exit(&vp->v_lock); 4148 rp_rmhash_locked(rp); 4149 rw_exit(&rp->r_hashq->r_lock); 4150 } 4151 /* 4152 * This call to rp_addfree will end up destroying the 4153 * rnode, but in a safe way with the appropriate set 4154 * of checks done. 4155 */ 4156 rp_addfree(rp, CRED()); 4157 mutex_enter(&rpfreelist_lock); 4158 } 4159 mutex_exit(&rpfreelist_lock); 4160 return (freed); 4161 } 4162 4163 /*ARGSUSED*/ 4164 static void 4165 nfs_reclaim(void *cdrarg) 4166 { 4167 4168 #ifdef DEBUG 4169 clstat_debug.reclaim.value.ui64++; 4170 #endif 4171 if (nfs_free_reclaim()) 4172 return; 4173 4174 if (nfs_active_reclaim()) 4175 return; 4176 4177 (void) nfs_rnode_reclaim(); 4178 } 4179 4180 /* 4181 * NFS client failover support 4182 * 4183 * Routines to copy filehandles 4184 */ 4185 void 4186 nfscopyfh(caddr_t fhp, vnode_t *vp) 4187 { 4188 fhandle_t *dest = (fhandle_t *)fhp; 4189 4190 if (dest != NULL) 4191 *dest = *VTOFH(vp); 4192 } 4193 4194 void 4195 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4196 { 4197 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4198 4199 if (dest != NULL) 4200 *dest = *VTOFH3(vp); 4201 } 4202 4203 /* 4204 * NFS client failover support 4205 * 4206 * failover_safe() will test various conditions to ensure that 4207 * failover is permitted for this vnode. It will be denied 4208 * if: 4209 * 1) the operation in progress does not support failover (NULL fi) 4210 * 2) there are no available replicas (NULL mi_servers->sv_next) 4211 * 3) any locks are outstanding on this file 4212 */ 4213 static int 4214 failover_safe(failinfo_t *fi) 4215 { 4216 4217 /* 4218 * Does this op permit failover? 4219 */ 4220 if (fi == NULL || fi->vp == NULL) 4221 return (0); 4222 4223 /* 4224 * Are there any alternates to failover to? 4225 */ 4226 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4227 return (0); 4228 4229 /* 4230 * Disable check; we've forced local locking 4231 * 4232 * if (flk_has_remote_locks(fi->vp)) 4233 * return (0); 4234 */ 4235 4236 /* 4237 * If we have no partial path, we can't do anything 4238 */ 4239 if (VTOR(fi->vp)->r_path == NULL) 4240 return (0); 4241 4242 return (1); 4243 } 4244 4245 #include <sys/thread.h> 4246 4247 /* 4248 * NFS client failover support 4249 * 4250 * failover_newserver() will start a search for a new server, 4251 * preferably by starting an async thread to do the work. If 4252 * someone is already doing this (recognizable by MI_BINDINPROG 4253 * being set), it will simply return and the calling thread 4254 * will queue on the mi_failover_cv condition variable. 4255 */ 4256 static void 4257 failover_newserver(mntinfo_t *mi) 4258 { 4259 /* 4260 * Check if someone else is doing this already 4261 */ 4262 mutex_enter(&mi->mi_lock); 4263 if (mi->mi_flags & MI_BINDINPROG) { 4264 mutex_exit(&mi->mi_lock); 4265 return; 4266 } 4267 mi->mi_flags |= MI_BINDINPROG; 4268 4269 /* 4270 * Need to hold the vfs struct so that it can't be released 4271 * while the failover thread is selecting a new server. 4272 */ 4273 VFS_HOLD(mi->mi_vfsp); 4274 4275 /* 4276 * Start a thread to do the real searching. 4277 */ 4278 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4279 4280 mutex_exit(&mi->mi_lock); 4281 } 4282 4283 /* 4284 * NFS client failover support 4285 * 4286 * failover_thread() will find a new server to replace the one 4287 * currently in use, wake up other threads waiting on this mount 4288 * point, and die. It will start at the head of the server list 4289 * and poll servers until it finds one with an NFS server which is 4290 * registered and responds to a NULL procedure ping. 4291 * 4292 * XXX failover_thread is unsafe within the scope of the 4293 * present model defined for cpr to suspend the system. 4294 * Specifically, over-the-wire calls made by the thread 4295 * are unsafe. The thread needs to be reevaluated in case of 4296 * future updates to the cpr suspend model. 4297 */ 4298 static void 4299 failover_thread(mntinfo_t *mi) 4300 { 4301 servinfo_t *svp = NULL; 4302 CLIENT *cl; 4303 enum clnt_stat status; 4304 struct timeval tv; 4305 int error; 4306 int oncethru = 0; 4307 callb_cpr_t cprinfo; 4308 rnode_t *rp; 4309 int index; 4310 char *srvnames; 4311 size_t srvnames_len; 4312 struct nfs_clnt *nfscl = NULL; 4313 zoneid_t zoneid = getzoneid(); 4314 4315 #ifdef DEBUG 4316 /* 4317 * This is currently only needed to access counters which exist on 4318 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4319 * on non-DEBUG kernels. 4320 */ 4321 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4322 ASSERT(nfscl != NULL); 4323 #endif 4324 4325 /* 4326 * Its safe to piggyback on the mi_lock since failover_newserver() 4327 * code guarantees that there will be only one failover thread 4328 * per mountinfo at any instance. 4329 */ 4330 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4331 "failover_thread"); 4332 4333 mutex_enter(&mi->mi_lock); 4334 while (mi->mi_readers) { 4335 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4336 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4337 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4338 } 4339 mutex_exit(&mi->mi_lock); 4340 4341 tv.tv_sec = 2; 4342 tv.tv_usec = 0; 4343 4344 /* 4345 * Ping the null NFS procedure of every server in 4346 * the list until one responds. We always start 4347 * at the head of the list and always skip the one 4348 * that is current, since it's caused us a problem. 4349 */ 4350 while (svp == NULL) { 4351 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4352 if (!oncethru && svp == mi->mi_curr_serv) 4353 continue; 4354 4355 /* 4356 * If the file system was forcibly umounted 4357 * while trying to do a failover, then just 4358 * give up on the failover. It won't matter 4359 * what the server is. 4360 */ 4361 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4362 svp = NULL; 4363 goto done; 4364 } 4365 4366 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4367 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4368 if (error) 4369 continue; 4370 4371 if (!(mi->mi_flags & MI_INT)) 4372 cl->cl_nosignal = TRUE; 4373 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4374 xdr_void, NULL, tv); 4375 if (!(mi->mi_flags & MI_INT)) 4376 cl->cl_nosignal = FALSE; 4377 AUTH_DESTROY(cl->cl_auth); 4378 CLNT_DESTROY(cl); 4379 if (status == RPC_SUCCESS) { 4380 if (svp == mi->mi_curr_serv) { 4381 #ifdef DEBUG 4382 zcmn_err(zoneid, CE_NOTE, 4383 "NFS%d: failing over: selecting original server %s", 4384 mi->mi_vers, svp->sv_hostname); 4385 #else 4386 zcmn_err(zoneid, CE_NOTE, 4387 "NFS: failing over: selecting original server %s", 4388 svp->sv_hostname); 4389 #endif 4390 } else { 4391 #ifdef DEBUG 4392 zcmn_err(zoneid, CE_NOTE, 4393 "NFS%d: failing over from %s to %s", 4394 mi->mi_vers, 4395 mi->mi_curr_serv->sv_hostname, 4396 svp->sv_hostname); 4397 #else 4398 zcmn_err(zoneid, CE_NOTE, 4399 "NFS: failing over from %s to %s", 4400 mi->mi_curr_serv->sv_hostname, 4401 svp->sv_hostname); 4402 #endif 4403 } 4404 break; 4405 } 4406 } 4407 4408 if (svp == NULL) { 4409 if (!oncethru) { 4410 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4411 #ifdef DEBUG 4412 zprintf(zoneid, 4413 "NFS%d servers %s not responding " 4414 "still trying\n", mi->mi_vers, srvnames); 4415 #else 4416 zprintf(zoneid, "NFS servers %s not responding " 4417 "still trying\n", srvnames); 4418 #endif 4419 oncethru = 1; 4420 } 4421 mutex_enter(&mi->mi_lock); 4422 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4423 mutex_exit(&mi->mi_lock); 4424 delay(hz); 4425 mutex_enter(&mi->mi_lock); 4426 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4427 mutex_exit(&mi->mi_lock); 4428 } 4429 } 4430 4431 if (oncethru) { 4432 #ifdef DEBUG 4433 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4434 #else 4435 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4436 #endif 4437 } 4438 4439 if (svp != mi->mi_curr_serv) { 4440 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4441 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4442 rw_enter(&rtable[index].r_lock, RW_WRITER); 4443 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4444 mi->mi_vfsp); 4445 if (rp != NULL) { 4446 if (rp->r_flags & RHASHED) 4447 rp_rmhash_locked(rp); 4448 rw_exit(&rtable[index].r_lock); 4449 rp->r_server = svp; 4450 rp->r_fh = svp->sv_fhandle; 4451 (void) nfs_free_data_reclaim(rp); 4452 index = rtablehash(&rp->r_fh); 4453 rp->r_hashq = &rtable[index]; 4454 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4455 vn_exists(RTOV(rp)); 4456 rp_addhash(rp); 4457 rw_exit(&rp->r_hashq->r_lock); 4458 VN_RELE(RTOV(rp)); 4459 } else 4460 rw_exit(&rtable[index].r_lock); 4461 } 4462 4463 done: 4464 if (oncethru) 4465 kmem_free(srvnames, srvnames_len); 4466 mutex_enter(&mi->mi_lock); 4467 mi->mi_flags &= ~MI_BINDINPROG; 4468 if (svp != NULL) { 4469 mi->mi_curr_serv = svp; 4470 mi->mi_failover++; 4471 #ifdef DEBUG 4472 nfscl->nfscl_stat.failover.value.ui64++; 4473 #endif 4474 } 4475 cv_broadcast(&mi->mi_failover_cv); 4476 CALLB_CPR_EXIT(&cprinfo); 4477 VFS_RELE(mi->mi_vfsp); 4478 zthread_exit(); 4479 /* NOTREACHED */ 4480 } 4481 4482 /* 4483 * NFS client failover support 4484 * 4485 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4486 * is cleared, meaning that failover is complete. Called with 4487 * mi_lock mutex held. 4488 */ 4489 static int 4490 failover_wait(mntinfo_t *mi) 4491 { 4492 k_sigset_t smask; 4493 4494 /* 4495 * If someone else is hunting for a living server, 4496 * sleep until it's done. After our sleep, we may 4497 * be bound to the right server and get off cheaply. 4498 */ 4499 while (mi->mi_flags & MI_BINDINPROG) { 4500 /* 4501 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4502 * and SIGTERM. (Preserving the existing masks). 4503 * Mask out SIGINT if mount option nointr is specified. 4504 */ 4505 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4506 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4507 /* 4508 * restore original signal mask 4509 */ 4510 sigunintr(&smask); 4511 return (EINTR); 4512 } 4513 /* 4514 * restore original signal mask 4515 */ 4516 sigunintr(&smask); 4517 } 4518 return (0); 4519 } 4520 4521 /* 4522 * NFS client failover support 4523 * 4524 * failover_remap() will do a partial pathname lookup and find the 4525 * desired vnode on the current server. The interim vnode will be 4526 * discarded after we pilfer the new filehandle. 4527 * 4528 * Side effects: 4529 * - This routine will also update the filehandle in the args structure 4530 * pointed to by the fi->fhp pointer if it is non-NULL. 4531 */ 4532 4533 static int 4534 failover_remap(failinfo_t *fi) 4535 { 4536 vnode_t *vp, *nvp, *rootvp; 4537 rnode_t *rp, *nrp; 4538 mntinfo_t *mi; 4539 int error; 4540 #ifdef DEBUG 4541 struct nfs_clnt *nfscl; 4542 4543 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4544 ASSERT(nfscl != NULL); 4545 #endif 4546 /* 4547 * Sanity check 4548 */ 4549 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4550 return (EINVAL); 4551 vp = fi->vp; 4552 rp = VTOR(vp); 4553 mi = VTOMI(vp); 4554 4555 if (!(vp->v_flag & VROOT)) { 4556 /* 4557 * Given the root fh, use the path stored in 4558 * the rnode to find the fh for the new server. 4559 */ 4560 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4561 if (error) 4562 return (error); 4563 4564 error = failover_lookup(rp->r_path, rootvp, 4565 fi->lookupproc, fi->xattrdirproc, &nvp); 4566 4567 VN_RELE(rootvp); 4568 4569 if (error) 4570 return (error); 4571 4572 /* 4573 * If we found the same rnode, we're done now 4574 */ 4575 if (nvp == vp) { 4576 /* 4577 * Failed and the new server may physically be same 4578 * OR may share a same disk subsystem. In this case 4579 * file handle for a particular file path is not going 4580 * to change, given the same filehandle lookup will 4581 * always locate the same rnode as the existing one. 4582 * All we might need to do is to update the r_server 4583 * with the current servinfo. 4584 */ 4585 if (!VALID_FH(fi)) { 4586 rp->r_server = mi->mi_curr_serv; 4587 } 4588 VN_RELE(nvp); 4589 return (0); 4590 } 4591 4592 /* 4593 * Try to make it so that no one else will find this 4594 * vnode because it is just a temporary to hold the 4595 * new file handle until that file handle can be 4596 * copied to the original vnode/rnode. 4597 */ 4598 nrp = VTOR(nvp); 4599 mutex_enter(&mi->mi_remap_lock); 4600 /* 4601 * Some other thread could have raced in here and could 4602 * have done the remap for this particular rnode before 4603 * this thread here. Check for rp->r_server and 4604 * mi->mi_curr_serv and return if they are same. 4605 */ 4606 if (VALID_FH(fi)) { 4607 mutex_exit(&mi->mi_remap_lock); 4608 VN_RELE(nvp); 4609 return (0); 4610 } 4611 4612 if (nrp->r_flags & RHASHED) 4613 rp_rmhash(nrp); 4614 4615 /* 4616 * As a heuristic check on the validity of the new 4617 * file, check that the size and type match against 4618 * that we remember from the old version. 4619 */ 4620 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4621 mutex_exit(&mi->mi_remap_lock); 4622 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4623 "NFS replicas %s and %s: file %s not same.", 4624 rp->r_server->sv_hostname, 4625 nrp->r_server->sv_hostname, rp->r_path); 4626 VN_RELE(nvp); 4627 return (EINVAL); 4628 } 4629 4630 /* 4631 * snarf the filehandle from the new rnode 4632 * then release it, again while updating the 4633 * hash queues for the rnode. 4634 */ 4635 if (rp->r_flags & RHASHED) 4636 rp_rmhash(rp); 4637 rp->r_server = mi->mi_curr_serv; 4638 rp->r_fh = nrp->r_fh; 4639 rp->r_hashq = nrp->r_hashq; 4640 /* 4641 * Copy the attributes from the new rnode to the old 4642 * rnode. This will help to reduce unnecessary page 4643 * cache flushes. 4644 */ 4645 rp->r_attr = nrp->r_attr; 4646 rp->r_attrtime = nrp->r_attrtime; 4647 rp->r_mtime = nrp->r_mtime; 4648 (void) nfs_free_data_reclaim(rp); 4649 nfs_setswaplike(vp, &rp->r_attr); 4650 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4651 rp_addhash(rp); 4652 rw_exit(&rp->r_hashq->r_lock); 4653 mutex_exit(&mi->mi_remap_lock); 4654 VN_RELE(nvp); 4655 } 4656 4657 /* 4658 * Update successful failover remap count 4659 */ 4660 mutex_enter(&mi->mi_lock); 4661 mi->mi_remap++; 4662 mutex_exit(&mi->mi_lock); 4663 #ifdef DEBUG 4664 nfscl->nfscl_stat.remap.value.ui64++; 4665 #endif 4666 4667 /* 4668 * If we have a copied filehandle to update, do it now. 4669 */ 4670 if (fi->fhp != NULL && fi->copyproc != NULL) 4671 (*fi->copyproc)(fi->fhp, vp); 4672 4673 return (0); 4674 } 4675 4676 /* 4677 * NFS client failover support 4678 * 4679 * We want a simple pathname lookup routine to parse the pieces 4680 * of path in rp->r_path. We know that the path was a created 4681 * as rnodes were made, so we know we have only to deal with 4682 * paths that look like: 4683 * dir1/dir2/dir3/file 4684 * Any evidence of anything like .., symlinks, and ENOTDIR 4685 * are hard errors, because they mean something in this filesystem 4686 * is different from the one we came from, or has changed under 4687 * us in some way. If this is true, we want the failure. 4688 * 4689 * Extended attributes: if the filesystem is mounted with extended 4690 * attributes enabled (-o xattr), the attribute directory will be 4691 * represented in the r_path as the magic name XATTR_RPATH. So if 4692 * we see that name in the pathname, is must be because this node 4693 * is an extended attribute. Therefore, look it up that way. 4694 */ 4695 static int 4696 failover_lookup(char *path, vnode_t *root, 4697 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4698 vnode_t *, cred_t *, int), 4699 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4700 vnode_t **new) 4701 { 4702 vnode_t *dvp, *nvp; 4703 int error = EINVAL; 4704 char *s, *p, *tmppath; 4705 size_t len; 4706 mntinfo_t *mi; 4707 bool_t xattr; 4708 4709 /* Make local copy of path */ 4710 len = strlen(path) + 1; 4711 tmppath = kmem_alloc(len, KM_SLEEP); 4712 (void) strcpy(tmppath, path); 4713 s = tmppath; 4714 4715 dvp = root; 4716 VN_HOLD(dvp); 4717 mi = VTOMI(root); 4718 xattr = mi->mi_flags & MI_EXTATTR; 4719 4720 do { 4721 p = strchr(s, '/'); 4722 if (p != NULL) 4723 *p = '\0'; 4724 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4725 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4726 RFSCALL_SOFT); 4727 } else { 4728 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4729 CRED(), RFSCALL_SOFT); 4730 } 4731 if (p != NULL) 4732 *p++ = '/'; 4733 if (error) { 4734 VN_RELE(dvp); 4735 kmem_free(tmppath, len); 4736 return (error); 4737 } 4738 s = p; 4739 VN_RELE(dvp); 4740 dvp = nvp; 4741 } while (p != NULL); 4742 4743 if (nvp != NULL && new != NULL) 4744 *new = nvp; 4745 kmem_free(tmppath, len); 4746 return (0); 4747 } 4748 4749 /* 4750 * NFS client failover support 4751 * 4752 * sv_free() frees the malloc'd portion of a "servinfo_t". 4753 */ 4754 void 4755 sv_free(servinfo_t *svp) 4756 { 4757 servinfo_t *next; 4758 struct knetconfig *knconf; 4759 4760 while (svp != NULL) { 4761 next = svp->sv_next; 4762 if (svp->sv_secdata) 4763 sec_clnt_freeinfo(svp->sv_secdata); 4764 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4765 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4766 knconf = svp->sv_knconf; 4767 if (knconf != NULL) { 4768 if (knconf->knc_protofmly != NULL) 4769 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4770 if (knconf->knc_proto != NULL) 4771 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4772 kmem_free(knconf, sizeof (*knconf)); 4773 } 4774 knconf = svp->sv_origknconf; 4775 if (knconf != NULL) { 4776 if (knconf->knc_protofmly != NULL) 4777 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4778 if (knconf->knc_proto != NULL) 4779 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4780 kmem_free(knconf, sizeof (*knconf)); 4781 } 4782 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4783 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4784 mutex_destroy(&svp->sv_lock); 4785 kmem_free(svp, sizeof (*svp)); 4786 svp = next; 4787 } 4788 } 4789 4790 /* 4791 * Only can return non-zero if intr != 0. 4792 */ 4793 int 4794 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4795 { 4796 4797 mutex_enter(&l->lock); 4798 4799 /* 4800 * If this is a nested enter, then allow it. There 4801 * must be as many exits as enters through. 4802 */ 4803 if (l->owner == curthread) { 4804 /* lock is held for writing by current thread */ 4805 ASSERT(rw == RW_READER || rw == RW_WRITER); 4806 l->count--; 4807 } else if (rw == RW_READER) { 4808 /* 4809 * While there is a writer active or writers waiting, 4810 * then wait for them to finish up and move on. Then, 4811 * increment the count to indicate that a reader is 4812 * active. 4813 */ 4814 while (l->count < 0 || l->waiters > 0) { 4815 if (intr) { 4816 klwp_t *lwp = ttolwp(curthread); 4817 4818 if (lwp != NULL) 4819 lwp->lwp_nostop++; 4820 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) { 4821 if (lwp != NULL) 4822 lwp->lwp_nostop--; 4823 mutex_exit(&l->lock); 4824 return (EINTR); 4825 } 4826 if (lwp != NULL) 4827 lwp->lwp_nostop--; 4828 } else 4829 cv_wait(&l->cv_rd, &l->lock); 4830 } 4831 ASSERT(l->count < INT_MAX); 4832 #ifdef DEBUG 4833 if ((l->count % 10000) == 9999) 4834 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4835 "rwlock @ %p\n", l->count, (void *)&l); 4836 #endif 4837 l->count++; 4838 } else { 4839 ASSERT(rw == RW_WRITER); 4840 /* 4841 * While there are readers active or a writer 4842 * active, then wait for all of the readers 4843 * to finish or for the writer to finish. 4844 * Then, set the owner field to curthread and 4845 * decrement count to indicate that a writer 4846 * is active. 4847 */ 4848 while (l->count != 0) { 4849 l->waiters++; 4850 if (intr) { 4851 klwp_t *lwp = ttolwp(curthread); 4852 4853 if (lwp != NULL) 4854 lwp->lwp_nostop++; 4855 if (cv_wait_sig(&l->cv, &l->lock) == 0) { 4856 if (lwp != NULL) 4857 lwp->lwp_nostop--; 4858 l->waiters--; 4859 /* 4860 * If there are readers active and no 4861 * writers waiting then wake up all of 4862 * the waiting readers (if any). 4863 */ 4864 if (l->count > 0 && l->waiters == 0) 4865 cv_broadcast(&l->cv_rd); 4866 mutex_exit(&l->lock); 4867 return (EINTR); 4868 } 4869 if (lwp != NULL) 4870 lwp->lwp_nostop--; 4871 } else 4872 cv_wait(&l->cv, &l->lock); 4873 l->waiters--; 4874 } 4875 ASSERT(l->owner == NULL); 4876 l->owner = curthread; 4877 l->count--; 4878 } 4879 4880 mutex_exit(&l->lock); 4881 4882 return (0); 4883 } 4884 4885 /* 4886 * If the lock is available, obtain it and return non-zero. If there is 4887 * already a conflicting lock, return 0 immediately. 4888 */ 4889 4890 int 4891 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4892 { 4893 mutex_enter(&l->lock); 4894 4895 /* 4896 * If this is a nested enter, then allow it. There 4897 * must be as many exits as enters through. 4898 */ 4899 if (l->owner == curthread) { 4900 /* lock is held for writing by current thread */ 4901 ASSERT(rw == RW_READER || rw == RW_WRITER); 4902 l->count--; 4903 } else if (rw == RW_READER) { 4904 /* 4905 * If there is a writer active or writers waiting, deny the 4906 * lock. Otherwise, bump the count of readers. 4907 */ 4908 if (l->count < 0 || l->waiters > 0) { 4909 mutex_exit(&l->lock); 4910 return (0); 4911 } 4912 l->count++; 4913 } else { 4914 ASSERT(rw == RW_WRITER); 4915 /* 4916 * If there are readers active or a writer active, deny the 4917 * lock. Otherwise, set the owner field to curthread and 4918 * decrement count to indicate that a writer is active. 4919 */ 4920 if (l->count != 0) { 4921 mutex_exit(&l->lock); 4922 return (0); 4923 } 4924 ASSERT(l->owner == NULL); 4925 l->owner = curthread; 4926 l->count--; 4927 } 4928 4929 mutex_exit(&l->lock); 4930 4931 return (1); 4932 } 4933 4934 void 4935 nfs_rw_exit(nfs_rwlock_t *l) 4936 { 4937 4938 mutex_enter(&l->lock); 4939 4940 if (l->owner != NULL) { 4941 ASSERT(l->owner == curthread); 4942 4943 /* 4944 * To release a writer lock increment count to indicate that 4945 * there is one less writer active. If this was the last of 4946 * possibly nested writer locks, then clear the owner field as 4947 * well to indicate that there is no writer active. 4948 */ 4949 ASSERT(l->count < 0); 4950 l->count++; 4951 if (l->count == 0) { 4952 l->owner = NULL; 4953 4954 /* 4955 * If there are no writers waiting then wakeup all of 4956 * the waiting readers (if any). 4957 */ 4958 if (l->waiters == 0) 4959 cv_broadcast(&l->cv_rd); 4960 } 4961 } else { 4962 /* 4963 * To release a reader lock just decrement count to indicate 4964 * that there is one less reader active. 4965 */ 4966 ASSERT(l->count > 0); 4967 l->count--; 4968 } 4969 4970 /* 4971 * If there are no readers active nor a writer active and there is a 4972 * writer waiting we need to wake up it. 4973 */ 4974 if (l->count == 0 && l->waiters > 0) 4975 cv_signal(&l->cv); 4976 mutex_exit(&l->lock); 4977 } 4978 4979 int 4980 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4981 { 4982 4983 if (rw == RW_READER) 4984 return (l->count > 0); 4985 ASSERT(rw == RW_WRITER); 4986 return (l->count < 0); 4987 } 4988 4989 /* ARGSUSED */ 4990 void 4991 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4992 { 4993 4994 l->count = 0; 4995 l->waiters = 0; 4996 l->owner = NULL; 4997 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4998 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4999 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL); 5000 } 5001 5002 void 5003 nfs_rw_destroy(nfs_rwlock_t *l) 5004 { 5005 5006 mutex_destroy(&l->lock); 5007 cv_destroy(&l->cv); 5008 cv_destroy(&l->cv_rd); 5009 } 5010 5011 int 5012 nfs3_rddir_compar(const void *x, const void *y) 5013 { 5014 rddir_cache *a = (rddir_cache *)x; 5015 rddir_cache *b = (rddir_cache *)y; 5016 5017 if (a->nfs3_cookie == b->nfs3_cookie) { 5018 if (a->buflen == b->buflen) 5019 return (0); 5020 if (a->buflen < b->buflen) 5021 return (-1); 5022 return (1); 5023 } 5024 5025 if (a->nfs3_cookie < b->nfs3_cookie) 5026 return (-1); 5027 5028 return (1); 5029 } 5030 5031 int 5032 nfs_rddir_compar(const void *x, const void *y) 5033 { 5034 rddir_cache *a = (rddir_cache *)x; 5035 rddir_cache *b = (rddir_cache *)y; 5036 5037 if (a->nfs_cookie == b->nfs_cookie) { 5038 if (a->buflen == b->buflen) 5039 return (0); 5040 if (a->buflen < b->buflen) 5041 return (-1); 5042 return (1); 5043 } 5044 5045 if (a->nfs_cookie < b->nfs_cookie) 5046 return (-1); 5047 5048 return (1); 5049 } 5050 5051 static char * 5052 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 5053 { 5054 servinfo_t *s; 5055 char *srvnames; 5056 char *namep; 5057 size_t length; 5058 5059 /* 5060 * Calculate the length of the string required to hold all 5061 * of the server names plus either a comma or a null 5062 * character following each individual one. 5063 */ 5064 length = 0; 5065 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 5066 length += s->sv_hostnamelen; 5067 5068 srvnames = kmem_alloc(length, KM_SLEEP); 5069 5070 namep = srvnames; 5071 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 5072 (void) strcpy(namep, s->sv_hostname); 5073 namep += s->sv_hostnamelen - 1; 5074 *namep++ = ','; 5075 } 5076 *--namep = '\0'; 5077 5078 *len = length; 5079 5080 return (srvnames); 5081 } 5082 5083 /* 5084 * These two functions are temporary and designed for the upgrade-workaround 5085 * only. They cannot be used for general zone-crossing NFS client support, and 5086 * will be removed shortly. 5087 * 5088 * When the workaround is enabled, all NFS traffic is forced into the global 5089 * zone. These functions are called when the code needs to refer to the state 5090 * of the underlying network connection. They're not called when the function 5091 * needs to refer to the state of the process that invoked the system call. 5092 * (E.g., when checking whether the zone is shutting down during the mount() 5093 * call.) 5094 */ 5095 5096 struct zone * 5097 nfs_zone(void) 5098 { 5099 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5100 } 5101 5102 zoneid_t 5103 nfs_zoneid(void) 5104 { 5105 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5106 } 5107 5108 /* 5109 * nfs_mount_label_policy: 5110 * Determine whether the mount is allowed according to MAC check, 5111 * by comparing (where appropriate) label of the remote server 5112 * against the label of the zone being mounted into. 5113 * 5114 * Returns: 5115 * 0 : access allowed 5116 * -1 : read-only access allowed (i.e., read-down) 5117 * >0 : error code, such as EACCES 5118 */ 5119 int 5120 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5121 struct knetconfig *knconf, cred_t *cr) 5122 { 5123 int addr_type; 5124 void *ipaddr; 5125 bslabel_t *server_sl, *mntlabel; 5126 zone_t *mntzone = NULL; 5127 ts_label_t *zlabel; 5128 tsol_tpc_t *tp; 5129 ts_label_t *tsl = NULL; 5130 int retv; 5131 5132 /* 5133 * Get the zone's label. Each zone on a labeled system has a label. 5134 */ 5135 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5136 zlabel = mntzone->zone_slabel; 5137 ASSERT(zlabel != NULL); 5138 label_hold(zlabel); 5139 5140 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5141 addr_type = IPV4_VERSION; 5142 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5143 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5144 addr_type = IPV6_VERSION; 5145 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5146 } else { 5147 retv = 0; 5148 goto out; 5149 } 5150 5151 retv = EACCES; /* assume the worst */ 5152 5153 /* 5154 * Next, get the assigned label of the remote server. 5155 */ 5156 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5157 if (tp == NULL) 5158 goto out; /* error getting host entry */ 5159 5160 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5161 goto rel_tpc; /* invalid domain */ 5162 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5163 (tp->tpc_tp.host_type != UNLABELED)) 5164 goto rel_tpc; /* invalid hosttype */ 5165 5166 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5167 tsl = getflabel_cipso(vfsp); 5168 if (tsl == NULL) 5169 goto rel_tpc; /* error getting server lbl */ 5170 5171 server_sl = label2bslabel(tsl); 5172 } else { /* UNLABELED */ 5173 server_sl = &tp->tpc_tp.tp_def_label; 5174 } 5175 5176 mntlabel = label2bslabel(zlabel); 5177 5178 /* 5179 * Now compare labels to complete the MAC check. If the labels 5180 * are equal or if the requestor is in the global zone and has 5181 * NET_MAC_AWARE, then allow read-write access. (Except for 5182 * mounts into the global zone itself; restrict these to 5183 * read-only.) 5184 * 5185 * If the requestor is in some other zone, but their label 5186 * dominates the server, then allow read-down. 5187 * 5188 * Otherwise, access is denied. 5189 */ 5190 if (blequal(mntlabel, server_sl) || 5191 (crgetzoneid(cr) == GLOBAL_ZONEID && 5192 getpflags(NET_MAC_AWARE, cr) != 0)) { 5193 if ((mntzone == global_zone) || 5194 !blequal(mntlabel, server_sl)) 5195 retv = -1; /* read-only */ 5196 else 5197 retv = 0; /* access OK */ 5198 } else if (bldominates(mntlabel, server_sl)) { 5199 retv = -1; /* read-only */ 5200 } else { 5201 retv = EACCES; 5202 } 5203 5204 if (tsl != NULL) 5205 label_rele(tsl); 5206 5207 rel_tpc: 5208 TPC_RELE(tp); 5209 out: 5210 if (mntzone) 5211 zone_rele(mntzone); 5212 label_rele(zlabel); 5213 return (retv); 5214 } 5215 5216 boolean_t 5217 nfs_has_ctty(void) 5218 { 5219 boolean_t rv; 5220 mutex_enter(&curproc->p_splock); 5221 rv = (curproc->p_sessp->s_vp != NULL); 5222 mutex_exit(&curproc->p_splock); 5223 return (rv); 5224 } 5225 5226 /* 5227 * See if xattr directory to see if it has any generic user attributes 5228 */ 5229 int 5230 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5231 { 5232 struct uio uio; 5233 struct iovec iov; 5234 char *dbuf; 5235 struct dirent64 *dp; 5236 size_t dlen = 8 * 1024; 5237 size_t dbuflen; 5238 int eof = 0; 5239 int error; 5240 5241 *valp = 0; 5242 dbuf = kmem_alloc(dlen, KM_SLEEP); 5243 uio.uio_iov = &iov; 5244 uio.uio_iovcnt = 1; 5245 uio.uio_segflg = UIO_SYSSPACE; 5246 uio.uio_fmode = 0; 5247 uio.uio_extflg = UIO_COPY_CACHED; 5248 uio.uio_loffset = 0; 5249 uio.uio_resid = dlen; 5250 iov.iov_base = dbuf; 5251 iov.iov_len = dlen; 5252 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5253 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5254 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5255 5256 dbuflen = dlen - uio.uio_resid; 5257 5258 if (error || dbuflen == 0) { 5259 kmem_free(dbuf, dlen); 5260 return (error); 5261 } 5262 5263 dp = (dirent64_t *)dbuf; 5264 5265 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5266 if (strcmp(dp->d_name, ".") == 0 || 5267 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5268 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5269 VIEW_READONLY) == 0) { 5270 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5271 continue; 5272 } 5273 5274 *valp = 1; 5275 break; 5276 } 5277 kmem_free(dbuf, dlen); 5278 return (0); 5279 } 5280