1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/systm.h> 28 #include <sys/sdt.h> 29 #include <rpc/types.h> 30 #include <rpc/auth.h> 31 #include <rpc/auth_unix.h> 32 #include <rpc/auth_des.h> 33 #include <rpc/svc.h> 34 #include <rpc/xdr.h> 35 #include <nfs/nfs4.h> 36 #include <nfs/nfs_dispatch.h> 37 #include <nfs/nfs4_drc.h> 38 39 #define NFS4_MAX_MINOR_VERSION 0 40 41 /* 42 * This is the duplicate request cache for NFSv4 43 */ 44 rfs4_drc_t *nfs4_drc = NULL; 45 46 /* 47 * The default size of the duplicate request cache 48 */ 49 uint32_t nfs4_drc_max = 8 * 1024; 50 51 /* 52 * The number of buckets we'd like to hash the 53 * replies into.. do not change this on the fly. 54 */ 55 uint32_t nfs4_drc_hash = 541; 56 57 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp); 58 59 /* 60 * Initialize a duplicate request cache. 61 */ 62 rfs4_drc_t * 63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 64 { 65 rfs4_drc_t *drc; 66 uint32_t bki; 67 68 ASSERT(drc_size); 69 ASSERT(drc_hash_size); 70 71 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 72 73 drc->max_size = drc_size; 74 drc->in_use = 0; 75 76 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 77 78 drc->dr_hash = drc_hash_size; 79 80 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 81 82 for (bki = 0; bki < drc_hash_size; bki++) { 83 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 84 offsetof(rfs4_dupreq_t, dr_bkt_next)); 85 } 86 87 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 88 offsetof(rfs4_dupreq_t, dr_next)); 89 90 return (drc); 91 } 92 93 /* 94 * Destroy a duplicate request cache. 95 */ 96 void 97 rfs4_fini_drc(rfs4_drc_t *drc) 98 { 99 rfs4_dupreq_t *drp, *drp_next; 100 101 ASSERT(drc); 102 103 /* iterate over the dr_cache and free the enties */ 104 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 105 106 if (drp->dr_state == NFS4_DUP_REPLAY) 107 rfs4_compound_free(&(drp->dr_res)); 108 109 if (drp->dr_addr.buf != NULL) 110 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 111 112 drp_next = list_next(&(drc->dr_cache), drp); 113 114 kmem_free(drp, sizeof (rfs4_dupreq_t)); 115 } 116 117 mutex_destroy(&drc->lock); 118 kmem_free(drc->dr_buckets, 119 sizeof (list_t)*drc->dr_hash); 120 kmem_free(drc, sizeof (rfs4_drc_t)); 121 } 122 123 /* 124 * rfs4_dr_chstate: 125 * 126 * Change the state of a rfs4_dupreq. If it's not in transition 127 * to the FREE state, return. If we are moving to the FREE state 128 * then we need to clean up the compound results and move the entry 129 * to the end of the list. 130 */ 131 void 132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 133 { 134 rfs4_drc_t *drc; 135 136 ASSERT(drp); 137 ASSERT(drp->drc); 138 ASSERT(drp->dr_bkt); 139 ASSERT(MUTEX_HELD(&drp->drc->lock)); 140 141 drp->dr_state = new_state; 142 143 if (new_state != NFS4_DUP_FREE) 144 return; 145 146 drc = drp->drc; 147 148 /* 149 * Remove entry from the bucket and 150 * dr_cache list, free compound results. 151 */ 152 list_remove(drp->dr_bkt, drp); 153 list_remove(&(drc->dr_cache), drp); 154 rfs4_compound_free(&(drp->dr_res)); 155 } 156 157 /* 158 * rfs4_alloc_dr: 159 * 160 * Malloc a new one if we have not reached our maximum cache 161 * limit, otherwise pick an entry off the tail -- Use if it 162 * is marked as NFS4_DUP_FREE, or is an entry in the 163 * NFS4_DUP_REPLAY state. 164 */ 165 rfs4_dupreq_t * 166 rfs4_alloc_dr(rfs4_drc_t *drc) 167 { 168 rfs4_dupreq_t *drp_tail, *drp = NULL; 169 170 ASSERT(drc); 171 ASSERT(MUTEX_HELD(&drc->lock)); 172 173 /* 174 * Have we hit the cache limit yet ? 175 */ 176 if (drc->in_use < drc->max_size) { 177 /* 178 * nope, so let's malloc a new one 179 */ 180 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 181 drp->drc = drc; 182 drc->in_use++; 183 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 184 return (drp); 185 } 186 187 /* 188 * Cache is all allocated now traverse the list 189 * backwards to find one we can reuse. 190 */ 191 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 192 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 193 194 switch (drp_tail->dr_state) { 195 196 case NFS4_DUP_FREE: 197 list_remove(&(drc->dr_cache), drp_tail); 198 DTRACE_PROBE1(nfss__i__drc_freeclaim, 199 rfs4_dupreq_t *, drp_tail); 200 return (drp_tail); 201 /* NOTREACHED */ 202 203 case NFS4_DUP_REPLAY: 204 /* grab it. */ 205 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 206 DTRACE_PROBE1(nfss__i__drc_replayclaim, 207 rfs4_dupreq_t *, drp_tail); 208 return (drp_tail); 209 /* NOTREACHED */ 210 } 211 } 212 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 213 return (NULL); 214 } 215 216 /* 217 * rfs4_find_dr: 218 * 219 * Search for an entry in the duplicate request cache by 220 * calculating the hash index based on the XID, and examining 221 * the entries in the hash bucket. If we find a match, return. 222 * Once we have searched the bucket we call rfs4_alloc_dr() to 223 * allocate a new entry, or reuse one that is available. 224 */ 225 int 226 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 227 { 228 229 uint32_t the_xid; 230 list_t *dr_bkt; 231 rfs4_dupreq_t *drp; 232 int bktdex; 233 234 /* 235 * Get the XID, calculate the bucket and search to 236 * see if we need to replay from the cache. 237 */ 238 the_xid = req->rq_xprt->xp_xid; 239 bktdex = the_xid % drc->dr_hash; 240 241 dr_bkt = (list_t *) 242 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 243 244 DTRACE_PROBE3(nfss__i__drc_bktdex, 245 int, bktdex, 246 uint32_t, the_xid, 247 list_t *, dr_bkt); 248 249 *dup = NULL; 250 251 mutex_enter(&drc->lock); 252 /* 253 * Search the bucket for a matching xid and address. 254 */ 255 for (drp = list_head(dr_bkt); drp != NULL; 256 drp = list_next(dr_bkt, drp)) { 257 258 if (drp->dr_xid == the_xid && 259 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 260 bcmp((caddr_t)drp->dr_addr.buf, 261 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 262 drp->dr_addr.len) == 0) { 263 264 /* 265 * Found a match so REPLAY the Reply 266 */ 267 if (drp->dr_state == NFS4_DUP_REPLAY) { 268 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 269 mutex_exit(&drc->lock); 270 *dup = drp; 271 DTRACE_PROBE1(nfss__i__drc_replay, 272 rfs4_dupreq_t *, drp); 273 return (NFS4_DUP_REPLAY); 274 } 275 276 /* 277 * This entry must be in transition, so return 278 * the 'pending' status. 279 */ 280 mutex_exit(&drc->lock); 281 return (NFS4_DUP_PENDING); 282 } 283 } 284 285 drp = rfs4_alloc_dr(drc); 286 mutex_exit(&drc->lock); 287 288 /* 289 * The DRC is full and all entries are in use. Upper function 290 * should error out this request and force the client to 291 * retransmit -- effectively this is a resource issue. NFSD 292 * threads tied up with native File System, or the cache size 293 * is too small for the server load. 294 */ 295 if (drp == NULL) 296 return (NFS4_DUP_ERROR); 297 298 /* 299 * Init the state to NEW. 300 */ 301 drp->dr_state = NFS4_DUP_NEW; 302 303 /* 304 * If needed, resize the address buffer 305 */ 306 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 307 if (drp->dr_addr.buf != NULL) 308 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 309 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 310 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 311 if (drp->dr_addr.buf == NULL) { 312 /* 313 * If the malloc fails, mark the entry 314 * as free and put on the tail. 315 */ 316 drp->dr_addr.maxlen = 0; 317 drp->dr_state = NFS4_DUP_FREE; 318 mutex_enter(&drc->lock); 319 list_insert_tail(&(drc->dr_cache), drp); 320 mutex_exit(&drc->lock); 321 return (NFS4_DUP_ERROR); 322 } 323 } 324 325 326 /* 327 * Copy the address. 328 */ 329 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 330 331 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 332 (caddr_t)drp->dr_addr.buf, 333 drp->dr_addr.len); 334 335 drp->dr_xid = the_xid; 336 drp->dr_bkt = dr_bkt; 337 338 /* 339 * Insert at the head of the bucket and 340 * the drc lists.. 341 */ 342 mutex_enter(&drc->lock); 343 list_insert_head(&drc->dr_cache, drp); 344 list_insert_head(dr_bkt, drp); 345 mutex_exit(&drc->lock); 346 347 *dup = drp; 348 349 return (NFS4_DUP_NEW); 350 } 351 352 /* 353 * 354 * This function handles the duplicate request cache, 355 * NULL_PROC and COMPOUND procedure calls for NFSv4; 356 * 357 * Passed into this function are:- 358 * 359 * disp A pointer to our dispatch table entry 360 * req The request to process 361 * xprt The server transport handle 362 * ap A pointer to the arguments 363 * 364 * 365 * When appropriate this function is responsible for inserting 366 * the reply into the duplicate cache or replaying an existing 367 * cached reply. 368 * 369 * dr_stat reflects the state of the duplicate request that 370 * has been inserted into or retrieved from the cache 371 * 372 * drp is the duplicate request entry 373 * 374 */ 375 int 376 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, 377 SVCXPRT *xprt, char *ap) 378 { 379 380 COMPOUND4res res_buf; 381 COMPOUND4res *rbp; 382 COMPOUND4args *cap; 383 cred_t *cr = NULL; 384 int error = 0; 385 int dis_flags = 0; 386 int dr_stat = NFS4_NOT_DUP; 387 rfs4_dupreq_t *drp = NULL; 388 int rv; 389 390 ASSERT(disp); 391 392 /* 393 * Short circuit the RPC_NULL proc. 394 */ 395 if (disp->dis_proc == rpc_null) { 396 DTRACE_NFSV4_1(null__start, struct svc_req *, req); 397 if (!svc_sendreply(xprt, xdr_void, NULL)) { 398 DTRACE_NFSV4_1(null__done, struct svc_req *, req); 399 svcerr_systemerr(xprt); 400 return (1); 401 } 402 DTRACE_NFSV4_1(null__done, struct svc_req *, req); 403 return (0); 404 } 405 406 /* Only NFSv4 Compounds from this point onward */ 407 408 rbp = &res_buf; 409 cap = (COMPOUND4args *)ap; 410 411 /* 412 * Figure out the disposition of the whole COMPOUND 413 * and record it's IDEMPOTENTCY. 414 */ 415 rfs4_compound_flagproc(cap, &dis_flags); 416 417 /* 418 * If NON-IDEMPOTENT then we need to figure out if this 419 * request can be replied from the duplicate cache. 420 * 421 * If this is a new request then we need to insert the 422 * reply into the duplicate cache. 423 */ 424 if (!(dis_flags & RPC_IDEMPOTENT)) { 425 /* look for a replay from the cache or allocate */ 426 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 427 428 switch (dr_stat) { 429 430 case NFS4_DUP_ERROR: 431 rfs4_resource_err(req, cap); 432 return (1); 433 /* NOTREACHED */ 434 435 case NFS4_DUP_PENDING: 436 /* 437 * reply has previously been inserted into the 438 * duplicate cache, however the reply has 439 * not yet been sent via svc_sendreply() 440 */ 441 return (1); 442 /* NOTREACHED */ 443 444 case NFS4_DUP_NEW: 445 curthread->t_flag |= T_DONTPEND; 446 /* NON-IDEMPOTENT proc call */ 447 rfs4_compound(cap, rbp, NULL, req, cr, &rv); 448 curthread->t_flag &= ~T_DONTPEND; 449 450 if (rv) /* short ckt sendreply on error */ 451 return (rv); 452 453 /* 454 * dr_res must be initialized before calling 455 * rfs4_dr_chstate (it frees the reply). 456 */ 457 drp->dr_res = res_buf; 458 if (curthread->t_flag & T_WOULDBLOCK) { 459 curthread->t_flag &= ~T_WOULDBLOCK; 460 /* 461 * mark this entry as FREE and plop 462 * on the end of the cache list 463 */ 464 mutex_enter(&drp->drc->lock); 465 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 466 list_insert_tail(&(drp->drc->dr_cache), drp); 467 mutex_exit(&drp->drc->lock); 468 return (1); 469 } 470 break; 471 472 case NFS4_DUP_REPLAY: 473 /* replay from the cache */ 474 rbp = &(drp->dr_res); 475 break; 476 } 477 } else { 478 curthread->t_flag |= T_DONTPEND; 479 /* IDEMPOTENT proc call */ 480 rfs4_compound(cap, rbp, NULL, req, cr, &rv); 481 curthread->t_flag &= ~T_DONTPEND; 482 483 if (rv) /* short ckt sendreply on error */ 484 return (rv); 485 486 if (curthread->t_flag & T_WOULDBLOCK) { 487 curthread->t_flag &= ~T_WOULDBLOCK; 488 return (1); 489 } 490 } 491 492 /* 493 * Send out the replayed reply or the 'real' one. 494 */ 495 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 496 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 497 struct svc_req *, xprt, 498 char *, rbp); 499 svcerr_systemerr(xprt); 500 error++; 501 } 502 503 /* 504 * If this reply was just inserted into the duplicate cache 505 * or it was replayed from the dup cache; (re)mark it as 506 * available for replay 507 * 508 * At first glance, this 'if' statement seems a little strange; 509 * testing for NFS4_DUP_REPLAY, and then calling... 510 * 511 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 512 * 513 * ... but notice that we are checking dr_stat, and not the 514 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 515 * we do that so that we know not to prematurely reap it whilst 516 * we resent it to the client. 517 * 518 */ 519 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 520 mutex_enter(&drp->drc->lock); 521 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 522 mutex_exit(&drp->drc->lock); 523 } else if (dr_stat == NFS4_NOT_DUP) { 524 rfs4_compound_free(rbp); 525 } 526 527 return (error); 528 } 529 530 bool_t 531 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args) 532 { 533 COMPOUND4args *argsp; 534 COMPOUND4res res_buf, *resp; 535 536 if (req->rq_vers != 4) 537 return (FALSE); 538 539 argsp = (COMPOUND4args *)args; 540 541 if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION) 542 return (FALSE); 543 544 resp = &res_buf; 545 546 /* 547 * Form a reply tag by copying over the reqeuest tag. 548 */ 549 resp->tag.utf8string_val = 550 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 551 resp->tag.utf8string_len = argsp->tag.utf8string_len; 552 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val, 553 resp->tag.utf8string_len); 554 resp->array_len = 0; 555 resp->array = NULL; 556 resp->status = NFS4ERR_MINOR_VERS_MISMATCH; 557 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) { 558 DTRACE_PROBE2(nfss__e__minorvers_mismatch, 559 SVCXPRT *, xprt, char *, resp); 560 svcerr_systemerr(xprt); 561 } 562 rfs4_compound_free(resp); 563 return (TRUE); 564 } 565 566 void 567 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp) 568 { 569 COMPOUND4res res_buf, *rbp; 570 nfs_resop4 *resop; 571 PUTFH4res *resp; 572 573 rbp = &res_buf; 574 575 /* 576 * Form a reply tag by copying over the request tag. 577 */ 578 rbp->tag.utf8string_val = 579 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 580 rbp->tag.utf8string_len = argsp->tag.utf8string_len; 581 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val, 582 rbp->tag.utf8string_len); 583 584 rbp->array_len = 1; 585 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4), 586 KM_SLEEP); 587 resop = &rbp->array[0]; 588 resop->resop = argsp->array[0].argop; /* copy first op over */ 589 590 /* Any op will do, just need to access status field */ 591 resp = &resop->nfs_resop4_u.opputfh; 592 593 /* 594 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL. 595 * Note that all op numbers in the compound array were already 596 * validated by the XDR decoder (xdr_COMPOUND4args_srv()). 597 */ 598 resp->status = (resop->resop == OP_ILLEGAL ? 599 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE); 600 601 /* compound status is same as first op status */ 602 rbp->status = resp->status; 603 604 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 605 DTRACE_PROBE2(nfss__rsrc_err__sendfail, 606 struct svc_req *, req->rq_xprt, char *, rbp); 607 svcerr_systemerr(req->rq_xprt); 608 } 609 610 UTF8STRING_FREE(rbp->tag); 611 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4)); 612 } 613