1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2018 Nexenta Systems, Inc. 29 * Copyright 2020 RackTop Systems, Inc. 30 */ 31 32 #include <sys/systm.h> 33 #include <sys/sdt.h> 34 #include <rpc/types.h> 35 #include <rpc/auth.h> 36 #include <rpc/auth_unix.h> 37 #include <rpc/auth_des.h> 38 #include <rpc/svc.h> 39 #include <rpc/xdr.h> 40 #include <nfs/nfs4.h> 41 #include <nfs/nfs_dispatch.h> 42 #include <nfs/nfs4_drc.h> 43 44 #define NFS4_MAX_MINOR_VERSION 2 45 46 /* 47 * The default size of the duplicate request cache 48 */ 49 uint32_t nfs4_drc_max = 8 * 1024; 50 51 /* 52 * The number of buckets we'd like to hash the 53 * replies into.. do not change this on the fly. 54 */ 55 uint32_t nfs4_drc_hash = 541; 56 57 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp); 58 59 /* 60 * Initialize a duplicate request cache. 61 */ 62 rfs4_drc_t * 63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 64 { 65 rfs4_drc_t *drc; 66 uint32_t bki; 67 68 ASSERT(drc_size); 69 ASSERT(drc_hash_size); 70 71 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 72 73 drc->max_size = drc_size; 74 drc->in_use = 0; 75 76 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 77 78 drc->dr_hash = drc_hash_size; 79 80 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 81 82 for (bki = 0; bki < drc_hash_size; bki++) { 83 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 84 offsetof(rfs4_dupreq_t, dr_bkt_next)); 85 } 86 87 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 88 offsetof(rfs4_dupreq_t, dr_next)); 89 90 return (drc); 91 } 92 93 /* 94 * Destroy a duplicate request cache. 95 */ 96 void 97 rfs4_fini_drc(void) 98 { 99 nfs4_srv_t *nsrv4 = nfs4_get_srv(); 100 rfs4_drc_t *drc = nsrv4->nfs4_drc; 101 rfs4_dupreq_t *drp, *drp_next; 102 103 /* iterate over the dr_cache and free the enties */ 104 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 105 106 if (drp->dr_state == NFS4_DUP_REPLAY) 107 rfs4_compound_free(&(drp->dr_res)); 108 109 if (drp->dr_addr.buf != NULL) 110 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 111 112 drp_next = list_next(&(drc->dr_cache), drp); 113 114 kmem_free(drp, sizeof (rfs4_dupreq_t)); 115 } 116 117 mutex_destroy(&drc->lock); 118 kmem_free(drc->dr_buckets, 119 sizeof (list_t)*drc->dr_hash); 120 kmem_free(drc, sizeof (rfs4_drc_t)); 121 } 122 123 /* 124 * rfs4_dr_chstate: 125 * 126 * Change the state of a rfs4_dupreq. If it's not in transition 127 * to the FREE state, return. If we are moving to the FREE state 128 * then we need to clean up the compound results and move the entry 129 * to the end of the list. 130 */ 131 void 132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 133 { 134 rfs4_drc_t *drc; 135 136 ASSERT(drp); 137 ASSERT(drp->drc); 138 ASSERT(drp->dr_bkt); 139 ASSERT(MUTEX_HELD(&drp->drc->lock)); 140 141 drp->dr_state = new_state; 142 143 if (new_state != NFS4_DUP_FREE) 144 return; 145 146 drc = drp->drc; 147 148 /* 149 * Remove entry from the bucket and 150 * dr_cache list, free compound results. 151 */ 152 list_remove(drp->dr_bkt, drp); 153 list_remove(&(drc->dr_cache), drp); 154 rfs4_compound_free(&(drp->dr_res)); 155 } 156 157 /* 158 * rfs4_alloc_dr: 159 * 160 * Malloc a new one if we have not reached our maximum cache 161 * limit, otherwise pick an entry off the tail -- Use if it 162 * is marked as NFS4_DUP_FREE, or is an entry in the 163 * NFS4_DUP_REPLAY state. 164 */ 165 rfs4_dupreq_t * 166 rfs4_alloc_dr(rfs4_drc_t *drc) 167 { 168 rfs4_dupreq_t *drp_tail, *drp = NULL; 169 170 ASSERT(drc); 171 ASSERT(MUTEX_HELD(&drc->lock)); 172 173 /* 174 * Have we hit the cache limit yet ? 175 */ 176 if (drc->in_use < drc->max_size) { 177 /* 178 * nope, so let's malloc a new one 179 */ 180 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 181 drp->drc = drc; 182 drc->in_use++; 183 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 184 return (drp); 185 } 186 187 /* 188 * Cache is all allocated now traverse the list 189 * backwards to find one we can reuse. 190 */ 191 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 192 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 193 194 switch (drp_tail->dr_state) { 195 196 case NFS4_DUP_FREE: 197 list_remove(&(drc->dr_cache), drp_tail); 198 DTRACE_PROBE1(nfss__i__drc_freeclaim, 199 rfs4_dupreq_t *, drp_tail); 200 return (drp_tail); 201 /* NOTREACHED */ 202 203 case NFS4_DUP_REPLAY: 204 /* grab it. */ 205 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 206 DTRACE_PROBE1(nfss__i__drc_replayclaim, 207 rfs4_dupreq_t *, drp_tail); 208 return (drp_tail); 209 /* NOTREACHED */ 210 } 211 } 212 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 213 return (NULL); 214 } 215 216 /* 217 * rfs4_find_dr: 218 * 219 * Search for an entry in the duplicate request cache by 220 * calculating the hash index based on the XID, and examining 221 * the entries in the hash bucket. If we find a match, return. 222 * Once we have searched the bucket we call rfs4_alloc_dr() to 223 * allocate a new entry, or reuse one that is available. 224 */ 225 int 226 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 227 { 228 229 uint32_t the_xid; 230 list_t *dr_bkt; 231 rfs4_dupreq_t *drp; 232 int bktdex; 233 234 /* 235 * Get the XID, calculate the bucket and search to 236 * see if we need to replay from the cache. 237 */ 238 the_xid = req->rq_xprt->xp_xid; 239 bktdex = the_xid % drc->dr_hash; 240 241 dr_bkt = (list_t *) 242 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 243 244 DTRACE_PROBE3(nfss__i__drc_bktdex, 245 int, bktdex, 246 uint32_t, the_xid, 247 list_t *, dr_bkt); 248 249 *dup = NULL; 250 251 mutex_enter(&drc->lock); 252 /* 253 * Search the bucket for a matching xid and address. 254 */ 255 for (drp = list_head(dr_bkt); drp != NULL; 256 drp = list_next(dr_bkt, drp)) { 257 258 if (drp->dr_xid == the_xid && 259 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 260 bcmp((caddr_t)drp->dr_addr.buf, 261 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 262 drp->dr_addr.len) == 0) { 263 264 /* 265 * Found a match so REPLAY the Reply 266 */ 267 if (drp->dr_state == NFS4_DUP_REPLAY) { 268 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 269 mutex_exit(&drc->lock); 270 *dup = drp; 271 DTRACE_PROBE1(nfss__i__drc_replay, 272 rfs4_dupreq_t *, drp); 273 return (NFS4_DUP_REPLAY); 274 } 275 276 /* 277 * This entry must be in transition, so return 278 * the 'pending' status. 279 */ 280 mutex_exit(&drc->lock); 281 return (NFS4_DUP_PENDING); 282 } 283 } 284 285 drp = rfs4_alloc_dr(drc); 286 mutex_exit(&drc->lock); 287 288 /* 289 * The DRC is full and all entries are in use. Upper function 290 * should error out this request and force the client to 291 * retransmit -- effectively this is a resource issue. NFSD 292 * threads tied up with native File System, or the cache size 293 * is too small for the server load. 294 */ 295 if (drp == NULL) 296 return (NFS4_DUP_ERROR); 297 298 /* 299 * Init the state to NEW. 300 */ 301 drp->dr_state = NFS4_DUP_NEW; 302 303 /* 304 * If needed, resize the address buffer 305 */ 306 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 307 if (drp->dr_addr.buf != NULL) 308 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 309 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 310 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 311 if (drp->dr_addr.buf == NULL) { 312 /* 313 * If the malloc fails, mark the entry 314 * as free and put on the tail. 315 */ 316 drp->dr_addr.maxlen = 0; 317 drp->dr_state = NFS4_DUP_FREE; 318 mutex_enter(&drc->lock); 319 list_insert_tail(&(drc->dr_cache), drp); 320 mutex_exit(&drc->lock); 321 return (NFS4_DUP_ERROR); 322 } 323 } 324 325 326 /* 327 * Copy the address. 328 */ 329 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 330 331 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 332 (caddr_t)drp->dr_addr.buf, 333 drp->dr_addr.len); 334 335 drp->dr_xid = the_xid; 336 drp->dr_bkt = dr_bkt; 337 338 /* 339 * Insert at the head of the bucket and 340 * the drc lists.. 341 */ 342 mutex_enter(&drc->lock); 343 list_insert_head(&drc->dr_cache, drp); 344 list_insert_head(dr_bkt, drp); 345 mutex_exit(&drc->lock); 346 347 *dup = drp; 348 349 return (NFS4_DUP_NEW); 350 } 351 352 /* 353 * 354 * This function handles the duplicate request cache, 355 * NULL_PROC and COMPOUND procedure calls for NFSv4.0; 356 * the 4.x where x > 0 case is handled in rfs4x_dispatch. 357 * 358 * Passed into this function are:- 359 * 360 * disp A pointer to our dispatch table entry 361 * req The request to process 362 * xprt The server transport handle 363 * ap A pointer to the arguments 364 * 365 * 366 * When appropriate this function is responsible for inserting 367 * the reply into the duplicate cache or replaying an existing 368 * cached reply. 369 * 370 * dr_stat reflects the state of the duplicate request that 371 * has been inserted into or retrieved from the cache 372 * 373 * drp is the duplicate request entry 374 * 375 */ 376 int 377 rfs40_dispatch(struct svc_req *req, SVCXPRT *xprt, char *ap) 378 { 379 380 COMPOUND4res res_buf; 381 COMPOUND4res *rbp; 382 COMPOUND4args *cap; 383 int error = 0; 384 int dis_flags = 0; 385 int dr_stat = NFS4_NOT_DUP; 386 rfs4_dupreq_t *drp = NULL; 387 int rv; 388 struct compound_state cs; 389 nfs4_srv_t *nsrv4 = nfs4_get_srv(); 390 rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc; 391 392 /* Only NFSv4 Compounds from this point onward */ 393 394 rbp = &res_buf; 395 cap = (COMPOUND4args *)ap; 396 397 rfs4_init_compound_state(&cs); 398 399 /* 400 * Figure out the disposition of the whole COMPOUND 401 * and record it's IDEMPOTENTCY. 402 */ 403 rfs4_compound_flagproc(cap, &dis_flags); 404 405 /* 406 * If NON-IDEMPOTENT then we need to figure out if this 407 * request can be replied from the duplicate cache. 408 * 409 * If this is a new request then we need to insert the 410 * reply into the duplicate cache. 411 */ 412 if (!(dis_flags & RPC_IDEMPOTENT)) { 413 /* look for a replay from the cache or allocate */ 414 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 415 416 switch (dr_stat) { 417 418 case NFS4_DUP_ERROR: 419 rfs4_resource_err(req, cap); 420 return (1); 421 /* NOTREACHED */ 422 423 case NFS4_DUP_PENDING: 424 /* 425 * reply has previously been inserted into the 426 * duplicate cache, however the reply has 427 * not yet been sent via svc_sendreply() 428 */ 429 return (1); 430 /* NOTREACHED */ 431 432 case NFS4_DUP_NEW: 433 curthread->t_flag |= T_DONTPEND; 434 /* NON-IDEMPOTENT proc call */ 435 rfs4_compound(cap, rbp, &cs, req, &rv); 436 curthread->t_flag &= ~T_DONTPEND; 437 438 rfs4_fini_compound_state(&cs); 439 440 if (rv) /* short ckt sendreply on error */ 441 return (rv); 442 443 /* 444 * dr_res must be initialized before calling 445 * rfs4_dr_chstate (it frees the reply). 446 */ 447 drp->dr_res = res_buf; 448 if (curthread->t_flag & T_WOULDBLOCK) { 449 curthread->t_flag &= ~T_WOULDBLOCK; 450 /* 451 * mark this entry as FREE and plop 452 * on the end of the cache list 453 */ 454 mutex_enter(&drp->drc->lock); 455 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 456 list_insert_tail(&(drp->drc->dr_cache), drp); 457 mutex_exit(&drp->drc->lock); 458 return (1); 459 } 460 break; 461 462 case NFS4_DUP_REPLAY: 463 /* replay from the cache */ 464 rbp = &(drp->dr_res); 465 break; 466 } 467 } else { 468 curthread->t_flag |= T_DONTPEND; 469 /* IDEMPOTENT proc call */ 470 rfs4_compound(cap, rbp, &cs, req, &rv); 471 curthread->t_flag &= ~T_DONTPEND; 472 473 rfs4_fini_compound_state(&cs); 474 475 if (rv) /* short ckt sendreply on error */ 476 return (rv); 477 478 if (curthread->t_flag & T_WOULDBLOCK) { 479 curthread->t_flag &= ~T_WOULDBLOCK; 480 return (1); 481 } 482 } 483 484 /* 485 * Send out the replayed reply or the 'real' one. 486 */ 487 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 488 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 489 struct svc_req *, xprt, 490 char *, rbp); 491 svcerr_systemerr(xprt); 492 error++; 493 } 494 495 /* 496 * If this reply was just inserted into the duplicate cache 497 * or it was replayed from the dup cache; (re)mark it as 498 * available for replay 499 * 500 * At first glance, this 'if' statement seems a little strange; 501 * testing for NFS4_DUP_REPLAY, and then calling... 502 * 503 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 504 * 505 * ... but notice that we are checking dr_stat, and not the 506 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 507 * we do that so that we know not to prematurely reap it whilst 508 * we resent it to the client. 509 * 510 */ 511 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 512 mutex_enter(&drp->drc->lock); 513 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 514 mutex_exit(&drp->drc->lock); 515 } else if (dr_stat == NFS4_NOT_DUP) { 516 rfs4_compound_free(rbp); 517 } 518 519 return (error); 520 } 521 522 static int 523 rfs4_send_minor_mismatch(SVCXPRT *xprt, COMPOUND4args *argsp) 524 { 525 COMPOUND4res res_buf, *resp; 526 int err = 0; 527 528 resp = &res_buf; 529 530 /* 531 * Form a reply tag by copying over the request tag. 532 */ 533 resp->tag.utf8string_len = argsp->tag.utf8string_len; 534 if (argsp->tag.utf8string_len != 0) { 535 resp->tag.utf8string_val = 536 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 537 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val, 538 resp->tag.utf8string_len); 539 } else { 540 resp->tag.utf8string_val = NULL; 541 } 542 resp->array_len = 0; 543 resp->array = NULL; 544 resp->status = NFS4ERR_MINOR_VERS_MISMATCH; 545 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) { 546 DTRACE_PROBE2(nfss__e__minorvers_mismatch, 547 SVCXPRT *, xprt, char *, resp); 548 svcerr_systemerr(xprt); 549 err = 1; 550 } 551 rfs4_compound_free(resp); 552 return (err); 553 } 554 555 bool_t 556 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args) 557 { 558 COMPOUND4args *argsp; 559 560 if (req->rq_vers != 4) 561 return (FALSE); 562 563 argsp = (COMPOUND4args *)args; 564 565 if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION) 566 return (FALSE); 567 568 (void) rfs4_send_minor_mismatch(xprt, argsp); 569 return (TRUE); 570 } 571 572 void 573 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp) 574 { 575 COMPOUND4res res_buf, *rbp; 576 nfs_resop4 *resop; 577 PUTFH4res *resp; 578 579 rbp = &res_buf; 580 581 /* 582 * Form a reply tag by copying over the request tag. 583 */ 584 rbp->tag.utf8string_len = argsp->tag.utf8string_len; 585 if (argsp->tag.utf8string_len != 0) { 586 rbp->tag.utf8string_val = 587 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 588 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val, 589 rbp->tag.utf8string_len); 590 } else { 591 rbp->tag.utf8string_val = NULL; 592 } 593 594 rbp->array_len = 1; 595 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4), 596 KM_SLEEP); 597 resop = &rbp->array[0]; 598 resop->resop = argsp->array[0].argop; /* copy first op over */ 599 600 /* Any op will do, just need to access status field */ 601 resp = &resop->nfs_resop4_u.opputfh; 602 603 /* 604 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL. 605 * Note that all op numbers in the compound array were already 606 * validated by the XDR decoder (xdr_COMPOUND4args_srv()). 607 */ 608 resp->status = (resop->resop == OP_ILLEGAL ? 609 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE); 610 611 /* compound status is same as first op status */ 612 rbp->status = resp->status; 613 614 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 615 DTRACE_PROBE2(nfss__rsrc_err__sendfail, 616 struct svc_req *, req->rq_xprt, char *, rbp); 617 svcerr_systemerr(req->rq_xprt); 618 } 619 620 UTF8STRING_FREE(rbp->tag); 621 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4)); 622 } 623 624 /* ARGSUSED */ 625 int 626 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, 627 SVCXPRT *xprt, char *ap) 628 { 629 COMPOUND4args *cmp; 630 int error = 0; 631 632 /* 633 * Handle the NULL Proc here 634 */ 635 if (req->rq_proc == RFS_NULL) { 636 return (!svc_sendreply(xprt, xdr_void, NULL)); 637 } 638 639 cmp = (COMPOUND4args *)ap; 640 ASSERT(cmp != NULL); 641 642 switch (cmp->minorversion) { 643 case 1: 644 case 2: 645 error = rfs4x_dispatch(req, xprt, ap); 646 break; 647 648 case 0: 649 error = rfs40_dispatch(req, xprt, ap); 650 break; 651 652 default: 653 error = rfs4_send_minor_mismatch(xprt, cmp); 654 } 655 return (error); 656 } 657