1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2018 Nexenta Systems, Inc.
29  * Copyright 2020 RackTop Systems, Inc.
30  */
31 
32 #include <sys/systm.h>
33 #include <sys/sdt.h>
34 #include <rpc/types.h>
35 #include <rpc/auth.h>
36 #include <rpc/auth_unix.h>
37 #include <rpc/auth_des.h>
38 #include <rpc/svc.h>
39 #include <rpc/xdr.h>
40 #include <nfs/nfs4.h>
41 #include <nfs/nfs_dispatch.h>
42 #include <nfs/nfs4_drc.h>
43 
44 #define	NFS4_MAX_MINOR_VERSION	2
45 
46 /*
47  * The default size of the duplicate request cache
48  */
49 uint32_t nfs4_drc_max = 8 * 1024;
50 
51 /*
52  * The number of buckets we'd like to hash the
53  * replies into.. do not change this on the fly.
54  */
55 uint32_t nfs4_drc_hash = 541;
56 
57 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
58 
59 /*
60  * Initialize a duplicate request cache.
61  */
62 rfs4_drc_t *
63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
64 {
65 	rfs4_drc_t *drc;
66 	uint32_t   bki;
67 
68 	ASSERT(drc_size);
69 	ASSERT(drc_hash_size);
70 
71 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
72 
73 	drc->max_size = drc_size;
74 	drc->in_use = 0;
75 
76 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
77 
78 	drc->dr_hash = drc_hash_size;
79 
80 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
81 
82 	for (bki = 0; bki < drc_hash_size; bki++) {
83 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
84 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
85 	}
86 
87 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
88 	    offsetof(rfs4_dupreq_t, dr_next));
89 
90 	return (drc);
91 }
92 
93 /*
94  * Destroy a duplicate request cache.
95  */
96 void
97 rfs4_fini_drc(void)
98 {
99 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
100 	rfs4_drc_t *drc = nsrv4->nfs4_drc;
101 	rfs4_dupreq_t *drp, *drp_next;
102 
103 	/* iterate over the dr_cache and free the enties */
104 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
105 
106 		if (drp->dr_state == NFS4_DUP_REPLAY)
107 			rfs4_compound_free(&(drp->dr_res));
108 
109 		if (drp->dr_addr.buf != NULL)
110 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
111 
112 		drp_next = list_next(&(drc->dr_cache), drp);
113 
114 		kmem_free(drp, sizeof (rfs4_dupreq_t));
115 	}
116 
117 	mutex_destroy(&drc->lock);
118 	kmem_free(drc->dr_buckets,
119 	    sizeof (list_t)*drc->dr_hash);
120 	kmem_free(drc, sizeof (rfs4_drc_t));
121 }
122 
123 /*
124  * rfs4_dr_chstate:
125  *
126  * Change the state of a rfs4_dupreq. If it's not in transition
127  * to the FREE state, return. If we are moving to the FREE state
128  * then we need to clean up the compound results and move the entry
129  * to the end of the list.
130  */
131 void
132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
133 {
134 	rfs4_drc_t *drc;
135 
136 	ASSERT(drp);
137 	ASSERT(drp->drc);
138 	ASSERT(drp->dr_bkt);
139 	ASSERT(MUTEX_HELD(&drp->drc->lock));
140 
141 	drp->dr_state = new_state;
142 
143 	if (new_state != NFS4_DUP_FREE)
144 		return;
145 
146 	drc = drp->drc;
147 
148 	/*
149 	 * Remove entry from the bucket and
150 	 * dr_cache list, free compound results.
151 	 */
152 	list_remove(drp->dr_bkt, drp);
153 	list_remove(&(drc->dr_cache), drp);
154 	rfs4_compound_free(&(drp->dr_res));
155 }
156 
157 /*
158  * rfs4_alloc_dr:
159  *
160  * Malloc a new one if we have not reached our maximum cache
161  * limit, otherwise pick an entry off the tail -- Use if it
162  * is marked as NFS4_DUP_FREE, or is an entry in the
163  * NFS4_DUP_REPLAY state.
164  */
165 rfs4_dupreq_t *
166 rfs4_alloc_dr(rfs4_drc_t *drc)
167 {
168 	rfs4_dupreq_t *drp_tail, *drp = NULL;
169 
170 	ASSERT(drc);
171 	ASSERT(MUTEX_HELD(&drc->lock));
172 
173 	/*
174 	 * Have we hit the cache limit yet ?
175 	 */
176 	if (drc->in_use < drc->max_size) {
177 		/*
178 		 * nope, so let's malloc a new one
179 		 */
180 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
181 		drp->drc = drc;
182 		drc->in_use++;
183 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
184 		return (drp);
185 	}
186 
187 	/*
188 	 * Cache is all allocated now traverse the list
189 	 * backwards to find one we can reuse.
190 	 */
191 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
192 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
193 
194 		switch (drp_tail->dr_state) {
195 
196 		case NFS4_DUP_FREE:
197 			list_remove(&(drc->dr_cache), drp_tail);
198 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
199 			    rfs4_dupreq_t *, drp_tail);
200 			return (drp_tail);
201 			/* NOTREACHED */
202 
203 		case NFS4_DUP_REPLAY:
204 			/* grab it. */
205 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
206 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
207 			    rfs4_dupreq_t *, drp_tail);
208 			return (drp_tail);
209 			/* NOTREACHED */
210 		}
211 	}
212 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
213 	return (NULL);
214 }
215 
216 /*
217  * rfs4_find_dr:
218  *
219  * Search for an entry in the duplicate request cache by
220  * calculating the hash index based on the XID, and examining
221  * the entries in the hash bucket. If we find a match, return.
222  * Once we have searched the bucket we call rfs4_alloc_dr() to
223  * allocate a new entry, or reuse one that is available.
224  */
225 int
226 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
227 {
228 
229 	uint32_t	the_xid;
230 	list_t		*dr_bkt;
231 	rfs4_dupreq_t	*drp;
232 	int		bktdex;
233 
234 	/*
235 	 * Get the XID, calculate the bucket and search to
236 	 * see if we need to replay from the cache.
237 	 */
238 	the_xid = req->rq_xprt->xp_xid;
239 	bktdex = the_xid % drc->dr_hash;
240 
241 	dr_bkt = (list_t *)
242 	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
243 
244 	DTRACE_PROBE3(nfss__i__drc_bktdex,
245 	    int, bktdex,
246 	    uint32_t, the_xid,
247 	    list_t *, dr_bkt);
248 
249 	*dup = NULL;
250 
251 	mutex_enter(&drc->lock);
252 	/*
253 	 * Search the bucket for a matching xid and address.
254 	 */
255 	for (drp = list_head(dr_bkt); drp != NULL;
256 	    drp = list_next(dr_bkt, drp)) {
257 
258 		if (drp->dr_xid == the_xid &&
259 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
260 		    bcmp((caddr_t)drp->dr_addr.buf,
261 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
262 		    drp->dr_addr.len) == 0) {
263 
264 			/*
265 			 * Found a match so REPLAY the Reply
266 			 */
267 			if (drp->dr_state == NFS4_DUP_REPLAY) {
268 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
269 				mutex_exit(&drc->lock);
270 				*dup = drp;
271 				DTRACE_PROBE1(nfss__i__drc_replay,
272 				    rfs4_dupreq_t *, drp);
273 				return (NFS4_DUP_REPLAY);
274 			}
275 
276 			/*
277 			 * This entry must be in transition, so return
278 			 * the 'pending' status.
279 			 */
280 			mutex_exit(&drc->lock);
281 			return (NFS4_DUP_PENDING);
282 		}
283 	}
284 
285 	drp = rfs4_alloc_dr(drc);
286 	mutex_exit(&drc->lock);
287 
288 	/*
289 	 * The DRC is full and all entries are in use. Upper function
290 	 * should error out this request and force the client to
291 	 * retransmit -- effectively this is a resource issue. NFSD
292 	 * threads tied up with native File System, or the cache size
293 	 * is too small for the server load.
294 	 */
295 	if (drp == NULL)
296 		return (NFS4_DUP_ERROR);
297 
298 	/*
299 	 * Init the state to NEW.
300 	 */
301 	drp->dr_state = NFS4_DUP_NEW;
302 
303 	/*
304 	 * If needed, resize the address buffer
305 	 */
306 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
307 		if (drp->dr_addr.buf != NULL)
308 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
309 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
310 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
311 		if (drp->dr_addr.buf == NULL) {
312 			/*
313 			 * If the malloc fails, mark the entry
314 			 * as free and put on the tail.
315 			 */
316 			drp->dr_addr.maxlen = 0;
317 			drp->dr_state = NFS4_DUP_FREE;
318 			mutex_enter(&drc->lock);
319 			list_insert_tail(&(drc->dr_cache), drp);
320 			mutex_exit(&drc->lock);
321 			return (NFS4_DUP_ERROR);
322 		}
323 	}
324 
325 
326 	/*
327 	 * Copy the address.
328 	 */
329 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
330 
331 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
332 	    (caddr_t)drp->dr_addr.buf,
333 	    drp->dr_addr.len);
334 
335 	drp->dr_xid = the_xid;
336 	drp->dr_bkt = dr_bkt;
337 
338 	/*
339 	 * Insert at the head of the bucket and
340 	 * the drc lists..
341 	 */
342 	mutex_enter(&drc->lock);
343 	list_insert_head(&drc->dr_cache, drp);
344 	list_insert_head(dr_bkt, drp);
345 	mutex_exit(&drc->lock);
346 
347 	*dup = drp;
348 
349 	return (NFS4_DUP_NEW);
350 }
351 
352 /*
353  *
354  * This function handles the duplicate request cache,
355  * NULL_PROC and COMPOUND procedure calls for NFSv4.0;
356  * the 4.x where x > 0 case is handled in rfs4x_dispatch.
357  *
358  * Passed into this function are:-
359  *
360  *	disp	A pointer to our dispatch table entry
361  *	req	The request to process
362  *	xprt	The server transport handle
363  *	ap	A pointer to the arguments
364  *
365  *
366  * When appropriate this function is responsible for inserting
367  * the reply into the duplicate cache or replaying an existing
368  * cached reply.
369  *
370  * dr_stat	reflects the state of the duplicate request that
371  *		has been inserted into or retrieved from the cache
372  *
373  * drp		is the duplicate request entry
374  *
375  */
376 int
377 rfs40_dispatch(struct svc_req *req, SVCXPRT *xprt, char *ap)
378 {
379 
380 	COMPOUND4res	 res_buf;
381 	COMPOUND4res	*rbp;
382 	COMPOUND4args	*cap;
383 	int		 error = 0;
384 	int		 dis_flags = 0;
385 	int		 dr_stat = NFS4_NOT_DUP;
386 	rfs4_dupreq_t	*drp = NULL;
387 	int		 rv;
388 	struct compound_state cs;
389 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
390 	rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
391 
392 	/* Only NFSv4 Compounds from this point onward */
393 
394 	rbp = &res_buf;
395 	cap = (COMPOUND4args *)ap;
396 
397 	rfs4_init_compound_state(&cs);
398 
399 	/*
400 	 * Figure out the disposition of the whole COMPOUND
401 	 * and record it's IDEMPOTENTCY.
402 	 */
403 	rfs4_compound_flagproc(cap, &dis_flags);
404 
405 	/*
406 	 * If NON-IDEMPOTENT then we need to figure out if this
407 	 * request can be replied from the duplicate cache.
408 	 *
409 	 * If this is a new request then we need to insert the
410 	 * reply into the duplicate cache.
411 	 */
412 	if (!(dis_flags & RPC_IDEMPOTENT)) {
413 		/* look for a replay from the cache or allocate */
414 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
415 
416 		switch (dr_stat) {
417 
418 		case NFS4_DUP_ERROR:
419 			rfs4_resource_err(req, cap);
420 			return (1);
421 			/* NOTREACHED */
422 
423 		case NFS4_DUP_PENDING:
424 			/*
425 			 * reply has previously been inserted into the
426 			 * duplicate cache, however the reply has
427 			 * not yet been sent via svc_sendreply()
428 			 */
429 			return (1);
430 			/* NOTREACHED */
431 
432 		case NFS4_DUP_NEW:
433 			curthread->t_flag |= T_DONTPEND;
434 			/* NON-IDEMPOTENT proc call */
435 			rfs4_compound(cap, rbp, &cs, req, &rv);
436 			curthread->t_flag &= ~T_DONTPEND;
437 
438 			rfs4_fini_compound_state(&cs);
439 
440 			if (rv)		/* short ckt sendreply on error */
441 				return (rv);
442 
443 			/*
444 			 * dr_res must be initialized before calling
445 			 * rfs4_dr_chstate (it frees the reply).
446 			 */
447 			drp->dr_res = res_buf;
448 			if (curthread->t_flag & T_WOULDBLOCK) {
449 				curthread->t_flag &= ~T_WOULDBLOCK;
450 				/*
451 				 * mark this entry as FREE and plop
452 				 * on the end of the cache list
453 				 */
454 				mutex_enter(&drp->drc->lock);
455 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
456 				list_insert_tail(&(drp->drc->dr_cache), drp);
457 				mutex_exit(&drp->drc->lock);
458 				return (1);
459 			}
460 			break;
461 
462 		case NFS4_DUP_REPLAY:
463 			/* replay from the cache */
464 			rbp = &(drp->dr_res);
465 			break;
466 		}
467 	} else {
468 		curthread->t_flag |= T_DONTPEND;
469 		/* IDEMPOTENT proc call */
470 		rfs4_compound(cap, rbp, &cs, req, &rv);
471 		curthread->t_flag &= ~T_DONTPEND;
472 
473 		rfs4_fini_compound_state(&cs);
474 
475 		if (rv)		/* short ckt sendreply on error */
476 			return (rv);
477 
478 		if (curthread->t_flag & T_WOULDBLOCK) {
479 			curthread->t_flag &= ~T_WOULDBLOCK;
480 			return (1);
481 		}
482 	}
483 
484 	/*
485 	 * Send out the replayed reply or the 'real' one.
486 	 */
487 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
488 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
489 		    struct svc_req *, xprt,
490 		    char *, rbp);
491 		svcerr_systemerr(xprt);
492 		error++;
493 	}
494 
495 	/*
496 	 * If this reply was just inserted into the duplicate cache
497 	 * or it was replayed from the dup cache; (re)mark it as
498 	 * available for replay
499 	 *
500 	 * At first glance, this 'if' statement seems a little strange;
501 	 * testing for NFS4_DUP_REPLAY, and then calling...
502 	 *
503 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
504 	 *
505 	 * ... but notice that we are checking dr_stat, and not the
506 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
507 	 * we do that so that we know not to prematurely reap it whilst
508 	 * we resent it to the client.
509 	 *
510 	 */
511 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
512 		mutex_enter(&drp->drc->lock);
513 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
514 		mutex_exit(&drp->drc->lock);
515 	} else if (dr_stat == NFS4_NOT_DUP) {
516 		rfs4_compound_free(rbp);
517 	}
518 
519 	return (error);
520 }
521 
522 static int
523 rfs4_send_minor_mismatch(SVCXPRT *xprt, COMPOUND4args *argsp)
524 {
525 	COMPOUND4res res_buf, *resp;
526 	int err = 0;
527 
528 	resp = &res_buf;
529 
530 	/*
531 	 * Form a reply tag by copying over the request tag.
532 	 */
533 	resp->tag.utf8string_len = argsp->tag.utf8string_len;
534 	if (argsp->tag.utf8string_len != 0) {
535 		resp->tag.utf8string_val =
536 		    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
537 		bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
538 		    resp->tag.utf8string_len);
539 	} else {
540 		resp->tag.utf8string_val = NULL;
541 	}
542 	resp->array_len = 0;
543 	resp->array = NULL;
544 	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
545 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
546 		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
547 		    SVCXPRT *, xprt, char *, resp);
548 		svcerr_systemerr(xprt);
549 		err = 1;
550 	}
551 	rfs4_compound_free(resp);
552 	return (err);
553 }
554 
555 bool_t
556 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
557 {
558 	COMPOUND4args *argsp;
559 
560 	if (req->rq_vers != 4)
561 		return (FALSE);
562 
563 	argsp = (COMPOUND4args *)args;
564 
565 	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
566 		return (FALSE);
567 
568 	(void) rfs4_send_minor_mismatch(xprt, argsp);
569 	return (TRUE);
570 }
571 
572 void
573 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
574 {
575 	COMPOUND4res res_buf, *rbp;
576 	nfs_resop4 *resop;
577 	PUTFH4res *resp;
578 
579 	rbp = &res_buf;
580 
581 	/*
582 	 * Form a reply tag by copying over the request tag.
583 	 */
584 	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
585 	if (argsp->tag.utf8string_len != 0) {
586 		rbp->tag.utf8string_val =
587 		    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
588 		bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
589 		    rbp->tag.utf8string_len);
590 	} else {
591 		rbp->tag.utf8string_val = NULL;
592 	}
593 
594 	rbp->array_len = 1;
595 	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
596 	    KM_SLEEP);
597 	resop = &rbp->array[0];
598 	resop->resop = argsp->array[0].argop;	/* copy first op over */
599 
600 	/* Any op will do, just need to access status field */
601 	resp = &resop->nfs_resop4_u.opputfh;
602 
603 	/*
604 	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
605 	 * Note that all op numbers in the compound array were already
606 	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
607 	 */
608 	resp->status = (resop->resop == OP_ILLEGAL ?
609 	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
610 
611 	/* compound status is same as first op status */
612 	rbp->status = resp->status;
613 
614 	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
615 		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
616 		    struct svc_req *, req->rq_xprt, char *, rbp);
617 		svcerr_systemerr(req->rq_xprt);
618 	}
619 
620 	UTF8STRING_FREE(rbp->tag);
621 	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
622 }
623 
624 /* ARGSUSED */
625 int
626 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
627     SVCXPRT *xprt, char *ap)
628 {
629 	COMPOUND4args	*cmp;
630 	int error = 0;
631 
632 	/*
633 	 * Handle the NULL Proc here
634 	 */
635 	if (req->rq_proc == RFS_NULL) {
636 		return (!svc_sendreply(xprt, xdr_void, NULL));
637 	}
638 
639 	cmp = (COMPOUND4args *)ap;
640 	ASSERT(cmp != NULL);
641 
642 	switch (cmp->minorversion) {
643 	case 1:
644 	case 2:
645 		error = rfs4x_dispatch(req, xprt, ap);
646 		break;
647 
648 	case 0:
649 		error = rfs40_dispatch(req, xprt, ap);
650 		break;
651 
652 	default:
653 		error = rfs4_send_minor_mismatch(xprt, cmp);
654 	}
655 	return (error);
656 }
657