1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/sdt.h>
31 #include <rpc/types.h>
32 #include <rpc/auth.h>
33 #include <rpc/auth_unix.h>
34 #include <rpc/auth_des.h>
35 #include <rpc/svc.h>
36 #include <rpc/xdr.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfs_dispatch.h>
39 #include <nfs/nfs4_drc.h>
40 
41 /*
42  * This is the duplicate request cache for NFSv4
43  */
44 rfs4_drc_t *nfs4_drc = NULL;
45 
46 /*
47  * The default size of the duplicate request cache
48  */
49 uint32_t nfs4_drc_max = 8 * 1024;
50 
51 /*
52  * The number of buckets we'd like to hash the
53  * replies into.. do not change this on the fly.
54  */
55 uint32_t nfs4_drc_hash = 541;
56 
57 /*
58  * Initialize a duplicate request cache.
59  */
60 rfs4_drc_t *
61 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
62 {
63 	rfs4_drc_t *drc;
64 	uint32_t   bki;
65 
66 	ASSERT(drc_size);
67 	ASSERT(drc_hash_size);
68 
69 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
70 
71 	drc->max_size = drc_size;
72 	drc->in_use = 0;
73 
74 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
75 
76 	drc->dr_hash = drc_hash_size;
77 
78 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
79 
80 	for (bki = 0; bki < drc_hash_size; bki++) {
81 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
82 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
83 	}
84 
85 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
86 		    offsetof(rfs4_dupreq_t, dr_next));
87 
88 	return (drc);
89 }
90 
91 /*
92  * Destroy a duplicate request cache.
93  */
94 void
95 rfs4_fini_drc(rfs4_drc_t *drc)
96 {
97 	rfs4_dupreq_t *drp, *drp_next;
98 
99 	ASSERT(drc);
100 
101 	/* iterate over the dr_cache and free the enties */
102 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
103 
104 		if (drp->dr_state == NFS4_DUP_REPLAY)
105 			rfs4_compound_free(&(drp->dr_res));
106 
107 		if (drp->dr_addr.buf != NULL)
108 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
109 
110 		drp_next = list_next(&(drc->dr_cache), drp);
111 
112 		kmem_free(drp, sizeof (rfs4_dupreq_t));
113 	}
114 
115 	mutex_destroy(&drc->lock);
116 	kmem_free(drc->dr_buckets,
117 		sizeof (list_t)*drc->dr_hash);
118 	kmem_free(drc, sizeof (rfs4_drc_t));
119 }
120 
121 /*
122  * rfs4_dr_chstate:
123  *
124  * Change the state of a rfs4_dupreq. If it's not in transition
125  * to the FREE state, update the time used and return. If we
126  * are moving to the FREE state then we need to clean up the
127  * compound results and move the entry to the end of the list.
128  */
129 void
130 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
131 {
132 	rfs4_drc_t *drc;
133 
134 	ASSERT(drp);
135 	ASSERT(drp->drc);
136 	ASSERT(drp->dr_bkt);
137 	ASSERT(MUTEX_HELD(&drp->drc->lock));
138 
139 	drp->dr_state = new_state;
140 
141 	if (new_state != NFS4_DUP_FREE) {
142 		gethrestime(&drp->dr_time_used);
143 		return;
144 	}
145 
146 	drc = drp->drc;
147 
148 	/*
149 	 * Remove entry from the bucket and
150 	 * dr_cache list, free compound results.
151 	 */
152 	list_remove(drp->dr_bkt, drp);
153 	list_remove(&(drc->dr_cache), drp);
154 	rfs4_compound_free(&(drp->dr_res));
155 }
156 
157 /*
158  * rfs4_alloc_dr:
159  *
160  * Malloc a new one if we have not reached our maximum cache
161  * limit, otherwise pick an entry off the tail -- Use if it
162  * is marked as NFS4_DUP_FREE, or is an entry in the
163  * NFS4_DUP_REPLAY state.
164  */
165 rfs4_dupreq_t *
166 rfs4_alloc_dr(rfs4_drc_t *drc)
167 {
168 	rfs4_dupreq_t *drp_tail, *drp = NULL;
169 
170 	ASSERT(drc);
171 	ASSERT(MUTEX_HELD(&drc->lock));
172 
173 	/*
174 	 * Have we hit the cache limit yet ?
175 	 */
176 	if (drc->in_use < drc->max_size) {
177 		/*
178 		 * nope, so let's malloc a new one
179 		 */
180 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
181 		drp->drc = drc;
182 		drc->in_use++;
183 		gethrestime(&drp->dr_time_created);
184 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
185 		return (drp);
186 	}
187 
188 	/*
189 	 * Cache is all allocated now traverse the list
190 	 * backwards to find one we can reuse.
191 	 */
192 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
193 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
194 
195 		switch (drp_tail->dr_state) {
196 
197 		case NFS4_DUP_FREE:
198 			list_remove(&(drc->dr_cache), drp_tail);
199 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
200 					rfs4_dupreq_t *, drp_tail);
201 			return (drp_tail);
202 			/* NOTREACHED */
203 
204 		case NFS4_DUP_REPLAY:
205 			/* grab it. */
206 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
207 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
208 					rfs4_dupreq_t *, drp_tail);
209 			return (drp_tail);
210 			/* NOTREACHED */
211 		}
212 	}
213 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
214 	return (NULL);
215 }
216 
217 /*
218  * rfs4_find_dr:
219  *
220  * Search for an entry in the duplicate request cache by
221  * calculating the hash index based on the XID, and examining
222  * the entries in the hash bucket. If we find a match stamp the
223  * time_used and return. If the entry does not match it could be
224  * ready to be freed. Once we have searched the bucket we call
225  * rfs4_alloc_dr() to allocate a new entry, or reuse one that is
226  * available.
227  */
228 int
229 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
230 {
231 
232 	uint32_t	the_xid;
233 	list_t		*dr_bkt;
234 	rfs4_dupreq_t	*drp;
235 	int		bktdex;
236 
237 	/*
238 	 * Get the XID, calculate the bucket and search to
239 	 * see if we need to replay from the cache.
240 	 */
241 	the_xid = req->rq_xprt->xp_xid;
242 	bktdex = the_xid % drc->dr_hash;
243 
244 	dr_bkt = (list_t *)
245 		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
246 
247 	DTRACE_PROBE3(nfss__i__drc_bktdex,
248 			int, bktdex,
249 			uint32_t, the_xid,
250 			list_t *, dr_bkt);
251 
252 	*dup = NULL;
253 
254 	mutex_enter(&drc->lock);
255 	/*
256 	 * Search the bucket for a matching xid and address.
257 	 */
258 	for (drp = list_head(dr_bkt); drp != NULL;
259 		drp = list_next(dr_bkt, drp)) {
260 
261 		if (drp->dr_xid == the_xid &&
262 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
263 		    bcmp((caddr_t)drp->dr_addr.buf,
264 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
265 		    drp->dr_addr.len) == 0) {
266 
267 			/*
268 			 * Found a match so REPLAY the Reply
269 			 */
270 			if (drp->dr_state == NFS4_DUP_REPLAY) {
271 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
272 				mutex_exit(&drc->lock);
273 				*dup = drp;
274 				DTRACE_PROBE1(nfss__i__drc_replay,
275 					rfs4_dupreq_t *, drp);
276 				return (NFS4_DUP_REPLAY);
277 			}
278 
279 			/*
280 			 * This entry must be in transition, so return
281 			 * the 'pending' status.
282 			 */
283 			mutex_exit(&drc->lock);
284 			return (NFS4_DUP_PENDING);
285 		}
286 
287 		/*
288 		 * Not a match, but maybe this entry is okay
289 		 * to be reused.
290 		 */
291 		if (drp->dr_state == NFS4_DUP_REPLAY) {
292 			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
293 			list_insert_tail(&(drp->drc->dr_cache), drp);
294 		}
295 	}
296 
297 	drp = rfs4_alloc_dr(drc);
298 	mutex_exit(&drc->lock);
299 
300 	/*
301 	 * The DRC is full and all entries are in use. Upper function
302 	 * should error out this request and force the client to
303 	 * retransmit -- effectively this is a resource issue. NFSD
304 	 * threads tied up with native File System, or the cache size
305 	 * is too small for the server load.
306 	 */
307 	if (drp == NULL)
308 		return (NFS4_DUP_ERROR);
309 
310 	/*
311 	 * Init the state to NEW and clear the time used field.
312 	 */
313 	drp->dr_state = NFS4_DUP_NEW;
314 	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
315 
316 	/*
317 	 * If needed, resize the address buffer
318 	 */
319 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
320 		if (drp->dr_addr.buf != NULL)
321 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
322 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
323 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
324 		if (drp->dr_addr.buf == NULL) {
325 			/*
326 			 * If the malloc fails, mark the entry
327 			 * as free and put on the tail.
328 			 */
329 			drp->dr_addr.maxlen = 0;
330 			drp->dr_state = NFS4_DUP_FREE;
331 			mutex_enter(&drc->lock);
332 			list_insert_tail(&(drc->dr_cache), drp);
333 			mutex_exit(&drc->lock);
334 			return (NFS4_DUP_ERROR);
335 		}
336 	}
337 
338 
339 	/*
340 	 * Copy the address.
341 	 */
342 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
343 
344 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
345 		(caddr_t)drp->dr_addr.buf,
346 		drp->dr_addr.len);
347 
348 	drp->dr_xid = the_xid;
349 	drp->dr_bkt = dr_bkt;
350 
351 	/*
352 	 * Insert at the head of the bucket and
353 	 * the drc lists..
354 	 */
355 	mutex_enter(&drc->lock);
356 	list_insert_head(&drc->dr_cache, drp);
357 	list_insert_head(dr_bkt, drp);
358 	mutex_exit(&drc->lock);
359 
360 	*dup = drp;
361 
362 	return (NFS4_DUP_NEW);
363 }
364 
365 /*
366  *
367  * This function handles the duplicate request cache,
368  * NULL_PROC and COMPOUND procedure calls for NFSv4;
369  *
370  * Passed into this function are:-
371  *
372  * 	disp	A pointer to our dispatch table entry
373  * 	req	The request to process
374  * 	xprt	The server transport handle
375  * 	ap	A pointer to the arguments
376  *
377  *
378  * When appropriate this function is responsible for inserting
379  * the reply into the duplicate cache or replaying an existing
380  * cached reply.
381  *
382  * dr_stat 	reflects the state of the duplicate request that
383  * 		has been inserted into or retrieved from the cache
384  *
385  * drp		is the duplicate request entry
386  *
387  */
388 int
389 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
390 		SVCXPRT *xprt, char *ap)
391 {
392 
393 	COMPOUND4res res_buf, *rbp;
394 	COMPOUND4args *cap;
395 
396 	cred_t 	*cr = NULL;
397 	int	error = 0;
398 	int 	dis_flags = 0;
399 	int 	dr_stat = NFS4_NOT_DUP;
400 	rfs4_dupreq_t *drp = NULL;
401 
402 	ASSERT(disp);
403 
404 	/*
405 	 * Short circuit the RPC_NULL proc.
406 	 */
407 	if (disp->dis_proc == rpc_null) {
408 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
409 			return (1);
410 		}
411 		return (0);
412 	}
413 
414 	/* Only NFSv4 Compounds from this point onward */
415 
416 	rbp = &res_buf;
417 	cap = (COMPOUND4args *)ap;
418 
419 	/*
420 	 * Figure out the disposition of the whole COMPOUND
421 	 * and record it's IDEMPOTENTCY.
422 	 */
423 	rfs4_compound_flagproc(cap, &dis_flags);
424 
425 	/*
426 	 * If NON-IDEMPOTENT then we need to figure out if this
427 	 * request can be replied from the duplicate cache.
428 	 *
429 	 * If this is a new request then we need to insert the
430 	 * reply into the duplicate cache.
431 	 */
432 	if (!(dis_flags & RPC_IDEMPOTENT)) {
433 		/* look for a replay from the cache or allocate */
434 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
435 
436 		switch (dr_stat) {
437 
438 		case NFS4_DUP_ERROR:
439 			svcerr_systemerr(xprt);
440 			return (1);
441 			/* NOTREACHED */
442 
443 		case NFS4_DUP_PENDING:
444 			/*
445 			 * reply has previously been inserted into the
446 			 * duplicate cache, however the reply has
447 			 * not yet been sent via svc_sendreply()
448 			 */
449 			return (1);
450 			/* NOTREACHED */
451 
452 		case NFS4_DUP_NEW:
453 			curthread->t_flag |= T_DONTPEND;
454 			/* NON-IDEMPOTENT proc call */
455 			rfs4_compound(cap, rbp, NULL, req, cr);
456 
457 			curthread->t_flag &= ~T_DONTPEND;
458 
459 			/*
460 			 * dr_res must be initialized before calling
461 			 * rfs4_dr_chstate (it frees the reply).
462 			 */
463 			drp->dr_res = res_buf;
464 			if (curthread->t_flag & T_WOULDBLOCK) {
465 				curthread->t_flag &= ~T_WOULDBLOCK;
466 				/*
467 				 * mark this entry as FREE and plop
468 				 * on the end of the cache list
469 				 */
470 				mutex_enter(&drp->drc->lock);
471 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
472 				list_insert_tail(&(drp->drc->dr_cache), drp);
473 				mutex_exit(&drp->drc->lock);
474 				return (1);
475 			}
476 			break;
477 
478 		case NFS4_DUP_REPLAY:
479 			/* replay from the cache */
480 			rbp = &(drp->dr_res);
481 			break;
482 		}
483 	} else {
484 		curthread->t_flag |= T_DONTPEND;
485 		/* IDEMPOTENT proc call */
486 		rfs4_compound(cap, rbp, NULL, req, cr);
487 
488 		curthread->t_flag &= ~T_DONTPEND;
489 		if (curthread->t_flag & T_WOULDBLOCK) {
490 			curthread->t_flag &= ~T_WOULDBLOCK;
491 			return (1);
492 		}
493 	}
494 
495 	/*
496 	 * Send out the replayed reply or the 'real' one.
497 	 */
498 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
499 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
500 			struct svc_req *, xprt,
501 			char *, rbp);
502 		error++;
503 	}
504 
505 	/*
506 	 * If this reply was just inserted into the duplicate cache
507 	 * or it was replayed from the dup cache; (re)mark it as
508 	 * available for replay
509 	 *
510 	 * At first glance, this 'if' statement seems a little strange;
511 	 * testing for NFS4_DUP_REPLAY, and then calling...
512 	 *
513 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
514 	 *
515 	 * ... but notice that we are checking dr_stat, and not the
516 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
517 	 * we do that so that we know not to prematurely reap it whilst
518 	 * we resent it to the client.
519 	 *
520 	 */
521 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
522 		mutex_enter(&drp->drc->lock);
523 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
524 		mutex_exit(&drp->drc->lock);
525 	} else if (dr_stat == NFS4_NOT_DUP) {
526 		rfs4_compound_free(rbp);
527 	}
528 
529 	return (error);
530 }
531