1 /*	$NetBSD: nfs_nfsdcache.c,v 1.2 2013/11/27 17:24:44 christos Exp $	*/
2 /*-
3  * Copyright (c) 1989, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Rick Macklem at The University of Guelph.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  */
34 
35 #include <sys/cdefs.h>
36 /* __FBSDID("FreeBSD: head/sys/fs/nfsserver/nfs_nfsdcache.c 254337 2013-08-14 21:11:26Z rmacklem "); */
37 __RCSID("$NetBSD: nfs_nfsdcache.c,v 1.2 2013/11/27 17:24:44 christos Exp $");
38 
39 /*
40  * Here is the basic algorithm:
41  * First, some design criteria I used:
42  * - I think a false hit is more serious than a false miss
43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
44  *   avoided at all cost
45  * - A valid hit will probably happen a long time after the original reply
46  *   and the TCP socket that the original request was received on will no
47  *   longer be active
48  *   (The long time delay implies to me that LRU is not appropriate.)
49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50  *   in them as well as minimizing the risk of redoing retried non-idempotent
51  *   Ops.
52  * Because it is biased towards avoiding false hits, multiple entries with
53  * the same xid are to be expected, especially for the case of the entry
54  * in the cache being related to a seqid# sequenced Op.
55  *
56  * The basic algorithm I'm about to code up:
57  * - Null RPCs bypass the cache and are just done
58  * For TCP
59  * 	- key on <xid, NFS version> (as noted above, there can be several
60  * 				     entries with the same key)
61  * 	When a request arrives:
62  * 		For all that match key
63  * 		- if RPC# != OR request_size !=
64  * 			- not a match with this one
65  * 		- if NFSv4 and received on same TCP socket OR
66  *			received on a TCP connection created before the
67  *			entry was cached
68  * 			- not a match with this one
69  * 			(V2,3 clients might retry on same TCP socket)
70  * 		- calculate checksum on first N bytes of NFS XDR
71  * 		- if checksum !=
72  * 			- not a match for this one
73  * 		If any of the remaining ones that match has a
74  * 			seqid_refcnt > 0
75  * 			- not a match (go do RPC, using new cache entry)
76  * 		If one match left
77  * 			- a hit (reply from cache)
78  * 		else
79  * 			- miss (go do RPC, using new cache entry)
80  *
81  * 	During processing of NFSv4 request:
82  * 		- set a flag when a non-idempotent Op is processed
83  * 		- when an Op that uses a seqid# (Open,...) is processed
84  * 			- if same seqid# as referenced entry in cache
85  * 				- free new cache entry
86  * 				- reply from referenced cache entry
87  * 			  else if next seqid# in order
88  * 				- free referenced cache entry
89  * 				- increment seqid_refcnt on new cache entry
90  * 				- set pointer from Openowner/Lockowner to
91  * 					new cache entry (aka reference it)
92  * 			  else if first seqid# in sequence
93  * 				- increment seqid_refcnt on new cache entry
94  * 				- set pointer from Openowner/Lockowner to
95  * 					new cache entry (aka reference it)
96  *
97  * 	At end of RPC processing:
98  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
99  * 			cache entry
100  * 			- save reply in cache entry
101  * 			- calculate checksum on first N bytes of NFS XDR
102  * 				request
103  * 			- note op and length of XDR request (in bytes)
104  * 			- timestamp it
105  * 		  else
106  * 			- free new cache entry
107  * 		- Send reply (noting info for socket activity check, below)
108  *
109  * 	For cache entries saved above:
110  * 		- if saved since seqid_refcnt was > 0
111  * 			- free when seqid_refcnt decrements to 0
112  * 			  (when next one in sequence is processed above, or
113  * 			   when Openowner/Lockowner is discarded)
114  * 		  else { non-idempotent Op(s) }
115  * 			- free when
116  * 				- some further activity observed on same
117  * 					socket
118  * 				  (I'm not yet sure how I'm going to do
119  * 				   this. Maybe look at the TCP connection
120  * 				   to see if the send_tcp_sequence# is well
121  * 				   past sent reply OR K additional RPCs
122  * 				   replied on same socket OR?)
123  * 			  OR
124  * 				- when very old (hours, days, weeks?)
125  *
126  * For UDP (v2, 3 only), pretty much the old way:
127  * - key on <xid, NFS version, RPC#, Client host ip#>
128  *   (at most one entry for each key)
129  *
130  * When a Request arrives:
131  * - if a match with entry via key
132  * 	- if RPC marked In_progress
133  * 		- discard request (don't send reply)
134  * 	  else
135  * 		- reply from cache
136  * 		- timestamp cache entry
137  *   else
138  * 	- add entry to cache, marked In_progress
139  * 	- do RPC
140  * 	- when RPC done
141  * 		- if RPC# non-idempotent
142  * 			- mark entry Done (not In_progress)
143  * 			- save reply
144  * 			- timestamp cache entry
145  * 		  else
146  * 			- free cache entry
147  * 		- send reply
148  *
149  * Later, entries with saved replies are free'd a short time (few minutes)
150  * after reply sent (timestamp).
151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
152  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153  *		pages 53-63. San Diego, February 1989.
154  *	 for the UDP case.
155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156  *	for TCP. For V3, a reply won't be saved when the flood level is
157  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158  *	that case. This level should be set high enough that this almost
159  *	never happens.
160  */
161 #ifndef APPLEKEXT
162 #include <fs/nfs/nfsport.h>
163 
164 extern struct nfsstats newnfsstats;
165 extern struct mtx nfsrc_udpmtx;
166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
168 #endif	/* !APPLEKEXT */
169 
170 SYSCTL_DECL(_vfs_nfsd);
171 
172 static u_int	nfsrc_tcphighwater = 0;
173 static int
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)174 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
175 {
176 	int error, newhighwater;
177 
178 	newhighwater = nfsrc_tcphighwater;
179 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
180 	if (error != 0 || req->newptr == NULL)
181 		return (error);
182 	if (newhighwater < 0)
183 		return (EINVAL);
184 	if (newhighwater >= nfsrc_floodlevel)
185 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
186 	nfsrc_tcphighwater = newhighwater;
187 	return (0);
188 }
189 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
190     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
191     "High water mark for TCP cache entries");
192 
193 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
194 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
195     &nfsrc_udphighwater, 0,
196     "High water mark for UDP cache entries");
197 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
198 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
199     &nfsrc_tcptimeout, 0,
200     "Timeout for TCP entries in the DRC");
201 static u_int nfsrc_tcpnonidempotent = 1;
202 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
203     &nfsrc_tcpnonidempotent, 0,
204     "Enable the DRC for NFS over TCP");
205 
206 static int nfsrc_udpcachesize = 0;
207 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
208 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
209 
210 /*
211  * and the reverse mapping from generic to Version 2 procedure numbers
212  */
213 static int newnfsv2_procid[NFS_V3NPROCS] = {
214 	NFSV2PROC_NULL,
215 	NFSV2PROC_GETATTR,
216 	NFSV2PROC_SETATTR,
217 	NFSV2PROC_LOOKUP,
218 	NFSV2PROC_NOOP,
219 	NFSV2PROC_READLINK,
220 	NFSV2PROC_READ,
221 	NFSV2PROC_WRITE,
222 	NFSV2PROC_CREATE,
223 	NFSV2PROC_MKDIR,
224 	NFSV2PROC_SYMLINK,
225 	NFSV2PROC_CREATE,
226 	NFSV2PROC_REMOVE,
227 	NFSV2PROC_RMDIR,
228 	NFSV2PROC_RENAME,
229 	NFSV2PROC_LINK,
230 	NFSV2PROC_READDIR,
231 	NFSV2PROC_NOOP,
232 	NFSV2PROC_STATFS,
233 	NFSV2PROC_NOOP,
234 	NFSV2PROC_NOOP,
235 	NFSV2PROC_NOOP,
236 };
237 
238 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239 #define	NFSRCUDPHASH(xid) \
240 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
241 #define	NFSRCHASH(xid) \
242 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
243 #define	TRUE	1
244 #define	FALSE	0
245 #define	NFSRVCACHE_CHECKLEN	100
246 
247 /* True iff the rpc reply is an nfs status ONLY! */
248 static int nfsv2_repstat[NFS_V3NPROCS] = {
249 	FALSE,
250 	FALSE,
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	TRUE,
260 	TRUE,
261 	TRUE,
262 	TRUE,
263 	FALSE,
264 	TRUE,
265 	FALSE,
266 	FALSE,
267 	FALSE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 };
272 
273 /*
274  * Will NFS want to work over IPv6 someday?
275  */
276 #define	NETFAMILY(rp) \
277 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278 
279 /* local functions */
280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static void nfsrc_lock(struct nfsrvcache *rp);
283 static void nfsrc_unlock(struct nfsrvcache *rp);
284 static void nfsrc_wanted(struct nfsrvcache *rp);
285 static void nfsrc_freecache(struct nfsrvcache *rp);
286 static void nfsrc_trimcache(u_int64_t, struct socket *);
287 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
288     struct socket *);
289 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
290 static void nfsrc_marksametcpconn(u_int64_t);
291 
292 /*
293  * Return the correct mutex for this cache entry.
294  */
295 static __inline struct mtx *
nfsrc_cachemutex(struct nfsrvcache * rp)296 nfsrc_cachemutex(struct nfsrvcache *rp)
297 {
298 
299 	if ((rp->rc_flag & RC_UDP) != 0)
300 		return (&nfsrc_udpmtx);
301 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
302 }
303 
304 /*
305  * Initialize the server request cache list
306  */
307 APPLESTATIC void
nfsrvd_initcache(void)308 nfsrvd_initcache(void)
309 {
310 	int i;
311 	static int inited = 0;
312 
313 	if (inited)
314 		return;
315 	inited = 1;
316 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317 		LIST_INIT(&nfsrvudphashtbl[i]);
318 		LIST_INIT(&nfsrchash_table[i].tbl);
319 	}
320 	TAILQ_INIT(&nfsrvudplru);
321 	nfsrc_tcpsavedreplies = 0;
322 	nfsrc_udpcachesize = 0;
323 	newnfsstats.srvcache_tcppeak = 0;
324 	newnfsstats.srvcache_size = 0;
325 }
326 
327 /*
328  * Get a cache entry for this request. Basically just malloc a new one
329  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330  * Call nfsrc_trimcache() to clean up the cache before returning.
331  */
332 APPLESTATIC int
nfsrvd_getcache(struct nfsrv_descript * nd,struct socket * so)333 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
334 {
335 	struct nfsrvcache *newrp;
336 	int ret;
337 
338 	if (nd->nd_procnum == NFSPROC_NULL)
339 		panic("nfsd cache null");
340 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
341 	    M_NFSRVCACHE, M_WAITOK);
342 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
343 	if (nd->nd_flag & ND_NFSV4)
344 		newrp->rc_flag = RC_NFSV4;
345 	else if (nd->nd_flag & ND_NFSV3)
346 		newrp->rc_flag = RC_NFSV3;
347 	else
348 		newrp->rc_flag = RC_NFSV2;
349 	newrp->rc_xid = nd->nd_retxid;
350 	newrp->rc_proc = nd->nd_procnum;
351 	newrp->rc_sockref = nd->nd_sockref;
352 	newrp->rc_cachetime = nd->nd_tcpconntime;
353 	if (nd->nd_flag & ND_SAMETCPCONN)
354 		newrp->rc_flag |= RC_SAMETCPCONN;
355 	if (nd->nd_nam2 != NULL) {
356 		newrp->rc_flag |= RC_UDP;
357 		ret = nfsrc_getudp(nd, newrp);
358 	} else {
359 		ret = nfsrc_gettcp(nd, newrp);
360 	}
361 	nfsrc_trimcache(nd->nd_sockref, so);
362 	NFSEXITCODE2(0, nd);
363 	return (ret);
364 }
365 
366 /*
367  * For UDP (v2, v3):
368  * - key on <xid, NFS version, RPC#, Client host ip#>
369  *   (at most one entry for each key)
370  */
371 static int
nfsrc_getudp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)372 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
373 {
374 	struct nfsrvcache *rp;
375 	struct sockaddr_in *saddr;
376 	struct sockaddr_in6 *saddr6;
377 	struct nfsrvhashhead *hp;
378 	int ret = 0;
379 	struct mtx *mutex;
380 
381 	mutex = nfsrc_cachemutex(newrp);
382 	hp = NFSRCUDPHASH(newrp->rc_xid);
383 loop:
384 	mtx_lock(mutex);
385 	LIST_FOREACH(rp, hp, rc_hash) {
386 	    if (newrp->rc_xid == rp->rc_xid &&
387 		newrp->rc_proc == rp->rc_proc &&
388 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
389 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
390 			if ((rp->rc_flag & RC_LOCKED) != 0) {
391 				rp->rc_flag |= RC_WANTED;
392 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
393 				    "nfsrc", 10 * hz);
394 				goto loop;
395 			}
396 			if (rp->rc_flag == 0)
397 				panic("nfs udp cache0");
398 			rp->rc_flag |= RC_LOCKED;
399 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
400 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
401 			if (rp->rc_flag & RC_INPROG) {
402 				newnfsstats.srvcache_inproghits++;
403 				mtx_unlock(mutex);
404 				ret = RC_DROPIT;
405 			} else if (rp->rc_flag & RC_REPSTATUS) {
406 				/*
407 				 * V2 only.
408 				 */
409 				newnfsstats.srvcache_nonidemdonehits++;
410 				mtx_unlock(mutex);
411 				nfsrvd_rephead(nd);
412 				*(nd->nd_errp) = rp->rc_status;
413 				ret = RC_REPLY;
414 				rp->rc_timestamp = NFSD_MONOSEC +
415 					NFSRVCACHE_UDPTIMEOUT;
416 			} else if (rp->rc_flag & RC_REPMBUF) {
417 				newnfsstats.srvcache_nonidemdonehits++;
418 				mtx_unlock(mutex);
419 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
420 					M_COPYALL, M_WAITOK);
421 				ret = RC_REPLY;
422 				rp->rc_timestamp = NFSD_MONOSEC +
423 					NFSRVCACHE_UDPTIMEOUT;
424 			} else {
425 				panic("nfs udp cache1");
426 			}
427 			nfsrc_unlock(rp);
428 			free((caddr_t)newrp, M_NFSRVCACHE);
429 			goto out;
430 		}
431 	}
432 	newnfsstats.srvcache_misses++;
433 	atomic_add_int(&newnfsstats.srvcache_size, 1);
434 	nfsrc_udpcachesize++;
435 
436 	newrp->rc_flag |= RC_INPROG;
437 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
438 	if (saddr->sin_family == AF_INET)
439 		newrp->rc_inet = saddr->sin_addr.s_addr;
440 	else if (saddr->sin_family == AF_INET6) {
441 		saddr6 = (struct sockaddr_in6 *)saddr;
442 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
443 		    sizeof (struct in6_addr));
444 		newrp->rc_flag |= RC_INETIPV6;
445 	}
446 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
447 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
448 	mtx_unlock(mutex);
449 	nd->nd_rp = newrp;
450 	ret = RC_DOIT;
451 
452 out:
453 	NFSEXITCODE2(0, nd);
454 	return (ret);
455 }
456 
457 /*
458  * Update a request cache entry after the rpc has been done
459  */
460 APPLESTATIC struct nfsrvcache *
nfsrvd_updatecache(struct nfsrv_descript * nd,struct socket * so)461 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
462 {
463 	struct nfsrvcache *rp;
464 	struct nfsrvcache *retrp = NULL;
465 	mbuf_t m;
466 	struct mtx *mutex;
467 
468 	rp = nd->nd_rp;
469 	if (!rp)
470 		panic("nfsrvd_updatecache null rp");
471 	nd->nd_rp = NULL;
472 	mutex = nfsrc_cachemutex(rp);
473 	mtx_lock(mutex);
474 	nfsrc_lock(rp);
475 	if (!(rp->rc_flag & RC_INPROG))
476 		panic("nfsrvd_updatecache not inprog");
477 	rp->rc_flag &= ~RC_INPROG;
478 	if (rp->rc_flag & RC_UDP) {
479 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
480 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
481 	}
482 
483 	/*
484 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
485 	 */
486 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
487 		newnfsstats.srvcache_nonidemdonehits++;
488 		mtx_unlock(mutex);
489 		nd->nd_repstat = 0;
490 		if (nd->nd_mreq)
491 			mbuf_freem(nd->nd_mreq);
492 		if (!(rp->rc_flag & RC_REPMBUF))
493 			panic("reply from cache");
494 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
495 		    M_COPYALL, M_WAITOK);
496 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
497 		nfsrc_unlock(rp);
498 		goto out;
499 	}
500 
501 	/*
502 	 * If rc_refcnt > 0, save it
503 	 * For UDP, save it if ND_SAVEREPLY is set
504 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
505 	 */
506 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
507 	    (rp->rc_refcnt > 0 ||
508 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
509 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
510 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
511 	      nfsrc_tcpnonidempotent))) {
512 		if (rp->rc_refcnt > 0) {
513 			if (!(rp->rc_flag & RC_NFSV4))
514 				panic("update_cache refcnt");
515 			rp->rc_flag |= RC_REFCNT;
516 		}
517 		if ((nd->nd_flag & ND_NFSV2) &&
518 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
519 			rp->rc_status = nd->nd_repstat;
520 			rp->rc_flag |= RC_REPSTATUS;
521 			mtx_unlock(mutex);
522 		} else {
523 			if (!(rp->rc_flag & RC_UDP)) {
524 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
525 			    if (nfsrc_tcpsavedreplies >
526 				newnfsstats.srvcache_tcppeak)
527 				newnfsstats.srvcache_tcppeak =
528 				    nfsrc_tcpsavedreplies;
529 			}
530 			mtx_unlock(mutex);
531 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
532 			mtx_lock(mutex);
533 			rp->rc_reply = m;
534 			rp->rc_flag |= RC_REPMBUF;
535 			mtx_unlock(mutex);
536 		}
537 		if (rp->rc_flag & RC_UDP) {
538 			rp->rc_timestamp = NFSD_MONOSEC +
539 			    NFSRVCACHE_UDPTIMEOUT;
540 			nfsrc_unlock(rp);
541 		} else {
542 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
543 			if (rp->rc_refcnt > 0)
544 				nfsrc_unlock(rp);
545 			else
546 				retrp = rp;
547 		}
548 	} else {
549 		nfsrc_freecache(rp);
550 		mtx_unlock(mutex);
551 	}
552 
553 out:
554 	nfsrc_trimcache(nd->nd_sockref, so);
555 	NFSEXITCODE2(0, nd);
556 	return (retrp);
557 }
558 
559 /*
560  * Invalidate and, if possible, free an in prog cache entry.
561  * Must not sleep.
562  */
563 APPLESTATIC void
nfsrvd_delcache(struct nfsrvcache * rp)564 nfsrvd_delcache(struct nfsrvcache *rp)
565 {
566 	struct mtx *mutex;
567 
568 	mutex = nfsrc_cachemutex(rp);
569 	if (!(rp->rc_flag & RC_INPROG))
570 		panic("nfsrvd_delcache not in prog");
571 	mtx_lock(mutex);
572 	rp->rc_flag &= ~RC_INPROG;
573 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
574 		nfsrc_freecache(rp);
575 	mtx_unlock(mutex);
576 }
577 
578 /*
579  * Called after nfsrvd_updatecache() once the reply is sent, to update
580  * the entry for nfsrc_activesocket() and unlock it. The argument is
581  * the pointer returned by nfsrvd_updatecache().
582  */
583 APPLESTATIC void
nfsrvd_sentcache(struct nfsrvcache * rp,struct socket * so,int err)584 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
585 {
586 	tcp_seq tmp_seq;
587 	struct mtx *mutex;
588 
589 	mutex = nfsrc_cachemutex(rp);
590 	if (!(rp->rc_flag & RC_LOCKED))
591 		panic("nfsrvd_sentcache not locked");
592 	if (!err) {
593 		if ((so->so_proto->pr_domain->dom_family != AF_INET &&
594 		     so->so_proto->pr_domain->dom_family != AF_INET6) ||
595 		     so->so_proto->pr_protocol != IPPROTO_TCP)
596 			panic("nfs sent cache");
597 		if (nfsrv_getsockseqnum(so, &tmp_seq)) {
598 			mtx_lock(mutex);
599 			rp->rc_tcpseq = tmp_seq;
600 			rp->rc_flag |= RC_TCPSEQ;
601 			mtx_unlock(mutex);
602 		}
603 	}
604 	nfsrc_unlock(rp);
605 }
606 
607 /*
608  * Get a cache entry for TCP
609  * - key on <xid, nfs version>
610  *   (allow multiple entries for a given key)
611  */
612 static int
nfsrc_gettcp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)613 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
614 {
615 	struct nfsrvcache *rp, *nextrp;
616 	int i;
617 	struct nfsrvcache *hitrp;
618 	struct nfsrvhashhead *hp, nfsrc_templist;
619 	int hit, ret = 0;
620 	struct mtx *mutex;
621 
622 	mutex = nfsrc_cachemutex(newrp);
623 	hp = NFSRCHASH(newrp->rc_xid);
624 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
625 tryagain:
626 	mtx_lock(mutex);
627 	hit = 1;
628 	LIST_INIT(&nfsrc_templist);
629 	/*
630 	 * Get all the matches and put them on the temp list.
631 	 */
632 	rp = LIST_FIRST(hp);
633 	while (rp != NULL) {
634 		nextrp = LIST_NEXT(rp, rc_hash);
635 		if (newrp->rc_xid == rp->rc_xid &&
636 		    (!(rp->rc_flag & RC_INPROG) ||
637 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
638 		      newrp->rc_sockref == rp->rc_sockref)) &&
639 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
640 		    newrp->rc_proc == rp->rc_proc &&
641 		    ((newrp->rc_flag & RC_NFSV4) &&
642 		     newrp->rc_sockref != rp->rc_sockref &&
643 		     newrp->rc_cachetime >= rp->rc_cachetime)
644 		    && newrp->rc_reqlen == rp->rc_reqlen &&
645 		    newrp->rc_cksum == rp->rc_cksum) {
646 			LIST_REMOVE(rp, rc_hash);
647 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
648 		}
649 		rp = nextrp;
650 	}
651 
652 	/*
653 	 * Now, use nfsrc_templist to decide if there is a match.
654 	 */
655 	i = 0;
656 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
657 		i++;
658 		if (rp->rc_refcnt > 0) {
659 			hit = 0;
660 			break;
661 		}
662 	}
663 	/*
664 	 * Can be a hit only if one entry left.
665 	 * Note possible hit entry and put nfsrc_templist back on hash
666 	 * list.
667 	 */
668 	if (i != 1)
669 		hit = 0;
670 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
671 	while (rp != NULL) {
672 		nextrp = LIST_NEXT(rp, rc_hash);
673 		LIST_REMOVE(rp, rc_hash);
674 		LIST_INSERT_HEAD(hp, rp, rc_hash);
675 		rp = nextrp;
676 	}
677 	if (LIST_FIRST(&nfsrc_templist) != NULL)
678 		panic("nfs gettcp cache templist");
679 
680 	if (hit) {
681 		rp = hitrp;
682 		if ((rp->rc_flag & RC_LOCKED) != 0) {
683 			rp->rc_flag |= RC_WANTED;
684 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
685 			    "nfsrc", 10 * hz);
686 			goto tryagain;
687 		}
688 		if (rp->rc_flag == 0)
689 			panic("nfs tcp cache0");
690 		rp->rc_flag |= RC_LOCKED;
691 		if (rp->rc_flag & RC_INPROG) {
692 			newnfsstats.srvcache_inproghits++;
693 			mtx_unlock(mutex);
694 			if (newrp->rc_sockref == rp->rc_sockref)
695 				nfsrc_marksametcpconn(rp->rc_sockref);
696 			ret = RC_DROPIT;
697 		} else if (rp->rc_flag & RC_REPSTATUS) {
698 			/*
699 			 * V2 only.
700 			 */
701 			newnfsstats.srvcache_nonidemdonehits++;
702 			mtx_unlock(mutex);
703 			if (newrp->rc_sockref == rp->rc_sockref)
704 				nfsrc_marksametcpconn(rp->rc_sockref);
705 			ret = RC_REPLY;
706 			nfsrvd_rephead(nd);
707 			*(nd->nd_errp) = rp->rc_status;
708 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
709 		} else if (rp->rc_flag & RC_REPMBUF) {
710 			newnfsstats.srvcache_nonidemdonehits++;
711 			mtx_unlock(mutex);
712 			if (newrp->rc_sockref == rp->rc_sockref)
713 				nfsrc_marksametcpconn(rp->rc_sockref);
714 			ret = RC_REPLY;
715 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
716 				M_COPYALL, M_WAITOK);
717 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
718 		} else {
719 			panic("nfs tcp cache1");
720 		}
721 		nfsrc_unlock(rp);
722 		free((caddr_t)newrp, M_NFSRVCACHE);
723 		goto out;
724 	}
725 	newnfsstats.srvcache_misses++;
726 	atomic_add_int(&newnfsstats.srvcache_size, 1);
727 
728 	/*
729 	 * For TCP, multiple entries for a key are allowed, so don't
730 	 * chain it into the hash table until done.
731 	 */
732 	newrp->rc_cachetime = NFSD_MONOSEC;
733 	newrp->rc_flag |= RC_INPROG;
734 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
735 	mtx_unlock(mutex);
736 	nd->nd_rp = newrp;
737 	ret = RC_DOIT;
738 
739 out:
740 	NFSEXITCODE2(0, nd);
741 	return (ret);
742 }
743 
744 /*
745  * Lock a cache entry.
746  */
747 static void
nfsrc_lock(struct nfsrvcache * rp)748 nfsrc_lock(struct nfsrvcache *rp)
749 {
750 	struct mtx *mutex;
751 
752 	mutex = nfsrc_cachemutex(rp);
753 	mtx_assert(mutex, MA_OWNED);
754 	while ((rp->rc_flag & RC_LOCKED) != 0) {
755 		rp->rc_flag |= RC_WANTED;
756 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
757 	}
758 	rp->rc_flag |= RC_LOCKED;
759 }
760 
761 /*
762  * Unlock a cache entry.
763  */
764 static void
nfsrc_unlock(struct nfsrvcache * rp)765 nfsrc_unlock(struct nfsrvcache *rp)
766 {
767 	struct mtx *mutex;
768 
769 	mutex = nfsrc_cachemutex(rp);
770 	mtx_lock(mutex);
771 	rp->rc_flag &= ~RC_LOCKED;
772 	nfsrc_wanted(rp);
773 	mtx_unlock(mutex);
774 }
775 
776 /*
777  * Wakeup anyone wanting entry.
778  */
779 static void
nfsrc_wanted(struct nfsrvcache * rp)780 nfsrc_wanted(struct nfsrvcache *rp)
781 {
782 	if (rp->rc_flag & RC_WANTED) {
783 		rp->rc_flag &= ~RC_WANTED;
784 		wakeup((caddr_t)rp);
785 	}
786 }
787 
788 /*
789  * Free up the entry.
790  * Must not sleep.
791  */
792 static void
nfsrc_freecache(struct nfsrvcache * rp)793 nfsrc_freecache(struct nfsrvcache *rp)
794 {
795 
796 	LIST_REMOVE(rp, rc_hash);
797 	if (rp->rc_flag & RC_UDP) {
798 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
799 		nfsrc_udpcachesize--;
800 	}
801 	nfsrc_wanted(rp);
802 	if (rp->rc_flag & RC_REPMBUF) {
803 		mbuf_freem(rp->rc_reply);
804 		if (!(rp->rc_flag & RC_UDP))
805 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
806 	}
807 	FREE((caddr_t)rp, M_NFSRVCACHE);
808 	atomic_add_int(&newnfsstats.srvcache_size, -1);
809 }
810 
811 /*
812  * Clean out the cache. Called when nfsserver module is unloaded.
813  */
814 APPLESTATIC void
nfsrvd_cleancache(void)815 nfsrvd_cleancache(void)
816 {
817 	struct nfsrvcache *rp, *nextrp;
818 	int i;
819 
820 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
821 		mtx_lock(&nfsrchash_table[i].mtx);
822 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
823 			nfsrc_freecache(rp);
824 		mtx_unlock(&nfsrchash_table[i].mtx);
825 	}
826 	mtx_lock(&nfsrc_udpmtx);
827 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
828 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
829 			nfsrc_freecache(rp);
830 		}
831 	}
832 	newnfsstats.srvcache_size = 0;
833 	mtx_unlock(&nfsrc_udpmtx);
834 	nfsrc_tcpsavedreplies = 0;
835 }
836 
837 /*
838  * The basic rule is to get rid of entries that are expired.
839  */
840 static void
nfsrc_trimcache(u_int64_t sockref,struct socket * so)841 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
842 {
843 	struct nfsrvcache *rp, *nextrp;
844 	int i, j, k, time_histo[10];
845 	time_t thisstamp;
846 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
847 	static int onethread = 0;
848 
849 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
850 		return;
851 	if (NFSD_MONOSEC != udp_lasttrim ||
852 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
853 	    nfsrc_udphighwater / 2)) {
854 		mtx_lock(&nfsrc_udpmtx);
855 		udp_lasttrim = NFSD_MONOSEC;
856 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
857 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
858 			     && rp->rc_refcnt == 0
859 			     && ((rp->rc_flag & RC_REFCNT) ||
860 				 udp_lasttrim > rp->rc_timestamp ||
861 				 nfsrc_udpcachesize > nfsrc_udphighwater))
862 				nfsrc_freecache(rp);
863 		}
864 		mtx_unlock(&nfsrc_udpmtx);
865 	}
866 	if (NFSD_MONOSEC != tcp_lasttrim ||
867 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
868 		for (i = 0; i < 10; i++)
869 			time_histo[i] = 0;
870 		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
871 			mtx_lock(&nfsrchash_table[i].mtx);
872 			if (i == 0)
873 				tcp_lasttrim = NFSD_MONOSEC;
874 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
875 			    nextrp) {
876 				if (!(rp->rc_flag &
877 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
878 				     && rp->rc_refcnt == 0) {
879 					/*
880 					 * The timestamps range from roughly the
881 					 * present (tcp_lasttrim) to the present
882 					 * + nfsrc_tcptimeout. Generate a simple
883 					 * histogram of where the timeouts fall.
884 					 */
885 					j = rp->rc_timestamp - tcp_lasttrim;
886 					if (j >= nfsrc_tcptimeout)
887 						j = nfsrc_tcptimeout - 1;
888 					if (j < 0)
889 						j = 0;
890 					j = (j * 10 / nfsrc_tcptimeout) % 10;
891 					time_histo[j]++;
892 					if ((rp->rc_flag & RC_REFCNT) ||
893 					    tcp_lasttrim > rp->rc_timestamp ||
894 					    nfsrc_activesocket(rp, sockref, so))
895 						nfsrc_freecache(rp);
896 				}
897 			}
898 			mtx_unlock(&nfsrchash_table[i].mtx);
899 		}
900 		j = nfsrc_tcphighwater / 5;	/* 20% of it */
901 		if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
902 			/*
903 			 * Trim some more with a smaller timeout of as little
904 			 * as 20% of nfsrc_tcptimeout to try and get below
905 			 * 80% of the nfsrc_tcphighwater.
906 			 */
907 			k = 0;
908 			for (i = 0; i < 8; i++) {
909 				k += time_histo[i];
910 				if (k > j)
911 					break;
912 			}
913 			k = nfsrc_tcptimeout * (i + 1) / 10;
914 			if (k < 1)
915 				k = 1;
916 			thisstamp = tcp_lasttrim + k;
917 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
918 				mtx_lock(&nfsrchash_table[i].mtx);
919 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
920 				    rc_hash, nextrp) {
921 					if (!(rp->rc_flag &
922 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
923 					     && rp->rc_refcnt == 0
924 					     && ((rp->rc_flag & RC_REFCNT) ||
925 						 thisstamp > rp->rc_timestamp ||
926 						 nfsrc_activesocket(rp, sockref,
927 						    so)))
928 						nfsrc_freecache(rp);
929 				}
930 				mtx_unlock(&nfsrchash_table[i].mtx);
931 			}
932 		}
933 	}
934 	atomic_store_rel_int(&onethread, 0);
935 }
936 
937 /*
938  * Add a seqid# reference to the cache entry.
939  */
940 APPLESTATIC void
nfsrvd_refcache(struct nfsrvcache * rp)941 nfsrvd_refcache(struct nfsrvcache *rp)
942 {
943 	struct mtx *mutex;
944 
945 	mutex = nfsrc_cachemutex(rp);
946 	mtx_lock(mutex);
947 	if (rp->rc_refcnt < 0)
948 		panic("nfs cache refcnt");
949 	rp->rc_refcnt++;
950 	mtx_unlock(mutex);
951 }
952 
953 /*
954  * Dereference a seqid# cache entry.
955  */
956 APPLESTATIC void
nfsrvd_derefcache(struct nfsrvcache * rp)957 nfsrvd_derefcache(struct nfsrvcache *rp)
958 {
959 	struct mtx *mutex;
960 
961 	mutex = nfsrc_cachemutex(rp);
962 	mtx_lock(mutex);
963 	if (rp->rc_refcnt <= 0)
964 		panic("nfs cache derefcnt");
965 	rp->rc_refcnt--;
966 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
967 		nfsrc_freecache(rp);
968 	mtx_unlock(mutex);
969 }
970 
971 /*
972  * Check to see if the socket is active.
973  * Return 1 if the reply has been received/acknowledged by the client,
974  * 0 otherwise.
975  * XXX - Uses tcp internals.
976  */
977 static int
nfsrc_activesocket(struct nfsrvcache * rp,u_int64_t cur_sockref,struct socket * cur_so)978 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
979     struct socket *cur_so)
980 {
981 	int ret = 0;
982 
983 	if (!(rp->rc_flag & RC_TCPSEQ))
984 		return (ret);
985 	/*
986 	 * If the sockref is the same, it is the same TCP connection.
987 	 */
988 	if (cur_sockref == rp->rc_sockref)
989 		ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
990 	return (ret);
991 }
992 
993 /*
994  * Calculate the length of the mbuf list and a checksum on the first up to
995  * NFSRVCACHE_CHECKLEN bytes.
996  */
997 static int
nfsrc_getlenandcksum(mbuf_t m1,u_int16_t * cksum)998 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
999 {
1000 	int len = 0, cklen;
1001 	mbuf_t m;
1002 
1003 	m = m1;
1004 	while (m) {
1005 		len += mbuf_len(m);
1006 		m = mbuf_next(m);
1007 	}
1008 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1009 	*cksum = in_cksum(m1, cklen);
1010 	return (len);
1011 }
1012 
1013 /*
1014  * Mark a TCP connection that is seeing retries. Should never happen for
1015  * NFSv4.
1016  */
1017 static void
nfsrc_marksametcpconn(u_int64_t sockref)1018 nfsrc_marksametcpconn(u_int64_t sockref)
1019 {
1020 }
1021 
1022