1 /* $NetBSD: nfs_nfsdcache.c,v 1.2 2013/11/27 17:24:44 christos Exp $ */
2 /*-
3 * Copyright (c) 1989, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Rick Macklem at The University of Guelph.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 */
34
35 #include <sys/cdefs.h>
36 /* __FBSDID("FreeBSD: head/sys/fs/nfsserver/nfs_nfsdcache.c 254337 2013-08-14 21:11:26Z rmacklem "); */
37 __RCSID("$NetBSD: nfs_nfsdcache.c,v 1.2 2013/11/27 17:24:44 christos Exp $");
38
39 /*
40 * Here is the basic algorithm:
41 * First, some design criteria I used:
42 * - I think a false hit is more serious than a false miss
43 * - A false hit for an RPC that has Op(s) that order via seqid# must be
44 * avoided at all cost
45 * - A valid hit will probably happen a long time after the original reply
46 * and the TCP socket that the original request was received on will no
47 * longer be active
48 * (The long time delay implies to me that LRU is not appropriate.)
49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50 * in them as well as minimizing the risk of redoing retried non-idempotent
51 * Ops.
52 * Because it is biased towards avoiding false hits, multiple entries with
53 * the same xid are to be expected, especially for the case of the entry
54 * in the cache being related to a seqid# sequenced Op.
55 *
56 * The basic algorithm I'm about to code up:
57 * - Null RPCs bypass the cache and are just done
58 * For TCP
59 * - key on <xid, NFS version> (as noted above, there can be several
60 * entries with the same key)
61 * When a request arrives:
62 * For all that match key
63 * - if RPC# != OR request_size !=
64 * - not a match with this one
65 * - if NFSv4 and received on same TCP socket OR
66 * received on a TCP connection created before the
67 * entry was cached
68 * - not a match with this one
69 * (V2,3 clients might retry on same TCP socket)
70 * - calculate checksum on first N bytes of NFS XDR
71 * - if checksum !=
72 * - not a match for this one
73 * If any of the remaining ones that match has a
74 * seqid_refcnt > 0
75 * - not a match (go do RPC, using new cache entry)
76 * If one match left
77 * - a hit (reply from cache)
78 * else
79 * - miss (go do RPC, using new cache entry)
80 *
81 * During processing of NFSv4 request:
82 * - set a flag when a non-idempotent Op is processed
83 * - when an Op that uses a seqid# (Open,...) is processed
84 * - if same seqid# as referenced entry in cache
85 * - free new cache entry
86 * - reply from referenced cache entry
87 * else if next seqid# in order
88 * - free referenced cache entry
89 * - increment seqid_refcnt on new cache entry
90 * - set pointer from Openowner/Lockowner to
91 * new cache entry (aka reference it)
92 * else if first seqid# in sequence
93 * - increment seqid_refcnt on new cache entry
94 * - set pointer from Openowner/Lockowner to
95 * new cache entry (aka reference it)
96 *
97 * At end of RPC processing:
98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
99 * cache entry
100 * - save reply in cache entry
101 * - calculate checksum on first N bytes of NFS XDR
102 * request
103 * - note op and length of XDR request (in bytes)
104 * - timestamp it
105 * else
106 * - free new cache entry
107 * - Send reply (noting info for socket activity check, below)
108 *
109 * For cache entries saved above:
110 * - if saved since seqid_refcnt was > 0
111 * - free when seqid_refcnt decrements to 0
112 * (when next one in sequence is processed above, or
113 * when Openowner/Lockowner is discarded)
114 * else { non-idempotent Op(s) }
115 * - free when
116 * - some further activity observed on same
117 * socket
118 * (I'm not yet sure how I'm going to do
119 * this. Maybe look at the TCP connection
120 * to see if the send_tcp_sequence# is well
121 * past sent reply OR K additional RPCs
122 * replied on same socket OR?)
123 * OR
124 * - when very old (hours, days, weeks?)
125 *
126 * For UDP (v2, 3 only), pretty much the old way:
127 * - key on <xid, NFS version, RPC#, Client host ip#>
128 * (at most one entry for each key)
129 *
130 * When a Request arrives:
131 * - if a match with entry via key
132 * - if RPC marked In_progress
133 * - discard request (don't send reply)
134 * else
135 * - reply from cache
136 * - timestamp cache entry
137 * else
138 * - add entry to cache, marked In_progress
139 * - do RPC
140 * - when RPC done
141 * - if RPC# non-idempotent
142 * - mark entry Done (not In_progress)
143 * - save reply
144 * - timestamp cache entry
145 * else
146 * - free cache entry
147 * - send reply
148 *
149 * Later, entries with saved replies are free'd a short time (few minutes)
150 * after reply sent (timestamp).
151 * Reference: Chet Juszczak, "Improving the Performance and Correctness
152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153 * pages 53-63. San Diego, February 1989.
154 * for the UDP case.
155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156 * for TCP. For V3, a reply won't be saved when the flood level is
157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158 * that case. This level should be set high enough that this almost
159 * never happens.
160 */
161 #ifndef APPLEKEXT
162 #include <fs/nfs/nfsport.h>
163
164 extern struct nfsstats newnfsstats;
165 extern struct mtx nfsrc_udpmtx;
166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
168 #endif /* !APPLEKEXT */
169
170 SYSCTL_DECL(_vfs_nfsd);
171
172 static u_int nfsrc_tcphighwater = 0;
173 static int
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)174 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
175 {
176 int error, newhighwater;
177
178 newhighwater = nfsrc_tcphighwater;
179 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
180 if (error != 0 || req->newptr == NULL)
181 return (error);
182 if (newhighwater < 0)
183 return (EINVAL);
184 if (newhighwater >= nfsrc_floodlevel)
185 nfsrc_floodlevel = newhighwater + newhighwater / 5;
186 nfsrc_tcphighwater = newhighwater;
187 return (0);
188 }
189 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
190 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
191 "High water mark for TCP cache entries");
192
193 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
194 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
195 &nfsrc_udphighwater, 0,
196 "High water mark for UDP cache entries");
197 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
198 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
199 &nfsrc_tcptimeout, 0,
200 "Timeout for TCP entries in the DRC");
201 static u_int nfsrc_tcpnonidempotent = 1;
202 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
203 &nfsrc_tcpnonidempotent, 0,
204 "Enable the DRC for NFS over TCP");
205
206 static int nfsrc_udpcachesize = 0;
207 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
208 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
209
210 /*
211 * and the reverse mapping from generic to Version 2 procedure numbers
212 */
213 static int newnfsv2_procid[NFS_V3NPROCS] = {
214 NFSV2PROC_NULL,
215 NFSV2PROC_GETATTR,
216 NFSV2PROC_SETATTR,
217 NFSV2PROC_LOOKUP,
218 NFSV2PROC_NOOP,
219 NFSV2PROC_READLINK,
220 NFSV2PROC_READ,
221 NFSV2PROC_WRITE,
222 NFSV2PROC_CREATE,
223 NFSV2PROC_MKDIR,
224 NFSV2PROC_SYMLINK,
225 NFSV2PROC_CREATE,
226 NFSV2PROC_REMOVE,
227 NFSV2PROC_RMDIR,
228 NFSV2PROC_RENAME,
229 NFSV2PROC_LINK,
230 NFSV2PROC_READDIR,
231 NFSV2PROC_NOOP,
232 NFSV2PROC_STATFS,
233 NFSV2PROC_NOOP,
234 NFSV2PROC_NOOP,
235 NFSV2PROC_NOOP,
236 };
237
238 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239 #define NFSRCUDPHASH(xid) \
240 (&nfsrvudphashtbl[nfsrc_hash(xid)])
241 #define NFSRCHASH(xid) \
242 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
243 #define TRUE 1
244 #define FALSE 0
245 #define NFSRVCACHE_CHECKLEN 100
246
247 /* True iff the rpc reply is an nfs status ONLY! */
248 static int nfsv2_repstat[NFS_V3NPROCS] = {
249 FALSE,
250 FALSE,
251 FALSE,
252 FALSE,
253 FALSE,
254 FALSE,
255 FALSE,
256 FALSE,
257 FALSE,
258 FALSE,
259 TRUE,
260 TRUE,
261 TRUE,
262 TRUE,
263 FALSE,
264 TRUE,
265 FALSE,
266 FALSE,
267 FALSE,
268 FALSE,
269 FALSE,
270 FALSE,
271 };
272
273 /*
274 * Will NFS want to work over IPv6 someday?
275 */
276 #define NETFAMILY(rp) \
277 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278
279 /* local functions */
280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static void nfsrc_lock(struct nfsrvcache *rp);
283 static void nfsrc_unlock(struct nfsrvcache *rp);
284 static void nfsrc_wanted(struct nfsrvcache *rp);
285 static void nfsrc_freecache(struct nfsrvcache *rp);
286 static void nfsrc_trimcache(u_int64_t, struct socket *);
287 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
288 struct socket *);
289 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
290 static void nfsrc_marksametcpconn(u_int64_t);
291
292 /*
293 * Return the correct mutex for this cache entry.
294 */
295 static __inline struct mtx *
nfsrc_cachemutex(struct nfsrvcache * rp)296 nfsrc_cachemutex(struct nfsrvcache *rp)
297 {
298
299 if ((rp->rc_flag & RC_UDP) != 0)
300 return (&nfsrc_udpmtx);
301 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
302 }
303
304 /*
305 * Initialize the server request cache list
306 */
307 APPLESTATIC void
nfsrvd_initcache(void)308 nfsrvd_initcache(void)
309 {
310 int i;
311 static int inited = 0;
312
313 if (inited)
314 return;
315 inited = 1;
316 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317 LIST_INIT(&nfsrvudphashtbl[i]);
318 LIST_INIT(&nfsrchash_table[i].tbl);
319 }
320 TAILQ_INIT(&nfsrvudplru);
321 nfsrc_tcpsavedreplies = 0;
322 nfsrc_udpcachesize = 0;
323 newnfsstats.srvcache_tcppeak = 0;
324 newnfsstats.srvcache_size = 0;
325 }
326
327 /*
328 * Get a cache entry for this request. Basically just malloc a new one
329 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330 * Call nfsrc_trimcache() to clean up the cache before returning.
331 */
332 APPLESTATIC int
nfsrvd_getcache(struct nfsrv_descript * nd,struct socket * so)333 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
334 {
335 struct nfsrvcache *newrp;
336 int ret;
337
338 if (nd->nd_procnum == NFSPROC_NULL)
339 panic("nfsd cache null");
340 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
341 M_NFSRVCACHE, M_WAITOK);
342 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
343 if (nd->nd_flag & ND_NFSV4)
344 newrp->rc_flag = RC_NFSV4;
345 else if (nd->nd_flag & ND_NFSV3)
346 newrp->rc_flag = RC_NFSV3;
347 else
348 newrp->rc_flag = RC_NFSV2;
349 newrp->rc_xid = nd->nd_retxid;
350 newrp->rc_proc = nd->nd_procnum;
351 newrp->rc_sockref = nd->nd_sockref;
352 newrp->rc_cachetime = nd->nd_tcpconntime;
353 if (nd->nd_flag & ND_SAMETCPCONN)
354 newrp->rc_flag |= RC_SAMETCPCONN;
355 if (nd->nd_nam2 != NULL) {
356 newrp->rc_flag |= RC_UDP;
357 ret = nfsrc_getudp(nd, newrp);
358 } else {
359 ret = nfsrc_gettcp(nd, newrp);
360 }
361 nfsrc_trimcache(nd->nd_sockref, so);
362 NFSEXITCODE2(0, nd);
363 return (ret);
364 }
365
366 /*
367 * For UDP (v2, v3):
368 * - key on <xid, NFS version, RPC#, Client host ip#>
369 * (at most one entry for each key)
370 */
371 static int
nfsrc_getudp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)372 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
373 {
374 struct nfsrvcache *rp;
375 struct sockaddr_in *saddr;
376 struct sockaddr_in6 *saddr6;
377 struct nfsrvhashhead *hp;
378 int ret = 0;
379 struct mtx *mutex;
380
381 mutex = nfsrc_cachemutex(newrp);
382 hp = NFSRCUDPHASH(newrp->rc_xid);
383 loop:
384 mtx_lock(mutex);
385 LIST_FOREACH(rp, hp, rc_hash) {
386 if (newrp->rc_xid == rp->rc_xid &&
387 newrp->rc_proc == rp->rc_proc &&
388 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
389 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
390 if ((rp->rc_flag & RC_LOCKED) != 0) {
391 rp->rc_flag |= RC_WANTED;
392 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
393 "nfsrc", 10 * hz);
394 goto loop;
395 }
396 if (rp->rc_flag == 0)
397 panic("nfs udp cache0");
398 rp->rc_flag |= RC_LOCKED;
399 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
400 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
401 if (rp->rc_flag & RC_INPROG) {
402 newnfsstats.srvcache_inproghits++;
403 mtx_unlock(mutex);
404 ret = RC_DROPIT;
405 } else if (rp->rc_flag & RC_REPSTATUS) {
406 /*
407 * V2 only.
408 */
409 newnfsstats.srvcache_nonidemdonehits++;
410 mtx_unlock(mutex);
411 nfsrvd_rephead(nd);
412 *(nd->nd_errp) = rp->rc_status;
413 ret = RC_REPLY;
414 rp->rc_timestamp = NFSD_MONOSEC +
415 NFSRVCACHE_UDPTIMEOUT;
416 } else if (rp->rc_flag & RC_REPMBUF) {
417 newnfsstats.srvcache_nonidemdonehits++;
418 mtx_unlock(mutex);
419 nd->nd_mreq = m_copym(rp->rc_reply, 0,
420 M_COPYALL, M_WAITOK);
421 ret = RC_REPLY;
422 rp->rc_timestamp = NFSD_MONOSEC +
423 NFSRVCACHE_UDPTIMEOUT;
424 } else {
425 panic("nfs udp cache1");
426 }
427 nfsrc_unlock(rp);
428 free((caddr_t)newrp, M_NFSRVCACHE);
429 goto out;
430 }
431 }
432 newnfsstats.srvcache_misses++;
433 atomic_add_int(&newnfsstats.srvcache_size, 1);
434 nfsrc_udpcachesize++;
435
436 newrp->rc_flag |= RC_INPROG;
437 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
438 if (saddr->sin_family == AF_INET)
439 newrp->rc_inet = saddr->sin_addr.s_addr;
440 else if (saddr->sin_family == AF_INET6) {
441 saddr6 = (struct sockaddr_in6 *)saddr;
442 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
443 sizeof (struct in6_addr));
444 newrp->rc_flag |= RC_INETIPV6;
445 }
446 LIST_INSERT_HEAD(hp, newrp, rc_hash);
447 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
448 mtx_unlock(mutex);
449 nd->nd_rp = newrp;
450 ret = RC_DOIT;
451
452 out:
453 NFSEXITCODE2(0, nd);
454 return (ret);
455 }
456
457 /*
458 * Update a request cache entry after the rpc has been done
459 */
460 APPLESTATIC struct nfsrvcache *
nfsrvd_updatecache(struct nfsrv_descript * nd,struct socket * so)461 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
462 {
463 struct nfsrvcache *rp;
464 struct nfsrvcache *retrp = NULL;
465 mbuf_t m;
466 struct mtx *mutex;
467
468 rp = nd->nd_rp;
469 if (!rp)
470 panic("nfsrvd_updatecache null rp");
471 nd->nd_rp = NULL;
472 mutex = nfsrc_cachemutex(rp);
473 mtx_lock(mutex);
474 nfsrc_lock(rp);
475 if (!(rp->rc_flag & RC_INPROG))
476 panic("nfsrvd_updatecache not inprog");
477 rp->rc_flag &= ~RC_INPROG;
478 if (rp->rc_flag & RC_UDP) {
479 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
480 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
481 }
482
483 /*
484 * Reply from cache is a special case returned by nfsrv_checkseqid().
485 */
486 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
487 newnfsstats.srvcache_nonidemdonehits++;
488 mtx_unlock(mutex);
489 nd->nd_repstat = 0;
490 if (nd->nd_mreq)
491 mbuf_freem(nd->nd_mreq);
492 if (!(rp->rc_flag & RC_REPMBUF))
493 panic("reply from cache");
494 nd->nd_mreq = m_copym(rp->rc_reply, 0,
495 M_COPYALL, M_WAITOK);
496 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
497 nfsrc_unlock(rp);
498 goto out;
499 }
500
501 /*
502 * If rc_refcnt > 0, save it
503 * For UDP, save it if ND_SAVEREPLY is set
504 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
505 */
506 if (nd->nd_repstat != NFSERR_DONTREPLY &&
507 (rp->rc_refcnt > 0 ||
508 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
509 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
510 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
511 nfsrc_tcpnonidempotent))) {
512 if (rp->rc_refcnt > 0) {
513 if (!(rp->rc_flag & RC_NFSV4))
514 panic("update_cache refcnt");
515 rp->rc_flag |= RC_REFCNT;
516 }
517 if ((nd->nd_flag & ND_NFSV2) &&
518 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
519 rp->rc_status = nd->nd_repstat;
520 rp->rc_flag |= RC_REPSTATUS;
521 mtx_unlock(mutex);
522 } else {
523 if (!(rp->rc_flag & RC_UDP)) {
524 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
525 if (nfsrc_tcpsavedreplies >
526 newnfsstats.srvcache_tcppeak)
527 newnfsstats.srvcache_tcppeak =
528 nfsrc_tcpsavedreplies;
529 }
530 mtx_unlock(mutex);
531 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
532 mtx_lock(mutex);
533 rp->rc_reply = m;
534 rp->rc_flag |= RC_REPMBUF;
535 mtx_unlock(mutex);
536 }
537 if (rp->rc_flag & RC_UDP) {
538 rp->rc_timestamp = NFSD_MONOSEC +
539 NFSRVCACHE_UDPTIMEOUT;
540 nfsrc_unlock(rp);
541 } else {
542 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
543 if (rp->rc_refcnt > 0)
544 nfsrc_unlock(rp);
545 else
546 retrp = rp;
547 }
548 } else {
549 nfsrc_freecache(rp);
550 mtx_unlock(mutex);
551 }
552
553 out:
554 nfsrc_trimcache(nd->nd_sockref, so);
555 NFSEXITCODE2(0, nd);
556 return (retrp);
557 }
558
559 /*
560 * Invalidate and, if possible, free an in prog cache entry.
561 * Must not sleep.
562 */
563 APPLESTATIC void
nfsrvd_delcache(struct nfsrvcache * rp)564 nfsrvd_delcache(struct nfsrvcache *rp)
565 {
566 struct mtx *mutex;
567
568 mutex = nfsrc_cachemutex(rp);
569 if (!(rp->rc_flag & RC_INPROG))
570 panic("nfsrvd_delcache not in prog");
571 mtx_lock(mutex);
572 rp->rc_flag &= ~RC_INPROG;
573 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
574 nfsrc_freecache(rp);
575 mtx_unlock(mutex);
576 }
577
578 /*
579 * Called after nfsrvd_updatecache() once the reply is sent, to update
580 * the entry for nfsrc_activesocket() and unlock it. The argument is
581 * the pointer returned by nfsrvd_updatecache().
582 */
583 APPLESTATIC void
nfsrvd_sentcache(struct nfsrvcache * rp,struct socket * so,int err)584 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
585 {
586 tcp_seq tmp_seq;
587 struct mtx *mutex;
588
589 mutex = nfsrc_cachemutex(rp);
590 if (!(rp->rc_flag & RC_LOCKED))
591 panic("nfsrvd_sentcache not locked");
592 if (!err) {
593 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
594 so->so_proto->pr_domain->dom_family != AF_INET6) ||
595 so->so_proto->pr_protocol != IPPROTO_TCP)
596 panic("nfs sent cache");
597 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
598 mtx_lock(mutex);
599 rp->rc_tcpseq = tmp_seq;
600 rp->rc_flag |= RC_TCPSEQ;
601 mtx_unlock(mutex);
602 }
603 }
604 nfsrc_unlock(rp);
605 }
606
607 /*
608 * Get a cache entry for TCP
609 * - key on <xid, nfs version>
610 * (allow multiple entries for a given key)
611 */
612 static int
nfsrc_gettcp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)613 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
614 {
615 struct nfsrvcache *rp, *nextrp;
616 int i;
617 struct nfsrvcache *hitrp;
618 struct nfsrvhashhead *hp, nfsrc_templist;
619 int hit, ret = 0;
620 struct mtx *mutex;
621
622 mutex = nfsrc_cachemutex(newrp);
623 hp = NFSRCHASH(newrp->rc_xid);
624 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
625 tryagain:
626 mtx_lock(mutex);
627 hit = 1;
628 LIST_INIT(&nfsrc_templist);
629 /*
630 * Get all the matches and put them on the temp list.
631 */
632 rp = LIST_FIRST(hp);
633 while (rp != NULL) {
634 nextrp = LIST_NEXT(rp, rc_hash);
635 if (newrp->rc_xid == rp->rc_xid &&
636 (!(rp->rc_flag & RC_INPROG) ||
637 ((newrp->rc_flag & RC_SAMETCPCONN) &&
638 newrp->rc_sockref == rp->rc_sockref)) &&
639 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
640 newrp->rc_proc == rp->rc_proc &&
641 ((newrp->rc_flag & RC_NFSV4) &&
642 newrp->rc_sockref != rp->rc_sockref &&
643 newrp->rc_cachetime >= rp->rc_cachetime)
644 && newrp->rc_reqlen == rp->rc_reqlen &&
645 newrp->rc_cksum == rp->rc_cksum) {
646 LIST_REMOVE(rp, rc_hash);
647 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
648 }
649 rp = nextrp;
650 }
651
652 /*
653 * Now, use nfsrc_templist to decide if there is a match.
654 */
655 i = 0;
656 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
657 i++;
658 if (rp->rc_refcnt > 0) {
659 hit = 0;
660 break;
661 }
662 }
663 /*
664 * Can be a hit only if one entry left.
665 * Note possible hit entry and put nfsrc_templist back on hash
666 * list.
667 */
668 if (i != 1)
669 hit = 0;
670 hitrp = rp = LIST_FIRST(&nfsrc_templist);
671 while (rp != NULL) {
672 nextrp = LIST_NEXT(rp, rc_hash);
673 LIST_REMOVE(rp, rc_hash);
674 LIST_INSERT_HEAD(hp, rp, rc_hash);
675 rp = nextrp;
676 }
677 if (LIST_FIRST(&nfsrc_templist) != NULL)
678 panic("nfs gettcp cache templist");
679
680 if (hit) {
681 rp = hitrp;
682 if ((rp->rc_flag & RC_LOCKED) != 0) {
683 rp->rc_flag |= RC_WANTED;
684 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
685 "nfsrc", 10 * hz);
686 goto tryagain;
687 }
688 if (rp->rc_flag == 0)
689 panic("nfs tcp cache0");
690 rp->rc_flag |= RC_LOCKED;
691 if (rp->rc_flag & RC_INPROG) {
692 newnfsstats.srvcache_inproghits++;
693 mtx_unlock(mutex);
694 if (newrp->rc_sockref == rp->rc_sockref)
695 nfsrc_marksametcpconn(rp->rc_sockref);
696 ret = RC_DROPIT;
697 } else if (rp->rc_flag & RC_REPSTATUS) {
698 /*
699 * V2 only.
700 */
701 newnfsstats.srvcache_nonidemdonehits++;
702 mtx_unlock(mutex);
703 if (newrp->rc_sockref == rp->rc_sockref)
704 nfsrc_marksametcpconn(rp->rc_sockref);
705 ret = RC_REPLY;
706 nfsrvd_rephead(nd);
707 *(nd->nd_errp) = rp->rc_status;
708 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
709 } else if (rp->rc_flag & RC_REPMBUF) {
710 newnfsstats.srvcache_nonidemdonehits++;
711 mtx_unlock(mutex);
712 if (newrp->rc_sockref == rp->rc_sockref)
713 nfsrc_marksametcpconn(rp->rc_sockref);
714 ret = RC_REPLY;
715 nd->nd_mreq = m_copym(rp->rc_reply, 0,
716 M_COPYALL, M_WAITOK);
717 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
718 } else {
719 panic("nfs tcp cache1");
720 }
721 nfsrc_unlock(rp);
722 free((caddr_t)newrp, M_NFSRVCACHE);
723 goto out;
724 }
725 newnfsstats.srvcache_misses++;
726 atomic_add_int(&newnfsstats.srvcache_size, 1);
727
728 /*
729 * For TCP, multiple entries for a key are allowed, so don't
730 * chain it into the hash table until done.
731 */
732 newrp->rc_cachetime = NFSD_MONOSEC;
733 newrp->rc_flag |= RC_INPROG;
734 LIST_INSERT_HEAD(hp, newrp, rc_hash);
735 mtx_unlock(mutex);
736 nd->nd_rp = newrp;
737 ret = RC_DOIT;
738
739 out:
740 NFSEXITCODE2(0, nd);
741 return (ret);
742 }
743
744 /*
745 * Lock a cache entry.
746 */
747 static void
nfsrc_lock(struct nfsrvcache * rp)748 nfsrc_lock(struct nfsrvcache *rp)
749 {
750 struct mtx *mutex;
751
752 mutex = nfsrc_cachemutex(rp);
753 mtx_assert(mutex, MA_OWNED);
754 while ((rp->rc_flag & RC_LOCKED) != 0) {
755 rp->rc_flag |= RC_WANTED;
756 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
757 }
758 rp->rc_flag |= RC_LOCKED;
759 }
760
761 /*
762 * Unlock a cache entry.
763 */
764 static void
nfsrc_unlock(struct nfsrvcache * rp)765 nfsrc_unlock(struct nfsrvcache *rp)
766 {
767 struct mtx *mutex;
768
769 mutex = nfsrc_cachemutex(rp);
770 mtx_lock(mutex);
771 rp->rc_flag &= ~RC_LOCKED;
772 nfsrc_wanted(rp);
773 mtx_unlock(mutex);
774 }
775
776 /*
777 * Wakeup anyone wanting entry.
778 */
779 static void
nfsrc_wanted(struct nfsrvcache * rp)780 nfsrc_wanted(struct nfsrvcache *rp)
781 {
782 if (rp->rc_flag & RC_WANTED) {
783 rp->rc_flag &= ~RC_WANTED;
784 wakeup((caddr_t)rp);
785 }
786 }
787
788 /*
789 * Free up the entry.
790 * Must not sleep.
791 */
792 static void
nfsrc_freecache(struct nfsrvcache * rp)793 nfsrc_freecache(struct nfsrvcache *rp)
794 {
795
796 LIST_REMOVE(rp, rc_hash);
797 if (rp->rc_flag & RC_UDP) {
798 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
799 nfsrc_udpcachesize--;
800 }
801 nfsrc_wanted(rp);
802 if (rp->rc_flag & RC_REPMBUF) {
803 mbuf_freem(rp->rc_reply);
804 if (!(rp->rc_flag & RC_UDP))
805 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
806 }
807 FREE((caddr_t)rp, M_NFSRVCACHE);
808 atomic_add_int(&newnfsstats.srvcache_size, -1);
809 }
810
811 /*
812 * Clean out the cache. Called when nfsserver module is unloaded.
813 */
814 APPLESTATIC void
nfsrvd_cleancache(void)815 nfsrvd_cleancache(void)
816 {
817 struct nfsrvcache *rp, *nextrp;
818 int i;
819
820 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
821 mtx_lock(&nfsrchash_table[i].mtx);
822 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
823 nfsrc_freecache(rp);
824 mtx_unlock(&nfsrchash_table[i].mtx);
825 }
826 mtx_lock(&nfsrc_udpmtx);
827 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
828 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
829 nfsrc_freecache(rp);
830 }
831 }
832 newnfsstats.srvcache_size = 0;
833 mtx_unlock(&nfsrc_udpmtx);
834 nfsrc_tcpsavedreplies = 0;
835 }
836
837 /*
838 * The basic rule is to get rid of entries that are expired.
839 */
840 static void
nfsrc_trimcache(u_int64_t sockref,struct socket * so)841 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
842 {
843 struct nfsrvcache *rp, *nextrp;
844 int i, j, k, time_histo[10];
845 time_t thisstamp;
846 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
847 static int onethread = 0;
848
849 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
850 return;
851 if (NFSD_MONOSEC != udp_lasttrim ||
852 nfsrc_udpcachesize >= (nfsrc_udphighwater +
853 nfsrc_udphighwater / 2)) {
854 mtx_lock(&nfsrc_udpmtx);
855 udp_lasttrim = NFSD_MONOSEC;
856 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
857 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
858 && rp->rc_refcnt == 0
859 && ((rp->rc_flag & RC_REFCNT) ||
860 udp_lasttrim > rp->rc_timestamp ||
861 nfsrc_udpcachesize > nfsrc_udphighwater))
862 nfsrc_freecache(rp);
863 }
864 mtx_unlock(&nfsrc_udpmtx);
865 }
866 if (NFSD_MONOSEC != tcp_lasttrim ||
867 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
868 for (i = 0; i < 10; i++)
869 time_histo[i] = 0;
870 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
871 mtx_lock(&nfsrchash_table[i].mtx);
872 if (i == 0)
873 tcp_lasttrim = NFSD_MONOSEC;
874 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
875 nextrp) {
876 if (!(rp->rc_flag &
877 (RC_INPROG|RC_LOCKED|RC_WANTED))
878 && rp->rc_refcnt == 0) {
879 /*
880 * The timestamps range from roughly the
881 * present (tcp_lasttrim) to the present
882 * + nfsrc_tcptimeout. Generate a simple
883 * histogram of where the timeouts fall.
884 */
885 j = rp->rc_timestamp - tcp_lasttrim;
886 if (j >= nfsrc_tcptimeout)
887 j = nfsrc_tcptimeout - 1;
888 if (j < 0)
889 j = 0;
890 j = (j * 10 / nfsrc_tcptimeout) % 10;
891 time_histo[j]++;
892 if ((rp->rc_flag & RC_REFCNT) ||
893 tcp_lasttrim > rp->rc_timestamp ||
894 nfsrc_activesocket(rp, sockref, so))
895 nfsrc_freecache(rp);
896 }
897 }
898 mtx_unlock(&nfsrchash_table[i].mtx);
899 }
900 j = nfsrc_tcphighwater / 5; /* 20% of it */
901 if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
902 /*
903 * Trim some more with a smaller timeout of as little
904 * as 20% of nfsrc_tcptimeout to try and get below
905 * 80% of the nfsrc_tcphighwater.
906 */
907 k = 0;
908 for (i = 0; i < 8; i++) {
909 k += time_histo[i];
910 if (k > j)
911 break;
912 }
913 k = nfsrc_tcptimeout * (i + 1) / 10;
914 if (k < 1)
915 k = 1;
916 thisstamp = tcp_lasttrim + k;
917 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
918 mtx_lock(&nfsrchash_table[i].mtx);
919 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
920 rc_hash, nextrp) {
921 if (!(rp->rc_flag &
922 (RC_INPROG|RC_LOCKED|RC_WANTED))
923 && rp->rc_refcnt == 0
924 && ((rp->rc_flag & RC_REFCNT) ||
925 thisstamp > rp->rc_timestamp ||
926 nfsrc_activesocket(rp, sockref,
927 so)))
928 nfsrc_freecache(rp);
929 }
930 mtx_unlock(&nfsrchash_table[i].mtx);
931 }
932 }
933 }
934 atomic_store_rel_int(&onethread, 0);
935 }
936
937 /*
938 * Add a seqid# reference to the cache entry.
939 */
940 APPLESTATIC void
nfsrvd_refcache(struct nfsrvcache * rp)941 nfsrvd_refcache(struct nfsrvcache *rp)
942 {
943 struct mtx *mutex;
944
945 mutex = nfsrc_cachemutex(rp);
946 mtx_lock(mutex);
947 if (rp->rc_refcnt < 0)
948 panic("nfs cache refcnt");
949 rp->rc_refcnt++;
950 mtx_unlock(mutex);
951 }
952
953 /*
954 * Dereference a seqid# cache entry.
955 */
956 APPLESTATIC void
nfsrvd_derefcache(struct nfsrvcache * rp)957 nfsrvd_derefcache(struct nfsrvcache *rp)
958 {
959 struct mtx *mutex;
960
961 mutex = nfsrc_cachemutex(rp);
962 mtx_lock(mutex);
963 if (rp->rc_refcnt <= 0)
964 panic("nfs cache derefcnt");
965 rp->rc_refcnt--;
966 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
967 nfsrc_freecache(rp);
968 mtx_unlock(mutex);
969 }
970
971 /*
972 * Check to see if the socket is active.
973 * Return 1 if the reply has been received/acknowledged by the client,
974 * 0 otherwise.
975 * XXX - Uses tcp internals.
976 */
977 static int
nfsrc_activesocket(struct nfsrvcache * rp,u_int64_t cur_sockref,struct socket * cur_so)978 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
979 struct socket *cur_so)
980 {
981 int ret = 0;
982
983 if (!(rp->rc_flag & RC_TCPSEQ))
984 return (ret);
985 /*
986 * If the sockref is the same, it is the same TCP connection.
987 */
988 if (cur_sockref == rp->rc_sockref)
989 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
990 return (ret);
991 }
992
993 /*
994 * Calculate the length of the mbuf list and a checksum on the first up to
995 * NFSRVCACHE_CHECKLEN bytes.
996 */
997 static int
nfsrc_getlenandcksum(mbuf_t m1,u_int16_t * cksum)998 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
999 {
1000 int len = 0, cklen;
1001 mbuf_t m;
1002
1003 m = m1;
1004 while (m) {
1005 len += mbuf_len(m);
1006 m = mbuf_next(m);
1007 }
1008 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1009 *cksum = in_cksum(m1, cklen);
1010 return (len);
1011 }
1012
1013 /*
1014 * Mark a TCP connection that is seeing retries. Should never happen for
1015 * NFSv4.
1016 */
1017 static void
nfsrc_marksametcpconn(u_int64_t sockref)1018 nfsrc_marksametcpconn(u_int64_t sockref)
1019 {
1020 }
1021
1022