xref: /dragonfly/sys/vfs/nfs/nfs_socket.c (revision 8a0bcd56)
1 /*
2  * Copyright (c) 1989, 1991, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
37  * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38  * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.45 2007/05/18 17:05:13 dillon Exp $
39  */
40 
41 /*
42  * Socket operations for use by nfs
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/fcntl.h>
54 #include <sys/protosw.h>
55 #include <sys/resourcevar.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/socketops.h>
59 #include <sys/syslog.h>
60 #include <sys/thread.h>
61 #include <sys/tprintf.h>
62 #include <sys/sysctl.h>
63 #include <sys/signalvar.h>
64 
65 #include <sys/signal2.h>
66 #include <sys/mutex2.h>
67 #include <sys/socketvar2.h>
68 
69 #include <netinet/in.h>
70 #include <netinet/tcp.h>
71 #include <sys/thread2.h>
72 
73 #include "rpcv2.h"
74 #include "nfsproto.h"
75 #include "nfs.h"
76 #include "xdr_subs.h"
77 #include "nfsm_subs.h"
78 #include "nfsmount.h"
79 #include "nfsnode.h"
80 #include "nfsrtt.h"
81 
82 #define	TRUE	1
83 #define	FALSE	0
84 
85 /*
86  * RTT calculations are scaled by 256 (8 bits).  A proper fractional
87  * RTT will still be calculated even with a slow NFS timer.
88  */
89 #define	NFS_SRTT(r)	(r)->r_nmp->nm_srtt[proct[(r)->r_procnum]]
90 #define	NFS_SDRTT(r)	(r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum]]
91 #define NFS_RTT_SCALE_BITS	8	/* bits */
92 #define NFS_RTT_SCALE		256	/* value */
93 
94 /*
95  * Defines which timer to use for the procnum.
96  * 0 - default
97  * 1 - getattr
98  * 2 - lookup
99  * 3 - read
100  * 4 - write
101  */
102 static int proct[NFS_NPROCS] = {
103 	0, 1, 0, 2, 1, 3, 3, 4, 0, 0,	/* 00-09	*/
104 	0, 0, 0, 0, 0, 0, 3, 3, 0, 0,	/* 10-19	*/
105 	0, 5, 0, 0, 0, 0,		/* 20-29	*/
106 };
107 
108 static int multt[NFS_NPROCS] = {
109 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 00-09	*/
110 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 10-19	*/
111 	1, 2, 1, 1, 1, 1,		/* 20-29	*/
112 };
113 
114 static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 };
115 static int nfs_realign_test;
116 static int nfs_realign_count;
117 static int nfs_showrtt;
118 static int nfs_showrexmit;
119 int nfs_maxasyncbio = NFS_MAXASYNCBIO;
120 
121 SYSCTL_DECL(_vfs_nfs);
122 
123 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
124 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
125 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, "");
126 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, "");
127 SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, "");
128 
129 static int nfs_request_setup(nfsm_info_t info);
130 static int nfs_request_auth(struct nfsreq *rep);
131 static int nfs_request_try(struct nfsreq *rep);
132 static int nfs_request_waitreply(struct nfsreq *rep);
133 static int nfs_request_processreply(nfsm_info_t info, int);
134 
135 int nfsrtton = 0;
136 struct nfsrtt nfsrtt;
137 struct callout	nfs_timer_handle;
138 
139 static int	nfs_msg (struct thread *,char *,char *);
140 static int	nfs_rcvlock (struct nfsmount *nmp, struct nfsreq *myreq);
141 static void	nfs_rcvunlock (struct nfsmount *nmp);
142 static void	nfs_realign (struct mbuf **pm, int hsiz);
143 static int	nfs_receive (struct nfsmount *nmp, struct nfsreq *rep,
144 				struct sockaddr **aname, struct mbuf **mp);
145 static void	nfs_softterm (struct nfsreq *rep, int islocked);
146 static void	nfs_hardterm (struct nfsreq *rep, int islocked);
147 static int	nfs_reconnect (struct nfsmount *nmp, struct nfsreq *rep);
148 #ifndef NFS_NOSERVER
149 static int	nfsrv_getstream (struct nfssvc_sock *, int, int *);
150 static void	nfs_timer_req(struct nfsreq *req);
151 static void	nfs_checkpkt(struct mbuf *m, int len);
152 
153 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
154 				    struct nfssvc_sock *slp,
155 				    struct thread *td,
156 				    struct mbuf **mreqp) = {
157 	nfsrv_null,
158 	nfsrv_getattr,
159 	nfsrv_setattr,
160 	nfsrv_lookup,
161 	nfsrv3_access,
162 	nfsrv_readlink,
163 	nfsrv_read,
164 	nfsrv_write,
165 	nfsrv_create,
166 	nfsrv_mkdir,
167 	nfsrv_symlink,
168 	nfsrv_mknod,
169 	nfsrv_remove,
170 	nfsrv_rmdir,
171 	nfsrv_rename,
172 	nfsrv_link,
173 	nfsrv_readdir,
174 	nfsrv_readdirplus,
175 	nfsrv_statfs,
176 	nfsrv_fsinfo,
177 	nfsrv_pathconf,
178 	nfsrv_commit,
179 	nfsrv_noop,
180 	nfsrv_noop,
181 	nfsrv_noop,
182 	nfsrv_noop
183 };
184 #endif /* NFS_NOSERVER */
185 
186 /*
187  * Initialize sockets and congestion for a new NFS connection.
188  * We do not free the sockaddr if error.
189  */
190 int
191 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
192 {
193 	struct socket *so;
194 	int error;
195 	struct sockaddr *saddr;
196 	struct sockaddr_in *sin;
197 	struct thread *td = &thread0; /* only used for socreate and sobind */
198 
199 	nmp->nm_so = so = NULL;
200 	if (nmp->nm_flag & NFSMNT_FORCE)
201 		return (EINVAL);
202 	saddr = nmp->nm_nam;
203 	error = socreate(saddr->sa_family, &so, nmp->nm_sotype,
204 		nmp->nm_soproto, td);
205 	if (error)
206 		goto bad;
207 	nmp->nm_soflags = so->so_proto->pr_flags;
208 
209 	/*
210 	 * Some servers require that the client port be a reserved port number.
211 	 */
212 	if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
213 		struct sockopt sopt;
214 		int ip;
215 		struct sockaddr_in ssin;
216 
217 		bzero(&sopt, sizeof sopt);
218 		ip = IP_PORTRANGE_LOW;
219 		sopt.sopt_level = IPPROTO_IP;
220 		sopt.sopt_name = IP_PORTRANGE;
221 		sopt.sopt_val = (void *)&ip;
222 		sopt.sopt_valsize = sizeof(ip);
223 		sopt.sopt_td = NULL;
224 		error = sosetopt(so, &sopt);
225 		if (error)
226 			goto bad;
227 		bzero(&ssin, sizeof ssin);
228 		sin = &ssin;
229 		sin->sin_len = sizeof (struct sockaddr_in);
230 		sin->sin_family = AF_INET;
231 		sin->sin_addr.s_addr = INADDR_ANY;
232 		sin->sin_port = htons(0);
233 		error = sobind(so, (struct sockaddr *)sin, td);
234 		if (error)
235 			goto bad;
236 		bzero(&sopt, sizeof sopt);
237 		ip = IP_PORTRANGE_DEFAULT;
238 		sopt.sopt_level = IPPROTO_IP;
239 		sopt.sopt_name = IP_PORTRANGE;
240 		sopt.sopt_val = (void *)&ip;
241 		sopt.sopt_valsize = sizeof(ip);
242 		sopt.sopt_td = NULL;
243 		error = sosetopt(so, &sopt);
244 		if (error)
245 			goto bad;
246 	}
247 
248 	/*
249 	 * Protocols that do not require connections may be optionally left
250 	 * unconnected for servers that reply from a port other than NFS_PORT.
251 	 */
252 	if (nmp->nm_flag & NFSMNT_NOCONN) {
253 		if (nmp->nm_soflags & PR_CONNREQUIRED) {
254 			error = ENOTCONN;
255 			goto bad;
256 		}
257 	} else {
258 		error = soconnect(so, nmp->nm_nam, td);
259 		if (error)
260 			goto bad;
261 
262 		/*
263 		 * Wait for the connection to complete. Cribbed from the
264 		 * connect system call but with the wait timing out so
265 		 * that interruptible mounts don't hang here for a long time.
266 		 */
267 		crit_enter();
268 		while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
269 			(void) tsleep((caddr_t)&so->so_timeo, 0,
270 				"nfscon", 2 * hz);
271 			if ((so->so_state & SS_ISCONNECTING) &&
272 			    so->so_error == 0 && rep &&
273 			    (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
274 				soclrstate(so, SS_ISCONNECTING);
275 				crit_exit();
276 				goto bad;
277 			}
278 		}
279 		if (so->so_error) {
280 			error = so->so_error;
281 			so->so_error = 0;
282 			crit_exit();
283 			goto bad;
284 		}
285 		crit_exit();
286 	}
287 	so->so_rcv.ssb_timeo = (5 * hz);
288 	so->so_snd.ssb_timeo = (5 * hz);
289 
290 	/*
291 	 * Get buffer reservation size from sysctl, but impose reasonable
292 	 * limits.
293 	 */
294 	if (nmp->nm_sotype == SOCK_STREAM) {
295 		if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
296 			struct sockopt sopt;
297 			int val;
298 
299 			bzero(&sopt, sizeof sopt);
300 			sopt.sopt_level = SOL_SOCKET;
301 			sopt.sopt_name = SO_KEEPALIVE;
302 			sopt.sopt_val = &val;
303 			sopt.sopt_valsize = sizeof val;
304 			val = 1;
305 			sosetopt(so, &sopt);
306 		}
307 		if (so->so_proto->pr_protocol == IPPROTO_TCP) {
308 			struct sockopt sopt;
309 			int val;
310 
311 			bzero(&sopt, sizeof sopt);
312 			sopt.sopt_level = IPPROTO_TCP;
313 			sopt.sopt_name = TCP_NODELAY;
314 			sopt.sopt_val = &val;
315 			sopt.sopt_valsize = sizeof val;
316 			val = 1;
317 			sosetopt(so, &sopt);
318 
319 			bzero(&sopt, sizeof sopt);
320 			sopt.sopt_level = IPPROTO_TCP;
321 			sopt.sopt_name = TCP_FASTKEEP;
322 			sopt.sopt_val = &val;
323 			sopt.sopt_valsize = sizeof val;
324 			val = 1;
325 			sosetopt(so, &sopt);
326 		}
327 	}
328 	error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL);
329 	if (error)
330 		goto bad;
331 	atomic_set_int(&so->so_rcv.ssb_flags, SSB_NOINTR);
332 	atomic_set_int(&so->so_snd.ssb_flags, SSB_NOINTR);
333 
334 	/* Initialize other non-zero congestion variables */
335 	nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
336 		nmp->nm_srtt[3] = (NFS_TIMEO << NFS_RTT_SCALE_BITS);
337 	nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
338 		nmp->nm_sdrtt[3] = 0;
339 	nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
340 	nmp->nm_timeouts = 0;
341 
342 	/*
343 	 * Assign nm_so last.  The moment nm_so is assigned the nfs_timer()
344 	 * can mess with the socket.
345 	 */
346 	nmp->nm_so = so;
347 	return (0);
348 
349 bad:
350 	if (so) {
351 		soshutdown(so, SHUT_RDWR);
352 		soclose(so, FNONBLOCK);
353 	}
354 	return (error);
355 }
356 
357 /*
358  * Reconnect routine:
359  * Called when a connection is broken on a reliable protocol.
360  * - clean up the old socket
361  * - nfs_connect() again
362  * - set R_NEEDSXMIT for all outstanding requests on mount point
363  * If this fails the mount point is DEAD!
364  * nb: Must be called with the nfs_sndlock() set on the mount point.
365  */
366 static int
367 nfs_reconnect(struct nfsmount *nmp, struct nfsreq *rep)
368 {
369 	struct nfsreq *req;
370 	int error;
371 
372 	nfs_disconnect(nmp);
373 	if (nmp->nm_rxstate >= NFSSVC_STOPPING)
374 		return (EINTR);
375 	while ((error = nfs_connect(nmp, rep)) != 0) {
376 		if (error == EINTR || error == ERESTART)
377 			return (EINTR);
378 		if (error == EINVAL)
379 			return (error);
380 		if (nmp->nm_rxstate >= NFSSVC_STOPPING)
381 			return (EINTR);
382 		(void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
383 	}
384 
385 	/*
386 	 * Loop through outstanding request list and fix up all requests
387 	 * on old socket.
388 	 */
389 	crit_enter();
390 	TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
391 		KKASSERT(req->r_nmp == nmp);
392 		req->r_flags |= R_NEEDSXMIT;
393 	}
394 	crit_exit();
395 	return (0);
396 }
397 
398 /*
399  * NFS disconnect. Clean up and unlink.
400  */
401 void
402 nfs_disconnect(struct nfsmount *nmp)
403 {
404 	struct socket *so;
405 
406 	if (nmp->nm_so) {
407 		so = nmp->nm_so;
408 		nmp->nm_so = NULL;
409 		soshutdown(so, SHUT_RDWR);
410 		soclose(so, FNONBLOCK);
411 	}
412 }
413 
414 void
415 nfs_safedisconnect(struct nfsmount *nmp)
416 {
417 	nfs_rcvlock(nmp, NULL);
418 	nfs_disconnect(nmp);
419 	nfs_rcvunlock(nmp);
420 }
421 
422 /*
423  * This is the nfs send routine. For connection based socket types, it
424  * must be called with an nfs_sndlock() on the socket.
425  * "rep == NULL" indicates that it has been called from a server.
426  * For the client side:
427  * - return EINTR if the RPC is terminated, 0 otherwise
428  * - set R_NEEDSXMIT if the send fails for any reason
429  * - do any cleanup required by recoverable socket errors (?)
430  * For the server side:
431  * - return EINTR or ERESTART if interrupted by a signal
432  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
433  * - do any cleanup required by recoverable socket errors (?)
434  */
435 int
436 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
437 	 struct nfsreq *rep)
438 {
439 	struct sockaddr *sendnam;
440 	int error, soflags, flags;
441 
442 	if (rep) {
443 		if (rep->r_flags & R_SOFTTERM) {
444 			m_freem(top);
445 			return (EINTR);
446 		}
447 		if ((so = rep->r_nmp->nm_so) == NULL) {
448 			rep->r_flags |= R_NEEDSXMIT;
449 			m_freem(top);
450 			return (0);
451 		}
452 		rep->r_flags &= ~R_NEEDSXMIT;
453 		soflags = rep->r_nmp->nm_soflags;
454 	} else {
455 		soflags = so->so_proto->pr_flags;
456 	}
457 	if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
458 		sendnam = NULL;
459 	else
460 		sendnam = nam;
461 	if (so->so_type == SOCK_SEQPACKET)
462 		flags = MSG_EOR;
463 	else
464 		flags = 0;
465 
466 	/*
467 	 * calls pru_sosend -> sosend -> so_pru_send -> netrpc
468 	 */
469 	error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
470 			      curthread /*XXX*/);
471 	/*
472 	 * ENOBUFS for dgram sockets is transient and non fatal.
473 	 * No need to log, and no need to break a soft mount.
474 	 */
475 	if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
476 		error = 0;
477 		/*
478 		 * do backoff retransmit on client
479 		 */
480 		if (rep) {
481 			if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) {
482 				rep->r_nmp->nm_state |= NFSSTA_SENDSPACE;
483 				kprintf("Warning: NFS: Insufficient sendspace "
484 					"(%lu),\n"
485 					"\t You must increase vfs.nfs.soreserve"
486 					"or decrease vfs.nfs.maxasyncbio\n",
487 					so->so_snd.ssb_hiwat);
488 			}
489 			rep->r_flags |= R_NEEDSXMIT;
490 		}
491 	}
492 
493 	if (error) {
494 		if (rep) {
495 			log(LOG_INFO, "nfs send error %d for server %s\n",error,
496 			    rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
497 			/*
498 			 * Deal with errors for the client side.
499 			 */
500 			if (rep->r_flags & R_SOFTTERM)
501 				error = EINTR;
502 			else
503 				rep->r_flags |= R_NEEDSXMIT;
504 		} else {
505 			log(LOG_INFO, "nfsd send error %d\n", error);
506 		}
507 
508 		/*
509 		 * Handle any recoverable (soft) socket errors here. (?)
510 		 */
511 		if (error != EINTR && error != ERESTART &&
512 			error != EWOULDBLOCK && error != EPIPE)
513 			error = 0;
514 	}
515 	return (error);
516 }
517 
518 /*
519  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
520  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
521  * Mark and consolidate the data into a new mbuf list.
522  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
523  *     small mbufs.
524  * For SOCK_STREAM we must be very careful to read an entire record once
525  * we have read any of it, even if the system call has been interrupted.
526  */
527 static int
528 nfs_receive(struct nfsmount *nmp, struct nfsreq *rep,
529 	    struct sockaddr **aname, struct mbuf **mp)
530 {
531 	struct socket *so;
532 	struct sockbuf sio;
533 	struct uio auio;
534 	struct iovec aio;
535 	struct mbuf *m;
536 	struct mbuf *control;
537 	u_int32_t len;
538 	struct sockaddr **getnam;
539 	int error, sotype, rcvflg;
540 	struct thread *td = curthread;	/* XXX */
541 
542 	/*
543 	 * Set up arguments for soreceive()
544 	 */
545 	*mp = NULL;
546 	*aname = NULL;
547 	sotype = nmp->nm_sotype;
548 
549 	/*
550 	 * For reliable protocols, lock against other senders/receivers
551 	 * in case a reconnect is necessary.
552 	 * For SOCK_STREAM, first get the Record Mark to find out how much
553 	 * more there is to get.
554 	 * We must lock the socket against other receivers
555 	 * until we have an entire rpc request/reply.
556 	 */
557 	if (sotype != SOCK_DGRAM) {
558 		error = nfs_sndlock(nmp, rep);
559 		if (error)
560 			return (error);
561 tryagain:
562 		/*
563 		 * Check for fatal errors and resending request.
564 		 */
565 		/*
566 		 * Ugh: If a reconnect attempt just happened, nm_so
567 		 * would have changed. NULL indicates a failed
568 		 * attempt that has essentially shut down this
569 		 * mount point.
570 		 */
571 		if (rep && (rep->r_mrep || (rep->r_flags & R_SOFTTERM))) {
572 			nfs_sndunlock(nmp);
573 			return (EINTR);
574 		}
575 		so = nmp->nm_so;
576 		if (so == NULL) {
577 			error = nfs_reconnect(nmp, rep);
578 			if (error) {
579 				nfs_sndunlock(nmp);
580 				return (error);
581 			}
582 			goto tryagain;
583 		}
584 		while (rep && (rep->r_flags & R_NEEDSXMIT)) {
585 			m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
586 			nfsstats.rpcretries++;
587 			error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
588 			if (error) {
589 				if (error == EINTR || error == ERESTART ||
590 				    (error = nfs_reconnect(nmp, rep)) != 0) {
591 					nfs_sndunlock(nmp);
592 					return (error);
593 				}
594 				goto tryagain;
595 			}
596 		}
597 		nfs_sndunlock(nmp);
598 		if (sotype == SOCK_STREAM) {
599 			/*
600 			 * Get the length marker from the stream
601 			 */
602 			aio.iov_base = (caddr_t)&len;
603 			aio.iov_len = sizeof(u_int32_t);
604 			auio.uio_iov = &aio;
605 			auio.uio_iovcnt = 1;
606 			auio.uio_segflg = UIO_SYSSPACE;
607 			auio.uio_rw = UIO_READ;
608 			auio.uio_offset = 0;
609 			auio.uio_resid = sizeof(u_int32_t);
610 			auio.uio_td = td;
611 			do {
612 			   rcvflg = MSG_WAITALL;
613 			   error = so_pru_soreceive(so, NULL, &auio, NULL,
614 						    NULL, &rcvflg);
615 			   if (error == EWOULDBLOCK && rep) {
616 				if (rep->r_flags & R_SOFTTERM)
617 					return (EINTR);
618 			   }
619 			} while (error == EWOULDBLOCK);
620 
621 			if (error == 0 && auio.uio_resid > 0) {
622 			    /*
623 			     * Only log short packets if not EOF
624 			     */
625 			    if (auio.uio_resid != sizeof(u_int32_t))
626 			    log(LOG_INFO,
627 				 "short receive (%d/%d) from nfs server %s\n",
628 				 (int)(sizeof(u_int32_t) - auio.uio_resid),
629 				 (int)sizeof(u_int32_t),
630 				 nmp->nm_mountp->mnt_stat.f_mntfromname);
631 			    error = EPIPE;
632 			}
633 			if (error)
634 				goto errout;
635 			len = ntohl(len) & ~0x80000000;
636 			/*
637 			 * This is SERIOUS! We are out of sync with the sender
638 			 * and forcing a disconnect/reconnect is all I can do.
639 			 */
640 			if (len > NFS_MAXPACKET) {
641 			    log(LOG_ERR, "%s (%d) from nfs server %s\n",
642 				"impossible packet length",
643 				len,
644 				nmp->nm_mountp->mnt_stat.f_mntfromname);
645 			    error = EFBIG;
646 			    goto errout;
647 			}
648 
649 			/*
650 			 * Get the rest of the packet as an mbuf chain
651 			 */
652 			sbinit(&sio, len);
653 			do {
654 			    rcvflg = MSG_WAITALL;
655 			    error = so_pru_soreceive(so, NULL, NULL, &sio,
656 						     NULL, &rcvflg);
657 			} while (error == EWOULDBLOCK || error == EINTR ||
658 				 error == ERESTART);
659 			if (error == 0 && sio.sb_cc != len) {
660 			    if (sio.sb_cc != 0)
661 			    log(LOG_INFO,
662 				"short receive (%zu/%d) from nfs server %s\n",
663 				(size_t)len - auio.uio_resid, len,
664 				nmp->nm_mountp->mnt_stat.f_mntfromname);
665 			    error = EPIPE;
666 			}
667 			*mp = sio.sb_mb;
668 		} else {
669 			/*
670 			 * Non-stream, so get the whole packet by not
671 			 * specifying MSG_WAITALL and by specifying a large
672 			 * length.
673 			 *
674 			 * We have no use for control msg., but must grab them
675 			 * and then throw them away so we know what is going
676 			 * on.
677 			 */
678 			sbinit(&sio, 100000000);
679 			do {
680 			    rcvflg = 0;
681 			    error =  so_pru_soreceive(so, NULL, NULL, &sio,
682 						      &control, &rcvflg);
683 			    if (control)
684 				m_freem(control);
685 			    if (error == EWOULDBLOCK && rep) {
686 				if (rep->r_flags & R_SOFTTERM) {
687 					m_freem(sio.sb_mb);
688 					return (EINTR);
689 				}
690 			    }
691 			} while (error == EWOULDBLOCK ||
692 				 (error == 0 && sio.sb_mb == NULL && control));
693 			if ((rcvflg & MSG_EOR) == 0)
694 				kprintf("Egad!!\n");
695 			if (error == 0 && sio.sb_mb == NULL)
696 				error = EPIPE;
697 			len = sio.sb_cc;
698 			*mp = sio.sb_mb;
699 		}
700 errout:
701 		if (error && error != EINTR && error != ERESTART) {
702 			m_freem(*mp);
703 			*mp = NULL;
704 			if (error != EPIPE) {
705 				log(LOG_INFO,
706 				    "receive error %d from nfs server %s\n",
707 				    error,
708 				 nmp->nm_mountp->mnt_stat.f_mntfromname);
709 			}
710 			error = nfs_sndlock(nmp, rep);
711 			if (!error) {
712 				error = nfs_reconnect(nmp, rep);
713 				if (!error)
714 					goto tryagain;
715 				else
716 					nfs_sndunlock(nmp);
717 			}
718 		}
719 	} else {
720 		if ((so = nmp->nm_so) == NULL)
721 			return (EACCES);
722 		if (so->so_state & SS_ISCONNECTED)
723 			getnam = NULL;
724 		else
725 			getnam = aname;
726 		sbinit(&sio, 100000000);
727 		do {
728 			rcvflg = 0;
729 			error =  so_pru_soreceive(so, getnam, NULL, &sio,
730 						  NULL, &rcvflg);
731 			if (error == EWOULDBLOCK && rep &&
732 			    (rep->r_flags & R_SOFTTERM)) {
733 				m_freem(sio.sb_mb);
734 				return (EINTR);
735 			}
736 		} while (error == EWOULDBLOCK);
737 
738 		len = sio.sb_cc;
739 		*mp = sio.sb_mb;
740 
741 		/*
742 		 * A shutdown may result in no error and no mbuf.
743 		 * Convert to EPIPE.
744 		 */
745 		if (*mp == NULL && error == 0)
746 			error = EPIPE;
747 	}
748 	if (error) {
749 		m_freem(*mp);
750 		*mp = NULL;
751 	}
752 
753 	/*
754 	 * Search for any mbufs that are not a multiple of 4 bytes long
755 	 * or with m_data not longword aligned.
756 	 * These could cause pointer alignment problems, so copy them to
757 	 * well aligned mbufs.
758 	 */
759 	nfs_realign(mp, 5 * NFSX_UNSIGNED);
760 	return (error);
761 }
762 
763 /*
764  * Implement receipt of reply on a socket.
765  *
766  * We must search through the list of received datagrams matching them
767  * with outstanding requests using the xid, until ours is found.
768  *
769  * If myrep is NULL we process packets on the socket until
770  * interrupted or until nm_reqrxq is non-empty.
771  */
772 /* ARGSUSED */
773 int
774 nfs_reply(struct nfsmount *nmp, struct nfsreq *myrep)
775 {
776 	struct nfsreq *rep;
777 	struct sockaddr *nam;
778 	u_int32_t rxid;
779 	u_int32_t *tl;
780 	int error;
781 	struct nfsm_info info;
782 
783 	/*
784 	 * Loop around until we get our own reply
785 	 */
786 	for (;;) {
787 		/*
788 		 * Lock against other receivers so that I don't get stuck in
789 		 * sbwait() after someone else has received my reply for me.
790 		 * Also necessary for connection based protocols to avoid
791 		 * race conditions during a reconnect.
792 		 *
793 		 * If nfs_rcvlock() returns EALREADY, that means that
794 		 * the reply has already been recieved by another
795 		 * process and we can return immediately.  In this
796 		 * case, the lock is not taken to avoid races with
797 		 * other processes.
798 		 */
799 		info.mrep = NULL;
800 
801 		error = nfs_rcvlock(nmp, myrep);
802 		if (error == EALREADY)
803 			return (0);
804 		if (error)
805 			return (error);
806 
807 		/*
808 		 * If myrep is NULL we are the receiver helper thread.
809 		 * Stop waiting for incoming replies if there are
810 		 * messages sitting on reqrxq that we need to process,
811 		 * or if a shutdown request is pending.
812 		 */
813 		if (myrep == NULL && (TAILQ_FIRST(&nmp->nm_reqrxq) ||
814 		    nmp->nm_rxstate > NFSSVC_PENDING)) {
815 			nfs_rcvunlock(nmp);
816 			return(EWOULDBLOCK);
817 		}
818 
819 		/*
820 		 * Get the next Rpc reply off the socket
821 		 *
822 		 * We cannot release the receive lock until we've
823 		 * filled in rep->r_mrep, otherwise a waiting
824 		 * thread may deadlock in soreceive with no incoming
825 		 * packets expected.
826 		 */
827 		error = nfs_receive(nmp, myrep, &nam, &info.mrep);
828 		if (error) {
829 			/*
830 			 * Ignore routing errors on connectionless protocols??
831 			 */
832 			nfs_rcvunlock(nmp);
833 			if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
834 				if (nmp->nm_so == NULL)
835 					return (error);
836 				nmp->nm_so->so_error = 0;
837 				continue;
838 			}
839 			return (error);
840 		}
841 		if (nam)
842 			FREE(nam, M_SONAME);
843 
844 		/*
845 		 * Get the xid and check that it is an rpc reply
846 		 */
847 		info.md = info.mrep;
848 		info.dpos = mtod(info.md, caddr_t);
849 		NULLOUT(tl = nfsm_dissect(&info, 2*NFSX_UNSIGNED));
850 		rxid = *tl++;
851 		if (*tl != rpc_reply) {
852 			nfsstats.rpcinvalid++;
853 			m_freem(info.mrep);
854 			info.mrep = NULL;
855 nfsmout:
856 			nfs_rcvunlock(nmp);
857 			continue;
858 		}
859 
860 		/*
861 		 * Loop through the request list to match up the reply
862 		 * Iff no match, just drop the datagram.  On match, set
863 		 * r_mrep atomically to prevent the timer from messing
864 		 * around with the request after we have exited the critical
865 		 * section.
866 		 */
867 		crit_enter();
868 		TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) {
869 			if (rep->r_mrep == NULL && rxid == rep->r_xid)
870 				break;
871 		}
872 
873 		/*
874 		 * Fill in the rest of the reply if we found a match.
875 		 *
876 		 * Deal with duplicate responses if there was no match.
877 		 */
878 		if (rep) {
879 			rep->r_md = info.md;
880 			rep->r_dpos = info.dpos;
881 			if (nfsrtton) {
882 				struct rttl *rt;
883 
884 				rt = &nfsrtt.rttl[nfsrtt.pos];
885 				rt->proc = rep->r_procnum;
886 				rt->rto = 0;
887 				rt->sent = 0;
888 				rt->cwnd = nmp->nm_maxasync_scaled;
889 				rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
890 				rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
891 				rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
892 				getmicrotime(&rt->tstamp);
893 				if (rep->r_flags & R_TIMING)
894 					rt->rtt = rep->r_rtt;
895 				else
896 					rt->rtt = 1000000;
897 				nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
898 			}
899 
900 			/*
901 			 * New congestion control is based only on async
902 			 * requests.
903 			 */
904 			if (nmp->nm_maxasync_scaled < NFS_MAXASYNC_SCALED)
905 				++nmp->nm_maxasync_scaled;
906 			if (rep->r_flags & R_SENT) {
907 				rep->r_flags &= ~R_SENT;
908 			}
909 			/*
910 			 * Update rtt using a gain of 0.125 on the mean
911 			 * and a gain of 0.25 on the deviation.
912 			 *
913 			 * NOTE SRTT/SDRTT are only good if R_TIMING is set.
914 			 */
915 			if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) {
916 				/*
917 				 * Since the timer resolution of
918 				 * NFS_HZ is so course, it can often
919 				 * result in r_rtt == 0. Since
920 				 * r_rtt == N means that the actual
921 				 * rtt is between N+dt and N+2-dt ticks,
922 				 * add 1.
923 				 */
924 				int n;
925 				int d;
926 
927 #define NFSRSB	NFS_RTT_SCALE_BITS
928 				n = ((NFS_SRTT(rep) * 7) +
929 				     (rep->r_rtt << NFSRSB)) >> 3;
930 				d = n - NFS_SRTT(rep);
931 				NFS_SRTT(rep) = n;
932 
933 				/*
934 				 * Don't let the jitter calculation decay
935 				 * too quickly, but we want a fast rampup.
936 				 */
937 				if (d < 0)
938 					d = -d;
939 				d <<= NFSRSB;
940 				if (d < NFS_SDRTT(rep))
941 					n = ((NFS_SDRTT(rep) * 15) + d) >> 4;
942 				else
943 					n = ((NFS_SDRTT(rep) * 3) + d) >> 2;
944 				NFS_SDRTT(rep) = n;
945 #undef NFSRSB
946 			}
947 			nmp->nm_timeouts = 0;
948 			rep->r_mrep = info.mrep;
949 			nfs_hardterm(rep, 0);
950 		} else {
951 			/*
952 			 * Extract vers, prog, nfsver, procnum.  A duplicate
953 			 * response means we didn't wait long enough so
954 			 * we increase the SRTT to avoid future spurious
955 			 * timeouts.
956 			 */
957 			u_int procnum = nmp->nm_lastreprocnum;
958 			int n;
959 
960 			if (procnum < NFS_NPROCS && proct[procnum]) {
961 				if (nfs_showrexmit)
962 					kprintf("D");
963 				n = nmp->nm_srtt[proct[procnum]];
964 				n += NFS_ASYSCALE * NFS_HZ;
965 				if (n < NFS_ASYSCALE * NFS_HZ * 10)
966 					n = NFS_ASYSCALE * NFS_HZ * 10;
967 				nmp->nm_srtt[proct[procnum]] = n;
968 			}
969 		}
970 		nfs_rcvunlock(nmp);
971 		crit_exit();
972 
973 		/*
974 		 * If not matched to a request, drop it.
975 		 * If it's mine, get out.
976 		 */
977 		if (rep == NULL) {
978 			nfsstats.rpcunexpected++;
979 			m_freem(info.mrep);
980 			info.mrep = NULL;
981 		} else if (rep == myrep) {
982 			if (rep->r_mrep == NULL)
983 				panic("nfsreply nil");
984 			return (0);
985 		}
986 	}
987 }
988 
989 /*
990  * Run the request state machine until the target state is reached
991  * or a fatal error occurs.  The target state is not run.  Specifying
992  * a target of NFSM_STATE_DONE runs the state machine until the rpc
993  * is complete.
994  *
995  * EINPROGRESS is returned for all states other then the DONE state,
996  * indicating that the rpc is still in progress.
997  */
998 int
999 nfs_request(struct nfsm_info *info, nfsm_state_t bstate, nfsm_state_t estate)
1000 {
1001 	struct nfsreq *req;
1002 
1003 	while (info->state >= bstate && info->state < estate) {
1004 		switch(info->state) {
1005 		case NFSM_STATE_SETUP:
1006 			/*
1007 			 * Setup the nfsreq.  Any error which occurs during
1008 			 * this state is fatal.
1009 			 */
1010 			info->error = nfs_request_setup(info);
1011 			if (info->error) {
1012 				info->state = NFSM_STATE_DONE;
1013 				return (info->error);
1014 			} else {
1015 				req = info->req;
1016 				req->r_mrp = &info->mrep;
1017 				req->r_mdp = &info->md;
1018 				req->r_dposp = &info->dpos;
1019 				info->state = NFSM_STATE_AUTH;
1020 			}
1021 			break;
1022 		case NFSM_STATE_AUTH:
1023 			/*
1024 			 * Authenticate the nfsreq.  Any error which occurs
1025 			 * during this state is fatal.
1026 			 */
1027 			info->error = nfs_request_auth(info->req);
1028 			if (info->error) {
1029 				info->state = NFSM_STATE_DONE;
1030 				return (info->error);
1031 			} else {
1032 				info->state = NFSM_STATE_TRY;
1033 			}
1034 			break;
1035 		case NFSM_STATE_TRY:
1036 			/*
1037 			 * Transmit or retransmit attempt.  An error in this
1038 			 * state is ignored and we always move on to the
1039 			 * next state.
1040 			 *
1041 			 * This can trivially race the receiver if the
1042 			 * request is asynchronous.  nfs_request_try()
1043 			 * will thus set the state for us and we
1044 			 * must also return immediately if we are
1045 			 * running an async state machine, because
1046 			 * info can become invalid due to races after
1047 			 * try() returns.
1048 			 */
1049 			if (info->req->r_flags & R_ASYNC) {
1050 				nfs_request_try(info->req);
1051 				if (estate == NFSM_STATE_WAITREPLY)
1052 					return (EINPROGRESS);
1053 			} else {
1054 				nfs_request_try(info->req);
1055 				info->state = NFSM_STATE_WAITREPLY;
1056 			}
1057 			break;
1058 		case NFSM_STATE_WAITREPLY:
1059 			/*
1060 			 * Wait for a reply or timeout and move on to the
1061 			 * next state.  The error returned by this state
1062 			 * is passed to the processing code in the next
1063 			 * state.
1064 			 */
1065 			info->error = nfs_request_waitreply(info->req);
1066 			info->state = NFSM_STATE_PROCESSREPLY;
1067 			break;
1068 		case NFSM_STATE_PROCESSREPLY:
1069 			/*
1070 			 * Process the reply or timeout.  Errors which occur
1071 			 * in this state may cause the state machine to
1072 			 * go back to an earlier state, and are fatal
1073 			 * otherwise.
1074 			 */
1075 			info->error = nfs_request_processreply(info,
1076 							       info->error);
1077 			switch(info->error) {
1078 			case ENEEDAUTH:
1079 				info->state = NFSM_STATE_AUTH;
1080 				break;
1081 			case EAGAIN:
1082 				info->state = NFSM_STATE_TRY;
1083 				break;
1084 			default:
1085 				/*
1086 				 * Operation complete, with or without an
1087 				 * error.  We are done.
1088 				 */
1089 				info->req = NULL;
1090 				info->state = NFSM_STATE_DONE;
1091 				return (info->error);
1092 			}
1093 			break;
1094 		case NFSM_STATE_DONE:
1095 			/*
1096 			 * Shouldn't be reached
1097 			 */
1098 			return (info->error);
1099 			/* NOT REACHED */
1100 		}
1101 	}
1102 
1103 	/*
1104 	 * If we are done return the error code (if any).
1105 	 * Otherwise return EINPROGRESS.
1106 	 */
1107 	if (info->state == NFSM_STATE_DONE)
1108 		return (info->error);
1109 	return (EINPROGRESS);
1110 }
1111 
1112 /*
1113  * nfs_request - goes something like this
1114  *	- fill in request struct
1115  *	- links it into list
1116  *	- calls nfs_send() for first transmit
1117  *	- calls nfs_receive() to get reply
1118  *	- break down rpc header and return with nfs reply pointed to
1119  *	  by mrep or error
1120  * nb: always frees up mreq mbuf list
1121  */
1122 static int
1123 nfs_request_setup(nfsm_info_t info)
1124 {
1125 	struct nfsreq *req;
1126 	struct nfsmount *nmp;
1127 	struct mbuf *m;
1128 	int i;
1129 
1130 	/*
1131 	 * Reject requests while attempting a forced unmount.
1132 	 */
1133 	if (info->vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
1134 		m_freem(info->mreq);
1135 		info->mreq = NULL;
1136 		return (ESTALE);
1137 	}
1138 	nmp = VFSTONFS(info->vp->v_mount);
1139 	req = kmalloc(sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
1140 	req->r_nmp = nmp;
1141 	req->r_vp = info->vp;
1142 	req->r_td = info->td;
1143 	req->r_procnum = info->procnum;
1144 	req->r_mreq = NULL;
1145 	req->r_cred = info->cred;
1146 
1147 	i = 0;
1148 	m = info->mreq;
1149 	while (m) {
1150 		i += m->m_len;
1151 		m = m->m_next;
1152 	}
1153 	req->r_mrest = info->mreq;
1154 	req->r_mrest_len = i;
1155 
1156 	/*
1157 	 * The presence of a non-NULL r_info in req indicates
1158 	 * async completion via our helper threads.  See the receiver
1159 	 * code.
1160 	 */
1161 	if (info->bio) {
1162 		req->r_info = info;
1163 		req->r_flags = R_ASYNC;
1164 	} else {
1165 		req->r_info = NULL;
1166 		req->r_flags = 0;
1167 	}
1168 	info->req = req;
1169 	return(0);
1170 }
1171 
1172 static int
1173 nfs_request_auth(struct nfsreq *rep)
1174 {
1175 	struct nfsmount *nmp = rep->r_nmp;
1176 	struct mbuf *m;
1177 	char nickv[RPCX_NICKVERF];
1178 	int error = 0, auth_len, auth_type;
1179 	int verf_len;
1180 	u_int32_t xid;
1181 	char *auth_str, *verf_str;
1182 	struct ucred *cred;
1183 
1184 	cred = rep->r_cred;
1185 	rep->r_failed_auth = 0;
1186 
1187 	/*
1188 	 * Get the RPC header with authorization.
1189 	 */
1190 	verf_str = auth_str = NULL;
1191 	if (nmp->nm_flag & NFSMNT_KERB) {
1192 		verf_str = nickv;
1193 		verf_len = sizeof (nickv);
1194 		auth_type = RPCAUTH_KERB4;
1195 		bzero((caddr_t)rep->r_key, sizeof(rep->r_key));
1196 		if (rep->r_failed_auth ||
1197 		    nfs_getnickauth(nmp, cred, &auth_str, &auth_len,
1198 				    verf_str, verf_len)) {
1199 			error = nfs_getauth(nmp, rep, cred, &auth_str,
1200 				&auth_len, verf_str, &verf_len, rep->r_key);
1201 			if (error) {
1202 				m_freem(rep->r_mrest);
1203 				rep->r_mrest = NULL;
1204 				kfree((caddr_t)rep, M_NFSREQ);
1205 				return (error);
1206 			}
1207 		}
1208 	} else {
1209 		auth_type = RPCAUTH_UNIX;
1210 		if (cred->cr_ngroups < 1)
1211 			panic("nfsreq nogrps");
1212 		auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1213 			nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1214 			5 * NFSX_UNSIGNED;
1215 	}
1216 	if (rep->r_mrest)
1217 		nfs_checkpkt(rep->r_mrest, rep->r_mrest_len);
1218 	m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type,
1219 			auth_len, auth_str, verf_len, verf_str,
1220 			rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid);
1221 	rep->r_mrest = NULL;
1222 	if (auth_str)
1223 		kfree(auth_str, M_TEMP);
1224 
1225 	/*
1226 	 * For stream protocols, insert a Sun RPC Record Mark.
1227 	 */
1228 	if (nmp->nm_sotype == SOCK_STREAM) {
1229 		M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT);
1230 		if (m == NULL) {
1231 			kfree(rep, M_NFSREQ);
1232 			return (ENOBUFS);
1233 		}
1234 		*mtod(m, u_int32_t *) = htonl(0x80000000 |
1235 			 (m->m_pkthdr.len - NFSX_UNSIGNED));
1236 	}
1237 
1238 	nfs_checkpkt(m, m->m_pkthdr.len);
1239 
1240 	rep->r_mreq = m;
1241 	rep->r_xid = xid;
1242 	return (0);
1243 }
1244 
1245 static int
1246 nfs_request_try(struct nfsreq *rep)
1247 {
1248 	struct nfsmount *nmp = rep->r_nmp;
1249 	struct mbuf *m2;
1250 	int error;
1251 
1252 	/*
1253 	 * Request is not on any queue, only the owner has access to it
1254 	 * so it should not be locked by anyone atm.
1255 	 *
1256 	 * Interlock to prevent races.  While locked the only remote
1257 	 * action possible is for r_mrep to be set (once we enqueue it).
1258 	 */
1259 	if (rep->r_flags == 0xdeadc0de) {
1260 		print_backtrace(-1);
1261 		panic("flags nbad\n");
1262 	}
1263 	KKASSERT((rep->r_flags & (R_LOCKED | R_ONREQQ)) == 0);
1264 	if (nmp->nm_flag & NFSMNT_SOFT)
1265 		rep->r_retry = nmp->nm_retry;
1266 	else
1267 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
1268 	rep->r_rtt = rep->r_rexmit = 0;
1269 	if (proct[rep->r_procnum] > 0)
1270 		rep->r_flags |= R_TIMING | R_LOCKED;
1271 	else
1272 		rep->r_flags |= R_LOCKED;
1273 	rep->r_mrep = NULL;
1274 
1275 	/*
1276 	 * Do the client side RPC.
1277 	 */
1278 	nfsstats.rpcrequests++;
1279 
1280 	if (nmp->nm_flag & NFSMNT_FORCE) {
1281 		rep->r_flags |= R_SOFTTERM;
1282 		rep->r_flags &= ~R_LOCKED;
1283 		return (0);
1284 	}
1285 
1286 	/*
1287 	 * Chain request into list of outstanding requests. Be sure
1288 	 * to put it LAST so timer finds oldest requests first.  Note
1289 	 * that our control of R_LOCKED prevents the request from
1290 	 * getting ripped out from under us or transmitted by the
1291 	 * timer code.
1292 	 *
1293 	 * For requests with info structures we must atomically set the
1294 	 * info's state because the structure could become invalid upon
1295 	 * return due to races (i.e., if async)
1296 	 */
1297 	crit_enter();
1298 	mtx_link_init(&rep->r_link);
1299 	KKASSERT((rep->r_flags & R_ONREQQ) == 0);
1300 	TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain);
1301 	rep->r_flags |= R_ONREQQ;
1302 	++nmp->nm_reqqlen;
1303 	if (rep->r_flags & R_ASYNC)
1304 		rep->r_info->state = NFSM_STATE_WAITREPLY;
1305 	crit_exit();
1306 
1307 	error = 0;
1308 
1309 	/*
1310 	 * Send if we can.  Congestion control is not handled here any more
1311 	 * becausing trying to defer the initial send based on the nfs_timer
1312 	 * requires having a very fast nfs_timer, which is silly.
1313 	 */
1314 	if (nmp->nm_so) {
1315 		if (nmp->nm_soflags & PR_CONNREQUIRED)
1316 			error = nfs_sndlock(nmp, rep);
1317 		if (error == 0) {
1318 			m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
1319 			error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1320 			if (nmp->nm_soflags & PR_CONNREQUIRED)
1321 				nfs_sndunlock(nmp);
1322 			rep->r_flags &= ~R_NEEDSXMIT;
1323 			if ((rep->r_flags & R_SENT) == 0) {
1324 				rep->r_flags |= R_SENT;
1325 			}
1326 		} else {
1327 			rep->r_flags |= R_NEEDSXMIT;
1328 		}
1329 	} else {
1330 		rep->r_flags |= R_NEEDSXMIT;
1331 		rep->r_rtt = -1;
1332 	}
1333 	if (error == EPIPE)
1334 		error = 0;
1335 
1336 	/*
1337 	 * Release the lock.  The only remote action that may have occurred
1338 	 * would have been the setting of rep->r_mrep.  If this occured
1339 	 * and the request was async we have to move it to the reader
1340 	 * thread's queue for action.
1341 	 *
1342 	 * For async requests also make sure the reader is woken up so
1343 	 * it gets on the socket to read responses.
1344 	 */
1345 	crit_enter();
1346 	if (rep->r_flags & R_ASYNC) {
1347 		if (rep->r_mrep)
1348 			nfs_hardterm(rep, 1);
1349 		rep->r_flags &= ~R_LOCKED;
1350 		nfssvc_iod_reader_wakeup(nmp);
1351 	} else {
1352 		rep->r_flags &= ~R_LOCKED;
1353 	}
1354 	if (rep->r_flags & R_WANTED) {
1355 		rep->r_flags &= ~R_WANTED;
1356 		wakeup(rep);
1357 	}
1358 	crit_exit();
1359 	return (error);
1360 }
1361 
1362 /*
1363  * This code is only called for synchronous requests.  Completed synchronous
1364  * requests are left on reqq and we remove them before moving on to the
1365  * processing state.
1366  */
1367 static int
1368 nfs_request_waitreply(struct nfsreq *rep)
1369 {
1370 	struct nfsmount *nmp = rep->r_nmp;
1371 	int error;
1372 
1373 	KKASSERT((rep->r_flags & R_ASYNC) == 0);
1374 
1375 	/*
1376 	 * Wait until the request is finished.
1377 	 */
1378 	error = nfs_reply(nmp, rep);
1379 
1380 	/*
1381 	 * RPC done, unlink the request, but don't rip it out from under
1382 	 * the callout timer.
1383 	 *
1384 	 * Once unlinked no other receiver or the timer will have
1385 	 * visibility, so we do not have to set R_LOCKED.
1386 	 */
1387 	crit_enter();
1388 	while (rep->r_flags & R_LOCKED) {
1389 		rep->r_flags |= R_WANTED;
1390 		tsleep(rep, 0, "nfstrac", 0);
1391 	}
1392 	KKASSERT(rep->r_flags & R_ONREQQ);
1393 	TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
1394 	rep->r_flags &= ~R_ONREQQ;
1395 	--nmp->nm_reqqlen;
1396 	if (TAILQ_FIRST(&nmp->nm_bioq) &&
1397 	    nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
1398 		nfssvc_iod_writer_wakeup(nmp);
1399 	}
1400 	crit_exit();
1401 
1402 	/*
1403 	 * Decrement the outstanding request count.
1404 	 */
1405 	if (rep->r_flags & R_SENT) {
1406 		rep->r_flags &= ~R_SENT;
1407 	}
1408 	return (error);
1409 }
1410 
1411 /*
1412  * Process reply with error returned from nfs_requet_waitreply().
1413  *
1414  * Returns EAGAIN if it wants us to loop up to nfs_request_try() again.
1415  * Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again.
1416  */
1417 static int
1418 nfs_request_processreply(nfsm_info_t info, int error)
1419 {
1420 	struct nfsreq *req = info->req;
1421 	struct nfsmount *nmp = req->r_nmp;
1422 	u_int32_t *tl;
1423 	int verf_type;
1424 	int i;
1425 
1426 	/*
1427 	 * If there was a successful reply and a tprintf msg.
1428 	 * tprintf a response.
1429 	 */
1430 	if (error == 0 && (req->r_flags & R_TPRINTFMSG)) {
1431 		nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1432 		    "is alive again");
1433 	}
1434 	info->mrep = req->r_mrep;
1435 	info->md = req->r_md;
1436 	info->dpos = req->r_dpos;
1437 	if (error) {
1438 		m_freem(req->r_mreq);
1439 		req->r_mreq = NULL;
1440 		kfree(req, M_NFSREQ);
1441 		info->req = NULL;
1442 		return (error);
1443 	}
1444 
1445 	/*
1446 	 * break down the rpc header and check if ok
1447 	 */
1448 	NULLOUT(tl = nfsm_dissect(info, 3 * NFSX_UNSIGNED));
1449 	if (*tl++ == rpc_msgdenied) {
1450 		if (*tl == rpc_mismatch) {
1451 			error = EOPNOTSUPP;
1452 		} else if ((nmp->nm_flag & NFSMNT_KERB) &&
1453 			   *tl++ == rpc_autherr) {
1454 			if (req->r_failed_auth == 0) {
1455 				req->r_failed_auth++;
1456 				req->r_mheadend->m_next = NULL;
1457 				m_freem(info->mrep);
1458 				info->mrep = NULL;
1459 				m_freem(req->r_mreq);
1460 				req->r_mreq = NULL;
1461 				return (ENEEDAUTH);
1462 			} else {
1463 				error = EAUTH;
1464 			}
1465 		} else {
1466 			error = EACCES;
1467 		}
1468 		m_freem(info->mrep);
1469 		info->mrep = NULL;
1470 		m_freem(req->r_mreq);
1471 		req->r_mreq = NULL;
1472 		kfree(req, M_NFSREQ);
1473 		info->req = NULL;
1474 		return (error);
1475 	}
1476 
1477 	/*
1478 	 * Grab any Kerberos verifier, otherwise just throw it away.
1479 	 */
1480 	verf_type = fxdr_unsigned(int, *tl++);
1481 	i = fxdr_unsigned(int32_t, *tl);
1482 	if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1483 		error = nfs_savenickauth(nmp, req->r_cred, i, req->r_key,
1484 					 &info->md, &info->dpos, info->mrep);
1485 		if (error)
1486 			goto nfsmout;
1487 	} else if (i > 0) {
1488 		ERROROUT(nfsm_adv(info, nfsm_rndup(i)));
1489 	}
1490 	NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
1491 	/* 0 == ok */
1492 	if (*tl == 0) {
1493 		NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
1494 		if (*tl != 0) {
1495 			error = fxdr_unsigned(int, *tl);
1496 
1497 			/*
1498 			 * Does anyone even implement this?  Just impose
1499 			 * a 1-second delay.
1500 			 */
1501 			if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1502 				error == NFSERR_TRYLATER) {
1503 				m_freem(info->mrep);
1504 				info->mrep = NULL;
1505 				error = 0;
1506 
1507 				tsleep((caddr_t)&lbolt, 0, "nqnfstry", 0);
1508 				return (EAGAIN);	/* goto tryagain */
1509 			}
1510 
1511 			/*
1512 			 * If the File Handle was stale, invalidate the
1513 			 * lookup cache, just in case.
1514 			 *
1515 			 * To avoid namecache<->vnode deadlocks we must
1516 			 * release the vnode lock if we hold it.
1517 			 */
1518 			if (error == ESTALE) {
1519 				struct vnode *vp = req->r_vp;
1520 				int ltype;
1521 
1522 				ltype = lockstatus(&vp->v_lock, curthread);
1523 				if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1524 					lockmgr(&vp->v_lock, LK_RELEASE);
1525 				cache_inval_vp(vp, CINV_CHILDREN);
1526 				if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1527 					lockmgr(&vp->v_lock, ltype);
1528 			}
1529 			if (nmp->nm_flag & NFSMNT_NFSV3) {
1530 				KKASSERT(*req->r_mrp == info->mrep);
1531 				KKASSERT(*req->r_mdp == info->md);
1532 				KKASSERT(*req->r_dposp == info->dpos);
1533 				error |= NFSERR_RETERR;
1534 			} else {
1535 				m_freem(info->mrep);
1536 				info->mrep = NULL;
1537 			}
1538 			m_freem(req->r_mreq);
1539 			req->r_mreq = NULL;
1540 			kfree(req, M_NFSREQ);
1541 			info->req = NULL;
1542 			return (error);
1543 		}
1544 
1545 		KKASSERT(*req->r_mrp == info->mrep);
1546 		KKASSERT(*req->r_mdp == info->md);
1547 		KKASSERT(*req->r_dposp == info->dpos);
1548 		m_freem(req->r_mreq);
1549 		req->r_mreq = NULL;
1550 		FREE(req, M_NFSREQ);
1551 		return (0);
1552 	}
1553 	m_freem(info->mrep);
1554 	info->mrep = NULL;
1555 	error = EPROTONOSUPPORT;
1556 nfsmout:
1557 	m_freem(req->r_mreq);
1558 	req->r_mreq = NULL;
1559 	kfree(req, M_NFSREQ);
1560 	info->req = NULL;
1561 	return (error);
1562 }
1563 
1564 #ifndef NFS_NOSERVER
1565 /*
1566  * Generate the rpc reply header
1567  * siz arg. is used to decide if adding a cluster is worthwhile
1568  */
1569 int
1570 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
1571 	    int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp)
1572 {
1573 	u_int32_t *tl;
1574 	struct nfsm_info info;
1575 
1576 	siz += RPC_REPLYSIZ;
1577 	info.mb = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
1578 	info.mreq = info.mb;
1579 	info.mreq->m_pkthdr.len = 0;
1580 	/*
1581 	 * If this is not a cluster, try and leave leading space
1582 	 * for the lower level headers.
1583 	 */
1584 	if ((max_hdr + siz) < MINCLSIZE)
1585 		info.mreq->m_data += max_hdr;
1586 	tl = mtod(info.mreq, u_int32_t *);
1587 	info.mreq->m_len = 6 * NFSX_UNSIGNED;
1588 	info.bpos = ((caddr_t)tl) + info.mreq->m_len;
1589 	*tl++ = txdr_unsigned(nd->nd_retxid);
1590 	*tl++ = rpc_reply;
1591 	if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1592 		*tl++ = rpc_msgdenied;
1593 		if (err & NFSERR_AUTHERR) {
1594 			*tl++ = rpc_autherr;
1595 			*tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1596 			info.mreq->m_len -= NFSX_UNSIGNED;
1597 			info.bpos -= NFSX_UNSIGNED;
1598 		} else {
1599 			*tl++ = rpc_mismatch;
1600 			*tl++ = txdr_unsigned(RPC_VER2);
1601 			*tl = txdr_unsigned(RPC_VER2);
1602 		}
1603 	} else {
1604 		*tl++ = rpc_msgaccepted;
1605 
1606 		/*
1607 		 * For Kerberos authentication, we must send the nickname
1608 		 * verifier back, otherwise just RPCAUTH_NULL.
1609 		 */
1610 		if (nd->nd_flag & ND_KERBFULL) {
1611 		    struct nfsuid *nuidp;
1612 		    struct timeval ktvin, ktvout;
1613 
1614 		    for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1615 			nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1616 			if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1617 			    (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1618 			     &nuidp->nu_haddr, nd->nd_nam2)))
1619 			    break;
1620 		    }
1621 		    if (nuidp) {
1622 			ktvin.tv_sec =
1623 			    txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1624 			ktvin.tv_usec =
1625 			    txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1626 
1627 			/*
1628 			 * Encrypt the timestamp in ecb mode using the
1629 			 * session key.
1630 			 */
1631 #ifdef NFSKERB
1632 			XXX
1633 #else
1634 			ktvout.tv_sec = 0;
1635 			ktvout.tv_usec = 0;
1636 #endif
1637 
1638 			*tl++ = rpc_auth_kerb;
1639 			*tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1640 			*tl = ktvout.tv_sec;
1641 			tl = nfsm_build(&info, 3 * NFSX_UNSIGNED);
1642 			*tl++ = ktvout.tv_usec;
1643 			*tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1644 		    } else {
1645 			*tl++ = 0;
1646 			*tl++ = 0;
1647 		    }
1648 		} else {
1649 			*tl++ = 0;
1650 			*tl++ = 0;
1651 		}
1652 		switch (err) {
1653 		case EPROGUNAVAIL:
1654 			*tl = txdr_unsigned(RPC_PROGUNAVAIL);
1655 			break;
1656 		case EPROGMISMATCH:
1657 			*tl = txdr_unsigned(RPC_PROGMISMATCH);
1658 			tl = nfsm_build(&info, 2 * NFSX_UNSIGNED);
1659 			*tl++ = txdr_unsigned(2);
1660 			*tl = txdr_unsigned(3);
1661 			break;
1662 		case EPROCUNAVAIL:
1663 			*tl = txdr_unsigned(RPC_PROCUNAVAIL);
1664 			break;
1665 		case EBADRPC:
1666 			*tl = txdr_unsigned(RPC_GARBAGE);
1667 			break;
1668 		default:
1669 			*tl = 0;
1670 			if (err != NFSERR_RETVOID) {
1671 				tl = nfsm_build(&info, NFSX_UNSIGNED);
1672 				if (err)
1673 				    *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1674 				else
1675 				    *tl = 0;
1676 			}
1677 			break;
1678 		};
1679 	}
1680 
1681 	if (mrq != NULL)
1682 	    *mrq = info.mreq;
1683 	*mbp = info.mb;
1684 	*bposp = info.bpos;
1685 	if (err != 0 && err != NFSERR_RETVOID)
1686 		nfsstats.srvrpc_errs++;
1687 	return (0);
1688 }
1689 
1690 
1691 #endif /* NFS_NOSERVER */
1692 
1693 /*
1694  * Nfs timer routine.
1695  *
1696  * Scan the nfsreq list and retranmit any requests that have timed out
1697  * To avoid retransmission attempts on STREAM sockets (in the future) make
1698  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1699  *
1700  * Requests with attached responses, terminated requests, and
1701  * locked requests are ignored.  Locked requests will be picked up
1702  * in a later timer call.
1703  */
1704 void
1705 nfs_timer_callout(void *arg /* never used */)
1706 {
1707 	struct nfsmount *nmp;
1708 	struct nfsreq *req;
1709 #ifndef NFS_NOSERVER
1710 	struct nfssvc_sock *slp;
1711 	u_quad_t cur_usec;
1712 #endif /* NFS_NOSERVER */
1713 
1714 	lwkt_gettoken(&nfs_token);
1715 	TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) {
1716 		lwkt_gettoken(&nmp->nm_token);
1717 		TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
1718 			KKASSERT(nmp == req->r_nmp);
1719 			if (req->r_mrep)
1720 				continue;
1721 			if (req->r_flags & (R_SOFTTERM | R_LOCKED))
1722 				continue;
1723 			req->r_flags |= R_LOCKED;
1724 			if (nfs_sigintr(nmp, req, req->r_td)) {
1725 				nfs_softterm(req, 1);
1726 			} else {
1727 				nfs_timer_req(req);
1728 			}
1729 			req->r_flags &= ~R_LOCKED;
1730 			if (req->r_flags & R_WANTED) {
1731 				req->r_flags &= ~R_WANTED;
1732 				wakeup(req);
1733 			}
1734 		}
1735 		lwkt_reltoken(&nmp->nm_token);
1736 	}
1737 #ifndef NFS_NOSERVER
1738 
1739 	/*
1740 	 * Scan the write gathering queues for writes that need to be
1741 	 * completed now.
1742 	 */
1743 	cur_usec = nfs_curusec();
1744 
1745 	TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1746 		/* XXX race against removal */
1747 		if (lwkt_trytoken(&slp->ns_token)) {
1748 			if (slp->ns_tq.lh_first &&
1749 			    (slp->ns_tq.lh_first->nd_time <= cur_usec)) {
1750 				nfsrv_wakenfsd(slp, 1);
1751 			}
1752 			lwkt_reltoken(&slp->ns_token);
1753 		}
1754 	}
1755 #endif /* NFS_NOSERVER */
1756 
1757 	callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer_callout, NULL);
1758 	lwkt_reltoken(&nfs_token);
1759 }
1760 
1761 static
1762 void
1763 nfs_timer_req(struct nfsreq *req)
1764 {
1765 	struct thread *td = &thread0; /* XXX for creds, will break if sleep */
1766 	struct nfsmount *nmp = req->r_nmp;
1767 	struct mbuf *m;
1768 	struct socket *so;
1769 	int timeo;
1770 	int error;
1771 
1772 	/*
1773 	 * rtt ticks and timeout calculation.  Return if the timeout
1774 	 * has not been reached yet, unless the packet is flagged
1775 	 * for an immediate send.
1776 	 *
1777 	 * The mean rtt doesn't help when we get random I/Os, we have
1778 	 * to multiply by fairly large numbers.
1779 	 */
1780 	if (req->r_rtt >= 0) {
1781 		/*
1782 		 * Calculate the timeout to test against.
1783 		 */
1784 		req->r_rtt++;
1785 		if (nmp->nm_flag & NFSMNT_DUMBTIMR) {
1786 			timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
1787 		} else if (req->r_flags & R_TIMING) {
1788 			timeo = NFS_SRTT(req) + NFS_SDRTT(req);
1789 		} else {
1790 			timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
1791 		}
1792 		timeo *= multt[req->r_procnum];
1793 		/* timeo is still scaled by SCALE_BITS */
1794 
1795 #define NFSFS	(NFS_RTT_SCALE * NFS_HZ)
1796 		if (req->r_flags & R_TIMING) {
1797 			static long last_time;
1798 			if (nfs_showrtt && last_time != time_second) {
1799 				kprintf("rpccmd %d NFS SRTT %d SDRTT %d "
1800 					"timeo %d.%03d\n",
1801 					proct[req->r_procnum],
1802 					NFS_SRTT(req), NFS_SDRTT(req),
1803 					timeo / NFSFS,
1804 					timeo % NFSFS * 1000 /  NFSFS);
1805 				last_time = time_second;
1806 			}
1807 		}
1808 #undef NFSFS
1809 
1810 		/*
1811 		 * deal with nfs_timer jitter.
1812 		 */
1813 		timeo = (timeo >> NFS_RTT_SCALE_BITS) + 1;
1814 		if (timeo < 2)
1815 			timeo = 2;
1816 
1817 		if (nmp->nm_timeouts > 0)
1818 			timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1819 		if (timeo > NFS_MAXTIMEO)
1820 			timeo = NFS_MAXTIMEO;
1821 		if (req->r_rtt <= timeo) {
1822 			if ((req->r_flags & R_NEEDSXMIT) == 0)
1823 				return;
1824 		} else if (nmp->nm_timeouts < 8) {
1825 			nmp->nm_timeouts++;
1826 		}
1827 	}
1828 
1829 	/*
1830 	 * Check for server not responding
1831 	 */
1832 	if ((req->r_flags & R_TPRINTFMSG) == 0 &&
1833 	     req->r_rexmit > nmp->nm_deadthresh) {
1834 		nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1835 			"not responding");
1836 		req->r_flags |= R_TPRINTFMSG;
1837 	}
1838 	if (req->r_rexmit >= req->r_retry) {	/* too many */
1839 		nfsstats.rpctimeouts++;
1840 		nfs_softterm(req, 1);
1841 		return;
1842 	}
1843 
1844 	/*
1845 	 * Generally disable retransmission on reliable sockets,
1846 	 * unless the request is flagged for immediate send.
1847 	 */
1848 	if (nmp->nm_sotype != SOCK_DGRAM) {
1849 		if (++req->r_rexmit > NFS_MAXREXMIT)
1850 			req->r_rexmit = NFS_MAXREXMIT;
1851 		if ((req->r_flags & R_NEEDSXMIT) == 0)
1852 			return;
1853 	}
1854 
1855 	/*
1856 	 * Stop here if we do not have a socket!
1857 	 */
1858 	if ((so = nmp->nm_so) == NULL)
1859 		return;
1860 
1861 	/*
1862 	 * If there is enough space and the window allows.. resend it.
1863 	 *
1864 	 * r_rtt is left intact in case we get an answer after the
1865 	 * retry that was a reply to the original packet.
1866 	 *
1867 	 * NOTE: so_pru_send()
1868 	 */
1869 	if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len &&
1870 	    (req->r_flags & (R_SENT | R_NEEDSXMIT)) &&
1871 	   (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
1872 		if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1873 		    error = so_pru_send(so, 0, m, NULL, NULL, td);
1874 		else
1875 		    error = so_pru_send(so, 0, m, nmp->nm_nam, NULL, td);
1876 		if (error) {
1877 			if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1878 				so->so_error = 0;
1879 			req->r_flags |= R_NEEDSXMIT;
1880 		} else if (req->r_mrep == NULL) {
1881 			/*
1882 			 * Iff first send, start timing
1883 			 * else turn timing off, backoff timer
1884 			 * and divide congestion window by 2.
1885 			 *
1886 			 * It is possible for the so_pru_send() to
1887 			 * block and for us to race a reply so we
1888 			 * only do this if the reply field has not
1889 			 * been filled in.  R_LOCKED will prevent
1890 			 * the request from being ripped out from under
1891 			 * us entirely.
1892 			 *
1893 			 * Record the last resent procnum to aid us
1894 			 * in duplicate detection on receive.
1895 			 */
1896 			if ((req->r_flags & R_NEEDSXMIT) == 0) {
1897 				if (nfs_showrexmit)
1898 					kprintf("X");
1899 				if (++req->r_rexmit > NFS_MAXREXMIT)
1900 					req->r_rexmit = NFS_MAXREXMIT;
1901 				nmp->nm_maxasync_scaled >>= 1;
1902 				if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED)
1903 					nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
1904 				nfsstats.rpcretries++;
1905 				nmp->nm_lastreprocnum = req->r_procnum;
1906 			} else {
1907 				req->r_flags |= R_SENT;
1908 				req->r_flags &= ~R_NEEDSXMIT;
1909 			}
1910 		}
1911 	}
1912 }
1913 
1914 /*
1915  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1916  * wait for all requests to complete. This is used by forced unmounts
1917  * to terminate any outstanding RPCs.
1918  *
1919  * Locked requests cannot be canceled but will be marked for
1920  * soft-termination.
1921  */
1922 int
1923 nfs_nmcancelreqs(struct nfsmount *nmp)
1924 {
1925 	struct nfsreq *req;
1926 	int i;
1927 
1928 	crit_enter();
1929 	TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
1930 		if (req->r_mrep != NULL || (req->r_flags & R_SOFTTERM))
1931 			continue;
1932 		nfs_softterm(req, 0);
1933 	}
1934 	/* XXX  the other two queues as well */
1935 	crit_exit();
1936 
1937 	for (i = 0; i < 30; i++) {
1938 		crit_enter();
1939 		TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
1940 			if (nmp == req->r_nmp)
1941 				break;
1942 		}
1943 		crit_exit();
1944 		if (req == NULL)
1945 			return (0);
1946 		tsleep(&lbolt, 0, "nfscancel", 0);
1947 	}
1948 	return (EBUSY);
1949 }
1950 
1951 /*
1952  * Soft-terminate a request, effectively marking it as failed.
1953  *
1954  * Must be called from within a critical section.
1955  */
1956 static void
1957 nfs_softterm(struct nfsreq *rep, int islocked)
1958 {
1959 	rep->r_flags |= R_SOFTTERM;
1960 	nfs_hardterm(rep, islocked);
1961 }
1962 
1963 /*
1964  * Hard-terminate a request, typically after getting a response.
1965  *
1966  * The state machine can still decide to re-issue it later if necessary.
1967  *
1968  * Must be called from within a critical section.
1969  */
1970 static void
1971 nfs_hardterm(struct nfsreq *rep, int islocked)
1972 {
1973 	struct nfsmount *nmp = rep->r_nmp;
1974 
1975 	/*
1976 	 * The nm_send count is decremented now to avoid deadlocks
1977 	 * when the process in soreceive() hasn't yet managed to send
1978 	 * its own request.
1979 	 */
1980 	if (rep->r_flags & R_SENT) {
1981 		rep->r_flags &= ~R_SENT;
1982 	}
1983 
1984 	/*
1985 	 * If we locked the request or nobody else has locked the request,
1986 	 * and the request is async, we can move it to the reader thread's
1987 	 * queue now and fix up the state.
1988 	 *
1989 	 * If we locked the request or nobody else has locked the request,
1990 	 * we can wake up anyone blocked waiting for a response on the
1991 	 * request.
1992 	 */
1993 	if (islocked || (rep->r_flags & R_LOCKED) == 0) {
1994 		if ((rep->r_flags & (R_ONREQQ | R_ASYNC)) ==
1995 		    (R_ONREQQ | R_ASYNC)) {
1996 			rep->r_flags &= ~R_ONREQQ;
1997 			TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
1998 			--nmp->nm_reqqlen;
1999 			TAILQ_INSERT_TAIL(&nmp->nm_reqrxq, rep, r_chain);
2000 			KKASSERT(rep->r_info->state == NFSM_STATE_TRY ||
2001 				 rep->r_info->state == NFSM_STATE_WAITREPLY);
2002 			rep->r_info->state = NFSM_STATE_PROCESSREPLY;
2003 			nfssvc_iod_reader_wakeup(nmp);
2004 			if (TAILQ_FIRST(&nmp->nm_bioq) &&
2005 			    nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
2006 				nfssvc_iod_writer_wakeup(nmp);
2007 			}
2008 		}
2009 		mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link);
2010 	}
2011 }
2012 
2013 /*
2014  * Test for a termination condition pending on the process.
2015  * This is used for NFSMNT_INT mounts.
2016  */
2017 int
2018 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
2019 {
2020 	sigset_t tmpset;
2021 	struct proc *p;
2022 	struct lwp *lp;
2023 
2024 	if (rep && (rep->r_flags & R_SOFTTERM))
2025 		return (EINTR);
2026 	/* Terminate all requests while attempting a forced unmount. */
2027 	if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
2028 		return (EINTR);
2029 	if (!(nmp->nm_flag & NFSMNT_INT))
2030 		return (0);
2031 	/* td might be NULL YYY */
2032 	if (td == NULL || (p = td->td_proc) == NULL)
2033 		return (0);
2034 
2035 	lp = td->td_lwp;
2036 	tmpset = lwp_sigpend(lp);
2037 	SIGSETNAND(tmpset, lp->lwp_sigmask);
2038 	SIGSETNAND(tmpset, p->p_sigignore);
2039 	if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset))
2040 		return (EINTR);
2041 
2042 	return (0);
2043 }
2044 
2045 /*
2046  * Lock a socket against others.
2047  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
2048  * and also to avoid race conditions between the processes with nfs requests
2049  * in progress when a reconnect is necessary.
2050  */
2051 int
2052 nfs_sndlock(struct nfsmount *nmp, struct nfsreq *rep)
2053 {
2054 	mtx_t mtx = &nmp->nm_txlock;
2055 	struct thread *td;
2056 	int slptimeo;
2057 	int slpflag;
2058 	int error;
2059 
2060 	slpflag = 0;
2061 	slptimeo = 0;
2062 	td = rep ? rep->r_td : NULL;
2063 	if (nmp->nm_flag & NFSMNT_INT)
2064 		slpflag = PCATCH;
2065 
2066 	while ((error = mtx_lock_ex_try(mtx)) != 0) {
2067 		if (nfs_sigintr(nmp, rep, td)) {
2068 			error = EINTR;
2069 			break;
2070 		}
2071 		error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo);
2072 		if (error == 0)
2073 			break;
2074 		if (slpflag == PCATCH) {
2075 			slpflag = 0;
2076 			slptimeo = 2 * hz;
2077 		}
2078 	}
2079 	/* Always fail if our request has been cancelled. */
2080 	if (rep && (rep->r_flags & R_SOFTTERM)) {
2081 		if (error == 0)
2082 			mtx_unlock(mtx);
2083 		error = EINTR;
2084 	}
2085 	return (error);
2086 }
2087 
2088 /*
2089  * Unlock the stream socket for others.
2090  */
2091 void
2092 nfs_sndunlock(struct nfsmount *nmp)
2093 {
2094 	mtx_unlock(&nmp->nm_txlock);
2095 }
2096 
2097 /*
2098  * Lock the receiver side of the socket.
2099  *
2100  * rep may be NULL.
2101  */
2102 static int
2103 nfs_rcvlock(struct nfsmount *nmp, struct nfsreq *rep)
2104 {
2105 	mtx_t mtx = &nmp->nm_rxlock;
2106 	int slpflag;
2107 	int slptimeo;
2108 	int error;
2109 
2110 	/*
2111 	 * Unconditionally check for completion in case another nfsiod
2112 	 * get the packet while the caller was blocked, before the caller
2113 	 * called us.  Packet reception is handled by mainline code which
2114 	 * is protected by the BGL at the moment.
2115 	 *
2116 	 * We do not strictly need the second check just before the
2117 	 * tsleep(), but it's good defensive programming.
2118 	 */
2119 	if (rep && rep->r_mrep != NULL)
2120 		return (EALREADY);
2121 
2122 	if (nmp->nm_flag & NFSMNT_INT)
2123 		slpflag = PCATCH;
2124 	else
2125 		slpflag = 0;
2126 	slptimeo = 0;
2127 
2128 	while ((error = mtx_lock_ex_try(mtx)) != 0) {
2129 		if (nfs_sigintr(nmp, rep, (rep ? rep->r_td : NULL))) {
2130 			error = EINTR;
2131 			break;
2132 		}
2133 		if (rep && rep->r_mrep != NULL) {
2134 			error = EALREADY;
2135 			break;
2136 		}
2137 
2138 		/*
2139 		 * NOTE: can return ENOLCK, but in that case rep->r_mrep
2140 		 *       will already be set.
2141 		 */
2142 		if (rep) {
2143 			error = mtx_lock_ex_link(mtx, &rep->r_link,
2144 						 "nfsrcvlk",
2145 						 slpflag, slptimeo);
2146 		} else {
2147 			error = mtx_lock_ex(mtx, "nfsrcvlk", slpflag, slptimeo);
2148 		}
2149 		if (error == 0)
2150 			break;
2151 
2152 		/*
2153 		 * If our reply was recieved while we were sleeping,
2154 		 * then just return without taking the lock to avoid a
2155 		 * situation where a single iod could 'capture' the
2156 		 * recieve lock.
2157 		 */
2158 		if (rep && rep->r_mrep != NULL) {
2159 			error = EALREADY;
2160 			break;
2161 		}
2162 		if (slpflag == PCATCH) {
2163 			slpflag = 0;
2164 			slptimeo = 2 * hz;
2165 		}
2166 	}
2167 	if (error == 0) {
2168 		if (rep && rep->r_mrep != NULL) {
2169 			error = EALREADY;
2170 			mtx_unlock(mtx);
2171 		}
2172 	}
2173 	return (error);
2174 }
2175 
2176 /*
2177  * Unlock the stream socket for others.
2178  */
2179 static void
2180 nfs_rcvunlock(struct nfsmount *nmp)
2181 {
2182 	mtx_unlock(&nmp->nm_rxlock);
2183 }
2184 
2185 /*
2186  * nfs_realign:
2187  *
2188  * Check for badly aligned mbuf data and realign by copying the unaligned
2189  * portion of the data into a new mbuf chain and freeing the portions
2190  * of the old chain that were replaced.
2191  *
2192  * We cannot simply realign the data within the existing mbuf chain
2193  * because the underlying buffers may contain other rpc commands and
2194  * we cannot afford to overwrite them.
2195  *
2196  * We would prefer to avoid this situation entirely.  The situation does
2197  * not occur with NFS/UDP and is supposed to only occassionally occur
2198  * with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
2199  *
2200  * NOTE!  MB_DONTWAIT cannot be used here.  The mbufs must be acquired
2201  *	  because the rpc request OR reply cannot be thrown away.  TCP NFS
2202  *	  mounts do not retry their RPCs unless the TCP connection itself
2203  *	  is dropped so throwing away a RPC will basically cause the NFS
2204  *	  operation to lockup indefinitely.
2205  */
2206 static void
2207 nfs_realign(struct mbuf **pm, int hsiz)
2208 {
2209 	struct mbuf *m;
2210 	struct mbuf *n = NULL;
2211 
2212 	/*
2213 	 * Check for misalignemnt
2214 	 */
2215 	++nfs_realign_test;
2216 	while ((m = *pm) != NULL) {
2217 		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3))
2218 			break;
2219 		pm = &m->m_next;
2220 	}
2221 
2222 	/*
2223 	 * If misalignment found make a completely new copy.
2224 	 */
2225 	if (m) {
2226 		++nfs_realign_count;
2227 		n = m_dup_data(m, MB_WAIT);
2228 		m_freem(*pm);
2229 		*pm = n;
2230 	}
2231 }
2232 
2233 #ifndef NFS_NOSERVER
2234 
2235 /*
2236  * Parse an RPC request
2237  * - verify it
2238  * - fill in the cred struct.
2239  */
2240 int
2241 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
2242 {
2243 	int len, i;
2244 	u_int32_t *tl;
2245 	struct uio uio;
2246 	struct iovec iov;
2247 	caddr_t cp;
2248 	u_int32_t nfsvers, auth_type;
2249 	uid_t nickuid;
2250 	int error = 0, ticklen;
2251 	struct nfsuid *nuidp;
2252 	struct timeval tvin, tvout;
2253 	struct nfsm_info info;
2254 #if 0				/* until encrypted keys are implemented */
2255 	NFSKERBKEYSCHED_T keys;	/* stores key schedule */
2256 #endif
2257 
2258 	info.mrep = nd->nd_mrep;
2259 	info.md = nd->nd_md;
2260 	info.dpos = nd->nd_dpos;
2261 
2262 	if (has_header) {
2263 		NULLOUT(tl = nfsm_dissect(&info, 10 * NFSX_UNSIGNED));
2264 		nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
2265 		if (*tl++ != rpc_call) {
2266 			m_freem(info.mrep);
2267 			return (EBADRPC);
2268 		}
2269 	} else {
2270 		NULLOUT(tl = nfsm_dissect(&info, 8 * NFSX_UNSIGNED));
2271 	}
2272 	nd->nd_repstat = 0;
2273 	nd->nd_flag = 0;
2274 	if (*tl++ != rpc_vers) {
2275 		nd->nd_repstat = ERPCMISMATCH;
2276 		nd->nd_procnum = NFSPROC_NOOP;
2277 		return (0);
2278 	}
2279 	if (*tl != nfs_prog) {
2280 		nd->nd_repstat = EPROGUNAVAIL;
2281 		nd->nd_procnum = NFSPROC_NOOP;
2282 		return (0);
2283 	}
2284 	tl++;
2285 	nfsvers = fxdr_unsigned(u_int32_t, *tl++);
2286 	if (nfsvers < NFS_VER2 || nfsvers > NFS_VER3) {
2287 		nd->nd_repstat = EPROGMISMATCH;
2288 		nd->nd_procnum = NFSPROC_NOOP;
2289 		return (0);
2290 	}
2291 	if (nfsvers == NFS_VER3)
2292 		nd->nd_flag = ND_NFSV3;
2293 	nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
2294 	if (nd->nd_procnum == NFSPROC_NULL)
2295 		return (0);
2296 	if (nd->nd_procnum >= NFS_NPROCS ||
2297 		(nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
2298 		(!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2299 		nd->nd_repstat = EPROCUNAVAIL;
2300 		nd->nd_procnum = NFSPROC_NOOP;
2301 		return (0);
2302 	}
2303 	if ((nd->nd_flag & ND_NFSV3) == 0)
2304 		nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2305 	auth_type = *tl++;
2306 	len = fxdr_unsigned(int, *tl++);
2307 	if (len < 0 || len > RPCAUTH_MAXSIZ) {
2308 		m_freem(info.mrep);
2309 		return (EBADRPC);
2310 	}
2311 
2312 	nd->nd_flag &= ~ND_KERBAUTH;
2313 	/*
2314 	 * Handle auth_unix or auth_kerb.
2315 	 */
2316 	if (auth_type == rpc_auth_unix) {
2317 		len = fxdr_unsigned(int, *++tl);
2318 		if (len < 0 || len > NFS_MAXNAMLEN) {
2319 			m_freem(info.mrep);
2320 			return (EBADRPC);
2321 		}
2322 		ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
2323 		NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
2324 		bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
2325 		nd->nd_cr.cr_ref = 1;
2326 		nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
2327 		nd->nd_cr.cr_ruid = nd->nd_cr.cr_svuid = nd->nd_cr.cr_uid;
2328 		nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
2329 		nd->nd_cr.cr_rgid = nd->nd_cr.cr_svgid = nd->nd_cr.cr_gid;
2330 		len = fxdr_unsigned(int, *tl);
2331 		if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2332 			m_freem(info.mrep);
2333 			return (EBADRPC);
2334 		}
2335 		NULLOUT(tl = nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED));
2336 		for (i = 1; i <= len; i++)
2337 		    if (i < NGROUPS)
2338 			nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2339 		    else
2340 			tl++;
2341 		nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2342 		if (nd->nd_cr.cr_ngroups > 1)
2343 		    nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
2344 		len = fxdr_unsigned(int, *++tl);
2345 		if (len < 0 || len > RPCAUTH_MAXSIZ) {
2346 			m_freem(info.mrep);
2347 			return (EBADRPC);
2348 		}
2349 		if (len > 0) {
2350 			ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
2351 		}
2352 	} else if (auth_type == rpc_auth_kerb) {
2353 		switch (fxdr_unsigned(int, *tl++)) {
2354 		case RPCAKN_FULLNAME:
2355 			ticklen = fxdr_unsigned(int, *tl);
2356 			*((u_int32_t *)nfsd->nfsd_authstr) = *tl;
2357 			uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
2358 			nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
2359 			if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
2360 				m_freem(info.mrep);
2361 				return (EBADRPC);
2362 			}
2363 			uio.uio_offset = 0;
2364 			uio.uio_iov = &iov;
2365 			uio.uio_iovcnt = 1;
2366 			uio.uio_segflg = UIO_SYSSPACE;
2367 			iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
2368 			iov.iov_len = RPCAUTH_MAXSIZ - 4;
2369 			ERROROUT(nfsm_mtouio(&info, &uio, uio.uio_resid));
2370 			NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
2371 			if (*tl++ != rpc_auth_kerb ||
2372 				fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2373 				kprintf("Bad kerb verifier\n");
2374 				nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2375 				nd->nd_procnum = NFSPROC_NOOP;
2376 				return (0);
2377 			}
2378 			NULLOUT(cp = nfsm_dissect(&info, 4 * NFSX_UNSIGNED));
2379 			tl = (u_int32_t *)cp;
2380 			if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2381 				kprintf("Not fullname kerb verifier\n");
2382 				nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2383 				nd->nd_procnum = NFSPROC_NOOP;
2384 				return (0);
2385 			}
2386 			cp += NFSX_UNSIGNED;
2387 			bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2388 			nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2389 			nd->nd_flag |= ND_KERBFULL;
2390 			nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2391 			break;
2392 		case RPCAKN_NICKNAME:
2393 			if (len != 2 * NFSX_UNSIGNED) {
2394 				kprintf("Kerb nickname short\n");
2395 				nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2396 				nd->nd_procnum = NFSPROC_NOOP;
2397 				return (0);
2398 			}
2399 			nickuid = fxdr_unsigned(uid_t, *tl);
2400 			NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
2401 			if (*tl++ != rpc_auth_kerb ||
2402 				fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2403 				kprintf("Kerb nick verifier bad\n");
2404 				nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2405 				nd->nd_procnum = NFSPROC_NOOP;
2406 				return (0);
2407 			}
2408 			NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
2409 			tvin.tv_sec = *tl++;
2410 			tvin.tv_usec = *tl;
2411 
2412 			for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2413 			    nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2414 				if (nuidp->nu_cr.cr_uid == nickuid &&
2415 				    (!nd->nd_nam2 ||
2416 				     netaddr_match(NU_NETFAM(nuidp),
2417 				      &nuidp->nu_haddr, nd->nd_nam2)))
2418 					break;
2419 			}
2420 			if (!nuidp) {
2421 				nd->nd_repstat =
2422 					(NFSERR_AUTHERR|AUTH_REJECTCRED);
2423 				nd->nd_procnum = NFSPROC_NOOP;
2424 				return (0);
2425 			}
2426 
2427 			/*
2428 			 * Now, decrypt the timestamp using the session key
2429 			 * and validate it.
2430 			 */
2431 #ifdef NFSKERB
2432 			XXX
2433 #else
2434 			tvout.tv_sec = 0;
2435 			tvout.tv_usec = 0;
2436 #endif
2437 
2438 			tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2439 			tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2440 			if (nuidp->nu_expire < time_second ||
2441 			    nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2442 			    (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2443 			     nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2444 				nuidp->nu_expire = 0;
2445 				nd->nd_repstat =
2446 				    (NFSERR_AUTHERR|AUTH_REJECTVERF);
2447 				nd->nd_procnum = NFSPROC_NOOP;
2448 				return (0);
2449 			}
2450 			nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
2451 			nd->nd_flag |= ND_KERBNICK;
2452 		};
2453 	} else {
2454 		nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2455 		nd->nd_procnum = NFSPROC_NOOP;
2456 		return (0);
2457 	}
2458 
2459 	nd->nd_md = info.md;
2460 	nd->nd_dpos = info.dpos;
2461 	return (0);
2462 nfsmout:
2463 	return (error);
2464 }
2465 
2466 #endif
2467 
2468 /*
2469  * Send a message to the originating process's terminal.  The thread and/or
2470  * process may be NULL.  YYY the thread should not be NULL but there may
2471  * still be some uio_td's that are still being passed as NULL through to
2472  * nfsm_request().
2473  */
2474 static int
2475 nfs_msg(struct thread *td, char *server, char *msg)
2476 {
2477 	tpr_t tpr;
2478 
2479 	if (td && td->td_proc)
2480 		tpr = tprintf_open(td->td_proc);
2481 	else
2482 		tpr = NULL;
2483 	tprintf(tpr, "nfs server %s: %s\n", server, msg);
2484 	tprintf_close(tpr);
2485 	return (0);
2486 }
2487 
2488 #ifndef NFS_NOSERVER
2489 
2490 /*
2491  * Socket upcall routine for nfsd sockets.  This runs in the protocol
2492  * thread and passes waitflag == MB_DONTWAIT.
2493  */
2494 void
2495 nfsrv_rcv_upcall(struct socket *so, void *arg, int waitflag)
2496 {
2497 	struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2498 
2499 	if (slp->ns_needq_upcall == 0) {
2500 		slp->ns_needq_upcall = 1;	/* ok to race */
2501 		lwkt_gettoken(&nfs_token);
2502 		nfsrv_wakenfsd(slp, 1);
2503 		lwkt_reltoken(&nfs_token);
2504 	}
2505 #if 0
2506 	lwkt_gettoken(&slp->ns_token);
2507 	slp->ns_flag |= SLP_NEEDQ;
2508 	nfsrv_rcv(so, arg, waitflag);
2509 	lwkt_reltoken(&slp->ns_token);
2510 #endif
2511 }
2512 
2513 /*
2514  * Process new data on a receive socket.  Essentially do as much as we can
2515  * non-blocking, else punt and it will be called with MB_WAIT from an nfsd.
2516  *
2517  * slp->ns_token is held on call
2518  */
2519 void
2520 nfsrv_rcv(struct socket *so, void *arg, int waitflag)
2521 {
2522 	struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2523 	struct mbuf *m;
2524 	struct sockaddr *nam;
2525 	struct sockbuf sio;
2526 	int flags, error;
2527 	int nparallel_wakeup = 0;
2528 
2529 	ASSERT_LWKT_TOKEN_HELD(&slp->ns_token);
2530 
2531 	if ((slp->ns_flag & SLP_VALID) == 0)
2532 		return;
2533 
2534 	/*
2535 	 * Do not allow an infinite number of completed RPC records to build
2536 	 * up before we stop reading data from the socket.  Otherwise we could
2537 	 * end up holding onto an unreasonable number of mbufs for requests
2538 	 * waiting for service.
2539 	 *
2540 	 * This should give pretty good feedback to the TCP layer and
2541 	 * prevents a memory crunch for other protocols.
2542 	 *
2543 	 * Note that the same service socket can be dispatched to several
2544 	 * nfs servers simultaniously.  The tcp protocol callback calls us
2545 	 * with MB_DONTWAIT.  nfsd calls us with MB_WAIT (typically).
2546 	 */
2547 	if (NFSRV_RECLIMIT(slp))
2548 		return;
2549 
2550 	/*
2551 	 * Handle protocol specifics to parse an RPC request.  We always
2552 	 * pull from the socket using non-blocking I/O.
2553 	 */
2554 	if (so->so_type == SOCK_STREAM) {
2555 		/*
2556 		 * The data has to be read in an orderly fashion from a TCP
2557 		 * stream, unlike a UDP socket.  It is possible for soreceive
2558 		 * and/or nfsrv_getstream() to block, so make sure only one
2559 		 * entity is messing around with the TCP stream at any given
2560 		 * moment.  The receive sockbuf's lock in soreceive is not
2561 		 * sufficient.
2562 		 */
2563 		if (slp->ns_flag & SLP_GETSTREAM)
2564 			return;
2565 		slp->ns_flag |= SLP_GETSTREAM;
2566 
2567 		/*
2568 		 * Do soreceive().  Pull out as much data as possible without
2569 		 * blocking.
2570 		 */
2571 		sbinit(&sio, 1000000000);
2572 		flags = MSG_DONTWAIT;
2573 		error = so_pru_soreceive(so, &nam, NULL, &sio, NULL, &flags);
2574 		if (error || sio.sb_mb == NULL) {
2575 			if (error != EWOULDBLOCK)
2576 				slp->ns_flag |= SLP_DISCONN;
2577 			slp->ns_flag &= ~(SLP_GETSTREAM | SLP_NEEDQ);
2578 			goto done;
2579 		}
2580 		m = sio.sb_mb;
2581 		if (slp->ns_rawend) {
2582 			slp->ns_rawend->m_next = m;
2583 			slp->ns_cc += sio.sb_cc;
2584 		} else {
2585 			slp->ns_raw = m;
2586 			slp->ns_cc = sio.sb_cc;
2587 		}
2588 		while (m->m_next)
2589 			m = m->m_next;
2590 		slp->ns_rawend = m;
2591 
2592 		/*
2593 		 * Now try and parse as many record(s) as we can out of the
2594 		 * raw stream data.  This will set SLP_DOREC.
2595 		 */
2596 		error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup);
2597 		if (error && error != EWOULDBLOCK)
2598 			slp->ns_flag |= SLP_DISCONN;
2599 		slp->ns_flag &= ~SLP_GETSTREAM;
2600 	} else {
2601 		/*
2602 		 * For UDP soreceive typically pulls just one packet, loop
2603 		 * to get the whole batch.
2604 		 */
2605 		do {
2606 			sbinit(&sio, 1000000000);
2607 			flags = MSG_DONTWAIT;
2608 			error = so_pru_soreceive(so, &nam, NULL, &sio,
2609 						 NULL, &flags);
2610 			if (sio.sb_mb) {
2611 				struct nfsrv_rec *rec;
2612 				int mf = (waitflag & MB_DONTWAIT) ?
2613 					    M_NOWAIT : M_WAITOK;
2614 				rec = kmalloc(sizeof(struct nfsrv_rec),
2615 					     M_NFSRVDESC, mf);
2616 				if (!rec) {
2617 					if (nam)
2618 						FREE(nam, M_SONAME);
2619 					m_freem(sio.sb_mb);
2620 					continue;
2621 				}
2622 				nfs_realign(&sio.sb_mb, 10 * NFSX_UNSIGNED);
2623 				rec->nr_address = nam;
2624 				rec->nr_packet = sio.sb_mb;
2625 				STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2626 				++slp->ns_numrec;
2627 				slp->ns_flag |= SLP_DOREC;
2628 				++nparallel_wakeup;
2629 			} else {
2630 				slp->ns_flag &= ~SLP_NEEDQ;
2631 			}
2632 			if (error) {
2633 				if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2634 				    && error != EWOULDBLOCK) {
2635 					slp->ns_flag |= SLP_DISCONN;
2636 					break;
2637 				}
2638 			}
2639 			if (NFSRV_RECLIMIT(slp))
2640 				break;
2641 		} while (sio.sb_mb);
2642 	}
2643 
2644 	/*
2645 	 * If we were upcalled from the tcp protocol layer and we have
2646 	 * fully parsed records ready to go, or there is new data pending,
2647 	 * or something went wrong, try to wake up a nfsd thread to deal
2648 	 * with it.
2649 	 */
2650 done:
2651 	/* XXX this code is currently not executed (nfsrv_rcv_upcall) */
2652 	if (waitflag == MB_DONTWAIT && (slp->ns_flag & SLP_ACTION_MASK)) {
2653 		lwkt_gettoken(&nfs_token);
2654 		nfsrv_wakenfsd(slp, nparallel_wakeup);
2655 		lwkt_reltoken(&nfs_token);
2656 	}
2657 }
2658 
2659 /*
2660  * Try and extract an RPC request from the mbuf data list received on a
2661  * stream socket. The "waitflag" argument indicates whether or not it
2662  * can sleep.
2663  */
2664 static int
2665 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
2666 {
2667 	struct mbuf *m, **mpp;
2668 	char *cp1, *cp2;
2669 	int len;
2670 	struct mbuf *om, *m2, *recm;
2671 	u_int32_t recmark;
2672 
2673 	for (;;) {
2674 	    if (slp->ns_reclen == 0) {
2675 		if (slp->ns_cc < NFSX_UNSIGNED)
2676 			return (0);
2677 		m = slp->ns_raw;
2678 		if (m->m_len >= NFSX_UNSIGNED) {
2679 			bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2680 			m->m_data += NFSX_UNSIGNED;
2681 			m->m_len -= NFSX_UNSIGNED;
2682 		} else {
2683 			cp1 = (caddr_t)&recmark;
2684 			cp2 = mtod(m, caddr_t);
2685 			while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2686 				while (m->m_len == 0) {
2687 					m = m->m_next;
2688 					cp2 = mtod(m, caddr_t);
2689 				}
2690 				*cp1++ = *cp2++;
2691 				m->m_data++;
2692 				m->m_len--;
2693 			}
2694 		}
2695 		slp->ns_cc -= NFSX_UNSIGNED;
2696 		recmark = ntohl(recmark);
2697 		slp->ns_reclen = recmark & ~0x80000000;
2698 		if (recmark & 0x80000000)
2699 			slp->ns_flag |= SLP_LASTFRAG;
2700 		else
2701 			slp->ns_flag &= ~SLP_LASTFRAG;
2702 		if (slp->ns_reclen > NFS_MAXPACKET || slp->ns_reclen <= 0) {
2703 			log(LOG_ERR, "%s (%d) from nfs client\n",
2704 			    "impossible packet length",
2705 			    slp->ns_reclen);
2706 			return (EPERM);
2707 		}
2708 	    }
2709 
2710 	    /*
2711 	     * Now get the record part.
2712 	     *
2713 	     * Note that slp->ns_reclen may be 0.  Linux sometimes
2714 	     * generates 0-length RPCs
2715 	     */
2716 	    recm = NULL;
2717 	    if (slp->ns_cc == slp->ns_reclen) {
2718 		recm = slp->ns_raw;
2719 		slp->ns_raw = slp->ns_rawend = NULL;
2720 		slp->ns_cc = slp->ns_reclen = 0;
2721 	    } else if (slp->ns_cc > slp->ns_reclen) {
2722 		len = 0;
2723 		m = slp->ns_raw;
2724 		om = NULL;
2725 
2726 		while (len < slp->ns_reclen) {
2727 			if ((len + m->m_len) > slp->ns_reclen) {
2728 				m2 = m_copym(m, 0, slp->ns_reclen - len,
2729 					waitflag);
2730 				if (m2) {
2731 					if (om) {
2732 						om->m_next = m2;
2733 						recm = slp->ns_raw;
2734 					} else
2735 						recm = m2;
2736 					m->m_data += slp->ns_reclen - len;
2737 					m->m_len -= slp->ns_reclen - len;
2738 					len = slp->ns_reclen;
2739 				} else {
2740 					return (EWOULDBLOCK);
2741 				}
2742 			} else if ((len + m->m_len) == slp->ns_reclen) {
2743 				om = m;
2744 				len += m->m_len;
2745 				m = m->m_next;
2746 				recm = slp->ns_raw;
2747 				om->m_next = NULL;
2748 			} else {
2749 				om = m;
2750 				len += m->m_len;
2751 				m = m->m_next;
2752 			}
2753 		}
2754 		slp->ns_raw = m;
2755 		slp->ns_cc -= len;
2756 		slp->ns_reclen = 0;
2757 	    } else {
2758 		return (0);
2759 	    }
2760 
2761 	    /*
2762 	     * Accumulate the fragments into a record.
2763 	     */
2764 	    mpp = &slp->ns_frag;
2765 	    while (*mpp)
2766 		mpp = &((*mpp)->m_next);
2767 	    *mpp = recm;
2768 	    if (slp->ns_flag & SLP_LASTFRAG) {
2769 		struct nfsrv_rec *rec;
2770 		int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2771 		rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2772 		if (!rec) {
2773 		    m_freem(slp->ns_frag);
2774 		} else {
2775 		    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2776 		    rec->nr_address = NULL;
2777 		    rec->nr_packet = slp->ns_frag;
2778 		    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2779 		    ++slp->ns_numrec;
2780 		    slp->ns_flag |= SLP_DOREC;
2781 		    ++*countp;
2782 		}
2783 		slp->ns_frag = NULL;
2784 	    }
2785 	}
2786 }
2787 
2788 #ifdef INVARIANTS
2789 
2790 /*
2791  * Sanity check our mbuf chain.
2792  */
2793 static void
2794 nfs_checkpkt(struct mbuf *m, int len)
2795 {
2796 	int xlen = 0;
2797 	while (m) {
2798 		xlen += m->m_len;
2799 		m = m->m_next;
2800 	}
2801 	if (xlen != len) {
2802 		panic("nfs_checkpkt: len mismatch %d/%d mbuf %p\n",
2803 			xlen, len, m);
2804 	}
2805 }
2806 
2807 #else
2808 
2809 static void
2810 nfs_checkpkt(struct mbuf *m __unused, int len __unused)
2811 {
2812 }
2813 
2814 #endif
2815 
2816 /*
2817  * Parse an RPC header.
2818  *
2819  * If the socket is invalid or no records are pending we return ENOBUFS.
2820  * The caller must deal with NEEDQ races.
2821  */
2822 int
2823 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd,
2824 	    struct nfsrv_descript **ndp)
2825 {
2826 	struct nfsrv_rec *rec;
2827 	struct mbuf *m;
2828 	struct sockaddr *nam;
2829 	struct nfsrv_descript *nd;
2830 	int error;
2831 
2832 	*ndp = NULL;
2833 	if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2834 		return (ENOBUFS);
2835 	rec = STAILQ_FIRST(&slp->ns_rec);
2836 	STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2837 	KKASSERT(slp->ns_numrec > 0);
2838 	if (--slp->ns_numrec == 0)
2839 		slp->ns_flag &= ~SLP_DOREC;
2840 	nam = rec->nr_address;
2841 	m = rec->nr_packet;
2842 	kfree(rec, M_NFSRVDESC);
2843 	MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2844 		M_NFSRVDESC, M_WAITOK);
2845 	nd->nd_md = nd->nd_mrep = m;
2846 	nd->nd_nam2 = nam;
2847 	nd->nd_dpos = mtod(m, caddr_t);
2848 	error = nfs_getreq(nd, nfsd, TRUE);
2849 	if (error) {
2850 		if (nam) {
2851 			FREE(nam, M_SONAME);
2852 		}
2853 		kfree((caddr_t)nd, M_NFSRVDESC);
2854 		return (error);
2855 	}
2856 	*ndp = nd;
2857 	nfsd->nfsd_nd = nd;
2858 	return (0);
2859 }
2860 
2861 /*
2862  * Try to assign service sockets to nfsd threads based on the number
2863  * of new rpc requests that have been queued on the service socket.
2864  *
2865  * If no nfsd's are available or additonal requests are pending, set the
2866  * NFSD_CHECKSLP flag so that one of the running nfsds will go look for
2867  * the work in the nfssvc_sock list when it is finished processing its
2868  * current work.  This flag is only cleared when an nfsd can not find
2869  * any new work to perform.
2870  */
2871 void
2872 nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel)
2873 {
2874 	struct nfsd *nd;
2875 
2876 	if ((slp->ns_flag & SLP_VALID) == 0)
2877 		return;
2878 	if (nparallel <= 1)
2879 		nparallel = 1;
2880 	TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2881 		if (nd->nfsd_flag & NFSD_WAITING) {
2882 			nd->nfsd_flag &= ~NFSD_WAITING;
2883 			if (nd->nfsd_slp)
2884 				panic("nfsd wakeup");
2885 			nfsrv_slpref(slp);
2886 			nd->nfsd_slp = slp;
2887 			wakeup((caddr_t)nd);
2888 			if (--nparallel == 0)
2889 				break;
2890 		}
2891 	}
2892 
2893 	/*
2894 	 * If we couldn't assign slp then the NFSDs are all busy and
2895 	 * we set a flag indicating that there is pending work.
2896 	 */
2897 	if (nparallel)
2898 		nfsd_head_flag |= NFSD_CHECKSLP;
2899 }
2900 #endif /* NFS_NOSERVER */
2901