xref: /freebsd/sys/fs/nfs/nfs_commonkrpc.c (revision 4d846d26)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1991, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 /*
40  * Socket operations for use by nfs
41  */
42 
43 #include "opt_kgssapi.h"
44 #include "opt_nfs.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/limits.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/mount.h>
54 #include <sys/mutex.h>
55 #include <sys/proc.h>
56 #include <sys/signalvar.h>
57 #include <sys/syscallsubr.h>
58 #include <sys/sysctl.h>
59 #include <sys/syslog.h>
60 #include <sys/vnode.h>
61 
62 #include <rpc/rpc.h>
63 #include <rpc/krpc.h>
64 
65 #include <kgssapi/krb5/kcrypto.h>
66 
67 #include <fs/nfs/nfsport.h>
68 
69 #ifdef KDTRACE_HOOKS
70 #include <sys/dtrace_bsd.h>
71 
72 dtrace_nfsclient_nfs23_start_probe_func_t
73 		dtrace_nfscl_nfs234_start_probe;
74 
75 dtrace_nfsclient_nfs23_done_probe_func_t
76 		dtrace_nfscl_nfs234_done_probe;
77 
78 /*
79  * Registered probes by RPC type.
80  */
81 uint32_t	nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
82 uint32_t	nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
83 
84 uint32_t	nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
85 uint32_t	nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
86 
87 uint32_t	nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
88 uint32_t	nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
89 #endif
90 
91 NFSSTATESPINLOCK;
92 NFSREQSPINLOCK;
93 NFSDLOCKMUTEX;
94 NFSCLSTATEMUTEX;
95 extern struct nfsstatsv1 nfsstatsv1;
96 extern struct nfsreqhead nfsd_reqq;
97 extern int nfscl_ticks;
98 extern void (*ncl_call_invalcaches)(struct vnode *);
99 extern int nfs_numnfscbd;
100 extern int nfscl_debuglevel;
101 extern int nfsrv_lease;
102 
103 SVCPOOL		*nfscbd_pool;
104 int		nfs_bufpackets = 4;
105 static int	nfsrv_gsscallbackson = 0;
106 static int	nfs_reconnects;
107 static int	nfs3_jukebox_delay = 10;
108 static int	nfs_skip_wcc_data_onerr = 1;
109 static int	nfs_dsretries = 2;
110 static struct timespec	nfs_trylater_max = {
111 	.tv_sec		= NFS_TRYLATERDEL,
112 	.tv_nsec	= 0,
113 };
114 
115 SYSCTL_DECL(_vfs_nfs);
116 
117 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
118     "Buffer reservation size 2 < x < 64");
119 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
120     "Number of times the nfs client has had to reconnect");
121 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
122     "Number of seconds to delay a retry after receiving EJUKEBOX");
123 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
124     "Disable weak cache consistency checking when server returns an error");
125 SYSCTL_INT(_vfs_nfs, OID_AUTO, dsretries, CTLFLAG_RW, &nfs_dsretries, 0,
126     "Number of retries for a DS RPC before failure");
127 
128 static void	nfs_down(struct nfsmount *, struct thread *, const char *,
129     int, int);
130 static void	nfs_up(struct nfsmount *, struct thread *, const char *,
131     int, int);
132 static int	nfs_msg(struct thread *, const char *, const char *, int);
133 
134 struct nfs_cached_auth {
135 	int		ca_refs; /* refcount, including 1 from the cache */
136 	uid_t		ca_uid;	 /* uid that corresponds to this auth */
137 	AUTH		*ca_auth; /* RPC auth handle */
138 };
139 
140 static int nfsv2_procid[NFS_V3NPROCS] = {
141 	NFSV2PROC_NULL,
142 	NFSV2PROC_GETATTR,
143 	NFSV2PROC_SETATTR,
144 	NFSV2PROC_LOOKUP,
145 	NFSV2PROC_NOOP,
146 	NFSV2PROC_READLINK,
147 	NFSV2PROC_READ,
148 	NFSV2PROC_WRITE,
149 	NFSV2PROC_CREATE,
150 	NFSV2PROC_MKDIR,
151 	NFSV2PROC_SYMLINK,
152 	NFSV2PROC_CREATE,
153 	NFSV2PROC_REMOVE,
154 	NFSV2PROC_RMDIR,
155 	NFSV2PROC_RENAME,
156 	NFSV2PROC_LINK,
157 	NFSV2PROC_READDIR,
158 	NFSV2PROC_NOOP,
159 	NFSV2PROC_STATFS,
160 	NFSV2PROC_NOOP,
161 	NFSV2PROC_NOOP,
162 	NFSV2PROC_NOOP,
163 };
164 
165 /*
166  * This static array indicates that a NFSv4 RPC should use
167  * RPCSEC_GSS, if the mount indicates that via sec=krb5[ip].
168  * System RPCs that do not use file handles will be false
169  * in this array so that they will use AUTH_SYS when the
170  * "syskrb5" mount option is specified, along with
171  * "sec=krb5[ip]".
172  */
173 static bool nfscl_use_gss[NFSV42_NPROCS] = {
174 	true,
175 	true,
176 	true,
177 	true,
178 	true,
179 	true,
180 	true,
181 	true,
182 	true,
183 	true,
184 	true,
185 	true,
186 	true,
187 	true,
188 	true,
189 	true,
190 	true,
191 	true,
192 	true,
193 	true,
194 	true,
195 	true,
196 	true,
197 	false,		/* SetClientID */
198 	false,		/* SetClientIDConfirm */
199 	true,
200 	true,
201 	true,
202 	true,
203 	true,
204 	true,
205 	true,
206 	false,		/* Renew */
207 	true,
208 	false,		/* ReleaseLockOwn */
209 	true,
210 	true,
211 	true,
212 	true,
213 	true,
214 	true,
215 	false,		/* ExchangeID */
216 	false,		/* CreateSession */
217 	false,		/* DestroySession */
218 	false,		/* DestroyClientID */
219 	false,		/* FreeStateID */
220 	true,
221 	true,
222 	true,
223 	true,
224 	false,		/* ReclaimComplete */
225 	true,
226 	true,
227 	true,
228 	true,
229 	true,
230 	true,
231 	true,
232 	true,
233 	true,
234 	true,
235 	true,
236 	true,
237 	true,
238 	true,
239 	false,		/* BindConnectionToSession */
240 	true,
241 	true,
242 	true,
243 	true,
244 };
245 
246 /*
247  * Initialize sockets and congestion for a new NFS connection.
248  * We do not free the sockaddr if error.
249  * Which arguments are set to NULL indicate what kind of call it is.
250  * cred == NULL --> a call to connect to a pNFS DS
251  * nmp == NULL --> indicates an upcall to userland or a NFSv4.0 callback
252  */
253 int
254 newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
255     struct ucred *cred, NFSPROC_T *p, int callback_retry_mult, bool dotls,
256     struct __rpc_client **clipp)
257 {
258 	int rcvreserve, sndreserve;
259 	int pktscale, pktscalesav;
260 	struct sockaddr *saddr;
261 	struct ucred *origcred;
262 	CLIENT *client;
263 	struct netconfig *nconf;
264 	struct socket *so;
265 	int one = 1, retries, error = 0;
266 	struct thread *td = curthread;
267 	SVCXPRT *xprt;
268 	struct timeval timo;
269 	uint64_t tval;
270 
271 	/*
272 	 * We need to establish the socket using the credentials of
273 	 * the mountpoint.  Some parts of this process (such as
274 	 * sobind() and soconnect()) will use the curent thread's
275 	 * credential instead of the socket credential.  To work
276 	 * around this, temporarily change the current thread's
277 	 * credential to that of the mountpoint.
278 	 *
279 	 * XXX: It would be better to explicitly pass the correct
280 	 * credential to sobind() and soconnect().
281 	 */
282 	origcred = td->td_ucred;
283 
284 	/*
285 	 * Use the credential in nr_cred, if not NULL.
286 	 */
287 	if (nrp->nr_cred != NULL)
288 		td->td_ucred = nrp->nr_cred;
289 	else
290 		td->td_ucred = cred;
291 	saddr = nrp->nr_nam;
292 
293 	if (saddr->sa_family == AF_INET)
294 		if (nrp->nr_sotype == SOCK_DGRAM)
295 			nconf = getnetconfigent("udp");
296 		else
297 			nconf = getnetconfigent("tcp");
298 	else
299 		if (nrp->nr_sotype == SOCK_DGRAM)
300 			nconf = getnetconfigent("udp6");
301 		else
302 			nconf = getnetconfigent("tcp6");
303 
304 	pktscale = nfs_bufpackets;
305 	if (pktscale < 2)
306 		pktscale = 2;
307 	if (pktscale > 64)
308 		pktscale = 64;
309 	pktscalesav = pktscale;
310 	/*
311 	 * soreserve() can fail if sb_max is too small, so shrink pktscale
312 	 * and try again if there is an error.
313 	 * Print a log message suggesting increasing sb_max.
314 	 * Creating a socket and doing this is necessary since, if the
315 	 * reservation sizes are too large and will make soreserve() fail,
316 	 * the connection will work until a large send is attempted and
317 	 * then it will loop in the krpc code.
318 	 */
319 	so = NULL;
320 	saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
321 	error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
322 	    nrp->nr_soproto, td->td_ucred, td);
323 	if (error != 0)
324 		goto out;
325 	do {
326 	    if (error != 0 && pktscale > 2) {
327 		if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
328 		    pktscale == pktscalesav) {
329 		    /*
330 		     * Suggest vfs.nfs.bufpackets * maximum RPC message,
331 		     * adjusted for the sb_max->sb_max_adj conversion of
332 		     * MCLBYTES / (MSIZE + MCLBYTES) as the minimum setting
333 		     * for kern.ipc.maxsockbuf.
334 		     */
335 		    tval = (NFS_MAXBSIZE + NFS_MAXXDR) * nfs_bufpackets;
336 		    tval *= MSIZE + MCLBYTES;
337 		    tval += MCLBYTES - 1; /* Round up divide by MCLBYTES. */
338 		    tval /= MCLBYTES;
339 		    printf("Consider increasing kern.ipc.maxsockbuf to a "
340 			"minimum of %ju to support %ubyte NFS I/O\n",
341 			(uintmax_t)tval, NFS_MAXBSIZE);
342 		}
343 		pktscale--;
344 	    }
345 	    if (nrp->nr_sotype == SOCK_DGRAM) {
346 		if (nmp != NULL) {
347 			sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
348 			    pktscale;
349 			rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
350 			    pktscale;
351 		} else {
352 			sndreserve = rcvreserve = 1024 * pktscale;
353 		}
354 	    } else {
355 		if (nrp->nr_sotype != SOCK_STREAM)
356 			panic("nfscon sotype");
357 		if (nmp != NULL) {
358 			sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
359 			    pktscale;
360 			rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
361 			    pktscale;
362 		} else {
363 			sndreserve = rcvreserve = 1024 * pktscale;
364 		}
365 	    }
366 	    error = soreserve(so, sndreserve, rcvreserve);
367 	    if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
368 		pktscale <= 2)
369 		printf("Must increase kern.ipc.maxsockbuf or reduce"
370 		    " rsize, wsize\n");
371 	} while (error != 0 && pktscale > 2);
372 	soclose(so);
373 	if (error != 0)
374 		goto out;
375 
376 	client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
377 	    nrp->nr_vers, sndreserve, rcvreserve);
378 	CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
379 	if (nmp != NULL) {
380 		if ((nmp->nm_flag & NFSMNT_INT))
381 			CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
382 		if ((nmp->nm_flag & NFSMNT_RESVPORT))
383 			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
384 		if (NFSHASTLS(nmp)) {
385 			CLNT_CONTROL(client, CLSET_TLS, &one);
386 			if (nmp->nm_tlscertname != NULL)
387 				CLNT_CONTROL(client, CLSET_TLSCERTNAME,
388 				    nmp->nm_tlscertname);
389 		}
390 		if (NFSHASSOFT(nmp)) {
391 			if (nmp->nm_sotype == SOCK_DGRAM)
392 				/*
393 				 * For UDP, the large timeout for a reconnect
394 				 * will be set to "nm_retry * nm_timeo / 2", so
395 				 * we only want to do 2 reconnect timeout
396 				 * retries.
397 				 */
398 				retries = 2;
399 			else
400 				retries = nmp->nm_retry;
401 		} else
402 			retries = INT_MAX;
403 		if (NFSHASNFSV4N(nmp)) {
404 			if (cred != NULL) {
405 				if (NFSHASSOFT(nmp)) {
406 					/*
407 					 * This should be a DS mount.
408 					 * Use CLSET_TIMEOUT to set the timeout
409 					 * for connections to DSs instead of
410 					 * specifying a timeout on each RPC.
411 					 * This is done so that SO_SNDTIMEO
412 					 * is set on the TCP socket as well
413 					 * as specifying a time limit when
414 					 * waiting for an RPC reply.  Useful
415 					 * if the send queue for the TCP
416 					 * connection has become constipated,
417 					 * due to a failed DS.
418 					 * The choice of lease_duration / 4 is
419 					 * fairly arbitrary, but seems to work
420 					 * ok, with a lower bound of 10sec.
421 					 */
422 					timo.tv_sec = nfsrv_lease / 4;
423 					if (timo.tv_sec < 10)
424 						timo.tv_sec = 10;
425 					timo.tv_usec = 0;
426 					CLNT_CONTROL(client, CLSET_TIMEOUT,
427 					    &timo);
428 				}
429 				/*
430 				 * Make sure the nfscbd_pool doesn't get
431 				 * destroyed while doing this.
432 				 */
433 				NFSD_LOCK();
434 				if (nfs_numnfscbd > 0) {
435 					nfs_numnfscbd++;
436 					NFSD_UNLOCK();
437 					xprt = svc_vc_create_backchannel(
438 					    nfscbd_pool);
439 					CLNT_CONTROL(client, CLSET_BACKCHANNEL,
440 					    xprt);
441 					NFSD_LOCK();
442 					nfs_numnfscbd--;
443 					if (nfs_numnfscbd == 0)
444 						wakeup(&nfs_numnfscbd);
445 				}
446 				NFSD_UNLOCK();
447 			} else {
448 				/*
449 				 * cred == NULL for a DS connect.
450 				 * For connects to a DS, set a retry limit
451 				 * so that failed DSs will be detected.
452 				 * This is ok for NFSv4.1, since a DS does
453 				 * not maintain open/lock state and is the
454 				 * only case where using a "soft" mount is
455 				 * recommended for NFSv4.
456 				 * For mounts from the MDS to DS, this is done
457 				 * via mount options, but that is not the case
458 				 * here.  The retry limit here can be adjusted
459 				 * via the sysctl vfs.nfs.dsretries.
460 				 * See the comment above w.r.t. timeout.
461 				 */
462 				timo.tv_sec = nfsrv_lease / 4;
463 				if (timo.tv_sec < 10)
464 					timo.tv_sec = 10;
465 				timo.tv_usec = 0;
466 				CLNT_CONTROL(client, CLSET_TIMEOUT, &timo);
467 				retries = nfs_dsretries;
468 			}
469 		}
470 	} else {
471 		/*
472 		 * Three cases:
473 		 * - Null RPC callback to client
474 		 * - Non-Null RPC callback to client, wait a little longer
475 		 * - upcalls to nfsuserd and gssd (clp == NULL)
476 		 */
477 		if (callback_retry_mult == 0) {
478 			retries = NFSV4_UPCALLRETRY;
479 			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
480 		} else {
481 			retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
482 		}
483 		if (dotls)
484 			CLNT_CONTROL(client, CLSET_TLS, &one);
485 	}
486 	CLNT_CONTROL(client, CLSET_RETRIES, &retries);
487 
488 	if (nmp != NULL) {
489 		/*
490 		 * For UDP, there are 2 timeouts:
491 		 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
492 		 *   that does a retransmit of an RPC request using the same
493 		 *   socket and xid. This is what you normally want to do,
494 		 *   since NFS servers depend on "same xid" for their
495 		 *   Duplicate Request Cache.
496 		 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
497 		 *   retransmits on the same socket should fail and a fresh
498 		 *   socket created. Each of these timeouts counts as one
499 		 *   CLSET_RETRIES as set above.
500 		 * Set the initial retransmit timeout for UDP. This timeout
501 		 * doesn't exist for TCP and the following call just fails,
502 		 * which is ok.
503 		 */
504 		timo.tv_sec = nmp->nm_timeo / NFS_HZ;
505 		timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
506 		CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
507 	}
508 
509 	/*
510 	 * *clipp is &nrp->nr_client or &nm_aconn[nmp->nm_nextaconn].
511 	 * The latter case is for additional connections specified by the
512 	 * "nconnect" mount option.  nr_mtx etc is used for these additional
513 	 * connections, as well as nr_client in the nfssockreq
514 	 * structure for the mount.
515 	 */
516 	mtx_lock(&nrp->nr_mtx);
517 	if (*clipp != NULL) {
518 		mtx_unlock(&nrp->nr_mtx);
519 		/*
520 		 * Someone else already connected.
521 		 */
522 		CLNT_RELEASE(client);
523 	} else {
524 		*clipp = client;
525 		/*
526 		 * Protocols that do not require connections may be optionally
527 		 * left unconnected for servers that reply from a port other
528 		 * than NFS_PORT.
529 		 */
530 		if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
531 			mtx_unlock(&nrp->nr_mtx);
532 			CLNT_CONTROL(client, CLSET_CONNECT, &one);
533 		} else
534 			mtx_unlock(&nrp->nr_mtx);
535 	}
536 
537 out:
538 	/* Restore current thread's credentials. */
539 	td->td_ucred = origcred;
540 
541 	NFSEXITCODE(error);
542 	return (error);
543 }
544 
545 /*
546  * NFS disconnect. Clean up and unlink.
547  */
548 void
549 newnfs_disconnect(struct nfsmount *nmp, struct nfssockreq *nrp)
550 {
551 	CLIENT *client, *aconn[NFS_MAXNCONN - 1];
552 	int i;
553 
554 	mtx_lock(&nrp->nr_mtx);
555 	if (nrp->nr_client != NULL) {
556 		client = nrp->nr_client;
557 		nrp->nr_client = NULL;
558 		if (nmp != NULL && nmp->nm_aconnect > 0) {
559 			for (i = 0; i < nmp->nm_aconnect; i++) {
560 				aconn[i] = nmp->nm_aconn[i];
561 				nmp->nm_aconn[i] = NULL;
562 			}
563 		}
564 		mtx_unlock(&nrp->nr_mtx);
565 		rpc_gss_secpurge_call(client);
566 		CLNT_CLOSE(client);
567 		CLNT_RELEASE(client);
568 		if (nmp != NULL && nmp->nm_aconnect > 0) {
569 			for (i = 0; i < nmp->nm_aconnect; i++) {
570 				if (aconn[i] != NULL) {
571 					rpc_gss_secpurge_call(aconn[i]);
572 					CLNT_CLOSE(aconn[i]);
573 					CLNT_RELEASE(aconn[i]);
574 				}
575 			}
576 		}
577 	} else {
578 		mtx_unlock(&nrp->nr_mtx);
579 	}
580 }
581 
582 static AUTH *
583 nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
584     char *srv_principal, gss_OID mech_oid, struct ucred *cred)
585 {
586 	rpc_gss_service_t svc;
587 	AUTH *auth;
588 
589 	switch (secflavour) {
590 	case RPCSEC_GSS_KRB5:
591 	case RPCSEC_GSS_KRB5I:
592 	case RPCSEC_GSS_KRB5P:
593 		if (!mech_oid) {
594 			if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
595 				return (NULL);
596 		}
597 		if (secflavour == RPCSEC_GSS_KRB5)
598 			svc = rpc_gss_svc_none;
599 		else if (secflavour == RPCSEC_GSS_KRB5I)
600 			svc = rpc_gss_svc_integrity;
601 		else
602 			svc = rpc_gss_svc_privacy;
603 
604 		if (clnt_principal == NULL)
605 			auth = rpc_gss_secfind_call(nrp->nr_client, cred,
606 			    srv_principal, mech_oid, svc);
607 		else {
608 			auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
609 			    clnt_principal, srv_principal, "kerberosv5",
610 			    svc, NULL, NULL, NULL);
611 			return (auth);
612 		}
613 		if (auth != NULL)
614 			return (auth);
615 		/* fallthrough */
616 	case AUTH_SYS:
617 	default:
618 		return (authunix_create(cred));
619 	}
620 }
621 
622 /*
623  * Callback from the RPC code to generate up/down notifications.
624  */
625 
626 struct nfs_feedback_arg {
627 	struct nfsmount *nf_mount;
628 	int		nf_lastmsg;	/* last tprintf */
629 	int		nf_tprintfmsg;
630 	struct thread	*nf_td;
631 };
632 
633 static void
634 nfs_feedback(int type, int proc, void *arg)
635 {
636 	struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
637 	struct nfsmount *nmp = nf->nf_mount;
638 	time_t now;
639 
640 	switch (type) {
641 	case FEEDBACK_REXMIT2:
642 	case FEEDBACK_RECONNECT:
643 		now = NFSD_MONOSEC;
644 		if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
645 			nfs_down(nmp, nf->nf_td,
646 			    "not responding", 0, NFSSTA_TIMEO);
647 			nf->nf_tprintfmsg = TRUE;
648 			nf->nf_lastmsg = now;
649 		}
650 		break;
651 
652 	case FEEDBACK_OK:
653 		nfs_up(nf->nf_mount, nf->nf_td,
654 		    "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
655 		break;
656 	}
657 }
658 
659 /*
660  * newnfs_request - goes something like this
661  *	- does the rpc by calling the krpc layer
662  *	- break down rpc header and return with nfs reply
663  * nb: always frees up nd_mreq mbuf list
664  */
665 int
666 newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
667     struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
668     struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
669     u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
670 {
671 	uint32_t retseq, retval, slotseq, *tl;
672 	int i = 0, j = 0, opcnt, set_sigset = 0, slot;
673 	int error = 0, usegssname = 0, secflavour = AUTH_SYS;
674 	int freeslot, maxslot, reterr, slotpos, timeo;
675 	u_int16_t procnum;
676 	u_int nextconn;
677 	struct nfs_feedback_arg nf;
678 	struct timeval timo;
679 	AUTH *auth;
680 	struct rpc_callextra ext;
681 	enum clnt_stat stat;
682 	struct nfsreq *rep = NULL;
683 	char *srv_principal = NULL, *clnt_principal = NULL;
684 	sigset_t oldset;
685 	struct ucred *authcred;
686 	struct nfsclsession *sep;
687 	uint8_t sessionid[NFSX_V4SESSIONID];
688 	bool nextconn_set;
689 	struct timespec trylater_delay, ts, waituntil;
690 
691 	/* Initially 1msec. */
692 	trylater_delay.tv_sec = 0;
693 	trylater_delay.tv_nsec = 1000000;
694 	sep = dssep;
695 	if (xidp != NULL)
696 		*xidp = 0;
697 	/* Reject requests while attempting a forced unmount. */
698 	if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
699 		m_freem(nd->nd_mreq);
700 		return (ESTALE);
701 	}
702 
703 	/*
704 	 * Set authcred, which is used to acquire RPC credentials to
705 	 * the cred argument, by default. The crhold() should not be
706 	 * necessary, but will ensure that some future code change
707 	 * doesn't result in the credential being free'd prematurely.
708 	 */
709 	authcred = crhold(cred);
710 
711 	/* For client side interruptible mounts, mask off the signals. */
712 	if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
713 		newnfs_set_sigmask(td, &oldset);
714 		set_sigset = 1;
715 	}
716 
717 	/*
718 	 * If not already connected call newnfs_connect now.
719 	 */
720 	if (nrp->nr_client == NULL)
721 		newnfs_connect(nmp, nrp, cred, td, 0, false, &nrp->nr_client);
722 
723 	/*
724 	 * If the "nconnect" mount option was specified and this RPC is
725 	 * one that can have a large RPC message and is being done through
726 	 * the NFS/MDS server, use an additional connection. (When the RPC is
727 	 * being done through the server/MDS, nrp == &nmp->nm_sockreq.)
728 	 * The "nconnect" mount option normally has minimal effect when the
729 	 * "pnfs" mount option is specified, since only Readdir RPCs are
730 	 * normally done through the NFS/MDS server.
731 	 */
732 	nextconn_set = false;
733 	if (nmp != NULL && nmp->nm_aconnect > 0 && nrp == &nmp->nm_sockreq &&
734 	    (nd->nd_procnum == NFSPROC_READ ||
735 	     nd->nd_procnum == NFSPROC_READDIR ||
736 	     nd->nd_procnum == NFSPROC_READDIRPLUS ||
737 	     nd->nd_procnum == NFSPROC_WRITE)) {
738 		nextconn = atomic_fetchadd_int(&nmp->nm_nextaconn, 1);
739 		nextconn %= nmp->nm_aconnect;
740 		nextconn_set = true;
741 		if (nmp->nm_aconn[nextconn] == NULL)
742 			newnfs_connect(nmp, nrp, cred, td, 0, false,
743 			    &nmp->nm_aconn[nextconn]);
744 	}
745 
746 	/*
747 	 * For a client side mount, nmp is != NULL and clp == NULL. For
748 	 * server calls (callbacks or upcalls), nmp == NULL.
749 	 */
750 	if (clp != NULL) {
751 		NFSLOCKSTATE();
752 		if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
753 			secflavour = RPCSEC_GSS_KRB5;
754 			if (nd->nd_procnum != NFSPROC_NULL) {
755 				if (clp->lc_flags & LCL_GSSINTEGRITY)
756 					secflavour = RPCSEC_GSS_KRB5I;
757 				else if (clp->lc_flags & LCL_GSSPRIVACY)
758 					secflavour = RPCSEC_GSS_KRB5P;
759 			}
760 		}
761 		NFSUNLOCKSTATE();
762 	} else if (nmp != NULL && NFSHASKERB(nmp) &&
763 	     nd->nd_procnum != NFSPROC_NULL && (!NFSHASSYSKRB5(nmp) ||
764 	     nfscl_use_gss[nd->nd_procnum])) {
765 		if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
766 			nd->nd_flag |= ND_USEGSSNAME;
767 		if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
768 			/*
769 			 * If there is a client side host based credential,
770 			 * use that, otherwise use the system uid, if set.
771 			 * The system uid is in the nmp->nm_sockreq.nr_cred
772 			 * credentials.
773 			 */
774 			if (nmp->nm_krbnamelen > 0) {
775 				usegssname = 1;
776 				clnt_principal = nmp->nm_krbname;
777 			} else if (nmp->nm_uid != (uid_t)-1) {
778 				KASSERT(nmp->nm_sockreq.nr_cred != NULL,
779 				    ("newnfs_request: NULL nr_cred"));
780 				crfree(authcred);
781 				authcred = crhold(nmp->nm_sockreq.nr_cred);
782 			}
783 		} else if (nmp->nm_krbnamelen == 0 &&
784 		    nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
785 			/*
786 			 * If there is no host based principal name and
787 			 * the system uid is set and this is root, use the
788 			 * system uid, since root won't have user
789 			 * credentials in a credentials cache file.
790 			 * The system uid is in the nmp->nm_sockreq.nr_cred
791 			 * credentials.
792 			 */
793 			KASSERT(nmp->nm_sockreq.nr_cred != NULL,
794 			    ("newnfs_request: NULL nr_cred"));
795 			crfree(authcred);
796 			authcred = crhold(nmp->nm_sockreq.nr_cred);
797 		}
798 		if (NFSHASINTEGRITY(nmp))
799 			secflavour = RPCSEC_GSS_KRB5I;
800 		else if (NFSHASPRIVACY(nmp))
801 			secflavour = RPCSEC_GSS_KRB5P;
802 		else
803 			secflavour = RPCSEC_GSS_KRB5;
804 		srv_principal = NFSMNT_SRVKRBNAME(nmp);
805 	} else if (nmp != NULL && (!NFSHASKERB(nmp) || NFSHASSYSKRB5(nmp)) &&
806 	    nd->nd_procnum != NFSPROC_NULL &&
807 	    (nd->nd_flag & ND_USEGSSNAME) != 0) {
808 		/*
809 		 * Use the uid that did the mount when the RPC is doing
810 		 * NFSv4 system operations, as indicated by the
811 		 * ND_USEGSSNAME flag, for the AUTH_SYS case.
812 		 * The credentials in nm_sockreq.nr_cred were used for the
813 		 * mount.
814 		 */
815 		KASSERT(nmp->nm_sockreq.nr_cred != NULL,
816 		    ("newnfs_request: NULL nr_cred"));
817 		crfree(authcred);
818 		authcred = crhold(nmp->nm_sockreq.nr_cred);
819 	}
820 
821 	if (nmp != NULL) {
822 		bzero(&nf, sizeof(struct nfs_feedback_arg));
823 		nf.nf_mount = nmp;
824 		nf.nf_td = td;
825 		nf.nf_lastmsg = NFSD_MONOSEC -
826 		    ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
827 	}
828 
829 	if (nd->nd_procnum == NFSPROC_NULL)
830 		auth = authnone_create();
831 	else if (usegssname) {
832 		/*
833 		 * For this case, the authenticator is held in the
834 		 * nfssockreq structure, so don't release the reference count
835 		 * held on it. --> Don't AUTH_DESTROY() it in this function.
836 		 */
837 		if (nrp->nr_auth == NULL)
838 			nrp->nr_auth = nfs_getauth(nrp, secflavour,
839 			    clnt_principal, srv_principal, NULL, authcred);
840 		else
841 			rpc_gss_refresh_auth_call(nrp->nr_auth);
842 		auth = nrp->nr_auth;
843 	} else
844 		auth = nfs_getauth(nrp, secflavour, NULL,
845 		    srv_principal, NULL, authcred);
846 	crfree(authcred);
847 	if (auth == NULL) {
848 		m_freem(nd->nd_mreq);
849 		if (set_sigset)
850 			newnfs_restore_sigmask(td, &oldset);
851 		return (EACCES);
852 	}
853 	bzero(&ext, sizeof(ext));
854 	ext.rc_auth = auth;
855 	if (nmp != NULL) {
856 		ext.rc_feedback = nfs_feedback;
857 		ext.rc_feedback_arg = &nf;
858 	}
859 
860 	procnum = nd->nd_procnum;
861 	if ((nd->nd_flag & ND_NFSV4) &&
862 	    nd->nd_procnum != NFSPROC_NULL &&
863 	    nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
864 		procnum = NFSV4PROC_COMPOUND;
865 
866 	if (nmp != NULL) {
867 		NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
868 
869 		/* Map the procnum to the old NFSv2 one, as required. */
870 		if ((nd->nd_flag & ND_NFSV2) != 0) {
871 			if (nd->nd_procnum < NFS_V3NPROCS)
872 				procnum = nfsv2_procid[nd->nd_procnum];
873 			else
874 				procnum = NFSV2PROC_NOOP;
875 		}
876 
877 		/*
878 		 * Now only used for the R_DONTRECOVER case, but until that is
879 		 * supported within the krpc code, I need to keep a queue of
880 		 * outstanding RPCs for nfsv4 client requests.
881 		 */
882 		if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
883 			rep = malloc(sizeof(struct nfsreq),
884 			    M_NFSDREQ, M_WAITOK);
885 #ifdef KDTRACE_HOOKS
886 		if (dtrace_nfscl_nfs234_start_probe != NULL) {
887 			uint32_t probe_id;
888 			int probe_procnum;
889 
890 			if (nd->nd_flag & ND_NFSV4) {
891 				probe_id =
892 				    nfscl_nfs4_start_probes[nd->nd_procnum];
893 				probe_procnum = nd->nd_procnum;
894 			} else if (nd->nd_flag & ND_NFSV3) {
895 				probe_id = nfscl_nfs3_start_probes[procnum];
896 				probe_procnum = procnum;
897 			} else {
898 				probe_id =
899 				    nfscl_nfs2_start_probes[nd->nd_procnum];
900 				probe_procnum = procnum;
901 			}
902 			if (probe_id != 0)
903 				(dtrace_nfscl_nfs234_start_probe)
904 				    (probe_id, vp, nd->nd_mreq, cred,
905 				     probe_procnum);
906 		}
907 #endif
908 	}
909 	freeslot = -1;		/* Set to slot that needs to be free'd */
910 tryagain:
911 	slot = -1;		/* Slot that needs a sequence# increment. */
912 	/*
913 	 * This timeout specifies when a new socket should be created,
914 	 * along with new xid values. For UDP, this should be done
915 	 * infrequently, since retransmits of RPC requests should normally
916 	 * use the same xid.
917 	 */
918 	if (nmp == NULL) {
919 		if (clp == NULL) {
920 			timo.tv_sec = NFSV4_UPCALLTIMEO;
921 			timo.tv_usec = 0;
922 		} else {
923 			timo.tv_sec = NFSV4_CALLBACKTIMEO / 1000;
924 			timo.tv_usec = NFSV4_CALLBACKTIMEO * 1000;
925 		}
926 	} else {
927 		if (nrp->nr_sotype != SOCK_DGRAM) {
928 			timo.tv_usec = 0;
929 			if ((nmp->nm_flag & NFSMNT_NFSV4))
930 				timo.tv_sec = INT_MAX;
931 			else
932 				timo.tv_sec = NFS_TCPTIMEO;
933 		} else {
934 			if (NFSHASSOFT(nmp)) {
935 				/*
936 				 * CLSET_RETRIES is set to 2, so this should be
937 				 * half of the total timeout required.
938 				 */
939 				timeo = nmp->nm_retry * nmp->nm_timeo / 2;
940 				if (timeo < 1)
941 					timeo = 1;
942 				timo.tv_sec = timeo / NFS_HZ;
943 				timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
944 				    NFS_HZ;
945 			} else {
946 				/* For UDP hard mounts, use a large value. */
947 				timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
948 				timo.tv_usec = 0;
949 			}
950 		}
951 
952 		if (rep != NULL) {
953 			rep->r_flags = 0;
954 			rep->r_nmp = nmp;
955 			/*
956 			 * Chain request into list of outstanding requests.
957 			 */
958 			NFSLOCKREQ();
959 			TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
960 			NFSUNLOCKREQ();
961 		}
962 	}
963 
964 	nd->nd_mrep = NULL;
965 	if (clp != NULL && sep != NULL)
966 		stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
967 		    nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
968 	else if (nextconn_set)
969 		/*
970 		 * When there are multiple TCP connections, send the
971 		 * RPCs with large messages on the alternate TCP
972 		 * connection(s) in a round robin fashion.
973 		 * The small RPC messages are sent on the default
974 		 * TCP connection because they do not require much
975 		 * network bandwidth and separating them from the
976 		 * large RPC messages avoids them getting "log jammed"
977 		 * behind several large RPC messages.
978 		 */
979 		stat = CLNT_CALL_MBUF(nmp->nm_aconn[nextconn],
980 		    &ext, procnum, nd->nd_mreq, &nd->nd_mrep, timo);
981 	else
982 		stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
983 		    nd->nd_mreq, &nd->nd_mrep, timo);
984 	NFSCL_DEBUG(2, "clnt call=%d\n", stat);
985 
986 	if (rep != NULL) {
987 		/*
988 		 * RPC done, unlink the request.
989 		 */
990 		NFSLOCKREQ();
991 		TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
992 		NFSUNLOCKREQ();
993 	}
994 
995 	/*
996 	 * If there was a successful reply and a tprintf msg.
997 	 * tprintf a response.
998 	 */
999 	if (stat == RPC_SUCCESS) {
1000 		error = 0;
1001 	} else if (stat == RPC_TIMEDOUT) {
1002 		NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
1003 		error = ETIMEDOUT;
1004 	} else if (stat == RPC_VERSMISMATCH) {
1005 		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1006 		error = EOPNOTSUPP;
1007 	} else if (stat == RPC_PROGVERSMISMATCH) {
1008 		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1009 		error = EPROTONOSUPPORT;
1010 	} else if (stat == RPC_CANTSEND || stat == RPC_CANTRECV ||
1011 	     stat == RPC_SYSTEMERROR || stat == RPC_INTR) {
1012 		/* Check for a session slot that needs to be free'd. */
1013 		if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1014 		    (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1015 		    nd->nd_procnum != NFSPROC_NULL) {
1016 			/*
1017 			 * This should only occur when either the MDS or
1018 			 * a client has an RPC against a DS fail.
1019 			 * This happens because these cases use "soft"
1020 			 * connections that can time out and fail.
1021 			 * The slot used for this RPC is now in a
1022 			 * non-deterministic state, but if the slot isn't
1023 			 * free'd, threads can get stuck waiting for a slot.
1024 			 */
1025 			if (sep == NULL)
1026 				sep = nfsmnt_mdssession(nmp);
1027 			/*
1028 			 * Bump the sequence# out of range, so that reuse of
1029 			 * this slot will result in an NFSERR_SEQMISORDERED
1030 			 * error and not a bogus cached RPC reply.
1031 			 */
1032 			mtx_lock(&sep->nfsess_mtx);
1033 			sep->nfsess_slotseq[nd->nd_slotid] += 10;
1034 			sep->nfsess_badslots |= (0x1ULL << nd->nd_slotid);
1035 			mtx_unlock(&sep->nfsess_mtx);
1036 			/* And free the slot. */
1037 			nfsv4_freeslot(sep, nd->nd_slotid, false);
1038 		}
1039 		if (stat == RPC_INTR)
1040 			error = EINTR;
1041 		else {
1042 			NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1043 			error = ENXIO;
1044 		}
1045 	} else {
1046 		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1047 		error = EACCES;
1048 	}
1049 	if (error) {
1050 		m_freem(nd->nd_mreq);
1051 		if (usegssname == 0)
1052 			AUTH_DESTROY(auth);
1053 		if (rep != NULL)
1054 			free(rep, M_NFSDREQ);
1055 		if (set_sigset)
1056 			newnfs_restore_sigmask(td, &oldset);
1057 		return (error);
1058 	}
1059 
1060 	KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
1061 
1062 	/*
1063 	 * Search for any mbufs that are not a multiple of 4 bytes long
1064 	 * or with m_data not longword aligned.
1065 	 * These could cause pointer alignment problems, so copy them to
1066 	 * well aligned mbufs.
1067 	 */
1068 	newnfs_realign(&nd->nd_mrep, M_WAITOK);
1069 	nd->nd_md = nd->nd_mrep;
1070 	nd->nd_dpos = mtod(nd->nd_md, caddr_t);
1071 	nd->nd_repstat = 0;
1072 	if (nd->nd_procnum != NFSPROC_NULL &&
1073 	    nd->nd_procnum != NFSV4PROC_CBNULL) {
1074 		/* If sep == NULL, set it to the default in nmp. */
1075 		if (sep == NULL && nmp != NULL)
1076 			sep = nfsmnt_mdssession(nmp);
1077 		/*
1078 		 * and now the actual NFS xdr.
1079 		 */
1080 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1081 		nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
1082 		if (nd->nd_repstat >= 10000)
1083 			NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
1084 			    (int)nd->nd_repstat);
1085 
1086 		/*
1087 		 * Get rid of the tag, return count and SEQUENCE result for
1088 		 * NFSv4.
1089 		 */
1090 		if ((nd->nd_flag & ND_NFSV4) != 0 && nd->nd_repstat !=
1091 		    NFSERR_MINORVERMISMATCH) {
1092 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1093 			i = fxdr_unsigned(int, *tl);
1094 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
1095 			if (error)
1096 				goto nfsmout;
1097 			NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1098 			opcnt = fxdr_unsigned(int, *tl++);
1099 			i = fxdr_unsigned(int, *tl++);
1100 			j = fxdr_unsigned(int, *tl);
1101 			if (j >= 10000)
1102 				NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
1103 			/*
1104 			 * If the first op is Sequence, free up the slot.
1105 			 */
1106 			if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
1107 			   (clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0)) {
1108 				NFSCL_DEBUG(1, "failed seq=%d\n", j);
1109 				if (sep != NULL && i == NFSV4OP_SEQUENCE &&
1110 				    j == NFSERR_SEQMISORDERED) {
1111 					mtx_lock(&sep->nfsess_mtx);
1112 					sep->nfsess_badslots |=
1113 					    (0x1ULL << nd->nd_slotid);
1114 					mtx_unlock(&sep->nfsess_mtx);
1115 				}
1116 			}
1117 			if (((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
1118 			    (clp != NULL && i == NFSV4OP_CBSEQUENCE &&
1119 			    j == 0)) && sep != NULL) {
1120 				if (i == NFSV4OP_SEQUENCE)
1121 					NFSM_DISSECT(tl, uint32_t *,
1122 					    NFSX_V4SESSIONID +
1123 					    5 * NFSX_UNSIGNED);
1124 				else
1125 					NFSM_DISSECT(tl, uint32_t *,
1126 					    NFSX_V4SESSIONID +
1127 					    4 * NFSX_UNSIGNED);
1128 				mtx_lock(&sep->nfsess_mtx);
1129 				if (bcmp(tl, sep->nfsess_sessionid,
1130 				    NFSX_V4SESSIONID) == 0) {
1131 					tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
1132 					retseq = fxdr_unsigned(uint32_t, *tl++);
1133 					slot = fxdr_unsigned(int, *tl++);
1134 					if ((nd->nd_flag & ND_HASSLOTID) != 0) {
1135 						if (slot >= NFSV4_SLOTS ||
1136 						    (i == NFSV4OP_CBSEQUENCE &&
1137 						     slot >= NFSV4_CBSLOTS)) {
1138 							printf("newnfs_request:"
1139 							    " Bogus slot\n");
1140 							slot = nd->nd_slotid;
1141 						} else if (slot !=
1142 						    nd->nd_slotid) {
1143 						    printf("newnfs_request:"
1144 							" Wrong session "
1145 							"srvslot=%d "
1146 							"slot=%d\n", slot,
1147 							nd->nd_slotid);
1148 						    if (i == NFSV4OP_SEQUENCE) {
1149 							/*
1150 							 * Mark both slots as
1151 							 * bad, because we do
1152 							 * not know if the
1153 							 * server has advanced
1154 							 * the sequence# for
1155 							 * either of them.
1156 							 */
1157 							sep->nfsess_badslots |=
1158 							    (0x1ULL << slot);
1159 							sep->nfsess_badslots |=
1160 							    (0x1ULL <<
1161 							     nd->nd_slotid);
1162 						    }
1163 						    slot = nd->nd_slotid;
1164 						}
1165 						freeslot = slot;
1166 					} else if (slot != 0) {
1167 						printf("newnfs_request: Bad "
1168 						    "session slot=%d\n", slot);
1169 						slot = 0;
1170 					}
1171 					if (retseq != sep->nfsess_slotseq[slot])
1172 						printf("retseq diff 0x%x\n",
1173 						    retseq);
1174 					retval = fxdr_unsigned(uint32_t, *++tl);
1175 					if ((retval + 1) < sep->nfsess_foreslots
1176 					    )
1177 						sep->nfsess_foreslots = (retval
1178 						    + 1);
1179 					else if ((retval + 1) >
1180 					    sep->nfsess_foreslots)
1181 						sep->nfsess_foreslots = (retval
1182 						    < 64) ? (retval + 1) : 64;
1183 				}
1184 				mtx_unlock(&sep->nfsess_mtx);
1185 
1186 				/* Grab the op and status for the next one. */
1187 				if (opcnt > 1) {
1188 					NFSM_DISSECT(tl, uint32_t *,
1189 					    2 * NFSX_UNSIGNED);
1190 					i = fxdr_unsigned(int, *tl++);
1191 					j = fxdr_unsigned(int, *tl);
1192 				}
1193 			}
1194 		}
1195 		if (nd->nd_repstat != 0) {
1196 			if (nd->nd_repstat == NFSERR_BADSESSION &&
1197 			    nmp != NULL && dssep == NULL &&
1198 			    (nd->nd_flag & ND_NFSV41) != 0) {
1199 				/*
1200 				 * If this is a client side MDS RPC, mark
1201 				 * the MDS session defunct and initiate
1202 				 * recovery, as required.
1203 				 * The nfsess_defunct field is protected by
1204 				 * the NFSLOCKMNT()/nm_mtx lock and not the
1205 				 * nfsess_mtx lock to simplify its handling,
1206 				 * for the MDS session. This lock is also
1207 				 * sufficient for nfsess_sessionid, since it
1208 				 * never changes in the structure.
1209 				 */
1210 				NFSCL_DEBUG(1, "Got badsession\n");
1211 				NFSLOCKCLSTATE();
1212 				NFSLOCKMNT(nmp);
1213 				sep = NFSMNT_MDSSESSION(nmp);
1214 				if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
1215 				    NFSX_V4SESSIONID) == 0) {
1216 					printf("Initiate recovery. If server "
1217 					    "has not rebooted, "
1218 					    "check NFS clients for unique "
1219 					    "/etc/hostid's\n");
1220 					/* Initiate recovery. */
1221 					sep->nfsess_defunct = 1;
1222 					NFSCL_DEBUG(1, "Marked defunct\n");
1223 					if (nmp->nm_clp != NULL) {
1224 						nmp->nm_clp->nfsc_flags |=
1225 						    NFSCLFLAGS_RECOVER;
1226 						wakeup(nmp->nm_clp);
1227 					}
1228 				}
1229 				NFSUNLOCKCLSTATE();
1230 				/*
1231 				 * Sleep for up to 1sec waiting for a new
1232 				 * session.
1233 				 */
1234 				mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
1235 				    "nfsbadsess", hz);
1236 				/*
1237 				 * Get the session again, in case a new one
1238 				 * has been created during the sleep.
1239 				 */
1240 				sep = NFSMNT_MDSSESSION(nmp);
1241 				NFSUNLOCKMNT(nmp);
1242 				if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
1243 					reterr = nfsv4_sequencelookup(nmp, sep,
1244 					    &slotpos, &maxslot, &slotseq,
1245 					    sessionid, true);
1246 					if (reterr == 0) {
1247 						/* Fill in new session info. */
1248 						NFSCL_DEBUG(1,
1249 						  "Filling in new sequence\n");
1250 						tl = nd->nd_sequence;
1251 						bcopy(sessionid, tl,
1252 						    NFSX_V4SESSIONID);
1253 						tl += NFSX_V4SESSIONID /
1254 						    NFSX_UNSIGNED;
1255 						*tl++ = txdr_unsigned(slotseq);
1256 						*tl++ = txdr_unsigned(slotpos);
1257 						*tl = txdr_unsigned(maxslot);
1258 						nd->nd_slotid = slotpos;
1259 						nd->nd_flag |= ND_HASSLOTID;
1260 					}
1261 					if (reterr == NFSERR_BADSESSION ||
1262 					    reterr == 0) {
1263 						NFSCL_DEBUG(1,
1264 						    "Badsession looping\n");
1265 						m_freem(nd->nd_mrep);
1266 						nd->nd_mrep = NULL;
1267 						goto tryagain;
1268 					}
1269 					nd->nd_repstat = reterr;
1270 					NFSCL_DEBUG(1, "Got err=%d\n", reterr);
1271 				}
1272 			}
1273 			/*
1274 			 * When clp != NULL, it is a callback and all
1275 			 * callback operations can be retried for NFSERR_DELAY.
1276 			 */
1277 			if (((nd->nd_repstat == NFSERR_DELAY ||
1278 			      nd->nd_repstat == NFSERR_GRACE) &&
1279 			     (nd->nd_flag & ND_NFSV4) && (clp != NULL ||
1280 			     (nd->nd_procnum != NFSPROC_DELEGRETURN &&
1281 			     nd->nd_procnum != NFSPROC_SETATTR &&
1282 			     nd->nd_procnum != NFSPROC_READ &&
1283 			     nd->nd_procnum != NFSPROC_READDS &&
1284 			     nd->nd_procnum != NFSPROC_WRITE &&
1285 			     nd->nd_procnum != NFSPROC_WRITEDS &&
1286 			     nd->nd_procnum != NFSPROC_OPEN &&
1287 			     nd->nd_procnum != NFSPROC_OPENLAYGET &&
1288 			     nd->nd_procnum != NFSPROC_CREATE &&
1289 			     nd->nd_procnum != NFSPROC_CREATELAYGET &&
1290 			     nd->nd_procnum != NFSPROC_OPENCONFIRM &&
1291 			     nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
1292 			     nd->nd_procnum != NFSPROC_CLOSE &&
1293 			     nd->nd_procnum != NFSPROC_LOCK &&
1294 			     nd->nd_procnum != NFSPROC_LOCKU))) ||
1295 			    (nd->nd_repstat == NFSERR_DELAY &&
1296 			     (nd->nd_flag & ND_NFSV4) == 0) ||
1297 			    nd->nd_repstat == NFSERR_RESOURCE) {
1298 				/* Clip at NFS_TRYLATERDEL. */
1299 				if (timespeccmp(&trylater_delay,
1300 				    &nfs_trylater_max, >))
1301 					trylater_delay = nfs_trylater_max;
1302 				getnanouptime(&waituntil);
1303 				timespecadd(&waituntil, &trylater_delay,
1304 				    &waituntil);
1305 				do {
1306 					nfs_catnap(PZERO, 0, "nfstry");
1307 					getnanouptime(&ts);
1308 				} while (timespeccmp(&ts, &waituntil, <));
1309 				timespecadd(&trylater_delay, &trylater_delay,
1310 				    &trylater_delay);	/* Double each time. */
1311 				if (slot != -1) {
1312 					mtx_lock(&sep->nfsess_mtx);
1313 					sep->nfsess_slotseq[slot]++;
1314 					*nd->nd_slotseq = txdr_unsigned(
1315 					    sep->nfsess_slotseq[slot]);
1316 					mtx_unlock(&sep->nfsess_mtx);
1317 				}
1318 				m_freem(nd->nd_mrep);
1319 				nd->nd_mrep = NULL;
1320 				goto tryagain;
1321 			}
1322 
1323 			/*
1324 			 * If the File Handle was stale, invalidate the
1325 			 * lookup cache, just in case.
1326 			 * (vp != NULL implies a client side call)
1327 			 */
1328 			if (nd->nd_repstat == ESTALE && vp != NULL) {
1329 				cache_purge(vp);
1330 				if (ncl_call_invalcaches != NULL)
1331 					(*ncl_call_invalcaches)(vp);
1332 			}
1333 		}
1334 		if ((nd->nd_flag & ND_NFSV4) != 0) {
1335 			/* Free the slot, as required. */
1336 			if (freeslot != -1)
1337 				nfsv4_freeslot(sep, freeslot, false);
1338 			/*
1339 			 * If this op is Putfh, throw its results away.
1340 			 */
1341 			if (j >= 10000)
1342 				NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
1343 			if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
1344 				NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
1345 				i = fxdr_unsigned(int, *tl++);
1346 				j = fxdr_unsigned(int, *tl);
1347 				if (j >= 10000)
1348 					NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
1349 					    j);
1350 				/*
1351 				 * All Compounds that do an Op that must
1352 				 * be in sequence consist of NFSV4OP_PUTFH
1353 				 * followed by one of these. As such, we
1354 				 * can determine if the seqid# should be
1355 				 * incremented, here.
1356 				 */
1357 				if ((i == NFSV4OP_OPEN ||
1358 				     i == NFSV4OP_OPENCONFIRM ||
1359 				     i == NFSV4OP_OPENDOWNGRADE ||
1360 				     i == NFSV4OP_CLOSE ||
1361 				     i == NFSV4OP_LOCK ||
1362 				     i == NFSV4OP_LOCKU) &&
1363 				    (j == 0 ||
1364 				     (j != NFSERR_STALECLIENTID &&
1365 				      j != NFSERR_STALESTATEID &&
1366 				      j != NFSERR_BADSTATEID &&
1367 				      j != NFSERR_BADSEQID &&
1368 				      j != NFSERR_BADXDR &&
1369 				      j != NFSERR_RESOURCE &&
1370 				      j != NFSERR_NOFILEHANDLE)))
1371 					nd->nd_flag |= ND_INCRSEQID;
1372 			}
1373 			/*
1374 			 * If this op's status is non-zero, mark
1375 			 * that there is no more data to process.
1376 			 * The exception is Setattr, which always has xdr
1377 			 * when it has failed.
1378 			 */
1379 			if (j != 0 && i != NFSV4OP_SETATTR)
1380 				nd->nd_flag |= ND_NOMOREDATA;
1381 
1382 			/*
1383 			 * If R_DONTRECOVER is set, replace the stale error
1384 			 * reply, so that recovery isn't initiated.
1385 			 */
1386 			if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
1387 			     nd->nd_repstat == NFSERR_BADSESSION ||
1388 			     nd->nd_repstat == NFSERR_STALESTATEID) &&
1389 			    rep != NULL && (rep->r_flags & R_DONTRECOVER))
1390 				nd->nd_repstat = NFSERR_STALEDONTRECOVER;
1391 		}
1392 	}
1393 
1394 #ifdef KDTRACE_HOOKS
1395 	if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
1396 		uint32_t probe_id;
1397 		int probe_procnum;
1398 
1399 		if (nd->nd_flag & ND_NFSV4) {
1400 			probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
1401 			probe_procnum = nd->nd_procnum;
1402 		} else if (nd->nd_flag & ND_NFSV3) {
1403 			probe_id = nfscl_nfs3_done_probes[procnum];
1404 			probe_procnum = procnum;
1405 		} else {
1406 			probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
1407 			probe_procnum = procnum;
1408 		}
1409 		if (probe_id != 0)
1410 			(dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
1411 			    nd->nd_mreq, cred, probe_procnum, 0);
1412 	}
1413 #endif
1414 
1415 	m_freem(nd->nd_mreq);
1416 	if (usegssname == 0)
1417 		AUTH_DESTROY(auth);
1418 	if (rep != NULL)
1419 		free(rep, M_NFSDREQ);
1420 	if (set_sigset)
1421 		newnfs_restore_sigmask(td, &oldset);
1422 	return (0);
1423 nfsmout:
1424 	m_freem(nd->nd_mrep);
1425 	m_freem(nd->nd_mreq);
1426 	if (usegssname == 0)
1427 		AUTH_DESTROY(auth);
1428 	if (rep != NULL)
1429 		free(rep, M_NFSDREQ);
1430 	if (set_sigset)
1431 		newnfs_restore_sigmask(td, &oldset);
1432 	return (error);
1433 }
1434 
1435 /*
1436  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1437  * wait for all requests to complete. This is used by forced unmounts
1438  * to terminate any outstanding RPCs.
1439  */
1440 int
1441 newnfs_nmcancelreqs(struct nfsmount *nmp)
1442 {
1443 	struct nfsclds *dsp;
1444 	struct __rpc_client *cl;
1445 	int i;
1446 
1447 	if (nmp->nm_sockreq.nr_client != NULL)
1448 		CLNT_CLOSE(nmp->nm_sockreq.nr_client);
1449 	for (i = 0; i < nmp->nm_aconnect; i++)
1450 		if (nmp->nm_aconn[i] != NULL)
1451 			CLNT_CLOSE(nmp->nm_aconn[i]);
1452 lookformore:
1453 	NFSLOCKMNT(nmp);
1454 	TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
1455 		NFSLOCKDS(dsp);
1456 		if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
1457 		    (dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
1458 		    dsp->nfsclds_sockp != NULL &&
1459 		    dsp->nfsclds_sockp->nr_client != NULL) {
1460 			dsp->nfsclds_flags |= NFSCLDS_CLOSED;
1461 			cl = dsp->nfsclds_sockp->nr_client;
1462 			NFSUNLOCKDS(dsp);
1463 			NFSUNLOCKMNT(nmp);
1464 			CLNT_CLOSE(cl);
1465 			goto lookformore;
1466 		}
1467 		NFSUNLOCKDS(dsp);
1468 	}
1469 	NFSUNLOCKMNT(nmp);
1470 	return (0);
1471 }
1472 
1473 /*
1474  * Any signal that can interrupt an NFS operation in an intr mount
1475  * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
1476  */
1477 int newnfs_sig_set[] = {
1478 	SIGINT,
1479 	SIGTERM,
1480 	SIGHUP,
1481 	SIGKILL,
1482 	SIGQUIT
1483 };
1484 
1485 /*
1486  * Check to see if one of the signals in our subset is pending on
1487  * the process (in an intr mount).
1488  */
1489 static int
1490 nfs_sig_pending(sigset_t set)
1491 {
1492 	int i;
1493 
1494 	for (i = 0 ; i < nitems(newnfs_sig_set); i++)
1495 		if (SIGISMEMBER(set, newnfs_sig_set[i]))
1496 			return (1);
1497 	return (0);
1498 }
1499 
1500 /*
1501  * The set/restore sigmask functions are used to (temporarily) overwrite
1502  * the thread td_sigmask during an RPC call (for example). These are also
1503  * used in other places in the NFS client that might tsleep().
1504  */
1505 void
1506 newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
1507 {
1508 	sigset_t newset;
1509 	int i;
1510 	struct proc *p;
1511 
1512 	SIGFILLSET(newset);
1513 	if (td == NULL)
1514 		td = curthread; /* XXX */
1515 	p = td->td_proc;
1516 	/* Remove the NFS set of signals from newset */
1517 	PROC_LOCK(p);
1518 	mtx_lock(&p->p_sigacts->ps_mtx);
1519 	for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
1520 		/*
1521 		 * But make sure we leave the ones already masked
1522 		 * by the process, ie. remove the signal from the
1523 		 * temporary signalmask only if it wasn't already
1524 		 * in p_sigmask.
1525 		 */
1526 		if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
1527 		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
1528 			SIGDELSET(newset, newnfs_sig_set[i]);
1529 	}
1530 	mtx_unlock(&p->p_sigacts->ps_mtx);
1531 	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
1532 	    SIGPROCMASK_PROC_LOCKED);
1533 	PROC_UNLOCK(p);
1534 }
1535 
1536 void
1537 newnfs_restore_sigmask(struct thread *td, sigset_t *set)
1538 {
1539 	if (td == NULL)
1540 		td = curthread; /* XXX */
1541 	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
1542 }
1543 
1544 /*
1545  * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
1546  * old one after msleep() returns.
1547  */
1548 int
1549 newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
1550 {
1551 	sigset_t oldset;
1552 	int error;
1553 
1554 	if ((priority & PCATCH) == 0)
1555 		return msleep(ident, mtx, priority, wmesg, timo);
1556 	if (td == NULL)
1557 		td = curthread; /* XXX */
1558 	newnfs_set_sigmask(td, &oldset);
1559 	error = msleep(ident, mtx, priority, wmesg, timo);
1560 	newnfs_restore_sigmask(td, &oldset);
1561 	return (error);
1562 }
1563 
1564 /*
1565  * Test for a termination condition pending on the process.
1566  * This is used for NFSMNT_INT mounts.
1567  */
1568 int
1569 newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
1570 {
1571 	struct proc *p;
1572 	sigset_t tmpset;
1573 
1574 	/* Terminate all requests while attempting a forced unmount. */
1575 	if (NFSCL_FORCEDISM(nmp->nm_mountp))
1576 		return (EIO);
1577 	if (!(nmp->nm_flag & NFSMNT_INT))
1578 		return (0);
1579 	if (td == NULL)
1580 		return (0);
1581 	p = td->td_proc;
1582 	PROC_LOCK(p);
1583 	tmpset = p->p_siglist;
1584 	SIGSETOR(tmpset, td->td_siglist);
1585 	SIGSETNAND(tmpset, td->td_sigmask);
1586 	mtx_lock(&p->p_sigacts->ps_mtx);
1587 	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
1588 	mtx_unlock(&p->p_sigacts->ps_mtx);
1589 	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
1590 	    && nfs_sig_pending(tmpset)) {
1591 		PROC_UNLOCK(p);
1592 		return (EINTR);
1593 	}
1594 	PROC_UNLOCK(p);
1595 	return (0);
1596 }
1597 
1598 static int
1599 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
1600 {
1601 	struct proc *p;
1602 
1603 	p = td ? td->td_proc : NULL;
1604 	if (error) {
1605 		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
1606 		    server, msg, error);
1607 	} else {
1608 		tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
1609 	}
1610 	return (0);
1611 }
1612 
1613 static void
1614 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
1615     int error, int flags)
1616 {
1617 	if (nmp == NULL)
1618 		return;
1619 	mtx_lock(&nmp->nm_mtx);
1620 	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
1621 		nmp->nm_state |= NFSSTA_TIMEO;
1622 		mtx_unlock(&nmp->nm_mtx);
1623 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1624 		    VQ_NOTRESP, 0);
1625 	} else
1626 		mtx_unlock(&nmp->nm_mtx);
1627 	mtx_lock(&nmp->nm_mtx);
1628 	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1629 		nmp->nm_state |= NFSSTA_LOCKTIMEO;
1630 		mtx_unlock(&nmp->nm_mtx);
1631 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1632 		    VQ_NOTRESPLOCK, 0);
1633 	} else
1634 		mtx_unlock(&nmp->nm_mtx);
1635 	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
1636 }
1637 
1638 static void
1639 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
1640     int flags, int tprintfmsg)
1641 {
1642 	if (nmp == NULL)
1643 		return;
1644 	if (tprintfmsg) {
1645 		nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
1646 	}
1647 
1648 	mtx_lock(&nmp->nm_mtx);
1649 	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
1650 		nmp->nm_state &= ~NFSSTA_TIMEO;
1651 		mtx_unlock(&nmp->nm_mtx);
1652 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1653 		    VQ_NOTRESP, 1);
1654 	} else
1655 		mtx_unlock(&nmp->nm_mtx);
1656 
1657 	mtx_lock(&nmp->nm_mtx);
1658 	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1659 		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
1660 		mtx_unlock(&nmp->nm_mtx);
1661 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1662 		    VQ_NOTRESPLOCK, 1);
1663 	} else
1664 		mtx_unlock(&nmp->nm_mtx);
1665 }
1666