1 /* $OpenBSD: nfs_socket.c,v 1.150 2024/04/30 17:05:20 miod Exp $ */
2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */
3
4 /*
5 * Copyright (c) 1989, 1991, 1993, 1995
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Rick Macklem at The University of Guelph.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
36 */
37
38 /*
39 * Socket operations for use by nfs
40 */
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/proc.h>
45 #include <sys/mount.h>
46 #include <sys/kernel.h>
47 #include <sys/mbuf.h>
48 #include <sys/vnode.h>
49 #include <sys/protosw.h>
50 #include <sys/signalvar.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/syslog.h>
54 #include <sys/tprintf.h>
55 #include <sys/namei.h>
56 #include <sys/pool.h>
57 #include <sys/queue.h>
58
59 #include <netinet/in.h>
60 #include <netinet/tcp.h>
61
62 #include <nfs/rpcv2.h>
63 #include <nfs/nfsproto.h>
64 #include <nfs/nfs.h>
65 #include <nfs/xdr_subs.h>
66 #include <nfs/nfsmount.h>
67 #include <nfs/nfs_var.h>
68 #include <nfs/nfsm_subs.h>
69
70 /* External data, mostly RPC constants in XDR form. */
71 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
72 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr;
73 extern u_int32_t nfs_prog;
74 extern struct nfsstats nfsstats;
75 extern const int nfsv3_procid[NFS_NPROCS];
76 extern int nfs_ticks;
77
78 extern struct pool nfsrv_descript_pl;
79
80 /*
81 * There is a congestion window for outstanding rpcs maintained per mount
82 * point. The cwnd size is adjusted in roughly the way that:
83 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
84 * SIGCOMM '88". ACM, August 1988.
85 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
86 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
87 * of rpcs is in progress.
88 * (The sent count and cwnd are scaled for integer arith.)
89 * Variants of "slow start" were tried and were found to be too much of a
90 * performance hit (ave. rtt 3 times larger),
91 * I suspect due to the large rtt that nfs rpcs have.
92 */
93 #define NFS_CWNDSCALE 256
94 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
95 static const int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 };
96
97 /* RTT estimator */
98 static const enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = {
99 NFS_DEFAULT_TIMER, /* NULL */
100 NFS_GETATTR_TIMER, /* GETATTR */
101 NFS_DEFAULT_TIMER, /* SETATTR */
102 NFS_LOOKUP_TIMER, /* LOOKUP */
103 NFS_GETATTR_TIMER, /* ACCESS */
104 NFS_READ_TIMER, /* READLINK */
105 NFS_READ_TIMER, /* READ */
106 NFS_WRITE_TIMER, /* WRITE */
107 NFS_DEFAULT_TIMER, /* CREATE */
108 NFS_DEFAULT_TIMER, /* MKDIR */
109 NFS_DEFAULT_TIMER, /* SYMLINK */
110 NFS_DEFAULT_TIMER, /* MKNOD */
111 NFS_DEFAULT_TIMER, /* REMOVE */
112 NFS_DEFAULT_TIMER, /* RMDIR */
113 NFS_DEFAULT_TIMER, /* RENAME */
114 NFS_DEFAULT_TIMER, /* LINK */
115 NFS_READ_TIMER, /* READDIR */
116 NFS_READ_TIMER, /* READDIRPLUS */
117 NFS_DEFAULT_TIMER, /* FSSTAT */
118 NFS_DEFAULT_TIMER, /* FSINFO */
119 NFS_DEFAULT_TIMER, /* PATHCONF */
120 NFS_DEFAULT_TIMER, /* COMMIT */
121 NFS_DEFAULT_TIMER, /* NOOP */
122 };
123
124 void nfs_init_rtt(struct nfsmount *);
125 void nfs_update_rtt(struct nfsreq *);
126 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum);
127
128 void nfs_realign(struct mbuf **, int);
129 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *);
130
131 int nfs_rcvlock(struct nfsreq *);
132 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **);
133 int nfs_reconnect(struct nfsreq *);
134 int nfs_reply(struct nfsreq *);
135 void nfs_msg(struct nfsreq *, char *);
136 void nfs_rcvunlock(int *);
137
138 int nfsrv_getstream(struct nfssvc_sock *, int);
139
140 unsigned int nfs_realign_test = 0;
141 unsigned int nfs_realign_count = 0;
142
143 /* Initialize the RTT estimator state for a new mount point. */
144 void
nfs_init_rtt(struct nfsmount * nmp)145 nfs_init_rtt(struct nfsmount *nmp)
146 {
147 int i;
148
149 for (i = 0; i < NFS_MAX_TIMER; i++)
150 nmp->nm_srtt[i] = NFS_INITRTT;
151 for (i = 0; i < NFS_MAX_TIMER; i++)
152 nmp->nm_sdrtt[i] = 0;
153 }
154
155 /*
156 * Update a mount point's RTT estimator state using data from the
157 * passed-in request.
158 *
159 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
160 *
161 * NB: Since the timer resolution of NFS_HZ is so coarse, it can often
162 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
163 * between N + dt and N + 2 - dt ticks, add 1 before calculating the
164 * update values.
165 */
166 void
nfs_update_rtt(struct nfsreq * rep)167 nfs_update_rtt(struct nfsreq *rep)
168 {
169 int t1 = rep->r_rtt + 1;
170 int index = nfs_ptimers[rep->r_procnum] - 1;
171 int *srtt = &rep->r_nmp->nm_srtt[index];
172 int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
173
174 t1 -= *srtt >> 3;
175 *srtt += t1;
176 if (t1 < 0)
177 t1 = -t1;
178 t1 -= *sdrtt >> 2;
179 *sdrtt += t1;
180 }
181
182 /*
183 * Estimate RTO for an NFS RPC sent via an unreliable datagram.
184 *
185 * Use the mean and mean deviation of RTT for the appropriate type
186 * of RPC for the frequent RPCs and a default for the others.
187 * The justification for doing "other" this way is that these RPCs
188 * happen so infrequently that timer est. would probably be stale.
189 * Also, since many of these RPCs are non-idempotent, a conservative
190 * timeout is desired.
191 *
192 * getattr, lookup - A+2D
193 * read, write - A+4D
194 * other - nm_timeo
195 */
196 int
nfs_estimate_rto(struct nfsmount * nmp,u_int32_t procnum)197 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
198 {
199 enum nfs_rto_timers timer = nfs_ptimers[procnum];
200 int index = timer - 1;
201 int rto;
202
203 switch (timer) {
204 case NFS_GETATTR_TIMER:
205 case NFS_LOOKUP_TIMER:
206 rto = ((nmp->nm_srtt[index] + 3) >> 2) +
207 ((nmp->nm_sdrtt[index] + 1) >> 1);
208 break;
209 case NFS_READ_TIMER:
210 case NFS_WRITE_TIMER:
211 rto = ((nmp->nm_srtt[index] + 7) >> 3) +
212 (nmp->nm_sdrtt[index] + 1);
213 break;
214 default:
215 rto = nmp->nm_timeo;
216 return (rto);
217 }
218
219 if (rto < NFS_MINRTO)
220 rto = NFS_MINRTO;
221 else if (rto > NFS_MAXRTO)
222 rto = NFS_MAXRTO;
223
224 return (rto);
225 }
226
227
228
229 /*
230 * Initialize sockets and congestion for a new NFS connection.
231 * We do not free the sockaddr if error.
232 */
233 int
nfs_connect(struct nfsmount * nmp,struct nfsreq * rep)234 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
235 {
236 struct socket *so;
237 int error, rcvreserve, sndreserve;
238 struct sockaddr *saddr;
239 struct sockaddr_in *sin;
240 struct mbuf *nam = NULL, *mopt = NULL;
241
242 if (!(nmp->nm_sotype == SOCK_DGRAM || nmp->nm_sotype == SOCK_STREAM))
243 return (EINVAL);
244
245 nmp->nm_so = NULL;
246 saddr = mtod(nmp->nm_nam, struct sockaddr *);
247 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
248 nmp->nm_soproto);
249 if (error) {
250 nfs_disconnect(nmp);
251 return (error);
252 }
253
254 /* Allocate mbufs possibly waiting before grabbing the socket lock. */
255 if (nmp->nm_sotype == SOCK_STREAM || saddr->sa_family == AF_INET)
256 MGET(mopt, M_WAIT, MT_SOOPTS);
257 if (saddr->sa_family == AF_INET)
258 MGET(nam, M_WAIT, MT_SONAME);
259
260 so = nmp->nm_so;
261 nmp->nm_soflags = so->so_proto->pr_flags;
262
263 /*
264 * Some servers require that the client port be a reserved port number.
265 * We always allocate a reserved port, as this prevents filehandle
266 * disclosure through UDP port capture.
267 */
268 if (saddr->sa_family == AF_INET) {
269 int *ip;
270
271 mopt->m_len = sizeof(int);
272 ip = mtod(mopt, int *);
273 *ip = IP_PORTRANGE_LOW;
274 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
275 if (error)
276 goto bad;
277
278 sin = mtod(nam, struct sockaddr_in *);
279 memset(sin, 0, sizeof(*sin));
280 sin->sin_len = nam->m_len = sizeof(struct sockaddr_in);
281 sin->sin_family = AF_INET;
282 sin->sin_addr.s_addr = INADDR_ANY;
283 sin->sin_port = htons(0);
284 solock(so);
285 error = sobind(so, nam, &proc0);
286 sounlock(so);
287 if (error)
288 goto bad;
289
290 mopt->m_len = sizeof(int);
291 ip = mtod(mopt, int *);
292 *ip = IP_PORTRANGE_DEFAULT;
293 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
294 if (error)
295 goto bad;
296 }
297
298 /*
299 * Protocols that do not require connections may be optionally left
300 * unconnected for servers that reply from a port other than NFS_PORT.
301 */
302 if (nmp->nm_flag & NFSMNT_NOCONN) {
303 if (nmp->nm_soflags & PR_CONNREQUIRED) {
304 error = ENOTCONN;
305 goto bad;
306 }
307 } else {
308 solock(so);
309 error = soconnect(so, nmp->nm_nam);
310 if (error)
311 goto bad_locked;
312
313 /*
314 * Wait for the connection to complete. Cribbed from the
315 * connect system call but with the wait timing out so
316 * that interruptible mounts don't hang here for a long time.
317 */
318 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
319 sosleep_nsec(so, &so->so_timeo, PSOCK, "nfscon",
320 SEC_TO_NSEC(2));
321 if ((so->so_state & SS_ISCONNECTING) &&
322 so->so_error == 0 && rep &&
323 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){
324 so->so_state &= ~SS_ISCONNECTING;
325 goto bad_locked;
326 }
327 }
328 if (so->so_error) {
329 error = so->so_error;
330 so->so_error = 0;
331 goto bad_locked;
332 }
333 sounlock(so);
334 }
335 /*
336 * Always set receive timeout to detect server crash and reconnect.
337 * Otherwise, we can get stuck in soreceive forever.
338 */
339 mtx_enter(&so->so_rcv.sb_mtx);
340 so->so_rcv.sb_timeo_nsecs = SEC_TO_NSEC(5);
341 mtx_leave(&so->so_rcv.sb_mtx);
342 mtx_enter(&so->so_snd.sb_mtx);
343 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT))
344 so->so_snd.sb_timeo_nsecs = SEC_TO_NSEC(5);
345 else
346 so->so_snd.sb_timeo_nsecs = INFSLP;
347 mtx_leave(&so->so_snd.sb_mtx);
348 if (nmp->nm_sotype == SOCK_DGRAM) {
349 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR;
350 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
351 NFS_MAXPKTHDR) * 2;
352 } else if (nmp->nm_sotype == SOCK_STREAM) {
353 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
354 *mtod(mopt, int32_t *) = 1;
355 mopt->m_len = sizeof(int32_t);
356 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, mopt);
357 }
358 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
359 *mtod(mopt, int32_t *) = 1;
360 mopt->m_len = sizeof(int32_t);
361 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, mopt);
362 }
363 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
364 sizeof (u_int32_t)) * 2;
365 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
366 sizeof (u_int32_t)) * 2;
367 } else {
368 panic("%s: nm_sotype %d", __func__, nmp->nm_sotype);
369 }
370 solock(so);
371 error = soreserve(so, sndreserve, rcvreserve);
372 if (error)
373 goto bad_locked;
374 mtx_enter(&so->so_rcv.sb_mtx);
375 so->so_rcv.sb_flags |= SB_NOINTR;
376 mtx_leave(&so->so_rcv.sb_mtx);
377 so->so_snd.sb_flags |= SB_NOINTR;
378 sounlock(so);
379
380 m_freem(mopt);
381 m_freem(nam);
382
383 /* Initialize other non-zero congestion variables */
384 nfs_init_rtt(nmp);
385 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
386 nmp->nm_sent = 0;
387 nmp->nm_timeouts = 0;
388 return (0);
389
390 bad_locked:
391 sounlock(so);
392 bad:
393
394 m_freem(mopt);
395 m_freem(nam);
396
397 nfs_disconnect(nmp);
398 return (error);
399 }
400
401 /*
402 * Reconnect routine:
403 * Called when a connection is broken on a reliable protocol.
404 * - clean up the old socket
405 * - nfs_connect() again
406 * - set R_MUSTRESEND for all outstanding requests on mount point
407 * If this fails the mount point is DEAD!
408 * nb: Must be called with the nfs_sndlock() set on the mount point.
409 */
410 int
nfs_reconnect(struct nfsreq * rep)411 nfs_reconnect(struct nfsreq *rep)
412 {
413 struct nfsreq *rp;
414 struct nfsmount *nmp = rep->r_nmp;
415 int error;
416
417 nfs_disconnect(nmp);
418 while ((error = nfs_connect(nmp, rep)) != 0) {
419 if (error == EINTR || error == ERESTART)
420 return (EINTR);
421 tsleep_nsec(&nowake, PSOCK, "nfsrecon", SEC_TO_NSEC(1));
422 }
423
424 /*
425 * Loop through outstanding request list and fix up all requests
426 * on old socket.
427 */
428 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) {
429 rp->r_flags |= R_MUSTRESEND;
430 rp->r_rexmit = 0;
431 }
432 return (0);
433 }
434
435 /*
436 * NFS disconnect. Clean up and unlink.
437 */
438 void
nfs_disconnect(struct nfsmount * nmp)439 nfs_disconnect(struct nfsmount *nmp)
440 {
441 struct socket *so;
442
443 if (nmp->nm_so) {
444 so = nmp->nm_so;
445 nmp->nm_so = NULL;
446 soshutdown(so, SHUT_RDWR);
447 soclose(so, 0);
448 }
449 }
450
451 /*
452 * This is the nfs send routine. For connection based socket types, it
453 * must be called with an nfs_sndlock() on the socket.
454 * "rep == NULL" indicates that it has been called from a server.
455 * For the client side:
456 * - return EINTR if the RPC is terminated, 0 otherwise
457 * - set R_MUSTRESEND if the send fails for any reason
458 * - do any cleanup required by recoverable socket errors (???)
459 * For the server side:
460 * - return EINTR or ERESTART if interrupted by a signal
461 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
462 * - do any cleanup required by recoverable socket errors (???)
463 */
464 int
nfs_send(struct socket * so,struct mbuf * nam,struct mbuf * top,struct nfsreq * rep)465 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top,
466 struct nfsreq *rep)
467 {
468 struct mbuf *sendnam;
469 int error, soflags, flags;
470
471 if (rep) {
472 if (rep->r_flags & R_SOFTTERM) {
473 m_freem(top);
474 return (EINTR);
475 }
476 if ((so = rep->r_nmp->nm_so) == NULL) {
477 rep->r_flags |= R_MUSTRESEND;
478 m_freem(top);
479 return (0);
480 }
481 rep->r_flags &= ~R_MUSTRESEND;
482 soflags = rep->r_nmp->nm_soflags;
483 } else
484 soflags = so->so_proto->pr_flags;
485 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
486 sendnam = NULL;
487 else
488 sendnam = nam;
489 flags = 0;
490
491 error = sosend(so, sendnam, NULL, top, NULL, flags);
492 if (error) {
493 if (rep) {
494 /*
495 * Deal with errors for the client side.
496 */
497 if (rep->r_flags & R_SOFTTERM)
498 error = EINTR;
499 else
500 rep->r_flags |= R_MUSTRESEND;
501 }
502
503 /*
504 * Handle any recoverable (soft) socket errors here. (???)
505 */
506 if (error != EINTR && error != ERESTART &&
507 error != EWOULDBLOCK && error != EPIPE)
508 error = 0;
509 }
510 return (error);
511 }
512
513 #ifdef NFSCLIENT
514 /*
515 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
516 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
517 * Mark and consolidate the data into a new mbuf list.
518 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
519 * small mbufs.
520 * For SOCK_STREAM we must be very careful to read an entire record once
521 * we have read any of it, even if the system call has been interrupted.
522 */
523 int
nfs_receive(struct nfsreq * rep,struct mbuf ** aname,struct mbuf ** mp)524 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp)
525 {
526 struct socket *so;
527 struct uio auio;
528 struct iovec aio;
529 struct mbuf *m;
530 struct mbuf *control;
531 u_int32_t len;
532 struct mbuf **getnam;
533 int error, sotype, rcvflg;
534 struct proc *p = curproc; /* XXX */
535
536 /*
537 * Set up arguments for soreceive()
538 */
539 *mp = NULL;
540 *aname = NULL;
541 sotype = rep->r_nmp->nm_sotype;
542
543 /*
544 * For reliable protocols, lock against other senders/receivers
545 * in case a reconnect is necessary.
546 * For SOCK_STREAM, first get the Record Mark to find out how much
547 * more there is to get.
548 * We must lock the socket against other receivers
549 * until we have an entire rpc request/reply.
550 */
551 if (sotype != SOCK_DGRAM) {
552 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep);
553 if (error)
554 return (error);
555 tryagain:
556 /*
557 * Check for fatal errors and resending request.
558 */
559 /*
560 * Ugh: If a reconnect attempt just happened, nm_so
561 * would have changed. NULL indicates a failed
562 * attempt that has essentially shut down this
563 * mount point.
564 */
565 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
566 nfs_sndunlock(&rep->r_nmp->nm_flag);
567 return (EINTR);
568 }
569 so = rep->r_nmp->nm_so;
570 if (!so) {
571 error = nfs_reconnect(rep);
572 if (error) {
573 nfs_sndunlock(&rep->r_nmp->nm_flag);
574 return (error);
575 }
576 goto tryagain;
577 }
578 while (rep->r_flags & R_MUSTRESEND) {
579 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
580 nfsstats.rpcretries++;
581 rep->r_rtt = 0;
582 rep->r_flags &= ~R_TIMING;
583 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
584 if (error) {
585 if (error == EINTR || error == ERESTART ||
586 (error = nfs_reconnect(rep)) != 0) {
587 nfs_sndunlock(&rep->r_nmp->nm_flag);
588 return (error);
589 }
590 goto tryagain;
591 }
592 }
593 nfs_sndunlock(&rep->r_nmp->nm_flag);
594 if (sotype == SOCK_STREAM) {
595 aio.iov_base = (caddr_t) &len;
596 aio.iov_len = sizeof(u_int32_t);
597 auio.uio_iov = &aio;
598 auio.uio_iovcnt = 1;
599 auio.uio_segflg = UIO_SYSSPACE;
600 auio.uio_rw = UIO_READ;
601 auio.uio_offset = 0;
602 auio.uio_resid = sizeof(u_int32_t);
603 auio.uio_procp = p;
604 do {
605 rcvflg = MSG_WAITALL;
606 error = soreceive(so, NULL, &auio, NULL, NULL,
607 &rcvflg, 0);
608 if (error == EWOULDBLOCK && rep) {
609 if (rep->r_flags & R_SOFTTERM)
610 return (EINTR);
611 /*
612 * looks like the server died after it
613 * received the request, make sure
614 * that we will retransmit and we
615 * don't get stuck here forever.
616 */
617 if (rep->r_rexmit >=
618 rep->r_nmp->nm_retry) {
619 nfsstats.rpctimeouts++;
620 error = EPIPE;
621 }
622 }
623 } while (error == EWOULDBLOCK);
624 if (!error && auio.uio_resid > 0) {
625 log(LOG_INFO,
626 "short receive (%zu/%zu) from nfs server %s\n",
627 sizeof(u_int32_t) - auio.uio_resid,
628 sizeof(u_int32_t),
629 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
630 error = EPIPE;
631 }
632 if (error)
633 goto errout;
634
635 len = ntohl(len) & ~0x80000000;
636 /*
637 * This is SERIOUS! We are out of sync with the sender
638 * and forcing a disconnect/reconnect is all I can do.
639 */
640 if (len > NFS_MAXPACKET) {
641 log(LOG_ERR, "%s (%u) from nfs server %s\n",
642 "impossible packet length",
643 len,
644 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
645 error = EFBIG;
646 goto errout;
647 }
648 auio.uio_resid = len;
649 do {
650 rcvflg = MSG_WAITALL;
651 error = soreceive(so, NULL, &auio, mp, NULL,
652 &rcvflg, 0);
653 } while (error == EWOULDBLOCK || error == EINTR ||
654 error == ERESTART);
655 if (!error && auio.uio_resid > 0) {
656 log(LOG_INFO, "short receive (%zu/%u) from "
657 "nfs server %s\n", len - auio.uio_resid,
658 len, rep->r_nmp->nm_mountp->
659 mnt_stat.f_mntfromname);
660 error = EPIPE;
661 }
662 } else {
663 /*
664 * NB: Since uio_resid is big, MSG_WAITALL is ignored
665 * and soreceive() will return when it has either a
666 * control msg or a data msg.
667 * We have no use for control msg., but must grab them
668 * and then throw them away so we know what is going
669 * on.
670 */
671 auio.uio_resid = len = 100000000; /* Anything Big */
672 auio.uio_procp = p;
673 do {
674 rcvflg = 0;
675 error = soreceive(so, NULL, &auio, mp, &control,
676 &rcvflg, 0);
677 m_freem(control);
678 if (error == EWOULDBLOCK && rep) {
679 if (rep->r_flags & R_SOFTTERM)
680 return (EINTR);
681 }
682 } while (error == EWOULDBLOCK ||
683 (!error && *mp == NULL && control));
684 if ((rcvflg & MSG_EOR) == 0)
685 printf("Egad!!\n");
686 if (!error && *mp == NULL)
687 error = EPIPE;
688 len -= auio.uio_resid;
689 }
690 errout:
691 if (error && error != EINTR && error != ERESTART) {
692 m_freemp(mp);
693 if (error != EPIPE)
694 log(LOG_INFO,
695 "receive error %d from nfs server %s\n",
696 error,
697 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
698 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep);
699 if (!error) {
700 error = nfs_reconnect(rep);
701 if (!error)
702 goto tryagain;
703 nfs_sndunlock(&rep->r_nmp->nm_flag);
704 }
705 }
706 } else {
707 if ((so = rep->r_nmp->nm_so) == NULL)
708 return (EACCES);
709 if (so->so_state & SS_ISCONNECTED)
710 getnam = NULL;
711 else
712 getnam = aname;
713 auio.uio_resid = len = 1000000;
714 auio.uio_procp = p;
715 do {
716 rcvflg = 0;
717 error = soreceive(so, getnam, &auio, mp, NULL,
718 &rcvflg, 0);
719 if (error == EWOULDBLOCK &&
720 (rep->r_flags & R_SOFTTERM))
721 return (EINTR);
722 } while (error == EWOULDBLOCK);
723 len -= auio.uio_resid;
724 }
725 if (error)
726 m_freemp(mp);
727 /*
728 * Search for any mbufs that are not a multiple of 4 bytes long
729 * or with m_data not longword aligned.
730 * These could cause pointer alignment problems, so copy them to
731 * well aligned mbufs.
732 */
733 nfs_realign(mp, 5 * NFSX_UNSIGNED);
734 return (error);
735 }
736
737 /*
738 * Implement receipt of reply on a socket.
739 * We must search through the list of received datagrams matching them
740 * with outstanding requests using the xid, until ours is found.
741 */
742 int
nfs_reply(struct nfsreq * myrep)743 nfs_reply(struct nfsreq *myrep)
744 {
745 struct nfsreq *rep;
746 struct nfsmount *nmp = myrep->r_nmp;
747 struct nfsm_info info;
748 struct mbuf *nam;
749 u_int32_t rxid, *tl;
750 int error;
751
752 /*
753 * Loop around until we get our own reply
754 */
755 for (;;) {
756 /*
757 * Lock against other receivers so that I don't get stuck in
758 * sbwait() after someone else has received my reply for me.
759 * Also necessary for connection based protocols to avoid
760 * race conditions during a reconnect.
761 */
762 error = nfs_rcvlock(myrep);
763 if (error)
764 return (error == EALREADY ? 0 : error);
765
766 /*
767 * Get the next Rpc reply off the socket
768 */
769 error = nfs_receive(myrep, &nam, &info.nmi_mrep);
770 nfs_rcvunlock(&nmp->nm_flag);
771 if (error) {
772
773 /*
774 * Ignore routing errors on connectionless protocols??
775 */
776 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
777 if (nmp->nm_so)
778 nmp->nm_so->so_error = 0;
779 continue;
780 }
781 return (error);
782 }
783 m_freem(nam);
784
785 /*
786 * Get the xid and check that it is an rpc reply
787 */
788 info.nmi_md = info.nmi_mrep;
789 info.nmi_dpos = mtod(info.nmi_md, caddr_t);
790 info.nmi_errorp = &error;
791 tl = (uint32_t *)nfsm_dissect(&info, 2 * NFSX_UNSIGNED);
792 if (tl == NULL)
793 goto nfsmout;
794 rxid = *tl++;
795 if (*tl != rpc_reply) {
796 nfsstats.rpcinvalid++;
797 m_freem(info.nmi_mrep);
798 nfsmout:
799 continue;
800 }
801
802 /*
803 * Loop through the request list to match up the reply
804 * Iff no match, just drop the datagram
805 */
806 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) {
807 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
808 /* Found it.. */
809 rep->r_mrep = info.nmi_mrep;
810 rep->r_md = info.nmi_md;
811 rep->r_dpos = info.nmi_dpos;
812
813 /*
814 * Update congestion window.
815 * Do the additive increase of
816 * one rpc/rtt.
817 */
818 if (nmp->nm_cwnd <= nmp->nm_sent) {
819 nmp->nm_cwnd +=
820 (NFS_CWNDSCALE * NFS_CWNDSCALE +
821 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
822 if (nmp->nm_cwnd > NFS_MAXCWND)
823 nmp->nm_cwnd = NFS_MAXCWND;
824 }
825 rep->r_flags &= ~R_SENT;
826 nmp->nm_sent -= NFS_CWNDSCALE;
827
828 if (rep->r_flags & R_TIMING)
829 nfs_update_rtt(rep);
830
831 nmp->nm_timeouts = 0;
832 break;
833 }
834 }
835 /*
836 * If not matched to a request, drop it.
837 * If it's mine, get out.
838 */
839 if (rep == 0) {
840 nfsstats.rpcunexpected++;
841 m_freem(info.nmi_mrep);
842 } else if (rep == myrep) {
843 if (rep->r_mrep == NULL)
844 panic("nfsreply nil");
845 return (0);
846 }
847 }
848 }
849
850 /*
851 * nfs_request - goes something like this
852 * - fill in request struct
853 * - links it into list
854 * - calls nfs_send() for first transmit
855 * - calls nfs_receive() to get reply
856 * - break down rpc header and return with nfs reply pointed to
857 * by mrep or error
858 * nb: always frees up mreq mbuf list
859 */
860 int
nfs_request(struct vnode * vp,int procnum,struct nfsm_info * infop)861 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop)
862 {
863 struct mbuf *m;
864 u_int32_t *tl;
865 struct nfsmount *nmp;
866 int i, error = 0;
867 int trylater_delay;
868 struct nfsreq *rep;
869 struct nfsm_info info;
870
871 rep = pool_get(&nfsreqpl, PR_WAITOK);
872 rep->r_nmp = VFSTONFS(vp->v_mount);
873 rep->r_vp = vp;
874 rep->r_procp = infop->nmi_procp;
875 rep->r_procnum = procnum;
876
877 /* empty mbuf for AUTH_UNIX header */
878 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA);
879 rep->r_mreq->m_next = infop->nmi_mreq;
880 rep->r_mreq->m_len = 0;
881 m_calchdrlen(rep->r_mreq);
882
883 trylater_delay = NFS_MINTIMEO;
884
885 nmp = rep->r_nmp;
886
887 /* Get the RPC header with authorization. */
888 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX);
889 m = rep->r_mreq;
890
891 /*
892 * For stream protocols, insert a Sun RPC Record Mark.
893 */
894 if (nmp->nm_sotype == SOCK_STREAM) {
895 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
896 *mtod(m, u_int32_t *) = htonl(0x80000000 |
897 (m->m_pkthdr.len - NFSX_UNSIGNED));
898 }
899
900 tryagain:
901 rep->r_rtt = rep->r_rexmit = 0;
902 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER)
903 rep->r_flags = R_TIMING;
904 else
905 rep->r_flags = 0;
906 rep->r_mrep = NULL;
907
908 /*
909 * Do the client side RPC.
910 */
911 nfsstats.rpcrequests++;
912 /*
913 * Chain request into list of outstanding requests. Be sure
914 * to put it LAST so timer finds oldest requests first.
915 */
916 if (TAILQ_EMPTY(&nmp->nm_reqsq))
917 timeout_add(&nmp->nm_rtimeout, nfs_ticks);
918 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain);
919
920 /*
921 * If backing off another request or avoiding congestion, don't
922 * send this one now but let timer do it. If not timing a request,
923 * do it now.
924 */
925 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
926 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
927 nmp->nm_sent < nmp->nm_cwnd)) {
928 if (nmp->nm_soflags & PR_CONNREQUIRED)
929 error = nfs_sndlock(&nmp->nm_flag, rep);
930 if (!error) {
931 error = nfs_send(nmp->nm_so, nmp->nm_nam,
932 m_copym(m, 0, M_COPYALL, M_WAIT), rep);
933 if (nmp->nm_soflags & PR_CONNREQUIRED)
934 nfs_sndunlock(&nmp->nm_flag);
935 }
936 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
937 nmp->nm_sent += NFS_CWNDSCALE;
938 rep->r_flags |= R_SENT;
939 }
940 } else {
941 rep->r_rtt = -1;
942 }
943
944 /*
945 * Wait for the reply from our send or the timer's.
946 */
947 if (!error || error == EPIPE)
948 error = nfs_reply(rep);
949
950 /*
951 * RPC done, unlink the request.
952 */
953 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain);
954 if (TAILQ_EMPTY(&nmp->nm_reqsq))
955 timeout_del(&nmp->nm_rtimeout);
956
957 /*
958 * Decrement the outstanding request count.
959 */
960 if (rep->r_flags & R_SENT) {
961 rep->r_flags &= ~R_SENT; /* paranoia */
962 nmp->nm_sent -= NFS_CWNDSCALE;
963 }
964
965 /*
966 * If there was a successful reply and a tprintf msg.
967 * tprintf a response.
968 */
969 if (!error && (rep->r_flags & R_TPRINTFMSG))
970 nfs_msg(rep, "is alive again");
971 info.nmi_mrep = rep->r_mrep;
972 info.nmi_md = rep->r_md;
973 info.nmi_dpos = rep->r_dpos;
974 info.nmi_errorp = &error;
975 if (error) {
976 infop->nmi_mrep = NULL;
977 goto nfsmout1;
978 }
979
980 /*
981 * break down the rpc header and check if ok
982 */
983 tl = (uint32_t *)nfsm_dissect(&info, 3 * NFSX_UNSIGNED);
984 if (tl == NULL)
985 goto nfsmout;
986 if (*tl++ == rpc_msgdenied) {
987 if (*tl == rpc_mismatch)
988 error = EOPNOTSUPP;
989 else
990 error = EACCES; /* Should be EAUTH. */
991 infop->nmi_mrep = NULL;
992 goto nfsmout1;
993 }
994
995 /*
996 * Since we only support RPCAUTH_UNIX atm we step over the
997 * reply verifer type, and in the (error) case that there really
998 * is any data in it, we advance over it.
999 */
1000 tl++; /* Step over verifer type */
1001 i = fxdr_unsigned(int32_t, *tl);
1002 if (i > 0) {
1003 /* Should not happen */
1004 if (nfsm_adv(&info, nfsm_rndup(i)) != 0)
1005 goto nfsmout;
1006 }
1007
1008 tl = (uint32_t *)nfsm_dissect(&info, NFSX_UNSIGNED);
1009 if (tl == NULL)
1010 goto nfsmout;
1011 /* 0 == ok */
1012 if (*tl == 0) {
1013 tl = (uint32_t *)nfsm_dissect(&info, NFSX_UNSIGNED);
1014 if (tl == NULL)
1015 goto nfsmout;
1016 if (*tl != 0) {
1017 error = fxdr_unsigned(int, *tl);
1018 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1019 error == NFSERR_TRYLATER) {
1020 m_freem(info.nmi_mrep);
1021 error = 0;
1022 tsleep_nsec(&nowake, PSOCK, "nfsretry",
1023 SEC_TO_NSEC(trylater_delay));
1024 trylater_delay *= NFS_TIMEOUTMUL;
1025 if (trylater_delay > NFS_MAXTIMEO)
1026 trylater_delay = NFS_MAXTIMEO;
1027
1028 goto tryagain;
1029 }
1030
1031 /*
1032 * If the File Handle was stale, invalidate the
1033 * lookup cache, just in case.
1034 */
1035 if (error == ESTALE)
1036 cache_purge(rep->r_vp);
1037 }
1038 goto nfsmout;
1039 }
1040
1041 error = EPROTONOSUPPORT;
1042
1043 nfsmout:
1044 infop->nmi_mrep = info.nmi_mrep;
1045 infop->nmi_md = info.nmi_md;
1046 infop->nmi_dpos = info.nmi_dpos;
1047 nfsmout1:
1048 m_freem(rep->r_mreq);
1049 pool_put(&nfsreqpl, rep);
1050 return (error);
1051 }
1052 #endif /* NFSCLIENT */
1053
1054 /*
1055 * Generate the rpc reply header
1056 * siz arg. is used to decide if adding a cluster is worthwhile
1057 */
1058 int
nfs_rephead(int siz,struct nfsrv_descript * nd,struct nfssvc_sock * slp,int err,struct mbuf ** mrq,struct mbuf ** mbp)1059 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
1060 int err, struct mbuf **mrq, struct mbuf **mbp)
1061 {
1062 u_int32_t *tl;
1063 struct mbuf *mreq;
1064 struct mbuf *mb;
1065
1066 MGETHDR(mreq, M_WAIT, MT_DATA);
1067 mb = mreq;
1068 /*
1069 * If this is a big reply, use a cluster else
1070 * try and leave leading space for the lower level headers.
1071 */
1072 siz += RPC_REPLYSIZ;
1073 if (siz >= MHLEN - max_hdr) {
1074 MCLGET(mreq, M_WAIT);
1075 } else
1076 mreq->m_data += max_hdr;
1077 tl = mtod(mreq, u_int32_t *);
1078 mreq->m_len = 6 * NFSX_UNSIGNED;
1079 *tl++ = txdr_unsigned(nd->nd_retxid);
1080 *tl++ = rpc_reply;
1081 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1082 *tl++ = rpc_msgdenied;
1083 if (err & NFSERR_AUTHERR) {
1084 *tl++ = rpc_autherr;
1085 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1086 mreq->m_len -= NFSX_UNSIGNED;
1087 } else {
1088 *tl++ = rpc_mismatch;
1089 *tl++ = txdr_unsigned(RPC_VER2);
1090 *tl = txdr_unsigned(RPC_VER2);
1091 }
1092 } else {
1093 *tl++ = rpc_msgaccepted;
1094
1095 /* AUTH_UNIX requires RPCAUTH_NULL. */
1096 *tl++ = 0;
1097 *tl++ = 0;
1098
1099 switch (err) {
1100 case EPROGUNAVAIL:
1101 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1102 break;
1103 case EPROGMISMATCH:
1104 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1105 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED);
1106 *tl++ = txdr_unsigned(NFS_VER2);
1107 *tl = txdr_unsigned(NFS_VER3);
1108 break;
1109 case EPROCUNAVAIL:
1110 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1111 break;
1112 case EBADRPC:
1113 *tl = txdr_unsigned(RPC_GARBAGE);
1114 break;
1115 default:
1116 *tl = 0;
1117 if (err != NFSERR_RETVOID) {
1118 tl = nfsm_build(&mb, NFSX_UNSIGNED);
1119 if (err)
1120 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1121 else
1122 *tl = 0;
1123 }
1124 break;
1125 };
1126 }
1127
1128 *mrq = mreq;
1129 if (mbp != NULL)
1130 *mbp = mb;
1131 if (err != 0 && err != NFSERR_RETVOID)
1132 nfsstats.srvrpc_errs++;
1133 return (0);
1134 }
1135
1136 /*
1137 * nfs timer routine
1138 * Scan the nfsreq list and retransmit any requests that have timed out.
1139 */
1140 void
nfs_timer(void * arg)1141 nfs_timer(void *arg)
1142 {
1143 struct nfsmount *nmp = arg;
1144 struct nfsreq *rep;
1145 struct mbuf *m;
1146 struct socket *so;
1147 int timeo, error;
1148
1149 NET_LOCK();
1150 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) {
1151 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1152 continue;
1153 if (nfs_sigintr(nmp, rep, rep->r_procp)) {
1154 rep->r_flags |= R_SOFTTERM;
1155 continue;
1156 }
1157 if (rep->r_rtt >= 0) {
1158 rep->r_rtt++;
1159 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1160 timeo = nmp->nm_timeo;
1161 else
1162 timeo = nfs_estimate_rto(nmp, rep->r_procnum);
1163 if (nmp->nm_timeouts > 0)
1164 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1165 if (rep->r_rtt <= timeo)
1166 continue;
1167 if (nmp->nm_timeouts < nitems(nfs_backoff))
1168 nmp->nm_timeouts++;
1169 }
1170
1171 /* Check for server not responding. */
1172 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) {
1173 nfs_msg(rep, "not responding");
1174 rep->r_flags |= R_TPRINTFMSG;
1175 }
1176 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */
1177 nfsstats.rpctimeouts++;
1178 rep->r_flags |= R_SOFTTERM;
1179 continue;
1180 }
1181 if (nmp->nm_sotype != SOCK_DGRAM) {
1182 if (++rep->r_rexmit > NFS_MAXREXMIT)
1183 rep->r_rexmit = NFS_MAXREXMIT;
1184 continue;
1185 }
1186
1187 if ((so = nmp->nm_so) == NULL)
1188 continue;
1189
1190 /*
1191 * If there is enough space and the window allows..
1192 * Resend it
1193 * Set r_rtt to -1 in case we fail to send it now.
1194 */
1195 rep->r_rtt = -1;
1196 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1197 ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1198 (rep->r_flags & R_SENT) ||
1199 nmp->nm_sent < nmp->nm_cwnd) &&
1200 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
1201 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1202 error = pru_send(so, m, NULL, NULL);
1203 else
1204 error = pru_send(so, m, nmp->nm_nam, NULL);
1205 if (error) {
1206 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1207 so->so_error = 0;
1208 } else {
1209 /*
1210 * Iff first send, start timing
1211 * else turn timing off, backoff timer
1212 * and divide congestion window by 2.
1213 */
1214 if (rep->r_flags & R_SENT) {
1215 rep->r_flags &= ~R_TIMING;
1216 if (++rep->r_rexmit > NFS_MAXREXMIT)
1217 rep->r_rexmit = NFS_MAXREXMIT;
1218 nmp->nm_cwnd >>= 1;
1219 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1220 nmp->nm_cwnd = NFS_CWNDSCALE;
1221 nfsstats.rpcretries++;
1222 } else {
1223 rep->r_flags |= R_SENT;
1224 nmp->nm_sent += NFS_CWNDSCALE;
1225 }
1226 rep->r_rtt = 0;
1227 }
1228 }
1229 }
1230 NET_UNLOCK();
1231 timeout_add(&nmp->nm_rtimeout, nfs_ticks);
1232 }
1233
1234 /*
1235 * Test for a termination condition pending on the process.
1236 * This is used for NFSMNT_INT mounts.
1237 */
1238 int
nfs_sigintr(struct nfsmount * nmp,struct nfsreq * rep,struct proc * p)1239 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p)
1240 {
1241
1242 if (rep && (rep->r_flags & R_SOFTTERM))
1243 return (EINTR);
1244 if (!(nmp->nm_flag & NFSMNT_INT))
1245 return (0);
1246 if (p && (SIGPENDING(p) & ~p->p_p->ps_sigacts->ps_sigignore &
1247 NFSINT_SIGMASK))
1248 return (EINTR);
1249 return (0);
1250 }
1251
1252 /*
1253 * Lock a socket against others.
1254 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1255 * and also to avoid race conditions between the processes with nfs requests
1256 * in progress when a reconnect is necessary.
1257 */
1258 int
nfs_sndlock(int * flagp,struct nfsreq * rep)1259 nfs_sndlock(int *flagp, struct nfsreq *rep)
1260 {
1261 uint64_t slptimeo = INFSLP;
1262 struct proc *p;
1263 int slpflag = 0;
1264
1265 if (rep) {
1266 p = rep->r_procp;
1267 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1268 slpflag = PCATCH;
1269 } else
1270 p = NULL;
1271 while (*flagp & NFSMNT_SNDLOCK) {
1272 if (rep && nfs_sigintr(rep->r_nmp, rep, p))
1273 return (EINTR);
1274 *flagp |= NFSMNT_WANTSND;
1275 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
1276 if (slpflag == PCATCH) {
1277 slpflag = 0;
1278 slptimeo = SEC_TO_NSEC(2);
1279 }
1280 }
1281 *flagp |= NFSMNT_SNDLOCK;
1282 return (0);
1283 }
1284
1285 /*
1286 * Unlock the stream socket for others.
1287 */
1288 void
nfs_sndunlock(int * flagp)1289 nfs_sndunlock(int *flagp)
1290 {
1291
1292 if ((*flagp & NFSMNT_SNDLOCK) == 0)
1293 panic("nfs sndunlock");
1294 *flagp &= ~NFSMNT_SNDLOCK;
1295 if (*flagp & NFSMNT_WANTSND) {
1296 *flagp &= ~NFSMNT_WANTSND;
1297 wakeup((caddr_t)flagp);
1298 }
1299 }
1300
1301 int
nfs_rcvlock(struct nfsreq * rep)1302 nfs_rcvlock(struct nfsreq *rep)
1303 {
1304 uint64_t slptimeo = INFSLP;
1305 int *flagp = &rep->r_nmp->nm_flag;
1306 int slpflag;
1307
1308 if (*flagp & NFSMNT_INT)
1309 slpflag = PCATCH;
1310 else
1311 slpflag = 0;
1312
1313 while (*flagp & NFSMNT_RCVLOCK) {
1314 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp))
1315 return (EINTR);
1316 *flagp |= NFSMNT_WANTRCV;
1317 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
1318 if (rep->r_mrep != NULL) {
1319 /*
1320 * Don't take the lock if our reply has been received
1321 * while we where sleeping.
1322 */
1323 return (EALREADY);
1324 }
1325 if (slpflag == PCATCH) {
1326 slpflag = 0;
1327 slptimeo = SEC_TO_NSEC(2);
1328 }
1329 }
1330 *flagp |= NFSMNT_RCVLOCK;
1331 return (0);
1332 }
1333
1334 /*
1335 * Unlock the stream socket for others.
1336 */
1337 void
nfs_rcvunlock(int * flagp)1338 nfs_rcvunlock(int *flagp)
1339 {
1340
1341 if ((*flagp & NFSMNT_RCVLOCK) == 0)
1342 panic("nfs rcvunlock");
1343 *flagp &= ~NFSMNT_RCVLOCK;
1344 if (*flagp & NFSMNT_WANTRCV) {
1345 *flagp &= ~NFSMNT_WANTRCV;
1346 wakeup(flagp);
1347 }
1348 }
1349
1350 /*
1351 * Auxiliary routine to align the length of mbuf copies made with m_copyback().
1352 */
1353 void
nfs_realign_fixup(struct mbuf * m,struct mbuf * n,unsigned int * off)1354 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off)
1355 {
1356 size_t padding;
1357
1358 /*
1359 * The maximum number of bytes that m_copyback() places in a mbuf is
1360 * always an aligned quantity, so realign happens at the chain's tail.
1361 */
1362 while (n->m_next != NULL)
1363 n = n->m_next;
1364
1365 /*
1366 * Pad from the next elements in the source chain. Loop until the
1367 * destination chain is aligned, or the end of the source is reached.
1368 */
1369 do {
1370 m = m->m_next;
1371 if (m == NULL)
1372 return;
1373
1374 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len);
1375 if (padding > m_trailingspace(n))
1376 panic("nfs_realign_fixup: no memory to pad to");
1377
1378 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding);
1379
1380 n->m_len += padding;
1381 m_adj(m, padding);
1382 *off += padding;
1383
1384 } while (!ALIGNED_POINTER(n->m_len, void *));
1385 }
1386
1387 /*
1388 * The NFS RPC parsing code uses the data address and the length of mbuf
1389 * structures to calculate on-memory addresses. This function makes sure these
1390 * parameters are correctly aligned.
1391 */
1392 void
nfs_realign(struct mbuf ** pm,int hsiz)1393 nfs_realign(struct mbuf **pm, int hsiz)
1394 {
1395 struct mbuf *m;
1396 struct mbuf *n = NULL;
1397 unsigned int off = 0;
1398
1399 ++nfs_realign_test;
1400 while ((m = *pm) != NULL) {
1401 if (!ALIGNED_POINTER(m->m_data, void *) ||
1402 !ALIGNED_POINTER(m->m_len, void *)) {
1403 MGET(n, M_WAIT, MT_DATA);
1404 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *)))
1405 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) {
1406 MCLGET(n, M_WAIT);
1407 }
1408 n->m_len = 0;
1409 break;
1410 }
1411 pm = &m->m_next;
1412 }
1413 /*
1414 * If n is non-NULL, loop on m copying data, then replace the
1415 * portion of the chain that had to be realigned.
1416 */
1417 if (n != NULL) {
1418 ++nfs_realign_count;
1419 while (m) {
1420 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT);
1421
1422 /*
1423 * If an unaligned amount of memory was copied, fix up
1424 * the last mbuf created by m_copyback().
1425 */
1426 if (!ALIGNED_POINTER(m->m_len, void *))
1427 nfs_realign_fixup(m, n, &off);
1428
1429 off += m->m_len;
1430 m = m->m_next;
1431 }
1432 m_freemp(pm);
1433 *pm = n;
1434 }
1435 }
1436
1437
1438 /*
1439 * Parse an RPC request
1440 * - verify it
1441 * - fill in the cred struct.
1442 */
1443 int
nfs_getreq(struct nfsrv_descript * nd,struct nfsd * nfsd,int has_header)1444 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
1445 {
1446 int len, i;
1447 u_int32_t *tl;
1448 u_int32_t nfsvers, auth_type;
1449 int error = 0;
1450 struct nfsm_info info;
1451
1452 info.nmi_mrep = nd->nd_mrep;
1453 info.nmi_md = nd->nd_md;
1454 info.nmi_dpos = nd->nd_dpos;
1455 info.nmi_errorp = &error;
1456 if (has_header) {
1457 tl = (uint32_t *)nfsm_dissect(&info, 10 * NFSX_UNSIGNED);
1458 if (tl == NULL)
1459 goto nfsmout;
1460 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
1461 if (*tl++ != rpc_call) {
1462 m_freem(info.nmi_mrep);
1463 return (EBADRPC);
1464 }
1465 } else {
1466 tl = (uint32_t *)nfsm_dissect(&info, 8 * NFSX_UNSIGNED);
1467 if (tl == NULL)
1468 goto nfsmout;
1469 }
1470 nd->nd_repstat = 0;
1471 nd->nd_flag = 0;
1472 if (*tl++ != rpc_vers) {
1473 nd->nd_repstat = ERPCMISMATCH;
1474 nd->nd_procnum = NFSPROC_NOOP;
1475 return (0);
1476 }
1477 if (*tl != nfs_prog) {
1478 nd->nd_repstat = EPROGUNAVAIL;
1479 nd->nd_procnum = NFSPROC_NOOP;
1480 return (0);
1481 }
1482 tl++;
1483 nfsvers = fxdr_unsigned(u_int32_t, *tl++);
1484 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) {
1485 nd->nd_repstat = EPROGMISMATCH;
1486 nd->nd_procnum = NFSPROC_NOOP;
1487 return (0);
1488 }
1489 if (nfsvers == NFS_VER3)
1490 nd->nd_flag = ND_NFSV3;
1491 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
1492 if (nd->nd_procnum == NFSPROC_NULL)
1493 return (0);
1494 if (nd->nd_procnum >= NFS_NPROCS ||
1495 (nd->nd_procnum > NFSPROC_COMMIT) ||
1496 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
1497 nd->nd_repstat = EPROCUNAVAIL;
1498 nd->nd_procnum = NFSPROC_NOOP;
1499 return (0);
1500 }
1501 if ((nd->nd_flag & ND_NFSV3) == 0)
1502 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
1503 auth_type = *tl++;
1504 len = fxdr_unsigned(int, *tl++);
1505 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1506 m_freem(info.nmi_mrep);
1507 return (EBADRPC);
1508 }
1509
1510 /* Handle auth_unix */
1511 if (auth_type == rpc_auth_unix) {
1512 len = fxdr_unsigned(int, *++tl);
1513 if (len < 0 || len > NFS_MAXNAMLEN) {
1514 m_freem(info.nmi_mrep);
1515 return (EBADRPC);
1516 }
1517 if (nfsm_adv(&info, nfsm_rndup(len)) != 0)
1518 goto nfsmout;
1519 tl = (uint32_t *)nfsm_dissect(&info, 3 * NFSX_UNSIGNED);
1520 if (tl == NULL)
1521 goto nfsmout;
1522 memset(&nd->nd_cr, 0, sizeof (struct ucred));
1523 refcnt_init(&nd->nd_cr.cr_refcnt);
1524 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
1525 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
1526 len = fxdr_unsigned(int, *tl);
1527 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
1528 m_freem(info.nmi_mrep);
1529 return (EBADRPC);
1530 }
1531 tl = (uint32_t *)
1532 nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED);
1533 if (tl == NULL)
1534 goto nfsmout;
1535 for (i = 0; i < len; i++) {
1536 if (i < NGROUPS_MAX)
1537 nd->nd_cr.cr_groups[i] =
1538 fxdr_unsigned(gid_t, *tl++);
1539 else
1540 tl++;
1541 }
1542 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len;
1543 len = fxdr_unsigned(int, *++tl);
1544 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1545 m_freem(info.nmi_mrep);
1546 return (EBADRPC);
1547 }
1548 if (len > 0) {
1549 if (nfsm_adv(&info, nfsm_rndup(len)) != 0)
1550 goto nfsmout;
1551 }
1552 } else {
1553 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
1554 nd->nd_procnum = NFSPROC_NOOP;
1555 return (0);
1556 }
1557
1558 nd->nd_md = info.nmi_md;
1559 nd->nd_dpos = info.nmi_dpos;
1560 return (0);
1561 nfsmout:
1562 return (error);
1563 }
1564
1565 void
nfs_msg(struct nfsreq * rep,char * msg)1566 nfs_msg(struct nfsreq *rep, char *msg)
1567 {
1568 tpr_t tpr;
1569
1570 if (rep->r_procp)
1571 tpr = tprintf_open(rep->r_procp);
1572 else
1573 tpr = NULL;
1574
1575 tprintf(tpr, "nfs server %s: %s\n",
1576 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg);
1577 tprintf_close(tpr);
1578 }
1579
1580 #ifdef NFSSERVER
1581 /*
1582 * Socket upcall routine for the nfsd sockets.
1583 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
1584 * Essentially do as much as possible non-blocking, else punt and it will
1585 * be called with M_WAIT from an nfsd.
1586 */
1587 void
nfsrv_rcv(struct socket * so,caddr_t arg,int waitflag)1588 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag)
1589 {
1590 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
1591 struct mbuf *m;
1592 struct mbuf *mp, *nam;
1593 struct uio auio;
1594 int flags, error;
1595
1596 KERNEL_LOCK();
1597
1598 if ((slp->ns_flag & SLP_VALID) == 0)
1599 goto out;
1600
1601 /* Defer soreceive() to an nfsd. */
1602 if (waitflag == M_DONTWAIT) {
1603 slp->ns_flag |= SLP_NEEDQ;
1604 goto dorecs;
1605 }
1606
1607 auio.uio_procp = NULL;
1608 if (so->so_type == SOCK_STREAM) {
1609 /*
1610 * Do soreceive().
1611 */
1612 auio.uio_resid = 1000000000;
1613 flags = MSG_DONTWAIT;
1614 error = soreceive(so, NULL, &auio, &mp, NULL,
1615 &flags, 0);
1616 if (error || mp == NULL) {
1617 if (error == EWOULDBLOCK)
1618 slp->ns_flag |= SLP_NEEDQ;
1619 else
1620 slp->ns_flag |= SLP_DISCONN;
1621 goto dorecs;
1622 }
1623 m = mp;
1624 if (slp->ns_rawend) {
1625 slp->ns_rawend->m_next = m;
1626 slp->ns_cc += 1000000000 - auio.uio_resid;
1627 } else {
1628 slp->ns_raw = m;
1629 slp->ns_cc = 1000000000 - auio.uio_resid;
1630 }
1631 while (m->m_next)
1632 m = m->m_next;
1633 slp->ns_rawend = m;
1634
1635 /*
1636 * Now try and parse record(s) out of the raw stream data.
1637 */
1638 error = nfsrv_getstream(slp, waitflag);
1639 if (error) {
1640 if (error == EPERM)
1641 slp->ns_flag |= SLP_DISCONN;
1642 else
1643 slp->ns_flag |= SLP_NEEDQ;
1644 }
1645 } else {
1646 do {
1647 auio.uio_resid = 1000000000;
1648 flags = MSG_DONTWAIT;
1649 error = soreceive(so, &nam, &auio, &mp,
1650 NULL, &flags, 0);
1651 if (mp) {
1652 m = nam;
1653 m->m_next = mp;
1654 if (slp->ns_recend)
1655 slp->ns_recend->m_nextpkt = m;
1656 else
1657 slp->ns_rec = m;
1658 slp->ns_recend = m;
1659 m->m_nextpkt = NULL;
1660 }
1661 if (error) {
1662 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
1663 && error != EWOULDBLOCK) {
1664 slp->ns_flag |= SLP_DISCONN;
1665 goto dorecs;
1666 }
1667 }
1668 } while (mp);
1669 }
1670
1671 /*
1672 * Now try and process the request records, non-blocking.
1673 */
1674 dorecs:
1675 if (waitflag == M_DONTWAIT &&
1676 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN))))
1677 nfsrv_wakenfsd(slp);
1678
1679 out:
1680 KERNEL_UNLOCK();
1681 }
1682
1683 /*
1684 * Try and extract an RPC request from the mbuf data list received on a
1685 * stream socket. The "waitflag" argument indicates whether or not it
1686 * can sleep.
1687 */
1688 int
nfsrv_getstream(struct nfssvc_sock * slp,int waitflag)1689 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag)
1690 {
1691 struct mbuf *m, **mpp;
1692 char *cp1, *cp2;
1693 int len;
1694 struct mbuf *om, *m2, *recm;
1695 u_int32_t recmark;
1696
1697 if (slp->ns_flag & SLP_GETSTREAM)
1698 return (0);
1699 slp->ns_flag |= SLP_GETSTREAM;
1700 for (;;) {
1701 if (slp->ns_reclen == 0) {
1702 if (slp->ns_cc < NFSX_UNSIGNED) {
1703 slp->ns_flag &= ~SLP_GETSTREAM;
1704 return (0);
1705 }
1706 m = slp->ns_raw;
1707 if (m->m_len >= NFSX_UNSIGNED) {
1708 bcopy(mtod(m, caddr_t), &recmark,
1709 NFSX_UNSIGNED);
1710 m->m_data += NFSX_UNSIGNED;
1711 m->m_len -= NFSX_UNSIGNED;
1712 } else {
1713 cp1 = (caddr_t)&recmark;
1714 cp2 = mtod(m, caddr_t);
1715 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
1716 while (m->m_len == 0) {
1717 m = m->m_next;
1718 cp2 = mtod(m, caddr_t);
1719 }
1720 *cp1++ = *cp2++;
1721 m->m_data++;
1722 m->m_len--;
1723 }
1724 }
1725 slp->ns_cc -= NFSX_UNSIGNED;
1726 recmark = ntohl(recmark);
1727 slp->ns_reclen = recmark & ~0x80000000;
1728 if (recmark & 0x80000000)
1729 slp->ns_flag |= SLP_LASTFRAG;
1730 else
1731 slp->ns_flag &= ~SLP_LASTFRAG;
1732 if (slp->ns_reclen > NFS_MAXPACKET) {
1733 slp->ns_flag &= ~SLP_GETSTREAM;
1734 return (EPERM);
1735 }
1736 }
1737
1738 /*
1739 * Now get the record part.
1740 */
1741 recm = NULL;
1742 if (slp->ns_cc == slp->ns_reclen) {
1743 recm = slp->ns_raw;
1744 slp->ns_raw = slp->ns_rawend = NULL;
1745 slp->ns_cc = slp->ns_reclen = 0;
1746 } else if (slp->ns_cc > slp->ns_reclen) {
1747 len = 0;
1748 m = slp->ns_raw;
1749 om = NULL;
1750 while (len < slp->ns_reclen) {
1751 if ((len + m->m_len) > slp->ns_reclen) {
1752 m2 = m_copym(m, 0, slp->ns_reclen - len,
1753 waitflag);
1754 if (m2) {
1755 if (om) {
1756 om->m_next = m2;
1757 recm = slp->ns_raw;
1758 } else
1759 recm = m2;
1760 m->m_data += slp->ns_reclen-len;
1761 m->m_len -= slp->ns_reclen-len;
1762 len = slp->ns_reclen;
1763 } else {
1764 slp->ns_flag &= ~SLP_GETSTREAM;
1765 return (EWOULDBLOCK);
1766 }
1767 } else if ((len + m->m_len) == slp->ns_reclen) {
1768 om = m;
1769 len += m->m_len;
1770 m = m->m_next;
1771 recm = slp->ns_raw;
1772 om->m_next = NULL;
1773 } else {
1774 om = m;
1775 len += m->m_len;
1776 m = m->m_next;
1777 }
1778 }
1779 slp->ns_raw = m;
1780 slp->ns_cc -= len;
1781 slp->ns_reclen = 0;
1782 } else {
1783 slp->ns_flag &= ~SLP_GETSTREAM;
1784 return (0);
1785 }
1786
1787 /*
1788 * Accumulate the fragments into a record.
1789 */
1790 mpp = &slp->ns_frag;
1791 while (*mpp)
1792 mpp = &((*mpp)->m_next);
1793 *mpp = recm;
1794 if (slp->ns_flag & SLP_LASTFRAG) {
1795 if (slp->ns_recend)
1796 slp->ns_recend->m_nextpkt = slp->ns_frag;
1797 else
1798 slp->ns_rec = slp->ns_frag;
1799 slp->ns_recend = slp->ns_frag;
1800 slp->ns_frag = NULL;
1801 }
1802 }
1803 }
1804
1805 /*
1806 * Parse an RPC header.
1807 */
1808 int
nfsrv_dorec(struct nfssvc_sock * slp,struct nfsd * nfsd,struct nfsrv_descript ** ndp)1809 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd,
1810 struct nfsrv_descript **ndp)
1811 {
1812 struct mbuf *m, *nam;
1813 struct nfsrv_descript *nd;
1814 int error;
1815
1816 *ndp = NULL;
1817 if ((slp->ns_flag & SLP_VALID) == 0 ||
1818 (m = slp->ns_rec) == NULL)
1819 return (ENOBUFS);
1820 slp->ns_rec = m->m_nextpkt;
1821 if (slp->ns_rec)
1822 m->m_nextpkt = NULL;
1823 else
1824 slp->ns_recend = NULL;
1825 if (m->m_type == MT_SONAME) {
1826 nam = m;
1827 m = m->m_next;
1828 nam->m_next = NULL;
1829 } else
1830 nam = NULL;
1831 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK);
1832 nfs_realign(&m, 10 * NFSX_UNSIGNED);
1833 nd->nd_md = nd->nd_mrep = m;
1834 nd->nd_nam2 = nam;
1835 nd->nd_dpos = mtod(m, caddr_t);
1836 error = nfs_getreq(nd, nfsd, 1);
1837 if (error) {
1838 m_freem(nam);
1839 pool_put(&nfsrv_descript_pl, nd);
1840 return (error);
1841 }
1842 *ndp = nd;
1843 nfsd->nfsd_nd = nd;
1844 return (0);
1845 }
1846
1847
1848 /*
1849 * Search for a sleeping nfsd and wake it up.
1850 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
1851 * running nfsds will go look for the work in the nfssvc_sock list.
1852 */
1853 void
nfsrv_wakenfsd(struct nfssvc_sock * slp)1854 nfsrv_wakenfsd(struct nfssvc_sock *slp)
1855 {
1856 struct nfsd *nfsd;
1857
1858 if ((slp->ns_flag & SLP_VALID) == 0)
1859 return;
1860
1861 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) {
1862 if (nfsd->nfsd_flag & NFSD_WAITING) {
1863 nfsd->nfsd_flag &= ~NFSD_WAITING;
1864 if (nfsd->nfsd_slp)
1865 panic("nfsd wakeup");
1866 slp->ns_sref++;
1867 nfsd->nfsd_slp = slp;
1868 wakeup_one(nfsd);
1869 return;
1870 }
1871 }
1872
1873 slp->ns_flag |= SLP_DOREC;
1874 nfsd_head_flag |= NFSD_CHECKSLP;
1875 }
1876 #endif /* NFSSERVER */
1877