1 /* $OpenBSD: nfs_socket.c,v 1.111 2015/08/24 14:00:29 bluhm Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/signalvar.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/syslog.h> 55 #include <sys/tprintf.h> 56 #include <sys/namei.h> 57 #include <sys/pool.h> 58 #include <sys/queue.h> 59 60 #include <netinet/in.h> 61 #include <netinet/tcp.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/xdr_subs.h> 67 #include <nfs/nfsm_subs.h> 68 #include <nfs/nfsmount.h> 69 #include <nfs/nfs_var.h> 70 71 /* External data, mostly RPC constants in XDR form. */ 72 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 73 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 74 extern u_int32_t nfs_prog; 75 extern struct nfsstats nfsstats; 76 extern int nfsv3_procid[NFS_NPROCS]; 77 extern int nfs_ticks; 78 79 extern struct pool nfsrv_descript_pl; 80 81 /* 82 * There is a congestion window for outstanding rpcs maintained per mount 83 * point. The cwnd size is adjusted in roughly the way that: 84 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 85 * SIGCOMM '88". ACM, August 1988. 86 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 87 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 88 * of rpcs is in progress. 89 * (The sent count and cwnd are scaled for integer arith.) 90 * Variants of "slow start" were tried and were found to be too much of a 91 * performance hit (ave. rtt 3 times larger), 92 * I suspect due to the large rtt that nfs rpcs have. 93 */ 94 #define NFS_CWNDSCALE 256 95 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 96 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 97 98 /* RTT estimator */ 99 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 100 NFS_DEFAULT_TIMER, /* NULL */ 101 NFS_GETATTR_TIMER, /* GETATTR */ 102 NFS_DEFAULT_TIMER, /* SETATTR */ 103 NFS_LOOKUP_TIMER, /* LOOKUP */ 104 NFS_GETATTR_TIMER, /* ACCESS */ 105 NFS_READ_TIMER, /* READLINK */ 106 NFS_READ_TIMER, /* READ */ 107 NFS_WRITE_TIMER, /* WRITE */ 108 NFS_DEFAULT_TIMER, /* CREATE */ 109 NFS_DEFAULT_TIMER, /* MKDIR */ 110 NFS_DEFAULT_TIMER, /* SYMLINK */ 111 NFS_DEFAULT_TIMER, /* MKNOD */ 112 NFS_DEFAULT_TIMER, /* REMOVE */ 113 NFS_DEFAULT_TIMER, /* RMDIR */ 114 NFS_DEFAULT_TIMER, /* RENAME */ 115 NFS_DEFAULT_TIMER, /* LINK */ 116 NFS_READ_TIMER, /* READDIR */ 117 NFS_READ_TIMER, /* READDIRPLUS */ 118 NFS_DEFAULT_TIMER, /* FSSTAT */ 119 NFS_DEFAULT_TIMER, /* FSINFO */ 120 NFS_DEFAULT_TIMER, /* PATHCONF */ 121 NFS_DEFAULT_TIMER, /* COMMIT */ 122 NFS_DEFAULT_TIMER, /* NOOP */ 123 }; 124 125 void nfs_init_rtt(struct nfsmount *); 126 void nfs_update_rtt(struct nfsreq *); 127 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 128 129 void nfs_realign(struct mbuf **, int); 130 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 131 unsigned int nfs_realign_test = 0; 132 unsigned int nfs_realign_count = 0; 133 134 /* Initialize the RTT estimator state for a new mount point. */ 135 void 136 nfs_init_rtt(struct nfsmount *nmp) 137 { 138 int i; 139 140 for (i = 0; i < NFS_MAX_TIMER; i++) 141 nmp->nm_srtt[i] = NFS_INITRTT; 142 for (i = 0; i < NFS_MAX_TIMER; i++) 143 nmp->nm_sdrtt[i] = 0; 144 } 145 146 /* 147 * Update a mount point's RTT estimator state using data from the 148 * passed-in request. 149 * 150 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 151 * 152 * NB: Since the timer resolution of NFS_HZ is so course, it can often 153 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 154 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 155 * update values. 156 */ 157 void 158 nfs_update_rtt(struct nfsreq *rep) 159 { 160 int t1 = rep->r_rtt + 1; 161 int index = nfs_ptimers[rep->r_procnum] - 1; 162 int *srtt = &rep->r_nmp->nm_srtt[index]; 163 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 164 165 t1 -= *srtt >> 3; 166 *srtt += t1; 167 if (t1 < 0) 168 t1 = -t1; 169 t1 -= *sdrtt >> 2; 170 *sdrtt += t1; 171 } 172 173 /* 174 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 175 * 176 * Use the mean and mean deviation of RTT for the appropriate type 177 * of RPC for the frequent RPCs and a default for the others. 178 * The justification for doing "other" this way is that these RPCs 179 * happen so infrequently that timer est. would probably be stale. 180 * Also, since many of these RPCs are non-idempotent, a conservative 181 * timeout is desired. 182 * 183 * getattr, lookup - A+2D 184 * read, write - A+4D 185 * other - nm_timeo 186 */ 187 int 188 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 189 { 190 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 191 int index = timer - 1; 192 int rto; 193 194 switch (timer) { 195 case NFS_GETATTR_TIMER: 196 case NFS_LOOKUP_TIMER: 197 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 198 ((nmp->nm_sdrtt[index] + 1) >> 1); 199 break; 200 case NFS_READ_TIMER: 201 case NFS_WRITE_TIMER: 202 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 203 (nmp->nm_sdrtt[index] + 1); 204 break; 205 default: 206 rto = nmp->nm_timeo; 207 return (rto); 208 } 209 210 if (rto < NFS_MINRTO) 211 rto = NFS_MINRTO; 212 else if (rto > NFS_MAXRTO) 213 rto = NFS_MAXRTO; 214 215 return (rto); 216 } 217 218 219 220 /* 221 * Initialize sockets and congestion for a new NFS connection. 222 * We do not free the sockaddr if error. 223 */ 224 int 225 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 226 { 227 struct socket *so; 228 int s, error, rcvreserve, sndreserve; 229 struct sockaddr *saddr; 230 struct sockaddr_in *sin; 231 struct mbuf *m; 232 233 nmp->nm_so = NULL; 234 saddr = mtod(nmp->nm_nam, struct sockaddr *); 235 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 236 nmp->nm_soproto); 237 if (error) 238 goto bad; 239 so = nmp->nm_so; 240 nmp->nm_soflags = so->so_proto->pr_flags; 241 242 /* 243 * Some servers require that the client port be a reserved port number. 244 * We always allocate a reserved port, as this prevents filehandle 245 * disclosure through UDP port capture. 246 */ 247 if (saddr->sa_family == AF_INET) { 248 struct mbuf *mopt; 249 int *ip; 250 251 MGET(mopt, M_WAIT, MT_SOOPTS); 252 mopt->m_len = sizeof(int); 253 ip = mtod(mopt, int *); 254 *ip = IP_PORTRANGE_LOW; 255 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 256 if (error) 257 goto bad; 258 259 MGET(m, M_WAIT, MT_SONAME); 260 sin = mtod(m, struct sockaddr_in *); 261 memset(sin, 0, sizeof(*sin)); 262 sin->sin_len = m->m_len = sizeof(struct sockaddr_in); 263 sin->sin_family = AF_INET; 264 sin->sin_addr.s_addr = INADDR_ANY; 265 sin->sin_port = htons(0); 266 error = sobind(so, m, &proc0); 267 m_freem(m); 268 if (error) 269 goto bad; 270 271 MGET(mopt, M_WAIT, MT_SOOPTS); 272 mopt->m_len = sizeof(int); 273 ip = mtod(mopt, int *); 274 *ip = IP_PORTRANGE_DEFAULT; 275 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 276 if (error) 277 goto bad; 278 } 279 280 /* 281 * Protocols that do not require connections may be optionally left 282 * unconnected for servers that reply from a port other than NFS_PORT. 283 */ 284 if (nmp->nm_flag & NFSMNT_NOCONN) { 285 if (nmp->nm_soflags & PR_CONNREQUIRED) { 286 error = ENOTCONN; 287 goto bad; 288 } 289 } else { 290 error = soconnect(so, nmp->nm_nam); 291 if (error) 292 goto bad; 293 294 /* 295 * Wait for the connection to complete. Cribbed from the 296 * connect system call but with the wait timing out so 297 * that interruptible mounts don't hang here for a long time. 298 */ 299 s = splsoftnet(); 300 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 301 (void) tsleep((caddr_t)&so->so_timeo, PSOCK, 302 "nfscon", 2 * hz); 303 if ((so->so_state & SS_ISCONNECTING) && 304 so->so_error == 0 && rep && 305 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 306 so->so_state &= ~SS_ISCONNECTING; 307 splx(s); 308 goto bad; 309 } 310 } 311 if (so->so_error) { 312 error = so->so_error; 313 so->so_error = 0; 314 splx(s); 315 goto bad; 316 } 317 splx(s); 318 } 319 /* 320 * Always set receive timeout to detect server crash and reconnect. 321 * Otherwise, we can get stuck in soreceive forever. 322 */ 323 so->so_rcv.sb_timeo = (5 * hz); 324 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 325 so->so_snd.sb_timeo = (5 * hz); 326 else 327 so->so_snd.sb_timeo = 0; 328 if (nmp->nm_sotype == SOCK_DGRAM) { 329 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 330 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 331 NFS_MAXPKTHDR) * 2; 332 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 333 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 334 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 335 NFS_MAXPKTHDR) * 2; 336 } else { 337 if (nmp->nm_sotype != SOCK_STREAM) 338 panic("nfscon sotype"); 339 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 340 MGET(m, M_WAIT, MT_SOOPTS); 341 *mtod(m, int32_t *) = 1; 342 m->m_len = sizeof(int32_t); 343 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 344 } 345 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 346 MGET(m, M_WAIT, MT_SOOPTS); 347 *mtod(m, int32_t *) = 1; 348 m->m_len = sizeof(int32_t); 349 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 350 } 351 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 352 sizeof (u_int32_t)) * 2; 353 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 354 sizeof (u_int32_t)) * 2; 355 } 356 error = soreserve(so, sndreserve, rcvreserve); 357 if (error) 358 goto bad; 359 so->so_rcv.sb_flags |= SB_NOINTR; 360 so->so_snd.sb_flags |= SB_NOINTR; 361 362 /* Initialize other non-zero congestion variables */ 363 nfs_init_rtt(nmp); 364 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 365 nmp->nm_sent = 0; 366 nmp->nm_timeouts = 0; 367 return (0); 368 369 bad: 370 nfs_disconnect(nmp); 371 return (error); 372 } 373 374 /* 375 * Reconnect routine: 376 * Called when a connection is broken on a reliable protocol. 377 * - clean up the old socket 378 * - nfs_connect() again 379 * - set R_MUSTRESEND for all outstanding requests on mount point 380 * If this fails the mount point is DEAD! 381 * nb: Must be called with the nfs_sndlock() set on the mount point. 382 */ 383 int 384 nfs_reconnect(struct nfsreq *rep) 385 { 386 struct nfsreq *rp; 387 struct nfsmount *nmp = rep->r_nmp; 388 int s, error; 389 390 nfs_disconnect(nmp); 391 while ((error = nfs_connect(nmp, rep)) != 0) { 392 if (error == EINTR || error == ERESTART) 393 return (EINTR); 394 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsrecon", 0); 395 } 396 397 /* 398 * Loop through outstanding request list and fix up all requests 399 * on old socket. 400 */ 401 s = splsoftnet(); 402 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 403 rp->r_flags |= R_MUSTRESEND; 404 rp->r_rexmit = 0; 405 } 406 splx(s); 407 return (0); 408 } 409 410 /* 411 * NFS disconnect. Clean up and unlink. 412 */ 413 void 414 nfs_disconnect(struct nfsmount *nmp) 415 { 416 struct socket *so; 417 418 if (nmp->nm_so) { 419 so = nmp->nm_so; 420 nmp->nm_so = NULL; 421 soshutdown(so, SHUT_RDWR); 422 soclose(so); 423 } 424 } 425 426 /* 427 * This is the nfs send routine. For connection based socket types, it 428 * must be called with an nfs_sndlock() on the socket. 429 * "rep == NULL" indicates that it has been called from a server. 430 * For the client side: 431 * - return EINTR if the RPC is terminated, 0 otherwise 432 * - set R_MUSTRESEND if the send fails for any reason 433 * - do any cleanup required by recoverable socket errors (???) 434 * For the server side: 435 * - return EINTR or ERESTART if interrupted by a signal 436 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 437 * - do any cleanup required by recoverable socket errors (???) 438 */ 439 int 440 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 441 struct nfsreq *rep) 442 { 443 struct mbuf *sendnam; 444 int error, soflags, flags; 445 446 if (rep) { 447 if (rep->r_flags & R_SOFTTERM) { 448 m_freem(top); 449 return (EINTR); 450 } 451 if ((so = rep->r_nmp->nm_so) == NULL) { 452 rep->r_flags |= R_MUSTRESEND; 453 m_freem(top); 454 return (0); 455 } 456 rep->r_flags &= ~R_MUSTRESEND; 457 soflags = rep->r_nmp->nm_soflags; 458 } else 459 soflags = so->so_proto->pr_flags; 460 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 461 sendnam = NULL; 462 else 463 sendnam = nam; 464 if (so->so_type == SOCK_SEQPACKET) 465 flags = MSG_EOR; 466 else 467 flags = 0; 468 469 error = sosend(so, sendnam, NULL, top, NULL, flags); 470 if (error) { 471 if (rep) { 472 /* 473 * Deal with errors for the client side. 474 */ 475 if (rep->r_flags & R_SOFTTERM) 476 error = EINTR; 477 else 478 rep->r_flags |= R_MUSTRESEND; 479 } 480 481 /* 482 * Handle any recoverable (soft) socket errors here. (???) 483 */ 484 if (error != EINTR && error != ERESTART && 485 error != EWOULDBLOCK && error != EPIPE) 486 error = 0; 487 } 488 return (error); 489 } 490 491 #ifdef NFSCLIENT 492 /* 493 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 494 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 495 * Mark and consolidate the data into a new mbuf list. 496 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 497 * small mbufs. 498 * For SOCK_STREAM we must be very careful to read an entire record once 499 * we have read any of it, even if the system call has been interrupted. 500 */ 501 int 502 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 503 { 504 struct socket *so; 505 struct uio auio; 506 struct iovec aio; 507 struct mbuf *m; 508 struct mbuf *control; 509 u_int32_t len; 510 struct mbuf **getnam; 511 int error, sotype, rcvflg; 512 struct proc *p = curproc; /* XXX */ 513 514 /* 515 * Set up arguments for soreceive() 516 */ 517 *mp = NULL; 518 *aname = NULL; 519 sotype = rep->r_nmp->nm_sotype; 520 521 /* 522 * For reliable protocols, lock against other senders/receivers 523 * in case a reconnect is necessary. 524 * For SOCK_STREAM, first get the Record Mark to find out how much 525 * more there is to get. 526 * We must lock the socket against other receivers 527 * until we have an entire rpc request/reply. 528 */ 529 if (sotype != SOCK_DGRAM) { 530 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 531 if (error) 532 return (error); 533 tryagain: 534 /* 535 * Check for fatal errors and resending request. 536 */ 537 /* 538 * Ugh: If a reconnect attempt just happened, nm_so 539 * would have changed. NULL indicates a failed 540 * attempt that has essentially shut down this 541 * mount point. 542 */ 543 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 544 nfs_sndunlock(&rep->r_nmp->nm_flag); 545 return (EINTR); 546 } 547 so = rep->r_nmp->nm_so; 548 if (!so) { 549 error = nfs_reconnect(rep); 550 if (error) { 551 nfs_sndunlock(&rep->r_nmp->nm_flag); 552 return (error); 553 } 554 goto tryagain; 555 } 556 while (rep->r_flags & R_MUSTRESEND) { 557 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 558 nfsstats.rpcretries++; 559 rep->r_rtt = 0; 560 rep->r_flags &= ~R_TIMING; 561 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 562 if (error) { 563 if (error == EINTR || error == ERESTART || 564 (error = nfs_reconnect(rep)) != 0) { 565 nfs_sndunlock(&rep->r_nmp->nm_flag); 566 return (error); 567 } 568 goto tryagain; 569 } 570 } 571 nfs_sndunlock(&rep->r_nmp->nm_flag); 572 if (sotype == SOCK_STREAM) { 573 aio.iov_base = (caddr_t) &len; 574 aio.iov_len = sizeof(u_int32_t); 575 auio.uio_iov = &aio; 576 auio.uio_iovcnt = 1; 577 auio.uio_segflg = UIO_SYSSPACE; 578 auio.uio_rw = UIO_READ; 579 auio.uio_offset = 0; 580 auio.uio_resid = sizeof(u_int32_t); 581 auio.uio_procp = p; 582 do { 583 rcvflg = MSG_WAITALL; 584 error = soreceive(so, NULL, &auio, NULL, NULL, 585 &rcvflg, 0); 586 if (error == EWOULDBLOCK && rep) { 587 if (rep->r_flags & R_SOFTTERM) 588 return (EINTR); 589 /* 590 * looks like the server died after it 591 * received the request, make sure 592 * that we will retransmit and we 593 * don't get stuck here forever. 594 */ 595 if (rep->r_rexmit >= rep->r_nmp->nm_retry) { 596 nfsstats.rpctimeouts++; 597 error = EPIPE; 598 } 599 } 600 } while (error == EWOULDBLOCK); 601 if (!error && auio.uio_resid > 0) { 602 log(LOG_INFO, 603 "short receive (%zu/%zu) from nfs server %s\n", 604 sizeof(u_int32_t) - auio.uio_resid, 605 sizeof(u_int32_t), 606 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 607 error = EPIPE; 608 } 609 if (error) 610 goto errout; 611 612 len = ntohl(len) & ~0x80000000; 613 /* 614 * This is SERIOUS! We are out of sync with the sender 615 * and forcing a disconnect/reconnect is all I can do. 616 */ 617 if (len > NFS_MAXPACKET) { 618 log(LOG_ERR, "%s (%u) from nfs server %s\n", 619 "impossible packet length", 620 len, 621 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 622 error = EFBIG; 623 goto errout; 624 } 625 auio.uio_resid = len; 626 do { 627 rcvflg = MSG_WAITALL; 628 error = soreceive(so, NULL, &auio, mp, NULL, 629 &rcvflg, 0); 630 } while (error == EWOULDBLOCK || error == EINTR || 631 error == ERESTART); 632 if (!error && auio.uio_resid > 0) { 633 log(LOG_INFO, 634 "short receive (%zu/%u) from nfs server %s\n", 635 len - auio.uio_resid, len, 636 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 637 error = EPIPE; 638 } 639 } else { 640 /* 641 * NB: Since uio_resid is big, MSG_WAITALL is ignored 642 * and soreceive() will return when it has either a 643 * control msg or a data msg. 644 * We have no use for control msg., but must grab them 645 * and then throw them away so we know what is going 646 * on. 647 */ 648 auio.uio_resid = len = 100000000; /* Anything Big */ 649 auio.uio_procp = p; 650 do { 651 rcvflg = 0; 652 error = soreceive(so, NULL, &auio, mp, &control, 653 &rcvflg, 0); 654 m_freem(control); 655 if (error == EWOULDBLOCK && rep) { 656 if (rep->r_flags & R_SOFTTERM) 657 return (EINTR); 658 } 659 } while (error == EWOULDBLOCK || 660 (!error && *mp == NULL && control)); 661 if ((rcvflg & MSG_EOR) == 0) 662 printf("Egad!!\n"); 663 if (!error && *mp == NULL) 664 error = EPIPE; 665 len -= auio.uio_resid; 666 } 667 errout: 668 if (error && error != EINTR && error != ERESTART) { 669 m_freem(*mp); 670 *mp = NULL; 671 if (error != EPIPE) 672 log(LOG_INFO, 673 "receive error %d from nfs server %s\n", 674 error, 675 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 676 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 677 if (!error) { 678 error = nfs_reconnect(rep); 679 if (!error) 680 goto tryagain; 681 nfs_sndunlock(&rep->r_nmp->nm_flag); 682 } 683 } 684 } else { 685 if ((so = rep->r_nmp->nm_so) == NULL) 686 return (EACCES); 687 if (so->so_state & SS_ISCONNECTED) 688 getnam = NULL; 689 else 690 getnam = aname; 691 auio.uio_resid = len = 1000000; 692 auio.uio_procp = p; 693 do { 694 rcvflg = 0; 695 error = soreceive(so, getnam, &auio, mp, NULL, 696 &rcvflg, 0); 697 if (error == EWOULDBLOCK && 698 (rep->r_flags & R_SOFTTERM)) 699 return (EINTR); 700 } while (error == EWOULDBLOCK); 701 len -= auio.uio_resid; 702 } 703 if (error) { 704 m_freem(*mp); 705 *mp = NULL; 706 } 707 /* 708 * Search for any mbufs that are not a multiple of 4 bytes long 709 * or with m_data not longword aligned. 710 * These could cause pointer alignment problems, so copy them to 711 * well aligned mbufs. 712 */ 713 nfs_realign(mp, 5 * NFSX_UNSIGNED); 714 return (error); 715 } 716 717 /* 718 * Implement receipt of reply on a socket. 719 * We must search through the list of received datagrams matching them 720 * with outstanding requests using the xid, until ours is found. 721 */ 722 int 723 nfs_reply(struct nfsreq *myrep) 724 { 725 struct nfsreq *rep; 726 struct nfsmount *nmp = myrep->r_nmp; 727 struct nfsm_info info; 728 struct mbuf *nam; 729 u_int32_t rxid, *tl, t1; 730 caddr_t cp2; 731 int s, error; 732 733 /* 734 * Loop around until we get our own reply 735 */ 736 for (;;) { 737 /* 738 * Lock against other receivers so that I don't get stuck in 739 * sbwait() after someone else has received my reply for me. 740 * Also necessary for connection based protocols to avoid 741 * race conditions during a reconnect. 742 */ 743 error = nfs_rcvlock(myrep); 744 if (error) 745 return (error == EALREADY ? 0 : error); 746 747 /* 748 * Get the next Rpc reply off the socket 749 */ 750 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 751 nfs_rcvunlock(&nmp->nm_flag); 752 if (error) { 753 754 /* 755 * Ignore routing errors on connectionless protocols?? 756 */ 757 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 758 if (nmp->nm_so) 759 nmp->nm_so->so_error = 0; 760 continue; 761 } 762 return (error); 763 } 764 m_freem(nam); 765 766 /* 767 * Get the xid and check that it is an rpc reply 768 */ 769 info.nmi_md = info.nmi_mrep; 770 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 771 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); 772 rxid = *tl++; 773 if (*tl != rpc_reply) { 774 nfsstats.rpcinvalid++; 775 m_freem(info.nmi_mrep); 776 nfsmout: 777 continue; 778 } 779 780 /* 781 * Loop through the request list to match up the reply 782 * Iff no match, just drop the datagram 783 */ 784 s = splsoftnet(); 785 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 786 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 787 /* Found it.. */ 788 rep->r_mrep = info.nmi_mrep; 789 rep->r_md = info.nmi_md; 790 rep->r_dpos = info.nmi_dpos; 791 792 /* 793 * Update congestion window. 794 * Do the additive increase of 795 * one rpc/rtt. 796 */ 797 if (nmp->nm_cwnd <= nmp->nm_sent) { 798 nmp->nm_cwnd += 799 (NFS_CWNDSCALE * NFS_CWNDSCALE + 800 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 801 if (nmp->nm_cwnd > NFS_MAXCWND) 802 nmp->nm_cwnd = NFS_MAXCWND; 803 } 804 rep->r_flags &= ~R_SENT; 805 nmp->nm_sent -= NFS_CWNDSCALE; 806 807 if (rep->r_flags & R_TIMING) 808 nfs_update_rtt(rep); 809 810 nmp->nm_timeouts = 0; 811 break; 812 } 813 } 814 splx(s); 815 /* 816 * If not matched to a request, drop it. 817 * If it's mine, get out. 818 */ 819 if (rep == 0) { 820 nfsstats.rpcunexpected++; 821 m_freem(info.nmi_mrep); 822 } else if (rep == myrep) { 823 if (rep->r_mrep == NULL) 824 panic("nfsreply nil"); 825 return (0); 826 } 827 } 828 } 829 830 /* 831 * nfs_request - goes something like this 832 * - fill in request struct 833 * - links it into list 834 * - calls nfs_send() for first transmit 835 * - calls nfs_receive() to get reply 836 * - break down rpc header and return with nfs reply pointed to 837 * by mrep or error 838 * nb: always frees up mreq mbuf list 839 */ 840 int 841 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 842 { 843 struct mbuf *m; 844 u_int32_t *tl; 845 struct nfsmount *nmp; 846 struct timeval tv; 847 caddr_t cp2; 848 int t1, i, s, error = 0; 849 int trylater_delay; 850 struct nfsreq *rep; 851 int mrest_len; 852 struct nfsm_info info; 853 854 rep = pool_get(&nfsreqpl, PR_WAITOK); 855 rep->r_nmp = VFSTONFS(vp->v_mount); 856 rep->r_vp = vp; 857 rep->r_procp = infop->nmi_procp; 858 rep->r_procnum = procnum; 859 860 mrest_len = 0; 861 m = infop->nmi_mreq; 862 while (m) { 863 mrest_len += m->m_len; 864 m = m->m_next; 865 } 866 867 /* empty mbuf for AUTH_UNIX header */ 868 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 869 rep->r_mreq->m_next = infop->nmi_mreq; 870 rep->r_mreq->m_pkthdr.len = mrest_len; 871 872 trylater_delay = NFS_MINTIMEO; 873 874 nmp = rep->r_nmp; 875 876 /* Get the RPC header with authorization. */ 877 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 878 m = rep->r_mreq; 879 880 /* 881 * For stream protocols, insert a Sun RPC Record Mark. 882 */ 883 if (nmp->nm_sotype == SOCK_STREAM) { 884 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 885 *mtod(m, u_int32_t *) = htonl(0x80000000 | 886 (m->m_pkthdr.len - NFSX_UNSIGNED)); 887 } 888 889 tryagain: 890 rep->r_rtt = rep->r_rexmit = 0; 891 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 892 rep->r_flags = R_TIMING; 893 else 894 rep->r_flags = 0; 895 rep->r_mrep = NULL; 896 897 /* 898 * Do the client side RPC. 899 */ 900 nfsstats.rpcrequests++; 901 /* 902 * Chain request into list of outstanding requests. Be sure 903 * to put it LAST so timer finds oldest requests first. 904 */ 905 s = splsoftnet(); 906 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 907 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 908 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 909 910 /* 911 * If backing off another request or avoiding congestion, don't 912 * send this one now but let timer do it. If not timing a request, 913 * do it now. 914 */ 915 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 916 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 917 nmp->nm_sent < nmp->nm_cwnd)) { 918 splx(s); 919 if (nmp->nm_soflags & PR_CONNREQUIRED) 920 error = nfs_sndlock(&nmp->nm_flag, rep); 921 if (!error) { 922 error = nfs_send(nmp->nm_so, nmp->nm_nam, 923 m_copym(m, 0, M_COPYALL, M_WAIT), 924 rep); 925 if (nmp->nm_soflags & PR_CONNREQUIRED) 926 nfs_sndunlock(&nmp->nm_flag); 927 } 928 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 929 nmp->nm_sent += NFS_CWNDSCALE; 930 rep->r_flags |= R_SENT; 931 } 932 } else { 933 splx(s); 934 rep->r_rtt = -1; 935 } 936 937 /* 938 * Wait for the reply from our send or the timer's. 939 */ 940 if (!error || error == EPIPE) 941 error = nfs_reply(rep); 942 943 /* 944 * RPC done, unlink the request. 945 */ 946 s = splsoftnet(); 947 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 948 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 949 timeout_del(&nmp->nm_rtimeout); 950 splx(s); 951 952 /* 953 * Decrement the outstanding request count. 954 */ 955 if (rep->r_flags & R_SENT) { 956 rep->r_flags &= ~R_SENT; /* paranoia */ 957 nmp->nm_sent -= NFS_CWNDSCALE; 958 } 959 960 /* 961 * If there was a successful reply and a tprintf msg. 962 * tprintf a response. 963 */ 964 if (!error && (rep->r_flags & R_TPRINTFMSG)) 965 nfs_msg(rep, "is alive again"); 966 info.nmi_mrep = rep->r_mrep; 967 info.nmi_md = rep->r_md; 968 info.nmi_dpos = rep->r_dpos; 969 if (error) { 970 infop->nmi_mrep = NULL; 971 goto nfsmout1; 972 } 973 974 /* 975 * break down the rpc header and check if ok 976 */ 977 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 978 if (*tl++ == rpc_msgdenied) { 979 if (*tl == rpc_mismatch) 980 error = EOPNOTSUPP; 981 else 982 error = EACCES; /* Should be EAUTH. */ 983 infop->nmi_mrep = NULL; 984 goto nfsmout1; 985 } 986 987 /* 988 * Since we only support RPCAUTH_UNIX atm we step over the 989 * reply verifer type, and in the (error) case that there really 990 * is any data in it, we advance over it. 991 */ 992 tl++; /* Step over verifer type */ 993 i = fxdr_unsigned(int32_t, *tl); 994 if (i > 0) 995 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 996 997 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 998 /* 0 == ok */ 999 if (*tl == 0) { 1000 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1001 if (*tl != 0) { 1002 error = fxdr_unsigned(int, *tl); 1003 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1004 error == NFSERR_TRYLATER) { 1005 m_freem(info.nmi_mrep); 1006 error = 0; 1007 tv.tv_sec = trylater_delay; 1008 tv.tv_usec = 0; 1009 tsleep(&tv, PSOCK, "nfsretry", tvtohz(&tv)); 1010 trylater_delay *= NFS_TIMEOUTMUL; 1011 if (trylater_delay > NFS_MAXTIMEO) 1012 trylater_delay = NFS_MAXTIMEO; 1013 1014 goto tryagain; 1015 } 1016 1017 /* 1018 * If the File Handle was stale, invalidate the 1019 * lookup cache, just in case. 1020 */ 1021 if (error == ESTALE) 1022 cache_purge(rep->r_vp); 1023 } 1024 goto nfsmout; 1025 } 1026 1027 error = EPROTONOSUPPORT; 1028 1029 nfsmout: 1030 infop->nmi_mrep = info.nmi_mrep; 1031 infop->nmi_md = info.nmi_md; 1032 infop->nmi_dpos = info.nmi_dpos; 1033 nfsmout1: 1034 m_freem(rep->r_mreq); 1035 pool_put(&nfsreqpl, rep); 1036 return (error); 1037 } 1038 #endif /* NFSCLIENT */ 1039 1040 /* 1041 * Generate the rpc reply header 1042 * siz arg. is used to decide if adding a cluster is worthwhile 1043 */ 1044 int 1045 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1046 int err, struct mbuf **mrq, struct mbuf **mbp) 1047 { 1048 u_int32_t *tl; 1049 struct mbuf *mreq; 1050 struct mbuf *mb; 1051 1052 MGETHDR(mreq, M_WAIT, MT_DATA); 1053 mb = mreq; 1054 /* 1055 * If this is a big reply, use a cluster else 1056 * try and leave leading space for the lower level headers. 1057 */ 1058 siz += RPC_REPLYSIZ; 1059 if (siz >= MHLEN - max_hdr) { 1060 MCLGET(mreq, M_WAIT); 1061 } else 1062 mreq->m_data += max_hdr; 1063 tl = mtod(mreq, u_int32_t *); 1064 mreq->m_len = 6 * NFSX_UNSIGNED; 1065 *tl++ = txdr_unsigned(nd->nd_retxid); 1066 *tl++ = rpc_reply; 1067 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1068 *tl++ = rpc_msgdenied; 1069 if (err & NFSERR_AUTHERR) { 1070 *tl++ = rpc_autherr; 1071 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1072 mreq->m_len -= NFSX_UNSIGNED; 1073 } else { 1074 *tl++ = rpc_mismatch; 1075 *tl++ = txdr_unsigned(RPC_VER2); 1076 *tl = txdr_unsigned(RPC_VER2); 1077 } 1078 } else { 1079 *tl++ = rpc_msgaccepted; 1080 1081 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1082 *tl++ = 0; 1083 *tl++ = 0; 1084 1085 switch (err) { 1086 case EPROGUNAVAIL: 1087 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1088 break; 1089 case EPROGMISMATCH: 1090 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1091 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1092 *tl++ = txdr_unsigned(NFS_VER2); 1093 *tl = txdr_unsigned(NFS_VER3); 1094 break; 1095 case EPROCUNAVAIL: 1096 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1097 break; 1098 case EBADRPC: 1099 *tl = txdr_unsigned(RPC_GARBAGE); 1100 break; 1101 default: 1102 *tl = 0; 1103 if (err != NFSERR_RETVOID) { 1104 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1105 if (err) 1106 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1107 else 1108 *tl = 0; 1109 } 1110 break; 1111 }; 1112 } 1113 1114 *mrq = mreq; 1115 if (mbp != NULL) 1116 *mbp = mb; 1117 if (err != 0 && err != NFSERR_RETVOID) 1118 nfsstats.srvrpc_errs++; 1119 return (0); 1120 } 1121 1122 /* 1123 * nfs timer routine 1124 * Scan the nfsreq list and retranmit any requests that have timed out. 1125 */ 1126 void 1127 nfs_timer(void *arg) 1128 { 1129 struct nfsmount *nmp = arg; 1130 struct nfsreq *rep; 1131 struct mbuf *m; 1132 struct socket *so; 1133 int timeo, s, error; 1134 1135 s = splsoftnet(); 1136 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1137 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1138 continue; 1139 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1140 rep->r_flags |= R_SOFTTERM; 1141 continue; 1142 } 1143 if (rep->r_rtt >= 0) { 1144 rep->r_rtt++; 1145 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1146 timeo = nmp->nm_timeo; 1147 else 1148 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1149 if (nmp->nm_timeouts > 0) 1150 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1151 if (rep->r_rtt <= timeo) 1152 continue; 1153 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1154 nmp->nm_timeouts++; 1155 } 1156 1157 /* Check for server not responding. */ 1158 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1159 nfs_msg(rep, "not responding"); 1160 rep->r_flags |= R_TPRINTFMSG; 1161 } 1162 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1163 nfsstats.rpctimeouts++; 1164 rep->r_flags |= R_SOFTTERM; 1165 continue; 1166 } 1167 if (nmp->nm_sotype != SOCK_DGRAM) { 1168 if (++rep->r_rexmit > NFS_MAXREXMIT) 1169 rep->r_rexmit = NFS_MAXREXMIT; 1170 continue; 1171 } 1172 1173 if ((so = nmp->nm_so) == NULL) 1174 continue; 1175 1176 /* 1177 * If there is enough space and the window allows.. 1178 * Resend it 1179 * Set r_rtt to -1 in case we fail to send it now. 1180 */ 1181 rep->r_rtt = -1; 1182 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1183 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1184 (rep->r_flags & R_SENT) || 1185 nmp->nm_sent < nmp->nm_cwnd) && 1186 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1187 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1188 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1189 NULL, NULL, curproc); 1190 else 1191 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1192 nmp->nm_nam, NULL, curproc); 1193 if (error) { 1194 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1195 so->so_error = 0; 1196 } else { 1197 /* 1198 * Iff first send, start timing 1199 * else turn timing off, backoff timer 1200 * and divide congestion window by 2. 1201 */ 1202 if (rep->r_flags & R_SENT) { 1203 rep->r_flags &= ~R_TIMING; 1204 if (++rep->r_rexmit > NFS_MAXREXMIT) 1205 rep->r_rexmit = NFS_MAXREXMIT; 1206 nmp->nm_cwnd >>= 1; 1207 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1208 nmp->nm_cwnd = NFS_CWNDSCALE; 1209 nfsstats.rpcretries++; 1210 } else { 1211 rep->r_flags |= R_SENT; 1212 nmp->nm_sent += NFS_CWNDSCALE; 1213 } 1214 rep->r_rtt = 0; 1215 } 1216 } 1217 } 1218 splx(s); 1219 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1220 } 1221 1222 /* 1223 * Test for a termination condition pending on the process. 1224 * This is used for NFSMNT_INT mounts. 1225 */ 1226 int 1227 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1228 { 1229 1230 if (rep && (rep->r_flags & R_SOFTTERM)) 1231 return (EINTR); 1232 if (!(nmp->nm_flag & NFSMNT_INT)) 1233 return (0); 1234 if (p && p->p_siglist && 1235 (((p->p_siglist & ~p->p_sigmask) & 1236 ~p->p_p->ps_sigacts->ps_sigignore) & NFSINT_SIGMASK)) 1237 return (EINTR); 1238 return (0); 1239 } 1240 1241 /* 1242 * Lock a socket against others. 1243 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1244 * and also to avoid race conditions between the processes with nfs requests 1245 * in progress when a reconnect is necessary. 1246 */ 1247 int 1248 nfs_sndlock(int *flagp, struct nfsreq *rep) 1249 { 1250 struct proc *p; 1251 int slpflag = 0, slptimeo = 0; 1252 1253 if (rep) { 1254 p = rep->r_procp; 1255 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1256 slpflag = PCATCH; 1257 } else 1258 p = NULL; 1259 while (*flagp & NFSMNT_SNDLOCK) { 1260 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1261 return (EINTR); 1262 *flagp |= NFSMNT_WANTSND; 1263 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1264 slptimeo); 1265 if (slpflag == PCATCH) { 1266 slpflag = 0; 1267 slptimeo = 2 * hz; 1268 } 1269 } 1270 *flagp |= NFSMNT_SNDLOCK; 1271 return (0); 1272 } 1273 1274 /* 1275 * Unlock the stream socket for others. 1276 */ 1277 void 1278 nfs_sndunlock(int *flagp) 1279 { 1280 1281 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1282 panic("nfs sndunlock"); 1283 *flagp &= ~NFSMNT_SNDLOCK; 1284 if (*flagp & NFSMNT_WANTSND) { 1285 *flagp &= ~NFSMNT_WANTSND; 1286 wakeup((caddr_t)flagp); 1287 } 1288 } 1289 1290 int 1291 nfs_rcvlock(struct nfsreq *rep) 1292 { 1293 int *flagp = &rep->r_nmp->nm_flag; 1294 int slpflag, slptimeo = 0; 1295 1296 if (*flagp & NFSMNT_INT) 1297 slpflag = PCATCH; 1298 else 1299 slpflag = 0; 1300 1301 while (*flagp & NFSMNT_RCVLOCK) { 1302 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1303 return (EINTR); 1304 *flagp |= NFSMNT_WANTRCV; 1305 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1306 slptimeo); 1307 if (rep->r_mrep != NULL) { 1308 /* 1309 * Don't take the lock if our reply has been received 1310 * while we where sleeping. 1311 */ 1312 return (EALREADY); 1313 } 1314 if (slpflag == PCATCH) { 1315 slpflag = 0; 1316 slptimeo = 2 * hz; 1317 } 1318 } 1319 *flagp |= NFSMNT_RCVLOCK; 1320 return (0); 1321 } 1322 1323 /* 1324 * Unlock the stream socket for others. 1325 */ 1326 void 1327 nfs_rcvunlock(int *flagp) 1328 { 1329 1330 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1331 panic("nfs rcvunlock"); 1332 *flagp &= ~NFSMNT_RCVLOCK; 1333 if (*flagp & NFSMNT_WANTRCV) { 1334 *flagp &= ~NFSMNT_WANTRCV; 1335 wakeup((caddr_t)flagp); 1336 } 1337 } 1338 1339 /* 1340 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1341 */ 1342 void 1343 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1344 { 1345 size_t padding; 1346 1347 /* 1348 * The maximum number of bytes that m_copyback() places in a mbuf is 1349 * always an aligned quantity, so realign happens at the chain's tail. 1350 */ 1351 while (n->m_next != NULL) 1352 n = n->m_next; 1353 1354 /* 1355 * Pad from the next elements in the source chain. Loop until the 1356 * destination chain is aligned, or the end of the source is reached. 1357 */ 1358 do { 1359 m = m->m_next; 1360 if (m == NULL) 1361 return; 1362 1363 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1364 if (padding > M_TRAILINGSPACE(n)) 1365 panic("nfs_realign_fixup: no memory to pad to"); 1366 1367 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1368 1369 n->m_len += padding; 1370 m_adj(m, padding); 1371 *off += padding; 1372 1373 } while (!ALIGNED_POINTER(n->m_len, void *)); 1374 } 1375 1376 /* 1377 * The NFS RPC parsing code uses the data address and the length of mbuf 1378 * structures to calculate on-memory addresses. This function makes sure these 1379 * parameters are correctly aligned. 1380 */ 1381 void 1382 nfs_realign(struct mbuf **pm, int hsiz) 1383 { 1384 struct mbuf *m; 1385 struct mbuf *n = NULL; 1386 unsigned int off = 0; 1387 1388 ++nfs_realign_test; 1389 while ((m = *pm) != NULL) { 1390 if (!ALIGNED_POINTER(m->m_data, void *) || 1391 !ALIGNED_POINTER(m->m_len, void *)) { 1392 MGET(n, M_WAIT, MT_DATA); 1393 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1394 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1395 MCLGET(n, M_WAIT); 1396 } 1397 n->m_len = 0; 1398 break; 1399 } 1400 pm = &m->m_next; 1401 } 1402 /* 1403 * If n is non-NULL, loop on m copying data, then replace the 1404 * portion of the chain that had to be realigned. 1405 */ 1406 if (n != NULL) { 1407 ++nfs_realign_count; 1408 while (m) { 1409 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1410 1411 /* 1412 * If an unaligned amount of memory was copied, fix up 1413 * the last mbuf created by m_copyback(). 1414 */ 1415 if (!ALIGNED_POINTER(m->m_len, void *)) 1416 nfs_realign_fixup(m, n, &off); 1417 1418 off += m->m_len; 1419 m = m->m_next; 1420 } 1421 m_freem(*pm); 1422 *pm = n; 1423 } 1424 } 1425 1426 1427 /* 1428 * Parse an RPC request 1429 * - verify it 1430 * - fill in the cred struct. 1431 */ 1432 int 1433 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1434 { 1435 int len, i; 1436 u_int32_t *tl; 1437 int32_t t1; 1438 caddr_t cp2; 1439 u_int32_t nfsvers, auth_type; 1440 int error = 0; 1441 struct nfsm_info info; 1442 1443 info.nmi_mrep = nd->nd_mrep; 1444 info.nmi_md = nd->nd_md; 1445 info.nmi_dpos = nd->nd_dpos; 1446 if (has_header) { 1447 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1448 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1449 if (*tl++ != rpc_call) { 1450 m_freem(info.nmi_mrep); 1451 return (EBADRPC); 1452 } 1453 } else 1454 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1455 nd->nd_repstat = 0; 1456 nd->nd_flag = 0; 1457 if (*tl++ != rpc_vers) { 1458 nd->nd_repstat = ERPCMISMATCH; 1459 nd->nd_procnum = NFSPROC_NOOP; 1460 return (0); 1461 } 1462 if (*tl != nfs_prog) { 1463 nd->nd_repstat = EPROGUNAVAIL; 1464 nd->nd_procnum = NFSPROC_NOOP; 1465 return (0); 1466 } 1467 tl++; 1468 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1469 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1470 nd->nd_repstat = EPROGMISMATCH; 1471 nd->nd_procnum = NFSPROC_NOOP; 1472 return (0); 1473 } 1474 if (nfsvers == NFS_VER3) 1475 nd->nd_flag = ND_NFSV3; 1476 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1477 if (nd->nd_procnum == NFSPROC_NULL) 1478 return (0); 1479 if (nd->nd_procnum >= NFS_NPROCS || 1480 (nd->nd_procnum > NFSPROC_COMMIT) || 1481 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1482 nd->nd_repstat = EPROCUNAVAIL; 1483 nd->nd_procnum = NFSPROC_NOOP; 1484 return (0); 1485 } 1486 if ((nd->nd_flag & ND_NFSV3) == 0) 1487 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1488 auth_type = *tl++; 1489 len = fxdr_unsigned(int, *tl++); 1490 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1491 m_freem(info.nmi_mrep); 1492 return (EBADRPC); 1493 } 1494 1495 /* Handle auth_unix */ 1496 if (auth_type == rpc_auth_unix) { 1497 len = fxdr_unsigned(int, *++tl); 1498 if (len < 0 || len > NFS_MAXNAMLEN) { 1499 m_freem(info.nmi_mrep); 1500 return (EBADRPC); 1501 } 1502 nfsm_adv(nfsm_rndup(len)); 1503 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1504 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1505 nd->nd_cr.cr_ref = 1; 1506 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1507 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1508 len = fxdr_unsigned(int, *tl); 1509 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1510 m_freem(info.nmi_mrep); 1511 return (EBADRPC); 1512 } 1513 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1514 for (i = 0; i < len; i++) 1515 if (i < NGROUPS_MAX) 1516 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1517 else 1518 tl++; 1519 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1520 len = fxdr_unsigned(int, *++tl); 1521 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1522 m_freem(info.nmi_mrep); 1523 return (EBADRPC); 1524 } 1525 if (len > 0) 1526 nfsm_adv(nfsm_rndup(len)); 1527 } else { 1528 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1529 nd->nd_procnum = NFSPROC_NOOP; 1530 return (0); 1531 } 1532 1533 nd->nd_md = info.nmi_md; 1534 nd->nd_dpos = info.nmi_dpos; 1535 return (0); 1536 nfsmout: 1537 return (error); 1538 } 1539 1540 void 1541 nfs_msg(struct nfsreq *rep, char *msg) 1542 { 1543 tpr_t tpr; 1544 1545 if (rep->r_procp) 1546 tpr = tprintf_open(rep->r_procp); 1547 else 1548 tpr = NULL; 1549 1550 tprintf(tpr, "nfs server %s: %s\n", 1551 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1552 tprintf_close(tpr); 1553 } 1554 1555 #ifdef NFSSERVER 1556 /* 1557 * Socket upcall routine for the nfsd sockets. 1558 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1559 * Essentially do as much as possible non-blocking, else punt and it will 1560 * be called with M_WAIT from an nfsd. 1561 */ 1562 void 1563 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1564 { 1565 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1566 struct mbuf *m; 1567 struct mbuf *mp, *nam; 1568 struct uio auio; 1569 int flags, error; 1570 1571 if ((slp->ns_flag & SLP_VALID) == 0) 1572 return; 1573 #ifdef notdef 1574 /* 1575 * Define this to test for nfsds handling this under heavy load. 1576 */ 1577 if (waitflag == M_DONTWAIT) { 1578 slp->ns_flag |= SLP_NEEDQ; goto dorecs; 1579 } 1580 #endif 1581 auio.uio_procp = NULL; 1582 if (so->so_type == SOCK_STREAM) { 1583 /* 1584 * If there are already records on the queue, defer soreceive() 1585 * to an nfsd so that there is feedback to the TCP layer that 1586 * the nfs servers are heavily loaded. 1587 */ 1588 if (slp->ns_rec && waitflag == M_DONTWAIT) { 1589 slp->ns_flag |= SLP_NEEDQ; 1590 goto dorecs; 1591 } 1592 1593 /* 1594 * Do soreceive(). 1595 */ 1596 auio.uio_resid = 1000000000; 1597 flags = MSG_DONTWAIT; 1598 error = soreceive(so, &nam, &auio, &mp, NULL, 1599 &flags, 0); 1600 if (error || mp == NULL) { 1601 if (error == EWOULDBLOCK) 1602 slp->ns_flag |= SLP_NEEDQ; 1603 else 1604 slp->ns_flag |= SLP_DISCONN; 1605 goto dorecs; 1606 } 1607 m = mp; 1608 if (slp->ns_rawend) { 1609 slp->ns_rawend->m_next = m; 1610 slp->ns_cc += 1000000000 - auio.uio_resid; 1611 } else { 1612 slp->ns_raw = m; 1613 slp->ns_cc = 1000000000 - auio.uio_resid; 1614 } 1615 while (m->m_next) 1616 m = m->m_next; 1617 slp->ns_rawend = m; 1618 1619 /* 1620 * Now try and parse record(s) out of the raw stream data. 1621 */ 1622 error = nfsrv_getstream(slp, waitflag); 1623 if (error) { 1624 if (error == EPERM) 1625 slp->ns_flag |= SLP_DISCONN; 1626 else 1627 slp->ns_flag |= SLP_NEEDQ; 1628 } 1629 } else { 1630 do { 1631 auio.uio_resid = 1000000000; 1632 flags = MSG_DONTWAIT; 1633 error = soreceive(so, &nam, &auio, &mp, 1634 NULL, &flags, 0); 1635 if (mp) { 1636 if (nam) { 1637 m = nam; 1638 m->m_next = mp; 1639 } else 1640 m = mp; 1641 if (slp->ns_recend) 1642 slp->ns_recend->m_nextpkt = m; 1643 else 1644 slp->ns_rec = m; 1645 slp->ns_recend = m; 1646 m->m_nextpkt = NULL; 1647 } 1648 if (error) { 1649 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1650 && error != EWOULDBLOCK) { 1651 slp->ns_flag |= SLP_DISCONN; 1652 goto dorecs; 1653 } 1654 } 1655 } while (mp); 1656 } 1657 1658 /* 1659 * Now try and process the request records, non-blocking. 1660 */ 1661 dorecs: 1662 if (waitflag == M_DONTWAIT && 1663 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1664 nfsrv_wakenfsd(slp); 1665 } 1666 1667 /* 1668 * Try and extract an RPC request from the mbuf data list received on a 1669 * stream socket. The "waitflag" argument indicates whether or not it 1670 * can sleep. 1671 */ 1672 int 1673 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1674 { 1675 struct mbuf *m, **mpp; 1676 char *cp1, *cp2; 1677 int len; 1678 struct mbuf *om, *m2, *recm; 1679 u_int32_t recmark; 1680 1681 if (slp->ns_flag & SLP_GETSTREAM) 1682 return (0); 1683 slp->ns_flag |= SLP_GETSTREAM; 1684 for (;;) { 1685 if (slp->ns_reclen == 0) { 1686 if (slp->ns_cc < NFSX_UNSIGNED) { 1687 slp->ns_flag &= ~SLP_GETSTREAM; 1688 return (0); 1689 } 1690 m = slp->ns_raw; 1691 if (m->m_len >= NFSX_UNSIGNED) { 1692 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1693 m->m_data += NFSX_UNSIGNED; 1694 m->m_len -= NFSX_UNSIGNED; 1695 } else { 1696 cp1 = (caddr_t)&recmark; 1697 cp2 = mtod(m, caddr_t); 1698 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1699 while (m->m_len == 0) { 1700 m = m->m_next; 1701 cp2 = mtod(m, caddr_t); 1702 } 1703 *cp1++ = *cp2++; 1704 m->m_data++; 1705 m->m_len--; 1706 } 1707 } 1708 slp->ns_cc -= NFSX_UNSIGNED; 1709 recmark = ntohl(recmark); 1710 slp->ns_reclen = recmark & ~0x80000000; 1711 if (recmark & 0x80000000) 1712 slp->ns_flag |= SLP_LASTFRAG; 1713 else 1714 slp->ns_flag &= ~SLP_LASTFRAG; 1715 if (slp->ns_reclen > NFS_MAXPACKET) { 1716 slp->ns_flag &= ~SLP_GETSTREAM; 1717 return (EPERM); 1718 } 1719 } 1720 1721 /* 1722 * Now get the record part. 1723 */ 1724 recm = NULL; 1725 if (slp->ns_cc == slp->ns_reclen) { 1726 recm = slp->ns_raw; 1727 slp->ns_raw = slp->ns_rawend = NULL; 1728 slp->ns_cc = slp->ns_reclen = 0; 1729 } else if (slp->ns_cc > slp->ns_reclen) { 1730 len = 0; 1731 m = slp->ns_raw; 1732 om = NULL; 1733 while (len < slp->ns_reclen) { 1734 if ((len + m->m_len) > slp->ns_reclen) { 1735 m2 = m_copym(m, 0, slp->ns_reclen - len, 1736 waitflag); 1737 if (m2) { 1738 if (om) { 1739 om->m_next = m2; 1740 recm = slp->ns_raw; 1741 } else 1742 recm = m2; 1743 m->m_data += slp->ns_reclen - len; 1744 m->m_len -= slp->ns_reclen - len; 1745 len = slp->ns_reclen; 1746 } else { 1747 slp->ns_flag &= ~SLP_GETSTREAM; 1748 return (EWOULDBLOCK); 1749 } 1750 } else if ((len + m->m_len) == slp->ns_reclen) { 1751 om = m; 1752 len += m->m_len; 1753 m = m->m_next; 1754 recm = slp->ns_raw; 1755 om->m_next = NULL; 1756 } else { 1757 om = m; 1758 len += m->m_len; 1759 m = m->m_next; 1760 } 1761 } 1762 slp->ns_raw = m; 1763 slp->ns_cc -= len; 1764 slp->ns_reclen = 0; 1765 } else { 1766 slp->ns_flag &= ~SLP_GETSTREAM; 1767 return (0); 1768 } 1769 1770 /* 1771 * Accumulate the fragments into a record. 1772 */ 1773 mpp = &slp->ns_frag; 1774 while (*mpp) 1775 mpp = &((*mpp)->m_next); 1776 *mpp = recm; 1777 if (slp->ns_flag & SLP_LASTFRAG) { 1778 if (slp->ns_recend) 1779 slp->ns_recend->m_nextpkt = slp->ns_frag; 1780 else 1781 slp->ns_rec = slp->ns_frag; 1782 slp->ns_recend = slp->ns_frag; 1783 slp->ns_frag = NULL; 1784 } 1785 } 1786 } 1787 1788 /* 1789 * Parse an RPC header. 1790 */ 1791 int 1792 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1793 struct nfsrv_descript **ndp) 1794 { 1795 struct mbuf *m, *nam; 1796 struct nfsrv_descript *nd; 1797 int error; 1798 1799 *ndp = NULL; 1800 if ((slp->ns_flag & SLP_VALID) == 0 || 1801 (m = slp->ns_rec) == NULL) 1802 return (ENOBUFS); 1803 slp->ns_rec = m->m_nextpkt; 1804 if (slp->ns_rec) 1805 m->m_nextpkt = NULL; 1806 else 1807 slp->ns_recend = NULL; 1808 if (m->m_type == MT_SONAME) { 1809 nam = m; 1810 m = m->m_next; 1811 nam->m_next = NULL; 1812 } else 1813 nam = NULL; 1814 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1815 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1816 nd->nd_md = nd->nd_mrep = m; 1817 nd->nd_nam2 = nam; 1818 nd->nd_dpos = mtod(m, caddr_t); 1819 error = nfs_getreq(nd, nfsd, 1); 1820 if (error) { 1821 m_freem(nam); 1822 pool_put(&nfsrv_descript_pl, nd); 1823 return (error); 1824 } 1825 *ndp = nd; 1826 nfsd->nfsd_nd = nd; 1827 return (0); 1828 } 1829 1830 1831 /* 1832 * Search for a sleeping nfsd and wake it up. 1833 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1834 * running nfsds will go look for the work in the nfssvc_sock list. 1835 */ 1836 void 1837 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1838 { 1839 struct nfsd *nfsd; 1840 1841 if ((slp->ns_flag & SLP_VALID) == 0) 1842 return; 1843 1844 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1845 if (nfsd->nfsd_flag & NFSD_WAITING) { 1846 nfsd->nfsd_flag &= ~NFSD_WAITING; 1847 if (nfsd->nfsd_slp) 1848 panic("nfsd wakeup"); 1849 slp->ns_sref++; 1850 nfsd->nfsd_slp = slp; 1851 wakeup_one(nfsd); 1852 return; 1853 } 1854 } 1855 1856 slp->ns_flag |= SLP_DOREC; 1857 nfsd_head_flag |= NFSD_CHECKSLP; 1858 } 1859 #endif /* NFSSERVER */ 1860