1 /* 2 * Copyright (c) 1989, 1991, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 37 * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $ 38 * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.45 2007/05/18 17:05:13 dillon Exp $ 39 */ 40 41 /* 42 * Socket operations for use by nfs 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/proc.h> 48 #include <sys/malloc.h> 49 #include <sys/mount.h> 50 #include <sys/kernel.h> 51 #include <sys/mbuf.h> 52 #include <sys/vnode.h> 53 #include <sys/fcntl.h> 54 #include <sys/protosw.h> 55 #include <sys/resourcevar.h> 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 #include <sys/socketops.h> 59 #include <sys/syslog.h> 60 #include <sys/thread.h> 61 #include <sys/tprintf.h> 62 #include <sys/sysctl.h> 63 #include <sys/signalvar.h> 64 65 #include <sys/signal2.h> 66 #include <sys/mutex2.h> 67 #include <sys/socketvar2.h> 68 69 #include <netinet/in.h> 70 #include <netinet/tcp.h> 71 #include <sys/thread2.h> 72 73 #include "rpcv2.h" 74 #include "nfsproto.h" 75 #include "nfs.h" 76 #include "xdr_subs.h" 77 #include "nfsm_subs.h" 78 #include "nfsmount.h" 79 #include "nfsnode.h" 80 #include "nfsrtt.h" 81 82 #define TRUE 1 83 #define FALSE 0 84 85 /* 86 * RTT calculations are scaled by 256 (8 bits). A proper fractional 87 * RTT will still be calculated even with a slow NFS timer. 88 */ 89 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum]] 90 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum]] 91 #define NFS_RTT_SCALE_BITS 8 /* bits */ 92 #define NFS_RTT_SCALE 256 /* value */ 93 94 /* 95 * Defines which timer to use for the procnum. 96 * 0 - default 97 * 1 - getattr 98 * 2 - lookup 99 * 3 - read 100 * 4 - write 101 */ 102 static int proct[NFS_NPROCS] = { 103 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, /* 00-09 */ 104 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, /* 10-19 */ 105 0, 5, 0, 0, 0, 0, /* 20-29 */ 106 }; 107 108 static int multt[NFS_NPROCS] = { 109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-09 */ 110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10-19 */ 111 1, 2, 1, 1, 1, 1, /* 20-29 */ 112 }; 113 114 static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 }; 115 static int nfs_realign_test; 116 static int nfs_realign_count; 117 static int nfs_showrtt; 118 static int nfs_showrexmit; 119 int nfs_maxasyncbio = NFS_MAXASYNCBIO; 120 121 SYSCTL_DECL(_vfs_nfs); 122 123 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, ""); 124 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, ""); 125 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, ""); 126 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, ""); 127 SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, ""); 128 129 static int nfs_request_setup(nfsm_info_t info); 130 static int nfs_request_auth(struct nfsreq *rep); 131 static int nfs_request_try(struct nfsreq *rep); 132 static int nfs_request_waitreply(struct nfsreq *rep); 133 static int nfs_request_processreply(nfsm_info_t info, int); 134 135 int nfsrtton = 0; 136 struct nfsrtt nfsrtt; 137 struct callout nfs_timer_handle; 138 139 static int nfs_msg (struct thread *,char *,char *); 140 static int nfs_rcvlock (struct nfsmount *nmp, struct nfsreq *myreq); 141 static void nfs_rcvunlock (struct nfsmount *nmp); 142 static void nfs_realign (struct mbuf **pm, int hsiz); 143 static int nfs_receive (struct nfsmount *nmp, struct nfsreq *rep, 144 struct sockaddr **aname, struct mbuf **mp); 145 static void nfs_softterm (struct nfsreq *rep, int islocked); 146 static void nfs_hardterm (struct nfsreq *rep, int islocked); 147 static int nfs_reconnect (struct nfsmount *nmp, struct nfsreq *rep); 148 #ifndef NFS_NOSERVER 149 static int nfsrv_getstream (struct nfssvc_sock *, int, int *); 150 static void nfs_timer_req(struct nfsreq *req); 151 static void nfs_checkpkt(struct mbuf *m, int len); 152 153 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd, 154 struct nfssvc_sock *slp, 155 struct thread *td, 156 struct mbuf **mreqp) = { 157 nfsrv_null, 158 nfsrv_getattr, 159 nfsrv_setattr, 160 nfsrv_lookup, 161 nfsrv3_access, 162 nfsrv_readlink, 163 nfsrv_read, 164 nfsrv_write, 165 nfsrv_create, 166 nfsrv_mkdir, 167 nfsrv_symlink, 168 nfsrv_mknod, 169 nfsrv_remove, 170 nfsrv_rmdir, 171 nfsrv_rename, 172 nfsrv_link, 173 nfsrv_readdir, 174 nfsrv_readdirplus, 175 nfsrv_statfs, 176 nfsrv_fsinfo, 177 nfsrv_pathconf, 178 nfsrv_commit, 179 nfsrv_noop, 180 nfsrv_noop, 181 nfsrv_noop, 182 nfsrv_noop 183 }; 184 #endif /* NFS_NOSERVER */ 185 186 /* 187 * Initialize sockets and congestion for a new NFS connection. 188 * We do not free the sockaddr if error. 189 */ 190 int 191 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 192 { 193 struct socket *so; 194 int error; 195 struct sockaddr *saddr; 196 struct sockaddr_in *sin; 197 struct thread *td = &thread0; /* only used for socreate and sobind */ 198 199 nmp->nm_so = so = NULL; 200 if (nmp->nm_flag & NFSMNT_FORCE) 201 return (EINVAL); 202 saddr = nmp->nm_nam; 203 error = socreate(saddr->sa_family, &so, nmp->nm_sotype, 204 nmp->nm_soproto, td); 205 if (error) 206 goto bad; 207 nmp->nm_soflags = so->so_proto->pr_flags; 208 209 /* 210 * Some servers require that the client port be a reserved port number. 211 */ 212 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { 213 struct sockopt sopt; 214 int ip; 215 struct sockaddr_in ssin; 216 217 bzero(&sopt, sizeof sopt); 218 ip = IP_PORTRANGE_LOW; 219 sopt.sopt_level = IPPROTO_IP; 220 sopt.sopt_name = IP_PORTRANGE; 221 sopt.sopt_val = (void *)&ip; 222 sopt.sopt_valsize = sizeof(ip); 223 sopt.sopt_td = NULL; 224 error = sosetopt(so, &sopt); 225 if (error) 226 goto bad; 227 bzero(&ssin, sizeof ssin); 228 sin = &ssin; 229 sin->sin_len = sizeof (struct sockaddr_in); 230 sin->sin_family = AF_INET; 231 sin->sin_addr.s_addr = INADDR_ANY; 232 sin->sin_port = htons(0); 233 error = sobind(so, (struct sockaddr *)sin, td); 234 if (error) 235 goto bad; 236 bzero(&sopt, sizeof sopt); 237 ip = IP_PORTRANGE_DEFAULT; 238 sopt.sopt_level = IPPROTO_IP; 239 sopt.sopt_name = IP_PORTRANGE; 240 sopt.sopt_val = (void *)&ip; 241 sopt.sopt_valsize = sizeof(ip); 242 sopt.sopt_td = NULL; 243 error = sosetopt(so, &sopt); 244 if (error) 245 goto bad; 246 } 247 248 /* 249 * Protocols that do not require connections may be optionally left 250 * unconnected for servers that reply from a port other than NFS_PORT. 251 */ 252 if (nmp->nm_flag & NFSMNT_NOCONN) { 253 if (nmp->nm_soflags & PR_CONNREQUIRED) { 254 error = ENOTCONN; 255 goto bad; 256 } 257 } else { 258 error = soconnect(so, nmp->nm_nam, td); 259 if (error) 260 goto bad; 261 262 /* 263 * Wait for the connection to complete. Cribbed from the 264 * connect system call but with the wait timing out so 265 * that interruptible mounts don't hang here for a long time. 266 */ 267 crit_enter(); 268 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 269 (void) tsleep((caddr_t)&so->so_timeo, 0, 270 "nfscon", 2 * hz); 271 if ((so->so_state & SS_ISCONNECTING) && 272 so->so_error == 0 && rep && 273 (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){ 274 soclrstate(so, SS_ISCONNECTING); 275 crit_exit(); 276 goto bad; 277 } 278 } 279 if (so->so_error) { 280 error = so->so_error; 281 so->so_error = 0; 282 crit_exit(); 283 goto bad; 284 } 285 crit_exit(); 286 } 287 so->so_rcv.ssb_timeo = (5 * hz); 288 so->so_snd.ssb_timeo = (5 * hz); 289 290 /* 291 * Get buffer reservation size from sysctl, but impose reasonable 292 * limits. 293 */ 294 if (nmp->nm_sotype == SOCK_STREAM) { 295 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 296 struct sockopt sopt; 297 int val; 298 299 bzero(&sopt, sizeof sopt); 300 sopt.sopt_level = SOL_SOCKET; 301 sopt.sopt_name = SO_KEEPALIVE; 302 sopt.sopt_val = &val; 303 sopt.sopt_valsize = sizeof val; 304 val = 1; 305 sosetopt(so, &sopt); 306 } 307 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 308 struct sockopt sopt; 309 int val; 310 311 bzero(&sopt, sizeof sopt); 312 sopt.sopt_level = IPPROTO_TCP; 313 sopt.sopt_name = TCP_NODELAY; 314 sopt.sopt_val = &val; 315 sopt.sopt_valsize = sizeof val; 316 val = 1; 317 sosetopt(so, &sopt); 318 319 bzero(&sopt, sizeof sopt); 320 sopt.sopt_level = IPPROTO_TCP; 321 sopt.sopt_name = TCP_FASTKEEP; 322 sopt.sopt_val = &val; 323 sopt.sopt_valsize = sizeof val; 324 val = 1; 325 sosetopt(so, &sopt); 326 } 327 } 328 error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL); 329 if (error) 330 goto bad; 331 atomic_set_int(&so->so_rcv.ssb_flags, SSB_NOINTR); 332 atomic_set_int(&so->so_snd.ssb_flags, SSB_NOINTR); 333 334 /* Initialize other non-zero congestion variables */ 335 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = 336 nmp->nm_srtt[3] = (NFS_TIMEO << NFS_RTT_SCALE_BITS); 337 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = 338 nmp->nm_sdrtt[3] = 0; 339 nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED; 340 nmp->nm_timeouts = 0; 341 342 /* 343 * Assign nm_so last. The moment nm_so is assigned the nfs_timer() 344 * can mess with the socket. 345 */ 346 nmp->nm_so = so; 347 return (0); 348 349 bad: 350 if (so) { 351 soshutdown(so, SHUT_RDWR); 352 soclose(so, FNONBLOCK); 353 } 354 return (error); 355 } 356 357 /* 358 * Reconnect routine: 359 * Called when a connection is broken on a reliable protocol. 360 * - clean up the old socket 361 * - nfs_connect() again 362 * - set R_NEEDSXMIT for all outstanding requests on mount point 363 * If this fails the mount point is DEAD! 364 * nb: Must be called with the nfs_sndlock() set on the mount point. 365 */ 366 static int 367 nfs_reconnect(struct nfsmount *nmp, struct nfsreq *rep) 368 { 369 struct nfsreq *req; 370 int error; 371 372 nfs_disconnect(nmp); 373 if (nmp->nm_rxstate >= NFSSVC_STOPPING) 374 return (EINTR); 375 while ((error = nfs_connect(nmp, rep)) != 0) { 376 if (error == EINTR || error == ERESTART) 377 return (EINTR); 378 if (error == EINVAL) 379 return (error); 380 if (nmp->nm_rxstate >= NFSSVC_STOPPING) 381 return (EINTR); 382 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0); 383 } 384 385 /* 386 * Loop through outstanding request list and fix up all requests 387 * on old socket. 388 */ 389 crit_enter(); 390 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { 391 KKASSERT(req->r_nmp == nmp); 392 req->r_flags |= R_NEEDSXMIT; 393 } 394 crit_exit(); 395 return (0); 396 } 397 398 /* 399 * NFS disconnect. Clean up and unlink. 400 */ 401 void 402 nfs_disconnect(struct nfsmount *nmp) 403 { 404 struct socket *so; 405 406 if (nmp->nm_so) { 407 so = nmp->nm_so; 408 nmp->nm_so = NULL; 409 soshutdown(so, SHUT_RDWR); 410 soclose(so, FNONBLOCK); 411 } 412 } 413 414 void 415 nfs_safedisconnect(struct nfsmount *nmp) 416 { 417 nfs_rcvlock(nmp, NULL); 418 nfs_disconnect(nmp); 419 nfs_rcvunlock(nmp); 420 } 421 422 /* 423 * This is the nfs send routine. For connection based socket types, it 424 * must be called with an nfs_sndlock() on the socket. 425 * "rep == NULL" indicates that it has been called from a server. 426 * For the client side: 427 * - return EINTR if the RPC is terminated, 0 otherwise 428 * - set R_NEEDSXMIT if the send fails for any reason 429 * - do any cleanup required by recoverable socket errors (?) 430 * For the server side: 431 * - return EINTR or ERESTART if interrupted by a signal 432 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 433 * - do any cleanup required by recoverable socket errors (?) 434 */ 435 int 436 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, 437 struct nfsreq *rep) 438 { 439 struct sockaddr *sendnam; 440 int error, soflags, flags; 441 442 if (rep) { 443 if (rep->r_flags & R_SOFTTERM) { 444 m_freem(top); 445 return (EINTR); 446 } 447 if ((so = rep->r_nmp->nm_so) == NULL) { 448 rep->r_flags |= R_NEEDSXMIT; 449 m_freem(top); 450 return (0); 451 } 452 rep->r_flags &= ~R_NEEDSXMIT; 453 soflags = rep->r_nmp->nm_soflags; 454 } else { 455 soflags = so->so_proto->pr_flags; 456 } 457 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 458 sendnam = NULL; 459 else 460 sendnam = nam; 461 if (so->so_type == SOCK_SEQPACKET) 462 flags = MSG_EOR; 463 else 464 flags = 0; 465 466 /* 467 * calls pru_sosend -> sosend -> so_pru_send -> netrpc 468 */ 469 error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags, 470 curthread /*XXX*/); 471 /* 472 * ENOBUFS for dgram sockets is transient and non fatal. 473 * No need to log, and no need to break a soft mount. 474 */ 475 if (error == ENOBUFS && so->so_type == SOCK_DGRAM) { 476 error = 0; 477 /* 478 * do backoff retransmit on client 479 */ 480 if (rep) { 481 if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) { 482 rep->r_nmp->nm_state |= NFSSTA_SENDSPACE; 483 kprintf("Warning: NFS: Insufficient sendspace " 484 "(%lu),\n" 485 "\t You must increase vfs.nfs.soreserve" 486 "or decrease vfs.nfs.maxasyncbio\n", 487 so->so_snd.ssb_hiwat); 488 } 489 rep->r_flags |= R_NEEDSXMIT; 490 } 491 } 492 493 if (error) { 494 if (rep) { 495 log(LOG_INFO, "nfs send error %d for server %s\n",error, 496 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 497 /* 498 * Deal with errors for the client side. 499 */ 500 if (rep->r_flags & R_SOFTTERM) 501 error = EINTR; 502 else 503 rep->r_flags |= R_NEEDSXMIT; 504 } else { 505 log(LOG_INFO, "nfsd send error %d\n", error); 506 } 507 508 /* 509 * Handle any recoverable (soft) socket errors here. (?) 510 */ 511 if (error != EINTR && error != ERESTART && 512 error != EWOULDBLOCK && error != EPIPE) 513 error = 0; 514 } 515 return (error); 516 } 517 518 /* 519 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 520 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 521 * Mark and consolidate the data into a new mbuf list. 522 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 523 * small mbufs. 524 * For SOCK_STREAM we must be very careful to read an entire record once 525 * we have read any of it, even if the system call has been interrupted. 526 */ 527 static int 528 nfs_receive(struct nfsmount *nmp, struct nfsreq *rep, 529 struct sockaddr **aname, struct mbuf **mp) 530 { 531 struct socket *so; 532 struct sockbuf sio; 533 struct uio auio; 534 struct iovec aio; 535 struct mbuf *m; 536 struct mbuf *control; 537 u_int32_t len; 538 struct sockaddr **getnam; 539 int error, sotype, rcvflg; 540 struct thread *td = curthread; /* XXX */ 541 542 /* 543 * Set up arguments for soreceive() 544 */ 545 *mp = NULL; 546 *aname = NULL; 547 sotype = nmp->nm_sotype; 548 549 /* 550 * For reliable protocols, lock against other senders/receivers 551 * in case a reconnect is necessary. 552 * For SOCK_STREAM, first get the Record Mark to find out how much 553 * more there is to get. 554 * We must lock the socket against other receivers 555 * until we have an entire rpc request/reply. 556 */ 557 if (sotype != SOCK_DGRAM) { 558 error = nfs_sndlock(nmp, rep); 559 if (error) 560 return (error); 561 tryagain: 562 /* 563 * Check for fatal errors and resending request. 564 */ 565 /* 566 * Ugh: If a reconnect attempt just happened, nm_so 567 * would have changed. NULL indicates a failed 568 * attempt that has essentially shut down this 569 * mount point. 570 */ 571 if (rep && (rep->r_mrep || (rep->r_flags & R_SOFTTERM))) { 572 nfs_sndunlock(nmp); 573 return (EINTR); 574 } 575 so = nmp->nm_so; 576 if (so == NULL) { 577 error = nfs_reconnect(nmp, rep); 578 if (error) { 579 nfs_sndunlock(nmp); 580 return (error); 581 } 582 goto tryagain; 583 } 584 while (rep && (rep->r_flags & R_NEEDSXMIT)) { 585 m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT); 586 nfsstats.rpcretries++; 587 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 588 if (error) { 589 if (error == EINTR || error == ERESTART || 590 (error = nfs_reconnect(nmp, rep)) != 0) { 591 nfs_sndunlock(nmp); 592 return (error); 593 } 594 goto tryagain; 595 } 596 } 597 nfs_sndunlock(nmp); 598 if (sotype == SOCK_STREAM) { 599 /* 600 * Get the length marker from the stream 601 */ 602 aio.iov_base = (caddr_t)&len; 603 aio.iov_len = sizeof(u_int32_t); 604 auio.uio_iov = &aio; 605 auio.uio_iovcnt = 1; 606 auio.uio_segflg = UIO_SYSSPACE; 607 auio.uio_rw = UIO_READ; 608 auio.uio_offset = 0; 609 auio.uio_resid = sizeof(u_int32_t); 610 auio.uio_td = td; 611 do { 612 rcvflg = MSG_WAITALL; 613 error = so_pru_soreceive(so, NULL, &auio, NULL, 614 NULL, &rcvflg); 615 if (error == EWOULDBLOCK && rep) { 616 if (rep->r_flags & R_SOFTTERM) 617 return (EINTR); 618 } 619 } while (error == EWOULDBLOCK); 620 621 if (error == 0 && auio.uio_resid > 0) { 622 /* 623 * Only log short packets if not EOF 624 */ 625 if (auio.uio_resid != sizeof(u_int32_t)) 626 log(LOG_INFO, 627 "short receive (%d/%d) from nfs server %s\n", 628 (int)(sizeof(u_int32_t) - auio.uio_resid), 629 (int)sizeof(u_int32_t), 630 nmp->nm_mountp->mnt_stat.f_mntfromname); 631 error = EPIPE; 632 } 633 if (error) 634 goto errout; 635 len = ntohl(len) & ~0x80000000; 636 /* 637 * This is SERIOUS! We are out of sync with the sender 638 * and forcing a disconnect/reconnect is all I can do. 639 */ 640 if (len > NFS_MAXPACKET) { 641 log(LOG_ERR, "%s (%d) from nfs server %s\n", 642 "impossible packet length", 643 len, 644 nmp->nm_mountp->mnt_stat.f_mntfromname); 645 error = EFBIG; 646 goto errout; 647 } 648 649 /* 650 * Get the rest of the packet as an mbuf chain 651 */ 652 sbinit(&sio, len); 653 do { 654 rcvflg = MSG_WAITALL; 655 error = so_pru_soreceive(so, NULL, NULL, &sio, 656 NULL, &rcvflg); 657 } while (error == EWOULDBLOCK || error == EINTR || 658 error == ERESTART); 659 if (error == 0 && sio.sb_cc != len) { 660 if (sio.sb_cc != 0) 661 log(LOG_INFO, 662 "short receive (%zu/%d) from nfs server %s\n", 663 (size_t)len - auio.uio_resid, len, 664 nmp->nm_mountp->mnt_stat.f_mntfromname); 665 error = EPIPE; 666 } 667 *mp = sio.sb_mb; 668 } else { 669 /* 670 * Non-stream, so get the whole packet by not 671 * specifying MSG_WAITALL and by specifying a large 672 * length. 673 * 674 * We have no use for control msg., but must grab them 675 * and then throw them away so we know what is going 676 * on. 677 */ 678 sbinit(&sio, 100000000); 679 do { 680 rcvflg = 0; 681 error = so_pru_soreceive(so, NULL, NULL, &sio, 682 &control, &rcvflg); 683 if (control) 684 m_freem(control); 685 if (error == EWOULDBLOCK && rep) { 686 if (rep->r_flags & R_SOFTTERM) { 687 m_freem(sio.sb_mb); 688 return (EINTR); 689 } 690 } 691 } while (error == EWOULDBLOCK || 692 (error == 0 && sio.sb_mb == NULL && control)); 693 if ((rcvflg & MSG_EOR) == 0) 694 kprintf("Egad!!\n"); 695 if (error == 0 && sio.sb_mb == NULL) 696 error = EPIPE; 697 len = sio.sb_cc; 698 *mp = sio.sb_mb; 699 } 700 errout: 701 if (error && error != EINTR && error != ERESTART) { 702 m_freem(*mp); 703 *mp = NULL; 704 if (error != EPIPE) { 705 log(LOG_INFO, 706 "receive error %d from nfs server %s\n", 707 error, 708 nmp->nm_mountp->mnt_stat.f_mntfromname); 709 } 710 error = nfs_sndlock(nmp, rep); 711 if (!error) { 712 error = nfs_reconnect(nmp, rep); 713 if (!error) 714 goto tryagain; 715 else 716 nfs_sndunlock(nmp); 717 } 718 } 719 } else { 720 if ((so = nmp->nm_so) == NULL) 721 return (EACCES); 722 if (so->so_state & SS_ISCONNECTED) 723 getnam = NULL; 724 else 725 getnam = aname; 726 sbinit(&sio, 100000000); 727 do { 728 rcvflg = 0; 729 error = so_pru_soreceive(so, getnam, NULL, &sio, 730 NULL, &rcvflg); 731 if (error == EWOULDBLOCK && rep && 732 (rep->r_flags & R_SOFTTERM)) { 733 m_freem(sio.sb_mb); 734 return (EINTR); 735 } 736 } while (error == EWOULDBLOCK); 737 738 len = sio.sb_cc; 739 *mp = sio.sb_mb; 740 741 /* 742 * A shutdown may result in no error and no mbuf. 743 * Convert to EPIPE. 744 */ 745 if (*mp == NULL && error == 0) 746 error = EPIPE; 747 } 748 if (error) { 749 m_freem(*mp); 750 *mp = NULL; 751 } 752 753 /* 754 * Search for any mbufs that are not a multiple of 4 bytes long 755 * or with m_data not longword aligned. 756 * These could cause pointer alignment problems, so copy them to 757 * well aligned mbufs. 758 */ 759 nfs_realign(mp, 5 * NFSX_UNSIGNED); 760 return (error); 761 } 762 763 /* 764 * Implement receipt of reply on a socket. 765 * 766 * We must search through the list of received datagrams matching them 767 * with outstanding requests using the xid, until ours is found. 768 * 769 * If myrep is NULL we process packets on the socket until 770 * interrupted or until nm_reqrxq is non-empty. 771 */ 772 /* ARGSUSED */ 773 int 774 nfs_reply(struct nfsmount *nmp, struct nfsreq *myrep) 775 { 776 struct nfsreq *rep; 777 struct sockaddr *nam; 778 u_int32_t rxid; 779 u_int32_t *tl; 780 int error; 781 struct nfsm_info info; 782 783 /* 784 * Loop around until we get our own reply 785 */ 786 for (;;) { 787 /* 788 * Lock against other receivers so that I don't get stuck in 789 * sbwait() after someone else has received my reply for me. 790 * Also necessary for connection based protocols to avoid 791 * race conditions during a reconnect. 792 * 793 * If nfs_rcvlock() returns EALREADY, that means that 794 * the reply has already been recieved by another 795 * process and we can return immediately. In this 796 * case, the lock is not taken to avoid races with 797 * other processes. 798 */ 799 info.mrep = NULL; 800 801 error = nfs_rcvlock(nmp, myrep); 802 if (error == EALREADY) 803 return (0); 804 if (error) 805 return (error); 806 807 /* 808 * If myrep is NULL we are the receiver helper thread. 809 * Stop waiting for incoming replies if there are 810 * messages sitting on reqrxq that we need to process, 811 * or if a shutdown request is pending. 812 */ 813 if (myrep == NULL && (TAILQ_FIRST(&nmp->nm_reqrxq) || 814 nmp->nm_rxstate > NFSSVC_PENDING)) { 815 nfs_rcvunlock(nmp); 816 return(EWOULDBLOCK); 817 } 818 819 /* 820 * Get the next Rpc reply off the socket 821 * 822 * We cannot release the receive lock until we've 823 * filled in rep->r_mrep, otherwise a waiting 824 * thread may deadlock in soreceive with no incoming 825 * packets expected. 826 */ 827 error = nfs_receive(nmp, myrep, &nam, &info.mrep); 828 if (error) { 829 /* 830 * Ignore routing errors on connectionless protocols?? 831 */ 832 nfs_rcvunlock(nmp); 833 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 834 if (nmp->nm_so == NULL) 835 return (error); 836 nmp->nm_so->so_error = 0; 837 continue; 838 } 839 return (error); 840 } 841 if (nam) 842 FREE(nam, M_SONAME); 843 844 /* 845 * Get the xid and check that it is an rpc reply 846 */ 847 info.md = info.mrep; 848 info.dpos = mtod(info.md, caddr_t); 849 NULLOUT(tl = nfsm_dissect(&info, 2*NFSX_UNSIGNED)); 850 rxid = *tl++; 851 if (*tl != rpc_reply) { 852 nfsstats.rpcinvalid++; 853 m_freem(info.mrep); 854 info.mrep = NULL; 855 nfsmout: 856 nfs_rcvunlock(nmp); 857 continue; 858 } 859 860 /* 861 * Loop through the request list to match up the reply 862 * Iff no match, just drop the datagram. On match, set 863 * r_mrep atomically to prevent the timer from messing 864 * around with the request after we have exited the critical 865 * section. 866 */ 867 crit_enter(); 868 TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) { 869 if (rep->r_mrep == NULL && rxid == rep->r_xid) 870 break; 871 } 872 873 /* 874 * Fill in the rest of the reply if we found a match. 875 * 876 * Deal with duplicate responses if there was no match. 877 */ 878 if (rep) { 879 rep->r_md = info.md; 880 rep->r_dpos = info.dpos; 881 if (nfsrtton) { 882 struct rttl *rt; 883 884 rt = &nfsrtt.rttl[nfsrtt.pos]; 885 rt->proc = rep->r_procnum; 886 rt->rto = 0; 887 rt->sent = 0; 888 rt->cwnd = nmp->nm_maxasync_scaled; 889 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; 890 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; 891 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; 892 getmicrotime(&rt->tstamp); 893 if (rep->r_flags & R_TIMING) 894 rt->rtt = rep->r_rtt; 895 else 896 rt->rtt = 1000000; 897 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; 898 } 899 900 /* 901 * New congestion control is based only on async 902 * requests. 903 */ 904 if (nmp->nm_maxasync_scaled < NFS_MAXASYNC_SCALED) 905 ++nmp->nm_maxasync_scaled; 906 if (rep->r_flags & R_SENT) { 907 rep->r_flags &= ~R_SENT; 908 } 909 /* 910 * Update rtt using a gain of 0.125 on the mean 911 * and a gain of 0.25 on the deviation. 912 * 913 * NOTE SRTT/SDRTT are only good if R_TIMING is set. 914 */ 915 if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) { 916 /* 917 * Since the timer resolution of 918 * NFS_HZ is so course, it can often 919 * result in r_rtt == 0. Since 920 * r_rtt == N means that the actual 921 * rtt is between N+dt and N+2-dt ticks, 922 * add 1. 923 */ 924 int n; 925 int d; 926 927 #define NFSRSB NFS_RTT_SCALE_BITS 928 n = ((NFS_SRTT(rep) * 7) + 929 (rep->r_rtt << NFSRSB)) >> 3; 930 d = n - NFS_SRTT(rep); 931 NFS_SRTT(rep) = n; 932 933 /* 934 * Don't let the jitter calculation decay 935 * too quickly, but we want a fast rampup. 936 */ 937 if (d < 0) 938 d = -d; 939 d <<= NFSRSB; 940 if (d < NFS_SDRTT(rep)) 941 n = ((NFS_SDRTT(rep) * 15) + d) >> 4; 942 else 943 n = ((NFS_SDRTT(rep) * 3) + d) >> 2; 944 NFS_SDRTT(rep) = n; 945 #undef NFSRSB 946 } 947 nmp->nm_timeouts = 0; 948 rep->r_mrep = info.mrep; 949 nfs_hardterm(rep, 0); 950 } else { 951 /* 952 * Extract vers, prog, nfsver, procnum. A duplicate 953 * response means we didn't wait long enough so 954 * we increase the SRTT to avoid future spurious 955 * timeouts. 956 */ 957 u_int procnum = nmp->nm_lastreprocnum; 958 int n; 959 960 if (procnum < NFS_NPROCS && proct[procnum]) { 961 if (nfs_showrexmit) 962 kprintf("D"); 963 n = nmp->nm_srtt[proct[procnum]]; 964 n += NFS_ASYSCALE * NFS_HZ; 965 if (n < NFS_ASYSCALE * NFS_HZ * 10) 966 n = NFS_ASYSCALE * NFS_HZ * 10; 967 nmp->nm_srtt[proct[procnum]] = n; 968 } 969 } 970 nfs_rcvunlock(nmp); 971 crit_exit(); 972 973 /* 974 * If not matched to a request, drop it. 975 * If it's mine, get out. 976 */ 977 if (rep == NULL) { 978 nfsstats.rpcunexpected++; 979 m_freem(info.mrep); 980 info.mrep = NULL; 981 } else if (rep == myrep) { 982 if (rep->r_mrep == NULL) 983 panic("nfsreply nil"); 984 return (0); 985 } 986 } 987 } 988 989 /* 990 * Run the request state machine until the target state is reached 991 * or a fatal error occurs. The target state is not run. Specifying 992 * a target of NFSM_STATE_DONE runs the state machine until the rpc 993 * is complete. 994 * 995 * EINPROGRESS is returned for all states other then the DONE state, 996 * indicating that the rpc is still in progress. 997 */ 998 int 999 nfs_request(struct nfsm_info *info, nfsm_state_t bstate, nfsm_state_t estate) 1000 { 1001 struct nfsreq *req; 1002 1003 while (info->state >= bstate && info->state < estate) { 1004 switch(info->state) { 1005 case NFSM_STATE_SETUP: 1006 /* 1007 * Setup the nfsreq. Any error which occurs during 1008 * this state is fatal. 1009 */ 1010 info->error = nfs_request_setup(info); 1011 if (info->error) { 1012 info->state = NFSM_STATE_DONE; 1013 return (info->error); 1014 } else { 1015 req = info->req; 1016 req->r_mrp = &info->mrep; 1017 req->r_mdp = &info->md; 1018 req->r_dposp = &info->dpos; 1019 info->state = NFSM_STATE_AUTH; 1020 } 1021 break; 1022 case NFSM_STATE_AUTH: 1023 /* 1024 * Authenticate the nfsreq. Any error which occurs 1025 * during this state is fatal. 1026 */ 1027 info->error = nfs_request_auth(info->req); 1028 if (info->error) { 1029 info->state = NFSM_STATE_DONE; 1030 return (info->error); 1031 } else { 1032 info->state = NFSM_STATE_TRY; 1033 } 1034 break; 1035 case NFSM_STATE_TRY: 1036 /* 1037 * Transmit or retransmit attempt. An error in this 1038 * state is ignored and we always move on to the 1039 * next state. 1040 * 1041 * This can trivially race the receiver if the 1042 * request is asynchronous. nfs_request_try() 1043 * will thus set the state for us and we 1044 * must also return immediately if we are 1045 * running an async state machine, because 1046 * info can become invalid due to races after 1047 * try() returns. 1048 */ 1049 if (info->req->r_flags & R_ASYNC) { 1050 nfs_request_try(info->req); 1051 if (estate == NFSM_STATE_WAITREPLY) 1052 return (EINPROGRESS); 1053 } else { 1054 nfs_request_try(info->req); 1055 info->state = NFSM_STATE_WAITREPLY; 1056 } 1057 break; 1058 case NFSM_STATE_WAITREPLY: 1059 /* 1060 * Wait for a reply or timeout and move on to the 1061 * next state. The error returned by this state 1062 * is passed to the processing code in the next 1063 * state. 1064 */ 1065 info->error = nfs_request_waitreply(info->req); 1066 info->state = NFSM_STATE_PROCESSREPLY; 1067 break; 1068 case NFSM_STATE_PROCESSREPLY: 1069 /* 1070 * Process the reply or timeout. Errors which occur 1071 * in this state may cause the state machine to 1072 * go back to an earlier state, and are fatal 1073 * otherwise. 1074 */ 1075 info->error = nfs_request_processreply(info, 1076 info->error); 1077 switch(info->error) { 1078 case ENEEDAUTH: 1079 info->state = NFSM_STATE_AUTH; 1080 break; 1081 case EAGAIN: 1082 info->state = NFSM_STATE_TRY; 1083 break; 1084 default: 1085 /* 1086 * Operation complete, with or without an 1087 * error. We are done. 1088 */ 1089 info->req = NULL; 1090 info->state = NFSM_STATE_DONE; 1091 return (info->error); 1092 } 1093 break; 1094 case NFSM_STATE_DONE: 1095 /* 1096 * Shouldn't be reached 1097 */ 1098 return (info->error); 1099 /* NOT REACHED */ 1100 } 1101 } 1102 1103 /* 1104 * If we are done return the error code (if any). 1105 * Otherwise return EINPROGRESS. 1106 */ 1107 if (info->state == NFSM_STATE_DONE) 1108 return (info->error); 1109 return (EINPROGRESS); 1110 } 1111 1112 /* 1113 * nfs_request - goes something like this 1114 * - fill in request struct 1115 * - links it into list 1116 * - calls nfs_send() for first transmit 1117 * - calls nfs_receive() to get reply 1118 * - break down rpc header and return with nfs reply pointed to 1119 * by mrep or error 1120 * nb: always frees up mreq mbuf list 1121 */ 1122 static int 1123 nfs_request_setup(nfsm_info_t info) 1124 { 1125 struct nfsreq *req; 1126 struct nfsmount *nmp; 1127 struct mbuf *m; 1128 int i; 1129 1130 /* 1131 * Reject requests while attempting a forced unmount. 1132 */ 1133 if (info->vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) { 1134 m_freem(info->mreq); 1135 info->mreq = NULL; 1136 return (ESTALE); 1137 } 1138 nmp = VFSTONFS(info->vp->v_mount); 1139 req = kmalloc(sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); 1140 req->r_nmp = nmp; 1141 req->r_vp = info->vp; 1142 req->r_td = info->td; 1143 req->r_procnum = info->procnum; 1144 req->r_mreq = NULL; 1145 req->r_cred = info->cred; 1146 1147 i = 0; 1148 m = info->mreq; 1149 while (m) { 1150 i += m->m_len; 1151 m = m->m_next; 1152 } 1153 req->r_mrest = info->mreq; 1154 req->r_mrest_len = i; 1155 1156 /* 1157 * The presence of a non-NULL r_info in req indicates 1158 * async completion via our helper threads. See the receiver 1159 * code. 1160 */ 1161 if (info->bio) { 1162 req->r_info = info; 1163 req->r_flags = R_ASYNC; 1164 } else { 1165 req->r_info = NULL; 1166 req->r_flags = 0; 1167 } 1168 info->req = req; 1169 return(0); 1170 } 1171 1172 static int 1173 nfs_request_auth(struct nfsreq *rep) 1174 { 1175 struct nfsmount *nmp = rep->r_nmp; 1176 struct mbuf *m; 1177 char nickv[RPCX_NICKVERF]; 1178 int error = 0, auth_len, auth_type; 1179 int verf_len; 1180 u_int32_t xid; 1181 char *auth_str, *verf_str; 1182 struct ucred *cred; 1183 1184 cred = rep->r_cred; 1185 rep->r_failed_auth = 0; 1186 1187 /* 1188 * Get the RPC header with authorization. 1189 */ 1190 verf_str = auth_str = NULL; 1191 if (nmp->nm_flag & NFSMNT_KERB) { 1192 verf_str = nickv; 1193 verf_len = sizeof (nickv); 1194 auth_type = RPCAUTH_KERB4; 1195 bzero((caddr_t)rep->r_key, sizeof(rep->r_key)); 1196 if (rep->r_failed_auth || 1197 nfs_getnickauth(nmp, cred, &auth_str, &auth_len, 1198 verf_str, verf_len)) { 1199 error = nfs_getauth(nmp, rep, cred, &auth_str, 1200 &auth_len, verf_str, &verf_len, rep->r_key); 1201 if (error) { 1202 m_freem(rep->r_mrest); 1203 rep->r_mrest = NULL; 1204 kfree((caddr_t)rep, M_NFSREQ); 1205 return (error); 1206 } 1207 } 1208 } else { 1209 auth_type = RPCAUTH_UNIX; 1210 if (cred->cr_ngroups < 1) 1211 panic("nfsreq nogrps"); 1212 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? 1213 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + 1214 5 * NFSX_UNSIGNED; 1215 } 1216 if (rep->r_mrest) 1217 nfs_checkpkt(rep->r_mrest, rep->r_mrest_len); 1218 m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type, 1219 auth_len, auth_str, verf_len, verf_str, 1220 rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid); 1221 rep->r_mrest = NULL; 1222 if (auth_str) 1223 kfree(auth_str, M_TEMP); 1224 1225 /* 1226 * For stream protocols, insert a Sun RPC Record Mark. 1227 */ 1228 if (nmp->nm_sotype == SOCK_STREAM) { 1229 M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT); 1230 if (m == NULL) { 1231 kfree(rep, M_NFSREQ); 1232 return (ENOBUFS); 1233 } 1234 *mtod(m, u_int32_t *) = htonl(0x80000000 | 1235 (m->m_pkthdr.len - NFSX_UNSIGNED)); 1236 } 1237 1238 nfs_checkpkt(m, m->m_pkthdr.len); 1239 1240 rep->r_mreq = m; 1241 rep->r_xid = xid; 1242 return (0); 1243 } 1244 1245 static int 1246 nfs_request_try(struct nfsreq *rep) 1247 { 1248 struct nfsmount *nmp = rep->r_nmp; 1249 struct mbuf *m2; 1250 int error; 1251 1252 /* 1253 * Request is not on any queue, only the owner has access to it 1254 * so it should not be locked by anyone atm. 1255 * 1256 * Interlock to prevent races. While locked the only remote 1257 * action possible is for r_mrep to be set (once we enqueue it). 1258 */ 1259 if (rep->r_flags == 0xdeadc0de) { 1260 print_backtrace(-1); 1261 panic("flags nbad\n"); 1262 } 1263 KKASSERT((rep->r_flags & (R_LOCKED | R_ONREQQ)) == 0); 1264 if (nmp->nm_flag & NFSMNT_SOFT) 1265 rep->r_retry = nmp->nm_retry; 1266 else 1267 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ 1268 rep->r_rtt = rep->r_rexmit = 0; 1269 if (proct[rep->r_procnum] > 0) 1270 rep->r_flags |= R_TIMING | R_LOCKED; 1271 else 1272 rep->r_flags |= R_LOCKED; 1273 rep->r_mrep = NULL; 1274 1275 /* 1276 * Do the client side RPC. 1277 */ 1278 nfsstats.rpcrequests++; 1279 1280 if (nmp->nm_flag & NFSMNT_FORCE) { 1281 rep->r_flags |= R_SOFTTERM; 1282 rep->r_flags &= ~R_LOCKED; 1283 return (0); 1284 } 1285 1286 /* 1287 * Chain request into list of outstanding requests. Be sure 1288 * to put it LAST so timer finds oldest requests first. Note 1289 * that our control of R_LOCKED prevents the request from 1290 * getting ripped out from under us or transmitted by the 1291 * timer code. 1292 * 1293 * For requests with info structures we must atomically set the 1294 * info's state because the structure could become invalid upon 1295 * return due to races (i.e., if async) 1296 */ 1297 crit_enter(); 1298 mtx_link_init(&rep->r_link); 1299 KKASSERT((rep->r_flags & R_ONREQQ) == 0); 1300 TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain); 1301 rep->r_flags |= R_ONREQQ; 1302 ++nmp->nm_reqqlen; 1303 if (rep->r_flags & R_ASYNC) 1304 rep->r_info->state = NFSM_STATE_WAITREPLY; 1305 crit_exit(); 1306 1307 error = 0; 1308 1309 /* 1310 * Send if we can. Congestion control is not handled here any more 1311 * becausing trying to defer the initial send based on the nfs_timer 1312 * requires having a very fast nfs_timer, which is silly. 1313 */ 1314 if (nmp->nm_so) { 1315 if (nmp->nm_soflags & PR_CONNREQUIRED) 1316 error = nfs_sndlock(nmp, rep); 1317 if (error == 0) { 1318 m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT); 1319 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); 1320 if (nmp->nm_soflags & PR_CONNREQUIRED) 1321 nfs_sndunlock(nmp); 1322 rep->r_flags &= ~R_NEEDSXMIT; 1323 if ((rep->r_flags & R_SENT) == 0) { 1324 rep->r_flags |= R_SENT; 1325 } 1326 } else { 1327 rep->r_flags |= R_NEEDSXMIT; 1328 } 1329 } else { 1330 rep->r_flags |= R_NEEDSXMIT; 1331 rep->r_rtt = -1; 1332 } 1333 if (error == EPIPE) 1334 error = 0; 1335 1336 /* 1337 * Release the lock. The only remote action that may have occurred 1338 * would have been the setting of rep->r_mrep. If this occured 1339 * and the request was async we have to move it to the reader 1340 * thread's queue for action. 1341 * 1342 * For async requests also make sure the reader is woken up so 1343 * it gets on the socket to read responses. 1344 */ 1345 crit_enter(); 1346 if (rep->r_flags & R_ASYNC) { 1347 if (rep->r_mrep) 1348 nfs_hardterm(rep, 1); 1349 rep->r_flags &= ~R_LOCKED; 1350 nfssvc_iod_reader_wakeup(nmp); 1351 } else { 1352 rep->r_flags &= ~R_LOCKED; 1353 } 1354 if (rep->r_flags & R_WANTED) { 1355 rep->r_flags &= ~R_WANTED; 1356 wakeup(rep); 1357 } 1358 crit_exit(); 1359 return (error); 1360 } 1361 1362 /* 1363 * This code is only called for synchronous requests. Completed synchronous 1364 * requests are left on reqq and we remove them before moving on to the 1365 * processing state. 1366 */ 1367 static int 1368 nfs_request_waitreply(struct nfsreq *rep) 1369 { 1370 struct nfsmount *nmp = rep->r_nmp; 1371 int error; 1372 1373 KKASSERT((rep->r_flags & R_ASYNC) == 0); 1374 1375 /* 1376 * Wait until the request is finished. 1377 */ 1378 error = nfs_reply(nmp, rep); 1379 1380 /* 1381 * RPC done, unlink the request, but don't rip it out from under 1382 * the callout timer. 1383 * 1384 * Once unlinked no other receiver or the timer will have 1385 * visibility, so we do not have to set R_LOCKED. 1386 */ 1387 crit_enter(); 1388 while (rep->r_flags & R_LOCKED) { 1389 rep->r_flags |= R_WANTED; 1390 tsleep(rep, 0, "nfstrac", 0); 1391 } 1392 KKASSERT(rep->r_flags & R_ONREQQ); 1393 TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain); 1394 rep->r_flags &= ~R_ONREQQ; 1395 --nmp->nm_reqqlen; 1396 if (TAILQ_FIRST(&nmp->nm_bioq) && 1397 nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) { 1398 nfssvc_iod_writer_wakeup(nmp); 1399 } 1400 crit_exit(); 1401 1402 /* 1403 * Decrement the outstanding request count. 1404 */ 1405 if (rep->r_flags & R_SENT) { 1406 rep->r_flags &= ~R_SENT; 1407 } 1408 return (error); 1409 } 1410 1411 /* 1412 * Process reply with error returned from nfs_requet_waitreply(). 1413 * 1414 * Returns EAGAIN if it wants us to loop up to nfs_request_try() again. 1415 * Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again. 1416 */ 1417 static int 1418 nfs_request_processreply(nfsm_info_t info, int error) 1419 { 1420 struct nfsreq *req = info->req; 1421 struct nfsmount *nmp = req->r_nmp; 1422 u_int32_t *tl; 1423 int verf_type; 1424 int i; 1425 1426 /* 1427 * If there was a successful reply and a tprintf msg. 1428 * tprintf a response. 1429 */ 1430 if (error == 0 && (req->r_flags & R_TPRINTFMSG)) { 1431 nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname, 1432 "is alive again"); 1433 } 1434 info->mrep = req->r_mrep; 1435 info->md = req->r_md; 1436 info->dpos = req->r_dpos; 1437 if (error) { 1438 m_freem(req->r_mreq); 1439 req->r_mreq = NULL; 1440 kfree(req, M_NFSREQ); 1441 info->req = NULL; 1442 return (error); 1443 } 1444 1445 /* 1446 * break down the rpc header and check if ok 1447 */ 1448 NULLOUT(tl = nfsm_dissect(info, 3 * NFSX_UNSIGNED)); 1449 if (*tl++ == rpc_msgdenied) { 1450 if (*tl == rpc_mismatch) { 1451 error = EOPNOTSUPP; 1452 } else if ((nmp->nm_flag & NFSMNT_KERB) && 1453 *tl++ == rpc_autherr) { 1454 if (req->r_failed_auth == 0) { 1455 req->r_failed_auth++; 1456 req->r_mheadend->m_next = NULL; 1457 m_freem(info->mrep); 1458 info->mrep = NULL; 1459 m_freem(req->r_mreq); 1460 req->r_mreq = NULL; 1461 return (ENEEDAUTH); 1462 } else { 1463 error = EAUTH; 1464 } 1465 } else { 1466 error = EACCES; 1467 } 1468 m_freem(info->mrep); 1469 info->mrep = NULL; 1470 m_freem(req->r_mreq); 1471 req->r_mreq = NULL; 1472 kfree(req, M_NFSREQ); 1473 info->req = NULL; 1474 return (error); 1475 } 1476 1477 /* 1478 * Grab any Kerberos verifier, otherwise just throw it away. 1479 */ 1480 verf_type = fxdr_unsigned(int, *tl++); 1481 i = fxdr_unsigned(int32_t, *tl); 1482 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) { 1483 error = nfs_savenickauth(nmp, req->r_cred, i, req->r_key, 1484 &info->md, &info->dpos, info->mrep); 1485 if (error) 1486 goto nfsmout; 1487 } else if (i > 0) { 1488 ERROROUT(nfsm_adv(info, nfsm_rndup(i))); 1489 } 1490 NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED)); 1491 /* 0 == ok */ 1492 if (*tl == 0) { 1493 NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED)); 1494 if (*tl != 0) { 1495 error = fxdr_unsigned(int, *tl); 1496 1497 /* 1498 * Does anyone even implement this? Just impose 1499 * a 1-second delay. 1500 */ 1501 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1502 error == NFSERR_TRYLATER) { 1503 m_freem(info->mrep); 1504 info->mrep = NULL; 1505 error = 0; 1506 1507 tsleep((caddr_t)&lbolt, 0, "nqnfstry", 0); 1508 return (EAGAIN); /* goto tryagain */ 1509 } 1510 1511 /* 1512 * If the File Handle was stale, invalidate the 1513 * lookup cache, just in case. 1514 * 1515 * To avoid namecache<->vnode deadlocks we must 1516 * release the vnode lock if we hold it. 1517 */ 1518 if (error == ESTALE) { 1519 struct vnode *vp = req->r_vp; 1520 int ltype; 1521 1522 ltype = lockstatus(&vp->v_lock, curthread); 1523 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED) 1524 lockmgr(&vp->v_lock, LK_RELEASE); 1525 cache_inval_vp(vp, CINV_CHILDREN); 1526 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED) 1527 lockmgr(&vp->v_lock, ltype); 1528 } 1529 if (nmp->nm_flag & NFSMNT_NFSV3) { 1530 KKASSERT(*req->r_mrp == info->mrep); 1531 KKASSERT(*req->r_mdp == info->md); 1532 KKASSERT(*req->r_dposp == info->dpos); 1533 error |= NFSERR_RETERR; 1534 } else { 1535 m_freem(info->mrep); 1536 info->mrep = NULL; 1537 } 1538 m_freem(req->r_mreq); 1539 req->r_mreq = NULL; 1540 kfree(req, M_NFSREQ); 1541 info->req = NULL; 1542 return (error); 1543 } 1544 1545 KKASSERT(*req->r_mrp == info->mrep); 1546 KKASSERT(*req->r_mdp == info->md); 1547 KKASSERT(*req->r_dposp == info->dpos); 1548 m_freem(req->r_mreq); 1549 req->r_mreq = NULL; 1550 FREE(req, M_NFSREQ); 1551 return (0); 1552 } 1553 m_freem(info->mrep); 1554 info->mrep = NULL; 1555 error = EPROTONOSUPPORT; 1556 nfsmout: 1557 m_freem(req->r_mreq); 1558 req->r_mreq = NULL; 1559 kfree(req, M_NFSREQ); 1560 info->req = NULL; 1561 return (error); 1562 } 1563 1564 #ifndef NFS_NOSERVER 1565 /* 1566 * Generate the rpc reply header 1567 * siz arg. is used to decide if adding a cluster is worthwhile 1568 */ 1569 int 1570 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1571 int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp) 1572 { 1573 u_int32_t *tl; 1574 struct nfsm_info info; 1575 1576 siz += RPC_REPLYSIZ; 1577 info.mb = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL); 1578 info.mreq = info.mb; 1579 info.mreq->m_pkthdr.len = 0; 1580 /* 1581 * If this is not a cluster, try and leave leading space 1582 * for the lower level headers. 1583 */ 1584 if ((max_hdr + siz) < MINCLSIZE) 1585 info.mreq->m_data += max_hdr; 1586 tl = mtod(info.mreq, u_int32_t *); 1587 info.mreq->m_len = 6 * NFSX_UNSIGNED; 1588 info.bpos = ((caddr_t)tl) + info.mreq->m_len; 1589 *tl++ = txdr_unsigned(nd->nd_retxid); 1590 *tl++ = rpc_reply; 1591 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1592 *tl++ = rpc_msgdenied; 1593 if (err & NFSERR_AUTHERR) { 1594 *tl++ = rpc_autherr; 1595 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1596 info.mreq->m_len -= NFSX_UNSIGNED; 1597 info.bpos -= NFSX_UNSIGNED; 1598 } else { 1599 *tl++ = rpc_mismatch; 1600 *tl++ = txdr_unsigned(RPC_VER2); 1601 *tl = txdr_unsigned(RPC_VER2); 1602 } 1603 } else { 1604 *tl++ = rpc_msgaccepted; 1605 1606 /* 1607 * For Kerberos authentication, we must send the nickname 1608 * verifier back, otherwise just RPCAUTH_NULL. 1609 */ 1610 if (nd->nd_flag & ND_KERBFULL) { 1611 struct nfsuid *nuidp; 1612 struct timeval ktvin, ktvout; 1613 1614 for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first; 1615 nuidp != 0; nuidp = nuidp->nu_hash.le_next) { 1616 if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid && 1617 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp), 1618 &nuidp->nu_haddr, nd->nd_nam2))) 1619 break; 1620 } 1621 if (nuidp) { 1622 ktvin.tv_sec = 1623 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1); 1624 ktvin.tv_usec = 1625 txdr_unsigned(nuidp->nu_timestamp.tv_usec); 1626 1627 /* 1628 * Encrypt the timestamp in ecb mode using the 1629 * session key. 1630 */ 1631 #ifdef NFSKERB 1632 XXX 1633 #else 1634 ktvout.tv_sec = 0; 1635 ktvout.tv_usec = 0; 1636 #endif 1637 1638 *tl++ = rpc_auth_kerb; 1639 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED); 1640 *tl = ktvout.tv_sec; 1641 tl = nfsm_build(&info, 3 * NFSX_UNSIGNED); 1642 *tl++ = ktvout.tv_usec; 1643 *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid); 1644 } else { 1645 *tl++ = 0; 1646 *tl++ = 0; 1647 } 1648 } else { 1649 *tl++ = 0; 1650 *tl++ = 0; 1651 } 1652 switch (err) { 1653 case EPROGUNAVAIL: 1654 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1655 break; 1656 case EPROGMISMATCH: 1657 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1658 tl = nfsm_build(&info, 2 * NFSX_UNSIGNED); 1659 *tl++ = txdr_unsigned(2); 1660 *tl = txdr_unsigned(3); 1661 break; 1662 case EPROCUNAVAIL: 1663 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1664 break; 1665 case EBADRPC: 1666 *tl = txdr_unsigned(RPC_GARBAGE); 1667 break; 1668 default: 1669 *tl = 0; 1670 if (err != NFSERR_RETVOID) { 1671 tl = nfsm_build(&info, NFSX_UNSIGNED); 1672 if (err) 1673 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1674 else 1675 *tl = 0; 1676 } 1677 break; 1678 }; 1679 } 1680 1681 if (mrq != NULL) 1682 *mrq = info.mreq; 1683 *mbp = info.mb; 1684 *bposp = info.bpos; 1685 if (err != 0 && err != NFSERR_RETVOID) 1686 nfsstats.srvrpc_errs++; 1687 return (0); 1688 } 1689 1690 1691 #endif /* NFS_NOSERVER */ 1692 1693 /* 1694 * Nfs timer routine. 1695 * 1696 * Scan the nfsreq list and retranmit any requests that have timed out 1697 * To avoid retransmission attempts on STREAM sockets (in the future) make 1698 * sure to set the r_retry field to 0 (implies nm_retry == 0). 1699 * 1700 * Requests with attached responses, terminated requests, and 1701 * locked requests are ignored. Locked requests will be picked up 1702 * in a later timer call. 1703 */ 1704 void 1705 nfs_timer_callout(void *arg /* never used */) 1706 { 1707 struct nfsmount *nmp; 1708 struct nfsreq *req; 1709 #ifndef NFS_NOSERVER 1710 struct nfssvc_sock *slp; 1711 u_quad_t cur_usec; 1712 #endif /* NFS_NOSERVER */ 1713 1714 lwkt_gettoken(&nfs_token); 1715 TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) { 1716 lwkt_gettoken(&nmp->nm_token); 1717 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { 1718 KKASSERT(nmp == req->r_nmp); 1719 if (req->r_mrep) 1720 continue; 1721 if (req->r_flags & (R_SOFTTERM | R_LOCKED)) 1722 continue; 1723 req->r_flags |= R_LOCKED; 1724 if (nfs_sigintr(nmp, req, req->r_td)) { 1725 nfs_softterm(req, 1); 1726 } else { 1727 nfs_timer_req(req); 1728 } 1729 req->r_flags &= ~R_LOCKED; 1730 if (req->r_flags & R_WANTED) { 1731 req->r_flags &= ~R_WANTED; 1732 wakeup(req); 1733 } 1734 } 1735 lwkt_reltoken(&nmp->nm_token); 1736 } 1737 #ifndef NFS_NOSERVER 1738 1739 /* 1740 * Scan the write gathering queues for writes that need to be 1741 * completed now. 1742 */ 1743 cur_usec = nfs_curusec(); 1744 1745 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { 1746 /* XXX race against removal */ 1747 if (lwkt_trytoken(&slp->ns_token)) { 1748 if (slp->ns_tq.lh_first && 1749 (slp->ns_tq.lh_first->nd_time <= cur_usec)) { 1750 nfsrv_wakenfsd(slp, 1); 1751 } 1752 lwkt_reltoken(&slp->ns_token); 1753 } 1754 } 1755 #endif /* NFS_NOSERVER */ 1756 1757 callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer_callout, NULL); 1758 lwkt_reltoken(&nfs_token); 1759 } 1760 1761 static 1762 void 1763 nfs_timer_req(struct nfsreq *req) 1764 { 1765 struct thread *td = &thread0; /* XXX for creds, will break if sleep */ 1766 struct nfsmount *nmp = req->r_nmp; 1767 struct mbuf *m; 1768 struct socket *so; 1769 int timeo; 1770 int error; 1771 1772 /* 1773 * rtt ticks and timeout calculation. Return if the timeout 1774 * has not been reached yet, unless the packet is flagged 1775 * for an immediate send. 1776 * 1777 * The mean rtt doesn't help when we get random I/Os, we have 1778 * to multiply by fairly large numbers. 1779 */ 1780 if (req->r_rtt >= 0) { 1781 /* 1782 * Calculate the timeout to test against. 1783 */ 1784 req->r_rtt++; 1785 if (nmp->nm_flag & NFSMNT_DUMBTIMR) { 1786 timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS; 1787 } else if (req->r_flags & R_TIMING) { 1788 timeo = NFS_SRTT(req) + NFS_SDRTT(req); 1789 } else { 1790 timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS; 1791 } 1792 timeo *= multt[req->r_procnum]; 1793 /* timeo is still scaled by SCALE_BITS */ 1794 1795 #define NFSFS (NFS_RTT_SCALE * NFS_HZ) 1796 if (req->r_flags & R_TIMING) { 1797 static long last_time; 1798 if (nfs_showrtt && last_time != time_second) { 1799 kprintf("rpccmd %d NFS SRTT %d SDRTT %d " 1800 "timeo %d.%03d\n", 1801 proct[req->r_procnum], 1802 NFS_SRTT(req), NFS_SDRTT(req), 1803 timeo / NFSFS, 1804 timeo % NFSFS * 1000 / NFSFS); 1805 last_time = time_second; 1806 } 1807 } 1808 #undef NFSFS 1809 1810 /* 1811 * deal with nfs_timer jitter. 1812 */ 1813 timeo = (timeo >> NFS_RTT_SCALE_BITS) + 1; 1814 if (timeo < 2) 1815 timeo = 2; 1816 1817 if (nmp->nm_timeouts > 0) 1818 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1819 if (timeo > NFS_MAXTIMEO) 1820 timeo = NFS_MAXTIMEO; 1821 if (req->r_rtt <= timeo) { 1822 if ((req->r_flags & R_NEEDSXMIT) == 0) 1823 return; 1824 } else if (nmp->nm_timeouts < 8) { 1825 nmp->nm_timeouts++; 1826 } 1827 } 1828 1829 /* 1830 * Check for server not responding 1831 */ 1832 if ((req->r_flags & R_TPRINTFMSG) == 0 && 1833 req->r_rexmit > nmp->nm_deadthresh) { 1834 nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname, 1835 "not responding"); 1836 req->r_flags |= R_TPRINTFMSG; 1837 } 1838 if (req->r_rexmit >= req->r_retry) { /* too many */ 1839 nfsstats.rpctimeouts++; 1840 nfs_softterm(req, 1); 1841 return; 1842 } 1843 1844 /* 1845 * Generally disable retransmission on reliable sockets, 1846 * unless the request is flagged for immediate send. 1847 */ 1848 if (nmp->nm_sotype != SOCK_DGRAM) { 1849 if (++req->r_rexmit > NFS_MAXREXMIT) 1850 req->r_rexmit = NFS_MAXREXMIT; 1851 if ((req->r_flags & R_NEEDSXMIT) == 0) 1852 return; 1853 } 1854 1855 /* 1856 * Stop here if we do not have a socket! 1857 */ 1858 if ((so = nmp->nm_so) == NULL) 1859 return; 1860 1861 /* 1862 * If there is enough space and the window allows.. resend it. 1863 * 1864 * r_rtt is left intact in case we get an answer after the 1865 * retry that was a reply to the original packet. 1866 * 1867 * NOTE: so_pru_send() 1868 */ 1869 if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len && 1870 (req->r_flags & (R_SENT | R_NEEDSXMIT)) && 1871 (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){ 1872 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1873 error = so_pru_send(so, 0, m, NULL, NULL, td); 1874 else 1875 error = so_pru_send(so, 0, m, nmp->nm_nam, NULL, td); 1876 if (error) { 1877 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1878 so->so_error = 0; 1879 req->r_flags |= R_NEEDSXMIT; 1880 } else if (req->r_mrep == NULL) { 1881 /* 1882 * Iff first send, start timing 1883 * else turn timing off, backoff timer 1884 * and divide congestion window by 2. 1885 * 1886 * It is possible for the so_pru_send() to 1887 * block and for us to race a reply so we 1888 * only do this if the reply field has not 1889 * been filled in. R_LOCKED will prevent 1890 * the request from being ripped out from under 1891 * us entirely. 1892 * 1893 * Record the last resent procnum to aid us 1894 * in duplicate detection on receive. 1895 */ 1896 if ((req->r_flags & R_NEEDSXMIT) == 0) { 1897 if (nfs_showrexmit) 1898 kprintf("X"); 1899 if (++req->r_rexmit > NFS_MAXREXMIT) 1900 req->r_rexmit = NFS_MAXREXMIT; 1901 nmp->nm_maxasync_scaled >>= 1; 1902 if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED) 1903 nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED; 1904 nfsstats.rpcretries++; 1905 nmp->nm_lastreprocnum = req->r_procnum; 1906 } else { 1907 req->r_flags |= R_SENT; 1908 req->r_flags &= ~R_NEEDSXMIT; 1909 } 1910 } 1911 } 1912 } 1913 1914 /* 1915 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and 1916 * wait for all requests to complete. This is used by forced unmounts 1917 * to terminate any outstanding RPCs. 1918 * 1919 * Locked requests cannot be canceled but will be marked for 1920 * soft-termination. 1921 */ 1922 int 1923 nfs_nmcancelreqs(struct nfsmount *nmp) 1924 { 1925 struct nfsreq *req; 1926 int i; 1927 1928 crit_enter(); 1929 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { 1930 if (req->r_mrep != NULL || (req->r_flags & R_SOFTTERM)) 1931 continue; 1932 nfs_softterm(req, 0); 1933 } 1934 /* XXX the other two queues as well */ 1935 crit_exit(); 1936 1937 for (i = 0; i < 30; i++) { 1938 crit_enter(); 1939 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { 1940 if (nmp == req->r_nmp) 1941 break; 1942 } 1943 crit_exit(); 1944 if (req == NULL) 1945 return (0); 1946 tsleep(&lbolt, 0, "nfscancel", 0); 1947 } 1948 return (EBUSY); 1949 } 1950 1951 /* 1952 * Soft-terminate a request, effectively marking it as failed. 1953 * 1954 * Must be called from within a critical section. 1955 */ 1956 static void 1957 nfs_softterm(struct nfsreq *rep, int islocked) 1958 { 1959 rep->r_flags |= R_SOFTTERM; 1960 nfs_hardterm(rep, islocked); 1961 } 1962 1963 /* 1964 * Hard-terminate a request, typically after getting a response. 1965 * 1966 * The state machine can still decide to re-issue it later if necessary. 1967 * 1968 * Must be called from within a critical section. 1969 */ 1970 static void 1971 nfs_hardterm(struct nfsreq *rep, int islocked) 1972 { 1973 struct nfsmount *nmp = rep->r_nmp; 1974 1975 /* 1976 * The nm_send count is decremented now to avoid deadlocks 1977 * when the process in soreceive() hasn't yet managed to send 1978 * its own request. 1979 */ 1980 if (rep->r_flags & R_SENT) { 1981 rep->r_flags &= ~R_SENT; 1982 } 1983 1984 /* 1985 * If we locked the request or nobody else has locked the request, 1986 * and the request is async, we can move it to the reader thread's 1987 * queue now and fix up the state. 1988 * 1989 * If we locked the request or nobody else has locked the request, 1990 * we can wake up anyone blocked waiting for a response on the 1991 * request. 1992 */ 1993 if (islocked || (rep->r_flags & R_LOCKED) == 0) { 1994 if ((rep->r_flags & (R_ONREQQ | R_ASYNC)) == 1995 (R_ONREQQ | R_ASYNC)) { 1996 rep->r_flags &= ~R_ONREQQ; 1997 TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain); 1998 --nmp->nm_reqqlen; 1999 TAILQ_INSERT_TAIL(&nmp->nm_reqrxq, rep, r_chain); 2000 KKASSERT(rep->r_info->state == NFSM_STATE_TRY || 2001 rep->r_info->state == NFSM_STATE_WAITREPLY); 2002 rep->r_info->state = NFSM_STATE_PROCESSREPLY; 2003 nfssvc_iod_reader_wakeup(nmp); 2004 if (TAILQ_FIRST(&nmp->nm_bioq) && 2005 nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) { 2006 nfssvc_iod_writer_wakeup(nmp); 2007 } 2008 } 2009 mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link); 2010 } 2011 } 2012 2013 /* 2014 * Test for a termination condition pending on the process. 2015 * This is used for NFSMNT_INT mounts. 2016 */ 2017 int 2018 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td) 2019 { 2020 sigset_t tmpset; 2021 struct proc *p; 2022 struct lwp *lp; 2023 2024 if (rep && (rep->r_flags & R_SOFTTERM)) 2025 return (EINTR); 2026 /* Terminate all requests while attempting a forced unmount. */ 2027 if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) 2028 return (EINTR); 2029 if (!(nmp->nm_flag & NFSMNT_INT)) 2030 return (0); 2031 /* td might be NULL YYY */ 2032 if (td == NULL || (p = td->td_proc) == NULL) 2033 return (0); 2034 2035 lp = td->td_lwp; 2036 tmpset = lwp_sigpend(lp); 2037 SIGSETNAND(tmpset, lp->lwp_sigmask); 2038 SIGSETNAND(tmpset, p->p_sigignore); 2039 if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset)) 2040 return (EINTR); 2041 2042 return (0); 2043 } 2044 2045 /* 2046 * Lock a socket against others. 2047 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 2048 * and also to avoid race conditions between the processes with nfs requests 2049 * in progress when a reconnect is necessary. 2050 */ 2051 int 2052 nfs_sndlock(struct nfsmount *nmp, struct nfsreq *rep) 2053 { 2054 mtx_t mtx = &nmp->nm_txlock; 2055 struct thread *td; 2056 int slptimeo; 2057 int slpflag; 2058 int error; 2059 2060 slpflag = 0; 2061 slptimeo = 0; 2062 td = rep ? rep->r_td : NULL; 2063 if (nmp->nm_flag & NFSMNT_INT) 2064 slpflag = PCATCH; 2065 2066 while ((error = mtx_lock_ex_try(mtx)) != 0) { 2067 if (nfs_sigintr(nmp, rep, td)) { 2068 error = EINTR; 2069 break; 2070 } 2071 error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo); 2072 if (error == 0) 2073 break; 2074 if (slpflag == PCATCH) { 2075 slpflag = 0; 2076 slptimeo = 2 * hz; 2077 } 2078 } 2079 /* Always fail if our request has been cancelled. */ 2080 if (rep && (rep->r_flags & R_SOFTTERM)) { 2081 if (error == 0) 2082 mtx_unlock(mtx); 2083 error = EINTR; 2084 } 2085 return (error); 2086 } 2087 2088 /* 2089 * Unlock the stream socket for others. 2090 */ 2091 void 2092 nfs_sndunlock(struct nfsmount *nmp) 2093 { 2094 mtx_unlock(&nmp->nm_txlock); 2095 } 2096 2097 /* 2098 * Lock the receiver side of the socket. 2099 * 2100 * rep may be NULL. 2101 */ 2102 static int 2103 nfs_rcvlock(struct nfsmount *nmp, struct nfsreq *rep) 2104 { 2105 mtx_t mtx = &nmp->nm_rxlock; 2106 int slpflag; 2107 int slptimeo; 2108 int error; 2109 2110 /* 2111 * Unconditionally check for completion in case another nfsiod 2112 * get the packet while the caller was blocked, before the caller 2113 * called us. Packet reception is handled by mainline code which 2114 * is protected by the BGL at the moment. 2115 * 2116 * We do not strictly need the second check just before the 2117 * tsleep(), but it's good defensive programming. 2118 */ 2119 if (rep && rep->r_mrep != NULL) 2120 return (EALREADY); 2121 2122 if (nmp->nm_flag & NFSMNT_INT) 2123 slpflag = PCATCH; 2124 else 2125 slpflag = 0; 2126 slptimeo = 0; 2127 2128 while ((error = mtx_lock_ex_try(mtx)) != 0) { 2129 if (nfs_sigintr(nmp, rep, (rep ? rep->r_td : NULL))) { 2130 error = EINTR; 2131 break; 2132 } 2133 if (rep && rep->r_mrep != NULL) { 2134 error = EALREADY; 2135 break; 2136 } 2137 2138 /* 2139 * NOTE: can return ENOLCK, but in that case rep->r_mrep 2140 * will already be set. 2141 */ 2142 if (rep) { 2143 error = mtx_lock_ex_link(mtx, &rep->r_link, 2144 "nfsrcvlk", 2145 slpflag, slptimeo); 2146 } else { 2147 error = mtx_lock_ex(mtx, "nfsrcvlk", slpflag, slptimeo); 2148 } 2149 if (error == 0) 2150 break; 2151 2152 /* 2153 * If our reply was recieved while we were sleeping, 2154 * then just return without taking the lock to avoid a 2155 * situation where a single iod could 'capture' the 2156 * recieve lock. 2157 */ 2158 if (rep && rep->r_mrep != NULL) { 2159 error = EALREADY; 2160 break; 2161 } 2162 if (slpflag == PCATCH) { 2163 slpflag = 0; 2164 slptimeo = 2 * hz; 2165 } 2166 } 2167 if (error == 0) { 2168 if (rep && rep->r_mrep != NULL) { 2169 error = EALREADY; 2170 mtx_unlock(mtx); 2171 } 2172 } 2173 return (error); 2174 } 2175 2176 /* 2177 * Unlock the stream socket for others. 2178 */ 2179 static void 2180 nfs_rcvunlock(struct nfsmount *nmp) 2181 { 2182 mtx_unlock(&nmp->nm_rxlock); 2183 } 2184 2185 /* 2186 * nfs_realign: 2187 * 2188 * Check for badly aligned mbuf data and realign by copying the unaligned 2189 * portion of the data into a new mbuf chain and freeing the portions 2190 * of the old chain that were replaced. 2191 * 2192 * We cannot simply realign the data within the existing mbuf chain 2193 * because the underlying buffers may contain other rpc commands and 2194 * we cannot afford to overwrite them. 2195 * 2196 * We would prefer to avoid this situation entirely. The situation does 2197 * not occur with NFS/UDP and is supposed to only occassionally occur 2198 * with TCP. Use vfs.nfs.realign_count and realign_test to check this. 2199 * 2200 * NOTE! MB_DONTWAIT cannot be used here. The mbufs must be acquired 2201 * because the rpc request OR reply cannot be thrown away. TCP NFS 2202 * mounts do not retry their RPCs unless the TCP connection itself 2203 * is dropped so throwing away a RPC will basically cause the NFS 2204 * operation to lockup indefinitely. 2205 */ 2206 static void 2207 nfs_realign(struct mbuf **pm, int hsiz) 2208 { 2209 struct mbuf *m; 2210 struct mbuf *n = NULL; 2211 2212 /* 2213 * Check for misalignemnt 2214 */ 2215 ++nfs_realign_test; 2216 while ((m = *pm) != NULL) { 2217 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) 2218 break; 2219 pm = &m->m_next; 2220 } 2221 2222 /* 2223 * If misalignment found make a completely new copy. 2224 */ 2225 if (m) { 2226 ++nfs_realign_count; 2227 n = m_dup_data(m, MB_WAIT); 2228 m_freem(*pm); 2229 *pm = n; 2230 } 2231 } 2232 2233 #ifndef NFS_NOSERVER 2234 2235 /* 2236 * Parse an RPC request 2237 * - verify it 2238 * - fill in the cred struct. 2239 */ 2240 int 2241 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 2242 { 2243 int len, i; 2244 u_int32_t *tl; 2245 struct uio uio; 2246 struct iovec iov; 2247 caddr_t cp; 2248 u_int32_t nfsvers, auth_type; 2249 uid_t nickuid; 2250 int error = 0, ticklen; 2251 struct nfsuid *nuidp; 2252 struct timeval tvin, tvout; 2253 struct nfsm_info info; 2254 #if 0 /* until encrypted keys are implemented */ 2255 NFSKERBKEYSCHED_T keys; /* stores key schedule */ 2256 #endif 2257 2258 info.mrep = nd->nd_mrep; 2259 info.md = nd->nd_md; 2260 info.dpos = nd->nd_dpos; 2261 2262 if (has_header) { 2263 NULLOUT(tl = nfsm_dissect(&info, 10 * NFSX_UNSIGNED)); 2264 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 2265 if (*tl++ != rpc_call) { 2266 m_freem(info.mrep); 2267 return (EBADRPC); 2268 } 2269 } else { 2270 NULLOUT(tl = nfsm_dissect(&info, 8 * NFSX_UNSIGNED)); 2271 } 2272 nd->nd_repstat = 0; 2273 nd->nd_flag = 0; 2274 if (*tl++ != rpc_vers) { 2275 nd->nd_repstat = ERPCMISMATCH; 2276 nd->nd_procnum = NFSPROC_NOOP; 2277 return (0); 2278 } 2279 if (*tl != nfs_prog) { 2280 nd->nd_repstat = EPROGUNAVAIL; 2281 nd->nd_procnum = NFSPROC_NOOP; 2282 return (0); 2283 } 2284 tl++; 2285 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 2286 if (nfsvers < NFS_VER2 || nfsvers > NFS_VER3) { 2287 nd->nd_repstat = EPROGMISMATCH; 2288 nd->nd_procnum = NFSPROC_NOOP; 2289 return (0); 2290 } 2291 if (nfsvers == NFS_VER3) 2292 nd->nd_flag = ND_NFSV3; 2293 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 2294 if (nd->nd_procnum == NFSPROC_NULL) 2295 return (0); 2296 if (nd->nd_procnum >= NFS_NPROCS || 2297 (nd->nd_procnum >= NQNFSPROC_GETLEASE) || 2298 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 2299 nd->nd_repstat = EPROCUNAVAIL; 2300 nd->nd_procnum = NFSPROC_NOOP; 2301 return (0); 2302 } 2303 if ((nd->nd_flag & ND_NFSV3) == 0) 2304 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 2305 auth_type = *tl++; 2306 len = fxdr_unsigned(int, *tl++); 2307 if (len < 0 || len > RPCAUTH_MAXSIZ) { 2308 m_freem(info.mrep); 2309 return (EBADRPC); 2310 } 2311 2312 nd->nd_flag &= ~ND_KERBAUTH; 2313 /* 2314 * Handle auth_unix or auth_kerb. 2315 */ 2316 if (auth_type == rpc_auth_unix) { 2317 len = fxdr_unsigned(int, *++tl); 2318 if (len < 0 || len > NFS_MAXNAMLEN) { 2319 m_freem(info.mrep); 2320 return (EBADRPC); 2321 } 2322 ERROROUT(nfsm_adv(&info, nfsm_rndup(len))); 2323 NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED)); 2324 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred)); 2325 nd->nd_cr.cr_ref = 1; 2326 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 2327 nd->nd_cr.cr_ruid = nd->nd_cr.cr_svuid = nd->nd_cr.cr_uid; 2328 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 2329 nd->nd_cr.cr_rgid = nd->nd_cr.cr_svgid = nd->nd_cr.cr_gid; 2330 len = fxdr_unsigned(int, *tl); 2331 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 2332 m_freem(info.mrep); 2333 return (EBADRPC); 2334 } 2335 NULLOUT(tl = nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED)); 2336 for (i = 1; i <= len; i++) 2337 if (i < NGROUPS) 2338 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 2339 else 2340 tl++; 2341 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); 2342 if (nd->nd_cr.cr_ngroups > 1) 2343 nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups); 2344 len = fxdr_unsigned(int, *++tl); 2345 if (len < 0 || len > RPCAUTH_MAXSIZ) { 2346 m_freem(info.mrep); 2347 return (EBADRPC); 2348 } 2349 if (len > 0) { 2350 ERROROUT(nfsm_adv(&info, nfsm_rndup(len))); 2351 } 2352 } else if (auth_type == rpc_auth_kerb) { 2353 switch (fxdr_unsigned(int, *tl++)) { 2354 case RPCAKN_FULLNAME: 2355 ticklen = fxdr_unsigned(int, *tl); 2356 *((u_int32_t *)nfsd->nfsd_authstr) = *tl; 2357 uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED; 2358 nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED; 2359 if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) { 2360 m_freem(info.mrep); 2361 return (EBADRPC); 2362 } 2363 uio.uio_offset = 0; 2364 uio.uio_iov = &iov; 2365 uio.uio_iovcnt = 1; 2366 uio.uio_segflg = UIO_SYSSPACE; 2367 iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4]; 2368 iov.iov_len = RPCAUTH_MAXSIZ - 4; 2369 ERROROUT(nfsm_mtouio(&info, &uio, uio.uio_resid)); 2370 NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED)); 2371 if (*tl++ != rpc_auth_kerb || 2372 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) { 2373 kprintf("Bad kerb verifier\n"); 2374 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF); 2375 nd->nd_procnum = NFSPROC_NOOP; 2376 return (0); 2377 } 2378 NULLOUT(cp = nfsm_dissect(&info, 4 * NFSX_UNSIGNED)); 2379 tl = (u_int32_t *)cp; 2380 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) { 2381 kprintf("Not fullname kerb verifier\n"); 2382 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF); 2383 nd->nd_procnum = NFSPROC_NOOP; 2384 return (0); 2385 } 2386 cp += NFSX_UNSIGNED; 2387 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED); 2388 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED; 2389 nd->nd_flag |= ND_KERBFULL; 2390 nfsd->nfsd_flag |= NFSD_NEEDAUTH; 2391 break; 2392 case RPCAKN_NICKNAME: 2393 if (len != 2 * NFSX_UNSIGNED) { 2394 kprintf("Kerb nickname short\n"); 2395 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED); 2396 nd->nd_procnum = NFSPROC_NOOP; 2397 return (0); 2398 } 2399 nickuid = fxdr_unsigned(uid_t, *tl); 2400 NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED)); 2401 if (*tl++ != rpc_auth_kerb || 2402 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) { 2403 kprintf("Kerb nick verifier bad\n"); 2404 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF); 2405 nd->nd_procnum = NFSPROC_NOOP; 2406 return (0); 2407 } 2408 NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED)); 2409 tvin.tv_sec = *tl++; 2410 tvin.tv_usec = *tl; 2411 2412 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first; 2413 nuidp != 0; nuidp = nuidp->nu_hash.le_next) { 2414 if (nuidp->nu_cr.cr_uid == nickuid && 2415 (!nd->nd_nam2 || 2416 netaddr_match(NU_NETFAM(nuidp), 2417 &nuidp->nu_haddr, nd->nd_nam2))) 2418 break; 2419 } 2420 if (!nuidp) { 2421 nd->nd_repstat = 2422 (NFSERR_AUTHERR|AUTH_REJECTCRED); 2423 nd->nd_procnum = NFSPROC_NOOP; 2424 return (0); 2425 } 2426 2427 /* 2428 * Now, decrypt the timestamp using the session key 2429 * and validate it. 2430 */ 2431 #ifdef NFSKERB 2432 XXX 2433 #else 2434 tvout.tv_sec = 0; 2435 tvout.tv_usec = 0; 2436 #endif 2437 2438 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec); 2439 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec); 2440 if (nuidp->nu_expire < time_second || 2441 nuidp->nu_timestamp.tv_sec > tvout.tv_sec || 2442 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec && 2443 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) { 2444 nuidp->nu_expire = 0; 2445 nd->nd_repstat = 2446 (NFSERR_AUTHERR|AUTH_REJECTVERF); 2447 nd->nd_procnum = NFSPROC_NOOP; 2448 return (0); 2449 } 2450 nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr); 2451 nd->nd_flag |= ND_KERBNICK; 2452 }; 2453 } else { 2454 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 2455 nd->nd_procnum = NFSPROC_NOOP; 2456 return (0); 2457 } 2458 2459 nd->nd_md = info.md; 2460 nd->nd_dpos = info.dpos; 2461 return (0); 2462 nfsmout: 2463 return (error); 2464 } 2465 2466 #endif 2467 2468 /* 2469 * Send a message to the originating process's terminal. The thread and/or 2470 * process may be NULL. YYY the thread should not be NULL but there may 2471 * still be some uio_td's that are still being passed as NULL through to 2472 * nfsm_request(). 2473 */ 2474 static int 2475 nfs_msg(struct thread *td, char *server, char *msg) 2476 { 2477 tpr_t tpr; 2478 2479 if (td && td->td_proc) 2480 tpr = tprintf_open(td->td_proc); 2481 else 2482 tpr = NULL; 2483 tprintf(tpr, "nfs server %s: %s\n", server, msg); 2484 tprintf_close(tpr); 2485 return (0); 2486 } 2487 2488 #ifndef NFS_NOSERVER 2489 2490 /* 2491 * Socket upcall routine for nfsd sockets. This runs in the protocol 2492 * thread and passes waitflag == MB_DONTWAIT. 2493 */ 2494 void 2495 nfsrv_rcv_upcall(struct socket *so, void *arg, int waitflag) 2496 { 2497 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 2498 2499 if (slp->ns_needq_upcall == 0) { 2500 slp->ns_needq_upcall = 1; /* ok to race */ 2501 lwkt_gettoken(&nfs_token); 2502 nfsrv_wakenfsd(slp, 1); 2503 lwkt_reltoken(&nfs_token); 2504 } 2505 #if 0 2506 lwkt_gettoken(&slp->ns_token); 2507 slp->ns_flag |= SLP_NEEDQ; 2508 nfsrv_rcv(so, arg, waitflag); 2509 lwkt_reltoken(&slp->ns_token); 2510 #endif 2511 } 2512 2513 /* 2514 * Process new data on a receive socket. Essentially do as much as we can 2515 * non-blocking, else punt and it will be called with MB_WAIT from an nfsd. 2516 * 2517 * slp->ns_token is held on call 2518 */ 2519 void 2520 nfsrv_rcv(struct socket *so, void *arg, int waitflag) 2521 { 2522 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 2523 struct mbuf *m; 2524 struct sockaddr *nam; 2525 struct sockbuf sio; 2526 int flags, error; 2527 int nparallel_wakeup = 0; 2528 2529 ASSERT_LWKT_TOKEN_HELD(&slp->ns_token); 2530 2531 if ((slp->ns_flag & SLP_VALID) == 0) 2532 return; 2533 2534 /* 2535 * Do not allow an infinite number of completed RPC records to build 2536 * up before we stop reading data from the socket. Otherwise we could 2537 * end up holding onto an unreasonable number of mbufs for requests 2538 * waiting for service. 2539 * 2540 * This should give pretty good feedback to the TCP layer and 2541 * prevents a memory crunch for other protocols. 2542 * 2543 * Note that the same service socket can be dispatched to several 2544 * nfs servers simultaniously. The tcp protocol callback calls us 2545 * with MB_DONTWAIT. nfsd calls us with MB_WAIT (typically). 2546 */ 2547 if (NFSRV_RECLIMIT(slp)) 2548 return; 2549 2550 /* 2551 * Handle protocol specifics to parse an RPC request. We always 2552 * pull from the socket using non-blocking I/O. 2553 */ 2554 if (so->so_type == SOCK_STREAM) { 2555 /* 2556 * The data has to be read in an orderly fashion from a TCP 2557 * stream, unlike a UDP socket. It is possible for soreceive 2558 * and/or nfsrv_getstream() to block, so make sure only one 2559 * entity is messing around with the TCP stream at any given 2560 * moment. The receive sockbuf's lock in soreceive is not 2561 * sufficient. 2562 */ 2563 if (slp->ns_flag & SLP_GETSTREAM) 2564 return; 2565 slp->ns_flag |= SLP_GETSTREAM; 2566 2567 /* 2568 * Do soreceive(). Pull out as much data as possible without 2569 * blocking. 2570 */ 2571 sbinit(&sio, 1000000000); 2572 flags = MSG_DONTWAIT; 2573 error = so_pru_soreceive(so, &nam, NULL, &sio, NULL, &flags); 2574 if (error || sio.sb_mb == NULL) { 2575 if (error != EWOULDBLOCK) 2576 slp->ns_flag |= SLP_DISCONN; 2577 slp->ns_flag &= ~(SLP_GETSTREAM | SLP_NEEDQ); 2578 goto done; 2579 } 2580 m = sio.sb_mb; 2581 if (slp->ns_rawend) { 2582 slp->ns_rawend->m_next = m; 2583 slp->ns_cc += sio.sb_cc; 2584 } else { 2585 slp->ns_raw = m; 2586 slp->ns_cc = sio.sb_cc; 2587 } 2588 while (m->m_next) 2589 m = m->m_next; 2590 slp->ns_rawend = m; 2591 2592 /* 2593 * Now try and parse as many record(s) as we can out of the 2594 * raw stream data. This will set SLP_DOREC. 2595 */ 2596 error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup); 2597 if (error && error != EWOULDBLOCK) 2598 slp->ns_flag |= SLP_DISCONN; 2599 slp->ns_flag &= ~SLP_GETSTREAM; 2600 } else { 2601 /* 2602 * For UDP soreceive typically pulls just one packet, loop 2603 * to get the whole batch. 2604 */ 2605 do { 2606 sbinit(&sio, 1000000000); 2607 flags = MSG_DONTWAIT; 2608 error = so_pru_soreceive(so, &nam, NULL, &sio, 2609 NULL, &flags); 2610 if (sio.sb_mb) { 2611 struct nfsrv_rec *rec; 2612 int mf = (waitflag & MB_DONTWAIT) ? 2613 M_NOWAIT : M_WAITOK; 2614 rec = kmalloc(sizeof(struct nfsrv_rec), 2615 M_NFSRVDESC, mf); 2616 if (!rec) { 2617 if (nam) 2618 FREE(nam, M_SONAME); 2619 m_freem(sio.sb_mb); 2620 continue; 2621 } 2622 nfs_realign(&sio.sb_mb, 10 * NFSX_UNSIGNED); 2623 rec->nr_address = nam; 2624 rec->nr_packet = sio.sb_mb; 2625 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); 2626 ++slp->ns_numrec; 2627 slp->ns_flag |= SLP_DOREC; 2628 ++nparallel_wakeup; 2629 } else { 2630 slp->ns_flag &= ~SLP_NEEDQ; 2631 } 2632 if (error) { 2633 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 2634 && error != EWOULDBLOCK) { 2635 slp->ns_flag |= SLP_DISCONN; 2636 break; 2637 } 2638 } 2639 if (NFSRV_RECLIMIT(slp)) 2640 break; 2641 } while (sio.sb_mb); 2642 } 2643 2644 /* 2645 * If we were upcalled from the tcp protocol layer and we have 2646 * fully parsed records ready to go, or there is new data pending, 2647 * or something went wrong, try to wake up a nfsd thread to deal 2648 * with it. 2649 */ 2650 done: 2651 /* XXX this code is currently not executed (nfsrv_rcv_upcall) */ 2652 if (waitflag == MB_DONTWAIT && (slp->ns_flag & SLP_ACTION_MASK)) { 2653 lwkt_gettoken(&nfs_token); 2654 nfsrv_wakenfsd(slp, nparallel_wakeup); 2655 lwkt_reltoken(&nfs_token); 2656 } 2657 } 2658 2659 /* 2660 * Try and extract an RPC request from the mbuf data list received on a 2661 * stream socket. The "waitflag" argument indicates whether or not it 2662 * can sleep. 2663 */ 2664 static int 2665 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp) 2666 { 2667 struct mbuf *m, **mpp; 2668 char *cp1, *cp2; 2669 int len; 2670 struct mbuf *om, *m2, *recm; 2671 u_int32_t recmark; 2672 2673 for (;;) { 2674 if (slp->ns_reclen == 0) { 2675 if (slp->ns_cc < NFSX_UNSIGNED) 2676 return (0); 2677 m = slp->ns_raw; 2678 if (m->m_len >= NFSX_UNSIGNED) { 2679 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 2680 m->m_data += NFSX_UNSIGNED; 2681 m->m_len -= NFSX_UNSIGNED; 2682 } else { 2683 cp1 = (caddr_t)&recmark; 2684 cp2 = mtod(m, caddr_t); 2685 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 2686 while (m->m_len == 0) { 2687 m = m->m_next; 2688 cp2 = mtod(m, caddr_t); 2689 } 2690 *cp1++ = *cp2++; 2691 m->m_data++; 2692 m->m_len--; 2693 } 2694 } 2695 slp->ns_cc -= NFSX_UNSIGNED; 2696 recmark = ntohl(recmark); 2697 slp->ns_reclen = recmark & ~0x80000000; 2698 if (recmark & 0x80000000) 2699 slp->ns_flag |= SLP_LASTFRAG; 2700 else 2701 slp->ns_flag &= ~SLP_LASTFRAG; 2702 if (slp->ns_reclen > NFS_MAXPACKET || slp->ns_reclen <= 0) { 2703 log(LOG_ERR, "%s (%d) from nfs client\n", 2704 "impossible packet length", 2705 slp->ns_reclen); 2706 return (EPERM); 2707 } 2708 } 2709 2710 /* 2711 * Now get the record part. 2712 * 2713 * Note that slp->ns_reclen may be 0. Linux sometimes 2714 * generates 0-length RPCs 2715 */ 2716 recm = NULL; 2717 if (slp->ns_cc == slp->ns_reclen) { 2718 recm = slp->ns_raw; 2719 slp->ns_raw = slp->ns_rawend = NULL; 2720 slp->ns_cc = slp->ns_reclen = 0; 2721 } else if (slp->ns_cc > slp->ns_reclen) { 2722 len = 0; 2723 m = slp->ns_raw; 2724 om = NULL; 2725 2726 while (len < slp->ns_reclen) { 2727 if ((len + m->m_len) > slp->ns_reclen) { 2728 m2 = m_copym(m, 0, slp->ns_reclen - len, 2729 waitflag); 2730 if (m2) { 2731 if (om) { 2732 om->m_next = m2; 2733 recm = slp->ns_raw; 2734 } else 2735 recm = m2; 2736 m->m_data += slp->ns_reclen - len; 2737 m->m_len -= slp->ns_reclen - len; 2738 len = slp->ns_reclen; 2739 } else { 2740 return (EWOULDBLOCK); 2741 } 2742 } else if ((len + m->m_len) == slp->ns_reclen) { 2743 om = m; 2744 len += m->m_len; 2745 m = m->m_next; 2746 recm = slp->ns_raw; 2747 om->m_next = NULL; 2748 } else { 2749 om = m; 2750 len += m->m_len; 2751 m = m->m_next; 2752 } 2753 } 2754 slp->ns_raw = m; 2755 slp->ns_cc -= len; 2756 slp->ns_reclen = 0; 2757 } else { 2758 return (0); 2759 } 2760 2761 /* 2762 * Accumulate the fragments into a record. 2763 */ 2764 mpp = &slp->ns_frag; 2765 while (*mpp) 2766 mpp = &((*mpp)->m_next); 2767 *mpp = recm; 2768 if (slp->ns_flag & SLP_LASTFRAG) { 2769 struct nfsrv_rec *rec; 2770 int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK; 2771 rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf); 2772 if (!rec) { 2773 m_freem(slp->ns_frag); 2774 } else { 2775 nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED); 2776 rec->nr_address = NULL; 2777 rec->nr_packet = slp->ns_frag; 2778 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link); 2779 ++slp->ns_numrec; 2780 slp->ns_flag |= SLP_DOREC; 2781 ++*countp; 2782 } 2783 slp->ns_frag = NULL; 2784 } 2785 } 2786 } 2787 2788 #ifdef INVARIANTS 2789 2790 /* 2791 * Sanity check our mbuf chain. 2792 */ 2793 static void 2794 nfs_checkpkt(struct mbuf *m, int len) 2795 { 2796 int xlen = 0; 2797 while (m) { 2798 xlen += m->m_len; 2799 m = m->m_next; 2800 } 2801 if (xlen != len) { 2802 panic("nfs_checkpkt: len mismatch %d/%d mbuf %p\n", 2803 xlen, len, m); 2804 } 2805 } 2806 2807 #else 2808 2809 static void 2810 nfs_checkpkt(struct mbuf *m __unused, int len __unused) 2811 { 2812 } 2813 2814 #endif 2815 2816 /* 2817 * Parse an RPC header. 2818 * 2819 * If the socket is invalid or no records are pending we return ENOBUFS. 2820 * The caller must deal with NEEDQ races. 2821 */ 2822 int 2823 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 2824 struct nfsrv_descript **ndp) 2825 { 2826 struct nfsrv_rec *rec; 2827 struct mbuf *m; 2828 struct sockaddr *nam; 2829 struct nfsrv_descript *nd; 2830 int error; 2831 2832 *ndp = NULL; 2833 if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec)) 2834 return (ENOBUFS); 2835 rec = STAILQ_FIRST(&slp->ns_rec); 2836 STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link); 2837 KKASSERT(slp->ns_numrec > 0); 2838 if (--slp->ns_numrec == 0) 2839 slp->ns_flag &= ~SLP_DOREC; 2840 nam = rec->nr_address; 2841 m = rec->nr_packet; 2842 kfree(rec, M_NFSRVDESC); 2843 MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript), 2844 M_NFSRVDESC, M_WAITOK); 2845 nd->nd_md = nd->nd_mrep = m; 2846 nd->nd_nam2 = nam; 2847 nd->nd_dpos = mtod(m, caddr_t); 2848 error = nfs_getreq(nd, nfsd, TRUE); 2849 if (error) { 2850 if (nam) { 2851 FREE(nam, M_SONAME); 2852 } 2853 kfree((caddr_t)nd, M_NFSRVDESC); 2854 return (error); 2855 } 2856 *ndp = nd; 2857 nfsd->nfsd_nd = nd; 2858 return (0); 2859 } 2860 2861 /* 2862 * Try to assign service sockets to nfsd threads based on the number 2863 * of new rpc requests that have been queued on the service socket. 2864 * 2865 * If no nfsd's are available or additonal requests are pending, set the 2866 * NFSD_CHECKSLP flag so that one of the running nfsds will go look for 2867 * the work in the nfssvc_sock list when it is finished processing its 2868 * current work. This flag is only cleared when an nfsd can not find 2869 * any new work to perform. 2870 */ 2871 void 2872 nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel) 2873 { 2874 struct nfsd *nd; 2875 2876 if ((slp->ns_flag & SLP_VALID) == 0) 2877 return; 2878 if (nparallel <= 1) 2879 nparallel = 1; 2880 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) { 2881 if (nd->nfsd_flag & NFSD_WAITING) { 2882 nd->nfsd_flag &= ~NFSD_WAITING; 2883 if (nd->nfsd_slp) 2884 panic("nfsd wakeup"); 2885 nfsrv_slpref(slp); 2886 nd->nfsd_slp = slp; 2887 wakeup((caddr_t)nd); 2888 if (--nparallel == 0) 2889 break; 2890 } 2891 } 2892 2893 /* 2894 * If we couldn't assign slp then the NFSDs are all busy and 2895 * we set a flag indicating that there is pending work. 2896 */ 2897 if (nparallel) 2898 nfsd_head_flag |= NFSD_CHECKSLP; 2899 } 2900 #endif /* NFS_NOSERVER */ 2901