/* * This file implements the upper socket layer of VFS: the BSD socket system * calls, and any associated file descriptor, file pointer, vnode, and file * system processing. In most cases, this layer will call into the lower * socket layer in order to send the request to a socket driver. Generic file * calls (e.g., read, write, ioctl, and select) are not implemented here, and * will directly call into the lower socket layer as well. * * The following table shows the system call numbers implemented in this file, * along with their request and reply message types. Each request layout * message type is prefixed with "m_lc_vfs_". Each reply layout message type * is prefixed with "m_vfs_lc_". For requests without a specific reply layout, * only the "m_type" message field is used in the reply message. * * Type Request layout Reply layout * ---- -------------- ------------ * VFS_SOCKET socket * VFS_SOCKETPAIR socket fdpair * VFS_BIND sockaddr * VFS_CONNECT sockaddr * VFS_LISTEN listen * VFS_ACCEPT sockaddr socklen * VFS_SENDTO sendrecv * VFS_RECVFROM sendrecv socklen * VFS_SENDMSG sockmsg * VFS_RECVMSG sockmsg * VFS_SETSOCKOPT sockopt * VFS_GETSOCKOPT sockopt socklen * VFS_GETSOCKNAME sockaddr socklen * VFS_GETPEERNAME sockaddr socklen * VFS_SHUTDOWN shutdown */ #include "fs.h" #include "vnode.h" #include "file.h" #include /* * Convert any SOCK_xx open flags to O_xx open flags. */ static int get_sock_flags(int type) { int flags; flags = 0; if (type & SOCK_CLOEXEC) flags |= O_CLOEXEC; if (type & SOCK_NONBLOCK) flags |= O_NONBLOCK; if (type & SOCK_NOSIGPIPE) flags |= O_NOSIGPIPE; return flags; } /* * Perform cheap pre-call checks to ensure that the given number of socket FDs * can be created for the current process. */ static int check_sock_fds(int nfds) { /* * For now, we simply check if there are enough file descriptor slots * free in the process. Since the process is blocked on a socket call, * this aspect will not change. Availability of file pointers, vnodes, * and PFS nodes may vary, and is therefore less interesting to check * here - it will have to be checked again upon completion anyway. */ return check_fds(fp, nfds); } /* * Create a new file descriptor, including supporting objects, for the open * socket identified by 'dev', in the current process, using the O_xx open * flags 'flags'. On success, return the file descriptor number. The results * of a successful call can be undone with close_fd(), which will also close * the socket itself. On failure, return a negative error code. In this case, * the socket will be left open. */ static int make_sock_fd(dev_t dev, int flags) { struct vmnt *vmp; struct vnode *vp; struct filp *filp; struct node_details res; int r, fd; assert((flags & ~(O_CLOEXEC | O_NONBLOCK | O_NOSIGPIPE)) == 0); #if !NDEBUG /* * Check whether there is a socket object for the new device already. * This is an expensive check, but if the socket driver sends us a new * socket ID that is already in use, this is a sure sign of driver * misbehavior. So far it does seem like nothing would go wrong within * VFS in this case though, which is why this is a debug-only check. */ if (find_filp_by_sock_dev(dev) != NULL) { printf("VFS: socket driver %d generated in-use socket ID!\n", get_smap_by_dev(dev, NULL)->smap_endpt); return EIO; } #endif /* !NDEBUG */ /* * Get a lock on PFS. TODO: it is not clear whether locking PFS is * needed at all, let alone which lock: map_vnode() uses a write lock, * create_pipe() uses a read lock, and cdev_clone() uses no lock at * all. As is, the README prescribes VMNT_READ, so that's what we use * here. The code below largely copies the create_pipe() code anyway. */ if ((vmp = find_vmnt(PFS_PROC_NR)) == NULL) panic("PFS gone"); if ((r = lock_vmnt(vmp, VMNT_READ)) != OK) return r; /* Obtain a free vnode. */ if ((vp = get_free_vnode()) == NULL) { unlock_vmnt(vmp); return err_code; } lock_vnode(vp, VNODE_OPCL); /* Acquire a file descriptor. */ if ((r = get_fd(fp, 0, R_BIT | W_BIT, &fd, &filp)) != OK) { unlock_vnode(vp); unlock_vmnt(vmp); return r; } /* Create a PFS node for the socket. */ if ((r = req_newnode(PFS_PROC_NR, fp->fp_effuid, fp->fp_effgid, S_IFSOCK | ACCESSPERMS, dev, &res)) != OK) { unlock_filp(filp); unlock_vnode(vp); unlock_vmnt(vmp); return r; } /* Fill in the objects, and link them together. */ vp->v_fs_e = res.fs_e; vp->v_inode_nr = res.inode_nr; vp->v_mode = res.fmode; vp->v_sdev = dev; vp->v_fs_count = 1; vp->v_ref_count = 1; vp->v_vmnt = NULL; vp->v_dev = NO_DEV; vp->v_size = 0; filp->filp_vno = vp; filp->filp_flags = O_RDWR | flags; filp->filp_count = 1; fp->fp_filp[fd] = filp; if (flags & O_CLOEXEC) FD_SET(fd, &fp->fp_cloexec_set); /* Release locks, and return the new file descriptor. */ unlock_filp(filp); /* this also unlocks the vnode now! */ unlock_vmnt(vmp); return fd; } /* * Create a socket. */ int do_socket(void) { int domain, type, sock_type, protocol; dev_t dev; int r, flags; domain = job_m_in.m_lc_vfs_socket.domain; type = job_m_in.m_lc_vfs_socket.type; protocol = job_m_in.m_lc_vfs_socket.protocol; /* Is there a socket driver for this domain at all? */ if (get_smap_by_domain(domain) == NULL) return EAFNOSUPPORT; /* * Ensure that it is at least likely that after creating a socket, we * will be able to create a file descriptor for it, along with all the * necessary supporting objects. While it would be slightly neater to * allocate these objects before trying to create the socket, this is * offset by the fact that that approach results in a downright mess in * do_socketpair() below, and with the current approach we can reuse * the same code for accepting sockets as well. For newly created * sockets, it is no big deal to close them right after creation; for * newly accepted sockets, we have no choice but to do that anyway. * Moreover, object creation failures should be rare and our approach * does not cause significantly more overhead anyway, so the entire * issue is largely philosophical anyway. For now, this will do. */ if ((r = check_sock_fds(1)) != OK) return r; sock_type = type & ~SOCK_FLAGS_MASK; flags = get_sock_flags(type); if ((r = sdev_socket(domain, sock_type, protocol, &dev, FALSE /*pair*/)) != OK) return r; if ((r = make_sock_fd(dev, flags)) < 0) (void)sdev_close(dev, FALSE /*may_suspend*/); return r; } /* * Create a pair of connected sockets. */ int do_socketpair(void) { int domain, type, sock_type, protocol; dev_t dev[2]; int r, fd0, fd1, flags; domain = job_m_in.m_lc_vfs_socket.domain; type = job_m_in.m_lc_vfs_socket.type; protocol = job_m_in.m_lc_vfs_socket.protocol; /* Is there a socket driver for this domain at all? */ if (get_smap_by_domain(domain) == NULL) return EAFNOSUPPORT; /* * See the lengthy comment in do_socket(). This time we need two of * everything, though. */ if ((r = check_sock_fds(2)) != OK) return r; sock_type = type & ~SOCK_FLAGS_MASK; flags = get_sock_flags(type); if ((r = sdev_socket(domain, sock_type, protocol, dev, TRUE /*pair*/)) != OK) return r; if ((fd0 = make_sock_fd(dev[0], flags)) < 0) { (void)sdev_close(dev[0], FALSE /*may_suspend*/); (void)sdev_close(dev[1], FALSE /*may_suspend*/); return fd0; } if ((fd1 = make_sock_fd(dev[1], flags)) < 0) { close_fd(fp, fd0, FALSE /*may_suspend*/); (void)sdev_close(dev[1], FALSE /*may_suspend*/); return fd1; } job_m_out.m_vfs_lc_fdpair.fd0 = fd0; job_m_out.m_vfs_lc_fdpair.fd1 = fd1; return OK; } /* * Check whether the given file descriptor identifies an open socket in the * current process. If so, return OK, with the socket device number stored in * 'dev' and its file pointer flags stored in 'flags' (if not NULL). If not, * return an appropriate error code. */ static int get_sock(int fd, dev_t * dev, int * flags) { struct filp *filp; if ((filp = get_filp(fd, VNODE_READ)) == NULL) return err_code; if (!S_ISSOCK(filp->filp_vno->v_mode)) { unlock_filp(filp); return ENOTSOCK; } *dev = filp->filp_vno->v_sdev; if (flags != NULL) *flags = filp->filp_flags; /* * It is safe to leave the file pointer object unlocked during the * actual call. Since the current process is blocked for the duration * of the socket call, we know the socket's file descriptor, and thus * its file pointer, can not possibly be freed. In addition, we will * not be accessing the file pointer anymore later, with the exception * of accept calls, which reacquire the lock when the reply comes in. */ unlock_filp(filp); return OK; } /* * Bind a socket to a local address. */ int do_bind(void) { dev_t dev; int r, fd, flags; fd = job_m_in.m_lc_vfs_sockaddr.fd; if ((r = get_sock(fd, &dev, &flags)) != OK) return r; return sdev_bind(dev, job_m_in.m_lc_vfs_sockaddr.addr, job_m_in.m_lc_vfs_sockaddr.addr_len, flags); } /* * Connect a socket to a remote address. */ int do_connect(void) { dev_t dev; int r, fd, flags; fd = job_m_in.m_lc_vfs_sockaddr.fd; if ((r = get_sock(fd, &dev, &flags)) != OK) return r; return sdev_connect(dev, job_m_in.m_lc_vfs_sockaddr.addr, job_m_in.m_lc_vfs_sockaddr.addr_len, flags); } /* * Put a socket in listening mode. */ int do_listen(void) { dev_t dev; int r, fd, backlog; fd = job_m_in.m_lc_vfs_listen.fd; backlog = job_m_in.m_lc_vfs_listen.backlog; if ((r = get_sock(fd, &dev, NULL)) != OK) return r; if (backlog < 0) backlog = 0; return sdev_listen(dev, backlog); } /* * Accept a connection on a listening socket, creating a new socket. */ int do_accept(void) { dev_t dev; int r, fd, flags; fd = job_m_in.m_lc_vfs_sockaddr.fd; if ((r = get_sock(fd, &dev, &flags)) != OK) return r; if ((r = check_sock_fds(1)) != OK) return r; return sdev_accept(dev, job_m_in.m_lc_vfs_sockaddr.addr, job_m_in.m_lc_vfs_sockaddr.addr_len, flags, fd); } /* * Resume a previously suspended accept(2) system call. This routine must * cover three distinct cases, depending on the 'status' and 'dev' values: * * #1. If the 'status' parameter is set to OK, the accept call succeeded. In * that case, the function is guaranteed to be called from a worker thread, * with 'fp' set to the user process that made the system call. In that * case, this function may block its calling thread. The 'dev' parameter * will contain the device number of the newly accepted socket. * #2. If the 'status' parameter contains a negative error code, but 'dev' is * *not* set to NO_DEV, then the same as above applies, except that the new * socket must be closed immediately. * #3. If 'status' is a negative error code and 'dev' is set to NO_DEV, then * the accept call has failed and no new socket was ever created. In this * case, the function MUST NOT block its calling thread. */ void resume_accept(struct fproc * rfp, int status, dev_t dev, unsigned int addr_len, int listen_fd) { message m; dev_t ldev; int r, flags; /* * If the call did not succeed and no socket was created (case #3), we * cannot and should not do more than send the error to the user * process. */ if (status != OK && dev == NO_DEV) { replycode(rfp->fp_endpoint, status); return; } /* * The call succeeded. The lower socket layer (sdev.c) ensures that in * that case, we are called from a worker thread which is associated * with the original user process. Thus, we can block the current * thread. Start by verifying that the listening socket is still * around. If it is not, it must have been invalidated as a result of * a socket driver death, in which case we must report an error but * need not close the new socket. As a side effect, obtain the * listening socket's flags, which on BSD systems are inherited by the * accepted socket. */ assert(fp == rfp); /* needed for get_sock() and make_sock_fd() */ if (get_sock(listen_fd, &ldev, &flags) != OK) { replycode(rfp->fp_endpoint, EIO); return; } /* The same socket driver must host both sockets, obviously. */ assert(get_smap_by_dev(ldev, NULL) == get_smap_by_dev(dev, NULL)); /* * If an error status was returned (case #2), we must now close the * newly accepted socket. Effectively, this allows socket drivers to * handle address copy failures in the cleanest possible way. */ if (status != OK) { (void)sdev_close(dev, FALSE /*may_suspend*/); replycode(rfp->fp_endpoint, status); return; } /* * A new socket has been successfully accepted (case #1). Try to * create a file descriptor for the new socket. If this fails, we have * to close the new socket after all. That is not great, but we have * no way to prevent this except by preallocating all objects for the * duration of the accept call, which is not exactly great either. */ flags &= O_CLOEXEC | O_NONBLOCK | O_NOSIGPIPE; if ((r = make_sock_fd(dev, flags)) < 0) { (void)sdev_close(dev, FALSE /*may_suspend*/); replycode(rfp->fp_endpoint, r); return; } /* * The accept call has succeeded. Send a reply message with the new * file descriptor and an address length (which may be zero). */ memset(&m, 0, sizeof(m)); m.m_vfs_lc_socklen.len = addr_len; reply(&m, rfp->fp_endpoint, r); } /* * Send a message on a socket. */ int do_sendto(void) { dev_t dev; int r, fd, flags; fd = job_m_in.m_lc_vfs_sendrecv.fd; if ((r = get_sock(fd, &dev, &flags)) != OK) return r; return sdev_readwrite(dev, job_m_in.m_lc_vfs_sendrecv.buf, job_m_in.m_lc_vfs_sendrecv.len, 0, 0, job_m_in.m_lc_vfs_sendrecv.addr, job_m_in.m_lc_vfs_sendrecv.addr_len, job_m_in.m_lc_vfs_sendrecv.flags, WRITING, flags, 0); } /* * Receive a message from a socket. */ int do_recvfrom(void) { dev_t dev; int r, fd, flags; fd = job_m_in.m_lc_vfs_sendrecv.fd; if ((r = get_sock(fd, &dev, &flags)) != OK) return r; return sdev_readwrite(dev, job_m_in.m_lc_vfs_sendrecv.buf, job_m_in.m_lc_vfs_sendrecv.len, 0, 0, job_m_in.m_lc_vfs_sendrecv.addr, job_m_in.m_lc_vfs_sendrecv.addr_len, job_m_in.m_lc_vfs_sendrecv.flags, READING, flags, 0); } /* * Resume a previously suspended recvfrom(2) system call. This function MUST * NOT block its calling thread. */ void resume_recvfrom(struct fproc * rfp, int status, unsigned int addr_len) { message m; if (status >= 0) { memset(&m, 0, sizeof(m)); m.m_vfs_lc_socklen.len = addr_len; reply(&m, rfp->fp_endpoint, status); } else replycode(rfp->fp_endpoint, status); } /* * Send or receive a message on a socket using a message structure. */ int do_sockmsg(void) { struct msghdr msg; struct iovec iov; vir_bytes msg_buf, data_buf; size_t data_len; dev_t dev; int r, fd, flags; assert(job_call_nr == VFS_SENDMSG || job_call_nr == VFS_RECVMSG); fd = job_m_in.m_lc_vfs_sockmsg.fd; msg_buf = job_m_in.m_lc_vfs_sockmsg.msgbuf; if ((r = get_sock(fd, &dev, &flags)) != OK) return r; if ((r = sys_datacopy_wrapper(who_e, msg_buf, SELF, (vir_bytes)&msg, sizeof(msg))) != OK) return r; data_buf = 0; data_len = 0; if (msg.msg_iovlen > 0) { /* * We do not yet support vectors with more than one element; * for this reason, libc is currently expected to consolidate * the entire vector into a single element. Once we do add * proper vector support, the ABI itself need not be changed. */ if (msg.msg_iovlen > 1) return EMSGSIZE; if ((r = sys_datacopy_wrapper(who_e, (vir_bytes)msg.msg_iov, SELF, (vir_bytes)&iov, sizeof(iov))) != OK) return r; if (iov.iov_len > SSIZE_MAX) return EINVAL; if (iov.iov_len > 0) { data_buf = (vir_bytes)iov.iov_base; data_len = iov.iov_len; } } return sdev_readwrite(dev, data_buf, data_len, (vir_bytes)msg.msg_control, msg.msg_controllen, (vir_bytes)msg.msg_name, msg.msg_namelen, job_m_in.m_lc_vfs_sockmsg.flags, (job_call_nr == VFS_RECVMSG) ? READING : WRITING, flags, (job_call_nr == VFS_RECVMSG) ? msg_buf : 0); } /* * Resume a previously suspended recvmsg(2) system call. The 'status' * parameter contains either the number of data bytes received or a negative * error code. The 'msg_buf' parameter contains the user address of the msghdr * structure. If a failure occurs in this function, the received data * (including, in the worst case, references to received file descriptors) will * be lost - while seriously ugly, this is always the calling process's fault, * extremely hard to deal with, and on par with current behavior in other * operating systems. This function MUST NOT block its calling thread. */ void resume_recvmsg(struct fproc * rfp, int status, unsigned int ctl_len, unsigned int addr_len, int flags, vir_bytes msg_buf) { struct msghdr msg; int r; if (status < 0) { replycode(rfp->fp_endpoint, status); return; } /* * Unfortunately, we now need to update a subset of the fields of the * msghdr structure. We can 1) copy in the entire structure for the * second time, modify some fields, and copy it out in its entirety * again, 2) copy out individual fields that have been changed, 3) save * a copy of the original structure somewhere. The third option is the * most efficient, but would increase the fproc structure size by quite * a bit. The main difference between the first and second options is * the number of kernel calls; we choose to use the first option. */ if ((r = sys_datacopy_wrapper(rfp->fp_endpoint, msg_buf, SELF, (vir_bytes)&msg, sizeof(msg))) != OK) { /* We copied it in before, how could it fail now? */ printf("VFS: resume_recvmsg cannot copy in msghdr? (%d)\n", r); replycode(rfp->fp_endpoint, r); return; } /* Modify and copy out the structure, and wake up the caller. */ msg.msg_controllen = ctl_len; msg.msg_flags = flags; if (addr_len > 0) msg.msg_namelen = addr_len; if ((r = sys_datacopy_wrapper(SELF, (vir_bytes)&msg, rfp->fp_endpoint, msg_buf, sizeof(msg))) != OK) status = r; replycode(rfp->fp_endpoint, status); } /* * Set socket options. */ int do_setsockopt(void) { dev_t dev; int r, fd; fd = job_m_in.m_lc_vfs_sockopt.fd; if ((r = get_sock(fd, &dev, NULL)) != OK) return r; return sdev_setsockopt(dev, job_m_in.m_lc_vfs_sockopt.level, job_m_in.m_lc_vfs_sockopt.name, job_m_in.m_lc_vfs_sockopt.buf, job_m_in.m_lc_vfs_sockopt.len); } /* * Get socket options. */ int do_getsockopt(void) { unsigned int len; dev_t dev; int r, fd; fd = job_m_in.m_lc_vfs_sockopt.fd; len = job_m_in.m_lc_vfs_sockopt.len; if ((r = get_sock(fd, &dev, NULL)) != OK) return r; r = sdev_getsockopt(dev, job_m_in.m_lc_vfs_sockopt.level, job_m_in.m_lc_vfs_sockopt.name, job_m_in.m_lc_vfs_sockopt.buf, &len); if (r == OK) job_m_out.m_vfs_lc_socklen.len = len; return r; } /* * Get the local address of a socket. */ int do_getsockname(void) { unsigned int len; dev_t dev; int r, fd; fd = job_m_in.m_lc_vfs_sockaddr.fd; len = job_m_in.m_lc_vfs_sockaddr.addr_len; if ((r = get_sock(fd, &dev, NULL)) != OK) return r; r = sdev_getsockname(dev, job_m_in.m_lc_vfs_sockaddr.addr, &len); if (r == OK) job_m_out.m_vfs_lc_socklen.len = len; return r; } /* * Get the remote address of a socket. */ int do_getpeername(void) { unsigned int len; dev_t dev; int r, fd; fd = job_m_in.m_lc_vfs_sockaddr.fd; len = job_m_in.m_lc_vfs_sockaddr.addr_len; if ((r = get_sock(fd, &dev, NULL)) != OK) return r; r = sdev_getpeername(dev, job_m_in.m_lc_vfs_sockaddr.addr, &len); if (r == OK) job_m_out.m_vfs_lc_socklen.len = len; return r; } /* * Shut down socket send and receive operations. */ int do_shutdown(void) { dev_t dev; int r, fd, how; fd = job_m_in.m_lc_vfs_shutdown.fd; how = job_m_in.m_lc_vfs_shutdown.how; if ((r = get_sock(fd, &dev, NULL)) != OK) return r; if (how != SHUT_RD && how != SHUT_WR && how != SHUT_RDWR) return EINVAL; return sdev_shutdown(dev, how); }