1 /* $NetBSD: sys_pipe.c,v 1.25 2002/03/17 19:41:07 atatat Exp $ */ 2 3 /* 4 * Copyright (c) 1996 John S. Dyson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice immediately at the beginning of the file, without modification, 12 * this list of conditions, and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Absolutely no warranty of function or purpose is made by the author 17 * John S. Dyson. 18 * 4. Modifications may be freely made to this file if the above conditions 19 * are met. 20 * 21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.95 2002/03/09 22:06:31 alfred Exp $ 22 */ 23 24 /* 25 * This file contains a high-performance replacement for the socket-based 26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 27 * all features of sockets, but does do everything that pipes normally 28 * do. 29 * 30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was 31 * written by Jaromir Dolecek. 32 */ 33 34 /* 35 * This code has two modes of operation, a small write mode and a large 36 * write mode. The small write mode acts like conventional pipes with 37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD, 40 * those pages are also wired), and the receiving process can copy it directly 41 * from the pages in the sending process. 42 * 43 * If the sending process receives a signal, it is possible that it will 44 * go away, and certainly its address space can change, because control 45 * is returned back to the user-mode side. In that case, the pipe code 46 * arranges to copy the buffer supplied by the user process on FreeBSD, to 47 * a pageable kernel buffer, and the receiving process will grab the data 48 * from the pageable kernel buffer. Since signals don't happen all that often, 49 * the copy operation is normally eliminated. 50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(), 51 * so no explicit handling need to be done, all is handled by standard VM 52 * facilities. 53 * 54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 55 * happen for small transfers so that the system will not spend all of 56 * its time context switching. PIPE_SIZE is constrained by the 57 * amount of kernel virtual memory. 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.25 2002/03/17 19:41:07 atatat Exp $"); 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/proc.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/filedesc.h> 69 #include <sys/filio.h> 70 #include <sys/kernel.h> 71 #include <sys/lock.h> 72 #include <sys/ttycom.h> 73 #include <sys/stat.h> 74 #include <sys/malloc.h> 75 #include <sys/poll.h> 76 #include <sys/signalvar.h> 77 #include <sys/vnode.h> 78 #include <sys/uio.h> 79 #include <sys/lock.h> 80 #ifdef __FreeBSD__ 81 #include <sys/mutex.h> 82 #endif 83 #ifdef __NetBSD__ 84 #include <sys/select.h> 85 #include <sys/mount.h> 86 #include <sys/syscallargs.h> 87 #include <uvm/uvm.h> 88 #include <sys/sysctl.h> 89 #include <sys/kernel.h> 90 #endif /* NetBSD, FreeBSD */ 91 92 #include <sys/pipe.h> 93 94 #ifdef __NetBSD__ 95 /* 96 * Avoid microtime(9), it's slow. We don't guard the read from time(9) 97 * with splclock(9) since we don't actually need to be THAT sure the access 98 * is atomic. 99 */ 100 #define vfs_timestamp(tv) (*(tv) = time) 101 #endif 102 103 /* 104 * Use this define if you want to disable *fancy* VM things. Expect an 105 * approx 30% decrease in transfer rate. This could be useful for 106 * OpenBSD. 107 */ 108 /* #define PIPE_NODIRECT */ 109 110 /* 111 * interfaces to the outside world 112 */ 113 #ifdef __FreeBSD__ 114 static int pipe_read(struct file *fp, struct uio *uio, 115 struct ucred *cred, int flags, struct thread *td); 116 static int pipe_write(struct file *fp, struct uio *uio, 117 struct ucred *cred, int flags, struct thread *td); 118 static int pipe_close(struct file *fp, struct thread *td); 119 static int pipe_poll(struct file *fp, int events, struct ucred *cred, 120 struct thread *td); 121 static int pipe_kqfilter(struct file *fp, struct knote *kn); 122 static int pipe_stat(struct file *fp, struct stat *sb, struct thread *td); 123 static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td); 124 125 static struct fileops pipeops = { 126 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter, 127 pipe_stat, pipe_close 128 }; 129 130 static void filt_pipedetach(struct knote *kn); 131 static int filt_piperead(struct knote *kn, long hint); 132 static int filt_pipewrite(struct knote *kn, long hint); 133 134 static struct filterops pipe_rfiltops = 135 { 1, NULL, filt_pipedetach, filt_piperead }; 136 static struct filterops pipe_wfiltops = 137 { 1, NULL, filt_pipedetach, filt_pipewrite }; 138 139 #define PIPE_GET_GIANT(pipe) \ 140 do { \ 141 PIPE_UNLOCK(wpipe); \ 142 mtx_lock(&Giant); \ 143 } while (0) 144 145 #define PIPE_DROP_GIANT(pipe) \ 146 do { \ 147 mtx_unlock(&Giant); \ 148 PIPE_LOCK(wpipe); \ 149 } while (0) 150 151 #endif /* FreeBSD */ 152 153 #ifdef __NetBSD__ 154 static int pipe_read(struct file *fp, off_t *offset, struct uio *uio, 155 struct ucred *cred, int flags); 156 static int pipe_write(struct file *fp, off_t *offset, struct uio *uio, 157 struct ucred *cred, int flags); 158 static int pipe_close(struct file *fp, struct proc *p); 159 static int pipe_poll(struct file *fp, int events, struct proc *p); 160 static int pipe_fcntl(struct file *fp, u_int com, caddr_t data, 161 struct proc *p); 162 static int pipe_stat(struct file *fp, struct stat *sb, struct proc *p); 163 static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p); 164 165 static struct fileops pipeops = 166 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll, 167 pipe_stat, pipe_close }; 168 169 /* XXXSMP perhaps use spinlocks & KERNEL_PROC_(UN)LOCK() ? just clear now */ 170 #define PIPE_GET_GIANT(pipe) 171 #define PIPE_DROP_GIANT(pipe) 172 #define GIANT_REQUIRED 173 174 #endif /* NetBSD */ 175 176 /* 177 * Default pipe buffer size(s), this can be kind-of large now because pipe 178 * space is pageable. The pipe code will try to maintain locality of 179 * reference for performance reasons, so small amounts of outstanding I/O 180 * will not wipe the cache. 181 */ 182 #define MINPIPESIZE (PIPE_SIZE/3) 183 #define MAXPIPESIZE (2*PIPE_SIZE/3) 184 185 /* 186 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 187 * is there so that on large systems, we don't exhaust it. 188 */ 189 #define MAXPIPEKVA (8*1024*1024) 190 static int maxpipekva = MAXPIPEKVA; 191 192 /* 193 * Limit for direct transfers, we cannot, of course limit 194 * the amount of kva for pipes in general though. 195 */ 196 #define LIMITPIPEKVA (16*1024*1024) 197 static int limitpipekva = LIMITPIPEKVA; 198 199 /* 200 * Limit the number of "big" pipes 201 */ 202 #define LIMITBIGPIPES 32 203 static int maxbigpipes = LIMITBIGPIPES; 204 static int nbigpipe = 0; 205 206 /* 207 * Amount of KVA consumed by pipe buffers. 208 */ 209 static int amountpipekva = 0; 210 211 static void pipeclose(struct pipe *cpipe); 212 static void pipe_free_kmem(struct pipe *cpipe); 213 static int pipe_create(struct pipe **cpipep, int allockva); 214 static __inline int pipelock(struct pipe *cpipe, int catch); 215 static __inline void pipeunlock(struct pipe *cpipe); 216 static __inline void pipeselwakeup(struct pipe *cpipe, struct pipe *sigp); 217 #ifndef PIPE_NODIRECT 218 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 219 #endif 220 static int pipespace(struct pipe *cpipe, int size); 221 222 #ifdef __NetBSD__ 223 #ifndef PIPE_NODIRECT 224 static int pipe_loan_alloc(struct pipe *, int); 225 static void pipe_loan_free(struct pipe *); 226 #endif /* PIPE_NODIRECT */ 227 228 static struct pool pipe_pool; 229 #endif /* NetBSD */ 230 231 #ifdef __FreeBSD__ 232 static vm_zone_t pipe_zone; 233 234 static void pipeinit(void *dummy __unused); 235 #ifndef PIPE_NODIRECT 236 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 237 static void pipe_destroy_write_buffer(struct pipe *wpipe); 238 static void pipe_clone_write_buffer(struct pipe *wpipe); 239 #endif 240 241 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 242 243 static void 244 pipeinit(void *dummy __unused) 245 { 246 247 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4); 248 } 249 #endif /* FreeBSD */ 250 251 /* 252 * The pipe system call for the DTYPE_PIPE type of pipes 253 */ 254 255 /* ARGSUSED */ 256 #ifdef __FreeBSD__ 257 int 258 pipe(td, uap) 259 struct thread *td; 260 struct pipe_args /* { 261 int dummy; 262 } */ *uap; 263 #elif defined(__NetBSD__) 264 int 265 sys_pipe(p, v, retval) 266 struct proc *p; 267 void *v; 268 register_t *retval; 269 #endif 270 { 271 struct file *rf, *wf; 272 struct pipe *rpipe, *wpipe; 273 int fd, error; 274 #ifdef __FreeBSD__ 275 struct mtx *pmtx; 276 277 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 278 279 pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO); 280 281 rpipe = wpipe = NULL; 282 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) { 283 pipeclose(rpipe); 284 pipeclose(wpipe); 285 free(pmtx, M_TEMP); 286 return (ENFILE); 287 } 288 289 error = falloc(td, &rf, &fd); 290 if (error) { 291 pipeclose(rpipe); 292 pipeclose(wpipe); 293 free(pmtx, M_TEMP); 294 return (error); 295 } 296 fhold(rf); 297 td->td_retval[0] = fd; 298 299 /* 300 * Warning: once we've gotten past allocation of the fd for the 301 * read-side, we can only drop the read side via fdrop() in order 302 * to avoid races against processes which manage to dup() the read 303 * side while we are blocked trying to allocate the write side. 304 */ 305 FILE_LOCK(rf); 306 rf->f_flag = FREAD | FWRITE; 307 rf->f_type = DTYPE_PIPE; 308 rf->f_data = (caddr_t)rpipe; 309 rf->f_ops = &pipeops; 310 FILE_UNLOCK(rf); 311 error = falloc(td, &wf, &fd); 312 if (error) { 313 struct filedesc *fdp = td->td_proc->p_fd; 314 FILEDESC_LOCK(fdp); 315 if (fdp->fd_ofiles[td->td_retval[0]] == rf) { 316 fdp->fd_ofiles[td->td_retval[0]] = NULL; 317 FILEDESC_UNLOCK(fdp); 318 fdrop(rf, td); 319 } else 320 FILEDESC_UNLOCK(fdp); 321 fdrop(rf, td); 322 /* rpipe has been closed by fdrop(). */ 323 pipeclose(wpipe); 324 free(pmtx, M_TEMP); 325 return (error); 326 } 327 FILE_LOCK(wf); 328 wf->f_flag = FREAD | FWRITE; 329 wf->f_type = DTYPE_PIPE; 330 wf->f_data = (caddr_t)wpipe; 331 wf->f_ops = &pipeops; 332 p->p_retval[1] = fd; 333 rpipe->pipe_peer = wpipe; 334 wpipe->pipe_peer = rpipe; 335 mtx_init(pmtx, "pipe mutex", MTX_DEF); 336 rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; 337 fdrop(rf, td); 338 #endif /* FreeBSD */ 339 340 #ifdef __NetBSD__ 341 rpipe = wpipe = NULL; 342 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) { 343 pipeclose(rpipe); 344 pipeclose(wpipe); 345 return (ENFILE); 346 } 347 348 /* 349 * Note: the file structure returned from falloc() is marked 350 * as 'larval' initially. Unless we mark it as 'mature' by 351 * FILE_SET_MATURE(), any attempt to do anything with it would 352 * return EBADF, including e.g. dup(2) or close(2). This avoids 353 * file descriptor races if we block in the second falloc(). 354 */ 355 356 error = falloc(p, &rf, &fd); 357 if (error) 358 goto free2; 359 retval[0] = fd; 360 rf->f_flag = FREAD; 361 rf->f_type = DTYPE_PIPE; 362 rf->f_data = (caddr_t)rpipe; 363 rf->f_ops = &pipeops; 364 365 error = falloc(p, &wf, &fd); 366 if (error) 367 goto free3; 368 retval[1] = fd; 369 wf->f_flag = FWRITE; 370 wf->f_type = DTYPE_PIPE; 371 wf->f_data = (caddr_t)wpipe; 372 wf->f_ops = &pipeops; 373 374 rpipe->pipe_peer = wpipe; 375 wpipe->pipe_peer = rpipe; 376 377 FILE_SET_MATURE(rf); 378 FILE_SET_MATURE(wf); 379 FILE_UNUSE(rf, p); 380 FILE_UNUSE(wf, p); 381 return (0); 382 free3: 383 FILE_UNUSE(rf, p); 384 ffree(rf); 385 fdremove(p->p_fd, retval[0]); 386 free2: 387 pipeclose(wpipe); 388 pipeclose(rpipe); 389 #endif /* NetBSD */ 390 391 return (error); 392 } 393 394 /* 395 * Allocate kva for pipe circular buffer, the space is pageable 396 * This routine will 'realloc' the size of a pipe safely, if it fails 397 * it will retain the old buffer. 398 * If it fails it will return ENOMEM. 399 */ 400 static int 401 pipespace(cpipe, size) 402 struct pipe *cpipe; 403 int size; 404 { 405 caddr_t buffer; 406 #ifdef __FreeBSD__ 407 struct vm_object *object; 408 int npages, error; 409 410 GIANT_REQUIRED; 411 KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)), 412 ("pipespace: pipe mutex locked")); 413 414 npages = round_page(size)/PAGE_SIZE; 415 /* 416 * Create an object, I don't like the idea of paging to/from 417 * kernel_object. 418 */ 419 object = vm_object_allocate(OBJT_DEFAULT, npages); 420 buffer = (caddr_t) vm_map_min(kernel_map); 421 422 /* 423 * Insert the object into the kernel map, and allocate kva for it. 424 * The map entry is, by default, pageable. 425 */ 426 error = vm_map_find(kernel_map, object, 0, 427 (vm_offset_t *) &buffer, size, 1, 428 VM_PROT_ALL, VM_PROT_ALL, 0); 429 430 if (error != KERN_SUCCESS) { 431 vm_object_deallocate(object); 432 return (ENOMEM); 433 } 434 #endif /* FreeBSD */ 435 436 #ifdef __NetBSD__ 437 /* 438 * Allocate pageable virtual address space. Physical memory is allocated 439 * on demand. 440 */ 441 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size)); 442 if (buffer == NULL) 443 return (ENOMEM); 444 #endif /* NetBSD */ 445 446 /* free old resources if we're resizing */ 447 pipe_free_kmem(cpipe); 448 #ifdef __FreeBSD__ 449 cpipe->pipe_buffer.object = object; 450 #endif 451 cpipe->pipe_buffer.buffer = buffer; 452 cpipe->pipe_buffer.size = size; 453 cpipe->pipe_buffer.in = 0; 454 cpipe->pipe_buffer.out = 0; 455 cpipe->pipe_buffer.cnt = 0; 456 amountpipekva += cpipe->pipe_buffer.size; 457 return (0); 458 } 459 460 /* 461 * initialize and allocate VM and memory for pipe 462 */ 463 static int 464 pipe_create(cpipep, allockva) 465 struct pipe **cpipep; 466 int allockva; 467 { 468 struct pipe *cpipe; 469 int error; 470 471 #ifdef __FreeBSD__ 472 *cpipep = zalloc(pipe_zone); 473 #endif 474 #ifdef __NetBSD__ 475 *cpipep = pool_get(&pipe_pool, M_WAITOK); 476 #endif 477 if (*cpipep == NULL) 478 return (ENOMEM); 479 480 cpipe = *cpipep; 481 482 /* Initialize */ 483 memset(cpipe, 0, sizeof(*cpipe)); 484 cpipe->pipe_state = PIPE_SIGNALR; 485 486 #ifdef __FreeBSD__ 487 cpipe->pipe_mtxp = NULL; /* avoid pipespace assertion */ 488 #endif 489 if (allockva && (error = pipespace(cpipe, PIPE_SIZE))) 490 return (error); 491 492 vfs_timestamp(&cpipe->pipe_ctime); 493 cpipe->pipe_atime = cpipe->pipe_ctime; 494 cpipe->pipe_mtime = cpipe->pipe_ctime; 495 #ifdef __NetBSD__ 496 cpipe->pipe_pgid = NO_PID; 497 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0); 498 #endif 499 500 return (0); 501 } 502 503 504 /* 505 * lock a pipe for I/O, blocking other access 506 */ 507 static __inline int 508 pipelock(cpipe, catch) 509 struct pipe *cpipe; 510 int catch; 511 { 512 int error; 513 514 #ifdef __FreeBSD__ 515 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 516 while (cpipe->pipe_state & PIPE_LOCKFL) { 517 cpipe->pipe_state |= PIPE_LWANT; 518 error = msleep(cpipe, PIPE_MTX(cpipe), 519 catch ? (PRIBIO | PCATCH) : PRIBIO, 520 "pipelk", 0); 521 if (error != 0) 522 return (error); 523 } 524 cpipe->pipe_state |= PIPE_LOCKFL; 525 return (0); 526 #endif 527 528 #ifdef __NetBSD__ 529 do { 530 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL); 531 } while (!catch && (error == EINTR || error == ERESTART)); 532 return (error); 533 #endif 534 } 535 536 /* 537 * unlock a pipe I/O lock 538 */ 539 static __inline void 540 pipeunlock(cpipe) 541 struct pipe *cpipe; 542 { 543 544 #ifdef __FreeBSD__ 545 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 546 cpipe->pipe_state &= ~PIPE_LOCKFL; 547 if (cpipe->pipe_state & PIPE_LWANT) { 548 cpipe->pipe_state &= ~PIPE_LWANT; 549 wakeup(cpipe); 550 } 551 #endif 552 553 #ifdef __NetBSD__ 554 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL); 555 #endif 556 } 557 558 /* 559 * Select/poll wakup. This also sends SIGIO to peer connected to 560 * 'sigpipe' side of pipe. 561 */ 562 static __inline void 563 pipeselwakeup(selp, sigp) 564 struct pipe *selp, *sigp; 565 { 566 if (selp->pipe_state & PIPE_SEL) { 567 selp->pipe_state &= ~PIPE_SEL; 568 selwakeup(&selp->pipe_sel); 569 } 570 #ifdef __FreeBSD__ 571 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio) 572 pgsigio(sigp->pipe_sigio, SIGIO, 0); 573 KNOTE(&selp->pipe_sel.si_note, 0); 574 #endif 575 576 #ifdef __NetBSD__ 577 if (sigp && (sigp->pipe_state & PIPE_ASYNC) 578 && sigp->pipe_pgid != NO_PID){ 579 struct proc *p; 580 581 if (sigp->pipe_pgid < 0) 582 gsignal(-sigp->pipe_pgid, SIGIO); 583 else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0) 584 psignal(p, SIGIO); 585 } 586 #endif /* NetBSD */ 587 } 588 589 /* ARGSUSED */ 590 #ifdef __FreeBSD__ 591 static int 592 pipe_read(fp, uio, cred, flags, td) 593 struct file *fp; 594 struct uio *uio; 595 struct ucred *cred; 596 struct thread *td; 597 int flags; 598 struct proc *p; 599 #elif defined(__NetBSD__) 600 static int 601 pipe_read(fp, offset, uio, cred, flags) 602 struct file *fp; 603 off_t *offset; 604 struct uio *uio; 605 struct ucred *cred; 606 int flags; 607 #endif 608 { 609 struct pipe *rpipe = (struct pipe *) fp->f_data; 610 int error; 611 size_t nread = 0; 612 size_t size; 613 size_t ocnt; 614 615 PIPE_LOCK(rpipe); 616 ++rpipe->pipe_busy; 617 error = pipelock(rpipe, 1); 618 if (error) 619 goto unlocked_error; 620 621 ocnt = rpipe->pipe_buffer.cnt; 622 623 while (uio->uio_resid) { 624 /* 625 * normal pipe buffer receive 626 */ 627 if (rpipe->pipe_buffer.cnt > 0) { 628 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 629 if (size > rpipe->pipe_buffer.cnt) 630 size = rpipe->pipe_buffer.cnt; 631 if (size > uio->uio_resid) 632 size = uio->uio_resid; 633 634 PIPE_UNLOCK(rpipe); 635 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 636 size, uio); 637 PIPE_LOCK(rpipe); 638 if (error) 639 break; 640 641 rpipe->pipe_buffer.out += size; 642 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 643 rpipe->pipe_buffer.out = 0; 644 645 rpipe->pipe_buffer.cnt -= size; 646 647 /* 648 * If there is no more to read in the pipe, reset 649 * its pointers to the beginning. This improves 650 * cache hit stats. 651 */ 652 if (rpipe->pipe_buffer.cnt == 0) { 653 rpipe->pipe_buffer.in = 0; 654 rpipe->pipe_buffer.out = 0; 655 } 656 nread += size; 657 #ifndef PIPE_NODIRECT 658 /* 659 * Direct copy, bypassing a kernel buffer. 660 */ 661 } else if ((size = rpipe->pipe_map.cnt) && 662 (rpipe->pipe_state & PIPE_DIRECTW)) { 663 caddr_t va; 664 if (size > uio->uio_resid) 665 size = uio->uio_resid; 666 667 va = (caddr_t) rpipe->pipe_map.kva + 668 rpipe->pipe_map.pos; 669 PIPE_UNLOCK(rpipe); 670 error = uiomove(va, size, uio); 671 PIPE_LOCK(rpipe); 672 if (error) 673 break; 674 nread += size; 675 rpipe->pipe_map.pos += size; 676 rpipe->pipe_map.cnt -= size; 677 if (rpipe->pipe_map.cnt == 0) { 678 rpipe->pipe_state &= ~PIPE_DIRECTW; 679 wakeup(rpipe); 680 } 681 #endif 682 } else { 683 /* 684 * detect EOF condition 685 * read returns 0 on EOF, no need to set error 686 */ 687 if (rpipe->pipe_state & PIPE_EOF) 688 break; 689 690 /* 691 * If the "write-side" has been blocked, wake it up now. 692 */ 693 if (rpipe->pipe_state & PIPE_WANTW) { 694 rpipe->pipe_state &= ~PIPE_WANTW; 695 wakeup(rpipe); 696 } 697 698 /* 699 * Break if some data was read. 700 */ 701 if (nread > 0) 702 break; 703 704 /* 705 * don't block on non-blocking I/O 706 */ 707 if (fp->f_flag & FNONBLOCK) { 708 error = EAGAIN; 709 break; 710 } 711 712 /* 713 * Unlock the pipe buffer for our remaining processing. 714 * We will either break out with an error or we will 715 * sleep and relock to loop. 716 */ 717 pipeunlock(rpipe); 718 719 /* 720 * We want to read more, wake up select/poll. 721 */ 722 pipeselwakeup(rpipe, rpipe->pipe_peer); 723 724 rpipe->pipe_state |= PIPE_WANTR; 725 #ifdef __FreeBSD__ 726 error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 727 "piperd", 0); 728 #else 729 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0); 730 #endif 731 if (error != 0 || (error = pipelock(rpipe, 1))) 732 goto unlocked_error; 733 } 734 } 735 pipeunlock(rpipe); 736 737 /* XXX: should probably do this before getting any locks. */ 738 if (error == 0) 739 vfs_timestamp(&rpipe->pipe_atime); 740 unlocked_error: 741 --rpipe->pipe_busy; 742 743 /* 744 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0. 745 */ 746 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) { 747 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW); 748 wakeup(rpipe); 749 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 750 /* 751 * Handle write blocking hysteresis. 752 */ 753 if (rpipe->pipe_state & PIPE_WANTW) { 754 rpipe->pipe_state &= ~PIPE_WANTW; 755 wakeup(rpipe); 756 } 757 } 758 759 /* 760 * If anything was read off the buffer, signal to the writer it's 761 * possible to write more data. Also send signal if we are here for the 762 * first time after last write. 763 */ 764 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF 765 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { 766 pipeselwakeup(rpipe, rpipe->pipe_peer); 767 rpipe->pipe_state &= ~PIPE_SIGNALR; 768 } 769 770 PIPE_UNLOCK(rpipe); 771 return (error); 772 } 773 774 #ifdef __FreeBSD__ 775 #ifndef PIPE_NODIRECT 776 /* 777 * Map the sending processes' buffer into kernel space and wire it. 778 * This is similar to a physical write operation. 779 */ 780 static int 781 pipe_build_write_buffer(wpipe, uio) 782 struct pipe *wpipe; 783 struct uio *uio; 784 { 785 size_t size; 786 int i; 787 vm_offset_t addr, endaddr, paddr; 788 789 GIANT_REQUIRED; 790 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 791 792 size = uio->uio_iov->iov_len; 793 if (size > wpipe->pipe_buffer.size) 794 size = wpipe->pipe_buffer.size; 795 796 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 797 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 798 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 799 vm_page_t m; 800 801 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || 802 (paddr = pmap_kextract(addr)) == 0) { 803 int j; 804 805 for (j = 0; j < i; j++) 806 vm_page_unwire(wpipe->pipe_map.ms[j], 1); 807 return (EFAULT); 808 } 809 810 m = PHYS_TO_VM_PAGE(paddr); 811 vm_page_wire(m); 812 wpipe->pipe_map.ms[i] = m; 813 } 814 815 /* 816 * set up the control block 817 */ 818 wpipe->pipe_map.npages = i; 819 wpipe->pipe_map.pos = 820 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 821 wpipe->pipe_map.cnt = size; 822 823 /* 824 * and map the buffer 825 */ 826 if (wpipe->pipe_map.kva == 0) { 827 /* 828 * We need to allocate space for an extra page because the 829 * address range might (will) span pages at times. 830 */ 831 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, 832 wpipe->pipe_buffer.size + PAGE_SIZE); 833 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; 834 } 835 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 836 wpipe->pipe_map.npages); 837 838 /* 839 * and update the uio data 840 */ 841 842 uio->uio_iov->iov_len -= size; 843 uio->uio_iov->iov_base += size; 844 if (uio->uio_iov->iov_len == 0) 845 uio->uio_iov++; 846 uio->uio_resid -= size; 847 uio->uio_offset += size; 848 return (0); 849 } 850 851 /* 852 * unmap and unwire the process buffer 853 */ 854 static void 855 pipe_destroy_write_buffer(wpipe) 856 struct pipe *wpipe; 857 { 858 int i; 859 860 GIANT_REQUIRED; 861 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 862 863 if (wpipe->pipe_map.kva) { 864 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 865 866 if (amountpipekva > maxpipekva) { 867 vm_offset_t kva = wpipe->pipe_map.kva; 868 wpipe->pipe_map.kva = 0; 869 kmem_free(kernel_map, kva, 870 wpipe->pipe_buffer.size + PAGE_SIZE); 871 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; 872 } 873 } 874 for (i = 0; i < wpipe->pipe_map.npages; i++) 875 vm_page_unwire(wpipe->pipe_map.ms[i], 1); 876 wpipe->pipe_map.npages = 0; 877 } 878 879 /* 880 * In the case of a signal, the writing process might go away. This 881 * code copies the data into the circular buffer so that the source 882 * pages can be freed without loss of data. 883 */ 884 static void 885 pipe_clone_write_buffer(wpipe) 886 struct pipe *wpipe; 887 { 888 int size; 889 int pos; 890 891 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 892 size = wpipe->pipe_map.cnt; 893 pos = wpipe->pipe_map.pos; 894 memcpy((caddr_t) wpipe->pipe_buffer.buffer, 895 (caddr_t) wpipe->pipe_map.kva + pos, size); 896 897 wpipe->pipe_buffer.in = size; 898 wpipe->pipe_buffer.out = 0; 899 wpipe->pipe_buffer.cnt = size; 900 wpipe->pipe_state &= ~PIPE_DIRECTW; 901 902 PIPE_GET_GIANT(wpipe); 903 pipe_destroy_write_buffer(wpipe); 904 PIPE_DROP_GIANT(wpipe); 905 } 906 907 /* 908 * This implements the pipe buffer write mechanism. Note that only 909 * a direct write OR a normal pipe write can be pending at any given time. 910 * If there are any characters in the pipe buffer, the direct write will 911 * be deferred until the receiving process grabs all of the bytes from 912 * the pipe buffer. Then the direct mapping write is set-up. 913 */ 914 static int 915 pipe_direct_write(wpipe, uio) 916 struct pipe *wpipe; 917 struct uio *uio; 918 { 919 int error; 920 921 retry: 922 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 923 while (wpipe->pipe_state & PIPE_DIRECTW) { 924 if (wpipe->pipe_state & PIPE_WANTR) { 925 wpipe->pipe_state &= ~PIPE_WANTR; 926 wakeup(wpipe); 927 } 928 wpipe->pipe_state |= PIPE_WANTW; 929 error = msleep(wpipe, PIPE_MTX(wpipe), 930 PRIBIO | PCATCH, "pipdww", 0); 931 if (error) 932 goto error1; 933 if (wpipe->pipe_state & PIPE_EOF) { 934 error = EPIPE; 935 goto error1; 936 } 937 } 938 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 939 if (wpipe->pipe_buffer.cnt > 0) { 940 if (wpipe->pipe_state & PIPE_WANTR) { 941 wpipe->pipe_state &= ~PIPE_WANTR; 942 wakeup(wpipe); 943 } 944 945 wpipe->pipe_state |= PIPE_WANTW; 946 error = msleep(wpipe, PIPE_MTX(wpipe), 947 PRIBIO | PCATCH, "pipdwc", 0); 948 if (error) 949 goto error1; 950 if (wpipe->pipe_state & PIPE_EOF) { 951 error = EPIPE; 952 goto error1; 953 } 954 goto retry; 955 } 956 957 wpipe->pipe_state |= PIPE_DIRECTW; 958 959 PIPE_GET_GIANT(wpipe); 960 error = pipe_build_write_buffer(wpipe, uio); 961 PIPE_DROP_GIANT(wpipe); 962 if (error) { 963 wpipe->pipe_state &= ~PIPE_DIRECTW; 964 goto error1; 965 } 966 967 error = 0; 968 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 969 if (wpipe->pipe_state & PIPE_EOF) { 970 pipelock(wpipe, 0); 971 PIPE_GET_GIANT(wpipe); 972 pipe_destroy_write_buffer(wpipe); 973 PIPE_DROP_GIANT(wpipe); 974 pipeunlock(wpipe); 975 pipeselwakeup(wpipe, wpipe); 976 error = EPIPE; 977 goto error1; 978 } 979 if (wpipe->pipe_state & PIPE_WANTR) { 980 wpipe->pipe_state &= ~PIPE_WANTR; 981 wakeup(wpipe); 982 } 983 pipeselwakeup(wpipe, wpipe); 984 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 985 "pipdwt", 0); 986 } 987 988 pipelock(wpipe,0); 989 if (wpipe->pipe_state & PIPE_DIRECTW) { 990 /* 991 * this bit of trickery substitutes a kernel buffer for 992 * the process that might be going away. 993 */ 994 pipe_clone_write_buffer(wpipe); 995 } else { 996 PIPE_GET_GIANT(wpipe); 997 pipe_destroy_write_buffer(wpipe); 998 PIPE_DROP_GIANT(wpipe); 999 } 1000 pipeunlock(wpipe); 1001 return (error); 1002 1003 error1: 1004 wakeup(wpipe); 1005 return (error); 1006 } 1007 #endif /* !PIPE_NODIRECT */ 1008 #endif /* FreeBSD */ 1009 1010 #ifdef __NetBSD__ 1011 #ifndef PIPE_NODIRECT 1012 /* 1013 * Allocate structure for loan transfer. 1014 */ 1015 static int 1016 pipe_loan_alloc(wpipe, npages) 1017 struct pipe *wpipe; 1018 int npages; 1019 { 1020 vsize_t len; 1021 1022 len = (vsize_t)npages << PAGE_SHIFT; 1023 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, len); 1024 if (wpipe->pipe_map.kva == 0) 1025 return (ENOMEM); 1026 1027 amountpipekva += len; 1028 wpipe->pipe_map.npages = npages; 1029 wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE, 1030 M_WAITOK); 1031 return (0); 1032 } 1033 1034 /* 1035 * Free resources allocated for loan transfer. 1036 */ 1037 static void 1038 pipe_loan_free(wpipe) 1039 struct pipe *wpipe; 1040 { 1041 vsize_t len; 1042 1043 len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT; 1044 uvm_km_free(kernel_map, wpipe->pipe_map.kva, len); 1045 wpipe->pipe_map.kva = 0; 1046 amountpipekva -= len; 1047 free(wpipe->pipe_map.pgs, M_PIPE); 1048 wpipe->pipe_map.pgs = NULL; 1049 } 1050 1051 /* 1052 * NetBSD direct write, using uvm_loan() mechanism. 1053 * This implements the pipe buffer write mechanism. Note that only 1054 * a direct write OR a normal pipe write can be pending at any given time. 1055 * If there are any characters in the pipe buffer, the direct write will 1056 * be deferred until the receiving process grabs all of the bytes from 1057 * the pipe buffer. Then the direct mapping write is set-up. 1058 */ 1059 static int 1060 pipe_direct_write(wpipe, uio) 1061 struct pipe *wpipe; 1062 struct uio *uio; 1063 { 1064 int error, npages, j; 1065 struct vm_page **pgs; 1066 vaddr_t bbase, kva, base, bend; 1067 vsize_t blen, bcnt; 1068 voff_t bpos; 1069 1070 retry: 1071 while (wpipe->pipe_state & PIPE_DIRECTW) { 1072 if (wpipe->pipe_state & PIPE_WANTR) { 1073 wpipe->pipe_state &= ~PIPE_WANTR; 1074 wakeup(wpipe); 1075 } 1076 wpipe->pipe_state |= PIPE_WANTW; 1077 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0); 1078 if (error) 1079 goto error; 1080 if (wpipe->pipe_state & PIPE_EOF) { 1081 error = EPIPE; 1082 goto error; 1083 } 1084 } 1085 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 1086 if (wpipe->pipe_buffer.cnt > 0) { 1087 if (wpipe->pipe_state & PIPE_WANTR) { 1088 wpipe->pipe_state &= ~PIPE_WANTR; 1089 wakeup(wpipe); 1090 } 1091 1092 wpipe->pipe_state |= PIPE_WANTW; 1093 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0); 1094 if (error) 1095 goto error; 1096 if (wpipe->pipe_state & PIPE_EOF) { 1097 error = EPIPE; 1098 goto error; 1099 } 1100 goto retry; 1101 } 1102 1103 /* 1104 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers 1105 * not aligned to PAGE_SIZE. 1106 */ 1107 bbase = (vaddr_t)uio->uio_iov->iov_base; 1108 base = trunc_page(bbase); 1109 bend = round_page(bbase + uio->uio_iov->iov_len); 1110 blen = bend - base; 1111 bpos = bbase - base; 1112 1113 if (blen > PIPE_DIRECT_CHUNK) { 1114 blen = PIPE_DIRECT_CHUNK; 1115 bend = base + blen; 1116 bcnt = PIPE_DIRECT_CHUNK - bpos; 1117 } else { 1118 bcnt = uio->uio_iov->iov_len; 1119 } 1120 npages = blen >> PAGE_SHIFT; 1121 1122 wpipe->pipe_map.pos = bpos; 1123 wpipe->pipe_map.cnt = bcnt; 1124 1125 /* 1126 * Free the old kva if we need more pages than we have 1127 * allocated. 1128 */ 1129 if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages) 1130 pipe_loan_free(wpipe); 1131 1132 /* Allocate new kva. */ 1133 if (wpipe->pipe_map.kva == 0) { 1134 error = pipe_loan_alloc(wpipe, npages); 1135 if (error) { 1136 goto error; 1137 } 1138 } 1139 1140 /* Loan the write buffer memory from writer process */ 1141 pgs = wpipe->pipe_map.pgs; 1142 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen, 1143 pgs, UVM_LOAN_TOPAGE); 1144 if (error) { 1145 pgs = NULL; 1146 goto cleanup; 1147 } 1148 1149 /* Enter the loaned pages to kva */ 1150 kva = wpipe->pipe_map.kva; 1151 for (j = 0; j < npages; j++, kva += PAGE_SIZE) { 1152 pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ); 1153 } 1154 pmap_update(pmap_kernel()); 1155 1156 wpipe->pipe_state |= PIPE_DIRECTW; 1157 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 1158 if (wpipe->pipe_state & PIPE_EOF) { 1159 error = EPIPE; 1160 break; 1161 } 1162 if (wpipe->pipe_state & PIPE_WANTR) { 1163 wpipe->pipe_state &= ~PIPE_WANTR; 1164 wakeup(wpipe); 1165 } 1166 pipeselwakeup(wpipe, wpipe); 1167 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0); 1168 } 1169 1170 if (error) 1171 wpipe->pipe_state &= ~PIPE_DIRECTW; 1172 1173 cleanup: 1174 pipelock(wpipe, 0); 1175 if (pgs != NULL) { 1176 pmap_kremove(wpipe->pipe_map.kva, blen); 1177 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE); 1178 } 1179 if (error || amountpipekva > maxpipekva) 1180 pipe_loan_free(wpipe); 1181 pipeunlock(wpipe); 1182 1183 if (error) { 1184 pipeselwakeup(wpipe, wpipe); 1185 1186 /* 1187 * If nothing was read from what we offered, return error 1188 * straight on. Otherwise update uio resid first. Caller 1189 * will deal with the error condition, returning short 1190 * write, error, or restarting the write(2) as appropriate. 1191 */ 1192 if (wpipe->pipe_map.cnt == bcnt) { 1193 error: 1194 wakeup(wpipe); 1195 return (error); 1196 } 1197 1198 bcnt -= wpipe->pipe_map.cnt; 1199 } 1200 1201 uio->uio_resid -= bcnt; 1202 /* uio_offset not updated, not set/used for write(2) */ 1203 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt; 1204 uio->uio_iov->iov_len -= bcnt; 1205 if (uio->uio_iov->iov_len == 0) { 1206 uio->uio_iov++; 1207 uio->uio_iovcnt--; 1208 } 1209 1210 return (error); 1211 } 1212 #endif /* !PIPE_NODIRECT */ 1213 #endif /* NetBSD */ 1214 1215 #ifdef __FreeBSD__ 1216 static int 1217 pipe_write(fp, uio, cred, flags, td) 1218 struct file *fp; 1219 off_t *offset; 1220 struct uio *uio; 1221 struct ucred *cred; 1222 int flags; 1223 struct thread *td; 1224 #elif defined(__NetBSD__) 1225 static int 1226 pipe_write(fp, offset, uio, cred, flags) 1227 struct file *fp; 1228 off_t *offset; 1229 struct uio *uio; 1230 struct ucred *cred; 1231 int flags; 1232 #endif 1233 { 1234 int error = 0; 1235 struct pipe *wpipe, *rpipe; 1236 1237 rpipe = (struct pipe *) fp->f_data; 1238 wpipe = rpipe->pipe_peer; 1239 1240 PIPE_LOCK(rpipe); 1241 /* 1242 * detect loss of pipe read side, issue SIGPIPE if lost. 1243 */ 1244 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1245 PIPE_UNLOCK(rpipe); 1246 return (EPIPE); 1247 } 1248 1249 ++wpipe->pipe_busy; 1250 1251 /* 1252 * If it is advantageous to resize the pipe buffer, do 1253 * so. 1254 */ 1255 if ((uio->uio_resid > PIPE_SIZE) && 1256 (nbigpipe < maxbigpipes) && 1257 #ifndef PIPE_NODIRECT 1258 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 1259 #endif 1260 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 1261 (wpipe->pipe_buffer.cnt == 0)) { 1262 1263 if ((error = pipelock(wpipe,1)) == 0) { 1264 PIPE_GET_GIANT(rpipe); 1265 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 1266 nbigpipe++; 1267 PIPE_DROP_GIANT(rpipe); 1268 pipeunlock(wpipe); 1269 } else { 1270 /* 1271 * If an error occurred, unbusy and return, waking up 1272 * any waiting readers. 1273 */ 1274 --wpipe->pipe_busy; 1275 if (wpipe->pipe_busy == 0 1276 && (wpipe->pipe_state & PIPE_WANTCLOSE)) { 1277 wpipe->pipe_state &= 1278 ~(PIPE_WANTCLOSE | PIPE_WANTR); 1279 wakeup(wpipe); 1280 } 1281 1282 return (error); 1283 } 1284 } 1285 1286 #ifdef __FreeBSD__ 1287 /* 1288 * If an early error occured unbusy and return, waking up any pending 1289 * readers. 1290 */ 1291 if (error) { 1292 --wpipe->pipe_busy; 1293 if ((wpipe->pipe_busy == 0) && 1294 (wpipe->pipe_state & PIPE_WANT)) { 1295 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1296 wakeup(wpipe); 1297 } 1298 PIPE_UNLOCK(rpipe); 1299 return(error); 1300 } 1301 1302 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); 1303 #endif 1304 1305 while (uio->uio_resid) { 1306 int space; 1307 1308 #ifndef PIPE_NODIRECT 1309 /* 1310 * If the transfer is large, we can gain performance if 1311 * we do process-to-process copies directly. 1312 * If the write is non-blocking, we don't use the 1313 * direct write mechanism. 1314 * 1315 * The direct write mechanism will detect the reader going 1316 * away on us. 1317 */ 1318 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 1319 (fp->f_flag & FNONBLOCK) == 0 && 1320 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) { 1321 error = pipe_direct_write(wpipe, uio); 1322 1323 /* 1324 * Break out if error occured, unless it's ENOMEM. 1325 * ENOMEM means we failed to allocate some resources 1326 * for direct write, so we just fallback to ordinary 1327 * write. If the direct write was successful, 1328 * process rest of data via ordinary write. 1329 */ 1330 if (!error) 1331 continue; 1332 1333 if (error != ENOMEM) 1334 break; 1335 } 1336 #endif /* PIPE_NODIRECT */ 1337 1338 /* 1339 * Pipe buffered writes cannot be coincidental with 1340 * direct writes. We wait until the currently executing 1341 * direct write is completed before we start filling the 1342 * pipe buffer. We break out if a signal occurs or the 1343 * reader goes away. 1344 */ 1345 retrywrite: 1346 while (wpipe->pipe_state & PIPE_DIRECTW) { 1347 if (wpipe->pipe_state & PIPE_WANTR) { 1348 wpipe->pipe_state &= ~PIPE_WANTR; 1349 wakeup(wpipe); 1350 } 1351 #ifdef __FreeBSD__ 1352 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1353 "pipbww", 0); 1354 #else 1355 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0); 1356 #endif 1357 if (wpipe->pipe_state & PIPE_EOF) 1358 break; 1359 if (error) 1360 break; 1361 } 1362 if (wpipe->pipe_state & PIPE_EOF) { 1363 error = EPIPE; 1364 break; 1365 } 1366 1367 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1368 1369 /* Writes of size <= PIPE_BUF must be atomic. */ 1370 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF)) 1371 space = 0; 1372 1373 if (space > 0) { 1374 int size; /* Transfer size */ 1375 int segsize; /* first segment to transfer */ 1376 1377 if ((error = pipelock(wpipe,1)) != 0) 1378 break; 1379 1380 /* 1381 * It is possible for a direct write to 1382 * slip in on us... handle it here... 1383 */ 1384 if (wpipe->pipe_state & PIPE_DIRECTW) { 1385 pipeunlock(wpipe); 1386 goto retrywrite; 1387 } 1388 /* 1389 * If a process blocked in uiomove, our 1390 * value for space might be bad. 1391 * 1392 * XXX will we be ok if the reader has gone 1393 * away here? 1394 */ 1395 if (space > wpipe->pipe_buffer.size - 1396 wpipe->pipe_buffer.cnt) { 1397 pipeunlock(wpipe); 1398 goto retrywrite; 1399 } 1400 1401 /* 1402 * Transfer size is minimum of uio transfer 1403 * and free space in pipe buffer. 1404 */ 1405 if (space > uio->uio_resid) 1406 size = uio->uio_resid; 1407 else 1408 size = space; 1409 /* 1410 * First segment to transfer is minimum of 1411 * transfer size and contiguous space in 1412 * pipe buffer. If first segment to transfer 1413 * is less than the transfer size, we've got 1414 * a wraparound in the buffer. 1415 */ 1416 segsize = wpipe->pipe_buffer.size - 1417 wpipe->pipe_buffer.in; 1418 if (segsize > size) 1419 segsize = size; 1420 1421 /* Transfer first segment */ 1422 1423 PIPE_UNLOCK(rpipe); 1424 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1425 segsize, uio); 1426 PIPE_LOCK(rpipe); 1427 1428 if (error == 0 && segsize < size) { 1429 /* 1430 * Transfer remaining part now, to 1431 * support atomic writes. Wraparound 1432 * happened. 1433 */ 1434 #ifdef DEBUG 1435 if (wpipe->pipe_buffer.in + segsize != 1436 wpipe->pipe_buffer.size) 1437 panic("Expected pipe buffer wraparound disappeared"); 1438 #endif 1439 1440 PIPE_UNLOCK(rpipe); 1441 error = uiomove(&wpipe->pipe_buffer.buffer[0], 1442 size - segsize, uio); 1443 PIPE_LOCK(rpipe); 1444 } 1445 if (error == 0) { 1446 wpipe->pipe_buffer.in += size; 1447 if (wpipe->pipe_buffer.in >= 1448 wpipe->pipe_buffer.size) { 1449 #ifdef DEBUG 1450 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) 1451 panic("Expected wraparound bad"); 1452 #endif 1453 wpipe->pipe_buffer.in = size - segsize; 1454 } 1455 1456 wpipe->pipe_buffer.cnt += size; 1457 #ifdef DEBUG 1458 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) 1459 panic("Pipe buffer overflow"); 1460 #endif 1461 } 1462 pipeunlock(wpipe); 1463 if (error) 1464 break; 1465 } else { 1466 /* 1467 * If the "read-side" has been blocked, wake it up now. 1468 */ 1469 if (wpipe->pipe_state & PIPE_WANTR) { 1470 wpipe->pipe_state &= ~PIPE_WANTR; 1471 wakeup(wpipe); 1472 } 1473 1474 /* 1475 * don't block on non-blocking I/O 1476 */ 1477 if (fp->f_flag & FNONBLOCK) { 1478 error = EAGAIN; 1479 break; 1480 } 1481 1482 /* 1483 * We have no more space and have something to offer, 1484 * wake up select/poll. 1485 */ 1486 pipeselwakeup(wpipe, wpipe); 1487 1488 wpipe->pipe_state |= PIPE_WANTW; 1489 #ifdef __FreeBSD__ 1490 error = msleep(wpipe, PIPE_MTX(rpipe), 1491 PRIBIO | PCATCH, "pipewr", 0); 1492 #else 1493 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0); 1494 #endif 1495 if (error != 0) 1496 break; 1497 /* 1498 * If read side wants to go away, we just issue a signal 1499 * to ourselves. 1500 */ 1501 if (wpipe->pipe_state & PIPE_EOF) { 1502 error = EPIPE; 1503 break; 1504 } 1505 } 1506 } 1507 1508 --wpipe->pipe_busy; 1509 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) { 1510 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR); 1511 wakeup(wpipe); 1512 } else if (wpipe->pipe_buffer.cnt > 0) { 1513 /* 1514 * If we have put any characters in the buffer, we wake up 1515 * the reader. 1516 */ 1517 if (wpipe->pipe_state & PIPE_WANTR) { 1518 wpipe->pipe_state &= ~PIPE_WANTR; 1519 wakeup(wpipe); 1520 } 1521 } 1522 1523 /* 1524 * Don't return EPIPE if I/O was successful 1525 */ 1526 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0) 1527 && (uio->uio_resid == 0)) 1528 error = 0; 1529 1530 if (error == 0) 1531 vfs_timestamp(&wpipe->pipe_mtime); 1532 1533 /* 1534 * We have something to offer, wake up select/poll. 1535 * wpipe->pipe_map.cnt is always 0 in this point (direct write 1536 * is only done synchronously), so check only wpipe->pipe_buffer.cnt 1537 */ 1538 if (wpipe->pipe_buffer.cnt) 1539 pipeselwakeup(wpipe, wpipe); 1540 1541 /* 1542 * Arrange for next read(2) to do a signal. 1543 */ 1544 wpipe->pipe_state |= PIPE_SIGNALR; 1545 1546 PIPE_UNLOCK(rpipe); 1547 return (error); 1548 } 1549 1550 /* 1551 * we implement a very minimal set of ioctls for compatibility with sockets. 1552 */ 1553 int 1554 #ifdef __FreeBSD__ 1555 pipe_ioctl(fp, cmd, data, td) 1556 struct file *fp; 1557 u_long cmd; 1558 caddr_t data; 1559 struct thread *td; 1560 #else 1561 pipe_ioctl(fp, cmd, data, p) 1562 struct file *fp; 1563 u_long cmd; 1564 caddr_t data; 1565 struct proc *p; 1566 #endif 1567 { 1568 struct pipe *mpipe = (struct pipe *)fp->f_data; 1569 1570 switch (cmd) { 1571 1572 case FIONBIO: 1573 return (0); 1574 1575 case FIOASYNC: 1576 PIPE_LOCK(mpipe); 1577 if (*(int *)data) { 1578 mpipe->pipe_state |= PIPE_ASYNC; 1579 } else { 1580 mpipe->pipe_state &= ~PIPE_ASYNC; 1581 } 1582 PIPE_UNLOCK(mpipe); 1583 return (0); 1584 1585 case FIONREAD: 1586 PIPE_LOCK(mpipe); 1587 #ifndef PIPE_NODIRECT 1588 if (mpipe->pipe_state & PIPE_DIRECTW) 1589 *(int *)data = mpipe->pipe_map.cnt; 1590 else 1591 #endif 1592 *(int *)data = mpipe->pipe_buffer.cnt; 1593 PIPE_UNLOCK(mpipe); 1594 return (0); 1595 1596 #ifdef __FreeBSD__ 1597 case FIOSETOWN: 1598 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 1599 1600 case FIOGETOWN: 1601 *(int *)data = fgetown(mpipe->pipe_sigio); 1602 return (0); 1603 1604 /* This is deprecated, FIOSETOWN should be used instead. */ 1605 case TIOCSPGRP: 1606 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 1607 1608 /* This is deprecated, FIOGETOWN should be used instead. */ 1609 case TIOCGPGRP: 1610 *(int *)data = -fgetown(mpipe->pipe_sigio); 1611 return (0); 1612 #endif /* FreeBSD */ 1613 #ifdef __NetBSD__ 1614 case TIOCSPGRP: 1615 mpipe->pipe_pgid = *(int *)data; 1616 return (0); 1617 1618 case TIOCGPGRP: 1619 *(int *)data = mpipe->pipe_pgid; 1620 return (0); 1621 #endif /* NetBSD */ 1622 1623 } 1624 return (EPASSTHROUGH); 1625 } 1626 1627 int 1628 #ifdef __FreeBSD__ 1629 pipe_poll(fp, events, cred, td) 1630 struct file *fp; 1631 int events; 1632 struct ucred *cred; 1633 struct thread *td; 1634 #elif defined(__NetBSD__) 1635 pipe_poll(fp, events, td) 1636 struct file *fp; 1637 int events; 1638 struct proc *td; 1639 #endif 1640 { 1641 struct pipe *rpipe = (struct pipe *)fp->f_data; 1642 struct pipe *wpipe; 1643 int revents = 0; 1644 1645 wpipe = rpipe->pipe_peer; 1646 PIPE_LOCK(rpipe); 1647 if (events & (POLLIN | POLLRDNORM)) 1648 if ((rpipe->pipe_buffer.cnt > 0) || 1649 #ifndef PIPE_NODIRECT 1650 (rpipe->pipe_state & PIPE_DIRECTW) || 1651 #endif 1652 (rpipe->pipe_state & PIPE_EOF)) 1653 revents |= events & (POLLIN | POLLRDNORM); 1654 1655 if (events & (POLLOUT | POLLWRNORM)) 1656 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) 1657 || ( 1658 #ifndef PIPE_NODIRECT 1659 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1660 #endif 1661 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1662 revents |= events & (POLLOUT | POLLWRNORM); 1663 1664 if ((rpipe->pipe_state & PIPE_EOF) || 1665 (wpipe == NULL) || 1666 (wpipe->pipe_state & PIPE_EOF)) 1667 revents |= POLLHUP; 1668 1669 if (revents == 0) { 1670 if (events & (POLLIN | POLLRDNORM)) { 1671 selrecord(td, &rpipe->pipe_sel); 1672 rpipe->pipe_state |= PIPE_SEL; 1673 } 1674 1675 if (events & (POLLOUT | POLLWRNORM)) { 1676 selrecord(td, &wpipe->pipe_sel); 1677 wpipe->pipe_state |= PIPE_SEL; 1678 } 1679 } 1680 PIPE_UNLOCK(rpipe); 1681 1682 return (revents); 1683 } 1684 1685 static int 1686 #ifdef __FreeBSD__ 1687 pipe_stat(fp, ub, td) 1688 struct file *fp; 1689 struct stat *ub; 1690 struct thread *td; 1691 #else 1692 pipe_stat(fp, ub, td) 1693 struct file *fp; 1694 struct stat *ub; 1695 struct proc *td; 1696 #endif 1697 { 1698 struct pipe *pipe = (struct pipe *)fp->f_data; 1699 1700 memset((caddr_t)ub, 0, sizeof(*ub)); 1701 ub->st_mode = S_IFIFO; 1702 ub->st_blksize = pipe->pipe_buffer.size; 1703 ub->st_size = pipe->pipe_buffer.cnt; 1704 ub->st_blocks = (ub->st_size) ? 1 : 0; 1705 #ifdef __FreeBSD__ 1706 ub->st_atimespec = pipe->pipe_atime; 1707 ub->st_mtimespec = pipe->pipe_mtime; 1708 ub->st_ctimespec = pipe->pipe_ctime; 1709 #endif /* FreeBSD */ 1710 #ifdef __NetBSD__ 1711 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec) 1712 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); 1713 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); 1714 #endif /* NetBSD */ 1715 ub->st_uid = fp->f_cred->cr_uid; 1716 ub->st_gid = fp->f_cred->cr_gid; 1717 /* 1718 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1719 * XXX (st_dev, st_ino) should be unique. 1720 */ 1721 return (0); 1722 } 1723 1724 /* ARGSUSED */ 1725 static int 1726 #ifdef __FreeBSD__ 1727 pipe_close(fp, td) 1728 struct file *fp; 1729 struct thread *td; 1730 #else 1731 pipe_close(fp, td) 1732 struct file *fp; 1733 struct proc *td; 1734 #endif 1735 { 1736 struct pipe *cpipe = (struct pipe *)fp->f_data; 1737 1738 #ifdef __FreeBSD__ 1739 fp->f_ops = &badfileops; 1740 funsetown(cpipe->pipe_sigio); 1741 #endif 1742 fp->f_data = NULL; 1743 pipeclose(cpipe); 1744 return (0); 1745 } 1746 1747 static void 1748 pipe_free_kmem(cpipe) 1749 struct pipe *cpipe; 1750 { 1751 1752 #ifdef __FreeBSD__ 1753 1754 GIANT_REQUIRED; 1755 KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)), 1756 ("pipespace: pipe mutex locked")); 1757 #endif 1758 1759 if (cpipe->pipe_buffer.buffer != NULL) { 1760 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1761 --nbigpipe; 1762 amountpipekva -= cpipe->pipe_buffer.size; 1763 #ifdef __FreeBSD__ 1764 kmem_free(kernel_map, 1765 (vm_offset_t)cpipe->pipe_buffer.buffer, 1766 cpipe->pipe_buffer.size); 1767 #elif defined(__NetBSD__) 1768 uvm_km_free(kernel_map, 1769 (vaddr_t)cpipe->pipe_buffer.buffer, 1770 cpipe->pipe_buffer.size); 1771 #endif /* NetBSD */ 1772 cpipe->pipe_buffer.buffer = NULL; 1773 } 1774 #ifndef PIPE_NODIRECT 1775 if (cpipe->pipe_map.kva != 0) { 1776 #ifdef __FreeBSD__ 1777 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; 1778 kmem_free(kernel_map, 1779 cpipe->pipe_map.kva, 1780 cpipe->pipe_buffer.size + PAGE_SIZE); 1781 #elif defined(__NetBSD__) 1782 pipe_loan_free(cpipe); 1783 #endif /* NetBSD */ 1784 cpipe->pipe_map.cnt = 0; 1785 cpipe->pipe_map.kva = 0; 1786 cpipe->pipe_map.pos = 0; 1787 cpipe->pipe_map.npages = 0; 1788 } 1789 #endif /* !PIPE_NODIRECT */ 1790 } 1791 1792 /* 1793 * shutdown the pipe 1794 */ 1795 static void 1796 pipeclose(cpipe) 1797 struct pipe *cpipe; 1798 { 1799 struct pipe *ppipe; 1800 #ifdef __FreeBSD__ 1801 int hadpeer = 0; 1802 #endif 1803 1804 if (cpipe == NULL) 1805 return; 1806 1807 /* partially created pipes won't have a valid mutex. */ 1808 if (PIPE_MTX(cpipe) != NULL) 1809 PIPE_LOCK(cpipe); 1810 1811 pipeselwakeup(cpipe, cpipe); 1812 1813 /* 1814 * If the other side is blocked, wake it up saying that 1815 * we want to close it down. 1816 */ 1817 while (cpipe->pipe_busy) { 1818 wakeup(cpipe); 1819 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF; 1820 #ifdef __FreeBSD__ 1821 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1822 #else 1823 tsleep(cpipe, PRIBIO, "pipecl", 0); 1824 #endif 1825 } 1826 1827 /* 1828 * Disconnect from peer 1829 */ 1830 if ((ppipe = cpipe->pipe_peer) != NULL) { 1831 #ifdef __FreeBSD__ 1832 hadpeer++; 1833 #endif 1834 pipeselwakeup(ppipe, ppipe); 1835 1836 ppipe->pipe_state |= PIPE_EOF; 1837 wakeup(ppipe); 1838 #ifdef __FreeBSD__ 1839 KNOTE(&ppipe->pipe_sel.si_note, 0); 1840 #endif 1841 ppipe->pipe_peer = NULL; 1842 } 1843 /* 1844 * free resources 1845 */ 1846 #ifdef __FreeBSD__ 1847 if (PIPE_MTX(cpipe) != NULL) { 1848 PIPE_UNLOCK(cpipe); 1849 if (!hadpeer) { 1850 mtx_destroy(PIPE_MTX(cpipe)); 1851 free(PIPE_MTX(cpipe), M_TEMP); 1852 } 1853 } 1854 mtx_lock(&Giant); 1855 pipe_free_kmem(cpipe); 1856 zfree(pipe_zone, cpipe); 1857 mtx_unlock(&Giant); 1858 #endif 1859 1860 #ifdef __NetBSD__ 1861 if (PIPE_MTX(cpipe) != NULL) 1862 PIPE_UNLOCK(cpipe); 1863 1864 pipe_free_kmem(cpipe); 1865 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL); 1866 pool_put(&pipe_pool, cpipe); 1867 #endif 1868 } 1869 1870 #ifdef __FreeBSD__ 1871 /*ARGSUSED*/ 1872 static int 1873 pipe_kqfilter(struct file *fp, struct knote *kn) 1874 { 1875 struct pipe *cpipe; 1876 1877 cpipe = (struct pipe *)kn->kn_fp->f_data; 1878 switch (kn->kn_filter) { 1879 case EVFILT_READ: 1880 kn->kn_fop = &pipe_rfiltops; 1881 break; 1882 case EVFILT_WRITE: 1883 kn->kn_fop = &pipe_wfiltops; 1884 cpipe = cpipe->pipe_peer; 1885 break; 1886 default: 1887 return (1); 1888 } 1889 kn->kn_hook = (caddr_t)cpipe; 1890 1891 PIPE_LOCK(cpipe); 1892 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); 1893 PIPE_UNLOCK(cpipe); 1894 return (0); 1895 } 1896 1897 static void 1898 filt_pipedetach(struct knote *kn) 1899 { 1900 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1901 1902 PIPE_LOCK(cpipe); 1903 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); 1904 PIPE_UNLOCK(cpipe); 1905 } 1906 1907 /*ARGSUSED*/ 1908 static int 1909 filt_piperead(struct knote *kn, long hint) 1910 { 1911 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1912 struct pipe *wpipe = rpipe->pipe_peer; 1913 1914 PIPE_LOCK(rpipe); 1915 kn->kn_data = rpipe->pipe_buffer.cnt; 1916 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1917 kn->kn_data = rpipe->pipe_map.cnt; 1918 1919 if ((rpipe->pipe_state & PIPE_EOF) || 1920 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1921 kn->kn_flags |= EV_EOF; 1922 PIPE_UNLOCK(rpipe); 1923 return (1); 1924 } 1925 PIPE_UNLOCK(rpipe); 1926 return (kn->kn_data > 0); 1927 } 1928 1929 /*ARGSUSED*/ 1930 static int 1931 filt_pipewrite(struct knote *kn, long hint) 1932 { 1933 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1934 struct pipe *wpipe = rpipe->pipe_peer; 1935 1936 PIPE_LOCK(rpipe); 1937 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1938 kn->kn_data = 0; 1939 kn->kn_flags |= EV_EOF; 1940 PIPE_UNLOCK(rpipe); 1941 return (1); 1942 } 1943 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1944 if (wpipe->pipe_state & PIPE_DIRECTW) 1945 kn->kn_data = 0; 1946 1947 PIPE_UNLOCK(rpipe); 1948 return (kn->kn_data >= PIPE_BUF); 1949 } 1950 #endif /* FreeBSD */ 1951 1952 #ifdef __NetBSD__ 1953 static int 1954 pipe_fcntl(fp, cmd, data, p) 1955 struct file *fp; 1956 u_int cmd; 1957 caddr_t data; 1958 struct proc *p; 1959 { 1960 if (cmd == F_SETFL) 1961 return (0); 1962 else 1963 return (EOPNOTSUPP); 1964 } 1965 1966 /* 1967 * Handle pipe sysctls. 1968 */ 1969 int 1970 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen) 1971 int *name; 1972 u_int namelen; 1973 void *oldp; 1974 size_t *oldlenp; 1975 void *newp; 1976 size_t newlen; 1977 { 1978 /* All sysctl names at this level are terminal. */ 1979 if (namelen != 1) 1980 return (ENOTDIR); /* overloaded */ 1981 1982 switch (name[0]) { 1983 case KERN_PIPE_MAXKVASZ: 1984 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva)); 1985 case KERN_PIPE_LIMITKVA: 1986 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva)); 1987 case KERN_PIPE_MAXBIGPIPES: 1988 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes)); 1989 case KERN_PIPE_NBIGPIPES: 1990 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe)); 1991 case KERN_PIPE_KVASIZE: 1992 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva)); 1993 default: 1994 return (EOPNOTSUPP); 1995 } 1996 /* NOTREACHED */ 1997 } 1998 1999 /* 2000 * Initialize pipe structs. 2001 */ 2002 void 2003 pipe_init(void) 2004 { 2005 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl", NULL); 2006 } 2007 2008 #endif /* __NetBSD __ */ 2009