1 /* 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.60.2.13 2002/08/05 15:05:15 des Exp $ 20 * $DragonFly: src/sys/kern/sys_pipe.c,v 1.13 2003/11/03 17:11:21 dillon Exp $ 21 */ 22 23 /* 24 * This file contains a high-performance replacement for the socket-based 25 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 26 * all features of sockets, but does do everything that pipes normally 27 * do. 28 */ 29 30 /* 31 * This code has two modes of operation, a small write mode and a large 32 * write mode. The small write mode acts like conventional pipes with 33 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 34 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 35 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 36 * the receiving process can copy it directly from the pages in the sending 37 * process. 38 * 39 * If the sending process receives a signal, it is possible that it will 40 * go away, and certainly its address space can change, because control 41 * is returned back to the user-mode side. In that case, the pipe code 42 * arranges to copy the buffer supplied by the user process, to a pageable 43 * kernel buffer, and the receiving process will grab the data from the 44 * pageable kernel buffer. Since signals don't happen all that often, 45 * the copy operation is normally eliminated. 46 * 47 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 48 * happen for small transfers so that the system will not spend all of 49 * its time context switching. PIPE_SIZE is constrained by the 50 * amount of kernel virtual memory. 51 */ 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/proc.h> 56 #include <sys/fcntl.h> 57 #include <sys/file.h> 58 #include <sys/filedesc.h> 59 #include <sys/filio.h> 60 #include <sys/ttycom.h> 61 #include <sys/stat.h> 62 #include <sys/poll.h> 63 #include <sys/select.h> 64 #include <sys/signalvar.h> 65 #include <sys/sysproto.h> 66 #include <sys/pipe.h> 67 #include <sys/vnode.h> 68 #include <sys/uio.h> 69 #include <sys/event.h> 70 71 #include <vm/vm.h> 72 #include <vm/vm_param.h> 73 #include <sys/lock.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_kern.h> 76 #include <vm/vm_extern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_zone.h> 81 82 #include <sys/file2.h> 83 84 /* 85 * Use this define if you want to disable *fancy* VM things. Expect an 86 * approx 30% decrease in transfer rate. This could be useful for 87 * NetBSD or OpenBSD. 88 */ 89 /* #define PIPE_NODIRECT */ 90 91 /* 92 * interfaces to the outside world 93 */ 94 static int pipe_read (struct file *fp, struct uio *uio, 95 struct ucred *cred, int flags, struct thread *td); 96 static int pipe_write (struct file *fp, struct uio *uio, 97 struct ucred *cred, int flags, struct thread *td); 98 static int pipe_close (struct file *fp, struct thread *td); 99 static int pipe_poll (struct file *fp, int events, struct ucred *cred, 100 struct thread *td); 101 static int pipe_kqfilter (struct file *fp, struct knote *kn); 102 static int pipe_stat (struct file *fp, struct stat *sb, struct thread *td); 103 static int pipe_ioctl (struct file *fp, u_long cmd, caddr_t data, struct thread *td); 104 105 static struct fileops pipeops = { 106 NULL, /* port */ 107 0, /* autoq */ 108 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter, 109 pipe_stat, pipe_close 110 }; 111 112 static void filt_pipedetach(struct knote *kn); 113 static int filt_piperead(struct knote *kn, long hint); 114 static int filt_pipewrite(struct knote *kn, long hint); 115 116 static struct filterops pipe_rfiltops = 117 { 1, NULL, filt_pipedetach, filt_piperead }; 118 static struct filterops pipe_wfiltops = 119 { 1, NULL, filt_pipedetach, filt_pipewrite }; 120 121 122 /* 123 * Default pipe buffer size(s), this can be kind-of large now because pipe 124 * space is pageable. The pipe code will try to maintain locality of 125 * reference for performance reasons, so small amounts of outstanding I/O 126 * will not wipe the cache. 127 */ 128 #define MINPIPESIZE (PIPE_SIZE/3) 129 #define MAXPIPESIZE (2*PIPE_SIZE/3) 130 131 /* 132 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 133 * is there so that on large systems, we don't exhaust it. 134 */ 135 #define MAXPIPEKVA (8*1024*1024) 136 137 /* 138 * Limit for direct transfers, we cannot, of course limit 139 * the amount of kva for pipes in general though. 140 */ 141 #define LIMITPIPEKVA (16*1024*1024) 142 143 /* 144 * Limit the number of "big" pipes 145 */ 146 #define LIMITBIGPIPES 32 147 static int nbigpipe; 148 149 static int amountpipekva; 150 151 static void pipeclose (struct pipe *cpipe); 152 static void pipe_free_kmem (struct pipe *cpipe); 153 static int pipe_create (struct pipe **cpipep); 154 static __inline int pipelock (struct pipe *cpipe, int catch); 155 static __inline void pipeunlock (struct pipe *cpipe); 156 static __inline void pipeselwakeup (struct pipe *cpipe); 157 #ifndef PIPE_NODIRECT 158 static int pipe_build_write_buffer (struct pipe *wpipe, struct uio *uio); 159 static void pipe_destroy_write_buffer (struct pipe *wpipe); 160 static int pipe_direct_write (struct pipe *wpipe, struct uio *uio); 161 static void pipe_clone_write_buffer (struct pipe *wpipe); 162 #endif 163 static int pipespace (struct pipe *cpipe, int size); 164 165 static vm_zone_t pipe_zone; 166 167 /* 168 * The pipe system call for the DTYPE_PIPE type of pipes 169 * 170 * pipe_ARgs(int dummy) 171 */ 172 173 /* ARGSUSED */ 174 int 175 pipe(struct pipe_args *uap) 176 { 177 struct thread *td = curthread; 178 struct proc *p = td->td_proc; 179 struct filedesc *fdp; 180 struct file *rf, *wf; 181 struct pipe *rpipe, *wpipe; 182 int fd1, fd2, error; 183 184 KKASSERT(p); 185 fdp = p->p_fd; 186 187 if (pipe_zone == NULL) 188 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4); 189 190 rpipe = wpipe = NULL; 191 if (pipe_create(&rpipe) || pipe_create(&wpipe)) { 192 pipeclose(rpipe); 193 pipeclose(wpipe); 194 return (ENFILE); 195 } 196 197 rpipe->pipe_state |= PIPE_DIRECTOK; 198 wpipe->pipe_state |= PIPE_DIRECTOK; 199 200 error = falloc(p, &rf, &fd1); 201 if (error) { 202 pipeclose(rpipe); 203 pipeclose(wpipe); 204 return (error); 205 } 206 fhold(rf); 207 uap->sysmsg_fds[0] = fd1; 208 209 /* 210 * Warning: once we've gotten past allocation of the fd for the 211 * read-side, we can only drop the read side via fdrop() in order 212 * to avoid races against processes which manage to dup() the read 213 * side while we are blocked trying to allocate the write side. 214 */ 215 rf->f_flag = FREAD | FWRITE; 216 rf->f_type = DTYPE_PIPE; 217 rf->f_data = (caddr_t)rpipe; 218 rf->f_ops = &pipeops; 219 error = falloc(p, &wf, &fd2); 220 if (error) { 221 if (fdp->fd_ofiles[fd1] == rf) { 222 fdp->fd_ofiles[fd1] = NULL; 223 fdrop(rf, td); 224 } 225 fdrop(rf, td); 226 /* rpipe has been closed by fdrop(). */ 227 pipeclose(wpipe); 228 return (error); 229 } 230 wf->f_flag = FREAD | FWRITE; 231 wf->f_type = DTYPE_PIPE; 232 wf->f_data = (caddr_t)wpipe; 233 wf->f_ops = &pipeops; 234 uap->sysmsg_fds[1] = fd2; 235 236 rpipe->pipe_peer = wpipe; 237 wpipe->pipe_peer = rpipe; 238 fdrop(rf, td); 239 240 return (0); 241 } 242 243 /* 244 * Allocate kva for pipe circular buffer, the space is pageable 245 * This routine will 'realloc' the size of a pipe safely, if it fails 246 * it will retain the old buffer. 247 * If it fails it will return ENOMEM. 248 */ 249 static int 250 pipespace(cpipe, size) 251 struct pipe *cpipe; 252 int size; 253 { 254 struct vm_object *object; 255 caddr_t buffer; 256 int npages, error; 257 258 npages = round_page(size)/PAGE_SIZE; 259 /* 260 * Create an object, I don't like the idea of paging to/from 261 * kernel_object. 262 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 263 */ 264 object = vm_object_allocate(OBJT_DEFAULT, npages); 265 buffer = (caddr_t) vm_map_min(kernel_map); 266 267 /* 268 * Insert the object into the kernel map, and allocate kva for it. 269 * The map entry is, by default, pageable. 270 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 271 */ 272 error = vm_map_find(kernel_map, object, 0, 273 (vm_offset_t *) &buffer, size, 1, 274 VM_PROT_ALL, VM_PROT_ALL, 0); 275 276 if (error != KERN_SUCCESS) { 277 vm_object_deallocate(object); 278 return (ENOMEM); 279 } 280 281 /* free old resources if we're resizing */ 282 pipe_free_kmem(cpipe); 283 cpipe->pipe_buffer.object = object; 284 cpipe->pipe_buffer.buffer = buffer; 285 cpipe->pipe_buffer.size = size; 286 cpipe->pipe_buffer.in = 0; 287 cpipe->pipe_buffer.out = 0; 288 cpipe->pipe_buffer.cnt = 0; 289 amountpipekva += cpipe->pipe_buffer.size; 290 return (0); 291 } 292 293 /* 294 * initialize and allocate VM and memory for pipe 295 */ 296 static int 297 pipe_create(cpipep) 298 struct pipe **cpipep; 299 { 300 struct pipe *cpipe; 301 int error; 302 303 *cpipep = zalloc(pipe_zone); 304 if (*cpipep == NULL) 305 return (ENOMEM); 306 307 cpipe = *cpipep; 308 309 /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */ 310 cpipe->pipe_buffer.object = NULL; 311 #ifndef PIPE_NODIRECT 312 cpipe->pipe_map.kva = NULL; 313 #endif 314 /* 315 * protect so pipeclose() doesn't follow a junk pointer 316 * if pipespace() fails. 317 */ 318 bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel)); 319 cpipe->pipe_state = 0; 320 cpipe->pipe_peer = NULL; 321 cpipe->pipe_busy = 0; 322 323 #ifndef PIPE_NODIRECT 324 /* 325 * pipe data structure initializations to support direct pipe I/O 326 */ 327 cpipe->pipe_map.cnt = 0; 328 cpipe->pipe_map.kva = 0; 329 cpipe->pipe_map.pos = 0; 330 cpipe->pipe_map.npages = 0; 331 /* cpipe->pipe_map.ms[] = invalid */ 332 #endif 333 334 error = pipespace(cpipe, PIPE_SIZE); 335 if (error) 336 return (error); 337 338 vfs_timestamp(&cpipe->pipe_ctime); 339 cpipe->pipe_atime = cpipe->pipe_ctime; 340 cpipe->pipe_mtime = cpipe->pipe_ctime; 341 342 return (0); 343 } 344 345 346 /* 347 * lock a pipe for I/O, blocking other access 348 */ 349 static __inline int 350 pipelock(cpipe, catch) 351 struct pipe *cpipe; 352 int catch; 353 { 354 int error; 355 356 while (cpipe->pipe_state & PIPE_LOCK) { 357 cpipe->pipe_state |= PIPE_LWANT; 358 error = tsleep(cpipe, (catch ? PCATCH : 0), "pipelk", 0); 359 if (error != 0) 360 return (error); 361 } 362 cpipe->pipe_state |= PIPE_LOCK; 363 return (0); 364 } 365 366 /* 367 * unlock a pipe I/O lock 368 */ 369 static __inline void 370 pipeunlock(cpipe) 371 struct pipe *cpipe; 372 { 373 374 cpipe->pipe_state &= ~PIPE_LOCK; 375 if (cpipe->pipe_state & PIPE_LWANT) { 376 cpipe->pipe_state &= ~PIPE_LWANT; 377 wakeup(cpipe); 378 } 379 } 380 381 static __inline void 382 pipeselwakeup(cpipe) 383 struct pipe *cpipe; 384 { 385 386 if (cpipe->pipe_state & PIPE_SEL) { 387 cpipe->pipe_state &= ~PIPE_SEL; 388 selwakeup(&cpipe->pipe_sel); 389 } 390 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 391 pgsigio(cpipe->pipe_sigio, SIGIO, 0); 392 KNOTE(&cpipe->pipe_sel.si_note, 0); 393 } 394 395 /* ARGSUSED */ 396 static int 397 pipe_read(struct file *fp, struct uio *uio, struct ucred *cred, 398 int flags, struct thread *td) 399 { 400 struct pipe *rpipe = (struct pipe *) fp->f_data; 401 int error; 402 int nread = 0; 403 u_int size; 404 405 ++rpipe->pipe_busy; 406 error = pipelock(rpipe, 1); 407 if (error) 408 goto unlocked_error; 409 410 while (uio->uio_resid) { 411 /* 412 * normal pipe buffer receive 413 */ 414 if (rpipe->pipe_buffer.cnt > 0) { 415 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 416 if (size > rpipe->pipe_buffer.cnt) 417 size = rpipe->pipe_buffer.cnt; 418 if (size > (u_int) uio->uio_resid) 419 size = (u_int) uio->uio_resid; 420 421 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 422 size, uio); 423 if (error) 424 break; 425 426 rpipe->pipe_buffer.out += size; 427 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 428 rpipe->pipe_buffer.out = 0; 429 430 rpipe->pipe_buffer.cnt -= size; 431 432 /* 433 * If there is no more to read in the pipe, reset 434 * its pointers to the beginning. This improves 435 * cache hit stats. 436 */ 437 if (rpipe->pipe_buffer.cnt == 0) { 438 rpipe->pipe_buffer.in = 0; 439 rpipe->pipe_buffer.out = 0; 440 } 441 nread += size; 442 #ifndef PIPE_NODIRECT 443 /* 444 * Direct copy, bypassing a kernel buffer. 445 */ 446 } else if ((size = rpipe->pipe_map.cnt) && 447 (rpipe->pipe_state & PIPE_DIRECTW)) { 448 caddr_t va; 449 if (size > (u_int) uio->uio_resid) 450 size = (u_int) uio->uio_resid; 451 452 va = (caddr_t) rpipe->pipe_map.kva + 453 rpipe->pipe_map.pos; 454 error = uiomove(va, size, uio); 455 if (error) 456 break; 457 nread += size; 458 rpipe->pipe_map.pos += size; 459 rpipe->pipe_map.cnt -= size; 460 if (rpipe->pipe_map.cnt == 0) { 461 rpipe->pipe_state &= ~PIPE_DIRECTW; 462 wakeup(rpipe); 463 } 464 #endif 465 } else { 466 /* 467 * detect EOF condition 468 * read returns 0 on EOF, no need to set error 469 */ 470 if (rpipe->pipe_state & PIPE_EOF) 471 break; 472 473 /* 474 * If the "write-side" has been blocked, wake it up now. 475 */ 476 if (rpipe->pipe_state & PIPE_WANTW) { 477 rpipe->pipe_state &= ~PIPE_WANTW; 478 wakeup(rpipe); 479 } 480 481 /* 482 * Break if some data was read. 483 */ 484 if (nread > 0) 485 break; 486 487 /* 488 * Unlock the pipe buffer for our remaining processing. We 489 * will either break out with an error or we will sleep and 490 * relock to loop. 491 */ 492 pipeunlock(rpipe); 493 494 /* 495 * Handle non-blocking mode operation or 496 * wait for more data. 497 */ 498 if (fp->f_flag & FNONBLOCK) { 499 error = EAGAIN; 500 } else { 501 rpipe->pipe_state |= PIPE_WANTR; 502 if ((error = tsleep(rpipe, PCATCH, 503 "piperd", 0)) == 0) { 504 error = pipelock(rpipe, 1); 505 } 506 } 507 if (error) 508 goto unlocked_error; 509 } 510 } 511 pipeunlock(rpipe); 512 513 if (error == 0) 514 vfs_timestamp(&rpipe->pipe_atime); 515 unlocked_error: 516 --rpipe->pipe_busy; 517 518 /* 519 * PIPE_WANT processing only makes sense if pipe_busy is 0. 520 */ 521 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 522 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 523 wakeup(rpipe); 524 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 525 /* 526 * Handle write blocking hysteresis. 527 */ 528 if (rpipe->pipe_state & PIPE_WANTW) { 529 rpipe->pipe_state &= ~PIPE_WANTW; 530 wakeup(rpipe); 531 } 532 } 533 534 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 535 pipeselwakeup(rpipe); 536 537 return (error); 538 } 539 540 #ifndef PIPE_NODIRECT 541 /* 542 * Map the sending processes' buffer into kernel space and wire it. 543 * This is similar to a physical write operation. 544 */ 545 static int 546 pipe_build_write_buffer(wpipe, uio) 547 struct pipe *wpipe; 548 struct uio *uio; 549 { 550 u_int size; 551 int i; 552 vm_offset_t addr, endaddr; 553 vm_paddr_t paddr; 554 555 size = (u_int) uio->uio_iov->iov_len; 556 if (size > wpipe->pipe_buffer.size) 557 size = wpipe->pipe_buffer.size; 558 559 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 560 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 561 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 562 vm_page_t m; 563 564 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || 565 (paddr = pmap_kextract(addr)) == 0) { 566 int j; 567 568 for (j = 0; j < i; j++) 569 vm_page_unhold(wpipe->pipe_map.ms[j]); 570 return (EFAULT); 571 } 572 573 m = PHYS_TO_VM_PAGE(paddr); 574 vm_page_hold(m); 575 wpipe->pipe_map.ms[i] = m; 576 } 577 578 /* 579 * set up the control block 580 */ 581 wpipe->pipe_map.npages = i; 582 wpipe->pipe_map.pos = 583 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 584 wpipe->pipe_map.cnt = size; 585 586 /* 587 * and map the buffer 588 */ 589 if (wpipe->pipe_map.kva == 0) { 590 /* 591 * We need to allocate space for an extra page because the 592 * address range might (will) span pages at times. 593 */ 594 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, 595 wpipe->pipe_buffer.size + PAGE_SIZE); 596 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; 597 } 598 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 599 wpipe->pipe_map.npages); 600 601 /* 602 * and update the uio data 603 */ 604 605 uio->uio_iov->iov_len -= size; 606 uio->uio_iov->iov_base += size; 607 if (uio->uio_iov->iov_len == 0) 608 uio->uio_iov++; 609 uio->uio_resid -= size; 610 uio->uio_offset += size; 611 return (0); 612 } 613 614 /* 615 * unmap and unwire the process buffer 616 */ 617 static void 618 pipe_destroy_write_buffer(wpipe) 619 struct pipe *wpipe; 620 { 621 int i; 622 623 if (wpipe->pipe_map.kva) { 624 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 625 626 if (amountpipekva > MAXPIPEKVA) { 627 vm_offset_t kva = wpipe->pipe_map.kva; 628 wpipe->pipe_map.kva = 0; 629 kmem_free(kernel_map, kva, 630 wpipe->pipe_buffer.size + PAGE_SIZE); 631 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; 632 } 633 } 634 for (i = 0; i < wpipe->pipe_map.npages; i++) 635 vm_page_unhold(wpipe->pipe_map.ms[i]); 636 wpipe->pipe_map.npages = 0; 637 } 638 639 /* 640 * In the case of a signal, the writing process might go away. This 641 * code copies the data into the circular buffer so that the source 642 * pages can be freed without loss of data. 643 */ 644 static void 645 pipe_clone_write_buffer(wpipe) 646 struct pipe *wpipe; 647 { 648 int size; 649 int pos; 650 651 size = wpipe->pipe_map.cnt; 652 pos = wpipe->pipe_map.pos; 653 bcopy((caddr_t) wpipe->pipe_map.kva + pos, 654 (caddr_t) wpipe->pipe_buffer.buffer, size); 655 656 wpipe->pipe_buffer.in = size; 657 wpipe->pipe_buffer.out = 0; 658 wpipe->pipe_buffer.cnt = size; 659 wpipe->pipe_state &= ~PIPE_DIRECTW; 660 661 pipe_destroy_write_buffer(wpipe); 662 } 663 664 /* 665 * This implements the pipe buffer write mechanism. Note that only 666 * a direct write OR a normal pipe write can be pending at any given time. 667 * If there are any characters in the pipe buffer, the direct write will 668 * be deferred until the receiving process grabs all of the bytes from 669 * the pipe buffer. Then the direct mapping write is set-up. 670 */ 671 static int 672 pipe_direct_write(wpipe, uio) 673 struct pipe *wpipe; 674 struct uio *uio; 675 { 676 int error; 677 678 retry: 679 while (wpipe->pipe_state & PIPE_DIRECTW) { 680 if (wpipe->pipe_state & PIPE_WANTR) { 681 wpipe->pipe_state &= ~PIPE_WANTR; 682 wakeup(wpipe); 683 } 684 wpipe->pipe_state |= PIPE_WANTW; 685 error = tsleep(wpipe, PCATCH, "pipdww", 0); 686 if (error) 687 goto error1; 688 if (wpipe->pipe_state & PIPE_EOF) { 689 error = EPIPE; 690 goto error1; 691 } 692 } 693 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 694 if (wpipe->pipe_buffer.cnt > 0) { 695 if (wpipe->pipe_state & PIPE_WANTR) { 696 wpipe->pipe_state &= ~PIPE_WANTR; 697 wakeup(wpipe); 698 } 699 700 wpipe->pipe_state |= PIPE_WANTW; 701 error = tsleep(wpipe, PCATCH, "pipdwc", 0); 702 if (error) 703 goto error1; 704 if (wpipe->pipe_state & PIPE_EOF) { 705 error = EPIPE; 706 goto error1; 707 } 708 goto retry; 709 } 710 711 wpipe->pipe_state |= PIPE_DIRECTW; 712 713 error = pipe_build_write_buffer(wpipe, uio); 714 if (error) { 715 wpipe->pipe_state &= ~PIPE_DIRECTW; 716 goto error1; 717 } 718 719 error = 0; 720 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 721 if (wpipe->pipe_state & PIPE_EOF) { 722 pipelock(wpipe, 0); 723 pipe_destroy_write_buffer(wpipe); 724 pipeunlock(wpipe); 725 pipeselwakeup(wpipe); 726 error = EPIPE; 727 goto error1; 728 } 729 if (wpipe->pipe_state & PIPE_WANTR) { 730 wpipe->pipe_state &= ~PIPE_WANTR; 731 wakeup(wpipe); 732 } 733 pipeselwakeup(wpipe); 734 error = tsleep(wpipe, PCATCH, "pipdwt", 0); 735 } 736 737 pipelock(wpipe,0); 738 if (wpipe->pipe_state & PIPE_DIRECTW) { 739 /* 740 * this bit of trickery substitutes a kernel buffer for 741 * the process that might be going away. 742 */ 743 pipe_clone_write_buffer(wpipe); 744 } else { 745 pipe_destroy_write_buffer(wpipe); 746 } 747 pipeunlock(wpipe); 748 return (error); 749 750 error1: 751 wakeup(wpipe); 752 return (error); 753 } 754 #endif 755 756 static int 757 pipe_write(struct file *fp, struct uio *uio, struct ucred *cred, 758 int flags, struct thread *td) 759 { 760 int error = 0; 761 int orig_resid; 762 struct pipe *wpipe, *rpipe; 763 764 rpipe = (struct pipe *) fp->f_data; 765 wpipe = rpipe->pipe_peer; 766 767 /* 768 * detect loss of pipe read side, issue SIGPIPE if lost. 769 */ 770 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 771 return (EPIPE); 772 } 773 ++wpipe->pipe_busy; 774 775 /* 776 * If it is advantageous to resize the pipe buffer, do 777 * so. 778 */ 779 if ((uio->uio_resid > PIPE_SIZE) && 780 (nbigpipe < LIMITBIGPIPES) && 781 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 782 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 783 (wpipe->pipe_buffer.cnt == 0)) { 784 785 if ((error = pipelock(wpipe,1)) == 0) { 786 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 787 nbigpipe++; 788 pipeunlock(wpipe); 789 } 790 } 791 792 /* 793 * If an early error occured unbusy and return, waking up any pending 794 * readers. 795 */ 796 if (error) { 797 --wpipe->pipe_busy; 798 if ((wpipe->pipe_busy == 0) && 799 (wpipe->pipe_state & PIPE_WANT)) { 800 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 801 wakeup(wpipe); 802 } 803 return(error); 804 } 805 806 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); 807 808 orig_resid = uio->uio_resid; 809 810 while (uio->uio_resid) { 811 int space; 812 813 #ifndef PIPE_NODIRECT 814 /* 815 * If the transfer is large, we can gain performance if 816 * we do process-to-process copies directly. 817 * If the write is non-blocking, we don't use the 818 * direct write mechanism. 819 * 820 * The direct write mechanism will detect the reader going 821 * away on us. 822 */ 823 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 824 (fp->f_flag & FNONBLOCK) == 0 && 825 (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && 826 (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { 827 error = pipe_direct_write( wpipe, uio); 828 if (error) 829 break; 830 continue; 831 } 832 #endif 833 834 /* 835 * Pipe buffered writes cannot be coincidental with 836 * direct writes. We wait until the currently executing 837 * direct write is completed before we start filling the 838 * pipe buffer. We break out if a signal occurs or the 839 * reader goes away. 840 */ 841 retrywrite: 842 while (wpipe->pipe_state & PIPE_DIRECTW) { 843 if (wpipe->pipe_state & PIPE_WANTR) { 844 wpipe->pipe_state &= ~PIPE_WANTR; 845 wakeup(wpipe); 846 } 847 error = tsleep(wpipe, PCATCH, "pipbww", 0); 848 if (wpipe->pipe_state & PIPE_EOF) 849 break; 850 if (error) 851 break; 852 } 853 if (wpipe->pipe_state & PIPE_EOF) { 854 error = EPIPE; 855 break; 856 } 857 858 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 859 860 /* Writes of size <= PIPE_BUF must be atomic. */ 861 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 862 space = 0; 863 864 if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { 865 if ((error = pipelock(wpipe,1)) == 0) { 866 int size; /* Transfer size */ 867 int segsize; /* first segment to transfer */ 868 869 /* 870 * It is possible for a direct write to 871 * slip in on us... handle it here... 872 */ 873 if (wpipe->pipe_state & PIPE_DIRECTW) { 874 pipeunlock(wpipe); 875 goto retrywrite; 876 } 877 /* 878 * If a process blocked in uiomove, our 879 * value for space might be bad. 880 * 881 * XXX will we be ok if the reader has gone 882 * away here? 883 */ 884 if (space > wpipe->pipe_buffer.size - 885 wpipe->pipe_buffer.cnt) { 886 pipeunlock(wpipe); 887 goto retrywrite; 888 } 889 890 /* 891 * Transfer size is minimum of uio transfer 892 * and free space in pipe buffer. 893 */ 894 if (space > uio->uio_resid) 895 size = uio->uio_resid; 896 else 897 size = space; 898 /* 899 * First segment to transfer is minimum of 900 * transfer size and contiguous space in 901 * pipe buffer. If first segment to transfer 902 * is less than the transfer size, we've got 903 * a wraparound in the buffer. 904 */ 905 segsize = wpipe->pipe_buffer.size - 906 wpipe->pipe_buffer.in; 907 if (segsize > size) 908 segsize = size; 909 910 /* Transfer first segment */ 911 912 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 913 segsize, uio); 914 915 if (error == 0 && segsize < size) { 916 /* 917 * Transfer remaining part now, to 918 * support atomic writes. Wraparound 919 * happened. 920 */ 921 if (wpipe->pipe_buffer.in + segsize != 922 wpipe->pipe_buffer.size) 923 panic("Expected pipe buffer wraparound disappeared"); 924 925 error = uiomove(&wpipe->pipe_buffer.buffer[0], 926 size - segsize, uio); 927 } 928 if (error == 0) { 929 wpipe->pipe_buffer.in += size; 930 if (wpipe->pipe_buffer.in >= 931 wpipe->pipe_buffer.size) { 932 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) 933 panic("Expected wraparound bad"); 934 wpipe->pipe_buffer.in = size - segsize; 935 } 936 937 wpipe->pipe_buffer.cnt += size; 938 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) 939 panic("Pipe buffer overflow"); 940 941 } 942 pipeunlock(wpipe); 943 } 944 if (error) 945 break; 946 947 } else { 948 /* 949 * If the "read-side" has been blocked, wake it up now. 950 */ 951 if (wpipe->pipe_state & PIPE_WANTR) { 952 wpipe->pipe_state &= ~PIPE_WANTR; 953 wakeup(wpipe); 954 } 955 956 /* 957 * don't block on non-blocking I/O 958 */ 959 if (fp->f_flag & FNONBLOCK) { 960 error = EAGAIN; 961 break; 962 } 963 964 /* 965 * We have no more space and have something to offer, 966 * wake up select/poll. 967 */ 968 pipeselwakeup(wpipe); 969 970 wpipe->pipe_state |= PIPE_WANTW; 971 error = tsleep(wpipe, PCATCH, "pipewr", 0); 972 if (error != 0) 973 break; 974 /* 975 * If read side wants to go away, we just issue a signal 976 * to ourselves. 977 */ 978 if (wpipe->pipe_state & PIPE_EOF) { 979 error = EPIPE; 980 break; 981 } 982 } 983 } 984 985 --wpipe->pipe_busy; 986 987 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 988 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 989 wakeup(wpipe); 990 } else if (wpipe->pipe_buffer.cnt > 0) { 991 /* 992 * If we have put any characters in the buffer, we wake up 993 * the reader. 994 */ 995 if (wpipe->pipe_state & PIPE_WANTR) { 996 wpipe->pipe_state &= ~PIPE_WANTR; 997 wakeup(wpipe); 998 } 999 } 1000 1001 /* 1002 * Don't return EPIPE if I/O was successful 1003 */ 1004 if ((wpipe->pipe_buffer.cnt == 0) && 1005 (uio->uio_resid == 0) && 1006 (error == EPIPE)) { 1007 error = 0; 1008 } 1009 1010 if (error == 0) 1011 vfs_timestamp(&wpipe->pipe_mtime); 1012 1013 /* 1014 * We have something to offer, 1015 * wake up select/poll. 1016 */ 1017 if (wpipe->pipe_buffer.cnt) 1018 pipeselwakeup(wpipe); 1019 1020 return (error); 1021 } 1022 1023 /* 1024 * we implement a very minimal set of ioctls for compatibility with sockets. 1025 */ 1026 int 1027 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td) 1028 { 1029 struct pipe *mpipe = (struct pipe *)fp->f_data; 1030 1031 switch (cmd) { 1032 1033 case FIONBIO: 1034 return (0); 1035 1036 case FIOASYNC: 1037 if (*(int *)data) { 1038 mpipe->pipe_state |= PIPE_ASYNC; 1039 } else { 1040 mpipe->pipe_state &= ~PIPE_ASYNC; 1041 } 1042 return (0); 1043 1044 case FIONREAD: 1045 if (mpipe->pipe_state & PIPE_DIRECTW) 1046 *(int *)data = mpipe->pipe_map.cnt; 1047 else 1048 *(int *)data = mpipe->pipe_buffer.cnt; 1049 return (0); 1050 1051 case FIOSETOWN: 1052 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 1053 1054 case FIOGETOWN: 1055 *(int *)data = fgetown(mpipe->pipe_sigio); 1056 return (0); 1057 1058 /* This is deprecated, FIOSETOWN should be used instead. */ 1059 case TIOCSPGRP: 1060 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 1061 1062 /* This is deprecated, FIOGETOWN should be used instead. */ 1063 case TIOCGPGRP: 1064 *(int *)data = -fgetown(mpipe->pipe_sigio); 1065 return (0); 1066 1067 } 1068 return (ENOTTY); 1069 } 1070 1071 int 1072 pipe_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) 1073 { 1074 struct pipe *rpipe = (struct pipe *)fp->f_data; 1075 struct pipe *wpipe; 1076 int revents = 0; 1077 1078 wpipe = rpipe->pipe_peer; 1079 if (events & (POLLIN | POLLRDNORM)) 1080 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1081 (rpipe->pipe_buffer.cnt > 0) || 1082 (rpipe->pipe_state & PIPE_EOF)) 1083 revents |= events & (POLLIN | POLLRDNORM); 1084 1085 if (events & (POLLOUT | POLLWRNORM)) 1086 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || 1087 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1088 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1089 revents |= events & (POLLOUT | POLLWRNORM); 1090 1091 if ((rpipe->pipe_state & PIPE_EOF) || 1092 (wpipe == NULL) || 1093 (wpipe->pipe_state & PIPE_EOF)) 1094 revents |= POLLHUP; 1095 1096 if (revents == 0) { 1097 if (events & (POLLIN | POLLRDNORM)) { 1098 selrecord(td, &rpipe->pipe_sel); 1099 rpipe->pipe_state |= PIPE_SEL; 1100 } 1101 1102 if (events & (POLLOUT | POLLWRNORM)) { 1103 selrecord(td, &wpipe->pipe_sel); 1104 wpipe->pipe_state |= PIPE_SEL; 1105 } 1106 } 1107 1108 return (revents); 1109 } 1110 1111 static int 1112 pipe_stat(struct file *fp, struct stat *ub, struct thread *td) 1113 { 1114 struct pipe *pipe = (struct pipe *)fp->f_data; 1115 1116 bzero((caddr_t)ub, sizeof(*ub)); 1117 ub->st_mode = S_IFIFO; 1118 ub->st_blksize = pipe->pipe_buffer.size; 1119 ub->st_size = pipe->pipe_buffer.cnt; 1120 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1121 ub->st_atimespec = pipe->pipe_atime; 1122 ub->st_mtimespec = pipe->pipe_mtime; 1123 ub->st_ctimespec = pipe->pipe_ctime; 1124 /* 1125 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev, 1126 * st_flags, st_gen. 1127 * XXX (st_dev, st_ino) should be unique. 1128 */ 1129 return (0); 1130 } 1131 1132 /* ARGSUSED */ 1133 static int 1134 pipe_close(struct file *fp, struct thread *td) 1135 { 1136 struct pipe *cpipe = (struct pipe *)fp->f_data; 1137 1138 fp->f_ops = &badfileops; 1139 fp->f_data = NULL; 1140 funsetown(cpipe->pipe_sigio); 1141 pipeclose(cpipe); 1142 return (0); 1143 } 1144 1145 static void 1146 pipe_free_kmem(struct pipe *cpipe) 1147 { 1148 1149 if (cpipe->pipe_buffer.buffer != NULL) { 1150 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1151 --nbigpipe; 1152 amountpipekva -= cpipe->pipe_buffer.size; 1153 kmem_free(kernel_map, 1154 (vm_offset_t)cpipe->pipe_buffer.buffer, 1155 cpipe->pipe_buffer.size); 1156 cpipe->pipe_buffer.buffer = NULL; 1157 } 1158 #ifndef PIPE_NODIRECT 1159 if (cpipe->pipe_map.kva != NULL) { 1160 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; 1161 kmem_free(kernel_map, 1162 cpipe->pipe_map.kva, 1163 cpipe->pipe_buffer.size + PAGE_SIZE); 1164 cpipe->pipe_map.cnt = 0; 1165 cpipe->pipe_map.kva = 0; 1166 cpipe->pipe_map.pos = 0; 1167 cpipe->pipe_map.npages = 0; 1168 } 1169 #endif 1170 } 1171 1172 /* 1173 * shutdown the pipe 1174 */ 1175 static void 1176 pipeclose(struct pipe *cpipe) 1177 { 1178 struct pipe *ppipe; 1179 1180 if (cpipe) { 1181 1182 pipeselwakeup(cpipe); 1183 1184 /* 1185 * If the other side is blocked, wake it up saying that 1186 * we want to close it down. 1187 */ 1188 while (cpipe->pipe_busy) { 1189 wakeup(cpipe); 1190 cpipe->pipe_state |= PIPE_WANT | PIPE_EOF; 1191 tsleep(cpipe, 0, "pipecl", 0); 1192 } 1193 1194 /* 1195 * Disconnect from peer 1196 */ 1197 if ((ppipe = cpipe->pipe_peer) != NULL) { 1198 pipeselwakeup(ppipe); 1199 1200 ppipe->pipe_state |= PIPE_EOF; 1201 wakeup(ppipe); 1202 KNOTE(&ppipe->pipe_sel.si_note, 0); 1203 ppipe->pipe_peer = NULL; 1204 } 1205 /* 1206 * free resources 1207 */ 1208 pipe_free_kmem(cpipe); 1209 zfree(pipe_zone, cpipe); 1210 } 1211 } 1212 1213 /*ARGSUSED*/ 1214 static int 1215 pipe_kqfilter(struct file *fp, struct knote *kn) 1216 { 1217 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1218 1219 switch (kn->kn_filter) { 1220 case EVFILT_READ: 1221 kn->kn_fop = &pipe_rfiltops; 1222 break; 1223 case EVFILT_WRITE: 1224 kn->kn_fop = &pipe_wfiltops; 1225 cpipe = cpipe->pipe_peer; 1226 if (cpipe == NULL) 1227 /* other end of pipe has been closed */ 1228 return (EPIPE); 1229 break; 1230 default: 1231 return (1); 1232 } 1233 kn->kn_hook = (caddr_t)cpipe; 1234 1235 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); 1236 return (0); 1237 } 1238 1239 static void 1240 filt_pipedetach(struct knote *kn) 1241 { 1242 struct pipe *cpipe = (struct pipe *)kn->kn_hook; 1243 1244 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); 1245 } 1246 1247 /*ARGSUSED*/ 1248 static int 1249 filt_piperead(struct knote *kn, long hint) 1250 { 1251 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1252 struct pipe *wpipe = rpipe->pipe_peer; 1253 1254 kn->kn_data = rpipe->pipe_buffer.cnt; 1255 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1256 kn->kn_data = rpipe->pipe_map.cnt; 1257 1258 if ((rpipe->pipe_state & PIPE_EOF) || 1259 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1260 kn->kn_flags |= EV_EOF; 1261 return (1); 1262 } 1263 return (kn->kn_data > 0); 1264 } 1265 1266 /*ARGSUSED*/ 1267 static int 1268 filt_pipewrite(struct knote *kn, long hint) 1269 { 1270 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1271 struct pipe *wpipe = rpipe->pipe_peer; 1272 1273 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1274 kn->kn_data = 0; 1275 kn->kn_flags |= EV_EOF; 1276 return (1); 1277 } 1278 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1279 if (wpipe->pipe_state & PIPE_DIRECTW) 1280 kn->kn_data = 0; 1281 1282 return (kn->kn_data >= PIPE_BUF); 1283 } 1284