xref: /openbsd/sys/kern/sys_pipe.c (revision 4bdff4be)
1 /*	$OpenBSD: sys_pipe.c,v 1.146 2023/05/09 14:22:17 visa Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #ifdef KTRACE
43 #include <sys/ktrace.h>
44 #endif
45 
46 #include <uvm/uvm_extern.h>
47 
48 #include <sys/pipe.h>
49 
50 struct pipe_pair {
51 	struct pipe pp_wpipe;
52 	struct pipe pp_rpipe;
53 	struct rwlock pp_lock;
54 };
55 
56 /*
57  * interfaces to the outside world
58  */
59 int	pipe_read(struct file *, struct uio *, int);
60 int	pipe_write(struct file *, struct uio *, int);
61 int	pipe_close(struct file *, struct proc *);
62 int	pipe_kqfilter(struct file *fp, struct knote *kn);
63 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
64 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
65 
66 static const struct fileops pipeops = {
67 	.fo_read	= pipe_read,
68 	.fo_write	= pipe_write,
69 	.fo_ioctl	= pipe_ioctl,
70 	.fo_kqfilter	= pipe_kqfilter,
71 	.fo_stat	= pipe_stat,
72 	.fo_close	= pipe_close
73 };
74 
75 void	filt_pipedetach(struct knote *kn);
76 int	filt_piperead(struct knote *kn, long hint);
77 int	filt_pipewrite(struct knote *kn, long hint);
78 int	filt_pipeexcept(struct knote *kn, long hint);
79 int	filt_pipemodify(struct kevent *kev, struct knote *kn);
80 int	filt_pipeprocess(struct knote *kn, struct kevent *kev);
81 
82 const struct filterops pipe_rfiltops = {
83 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
84 	.f_attach	= NULL,
85 	.f_detach	= filt_pipedetach,
86 	.f_event	= filt_piperead,
87 	.f_modify	= filt_pipemodify,
88 	.f_process	= filt_pipeprocess,
89 };
90 
91 const struct filterops pipe_wfiltops = {
92 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
93 	.f_attach	= NULL,
94 	.f_detach	= filt_pipedetach,
95 	.f_event	= filt_pipewrite,
96 	.f_modify	= filt_pipemodify,
97 	.f_process	= filt_pipeprocess,
98 };
99 
100 const struct filterops pipe_efiltops = {
101 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
102 	.f_attach	= NULL,
103 	.f_detach	= filt_pipedetach,
104 	.f_event	= filt_pipeexcept,
105 	.f_modify	= filt_pipemodify,
106 	.f_process	= filt_pipeprocess,
107 };
108 
109 /*
110  * Default pipe buffer size(s), this can be kind-of large now because pipe
111  * space is pageable.  The pipe code will try to maintain locality of
112  * reference for performance reasons, so small amounts of outstanding I/O
113  * will not wipe the cache.
114  */
115 #define MINPIPESIZE (PIPE_SIZE/3)
116 
117 /*
118  * Limit the number of "big" pipes
119  */
120 #define LIMITBIGPIPES	32
121 unsigned int nbigpipe;
122 static unsigned int amountpipekva;
123 
124 struct pool pipe_pair_pool;
125 
126 int	dopipe(struct proc *, int *, int);
127 void	pipe_wakeup(struct pipe *);
128 
129 int	pipe_create(struct pipe *);
130 void	pipe_destroy(struct pipe *);
131 int	pipe_rundown(struct pipe *);
132 struct pipe *pipe_peer(struct pipe *);
133 int	pipe_buffer_realloc(struct pipe *, u_int);
134 void	pipe_buffer_free(struct pipe *);
135 
136 int	pipe_iolock(struct pipe *);
137 void	pipe_iounlock(struct pipe *);
138 int	pipe_iosleep(struct pipe *, const char *);
139 
140 struct pipe_pair *pipe_pair_create(void);
141 void	pipe_pair_destroy(struct pipe_pair *);
142 
143 /*
144  * The pipe system call for the DTYPE_PIPE type of pipes
145  */
146 
147 int
148 sys_pipe(struct proc *p, void *v, register_t *retval)
149 {
150 	struct sys_pipe_args /* {
151 		syscallarg(int *) fdp;
152 	} */ *uap = v;
153 
154 	return (dopipe(p, SCARG(uap, fdp), 0));
155 }
156 
157 int
158 sys_pipe2(struct proc *p, void *v, register_t *retval)
159 {
160 	struct sys_pipe2_args /* {
161 		syscallarg(int *) fdp;
162 		syscallarg(int) flags;
163 	} */ *uap = v;
164 
165 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
166 		return (EINVAL);
167 
168 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
169 }
170 
171 int
172 dopipe(struct proc *p, int *ufds, int flags)
173 {
174 	struct filedesc *fdp = p->p_fd;
175 	struct file *rf, *wf;
176 	struct pipe_pair *pp;
177 	struct pipe *rpipe, *wpipe = NULL;
178 	int fds[2], cloexec, error;
179 
180 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
181 
182 	pp = pipe_pair_create();
183 	if (pp == NULL)
184 		return (ENOMEM);
185 	wpipe = &pp->pp_wpipe;
186 	rpipe = &pp->pp_rpipe;
187 
188 	fdplock(fdp);
189 
190 	error = falloc(p, &rf, &fds[0]);
191 	if (error != 0)
192 		goto free2;
193 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
194 	rf->f_type = DTYPE_PIPE;
195 	rf->f_data = rpipe;
196 	rf->f_ops = &pipeops;
197 
198 	error = falloc(p, &wf, &fds[1]);
199 	if (error != 0)
200 		goto free3;
201 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
202 	wf->f_type = DTYPE_PIPE;
203 	wf->f_data = wpipe;
204 	wf->f_ops = &pipeops;
205 
206 	fdinsert(fdp, fds[0], cloexec, rf);
207 	fdinsert(fdp, fds[1], cloexec, wf);
208 
209 	error = copyout(fds, ufds, sizeof(fds));
210 	if (error == 0) {
211 		fdpunlock(fdp);
212 #ifdef KTRACE
213 		if (KTRPOINT(p, KTR_STRUCT))
214 			ktrfds(p, fds, 2);
215 #endif
216 	} else {
217 		/* fdrelease() unlocks fdp. */
218 		fdrelease(p, fds[0]);
219 		fdplock(fdp);
220 		fdrelease(p, fds[1]);
221 	}
222 
223 	FRELE(rf, p);
224 	FRELE(wf, p);
225 	return (error);
226 
227 free3:
228 	fdremove(fdp, fds[0]);
229 	closef(rf, p);
230 	rpipe = NULL;
231 free2:
232 	fdpunlock(fdp);
233 	pipe_destroy(wpipe);
234 	pipe_destroy(rpipe);
235 	return (error);
236 }
237 
238 /*
239  * Allocate kva for pipe circular buffer, the space is pageable.
240  * This routine will 'realloc' the size of a pipe safely, if it fails
241  * it will retain the old buffer.
242  * If it fails it will return ENOMEM.
243  */
244 int
245 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
246 {
247 	caddr_t buffer;
248 
249 	/* buffer uninitialized or pipe locked */
250 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
251 	    (cpipe->pipe_state & PIPE_LOCK));
252 
253 	/* buffer should be empty */
254 	KASSERT(cpipe->pipe_buffer.cnt == 0);
255 
256 	KERNEL_LOCK();
257 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
258 	KERNEL_UNLOCK();
259 	if (buffer == NULL)
260 		return (ENOMEM);
261 
262 	/* free old resources if we are resizing */
263 	pipe_buffer_free(cpipe);
264 
265 	cpipe->pipe_buffer.buffer = buffer;
266 	cpipe->pipe_buffer.size = size;
267 	cpipe->pipe_buffer.in = 0;
268 	cpipe->pipe_buffer.out = 0;
269 
270 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
271 
272 	return (0);
273 }
274 
275 /*
276  * initialize and allocate VM and memory for pipe
277  */
278 int
279 pipe_create(struct pipe *cpipe)
280 {
281 	int error;
282 
283 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
284 	if (error != 0)
285 		return (error);
286 
287 	sigio_init(&cpipe->pipe_sigio);
288 
289 	getnanotime(&cpipe->pipe_ctime);
290 	cpipe->pipe_atime = cpipe->pipe_ctime;
291 	cpipe->pipe_mtime = cpipe->pipe_ctime;
292 
293 	return (0);
294 }
295 
296 struct pipe *
297 pipe_peer(struct pipe *cpipe)
298 {
299 	struct pipe *peer;
300 
301 	rw_assert_anylock(cpipe->pipe_lock);
302 
303 	peer = cpipe->pipe_peer;
304 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
305 		return (NULL);
306 	return (peer);
307 }
308 
309 /*
310  * Lock a pipe for exclusive I/O access.
311  */
312 int
313 pipe_iolock(struct pipe *cpipe)
314 {
315 	int error;
316 
317 	rw_assert_wrlock(cpipe->pipe_lock);
318 
319 	while (cpipe->pipe_state & PIPE_LOCK) {
320 		cpipe->pipe_state |= PIPE_LWANT;
321 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
322 		    "pipeiolk", INFSLP);
323 		if (error)
324 			return (error);
325 	}
326 	cpipe->pipe_state |= PIPE_LOCK;
327 	return (0);
328 }
329 
330 /*
331  * Unlock a pipe I/O lock.
332  */
333 void
334 pipe_iounlock(struct pipe *cpipe)
335 {
336 	rw_assert_wrlock(cpipe->pipe_lock);
337 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
338 
339 	cpipe->pipe_state &= ~PIPE_LOCK;
340 	if (cpipe->pipe_state & PIPE_LWANT) {
341 		cpipe->pipe_state &= ~PIPE_LWANT;
342 		wakeup(cpipe);
343 	}
344 }
345 
346 /*
347  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
348  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
349  * the I/O lock is not locked.
350  *
351  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
352  * before calling this function in order ensure that the same pipe is not
353  * destroyed while sleeping.
354  */
355 int
356 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
357 {
358 	int error;
359 
360 	pipe_iounlock(cpipe);
361 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
362 	    INFSLP);
363 	if (error)
364 		return (error);
365 	return (pipe_iolock(cpipe));
366 }
367 
368 void
369 pipe_wakeup(struct pipe *cpipe)
370 {
371 	rw_assert_wrlock(cpipe->pipe_lock);
372 
373 	knote_locked(&cpipe->pipe_klist, 0);
374 
375 	if (cpipe->pipe_state & PIPE_ASYNC)
376 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
377 }
378 
379 int
380 pipe_read(struct file *fp, struct uio *uio, int fflags)
381 {
382 	struct pipe *rpipe = fp->f_data;
383 	size_t nread = 0, size;
384 	int error;
385 
386 	rw_enter_write(rpipe->pipe_lock);
387 	++rpipe->pipe_busy;
388 	error = pipe_iolock(rpipe);
389 	if (error) {
390 		--rpipe->pipe_busy;
391 		pipe_rundown(rpipe);
392 		rw_exit_write(rpipe->pipe_lock);
393 		return (error);
394 	}
395 
396 	while (uio->uio_resid) {
397 		/* Normal pipe buffer receive. */
398 		if (rpipe->pipe_buffer.cnt > 0) {
399 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
400 			if (size > rpipe->pipe_buffer.cnt)
401 				size = rpipe->pipe_buffer.cnt;
402 			if (size > uio->uio_resid)
403 				size = uio->uio_resid;
404 			rw_exit_write(rpipe->pipe_lock);
405 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
406 					size, uio);
407 			rw_enter_write(rpipe->pipe_lock);
408 			if (error) {
409 				break;
410 			}
411 			rpipe->pipe_buffer.out += size;
412 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
413 				rpipe->pipe_buffer.out = 0;
414 
415 			rpipe->pipe_buffer.cnt -= size;
416 			/*
417 			 * If there is no more to read in the pipe, reset
418 			 * its pointers to the beginning.  This improves
419 			 * cache hit stats.
420 			 */
421 			if (rpipe->pipe_buffer.cnt == 0) {
422 				rpipe->pipe_buffer.in = 0;
423 				rpipe->pipe_buffer.out = 0;
424 			}
425 			nread += size;
426 		} else {
427 			/*
428 			 * detect EOF condition
429 			 * read returns 0 on EOF, no need to set error
430 			 */
431 			if (rpipe->pipe_state & PIPE_EOF)
432 				break;
433 
434 			/* If the "write-side" has been blocked, wake it up. */
435 			if (rpipe->pipe_state & PIPE_WANTW) {
436 				rpipe->pipe_state &= ~PIPE_WANTW;
437 				wakeup(rpipe);
438 			}
439 
440 			/* Break if some data was read. */
441 			if (nread > 0)
442 				break;
443 
444 			/* Handle non-blocking mode operation. */
445 			if (fp->f_flag & FNONBLOCK) {
446 				error = EAGAIN;
447 				break;
448 			}
449 
450 			/* Wait for more data. */
451 			rpipe->pipe_state |= PIPE_WANTR;
452 			error = pipe_iosleep(rpipe, "piperd");
453 			if (error)
454 				goto unlocked_error;
455 		}
456 	}
457 	pipe_iounlock(rpipe);
458 
459 	if (error == 0)
460 		getnanotime(&rpipe->pipe_atime);
461 unlocked_error:
462 	--rpipe->pipe_busy;
463 
464 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
465 		/* Handle write blocking hysteresis. */
466 		if (rpipe->pipe_state & PIPE_WANTW) {
467 			rpipe->pipe_state &= ~PIPE_WANTW;
468 			wakeup(rpipe);
469 		}
470 	}
471 
472 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
473 		pipe_wakeup(rpipe);
474 
475 	rw_exit_write(rpipe->pipe_lock);
476 	return (error);
477 }
478 
479 int
480 pipe_write(struct file *fp, struct uio *uio, int fflags)
481 {
482 	struct pipe *rpipe = fp->f_data, *wpipe;
483 	struct rwlock *lock = rpipe->pipe_lock;
484 	size_t orig_resid;
485 	int error;
486 
487 	rw_enter_write(lock);
488 	wpipe = pipe_peer(rpipe);
489 
490 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
491 	if (wpipe == NULL) {
492 		rw_exit_write(lock);
493 		return (EPIPE);
494 	}
495 
496 	++wpipe->pipe_busy;
497 	error = pipe_iolock(wpipe);
498 	if (error) {
499 		--wpipe->pipe_busy;
500 		pipe_rundown(wpipe);
501 		rw_exit_write(lock);
502 		return (error);
503 	}
504 
505 
506 	/* If it is advantageous to resize the pipe buffer, do so. */
507 	if (uio->uio_resid > PIPE_SIZE &&
508 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
509 	    wpipe->pipe_buffer.cnt == 0) {
510 	    	unsigned int npipe;
511 
512 		npipe = atomic_inc_int_nv(&nbigpipe);
513 		if (npipe > LIMITBIGPIPES ||
514 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
515 			atomic_dec_int(&nbigpipe);
516 	}
517 
518 	orig_resid = uio->uio_resid;
519 
520 	while (uio->uio_resid) {
521 		size_t space;
522 
523 		if (wpipe->pipe_state & PIPE_EOF) {
524 			error = EPIPE;
525 			break;
526 		}
527 
528 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
529 
530 		/* Writes of size <= PIPE_BUF must be atomic. */
531 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
532 			space = 0;
533 
534 		if (space > 0) {
535 			size_t size;	/* Transfer size */
536 			size_t segsize;	/* first segment to transfer */
537 
538 			/*
539 			 * Transfer size is minimum of uio transfer
540 			 * and free space in pipe buffer.
541 			 */
542 			if (space > uio->uio_resid)
543 				size = uio->uio_resid;
544 			else
545 				size = space;
546 			/*
547 			 * First segment to transfer is minimum of
548 			 * transfer size and contiguous space in
549 			 * pipe buffer.  If first segment to transfer
550 			 * is less than the transfer size, we've got
551 			 * a wraparound in the buffer.
552 			 */
553 			segsize = wpipe->pipe_buffer.size -
554 				wpipe->pipe_buffer.in;
555 			if (segsize > size)
556 				segsize = size;
557 
558 			/* Transfer first segment */
559 
560 			rw_exit_write(lock);
561 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
562 					segsize, uio);
563 			rw_enter_write(lock);
564 
565 			if (error == 0 && segsize < size) {
566 				/*
567 				 * Transfer remaining part now, to
568 				 * support atomic writes.  Wraparound
569 				 * happened.
570 				 */
571 #ifdef DIAGNOSTIC
572 				if (wpipe->pipe_buffer.in + segsize !=
573 				    wpipe->pipe_buffer.size)
574 					panic("Expected pipe buffer wraparound disappeared");
575 #endif
576 
577 				rw_exit_write(lock);
578 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
579 						size - segsize, uio);
580 				rw_enter_write(lock);
581 			}
582 			if (error == 0) {
583 				wpipe->pipe_buffer.in += size;
584 				if (wpipe->pipe_buffer.in >=
585 				    wpipe->pipe_buffer.size) {
586 #ifdef DIAGNOSTIC
587 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
588 						panic("Expected wraparound bad");
589 #endif
590 					wpipe->pipe_buffer.in = size - segsize;
591 				}
592 
593 				wpipe->pipe_buffer.cnt += size;
594 #ifdef DIAGNOSTIC
595 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
596 					panic("Pipe buffer overflow");
597 #endif
598 			}
599 			if (error)
600 				break;
601 		} else {
602 			/* If the "read-side" has been blocked, wake it up. */
603 			if (wpipe->pipe_state & PIPE_WANTR) {
604 				wpipe->pipe_state &= ~PIPE_WANTR;
605 				wakeup(wpipe);
606 			}
607 
608 			/* Don't block on non-blocking I/O. */
609 			if (fp->f_flag & FNONBLOCK) {
610 				error = EAGAIN;
611 				break;
612 			}
613 
614 			/*
615 			 * We have no more space and have something to offer,
616 			 * wake up select/poll.
617 			 */
618 			pipe_wakeup(wpipe);
619 
620 			wpipe->pipe_state |= PIPE_WANTW;
621 			error = pipe_iosleep(wpipe, "pipewr");
622 			if (error)
623 				goto unlocked_error;
624 
625 			/*
626 			 * If read side wants to go away, we just issue a
627 			 * signal to ourselves.
628 			 */
629 			if (wpipe->pipe_state & PIPE_EOF) {
630 				error = EPIPE;
631 				break;
632 			}
633 		}
634 	}
635 	pipe_iounlock(wpipe);
636 
637 unlocked_error:
638 	--wpipe->pipe_busy;
639 
640 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
641 		/*
642 		 * If we have put any characters in the buffer, we wake up
643 		 * the reader.
644 		 */
645 		if (wpipe->pipe_state & PIPE_WANTR) {
646 			wpipe->pipe_state &= ~PIPE_WANTR;
647 			wakeup(wpipe);
648 		}
649 	}
650 
651 	/* Don't return EPIPE if I/O was successful. */
652 	if (wpipe->pipe_buffer.cnt == 0 &&
653 	    uio->uio_resid == 0 &&
654 	    error == EPIPE) {
655 		error = 0;
656 	}
657 
658 	if (error == 0)
659 		getnanotime(&wpipe->pipe_mtime);
660 	/* We have something to offer, wake up select/poll. */
661 	if (wpipe->pipe_buffer.cnt)
662 		pipe_wakeup(wpipe);
663 
664 	rw_exit_write(lock);
665 	return (error);
666 }
667 
668 /*
669  * we implement a very minimal set of ioctls for compatibility with sockets.
670  */
671 int
672 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
673 {
674 	struct pipe *mpipe = fp->f_data;
675 	int error = 0;
676 
677 	switch (cmd) {
678 
679 	case FIONBIO:
680 		break;
681 
682 	case FIOASYNC:
683 		rw_enter_write(mpipe->pipe_lock);
684 		if (*(int *)data) {
685 			mpipe->pipe_state |= PIPE_ASYNC;
686 		} else {
687 			mpipe->pipe_state &= ~PIPE_ASYNC;
688 		}
689 		rw_exit_write(mpipe->pipe_lock);
690 		break;
691 
692 	case FIONREAD:
693 		rw_enter_read(mpipe->pipe_lock);
694 		*(int *)data = mpipe->pipe_buffer.cnt;
695 		rw_exit_read(mpipe->pipe_lock);
696 		break;
697 
698 	case FIOSETOWN:
699 	case SIOCSPGRP:
700 	case TIOCSPGRP:
701 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
702 		break;
703 
704 	case FIOGETOWN:
705 	case SIOCGPGRP:
706 	case TIOCGPGRP:
707 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
708 		break;
709 
710 	default:
711 		error = ENOTTY;
712 	}
713 
714 	return (error);
715 }
716 
717 int
718 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
719 {
720 	struct pipe *pipe = fp->f_data;
721 
722 	memset(ub, 0, sizeof(*ub));
723 
724 	rw_enter_read(pipe->pipe_lock);
725 	ub->st_mode = S_IFIFO;
726 	ub->st_blksize = pipe->pipe_buffer.size;
727 	ub->st_size = pipe->pipe_buffer.cnt;
728 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
729 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
730 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
731 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
732 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
733 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
734 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
735 	ub->st_uid = fp->f_cred->cr_uid;
736 	ub->st_gid = fp->f_cred->cr_gid;
737 	rw_exit_read(pipe->pipe_lock);
738 	/*
739 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
740 	 * XXX (st_dev, st_ino) should be unique.
741 	 */
742 	return (0);
743 }
744 
745 int
746 pipe_close(struct file *fp, struct proc *p)
747 {
748 	struct pipe *cpipe = fp->f_data;
749 
750 	fp->f_ops = NULL;
751 	fp->f_data = NULL;
752 	pipe_destroy(cpipe);
753 	return (0);
754 }
755 
756 /*
757  * Free kva for pipe circular buffer.
758  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
759  */
760 void
761 pipe_buffer_free(struct pipe *cpipe)
762 {
763 	u_int size;
764 
765 	if (cpipe->pipe_buffer.buffer == NULL)
766 		return;
767 
768 	size = cpipe->pipe_buffer.size;
769 
770 	KERNEL_LOCK();
771 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
772 	KERNEL_UNLOCK();
773 
774 	cpipe->pipe_buffer.buffer = NULL;
775 
776 	atomic_sub_int(&amountpipekva, size);
777 	if (size > PIPE_SIZE)
778 		atomic_dec_int(&nbigpipe);
779 }
780 
781 /*
782  * shutdown the pipe, and free resources.
783  */
784 void
785 pipe_destroy(struct pipe *cpipe)
786 {
787 	struct pipe *ppipe;
788 
789 	if (cpipe == NULL)
790 		return;
791 
792 	rw_enter_write(cpipe->pipe_lock);
793 
794 	pipe_wakeup(cpipe);
795 	sigio_free(&cpipe->pipe_sigio);
796 
797 	/*
798 	 * If the other side is blocked, wake it up saying that
799 	 * we want to close it down.
800 	 */
801 	cpipe->pipe_state |= PIPE_EOF;
802 	while (cpipe->pipe_busy) {
803 		wakeup(cpipe);
804 		cpipe->pipe_state |= PIPE_WANTD;
805 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
806 	}
807 
808 	/* Disconnect from peer. */
809 	if ((ppipe = cpipe->pipe_peer) != NULL) {
810 		pipe_wakeup(ppipe);
811 
812 		ppipe->pipe_state |= PIPE_EOF;
813 		wakeup(ppipe);
814 		ppipe->pipe_peer = NULL;
815 	}
816 
817 	pipe_buffer_free(cpipe);
818 
819 	rw_exit_write(cpipe->pipe_lock);
820 
821 	if (ppipe == NULL)
822 		pipe_pair_destroy(cpipe->pipe_pair);
823 }
824 
825 /*
826  * Returns non-zero if a rundown is currently ongoing.
827  */
828 int
829 pipe_rundown(struct pipe *cpipe)
830 {
831 	rw_assert_wrlock(cpipe->pipe_lock);
832 
833 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
834 		return (0);
835 
836 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
837 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
838 	wakeup(cpipe);
839 	return (1);
840 }
841 
842 int
843 pipe_kqfilter(struct file *fp, struct knote *kn)
844 {
845 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
846 	struct rwlock *lock = rpipe->pipe_lock;
847 	int error = 0;
848 
849 	rw_enter_write(lock);
850 	wpipe = pipe_peer(rpipe);
851 
852 	switch (kn->kn_filter) {
853 	case EVFILT_READ:
854 		kn->kn_fop = &pipe_rfiltops;
855 		kn->kn_hook = rpipe;
856 		klist_insert_locked(&rpipe->pipe_klist, kn);
857 		break;
858 	case EVFILT_WRITE:
859 		if (wpipe == NULL) {
860 			/*
861 			 * The other end of the pipe has been closed.
862 			 * Since the filter now always indicates a pending
863 			 * event, attach the knote to the current side
864 			 * to proceed with the registration.
865 			 */
866 			wpipe = rpipe;
867 		}
868 		kn->kn_fop = &pipe_wfiltops;
869 		kn->kn_hook = wpipe;
870 		klist_insert_locked(&wpipe->pipe_klist, kn);
871 		break;
872 	case EVFILT_EXCEPT:
873 		if (kn->kn_flags & __EV_SELECT) {
874 			/* Prevent triggering exceptfds. */
875 			error = EPERM;
876 			break;
877 		}
878 		if ((kn->kn_flags & __EV_POLL) == 0) {
879 			/* Disallow usage through kevent(2). */
880 			error = EINVAL;
881 			break;
882 		}
883 		kn->kn_fop = &pipe_efiltops;
884 		kn->kn_hook = rpipe;
885 		klist_insert_locked(&rpipe->pipe_klist, kn);
886 		break;
887 	default:
888 		error = EINVAL;
889 	}
890 
891 	rw_exit_write(lock);
892 
893 	return (error);
894 }
895 
896 void
897 filt_pipedetach(struct knote *kn)
898 {
899 	struct pipe *cpipe = kn->kn_hook;
900 
901 	klist_remove(&cpipe->pipe_klist, kn);
902 }
903 
904 int
905 filt_piperead(struct knote *kn, long hint)
906 {
907 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
908 
909 	rw_assert_wrlock(rpipe->pipe_lock);
910 
911 	wpipe = pipe_peer(rpipe);
912 
913 	kn->kn_data = rpipe->pipe_buffer.cnt;
914 
915 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
916 		kn->kn_flags |= EV_EOF;
917 		if (kn->kn_flags & __EV_POLL)
918 			kn->kn_flags |= __EV_HUP;
919 		return (1);
920 	}
921 
922 	return (kn->kn_data > 0);
923 }
924 
925 int
926 filt_pipewrite(struct knote *kn, long hint)
927 {
928 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
929 
930 	rw_assert_wrlock(rpipe->pipe_lock);
931 
932 	wpipe = pipe_peer(rpipe);
933 
934 	if (wpipe == NULL) {
935 		kn->kn_data = 0;
936 		kn->kn_flags |= EV_EOF;
937 		if (kn->kn_flags & __EV_POLL)
938 			kn->kn_flags |= __EV_HUP;
939 		return (1);
940 	}
941 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
942 
943 	return (kn->kn_data >= PIPE_BUF);
944 }
945 
946 int
947 filt_pipeexcept(struct knote *kn, long hint)
948 {
949 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
950 	int active = 0;
951 
952 	rw_assert_wrlock(rpipe->pipe_lock);
953 
954 	wpipe = pipe_peer(rpipe);
955 
956 	if (kn->kn_flags & __EV_POLL) {
957 		if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
958 			kn->kn_flags |= __EV_HUP;
959 			active = 1;
960 		}
961 	}
962 
963 	return (active);
964 }
965 
966 int
967 filt_pipemodify(struct kevent *kev, struct knote *kn)
968 {
969 	struct pipe *rpipe = kn->kn_fp->f_data;
970 	int active;
971 
972 	rw_enter_write(rpipe->pipe_lock);
973 	active = knote_modify(kev, kn);
974 	rw_exit_write(rpipe->pipe_lock);
975 
976 	return (active);
977 }
978 
979 int
980 filt_pipeprocess(struct knote *kn, struct kevent *kev)
981 {
982 	struct pipe *rpipe = kn->kn_fp->f_data;
983 	int active;
984 
985 	rw_enter_write(rpipe->pipe_lock);
986 	active = knote_process(kn, kev);
987 	rw_exit_write(rpipe->pipe_lock);
988 
989 	return (active);
990 }
991 
992 void
993 pipe_init(void)
994 {
995 	pool_init(&pipe_pair_pool, sizeof(struct pipe_pair), 0, IPL_MPFLOOR,
996 	    PR_WAITOK, "pipepl", NULL);
997 }
998 
999 struct pipe_pair *
1000 pipe_pair_create(void)
1001 {
1002 	struct pipe_pair *pp;
1003 
1004 	pp = pool_get(&pipe_pair_pool, PR_WAITOK | PR_ZERO);
1005 	pp->pp_wpipe.pipe_pair = pp;
1006 	pp->pp_rpipe.pipe_pair = pp;
1007 	pp->pp_wpipe.pipe_peer = &pp->pp_rpipe;
1008 	pp->pp_rpipe.pipe_peer = &pp->pp_wpipe;
1009 	/*
1010 	 * One lock is used per pipe pair in order to obtain exclusive access to
1011 	 * the pipe pair.
1012 	 */
1013 	rw_init(&pp->pp_lock, "pipelk");
1014 	pp->pp_wpipe.pipe_lock = &pp->pp_lock;
1015 	pp->pp_rpipe.pipe_lock = &pp->pp_lock;
1016 
1017 	klist_init_rwlock(&pp->pp_wpipe.pipe_klist, &pp->pp_lock);
1018 	klist_init_rwlock(&pp->pp_rpipe.pipe_klist, &pp->pp_lock);
1019 
1020 	if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
1021 		goto err;
1022 	return (pp);
1023 err:
1024 	pipe_destroy(&pp->pp_wpipe);
1025 	pipe_destroy(&pp->pp_rpipe);
1026 	return (NULL);
1027 }
1028 
1029 void
1030 pipe_pair_destroy(struct pipe_pair *pp)
1031 {
1032 	klist_free(&pp->pp_wpipe.pipe_klist);
1033 	klist_free(&pp->pp_rpipe.pipe_klist);
1034 	pool_put(&pipe_pair_pool, pp);
1035 }
1036