xref: /openbsd/sys/kern/sys_pipe.c (revision b9ae17a0)
1 /*	$OpenBSD: sys_pipe.c,v 1.148 2024/12/30 02:46:00 guenther Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #ifdef KTRACE
43 #include <sys/ktrace.h>
44 #endif
45 
46 #include <uvm/uvm_extern.h>
47 
48 #include <sys/pipe.h>
49 
50 struct pipe_pair {
51 	struct pipe pp_wpipe;
52 	struct pipe pp_rpipe;
53 	struct rwlock pp_lock;
54 };
55 
56 /*
57  * interfaces to the outside world
58  */
59 int	pipe_read(struct file *, struct uio *, int);
60 int	pipe_write(struct file *, struct uio *, int);
61 int	pipe_close(struct file *, struct proc *);
62 int	pipe_kqfilter(struct file *fp, struct knote *kn);
63 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
64 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
65 
66 static const struct fileops pipeops = {
67 	.fo_read	= pipe_read,
68 	.fo_write	= pipe_write,
69 	.fo_ioctl	= pipe_ioctl,
70 	.fo_kqfilter	= pipe_kqfilter,
71 	.fo_stat	= pipe_stat,
72 	.fo_close	= pipe_close
73 };
74 
75 void	filt_pipedetach(struct knote *kn);
76 int	filt_piperead(struct knote *kn, long hint);
77 int	filt_pipewrite(struct knote *kn, long hint);
78 int	filt_pipeexcept(struct knote *kn, long hint);
79 int	filt_pipemodify(struct kevent *kev, struct knote *kn);
80 int	filt_pipeprocess(struct knote *kn, struct kevent *kev);
81 
82 const struct filterops pipe_rfiltops = {
83 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
84 	.f_attach	= NULL,
85 	.f_detach	= filt_pipedetach,
86 	.f_event	= filt_piperead,
87 	.f_modify	= filt_pipemodify,
88 	.f_process	= filt_pipeprocess,
89 };
90 
91 const struct filterops pipe_wfiltops = {
92 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
93 	.f_attach	= NULL,
94 	.f_detach	= filt_pipedetach,
95 	.f_event	= filt_pipewrite,
96 	.f_modify	= filt_pipemodify,
97 	.f_process	= filt_pipeprocess,
98 };
99 
100 const struct filterops pipe_efiltops = {
101 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
102 	.f_attach	= NULL,
103 	.f_detach	= filt_pipedetach,
104 	.f_event	= filt_pipeexcept,
105 	.f_modify	= filt_pipemodify,
106 	.f_process	= filt_pipeprocess,
107 };
108 
109 /*
110  * Default pipe buffer size(s), this can be kind-of large now because pipe
111  * space is pageable.  The pipe code will try to maintain locality of
112  * reference for performance reasons, so small amounts of outstanding I/O
113  * will not wipe the cache.
114  */
115 #define MINPIPESIZE (PIPE_SIZE/3)
116 
117 /*
118  * Limit the number of "big" pipes
119  */
120 #define LIMITBIGPIPES	32
121 unsigned int nbigpipe;
122 static unsigned int amountpipekva;
123 
124 struct pool pipe_pair_pool;
125 
126 int	dopipe(struct proc *, int *, int);
127 void	pipe_wakeup(struct pipe *);
128 
129 int	pipe_create(struct pipe *);
130 void	pipe_destroy(struct pipe *);
131 int	pipe_rundown(struct pipe *);
132 struct pipe *pipe_peer(struct pipe *);
133 int	pipe_buffer_realloc(struct pipe *, u_int);
134 void	pipe_buffer_free(struct pipe *);
135 
136 int	pipe_iolock(struct pipe *);
137 void	pipe_iounlock(struct pipe *);
138 int	pipe_iosleep(struct pipe *, const char *);
139 
140 struct pipe_pair *pipe_pair_create(void);
141 void	pipe_pair_destroy(struct pipe_pair *);
142 
143 /*
144  * The pipe system call for the DTYPE_PIPE type of pipes
145  */
146 
147 int
sys_pipe(struct proc * p,void * v,register_t * retval)148 sys_pipe(struct proc *p, void *v, register_t *retval)
149 {
150 	struct sys_pipe_args /* {
151 		syscallarg(int *) fdp;
152 	} */ *uap = v;
153 
154 	return (dopipe(p, SCARG(uap, fdp), 0));
155 }
156 
157 int
sys_pipe2(struct proc * p,void * v,register_t * retval)158 sys_pipe2(struct proc *p, void *v, register_t *retval)
159 {
160 	struct sys_pipe2_args /* {
161 		syscallarg(int *) fdp;
162 		syscallarg(int) flags;
163 	} */ *uap = v;
164 
165 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
166 		return (EINVAL);
167 
168 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
169 }
170 
171 int
dopipe(struct proc * p,int * ufds,int flags)172 dopipe(struct proc *p, int *ufds, int flags)
173 {
174 	struct filedesc *fdp = p->p_fd;
175 	struct file *rf, *wf;
176 	struct pipe_pair *pp;
177 	struct pipe *rpipe, *wpipe = NULL;
178 	int fds[2], cloexec, error;
179 
180 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
181 
182 	pp = pipe_pair_create();
183 	if (pp == NULL)
184 		return (ENOMEM);
185 	wpipe = &pp->pp_wpipe;
186 	rpipe = &pp->pp_rpipe;
187 
188 	fdplock(fdp);
189 
190 	error = falloc(p, &rf, &fds[0]);
191 	if (error != 0)
192 		goto free2;
193 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
194 	rf->f_type = DTYPE_PIPE;
195 	rf->f_data = rpipe;
196 	rf->f_ops = &pipeops;
197 
198 	error = falloc(p, &wf, &fds[1]);
199 	if (error != 0)
200 		goto free3;
201 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
202 	wf->f_type = DTYPE_PIPE;
203 	wf->f_data = wpipe;
204 	wf->f_ops = &pipeops;
205 
206 	fdinsert(fdp, fds[0], cloexec, rf);
207 	fdinsert(fdp, fds[1], cloexec, wf);
208 
209 	error = copyout(fds, ufds, sizeof(fds));
210 	if (error == 0) {
211 		fdpunlock(fdp);
212 #ifdef KTRACE
213 		if (KTRPOINT(p, KTR_STRUCT))
214 			ktrfds(p, fds, 2);
215 #endif
216 	} else {
217 		/* fdrelease() unlocks fdp. */
218 		fdrelease(p, fds[0]);
219 		fdplock(fdp);
220 		fdrelease(p, fds[1]);
221 	}
222 
223 	FRELE(rf, p);
224 	FRELE(wf, p);
225 	return (error);
226 
227 free3:
228 	fdremove(fdp, fds[0]);
229 	closef(rf, p);
230 	rpipe = NULL;
231 free2:
232 	fdpunlock(fdp);
233 	pipe_destroy(wpipe);
234 	pipe_destroy(rpipe);
235 	return (error);
236 }
237 
238 /*
239  * Allocate kva for pipe circular buffer, the space is pageable.
240  * This routine will 'realloc' the size of a pipe safely, if it fails
241  * it will retain the old buffer.
242  * If it fails it will return ENOMEM.
243  */
244 int
pipe_buffer_realloc(struct pipe * cpipe,u_int size)245 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
246 {
247 	caddr_t buffer;
248 
249 	/* buffer uninitialized or pipe locked */
250 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
251 	    (cpipe->pipe_state & PIPE_LOCK));
252 
253 	/* buffer should be empty */
254 	KASSERT(cpipe->pipe_buffer.cnt == 0);
255 
256 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
257 	if (buffer == NULL)
258 		return (ENOMEM);
259 
260 	/* free old resources if we are resizing */
261 	pipe_buffer_free(cpipe);
262 
263 	cpipe->pipe_buffer.buffer = buffer;
264 	cpipe->pipe_buffer.size = size;
265 	cpipe->pipe_buffer.in = 0;
266 	cpipe->pipe_buffer.out = 0;
267 
268 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
269 
270 	return (0);
271 }
272 
273 /*
274  * initialize and allocate VM and memory for pipe
275  */
276 int
pipe_create(struct pipe * cpipe)277 pipe_create(struct pipe *cpipe)
278 {
279 	int error;
280 
281 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
282 	if (error != 0)
283 		return (error);
284 
285 	sigio_init(&cpipe->pipe_sigio);
286 
287 	getnanotime(&cpipe->pipe_ctime);
288 	cpipe->pipe_atime = cpipe->pipe_ctime;
289 	cpipe->pipe_mtime = cpipe->pipe_ctime;
290 
291 	return (0);
292 }
293 
294 struct pipe *
pipe_peer(struct pipe * cpipe)295 pipe_peer(struct pipe *cpipe)
296 {
297 	struct pipe *peer;
298 
299 	rw_assert_anylock(cpipe->pipe_lock);
300 
301 	peer = cpipe->pipe_peer;
302 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
303 		return (NULL);
304 	return (peer);
305 }
306 
307 /*
308  * Lock a pipe for exclusive I/O access.
309  */
310 int
pipe_iolock(struct pipe * cpipe)311 pipe_iolock(struct pipe *cpipe)
312 {
313 	int error;
314 
315 	rw_assert_wrlock(cpipe->pipe_lock);
316 
317 	while (cpipe->pipe_state & PIPE_LOCK) {
318 		cpipe->pipe_state |= PIPE_LWANT;
319 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
320 		    "pipeiolk", INFSLP);
321 		if (error)
322 			return (error);
323 	}
324 	cpipe->pipe_state |= PIPE_LOCK;
325 	return (0);
326 }
327 
328 /*
329  * Unlock a pipe I/O lock.
330  */
331 void
pipe_iounlock(struct pipe * cpipe)332 pipe_iounlock(struct pipe *cpipe)
333 {
334 	rw_assert_wrlock(cpipe->pipe_lock);
335 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
336 
337 	cpipe->pipe_state &= ~PIPE_LOCK;
338 	if (cpipe->pipe_state & PIPE_LWANT) {
339 		cpipe->pipe_state &= ~PIPE_LWANT;
340 		wakeup(cpipe);
341 	}
342 }
343 
344 /*
345  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
346  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
347  * the I/O lock is not locked.
348  *
349  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
350  * before calling this function in order ensure that the same pipe is not
351  * destroyed while sleeping.
352  */
353 int
pipe_iosleep(struct pipe * cpipe,const char * wmesg)354 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
355 {
356 	int error;
357 
358 	pipe_iounlock(cpipe);
359 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
360 	    INFSLP);
361 	if (error)
362 		return (error);
363 	return (pipe_iolock(cpipe));
364 }
365 
366 void
pipe_wakeup(struct pipe * cpipe)367 pipe_wakeup(struct pipe *cpipe)
368 {
369 	rw_assert_wrlock(cpipe->pipe_lock);
370 
371 	knote_locked(&cpipe->pipe_klist, 0);
372 
373 	if (cpipe->pipe_state & PIPE_ASYNC)
374 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
375 }
376 
377 int
pipe_read(struct file * fp,struct uio * uio,int fflags)378 pipe_read(struct file *fp, struct uio *uio, int fflags)
379 {
380 	struct pipe *rpipe = fp->f_data;
381 	size_t nread = 0, size;
382 	int error;
383 
384 	rw_enter_write(rpipe->pipe_lock);
385 	++rpipe->pipe_busy;
386 	error = pipe_iolock(rpipe);
387 	if (error) {
388 		--rpipe->pipe_busy;
389 		pipe_rundown(rpipe);
390 		rw_exit_write(rpipe->pipe_lock);
391 		return (error);
392 	}
393 
394 	while (uio->uio_resid) {
395 		/* Normal pipe buffer receive. */
396 		if (rpipe->pipe_buffer.cnt > 0) {
397 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
398 			if (size > rpipe->pipe_buffer.cnt)
399 				size = rpipe->pipe_buffer.cnt;
400 			if (size > uio->uio_resid)
401 				size = uio->uio_resid;
402 			rw_exit_write(rpipe->pipe_lock);
403 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
404 					size, uio);
405 			rw_enter_write(rpipe->pipe_lock);
406 			if (error) {
407 				break;
408 			}
409 			rpipe->pipe_buffer.out += size;
410 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
411 				rpipe->pipe_buffer.out = 0;
412 
413 			rpipe->pipe_buffer.cnt -= size;
414 			/*
415 			 * If there is no more to read in the pipe, reset
416 			 * its pointers to the beginning.  This improves
417 			 * cache hit stats.
418 			 */
419 			if (rpipe->pipe_buffer.cnt == 0) {
420 				rpipe->pipe_buffer.in = 0;
421 				rpipe->pipe_buffer.out = 0;
422 			}
423 			nread += size;
424 		} else {
425 			/*
426 			 * detect EOF condition
427 			 * read returns 0 on EOF, no need to set error
428 			 */
429 			if (rpipe->pipe_state & PIPE_EOF)
430 				break;
431 
432 			/* If the "write-side" has been blocked, wake it up. */
433 			if (rpipe->pipe_state & PIPE_WANTW) {
434 				rpipe->pipe_state &= ~PIPE_WANTW;
435 				wakeup(rpipe);
436 			}
437 
438 			/* Break if some data was read. */
439 			if (nread > 0)
440 				break;
441 
442 			/* Handle non-blocking mode operation. */
443 			if (fp->f_flag & FNONBLOCK) {
444 				error = EAGAIN;
445 				break;
446 			}
447 
448 			/* Wait for more data. */
449 			rpipe->pipe_state |= PIPE_WANTR;
450 			error = pipe_iosleep(rpipe, "piperd");
451 			if (error)
452 				goto unlocked_error;
453 		}
454 	}
455 	pipe_iounlock(rpipe);
456 
457 	if (error == 0)
458 		getnanotime(&rpipe->pipe_atime);
459 unlocked_error:
460 	--rpipe->pipe_busy;
461 
462 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
463 		/* Handle write blocking hysteresis. */
464 		if (rpipe->pipe_state & PIPE_WANTW) {
465 			rpipe->pipe_state &= ~PIPE_WANTW;
466 			wakeup(rpipe);
467 		}
468 	}
469 
470 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
471 		pipe_wakeup(rpipe);
472 
473 	rw_exit_write(rpipe->pipe_lock);
474 	return (error);
475 }
476 
477 int
pipe_write(struct file * fp,struct uio * uio,int fflags)478 pipe_write(struct file *fp, struct uio *uio, int fflags)
479 {
480 	struct pipe *rpipe = fp->f_data, *wpipe;
481 	struct rwlock *lock = rpipe->pipe_lock;
482 	size_t orig_resid;
483 	int error;
484 
485 	rw_enter_write(lock);
486 	wpipe = pipe_peer(rpipe);
487 
488 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
489 	if (wpipe == NULL) {
490 		rw_exit_write(lock);
491 		return (EPIPE);
492 	}
493 
494 	++wpipe->pipe_busy;
495 	error = pipe_iolock(wpipe);
496 	if (error) {
497 		--wpipe->pipe_busy;
498 		pipe_rundown(wpipe);
499 		rw_exit_write(lock);
500 		return (error);
501 	}
502 
503 
504 	/* If it is advantageous to resize the pipe buffer, do so. */
505 	if (uio->uio_resid > PIPE_SIZE &&
506 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
507 	    wpipe->pipe_buffer.cnt == 0) {
508 	    	unsigned int npipe;
509 
510 		npipe = atomic_inc_int_nv(&nbigpipe);
511 		if (npipe > LIMITBIGPIPES ||
512 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
513 			atomic_dec_int(&nbigpipe);
514 	}
515 
516 	orig_resid = uio->uio_resid;
517 
518 	while (uio->uio_resid) {
519 		size_t space;
520 
521 		if (wpipe->pipe_state & PIPE_EOF) {
522 			error = EPIPE;
523 			break;
524 		}
525 
526 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
527 
528 		/* Writes of size <= PIPE_BUF must be atomic. */
529 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
530 			space = 0;
531 
532 		if (space > 0) {
533 			size_t size;	/* Transfer size */
534 			size_t segsize;	/* first segment to transfer */
535 
536 			/*
537 			 * Transfer size is minimum of uio transfer
538 			 * and free space in pipe buffer.
539 			 */
540 			if (space > uio->uio_resid)
541 				size = uio->uio_resid;
542 			else
543 				size = space;
544 			/*
545 			 * First segment to transfer is minimum of
546 			 * transfer size and contiguous space in
547 			 * pipe buffer.  If first segment to transfer
548 			 * is less than the transfer size, we've got
549 			 * a wraparound in the buffer.
550 			 */
551 			segsize = wpipe->pipe_buffer.size -
552 				wpipe->pipe_buffer.in;
553 			if (segsize > size)
554 				segsize = size;
555 
556 			/* Transfer first segment */
557 
558 			rw_exit_write(lock);
559 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
560 					segsize, uio);
561 			rw_enter_write(lock);
562 
563 			if (error == 0 && segsize < size) {
564 				/*
565 				 * Transfer remaining part now, to
566 				 * support atomic writes.  Wraparound
567 				 * happened.
568 				 */
569 #ifdef DIAGNOSTIC
570 				if (wpipe->pipe_buffer.in + segsize !=
571 				    wpipe->pipe_buffer.size)
572 					panic("Expected pipe buffer wraparound disappeared");
573 #endif
574 
575 				rw_exit_write(lock);
576 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
577 						size - segsize, uio);
578 				rw_enter_write(lock);
579 			}
580 			if (error == 0) {
581 				wpipe->pipe_buffer.in += size;
582 				if (wpipe->pipe_buffer.in >=
583 				    wpipe->pipe_buffer.size) {
584 #ifdef DIAGNOSTIC
585 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
586 						panic("Expected wraparound bad");
587 #endif
588 					wpipe->pipe_buffer.in = size - segsize;
589 				}
590 
591 				wpipe->pipe_buffer.cnt += size;
592 #ifdef DIAGNOSTIC
593 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
594 					panic("Pipe buffer overflow");
595 #endif
596 			}
597 			if (error)
598 				break;
599 		} else {
600 			/* If the "read-side" has been blocked, wake it up. */
601 			if (wpipe->pipe_state & PIPE_WANTR) {
602 				wpipe->pipe_state &= ~PIPE_WANTR;
603 				wakeup(wpipe);
604 			}
605 
606 			/* Don't block on non-blocking I/O. */
607 			if (fp->f_flag & FNONBLOCK) {
608 				error = EAGAIN;
609 				break;
610 			}
611 
612 			/*
613 			 * We have no more space and have something to offer,
614 			 * wake up select/poll.
615 			 */
616 			pipe_wakeup(wpipe);
617 
618 			wpipe->pipe_state |= PIPE_WANTW;
619 			error = pipe_iosleep(wpipe, "pipewr");
620 			if (error)
621 				goto unlocked_error;
622 
623 			/*
624 			 * If read side wants to go away, we just issue a
625 			 * signal to ourselves.
626 			 */
627 			if (wpipe->pipe_state & PIPE_EOF) {
628 				error = EPIPE;
629 				break;
630 			}
631 		}
632 	}
633 	pipe_iounlock(wpipe);
634 
635 unlocked_error:
636 	--wpipe->pipe_busy;
637 
638 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
639 		/*
640 		 * If we have put any characters in the buffer, we wake up
641 		 * the reader.
642 		 */
643 		if (wpipe->pipe_state & PIPE_WANTR) {
644 			wpipe->pipe_state &= ~PIPE_WANTR;
645 			wakeup(wpipe);
646 		}
647 	}
648 
649 	/* Don't return EPIPE if I/O was successful. */
650 	if (wpipe->pipe_buffer.cnt == 0 &&
651 	    uio->uio_resid == 0 &&
652 	    error == EPIPE) {
653 		error = 0;
654 	}
655 
656 	if (error == 0)
657 		getnanotime(&wpipe->pipe_mtime);
658 	/* We have something to offer, wake up select/poll. */
659 	if (wpipe->pipe_buffer.cnt)
660 		pipe_wakeup(wpipe);
661 
662 	rw_exit_write(lock);
663 	return (error);
664 }
665 
666 /*
667  * we implement a very minimal set of ioctls for compatibility with sockets.
668  */
669 int
pipe_ioctl(struct file * fp,u_long cmd,caddr_t data,struct proc * p)670 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
671 {
672 	struct pipe *mpipe = fp->f_data;
673 	int error = 0;
674 
675 	switch (cmd) {
676 
677 	case FIOASYNC:
678 		rw_enter_write(mpipe->pipe_lock);
679 		if (*(int *)data) {
680 			mpipe->pipe_state |= PIPE_ASYNC;
681 		} else {
682 			mpipe->pipe_state &= ~PIPE_ASYNC;
683 		}
684 		rw_exit_write(mpipe->pipe_lock);
685 		break;
686 
687 	case FIONREAD:
688 		rw_enter_read(mpipe->pipe_lock);
689 		*(int *)data = mpipe->pipe_buffer.cnt;
690 		rw_exit_read(mpipe->pipe_lock);
691 		break;
692 
693 	case FIOSETOWN:
694 	case SIOCSPGRP:
695 	case TIOCSPGRP:
696 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
697 		break;
698 
699 	case FIOGETOWN:
700 	case SIOCGPGRP:
701 	case TIOCGPGRP:
702 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
703 		break;
704 
705 	default:
706 		error = ENOTTY;
707 	}
708 
709 	return (error);
710 }
711 
712 int
pipe_stat(struct file * fp,struct stat * ub,struct proc * p)713 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
714 {
715 	struct pipe *pipe = fp->f_data;
716 
717 	memset(ub, 0, sizeof(*ub));
718 
719 	rw_enter_read(pipe->pipe_lock);
720 	ub->st_mode = S_IFIFO;
721 	ub->st_blksize = pipe->pipe_buffer.size;
722 	ub->st_size = pipe->pipe_buffer.cnt;
723 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
724 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
725 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
726 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
727 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
728 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
729 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
730 	ub->st_uid = fp->f_cred->cr_uid;
731 	ub->st_gid = fp->f_cred->cr_gid;
732 	rw_exit_read(pipe->pipe_lock);
733 	/*
734 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
735 	 * XXX (st_dev, st_ino) should be unique.
736 	 */
737 	return (0);
738 }
739 
740 int
pipe_close(struct file * fp,struct proc * p)741 pipe_close(struct file *fp, struct proc *p)
742 {
743 	struct pipe *cpipe = fp->f_data;
744 
745 	fp->f_ops = NULL;
746 	fp->f_data = NULL;
747 	pipe_destroy(cpipe);
748 	return (0);
749 }
750 
751 /*
752  * Free kva for pipe circular buffer.
753  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
754  */
755 void
pipe_buffer_free(struct pipe * cpipe)756 pipe_buffer_free(struct pipe *cpipe)
757 {
758 	u_int size;
759 
760 	if (cpipe->pipe_buffer.buffer == NULL)
761 		return;
762 
763 	size = cpipe->pipe_buffer.size;
764 
765 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
766 
767 	cpipe->pipe_buffer.buffer = NULL;
768 
769 	atomic_sub_int(&amountpipekva, size);
770 	if (size > PIPE_SIZE)
771 		atomic_dec_int(&nbigpipe);
772 }
773 
774 /*
775  * shutdown the pipe, and free resources.
776  */
777 void
pipe_destroy(struct pipe * cpipe)778 pipe_destroy(struct pipe *cpipe)
779 {
780 	struct pipe *ppipe;
781 
782 	if (cpipe == NULL)
783 		return;
784 
785 	rw_enter_write(cpipe->pipe_lock);
786 
787 	pipe_wakeup(cpipe);
788 	sigio_free(&cpipe->pipe_sigio);
789 
790 	/*
791 	 * If the other side is blocked, wake it up saying that
792 	 * we want to close it down.
793 	 */
794 	cpipe->pipe_state |= PIPE_EOF;
795 	while (cpipe->pipe_busy) {
796 		wakeup(cpipe);
797 		cpipe->pipe_state |= PIPE_WANTD;
798 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
799 	}
800 
801 	/* Disconnect from peer. */
802 	if ((ppipe = cpipe->pipe_peer) != NULL) {
803 		pipe_wakeup(ppipe);
804 
805 		ppipe->pipe_state |= PIPE_EOF;
806 		wakeup(ppipe);
807 		ppipe->pipe_peer = NULL;
808 	}
809 
810 	pipe_buffer_free(cpipe);
811 
812 	rw_exit_write(cpipe->pipe_lock);
813 
814 	if (ppipe == NULL)
815 		pipe_pair_destroy(cpipe->pipe_pair);
816 }
817 
818 /*
819  * Returns non-zero if a rundown is currently ongoing.
820  */
821 int
pipe_rundown(struct pipe * cpipe)822 pipe_rundown(struct pipe *cpipe)
823 {
824 	rw_assert_wrlock(cpipe->pipe_lock);
825 
826 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
827 		return (0);
828 
829 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
830 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
831 	wakeup(cpipe);
832 	return (1);
833 }
834 
835 int
pipe_kqfilter(struct file * fp,struct knote * kn)836 pipe_kqfilter(struct file *fp, struct knote *kn)
837 {
838 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
839 	struct rwlock *lock = rpipe->pipe_lock;
840 	int error = 0;
841 
842 	rw_enter_write(lock);
843 	wpipe = pipe_peer(rpipe);
844 
845 	switch (kn->kn_filter) {
846 	case EVFILT_READ:
847 		kn->kn_fop = &pipe_rfiltops;
848 		kn->kn_hook = rpipe;
849 		klist_insert_locked(&rpipe->pipe_klist, kn);
850 		break;
851 	case EVFILT_WRITE:
852 		if (wpipe == NULL) {
853 			/*
854 			 * The other end of the pipe has been closed.
855 			 * Since the filter now always indicates a pending
856 			 * event, attach the knote to the current side
857 			 * to proceed with the registration.
858 			 */
859 			wpipe = rpipe;
860 		}
861 		kn->kn_fop = &pipe_wfiltops;
862 		kn->kn_hook = wpipe;
863 		klist_insert_locked(&wpipe->pipe_klist, kn);
864 		break;
865 	case EVFILT_EXCEPT:
866 		if (kn->kn_flags & __EV_SELECT) {
867 			/* Prevent triggering exceptfds. */
868 			error = EPERM;
869 			break;
870 		}
871 		if ((kn->kn_flags & __EV_POLL) == 0) {
872 			/* Disallow usage through kevent(2). */
873 			error = EINVAL;
874 			break;
875 		}
876 		kn->kn_fop = &pipe_efiltops;
877 		kn->kn_hook = rpipe;
878 		klist_insert_locked(&rpipe->pipe_klist, kn);
879 		break;
880 	default:
881 		error = EINVAL;
882 	}
883 
884 	rw_exit_write(lock);
885 
886 	return (error);
887 }
888 
889 void
filt_pipedetach(struct knote * kn)890 filt_pipedetach(struct knote *kn)
891 {
892 	struct pipe *cpipe = kn->kn_hook;
893 
894 	klist_remove(&cpipe->pipe_klist, kn);
895 }
896 
897 int
filt_piperead(struct knote * kn,long hint)898 filt_piperead(struct knote *kn, long hint)
899 {
900 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
901 
902 	rw_assert_wrlock(rpipe->pipe_lock);
903 
904 	wpipe = pipe_peer(rpipe);
905 
906 	kn->kn_data = rpipe->pipe_buffer.cnt;
907 
908 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
909 		kn->kn_flags |= EV_EOF;
910 		if (kn->kn_flags & __EV_POLL)
911 			kn->kn_flags |= __EV_HUP;
912 		return (1);
913 	}
914 
915 	return (kn->kn_data > 0);
916 }
917 
918 int
filt_pipewrite(struct knote * kn,long hint)919 filt_pipewrite(struct knote *kn, long hint)
920 {
921 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
922 
923 	rw_assert_wrlock(rpipe->pipe_lock);
924 
925 	wpipe = pipe_peer(rpipe);
926 
927 	if (wpipe == NULL) {
928 		kn->kn_data = 0;
929 		kn->kn_flags |= EV_EOF;
930 		if (kn->kn_flags & __EV_POLL)
931 			kn->kn_flags |= __EV_HUP;
932 		return (1);
933 	}
934 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
935 
936 	return (kn->kn_data >= PIPE_BUF);
937 }
938 
939 int
filt_pipeexcept(struct knote * kn,long hint)940 filt_pipeexcept(struct knote *kn, long hint)
941 {
942 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
943 	int active = 0;
944 
945 	rw_assert_wrlock(rpipe->pipe_lock);
946 
947 	wpipe = pipe_peer(rpipe);
948 
949 	if (kn->kn_flags & __EV_POLL) {
950 		if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
951 			kn->kn_flags |= __EV_HUP;
952 			active = 1;
953 		}
954 	}
955 
956 	return (active);
957 }
958 
959 int
filt_pipemodify(struct kevent * kev,struct knote * kn)960 filt_pipemodify(struct kevent *kev, struct knote *kn)
961 {
962 	struct pipe *rpipe = kn->kn_fp->f_data;
963 	int active;
964 
965 	rw_enter_write(rpipe->pipe_lock);
966 	active = knote_modify(kev, kn);
967 	rw_exit_write(rpipe->pipe_lock);
968 
969 	return (active);
970 }
971 
972 int
filt_pipeprocess(struct knote * kn,struct kevent * kev)973 filt_pipeprocess(struct knote *kn, struct kevent *kev)
974 {
975 	struct pipe *rpipe = kn->kn_fp->f_data;
976 	int active;
977 
978 	rw_enter_write(rpipe->pipe_lock);
979 	active = knote_process(kn, kev);
980 	rw_exit_write(rpipe->pipe_lock);
981 
982 	return (active);
983 }
984 
985 void
pipe_init(void)986 pipe_init(void)
987 {
988 	pool_init(&pipe_pair_pool, sizeof(struct pipe_pair), 0, IPL_MPFLOOR,
989 	    PR_WAITOK, "pipepl", NULL);
990 }
991 
992 struct pipe_pair *
pipe_pair_create(void)993 pipe_pair_create(void)
994 {
995 	struct pipe_pair *pp;
996 
997 	pp = pool_get(&pipe_pair_pool, PR_WAITOK | PR_ZERO);
998 	pp->pp_wpipe.pipe_pair = pp;
999 	pp->pp_rpipe.pipe_pair = pp;
1000 	pp->pp_wpipe.pipe_peer = &pp->pp_rpipe;
1001 	pp->pp_rpipe.pipe_peer = &pp->pp_wpipe;
1002 	/*
1003 	 * One lock is used per pipe pair in order to obtain exclusive access to
1004 	 * the pipe pair.
1005 	 */
1006 	rw_init(&pp->pp_lock, "pipelk");
1007 	pp->pp_wpipe.pipe_lock = &pp->pp_lock;
1008 	pp->pp_rpipe.pipe_lock = &pp->pp_lock;
1009 
1010 	klist_init_rwlock(&pp->pp_wpipe.pipe_klist, &pp->pp_lock);
1011 	klist_init_rwlock(&pp->pp_rpipe.pipe_klist, &pp->pp_lock);
1012 
1013 	if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
1014 		goto err;
1015 	return (pp);
1016 err:
1017 	pipe_destroy(&pp->pp_wpipe);
1018 	pipe_destroy(&pp->pp_rpipe);
1019 	return (NULL);
1020 }
1021 
1022 void
pipe_pair_destroy(struct pipe_pair * pp)1023 pipe_pair_destroy(struct pipe_pair *pp)
1024 {
1025 	klist_free(&pp->pp_wpipe.pipe_klist);
1026 	klist_free(&pp->pp_rpipe.pipe_klist);
1027 	pool_put(&pipe_pair_pool, pp);
1028 }
1029