xref: /minix/minix/net/lwip/bpfdev.c (revision 9f81acbc)
1 /* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */
2 /*
3  * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is
4  * independent from any other opened BPF devices.  We assume that each BPF
5  * device is used by one single user process, and this implementation therefore
6  * does not support multiple concurrent device calls on the same BPF device.
7  *
8  * Packet buffering basically follows the BSD model: each BPF device that is
9  * configured (that is, it has been attached to an interface) has two buffers,
10  * each of the configured size: a store buffer, where new packets are stored,
11  * and a hold buffer, which is typically full and awaiting retrieval through a
12  * read call from userland.  The buffers are swapped ("rotated") when the store
13  * buffer is filled up and the hold buffer is empty - if the hold buffer is not
14  * empty is not empty either, additional packets are dropped.
15  *
16  * These buffers are allocated when the BPF device is attached to an interface.
17  * The interface may later disappear, in which case the BPF device is detached
18  * from it, allowing any final packets to be read before read requests start
19  * returning I/O errors.  The buffers are freed only when the device is closed.
20  */
21 
22 #include "lwip.h"
23 #include "bpfdev.h"
24 
25 #include <minix/chardriver.h>
26 #include <net/if.h>
27 #include <net/bpfdesc.h>
28 #include <minix/bpf.h>
29 #include <sys/mman.h>
30 
31 /*
32  * Make sure that our implementation matches the BPF version in the NetBSD
33  * headers.  If they change the version number, we may have to make changes
34  * here accordingly.
35  */
36 #if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1
37 #error "NetBSD BPF version has changed"
38 #endif
39 
40 /* The number of BPF devices. */
41 #define NR_BPFDEV		16
42 
43 /* BPF receive buffer size: allowed range and default. */
44 #define BPF_BUF_MIN		BPF_WORDALIGN(sizeof(struct bpf_hdr))
45 #define BPF_BUF_DEF		32768
46 #define BPF_BUF_MAX		262144
47 
48 /*
49  * By opening /dev/bpf, one will obtain a cloned device with a different minor
50  * number, which maps to one of the BPF devices.
51  */
52 #define BPFDEV_MINOR		0	/* minor number of /dev/bpf */
53 #define BPFDEV_BASE_MINOR	1	/* base minor number for BPF devices */
54 
55 static struct bpfdev {
56 	struct bpfdev_link bpf_link;	/* structure link, MUST be first */
57 	TAILQ_ENTRY(bpfdev) bpf_next;	/* next on free or interface list */
58 	struct ifdev *bpf_ifdev;	/* associated interface, or NULL */
59 	unsigned int bpf_flags;		/* flags (BPFF_) */
60 	size_t bpf_size;		/* size of packet buffers */
61 	char *bpf_sbuf;			/* store buffer (mmap'd, or NULL) */
62 	char *bpf_hbuf;			/* hold buffer (mmap'd, or NULL) */
63 	size_t bpf_slen;		/* used part of store buffer */
64 	size_t bpf_hlen;		/* used part of hold buffer */
65 	struct bpf_insn *bpf_filter;	/* verified BPF filter, or NULL */
66 	size_t bpf_filterlen;		/* length of filter, for munmap */
67 	pid_t bpf_pid;			/* process ID of last using process */
68 	clock_t bpf_timeout;		/* timeout for read calls (0 = none) */
69 	struct {			/* state for pending read request */
70 		endpoint_t br_endpt;	/* reading endpoint, or NONE */
71 		cp_grant_id_t br_grant;	/* grant for reader's buffer */
72 		cdev_id_t br_id;	/* read request identifier */
73 		minix_timer_t br_timer;	/* timer for read timeout */
74 	} bpf_read;
75 	struct {			/* state for pending select request */
76 		endpoint_t bs_endpt;	/* selecting endpoint, or NONE */
77 		unsigned int bs_selops;	/* pending select operations */
78 	} bpf_select;
79 	struct {			/* packet capture statistics */
80 		uint64_t bs_recv;	/* # of packets run through filter */
81 		uint64_t bs_drop;	/* # of packets dropped: buffer full */
82 		uint64_t bs_capt;	/* # of packets accepted by filter */
83 	} bpf_stat;
84 } bpf_array[NR_BPFDEV];
85 
86 #define BPFF_IN_USE	0x01		/* this BPF device object is in use */
87 #define BPFF_PROMISC	0x02		/* promiscuous mode enabled */
88 #define BPFF_IMMEDIATE	0x04		/* immediate mode is enabled */
89 #define BPFF_SEESENT	0x08		/* also process host-sent packets */
90 #define BPFF_HDRCMPLT	0x10		/* do not fill in link-layer source */
91 #define BPFF_FEEDBACK	0x20		/* feed back written packet as input */
92 
93 static TAILQ_HEAD(, bpfdev_link) bpfl_freelist;	/* list of free BPF devices */
94 
95 static struct bpf_stat bpf_stat;
96 
97 static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *,
98 	struct rmib_oldp *, struct rmib_newp *);
99 
100 /* The CTL_NET NET_BPF subtree.  All nodes are dynamically numbered. */
101 static struct rmib_node net_bpf_table[] = {
102 	RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize",
103 	    "Maximum size for data capture buffer"), /* TODO: read-write */
104 	RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats",
105 	    "BPF stats"),
106 	RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers",
107 	    "BPF peers"),
108 };
109 
110 static struct rmib_node net_bpf_node =
111     RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options");
112 
113 /*
114  * Initialize the BPF module.
115  */
116 void
117 bpfdev_init(void)
118 {
119 	const int mib[] = { CTL_NET, NET_BPF };
120 	unsigned int slot;
121 	int r;
122 
123 	/* Initialize data structures. */
124 	TAILQ_INIT(&bpfl_freelist);
125 
126 	for (slot = 0; slot < __arraycount(bpf_array); slot++) {
127 		bpf_array[slot].bpf_flags = 0;
128 
129 		TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link,
130 		    bpfl_next);
131 	}
132 
133 	memset(&bpf_stat, 0, sizeof(bpf_stat));
134 
135 	/* Register the "net.bpf" subtree with the MIB service. */
136 	if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK)
137 		panic("unable to register net.bpf RMIB tree: %d", r);
138 }
139 
140 /*
141  * Given a BPF device object, return the corresponding minor number.
142  */
143 static devminor_t
144 bpfdev_get_minor(struct bpfdev * bpfdev)
145 {
146 
147 	assert(bpfdev != NULL);
148 
149 	return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array);
150 }
151 
152 /*
153  * Given a minor number, return the corresponding BPF device object, or NULL if
154  * the minor number does not identify a BPF device.
155  */
156 static struct bpfdev *
157 bpfdev_get_by_minor(devminor_t minor)
158 {
159 
160 	if (minor < BPFDEV_BASE_MINOR ||
161 	    (unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array))
162 		return NULL;
163 
164 	return &bpf_array[minor - BPFDEV_BASE_MINOR];
165 }
166 
167 /*
168  * Open a BPF device, returning a cloned device instance.
169  */
170 static int
171 bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt)
172 {
173 	struct bpfdev_link *bpfl;
174 	struct bpfdev *bpf;
175 
176 	/* Disallow opening cloned devices through device nodes. */
177 	if (minor != BPFDEV_MINOR)
178 		return ENXIO;
179 
180 	if (TAILQ_EMPTY(&bpfl_freelist))
181 		return ENOBUFS;
182 
183 	bpfl = TAILQ_FIRST(&bpfl_freelist);
184 	TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next);
185 
186 	bpf = (struct bpfdev *)bpfl;
187 
188 	memset(bpf, 0, sizeof(*bpf));
189 
190 	bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT;
191 	bpf->bpf_size = BPF_BUF_DEF;
192 	bpf->bpf_pid = getnpid(user_endpt);
193 	bpf->bpf_read.br_endpt = NONE;
194 	bpf->bpf_select.bs_endpt = NONE;
195 
196 	return CDEV_CLONED | bpfdev_get_minor(bpf);
197 }
198 
199 /*
200  * Close a BPF device.
201  */
202 static int
203 bpfdev_close(devminor_t minor)
204 {
205 	struct bpfdev *bpf;
206 
207 	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
208 		return EINVAL;
209 
210 	/*
211 	 * There cannot possibly be a pending read request, so we never need to
212 	 * cancel the read timer from here either.
213 	 */
214 	assert(bpf->bpf_read.br_endpt == NONE);
215 
216 	if (bpf->bpf_sbuf != NULL) {
217 		assert(bpf->bpf_hbuf != NULL);
218 
219 		if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0)
220 			panic("munmap failed: %d", -errno);
221 		if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0)
222 			panic("munmap failed: %d", -errno);
223 
224 		bpf->bpf_sbuf = NULL;
225 		bpf->bpf_hbuf = NULL;
226 	} else
227 		assert(bpf->bpf_hbuf == NULL);
228 
229 	if (bpf->bpf_filter != NULL) {
230 		assert(bpf->bpf_filterlen > 0);
231 
232 		if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0)
233 			panic("munmap failed: %d", -errno);
234 
235 		bpf->bpf_filter = NULL;
236 	}
237 
238 	/*
239 	 * If the BPF device was attached to an interface, and that interface
240 	 * has not disappeared in the meantime, detach from it now.
241 	 */
242 	if (bpf->bpf_ifdev != NULL) {
243 		if (bpf->bpf_flags & BPFF_PROMISC)
244 			ifdev_clear_promisc(bpf->bpf_ifdev);
245 
246 		ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link);
247 
248 		bpf->bpf_ifdev = NULL;
249 	}
250 
251 	bpf->bpf_flags = 0;		/* mark as no longer in use */
252 
253 	TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next);
254 
255 	return OK;
256 }
257 
258 /*
259  * Rotate buffers for the BPF device, by swapping the store buffer and the hold
260  * buffer.
261  */
262 static void
263 bpfdev_rotate(struct bpfdev * bpf)
264 {
265 	char *buf;
266 	size_t len;
267 
268 	/*
269 	 * When rotating, the store buffer may or may not be empty, but the
270 	 * hold buffer must always be empty.
271 	 */
272 	assert(bpf->bpf_hlen == 0);
273 
274 	buf = bpf->bpf_sbuf;
275 	len = bpf->bpf_slen;
276 	bpf->bpf_sbuf = bpf->bpf_hbuf;
277 	bpf->bpf_slen = bpf->bpf_hlen;
278 	bpf->bpf_hbuf = buf;
279 	bpf->bpf_hlen = len;
280 }
281 
282 /*
283  * Test whether any of the given select operations are ready on the BPF device,
284  * and return the set of ready operations.
285  */
286 static unsigned int
287 bpfdev_test_select(struct bpfdev * bpf, unsigned int ops)
288 {
289 	unsigned int ready_ops;
290 
291 	ready_ops = 0;
292 
293 	/*
294 	 * The BPF device is ready for reading if the hold buffer is not empty
295 	 * (i.e.: the store buffer has been filled up completely and was
296 	 * therefore rotated) or if immediate mode is set and the store buffer
297 	 * is not empty (i.e.: any packet is available at all).  In the latter
298 	 * case, the buffers will be rotated during the read.  We do not
299 	 * support applying the read timeout to selects and maintaining state
300 	 * between the select and the following read, because despite that
301 	 * libpcap claims that it is the right behavior, that is just insane.
302 	 */
303 	if (ops & CDEV_OP_RD) {
304 		if (bpf->bpf_ifdev == NULL)
305 			ready_ops |= CDEV_OP_RD;
306 		else if (bpf->bpf_hlen > 0)
307 			ready_ops |= CDEV_OP_RD;
308 		else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
309 		    bpf->bpf_slen > 0)
310 			ready_ops |= CDEV_OP_RD;
311 	}
312 
313 	if (ops & CDEV_OP_WR)
314 		ready_ops |= CDEV_OP_WR;
315 
316 	return ready_ops;
317 }
318 
319 /*
320  * There has been a state change on the BPF device.  If now possible, resume a
321  * pending select query, if any.
322  */
323 static void
324 bpfdev_resume_select(struct bpfdev * bpf)
325 {
326 	unsigned int ops, ready_ops;
327 	endpoint_t endpt;
328 
329 	/* First see if there is a pending select request at all. */
330 	if ((endpt = bpf->bpf_select.bs_endpt) == NONE)
331 		return;
332 	ops = bpf->bpf_select.bs_selops;
333 
334 	assert(ops != 0);
335 
336 	/* Then see if any of the pending operations are now ready. */
337 	if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0)
338 		return;
339 
340 	/* If so, notify VFS about the ready operations. */
341 	chardriver_reply_select(bpf->bpf_select.bs_endpt,
342 	    bpfdev_get_minor(bpf), ready_ops);
343 
344 	/*
345 	 * Forget about the ready operations.  If that leaves no pending
346 	 * operations, forget about the select request altogether.
347 	 */
348 	if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0)
349 		bpf->bpf_select.bs_endpt = NONE;
350 }
351 
352 /*
353  * There has been a state change on the BPF device.  If now possible, resume a
354  * pending read request, if any.  If the call is a result of a timeout,
355  * 'is_timeout' is set.  In that case, the read request must be resumed with an
356  * EAGAIN error if no packets are available, and the running timer must be
357  * canceled.  Otherwise, the resumption is due to a full buffer or a
358  * disappeared interface, and 'is_timeout' is not set.  In this case, the read
359  * request must be resumed with an I/O error if no packets are available.
360  */
361 static void
362 bpfdev_resume_read(struct bpfdev * bpf, int is_timeout)
363 {
364 	ssize_t r;
365 
366 	assert(bpf->bpf_read.br_endpt != NONE);
367 
368 	/*
369 	 * If the hold buffer is still empty, see if the store buffer has
370 	 * any packets to copy out.
371 	 */
372 	if (bpf->bpf_hlen == 0)
373 		bpfdev_rotate(bpf);
374 
375 	/* Return any available packets, or otherwise an error. */
376 	if (bpf->bpf_hlen > 0) {
377 		assert(bpf->bpf_hlen <= bpf->bpf_size);
378 
379 		r = sys_safecopyto(bpf->bpf_read.br_endpt,
380 		    bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf,
381 		    bpf->bpf_hlen);
382 
383 		if (r == OK) {
384 			r = (ssize_t)bpf->bpf_hlen;
385 
386 			bpf->bpf_hlen = 0;
387 
388 			assert(bpf->bpf_slen != bpf->bpf_size);
389 
390 			/*
391 			 * Allow readers to get the last packets after the
392 			 * interface has disappeared, before getting errors.
393 			 */
394 			if (bpf->bpf_ifdev == NULL)
395 				bpfdev_rotate(bpf);
396 		}
397 	} else
398 		r = (is_timeout) ? EAGAIN : EIO;
399 
400 	chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r);
401 
402 	bpf->bpf_read.br_endpt = NONE;
403 
404 	/* Was there still a timer running?  Then cancel it now. */
405 	if (bpf->bpf_timeout > 0 && !is_timeout)
406 		cancel_timer(&bpf->bpf_read.br_timer);
407 }
408 
409 /*
410  * A read timeout has triggered for the BPF device.  Wake up the pending read
411  * request.
412  */
413 static void
414 bpfdev_timeout(int arg)
415 {
416 	struct bpfdev *bpf;
417 
418 	assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array));
419 
420 	bpf = &bpf_array[arg];
421 
422 	assert(bpf->bpf_read.br_endpt != NONE);
423 
424 	bpfdev_resume_read(bpf, TRUE /*is_timeout*/);
425 }
426 
427 /*
428  * Read from a BPF device.
429  */
430 static ssize_t
431 bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt,
432 	cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
433 {
434 	struct bpfdev *bpf;
435 	ssize_t r;
436 	int suspend;
437 
438 	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
439 		return EINVAL;
440 
441 	/* Allow only one read call at a time. */
442 	if (bpf->bpf_read.br_endpt != NONE)
443 		return EIO;
444 
445 	/* Has this BPF device been configured at all yet? */
446 	if (bpf->bpf_sbuf == NULL)
447 		return EINVAL;
448 
449 	/*
450 	 * Does the read call size match the entire buffer size?  This is a
451 	 * ridiculous requirement but it makes our job quite a bit easier..
452 	 */
453 	if (size != bpf->bpf_size)
454 		return EINVAL;
455 
456 	/*
457 	 * Following standard receive semantics, if the interface is gone,
458 	 * return all the packets that were pending before returning an error.
459 	 * This requires extra buffer rotations after read completion, too.
460 	 */
461 	if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0)
462 		return EIO;
463 
464 	/*
465 	 * If immediate mode is not enabled, we should always suspend the read
466 	 * call if the hold buffer is empty.  If immediate mode is enabled, we
467 	 * should only suspend the read call if both buffers are empty, and
468 	 * return data from the hold buffer or otherwise the store buffer,
469 	 * whichever is not empty.  A non-blocking call behaves as though
470 	 * immediate mode is enabled, except it will return EAGAIN instead of
471 	 * suspending the read call if both buffers are empty.  Thus, we may
472 	 * have to rotate buffers for both immediate mode and non-blocking
473 	 * calls.  The latter is necessary for libpcap to behave correctly.
474 	 */
475 	if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE))
476 		suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0);
477 	else
478 		suspend = (bpf->bpf_hlen == 0);
479 
480 	if (suspend) {
481 		if (flags & CDEV_NONBLOCK)
482 			return EAGAIN;
483 
484 		/* Suspend the read call for later. */
485 		bpf->bpf_read.br_endpt = endpt;
486 		bpf->bpf_read.br_grant = grant;
487 		bpf->bpf_read.br_id = id;
488 
489 		/* Set a timer if requested. */
490 		if (bpf->bpf_timeout > 0)
491 			set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout,
492 			    bpfdev_timeout, (int)(bpf - bpf_array));
493 
494 		return EDONTREPLY;
495 	}
496 
497 	/* If we get here, either buffer has data; rotate buffers if needed. */
498 	if (bpf->bpf_hlen == 0)
499 		bpfdev_rotate(bpf);
500 	assert(bpf->bpf_hlen > 0);
501 
502 	if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf,
503 	    bpf->bpf_hlen)) != OK)
504 		return r;
505 
506 	r = (ssize_t)bpf->bpf_hlen;
507 
508 	bpf->bpf_hlen = 0;
509 
510 	/*
511 	 * If the store buffer is exactly full, rotate it now.  Also, if the
512 	 * interface has disappeared, the store buffer will never fill up.
513 	 * Rotate it so that the application will get any remaining data before
514 	 * getting errors about the interface being gone.
515 	 */
516 	if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL)
517 		bpfdev_rotate(bpf);
518 
519 	return r;
520 }
521 
522 /*
523  * Write to a BPF device.
524  */
525 static ssize_t
526 bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt,
527 	cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
528 {
529 	struct bpfdev *bpf;
530 	struct pbuf *pbuf, *pptr, *pcopy;
531 	size_t off;
532 	err_t err;
533 	int r;
534 
535 	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
536 		return EINVAL;
537 
538 	if (bpf->bpf_ifdev == NULL)
539 		return EINVAL;
540 
541 	/* VFS skips zero-sized I/O calls right now, but that may change. */
542 	if (size == 0)
543 		return 0;	/* nothing to do */
544 
545 	if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) +
546 	    ifdev_get_mtu(bpf->bpf_ifdev))
547 		return EMSGSIZE;
548 
549 	if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL)
550 		return ENOMEM;
551 
552 	/* TODO: turn this into a series of vector copies. */
553 	off = 0;
554 	for (pptr = pbuf; pptr != NULL; pptr = pptr->next) {
555 		if ((r = sys_safecopyfrom(endpt, grant, off,
556 		    (vir_bytes)pptr->payload, pptr->len)) != OK) {
557 			pbuf_free(pbuf);
558 
559 			return r;
560 		}
561 		off += pptr->len;
562 	}
563 	assert(off == size);
564 
565 	/*
566 	 * In feedback mode, we cannot use the same packet buffers for both
567 	 * output and input, so make a copy.  We do this before calling the
568 	 * output function, which may change part of the buffers, because the
569 	 * BSDs take this approach as well.
570 	 */
571 	if (bpf->bpf_flags & BPFF_FEEDBACK) {
572 		if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) {
573 			pbuf_free(pbuf);
574 
575 			return ENOMEM;
576 		}
577 
578 		if (pbuf_copy(pcopy, pbuf) != ERR_OK)
579 			panic("unexpected pbuf copy failure");
580 	} else
581 		pcopy = NULL;
582 
583 	/* Pass in the packet as output, and free it again. */
584 	err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/,
585 	    TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT));
586 
587 	pbuf_free(pbuf);
588 
589 	/* In feedback mode, pass in the copy as input, if output succeeded. */
590 	if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK))
591 		ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/,
592 		    FALSE /*to_bpf*/);
593 	else if (pcopy != NULL)
594 		pbuf_free(pcopy);
595 
596 	return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err);
597 }
598 
599 /*
600  * Attach a BPF device to a network interface, using the interface name given
601  * in an ifreq structure.  As side effect, allocate hold and store buffers for
602  * the device.  These buffers will stay allocated until the device is closed,
603  * even though the interface may disappear before that.  Return OK if the BPF
604  * device was successfully attached to the interface, or a negative error code
605  * otherwise.
606  */
607 static int
608 bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr)
609 {
610 	struct ifdev *ifdev;
611 	void *sbuf, *hbuf;
612 
613 	/* Find the interface with the given name. */
614 	ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0';
615 	if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL)
616 		return ENXIO;
617 
618 	/*
619 	 * Allocate a store buffer and a hold buffer.  Preallocate the memory,
620 	 * or we might get killed later during low-memory conditions.
621 	 */
622 	if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
623 	    MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED)
624 		return ENOMEM;
625 
626 	if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
627 	    MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) {
628 		(void)munmap(sbuf, bpf->bpf_size);
629 
630 		return ENOMEM;
631 	}
632 
633 	bpf->bpf_ifdev = ifdev;
634 	bpf->bpf_sbuf = sbuf;
635 	bpf->bpf_hbuf = hbuf;
636 	assert(bpf->bpf_slen == 0);
637 	assert(bpf->bpf_hlen == 0);
638 
639 	ifdev_attach_bpf(ifdev, &bpf->bpf_link);
640 
641 	return OK;
642 }
643 
644 /*
645  * Detach the BPF device from its interface, which is about to disappear.
646  */
647 void
648 bpfdev_detach(struct bpfdev_link * bpfl)
649 {
650 	struct bpfdev *bpf = (struct bpfdev *)bpfl;
651 
652 	assert(bpf->bpf_flags & BPFF_IN_USE);
653 	assert(bpf->bpf_ifdev != NULL);
654 
655 	/*
656 	 * We deliberately leave the buffers allocated here, for two reasons:
657 	 *
658 	 * 1) it lets applications to read any last packets in the buffers;
659 	 * 2) it prevents reattaching the BPF device to another interface.
660 	 */
661 	bpf->bpf_ifdev = NULL;
662 
663 	/*
664 	 * Resume pending read and select requests, returning any data left,
665 	 * or an error if none.
666 	 */
667 	if (bpf->bpf_hlen == 0)
668 		bpfdev_rotate(bpf);
669 
670 	if (bpf->bpf_read.br_endpt != NONE)
671 		bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
672 
673 	bpfdev_resume_select(bpf);
674 }
675 
676 /*
677  * Flush the given BPF device, resetting its buffer contents and statistics
678  * counters.
679  */
680 static void
681 bpfdev_flush(struct bpfdev * bpf)
682 {
683 
684 	bpf->bpf_slen = 0;
685 	bpf->bpf_hlen = 0;
686 
687 	bpf->bpf_stat.bs_recv = 0;
688 	bpf->bpf_stat.bs_drop = 0;
689 	bpf->bpf_stat.bs_capt = 0;
690 }
691 
692 /*
693  * Install a filter program on the BPF device.  A new filter replaces any old
694  * one.  A zero-sized filter simply clears a previous filter.  On success,
695  * perform a flush and return OK.  On failure, return a negative error code
696  * without making any modifications to the current filter.
697  */
698 static int
699 bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant)
700 {
701 	struct bpf_insn *filter;
702 	unsigned int count;
703 	size_t len;
704 	int r;
705 
706 	if ((r = sys_safecopyfrom(endpt, grant,
707 	    offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count,
708 	    sizeof(count))) != OK)
709 		return r;
710 
711 	if (count > BPF_MAXINSNS)
712 		return EINVAL;
713 	len = count * sizeof(struct bpf_insn);
714 
715 	if (len > 0) {
716 		if ((filter = (struct bpf_insn *)mmap(NULL, len,
717 		    PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) ==
718 		    MAP_FAILED)
719 			return ENOMEM;
720 
721 		if ((r = sys_safecopyfrom(endpt, grant,
722 		    offsetof(struct minix_bpf_program, mbf_insns),
723 		    (vir_bytes)filter, len)) != OK) {
724 			(void)munmap(filter, len);
725 
726 			return r;
727 		}
728 
729 		if (!bpf_validate(filter, count)) {
730 			(void)munmap(filter, len);
731 
732 			return EINVAL;
733 		}
734 	} else
735 		filter = NULL;
736 
737 	if (bpf->bpf_filter != NULL)
738 		(void)munmap(bpf->bpf_filter, bpf->bpf_filterlen);
739 
740 	bpf->bpf_filter = filter;
741 	bpf->bpf_filterlen = len;
742 
743 	bpfdev_flush(bpf);
744 
745 	return OK;
746 }
747 
748 /*
749  * Process an I/O control request on the BPF device.
750  */
751 static int
752 bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
753 	cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
754 {
755 	struct bpfdev *bpf;
756 	struct bpf_stat bs;
757 	struct bpf_version bv;
758 	struct bpf_dltlist bfl;
759 	struct timeval tv;
760 	struct ifreq ifr;
761 	unsigned int uval;
762 	int r, val;
763 
764 	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
765 		return EINVAL;
766 
767 	/*
768 	 * We do not support multiple concurrent requests in this module.  That
769 	 * not only means that we forbid a read(2) call on a BPF device object
770 	 * while another read(2) is already pending: we also disallow IOCTL
771 	 * IOCTL calls while such a read(2) call is in progress.  This
772 	 * restriction should never be a problem for user programs, and allows
773 	 * us to rely on the fact that that no settings can change between the
774 	 * start and end of any read call.  As a side note, pending select(2)
775 	 * queries may be similarly affected, and will also not be fully
776 	 * accurate if any options are changed while pending.
777 	 */
778 	if (bpf->bpf_read.br_endpt != NONE)
779 		return EIO;
780 
781 	bpf->bpf_pid = getnpid(user_endpt);
782 
783 	/* These are in order of the NetBSD BIOC.. IOCTL numbers. */
784 	switch (request) {
785 	case BIOCGBLEN:
786 		uval = bpf->bpf_size;
787 
788 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
789 		    sizeof(uval));
790 
791 	case BIOCSBLEN:
792 		if (bpf->bpf_sbuf != NULL)
793 			return EINVAL;
794 
795 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
796 		    sizeof(uval))) != OK)
797 			return r;
798 
799 		if (uval < BPF_BUF_MIN)
800 			uval = BPF_BUF_MIN;
801 		else if (uval > BPF_BUF_MAX)
802 			uval = BPF_BUF_MAX;
803 
804 		/* Is this the right thing to do?  It doesn't matter for us. */
805 		uval = BPF_WORDALIGN(uval);
806 
807 		if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
808 		    sizeof(uval))) != OK)
809 			return r;
810 
811 		bpf->bpf_size = uval;
812 
813 		return OK;
814 
815 	case MINIX_BIOCSETF:
816 		return bpfdev_setfilter(bpf, endpt, grant);
817 
818 	case BIOCPROMISC:
819 		if (bpf->bpf_ifdev == NULL)
820 			return EINVAL;
821 
822 		if (!(bpf->bpf_flags & BPFF_PROMISC)) {
823 			if (!ifdev_set_promisc(bpf->bpf_ifdev))
824 				return EINVAL;
825 
826 			bpf->bpf_flags |= BPFF_PROMISC;
827 		}
828 
829 		return OK;
830 
831 	case BIOCFLUSH:
832 		bpfdev_flush(bpf);
833 
834 		return OK;
835 
836 	case BIOCGDLT:
837 		if (bpf->bpf_ifdev == NULL)
838 			return EINVAL;
839 
840 		/* TODO: support for type configuration per BPF device. */
841 		uval = ifdev_get_dlt(bpf->bpf_ifdev);
842 
843 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
844 		    sizeof(uval));
845 
846 	case BIOCGETIF:
847 		if (bpf->bpf_ifdev == NULL)
848 			return EINVAL;
849 
850 		memset(&ifr, 0, sizeof(ifr));
851 		strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev),
852 		    sizeof(ifr.ifr_name));
853 
854 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr,
855 		    sizeof(ifr));
856 
857 	case BIOCSETIF:
858 		/*
859 		 * Test on the presence of a buffer rather than on an interface
860 		 * since the latter may disappear and thus be reset to NULL, in
861 		 * which case we do not want to allow rebinding to another.
862 		 */
863 		if (bpf->bpf_sbuf != NULL)
864 			return EINVAL;
865 
866 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr,
867 		    sizeof(ifr))) != OK)
868 			return r;
869 
870 		return bpfdev_attach(bpf, &ifr);
871 
872 	case BIOCGSTATS:
873 		/*
874 		 * Why do we not embed a bpf_stat structure directly in the
875 		 * BPF device structure?  Well, bpf_stat has massive padding..
876 		 */
877 		memset(&bs, 0, sizeof(bs));
878 		bs.bs_recv = bpf->bpf_stat.bs_recv;
879 		bs.bs_drop = bpf->bpf_stat.bs_drop;
880 		bs.bs_capt = bpf->bpf_stat.bs_capt;
881 
882 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs,
883 		    sizeof(bs));
884 
885 	case BIOCIMMEDIATE:
886 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
887 		    sizeof(uval))) != OK)
888 			return r;
889 
890 		if (uval)
891 			bpf->bpf_flags |= BPFF_IMMEDIATE;
892 		else
893 			bpf->bpf_flags &= ~BPFF_IMMEDIATE;
894 
895 		return OK;
896 
897 	case BIOCVERSION:
898 		memset(&bv, 0, sizeof(bv));
899 		bv.bv_major = BPF_MAJOR_VERSION;
900 		bv.bv_minor = BPF_MINOR_VERSION;
901 
902 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv,
903 		    sizeof(bv));
904 
905 	case BIOCGHDRCMPLT:
906 		uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
907 
908 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
909 		    sizeof(uval));
910 
911 	case BIOCSHDRCMPLT:
912 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
913 		    sizeof(uval))) != OK)
914 			return r;
915 
916 		if (uval)
917 			bpf->bpf_flags |= BPFF_HDRCMPLT;
918 		else
919 			bpf->bpf_flags &= ~BPFF_HDRCMPLT;
920 
921 		return OK;
922 
923 	case BIOCSDLT:
924 		if (bpf->bpf_ifdev == NULL)
925 			return EINVAL;
926 
927 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
928 		    sizeof(uval))) != OK)
929 			return r;
930 
931 		/* TODO: support for type configuration per BPF device. */
932 		if (uval != ifdev_get_dlt(bpf->bpf_ifdev))
933 			return EINVAL;
934 
935 		return OK;
936 
937 	case MINIX_BIOCGDLTLIST:
938 		if (bpf->bpf_ifdev == NULL)
939 			return EINVAL;
940 
941 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl,
942 		    sizeof(bfl))) != OK)
943 			return r;
944 
945 		if (bfl.bfl_list != NULL) {
946 			if (bfl.bfl_len < 1)
947 				return ENOMEM;
948 
949 			/*
950 			 * Copy out the 'list', which consists of one entry.
951 			 * If we were to produce multiple entries, we would
952 			 * have to check against the MINIX_BPF_MAXDLT limit.
953 			 */
954 			uval = ifdev_get_dlt(bpf->bpf_ifdev);
955 
956 			if ((r = sys_safecopyto(endpt, grant,
957 			    offsetof(struct minix_bpf_dltlist, mbfl_list),
958 			    (vir_bytes)&uval, sizeof(uval))) != OK)
959 				return r;
960 		}
961 		bfl.bfl_len = 1;
962 
963 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl,
964 		    sizeof(bfl));
965 
966 	case BIOCGSEESENT:
967 		uval = !!(bpf->bpf_flags & BPFF_SEESENT);
968 
969 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
970 		    sizeof(uval));
971 
972 	case BIOCSSEESENT:
973 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
974 		    sizeof(uval))) != OK)
975 			return r;
976 
977 		if (uval)
978 			bpf->bpf_flags |= BPFF_SEESENT;
979 		else
980 			bpf->bpf_flags &= ~BPFF_SEESENT;
981 
982 		return OK;
983 
984 	case BIOCSRTIMEOUT:
985 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv,
986 		    sizeof(tv))) != OK)
987 			return r;
988 
989 		if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK)
990 			return r;
991 
992 		return OK;
993 
994 	case BIOCGRTIMEOUT:
995 		util_ticks_to_timeval(bpf->bpf_timeout, &tv);
996 
997 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv,
998 		    sizeof(tv));
999 
1000 	case BIOCGFEEDBACK:
1001 		uval = !!(bpf->bpf_flags & BPFF_FEEDBACK);
1002 
1003 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
1004 		    sizeof(uval));
1005 
1006 	case BIOCSFEEDBACK:
1007 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
1008 		    sizeof(uval))) != OK)
1009 			return r;
1010 
1011 		if (uval)
1012 			bpf->bpf_flags |= BPFF_FEEDBACK;
1013 		else
1014 			bpf->bpf_flags &= ~BPFF_FEEDBACK;
1015 
1016 		return OK;
1017 
1018 	case FIONREAD:
1019 		val = 0;
1020 		if (bpf->bpf_hlen > 0)
1021 			val = bpf->bpf_hlen;
1022 		else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
1023 		    bpf->bpf_slen > 0)
1024 			val = bpf->bpf_slen;
1025 		else
1026 			val = 0;
1027 
1028 		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val,
1029 		    sizeof(val));
1030 
1031 	default:
1032 		return ENOTTY;
1033 	}
1034 }
1035 
1036 /*
1037  * Cancel a previously suspended request on a BPF device.  Since only read
1038  * requests may be suspended (select is handled differently), the cancel
1039  * request must be for a read request.  Note that character devices currently
1040  * (still) behave slightly differently from socket devices here: while socket
1041  * drivers are supposed to respond to the original request, character drivers
1042  * must respond to the original request from the cancel callback.
1043  */
1044 static int
1045 bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
1046 {
1047 	struct bpfdev *bpf;
1048 
1049 	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
1050 		return EDONTREPLY;
1051 
1052 	/* Is this a cancel request for the currently pending read request? */
1053 	if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id)
1054 		return EDONTREPLY;
1055 
1056 	/* If so, cancel the read request. */
1057 	if (bpf->bpf_timeout > 0)
1058 		cancel_timer(&bpf->bpf_read.br_timer);
1059 
1060 	bpf->bpf_read.br_endpt = NONE;
1061 
1062 	return EINTR; /* the return value for the canceled read request */
1063 }
1064 
1065 /*
1066  * Perform a select query on a BPF device.
1067  */
1068 static int
1069 bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
1070 {
1071 	struct bpfdev *bpf;
1072 	unsigned int r, notify;
1073 
1074 	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
1075 		return EINVAL;
1076 
1077 	notify = (ops & CDEV_NOTIFY);
1078 	ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
1079 
1080 	r = bpfdev_test_select(bpf, ops);
1081 
1082 	/*
1083 	 * For the operations that were not immediately ready, if requested,
1084 	 * save the select request for later.
1085 	 */
1086 	ops &= ~r;
1087 
1088 	if (ops != 0 && notify) {
1089 		if (bpf->bpf_select.bs_endpt != NONE) {
1090 			/* Merge in the operations with any earlier request. */
1091 			if (bpf->bpf_select.bs_endpt != endpt)
1092 				return EIO;
1093 			bpf->bpf_select.bs_selops |= ops;
1094 		} else {
1095 			bpf->bpf_select.bs_endpt = endpt;
1096 			bpf->bpf_select.bs_selops = ops;
1097 		}
1098 	}
1099 
1100 	return r;
1101 }
1102 
1103 /*
1104  * Process an incoming packet on the interface to which the given BPF device is
1105  * attached.  If the packet passes the filter (if any), store as much as
1106  * requested of it in the store buffer, rotating buffers if needed and resuming
1107  * suspended read and select requests as appropriate.  This function is also
1108  * called through bpfdev_output() below.
1109  */
1110 void
1111 bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
1112 {
1113 	struct bpfdev *bpf = (struct bpfdev *)bpfl;
1114 	struct timespec ts;
1115 	struct bpf_hdr bh;
1116 	const struct pbuf *pptr;
1117 	size_t caplen, hdrlen, totlen, off, chunk;
1118 	int hfull;
1119 
1120 	/*
1121 	 * Apparently bs_recv is the counter of packets that were run through
1122 	 * the filter, not the number of packets that were or could be received
1123 	 * by the user (which is what I got from the manual page.. oh well).
1124 	 */
1125 	bpf->bpf_stat.bs_recv++;
1126 	bpf_stat.bs_recv++;
1127 
1128 	/*
1129 	 * Run the packet through the BPF device's filter to see whether the
1130 	 * packet should be stored and if so, how much of it.  If no filter is
1131 	 * set, all packets will be stored in their entirety.
1132 	 */
1133 	caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload,
1134 	    pbuf->tot_len, pbuf->len);
1135 
1136 	if (caplen == 0)
1137 		return;		/* no match; ignore packet */
1138 
1139 	if (caplen > pbuf->tot_len)
1140 		caplen = pbuf->tot_len;
1141 
1142 	/* Truncate packet entries to the full size of the buffers. */
1143 	hdrlen = BPF_WORDALIGN(sizeof(bh));
1144 	totlen = BPF_WORDALIGN(hdrlen + caplen);
1145 
1146 	if (totlen > bpf->bpf_size) {
1147 		totlen = bpf->bpf_size;
1148 		caplen = totlen - hdrlen;
1149 	}
1150 	assert(totlen >= hdrlen);
1151 
1152 	bpf->bpf_stat.bs_capt++;
1153 	bpf_stat.bs_capt++;
1154 
1155 	assert(bpf->bpf_sbuf != NULL);
1156 	if (totlen > bpf->bpf_size - bpf->bpf_slen) {
1157 		/*
1158 		 * If the store buffer is full and the hold buffer is not
1159 		 * empty, we cannot swap the two buffers, and so we must drop
1160 		 * the current packet.
1161 		 */
1162 		if (bpf->bpf_hlen > 0) {
1163 			bpf->bpf_stat.bs_drop++;
1164 			bpf_stat.bs_drop++;
1165 
1166 			return;
1167 		}
1168 
1169 		/*
1170 		 * Rotate the buffers: the hold buffer will now be "full" and
1171 		 * ready to be read - it may not actually be entirely full, but
1172 		 * we could not fit this packet and we are not going to deliver
1173 		 * packets out of order..
1174 		 */
1175 		bpfdev_rotate(bpf);
1176 
1177 		hfull = TRUE;
1178 	} else
1179 		hfull = FALSE;
1180 
1181 	/*
1182 	 * Retrieve the capture time for the packet.  Ideally this would be
1183 	 * done only once per accepted packet, but we do not expect many BPF
1184 	 * devices to be receiving the same packets often enough to make that
1185 	 * worth it.
1186 	 */
1187 	clock_time(&ts);
1188 
1189 	/*
1190 	 * Copy the packet into the store buffer, including a newly generated
1191 	 * header.  Zero any padding areas, even if strictly not necessary.
1192 	 */
1193 	memset(&bh, 0, sizeof(bh));
1194 	bh.bh_tstamp.tv_sec = ts.tv_sec;
1195 	bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000;
1196 	bh.bh_caplen = caplen;
1197 	bh.bh_datalen = pbuf->tot_len;
1198 	bh.bh_hdrlen = hdrlen;
1199 
1200 	assert(bpf->bpf_sbuf != NULL);
1201 	off = bpf->bpf_slen;
1202 
1203 	memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh));
1204 	if (hdrlen > sizeof(bh))
1205 		memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0,
1206 		    hdrlen - sizeof(bh));
1207 	off += hdrlen;
1208 
1209 	for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) {
1210 		chunk = pptr->len;
1211 		if (chunk > caplen)
1212 			chunk = caplen;
1213 
1214 		memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk);
1215 
1216 		off += chunk;
1217 		caplen -= chunk;
1218 	}
1219 
1220 	assert(off <= bpf->bpf_slen + totlen);
1221 	if (bpf->bpf_slen + totlen > off)
1222 		memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off);
1223 
1224 	bpf->bpf_slen += totlen;
1225 
1226 	/*
1227 	 * Edge case: if the hold buffer is empty and the store buffer is now
1228 	 * exactly full, rotate buffers so that the packets can be read
1229 	 * immediately, without waiting for the next packet to cause rotation.
1230 	 */
1231 	if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) {
1232 		bpfdev_rotate(bpf);
1233 
1234 		hfull = TRUE;
1235 	}
1236 
1237 	/*
1238 	 * If the hold buffer is now full, or if immediate mode is enabled,
1239 	 * then we now have data to deliver to userland.  See if we can wake up
1240 	 * any read or select call (either but not both here).
1241 	 */
1242 	if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) {
1243 		if (bpf->bpf_read.br_endpt != NONE)
1244 			bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
1245 		else
1246 			bpfdev_resume_select(bpf);
1247 	}
1248 }
1249 
1250 /*
1251  * Process an outgoing packet on the interface to which the given BPF device is
1252  * attached.  If the BPF device is configured to capture outgoing packets as
1253  * well, attempt to capture the packet as per bpfdev_input().
1254  */
1255 void
1256 bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
1257 {
1258 	struct bpfdev *bpf = (struct bpfdev *)bpfl;
1259 
1260 	if (bpf->bpf_flags & BPFF_SEESENT)
1261 		bpfdev_input(bpfl, pbuf);
1262 }
1263 
1264 /*
1265  * Fill the given 'bde' structure with information about BPF device 'bpf'.
1266  */
1267 static void
1268 bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf)
1269 {
1270 
1271 	bde->bde_bufsize = bpf->bpf_size;
1272 	bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC);
1273 	bde->bde_state = BPF_IDLE;
1274 	bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE);
1275 	bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
1276 	bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT);
1277 	/*
1278 	 * NetBSD updates the process ID upon device open, close, ioctl, and
1279 	 * poll.  From those, only open and ioctl make sense for us.  Sadly
1280 	 * there is no way to indicate "no known PID" to netstat(1), so we
1281 	 * cannot even save just the endpoint and look up the corresponding PID
1282 	 * later, since the user process may be gone by then.
1283 	 */
1284 	bde->bde_pid = bpf->bpf_pid;
1285 	bde->bde_rcount = bpf->bpf_stat.bs_recv;
1286 	bde->bde_dcount = bpf->bpf_stat.bs_drop;
1287 	bde->bde_ccount = bpf->bpf_stat.bs_capt;
1288 	if (bpf->bpf_ifdev != NULL)
1289 		strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev),
1290 		    sizeof(bde->bde_ifname));
1291 }
1292 
1293 /*
1294  * Obtain statistics about open BPF devices ("peers").  This node may be
1295  * accessed by the superuser only.  Used by netstat(1).
1296  */
1297 static ssize_t
1298 bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused,
1299 	struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
1300 {
1301 	struct bpfdev *bpf;
1302 	struct bpf_d_ext bde;
1303 	unsigned int slot;
1304 	ssize_t off;
1305 	int r, size, max;
1306 
1307 	if (!(call->call_flags & RMIB_FLAG_AUTH))
1308 		return EPERM;
1309 
1310 	if (call->call_namelen != 2)
1311 		return EINVAL;
1312 
1313 	size = call->call_name[0];
1314 	if (size < 0 || (size_t)size > sizeof(bde))
1315 		return EINVAL;
1316 	if (size == 0)
1317 		size = sizeof(bde);
1318 	max = call->call_name[1];
1319 
1320 	off = 0;
1321 
1322 	for (slot = 0; slot < __arraycount(bpf_array); slot++) {
1323 		bpf = &bpf_array[slot];
1324 
1325 		if (!(bpf->bpf_flags & BPFF_IN_USE))
1326 			continue;
1327 
1328 		if (rmib_inrange(oldp, off)) {
1329 			memset(&bde, 0, sizeof(bde));
1330 
1331 			bpfdev_get_info(&bde, bpf);
1332 
1333 			if ((r = rmib_copyout(oldp, off, &bde, size)) < 0)
1334 				return r;
1335 		}
1336 
1337 		off += sizeof(bde);
1338 		if (max > 0 && --max == 0)
1339 			break;
1340 	}
1341 
1342 	/* No slack needed: netstat(1) resizes its buffer as needed. */
1343 	return off;
1344 }
1345 
1346 static const struct chardriver bpfdev_tab = {
1347 	.cdr_open		= bpfdev_open,
1348 	.cdr_close		= bpfdev_close,
1349 	.cdr_read		= bpfdev_read,
1350 	.cdr_write		= bpfdev_write,
1351 	.cdr_ioctl		= bpfdev_ioctl,
1352 	.cdr_cancel		= bpfdev_cancel,
1353 	.cdr_select		= bpfdev_select
1354 };
1355 
1356 /*
1357  * Process a character driver request.  Since the LWIP service offers character
1358  * devices for BPF only, it must be a request for a BPF device.
1359  */
1360 void
1361 bpfdev_process(message * m_ptr, int ipc_status)
1362 {
1363 
1364 	chardriver_process(&bpfdev_tab, m_ptr, ipc_status);
1365 }
1366