xref: /openbsd/sys/net/bpf.c (revision 5a38ef86)
1 /*	$OpenBSD: bpf.c,v 1.207 2021/11/10 04:45:15 dlg Exp $	*/
2 /*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
8  *
9  * This code is derived from the Stanford/CMU enet packet filter,
10  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
11  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
12  * Berkeley Laboratory.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
39  */
40 
41 #include "bpfilter.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48 #include <sys/ioctl.h>
49 #include <sys/conf.h>
50 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/socket.h>
53 #include <sys/poll.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56 #include <sys/rwlock.h>
57 #include <sys/atomic.h>
58 #include <sys/smr.h>
59 #include <sys/specdev.h>
60 #include <sys/selinfo.h>
61 #include <sys/sigio.h>
62 #include <sys/task.h>
63 #include <sys/time.h>
64 
65 #include <net/if.h>
66 #include <net/bpf.h>
67 #include <net/bpfdesc.h>
68 
69 #include <netinet/in.h>
70 #include <netinet/if_ether.h>
71 
72 #include "vlan.h"
73 #if NVLAN > 0
74 #include <net/if_vlan_var.h>
75 #endif
76 
77 #define BPF_BUFSIZE 32768
78 
79 #define PRINET  26			/* interruptible */
80 
81 /*
82  * The default read buffer size is patchable.
83  */
84 int bpf_bufsize = BPF_BUFSIZE;
85 int bpf_maxbufsize = BPF_MAXBUFSIZE;
86 
87 /*
88  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
89  *  bpf_d_list is the list of descriptors
90  */
91 struct bpf_if	*bpf_iflist;
92 LIST_HEAD(, bpf_d) bpf_d_list;
93 
94 int	bpf_allocbufs(struct bpf_d *);
95 void	bpf_ifname(struct bpf_if*, struct ifreq *);
96 void	bpf_mcopy(const void *, void *, size_t);
97 int	bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
98 	    struct sockaddr *);
99 int	bpf_setif(struct bpf_d *, struct ifreq *);
100 int	bpfpoll(dev_t, int, struct proc *);
101 int	bpfkqfilter(dev_t, struct knote *);
102 void	bpf_wakeup(struct bpf_d *);
103 void	bpf_wakeup_cb(void *);
104 int	_bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
105 void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
106 	    const struct bpf_hdr *);
107 int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
108 int	bpf_setdlt(struct bpf_d *, u_int);
109 
110 void	filt_bpfrdetach(struct knote *);
111 int	filt_bpfread(struct knote *, long);
112 
113 int	bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
114 
115 struct bpf_d *bpfilter_lookup(int);
116 
117 /*
118  * Called holding ``bd_mtx''.
119  */
120 void	bpf_attachd(struct bpf_d *, struct bpf_if *);
121 void	bpf_detachd(struct bpf_d *);
122 void	bpf_resetd(struct bpf_d *);
123 
124 void	bpf_prog_smr(void *);
125 void	bpf_d_smr(void *);
126 
127 /*
128  * Reference count access to descriptor buffers
129  */
130 void	bpf_get(struct bpf_d *);
131 void	bpf_put(struct bpf_d *);
132 
133 
134 struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
135 
136 int
137 bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
138     struct sockaddr *sockp)
139 {
140 	struct bpf_program_smr *bps;
141 	struct bpf_insn *fcode = NULL;
142 	struct mbuf *m;
143 	struct m_tag *mtag;
144 	int error;
145 	u_int hlen;
146 	u_int len;
147 	u_int linktype;
148 	u_int slen;
149 
150 	/*
151 	 * Build a sockaddr based on the data link layer type.
152 	 * We do this at this level because the ethernet header
153 	 * is copied directly into the data field of the sockaddr.
154 	 * In the case of SLIP, there is no header and the packet
155 	 * is forwarded as is.
156 	 * Also, we are careful to leave room at the front of the mbuf
157 	 * for the link level header.
158 	 */
159 	linktype = d->bd_bif->bif_dlt;
160 	switch (linktype) {
161 
162 	case DLT_SLIP:
163 		sockp->sa_family = AF_INET;
164 		hlen = 0;
165 		break;
166 
167 	case DLT_PPP:
168 		sockp->sa_family = AF_UNSPEC;
169 		hlen = 0;
170 		break;
171 
172 	case DLT_EN10MB:
173 		sockp->sa_family = AF_UNSPEC;
174 		/* XXX Would MAXLINKHDR be better? */
175 		hlen = ETHER_HDR_LEN;
176 		break;
177 
178 	case DLT_IEEE802_11:
179 	case DLT_IEEE802_11_RADIO:
180 		sockp->sa_family = AF_UNSPEC;
181 		hlen = 0;
182 		break;
183 
184 	case DLT_RAW:
185 	case DLT_NULL:
186 		sockp->sa_family = AF_UNSPEC;
187 		hlen = 0;
188 		break;
189 
190 	case DLT_LOOP:
191 		sockp->sa_family = AF_UNSPEC;
192 		hlen = sizeof(u_int32_t);
193 		break;
194 
195 	default:
196 		return (EIO);
197 	}
198 
199 	if (uio->uio_resid > MAXMCLBYTES)
200 		return (EIO);
201 	len = uio->uio_resid;
202 
203 	MGETHDR(m, M_WAIT, MT_DATA);
204 	m->m_pkthdr.ph_ifidx = 0;
205 	m->m_pkthdr.len = len - hlen;
206 
207 	if (len > MHLEN) {
208 		MCLGETL(m, M_WAIT, len);
209 		if ((m->m_flags & M_EXT) == 0) {
210 			error = ENOBUFS;
211 			goto bad;
212 		}
213 	}
214 	m->m_len = len;
215 	*mp = m;
216 
217 	error = uiomove(mtod(m, caddr_t), len, uio);
218 	if (error)
219 		goto bad;
220 
221 	smr_read_enter();
222 	bps = SMR_PTR_GET(&d->bd_wfilter);
223 	if (bps != NULL)
224 		fcode = bps->bps_bf.bf_insns;
225 	slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
226 	smr_read_leave();
227 
228 	if (slen < len) {
229 		error = EPERM;
230 		goto bad;
231 	}
232 
233 	if (m->m_len < hlen) {
234 		error = EPERM;
235 		goto bad;
236 	}
237 	/*
238 	 * Make room for link header, and copy it to sockaddr
239 	 */
240 	if (hlen != 0) {
241 		if (linktype == DLT_LOOP) {
242 			u_int32_t af;
243 
244 			/* the link header indicates the address family */
245 			KASSERT(hlen == sizeof(u_int32_t));
246 			memcpy(&af, m->m_data, hlen);
247 			sockp->sa_family = ntohl(af);
248 		} else
249 			memcpy(sockp->sa_data, m->m_data, hlen);
250 		m->m_len -= hlen;
251 		m->m_data += hlen; /* XXX */
252 	}
253 
254 	/*
255 	 * Prepend the data link type as a mbuf tag
256 	 */
257 	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
258 	*(u_int *)(mtag + 1) = linktype;
259 	m_tag_prepend(m, mtag);
260 
261 	return (0);
262  bad:
263 	m_freem(m);
264 	return (error);
265 }
266 
267 /*
268  * Attach file to the bpf interface, i.e. make d listen on bp.
269  */
270 void
271 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
272 {
273 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
274 
275 	/*
276 	 * Point d at bp, and add d to the interface's list of listeners.
277 	 * Finally, point the driver's bpf cookie at the interface so
278 	 * it will divert packets to bpf.
279 	 */
280 
281 	d->bd_bif = bp;
282 
283 	KERNEL_ASSERT_LOCKED();
284 	SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
285 
286 	*bp->bif_driverp = bp;
287 }
288 
289 /*
290  * Detach a file from its interface.
291  */
292 void
293 bpf_detachd(struct bpf_d *d)
294 {
295 	struct bpf_if *bp;
296 
297 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
298 
299 	bp = d->bd_bif;
300 	/* Not attached. */
301 	if (bp == NULL)
302 		return;
303 
304 	/* Remove ``d'' from the interface's descriptor list. */
305 	KERNEL_ASSERT_LOCKED();
306 	SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
307 
308 	if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
309 		/*
310 		 * Let the driver know that there are no more listeners.
311 		 */
312 		*bp->bif_driverp = NULL;
313 	}
314 
315 	d->bd_bif = NULL;
316 
317 	/*
318 	 * Check if this descriptor had requested promiscuous mode.
319 	 * If so, turn it off.
320 	 */
321 	if (d->bd_promisc) {
322 		int error;
323 
324 		KASSERT(bp->bif_ifp != NULL);
325 
326 		d->bd_promisc = 0;
327 
328 		bpf_get(d);
329 		mtx_leave(&d->bd_mtx);
330 		NET_LOCK();
331 		error = ifpromisc(bp->bif_ifp, 0);
332 		NET_UNLOCK();
333 		mtx_enter(&d->bd_mtx);
334 		bpf_put(d);
335 
336 		if (error && !(error == EINVAL || error == ENODEV ||
337 		    error == ENXIO))
338 			/*
339 			 * Something is really wrong if we were able to put
340 			 * the driver into promiscuous mode, but can't
341 			 * take it out.
342 			 */
343 			panic("bpf: ifpromisc failed");
344 	}
345 }
346 
347 void
348 bpfilterattach(int n)
349 {
350 	LIST_INIT(&bpf_d_list);
351 }
352 
353 /*
354  * Open ethernet device.  Returns ENXIO for illegal minor device number,
355  * EBUSY if file is open by another process.
356  */
357 int
358 bpfopen(dev_t dev, int flag, int mode, struct proc *p)
359 {
360 	struct bpf_d *bd;
361 	int unit = minor(dev);
362 
363 	if (unit & ((1 << CLONE_SHIFT) - 1))
364 		return (ENXIO);
365 
366 	KASSERT(bpfilter_lookup(unit) == NULL);
367 
368 	/* create on demand */
369 	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
370 		return (EBUSY);
371 
372 	/* Mark "free" and do most initialization. */
373 	bd->bd_unit = unit;
374 	bd->bd_bufsize = bpf_bufsize;
375 	bd->bd_sig = SIGIO;
376 	mtx_init(&bd->bd_mtx, IPL_NET);
377 	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
378 	smr_init(&bd->bd_smr);
379 	sigio_init(&bd->bd_sigio);
380 
381 	bd->bd_rtout = 0;	/* no timeout by default */
382 
383 	bpf_get(bd);
384 	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
385 
386 	return (0);
387 }
388 
389 /*
390  * Close the descriptor by detaching it from its interface,
391  * deallocating its buffers, and marking it free.
392  */
393 int
394 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
395 {
396 	struct bpf_d *d;
397 
398 	d = bpfilter_lookup(minor(dev));
399 	mtx_enter(&d->bd_mtx);
400 	bpf_detachd(d);
401 	bpf_wakeup(d);
402 	LIST_REMOVE(d, bd_list);
403 	mtx_leave(&d->bd_mtx);
404 	bpf_put(d);
405 
406 	return (0);
407 }
408 
409 /*
410  * Rotate the packet buffers in descriptor d.  Move the store buffer
411  * into the hold slot, and the free buffer into the store slot.
412  * Zero the length of the new store buffer.
413  */
414 #define ROTATE_BUFFERS(d) \
415 	KASSERT(d->bd_in_uiomove == 0); \
416 	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
417 	(d)->bd_hbuf = (d)->bd_sbuf; \
418 	(d)->bd_hlen = (d)->bd_slen; \
419 	(d)->bd_sbuf = (d)->bd_fbuf; \
420 	(d)->bd_slen = 0; \
421 	(d)->bd_fbuf = NULL;
422 
423 /*
424  *  bpfread - read next chunk of packets from buffers
425  */
426 int
427 bpfread(dev_t dev, struct uio *uio, int ioflag)
428 {
429 	uint64_t end, now;
430 	struct bpf_d *d;
431 	caddr_t hbuf;
432 	int error, hlen;
433 
434 	KERNEL_ASSERT_LOCKED();
435 
436 	d = bpfilter_lookup(minor(dev));
437 	if (d->bd_bif == NULL)
438 		return (ENXIO);
439 
440 	bpf_get(d);
441 	mtx_enter(&d->bd_mtx);
442 
443 	/*
444 	 * Restrict application to use a buffer the same size as
445 	 * as kernel buffers.
446 	 */
447 	if (uio->uio_resid != d->bd_bufsize) {
448 		error = EINVAL;
449 		goto out;
450 	}
451 
452 	/*
453 	 * If there's a timeout, mark when the read should end.
454 	 */
455 	if (d->bd_rtout != 0) {
456 		now = nsecuptime();
457 		end = now + d->bd_rtout;
458 		if (end < now)
459 			end = UINT64_MAX;
460 	}
461 
462 	/*
463 	 * If the hold buffer is empty, then do a timed sleep, which
464 	 * ends when the timeout expires or when enough packets
465 	 * have arrived to fill the store buffer.
466 	 */
467 	while (d->bd_hbuf == NULL) {
468 		if (d->bd_bif == NULL) {
469 			/* interface is gone */
470 			if (d->bd_slen == 0) {
471 				error = EIO;
472 				goto out;
473 			}
474 			ROTATE_BUFFERS(d);
475 			break;
476 		}
477 		if (d->bd_immediate && d->bd_slen != 0) {
478 			/*
479 			 * A packet(s) either arrived since the previous
480 			 * read or arrived while we were asleep.
481 			 * Rotate the buffers and return what's here.
482 			 */
483 			ROTATE_BUFFERS(d);
484 			break;
485 		}
486 		if (ISSET(ioflag, IO_NDELAY)) {
487 			/* User requested non-blocking I/O */
488 			error = EWOULDBLOCK;
489 		} else if (d->bd_rtout == 0) {
490 			/* No read timeout set. */
491 			d->bd_nreaders++;
492 			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
493 			    "bpf", INFSLP);
494 			d->bd_nreaders--;
495 		} else if ((now = nsecuptime()) < end) {
496 			/* Read timeout has not expired yet. */
497 			d->bd_nreaders++;
498 			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
499 			    "bpf", end - now);
500 			d->bd_nreaders--;
501 		} else {
502 			/* Read timeout has expired. */
503 			error = EWOULDBLOCK;
504 		}
505 		if (error == EINTR || error == ERESTART)
506 			goto out;
507 		if (error == EWOULDBLOCK) {
508 			/*
509 			 * On a timeout, return what's in the buffer,
510 			 * which may be nothing.  If there is something
511 			 * in the store buffer, we can rotate the buffers.
512 			 */
513 			if (d->bd_hbuf != NULL)
514 				/*
515 				 * We filled up the buffer in between
516 				 * getting the timeout and arriving
517 				 * here, so we don't need to rotate.
518 				 */
519 				break;
520 
521 			if (d->bd_slen == 0) {
522 				error = 0;
523 				goto out;
524 			}
525 			ROTATE_BUFFERS(d);
526 			break;
527 		}
528 	}
529 	/*
530 	 * At this point, we know we have something in the hold slot.
531 	 */
532 	hbuf = d->bd_hbuf;
533 	hlen = d->bd_hlen;
534 	d->bd_hbuf = NULL;
535 	d->bd_hlen = 0;
536 	d->bd_fbuf = NULL;
537 	d->bd_in_uiomove = 1;
538 
539 	/*
540 	 * Move data from hold buffer into user space.
541 	 * We know the entire buffer is transferred since
542 	 * we checked above that the read buffer is bpf_bufsize bytes.
543 	 */
544 	mtx_leave(&d->bd_mtx);
545 	error = uiomove(hbuf, hlen, uio);
546 	mtx_enter(&d->bd_mtx);
547 
548 	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
549 	KASSERT(d->bd_fbuf == NULL);
550 	KASSERT(d->bd_hbuf == NULL);
551 	d->bd_fbuf = hbuf;
552 	d->bd_in_uiomove = 0;
553 out:
554 	mtx_leave(&d->bd_mtx);
555 	bpf_put(d);
556 
557 	return (error);
558 }
559 
560 /*
561  * If there are processes sleeping on this descriptor, wake them up.
562  */
563 void
564 bpf_wakeup(struct bpf_d *d)
565 {
566 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
567 
568 	if (d->bd_nreaders)
569 		wakeup(d);
570 
571 	/*
572 	 * As long as pgsigio() and selwakeup() need to be protected
573 	 * by the KERNEL_LOCK() we have to delay the wakeup to
574 	 * another context to keep the hot path KERNEL_LOCK()-free.
575 	 */
576 	if ((d->bd_async && d->bd_sig) ||
577 	    (!klist_empty(&d->bd_sel.si_note) || d->bd_sel.si_seltid != 0)) {
578 		bpf_get(d);
579 		if (!task_add(systq, &d->bd_wake_task))
580 			bpf_put(d);
581 	}
582 }
583 
584 void
585 bpf_wakeup_cb(void *xd)
586 {
587 	struct bpf_d *d = xd;
588 
589 	if (d->bd_async && d->bd_sig)
590 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
591 
592 	selwakeup(&d->bd_sel);
593 	bpf_put(d);
594 }
595 
596 int
597 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
598 {
599 	struct bpf_d *d;
600 	struct ifnet *ifp;
601 	struct mbuf *m;
602 	int error;
603 	struct sockaddr_storage dst;
604 
605 	KERNEL_ASSERT_LOCKED();
606 
607 	d = bpfilter_lookup(minor(dev));
608 	if (d->bd_bif == NULL)
609 		return (ENXIO);
610 
611 	bpf_get(d);
612 	ifp = d->bd_bif->bif_ifp;
613 
614 	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
615 		error = ENETDOWN;
616 		goto out;
617 	}
618 
619 	if (uio->uio_resid == 0) {
620 		error = 0;
621 		goto out;
622 	}
623 
624 	error = bpf_movein(uio, d, &m, sstosa(&dst));
625 	if (error)
626 		goto out;
627 
628 	if (m->m_pkthdr.len > ifp->if_mtu) {
629 		m_freem(m);
630 		error = EMSGSIZE;
631 		goto out;
632 	}
633 
634 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
635 	m->m_pkthdr.pf.prio = ifp->if_llprio;
636 
637 	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
638 		dst.ss_family = pseudo_AF_HDRCMPLT;
639 
640 	NET_LOCK();
641 	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
642 	NET_UNLOCK();
643 
644 out:
645 	bpf_put(d);
646 	return (error);
647 }
648 
649 /*
650  * Reset a descriptor by flushing its packet buffer and clearing the
651  * receive and drop counts.
652  */
653 void
654 bpf_resetd(struct bpf_d *d)
655 {
656 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
657 	KASSERT(d->bd_in_uiomove == 0);
658 
659 	if (d->bd_hbuf != NULL) {
660 		/* Free the hold buffer. */
661 		d->bd_fbuf = d->bd_hbuf;
662 		d->bd_hbuf = NULL;
663 	}
664 	d->bd_slen = 0;
665 	d->bd_hlen = 0;
666 	d->bd_rcount = 0;
667 	d->bd_dcount = 0;
668 }
669 
670 /*
671  *  FIONREAD		Check for read packet available.
672  *  BIOCGBLEN		Get buffer len [for read()].
673  *  BIOCSETF		Set ethernet read filter.
674  *  BIOCFLUSH		Flush read packet buffer.
675  *  BIOCPROMISC		Put interface into promiscuous mode.
676  *  BIOCGDLTLIST	Get supported link layer types.
677  *  BIOCGDLT		Get link layer type.
678  *  BIOCSDLT		Set link layer type.
679  *  BIOCGETIF		Get interface name.
680  *  BIOCSETIF		Set interface.
681  *  BIOCSRTIMEOUT	Set read timeout.
682  *  BIOCGRTIMEOUT	Get read timeout.
683  *  BIOCGSTATS		Get packet stats.
684  *  BIOCIMMEDIATE	Set immediate mode.
685  *  BIOCVERSION		Get filter language version.
686  *  BIOCGHDRCMPLT	Get "header already complete" flag
687  *  BIOCSHDRCMPLT	Set "header already complete" flag
688  */
689 int
690 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
691 {
692 	struct bpf_d *d;
693 	int error = 0;
694 
695 	d = bpfilter_lookup(minor(dev));
696 	if (d->bd_locked && suser(p) != 0) {
697 		/* list of allowed ioctls when locked and not root */
698 		switch (cmd) {
699 		case BIOCGBLEN:
700 		case BIOCFLUSH:
701 		case BIOCGDLT:
702 		case BIOCGDLTLIST:
703 		case BIOCGETIF:
704 		case BIOCGRTIMEOUT:
705 		case BIOCGSTATS:
706 		case BIOCVERSION:
707 		case BIOCGRSIG:
708 		case BIOCGHDRCMPLT:
709 		case FIONREAD:
710 		case BIOCLOCK:
711 		case BIOCSRTIMEOUT:
712 		case BIOCIMMEDIATE:
713 		case TIOCGPGRP:
714 		case BIOCGDIRFILT:
715 			break;
716 		default:
717 			return (EPERM);
718 		}
719 	}
720 
721 	bpf_get(d);
722 
723 	switch (cmd) {
724 	default:
725 		error = EINVAL;
726 		break;
727 
728 	/*
729 	 * Check for read packet available.
730 	 */
731 	case FIONREAD:
732 		{
733 			int n;
734 
735 			mtx_enter(&d->bd_mtx);
736 			n = d->bd_slen;
737 			if (d->bd_hbuf != NULL)
738 				n += d->bd_hlen;
739 			mtx_leave(&d->bd_mtx);
740 
741 			*(int *)addr = n;
742 			break;
743 		}
744 
745 	/*
746 	 * Get buffer len [for read()].
747 	 */
748 	case BIOCGBLEN:
749 		*(u_int *)addr = d->bd_bufsize;
750 		break;
751 
752 	/*
753 	 * Set buffer length.
754 	 */
755 	case BIOCSBLEN:
756 		if (d->bd_bif != NULL)
757 			error = EINVAL;
758 		else {
759 			u_int size = *(u_int *)addr;
760 
761 			if (size > bpf_maxbufsize)
762 				*(u_int *)addr = size = bpf_maxbufsize;
763 			else if (size < BPF_MINBUFSIZE)
764 				*(u_int *)addr = size = BPF_MINBUFSIZE;
765 			mtx_enter(&d->bd_mtx);
766 			d->bd_bufsize = size;
767 			mtx_leave(&d->bd_mtx);
768 		}
769 		break;
770 
771 	/*
772 	 * Set link layer read filter.
773 	 */
774 	case BIOCSETF:
775 		error = bpf_setf(d, (struct bpf_program *)addr, 0);
776 		break;
777 
778 	/*
779 	 * Set link layer write filter.
780 	 */
781 	case BIOCSETWF:
782 		error = bpf_setf(d, (struct bpf_program *)addr, 1);
783 		break;
784 
785 	/*
786 	 * Flush read packet buffer.
787 	 */
788 	case BIOCFLUSH:
789 		mtx_enter(&d->bd_mtx);
790 		bpf_resetd(d);
791 		mtx_leave(&d->bd_mtx);
792 		break;
793 
794 	/*
795 	 * Put interface into promiscuous mode.
796 	 */
797 	case BIOCPROMISC:
798 		if (d->bd_bif == NULL) {
799 			/*
800 			 * No interface attached yet.
801 			 */
802 			error = EINVAL;
803 		} else if (d->bd_bif->bif_ifp != NULL) {
804 			if (d->bd_promisc == 0) {
805 				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
806 				NET_LOCK();
807 				error = ifpromisc(d->bd_bif->bif_ifp, 1);
808 				NET_UNLOCK();
809 				if (error == 0)
810 					d->bd_promisc = 1;
811 			}
812 		}
813 		break;
814 
815 	/*
816 	 * Get a list of supported device parameters.
817 	 */
818 	case BIOCGDLTLIST:
819 		if (d->bd_bif == NULL)
820 			error = EINVAL;
821 		else
822 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
823 		break;
824 
825 	/*
826 	 * Get device parameters.
827 	 */
828 	case BIOCGDLT:
829 		if (d->bd_bif == NULL)
830 			error = EINVAL;
831 		else
832 			*(u_int *)addr = d->bd_bif->bif_dlt;
833 		break;
834 
835 	/*
836 	 * Set device parameters.
837 	 */
838 	case BIOCSDLT:
839 		if (d->bd_bif == NULL)
840 			error = EINVAL;
841 		else {
842 			mtx_enter(&d->bd_mtx);
843 			error = bpf_setdlt(d, *(u_int *)addr);
844 			mtx_leave(&d->bd_mtx);
845 		}
846 		break;
847 
848 	/*
849 	 * Set interface name.
850 	 */
851 	case BIOCGETIF:
852 		if (d->bd_bif == NULL)
853 			error = EINVAL;
854 		else
855 			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
856 		break;
857 
858 	/*
859 	 * Set interface.
860 	 */
861 	case BIOCSETIF:
862 		error = bpf_setif(d, (struct ifreq *)addr);
863 		break;
864 
865 	/*
866 	 * Set read timeout.
867 	 */
868 	case BIOCSRTIMEOUT:
869 		{
870 			struct timeval *tv = (struct timeval *)addr;
871 			uint64_t rtout;
872 
873 			if (tv->tv_sec < 0 || !timerisvalid(tv)) {
874 				error = EINVAL;
875 				break;
876 			}
877 			rtout = TIMEVAL_TO_NSEC(tv);
878 			if (rtout > MAXTSLP) {
879 				error = EOVERFLOW;
880 				break;
881 			}
882 			mtx_enter(&d->bd_mtx);
883 			d->bd_rtout = rtout;
884 			mtx_leave(&d->bd_mtx);
885 			break;
886 		}
887 
888 	/*
889 	 * Get read timeout.
890 	 */
891 	case BIOCGRTIMEOUT:
892 		{
893 			struct timeval *tv = (struct timeval *)addr;
894 
895 			memset(tv, 0, sizeof(*tv));
896 			mtx_enter(&d->bd_mtx);
897 			NSEC_TO_TIMEVAL(d->bd_rtout, tv);
898 			mtx_leave(&d->bd_mtx);
899 			break;
900 		}
901 
902 	/*
903 	 * Get packet stats.
904 	 */
905 	case BIOCGSTATS:
906 		{
907 			struct bpf_stat *bs = (struct bpf_stat *)addr;
908 
909 			bs->bs_recv = d->bd_rcount;
910 			bs->bs_drop = d->bd_dcount;
911 			break;
912 		}
913 
914 	/*
915 	 * Set immediate mode.
916 	 */
917 	case BIOCIMMEDIATE:
918 		d->bd_immediate = *(u_int *)addr;
919 		break;
920 
921 	case BIOCVERSION:
922 		{
923 			struct bpf_version *bv = (struct bpf_version *)addr;
924 
925 			bv->bv_major = BPF_MAJOR_VERSION;
926 			bv->bv_minor = BPF_MINOR_VERSION;
927 			break;
928 		}
929 
930 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
931 		*(u_int *)addr = d->bd_hdrcmplt;
932 		break;
933 
934 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
935 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
936 		break;
937 
938 	case BIOCLOCK:		/* set "locked" flag (no reset) */
939 		d->bd_locked = 1;
940 		break;
941 
942 	case BIOCGFILDROP:	/* get "filter-drop" flag */
943 		*(u_int *)addr = d->bd_fildrop;
944 		break;
945 
946 	case BIOCSFILDROP: {	/* set "filter-drop" flag */
947 		unsigned int fildrop = *(u_int *)addr;
948 		switch (fildrop) {
949 		case BPF_FILDROP_PASS:
950 		case BPF_FILDROP_CAPTURE:
951 		case BPF_FILDROP_DROP:
952 			d->bd_fildrop = fildrop;
953 			break;
954 		default:
955 			error = EINVAL;
956 			break;
957 		}
958 		break;
959 	}
960 
961 	case BIOCGDIRFILT:	/* get direction filter */
962 		*(u_int *)addr = d->bd_dirfilt;
963 		break;
964 
965 	case BIOCSDIRFILT:	/* set direction filter */
966 		d->bd_dirfilt = (*(u_int *)addr) &
967 		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
968 		break;
969 
970 	case FIONBIO:		/* Non-blocking I/O */
971 		/* let vfs to keep track of this */
972 		break;
973 
974 	case FIOASYNC:		/* Send signal on receive packets */
975 		d->bd_async = *(int *)addr;
976 		break;
977 
978 	case FIOSETOWN:		/* Process or group to send signals to */
979 	case TIOCSPGRP:
980 		error = sigio_setown(&d->bd_sigio, cmd, addr);
981 		break;
982 
983 	case FIOGETOWN:
984 	case TIOCGPGRP:
985 		sigio_getown(&d->bd_sigio, cmd, addr);
986 		break;
987 
988 	case BIOCSRSIG:		/* Set receive signal */
989 		{
990 			u_int sig;
991 
992 			sig = *(u_int *)addr;
993 
994 			if (sig >= NSIG)
995 				error = EINVAL;
996 			else
997 				d->bd_sig = sig;
998 			break;
999 		}
1000 	case BIOCGRSIG:
1001 		*(u_int *)addr = d->bd_sig;
1002 		break;
1003 	}
1004 
1005 	bpf_put(d);
1006 	return (error);
1007 }
1008 
1009 /*
1010  * Set d's packet filter program to fp.  If this file already has a filter,
1011  * free it and replace it.  Returns EINVAL for bogus requests.
1012  */
1013 int
1014 bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
1015 {
1016 	struct bpf_program_smr *bps, *old_bps;
1017 	struct bpf_insn *fcode;
1018 	u_int flen, size;
1019 
1020 	KERNEL_ASSERT_LOCKED();
1021 
1022 	if (fp->bf_insns == 0) {
1023 		if (fp->bf_len != 0)
1024 			return (EINVAL);
1025 		bps = NULL;
1026 	} else {
1027 		flen = fp->bf_len;
1028 		if (flen > BPF_MAXINSNS)
1029 			return (EINVAL);
1030 
1031 		fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1032 		    M_WAITOK | M_CANFAIL);
1033 		if (fcode == NULL)
1034 			return (ENOMEM);
1035 
1036 		size = flen * sizeof(*fp->bf_insns);
1037 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1038 		    bpf_validate(fcode, (int)flen) == 0) {
1039 			free(fcode, M_DEVBUF, size);
1040 			return (EINVAL);
1041 		}
1042 
1043 		bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
1044 		smr_init(&bps->bps_smr);
1045 		bps->bps_bf.bf_len = flen;
1046 		bps->bps_bf.bf_insns = fcode;
1047 	}
1048 
1049 	if (wf == 0) {
1050 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
1051 		SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
1052 	} else {
1053 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
1054 		SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
1055 	}
1056 
1057 	mtx_enter(&d->bd_mtx);
1058 	bpf_resetd(d);
1059 	mtx_leave(&d->bd_mtx);
1060 	if (old_bps != NULL)
1061 		smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
1062 
1063 	return (0);
1064 }
1065 
1066 /*
1067  * Detach a file from its current interface (if attached at all) and attach
1068  * to the interface indicated by the name stored in ifr.
1069  * Return an errno or 0.
1070  */
1071 int
1072 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1073 {
1074 	struct bpf_if *bp, *candidate = NULL;
1075 	int error = 0;
1076 
1077 	/*
1078 	 * Look through attached interfaces for the named one.
1079 	 */
1080 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1081 		if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
1082 			continue;
1083 
1084 		if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
1085 			candidate = bp;
1086 	}
1087 
1088 	/* Not found. */
1089 	if (candidate == NULL)
1090 		return (ENXIO);
1091 
1092 	/*
1093 	 * Allocate the packet buffers if we need to.
1094 	 * If we're already attached to requested interface,
1095 	 * just flush the buffer.
1096 	 */
1097 	mtx_enter(&d->bd_mtx);
1098 	if (d->bd_sbuf == NULL) {
1099 		if ((error = bpf_allocbufs(d)))
1100 			goto out;
1101 	}
1102 	if (candidate != d->bd_bif) {
1103 		/*
1104 		 * Detach if attached to something else.
1105 		 */
1106 		bpf_detachd(d);
1107 		bpf_attachd(d, candidate);
1108 	}
1109 	bpf_resetd(d);
1110 out:
1111 	mtx_leave(&d->bd_mtx);
1112 	return (error);
1113 }
1114 
1115 /*
1116  * Copy the interface name to the ifreq.
1117  */
1118 void
1119 bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1120 {
1121 	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1122 }
1123 
1124 /*
1125  * Support for poll() system call
1126  */
1127 int
1128 bpfpoll(dev_t dev, int events, struct proc *p)
1129 {
1130 	struct bpf_d *d;
1131 	int revents;
1132 
1133 	KERNEL_ASSERT_LOCKED();
1134 
1135 	/*
1136 	 * An imitation of the FIONREAD ioctl code.
1137 	 */
1138 	d = bpfilter_lookup(minor(dev));
1139 
1140 	/*
1141 	 * XXX The USB stack manages it to trigger some race condition
1142 	 * which causes bpfilter_lookup to return NULL when a USB device
1143 	 * gets detached while it is up and has an open bpf handler (e.g.
1144 	 * dhclient).  We still should recheck if we can fix the root
1145 	 * cause of this issue.
1146 	 */
1147 	if (d == NULL)
1148 		return (POLLERR);
1149 
1150 	/* Always ready to write data */
1151 	revents = events & (POLLOUT | POLLWRNORM);
1152 
1153 	if (events & (POLLIN | POLLRDNORM)) {
1154 		mtx_enter(&d->bd_mtx);
1155 		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
1156 			revents |= events & (POLLIN | POLLRDNORM);
1157 		else
1158 			selrecord(p, &d->bd_sel);
1159 		mtx_leave(&d->bd_mtx);
1160 	}
1161 	return (revents);
1162 }
1163 
1164 const struct filterops bpfread_filtops = {
1165 	.f_flags	= FILTEROP_ISFD,
1166 	.f_attach	= NULL,
1167 	.f_detach	= filt_bpfrdetach,
1168 	.f_event	= filt_bpfread,
1169 };
1170 
1171 int
1172 bpfkqfilter(dev_t dev, struct knote *kn)
1173 {
1174 	struct bpf_d *d;
1175 	struct klist *klist;
1176 
1177 	KERNEL_ASSERT_LOCKED();
1178 
1179 	d = bpfilter_lookup(minor(dev));
1180 
1181 	switch (kn->kn_filter) {
1182 	case EVFILT_READ:
1183 		klist = &d->bd_sel.si_note;
1184 		kn->kn_fop = &bpfread_filtops;
1185 		break;
1186 	default:
1187 		return (EINVAL);
1188 	}
1189 
1190 	bpf_get(d);
1191 	kn->kn_hook = d;
1192 	klist_insert_locked(klist, kn);
1193 
1194 	return (0);
1195 }
1196 
1197 void
1198 filt_bpfrdetach(struct knote *kn)
1199 {
1200 	struct bpf_d *d = kn->kn_hook;
1201 
1202 	KERNEL_ASSERT_LOCKED();
1203 
1204 	klist_remove_locked(&d->bd_sel.si_note, kn);
1205 	bpf_put(d);
1206 }
1207 
1208 int
1209 filt_bpfread(struct knote *kn, long hint)
1210 {
1211 	struct bpf_d *d = kn->kn_hook;
1212 
1213 	KERNEL_ASSERT_LOCKED();
1214 
1215 	mtx_enter(&d->bd_mtx);
1216 	kn->kn_data = d->bd_hlen;
1217 	if (d->bd_immediate)
1218 		kn->kn_data += d->bd_slen;
1219 	mtx_leave(&d->bd_mtx);
1220 
1221 	return (kn->kn_data > 0);
1222 }
1223 
1224 /*
1225  * Copy data from an mbuf chain into a buffer.  This code is derived
1226  * from m_copydata in sys/uipc_mbuf.c.
1227  */
1228 void
1229 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1230 {
1231 	const struct mbuf *m;
1232 	u_int count;
1233 	u_char *dst;
1234 
1235 	m = src_arg;
1236 	dst = dst_arg;
1237 	while (len > 0) {
1238 		if (m == NULL)
1239 			panic("bpf_mcopy");
1240 		count = min(m->m_len, len);
1241 		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1242 		m = m->m_next;
1243 		dst += count;
1244 		len -= count;
1245 	}
1246 }
1247 
1248 int
1249 bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1250 {
1251 	return _bpf_mtap(arg, m, m, direction);
1252 }
1253 
1254 int
1255 _bpf_mtap(caddr_t arg, const struct mbuf *mp, const struct mbuf *m,
1256     u_int direction)
1257 {
1258 	struct bpf_if *bp = (struct bpf_if *)arg;
1259 	struct bpf_d *d;
1260 	size_t pktlen, slen;
1261 	const struct mbuf *m0;
1262 	struct bpf_hdr tbh;
1263 	int gothdr = 0;
1264 	int drop = 0;
1265 
1266 	if (m == NULL)
1267 		return (0);
1268 
1269 	if (bp == NULL)
1270 		return (0);
1271 
1272 	pktlen = 0;
1273 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1274 		pktlen += m0->m_len;
1275 
1276 	smr_read_enter();
1277 	SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1278 		struct bpf_program_smr *bps;
1279 		struct bpf_insn *fcode = NULL;
1280 
1281 		atomic_inc_long(&d->bd_rcount);
1282 
1283 		if (ISSET(d->bd_dirfilt, direction))
1284 			continue;
1285 
1286 		bps = SMR_PTR_GET(&d->bd_rfilter);
1287 		if (bps != NULL)
1288 			fcode = bps->bps_bf.bf_insns;
1289 		slen = bpf_mfilter(fcode, m, pktlen);
1290 
1291 		if (slen == 0)
1292 			continue;
1293 		if (d->bd_fildrop != BPF_FILDROP_PASS)
1294 			drop = 1;
1295 		if (d->bd_fildrop != BPF_FILDROP_DROP) {
1296 			if (!gothdr) {
1297 				struct timeval tv;
1298 				memset(&tbh, 0, sizeof(tbh));
1299 
1300 				if (ISSET(mp->m_flags, M_PKTHDR)) {
1301 					tbh.bh_ifidx = mp->m_pkthdr.ph_ifidx;
1302 					tbh.bh_flowid = mp->m_pkthdr.ph_flowid;
1303 					tbh.bh_flags = mp->m_pkthdr.pf.prio;
1304 					if (ISSET(mp->m_pkthdr.csum_flags,
1305 					    M_FLOWID))
1306 						SET(tbh.bh_flags, BPF_F_FLOWID);
1307 
1308 					m_microtime(mp, &tv);
1309 				} else
1310 					microtime(&tv);
1311 
1312 				tbh.bh_tstamp.tv_sec = tv.tv_sec;
1313 				tbh.bh_tstamp.tv_usec = tv.tv_usec;
1314 				SET(tbh.bh_flags, direction << BPF_F_DIR_SHIFT);
1315 
1316 				gothdr = 1;
1317 			}
1318 
1319 			mtx_enter(&d->bd_mtx);
1320 			bpf_catchpacket(d, (u_char *)m, pktlen, slen, &tbh);
1321 			mtx_leave(&d->bd_mtx);
1322 		}
1323 	}
1324 	smr_read_leave();
1325 
1326 	return (drop);
1327 }
1328 
1329 /*
1330  * Incoming linkage from device drivers, where a data buffer should be
1331  * prepended by an arbitrary header. In this situation we already have a
1332  * way of representing a chain of memory buffers, ie, mbufs, so reuse
1333  * the existing functionality by attaching the buffers to mbufs.
1334  *
1335  * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1336  * struct m_hdr each for the header and data on the stack.
1337  */
1338 int
1339 bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1340     const void *buf, unsigned int buflen, u_int direction)
1341 {
1342 	struct m_hdr mh, md;
1343 	struct mbuf *m0 = NULL;
1344 	struct mbuf **mp = &m0;
1345 
1346 	if (hdr != NULL) {
1347 		mh.mh_flags = 0;
1348 		mh.mh_next = NULL;
1349 		mh.mh_len = hdrlen;
1350 		mh.mh_data = (void *)hdr;
1351 
1352 		*mp = (struct mbuf *)&mh;
1353 		mp = &mh.mh_next;
1354 	}
1355 
1356 	if (buf != NULL) {
1357 		md.mh_flags = 0;
1358 		md.mh_next = NULL;
1359 		md.mh_len = buflen;
1360 		md.mh_data = (void *)buf;
1361 
1362 		*mp = (struct mbuf *)&md;
1363 	}
1364 
1365 	return bpf_mtap(arg, m0, direction);
1366 }
1367 
1368 /*
1369  * Incoming linkage from device drivers, where we have a mbuf chain
1370  * but need to prepend some arbitrary header from a linear buffer.
1371  *
1372  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1373  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1374  * fields in this header that we initialize, and will not try to free
1375  * it or keep a pointer to it.
1376  */
1377 int
1378 bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
1379     u_int direction)
1380 {
1381 	struct m_hdr mh;
1382 	const struct mbuf *m0;
1383 
1384 	if (dlen > 0) {
1385 		mh.mh_flags = 0;
1386 		mh.mh_next = (struct mbuf *)m;
1387 		mh.mh_len = dlen;
1388 		mh.mh_data = (void *)data;
1389 		m0 = (struct mbuf *)&mh;
1390 	} else
1391 		m0 = m;
1392 
1393 	return _bpf_mtap(arg, m, m0, direction);
1394 }
1395 
1396 /*
1397  * Incoming linkage from device drivers, where we have a mbuf chain
1398  * but need to prepend the address family.
1399  *
1400  * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1401  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1402  * fields in this header that we initialize, and will not try to free
1403  * it or keep a pointer to it.
1404  */
1405 int
1406 bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1407 {
1408 	u_int32_t    afh;
1409 
1410 	afh = htonl(af);
1411 
1412 	return bpf_mtap_hdr(arg, &afh, sizeof(afh), m, direction);
1413 }
1414 
1415 /*
1416  * Incoming linkage from device drivers, where we have a mbuf chain
1417  * but need to prepend a VLAN encapsulation header.
1418  *
1419  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1420  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1421  * fields in this header that we initialize, and will not try to free
1422  * it or keep a pointer to it.
1423  */
1424 int
1425 bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1426 {
1427 #if NVLAN > 0
1428 	struct ether_vlan_header evh;
1429 	struct m_hdr mh, md;
1430 
1431 	if ((m->m_flags & M_VLANTAG) == 0)
1432 #endif
1433 	{
1434 		return _bpf_mtap(arg, m, m, direction);
1435 	}
1436 
1437 #if NVLAN > 0
1438 	KASSERT(m->m_len >= ETHER_HDR_LEN);
1439 
1440 	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1441 	evh.evl_proto = evh.evl_encap_proto;
1442 	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1443 	evh.evl_tag = htons(m->m_pkthdr.ether_vtag);
1444 
1445 	mh.mh_flags = 0;
1446 	mh.mh_data = (caddr_t)&evh;
1447 	mh.mh_len = sizeof(evh);
1448 	mh.mh_next = (struct mbuf *)&md;
1449 
1450 	md.mh_flags = 0;
1451 	md.mh_data = m->m_data + ETHER_HDR_LEN;
1452 	md.mh_len = m->m_len - ETHER_HDR_LEN;
1453 	md.mh_next = m->m_next;
1454 
1455 	return _bpf_mtap(arg, m, (struct mbuf *)&mh, direction);
1456 #endif
1457 }
1458 
1459 /*
1460  * Move the packet data from interface memory (pkt) into the
1461  * store buffer.  Wake up listeners if needed.
1462  * "copy" is the routine called to do the actual data
1463  * transfer.  bcopy is passed in to copy contiguous chunks, while
1464  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1465  * pkt is really an mbuf.
1466  */
1467 void
1468 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1469     const struct bpf_hdr *tbh)
1470 {
1471 	struct bpf_hdr *bh;
1472 	int totlen, curlen;
1473 	int hdrlen, do_wakeup = 0;
1474 
1475 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1476 	if (d->bd_bif == NULL)
1477 		return;
1478 
1479 	hdrlen = d->bd_bif->bif_hdrlen;
1480 
1481 	/*
1482 	 * Figure out how many bytes to move.  If the packet is
1483 	 * greater or equal to the snapshot length, transfer that
1484 	 * much.  Otherwise, transfer the whole packet (unless
1485 	 * we hit the buffer size limit).
1486 	 */
1487 	totlen = hdrlen + min(snaplen, pktlen);
1488 	if (totlen > d->bd_bufsize)
1489 		totlen = d->bd_bufsize;
1490 
1491 	/*
1492 	 * Round up the end of the previous packet to the next longword.
1493 	 */
1494 	curlen = BPF_WORDALIGN(d->bd_slen);
1495 	if (curlen + totlen > d->bd_bufsize) {
1496 		/*
1497 		 * This packet will overflow the storage buffer.
1498 		 * Rotate the buffers if we can, then wakeup any
1499 		 * pending reads.
1500 		 */
1501 		if (d->bd_fbuf == NULL) {
1502 			/*
1503 			 * We haven't completed the previous read yet,
1504 			 * so drop the packet.
1505 			 */
1506 			++d->bd_dcount;
1507 			return;
1508 		}
1509 		ROTATE_BUFFERS(d);
1510 		do_wakeup = 1;
1511 		curlen = 0;
1512 	}
1513 
1514 	/*
1515 	 * Append the bpf header.
1516 	 */
1517 	bh = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1518 	*bh = *tbh;
1519 	bh->bh_datalen = pktlen;
1520 	bh->bh_hdrlen = hdrlen;
1521 	bh->bh_caplen = totlen - hdrlen;
1522 
1523 	/*
1524 	 * Copy the packet data into the store buffer and update its length.
1525 	 */
1526 	bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
1527 	d->bd_slen = curlen + totlen;
1528 
1529 	if (d->bd_immediate) {
1530 		/*
1531 		 * Immediate mode is set.  A packet arrived so any
1532 		 * reads should be woken up.
1533 		 */
1534 		do_wakeup = 1;
1535 	}
1536 
1537 	if (do_wakeup)
1538 		bpf_wakeup(d);
1539 }
1540 
1541 /*
1542  * Initialize all nonzero fields of a descriptor.
1543  */
1544 int
1545 bpf_allocbufs(struct bpf_d *d)
1546 {
1547 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1548 
1549 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1550 	if (d->bd_fbuf == NULL)
1551 		return (ENOMEM);
1552 
1553 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1554 	if (d->bd_sbuf == NULL) {
1555 		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1556 		d->bd_fbuf = NULL;
1557 		return (ENOMEM);
1558 	}
1559 
1560 	d->bd_slen = 0;
1561 	d->bd_hlen = 0;
1562 
1563 	return (0);
1564 }
1565 
1566 void
1567 bpf_prog_smr(void *bps_arg)
1568 {
1569 	struct bpf_program_smr *bps = bps_arg;
1570 
1571 	free(bps->bps_bf.bf_insns, M_DEVBUF,
1572 	    bps->bps_bf.bf_len * sizeof(struct bpf_insn));
1573 	free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
1574 }
1575 
1576 void
1577 bpf_d_smr(void *smr)
1578 {
1579 	struct bpf_d	*bd = smr;
1580 
1581 	sigio_free(&bd->bd_sigio);
1582 	free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
1583 	free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
1584 	free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
1585 
1586 	if (bd->bd_rfilter != NULL)
1587 		bpf_prog_smr(bd->bd_rfilter);
1588 	if (bd->bd_wfilter != NULL)
1589 		bpf_prog_smr(bd->bd_wfilter);
1590 
1591 	free(bd, M_DEVBUF, sizeof(*bd));
1592 }
1593 
1594 void
1595 bpf_get(struct bpf_d *bd)
1596 {
1597 	atomic_inc_int(&bd->bd_ref);
1598 }
1599 
1600 /*
1601  * Free buffers currently in use by a descriptor
1602  * when the reference count drops to zero.
1603  */
1604 void
1605 bpf_put(struct bpf_d *bd)
1606 {
1607 	if (atomic_dec_int_nv(&bd->bd_ref) > 0)
1608 		return;
1609 
1610 	smr_call(&bd->bd_smr, bpf_d_smr, bd);
1611 }
1612 
1613 void *
1614 bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1615 {
1616 	struct bpf_if *bp;
1617 
1618 	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1619 		panic("bpfattach");
1620 	SMR_SLIST_INIT(&bp->bif_dlist);
1621 	bp->bif_driverp = (struct bpf_if **)bpfp;
1622 	bp->bif_name = name;
1623 	bp->bif_ifp = NULL;
1624 	bp->bif_dlt = dlt;
1625 
1626 	bp->bif_next = bpf_iflist;
1627 	bpf_iflist = bp;
1628 
1629 	*bp->bif_driverp = NULL;
1630 
1631 	/*
1632 	 * Compute the length of the bpf header.  This is not necessarily
1633 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1634 	 * that the network layer header begins on a longword boundary (for
1635 	 * performance reasons and to alleviate alignment restrictions).
1636 	 */
1637 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1638 
1639 	return (bp);
1640 }
1641 
1642 void
1643 bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1644 {
1645 	struct bpf_if *bp;
1646 
1647 	bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
1648 	bp->bif_ifp = ifp;
1649 }
1650 
1651 /* Detach an interface from its attached bpf device.  */
1652 void
1653 bpfdetach(struct ifnet *ifp)
1654 {
1655 	struct bpf_if *bp, *nbp;
1656 
1657 	KERNEL_ASSERT_LOCKED();
1658 
1659 	for (bp = bpf_iflist; bp; bp = nbp) {
1660 		nbp = bp->bif_next;
1661 		if (bp->bif_ifp == ifp)
1662 			bpfsdetach(bp);
1663 	}
1664 	ifp->if_bpf = NULL;
1665 }
1666 
1667 void
1668 bpfsdetach(void *p)
1669 {
1670 	struct bpf_if *bp = p, *tbp;
1671 	struct bpf_d *bd;
1672 	int maj;
1673 
1674 	KERNEL_ASSERT_LOCKED();
1675 
1676 	/* Locate the major number. */
1677 	for (maj = 0; maj < nchrdev; maj++)
1678 		if (cdevsw[maj].d_open == bpfopen)
1679 			break;
1680 
1681 	while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist))) {
1682 		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1683 		klist_invalidate(&bd->bd_sel.si_note);
1684 	}
1685 
1686 	for (tbp = bpf_iflist; tbp; tbp = tbp->bif_next) {
1687 		if (tbp->bif_next == bp) {
1688 			tbp->bif_next = bp->bif_next;
1689 			break;
1690 		}
1691 	}
1692 
1693 	if (bpf_iflist == bp)
1694 		bpf_iflist = bp->bif_next;
1695 
1696 	free(bp, M_DEVBUF, sizeof(*bp));
1697 }
1698 
1699 int
1700 bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1701     void *newp, size_t newlen)
1702 {
1703 	switch (name[0]) {
1704 	case NET_BPF_BUFSIZE:
1705 		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1706 		    &bpf_bufsize, BPF_MINBUFSIZE, bpf_maxbufsize);
1707 	case NET_BPF_MAXBUFSIZE:
1708 		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1709 		    &bpf_maxbufsize, BPF_MINBUFSIZE, INT_MAX);
1710 	default:
1711 		return (EOPNOTSUPP);
1712 	}
1713 }
1714 
1715 int
1716 bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1717     size_t newlen)
1718 {
1719 	int flags = RW_INTR;
1720 	int error;
1721 
1722 	if (namelen != 1)
1723 		return (ENOTDIR);
1724 
1725 	flags |= (newp == NULL) ? RW_READ : RW_WRITE;
1726 
1727 	error = rw_enter(&bpf_sysctl_lk, flags);
1728 	if (error != 0)
1729 		return (error);
1730 
1731 	error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
1732 
1733 	rw_exit(&bpf_sysctl_lk);
1734 
1735 	return (error);
1736 }
1737 
1738 struct bpf_d *
1739 bpfilter_lookup(int unit)
1740 {
1741 	struct bpf_d *bd;
1742 
1743 	KERNEL_ASSERT_LOCKED();
1744 
1745 	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1746 		if (bd->bd_unit == unit)
1747 			return (bd);
1748 	return (NULL);
1749 }
1750 
1751 /*
1752  * Get a list of available data link type of the interface.
1753  */
1754 int
1755 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1756 {
1757 	int n, error;
1758 	struct bpf_if *bp;
1759 	const char *name;
1760 
1761 	name = d->bd_bif->bif_name;
1762 	n = 0;
1763 	error = 0;
1764 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1765 		if (strcmp(name, bp->bif_name) != 0)
1766 			continue;
1767 		if (bfl->bfl_list != NULL) {
1768 			if (n >= bfl->bfl_len)
1769 				return (ENOMEM);
1770 			error = copyout(&bp->bif_dlt,
1771 			    bfl->bfl_list + n, sizeof(u_int));
1772 			if (error)
1773 				break;
1774 		}
1775 		n++;
1776 	}
1777 
1778 	bfl->bfl_len = n;
1779 	return (error);
1780 }
1781 
1782 /*
1783  * Set the data link type of a BPF instance.
1784  */
1785 int
1786 bpf_setdlt(struct bpf_d *d, u_int dlt)
1787 {
1788 	const char *name;
1789 	struct bpf_if *bp;
1790 
1791 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1792 	if (d->bd_bif->bif_dlt == dlt)
1793 		return (0);
1794 	name = d->bd_bif->bif_name;
1795 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1796 		if (strcmp(name, bp->bif_name) != 0)
1797 			continue;
1798 		if (bp->bif_dlt == dlt)
1799 			break;
1800 	}
1801 	if (bp == NULL)
1802 		return (EINVAL);
1803 	bpf_detachd(d);
1804 	bpf_attachd(d, bp);
1805 	bpf_resetd(d);
1806 	return (0);
1807 }
1808 
1809 u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1810 u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1811 u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1812 
1813 int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1814 		    void *, u_int32_t);
1815 
1816 const struct bpf_ops bpf_mbuf_ops = {
1817 	bpf_mbuf_ldw,
1818 	bpf_mbuf_ldh,
1819 	bpf_mbuf_ldb,
1820 };
1821 
1822 int
1823 bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1824 {
1825 	u_int8_t *cp = buf;
1826 	u_int32_t count;
1827 
1828 	while (off >= m->m_len) {
1829 		off -= m->m_len;
1830 
1831 		m = m->m_next;
1832 		if (m == NULL)
1833 			return (-1);
1834 	}
1835 
1836 	for (;;) {
1837 		count = min(m->m_len - off, len);
1838 
1839 		memcpy(cp, m->m_data + off, count);
1840 		len -= count;
1841 
1842 		if (len == 0)
1843 			return (0);
1844 
1845 		m = m->m_next;
1846 		if (m == NULL)
1847 			break;
1848 
1849 		cp += count;
1850 		off = 0;
1851 	}
1852 
1853 	return (-1);
1854 }
1855 
1856 u_int32_t
1857 bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1858 {
1859 	u_int32_t v;
1860 
1861 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1862 		*err = 1;
1863 		return (0);
1864 	}
1865 
1866 	*err = 0;
1867 	return ntohl(v);
1868 }
1869 
1870 u_int32_t
1871 bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1872 {
1873 	u_int16_t v;
1874 
1875 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1876 		*err = 1;
1877 		return (0);
1878 	}
1879 
1880 	*err = 0;
1881 	return ntohs(v);
1882 }
1883 
1884 u_int32_t
1885 bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
1886 {
1887 	const struct mbuf *m = m0;
1888 	u_int8_t v;
1889 
1890 	while (k >= m->m_len) {
1891 		k -= m->m_len;
1892 
1893 		m = m->m_next;
1894 		if (m == NULL) {
1895 			*err = 1;
1896 			return (0);
1897 		}
1898 	}
1899 	v = m->m_data[k];
1900 
1901 	*err = 0;
1902 	return v;
1903 }
1904 
1905 u_int
1906 bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
1907 {
1908 	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
1909 }
1910