xref: /freebsd/sys/net/bpf.c (revision 8bd1f0cf)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from the Stanford/CMU enet packet filter,
8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10  * Berkeley Laboratory.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "opt_bpf.h"
43 #include "opt_ddb.h"
44 #include "opt_netgraph.h"
45 
46 #include <sys/types.h>
47 #include <sys/param.h>
48 #include <sys/lock.h>
49 #include <sys/rwlock.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/fcntl.h>
53 #include <sys/jail.h>
54 #include <sys/malloc.h>
55 #include <sys/mbuf.h>
56 #include <sys/time.h>
57 #include <sys/priv.h>
58 #include <sys/proc.h>
59 #include <sys/signalvar.h>
60 #include <sys/filio.h>
61 #include <sys/sockio.h>
62 #include <sys/ttycom.h>
63 #include <sys/uio.h>
64 #include <sys/sysent.h>
65 
66 #include <sys/event.h>
67 #include <sys/file.h>
68 #include <sys/poll.h>
69 #include <sys/proc.h>
70 
71 #include <sys/socket.h>
72 
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76 
77 #include <net/if.h>
78 #include <net/if_var.h>
79 #include <net/if_dl.h>
80 #include <net/bpf.h>
81 #include <net/bpf_buffer.h>
82 #ifdef BPF_JITTER
83 #include <net/bpf_jitter.h>
84 #endif
85 #include <net/bpf_zerocopy.h>
86 #include <net/bpfdesc.h>
87 #include <net/route.h>
88 #include <net/vnet.h>
89 
90 #include <netinet/in.h>
91 #include <netinet/if_ether.h>
92 #include <sys/kernel.h>
93 #include <sys/sysctl.h>
94 
95 #include <net80211/ieee80211_freebsd.h>
96 
97 #include <security/mac/mac_framework.h>
98 
99 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
100 
101 static struct bpf_if_ext dead_bpf_if = {
102 	.bif_dlist = LIST_HEAD_INITIALIZER()
103 };
104 
105 struct bpf_if {
106 #define	bif_next	bif_ext.bif_next
107 #define	bif_dlist	bif_ext.bif_dlist
108 	struct bpf_if_ext bif_ext;	/* public members */
109 	u_int		bif_dlt;	/* link layer type */
110 	u_int		bif_hdrlen;	/* length of link header */
111 	struct ifnet	*bif_ifp;	/* corresponding interface */
112 	struct rwlock	bif_lock;	/* interface lock */
113 	LIST_HEAD(, bpf_d) bif_wlist;	/* writer-only list */
114 	int		bif_flags;	/* Interface flags */
115 	struct bpf_if	**bif_bpf;	/* Pointer to pointer to us */
116 };
117 
118 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
119 
120 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
121 
122 #define PRINET  26			/* interruptible */
123 
124 #define	SIZEOF_BPF_HDR(type)	\
125     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
126 
127 #ifdef COMPAT_FREEBSD32
128 #include <sys/mount.h>
129 #include <compat/freebsd32/freebsd32.h>
130 #define BPF_ALIGNMENT32 sizeof(int32_t)
131 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
132 
133 #ifndef BURN_BRIDGES
134 /*
135  * 32-bit version of structure prepended to each packet.  We use this header
136  * instead of the standard one for 32-bit streams.  We mark the a stream as
137  * 32-bit the first time we see a 32-bit compat ioctl request.
138  */
139 struct bpf_hdr32 {
140 	struct timeval32 bh_tstamp;	/* time stamp */
141 	uint32_t	bh_caplen;	/* length of captured portion */
142 	uint32_t	bh_datalen;	/* original length of packet */
143 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
144 					   plus alignment padding) */
145 };
146 #endif
147 
148 struct bpf_program32 {
149 	u_int bf_len;
150 	uint32_t bf_insns;
151 };
152 
153 struct bpf_dltlist32 {
154 	u_int	bfl_len;
155 	u_int	bfl_list;
156 };
157 
158 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
159 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
160 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
161 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
162 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
163 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
164 #endif
165 
166 #define BPF_LOCK()	   sx_xlock(&bpf_sx)
167 #define BPF_UNLOCK()		sx_xunlock(&bpf_sx)
168 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
169 /*
170  * bpf_iflist is a list of BPF interface structures, each corresponding to a
171  * specific DLT.  The same network interface might have several BPF interface
172  * structures registered by different layers in the stack (i.e., 802.11
173  * frames, ethernet frames, etc).
174  */
175 static LIST_HEAD(, bpf_if)	bpf_iflist, bpf_freelist;
176 static struct sx	bpf_sx;		/* bpf global lock */
177 static int		bpf_bpfd_cnt;
178 
179 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
180 static void	bpf_detachd(struct bpf_d *);
181 static void	bpf_detachd_locked(struct bpf_d *);
182 static void	bpf_freed(struct bpf_d *);
183 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
184 		    struct sockaddr *, int *, struct bpf_d *);
185 static int	bpf_setif(struct bpf_d *, struct ifreq *);
186 static void	bpf_timed_out(void *);
187 static __inline void
188 		bpf_wakeup(struct bpf_d *);
189 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
190 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
191 		    struct bintime *);
192 static void	reset_d(struct bpf_d *);
193 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
194 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
195 static int	bpf_setdlt(struct bpf_d *, u_int);
196 static void	filt_bpfdetach(struct knote *);
197 static int	filt_bpfread(struct knote *, long);
198 static void	bpf_drvinit(void *);
199 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
200 
201 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
202 int bpf_maxinsns = BPF_MAXINSNS;
203 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
204     &bpf_maxinsns, 0, "Maximum bpf program instructions");
205 static int bpf_zerocopy_enable = 0;
206 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
207     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
208 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
209     bpf_stats_sysctl, "bpf statistics portal");
210 
211 static VNET_DEFINE(int, bpf_optimize_writers) = 0;
212 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
213 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW,
214     &VNET_NAME(bpf_optimize_writers), 0,
215     "Do not send packets until BPF program is set");
216 
217 static	d_open_t	bpfopen;
218 static	d_read_t	bpfread;
219 static	d_write_t	bpfwrite;
220 static	d_ioctl_t	bpfioctl;
221 static	d_poll_t	bpfpoll;
222 static	d_kqfilter_t	bpfkqfilter;
223 
224 static struct cdevsw bpf_cdevsw = {
225 	.d_version =	D_VERSION,
226 	.d_open =	bpfopen,
227 	.d_read =	bpfread,
228 	.d_write =	bpfwrite,
229 	.d_ioctl =	bpfioctl,
230 	.d_poll =	bpfpoll,
231 	.d_name =	"bpf",
232 	.d_kqfilter =	bpfkqfilter,
233 };
234 
235 static struct filterops bpfread_filtops = {
236 	.f_isfd = 1,
237 	.f_detach = filt_bpfdetach,
238 	.f_event = filt_bpfread,
239 };
240 
241 eventhandler_tag	bpf_ifdetach_cookie = NULL;
242 
243 /*
244  * LOCKING MODEL USED BY BPF:
245  * Locks:
246  * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal,
247  * some global counters and every bpf_if reference.
248  * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters.
249  * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields
250  *   used by bpf_mtap code.
251  *
252  * Lock order:
253  *
254  * Global lock, interface lock, descriptor lock
255  *
256  * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2]
257  * working model. In many places (like bpf_detachd) we start with BPF descriptor
258  * (and we need to at least rlock it to get reliable interface pointer). This
259  * gives us potential LOR. As a result, we use global lock to protect from bpf_if
260  * change in every such place.
261  *
262  * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
263  * 3) descriptor main wlock.
264  * Reading bd_bif can be protected by any of these locks, typically global lock.
265  *
266  * Changing read/write BPF filter is protected by the same three locks,
267  * the same applies for reading.
268  *
269  * Sleeping in global lock is not allowed due to bpfdetach() using it.
270  */
271 
272 /*
273  * Wrapper functions for various buffering methods.  If the set of buffer
274  * modes expands, we will probably want to introduce a switch data structure
275  * similar to protosw, et.
276  */
277 static void
278 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
279     u_int len)
280 {
281 
282 	BPFD_LOCK_ASSERT(d);
283 
284 	switch (d->bd_bufmode) {
285 	case BPF_BUFMODE_BUFFER:
286 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
287 
288 	case BPF_BUFMODE_ZBUF:
289 		counter_u64_add(d->bd_zcopy, 1);
290 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
291 
292 	default:
293 		panic("bpf_buf_append_bytes");
294 	}
295 }
296 
297 static void
298 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
299     u_int len)
300 {
301 
302 	BPFD_LOCK_ASSERT(d);
303 
304 	switch (d->bd_bufmode) {
305 	case BPF_BUFMODE_BUFFER:
306 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
307 
308 	case BPF_BUFMODE_ZBUF:
309 		counter_u64_add(d->bd_zcopy, 1);
310 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
311 
312 	default:
313 		panic("bpf_buf_append_mbuf");
314 	}
315 }
316 
317 /*
318  * This function gets called when the free buffer is re-assigned.
319  */
320 static void
321 bpf_buf_reclaimed(struct bpf_d *d)
322 {
323 
324 	BPFD_LOCK_ASSERT(d);
325 
326 	switch (d->bd_bufmode) {
327 	case BPF_BUFMODE_BUFFER:
328 		return;
329 
330 	case BPF_BUFMODE_ZBUF:
331 		bpf_zerocopy_buf_reclaimed(d);
332 		return;
333 
334 	default:
335 		panic("bpf_buf_reclaimed");
336 	}
337 }
338 
339 /*
340  * If the buffer mechanism has a way to decide that a held buffer can be made
341  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
342  * returned if the buffer can be discarded, (0) is returned if it cannot.
343  */
344 static int
345 bpf_canfreebuf(struct bpf_d *d)
346 {
347 
348 	BPFD_LOCK_ASSERT(d);
349 
350 	switch (d->bd_bufmode) {
351 	case BPF_BUFMODE_ZBUF:
352 		return (bpf_zerocopy_canfreebuf(d));
353 	}
354 	return (0);
355 }
356 
357 /*
358  * Allow the buffer model to indicate that the current store buffer is
359  * immutable, regardless of the appearance of space.  Return (1) if the
360  * buffer is writable, and (0) if not.
361  */
362 static int
363 bpf_canwritebuf(struct bpf_d *d)
364 {
365 	BPFD_LOCK_ASSERT(d);
366 
367 	switch (d->bd_bufmode) {
368 	case BPF_BUFMODE_ZBUF:
369 		return (bpf_zerocopy_canwritebuf(d));
370 	}
371 	return (1);
372 }
373 
374 /*
375  * Notify buffer model that an attempt to write to the store buffer has
376  * resulted in a dropped packet, in which case the buffer may be considered
377  * full.
378  */
379 static void
380 bpf_buffull(struct bpf_d *d)
381 {
382 
383 	BPFD_LOCK_ASSERT(d);
384 
385 	switch (d->bd_bufmode) {
386 	case BPF_BUFMODE_ZBUF:
387 		bpf_zerocopy_buffull(d);
388 		break;
389 	}
390 }
391 
392 /*
393  * Notify the buffer model that a buffer has moved into the hold position.
394  */
395 void
396 bpf_bufheld(struct bpf_d *d)
397 {
398 
399 	BPFD_LOCK_ASSERT(d);
400 
401 	switch (d->bd_bufmode) {
402 	case BPF_BUFMODE_ZBUF:
403 		bpf_zerocopy_bufheld(d);
404 		break;
405 	}
406 }
407 
408 static void
409 bpf_free(struct bpf_d *d)
410 {
411 
412 	switch (d->bd_bufmode) {
413 	case BPF_BUFMODE_BUFFER:
414 		return (bpf_buffer_free(d));
415 
416 	case BPF_BUFMODE_ZBUF:
417 		return (bpf_zerocopy_free(d));
418 
419 	default:
420 		panic("bpf_buf_free");
421 	}
422 }
423 
424 static int
425 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
426 {
427 
428 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
429 		return (EOPNOTSUPP);
430 	return (bpf_buffer_uiomove(d, buf, len, uio));
431 }
432 
433 static int
434 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
435 {
436 
437 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
438 		return (EOPNOTSUPP);
439 	return (bpf_buffer_ioctl_sblen(d, i));
440 }
441 
442 static int
443 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
444 {
445 
446 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
447 		return (EOPNOTSUPP);
448 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
449 }
450 
451 static int
452 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
453 {
454 
455 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
456 		return (EOPNOTSUPP);
457 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
458 }
459 
460 static int
461 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
462 {
463 
464 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
465 		return (EOPNOTSUPP);
466 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
467 }
468 
469 /*
470  * General BPF functions.
471  */
472 static int
473 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
474     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
475 {
476 	const struct ieee80211_bpf_params *p;
477 	struct ether_header *eh;
478 	struct mbuf *m;
479 	int error;
480 	int len;
481 	int hlen;
482 	int slen;
483 
484 	/*
485 	 * Build a sockaddr based on the data link layer type.
486 	 * We do this at this level because the ethernet header
487 	 * is copied directly into the data field of the sockaddr.
488 	 * In the case of SLIP, there is no header and the packet
489 	 * is forwarded as is.
490 	 * Also, we are careful to leave room at the front of the mbuf
491 	 * for the link level header.
492 	 */
493 	switch (linktype) {
494 
495 	case DLT_SLIP:
496 		sockp->sa_family = AF_INET;
497 		hlen = 0;
498 		break;
499 
500 	case DLT_EN10MB:
501 		sockp->sa_family = AF_UNSPEC;
502 		/* XXX Would MAXLINKHDR be better? */
503 		hlen = ETHER_HDR_LEN;
504 		break;
505 
506 	case DLT_FDDI:
507 		sockp->sa_family = AF_IMPLINK;
508 		hlen = 0;
509 		break;
510 
511 	case DLT_RAW:
512 		sockp->sa_family = AF_UNSPEC;
513 		hlen = 0;
514 		break;
515 
516 	case DLT_NULL:
517 		/*
518 		 * null interface types require a 4 byte pseudo header which
519 		 * corresponds to the address family of the packet.
520 		 */
521 		sockp->sa_family = AF_UNSPEC;
522 		hlen = 4;
523 		break;
524 
525 	case DLT_ATM_RFC1483:
526 		/*
527 		 * en atm driver requires 4-byte atm pseudo header.
528 		 * though it isn't standard, vpi:vci needs to be
529 		 * specified anyway.
530 		 */
531 		sockp->sa_family = AF_UNSPEC;
532 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
533 		break;
534 
535 	case DLT_PPP:
536 		sockp->sa_family = AF_UNSPEC;
537 		hlen = 4;	/* This should match PPP_HDRLEN */
538 		break;
539 
540 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
541 		sockp->sa_family = AF_IEEE80211;
542 		hlen = 0;
543 		break;
544 
545 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
546 		sockp->sa_family = AF_IEEE80211;
547 		sockp->sa_len = 12;	/* XXX != 0 */
548 		hlen = sizeof(struct ieee80211_bpf_params);
549 		break;
550 
551 	default:
552 		return (EIO);
553 	}
554 
555 	len = uio->uio_resid;
556 	if (len < hlen || len - hlen > ifp->if_mtu)
557 		return (EMSGSIZE);
558 
559 	m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR);
560 	if (m == NULL)
561 		return (EIO);
562 	m->m_pkthdr.len = m->m_len = len;
563 	*mp = m;
564 
565 	error = uiomove(mtod(m, u_char *), len, uio);
566 	if (error)
567 		goto bad;
568 
569 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
570 	if (slen == 0) {
571 		error = EPERM;
572 		goto bad;
573 	}
574 
575 	/* Check for multicast destination */
576 	switch (linktype) {
577 	case DLT_EN10MB:
578 		eh = mtod(m, struct ether_header *);
579 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
580 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
581 			    ETHER_ADDR_LEN) == 0)
582 				m->m_flags |= M_BCAST;
583 			else
584 				m->m_flags |= M_MCAST;
585 		}
586 		if (d->bd_hdrcmplt == 0) {
587 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
588 			    sizeof(eh->ether_shost));
589 		}
590 		break;
591 	}
592 
593 	/*
594 	 * Make room for link header, and copy it to sockaddr
595 	 */
596 	if (hlen != 0) {
597 		if (sockp->sa_family == AF_IEEE80211) {
598 			/*
599 			 * Collect true length from the parameter header
600 			 * NB: sockp is known to be zero'd so if we do a
601 			 *     short copy unspecified parameters will be
602 			 *     zero.
603 			 * NB: packet may not be aligned after stripping
604 			 *     bpf params
605 			 * XXX check ibp_vers
606 			 */
607 			p = mtod(m, const struct ieee80211_bpf_params *);
608 			hlen = p->ibp_len;
609 			if (hlen > sizeof(sockp->sa_data)) {
610 				error = EINVAL;
611 				goto bad;
612 			}
613 		}
614 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
615 	}
616 	*hdrlen = hlen;
617 
618 	return (0);
619 bad:
620 	m_freem(m);
621 	return (error);
622 }
623 
624 /*
625  * Attach file to the bpf interface, i.e. make d listen on bp.
626  */
627 static void
628 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
629 {
630 	int op_w;
631 
632 	BPF_LOCK_ASSERT();
633 
634 	/*
635 	 * Save sysctl value to protect from sysctl change
636 	 * between reads
637 	 */
638 	op_w = V_bpf_optimize_writers || d->bd_writer;
639 
640 	if (d->bd_bif != NULL)
641 		bpf_detachd_locked(d);
642 	/*
643 	 * Point d at bp, and add d to the interface's list.
644 	 * Since there are many applications using BPF for
645 	 * sending raw packets only (dhcpd, cdpd are good examples)
646 	 * we can delay adding d to the list of active listeners until
647 	 * some filter is configured.
648 	 */
649 
650 	BPFIF_WLOCK(bp);
651 	BPFD_LOCK(d);
652 
653 	d->bd_bif = bp;
654 
655 	if (op_w != 0) {
656 		/* Add to writers-only list */
657 		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
658 		/*
659 		 * We decrement bd_writer on every filter set operation.
660 		 * First BIOCSETF is done by pcap_open_live() to set up
661 		 * snap length. After that appliation usually sets its own filter
662 		 */
663 		d->bd_writer = 2;
664 	} else
665 		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
666 
667 	BPFD_UNLOCK(d);
668 	BPFIF_WUNLOCK(bp);
669 
670 	bpf_bpfd_cnt++;
671 
672 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
673 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
674 
675 	if (op_w == 0)
676 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
677 }
678 
679 /*
680  * Check if we need to upgrade our descriptor @d from write-only mode.
681  */
682 static int
683 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen)
684 {
685 	int is_snap, need_upgrade;
686 
687 	/*
688 	 * Check if we've already upgraded or new filter is empty.
689 	 */
690 	if (d->bd_writer == 0 || fcode == NULL)
691 		return (0);
692 
693 	need_upgrade = 0;
694 
695 	/*
696 	 * Check if cmd looks like snaplen setting from
697 	 * pcap_bpf.c:pcap_open_live().
698 	 * Note we're not checking .k value here:
699 	 * while pcap_open_live() definitely sets to non-zero value,
700 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
701 	 * do not consider upgrading immediately
702 	 */
703 	if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
704 		is_snap = 1;
705 	else
706 		is_snap = 0;
707 
708 	if (is_snap == 0) {
709 		/*
710 		 * We're setting first filter and it doesn't look like
711 		 * setting snaplen.  We're probably using bpf directly.
712 		 * Upgrade immediately.
713 		 */
714 		need_upgrade = 1;
715 	} else {
716 		/*
717 		 * Do not require upgrade by first BIOCSETF
718 		 * (used to set snaplen) by pcap_open_live().
719 		 */
720 
721 		if (--d->bd_writer == 0) {
722 			/*
723 			 * First snaplen filter has already
724 			 * been set. This is probably catch-all
725 			 * filter
726 			 */
727 			need_upgrade = 1;
728 		}
729 	}
730 
731 	CTR5(KTR_NET,
732 	    "%s: filter function set by pid %d, "
733 	    "bd_writer counter %d, snap %d upgrade %d",
734 	    __func__, d->bd_pid, d->bd_writer,
735 	    is_snap, need_upgrade);
736 
737 	return (need_upgrade);
738 }
739 
740 /*
741  * Add d to the list of active bp filters.
742  * Requires bpf_attachd() to be called before.
743  */
744 static void
745 bpf_upgraded(struct bpf_d *d)
746 {
747 	struct bpf_if *bp;
748 
749 	BPF_LOCK_ASSERT();
750 
751 	bp = d->bd_bif;
752 
753 	/*
754 	 * Filter can be set several times without specifying interface.
755 	 * Mark d as reader and exit.
756 	 */
757 	if (bp == NULL) {
758 		BPFD_LOCK(d);
759 		d->bd_writer = 0;
760 		BPFD_UNLOCK(d);
761 		return;
762 	}
763 
764 	BPFIF_WLOCK(bp);
765 	BPFD_LOCK(d);
766 
767 	/* Remove from writers-only list */
768 	LIST_REMOVE(d, bd_next);
769 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
770 	/* Mark d as reader */
771 	d->bd_writer = 0;
772 
773 	BPFD_UNLOCK(d);
774 	BPFIF_WUNLOCK(bp);
775 
776 	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
777 
778 	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
779 }
780 
781 /*
782  * Detach a file from its interface.
783  */
784 static void
785 bpf_detachd(struct bpf_d *d)
786 {
787 	BPF_LOCK();
788 	bpf_detachd_locked(d);
789 	BPF_UNLOCK();
790 }
791 
792 static void
793 bpf_detachd_locked(struct bpf_d *d)
794 {
795 	int error;
796 	struct bpf_if *bp;
797 	struct ifnet *ifp;
798 
799 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
800 
801 	BPF_LOCK_ASSERT();
802 
803 	/* Check if descriptor is attached */
804 	if ((bp = d->bd_bif) == NULL)
805 		return;
806 
807 	BPFIF_WLOCK(bp);
808 	BPFD_LOCK(d);
809 
810 	/* Save bd_writer value */
811 	error = d->bd_writer;
812 
813 	/*
814 	 * Remove d from the interface's descriptor list.
815 	 */
816 	LIST_REMOVE(d, bd_next);
817 
818 	ifp = bp->bif_ifp;
819 	d->bd_bif = NULL;
820 	BPFD_UNLOCK(d);
821 	BPFIF_WUNLOCK(bp);
822 
823 	bpf_bpfd_cnt--;
824 
825 	/* Call event handler iff d is attached */
826 	if (error == 0)
827 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
828 
829 	/*
830 	 * Check if this descriptor had requested promiscuous mode.
831 	 * If so, turn it off.
832 	 */
833 	if (d->bd_promisc) {
834 		d->bd_promisc = 0;
835 		CURVNET_SET(ifp->if_vnet);
836 		error = ifpromisc(ifp, 0);
837 		CURVNET_RESTORE();
838 		if (error != 0 && error != ENXIO) {
839 			/*
840 			 * ENXIO can happen if a pccard is unplugged
841 			 * Something is really wrong if we were able to put
842 			 * the driver into promiscuous mode, but can't
843 			 * take it out.
844 			 */
845 			if_printf(bp->bif_ifp,
846 				"bpf_detach: ifpromisc failed (%d)\n", error);
847 		}
848 	}
849 }
850 
851 /*
852  * Close the descriptor by detaching it from its interface,
853  * deallocating its buffers, and marking it free.
854  */
855 static void
856 bpf_dtor(void *data)
857 {
858 	struct bpf_d *d = data;
859 
860 	BPFD_LOCK(d);
861 	if (d->bd_state == BPF_WAITING)
862 		callout_stop(&d->bd_callout);
863 	d->bd_state = BPF_IDLE;
864 	BPFD_UNLOCK(d);
865 	funsetown(&d->bd_sigio);
866 	bpf_detachd(d);
867 #ifdef MAC
868 	mac_bpfdesc_destroy(d);
869 #endif /* MAC */
870 	seldrain(&d->bd_sel);
871 	knlist_destroy(&d->bd_sel.si_note);
872 	callout_drain(&d->bd_callout);
873 	bpf_freed(d);
874 	free(d, M_BPF);
875 }
876 
877 /*
878  * Open ethernet device.  Returns ENXIO for illegal minor device number,
879  * EBUSY if file is open by another process.
880  */
881 /* ARGSUSED */
882 static	int
883 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
884 {
885 	struct bpf_d *d;
886 	int error;
887 
888 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
889 	error = devfs_set_cdevpriv(d, bpf_dtor);
890 	if (error != 0) {
891 		free(d, M_BPF);
892 		return (error);
893 	}
894 
895 	/* Setup counters */
896 	d->bd_rcount = counter_u64_alloc(M_WAITOK);
897 	d->bd_dcount = counter_u64_alloc(M_WAITOK);
898 	d->bd_fcount = counter_u64_alloc(M_WAITOK);
899 	d->bd_wcount = counter_u64_alloc(M_WAITOK);
900 	d->bd_wfcount = counter_u64_alloc(M_WAITOK);
901 	d->bd_wdcount = counter_u64_alloc(M_WAITOK);
902 	d->bd_zcopy = counter_u64_alloc(M_WAITOK);
903 
904 	/*
905 	 * For historical reasons, perform a one-time initialization call to
906 	 * the buffer routines, even though we're not yet committed to a
907 	 * particular buffer method.
908 	 */
909 	bpf_buffer_init(d);
910 	if ((flags & FREAD) == 0)
911 		d->bd_writer = 2;
912 	d->bd_hbuf_in_use = 0;
913 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
914 	d->bd_sig = SIGIO;
915 	d->bd_direction = BPF_D_INOUT;
916 	BPF_PID_REFRESH(d, td);
917 #ifdef MAC
918 	mac_bpfdesc_init(d);
919 	mac_bpfdesc_create(td->td_ucred, d);
920 #endif
921 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
922 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
923 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
924 
925 	return (0);
926 }
927 
928 /*
929  *  bpfread - read next chunk of packets from buffers
930  */
931 static	int
932 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
933 {
934 	struct bpf_d *d;
935 	int error;
936 	int non_block;
937 	int timed_out;
938 
939 	error = devfs_get_cdevpriv((void **)&d);
940 	if (error != 0)
941 		return (error);
942 
943 	/*
944 	 * Restrict application to use a buffer the same size as
945 	 * as kernel buffers.
946 	 */
947 	if (uio->uio_resid != d->bd_bufsize)
948 		return (EINVAL);
949 
950 	non_block = ((ioflag & O_NONBLOCK) != 0);
951 
952 	BPFD_LOCK(d);
953 	BPF_PID_REFRESH_CUR(d);
954 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
955 		BPFD_UNLOCK(d);
956 		return (EOPNOTSUPP);
957 	}
958 	if (d->bd_state == BPF_WAITING)
959 		callout_stop(&d->bd_callout);
960 	timed_out = (d->bd_state == BPF_TIMED_OUT);
961 	d->bd_state = BPF_IDLE;
962 	while (d->bd_hbuf_in_use) {
963 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
964 		    PRINET|PCATCH, "bd_hbuf", 0);
965 		if (error != 0) {
966 			BPFD_UNLOCK(d);
967 			return (error);
968 		}
969 	}
970 	/*
971 	 * If the hold buffer is empty, then do a timed sleep, which
972 	 * ends when the timeout expires or when enough packets
973 	 * have arrived to fill the store buffer.
974 	 */
975 	while (d->bd_hbuf == NULL) {
976 		if (d->bd_slen != 0) {
977 			/*
978 			 * A packet(s) either arrived since the previous
979 			 * read or arrived while we were asleep.
980 			 */
981 			if (d->bd_immediate || non_block || timed_out) {
982 				/*
983 				 * Rotate the buffers and return what's here
984 				 * if we are in immediate mode, non-blocking
985 				 * flag is set, or this descriptor timed out.
986 				 */
987 				ROTATE_BUFFERS(d);
988 				break;
989 			}
990 		}
991 
992 		/*
993 		 * No data is available, check to see if the bpf device
994 		 * is still pointed at a real interface.  If not, return
995 		 * ENXIO so that the userland process knows to rebind
996 		 * it before using it again.
997 		 */
998 		if (d->bd_bif == NULL) {
999 			BPFD_UNLOCK(d);
1000 			return (ENXIO);
1001 		}
1002 
1003 		if (non_block) {
1004 			BPFD_UNLOCK(d);
1005 			return (EWOULDBLOCK);
1006 		}
1007 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
1008 		     "bpf", d->bd_rtout);
1009 		if (error == EINTR || error == ERESTART) {
1010 			BPFD_UNLOCK(d);
1011 			return (error);
1012 		}
1013 		if (error == EWOULDBLOCK) {
1014 			/*
1015 			 * On a timeout, return what's in the buffer,
1016 			 * which may be nothing.  If there is something
1017 			 * in the store buffer, we can rotate the buffers.
1018 			 */
1019 			if (d->bd_hbuf)
1020 				/*
1021 				 * We filled up the buffer in between
1022 				 * getting the timeout and arriving
1023 				 * here, so we don't need to rotate.
1024 				 */
1025 				break;
1026 
1027 			if (d->bd_slen == 0) {
1028 				BPFD_UNLOCK(d);
1029 				return (0);
1030 			}
1031 			ROTATE_BUFFERS(d);
1032 			break;
1033 		}
1034 	}
1035 	/*
1036 	 * At this point, we know we have something in the hold slot.
1037 	 */
1038 	d->bd_hbuf_in_use = 1;
1039 	BPFD_UNLOCK(d);
1040 
1041 	/*
1042 	 * Move data from hold buffer into user space.
1043 	 * We know the entire buffer is transferred since
1044 	 * we checked above that the read buffer is bpf_bufsize bytes.
1045   	 *
1046 	 * We do not have to worry about simultaneous reads because
1047 	 * we waited for sole access to the hold buffer above.
1048 	 */
1049 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
1050 
1051 	BPFD_LOCK(d);
1052 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
1053 	d->bd_fbuf = d->bd_hbuf;
1054 	d->bd_hbuf = NULL;
1055 	d->bd_hlen = 0;
1056 	bpf_buf_reclaimed(d);
1057 	d->bd_hbuf_in_use = 0;
1058 	wakeup(&d->bd_hbuf_in_use);
1059 	BPFD_UNLOCK(d);
1060 
1061 	return (error);
1062 }
1063 
1064 /*
1065  * If there are processes sleeping on this descriptor, wake them up.
1066  */
1067 static __inline void
1068 bpf_wakeup(struct bpf_d *d)
1069 {
1070 
1071 	BPFD_LOCK_ASSERT(d);
1072 	if (d->bd_state == BPF_WAITING) {
1073 		callout_stop(&d->bd_callout);
1074 		d->bd_state = BPF_IDLE;
1075 	}
1076 	wakeup(d);
1077 	if (d->bd_async && d->bd_sig && d->bd_sigio)
1078 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
1079 
1080 	selwakeuppri(&d->bd_sel, PRINET);
1081 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
1082 }
1083 
1084 static void
1085 bpf_timed_out(void *arg)
1086 {
1087 	struct bpf_d *d = (struct bpf_d *)arg;
1088 
1089 	BPFD_LOCK_ASSERT(d);
1090 
1091 	if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
1092 		return;
1093 	if (d->bd_state == BPF_WAITING) {
1094 		d->bd_state = BPF_TIMED_OUT;
1095 		if (d->bd_slen != 0)
1096 			bpf_wakeup(d);
1097 	}
1098 }
1099 
1100 static int
1101 bpf_ready(struct bpf_d *d)
1102 {
1103 
1104 	BPFD_LOCK_ASSERT(d);
1105 
1106 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
1107 		return (1);
1108 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1109 	    d->bd_slen != 0)
1110 		return (1);
1111 	return (0);
1112 }
1113 
1114 static int
1115 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
1116 {
1117 	struct bpf_d *d;
1118 	struct ifnet *ifp;
1119 	struct mbuf *m, *mc;
1120 	struct sockaddr dst;
1121 	struct route ro;
1122 	int error, hlen;
1123 
1124 	error = devfs_get_cdevpriv((void **)&d);
1125 	if (error != 0)
1126 		return (error);
1127 
1128 	BPF_PID_REFRESH_CUR(d);
1129 	counter_u64_add(d->bd_wcount, 1);
1130 	/* XXX: locking required */
1131 	if (d->bd_bif == NULL) {
1132 		counter_u64_add(d->bd_wdcount, 1);
1133 		return (ENXIO);
1134 	}
1135 
1136 	ifp = d->bd_bif->bif_ifp;
1137 
1138 	if ((ifp->if_flags & IFF_UP) == 0) {
1139 		counter_u64_add(d->bd_wdcount, 1);
1140 		return (ENETDOWN);
1141 	}
1142 
1143 	if (uio->uio_resid == 0) {
1144 		counter_u64_add(d->bd_wdcount, 1);
1145 		return (0);
1146 	}
1147 
1148 	bzero(&dst, sizeof(dst));
1149 	m = NULL;
1150 	hlen = 0;
1151 	/* XXX: bpf_movein() can sleep */
1152 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
1153 	    &m, &dst, &hlen, d);
1154 	if (error) {
1155 		counter_u64_add(d->bd_wdcount, 1);
1156 		return (error);
1157 	}
1158 	counter_u64_add(d->bd_wfcount, 1);
1159 	if (d->bd_hdrcmplt)
1160 		dst.sa_family = pseudo_AF_HDRCMPLT;
1161 
1162 	if (d->bd_feedback) {
1163 		mc = m_dup(m, M_NOWAIT);
1164 		if (mc != NULL)
1165 			mc->m_pkthdr.rcvif = ifp;
1166 		/* Set M_PROMISC for outgoing packets to be discarded. */
1167 		if (d->bd_direction == BPF_D_INOUT)
1168 			m->m_flags |= M_PROMISC;
1169 	} else
1170 		mc = NULL;
1171 
1172 	m->m_pkthdr.len -= hlen;
1173 	m->m_len -= hlen;
1174 	m->m_data += hlen;	/* XXX */
1175 
1176 	CURVNET_SET(ifp->if_vnet);
1177 #ifdef MAC
1178 	BPFD_LOCK(d);
1179 	mac_bpfdesc_create_mbuf(d, m);
1180 	if (mc != NULL)
1181 		mac_bpfdesc_create_mbuf(d, mc);
1182 	BPFD_UNLOCK(d);
1183 #endif
1184 
1185 	bzero(&ro, sizeof(ro));
1186 	if (hlen != 0) {
1187 		ro.ro_prepend = (u_char *)&dst.sa_data;
1188 		ro.ro_plen = hlen;
1189 		ro.ro_flags = RT_HAS_HEADER;
1190 	}
1191 
1192 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
1193 	if (error)
1194 		counter_u64_add(d->bd_wdcount, 1);
1195 
1196 	if (mc != NULL) {
1197 		if (error == 0)
1198 			(*ifp->if_input)(ifp, mc);
1199 		else
1200 			m_freem(mc);
1201 	}
1202 	CURVNET_RESTORE();
1203 
1204 	return (error);
1205 }
1206 
1207 /*
1208  * Reset a descriptor by flushing its packet buffer and clearing the receive
1209  * and drop counts.  This is doable for kernel-only buffers, but with
1210  * zero-copy buffers, we can't write to (or rotate) buffers that are
1211  * currently owned by userspace.  It would be nice if we could encapsulate
1212  * this logic in the buffer code rather than here.
1213  */
1214 static void
1215 reset_d(struct bpf_d *d)
1216 {
1217 
1218 	BPFD_LOCK_ASSERT(d);
1219 
1220 	while (d->bd_hbuf_in_use)
1221 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
1222 		    "bd_hbuf", 0);
1223 	if ((d->bd_hbuf != NULL) &&
1224 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1225 		/* Free the hold buffer. */
1226 		d->bd_fbuf = d->bd_hbuf;
1227 		d->bd_hbuf = NULL;
1228 		d->bd_hlen = 0;
1229 		bpf_buf_reclaimed(d);
1230 	}
1231 	if (bpf_canwritebuf(d))
1232 		d->bd_slen = 0;
1233 	counter_u64_zero(d->bd_rcount);
1234 	counter_u64_zero(d->bd_dcount);
1235 	counter_u64_zero(d->bd_fcount);
1236 	counter_u64_zero(d->bd_wcount);
1237 	counter_u64_zero(d->bd_wfcount);
1238 	counter_u64_zero(d->bd_wdcount);
1239 	counter_u64_zero(d->bd_zcopy);
1240 }
1241 
1242 /*
1243  *  FIONREAD		Check for read packet available.
1244  *  BIOCGBLEN		Get buffer len [for read()].
1245  *  BIOCSETF		Set read filter.
1246  *  BIOCSETFNR		Set read filter without resetting descriptor.
1247  *  BIOCSETWF		Set write filter.
1248  *  BIOCFLUSH		Flush read packet buffer.
1249  *  BIOCPROMISC		Put interface into promiscuous mode.
1250  *  BIOCGDLT		Get link layer type.
1251  *  BIOCGETIF		Get interface name.
1252  *  BIOCSETIF		Set interface.
1253  *  BIOCSRTIMEOUT	Set read timeout.
1254  *  BIOCGRTIMEOUT	Get read timeout.
1255  *  BIOCGSTATS		Get packet stats.
1256  *  BIOCIMMEDIATE	Set immediate mode.
1257  *  BIOCVERSION		Get filter language version.
1258  *  BIOCGHDRCMPLT	Get "header already complete" flag
1259  *  BIOCSHDRCMPLT	Set "header already complete" flag
1260  *  BIOCGDIRECTION	Get packet direction flag
1261  *  BIOCSDIRECTION	Set packet direction flag
1262  *  BIOCGTSTAMP		Get time stamp format and resolution.
1263  *  BIOCSTSTAMP		Set time stamp format and resolution.
1264  *  BIOCLOCK		Set "locked" flag
1265  *  BIOCFEEDBACK	Set packet feedback mode.
1266  *  BIOCSETZBUF		Set current zero-copy buffer locations.
1267  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
1268  *  BIOCROTZBUF		Force rotation of zero-copy buffer
1269  *  BIOCSETBUFMODE	Set buffer mode.
1270  *  BIOCGETBUFMODE	Get current buffer mode.
1271  */
1272 /* ARGSUSED */
1273 static	int
1274 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1275     struct thread *td)
1276 {
1277 	struct bpf_d *d;
1278 	int error;
1279 
1280 	error = devfs_get_cdevpriv((void **)&d);
1281 	if (error != 0)
1282 		return (error);
1283 
1284 	/*
1285 	 * Refresh PID associated with this descriptor.
1286 	 */
1287 	BPFD_LOCK(d);
1288 	BPF_PID_REFRESH(d, td);
1289 	if (d->bd_state == BPF_WAITING)
1290 		callout_stop(&d->bd_callout);
1291 	d->bd_state = BPF_IDLE;
1292 	BPFD_UNLOCK(d);
1293 
1294 	if (d->bd_locked == 1) {
1295 		switch (cmd) {
1296 		case BIOCGBLEN:
1297 		case BIOCFLUSH:
1298 		case BIOCGDLT:
1299 		case BIOCGDLTLIST:
1300 #ifdef COMPAT_FREEBSD32
1301 		case BIOCGDLTLIST32:
1302 #endif
1303 		case BIOCGETIF:
1304 		case BIOCGRTIMEOUT:
1305 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1306 		case BIOCGRTIMEOUT32:
1307 #endif
1308 		case BIOCGSTATS:
1309 		case BIOCVERSION:
1310 		case BIOCGRSIG:
1311 		case BIOCGHDRCMPLT:
1312 		case BIOCSTSTAMP:
1313 		case BIOCFEEDBACK:
1314 		case FIONREAD:
1315 		case BIOCLOCK:
1316 		case BIOCSRTIMEOUT:
1317 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1318 		case BIOCSRTIMEOUT32:
1319 #endif
1320 		case BIOCIMMEDIATE:
1321 		case TIOCGPGRP:
1322 		case BIOCROTZBUF:
1323 			break;
1324 		default:
1325 			return (EPERM);
1326 		}
1327 	}
1328 #ifdef COMPAT_FREEBSD32
1329 	/*
1330 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1331 	 * that it will get 32-bit packet headers.
1332 	 */
1333 	switch (cmd) {
1334 	case BIOCSETF32:
1335 	case BIOCSETFNR32:
1336 	case BIOCSETWF32:
1337 	case BIOCGDLTLIST32:
1338 	case BIOCGRTIMEOUT32:
1339 	case BIOCSRTIMEOUT32:
1340 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1341 			BPFD_LOCK(d);
1342 			d->bd_compat32 = 1;
1343 			BPFD_UNLOCK(d);
1344 		}
1345 	}
1346 #endif
1347 
1348 	CURVNET_SET(TD_TO_VNET(td));
1349 	switch (cmd) {
1350 
1351 	default:
1352 		error = EINVAL;
1353 		break;
1354 
1355 	/*
1356 	 * Check for read packet available.
1357 	 */
1358 	case FIONREAD:
1359 		{
1360 			int n;
1361 
1362 			BPFD_LOCK(d);
1363 			n = d->bd_slen;
1364 			while (d->bd_hbuf_in_use)
1365 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1366 				    PRINET, "bd_hbuf", 0);
1367 			if (d->bd_hbuf)
1368 				n += d->bd_hlen;
1369 			BPFD_UNLOCK(d);
1370 
1371 			*(int *)addr = n;
1372 			break;
1373 		}
1374 
1375 	/*
1376 	 * Get buffer len [for read()].
1377 	 */
1378 	case BIOCGBLEN:
1379 		BPFD_LOCK(d);
1380 		*(u_int *)addr = d->bd_bufsize;
1381 		BPFD_UNLOCK(d);
1382 		break;
1383 
1384 	/*
1385 	 * Set buffer length.
1386 	 */
1387 	case BIOCSBLEN:
1388 		error = bpf_ioctl_sblen(d, (u_int *)addr);
1389 		break;
1390 
1391 	/*
1392 	 * Set link layer read filter.
1393 	 */
1394 	case BIOCSETF:
1395 	case BIOCSETFNR:
1396 	case BIOCSETWF:
1397 #ifdef COMPAT_FREEBSD32
1398 	case BIOCSETF32:
1399 	case BIOCSETFNR32:
1400 	case BIOCSETWF32:
1401 #endif
1402 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1403 		break;
1404 
1405 	/*
1406 	 * Flush read packet buffer.
1407 	 */
1408 	case BIOCFLUSH:
1409 		BPFD_LOCK(d);
1410 		reset_d(d);
1411 		BPFD_UNLOCK(d);
1412 		break;
1413 
1414 	/*
1415 	 * Put interface into promiscuous mode.
1416 	 */
1417 	case BIOCPROMISC:
1418 		if (d->bd_bif == NULL) {
1419 			/*
1420 			 * No interface attached yet.
1421 			 */
1422 			error = EINVAL;
1423 			break;
1424 		}
1425 		if (d->bd_promisc == 0) {
1426 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
1427 			if (error == 0)
1428 				d->bd_promisc = 1;
1429 		}
1430 		break;
1431 
1432 	/*
1433 	 * Get current data link type.
1434 	 */
1435 	case BIOCGDLT:
1436 		BPF_LOCK();
1437 		if (d->bd_bif == NULL)
1438 			error = EINVAL;
1439 		else
1440 			*(u_int *)addr = d->bd_bif->bif_dlt;
1441 		BPF_UNLOCK();
1442 		break;
1443 
1444 	/*
1445 	 * Get a list of supported data link types.
1446 	 */
1447 #ifdef COMPAT_FREEBSD32
1448 	case BIOCGDLTLIST32:
1449 		{
1450 			struct bpf_dltlist32 *list32;
1451 			struct bpf_dltlist dltlist;
1452 
1453 			list32 = (struct bpf_dltlist32 *)addr;
1454 			dltlist.bfl_len = list32->bfl_len;
1455 			dltlist.bfl_list = PTRIN(list32->bfl_list);
1456 			BPF_LOCK();
1457 			if (d->bd_bif == NULL)
1458 				error = EINVAL;
1459 			else {
1460 				error = bpf_getdltlist(d, &dltlist);
1461 				if (error == 0)
1462 					list32->bfl_len = dltlist.bfl_len;
1463 			}
1464 			BPF_UNLOCK();
1465 			break;
1466 		}
1467 #endif
1468 
1469 	case BIOCGDLTLIST:
1470 		BPF_LOCK();
1471 		if (d->bd_bif == NULL)
1472 			error = EINVAL;
1473 		else
1474 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1475 		BPF_UNLOCK();
1476 		break;
1477 
1478 	/*
1479 	 * Set data link type.
1480 	 */
1481 	case BIOCSDLT:
1482 		BPF_LOCK();
1483 		if (d->bd_bif == NULL)
1484 			error = EINVAL;
1485 		else
1486 			error = bpf_setdlt(d, *(u_int *)addr);
1487 		BPF_UNLOCK();
1488 		break;
1489 
1490 	/*
1491 	 * Get interface name.
1492 	 */
1493 	case BIOCGETIF:
1494 		BPF_LOCK();
1495 		if (d->bd_bif == NULL)
1496 			error = EINVAL;
1497 		else {
1498 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
1499 			struct ifreq *const ifr = (struct ifreq *)addr;
1500 
1501 			strlcpy(ifr->ifr_name, ifp->if_xname,
1502 			    sizeof(ifr->ifr_name));
1503 		}
1504 		BPF_UNLOCK();
1505 		break;
1506 
1507 	/*
1508 	 * Set interface.
1509 	 */
1510 	case BIOCSETIF:
1511 		{
1512 			int alloc_buf, size;
1513 
1514 			/*
1515 			 * Behavior here depends on the buffering model.  If
1516 			 * we're using kernel memory buffers, then we can
1517 			 * allocate them here.  If we're using zero-copy,
1518 			 * then the user process must have registered buffers
1519 			 * by the time we get here.
1520 			 */
1521 			alloc_buf = 0;
1522 			BPFD_LOCK(d);
1523 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1524 			    d->bd_sbuf == NULL)
1525 				alloc_buf = 1;
1526 			BPFD_UNLOCK(d);
1527 			if (alloc_buf) {
1528 				size = d->bd_bufsize;
1529 				error = bpf_buffer_ioctl_sblen(d, &size);
1530 				if (error != 0)
1531 					break;
1532 			}
1533 			BPF_LOCK();
1534 			error = bpf_setif(d, (struct ifreq *)addr);
1535 			BPF_UNLOCK();
1536 			break;
1537 		}
1538 
1539 	/*
1540 	 * Set read timeout.
1541 	 */
1542 	case BIOCSRTIMEOUT:
1543 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1544 	case BIOCSRTIMEOUT32:
1545 #endif
1546 		{
1547 			struct timeval *tv = (struct timeval *)addr;
1548 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1549 			struct timeval32 *tv32;
1550 			struct timeval tv64;
1551 
1552 			if (cmd == BIOCSRTIMEOUT32) {
1553 				tv32 = (struct timeval32 *)addr;
1554 				tv = &tv64;
1555 				tv->tv_sec = tv32->tv_sec;
1556 				tv->tv_usec = tv32->tv_usec;
1557 			} else
1558 #endif
1559 				tv = (struct timeval *)addr;
1560 
1561 			/*
1562 			 * Subtract 1 tick from tvtohz() since this isn't
1563 			 * a one-shot timer.
1564 			 */
1565 			if ((error = itimerfix(tv)) == 0)
1566 				d->bd_rtout = tvtohz(tv) - 1;
1567 			break;
1568 		}
1569 
1570 	/*
1571 	 * Get read timeout.
1572 	 */
1573 	case BIOCGRTIMEOUT:
1574 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1575 	case BIOCGRTIMEOUT32:
1576 #endif
1577 		{
1578 			struct timeval *tv;
1579 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1580 			struct timeval32 *tv32;
1581 			struct timeval tv64;
1582 
1583 			if (cmd == BIOCGRTIMEOUT32)
1584 				tv = &tv64;
1585 			else
1586 #endif
1587 				tv = (struct timeval *)addr;
1588 
1589 			tv->tv_sec = d->bd_rtout / hz;
1590 			tv->tv_usec = (d->bd_rtout % hz) * tick;
1591 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1592 			if (cmd == BIOCGRTIMEOUT32) {
1593 				tv32 = (struct timeval32 *)addr;
1594 				tv32->tv_sec = tv->tv_sec;
1595 				tv32->tv_usec = tv->tv_usec;
1596 			}
1597 #endif
1598 
1599 			break;
1600 		}
1601 
1602 	/*
1603 	 * Get packet stats.
1604 	 */
1605 	case BIOCGSTATS:
1606 		{
1607 			struct bpf_stat *bs = (struct bpf_stat *)addr;
1608 
1609 			/* XXXCSJP overflow */
1610 			bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
1611 			bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
1612 			break;
1613 		}
1614 
1615 	/*
1616 	 * Set immediate mode.
1617 	 */
1618 	case BIOCIMMEDIATE:
1619 		BPFD_LOCK(d);
1620 		d->bd_immediate = *(u_int *)addr;
1621 		BPFD_UNLOCK(d);
1622 		break;
1623 
1624 	case BIOCVERSION:
1625 		{
1626 			struct bpf_version *bv = (struct bpf_version *)addr;
1627 
1628 			bv->bv_major = BPF_MAJOR_VERSION;
1629 			bv->bv_minor = BPF_MINOR_VERSION;
1630 			break;
1631 		}
1632 
1633 	/*
1634 	 * Get "header already complete" flag
1635 	 */
1636 	case BIOCGHDRCMPLT:
1637 		BPFD_LOCK(d);
1638 		*(u_int *)addr = d->bd_hdrcmplt;
1639 		BPFD_UNLOCK(d);
1640 		break;
1641 
1642 	/*
1643 	 * Set "header already complete" flag
1644 	 */
1645 	case BIOCSHDRCMPLT:
1646 		BPFD_LOCK(d);
1647 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1648 		BPFD_UNLOCK(d);
1649 		break;
1650 
1651 	/*
1652 	 * Get packet direction flag
1653 	 */
1654 	case BIOCGDIRECTION:
1655 		BPFD_LOCK(d);
1656 		*(u_int *)addr = d->bd_direction;
1657 		BPFD_UNLOCK(d);
1658 		break;
1659 
1660 	/*
1661 	 * Set packet direction flag
1662 	 */
1663 	case BIOCSDIRECTION:
1664 		{
1665 			u_int	direction;
1666 
1667 			direction = *(u_int *)addr;
1668 			switch (direction) {
1669 			case BPF_D_IN:
1670 			case BPF_D_INOUT:
1671 			case BPF_D_OUT:
1672 				BPFD_LOCK(d);
1673 				d->bd_direction = direction;
1674 				BPFD_UNLOCK(d);
1675 				break;
1676 			default:
1677 				error = EINVAL;
1678 			}
1679 		}
1680 		break;
1681 
1682 	/*
1683 	 * Get packet timestamp format and resolution.
1684 	 */
1685 	case BIOCGTSTAMP:
1686 		BPFD_LOCK(d);
1687 		*(u_int *)addr = d->bd_tstamp;
1688 		BPFD_UNLOCK(d);
1689 		break;
1690 
1691 	/*
1692 	 * Set packet timestamp format and resolution.
1693 	 */
1694 	case BIOCSTSTAMP:
1695 		{
1696 			u_int	func;
1697 
1698 			func = *(u_int *)addr;
1699 			if (BPF_T_VALID(func))
1700 				d->bd_tstamp = func;
1701 			else
1702 				error = EINVAL;
1703 		}
1704 		break;
1705 
1706 	case BIOCFEEDBACK:
1707 		BPFD_LOCK(d);
1708 		d->bd_feedback = *(u_int *)addr;
1709 		BPFD_UNLOCK(d);
1710 		break;
1711 
1712 	case BIOCLOCK:
1713 		BPFD_LOCK(d);
1714 		d->bd_locked = 1;
1715 		BPFD_UNLOCK(d);
1716 		break;
1717 
1718 	case FIONBIO:		/* Non-blocking I/O */
1719 		break;
1720 
1721 	case FIOASYNC:		/* Send signal on receive packets */
1722 		BPFD_LOCK(d);
1723 		d->bd_async = *(int *)addr;
1724 		BPFD_UNLOCK(d);
1725 		break;
1726 
1727 	case FIOSETOWN:
1728 		/*
1729 		 * XXX: Add some sort of locking here?
1730 		 * fsetown() can sleep.
1731 		 */
1732 		error = fsetown(*(int *)addr, &d->bd_sigio);
1733 		break;
1734 
1735 	case FIOGETOWN:
1736 		BPFD_LOCK(d);
1737 		*(int *)addr = fgetown(&d->bd_sigio);
1738 		BPFD_UNLOCK(d);
1739 		break;
1740 
1741 	/* This is deprecated, FIOSETOWN should be used instead. */
1742 	case TIOCSPGRP:
1743 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1744 		break;
1745 
1746 	/* This is deprecated, FIOGETOWN should be used instead. */
1747 	case TIOCGPGRP:
1748 		*(int *)addr = -fgetown(&d->bd_sigio);
1749 		break;
1750 
1751 	case BIOCSRSIG:		/* Set receive signal */
1752 		{
1753 			u_int sig;
1754 
1755 			sig = *(u_int *)addr;
1756 
1757 			if (sig >= NSIG)
1758 				error = EINVAL;
1759 			else {
1760 				BPFD_LOCK(d);
1761 				d->bd_sig = sig;
1762 				BPFD_UNLOCK(d);
1763 			}
1764 			break;
1765 		}
1766 	case BIOCGRSIG:
1767 		BPFD_LOCK(d);
1768 		*(u_int *)addr = d->bd_sig;
1769 		BPFD_UNLOCK(d);
1770 		break;
1771 
1772 	case BIOCGETBUFMODE:
1773 		BPFD_LOCK(d);
1774 		*(u_int *)addr = d->bd_bufmode;
1775 		BPFD_UNLOCK(d);
1776 		break;
1777 
1778 	case BIOCSETBUFMODE:
1779 		/*
1780 		 * Allow the buffering mode to be changed as long as we
1781 		 * haven't yet committed to a particular mode.  Our
1782 		 * definition of commitment, for now, is whether or not a
1783 		 * buffer has been allocated or an interface attached, since
1784 		 * that's the point where things get tricky.
1785 		 */
1786 		switch (*(u_int *)addr) {
1787 		case BPF_BUFMODE_BUFFER:
1788 			break;
1789 
1790 		case BPF_BUFMODE_ZBUF:
1791 			if (bpf_zerocopy_enable)
1792 				break;
1793 			/* FALLSTHROUGH */
1794 
1795 		default:
1796 			CURVNET_RESTORE();
1797 			return (EINVAL);
1798 		}
1799 
1800 		BPFD_LOCK(d);
1801 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1802 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1803 			BPFD_UNLOCK(d);
1804 			CURVNET_RESTORE();
1805 			return (EBUSY);
1806 		}
1807 		d->bd_bufmode = *(u_int *)addr;
1808 		BPFD_UNLOCK(d);
1809 		break;
1810 
1811 	case BIOCGETZMAX:
1812 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1813 		break;
1814 
1815 	case BIOCSETZBUF:
1816 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1817 		break;
1818 
1819 	case BIOCROTZBUF:
1820 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1821 		break;
1822 	}
1823 	CURVNET_RESTORE();
1824 	return (error);
1825 }
1826 
1827 /*
1828  * Set d's packet filter program to fp.  If this file already has a filter,
1829  * free it and replace it.  Returns EINVAL for bogus requests.
1830  *
1831  * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
1832  * since reading d->bd_bif can't be protected by d or interface lock due to
1833  * lock order.
1834  *
1835  * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
1836  * interface read lock to read all filers.
1837  *
1838  */
1839 static int
1840 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1841 {
1842 #ifdef COMPAT_FREEBSD32
1843 	struct bpf_program fp_swab;
1844 	struct bpf_program32 *fp32;
1845 #endif
1846 	struct bpf_insn *fcode, *old;
1847 #ifdef BPF_JITTER
1848 	bpf_jit_filter *jfunc, *ofunc;
1849 #endif
1850 	size_t size;
1851 	u_int flen;
1852 	int need_upgrade;
1853 
1854 #ifdef COMPAT_FREEBSD32
1855 	switch (cmd) {
1856 	case BIOCSETF32:
1857 	case BIOCSETWF32:
1858 	case BIOCSETFNR32:
1859 		fp32 = (struct bpf_program32 *)fp;
1860 		fp_swab.bf_len = fp32->bf_len;
1861 		fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1862 		fp = &fp_swab;
1863 		switch (cmd) {
1864 		case BIOCSETF32:
1865 			cmd = BIOCSETF;
1866 			break;
1867 		case BIOCSETWF32:
1868 			cmd = BIOCSETWF;
1869 			break;
1870 		}
1871 		break;
1872 	}
1873 #endif
1874 
1875 	fcode = NULL;
1876 #ifdef BPF_JITTER
1877 	jfunc = ofunc = NULL;
1878 #endif
1879 	need_upgrade = 0;
1880 
1881 	/*
1882 	 * Check new filter validness before acquiring any locks.
1883 	 * Allocate memory for new filter, if needed.
1884 	 */
1885 	flen = fp->bf_len;
1886 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1887 		return (EINVAL);
1888 	size = flen * sizeof(*fp->bf_insns);
1889 	if (size > 0) {
1890 		/* We're setting up new filter.  Copy and check actual data. */
1891 		fcode = malloc(size, M_BPF, M_WAITOK);
1892 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1893 		    !bpf_validate(fcode, flen)) {
1894 			free(fcode, M_BPF);
1895 			return (EINVAL);
1896 		}
1897 #ifdef BPF_JITTER
1898 		/* Filter is copied inside fcode and is perfectly valid. */
1899 		jfunc = bpf_jitter(fcode, flen);
1900 #endif
1901 	}
1902 
1903 	BPF_LOCK();
1904 
1905 	/*
1906 	 * Set up new filter.
1907 	 * Protect filter change by interface lock.
1908 	 * Additionally, we are protected by global lock here.
1909 	 */
1910 	if (d->bd_bif != NULL)
1911 		BPFIF_WLOCK(d->bd_bif);
1912 	BPFD_LOCK(d);
1913 	if (cmd == BIOCSETWF) {
1914 		old = d->bd_wfilter;
1915 		d->bd_wfilter = fcode;
1916 	} else {
1917 		old = d->bd_rfilter;
1918 		d->bd_rfilter = fcode;
1919 #ifdef BPF_JITTER
1920 		ofunc = d->bd_bfilter;
1921 		d->bd_bfilter = jfunc;
1922 #endif
1923 		if (cmd == BIOCSETF)
1924 			reset_d(d);
1925 
1926 		need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
1927 	}
1928 	BPFD_UNLOCK(d);
1929 	if (d->bd_bif != NULL)
1930 		BPFIF_WUNLOCK(d->bd_bif);
1931 	if (old != NULL)
1932 		free(old, M_BPF);
1933 #ifdef BPF_JITTER
1934 	if (ofunc != NULL)
1935 		bpf_destroy_jit_filter(ofunc);
1936 #endif
1937 
1938 	/* Move d to active readers list. */
1939 	if (need_upgrade != 0)
1940 		bpf_upgraded(d);
1941 
1942 	BPF_UNLOCK();
1943 	return (0);
1944 }
1945 
1946 /*
1947  * Detach a file from its current interface (if attached at all) and attach
1948  * to the interface indicated by the name stored in ifr.
1949  * Return an errno or 0.
1950  */
1951 static int
1952 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1953 {
1954 	struct bpf_if *bp;
1955 	struct ifnet *theywant;
1956 
1957 	BPF_LOCK_ASSERT();
1958 
1959 	theywant = ifunit(ifr->ifr_name);
1960 	if (theywant == NULL || theywant->if_bpf == NULL)
1961 		return (ENXIO);
1962 
1963 	bp = theywant->if_bpf;
1964 
1965 	/* Check if interface is not being detached from BPF */
1966 	BPFIF_RLOCK(bp);
1967 	if (bp->bif_flags & BPFIF_FLAG_DYING) {
1968 		BPFIF_RUNLOCK(bp);
1969 		return (ENXIO);
1970 	}
1971 	BPFIF_RUNLOCK(bp);
1972 
1973 	/*
1974 	 * At this point, we expect the buffer is already allocated.  If not,
1975 	 * return an error.
1976 	 */
1977 	switch (d->bd_bufmode) {
1978 	case BPF_BUFMODE_BUFFER:
1979 	case BPF_BUFMODE_ZBUF:
1980 		if (d->bd_sbuf == NULL)
1981 			return (EINVAL);
1982 		break;
1983 
1984 	default:
1985 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
1986 	}
1987 	if (bp != d->bd_bif)
1988 		bpf_attachd(d, bp);
1989 	BPFD_LOCK(d);
1990 	reset_d(d);
1991 	BPFD_UNLOCK(d);
1992 	return (0);
1993 }
1994 
1995 /*
1996  * Support for select() and poll() system calls
1997  *
1998  * Return true iff the specific operation will not block indefinitely.
1999  * Otherwise, return false but make a note that a selwakeup() must be done.
2000  */
2001 static int
2002 bpfpoll(struct cdev *dev, int events, struct thread *td)
2003 {
2004 	struct bpf_d *d;
2005 	int revents;
2006 
2007 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
2008 		return (events &
2009 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
2010 
2011 	/*
2012 	 * Refresh PID associated with this descriptor.
2013 	 */
2014 	revents = events & (POLLOUT | POLLWRNORM);
2015 	BPFD_LOCK(d);
2016 	BPF_PID_REFRESH(d, td);
2017 	if (events & (POLLIN | POLLRDNORM)) {
2018 		if (bpf_ready(d))
2019 			revents |= events & (POLLIN | POLLRDNORM);
2020 		else {
2021 			selrecord(td, &d->bd_sel);
2022 			/* Start the read timeout if necessary. */
2023 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2024 				callout_reset(&d->bd_callout, d->bd_rtout,
2025 				    bpf_timed_out, d);
2026 				d->bd_state = BPF_WAITING;
2027 			}
2028 		}
2029 	}
2030 	BPFD_UNLOCK(d);
2031 	return (revents);
2032 }
2033 
2034 /*
2035  * Support for kevent() system call.  Register EVFILT_READ filters and
2036  * reject all others.
2037  */
2038 int
2039 bpfkqfilter(struct cdev *dev, struct knote *kn)
2040 {
2041 	struct bpf_d *d;
2042 
2043 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
2044 	    kn->kn_filter != EVFILT_READ)
2045 		return (1);
2046 
2047 	/*
2048 	 * Refresh PID associated with this descriptor.
2049 	 */
2050 	BPFD_LOCK(d);
2051 	BPF_PID_REFRESH_CUR(d);
2052 	kn->kn_fop = &bpfread_filtops;
2053 	kn->kn_hook = d;
2054 	knlist_add(&d->bd_sel.si_note, kn, 1);
2055 	BPFD_UNLOCK(d);
2056 
2057 	return (0);
2058 }
2059 
2060 static void
2061 filt_bpfdetach(struct knote *kn)
2062 {
2063 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2064 
2065 	knlist_remove(&d->bd_sel.si_note, kn, 0);
2066 }
2067 
2068 static int
2069 filt_bpfread(struct knote *kn, long hint)
2070 {
2071 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2072 	int ready;
2073 
2074 	BPFD_LOCK_ASSERT(d);
2075 	ready = bpf_ready(d);
2076 	if (ready) {
2077 		kn->kn_data = d->bd_slen;
2078 		/*
2079 		 * Ignore the hold buffer if it is being copied to user space.
2080 		 */
2081 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
2082 			kn->kn_data += d->bd_hlen;
2083 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2084 		callout_reset(&d->bd_callout, d->bd_rtout,
2085 		    bpf_timed_out, d);
2086 		d->bd_state = BPF_WAITING;
2087 	}
2088 
2089 	return (ready);
2090 }
2091 
2092 #define	BPF_TSTAMP_NONE		0
2093 #define	BPF_TSTAMP_FAST		1
2094 #define	BPF_TSTAMP_NORMAL	2
2095 #define	BPF_TSTAMP_EXTERN	3
2096 
2097 static int
2098 bpf_ts_quality(int tstype)
2099 {
2100 
2101 	if (tstype == BPF_T_NONE)
2102 		return (BPF_TSTAMP_NONE);
2103 	if ((tstype & BPF_T_FAST) != 0)
2104 		return (BPF_TSTAMP_FAST);
2105 
2106 	return (BPF_TSTAMP_NORMAL);
2107 }
2108 
2109 static int
2110 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2111 {
2112 	struct m_tag *tag;
2113 	int quality;
2114 
2115 	quality = bpf_ts_quality(tstype);
2116 	if (quality == BPF_TSTAMP_NONE)
2117 		return (quality);
2118 
2119 	if (m != NULL) {
2120 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2121 		if (tag != NULL) {
2122 			*bt = *(struct bintime *)(tag + 1);
2123 			return (BPF_TSTAMP_EXTERN);
2124 		}
2125 	}
2126 	if (quality == BPF_TSTAMP_NORMAL)
2127 		binuptime(bt);
2128 	else
2129 		getbinuptime(bt);
2130 
2131 	return (quality);
2132 }
2133 
2134 /*
2135  * Incoming linkage from device drivers.  Process the packet pkt, of length
2136  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2137  * by each process' filter, and if accepted, stashed into the corresponding
2138  * buffer.
2139  */
2140 void
2141 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2142 {
2143 	struct bintime bt;
2144 	struct bpf_d *d;
2145 #ifdef BPF_JITTER
2146 	bpf_jit_filter *bf;
2147 #endif
2148 	u_int slen;
2149 	int gottime;
2150 
2151 	gottime = BPF_TSTAMP_NONE;
2152 
2153 	BPFIF_RLOCK(bp);
2154 
2155 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2156 		/*
2157 		 * We are not using any locks for d here because:
2158 		 * 1) any filter change is protected by interface
2159 		 * write lock
2160 		 * 2) destroying/detaching d is protected by interface
2161 		 * write lock, too
2162 		 */
2163 
2164 		counter_u64_add(d->bd_rcount, 1);
2165 		/*
2166 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
2167 		 * way for the caller to indiciate to us whether this packet
2168 		 * is inbound or outbound.  In the bpf_mtap() routines, we use
2169 		 * the interface pointers on the mbuf to figure it out.
2170 		 */
2171 #ifdef BPF_JITTER
2172 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2173 		if (bf != NULL)
2174 			slen = (*(bf->func))(pkt, pktlen, pktlen);
2175 		else
2176 #endif
2177 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2178 		if (slen != 0) {
2179 			/*
2180 			 * Filter matches. Let's to acquire write lock.
2181 			 */
2182 			BPFD_LOCK(d);
2183 
2184 			counter_u64_add(d->bd_fcount, 1);
2185 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2186 				gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
2187 #ifdef MAC
2188 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2189 #endif
2190 				catchpacket(d, pkt, pktlen, slen,
2191 				    bpf_append_bytes, &bt);
2192 			BPFD_UNLOCK(d);
2193 		}
2194 	}
2195 	BPFIF_RUNLOCK(bp);
2196 }
2197 
2198 #define	BPF_CHECK_DIRECTION(d, r, i)				\
2199 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
2200 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
2201 
2202 /*
2203  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2204  * Locking model is explained in bpf_tap().
2205  */
2206 void
2207 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2208 {
2209 	struct bintime bt;
2210 	struct bpf_d *d;
2211 #ifdef BPF_JITTER
2212 	bpf_jit_filter *bf;
2213 #endif
2214 	u_int pktlen, slen;
2215 	int gottime;
2216 
2217 	/* Skip outgoing duplicate packets. */
2218 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2219 		m->m_flags &= ~M_PROMISC;
2220 		return;
2221 	}
2222 
2223 	pktlen = m_length(m, NULL);
2224 	gottime = BPF_TSTAMP_NONE;
2225 
2226 	BPFIF_RLOCK(bp);
2227 
2228 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2229 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2230 			continue;
2231 		counter_u64_add(d->bd_rcount, 1);
2232 #ifdef BPF_JITTER
2233 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2234 		/* XXX We cannot handle multiple mbufs. */
2235 		if (bf != NULL && m->m_next == NULL)
2236 			slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
2237 		else
2238 #endif
2239 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2240 		if (slen != 0) {
2241 			BPFD_LOCK(d);
2242 
2243 			counter_u64_add(d->bd_fcount, 1);
2244 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2245 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2246 #ifdef MAC
2247 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2248 #endif
2249 				catchpacket(d, (u_char *)m, pktlen, slen,
2250 				    bpf_append_mbuf, &bt);
2251 			BPFD_UNLOCK(d);
2252 		}
2253 	}
2254 	BPFIF_RUNLOCK(bp);
2255 }
2256 
2257 /*
2258  * Incoming linkage from device drivers, when packet is in
2259  * an mbuf chain and to be prepended by a contiguous header.
2260  */
2261 void
2262 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2263 {
2264 	struct bintime bt;
2265 	struct mbuf mb;
2266 	struct bpf_d *d;
2267 	u_int pktlen, slen;
2268 	int gottime;
2269 
2270 	/* Skip outgoing duplicate packets. */
2271 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2272 		m->m_flags &= ~M_PROMISC;
2273 		return;
2274 	}
2275 
2276 	pktlen = m_length(m, NULL);
2277 	/*
2278 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
2279 	 * Note that we cut corners here; we only setup what's
2280 	 * absolutely needed--this mbuf should never go anywhere else.
2281 	 */
2282 	mb.m_next = m;
2283 	mb.m_data = data;
2284 	mb.m_len = dlen;
2285 	pktlen += dlen;
2286 
2287 	gottime = BPF_TSTAMP_NONE;
2288 
2289 	BPFIF_RLOCK(bp);
2290 
2291 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2292 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2293 			continue;
2294 		counter_u64_add(d->bd_rcount, 1);
2295 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2296 		if (slen != 0) {
2297 			BPFD_LOCK(d);
2298 
2299 			counter_u64_add(d->bd_fcount, 1);
2300 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2301 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2302 #ifdef MAC
2303 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2304 #endif
2305 				catchpacket(d, (u_char *)&mb, pktlen, slen,
2306 				    bpf_append_mbuf, &bt);
2307 			BPFD_UNLOCK(d);
2308 		}
2309 	}
2310 	BPFIF_RUNLOCK(bp);
2311 }
2312 
2313 #undef	BPF_CHECK_DIRECTION
2314 
2315 #undef	BPF_TSTAMP_NONE
2316 #undef	BPF_TSTAMP_FAST
2317 #undef	BPF_TSTAMP_NORMAL
2318 #undef	BPF_TSTAMP_EXTERN
2319 
2320 static int
2321 bpf_hdrlen(struct bpf_d *d)
2322 {
2323 	int hdrlen;
2324 
2325 	hdrlen = d->bd_bif->bif_hdrlen;
2326 #ifndef BURN_BRIDGES
2327 	if (d->bd_tstamp == BPF_T_NONE ||
2328 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2329 #ifdef COMPAT_FREEBSD32
2330 		if (d->bd_compat32)
2331 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2332 		else
2333 #endif
2334 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2335 	else
2336 #endif
2337 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2338 #ifdef COMPAT_FREEBSD32
2339 	if (d->bd_compat32)
2340 		hdrlen = BPF_WORDALIGN32(hdrlen);
2341 	else
2342 #endif
2343 		hdrlen = BPF_WORDALIGN(hdrlen);
2344 
2345 	return (hdrlen - d->bd_bif->bif_hdrlen);
2346 }
2347 
2348 static void
2349 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2350 {
2351 	struct bintime bt2, boottimebin;
2352 	struct timeval tsm;
2353 	struct timespec tsn;
2354 
2355 	if ((tstype & BPF_T_MONOTONIC) == 0) {
2356 		bt2 = *bt;
2357 		getboottimebin(&boottimebin);
2358 		bintime_add(&bt2, &boottimebin);
2359 		bt = &bt2;
2360 	}
2361 	switch (BPF_T_FORMAT(tstype)) {
2362 	case BPF_T_MICROTIME:
2363 		bintime2timeval(bt, &tsm);
2364 		ts->bt_sec = tsm.tv_sec;
2365 		ts->bt_frac = tsm.tv_usec;
2366 		break;
2367 	case BPF_T_NANOTIME:
2368 		bintime2timespec(bt, &tsn);
2369 		ts->bt_sec = tsn.tv_sec;
2370 		ts->bt_frac = tsn.tv_nsec;
2371 		break;
2372 	case BPF_T_BINTIME:
2373 		ts->bt_sec = bt->sec;
2374 		ts->bt_frac = bt->frac;
2375 		break;
2376 	}
2377 }
2378 
2379 /*
2380  * Move the packet data from interface memory (pkt) into the
2381  * store buffer.  "cpfn" is the routine called to do the actual data
2382  * transfer.  bcopy is passed in to copy contiguous chunks, while
2383  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2384  * pkt is really an mbuf.
2385  */
2386 static void
2387 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2388     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2389     struct bintime *bt)
2390 {
2391 	struct bpf_xhdr hdr;
2392 #ifndef BURN_BRIDGES
2393 	struct bpf_hdr hdr_old;
2394 #ifdef COMPAT_FREEBSD32
2395 	struct bpf_hdr32 hdr32_old;
2396 #endif
2397 #endif
2398 	int caplen, curlen, hdrlen, totlen;
2399 	int do_wakeup = 0;
2400 	int do_timestamp;
2401 	int tstype;
2402 
2403 	BPFD_LOCK_ASSERT(d);
2404 
2405 	/*
2406 	 * Detect whether user space has released a buffer back to us, and if
2407 	 * so, move it from being a hold buffer to a free buffer.  This may
2408 	 * not be the best place to do it (for example, we might only want to
2409 	 * run this check if we need the space), but for now it's a reliable
2410 	 * spot to do it.
2411 	 */
2412 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2413 		d->bd_fbuf = d->bd_hbuf;
2414 		d->bd_hbuf = NULL;
2415 		d->bd_hlen = 0;
2416 		bpf_buf_reclaimed(d);
2417 	}
2418 
2419 	/*
2420 	 * Figure out how many bytes to move.  If the packet is
2421 	 * greater or equal to the snapshot length, transfer that
2422 	 * much.  Otherwise, transfer the whole packet (unless
2423 	 * we hit the buffer size limit).
2424 	 */
2425 	hdrlen = bpf_hdrlen(d);
2426 	totlen = hdrlen + min(snaplen, pktlen);
2427 	if (totlen > d->bd_bufsize)
2428 		totlen = d->bd_bufsize;
2429 
2430 	/*
2431 	 * Round up the end of the previous packet to the next longword.
2432 	 *
2433 	 * Drop the packet if there's no room and no hope of room
2434 	 * If the packet would overflow the storage buffer or the storage
2435 	 * buffer is considered immutable by the buffer model, try to rotate
2436 	 * the buffer and wakeup pending processes.
2437 	 */
2438 #ifdef COMPAT_FREEBSD32
2439 	if (d->bd_compat32)
2440 		curlen = BPF_WORDALIGN32(d->bd_slen);
2441 	else
2442 #endif
2443 		curlen = BPF_WORDALIGN(d->bd_slen);
2444 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2445 		if (d->bd_fbuf == NULL) {
2446 			/*
2447 			 * There's no room in the store buffer, and no
2448 			 * prospect of room, so drop the packet.  Notify the
2449 			 * buffer model.
2450 			 */
2451 			bpf_buffull(d);
2452 			counter_u64_add(d->bd_dcount, 1);
2453 			return;
2454 		}
2455 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
2456 		ROTATE_BUFFERS(d);
2457 		do_wakeup = 1;
2458 		curlen = 0;
2459 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
2460 		/*
2461 		 * Immediate mode is set, or the read timeout has already
2462 		 * expired during a select call.  A packet arrived, so the
2463 		 * reader should be woken up.
2464 		 */
2465 		do_wakeup = 1;
2466 	caplen = totlen - hdrlen;
2467 	tstype = d->bd_tstamp;
2468 	do_timestamp = tstype != BPF_T_NONE;
2469 #ifndef BURN_BRIDGES
2470 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2471 		struct bpf_ts ts;
2472 		if (do_timestamp)
2473 			bpf_bintime2ts(bt, &ts, tstype);
2474 #ifdef COMPAT_FREEBSD32
2475 		if (d->bd_compat32) {
2476 			bzero(&hdr32_old, sizeof(hdr32_old));
2477 			if (do_timestamp) {
2478 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2479 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2480 			}
2481 			hdr32_old.bh_datalen = pktlen;
2482 			hdr32_old.bh_hdrlen = hdrlen;
2483 			hdr32_old.bh_caplen = caplen;
2484 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2485 			    sizeof(hdr32_old));
2486 			goto copy;
2487 		}
2488 #endif
2489 		bzero(&hdr_old, sizeof(hdr_old));
2490 		if (do_timestamp) {
2491 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2492 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2493 		}
2494 		hdr_old.bh_datalen = pktlen;
2495 		hdr_old.bh_hdrlen = hdrlen;
2496 		hdr_old.bh_caplen = caplen;
2497 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2498 		    sizeof(hdr_old));
2499 		goto copy;
2500 	}
2501 #endif
2502 
2503 	/*
2504 	 * Append the bpf header.  Note we append the actual header size, but
2505 	 * move forward the length of the header plus padding.
2506 	 */
2507 	bzero(&hdr, sizeof(hdr));
2508 	if (do_timestamp)
2509 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2510 	hdr.bh_datalen = pktlen;
2511 	hdr.bh_hdrlen = hdrlen;
2512 	hdr.bh_caplen = caplen;
2513 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2514 
2515 	/*
2516 	 * Copy the packet data into the store buffer and update its length.
2517 	 */
2518 #ifndef BURN_BRIDGES
2519 copy:
2520 #endif
2521 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2522 	d->bd_slen = curlen + totlen;
2523 
2524 	if (do_wakeup)
2525 		bpf_wakeup(d);
2526 }
2527 
2528 /*
2529  * Free buffers currently in use by a descriptor.
2530  * Called on close.
2531  */
2532 static void
2533 bpf_freed(struct bpf_d *d)
2534 {
2535 
2536 	/*
2537 	 * We don't need to lock out interrupts since this descriptor has
2538 	 * been detached from its interface and it yet hasn't been marked
2539 	 * free.
2540 	 */
2541 	bpf_free(d);
2542 	if (d->bd_rfilter != NULL) {
2543 		free((caddr_t)d->bd_rfilter, M_BPF);
2544 #ifdef BPF_JITTER
2545 		if (d->bd_bfilter != NULL)
2546 			bpf_destroy_jit_filter(d->bd_bfilter);
2547 #endif
2548 	}
2549 	if (d->bd_wfilter != NULL)
2550 		free((caddr_t)d->bd_wfilter, M_BPF);
2551 	mtx_destroy(&d->bd_lock);
2552 
2553 	counter_u64_free(d->bd_rcount);
2554 	counter_u64_free(d->bd_dcount);
2555 	counter_u64_free(d->bd_fcount);
2556 	counter_u64_free(d->bd_wcount);
2557 	counter_u64_free(d->bd_wfcount);
2558 	counter_u64_free(d->bd_wdcount);
2559 	counter_u64_free(d->bd_zcopy);
2560 
2561 }
2562 
2563 /*
2564  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
2565  * fixed size of the link header (variable length headers not yet supported).
2566  */
2567 void
2568 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2569 {
2570 
2571 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2572 }
2573 
2574 /*
2575  * Attach an interface to bpf.  ifp is a pointer to the structure
2576  * defining the interface to be attached, dlt is the link layer type,
2577  * and hdrlen is the fixed size of the link header (variable length
2578  * headers are not yet supporrted).
2579  */
2580 void
2581 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2582 {
2583 	struct bpf_if *bp;
2584 
2585 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
2586 	if (bp == NULL)
2587 		panic("bpfattach");
2588 
2589 	LIST_INIT(&bp->bif_dlist);
2590 	LIST_INIT(&bp->bif_wlist);
2591 	bp->bif_ifp = ifp;
2592 	bp->bif_dlt = dlt;
2593 	rw_init(&bp->bif_lock, "bpf interface lock");
2594 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
2595 	bp->bif_bpf = driverp;
2596 	*driverp = bp;
2597 
2598 	BPF_LOCK();
2599 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
2600 	BPF_UNLOCK();
2601 
2602 	bp->bif_hdrlen = hdrlen;
2603 
2604 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
2605 		if_printf(ifp, "bpf attached\n");
2606 }
2607 
2608 #ifdef VIMAGE
2609 /*
2610  * When moving interfaces between vnet instances we need a way to
2611  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
2612  * after the vmove.  We unfortunately have no device driver infrastructure
2613  * to query the interface for these values after creation/attach, thus
2614  * add this as a workaround.
2615  */
2616 int
2617 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
2618 {
2619 
2620 	if (bp == NULL)
2621 		return (ENXIO);
2622 	if (bif_dlt == NULL && bif_hdrlen == NULL)
2623 		return (0);
2624 
2625 	if (bif_dlt != NULL)
2626 		*bif_dlt = bp->bif_dlt;
2627 	if (bif_hdrlen != NULL)
2628 		*bif_hdrlen = bp->bif_hdrlen;
2629 
2630 	return (0);
2631 }
2632 #endif
2633 
2634 /*
2635  * Detach bpf from an interface. This involves detaching each descriptor
2636  * associated with the interface. Notify each descriptor as it's detached
2637  * so that any sleepers wake up and get ENXIO.
2638  */
2639 void
2640 bpfdetach(struct ifnet *ifp)
2641 {
2642 	struct bpf_if	*bp, *bp_temp;
2643 	struct bpf_d	*d;
2644 	int ndetached;
2645 
2646 	ndetached = 0;
2647 
2648 	BPF_LOCK();
2649 	/* Find all bpf_if struct's which reference ifp and detach them. */
2650 	LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
2651 		if (ifp != bp->bif_ifp)
2652 			continue;
2653 
2654 		LIST_REMOVE(bp, bif_next);
2655 		/* Add to to-be-freed list */
2656 		LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next);
2657 
2658 		ndetached++;
2659 		/*
2660 		 * Delay freeing bp till interface is detached
2661 		 * and all routes through this interface are removed.
2662 		 * Mark bp as detached to restrict new consumers.
2663 		 */
2664 		BPFIF_WLOCK(bp);
2665 		bp->bif_flags |= BPFIF_FLAG_DYING;
2666 		*bp->bif_bpf = (struct bpf_if *)&dead_bpf_if;
2667 		BPFIF_WUNLOCK(bp);
2668 
2669 		CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p",
2670 		    __func__, bp->bif_dlt, bp, ifp);
2671 
2672 		/* Free common descriptors */
2673 		while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
2674 			bpf_detachd_locked(d);
2675 			BPFD_LOCK(d);
2676 			bpf_wakeup(d);
2677 			BPFD_UNLOCK(d);
2678 		}
2679 
2680 		/* Free writer-only descriptors */
2681 		while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) {
2682 			bpf_detachd_locked(d);
2683 			BPFD_LOCK(d);
2684 			bpf_wakeup(d);
2685 			BPFD_UNLOCK(d);
2686 		}
2687 	}
2688 	BPF_UNLOCK();
2689 
2690 #ifdef INVARIANTS
2691 	if (ndetached == 0)
2692 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
2693 #endif
2694 }
2695 
2696 /*
2697  * Interface departure handler.
2698  * Note departure event does not guarantee interface is going down.
2699  * Interface renaming is currently done via departure/arrival event set.
2700  *
2701  * Departure handled is called after all routes pointing to
2702  * given interface are removed and interface is in down state
2703  * restricting any packets to be sent/received. We assume it is now safe
2704  * to free data allocated by BPF.
2705  */
2706 static void
2707 bpf_ifdetach(void *arg __unused, struct ifnet *ifp)
2708 {
2709 	struct bpf_if *bp, *bp_temp;
2710 	int nmatched = 0;
2711 
2712 	/* Ignore ifnet renaming. */
2713 	if (ifp->if_flags & IFF_RENAMING)
2714 		return;
2715 
2716 	BPF_LOCK();
2717 	/*
2718 	 * Find matching entries in free list.
2719 	 * Nothing should be found if bpfdetach() was not called.
2720 	 */
2721 	LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) {
2722 		if (ifp != bp->bif_ifp)
2723 			continue;
2724 
2725 		CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p",
2726 		    __func__, bp, ifp);
2727 
2728 		LIST_REMOVE(bp, bif_next);
2729 
2730 		rw_destroy(&bp->bif_lock);
2731 		free(bp, M_BPF);
2732 
2733 		nmatched++;
2734 	}
2735 	BPF_UNLOCK();
2736 }
2737 
2738 /*
2739  * Get a list of available data link type of the interface.
2740  */
2741 static int
2742 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2743 {
2744 	struct ifnet *ifp;
2745 	struct bpf_if *bp;
2746 	u_int *lst;
2747 	int error, n, n1;
2748 
2749 	BPF_LOCK_ASSERT();
2750 
2751 	ifp = d->bd_bif->bif_ifp;
2752 again:
2753 	n1 = 0;
2754 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2755 		if (bp->bif_ifp == ifp)
2756 			n1++;
2757 	}
2758 	if (bfl->bfl_list == NULL) {
2759 		bfl->bfl_len = n1;
2760 		return (0);
2761 	}
2762 	if (n1 > bfl->bfl_len)
2763 		return (ENOMEM);
2764 	BPF_UNLOCK();
2765 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
2766 	n = 0;
2767 	BPF_LOCK();
2768 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2769 		if (bp->bif_ifp != ifp)
2770 			continue;
2771 		if (n >= n1) {
2772 			free(lst, M_TEMP);
2773 			goto again;
2774 		}
2775 		lst[n] = bp->bif_dlt;
2776 		n++;
2777 	}
2778 	BPF_UNLOCK();
2779 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
2780 	free(lst, M_TEMP);
2781 	BPF_LOCK();
2782 	bfl->bfl_len = n;
2783 	return (error);
2784 }
2785 
2786 /*
2787  * Set the data link type of a BPF instance.
2788  */
2789 static int
2790 bpf_setdlt(struct bpf_d *d, u_int dlt)
2791 {
2792 	int error, opromisc;
2793 	struct ifnet *ifp;
2794 	struct bpf_if *bp;
2795 
2796 	BPF_LOCK_ASSERT();
2797 
2798 	if (d->bd_bif->bif_dlt == dlt)
2799 		return (0);
2800 	ifp = d->bd_bif->bif_ifp;
2801 
2802 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2803 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
2804 			break;
2805 	}
2806 
2807 	if (bp != NULL) {
2808 		opromisc = d->bd_promisc;
2809 		bpf_attachd(d, bp);
2810 		BPFD_LOCK(d);
2811 		reset_d(d);
2812 		BPFD_UNLOCK(d);
2813 		if (opromisc) {
2814 			error = ifpromisc(bp->bif_ifp, 1);
2815 			if (error)
2816 				if_printf(bp->bif_ifp,
2817 					"bpf_setdlt: ifpromisc failed (%d)\n",
2818 					error);
2819 			else
2820 				d->bd_promisc = 1;
2821 		}
2822 	}
2823 	return (bp == NULL ? EINVAL : 0);
2824 }
2825 
2826 static void
2827 bpf_drvinit(void *unused)
2828 {
2829 	struct cdev *dev;
2830 
2831 	sx_init(&bpf_sx, "bpf global lock");
2832 	LIST_INIT(&bpf_iflist);
2833 	LIST_INIT(&bpf_freelist);
2834 
2835 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2836 	/* For compatibility */
2837 	make_dev_alias(dev, "bpf0");
2838 
2839 	/* Register interface departure handler */
2840 	bpf_ifdetach_cookie = EVENTHANDLER_REGISTER(
2841 		    ifnet_departure_event, bpf_ifdetach, NULL,
2842 		    EVENTHANDLER_PRI_ANY);
2843 }
2844 
2845 /*
2846  * Zero out the various packet counters associated with all of the bpf
2847  * descriptors.  At some point, we will probably want to get a bit more
2848  * granular and allow the user to specify descriptors to be zeroed.
2849  */
2850 static void
2851 bpf_zero_counters(void)
2852 {
2853 	struct bpf_if *bp;
2854 	struct bpf_d *bd;
2855 
2856 	BPF_LOCK();
2857 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2858 		BPFIF_RLOCK(bp);
2859 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2860 			BPFD_LOCK(bd);
2861 			counter_u64_zero(bd->bd_rcount);
2862 			counter_u64_zero(bd->bd_dcount);
2863 			counter_u64_zero(bd->bd_fcount);
2864 			counter_u64_zero(bd->bd_wcount);
2865 			counter_u64_zero(bd->bd_wfcount);
2866 			counter_u64_zero(bd->bd_zcopy);
2867 			BPFD_UNLOCK(bd);
2868 		}
2869 		BPFIF_RUNLOCK(bp);
2870 	}
2871 	BPF_UNLOCK();
2872 }
2873 
2874 /*
2875  * Fill filter statistics
2876  */
2877 static void
2878 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
2879 {
2880 
2881 	bzero(d, sizeof(*d));
2882 	BPFD_LOCK_ASSERT(bd);
2883 	d->bd_structsize = sizeof(*d);
2884 	/* XXX: reading should be protected by global lock */
2885 	d->bd_immediate = bd->bd_immediate;
2886 	d->bd_promisc = bd->bd_promisc;
2887 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
2888 	d->bd_direction = bd->bd_direction;
2889 	d->bd_feedback = bd->bd_feedback;
2890 	d->bd_async = bd->bd_async;
2891 	d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
2892 	d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
2893 	d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
2894 	d->bd_sig = bd->bd_sig;
2895 	d->bd_slen = bd->bd_slen;
2896 	d->bd_hlen = bd->bd_hlen;
2897 	d->bd_bufsize = bd->bd_bufsize;
2898 	d->bd_pid = bd->bd_pid;
2899 	strlcpy(d->bd_ifname,
2900 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2901 	d->bd_locked = bd->bd_locked;
2902 	d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
2903 	d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
2904 	d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
2905 	d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
2906 	d->bd_bufmode = bd->bd_bufmode;
2907 }
2908 
2909 /*
2910  * Handle `netstat -B' stats request
2911  */
2912 static int
2913 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2914 {
2915 	static const struct xbpf_d zerostats;
2916 	struct xbpf_d *xbdbuf, *xbd, tempstats;
2917 	int index, error;
2918 	struct bpf_if *bp;
2919 	struct bpf_d *bd;
2920 
2921 	/*
2922 	 * XXX This is not technically correct. It is possible for non
2923 	 * privileged users to open bpf devices. It would make sense
2924 	 * if the users who opened the devices were able to retrieve
2925 	 * the statistics for them, too.
2926 	 */
2927 	error = priv_check(req->td, PRIV_NET_BPF);
2928 	if (error)
2929 		return (error);
2930 	/*
2931 	 * Check to see if the user is requesting that the counters be
2932 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
2933 	 * as we aren't allowing the user to set the counters currently.
2934 	 */
2935 	if (req->newptr != NULL) {
2936 		if (req->newlen != sizeof(tempstats))
2937 			return (EINVAL);
2938 		memset(&tempstats, 0, sizeof(tempstats));
2939 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
2940 		if (error)
2941 			return (error);
2942 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
2943 			return (EINVAL);
2944 		bpf_zero_counters();
2945 		return (0);
2946 	}
2947 	if (req->oldptr == NULL)
2948 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
2949 	if (bpf_bpfd_cnt == 0)
2950 		return (SYSCTL_OUT(req, 0, 0));
2951 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
2952 	BPF_LOCK();
2953 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
2954 		BPF_UNLOCK();
2955 		free(xbdbuf, M_BPF);
2956 		return (ENOMEM);
2957 	}
2958 	index = 0;
2959 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2960 		BPFIF_RLOCK(bp);
2961 		/* Send writers-only first */
2962 		LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
2963 			xbd = &xbdbuf[index++];
2964 			BPFD_LOCK(bd);
2965 			bpfstats_fill_xbpf(xbd, bd);
2966 			BPFD_UNLOCK(bd);
2967 		}
2968 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2969 			xbd = &xbdbuf[index++];
2970 			BPFD_LOCK(bd);
2971 			bpfstats_fill_xbpf(xbd, bd);
2972 			BPFD_UNLOCK(bd);
2973 		}
2974 		BPFIF_RUNLOCK(bp);
2975 	}
2976 	BPF_UNLOCK();
2977 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2978 	free(xbdbuf, M_BPF);
2979 	return (error);
2980 }
2981 
2982 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
2983 
2984 #else /* !DEV_BPF && !NETGRAPH_BPF */
2985 
2986 /*
2987  * NOP stubs to allow bpf-using drivers to load and function.
2988  *
2989  * A 'better' implementation would allow the core bpf functionality
2990  * to be loaded at runtime.
2991  */
2992 
2993 void
2994 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2995 {
2996 }
2997 
2998 void
2999 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
3000 {
3001 }
3002 
3003 void
3004 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
3005 {
3006 }
3007 
3008 void
3009 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3010 {
3011 
3012 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
3013 }
3014 
3015 void
3016 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
3017 {
3018 
3019 	*driverp = (struct bpf_if *)&dead_bpf_if;
3020 }
3021 
3022 void
3023 bpfdetach(struct ifnet *ifp)
3024 {
3025 }
3026 
3027 u_int
3028 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
3029 {
3030 	return -1;	/* "no filter" behaviour */
3031 }
3032 
3033 int
3034 bpf_validate(const struct bpf_insn *f, int len)
3035 {
3036 	return 0;		/* false */
3037 }
3038 
3039 #endif /* !DEV_BPF && !NETGRAPH_BPF */
3040 
3041 #ifdef DDB
3042 static void
3043 bpf_show_bpf_if(struct bpf_if *bpf_if)
3044 {
3045 
3046 	if (bpf_if == NULL)
3047 		return;
3048 	db_printf("%p:\n", bpf_if);
3049 #define	BPF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, bpf_if->e);
3050 	/* bif_ext.bif_next */
3051 	/* bif_ext.bif_dlist */
3052 	BPF_DB_PRINTF("%#x", bif_dlt);
3053 	BPF_DB_PRINTF("%u", bif_hdrlen);
3054 	BPF_DB_PRINTF("%p", bif_ifp);
3055 	/* bif_lock */
3056 	/* bif_wlist */
3057 	BPF_DB_PRINTF("%#x", bif_flags);
3058 }
3059 
3060 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
3061 {
3062 
3063 	if (!have_addr) {
3064 		db_printf("usage: show bpf_if <struct bpf_if *>\n");
3065 		return;
3066 	}
3067 
3068 	bpf_show_bpf_if((struct bpf_if *)addr);
3069 }
3070 #endif
3071