xref: /dragonfly/sys/dev/virtual/vkernel/net/if_vke.c (revision 066b6da2)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/endian.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/proc.h>
40 #include <sys/serialize.h>
41 #include <sys/socket.h>
42 #include <sys/sockio.h>
43 #include <sys/sysctl.h>
44 
45 #include <machine/md_var.h>
46 #include <machine/cothread.h>
47 
48 #include <net/ethernet.h>
49 #include <net/if.h>
50 #include <net/bpf.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 
54 #include <netinet/in_var.h>
55 
56 #include <sys/stat.h>
57 #include <net/tap/if_tap.h>
58 #include <err.h>
59 #include <errno.h>
60 #include <stdio.h>
61 #include <string.h>
62 #include <unistd.h>
63 #include <fcntl.h>
64 
65 #define VKE_DEVNAME		"vke"
66 
67 #define VKE_CHUNK	8 /* number of mbufs to queue before interrupting */
68 
69 #define NETFIFOINDEX(u, sc) ((u) & ((sc)->sc_ringsize - 1))
70 
71 #define VKE_COTD_RUN	0
72 #define VKE_COTD_EXIT	1
73 #define VKE_COTD_DEAD	2
74 
75 struct vke_fifo {
76 	struct mbuf	**array;
77 	int		rindex;
78 	int		windex;
79 };
80 typedef struct vke_fifo *fifo_t;
81 
82 /* Default value for a long time */
83 #define VKE_DEFAULT_RINGSIZE	256
84 static int vke_max_ringsize = 0;
85 TUNABLE_INT("hw.vke.max_ringsize", &vke_max_ringsize);
86 
87 #define LOW_POW_2(n)	(1 << (fls(n) - 1))
88 
89 struct vke_softc {
90 	struct arpcom		arpcom;
91 	int			sc_fd;
92 	int			sc_unit;
93 
94 	cothread_t		cotd_tx;
95 	cothread_t		cotd_rx;
96 
97 	int			cotd_tx_exit;
98 	int			cotd_rx_exit;
99 
100 	void			*sc_txbuf;
101 	int			sc_txbuf_len;
102 
103 	fifo_t			sc_txfifo;
104 	fifo_t			sc_txfifo_done;
105 	fifo_t			sc_rxfifo;
106 
107 	int			sc_ringsize;
108 
109 	long			cotd_ipackets;
110 	long			cotd_oerrors;
111 	long			cotd_opackets;
112 
113 	struct sysctl_ctx_list	sc_sysctl_ctx;
114 	struct sysctl_oid	*sc_sysctl_tree;
115 
116 	int			sc_tap_unit;	/* unit of backend tap(4) */
117 	in_addr_t		sc_addr;	/* address */
118 	in_addr_t		sc_mask;	/* netmask */
119 };
120 
121 static void	vke_start(struct ifnet *, struct ifaltq_subque *);
122 static void	vke_init(void *);
123 static int	vke_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
124 
125 static int	vke_attach(const struct vknetif_info *, int);
126 static int	vke_stop(struct vke_softc *);
127 static int	vke_init_addr(struct ifnet *, in_addr_t, in_addr_t);
128 static void	vke_tx_intr(cothread_t cotd);
129 static void	vke_tx_thread(cothread_t cotd);
130 static void	vke_rx_intr(cothread_t cotd);
131 static void	vke_rx_thread(cothread_t cotd);
132 
133 static int vke_txfifo_enqueue(struct vke_softc *sc, struct mbuf *m);
134 static struct mbuf *vke_txfifo_dequeue(struct vke_softc *sc);
135 
136 static int vke_txfifo_done_enqueue(struct vke_softc *sc, struct mbuf *m);
137 static struct mbuf * vke_txfifo_done_dequeue(struct vke_softc *sc, struct mbuf *nm);
138 
139 static struct mbuf *vke_rxfifo_dequeue(struct vke_softc *sc, struct mbuf *nm);
140 static struct mbuf *vke_rxfifo_sniff(struct vke_softc *sc);
141 
142 static void
143 vke_sysinit(void *arg __unused)
144 {
145 	int i, unit;
146 
147 	KASSERT(NetifNum <= VKNETIF_MAX, ("too many netifs: %d", NetifNum));
148 
149 	unit = 0;
150 	for (i = 0; i < NetifNum; ++i) {
151 		if (vke_attach(&NetifInfo[i], unit) == 0)
152 			++unit;
153 	}
154 }
155 SYSINIT(vke, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, vke_sysinit, NULL);
156 
157 /*
158  * vke_txfifo_done_enqueue() - Add an mbuf to the transmit done fifo.  Since
159  * the cothread cannot free transmit mbufs after processing we put them on
160  * the done fifo so the kernel can free them.
161  */
162 static int
163 vke_txfifo_done_enqueue(struct vke_softc *sc, struct mbuf *m)
164 {
165 	fifo_t fifo = sc->sc_txfifo_done;
166 
167 	while (NETFIFOINDEX(fifo->windex + 1, sc) == NETFIFOINDEX(fifo->rindex, sc)) {
168 		usleep(20000);
169 	}
170 
171 	fifo->array[NETFIFOINDEX(fifo->windex, sc)] = m;
172 	cpu_sfence();
173 	++fifo->windex;
174 	return (0);
175 }
176 
177 /*
178  * vke_txfifo_done_dequeue() - Remove an mbuf from the transmit done fifo.
179  */
180 static struct mbuf *
181 vke_txfifo_done_dequeue(struct vke_softc *sc, struct mbuf *nm)
182 {
183 	fifo_t fifo = sc->sc_txfifo_done;
184 	struct mbuf *m;
185 
186 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
187 		return (NULL);
188 
189 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
190 	fifo->array[NETFIFOINDEX(fifo->rindex, sc)] = nm;
191 	cpu_lfence();
192 	++fifo->rindex;
193 	return (m);
194 }
195 
196 /*
197  * vke_txfifo_enqueue() - Add an mbuf to the transmit fifo.
198  */
199 static int
200 vke_txfifo_enqueue(struct vke_softc *sc, struct mbuf *m)
201 {
202 	fifo_t fifo = sc->sc_txfifo;
203 
204 	if (NETFIFOINDEX(fifo->windex + 1, sc) == NETFIFOINDEX(fifo->rindex, sc))
205 		return (-1);
206 
207 	fifo->array[NETFIFOINDEX(fifo->windex, sc)] = m;
208 	cpu_sfence();
209 	++fifo->windex;
210 
211 	return (0);
212 }
213 
214 /*
215  * vke_txfifo_dequeue() - Return next mbuf on the transmit fifo if one
216  * exists.
217  */
218 static struct mbuf *
219 vke_txfifo_dequeue(struct vke_softc *sc)
220 {
221 	fifo_t fifo = sc->sc_txfifo;
222 	struct mbuf *m;
223 
224 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
225 		return (NULL);
226 
227 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
228 	fifo->array[NETFIFOINDEX(fifo->rindex, sc)] = NULL;
229 
230 	cpu_lfence();
231 	++fifo->rindex;
232 	return (m);
233 }
234 
235 static int
236 vke_txfifo_empty(struct vke_softc *sc)
237 {
238 	fifo_t fifo = sc->sc_txfifo;
239 
240 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
241 		return (1);
242 	return(0);
243 }
244 
245 /*
246  * vke_rxfifo_dequeue() - Return next mbuf on the receice fifo if one
247  * exists replacing it with newm which should point to a newly allocated
248  * mbuf.
249  */
250 static struct mbuf *
251 vke_rxfifo_dequeue(struct vke_softc *sc, struct mbuf *newm)
252 {
253 	fifo_t fifo = sc->sc_rxfifo;
254 	struct mbuf *m;
255 
256 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
257 		return (NULL);
258 
259 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
260 	fifo->array[NETFIFOINDEX(fifo->rindex, sc)] = newm;
261 	cpu_lfence();
262 	++fifo->rindex;
263 	return (m);
264 }
265 
266 /*
267  * Return the next mbuf if available but do NOT remove it from the FIFO.
268  */
269 static struct mbuf *
270 vke_rxfifo_sniff(struct vke_softc *sc)
271 {
272 	fifo_t fifo = sc->sc_rxfifo;
273 	struct mbuf *m;
274 
275 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
276 		return (NULL);
277 
278 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
279 	cpu_lfence();
280 	return (m);
281 }
282 
283 static void
284 vke_init(void *xsc)
285 {
286 	struct vke_softc *sc = xsc;
287 	struct ifnet *ifp = &sc->arpcom.ac_if;
288 	size_t ringsize = sc->sc_ringsize * sizeof(struct mbuf *);
289 	int i;
290 
291 	ASSERT_SERIALIZED(ifp->if_serializer);
292 
293 	vke_stop(sc);
294 
295 	ifp->if_flags |= IFF_RUNNING;
296 	ifsq_clr_oactive(ifq_get_subq_default(&ifp->if_snd));
297 
298 	/*
299 	 * Allocate memory for FIFO structures and mbufs.
300 	 */
301 	sc->sc_txfifo = kmalloc(sizeof(*sc->sc_txfifo),
302 	    M_DEVBUF, M_WAITOK | M_ZERO);
303 	sc->sc_txfifo_done = kmalloc(sizeof(*sc->sc_txfifo_done),
304 	    M_DEVBUF, M_WAITOK | M_ZERO);
305 	sc->sc_rxfifo = kmalloc(sizeof(*sc->sc_rxfifo),
306 	    M_DEVBUF, M_WAITOK | M_ZERO);
307 	sc->sc_txfifo->array = kmalloc(ringsize, M_DEVBUF, M_WAITOK | M_ZERO);
308 	sc->sc_txfifo_done->array = kmalloc(ringsize, M_DEVBUF, M_WAITOK | M_ZERO);
309 	sc->sc_rxfifo->array = kmalloc(ringsize, M_DEVBUF, M_WAITOK | M_ZERO);
310 
311 	for (i = 0; i < sc->sc_ringsize; i++) {
312 		sc->sc_rxfifo->array[i] = m_getcl(MB_WAIT, MT_DATA, M_PKTHDR);
313 		sc->sc_txfifo->array[i] = NULL;
314 		sc->sc_txfifo_done->array[i] = NULL;
315 	}
316 
317 	sc->cotd_tx_exit = sc->cotd_rx_exit = VKE_COTD_RUN;
318 	sc->cotd_tx = cothread_create(vke_tx_thread, vke_tx_intr, sc, "vke_tx");
319 	sc->cotd_rx = cothread_create(vke_rx_thread, vke_rx_intr, sc, "vke_rx");
320 
321 	if (sc->sc_addr != 0) {
322 		in_addr_t addr, mask;
323 
324 		addr = sc->sc_addr;
325 		mask = sc->sc_mask;
326 
327 		/*
328 		 * Make sure vkernel assigned
329 		 * address will not be added
330 		 * again.
331 		 */
332 		sc->sc_addr = 0;
333 		sc->sc_mask = 0;
334 
335 		vke_init_addr(ifp, addr, mask);
336 	}
337 
338 }
339 
340 /*
341  * Called from kernel.
342  *
343  * NOTE: We can't make any kernel callbacks while holding cothread lock
344  *	 because the cothread lock is not governed by the kernel scheduler
345  *	 (so mplock, tokens, etc will not be released).
346  */
347 static void
348 vke_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
349 {
350 	struct vke_softc *sc = ifp->if_softc;
351 	struct mbuf *m;
352 	cothread_t cotd = sc->cotd_tx;
353 	int count;
354 
355 	ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
356 	ASSERT_SERIALIZED(ifp->if_serializer);
357 
358 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
359 		return;
360 
361 	count = 0;
362 	while ((m = ifsq_dequeue(ifsq)) != NULL) {
363 		if (vke_txfifo_enqueue(sc, m) != -1) {
364 			if (count++ == VKE_CHUNK) {
365 				cothread_lock(cotd, 0);
366 				cothread_signal(cotd);
367 				cothread_unlock(cotd, 0);
368 				count = 0;
369 			}
370 		} else {
371 			m_freem(m);
372 		}
373 	}
374 	if (count) {
375 		cothread_lock(cotd, 0);
376 		cothread_signal(cotd);
377 		cothread_unlock(cotd, 0);
378 	}
379 }
380 
381 static int
382 vke_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr)
383 {
384 	struct vke_softc *sc = ifp->if_softc;
385 	int error = 0;
386 
387 	ASSERT_SERIALIZED(ifp->if_serializer);
388 
389 	switch (cmd) {
390 	case SIOCSIFFLAGS:
391 		if (ifp->if_flags & IFF_UP) {
392 			if ((ifp->if_flags & IFF_RUNNING) == 0)
393 				vke_init(sc);
394 		} else {
395 			if (ifp->if_flags & IFF_RUNNING)
396 				vke_stop(sc);
397 		}
398 		break;
399 	case SIOCGIFMEDIA:
400 	case SIOCSIFMEDIA:
401 		error = EOPNOTSUPP;
402 		/* TODO */
403 		break;
404 	case SIOCGIFSTATUS: {
405 		struct ifstat *ifs = (struct ifstat *)data;
406 		int len;
407 
408 		len = strlen(ifs->ascii);
409 		if (len < sizeof(ifs->ascii)) {
410 			if (sc->sc_tap_unit >= 0)
411 				ksnprintf(ifs->ascii + len, sizeof(ifs->ascii) - len,
412 				    "\tBacked by tap%d\n", sc->sc_tap_unit);
413 		}
414 		break;
415 	}
416 	case SIOCSIFADDR:
417 		if (((struct ifaddr *)data)->ifa_addr->sa_family == AF_INET) {
418 			/*
419 			 * If we are explicitly requested to change address,
420 			 * we should invalidate address/netmask passed in
421 			 * from vkernel command line.
422 			 */
423 			sc->sc_addr = 0;
424 			sc->sc_mask = 0;
425 		}
426 		/* FALL THROUGH */
427 	default:
428 		error = ether_ioctl(ifp, cmd, data);
429 		break;
430 	}
431 	return error;
432 }
433 
434 static int
435 vke_stop(struct vke_softc *sc)
436 {
437 	struct ifnet *ifp = &sc->arpcom.ac_if;
438 	int i;
439 
440 	ASSERT_SERIALIZED(ifp->if_serializer);
441 
442 	ifp->if_flags &= ~IFF_RUNNING;
443 	ifsq_clr_oactive(ifq_get_subq_default(&ifp->if_snd));
444 
445 	if (sc) {
446 		if (sc->cotd_tx) {
447 			cothread_lock(sc->cotd_tx, 0);
448 			if (sc->cotd_tx_exit == VKE_COTD_RUN)
449 				sc->cotd_tx_exit = VKE_COTD_EXIT;
450 			cothread_signal(sc->cotd_tx);
451 			cothread_unlock(sc->cotd_tx, 0);
452 			cothread_delete(&sc->cotd_tx);
453 		}
454 		if (sc->cotd_rx) {
455 			cothread_lock(sc->cotd_rx, 0);
456 			if (sc->cotd_rx_exit == VKE_COTD_RUN)
457 				sc->cotd_rx_exit = VKE_COTD_EXIT;
458 			cothread_signal(sc->cotd_rx);
459 			cothread_unlock(sc->cotd_rx, 0);
460 			cothread_delete(&sc->cotd_rx);
461 		}
462 
463 		for (i = 0; i < sc->sc_ringsize; i++) {
464 			if (sc->sc_rxfifo && sc->sc_rxfifo->array[i]) {
465 				m_freem(sc->sc_rxfifo->array[i]);
466 				sc->sc_rxfifo->array[i] = NULL;
467 			}
468 			if (sc->sc_txfifo && sc->sc_txfifo->array[i]) {
469 				m_freem(sc->sc_txfifo->array[i]);
470 				sc->sc_txfifo->array[i] = NULL;
471 			}
472 			if (sc->sc_txfifo_done && sc->sc_txfifo_done->array[i]) {
473 				m_freem(sc->sc_txfifo_done->array[i]);
474 				sc->sc_txfifo_done->array[i] = NULL;
475 			}
476 		}
477 
478 		if (sc->sc_txfifo) {
479 			if (sc->sc_txfifo->array)
480 				kfree(sc->sc_txfifo->array, M_DEVBUF);
481 			kfree(sc->sc_txfifo, M_DEVBUF);
482 			sc->sc_txfifo = NULL;
483 		}
484 
485 		if (sc->sc_txfifo_done) {
486 			if (sc->sc_txfifo_done->array)
487 				kfree(sc->sc_txfifo_done->array, M_DEVBUF);
488 			kfree(sc->sc_txfifo_done, M_DEVBUF);
489 			sc->sc_txfifo_done = NULL;
490 		}
491 
492 		if (sc->sc_rxfifo) {
493 			if (sc->sc_rxfifo->array)
494 				kfree(sc->sc_rxfifo->array, M_DEVBUF);
495 			kfree(sc->sc_rxfifo, M_DEVBUF);
496 			sc->sc_rxfifo = NULL;
497 		}
498 	}
499 
500 
501 	return 0;
502 }
503 
504 /*
505  * vke_rx_intr() is the interrupt function for the receive cothread.
506  */
507 static void
508 vke_rx_intr(cothread_t cotd)
509 {
510 	struct mbuf *m;
511 	struct mbuf *nm;
512 	struct vke_softc *sc = cotd->arg;
513 	struct ifnet *ifp = &sc->arpcom.ac_if;
514 	static int count = 0;
515 
516 	ifnet_serialize_all(ifp);
517 	cothread_lock(cotd, 0);
518 
519 	if (sc->cotd_rx_exit != VKE_COTD_RUN) {
520 		cothread_unlock(cotd, 0);
521 		ifnet_deserialize_all(ifp);
522 		return;
523 	}
524 	if (sc->cotd_ipackets) {
525 		IFNET_STAT_INC(ifp, ipackets, 1);
526 		sc->cotd_ipackets = 0;
527 	}
528 	cothread_unlock(cotd, 0);
529 
530 	while ((m = vke_rxfifo_sniff(sc)) != NULL) {
531 		nm = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
532 		if (nm) {
533 			vke_rxfifo_dequeue(sc, nm);
534 			ifp->if_input(ifp, m, NULL, -1);
535 			if (count++ == VKE_CHUNK) {
536 				cothread_lock(cotd, 0);
537 				cothread_signal(cotd);
538 				cothread_unlock(cotd, 0);
539 				count = 0;
540 			}
541 		} else {
542 			vke_rxfifo_dequeue(sc, m);
543 		}
544 	}
545 
546 	if (count) {
547 		cothread_lock(cotd, 0);
548 		cothread_signal(cotd);
549 		cothread_unlock(cotd, 0);
550 	}
551 	ifnet_deserialize_all(ifp);
552 }
553 
554 /*
555  * vke_tx_intr() is the interrupt function for the transmit cothread.
556  * Calls vke_start() to handle processing transmit mbufs.
557  */
558 static void
559 vke_tx_intr(cothread_t cotd)
560 {
561 	struct vke_softc *sc = cotd->arg;
562 	struct ifnet *ifp = &sc->arpcom.ac_if;
563 	struct mbuf *m;
564 
565 	ifnet_serialize_all(ifp);
566 	cothread_lock(cotd, 0);
567 	if (sc->cotd_tx_exit != VKE_COTD_RUN) {
568 		cothread_unlock(cotd, 0);
569 		ifnet_deserialize_all(ifp);
570 		return;
571 	}
572 	if (sc->cotd_opackets) {
573 		IFNET_STAT_INC(ifp, opackets, 1);
574 		sc->cotd_opackets = 0;
575 	}
576 	if (sc->cotd_oerrors) {
577 		IFNET_STAT_INC(ifp, oerrors, 1);
578 		sc->cotd_oerrors = 0;
579 	}
580 	cothread_unlock(cotd, 0);
581 
582 	/*
583 	 * Free TX mbufs that have been processed before starting new
584 	 * ones going to be pipeline friendly.
585 	 */
586 	while ((m = vke_txfifo_done_dequeue(sc, NULL)) != NULL) {
587 		m_freem(m);
588 	}
589 
590 	if ((ifp->if_flags & IFF_RUNNING) == 0)
591 		if_devstart(ifp);
592 
593 	ifnet_deserialize_all(ifp);
594 }
595 
596 /*
597  * vke_rx_thread() is the body of the receive cothread.
598  *
599  * WARNING!  THIS IS A COTHREAD WHICH HAS NO PER-CPU GLOBALDATA!!!!!
600  */
601 static void
602 vke_rx_thread(cothread_t cotd)
603 {
604 	struct mbuf *m;
605 	struct vke_softc *sc = cotd->arg;
606 	struct ifnet *ifp = &sc->arpcom.ac_if;
607 	fifo_t fifo = sc->sc_rxfifo;
608 	fd_set fdset;
609 	struct timeval tv;
610 	int count;
611 	int n;
612 
613 	/* Select timeout cannot be infinite since we need to check for
614 	 * the exit flag sc->cotd_rx_exit.
615 	 */
616 	tv.tv_sec = 0;
617 	tv.tv_usec = 500000;
618 
619 	FD_ZERO(&fdset);
620 	count = 0;
621 
622 	while (sc->cotd_rx_exit == VKE_COTD_RUN) {
623 		/*
624 		 * Wait for the RX FIFO to be loaded with
625 		 * empty mbufs.
626 		 */
627 		if (NETFIFOINDEX(fifo->windex + 1, sc) ==
628 		    NETFIFOINDEX(fifo->rindex, sc)) {
629 			usleep(20000);
630 			continue;
631 		}
632 
633 		/*
634 		 * Load data into the rx fifo
635 		 */
636 		m = fifo->array[NETFIFOINDEX(fifo->windex, sc)];
637 		if (m == NULL)
638 			continue;
639 		n = read(sc->sc_fd, mtod(m, void *), MCLBYTES);
640 		if (n > 0) {
641 			/* no mycpu in cothread */
642 			/*IFNET_STAT_INC(ifp, ipackets, 1);*/
643 			++sc->cotd_ipackets;
644 			m->m_pkthdr.rcvif = ifp;
645 			m->m_pkthdr.len = m->m_len = n;
646 			cpu_sfence();
647 			++fifo->windex;
648 			if (count++ == VKE_CHUNK) {
649 				cothread_intr(cotd);
650 				count = 0;
651 			}
652 		} else {
653 			if (count) {
654 				cothread_intr(cotd);
655 				count = 0;
656 			}
657 			FD_SET(sc->sc_fd, &fdset);
658 
659 			if (select(sc->sc_fd + 1, &fdset, NULL, NULL, &tv) == -1) {
660 				fprintf(stderr,
661 					VKE_DEVNAME "%d: select failed for "
662 					"TAP device\n", sc->sc_unit);
663 				usleep(1000000);
664 			}
665 		}
666 	}
667 	cpu_sfence();
668 	sc->cotd_rx_exit = VKE_COTD_DEAD;
669 }
670 
671 /*
672  * vke_tx_thread() is the body of the transmit cothread.
673  *
674  * WARNING!  THIS IS A COTHREAD WHICH HAS NO PER-CPU GLOBALDATA!!!!!
675  */
676 static void
677 vke_tx_thread(cothread_t cotd)
678 {
679 	struct mbuf *m;
680 	struct vke_softc *sc = cotd->arg;
681 	/*struct ifnet *ifp = &sc->arpcom.ac_if;*/
682 	int count = 0;
683 
684 	while (sc->cotd_tx_exit == VKE_COTD_RUN) {
685 		/*
686 		 * Write outgoing packets to the TAP interface
687 		 */
688 		m = vke_txfifo_dequeue(sc);
689 		if (m) {
690 			if (m->m_pkthdr.len <= MCLBYTES) {
691 				m_copydata(m, 0, m->m_pkthdr.len, sc->sc_txbuf);
692 				sc->sc_txbuf_len = m->m_pkthdr.len;
693 
694 				if (write(sc->sc_fd, sc->sc_txbuf,
695 					  sc->sc_txbuf_len) < 0) {
696 					/* no mycpu in cothread */
697 					/*IFNET_STAT_INC(ifp, oerrors, 1);*/
698 					++sc->cotd_oerrors;
699 				} else {
700 					/* no mycpu in cothread */
701 					/*IFNET_STAT_INC(ifp, opackets, 1);*/
702 					++sc->cotd_opackets;
703 				}
704 			}
705 			if (count++ == VKE_CHUNK) {
706 				cothread_intr(cotd);
707 				count = 0;
708 			}
709 			vke_txfifo_done_enqueue(sc, m);
710 		} else {
711 			if (count) {
712 				cothread_intr(cotd);
713 				count = 0;
714 			}
715 			cothread_lock(cotd, 1);
716 			if (vke_txfifo_empty(sc))
717 				cothread_wait(cotd);
718 			cothread_unlock(cotd, 1);
719 		}
720 	}
721 	cpu_sfence();
722 	sc->cotd_tx_exit = VKE_COTD_DEAD;
723 }
724 
725 static int
726 vke_attach(const struct vknetif_info *info, int unit)
727 {
728 	struct vke_softc *sc;
729 	struct ifnet *ifp;
730 	struct tapinfo tapinfo;
731 	uint8_t enaddr[ETHER_ADDR_LEN];
732 	int nmbufs;
733 	int fd;
734 
735 	KKASSERT(info->tap_fd >= 0);
736 	fd = info->tap_fd;
737 
738 	if (info->enaddr) {
739 		/*
740 		 * enaddr is supplied
741 		 */
742 		bcopy(info->enaddr, enaddr, ETHER_ADDR_LEN);
743 	} else {
744 		/*
745 		 * This is only a TAP device if tap_unit is non-zero.  If
746 		 * connecting to a virtual socket we generate a unique MAC.
747 		 *
748 		 * WARNING: enaddr[0] bit 0 is the multicast bit, when
749 		 *          randomizing enaddr[] just leave the first
750 		 *	    two bytes 00 00 for now.
751 		 */
752 		bzero(enaddr, sizeof(enaddr));
753 		if (info->tap_unit >= 0) {
754 			if (ioctl(fd, TAPGIFINFO, &tapinfo) < 0) {
755 				kprintf(VKE_DEVNAME "%d: ioctl(TAPGIFINFO) "
756 					"failed: %s\n", unit, strerror(errno));
757 				return ENXIO;
758 			}
759 
760 			if (ioctl(fd, SIOCGIFADDR, enaddr) < 0) {
761 				kprintf(VKE_DEVNAME "%d: ioctl(SIOCGIFADDR) "
762 					"failed: %s\n", unit, strerror(errno));
763 				return ENXIO;
764 			}
765 		} else {
766 			int fd = open("/dev/urandom", O_RDONLY);
767 			if (fd >= 0) {
768 				read(fd, enaddr + 2, 4);
769 				close(fd);
770 			}
771 			enaddr[4] = (int)getpid() >> 8;
772 			enaddr[5] = (int)getpid() & 255;
773 
774 		}
775 		enaddr[1] += 1;
776 	}
777 	if (ETHER_IS_MULTICAST(enaddr)) {
778 		kprintf(VKE_DEVNAME "%d: illegal MULTICAST ether mac!\n", unit);
779 		return ENXIO;
780 	}
781 
782 	sc = kmalloc(sizeof(*sc), M_DEVBUF, M_WAITOK | M_ZERO);
783 
784 	sc->sc_txbuf = kmalloc(MCLBYTES, M_DEVBUF, M_WAITOK);
785 	sc->sc_fd = fd;
786 	sc->sc_unit = unit;
787 	sc->sc_tap_unit = info->tap_unit;
788 	sc->sc_addr = info->netif_addr;
789 	sc->sc_mask = info->netif_mask;
790 
791 	if (vke_max_ringsize == 0) {
792 		nmbufs = nmbclusters / (NetifNum * 2);
793 		sc->sc_ringsize = LOW_POW_2(nmbufs);
794 		if (sc->sc_ringsize > VKE_DEFAULT_RINGSIZE)
795 			sc->sc_ringsize = VKE_DEFAULT_RINGSIZE;
796 	} else if (vke_max_ringsize >= VKE_CHUNK) {	/* Tunable specified */
797 		sc->sc_ringsize = LOW_POW_2(vke_max_ringsize);
798 	} else {
799 		sc->sc_ringsize = LOW_POW_2(VKE_CHUNK);
800 	}
801 
802 	ifp = &sc->arpcom.ac_if;
803 	if_initname(ifp, VKE_DEVNAME, sc->sc_unit);
804 
805 	/* NB: after if_initname() */
806 	sysctl_ctx_init(&sc->sc_sysctl_ctx);
807 	sc->sc_sysctl_tree = SYSCTL_ADD_NODE(&sc->sc_sysctl_ctx,
808 					     SYSCTL_STATIC_CHILDREN(_hw),
809 					     OID_AUTO, ifp->if_xname,
810 					     CTLFLAG_RD, 0, "");
811 	if (sc->sc_sysctl_tree == NULL) {
812 		kprintf(VKE_DEVNAME "%d: can't add sysctl node\n", unit);
813 	} else {
814 		SYSCTL_ADD_INT(&sc->sc_sysctl_ctx,
815 			       SYSCTL_CHILDREN(sc->sc_sysctl_tree),
816 			       OID_AUTO, "tap_unit",
817 			       CTLFLAG_RD, &sc->sc_tap_unit, 0,
818 			       "Backend tap(4) unit");
819 	}
820 
821 	ifp->if_softc = sc;
822 	ifp->if_ioctl = vke_ioctl;
823 	ifp->if_start = vke_start;
824 	ifp->if_init = vke_init;
825 	ifp->if_mtu = tapinfo.mtu;
826 	ifp->if_baudrate = tapinfo.baudrate;
827 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
828 	ifq_set_maxlen(&ifp->if_snd, IFQ_MAXLEN);
829 	ifq_set_ready(&ifp->if_snd);
830 
831 	/* TODO: if_media */
832 
833 	ether_ifattach(ifp, enaddr, NULL);
834 
835 	if (bootverbose && sc->sc_addr != 0) {
836 		if_printf(ifp, "pre-configured "
837 		    "address 0x%08x, netmask 0x%08x, %d mbuf clusters\n",
838 		    ntohl(sc->sc_addr), ntohl(sc->sc_mask), sc->sc_ringsize);
839 	}
840 
841 	return 0;
842 }
843 
844 static int
845 vke_init_addr(struct ifnet *ifp, in_addr_t addr, in_addr_t mask)
846 {
847 	struct ifaliasreq ifra;
848 	struct sockaddr_in *sin;
849 	int ret;
850 
851 	ASSERT_SERIALIZED(ifp->if_serializer);
852 
853 	if (bootverbose) {
854 		if_printf(ifp, "add pre-configured "
855 			  "address 0x%08x, netmask 0x%08x\n",
856 			  ntohl(addr), ntohl(mask));
857 	}
858 
859 	bzero(&ifra, sizeof(ifra));
860 
861 	/* NB: no need to set ifaliasreq.ifra_name */
862 
863 	sin = (struct sockaddr_in *)&ifra.ifra_addr;
864 	sin->sin_family = AF_INET;
865 	sin->sin_len = sizeof(*sin);
866 	sin->sin_addr.s_addr = addr;
867 
868 	if (mask != 0) {
869 		sin = (struct sockaddr_in *)&ifra.ifra_mask;
870 		sin->sin_len = sizeof(*sin);
871 		sin->sin_addr.s_addr = mask;
872 	}
873 
874 	/*
875 	 * Temporarily release serializer, in_control() will hold
876 	 * it again before calling ifnet.if_ioctl().
877 	 */
878 	ifnet_deserialize_all(ifp);
879 	ret = in_control(NULL, SIOCAIFADDR, (caddr_t)&ifra, ifp, NULL);
880 	ifnet_serialize_all(ifp);
881 
882 	return ret;
883 }
884