xref: /dragonfly/sys/dev/virtual/vkernel/net/if_vke.c (revision 78478697)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/endian.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/proc.h>
40 #include <sys/serialize.h>
41 #include <sys/socket.h>
42 #include <sys/sockio.h>
43 #include <sys/sysctl.h>
44 
45 #include <machine/md_var.h>
46 #include <machine/cothread.h>
47 
48 #include <net/ethernet.h>
49 #include <net/if.h>
50 #include <net/bpf.h>
51 #include <net/if_arp.h>
52 #include <net/if_media.h>
53 #include <net/ifq_var.h>
54 #include <net/vlan/if_vlan_ether.h>
55 
56 #include <netinet/in_var.h>
57 
58 #include <sys/stat.h>
59 #include <net/tap/if_tap.h>
60 #include <err.h>
61 #include <errno.h>
62 #include <stdio.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <fcntl.h>
66 
67 #define VKE_DEVNAME		"vke"
68 
69 #define VKE_CHUNK	8 /* number of mbufs to queue before interrupting */
70 
71 #define NETFIFOINDEX(u, sc) ((u) & ((sc)->sc_ringsize - 1))
72 
73 #define VKE_COTD_RUN	0
74 #define VKE_COTD_EXIT	1
75 #define VKE_COTD_DEAD	2
76 
77 struct vke_fifo {
78 	struct mbuf	**array;
79 	int		rindex;
80 	int		windex;
81 };
82 typedef struct vke_fifo *fifo_t;
83 
84 /* Default value for a long time */
85 #define VKE_DEFAULT_RINGSIZE	256
86 static int vke_max_ringsize = 0;
87 TUNABLE_INT("hw.vke.max_ringsize", &vke_max_ringsize);
88 
89 #define LOW_POW_2(n)	(1 << (fls(n) - 1))
90 
91 struct vke_softc {
92 	struct arpcom		arpcom;
93 	int			sc_fd;
94 	int			sc_unit;
95 
96 	cothread_t		cotd_tx;
97 	cothread_t		cotd_rx;
98 
99 	int			cotd_tx_exit;
100 	int			cotd_rx_exit;
101 
102 	void			*sc_txbuf;
103 	int			sc_txbuf_len;
104 
105 	fifo_t			sc_txfifo;
106 	fifo_t			sc_txfifo_done;
107 	fifo_t			sc_rxfifo;
108 
109 	int			sc_ringsize;
110 
111 	long			cotd_ipackets;
112 	long			cotd_oerrors;
113 	long			cotd_opackets;
114 
115 	struct sysctl_ctx_list	sc_sysctl_ctx;
116 	struct sysctl_oid	*sc_sysctl_tree;
117 
118 	int			sc_tap_unit;	/* unit of backend tap(4) */
119 	in_addr_t		sc_addr;	/* address */
120 	in_addr_t		sc_mask;	/* netmask */
121 
122 	struct ifmedia		sc_media;
123 };
124 
125 static void	vke_start(struct ifnet *, struct ifaltq_subque *);
126 static void	vke_init(void *);
127 static int	vke_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
128 
129 static int	vke_media_change(struct ifnet *);
130 static void	vke_media_status(struct ifnet *, struct ifmediareq *);
131 
132 static int	vke_attach(const struct vknetif_info *, int);
133 static int	vke_stop(struct vke_softc *);
134 static int	vke_init_addr(struct ifnet *, in_addr_t, in_addr_t);
135 static void	vke_tx_intr(cothread_t cotd);
136 static void	vke_tx_thread(cothread_t cotd);
137 static void	vke_rx_intr(cothread_t cotd);
138 static void	vke_rx_thread(cothread_t cotd);
139 
140 static int vke_txfifo_enqueue(struct vke_softc *sc, struct mbuf *m);
141 static struct mbuf *vke_txfifo_dequeue(struct vke_softc *sc);
142 
143 static int vke_txfifo_done_enqueue(struct vke_softc *sc, struct mbuf *m);
144 static struct mbuf * vke_txfifo_done_dequeue(struct vke_softc *sc, struct mbuf *nm);
145 
146 static struct mbuf *vke_rxfifo_dequeue(struct vke_softc *sc, struct mbuf *nm);
147 static struct mbuf *vke_rxfifo_sniff(struct vke_softc *sc);
148 
149 static void
150 vke_sysinit(void *arg __unused)
151 {
152 	int i, unit;
153 
154 	KASSERT(NetifNum <= VKNETIF_MAX, ("too many netifs: %d", NetifNum));
155 
156 	unit = 0;
157 	for (i = 0; i < NetifNum; ++i) {
158 		if (vke_attach(&NetifInfo[i], unit) == 0)
159 			++unit;
160 	}
161 }
162 SYSINIT(vke, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, vke_sysinit, NULL);
163 
164 /*
165  * vke_txfifo_done_enqueue() - Add an mbuf to the transmit done fifo.  Since
166  * the cothread cannot free transmit mbufs after processing we put them on
167  * the done fifo so the kernel can free them.
168  */
169 static int
170 vke_txfifo_done_enqueue(struct vke_softc *sc, struct mbuf *m)
171 {
172 	fifo_t fifo = sc->sc_txfifo_done;
173 
174 	while (NETFIFOINDEX(fifo->windex + 1, sc) == NETFIFOINDEX(fifo->rindex, sc)) {
175 		usleep(20000);
176 	}
177 
178 	fifo->array[NETFIFOINDEX(fifo->windex, sc)] = m;
179 	cpu_sfence();
180 	++fifo->windex;
181 	return (0);
182 }
183 
184 /*
185  * vke_txfifo_done_dequeue() - Remove an mbuf from the transmit done fifo.
186  */
187 static struct mbuf *
188 vke_txfifo_done_dequeue(struct vke_softc *sc, struct mbuf *nm)
189 {
190 	fifo_t fifo = sc->sc_txfifo_done;
191 	struct mbuf *m;
192 
193 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
194 		return (NULL);
195 
196 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
197 	fifo->array[NETFIFOINDEX(fifo->rindex, sc)] = nm;
198 	cpu_lfence();
199 	++fifo->rindex;
200 	return (m);
201 }
202 
203 /*
204  * vke_txfifo_enqueue() - Add an mbuf to the transmit fifo.
205  */
206 static int
207 vke_txfifo_enqueue(struct vke_softc *sc, struct mbuf *m)
208 {
209 	fifo_t fifo = sc->sc_txfifo;
210 
211 	if (NETFIFOINDEX(fifo->windex + 1, sc) == NETFIFOINDEX(fifo->rindex, sc))
212 		return (-1);
213 
214 	fifo->array[NETFIFOINDEX(fifo->windex, sc)] = m;
215 	cpu_sfence();
216 	++fifo->windex;
217 
218 	return (0);
219 }
220 
221 /*
222  * vke_txfifo_dequeue() - Return next mbuf on the transmit fifo if one
223  * exists.
224  */
225 static struct mbuf *
226 vke_txfifo_dequeue(struct vke_softc *sc)
227 {
228 	fifo_t fifo = sc->sc_txfifo;
229 	struct mbuf *m;
230 
231 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
232 		return (NULL);
233 
234 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
235 	fifo->array[NETFIFOINDEX(fifo->rindex, sc)] = NULL;
236 
237 	cpu_lfence();
238 	++fifo->rindex;
239 	return (m);
240 }
241 
242 static int
243 vke_txfifo_empty(struct vke_softc *sc)
244 {
245 	fifo_t fifo = sc->sc_txfifo;
246 
247 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
248 		return (1);
249 	return(0);
250 }
251 
252 /*
253  * vke_rxfifo_dequeue() - Return next mbuf on the receice fifo if one
254  * exists replacing it with newm which should point to a newly allocated
255  * mbuf.
256  */
257 static struct mbuf *
258 vke_rxfifo_dequeue(struct vke_softc *sc, struct mbuf *newm)
259 {
260 	fifo_t fifo = sc->sc_rxfifo;
261 	struct mbuf *m;
262 
263 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
264 		return (NULL);
265 
266 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
267 	fifo->array[NETFIFOINDEX(fifo->rindex, sc)] = newm;
268 	cpu_lfence();
269 	++fifo->rindex;
270 	return (m);
271 }
272 
273 /*
274  * Return the next mbuf if available but do NOT remove it from the FIFO.
275  */
276 static struct mbuf *
277 vke_rxfifo_sniff(struct vke_softc *sc)
278 {
279 	fifo_t fifo = sc->sc_rxfifo;
280 	struct mbuf *m;
281 
282 	if (NETFIFOINDEX(fifo->rindex, sc) == NETFIFOINDEX(fifo->windex, sc))
283 		return (NULL);
284 
285 	m = fifo->array[NETFIFOINDEX(fifo->rindex, sc)];
286 	cpu_lfence();
287 	return (m);
288 }
289 
290 static void
291 vke_init(void *xsc)
292 {
293 	struct vke_softc *sc = xsc;
294 	struct ifnet *ifp = &sc->arpcom.ac_if;
295 	size_t ringsize = sc->sc_ringsize * sizeof(struct mbuf *);
296 	int i;
297 
298 	ASSERT_SERIALIZED(ifp->if_serializer);
299 
300 	vke_stop(sc);
301 
302 	ifp->if_flags |= IFF_RUNNING;
303 	ifsq_clr_oactive(ifq_get_subq_default(&ifp->if_snd));
304 
305 	/*
306 	 * Allocate memory for FIFO structures and mbufs.
307 	 */
308 	sc->sc_txfifo = kmalloc(sizeof(*sc->sc_txfifo),
309 	    M_DEVBUF, M_WAITOK | M_ZERO);
310 	sc->sc_txfifo_done = kmalloc(sizeof(*sc->sc_txfifo_done),
311 	    M_DEVBUF, M_WAITOK | M_ZERO);
312 	sc->sc_rxfifo = kmalloc(sizeof(*sc->sc_rxfifo),
313 	    M_DEVBUF, M_WAITOK | M_ZERO);
314 	sc->sc_txfifo->array = kmalloc(ringsize, M_DEVBUF, M_WAITOK | M_ZERO);
315 	sc->sc_txfifo_done->array = kmalloc(ringsize, M_DEVBUF, M_WAITOK | M_ZERO);
316 	sc->sc_rxfifo->array = kmalloc(ringsize, M_DEVBUF, M_WAITOK | M_ZERO);
317 
318 	for (i = 0; i < sc->sc_ringsize; i++) {
319 		sc->sc_rxfifo->array[i] = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
320 		sc->sc_txfifo->array[i] = NULL;
321 		sc->sc_txfifo_done->array[i] = NULL;
322 	}
323 
324 	sc->cotd_tx_exit = sc->cotd_rx_exit = VKE_COTD_RUN;
325 	sc->cotd_tx = cothread_create(vke_tx_thread, vke_tx_intr, sc, "vke_tx");
326 	sc->cotd_rx = cothread_create(vke_rx_thread, vke_rx_intr, sc, "vke_rx");
327 
328 	if (sc->sc_addr != 0) {
329 		in_addr_t addr, mask;
330 
331 		addr = sc->sc_addr;
332 		mask = sc->sc_mask;
333 
334 		/*
335 		 * Make sure vkernel assigned
336 		 * address will not be added
337 		 * again.
338 		 */
339 		sc->sc_addr = 0;
340 		sc->sc_mask = 0;
341 
342 		vke_init_addr(ifp, addr, mask);
343 	}
344 
345 }
346 
347 /*
348  * Called from kernel.
349  *
350  * NOTE: We can't make any kernel callbacks while holding cothread lock
351  *	 because the cothread lock is not governed by the kernel scheduler
352  *	 (so mplock, tokens, etc will not be released).
353  */
354 static void
355 vke_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
356 {
357 	struct vke_softc *sc = ifp->if_softc;
358 	struct mbuf *m;
359 	cothread_t cotd = sc->cotd_tx;
360 	int count;
361 
362 	ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
363 	ASSERT_SERIALIZED(ifp->if_serializer);
364 
365 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
366 		return;
367 
368 	count = 0;
369 	while ((m = ifsq_dequeue(ifsq)) != NULL) {
370 		if (vke_txfifo_enqueue(sc, m) != -1) {
371 			ETHER_BPF_MTAP(ifp, m);
372 			if (count++ == VKE_CHUNK) {
373 				cothread_lock(cotd, 0);
374 				cothread_signal(cotd);
375 				cothread_unlock(cotd, 0);
376 				count = 0;
377 			}
378 		} else {
379 			m_freem(m);
380 		}
381 	}
382 	if (count) {
383 		cothread_lock(cotd, 0);
384 		cothread_signal(cotd);
385 		cothread_unlock(cotd, 0);
386 	}
387 }
388 
389 static int
390 vke_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr)
391 {
392 	struct vke_softc *sc = ifp->if_softc;
393 	struct ifreq *ifr = (struct ifreq *)data;
394 	int error = 0;
395 
396 	ASSERT_SERIALIZED(ifp->if_serializer);
397 
398 	switch (cmd) {
399 	case SIOCSIFFLAGS:
400 		if (ifp->if_flags & IFF_UP) {
401 			if ((ifp->if_flags & IFF_RUNNING) == 0)
402 				vke_init(sc);
403 		} else {
404 			if (ifp->if_flags & IFF_RUNNING)
405 				vke_stop(sc);
406 		}
407 		break;
408 	case SIOCGIFMEDIA:
409 	case SIOCSIFMEDIA:
410 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
411 		break;
412 	case SIOCGIFSTATUS: {
413 		struct ifstat *ifs = (struct ifstat *)data;
414 		int len;
415 
416 		len = strlen(ifs->ascii);
417 		if (len < sizeof(ifs->ascii)) {
418 			if (sc->sc_tap_unit >= 0)
419 				ksnprintf(ifs->ascii + len, sizeof(ifs->ascii) - len,
420 				    "\tBacked by tap%d\n", sc->sc_tap_unit);
421 		}
422 		break;
423 	}
424 	case SIOCSIFADDR:
425 		if (((struct ifaddr *)data)->ifa_addr->sa_family == AF_INET) {
426 			/*
427 			 * If we are explicitly requested to change address,
428 			 * we should invalidate address/netmask passed in
429 			 * from vkernel command line.
430 			 */
431 			sc->sc_addr = 0;
432 			sc->sc_mask = 0;
433 		}
434 		/* FALL THROUGH */
435 	default:
436 		error = ether_ioctl(ifp, cmd, data);
437 		break;
438 	}
439 	return error;
440 }
441 
442 static int
443 vke_stop(struct vke_softc *sc)
444 {
445 	struct ifnet *ifp = &sc->arpcom.ac_if;
446 	int i;
447 
448 	ASSERT_SERIALIZED(ifp->if_serializer);
449 
450 	ifp->if_flags &= ~IFF_RUNNING;
451 	ifsq_clr_oactive(ifq_get_subq_default(&ifp->if_snd));
452 
453 	if (sc) {
454 		if (sc->cotd_tx) {
455 			cothread_lock(sc->cotd_tx, 0);
456 			if (sc->cotd_tx_exit == VKE_COTD_RUN)
457 				sc->cotd_tx_exit = VKE_COTD_EXIT;
458 			cothread_signal(sc->cotd_tx);
459 			cothread_unlock(sc->cotd_tx, 0);
460 			cothread_delete(&sc->cotd_tx);
461 		}
462 		if (sc->cotd_rx) {
463 			cothread_lock(sc->cotd_rx, 0);
464 			if (sc->cotd_rx_exit == VKE_COTD_RUN)
465 				sc->cotd_rx_exit = VKE_COTD_EXIT;
466 			cothread_signal(sc->cotd_rx);
467 			cothread_unlock(sc->cotd_rx, 0);
468 			cothread_delete(&sc->cotd_rx);
469 		}
470 
471 		for (i = 0; i < sc->sc_ringsize; i++) {
472 			if (sc->sc_rxfifo && sc->sc_rxfifo->array[i]) {
473 				m_freem(sc->sc_rxfifo->array[i]);
474 				sc->sc_rxfifo->array[i] = NULL;
475 			}
476 			if (sc->sc_txfifo && sc->sc_txfifo->array[i]) {
477 				m_freem(sc->sc_txfifo->array[i]);
478 				sc->sc_txfifo->array[i] = NULL;
479 			}
480 			if (sc->sc_txfifo_done && sc->sc_txfifo_done->array[i]) {
481 				m_freem(sc->sc_txfifo_done->array[i]);
482 				sc->sc_txfifo_done->array[i] = NULL;
483 			}
484 		}
485 
486 		if (sc->sc_txfifo) {
487 			if (sc->sc_txfifo->array)
488 				kfree(sc->sc_txfifo->array, M_DEVBUF);
489 			kfree(sc->sc_txfifo, M_DEVBUF);
490 			sc->sc_txfifo = NULL;
491 		}
492 
493 		if (sc->sc_txfifo_done) {
494 			if (sc->sc_txfifo_done->array)
495 				kfree(sc->sc_txfifo_done->array, M_DEVBUF);
496 			kfree(sc->sc_txfifo_done, M_DEVBUF);
497 			sc->sc_txfifo_done = NULL;
498 		}
499 
500 		if (sc->sc_rxfifo) {
501 			if (sc->sc_rxfifo->array)
502 				kfree(sc->sc_rxfifo->array, M_DEVBUF);
503 			kfree(sc->sc_rxfifo, M_DEVBUF);
504 			sc->sc_rxfifo = NULL;
505 		}
506 	}
507 
508 
509 	return 0;
510 }
511 
512 /*
513  * vke_rx_intr() is the interrupt function for the receive cothread.
514  */
515 static void
516 vke_rx_intr(cothread_t cotd)
517 {
518 	struct mbuf *m;
519 	struct mbuf *nm;
520 	struct vke_softc *sc = cotd->arg;
521 	struct ifnet *ifp = &sc->arpcom.ac_if;
522 	static int count = 0;
523 
524 	ifnet_serialize_all(ifp);
525 	cothread_lock(cotd, 0);
526 
527 	if (sc->cotd_rx_exit != VKE_COTD_RUN) {
528 		cothread_unlock(cotd, 0);
529 		ifnet_deserialize_all(ifp);
530 		return;
531 	}
532 	if (sc->cotd_ipackets) {
533 		IFNET_STAT_INC(ifp, ipackets, 1);
534 		sc->cotd_ipackets = 0;
535 	}
536 	cothread_unlock(cotd, 0);
537 
538 	while ((m = vke_rxfifo_sniff(sc)) != NULL) {
539 		nm = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
540 		if (nm) {
541 			vke_rxfifo_dequeue(sc, nm);
542 			ifp->if_input(ifp, m, NULL, -1);
543 			if (count++ == VKE_CHUNK) {
544 				cothread_lock(cotd, 0);
545 				cothread_signal(cotd);
546 				cothread_unlock(cotd, 0);
547 				count = 0;
548 			}
549 		} else {
550 			vke_rxfifo_dequeue(sc, m);
551 		}
552 	}
553 
554 	if (count) {
555 		cothread_lock(cotd, 0);
556 		cothread_signal(cotd);
557 		cothread_unlock(cotd, 0);
558 	}
559 	ifnet_deserialize_all(ifp);
560 }
561 
562 /*
563  * vke_tx_intr() is the interrupt function for the transmit cothread.
564  * Calls vke_start() to handle processing transmit mbufs.
565  */
566 static void
567 vke_tx_intr(cothread_t cotd)
568 {
569 	struct vke_softc *sc = cotd->arg;
570 	struct ifnet *ifp = &sc->arpcom.ac_if;
571 	struct mbuf *m;
572 
573 	ifnet_serialize_all(ifp);
574 	cothread_lock(cotd, 0);
575 	if (sc->cotd_tx_exit != VKE_COTD_RUN) {
576 		cothread_unlock(cotd, 0);
577 		ifnet_deserialize_all(ifp);
578 		return;
579 	}
580 	if (sc->cotd_opackets) {
581 		IFNET_STAT_INC(ifp, opackets, 1);
582 		sc->cotd_opackets = 0;
583 	}
584 	if (sc->cotd_oerrors) {
585 		IFNET_STAT_INC(ifp, oerrors, 1);
586 		sc->cotd_oerrors = 0;
587 	}
588 	cothread_unlock(cotd, 0);
589 
590 	/*
591 	 * Free TX mbufs that have been processed before starting new
592 	 * ones going to be pipeline friendly.
593 	 */
594 	while ((m = vke_txfifo_done_dequeue(sc, NULL)) != NULL) {
595 		m_freem(m);
596 	}
597 
598 	if ((ifp->if_flags & IFF_RUNNING) == 0)
599 		if_devstart(ifp);
600 
601 	ifnet_deserialize_all(ifp);
602 }
603 
604 /*
605  * vke_rx_thread() is the body of the receive cothread.
606  *
607  * WARNING!  THIS IS A COTHREAD WHICH HAS NO PER-CPU GLOBALDATA!!!!!
608  */
609 static void
610 vke_rx_thread(cothread_t cotd)
611 {
612 	struct mbuf *m;
613 	struct vke_softc *sc = cotd->arg;
614 	struct ifnet *ifp = &sc->arpcom.ac_if;
615 	fifo_t fifo = sc->sc_rxfifo;
616 	fd_set fdset;
617 	struct timeval tv;
618 	int count;
619 	int n;
620 
621 	/* Select timeout cannot be infinite since we need to check for
622 	 * the exit flag sc->cotd_rx_exit.
623 	 */
624 	tv.tv_sec = 0;
625 	tv.tv_usec = 500000;
626 
627 	FD_ZERO(&fdset);
628 	count = 0;
629 
630 	while (sc->cotd_rx_exit == VKE_COTD_RUN) {
631 		/*
632 		 * Wait for the RX FIFO to be loaded with
633 		 * empty mbufs.
634 		 */
635 		if (NETFIFOINDEX(fifo->windex + 1, sc) ==
636 		    NETFIFOINDEX(fifo->rindex, sc)) {
637 			usleep(20000);
638 			continue;
639 		}
640 
641 		/*
642 		 * Load data into the rx fifo
643 		 */
644 		m = fifo->array[NETFIFOINDEX(fifo->windex, sc)];
645 		if (m == NULL)
646 			continue;
647 		n = read(sc->sc_fd, mtod(m, void *), MCLBYTES);
648 		if (n > 0) {
649 			/* no mycpu in cothread */
650 			/*IFNET_STAT_INC(ifp, ipackets, 1);*/
651 			++sc->cotd_ipackets;
652 			m->m_pkthdr.rcvif = ifp;
653 			m->m_pkthdr.len = m->m_len = n;
654 			cpu_sfence();
655 			++fifo->windex;
656 			if (count++ == VKE_CHUNK) {
657 				cothread_intr(cotd);
658 				count = 0;
659 			}
660 		} else {
661 			if (count) {
662 				cothread_intr(cotd);
663 				count = 0;
664 			}
665 			FD_SET(sc->sc_fd, &fdset);
666 
667 			if (select(sc->sc_fd + 1, &fdset, NULL, NULL, &tv) == -1) {
668 				fprintf(stderr,
669 					VKE_DEVNAME "%d: select failed for "
670 					"TAP device\n", sc->sc_unit);
671 				usleep(1000000);
672 			}
673 		}
674 	}
675 	cpu_sfence();
676 	sc->cotd_rx_exit = VKE_COTD_DEAD;
677 }
678 
679 /*
680  * vke_tx_thread() is the body of the transmit cothread.
681  *
682  * WARNING!  THIS IS A COTHREAD WHICH HAS NO PER-CPU GLOBALDATA!!!!!
683  */
684 static void
685 vke_tx_thread(cothread_t cotd)
686 {
687 	struct mbuf *m;
688 	struct vke_softc *sc = cotd->arg;
689 	/*struct ifnet *ifp = &sc->arpcom.ac_if;*/
690 	int count = 0;
691 
692 	while (sc->cotd_tx_exit == VKE_COTD_RUN) {
693 		/*
694 		 * Write outgoing packets to the TAP interface
695 		 */
696 		m = vke_txfifo_dequeue(sc);
697 		if (m) {
698 			if (m->m_pkthdr.len <= MCLBYTES) {
699 				m_copydata(m, 0, m->m_pkthdr.len, sc->sc_txbuf);
700 				sc->sc_txbuf_len = m->m_pkthdr.len;
701 
702 				if (write(sc->sc_fd, sc->sc_txbuf,
703 					  sc->sc_txbuf_len) < 0) {
704 					/* no mycpu in cothread */
705 					/*IFNET_STAT_INC(ifp, oerrors, 1);*/
706 					++sc->cotd_oerrors;
707 				} else {
708 					/* no mycpu in cothread */
709 					/*IFNET_STAT_INC(ifp, opackets, 1);*/
710 					++sc->cotd_opackets;
711 				}
712 			}
713 			if (count++ == VKE_CHUNK) {
714 				cothread_intr(cotd);
715 				count = 0;
716 			}
717 			vke_txfifo_done_enqueue(sc, m);
718 		} else {
719 			if (count) {
720 				cothread_intr(cotd);
721 				count = 0;
722 			}
723 			cothread_lock(cotd, 1);
724 			if (vke_txfifo_empty(sc))
725 				cothread_wait(cotd);
726 			cothread_unlock(cotd, 1);
727 		}
728 	}
729 	cpu_sfence();
730 	sc->cotd_tx_exit = VKE_COTD_DEAD;
731 }
732 
733 static int
734 vke_attach(const struct vknetif_info *info, int unit)
735 {
736 	struct vke_softc *sc;
737 	struct ifnet *ifp;
738 	struct tapinfo tapinfo;
739 	uint8_t enaddr[ETHER_ADDR_LEN];
740 	int nmbufs;
741 	int fd;
742 
743 	KKASSERT(info->tap_fd >= 0);
744 	fd = info->tap_fd;
745 
746 	if (info->enaddr) {
747 		/*
748 		 * enaddr is supplied
749 		 */
750 		bcopy(info->enaddr, enaddr, ETHER_ADDR_LEN);
751 	} else {
752 		/*
753 		 * This is only a TAP device if tap_unit is non-zero.  If
754 		 * connecting to a virtual socket we generate a unique MAC.
755 		 *
756 		 * WARNING: enaddr[0] bit 0 is the multicast bit, when
757 		 *          randomizing enaddr[] just leave the first
758 		 *	    two bytes 00 00 for now.
759 		 */
760 		bzero(enaddr, sizeof(enaddr));
761 		if (info->tap_unit >= 0) {
762 			if (ioctl(fd, TAPGIFINFO, &tapinfo) < 0) {
763 				kprintf(VKE_DEVNAME "%d: ioctl(TAPGIFINFO) "
764 					"failed: %s\n", unit, strerror(errno));
765 				return ENXIO;
766 			}
767 
768 			if (ioctl(fd, SIOCGIFADDR, enaddr) < 0) {
769 				kprintf(VKE_DEVNAME "%d: ioctl(SIOCGIFADDR) "
770 					"failed: %s\n", unit, strerror(errno));
771 				return ENXIO;
772 			}
773 		} else {
774 			int fd = open("/dev/urandom", O_RDONLY);
775 			if (fd >= 0) {
776 				read(fd, enaddr + 2, 4);
777 				close(fd);
778 			}
779 			enaddr[4] = (int)getpid() >> 8;
780 			enaddr[5] = (int)getpid() & 255;
781 
782 		}
783 		enaddr[1] += 1;
784 	}
785 	if (ETHER_IS_MULTICAST(enaddr)) {
786 		kprintf(VKE_DEVNAME "%d: illegal MULTICAST ether mac!\n", unit);
787 		return ENXIO;
788 	}
789 
790 	sc = kmalloc(sizeof(*sc), M_DEVBUF, M_WAITOK | M_ZERO);
791 
792 	sc->sc_txbuf = kmalloc(MCLBYTES, M_DEVBUF, M_WAITOK);
793 	sc->sc_fd = fd;
794 	sc->sc_unit = unit;
795 	sc->sc_tap_unit = info->tap_unit;
796 	sc->sc_addr = info->netif_addr;
797 	sc->sc_mask = info->netif_mask;
798 
799 	if (vke_max_ringsize == 0) {
800 		nmbufs = nmbclusters / (NetifNum * 2);
801 		sc->sc_ringsize = LOW_POW_2(nmbufs);
802 		if (sc->sc_ringsize > VKE_DEFAULT_RINGSIZE)
803 			sc->sc_ringsize = VKE_DEFAULT_RINGSIZE;
804 	} else if (vke_max_ringsize >= VKE_CHUNK) {	/* Tunable specified */
805 		sc->sc_ringsize = LOW_POW_2(vke_max_ringsize);
806 	} else {
807 		sc->sc_ringsize = LOW_POW_2(VKE_CHUNK);
808 	}
809 
810 	ifp = &sc->arpcom.ac_if;
811 	if_initname(ifp, VKE_DEVNAME, sc->sc_unit);
812 
813 	/* NB: after if_initname() */
814 	sysctl_ctx_init(&sc->sc_sysctl_ctx);
815 	sc->sc_sysctl_tree = SYSCTL_ADD_NODE(&sc->sc_sysctl_ctx,
816 					     SYSCTL_STATIC_CHILDREN(_hw),
817 					     OID_AUTO, ifp->if_xname,
818 					     CTLFLAG_RD, 0, "");
819 	if (sc->sc_sysctl_tree == NULL) {
820 		kprintf(VKE_DEVNAME "%d: can't add sysctl node\n", unit);
821 	} else {
822 		SYSCTL_ADD_INT(&sc->sc_sysctl_ctx,
823 			       SYSCTL_CHILDREN(sc->sc_sysctl_tree),
824 			       OID_AUTO, "tap_unit",
825 			       CTLFLAG_RD, &sc->sc_tap_unit, 0,
826 			       "Backend tap(4) unit");
827 	}
828 
829 	ifp->if_softc = sc;
830 	ifp->if_ioctl = vke_ioctl;
831 	ifp->if_start = vke_start;
832 	ifp->if_init = vke_init;
833 	ifp->if_mtu = tapinfo.mtu;
834 	ifp->if_baudrate = tapinfo.baudrate;
835 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
836 	ifq_set_maxlen(&ifp->if_snd, IFQ_MAXLEN);
837 	ifq_set_ready(&ifp->if_snd);
838 
839 	ifmedia_init(&sc->sc_media, 0, vke_media_change, vke_media_status);
840 	/* We support as many media types as we please for
841 	   debugging purposes */
842 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10_T, 0, NULL);
843 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL);
844 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10_2, 0, NULL);
845 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10_5, 0, NULL);
846 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_100_TX, 0, NULL);
847 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL);
848 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_100_FX, 0, NULL);
849 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_100_T4, 0, NULL);
850 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_100_VG, 0, NULL);
851 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_100_T2, 0, NULL);
852 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_1000_FX, 0, NULL);
853 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10_STP, 0, NULL);
854 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10_FL, 0, NULL);
855 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_1000_SX, 0, NULL);
856 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_1000_LX, 0, NULL);
857 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_1000_CX, 0, NULL);
858 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_1000_T, 0, NULL);
859 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
860 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_HPNA_1, 0, NULL);
861 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_LR, 0, NULL);
862 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_SR, 0, NULL);
863 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_CX4, 0, NULL);
864 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_2500_SX, 0, NULL);
865 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_TWINAX, 0, NULL);
866 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_TWINAX_LONG, 0, NULL);
867 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_LRM, 0, NULL);
868 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_T, 0, NULL);
869 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_40G_CR4, 0, NULL);
870 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_40G_SR4, 0, NULL);
871 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_40G_LR4, 0, NULL);
872 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
873 
874 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
875 
876 	ifp->if_link_state = LINK_STATE_UP;
877 
878 	ether_ifattach(ifp, enaddr, NULL);
879 
880 	if (bootverbose && sc->sc_addr != 0) {
881 		if_printf(ifp, "pre-configured "
882 		    "address 0x%08x, netmask 0x%08x, %d mbuf clusters\n",
883 		    ntohl(sc->sc_addr), ntohl(sc->sc_mask), sc->sc_ringsize);
884 	}
885 
886 	return 0;
887 }
888 
889 static int
890 vke_init_addr(struct ifnet *ifp, in_addr_t addr, in_addr_t mask)
891 {
892 	struct ifaliasreq ifra;
893 	struct sockaddr_in *sin;
894 	int ret;
895 
896 	ASSERT_SERIALIZED(ifp->if_serializer);
897 
898 	if (bootverbose) {
899 		if_printf(ifp, "add pre-configured "
900 			  "address 0x%08x, netmask 0x%08x\n",
901 			  ntohl(addr), ntohl(mask));
902 	}
903 
904 	bzero(&ifra, sizeof(ifra));
905 
906 	/* NB: no need to set ifaliasreq.ifra_name */
907 
908 	sin = (struct sockaddr_in *)&ifra.ifra_addr;
909 	sin->sin_family = AF_INET;
910 	sin->sin_len = sizeof(*sin);
911 	sin->sin_addr.s_addr = addr;
912 
913 	if (mask != 0) {
914 		sin = (struct sockaddr_in *)&ifra.ifra_mask;
915 		sin->sin_len = sizeof(*sin);
916 		sin->sin_addr.s_addr = mask;
917 	}
918 
919 	/*
920 	 * Temporarily release serializer, in_control() will hold
921 	 * it again before calling ifnet.if_ioctl().
922 	 */
923 	ifnet_deserialize_all(ifp);
924 	ret = in_control(SIOCAIFADDR, (caddr_t)&ifra, ifp, NULL);
925 	ifnet_serialize_all(ifp);
926 
927 	return ret;
928 }
929 
930 static int vke_media_change(struct ifnet *ifp)
931 {
932 	/* ignored */
933 	return(0);
934 }
935 
936 static void vke_media_status(struct ifnet *ifp, struct ifmediareq *imr)
937 {
938 	struct vke_softc *sc = (struct vke_softc *)ifp->if_softc;
939 
940 	imr->ifm_status = IFM_AVALID;
941 	imr->ifm_status |= IFM_ACTIVE;
942 
943         if(sc->sc_media.ifm_cur) {
944 		if(sc->sc_media.ifm_cur->ifm_media == IFM_ETHER) {
945 			imr->ifm_active = IFM_ETHER | IFM_1000_T | IFM_FDX;
946 		} else {
947 			imr->ifm_active = sc->sc_media.ifm_cur->ifm_media;
948 		}
949 	} else {
950 		imr->ifm_active = IFM_ETHER | IFM_1000_T | IFM_FDX;
951 	}
952 }
953