xref: /netbsd/sys/arch/xen/xen/xennetback_xenbus.c (revision 6550d01e)
1 /*      $NetBSD: xennetback_xenbus.c,v 1.38 2011/01/18 21:34:31 jym Exp $      */
2 
3 /*
4  * Copyright (c) 2006 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 #include "opt_xen.h"
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/malloc.h>
34 #include <sys/queue.h>
35 #include <sys/kernel.h>
36 #include <sys/mbuf.h>
37 #include <sys/protosw.h>
38 #include <sys/socket.h>
39 #include <sys/ioctl.h>
40 #include <sys/errno.h>
41 #include <sys/device.h>
42 #include <sys/intr.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <net/netisr.h>
49 #include <net/bpf.h>
50 #include <net/bpfdesc.h>
51 
52 #include <net/if_ether.h>
53 
54 
55 #include <xen/xen.h>
56 #include <xen/xen_shm.h>
57 #include <xen/evtchn.h>
58 #include <xen/xenbus.h>
59 #include <xen/xennet_checksum.h>
60 
61 #include <uvm/uvm.h>
62 
63 #ifdef XENDEBUG_NET
64 #define XENPRINTF(x) printf x
65 #else
66 #define XENPRINTF(x)
67 #endif
68 
69 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
70 #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
71 
72 /* linux wants at last 16 bytes free in front of the packet */
73 #define LINUX_REQUESTED_OFFSET 16
74 
75 /* hash list for TX requests */
76 /* descriptor of a packet being handled by the kernel */
77 struct xni_pkt {
78 	int pkt_id; /* packet's ID */
79 	grant_handle_t pkt_handle;
80 	struct xnetback_instance *pkt_xneti; /* pointer back to our softc */
81 };
82 
83 static inline void xni_pkt_unmap(struct xni_pkt *, vaddr_t);
84 
85 
86 /* pools for xni_pkt */
87 struct pool xni_pkt_pool;
88 /* ratecheck(9) for pool allocation failures */
89 struct timeval xni_pool_errintvl = { 30, 0 };  /* 30s, each */
90 /*
91  * Backend network device driver for Xen
92  */
93 
94 /* state of a xnetback instance */
95 typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xnetback_state_t;
96 
97 /* we keep the xnetback instances in a linked list */
98 struct xnetback_instance {
99 	SLIST_ENTRY(xnetback_instance) next;
100 	struct xenbus_device *xni_xbusd; /* our xenstore entry */
101 	domid_t xni_domid;		/* attached to this domain */
102 	uint32_t xni_handle;	/* domain-specific handle */
103 	xnetback_state_t xni_status;
104 	void *xni_softintr;
105 
106 	/* network interface stuff */
107 	struct ethercom xni_ec;
108 	struct callout xni_restart;
109 	uint8_t xni_enaddr[ETHER_ADDR_LEN];
110 
111 	/* remote domain communication stuff */
112 	unsigned int xni_evtchn; /* our even channel */
113 	netif_tx_back_ring_t xni_txring;
114 	netif_rx_back_ring_t xni_rxring;
115 	grant_handle_t xni_tx_ring_handle; /* to unmap the ring */
116 	grant_handle_t xni_rx_ring_handle;
117 	vaddr_t xni_tx_ring_va; /* to unmap the ring */
118 	vaddr_t xni_rx_ring_va;
119 };
120 #define xni_if    xni_ec.ec_if
121 #define xni_bpf   xni_if.if_bpf
122 
123        void xvifattach(int);
124 static int  xennetback_ifioctl(struct ifnet *, u_long, void *);
125 static void xennetback_ifstart(struct ifnet *);
126 static void xennetback_ifsoftstart_transfer(void *);
127 static void xennetback_ifsoftstart_copy(void *);
128 static void xennetback_ifwatchdog(struct ifnet *);
129 static int  xennetback_ifinit(struct ifnet *);
130 static void xennetback_ifstop(struct ifnet *, int);
131 
132 static int  xennetback_xenbus_create(struct xenbus_device *);
133 static int  xennetback_xenbus_destroy(void *);
134 static void xennetback_frontend_changed(void *, XenbusState);
135 
136 static inline void xennetback_tx_response(struct xnetback_instance *,
137     int, int);
138 static void xennetback_tx_free(struct mbuf * , void *, size_t, void *);
139 
140 SLIST_HEAD(, xnetback_instance) xnetback_instances;
141 
142 static struct xnetback_instance *xnetif_lookup(domid_t, uint32_t);
143 static int  xennetback_evthandler(void *);
144 
145 static struct xenbus_backend_driver xvif_backend_driver = {
146 	.xbakd_create = xennetback_xenbus_create,
147 	.xbakd_type = "vif"
148 };
149 
150 /*
151  * Number of packets to transmit in one hypercall (= number of pages to
152  * transmit at once).
153  */
154 #define NB_XMIT_PAGES_BATCH 64
155 /*
156  * We will transfer a mapped page to the remote domain, and remap another
157  * page in place immediately. For this we keep a list of pages available.
158  * When the list is empty, we ask the hypervisor to give us
159  * NB_XMIT_PAGES_BATCH pages back.
160  */
161 static unsigned long mcl_pages[NB_XMIT_PAGES_BATCH]; /* our physical pages */
162 int mcl_pages_alloc; /* current index in mcl_pages */
163 static int  xennetback_get_mcl_page(paddr_t *);
164 static void xennetback_get_new_mcl_pages(void);
165 /*
166  * If we can't transfer the mbuf directly, we have to copy it to a page which
167  * will be transferred to the remote domain. We use a pool_cache
168  * for this, or the mbuf cluster pool cache if MCLBYTES == PAGE_SIZE
169  */
170 #if MCLBYTES != PAGE_SIZE
171 pool_cache_t xmit_pages_cache;
172 #endif
173 pool_cache_t xmit_pages_cachep;
174 
175 /* arrays used in xennetback_ifstart(), too large to allocate on stack */
176 static mmu_update_t xstart_mmu[NB_XMIT_PAGES_BATCH];
177 static multicall_entry_t xstart_mcl[NB_XMIT_PAGES_BATCH + 1];
178 static gnttab_transfer_t xstart_gop_transfer[NB_XMIT_PAGES_BATCH];
179 static gnttab_copy_t     xstart_gop_copy[NB_XMIT_PAGES_BATCH];
180 struct mbuf *mbufs_sent[NB_XMIT_PAGES_BATCH];
181 struct _pages_pool_free {
182 	vaddr_t va;
183 	paddr_t pa;
184 } pages_pool_free[NB_XMIT_PAGES_BATCH];
185 
186 
187 static inline void
188 xni_pkt_unmap(struct xni_pkt *pkt, vaddr_t pkt_va)
189 {
190 	xen_shm_unmap(pkt_va, 1, &pkt->pkt_handle);
191 	pool_put(&xni_pkt_pool, pkt);
192 }
193 
194 void
195 xvifattach(int n)
196 {
197 	int i;
198 	struct pglist mlist;
199 	struct vm_page *pg;
200 
201 	XENPRINTF(("xennetback_init\n"));
202 
203 	/*
204 	 * steal some non-managed pages to the VM system, to replace
205 	 * mbuf cluster or xmit_pages_pool pages given to foreign domains.
206 	 */
207 	if (uvm_pglistalloc(PAGE_SIZE * NB_XMIT_PAGES_BATCH, 0, 0xffffffff,
208 	    0, 0, &mlist, NB_XMIT_PAGES_BATCH, 0) != 0)
209 		panic("xennetback_init: uvm_pglistalloc");
210 	for (i = 0, pg = mlist.tqh_first; pg != NULL;
211 	    pg = pg->pageq.queue.tqe_next, i++)
212 		mcl_pages[i] = xpmap_ptom(VM_PAGE_TO_PHYS(pg)) >> PAGE_SHIFT;
213 	if (i != NB_XMIT_PAGES_BATCH)
214 		panic("xennetback_init: %d mcl pages", i);
215 	mcl_pages_alloc = NB_XMIT_PAGES_BATCH - 1;
216 
217 	/* initialise pools */
218 	pool_init(&xni_pkt_pool, sizeof(struct xni_pkt), 0, 0, 0,
219 	    "xnbpkt", NULL, IPL_VM);
220 #if MCLBYTES != PAGE_SIZE
221 	xmit_pages_cache = pool_cache_init(PAGE_SIZE, 0, 0, 0, "xnbxm", NULL,
222 	    IPL_VM, NULL, NULL, NULL);
223 	xmit_pages_cachep = xmit_pages_cache;
224 #else
225 	xmit_pages_cachep = mcl_cache;
226 #endif
227 
228 	SLIST_INIT(&xnetback_instances);
229 	xenbus_backend_register(&xvif_backend_driver);
230 }
231 
232 static int
233 xennetback_xenbus_create(struct xenbus_device *xbusd)
234 {
235 	struct xnetback_instance *xneti;
236 	long domid, handle;
237 	struct ifnet *ifp;
238 	extern int ifqmaxlen; /* XXX */
239 	char *val, *e, *p;
240 	int i, err;
241 	struct xenbus_transaction *xbt;
242 
243 	if ((err = xenbus_read_ul(NULL, xbusd->xbusd_path,
244 	    "frontend-id", &domid, 10)) != 0) {
245 		aprint_error("xvif: can't read %s/frontend-id: %d\n",
246 		    xbusd->xbusd_path, err);
247 		return err;
248 	}
249 	if ((err = xenbus_read_ul(NULL, xbusd->xbusd_path,
250 	    "handle", &handle, 10)) != 0) {
251 		aprint_error("xvif: can't read %s/handle: %d\n",
252 		    xbusd->xbusd_path, err);
253 		return err;
254 	}
255 
256 	if (xnetif_lookup(domid, handle) != NULL) {
257 		return EEXIST;
258 	}
259 	xneti = malloc(sizeof(struct xnetback_instance), M_DEVBUF,
260 	    M_NOWAIT | M_ZERO);
261 	if (xneti == NULL) {
262 		return ENOMEM;
263 	}
264 	xneti->xni_domid = domid;
265 	xneti->xni_handle = handle;
266 	xneti->xni_status = DISCONNECTED;
267 
268 	xbusd->xbusd_u.b.b_cookie = xneti;
269 	xbusd->xbusd_u.b.b_detach = xennetback_xenbus_destroy;
270 	xneti->xni_xbusd = xbusd;
271 
272 	ifp = &xneti->xni_if;
273 	ifp->if_softc = xneti;
274 	snprintf(ifp->if_xname, IFNAMSIZ, "xvif%d.%d",
275 	    (int)domid, (int)handle);
276 
277 	/* read mac address */
278 	if ((err = xenbus_read(NULL, xbusd->xbusd_path, "mac", NULL, &val))) {
279 		aprint_error_ifnet(ifp, "can't read %s/mac: %d\n",
280 		    xbusd->xbusd_path, err);
281 		goto fail;
282 	}
283 	for (i = 0, p = val; i < 6; i++) {
284 		xneti->xni_enaddr[i] = strtoul(p, &e, 16);
285 		if ((e[0] == '\0' && i != 5) && e[0] != ':') {
286 			aprint_error_ifnet(ifp,
287 			    "%s is not a valid mac address\n", val);
288 			err = EINVAL;
289 			goto fail;
290 		}
291 		p = &e[1];
292 	}
293 	free(val, M_DEVBUF);
294 
295 	/* we can't use the same MAC addr as our guest */
296 	xneti->xni_enaddr[3]++;
297 	/* create pseudo-interface */
298 	aprint_verbose_ifnet(ifp, "Ethernet address %s\n",
299 	    ether_sprintf(xneti->xni_enaddr));
300 	ifp->if_flags =
301 	    IFF_BROADCAST|IFF_SIMPLEX|IFF_NOTRAILERS|IFF_MULTICAST;
302 	ifp->if_snd.ifq_maxlen =
303 	    max(ifqmaxlen, NET_TX_RING_SIZE * 2);
304 	ifp->if_capabilities = IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx;
305 	ifp->if_ioctl = xennetback_ifioctl;
306 	ifp->if_start = xennetback_ifstart;
307 	ifp->if_watchdog = xennetback_ifwatchdog;
308 	ifp->if_init = xennetback_ifinit;
309 	ifp->if_stop = xennetback_ifstop;
310 	ifp->if_timer = 0;
311 	IFQ_SET_READY(&ifp->if_snd);
312 	if_attach(ifp);
313 	ether_ifattach(&xneti->xni_if, xneti->xni_enaddr);
314 
315 	SLIST_INSERT_HEAD(&xnetback_instances, xneti, next);
316 
317 	xbusd->xbusd_otherend_changed = xennetback_frontend_changed;
318 
319 	do {
320 		xbt = xenbus_transaction_start();
321 		if (xbt == NULL) {
322 			aprint_error_ifnet(ifp,
323 			    "%s: can't start transaction\n",
324 			    xbusd->xbusd_path);
325 			goto fail;
326 		}
327 		err = xenbus_printf(xbt, xbusd->xbusd_path,
328 		    "vifname", ifp->if_xname);
329 		if (err) {
330 			aprint_error_ifnet(ifp,
331 			    "failed to write %s/vifname: %d\n",
332 			    xbusd->xbusd_path, err);
333 			goto abort_xbt;
334 		}
335 		err = xenbus_printf(xbt, xbusd->xbusd_path,
336 		    "feature-rx-copy", "%d", 1);
337 		if (err) {
338 			aprint_error_ifnet(ifp,
339 			    "failed to write %s/feature-rx-copy: %d\n",
340 			    xbusd->xbusd_path, err);
341 			goto abort_xbt;
342 		}
343 		err = xenbus_printf(xbt, xbusd->xbusd_path,
344 		    "feature-rx-flip", "%d", 1);
345 		if (err) {
346 			aprint_error_ifnet(ifp,
347 			    "failed to write %s/feature-rx-flip: %d\n",
348 			    xbusd->xbusd_path, err);
349 			goto abort_xbt;
350 		}
351 	} while ((err = xenbus_transaction_end(xbt, 0)) == EAGAIN);
352 	if (err) {
353 		aprint_error_ifnet(ifp,
354 		    "%s: can't end transaction: %d\n",
355 		    xbusd->xbusd_path, err);
356 	}
357 
358 	err = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
359 	if (err) {
360 		aprint_error_ifnet(ifp,
361 		    "failed to switch state on %s: %d\n",
362 		    xbusd->xbusd_path, err);
363 		goto fail;
364 	}
365 	return 0;
366 abort_xbt:
367 	xenbus_transaction_end(xbt, 1);
368 fail:
369 	free(xneti, M_DEVBUF);
370 	return err;
371 }
372 
373 int
374 xennetback_xenbus_destroy(void *arg)
375 {
376 	struct xnetback_instance *xneti = arg;
377 	struct gnttab_unmap_grant_ref op;
378 	int err;
379 
380 #if 0
381 	if (xneti->xni_status == CONNECTED) {
382 		return EBUSY;
383 	}
384 #endif
385 	aprint_verbose_ifnet(&xneti->xni_if, "disconnecting\n");
386 	hypervisor_mask_event(xneti->xni_evtchn);
387 	event_remove_handler(xneti->xni_evtchn, xennetback_evthandler, xneti);
388 	if (xneti->xni_softintr) {
389 		softint_disestablish(xneti->xni_softintr);
390 		xneti->xni_softintr = NULL;
391 	}
392 
393 	SLIST_REMOVE(&xnetback_instances,
394 	    xneti, xnetback_instance, next);
395 
396 	ether_ifdetach(&xneti->xni_if);
397 	if_detach(&xneti->xni_if);
398 
399 	if (xneti->xni_txring.sring) {
400 		op.host_addr = xneti->xni_tx_ring_va;
401 		op.handle = xneti->xni_tx_ring_handle;
402 		op.dev_bus_addr = 0;
403 		err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
404 		    &op, 1);
405 		if (err)
406 			aprint_error_ifnet(&xneti->xni_if,
407 					"unmap_grant_ref failed: %d\n", err);
408 	}
409 	if (xneti->xni_rxring.sring) {
410 		op.host_addr = xneti->xni_rx_ring_va;
411 		op.handle = xneti->xni_rx_ring_handle;
412 		op.dev_bus_addr = 0;
413 		err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
414 		    &op, 1);
415 		if (err)
416 			aprint_error_ifnet(&xneti->xni_if,
417 					"unmap_grant_ref failed: %d\n", err);
418 	}
419 	uvm_km_free(kernel_map, xneti->xni_tx_ring_va,
420 	    PAGE_SIZE, UVM_KMF_VAONLY);
421 	uvm_km_free(kernel_map, xneti->xni_rx_ring_va,
422 	    PAGE_SIZE, UVM_KMF_VAONLY);
423 	free(xneti, M_DEVBUF);
424 	return 0;
425 }
426 
427 static void
428 xennetback_frontend_changed(void *arg, XenbusState new_state)
429 {
430 	struct xnetback_instance *xneti = arg;
431 	struct xenbus_device *xbusd = xneti->xni_xbusd;
432 	int err;
433 	netif_tx_sring_t *tx_ring;
434 	netif_rx_sring_t *rx_ring;
435 	struct gnttab_map_grant_ref op;
436 	evtchn_op_t evop;
437 	u_long tx_ring_ref, rx_ring_ref;
438 	u_long revtchn, rx_copy;
439 
440 	XENPRINTF(("%s: new state %d\n", xneti->xni_if.if_xname, new_state));
441 	switch(new_state) {
442 	case XenbusStateInitialising:
443 	case XenbusStateInitialised:
444 		break;
445 
446 	case XenbusStateConnected:
447 		/* read comunication informations */
448 		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
449 		    "tx-ring-ref", &tx_ring_ref, 10);
450 		if (err) {
451 			xenbus_dev_fatal(xbusd, err, "reading %s/tx-ring-ref",
452 			    xbusd->xbusd_otherend);
453 			break;
454 		}
455 		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
456 		    "rx-ring-ref", &rx_ring_ref, 10);
457 		if (err) {
458 			xenbus_dev_fatal(xbusd, err, "reading %s/rx-ring-ref",
459 			    xbusd->xbusd_otherend);
460 			break;
461 		}
462 		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
463 		    "event-channel", &revtchn, 10);
464 		if (err) {
465 			xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
466 			    xbusd->xbusd_otherend);
467 			break;
468 		}
469 		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
470 		    "request-rx-copy", &rx_copy, 10);
471 		if (err == ENOENT)
472 			rx_copy = 0;
473 		else if (err) {
474 			xenbus_dev_fatal(xbusd, err, "reading %s/request-rx-copy",
475 			    xbusd->xbusd_otherend);
476 			break;
477 		}
478 
479 		if (rx_copy)
480 			xneti->xni_softintr = softint_establish(SOFTINT_NET,
481 			    xennetback_ifsoftstart_copy, xneti);
482 		else
483 			xneti->xni_softintr = softint_establish(SOFTINT_NET,
484 			    xennetback_ifsoftstart_transfer, xneti);
485 		if (xneti->xni_softintr == NULL) {
486 			err = ENOMEM;
487 			xenbus_dev_fatal(xbusd, ENOMEM,
488 			    "can't allocate softint", xbusd->xbusd_otherend);
489 			break;
490 		}
491 
492 		/* allocate VA space and map rings */
493 		xneti->xni_tx_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
494 		    UVM_KMF_VAONLY);
495 		if (xneti->xni_tx_ring_va == 0) {
496 			xenbus_dev_fatal(xbusd, ENOMEM,
497 			    "can't get VA for tx ring", xbusd->xbusd_otherend);
498 			break;
499 		}
500 		tx_ring = (void *)xneti->xni_tx_ring_va;
501 		xneti->xni_rx_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
502 		    UVM_KMF_VAONLY);
503 		if (xneti->xni_rx_ring_va == 0) {
504 			xenbus_dev_fatal(xbusd, ENOMEM,
505 			    "can't get VA for rx ring", xbusd->xbusd_otherend);
506 			goto err1;
507 		}
508 		rx_ring = (void *)xneti->xni_rx_ring_va;
509 		op.host_addr = xneti->xni_tx_ring_va;
510 		op.flags = GNTMAP_host_map;
511 		op.ref = tx_ring_ref;
512 		op.dom = xneti->xni_domid;
513 		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
514 		if (err || op.status) {
515 			printf("%s: can't map TX grant ref: %d/%d\n",
516 			    xneti->xni_if.if_xname, err, op.status);
517 			goto err2;
518 		}
519 		xneti->xni_tx_ring_handle = op.handle;
520 
521 		op.host_addr = xneti->xni_rx_ring_va;
522 		op.flags = GNTMAP_host_map;
523 		op.ref = rx_ring_ref;
524 		op.dom = xneti->xni_domid;
525 		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
526 		if (err || op.status) {
527 			printf("%s: can't map RX grant ref: %d/%d\n",
528 			    xneti->xni_if.if_xname, err, op.status);
529 			goto err2;
530 		}
531 		xneti->xni_rx_ring_handle = op.handle;
532 		BACK_RING_INIT(&xneti->xni_txring, tx_ring, PAGE_SIZE);
533 		BACK_RING_INIT(&xneti->xni_rxring, rx_ring, PAGE_SIZE);
534 		evop.cmd = EVTCHNOP_bind_interdomain;
535 		evop.u.bind_interdomain.remote_dom = xneti->xni_domid;
536 		evop.u.bind_interdomain.remote_port = revtchn;
537 		err = HYPERVISOR_event_channel_op(&evop);
538 		if (err) {
539 			printf("%s: can't get event channel: %d\n",
540 			    xneti->xni_if.if_xname, err);
541 			goto err2;
542 		}
543 		xneti->xni_evtchn = evop.u.bind_interdomain.local_port;
544 		xen_wmb();
545 		xneti->xni_status = CONNECTED;
546 		xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
547 		xen_wmb();
548 		event_set_handler(xneti->xni_evtchn, xennetback_evthandler,
549 		    xneti, IPL_NET, xneti->xni_if.if_xname);
550 		xennetback_ifinit(&xneti->xni_if);
551 		hypervisor_enable_event(xneti->xni_evtchn);
552 		hypervisor_notify_via_evtchn(xneti->xni_evtchn);
553 		break;
554 
555 	case XenbusStateClosing:
556 		xneti->xni_status = DISCONNECTING;
557 		xneti->xni_if.if_flags &= ~(IFF_RUNNING | IFF_OACTIVE);
558 		xneti->xni_if.if_timer = 0;
559 		xenbus_switch_state(xbusd, NULL, XenbusStateClosing);
560 		break;
561 
562 	case XenbusStateClosed:
563 		/* otherend_changed() should handle it for us */
564 		panic("xennetback_frontend_changed: closed\n");
565 	case XenbusStateUnknown:
566 	case XenbusStateInitWait:
567 	default:
568 		aprint_error("%s: invalid frontend state %d\n",
569 		    xneti->xni_if.if_xname, new_state);
570 		break;
571 	}
572 	return;
573 err2:
574 	uvm_km_free(kernel_map, xneti->xni_rx_ring_va,
575 	    PAGE_SIZE, UVM_KMF_VAONLY);
576 err1:
577 	uvm_km_free(kernel_map, xneti->xni_tx_ring_va,
578 	    PAGE_SIZE, UVM_KMF_VAONLY);
579 }
580 
581 /* lookup a xneti based on domain id and interface handle */
582 static struct xnetback_instance *
583 xnetif_lookup(domid_t dom , uint32_t handle)
584 {
585 	struct xnetback_instance *xneti;
586 
587 	SLIST_FOREACH(xneti, &xnetback_instances, next) {
588 		if (xneti->xni_domid == dom && xneti->xni_handle == handle)
589 			return xneti;
590 	}
591 	return NULL;
592 }
593 
594 
595 /* get a page to remplace a mbuf cluster page given to a domain */
596 static int
597 xennetback_get_mcl_page(paddr_t *map)
598 {
599 	if (mcl_pages_alloc < 0)
600 		/*
601 		 * we exhausted our allocation. We can't allocate new ones yet
602 		 * because the current pages may not have been loaned to
603 		 * the remote domain yet. We have to let the caller do this.
604 		 */
605 		return -1;
606 
607 	*map = ((paddr_t)mcl_pages[mcl_pages_alloc]) << PAGE_SHIFT;
608 	mcl_pages_alloc--;
609 	return 0;
610 
611 }
612 
613 static void
614 xennetback_get_new_mcl_pages(void)
615 {
616 	int nb_pages;
617 	struct xen_memory_reservation res;
618 
619 	/* get some new pages. */
620 	xenguest_handle(res.extent_start) = mcl_pages;
621 	res.nr_extents = NB_XMIT_PAGES_BATCH;
622 	res.extent_order = 0;
623 	res.address_bits = 0;
624 	res.domid = DOMID_SELF;
625 
626 	nb_pages = HYPERVISOR_memory_op(XENMEM_increase_reservation, &res);
627 	if (nb_pages <= 0) {
628 		printf("xennetback: can't get new mcl pages (%d)\n", nb_pages);
629 		return;
630 	}
631 	if (nb_pages != NB_XMIT_PAGES_BATCH)
632 		printf("xennetback: got only %d new mcl pages\n", nb_pages);
633 
634 	mcl_pages_alloc = nb_pages - 1;
635 }
636 
637 static inline void
638 xennetback_tx_response(struct xnetback_instance *xneti, int id, int status)
639 {
640 	RING_IDX resp_prod;
641 	netif_tx_response_t *txresp;
642 	int do_event;
643 
644 	resp_prod = xneti->xni_txring.rsp_prod_pvt;
645 	txresp = RING_GET_RESPONSE(&xneti->xni_txring, resp_prod);
646 
647 	txresp->id = id;
648 	txresp->status = status;
649 	xneti->xni_txring.rsp_prod_pvt++;
650 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xneti->xni_txring, do_event);
651 	if (do_event) {
652 		XENPRINTF(("%s send event\n", xneti->xni_if.if_xname));
653 		hypervisor_notify_via_evtchn(xneti->xni_evtchn);
654 	}
655 }
656 
657 static int
658 xennetback_evthandler(void *arg)
659 {
660 	struct xnetback_instance *xneti = arg;
661 	struct ifnet *ifp = &xneti->xni_if;
662 	netif_tx_request_t *txreq;
663 	struct xni_pkt *pkt;
664 	vaddr_t pkt_va;
665 	struct mbuf *m;
666 	int receive_pending, err;
667 	RING_IDX req_cons;
668 
669 	XENPRINTF(("xennetback_evthandler "));
670 	req_cons = xneti->xni_txring.req_cons;
671 	xen_rmb();
672 	while (1) {
673 		xen_rmb(); /* be sure to read the request before updating */
674 		xneti->xni_txring.req_cons = req_cons;
675 		xen_wmb();
676 		RING_FINAL_CHECK_FOR_REQUESTS(&xneti->xni_txring,
677 		    receive_pending);
678 		if (receive_pending == 0)
679 			break;
680 		txreq = RING_GET_REQUEST(&xneti->xni_txring, req_cons);
681 		xen_rmb();
682 		XENPRINTF(("%s pkt size %d\n", xneti->xni_if.if_xname,
683 		    txreq->size));
684 		req_cons++;
685 		if (__predict_false((ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
686 		    (IFF_UP | IFF_RUNNING))) {
687 			/* interface not up, drop */
688 			xennetback_tx_response(xneti, txreq->id,
689 			    NETIF_RSP_DROPPED);
690 			continue;
691 		}
692 		/*
693 		 * Do some sanity checks, and map the packet's page.
694 		 */
695 		if (__predict_false(txreq->size < ETHER_HDR_LEN ||
696 		   txreq->size > (ETHER_MAX_LEN - ETHER_CRC_LEN))) {
697 			printf("%s: packet size %d too big\n",
698 			    ifp->if_xname, txreq->size);
699 			xennetback_tx_response(xneti, txreq->id,
700 			    NETIF_RSP_ERROR);
701 			ifp->if_ierrors++;
702 			continue;
703 		}
704 		/* don't cross page boundaries */
705 		if (__predict_false(
706 		    txreq->offset + txreq->size > PAGE_SIZE)) {
707 			printf("%s: packet cross page boundary\n",
708 			    ifp->if_xname);
709 			xennetback_tx_response(xneti, txreq->id,
710 			    NETIF_RSP_ERROR);
711 			ifp->if_ierrors++;
712 			continue;
713 		}
714 		/* get a mbuf for this packet */
715 		MGETHDR(m, M_DONTWAIT, MT_DATA);
716 		if (__predict_false(m == NULL)) {
717 			static struct timeval lasttime;
718 			if (ratecheck(&lasttime, &xni_pool_errintvl))
719 				printf("%s: mbuf alloc failed\n",
720 				    ifp->if_xname);
721 			xennetback_tx_response(xneti, txreq->id,
722 			    NETIF_RSP_DROPPED);
723 			ifp->if_ierrors++;
724 			continue;
725 		}
726 
727 		XENPRINTF(("%s pkt offset %d size %d id %d req_cons %d\n",
728 		    xneti->xni_if.if_xname, txreq->offset,
729 		    txreq->size, txreq->id, MASK_NETIF_TX_IDX(req_cons)));
730 
731 		pkt = pool_get(&xni_pkt_pool, PR_NOWAIT);
732 		if (__predict_false(pkt == NULL)) {
733 			static struct timeval lasttime;
734 			if (ratecheck(&lasttime, &xni_pool_errintvl))
735 				printf("%s: xnbpkt alloc failed\n",
736 				    ifp->if_xname);
737 			xennetback_tx_response(xneti, txreq->id,
738 			    NETIF_RSP_DROPPED);
739 			ifp->if_ierrors++;
740 			m_freem(m);
741 			continue;
742 		}
743 		err = xen_shm_map(1, xneti->xni_domid, &txreq->gref, &pkt_va,
744 		    &pkt->pkt_handle, XSHM_RO);
745 		if (__predict_false(err == ENOMEM)) {
746 			xennetback_tx_response(xneti, txreq->id,
747 			    NETIF_RSP_DROPPED);
748 			ifp->if_ierrors++;
749 			pool_put(&xni_pkt_pool, pkt);
750 			m_freem(m);
751 			continue;
752 		}
753 
754 		if (__predict_false(err)) {
755 			printf("%s: mapping foreing page failed: %d\n",
756 			    xneti->xni_if.if_xname, err);
757 			xennetback_tx_response(xneti, txreq->id,
758 			    NETIF_RSP_ERROR);
759 			ifp->if_ierrors++;
760 			pool_put(&xni_pkt_pool, pkt);
761 			m_freem(m);
762 			continue;
763 		}
764 
765 		if ((ifp->if_flags & IFF_PROMISC) == 0) {
766 			struct ether_header *eh =
767 			    (void*)(pkt_va + txreq->offset);
768 			if (ETHER_IS_MULTICAST(eh->ether_dhost) == 0 &&
769 			    memcmp(CLLADDR(ifp->if_sadl), eh->ether_dhost,
770 			    ETHER_ADDR_LEN) != 0) {
771 				xni_pkt_unmap(pkt, pkt_va);
772 				m_freem(m);
773 				xennetback_tx_response(xneti, txreq->id,
774 				    NETIF_RSP_OKAY);
775 				continue; /* packet is not for us */
776 			}
777 		}
778 #ifdef notyet
779 a lot of work is needed in the tcp stack to handle read-only ext storage
780 so always copy for now.
781 		if (((req_cons + 1) & (NET_TX_RING_SIZE - 1)) ==
782 		    (xneti->xni_txring.rsp_prod_pvt & (NET_TX_RING_SIZE - 1)))
783 #else
784 		if (1)
785 #endif /* notyet */
786 		{
787 			/*
788 			 * This is the last TX buffer. Copy the data and
789 			 * ack it. Delaying it until the mbuf is
790 			 * freed will stall transmit.
791 			 */
792 			m->m_len = min(MHLEN, txreq->size);
793 			m->m_pkthdr.len = 0;
794 			m_copyback(m, 0, txreq->size,
795 			    (void *)(pkt_va + txreq->offset));
796 			xni_pkt_unmap(pkt, pkt_va);
797 			if (m->m_pkthdr.len < txreq->size) {
798 				ifp->if_ierrors++;
799 				m_freem(m);
800 				xennetback_tx_response(xneti, txreq->id,
801 				    NETIF_RSP_DROPPED);
802 				continue;
803 			}
804 			xennetback_tx_response(xneti, txreq->id,
805 			    NETIF_RSP_OKAY);
806 		} else {
807 
808 			pkt->pkt_id = txreq->id;
809 			pkt->pkt_xneti = xneti;
810 
811 			MEXTADD(m, pkt_va + txreq->offset,
812 			    txreq->size, M_DEVBUF, xennetback_tx_free, pkt);
813 			m->m_pkthdr.len = m->m_len = txreq->size;
814 			m->m_flags |= M_EXT_ROMAP;
815 		}
816 		if ((txreq->flags & NETTXF_csum_blank) != 0) {
817 			xennet_checksum_fill(&m);
818 			if (m == NULL) {
819 				ifp->if_ierrors++;
820 				continue;
821 			}
822 		}
823 		m->m_pkthdr.rcvif = ifp;
824 		ifp->if_ipackets++;
825 
826 		bpf_mtap(ifp, m);
827 		(*ifp->if_input)(ifp, m);
828 	}
829 	xen_rmb(); /* be sure to read the request before updating pointer */
830 	xneti->xni_txring.req_cons = req_cons;
831 	xen_wmb();
832 	/* check to see if we can transmit more packets */
833 	softint_schedule(xneti->xni_softintr);
834 
835 	return 1;
836 }
837 
838 static void
839 xennetback_tx_free(struct mbuf *m, void *va, size_t size, void *arg)
840 {
841 	int s = splnet();
842 	struct xni_pkt *pkt = arg;
843 	struct xnetback_instance *xneti = pkt->pkt_xneti;
844 
845 	XENPRINTF(("xennetback_tx_free\n"));
846 
847 	xennetback_tx_response(xneti, pkt->pkt_id, NETIF_RSP_OKAY);
848 
849 	xni_pkt_unmap(pkt, (vaddr_t)va & ~PAGE_MASK);
850 
851 	if (m)
852 		pool_cache_put(mb_cache, m);
853 	splx(s);
854 }
855 
856 static int
857 xennetback_ifioctl(struct ifnet *ifp, u_long cmd, void *data)
858 {
859 	//struct xnetback_instance *xneti = ifp->if_softc;
860 	//struct ifreq *ifr = (struct ifreq *)data;
861 	int s, error;
862 
863 	s = splnet();
864 	error = ether_ioctl(ifp, cmd, data);
865 	if (error == ENETRESET)
866 		error = 0;
867 	splx(s);
868 	return error;
869 }
870 
871 static void
872 xennetback_ifstart(struct ifnet *ifp)
873 {
874 	struct xnetback_instance *xneti = ifp->if_softc;
875 
876 	/*
877 	 * The Xen communication channel is much more efficient if we can
878 	 * schedule batch of packets for the domain. To achieve this, we
879 	 * schedule a soft interrupt, and just return. This way, the network
880 	 * stack will enqueue all pending mbufs in the interface's send queue
881 	 * before it is processed by the soft inetrrupt handler().
882 	 */
883 	softint_schedule(xneti->xni_softintr);
884 }
885 
886 static void
887 xennetback_ifsoftstart_transfer(void *arg)
888 {
889 	struct xnetback_instance *xneti = arg;
890 	struct ifnet *ifp = &xneti->xni_if;
891 	struct mbuf *m;
892 	vaddr_t xmit_va;
893 	paddr_t xmit_pa;
894 	paddr_t xmit_ma;
895 	paddr_t newp_ma = 0; /* XXX gcc */
896 	int i, j, nppitems;
897 	mmu_update_t *mmup;
898 	multicall_entry_t *mclp;
899 	netif_rx_response_t *rxresp;
900 	RING_IDX req_prod, resp_prod;
901 	int do_event = 0;
902 	gnttab_transfer_t *gop;
903 	int id, offset;
904 
905 	XENPRINTF(("xennetback_ifsoftstart_transfer "));
906 	int s = splnet();
907 	if (__predict_false(
908 	    (ifp->if_flags & (IFF_RUNNING|IFF_OACTIVE)) != IFF_RUNNING)) {
909 		splx(s);
910 		return;
911 	}
912 
913 	while (!IFQ_IS_EMPTY(&ifp->if_snd)) {
914 		XENPRINTF(("pkt\n"));
915 		req_prod = xneti->xni_rxring.sring->req_prod;
916 		resp_prod = xneti->xni_rxring.rsp_prod_pvt;
917 		xen_rmb();
918 
919 		mmup = xstart_mmu;
920 		mclp = xstart_mcl;
921 		gop = xstart_gop_transfer;
922 		for (nppitems = 0, i = 0; !IFQ_IS_EMPTY(&ifp->if_snd);) {
923 			XENPRINTF(("have a packet\n"));
924 			IFQ_POLL(&ifp->if_snd, m);
925 			if (__predict_false(m == NULL))
926 				panic("xennetback_ifstart: IFQ_POLL");
927 			if (__predict_false(
928 			    req_prod == xneti->xni_rxring.req_cons ||
929 			    xneti->xni_rxring.req_cons - resp_prod ==
930 			    NET_RX_RING_SIZE)) {
931 				/* out of ring space */
932 				XENPRINTF(("xennetback_ifstart: ring full "
933 				    "req_prod 0x%x req_cons 0x%x resp_prod "
934 				    "0x%x\n",
935 				    req_prod, xneti->xni_rxring.req_cons,
936 				    resp_prod));
937 				ifp->if_timer = 1;
938 				break;
939 			}
940 			if (__predict_false(i == NB_XMIT_PAGES_BATCH))
941 				break; /* we filled the array */
942 			if (__predict_false(
943 			    xennetback_get_mcl_page(&newp_ma) != 0))
944 				break; /* out of memory */
945 			if ((m->m_flags & M_CLUSTER) != 0 &&
946 			    !M_READONLY(m) && MCLBYTES == PAGE_SIZE) {
947 				/* we can give this page away */
948 				xmit_pa = m->m_ext.ext_paddr;
949 				xmit_ma = xpmap_ptom(xmit_pa);
950 				xmit_va = (vaddr_t)m->m_ext.ext_buf;
951 				KASSERT(xmit_pa != M_PADDR_INVALID);
952 				KASSERT((xmit_va & PAGE_MASK) == 0);
953 				offset = m->m_data - m->m_ext.ext_buf;
954 			} else {
955 				/* we have to copy the packet */
956 				xmit_va = (vaddr_t)pool_cache_get_paddr(
957 				    xmit_pages_cachep,
958 				    PR_NOWAIT, &xmit_pa);
959 				if (__predict_false(xmit_va == 0))
960 					break; /* out of memory */
961 
962 				KASSERT(xmit_pa != POOL_PADDR_INVALID);
963 				xmit_ma = xpmap_ptom(xmit_pa);
964 				XENPRINTF(("xennetback_get_xmit_page: got va "
965 				    "0x%x ma 0x%x\n", (u_int)xmit_va,
966 				    (u_int)xmit_ma));
967 				m_copydata(m, 0, m->m_pkthdr.len,
968 				    (char *)xmit_va + LINUX_REQUESTED_OFFSET);
969 				offset = LINUX_REQUESTED_OFFSET;
970 				pages_pool_free[nppitems].va = xmit_va;
971 				pages_pool_free[nppitems].pa = xmit_pa;
972 				nppitems++;
973 			}
974 			/* start filling ring */
975 			gop->ref = RING_GET_REQUEST(&xneti->xni_rxring,
976 			    xneti->xni_rxring.req_cons)->gref;
977 			id = RING_GET_REQUEST(&xneti->xni_rxring,
978 			    xneti->xni_rxring.req_cons)->id;
979 			xen_rmb();
980 			xneti->xni_rxring.req_cons++;
981 			rxresp = RING_GET_RESPONSE(&xneti->xni_rxring,
982 			    resp_prod);
983 			rxresp->id = id;
984 			rxresp->offset = offset;
985 			rxresp->status = m->m_pkthdr.len;
986 			if ((m->m_pkthdr.csum_flags &
987 			    (M_CSUM_TCPv4 | M_CSUM_UDPv4)) != 0) {
988 				rxresp->flags = NETRXF_csum_blank;
989 			} else {
990 				rxresp->flags = 0;
991 			}
992 			/*
993 			 * transfers the page containing the packet to the
994 			 * remote domain, and map newp in place.
995 			 */
996 			xpmap_phys_to_machine_mapping[
997 			    (xmit_pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
998 			    newp_ma >> PAGE_SHIFT;
999 			MULTI_update_va_mapping(mclp, xmit_va,
1000 			    newp_ma | PG_V | PG_RW | PG_U | PG_M, 0);
1001 			mclp++;
1002 			gop->mfn = xmit_ma >> PAGE_SHIFT;
1003 			gop->domid = xneti->xni_domid;
1004 			gop++;
1005 
1006 			mmup->ptr = newp_ma | MMU_MACHPHYS_UPDATE;
1007 			mmup->val = (xmit_pa - XPMAP_OFFSET) >> PAGE_SHIFT;
1008 			mmup++;
1009 
1010 			/* done with this packet */
1011 			IFQ_DEQUEUE(&ifp->if_snd, m);
1012 			mbufs_sent[i] = m;
1013 			resp_prod++;
1014 			i++; /* this packet has been queued */
1015 			ifp->if_opackets++;
1016 			bpf_mtap(ifp, m);
1017 		}
1018 		if (i != 0) {
1019 			/*
1020 			 * We may have allocated buffers which have entries
1021 			 * outstanding in the page update queue -- make sure
1022 			 * we flush those first!
1023 			 */
1024 			int svm = splvm();
1025 			xpq_flush_queue();
1026 			splx(svm);
1027 			mclp[-1].args[MULTI_UVMFLAGS_INDEX] =
1028 			    UVMF_TLB_FLUSH|UVMF_ALL;
1029 			mclp->op = __HYPERVISOR_mmu_update;
1030 			mclp->args[0] = (unsigned long)xstart_mmu;
1031 			mclp->args[1] = i;
1032 			mclp->args[2] = 0;
1033 			mclp->args[3] = DOMID_SELF;
1034 			mclp++;
1035 			/* update the MMU */
1036 			if (HYPERVISOR_multicall(xstart_mcl, i + 1) != 0) {
1037 				panic("%s: HYPERVISOR_multicall failed",
1038 				    ifp->if_xname);
1039 			}
1040 			for (j = 0; j < i + 1; j++) {
1041 				if (xstart_mcl[j].result != 0) {
1042 					printf("%s: xstart_mcl[%d] "
1043 					    "failed (%lu)\n", ifp->if_xname,
1044 					    j, xstart_mcl[j].result);
1045 					printf("%s: req_prod %u req_cons "
1046 					    "%u rsp_prod %u rsp_prod_pvt %u "
1047 					    "i %u\n",
1048 					    ifp->if_xname,
1049 					    xneti->xni_rxring.sring->req_prod,
1050 					    xneti->xni_rxring.req_cons,
1051 					    xneti->xni_rxring.sring->rsp_prod,
1052 					    xneti->xni_rxring.rsp_prod_pvt,
1053 					    i);
1054 				}
1055 			}
1056 			if (HYPERVISOR_grant_table_op(GNTTABOP_transfer,
1057 			    xstart_gop_transfer, i) != 0) {
1058 				panic("%s: GNTTABOP_transfer failed",
1059 				    ifp->if_xname);
1060 			}
1061 
1062 			for (j = 0; j < i; j++) {
1063 				if (xstart_gop_transfer[j].status != GNTST_okay) {
1064 					printf("%s GNTTABOP_transfer[%d] %d\n",
1065 					    ifp->if_xname,
1066 					    j, xstart_gop_transfer[j].status);
1067 					printf("%s: req_prod %u req_cons "
1068 					    "%u rsp_prod %u rsp_prod_pvt %u "
1069 					    "i %d\n",
1070 					    ifp->if_xname,
1071 					    xneti->xni_rxring.sring->req_prod,
1072 					    xneti->xni_rxring.req_cons,
1073 					    xneti->xni_rxring.sring->rsp_prod,
1074 					    xneti->xni_rxring.rsp_prod_pvt,
1075 					    i);
1076 					rxresp = RING_GET_RESPONSE(
1077 					    &xneti->xni_rxring,
1078 					    xneti->xni_rxring.rsp_prod_pvt + j);
1079 					rxresp->status = NETIF_RSP_ERROR;
1080 				}
1081 			}
1082 
1083 			/* update pointer */
1084 			KASSERT(
1085 			    xneti->xni_rxring.rsp_prod_pvt + i == resp_prod);
1086 			xneti->xni_rxring.rsp_prod_pvt = resp_prod;
1087 			RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(
1088 			    &xneti->xni_rxring, j);
1089 			if (j)
1090 				do_event = 1;
1091 			/* now we can free the mbufs */
1092 			for (j = 0; j < i; j++) {
1093 				m_freem(mbufs_sent[j]);
1094 			}
1095 			for (j = 0; j < nppitems; j++) {
1096 				pool_cache_put_paddr(xmit_pages_cachep,
1097 				    (void *)pages_pool_free[j].va,
1098 				    pages_pool_free[j].pa);
1099 			}
1100 		}
1101 		/* send event */
1102 		if (do_event) {
1103 			xen_rmb();
1104 			XENPRINTF(("%s receive event\n",
1105 			    xneti->xni_if.if_xname));
1106 			hypervisor_notify_via_evtchn(xneti->xni_evtchn);
1107 			do_event = 0;
1108 		}
1109 		/* check if we need to get back some pages */
1110 		if (mcl_pages_alloc < 0) {
1111 			xennetback_get_new_mcl_pages();
1112 			if (mcl_pages_alloc < 0) {
1113 				/*
1114 				 * setup the watchdog to try again, because
1115 				 * xennetback_ifstart() will never be called
1116 				 * again if queue is full.
1117 				 */
1118 				printf("xennetback_ifstart: no mcl_pages\n");
1119 				ifp->if_timer = 1;
1120 				break;
1121 			}
1122 		}
1123 		/*
1124 		 * note that we don't use RING_FINAL_CHECK_FOR_REQUESTS()
1125 		 * here, as the frontend doesn't notify when adding
1126 		 * requests anyway
1127 		 */
1128 		if (__predict_false(
1129 		    !RING_HAS_UNCONSUMED_REQUESTS(&xneti->xni_rxring))) {
1130 			/* ring full */
1131 			break;
1132 		}
1133 	}
1134 	splx(s);
1135 }
1136 
1137 static void
1138 xennetback_ifsoftstart_copy(void *arg)
1139 {
1140 	struct xnetback_instance *xneti = arg;
1141 	struct ifnet *ifp = &xneti->xni_if;
1142 	struct mbuf *m, *new_m;
1143 	paddr_t xmit_pa;
1144 	paddr_t xmit_ma;
1145 	int i, j;
1146 	netif_rx_response_t *rxresp;
1147 	RING_IDX req_prod, resp_prod;
1148 	int do_event = 0;
1149 	gnttab_copy_t *gop;
1150 	int id, offset;
1151 
1152 	XENPRINTF(("xennetback_ifsoftstart_transfer "));
1153 	int s = splnet();
1154 	if (__predict_false(
1155 	    (ifp->if_flags & (IFF_RUNNING|IFF_OACTIVE)) != IFF_RUNNING)) {
1156 		splx(s);
1157 		return;
1158 	}
1159 
1160 	while (!IFQ_IS_EMPTY(&ifp->if_snd)) {
1161 		XENPRINTF(("pkt\n"));
1162 		req_prod = xneti->xni_rxring.sring->req_prod;
1163 		resp_prod = xneti->xni_rxring.rsp_prod_pvt;
1164 		xen_rmb();
1165 
1166 		gop = xstart_gop_copy;
1167 		for (i = 0; !IFQ_IS_EMPTY(&ifp->if_snd);) {
1168 			XENPRINTF(("have a packet\n"));
1169 			IFQ_POLL(&ifp->if_snd, m);
1170 			if (__predict_false(m == NULL))
1171 				panic("xennetback_ifstart: IFQ_POLL");
1172 			if (__predict_false(
1173 			    req_prod == xneti->xni_rxring.req_cons ||
1174 			    xneti->xni_rxring.req_cons - resp_prod ==
1175 			    NET_RX_RING_SIZE)) {
1176 				/* out of ring space */
1177 				XENPRINTF(("xennetback_ifstart: ring full "
1178 				    "req_prod 0x%x req_cons 0x%x resp_prod "
1179 				    "0x%x\n",
1180 				    req_prod, xneti->xni_rxring.req_cons,
1181 				    resp_prod));
1182 				ifp->if_timer = 1;
1183 				break;
1184 			}
1185 			if (__predict_false(i == NB_XMIT_PAGES_BATCH))
1186 				break; /* we filled the array */
1187 			switch (m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
1188 			case M_EXT|M_EXT_CLUSTER:
1189 				KASSERT(m->m_ext.ext_paddr != M_PADDR_INVALID);
1190 				xmit_pa = m->m_ext.ext_paddr;
1191 				offset = m->m_data - m->m_ext.ext_buf;
1192 				break;
1193 			case 0:
1194 				KASSERT(m->m_paddr != M_PADDR_INVALID);
1195 				xmit_pa = m->m_paddr;
1196 				offset = M_BUFOFFSET(m) +
1197 				    (m->m_data - M_BUFADDR(m));
1198 				break;
1199 			default:
1200 				if (__predict_false(
1201 				    !pmap_extract(pmap_kernel(),
1202 				    (vaddr_t)m->m_data, &xmit_pa))) {
1203 					panic("xennet_start: no pa");
1204 				}
1205 				offset = 0;
1206 				break;
1207 			}
1208 			offset += (xmit_pa & ~PG_FRAME);
1209 			xmit_pa = (xmit_pa & PG_FRAME);
1210 			if (m->m_pkthdr.len != m->m_len ||
1211 			    (offset + m->m_pkthdr.len) > PAGE_SIZE) {
1212 				MGETHDR(new_m, M_DONTWAIT, MT_DATA);
1213 				if (__predict_false(new_m == NULL)) {
1214 					printf("%s: cannot allocate new mbuf\n",
1215 					    ifp->if_xname);
1216 					break;
1217 				}
1218 				if (m->m_pkthdr.len > MHLEN) {
1219 					MCLGET(new_m, M_DONTWAIT);
1220 					if (__predict_false(
1221 					    (new_m->m_flags & M_EXT) == 0)) {
1222 						XENPRINTF((
1223 						    "%s: no mbuf cluster\n",
1224 						    ifp->if_xname));
1225 						m_freem(new_m);
1226 						break;
1227 					}
1228 					xmit_pa = new_m->m_ext.ext_paddr;
1229 					offset = new_m->m_data -
1230 					    new_m->m_ext.ext_buf;
1231 				} else {
1232 					xmit_pa = new_m->m_paddr;
1233 					offset = M_BUFOFFSET(new_m) +
1234 					    (new_m->m_data - M_BUFADDR(new_m));
1235 				}
1236 				offset += (xmit_pa & ~PG_FRAME);
1237 				xmit_pa = (xmit_pa & PG_FRAME);
1238 				m_copydata(m, 0, m->m_pkthdr.len,
1239 				    mtod(new_m, void *));
1240 				new_m->m_len = new_m->m_pkthdr.len =
1241 				    m->m_pkthdr.len;
1242 				IFQ_DEQUEUE(&ifp->if_snd, m);
1243 				m_freem(m);
1244 				m = new_m;
1245 			} else {
1246 				IFQ_DEQUEUE(&ifp->if_snd, m);
1247 			}
1248 
1249 			KASSERT(xmit_pa != POOL_PADDR_INVALID);
1250 			KASSERT((offset + m->m_pkthdr.len) <= PAGE_SIZE);
1251 			xmit_ma = xpmap_ptom(xmit_pa);
1252 			/* start filling ring */
1253 			gop->flags = GNTCOPY_dest_gref;
1254 			gop->source.offset = offset;
1255 			gop->source.domid = DOMID_SELF;
1256 			gop->source.u.gmfn = xmit_ma >> PAGE_SHIFT;
1257 
1258 			gop->dest.u.ref = RING_GET_REQUEST(&xneti->xni_rxring,
1259 			    xneti->xni_rxring.req_cons)->gref;
1260 			gop->dest.offset = 0;
1261 			gop->dest.domid = xneti->xni_domid;
1262 
1263 			gop->len = m->m_pkthdr.len;
1264 			gop++;
1265 
1266 			id = RING_GET_REQUEST(&xneti->xni_rxring,
1267 			    xneti->xni_rxring.req_cons)->id;
1268 			xen_rmb();
1269 			xneti->xni_rxring.req_cons++;
1270 			rxresp = RING_GET_RESPONSE(&xneti->xni_rxring,
1271 			    resp_prod);
1272 			rxresp->id = id;
1273 			rxresp->offset = 0;
1274 			rxresp->status = m->m_pkthdr.len;
1275 			if ((m->m_pkthdr.csum_flags &
1276 			    (M_CSUM_TCPv4 | M_CSUM_UDPv4)) != 0) {
1277 				rxresp->flags = NETRXF_csum_blank;
1278 			} else {
1279 				rxresp->flags = 0;
1280 			}
1281 
1282 			mbufs_sent[i] = m;
1283 			resp_prod++;
1284 			i++; /* this packet has been queued */
1285 			ifp->if_opackets++;
1286 			bpf_mtap(ifp, m);
1287 		}
1288 		if (i != 0) {
1289 			if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1290 			    xstart_gop_copy, i) != 0) {
1291 				panic("%s: GNTTABOP_copy failed",
1292 				    ifp->if_xname);
1293 			}
1294 
1295 			for (j = 0; j < i; j++) {
1296 				if (xstart_gop_copy[j].status != GNTST_okay) {
1297 					printf("%s GNTTABOP_copy[%d] %d\n",
1298 					    ifp->if_xname,
1299 					    j, xstart_gop_copy[j].status);
1300 					printf("%s: req_prod %u req_cons "
1301 					    "%u rsp_prod %u rsp_prod_pvt %u "
1302 					    "i %d\n",
1303 					    ifp->if_xname,
1304 					    xneti->xni_rxring.sring->req_prod,
1305 					    xneti->xni_rxring.req_cons,
1306 					    xneti->xni_rxring.sring->rsp_prod,
1307 					    xneti->xni_rxring.rsp_prod_pvt,
1308 					    i);
1309 					rxresp = RING_GET_RESPONSE(
1310 					    &xneti->xni_rxring,
1311 					    xneti->xni_rxring.rsp_prod_pvt + j);
1312 					rxresp->status = NETIF_RSP_ERROR;
1313 				}
1314 			}
1315 
1316 			/* update pointer */
1317 			KASSERT(
1318 			    xneti->xni_rxring.rsp_prod_pvt + i == resp_prod);
1319 			xneti->xni_rxring.rsp_prod_pvt = resp_prod;
1320 			RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(
1321 			    &xneti->xni_rxring, j);
1322 			if (j)
1323 				do_event = 1;
1324 			/* now we can free the mbufs */
1325 			for (j = 0; j < i; j++) {
1326 				m_freem(mbufs_sent[j]);
1327 			}
1328 		}
1329 		/* send event */
1330 		if (do_event) {
1331 			xen_rmb();
1332 			XENPRINTF(("%s receive event\n",
1333 			    xneti->xni_if.if_xname));
1334 			hypervisor_notify_via_evtchn(xneti->xni_evtchn);
1335 			do_event = 0;
1336 		}
1337 		/*
1338 		 * note that we don't use RING_FINAL_CHECK_FOR_REQUESTS()
1339 		 * here, as the frontend doesn't notify when adding
1340 		 * requests anyway
1341 		 */
1342 		if (__predict_false(
1343 		    !RING_HAS_UNCONSUMED_REQUESTS(&xneti->xni_rxring))) {
1344 			/* ring full */
1345 			break;
1346 		}
1347 	}
1348 	splx(s);
1349 }
1350 
1351 
1352 static void
1353 xennetback_ifwatchdog(struct ifnet * ifp)
1354 {
1355 	/*
1356 	 * We can get to the following condition:
1357 	 * transmit stalls because the ring is full when the ifq is full too.
1358 	 * In this case (as, unfortunably, we don't get an interrupt from xen
1359 	 * on transmit) noting will ever call xennetback_ifstart() again.
1360 	 * Here we abuse the watchdog to get out of this condition.
1361 	 */
1362 	XENPRINTF(("xennetback_ifwatchdog\n"));
1363 	xennetback_ifstart(ifp);
1364 }
1365 
1366 
1367 static int
1368 xennetback_ifinit(struct ifnet *ifp)
1369 {
1370 	struct xnetback_instance *xneti = ifp->if_softc;
1371 	int s = splnet();
1372 
1373 	if ((ifp->if_flags & IFF_UP) == 0) {
1374 		splx(s);
1375 		return 0;
1376 	}
1377 	if (xneti->xni_status == CONNECTED)
1378 		ifp->if_flags |= IFF_RUNNING;
1379 	splx(s);
1380 	return 0;
1381 }
1382 
1383 static void
1384 xennetback_ifstop(struct ifnet *ifp, int disable)
1385 {
1386 	struct xnetback_instance *xneti = ifp->if_softc;
1387 	int s = splnet();
1388 
1389 	ifp->if_flags &= ~(IFF_RUNNING | IFF_OACTIVE);
1390 	ifp->if_timer = 0;
1391 	if (xneti->xni_status == CONNECTED) {
1392 		XENPRINTF(("%s: req_prod 0x%x resp_prod 0x%x req_cons 0x%x "
1393 		    "event 0x%x\n", ifp->if_xname, xneti->xni_txring->req_prod,
1394 		    xneti->xni_txring->resp_prod, xneti->xni_txring->req_cons,
1395 		    xneti->xni_txring->event));
1396 		xennetback_evthandler(ifp->if_softc); /* flush pending RX requests */
1397 	}
1398 	splx(s);
1399 }
1400