xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 48847494)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */
39 #include <sys/dlpi.h>
40 #include <sys/strsubr.h>
41 #include <sys/strsun.h>
42 #include <sys/types.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 #include <vm/vm_dep.h>
52 
53 #include <sys/gld.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 #include <sys/vnic_impl.h> /* blech. */
57 
58 /*
59  * The terms "transmit" and "receive" are used in alignment with domU,
60  * which means that packets originating from the peer domU are "transmitted"
61  * to other parts of the system and packets are "received" from them.
62  */
63 
64 /*
65  * XXPV dme: things to do, as well as various things indicated
66  * throughout the source:
67  * - copy avoidance outbound.
68  * - copy avoidance inbound.
69  * - transfer credit limiting.
70  * - MAC address based filtering.
71  */
72 
73 /*
74  * Linux expects to have some headroom in received buffers.  The Linux
75  * frontend driver (netfront) checks to see if the headroom is
76  * available and will re-allocate the buffer to make room if
77  * necessary.  To avoid this we add RX_BUFFER_HEADROOM bytes of
78  * headroom to each packet we pass to the peer.
79  */
80 #define	RX_BUFFER_HEADROOM	16
81 
82 /*
83  * Should we attempt to defer checksum calculation?
84  */
85 static boolean_t	xnb_cksum_offload = B_TRUE;
86 /*
87  * When receiving packets from a guest, should they be copied
88  * or used as-is (esballoc)?
89  */
90 static boolean_t	xnb_tx_always_copy = B_TRUE;
91 
92 static boolean_t	xnb_connect_rings(dev_info_t *);
93 static void		xnb_disconnect_rings(dev_info_t *);
94 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
95     void *, void *);
96 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
97     void *, void *);
98 
99 static int	xnb_txbuf_constructor(void *, void *, int);
100 static void	xnb_txbuf_destructor(void *, void *);
101 static xnb_txbuf_t *xnb_txbuf_get(xnb_t *, int);
102 static void	xnb_txbuf_put(xnb_t *, xnb_txbuf_t *);
103 static void	xnb_tx_notify_peer(xnb_t *);
104 static void	xnb_tx_complete(xnb_txbuf_t *);
105 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
106 static void 	xnb_tx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
107     xnb_txbuf_t *);
108 static void	xnb_tx_perform_pending_unmop(xnb_t *);
109 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
110 
111 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
112 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
113 
114 
115 boolean_t	xnb_hv_copy = B_TRUE;
116 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
117 
118 /* XXPV dme: are these really invalid? */
119 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
120 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
121 
122 static kmem_cache_t *xnb_txbuf_cachep;
123 static kmutex_t	xnb_alloc_page_lock;
124 
125 /*
126  * Statistics.
127  */
128 static char *aux_statistics[] = {
129 	"rx_cksum_deferred",
130 	"tx_cksum_no_need",
131 	"rx_rsp_notok",
132 	"tx_notify_deferred",
133 	"tx_notify_sent",
134 	"rx_notify_deferred",
135 	"rx_notify_sent",
136 	"tx_too_early",
137 	"rx_too_early",
138 	"rx_allocb_failed",
139 	"tx_allocb_failed",
140 	"rx_foreign_page",
141 	"mac_full",
142 	"spurious_intr",
143 	"allocation_success",
144 	"allocation_failure",
145 	"small_allocation_success",
146 	"small_allocation_failure",
147 	"other_allocation_failure",
148 	"rx_pageboundary_crossed",
149 	"rx_cpoparea_grown",
150 	"csum_hardware",
151 	"csum_software",
152 };
153 
154 static int
155 xnb_ks_aux_update(kstat_t *ksp, int flag)
156 {
157 	xnb_t *xnbp;
158 	kstat_named_t *knp;
159 
160 	if (flag != KSTAT_READ)
161 		return (EACCES);
162 
163 	xnbp = ksp->ks_private;
164 	knp = ksp->ks_data;
165 
166 	/*
167 	 * Assignment order should match that of the names in
168 	 * aux_statistics.
169 	 */
170 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
171 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
193 
194 	return (0);
195 }
196 
197 static boolean_t
198 xnb_ks_init(xnb_t *xnbp)
199 {
200 	int nstat = sizeof (aux_statistics) /
201 	    sizeof (aux_statistics[0]);
202 	char **cp = aux_statistics;
203 	kstat_named_t *knp;
204 
205 	/*
206 	 * Create and initialise kstats.
207 	 */
208 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
209 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
210 	    KSTAT_TYPE_NAMED, nstat, 0);
211 	if (xnbp->xnb_kstat_aux == NULL)
212 		return (B_FALSE);
213 
214 	xnbp->xnb_kstat_aux->ks_private = xnbp;
215 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
216 
217 	knp = xnbp->xnb_kstat_aux->ks_data;
218 	while (nstat > 0) {
219 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
220 
221 		knp++;
222 		cp++;
223 		nstat--;
224 	}
225 
226 	kstat_install(xnbp->xnb_kstat_aux);
227 
228 	return (B_TRUE);
229 }
230 
231 static void
232 xnb_ks_free(xnb_t *xnbp)
233 {
234 	kstat_delete(xnbp->xnb_kstat_aux);
235 }
236 
237 /*
238  * Software checksum calculation and insertion for an arbitrary packet.
239  */
240 /*ARGSUSED*/
241 static mblk_t *
242 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
243 {
244 	/*
245 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
246 	 * because it doesn't cover all of the interesting cases :-(
247 	 */
248 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
249 	    HCK_FULLCKSUM, KM_NOSLEEP);
250 
251 	return (mac_fix_cksum(mp));
252 }
253 
254 mblk_t *
255 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
256 {
257 	struct ether_header *ehp;
258 	uint16_t sap;
259 	uint32_t offset;
260 	ipha_t *ipha;
261 
262 	ASSERT(mp->b_next == NULL);
263 
264 	/*
265 	 * Check that the packet is contained in a single mblk.  In
266 	 * the "from peer" path this is true today, but will change
267 	 * when scatter gather support is added.  In the "to peer"
268 	 * path we cannot be sure, but in most cases it will be true
269 	 * (in the xnbo case the packet has come from a MAC device
270 	 * which is unlikely to split packets).
271 	 */
272 	if (mp->b_cont != NULL)
273 		goto software;
274 
275 	/*
276 	 * If the MAC has no hardware capability don't do any further
277 	 * checking.
278 	 */
279 	if (capab == 0)
280 		goto software;
281 
282 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
283 	ehp = (struct ether_header *)mp->b_rptr;
284 
285 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
286 		struct ether_vlan_header *evhp;
287 
288 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
289 		evhp = (struct ether_vlan_header *)mp->b_rptr;
290 		sap = ntohs(evhp->ether_type);
291 		offset = sizeof (struct ether_vlan_header);
292 	} else {
293 		sap = ntohs(ehp->ether_type);
294 		offset = sizeof (struct ether_header);
295 	}
296 
297 	/*
298 	 * We only attempt to do IPv4 packets in hardware.
299 	 */
300 	if (sap != ETHERTYPE_IP)
301 		goto software;
302 
303 	/*
304 	 * We know that this is an IPv4 packet.
305 	 */
306 	ipha = (ipha_t *)(mp->b_rptr + offset);
307 
308 	switch (ipha->ipha_protocol) {
309 	case IPPROTO_TCP:
310 	case IPPROTO_UDP: {
311 		uint32_t start, length, stuff, cksum;
312 		uint16_t *stuffp;
313 
314 		/*
315 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
316 		 * can use full IPv4 and partial checksum offload.
317 		 */
318 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
319 			break;
320 
321 		start = IP_SIMPLE_HDR_LENGTH;
322 		length = ntohs(ipha->ipha_length);
323 		if (ipha->ipha_protocol == IPPROTO_TCP) {
324 			stuff = start + TCP_CHECKSUM_OFFSET;
325 			cksum = IP_TCP_CSUM_COMP;
326 		} else {
327 			stuff = start + UDP_CHECKSUM_OFFSET;
328 			cksum = IP_UDP_CSUM_COMP;
329 		}
330 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
331 
332 		if (capab & HCKSUM_INET_FULL_V4) {
333 			/*
334 			 * Some devices require that the checksum
335 			 * field of the packet is zero for full
336 			 * offload.
337 			 */
338 			*stuffp = 0;
339 
340 			(void) hcksum_assoc(mp, NULL, NULL,
341 			    0, 0, 0, 0,
342 			    HCK_FULLCKSUM, KM_NOSLEEP);
343 
344 			xnbp->xnb_stat_csum_hardware++;
345 
346 			return (mp);
347 		}
348 
349 		if (capab & HCKSUM_INET_PARTIAL) {
350 			if (*stuffp == 0) {
351 				ipaddr_t src, dst;
352 
353 				/*
354 				 * Older Solaris guests don't insert
355 				 * the pseudo-header checksum, so we
356 				 * calculate it here.
357 				 */
358 				src = ipha->ipha_src;
359 				dst = ipha->ipha_dst;
360 
361 				cksum += (dst >> 16) + (dst & 0xFFFF);
362 				cksum += (src >> 16) + (src & 0xFFFF);
363 				cksum += length - IP_SIMPLE_HDR_LENGTH;
364 
365 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
366 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
367 
368 				ASSERT(cksum <= 0xFFFF);
369 
370 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
371 			}
372 
373 			(void) hcksum_assoc(mp, NULL, NULL,
374 			    start, stuff, length, 0,
375 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
376 
377 			xnbp->xnb_stat_csum_hardware++;
378 
379 			return (mp);
380 		}
381 
382 		/* NOTREACHED */
383 		break;
384 	}
385 
386 	default:
387 		/* Use software. */
388 		break;
389 	}
390 
391 software:
392 	/*
393 	 * We are not able to use any offload so do the whole thing in
394 	 * software.
395 	 */
396 	xnbp->xnb_stat_csum_software++;
397 
398 	return (xnb_software_csum(xnbp, mp));
399 }
400 
401 int
402 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
403 {
404 	xnb_t *xnbp;
405 	char *xsname, mac[ETHERADDRL * 3];
406 
407 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
408 
409 	xnbp->xnb_flavour = flavour;
410 	xnbp->xnb_flavour_data = flavour_data;
411 	xnbp->xnb_devinfo = dip;
412 	xnbp->xnb_evtchn = INVALID_EVTCHN;
413 	xnbp->xnb_irq = B_FALSE;
414 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
415 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
416 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
417 	xnbp->xnb_connected = B_FALSE;
418 	xnbp->xnb_hotplugged = B_FALSE;
419 	xnbp->xnb_detachable = B_FALSE;
420 	xnbp->xnb_peer = xvdi_get_oeid(dip);
421 	xnbp->xnb_tx_pages_writable = B_FALSE;
422 	xnbp->xnb_tx_always_copy = xnb_tx_always_copy;
423 
424 	xnbp->xnb_tx_buf_count = 0;
425 	xnbp->xnb_tx_unmop_count = 0;
426 
427 	xnbp->xnb_hv_copy = B_FALSE;
428 
429 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
430 	ASSERT(xnbp->xnb_rx_va != NULL);
431 
432 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
433 	    != DDI_SUCCESS)
434 		goto failure;
435 
436 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
437 	xnbp->xnb_rx_cpop = NULL;
438 	xnbp->xnb_cpop_sz = 0;
439 
440 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
441 	    xnbp->xnb_icookie);
442 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
443 	    xnbp->xnb_icookie);
444 
445 	/* set driver private pointer now */
446 	ddi_set_driver_private(dip, xnbp);
447 
448 	if (!xnb_ks_init(xnbp))
449 		goto failure_1;
450 
451 	/*
452 	 * Receive notification of changes in the state of the
453 	 * driver in the guest domain.
454 	 */
455 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
456 	    NULL) != DDI_SUCCESS)
457 		goto failure_2;
458 
459 	/*
460 	 * Receive notification of hotplug events.
461 	 */
462 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
463 	    NULL) != DDI_SUCCESS)
464 		goto failure_2;
465 
466 	xsname = xvdi_get_xsname(dip);
467 
468 	if (xenbus_printf(XBT_NULL, xsname,
469 	    "feature-no-csum-offload", "%d",
470 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
471 		goto failure_3;
472 
473 	/*
474 	 * Use global xnb_hv_copy to export this feature. This means that
475 	 * we have to decide what to do before starting up a guest domain
476 	 */
477 	if (xenbus_printf(XBT_NULL, xsname,
478 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
479 		goto failure_3;
480 	/*
481 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
482 	 * in addition to "feature-rx-copy" being 1. It seems strange
483 	 * to use four possible states to describe a binary decision,
484 	 * but we might as well play nice.
485 	 */
486 	if (xenbus_printf(XBT_NULL, xsname,
487 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
488 		goto failure_3;
489 
490 	if (xenbus_scanf(XBT_NULL, xsname,
491 	    "mac", "%s", mac) != 0) {
492 		cmn_err(CE_WARN, "xnb_attach: "
493 		    "cannot read mac address from %s",
494 		    xsname);
495 		goto failure_3;
496 	}
497 
498 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
499 		cmn_err(CE_WARN,
500 		    "xnb_attach: cannot parse mac address %s",
501 		    mac);
502 		goto failure_3;
503 	}
504 
505 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
506 	(void) xvdi_post_event(dip, XEN_HP_ADD);
507 
508 	return (DDI_SUCCESS);
509 
510 failure_3:
511 	xvdi_remove_event_handler(dip, NULL);
512 
513 failure_2:
514 	xnb_ks_free(xnbp);
515 
516 failure_1:
517 	mutex_destroy(&xnbp->xnb_rx_lock);
518 	mutex_destroy(&xnbp->xnb_tx_lock);
519 
520 failure:
521 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
522 	kmem_free(xnbp, sizeof (*xnbp));
523 	return (DDI_FAILURE);
524 }
525 
526 /*ARGSUSED*/
527 void
528 xnb_detach(dev_info_t *dip)
529 {
530 	xnb_t *xnbp = ddi_get_driver_private(dip);
531 
532 	ASSERT(xnbp != NULL);
533 	ASSERT(!xnbp->xnb_connected);
534 	ASSERT(xnbp->xnb_tx_buf_count == 0);
535 
536 	xnb_disconnect_rings(dip);
537 
538 	xvdi_remove_event_handler(dip, NULL);
539 
540 	xnb_ks_free(xnbp);
541 
542 	ddi_set_driver_private(dip, NULL);
543 
544 	mutex_destroy(&xnbp->xnb_tx_lock);
545 	mutex_destroy(&xnbp->xnb_rx_lock);
546 
547 	if (xnbp->xnb_cpop_sz > 0)
548 		kmem_free(xnbp->xnb_rx_cpop, sizeof (*xnbp->xnb_rx_cpop)
549 		    * xnbp->xnb_cpop_sz);
550 
551 	ASSERT(xnbp->xnb_rx_va != NULL);
552 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
553 
554 	kmem_free(xnbp, sizeof (*xnbp));
555 }
556 
557 
558 static mfn_t
559 xnb_alloc_page(xnb_t *xnbp)
560 {
561 #define	WARNING_RATE_LIMIT 100
562 #define	BATCH_SIZE 256
563 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
564 	static int nth = BATCH_SIZE;
565 	mfn_t mfn;
566 
567 	mutex_enter(&xnb_alloc_page_lock);
568 	if (nth == BATCH_SIZE) {
569 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
570 			xnbp->xnb_stat_allocation_failure++;
571 			mutex_exit(&xnb_alloc_page_lock);
572 
573 			/*
574 			 * Try for a single page in low memory situations.
575 			 */
576 			if (balloon_alloc_pages(1, &mfn) != 1) {
577 				if ((xnbp->xnb_stat_small_allocation_failure++
578 				    % WARNING_RATE_LIMIT) == 0)
579 					cmn_err(CE_WARN, "xnb_alloc_page: "
580 					    "Cannot allocate memory to "
581 					    "transfer packets to peer.");
582 				return (0);
583 			} else {
584 				xnbp->xnb_stat_small_allocation_success++;
585 				return (mfn);
586 			}
587 		}
588 
589 		nth = 0;
590 		xnbp->xnb_stat_allocation_success++;
591 	}
592 
593 	mfn = mfns[nth++];
594 	mutex_exit(&xnb_alloc_page_lock);
595 
596 	ASSERT(mfn != 0);
597 
598 	return (mfn);
599 #undef BATCH_SIZE
600 #undef WARNING_RATE_LIMIT
601 }
602 
603 /*ARGSUSED*/
604 static void
605 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
606 {
607 	int r;
608 	pfn_t pfn;
609 
610 	pfn = xen_assign_pfn(mfn);
611 	pfnzero(pfn, 0, PAGESIZE);
612 	xen_release_pfn(pfn);
613 
614 	/*
615 	 * This happens only in the error path, so batching is
616 	 * not worth the complication.
617 	 */
618 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
619 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
620 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
621 		    r, mfn);
622 	}
623 }
624 
625 /*
626  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
627  * using local variables.
628  */
629 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
630 	((((_r)->sring->req_prod - loop) <		\
631 		(RING_SIZE(_r) - (loop - prod))) ?	\
632 	    ((_r)->sring->req_prod - loop) :		\
633 	    (RING_SIZE(_r) - (loop - prod)))
634 
635 mblk_t *
636 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
637 {
638 	mblk_t *free = mp, *prev = NULL;
639 	size_t len;
640 	gnttab_transfer_t *gop;
641 	boolean_t notify;
642 	RING_IDX loop, prod, end;
643 
644 	/*
645 	 * For each packet the sequence of operations is:
646 	 *
647 	 * 1. get a new page from the hypervisor.
648 	 * 2. get a request slot from the ring.
649 	 * 3. copy the data into the new page.
650 	 * 4. transfer the page to the peer.
651 	 * 5. update the request slot.
652 	 * 6. kick the peer.
653 	 * 7. free mp.
654 	 *
655 	 * In order to reduce the number of hypercalls, we prepare
656 	 * several packets for the peer and perform a single hypercall
657 	 * to transfer them.
658 	 */
659 
660 	mutex_enter(&xnbp->xnb_rx_lock);
661 
662 	/*
663 	 * If we are not connected to the peer or have not yet
664 	 * finished hotplug it is too early to pass packets to the
665 	 * peer.
666 	 */
667 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
668 		mutex_exit(&xnbp->xnb_rx_lock);
669 		DTRACE_PROBE(flip_rx_too_early);
670 		xnbp->xnb_stat_rx_too_early++;
671 		return (mp);
672 	}
673 
674 	loop = xnbp->xnb_rx_ring.req_cons;
675 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
676 	gop = xnbp->xnb_rx_top;
677 
678 	while ((mp != NULL) &&
679 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
680 
681 		mfn_t mfn;
682 		pfn_t pfn;
683 		netif_rx_request_t *rxreq;
684 		netif_rx_response_t *rxresp;
685 		char *valoop;
686 		size_t offset;
687 		mblk_t *ml;
688 		uint16_t cksum_flags;
689 
690 		/* 1 */
691 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
692 			xnbp->xnb_stat_rx_defer++;
693 			break;
694 		}
695 
696 		/* 2 */
697 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
698 
699 #ifdef XNB_DEBUG
700 		if (!(rxreq->id < NET_RX_RING_SIZE))
701 			cmn_err(CE_PANIC, "xnb_to_peer: "
702 			    "id %d out of range in request 0x%p",
703 			    rxreq->id, (void *)rxreq);
704 #endif /* XNB_DEBUG */
705 
706 		/* Assign a pfn and map the new page at the allocated va. */
707 		pfn = xen_assign_pfn(mfn);
708 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
709 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
710 
711 		offset = RX_BUFFER_HEADROOM;
712 
713 		/* 3 */
714 		len = 0;
715 		valoop = xnbp->xnb_rx_va + offset;
716 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
717 			size_t chunk = ml->b_wptr - ml->b_rptr;
718 
719 			bcopy(ml->b_rptr, valoop, chunk);
720 			valoop += chunk;
721 			len += chunk;
722 		}
723 
724 		ASSERT(len + offset < PAGESIZE);
725 
726 		/* Release the pfn. */
727 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
728 		    HAT_UNLOAD_UNMAP);
729 		xen_release_pfn(pfn);
730 
731 		/* 4 */
732 		gop->mfn = mfn;
733 		gop->domid = xnbp->xnb_peer;
734 		gop->ref = rxreq->gref;
735 
736 		/* 5.1 */
737 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
738 		rxresp->offset = offset;
739 		rxresp->flags = 0;
740 
741 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
742 		if (cksum_flags != 0)
743 			xnbp->xnb_stat_rx_cksum_deferred++;
744 		rxresp->flags |= cksum_flags;
745 
746 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
747 		rxresp->status = len;
748 
749 		loop++;
750 		prod++;
751 		gop++;
752 		prev = mp;
753 		mp = mp->b_next;
754 	}
755 
756 	/*
757 	 * Did we actually do anything?
758 	 */
759 	if (loop == xnbp->xnb_rx_ring.req_cons) {
760 		mutex_exit(&xnbp->xnb_rx_lock);
761 		return (mp);
762 	}
763 
764 	end = loop;
765 
766 	/*
767 	 * Unlink the end of the 'done' list from the remainder.
768 	 */
769 	ASSERT(prev != NULL);
770 	prev->b_next = NULL;
771 
772 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
773 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
774 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
775 	}
776 
777 	loop = xnbp->xnb_rx_ring.req_cons;
778 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
779 	gop = xnbp->xnb_rx_top;
780 
781 	while (loop < end) {
782 		int16_t status = NETIF_RSP_OKAY;
783 
784 		if (gop->status != 0) {
785 			status = NETIF_RSP_ERROR;
786 
787 			/*
788 			 * If the status is anything other than
789 			 * GNTST_bad_page then we don't own the page
790 			 * any more, so don't try to give it back.
791 			 */
792 			if (gop->status != GNTST_bad_page)
793 				gop->mfn = 0;
794 		} else {
795 			/* The page is no longer ours. */
796 			gop->mfn = 0;
797 		}
798 
799 		if (gop->mfn != 0)
800 			/*
801 			 * Give back the page, as we won't be using
802 			 * it.
803 			 */
804 			xnb_free_page(xnbp, gop->mfn);
805 		else
806 			/*
807 			 * We gave away a page, update our accounting
808 			 * now.
809 			 */
810 			balloon_drv_subtracted(1);
811 
812 		/* 5.2 */
813 		if (status != NETIF_RSP_OKAY) {
814 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
815 			    status;
816 		} else {
817 			xnbp->xnb_stat_ipackets++;
818 			xnbp->xnb_stat_rbytes += len;
819 		}
820 
821 		loop++;
822 		prod++;
823 		gop++;
824 	}
825 
826 	xnbp->xnb_rx_ring.req_cons = loop;
827 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
828 
829 	/* 6 */
830 	/* LINTED: constant in conditional context */
831 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
832 	if (notify) {
833 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
834 		xnbp->xnb_stat_rx_notify_sent++;
835 	} else {
836 		xnbp->xnb_stat_rx_notify_deferred++;
837 	}
838 
839 	if (mp != NULL)
840 		xnbp->xnb_stat_rx_defer++;
841 
842 	mutex_exit(&xnbp->xnb_rx_lock);
843 
844 	/* Free mblk_t's that we consumed. */
845 	freemsgchain(free);
846 
847 	return (mp);
848 }
849 
850 /* helper functions for xnb_copy_to_peer */
851 
852 /*
853  * Grow the array of copy operation descriptors.
854  * Returns a pointer to the next available entry.
855  */
856 gnttab_copy_t *
857 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
858 {
859 	/*
860 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
861 	 * something into but cannot, because we haven't alloc'ed it
862 	 * yet, or NULL.
863 	 * old_cpop and new_cpop (local) are pointers to old/new
864 	 * versions of xnbp->xnb_rx_cpop.
865 	 */
866 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
867 	size_t		newcount;
868 
869 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
870 
871 	old_cpop = xnbp->xnb_rx_cpop;
872 	/*
873 	 * o_cpop is a pointer into the array pointed to by old_cpop;
874 	 * it would be an error for exactly one of these pointers to be NULL.
875 	 * We shouldn't call this function if xnb_rx_cpop has already
876 	 * been allocated, but we're starting to fill it from the beginning
877 	 * again.
878 	 */
879 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
880 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
881 
882 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
883 
884 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
885 	if (new_cpop == NULL) {
886 		xnbp->xnb_stat_other_allocation_failure++;
887 		return (NULL);
888 	}
889 
890 	if (o_cpop != NULL) {
891 		size_t	 offset = (o_cpop - old_cpop);
892 
893 		/* we only need to move the parts in use ... */
894 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
895 		    (sizeof (*old_cpop)));
896 
897 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
898 
899 		ret_cpop = new_cpop + offset;
900 	} else {
901 		ret_cpop = new_cpop;
902 	}
903 
904 	xnbp->xnb_rx_cpop = new_cpop;
905 	xnbp->xnb_cpop_sz = newcount;
906 
907 	xnbp->xnb_stat_rx_cpoparea_grown++;
908 
909 	return (ret_cpop);
910 }
911 
912 /*
913  * Check whether an address is on a page that's foreign to this domain.
914  */
915 static boolean_t
916 is_foreign(void *addr)
917 {
918 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
919 
920 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
921 }
922 
923 /*
924  * Insert a newly allocated mblk into a chain, replacing the old one.
925  */
926 static mblk_t *
927 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
928 {
929 	uint32_t	start, stuff, end, value, flags;
930 	mblk_t		*new_mp;
931 
932 	new_mp = copyb(mp);
933 	if (new_mp == NULL)
934 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
935 		    "for %p, len %lu", (void *) mp, len);
936 
937 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
938 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
939 	    flags, KM_NOSLEEP);
940 
941 	new_mp->b_next = mp->b_next;
942 	new_mp->b_prev = mp->b_prev;
943 	new_mp->b_cont = mp->b_cont;
944 
945 	/* Make sure we only overwrite pointers to the mblk being replaced. */
946 	if (mp_prev != NULL && mp_prev->b_next == mp)
947 		mp_prev->b_next = new_mp;
948 
949 	if (ml_prev != NULL && ml_prev->b_cont == mp)
950 		ml_prev->b_cont = new_mp;
951 
952 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
953 	freemsg(mp);
954 
955 	return (new_mp);
956 }
957 
958 /*
959  * Set all the fields in a gnttab_copy_t.
960  */
961 static void
962 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
963     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
964 {
965 	ASSERT(xnbp != NULL && gp != NULL);
966 
967 	gp->source.offset = s_off;
968 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
969 	gp->source.domid = DOMID_SELF;
970 
971 	gp->len = (uint16_t)len;
972 	gp->flags = GNTCOPY_dest_gref;
973 	gp->status = 0;
974 
975 	gp->dest.u.ref = d_ref;
976 	gp->dest.offset = d_off;
977 	gp->dest.domid = xnbp->xnb_peer;
978 }
979 
980 mblk_t *
981 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
982 {
983 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
984 	mblk_t		*ml, *ml_prev;
985 	gnttab_copy_t	*gop_cp;
986 	boolean_t	notify;
987 	RING_IDX	loop, prod;
988 	int		i;
989 
990 	if (!xnbp->xnb_hv_copy)
991 		return (xnb_to_peer(xnbp, mp));
992 
993 	/*
994 	 * For each packet the sequence of operations is:
995 	 *
996 	 *  1. get a request slot from the ring.
997 	 *  2. set up data for hypercall (see NOTE below)
998 	 *  3. have the hypervisore copy the data
999 	 *  4. update the request slot.
1000 	 *  5. kick the peer.
1001 	 *
1002 	 * NOTE ad 2.
1003 	 *  In order to reduce the number of hypercalls, we prepare
1004 	 *  several packets (mp->b_cont != NULL) for the peer and
1005 	 *  perform a single hypercall to transfer them.
1006 	 *  We also have to set up a seperate copy operation for
1007 	 *  every page.
1008 	 *
1009 	 * If we have more than one message (mp->b_next != NULL),
1010 	 * we do this whole dance repeatedly.
1011 	 */
1012 
1013 	mutex_enter(&xnbp->xnb_rx_lock);
1014 
1015 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1016 		mutex_exit(&xnbp->xnb_rx_lock);
1017 		DTRACE_PROBE(copy_rx_too_early);
1018 		xnbp->xnb_stat_rx_too_early++;
1019 		return (mp);
1020 	}
1021 
1022 	loop = xnbp->xnb_rx_ring.req_cons;
1023 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1024 
1025 	while ((mp != NULL) &&
1026 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1027 		netif_rx_request_t	*rxreq;
1028 		netif_rx_response_t	*rxresp;
1029 		size_t			offset, d_offset;
1030 		size_t			len;
1031 		uint16_t		cksum_flags;
1032 		int16_t			status = NETIF_RSP_OKAY;
1033 		int			item_count;
1034 
1035 		/* 1 */
1036 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1037 
1038 #ifdef XNB_DEBUG
1039 		if (!(rxreq->id < NET_RX_RING_SIZE))
1040 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1041 			    "id %d out of range in request 0x%p",
1042 			    rxreq->id, (void *)rxreq);
1043 #endif /* XNB_DEBUG */
1044 
1045 		/* 2 */
1046 		d_offset = offset = RX_BUFFER_HEADROOM;
1047 		len = 0;
1048 		item_count = 0;
1049 
1050 		gop_cp = xnbp->xnb_rx_cpop;
1051 
1052 		/*
1053 		 * We walk the b_cont pointers and set up a gop_cp
1054 		 * structure for every page in every data block we have.
1055 		 */
1056 		/* 2a */
1057 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1058 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1059 			uchar_t	*r_tmp,	*rpt_align;
1060 			size_t	r_offset;
1061 
1062 			/*
1063 			 * If we get an mblk on a page that doesn't belong to
1064 			 * this domain, get a new mblk to replace the old one.
1065 			 */
1066 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1067 				mblk_t *ml_new = replace_msg(ml, chunk,
1068 				    mp_prev, ml_prev);
1069 
1070 				/* We can still use old ml, but not *ml! */
1071 				if (free == ml)
1072 					free = ml_new;
1073 				if (mp == ml)
1074 					mp = ml_new;
1075 				ml = ml_new;
1076 
1077 				xnbp->xnb_stat_rx_foreign_page++;
1078 			}
1079 
1080 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1081 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1082 			r_tmp = ml->b_rptr;
1083 
1084 			if (d_offset + chunk > PAGESIZE)
1085 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1086 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1087 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1088 				    (void *)mp, (void *)saved_mp, (void *)ml,
1089 				    (void *)rpt_align,
1090 				    d_offset, chunk, (int)PAGESIZE);
1091 
1092 			while (chunk > 0) {
1093 				size_t part_len;
1094 
1095 				item_count++;
1096 				if (item_count > xnbp->xnb_cpop_sz) {
1097 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1098 					if (gop_cp == NULL)
1099 						goto failure;
1100 				}
1101 				/*
1102 				 * If our mblk crosses a page boundary, we need
1103 				 * to do a seperate copy for every page.
1104 				 */
1105 				if (r_offset + chunk > PAGESIZE) {
1106 					part_len = PAGESIZE - r_offset;
1107 
1108 					DTRACE_PROBE3(mblk_page_crossed,
1109 					    (mblk_t *), ml, int, chunk, int,
1110 					    (int)r_offset);
1111 
1112 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1113 				} else {
1114 					part_len = chunk;
1115 				}
1116 
1117 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1118 				    d_offset, part_len, rxreq->gref);
1119 
1120 				chunk -= part_len;
1121 
1122 				len += part_len;
1123 				d_offset += part_len;
1124 				r_tmp += part_len;
1125 				/*
1126 				 * The 2nd, 3rd ... last copies will always
1127 				 * start at r_tmp, therefore r_offset is 0.
1128 				 */
1129 				r_offset = 0;
1130 				gop_cp++;
1131 			}
1132 			ml_prev = ml;
1133 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1134 			    chunk, int, len, int, item_count);
1135 		}
1136 		/* 3 */
1137 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1138 		    item_count) != 0) {
1139 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1140 			DTRACE_PROBE(HV_granttableopfailed);
1141 		}
1142 
1143 		/* 4 */
1144 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1145 		rxresp->offset = offset;
1146 
1147 		rxresp->flags = 0;
1148 
1149 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1150 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1151 		    (int)rxresp->status);
1152 
1153 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1154 		if (cksum_flags != 0)
1155 			xnbp->xnb_stat_rx_cksum_deferred++;
1156 		rxresp->flags |= cksum_flags;
1157 
1158 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1159 		rxresp->status = len;
1160 
1161 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1162 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1163 		    (int)rxresp->status);
1164 
1165 		for (i = 0; i < item_count; i++) {
1166 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1167 				DTRACE_PROBE2(cpop__status__nonnull, int,
1168 				    (int)xnbp->xnb_rx_cpop[i].status,
1169 				    int, i);
1170 				status = NETIF_RSP_ERROR;
1171 			}
1172 		}
1173 
1174 		/* 5.2 */
1175 		if (status != NETIF_RSP_OKAY) {
1176 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1177 			    status;
1178 			xnbp->xnb_stat_rx_rsp_notok++;
1179 		} else {
1180 			xnbp->xnb_stat_ipackets++;
1181 			xnbp->xnb_stat_rbytes += len;
1182 		}
1183 
1184 		loop++;
1185 		prod++;
1186 		mp_prev = mp;
1187 		mp = mp->b_next;
1188 	}
1189 failure:
1190 	/*
1191 	 * Did we actually do anything?
1192 	 */
1193 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1194 		mutex_exit(&xnbp->xnb_rx_lock);
1195 		return (mp);
1196 	}
1197 
1198 	/*
1199 	 * Unlink the end of the 'done' list from the remainder.
1200 	 */
1201 	ASSERT(mp_prev != NULL);
1202 	mp_prev->b_next = NULL;
1203 
1204 	xnbp->xnb_rx_ring.req_cons = loop;
1205 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1206 
1207 	/* 6 */
1208 	/* LINTED: constant in conditional context */
1209 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1210 	if (notify) {
1211 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1212 		xnbp->xnb_stat_rx_notify_sent++;
1213 	} else {
1214 		xnbp->xnb_stat_rx_notify_deferred++;
1215 	}
1216 
1217 	if (mp != NULL)
1218 		xnbp->xnb_stat_rx_defer++;
1219 
1220 	mutex_exit(&xnbp->xnb_rx_lock);
1221 
1222 	/* Free mblk_t structs we have consumed. */
1223 	freemsgchain(free);
1224 
1225 	return (mp);
1226 }
1227 
1228 /*ARGSUSED*/
1229 static int
1230 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1231 {
1232 	xnb_txbuf_t *txp = buf;
1233 
1234 	bzero(txp, sizeof (*txp));
1235 
1236 	txp->xt_free_rtn.free_func = xnb_tx_complete;
1237 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1238 
1239 	txp->xt_mop.host_addr =
1240 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1241 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1242 	    VM_NOSLEEP : VM_SLEEP);
1243 
1244 	if (txp->xt_mop.host_addr == NULL) {
1245 		cmn_err(CE_WARN, "xnb_txbuf_constructor: "
1246 		    "cannot get address space");
1247 		return (-1);
1248 	}
1249 
1250 	/*
1251 	 * Have the hat ensure that page table exists for the VA.
1252 	 */
1253 	hat_prepare_mapping(kas.a_hat,
1254 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, NULL);
1255 
1256 	return (0);
1257 }
1258 
1259 /*ARGSUSED*/
1260 static void
1261 xnb_txbuf_destructor(void *buf, void *arg)
1262 {
1263 	xnb_txbuf_t *txp = buf;
1264 
1265 	ASSERT(txp->xt_mop.host_addr != NULL);
1266 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
1267 
1268 	hat_release_mapping(kas.a_hat,
1269 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr);
1270 	vmem_free(heap_arena,
1271 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, PAGESIZE);
1272 }
1273 
1274 static void
1275 xnb_tx_notify_peer(xnb_t *xnbp)
1276 {
1277 	boolean_t notify;
1278 
1279 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1280 
1281 	/* LINTED: constant in conditional context */
1282 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1283 	if (notify) {
1284 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1285 		xnbp->xnb_stat_tx_notify_sent++;
1286 	} else {
1287 		xnbp->xnb_stat_tx_notify_deferred++;
1288 	}
1289 }
1290 
1291 static void
1292 xnb_tx_complete(xnb_txbuf_t *txp)
1293 {
1294 	xnb_t *xnbp = txp->xt_xnbp;
1295 
1296 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
1297 
1298 	mutex_enter(&xnbp->xnb_tx_lock);
1299 	xnb_tx_schedule_unmop(xnbp, &txp->xt_mop, txp);
1300 	mutex_exit(&xnbp->xnb_tx_lock);
1301 }
1302 
1303 static void
1304 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1305 {
1306 	RING_IDX i;
1307 	netif_tx_response_t *txresp;
1308 
1309 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1310 
1311 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1312 
1313 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1314 	txresp->id = id;
1315 	txresp->status = status;
1316 
1317 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1318 
1319 	/*
1320 	 * Note that we don't push the change to the peer here - that
1321 	 * is the callers responsibility.
1322 	 */
1323 }
1324 
1325 static void
1326 xnb_tx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1327     xnb_txbuf_t *txp)
1328 {
1329 	gnttab_unmap_grant_ref_t	*unmop;
1330 	int				u_count;
1331 	int				reqs_on_ring;
1332 
1333 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1334 	ASSERT(xnbp->xnb_tx_unmop_count < NET_TX_RING_SIZE);
1335 
1336 	u_count = xnbp->xnb_tx_unmop_count++;
1337 
1338 	/* Cache data for the time when we actually unmap grant refs */
1339 	xnbp->xnb_tx_unmop_txp[u_count] = txp;
1340 
1341 	unmop = &xnbp->xnb_tx_unmop[u_count];
1342 	unmop->host_addr = mop->host_addr;
1343 	unmop->dev_bus_addr = mop->dev_bus_addr;
1344 	unmop->handle = mop->handle;
1345 
1346 	/*
1347 	 * We cannot check the ring once we're disconnected from it. Batching
1348 	 * doesn't seem to be a useful optimisation in this case either,
1349 	 * so we directly call into the actual unmap function.
1350 	 */
1351 	if (xnbp->xnb_connected) {
1352 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring);
1353 
1354 		/*
1355 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1356 		 * or (with N == 1) "immediate unmop" behaviour.
1357 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1358 		 */
1359 		if (xnbp->xnb_tx_unmop_count < xnb_unmop_hiwat &&
1360 		    reqs_on_ring > xnb_unmop_lowwat)
1361 			return;
1362 	}
1363 
1364 	xnb_tx_perform_pending_unmop(xnbp);
1365 }
1366 
1367 /*
1368  * Here we perform the actual unmapping of the data that was
1369  * accumulated in xnb_tx_schedule_unmop().
1370  * Note that it is the caller's responsibility to make sure that
1371  * there's actually something there to unmop.
1372  */
1373 static void
1374 xnb_tx_perform_pending_unmop(xnb_t *xnbp)
1375 {
1376 	RING_IDX loop;
1377 #ifdef XNB_DEBUG
1378 	gnttab_unmap_grant_ref_t *unmop;
1379 #endif /* XNB_DEBUG */
1380 
1381 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1382 	ASSERT(xnbp->xnb_tx_unmop_count > 0);
1383 
1384 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1385 	    xnbp->xnb_tx_unmop, xnbp->xnb_tx_unmop_count) < 0) {
1386 		cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
1387 		    "unmap grant operation failed, "
1388 		    "%d pages lost", xnbp->xnb_tx_unmop_count);
1389 	}
1390 
1391 #ifdef XNB_DEBUG
1392 	for (loop = 0, unmop = xnbp->xnb_tx_unmop;
1393 	    loop < xnbp->xnb_tx_unmop_count;
1394 	    loop++, unmop++) {
1395 		if (unmop->status != 0) {
1396 			cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
1397 			    "unmap grant reference failed (%d)",
1398 			    unmop->status);
1399 		}
1400 	}
1401 #endif /* XNB_DEBUG */
1402 
1403 	for (loop = 0; loop < xnbp->xnb_tx_unmop_count; loop++) {
1404 		xnb_txbuf_t	*txp = xnbp->xnb_tx_unmop_txp[loop];
1405 
1406 		if (txp == NULL)
1407 			cmn_err(CE_PANIC,
1408 			    "xnb_tx_perform_pending_unmop: "
1409 			    "unexpected NULL txp (loop %d; count %d)!",
1410 			    loop, xnbp->xnb_tx_unmop_count);
1411 
1412 		if (xnbp->xnb_connected)
1413 			xnb_tx_mark_complete(xnbp, txp->xt_id, txp->xt_status);
1414 		xnb_txbuf_put(xnbp, txp);
1415 	}
1416 	if (xnbp->xnb_connected)
1417 		xnb_tx_notify_peer(xnbp);
1418 
1419 	xnbp->xnb_tx_unmop_count = 0;
1420 
1421 #ifdef XNB_DEBUG
1422 	bzero(xnbp->xnb_tx_unmop, sizeof (xnbp->xnb_tx_unmop));
1423 	bzero(xnbp->xnb_tx_unmop_txp, sizeof (xnbp->xnb_tx_unmop_txp));
1424 #endif /* XNB_DEBUG */
1425 }
1426 
1427 static xnb_txbuf_t *
1428 xnb_txbuf_get(xnb_t *xnbp, int flags)
1429 {
1430 	xnb_txbuf_t *txp;
1431 
1432 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1433 
1434 	txp = kmem_cache_alloc(xnb_txbuf_cachep, flags);
1435 	if (txp != NULL) {
1436 		ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
1437 		txp->xt_flags |= XNB_TXBUF_INUSE;
1438 
1439 		txp->xt_xnbp = xnbp;
1440 		txp->xt_mop.dom = xnbp->xnb_peer;
1441 
1442 		txp->xt_mop.flags = GNTMAP_host_map;
1443 		if (!xnbp->xnb_tx_pages_writable)
1444 			txp->xt_mop.flags |= GNTMAP_readonly;
1445 
1446 		xnbp->xnb_tx_buf_count++;
1447 	}
1448 
1449 	return (txp);
1450 }
1451 
1452 static void
1453 xnb_txbuf_put(xnb_t *xnbp, xnb_txbuf_t *txp)
1454 {
1455 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1456 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
1457 
1458 	txp->xt_flags &= ~XNB_TXBUF_INUSE;
1459 	xnbp->xnb_tx_buf_count--;
1460 
1461 	kmem_cache_free(xnb_txbuf_cachep, txp);
1462 }
1463 
1464 static mblk_t *
1465 xnb_from_peer(xnb_t *xnbp)
1466 {
1467 	RING_IDX start, end, loop;
1468 	gnttab_map_grant_ref_t *mop;
1469 	xnb_txbuf_t **txpp;
1470 	netif_tx_request_t *txreq;
1471 	boolean_t work_to_do;
1472 	mblk_t *head, *tail;
1473 	/*
1474 	 * If the peer granted a read-only mapping to the page then we
1475 	 * must copy the data, as the local protocol stack (should the
1476 	 * packet be destined for this host) will modify the packet
1477 	 * 'in place'.
1478 	 */
1479 	boolean_t copy = xnbp->xnb_tx_always_copy ||
1480 	    !xnbp->xnb_tx_pages_writable;
1481 
1482 	/*
1483 	 * For each individual request, the sequence of actions is:
1484 	 *
1485 	 * 1. get the request.
1486 	 * 2. map the page based on the grant ref.
1487 	 * 3. allocate an mblk, copy the data to it.
1488 	 * 4. release the grant.
1489 	 * 5. update the ring.
1490 	 * 6. pass the packet upward.
1491 	 * 7. kick the peer.
1492 	 *
1493 	 * In fact, we try to perform the grant operations in batches,
1494 	 * so there are two loops.
1495 	 */
1496 
1497 	head = tail = NULL;
1498 around:
1499 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1500 
1501 	/* LINTED: constant in conditional context */
1502 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1503 	if (!work_to_do) {
1504 finished:
1505 		return (head);
1506 	}
1507 
1508 	start = xnbp->xnb_tx_ring.req_cons;
1509 	end = xnbp->xnb_tx_ring.sring->req_prod;
1510 
1511 	if ((end - start) > NET_TX_RING_SIZE) {
1512 		/*
1513 		 * This usually indicates that the frontend driver is
1514 		 * misbehaving, as it's not possible to have more than
1515 		 * NET_TX_RING_SIZE ring elements in play at any one
1516 		 * time.
1517 		 *
1518 		 * We reset the ring pointers to the state declared by
1519 		 * the frontend and try to carry on.
1520 		 */
1521 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1522 		    "items in the ring, resetting and trying to recover.",
1523 		    xnbp->xnb_peer, (end - start));
1524 
1525 		/* LINTED: constant in conditional context */
1526 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1527 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1528 
1529 		goto around;
1530 	}
1531 
1532 	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
1533 	    loop != end;
1534 	    loop++, mop++, txpp++) {
1535 		xnb_txbuf_t *txp;
1536 
1537 		txp = xnb_txbuf_get(xnbp, KM_NOSLEEP);
1538 		if (txp == NULL)
1539 			break;
1540 
1541 		ASSERT(xnbp->xnb_tx_pages_writable ||
1542 		    ((txp->xt_mop.flags & GNTMAP_readonly)
1543 		    == GNTMAP_readonly));
1544 
1545 		txp->xt_mop.ref =
1546 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1547 
1548 		*mop = txp->xt_mop;
1549 		*txpp = txp;
1550 	}
1551 
1552 	if ((loop - start) == 0)
1553 		goto finished;
1554 
1555 	end = loop;
1556 
1557 	if (xen_map_gref(GNTTABOP_map_grant_ref, xnbp->xnb_tx_mop,
1558 	    end - start, B_FALSE) != 0) {
1559 
1560 		cmn_err(CE_WARN, "xnb_from_peer: map grant operation failed");
1561 
1562 		loop = start;
1563 		txpp = xnbp->xnb_tx_bufp;
1564 
1565 		while (loop != end) {
1566 			xnb_txbuf_put(xnbp, *txpp);
1567 
1568 			loop++;
1569 			txpp++;
1570 		}
1571 
1572 		goto finished;
1573 	}
1574 
1575 	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
1576 	    loop != end;
1577 	    loop++, mop++, txpp++) {
1578 		mblk_t *mp = NULL;
1579 		int16_t status = NETIF_RSP_OKAY;
1580 		xnb_txbuf_t *txp = *txpp;
1581 
1582 		if (mop->status != 0) {
1583 			cmn_err(CE_WARN, "xnb_from_peer: "
1584 			    "failed to map buffer: %d",
1585 			    mop->status);
1586 			status = NETIF_RSP_ERROR;
1587 		}
1588 
1589 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1590 
1591 		if (status == NETIF_RSP_OKAY) {
1592 			if (copy) {
1593 				mp = allocb(txreq->size, BPRI_MED);
1594 				if (mp == NULL) {
1595 					status = NETIF_RSP_ERROR;
1596 					xnbp->xnb_stat_tx_allocb_failed++;
1597 				} else {
1598 					bcopy((caddr_t)(uintptr_t)
1599 					    mop->host_addr + txreq->offset,
1600 					    mp->b_wptr, txreq->size);
1601 					mp->b_wptr += txreq->size;
1602 				}
1603 			} else {
1604 				mp = desballoc((uchar_t *)(uintptr_t)
1605 				    mop->host_addr + txreq->offset,
1606 				    txreq->size, 0, &txp->xt_free_rtn);
1607 				if (mp == NULL) {
1608 					status = NETIF_RSP_ERROR;
1609 					xnbp->xnb_stat_tx_allocb_failed++;
1610 				} else {
1611 					txp->xt_id = txreq->id;
1612 					txp->xt_status = status;
1613 					txp->xt_mop = *mop;
1614 
1615 					mp->b_wptr += txreq->size;
1616 				}
1617 			}
1618 
1619 			/*
1620 			 * If we have a buffer and there are checksum
1621 			 * flags, process them appropriately.
1622 			 */
1623 			if ((mp != NULL) &&
1624 			    ((txreq->flags &
1625 			    (NETTXF_csum_blank | NETTXF_data_validated))
1626 			    != 0)) {
1627 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1628 				    mp, txreq->flags);
1629 				xnbp->xnb_stat_tx_cksum_no_need++;
1630 			}
1631 		}
1632 
1633 		if (copy || (mp == NULL)) {
1634 			txp->xt_status = status;
1635 			txp->xt_id = txreq->id;
1636 			xnb_tx_schedule_unmop(xnbp, mop, txp);
1637 		}
1638 
1639 		if (mp != NULL) {
1640 			xnbp->xnb_stat_opackets++;
1641 			xnbp->xnb_stat_obytes += txreq->size;
1642 
1643 			mp->b_next = NULL;
1644 			if (head == NULL) {
1645 				ASSERT(tail == NULL);
1646 				head = mp;
1647 			} else {
1648 				ASSERT(tail != NULL);
1649 				tail->b_next = mp;
1650 			}
1651 			tail = mp;
1652 		}
1653 	}
1654 
1655 	xnbp->xnb_tx_ring.req_cons = loop;
1656 
1657 	goto around;
1658 	/* NOTREACHED */
1659 }
1660 
1661 /*
1662  *  intr() -- ring interrupt service routine
1663  */
1664 static uint_t
1665 xnb_intr(caddr_t arg)
1666 {
1667 	xnb_t *xnbp = (xnb_t *)arg;
1668 	mblk_t *mp;
1669 
1670 	xnbp->xnb_stat_intr++;
1671 
1672 	mutex_enter(&xnbp->xnb_tx_lock);
1673 
1674 	ASSERT(xnbp->xnb_connected);
1675 
1676 	mp = xnb_from_peer(xnbp);
1677 
1678 	mutex_exit(&xnbp->xnb_tx_lock);
1679 
1680 	if (!xnbp->xnb_hotplugged) {
1681 		xnbp->xnb_stat_tx_too_early++;
1682 		goto fail;
1683 	}
1684 	if (mp == NULL) {
1685 		xnbp->xnb_stat_spurious_intr++;
1686 		goto fail;
1687 	}
1688 
1689 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1690 
1691 	return (DDI_INTR_CLAIMED);
1692 
1693 fail:
1694 	freemsgchain(mp);
1695 	return (DDI_INTR_CLAIMED);
1696 }
1697 
1698 static boolean_t
1699 xnb_connect_rings(dev_info_t *dip)
1700 {
1701 	xnb_t *xnbp = ddi_get_driver_private(dip);
1702 	char *oename;
1703 	struct gnttab_map_grant_ref map_op;
1704 	evtchn_port_t evtchn;
1705 	int i;
1706 
1707 	/*
1708 	 * Cannot attempt to connect the rings if already connected.
1709 	 */
1710 	ASSERT(!xnbp->xnb_connected);
1711 
1712 	oename = xvdi_get_oename(dip);
1713 
1714 	if (xenbus_gather(XBT_NULL, oename,
1715 	    "event-channel", "%u", &evtchn,
1716 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1717 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1718 	    NULL) != 0) {
1719 		cmn_err(CE_WARN, "xnb_connect_rings: "
1720 		    "cannot read other-end details from %s",
1721 		    oename);
1722 		goto fail;
1723 	}
1724 
1725 	if (xenbus_scanf(XBT_NULL, oename,
1726 	    "feature-tx-writable", "%d", &i) != 0)
1727 		i = 0;
1728 	if (i != 0)
1729 		xnbp->xnb_tx_pages_writable = B_TRUE;
1730 
1731 	if (xenbus_scanf(XBT_NULL, oename,
1732 	    "feature-no-csum-offload", "%d", &i) != 0)
1733 		i = 0;
1734 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1735 		xnbp->xnb_cksum_offload = B_FALSE;
1736 
1737 	/* Check whether our peer knows and requests hypervisor copy */
1738 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1739 	    != 0)
1740 		i = 0;
1741 	if (i != 0)
1742 		xnbp->xnb_hv_copy = B_TRUE;
1743 
1744 	/*
1745 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1746 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1747 	 *    into the allocated vaddr (one for tx, one for rx).
1748 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1749 	 *    bound to this domain.
1750 	 * 4. associate the event channel with an interrupt.
1751 	 * 5. declare ourselves connected.
1752 	 * 6. enable the interrupt.
1753 	 */
1754 
1755 	/* 1.tx */
1756 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1757 	    0, 0, 0, 0, VM_SLEEP);
1758 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1759 
1760 	/* 2.tx */
1761 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1762 	map_op.flags = GNTMAP_host_map;
1763 	map_op.ref = xnbp->xnb_tx_ring_ref;
1764 	map_op.dom = xnbp->xnb_peer;
1765 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1766 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1767 	    map_op.status != 0) {
1768 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1769 		goto fail;
1770 	}
1771 	xnbp->xnb_tx_ring_handle = map_op.handle;
1772 
1773 	/* LINTED: constant in conditional context */
1774 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1775 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1776 
1777 	/* 1.rx */
1778 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1779 	    0, 0, 0, 0, VM_SLEEP);
1780 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1781 
1782 	/* 2.rx */
1783 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1784 	map_op.flags = GNTMAP_host_map;
1785 	map_op.ref = xnbp->xnb_rx_ring_ref;
1786 	map_op.dom = xnbp->xnb_peer;
1787 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1788 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1789 	    map_op.status != 0) {
1790 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1791 		goto fail;
1792 	}
1793 	xnbp->xnb_rx_ring_handle = map_op.handle;
1794 
1795 	/* LINTED: constant in conditional context */
1796 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1797 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1798 
1799 	/* 3 */
1800 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1801 		cmn_err(CE_WARN, "xnb_connect_rings: "
1802 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1803 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1804 		goto fail;
1805 	}
1806 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1807 
1808 	/*
1809 	 * It would be good to set the state to XenbusStateConnected
1810 	 * here as well, but then what if ddi_add_intr() failed?
1811 	 * Changing the state in the store will be noticed by the peer
1812 	 * and cannot be "taken back".
1813 	 */
1814 	mutex_enter(&xnbp->xnb_tx_lock);
1815 	mutex_enter(&xnbp->xnb_rx_lock);
1816 
1817 	/* 5.1 */
1818 	xnbp->xnb_connected = B_TRUE;
1819 
1820 	mutex_exit(&xnbp->xnb_rx_lock);
1821 	mutex_exit(&xnbp->xnb_tx_lock);
1822 
1823 	/* 4, 6 */
1824 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1825 	    != DDI_SUCCESS) {
1826 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1827 		goto fail;
1828 	}
1829 	xnbp->xnb_irq = B_TRUE;
1830 
1831 	/* 5.2 */
1832 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1833 
1834 	return (B_TRUE);
1835 
1836 fail:
1837 	mutex_enter(&xnbp->xnb_tx_lock);
1838 	mutex_enter(&xnbp->xnb_rx_lock);
1839 
1840 	xnbp->xnb_connected = B_FALSE;
1841 	mutex_exit(&xnbp->xnb_rx_lock);
1842 	mutex_exit(&xnbp->xnb_tx_lock);
1843 
1844 	return (B_FALSE);
1845 }
1846 
1847 static void
1848 xnb_disconnect_rings(dev_info_t *dip)
1849 {
1850 	xnb_t *xnbp = ddi_get_driver_private(dip);
1851 
1852 	if (xnbp->xnb_irq) {
1853 		ddi_remove_intr(dip, 0, NULL);
1854 		xnbp->xnb_irq = B_FALSE;
1855 	}
1856 
1857 	if (xnbp->xnb_tx_unmop_count > 0)
1858 		xnb_tx_perform_pending_unmop(xnbp);
1859 
1860 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1861 		xvdi_free_evtchn(dip);
1862 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1863 	}
1864 
1865 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1866 		struct gnttab_unmap_grant_ref unmap_op;
1867 
1868 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1869 		    xnbp->xnb_rx_ring_addr;
1870 		unmap_op.dev_bus_addr = 0;
1871 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1872 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1873 		    &unmap_op, 1) != 0)
1874 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1875 			    "cannot unmap rx-ring page (%d)",
1876 			    unmap_op.status);
1877 
1878 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1879 	}
1880 
1881 	if (xnbp->xnb_rx_ring_addr != NULL) {
1882 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1883 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1884 		xnbp->xnb_rx_ring_addr = NULL;
1885 	}
1886 
1887 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1888 		struct gnttab_unmap_grant_ref unmap_op;
1889 
1890 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1891 		    xnbp->xnb_tx_ring_addr;
1892 		unmap_op.dev_bus_addr = 0;
1893 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1894 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1895 		    &unmap_op, 1) != 0)
1896 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1897 			    "cannot unmap tx-ring page (%d)",
1898 			    unmap_op.status);
1899 
1900 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1901 	}
1902 
1903 	if (xnbp->xnb_tx_ring_addr != NULL) {
1904 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1905 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1906 		xnbp->xnb_tx_ring_addr = NULL;
1907 	}
1908 }
1909 
1910 /*ARGSUSED*/
1911 static void
1912 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1913     void *arg, void *impl_data)
1914 {
1915 	xnb_t *xnbp = ddi_get_driver_private(dip);
1916 	XenbusState new_state = *(XenbusState *)impl_data;
1917 
1918 	ASSERT(xnbp != NULL);
1919 
1920 	switch (new_state) {
1921 	case XenbusStateConnected:
1922 		/* spurious state change */
1923 		if (xnbp->xnb_connected)
1924 			return;
1925 
1926 		if (xnb_connect_rings(dip)) {
1927 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1928 		} else {
1929 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1930 			xnb_disconnect_rings(dip);
1931 			(void) xvdi_switch_state(dip, XBT_NULL,
1932 			    XenbusStateClosed);
1933 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1934 		}
1935 
1936 		/*
1937 		 * Now that we've attempted to connect it's reasonable
1938 		 * to allow an attempt to detach.
1939 		 */
1940 		xnbp->xnb_detachable = B_TRUE;
1941 
1942 		break;
1943 
1944 	case XenbusStateClosing:
1945 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1946 
1947 		break;
1948 
1949 	case XenbusStateClosed:
1950 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1951 
1952 		mutex_enter(&xnbp->xnb_tx_lock);
1953 		mutex_enter(&xnbp->xnb_rx_lock);
1954 
1955 		xnb_disconnect_rings(dip);
1956 		xnbp->xnb_connected = B_FALSE;
1957 
1958 		mutex_exit(&xnbp->xnb_rx_lock);
1959 		mutex_exit(&xnbp->xnb_tx_lock);
1960 
1961 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1962 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1963 		/*
1964 		 * In all likelyhood this is already set (in the above
1965 		 * case), but if the peer never attempted to connect
1966 		 * and the domain is destroyed we get here without
1967 		 * having been through the case above, so we set it to
1968 		 * be sure.
1969 		 */
1970 		xnbp->xnb_detachable = B_TRUE;
1971 
1972 		break;
1973 
1974 	default:
1975 		break;
1976 	}
1977 }
1978 
1979 /*ARGSUSED*/
1980 static void
1981 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1982     void *arg, void *impl_data)
1983 {
1984 	xnb_t *xnbp = ddi_get_driver_private(dip);
1985 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1986 	boolean_t success;
1987 
1988 	ASSERT(xnbp != NULL);
1989 
1990 	switch (state) {
1991 	case Connected:
1992 
1993 		/* spurious hotplug event */
1994 		if (xnbp->xnb_hotplugged)
1995 			return;
1996 
1997 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1998 
1999 		mutex_enter(&xnbp->xnb_tx_lock);
2000 		mutex_enter(&xnbp->xnb_rx_lock);
2001 
2002 		xnbp->xnb_hotplugged = success;
2003 
2004 		mutex_exit(&xnbp->xnb_rx_lock);
2005 		mutex_exit(&xnbp->xnb_tx_lock);
2006 		break;
2007 
2008 	default:
2009 		break;
2010 	}
2011 }
2012 
2013 static struct modldrv modldrv = {
2014 	&mod_miscops, "xnb",
2015 };
2016 
2017 static struct modlinkage modlinkage = {
2018 	MODREV_1, &modldrv, NULL
2019 };
2020 
2021 int
2022 _init(void)
2023 {
2024 	int i;
2025 
2026 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2027 
2028 	xnb_txbuf_cachep = kmem_cache_create("xnb_txbuf_cachep",
2029 	    sizeof (xnb_txbuf_t), 0, xnb_txbuf_constructor,
2030 	    xnb_txbuf_destructor, NULL, NULL, NULL, 0);
2031 	ASSERT(xnb_txbuf_cachep != NULL);
2032 
2033 	i = mod_install(&modlinkage);
2034 	if (i != DDI_SUCCESS) {
2035 		kmem_cache_destroy(xnb_txbuf_cachep);
2036 		mutex_destroy(&xnb_alloc_page_lock);
2037 	}
2038 	return (i);
2039 }
2040 
2041 int
2042 _info(struct modinfo *modinfop)
2043 {
2044 	return (mod_info(&modlinkage, modinfop));
2045 }
2046 
2047 int
2048 _fini(void)
2049 {
2050 	int i;
2051 
2052 	i = mod_remove(&modlinkage);
2053 	if (i == DDI_SUCCESS) {
2054 		kmem_cache_destroy(xnb_txbuf_cachep);
2055 		mutex_destroy(&xnb_alloc_page_lock);
2056 	}
2057 	return (i);
2058 }
2059