xref: /illumos-gate/usr/src/uts/common/xen/io/xdb.c (revision 7257d1b4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Note: This is the backend part of the split PV disk driver. This driver
29  * is not a nexus driver, nor is it a leaf driver(block/char/stream driver).
30  * Currently, it does not create any minor node. So, although, it runs in
31  * backend domain, it will not be used directly from within dom0.
32  * It simply gets block I/O requests issued by frontend from a shared page
33  * (blkif ring buffer - defined by Xen) between backend and frontend domain,
34  * generates a buf, and push it down to underlying disk target driver via
35  * ldi interface. When buf is done, this driver will generate a response
36  * and put it into ring buffer to inform frontend of the status of the I/O
37  * request issued by it. When a new virtual device entry is added in xenstore,
38  * there will be an watch event sent from Xen to xvdi framework, who will,
39  * in turn, create the devinfo node and try to attach this driver
40  * (see xvdi_create_dev). When frontend peer changes its state to
41  * XenbusStateClose, an event will also be sent from Xen to xvdi framework,
42  * who will detach and remove this devinfo node (see i_xvdi_oestate_handler).
43  * I/O requests get from ring buffer and event coming from xenstore cannot be
44  * trusted. We verify them in xdb_get_buf() and xdb_check_state_transition().
45  *
46  * Virtual device configuration is read/written from/to the database via
47  * xenbus_* interfaces. Driver also use xvdi_* to interact with hypervisor.
48  * There is an on-going effort to make xvdi_* cover all xenbus_*.
49  */
50 
51 #pragma ident	"%Z%%M%	%I%	%E% SMI"
52 
53 #include <sys/types.h>
54 #include <sys/conf.h>
55 #include <sys/ddi.h>
56 #include <sys/dditypes.h>
57 #include <sys/sunddi.h>
58 #include <sys/list.h>
59 #include <sys/dkio.h>
60 #include <sys/cmlb.h>
61 #include <sys/vtoc.h>
62 #include <sys/modctl.h>
63 #include <sys/bootconf.h>
64 #include <sys/promif.h>
65 #include <sys/sysmacros.h>
66 #include <public/io/xenbus.h>
67 #include <xen/sys/xenbus_impl.h>
68 #include <xen/sys/xendev.h>
69 #include <sys/gnttab.h>
70 #include <sys/scsi/generic/inquiry.h>
71 #include <vm/seg_kmem.h>
72 #include <vm/hat_i86.h>
73 #include <sys/gnttab.h>
74 #include <sys/lofi.h>
75 #include <io/xdf.h>
76 #include <xen/io/blkif_impl.h>
77 #include <io/xdb.h>
78 
79 static xdb_t *xdb_statep;
80 static int xdb_debug = 0;
81 
82 static int xdb_push_response(xdb_t *, uint64_t, uint8_t, uint16_t);
83 static int xdb_get_request(xdb_t *, blkif_request_t *);
84 static void blkif_get_x86_32_req(blkif_request_t *, blkif_x86_32_request_t *);
85 static void blkif_get_x86_64_req(blkif_request_t *, blkif_x86_64_request_t *);
86 
87 #ifdef DEBUG
88 /*
89  * debug aid functions
90  */
91 
92 static void
93 logva(xdb_t *vdp, uint64_t va)
94 {
95 	uint64_t *page_addrs;
96 	int i;
97 
98 	page_addrs = vdp->page_addrs;
99 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
100 		if (page_addrs[i] == va)
101 			debug_enter("VA remapping found!");
102 	}
103 
104 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
105 		if (page_addrs[i] == 0) {
106 			page_addrs[i] = va;
107 			break;
108 		}
109 	}
110 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
111 }
112 
113 static void
114 unlogva(xdb_t *vdp, uint64_t va)
115 {
116 	uint64_t *page_addrs;
117 	int i;
118 
119 	page_addrs = vdp->page_addrs;
120 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
121 		if (page_addrs[i] == va) {
122 			page_addrs[i] = 0;
123 			break;
124 		}
125 	}
126 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
127 }
128 
129 static void
130 xdb_dump_request_oe(blkif_request_t *req)
131 {
132 	int i;
133 
134 	/*
135 	 * Exploit the public interface definitions for BLKIF_OP_READ
136 	 * etc..
137 	 */
138 	char *op_name[] = { "read", "write", "barrier", "flush" };
139 
140 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "op=%s", op_name[req->operation]));
141 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "num of segments=%d",
142 	    req->nr_segments));
143 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "handle=%d", req->handle));
144 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "id=%llu",
145 	    (unsigned long long)req->id));
146 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "start sector=%llu",
147 	    (unsigned long long)req->sector_number));
148 	for (i = 0; i < req->nr_segments; i++) {
149 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "gref=%d, first sec=%d,"
150 		    "last sec=%d", req->seg[i].gref, req->seg[i].first_sect,
151 		    req->seg[i].last_sect));
152 	}
153 }
154 #endif /* DEBUG */
155 
156 /*
157  * Statistics.
158  */
159 static char *xdb_stats[] = {
160 	"rd_reqs",
161 	"wr_reqs",
162 	"br_reqs",
163 	"fl_reqs",
164 	"oo_reqs"
165 };
166 
167 static int
168 xdb_kstat_update(kstat_t *ksp, int flag)
169 {
170 	xdb_t *vdp;
171 	kstat_named_t *knp;
172 
173 	if (flag != KSTAT_READ)
174 		return (EACCES);
175 
176 	vdp = ksp->ks_private;
177 	knp = ksp->ks_data;
178 
179 	/*
180 	 * Assignment order should match that of the names in
181 	 * xdb_stats.
182 	 */
183 	(knp++)->value.ui64 = vdp->xs_stat_req_reads;
184 	(knp++)->value.ui64 = vdp->xs_stat_req_writes;
185 	(knp++)->value.ui64 = vdp->xs_stat_req_barriers;
186 	(knp++)->value.ui64 = vdp->xs_stat_req_flushes;
187 	(knp++)->value.ui64 = 0; /* oo_req */
188 
189 	return (0);
190 }
191 
192 static boolean_t
193 xdb_kstat_init(xdb_t *vdp)
194 {
195 	int nstat = sizeof (xdb_stats) / sizeof (xdb_stats[0]);
196 	char **cp = xdb_stats;
197 	kstat_named_t *knp;
198 
199 	if ((vdp->xs_kstats = kstat_create("xdb",
200 	    ddi_get_instance(vdp->xs_dip),
201 	    "req_statistics", "block", KSTAT_TYPE_NAMED,
202 	    nstat, 0)) == NULL)
203 		return (B_FALSE);
204 
205 	vdp->xs_kstats->ks_private = vdp;
206 	vdp->xs_kstats->ks_update = xdb_kstat_update;
207 
208 	knp = vdp->xs_kstats->ks_data;
209 	while (nstat > 0) {
210 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
211 		knp++;
212 		cp++;
213 		nstat--;
214 	}
215 
216 	kstat_install(vdp->xs_kstats);
217 
218 	return (B_TRUE);
219 }
220 
221 static int xdb_biodone(buf_t *);
222 
223 static buf_t *
224 xdb_get_buf(xdb_t *vdp, blkif_request_t *req, xdb_request_t *xreq)
225 {
226 	buf_t *bp;
227 	uint8_t segs, curseg;
228 	int sectors;
229 	int i, err;
230 	gnttab_map_grant_ref_t mapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
231 	ddi_acc_handle_t acchdl;
232 
233 	acchdl = vdp->xs_ring_hdl;
234 	bp = XDB_XREQ2BP(xreq);
235 	curseg = xreq->xr_curseg;
236 	/* init a new xdb request */
237 	if (req != NULL) {
238 		ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
239 		boolean_t pagemapok = B_TRUE;
240 		uint8_t op = ddi_get8(acchdl, &req->operation);
241 
242 		xreq->xr_vdp = vdp;
243 		xreq->xr_op = op;
244 		xreq->xr_id = ddi_get64(acchdl, &req->id);
245 		segs = xreq->xr_buf_pages = ddi_get8(acchdl, &req->nr_segments);
246 		if (segs == 0) {
247 			if (op != BLKIF_OP_FLUSH_DISKCACHE)
248 				cmn_err(CE_WARN, "!non-BLKIF_OP_FLUSH_DISKCACHE"
249 				    " is seen from domain %d with zero "
250 				    "length data buffer!", vdp->xs_peer);
251 			bioinit(bp);
252 			bp->b_bcount = 0;
253 			bp->b_lblkno = 0;
254 			bp->b_un.b_addr = NULL;
255 			return (bp);
256 		} else if (op == BLKIF_OP_FLUSH_DISKCACHE) {
257 			cmn_err(CE_WARN, "!BLKIF_OP_FLUSH_DISKCACHE"
258 			    " is seen from domain %d with non-zero "
259 			    "length data buffer!", vdp->xs_peer);
260 		}
261 
262 		/*
263 		 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
264 		 * according to the definition of blk interface by Xen
265 		 * we do sanity check here
266 		 */
267 		if (segs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
268 			segs = xreq->xr_buf_pages =
269 			    BLKIF_MAX_SEGMENTS_PER_REQUEST;
270 
271 		for (i = 0; i < segs; i++) {
272 			uint8_t fs, ls;
273 
274 			mapops[i].host_addr =
275 			    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
276 			    vdp->xs_iopage_va, xreq->xr_idx, i);
277 			mapops[i].dom = vdp->xs_peer;
278 			mapops[i].ref = ddi_get32(acchdl, &req->seg[i].gref);
279 			mapops[i].flags = GNTMAP_host_map;
280 			if (op != BLKIF_OP_READ)
281 				mapops[i].flags |= GNTMAP_readonly;
282 
283 			fs = ddi_get8(acchdl, &req->seg[i].first_sect);
284 			ls = ddi_get8(acchdl, &req->seg[i].last_sect);
285 
286 			/*
287 			 * first_sect should be no bigger than last_sect and
288 			 * both of them should be no bigger than
289 			 * (PAGESIZE / XB_BSIZE - 1) according to definition
290 			 * of blk interface by Xen, so sanity check again
291 			 */
292 			if (fs > (PAGESIZE / XB_BSIZE - 1))
293 				fs = PAGESIZE / XB_BSIZE - 1;
294 			if (ls > (PAGESIZE / XB_BSIZE - 1))
295 				ls = PAGESIZE / XB_BSIZE - 1;
296 			if (fs > ls)
297 				fs = ls;
298 
299 			xreq->xr_segs[i].fs = fs;
300 			xreq->xr_segs[i].ls = ls;
301 		}
302 
303 		/* map in io pages */
304 		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
305 		    mapops, i);
306 		if (err != 0)
307 			return (NULL);
308 		for (i = 0; i < segs; i++) {
309 			/*
310 			 * Although HYPERVISOR_grant_table_op() returned no
311 			 * error, mapping of each single page can fail. So,
312 			 * we have to do the check here and handle the error
313 			 * if needed
314 			 */
315 			if (mapops[i].status != GNTST_okay) {
316 				int j;
317 				for (j = 0; j < i; j++) {
318 #ifdef DEBUG
319 					unlogva(vdp, mapops[j].host_addr);
320 #endif
321 					xen_release_pfn(
322 					    xreq->xr_plist[j].p_pagenum);
323 				}
324 				pagemapok = B_FALSE;
325 				break;
326 			}
327 			/* record page mapping handle for unmapping later */
328 			xreq->xr_page_hdls[i] = mapops[i].handle;
329 #ifdef DEBUG
330 			logva(vdp, mapops[i].host_addr);
331 #endif
332 			/*
333 			 * Pass the MFNs down using the shadow list (xr_pplist)
334 			 *
335 			 * This is pretty ugly since we have implict knowledge
336 			 * of how the rootnex binds buffers.
337 			 * The GNTTABOP_map_grant_ref op makes us do some ugly
338 			 * stuff since we're not allowed to touch these PTEs
339 			 * from the VM.
340 			 *
341 			 * Obviously, these aren't real page_t's. The rootnex
342 			 * only needs p_pagenum.
343 			 * Also, don't use btop() here or 32 bit PAE breaks.
344 			 */
345 			xreq->xr_pplist[i] = &xreq->xr_plist[i];
346 			xreq->xr_plist[i].p_pagenum =
347 			    xen_assign_pfn(mapops[i].dev_bus_addr >> PAGESHIFT);
348 		}
349 
350 		/*
351 		 * not all pages mapped in successfully, unmap those mapped-in
352 		 * page and return failure
353 		 */
354 		if (!pagemapok) {
355 			gnttab_unmap_grant_ref_t unmapop;
356 
357 			for (i = 0; i < segs; i++) {
358 				if (mapops[i].status != GNTST_okay)
359 					continue;
360 				unmapop.host_addr =
361 				    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
362 				    vdp->xs_iopage_va, xreq->xr_idx, i);
363 				unmapop.dev_bus_addr = NULL;
364 				unmapop.handle = mapops[i].handle;
365 				(void) HYPERVISOR_grant_table_op(
366 				    GNTTABOP_unmap_grant_ref, &unmapop, 1);
367 			}
368 
369 			return (NULL);
370 		}
371 		bioinit(bp);
372 		bp->b_lblkno = ddi_get64(acchdl, &req->sector_number);
373 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
374 		bp->b_flags |= (ddi_get8(acchdl, &req->operation) ==
375 		    BLKIF_OP_READ) ? B_READ : (B_WRITE | B_ASYNC);
376 	} else {
377 		uint64_t blkst;
378 		int isread;
379 
380 		/* reuse this buf */
381 		blkst = bp->b_lblkno + bp->b_bcount / DEV_BSIZE;
382 		isread = bp->b_flags & B_READ;
383 		bioreset(bp);
384 		bp->b_lblkno = blkst;
385 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
386 		bp->b_flags |= isread ? B_READ : (B_WRITE | B_ASYNC);
387 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "reuse buf, xreq is %d!!",
388 		    xreq->xr_idx));
389 	}
390 
391 	/* form a buf */
392 	bp->b_un.b_addr = XDB_IOPAGE_VA(vdp->xs_iopage_va, xreq->xr_idx,
393 	    curseg) + xreq->xr_segs[curseg].fs * DEV_BSIZE;
394 	bp->b_shadow = &xreq->xr_pplist[curseg];
395 	bp->b_iodone = xdb_biodone;
396 	sectors = 0;
397 	for (i = curseg; i < xreq->xr_buf_pages; i++) {
398 		/*
399 		 * The xreq->xr_segs[i].fs of the first seg can be non-zero
400 		 * otherwise, we'll break it into multiple bufs
401 		 */
402 		if ((i != curseg) && (xreq->xr_segs[i].fs != 0)) {
403 			break;
404 		}
405 		sectors += (xreq->xr_segs[i].ls - xreq->xr_segs[i].fs + 1);
406 	}
407 	xreq->xr_curseg = i;
408 	bp->b_bcount = sectors * DEV_BSIZE;
409 	bp->b_bufsize = bp->b_bcount;
410 
411 	return (bp);
412 }
413 
414 static xdb_request_t *
415 xdb_get_req(xdb_t *vdp)
416 {
417 	xdb_request_t *req;
418 	int idx;
419 
420 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
421 	ASSERT(vdp->xs_free_req != -1);
422 	req = &vdp->xs_req[vdp->xs_free_req];
423 	vdp->xs_free_req = req->xr_next;
424 	idx = req->xr_idx;
425 	bzero(req, sizeof (xdb_request_t));
426 	req->xr_idx = idx;
427 	return (req);
428 }
429 
430 static void
431 xdb_free_req(xdb_request_t *req)
432 {
433 	xdb_t *vdp = req->xr_vdp;
434 
435 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
436 	req->xr_next = vdp->xs_free_req;
437 	vdp->xs_free_req = req->xr_idx;
438 }
439 
440 static void
441 xdb_response(xdb_t *vdp, blkif_request_t *req, boolean_t ok)
442 {
443 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
444 
445 	if (xdb_push_response(vdp, ddi_get64(acchdl, &req->id),
446 	    ddi_get8(acchdl, &req->operation), ok))
447 		xvdi_notify_oe(vdp->xs_dip);
448 }
449 
450 static void
451 xdb_init_ioreqs(xdb_t *vdp)
452 {
453 	int i;
454 
455 	ASSERT(vdp->xs_nentry);
456 
457 	if (vdp->xs_req == NULL)
458 		vdp->xs_req = kmem_alloc(vdp->xs_nentry *
459 		    sizeof (xdb_request_t), KM_SLEEP);
460 #ifdef DEBUG
461 	if (vdp->page_addrs == NULL)
462 		vdp->page_addrs = kmem_zalloc(XDB_MAX_IO_PAGES(vdp) *
463 		    sizeof (uint64_t), KM_SLEEP);
464 #endif
465 	for (i = 0; i < vdp->xs_nentry; i++) {
466 		vdp->xs_req[i].xr_idx = i;
467 		vdp->xs_req[i].xr_next = i + 1;
468 	}
469 	vdp->xs_req[vdp->xs_nentry - 1].xr_next = -1;
470 	vdp->xs_free_req = 0;
471 
472 	/* alloc va in host dom for io page mapping */
473 	vdp->xs_iopage_va = vmem_xalloc(heap_arena,
474 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE, PAGESIZE, 0, 0, 0, 0,
475 	    VM_SLEEP);
476 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
477 		hat_prepare_mapping(kas.a_hat,
478 		    vdp->xs_iopage_va + i * PAGESIZE);
479 }
480 
481 static void
482 xdb_uninit_ioreqs(xdb_t *vdp)
483 {
484 	int i;
485 
486 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
487 		hat_release_mapping(kas.a_hat,
488 		    vdp->xs_iopage_va + i * PAGESIZE);
489 	vmem_xfree(heap_arena, vdp->xs_iopage_va,
490 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE);
491 	if (vdp->xs_req != NULL) {
492 		kmem_free(vdp->xs_req, vdp->xs_nentry * sizeof (xdb_request_t));
493 		vdp->xs_req = NULL;
494 	}
495 #ifdef DEBUG
496 	if (vdp->page_addrs != NULL) {
497 		kmem_free(vdp->page_addrs, XDB_MAX_IO_PAGES(vdp) *
498 		    sizeof (uint64_t));
499 		vdp->page_addrs = NULL;
500 	}
501 #endif
502 }
503 
504 static uint_t
505 xdb_intr(caddr_t arg)
506 {
507 	blkif_request_t req;
508 	blkif_request_t *reqp = &req;
509 	xdb_request_t *xreq;
510 	buf_t *bp;
511 	uint8_t op;
512 	xdb_t *vdp = (xdb_t *)arg;
513 	int ret = DDI_INTR_UNCLAIMED;
514 	dev_info_t *dip = vdp->xs_dip;
515 
516 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
517 	    "xdb@%s: I/O request received from dom %d",
518 	    ddi_get_name_addr(dip), vdp->xs_peer));
519 
520 	mutex_enter(&vdp->xs_iomutex);
521 
522 	/* shouldn't touch ring buffer if not in connected state */
523 	if (vdp->xs_if_status != XDB_CONNECTED) {
524 		mutex_exit(&vdp->xs_iomutex);
525 		return (DDI_INTR_UNCLAIMED);
526 	}
527 
528 	/*
529 	 * We'll loop till there is no more request in the ring
530 	 * We won't stuck in this loop for ever since the size of ring buffer
531 	 * is limited, and frontend will stop pushing requests into it when
532 	 * the ring buffer is full
533 	 */
534 
535 	/* req_event will be increased in xvdi_ring_get_request() */
536 	while (xdb_get_request(vdp, reqp)) {
537 		ret = DDI_INTR_CLAIMED;
538 
539 		op = ddi_get8(vdp->xs_ring_hdl, &reqp->operation);
540 		if (op == BLKIF_OP_READ			||
541 		    op == BLKIF_OP_WRITE		||
542 		    op == BLKIF_OP_WRITE_BARRIER	||
543 		    op == BLKIF_OP_FLUSH_DISKCACHE) {
544 #ifdef DEBUG
545 			xdb_dump_request_oe(reqp);
546 #endif
547 			xreq = xdb_get_req(vdp);
548 			ASSERT(xreq);
549 			switch (op) {
550 			case BLKIF_OP_READ:
551 				vdp->xs_stat_req_reads++;
552 				break;
553 			case BLKIF_OP_WRITE_BARRIER:
554 				vdp->xs_stat_req_barriers++;
555 				/* FALLTHRU */
556 			case BLKIF_OP_WRITE:
557 				vdp->xs_stat_req_writes++;
558 				break;
559 			case BLKIF_OP_FLUSH_DISKCACHE:
560 				vdp->xs_stat_req_flushes++;
561 				break;
562 			}
563 
564 			xreq->xr_curseg = 0; /* start from first segment */
565 			bp = xdb_get_buf(vdp, reqp, xreq);
566 			if (bp == NULL) {
567 				/* failed to form a buf */
568 				xdb_free_req(xreq);
569 				xdb_response(vdp, reqp, B_FALSE);
570 				continue;
571 			}
572 			bp->av_forw = NULL;
573 
574 			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
575 			    " buf %p, blkno %lld, size %lu, addr %p",
576 			    (void *)bp, (longlong_t)bp->b_blkno,
577 			    (ulong_t)bp->b_bcount, (void *)bp->b_un.b_addr));
578 
579 			/* send bp to underlying blk driver */
580 			if (vdp->xs_f_iobuf == NULL) {
581 				vdp->xs_f_iobuf = vdp->xs_l_iobuf = bp;
582 			} else {
583 				vdp->xs_l_iobuf->av_forw = bp;
584 				vdp->xs_l_iobuf = bp;
585 			}
586 		} else {
587 			xdb_response(vdp, reqp, B_FALSE);
588 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
589 			    "Unsupported cmd received from dom %d",
590 			    ddi_get_name_addr(dip), vdp->xs_peer));
591 		}
592 	}
593 	/* notify our taskq to push buf to underlying blk driver */
594 	if (ret == DDI_INTR_CLAIMED)
595 		cv_broadcast(&vdp->xs_iocv);
596 
597 	mutex_exit(&vdp->xs_iomutex);
598 
599 	return (ret);
600 }
601 
602 static int
603 xdb_biodone(buf_t *bp)
604 {
605 	int i, err, bioerr;
606 	uint8_t segs;
607 	gnttab_unmap_grant_ref_t unmapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
608 	xdb_request_t *xreq = XDB_BP2XREQ(bp);
609 	xdb_t *vdp = xreq->xr_vdp;
610 	buf_t *nbp;
611 
612 	bioerr = geterror(bp);
613 	if (bioerr)
614 		XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: I/O error %d",
615 		    ddi_get_name_addr(vdp->xs_dip), bioerr));
616 
617 	/* check if we are done w/ this I/O request */
618 	if ((bioerr == 0) && (xreq->xr_curseg < xreq->xr_buf_pages)) {
619 		nbp = xdb_get_buf(vdp, NULL, xreq);
620 		if (nbp) {
621 			err = ldi_strategy(vdp->xs_ldi_hdl, nbp);
622 			if (err == 0) {
623 				XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
624 				    "sent buf to backend ok"));
625 				return (DDI_SUCCESS);
626 			}
627 			bioerr = EIO;
628 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
629 			    "sent buf to backend dev failed, err=%d",
630 			    ddi_get_name_addr(vdp->xs_dip), err));
631 		} else {
632 			bioerr = EIO;
633 		}
634 	}
635 
636 	/* unmap io pages */
637 	segs = xreq->xr_buf_pages;
638 	/*
639 	 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
640 	 * according to the definition of blk interface by Xen
641 	 */
642 	ASSERT(segs <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
643 	for (i = 0; i < segs; i++) {
644 		unmapops[i].host_addr = (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
645 		    vdp->xs_iopage_va, xreq->xr_idx, i);
646 #ifdef DEBUG
647 		mutex_enter(&vdp->xs_iomutex);
648 		unlogva(vdp, unmapops[i].host_addr);
649 		mutex_exit(&vdp->xs_iomutex);
650 #endif
651 		unmapops[i].dev_bus_addr = NULL;
652 		unmapops[i].handle = xreq->xr_page_hdls[i];
653 	}
654 	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
655 	    unmapops, segs);
656 	ASSERT(!err);
657 
658 	/*
659 	 * If we have reached a barrier write or a cache flush , then we must
660 	 * flush all our I/Os.
661 	 */
662 	if (xreq->xr_op == BLKIF_OP_WRITE_BARRIER ||
663 	    xreq->xr_op == BLKIF_OP_FLUSH_DISKCACHE) {
664 		/*
665 		 * XXX At this point the write did succeed, so I don't
666 		 * believe we should report an error because the flush
667 		 * failed. However, this is a debatable point, so
668 		 * maybe we need to think more carefully about this.
669 		 * For now, just cast to void.
670 		 */
671 		(void) ldi_ioctl(vdp->xs_ldi_hdl,
672 		    DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, kcred, NULL);
673 	}
674 
675 	mutex_enter(&vdp->xs_iomutex);
676 
677 	/* send response back to frontend */
678 	if (vdp->xs_if_status == XDB_CONNECTED) {
679 		if (xdb_push_response(vdp, xreq->xr_id, xreq->xr_op, bioerr))
680 			xvdi_notify_oe(vdp->xs_dip);
681 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
682 		    "sent resp back to frontend, id=%llu",
683 		    (unsigned long long)xreq->xr_id));
684 	}
685 	/* free io resources */
686 	biofini(bp);
687 	xdb_free_req(xreq);
688 
689 	vdp->xs_ionum--;
690 	if ((vdp->xs_if_status != XDB_CONNECTED) && (vdp->xs_ionum == 0)) {
691 		/* we're closing, someone is waiting for I/O clean-up */
692 		cv_signal(&vdp->xs_ionumcv);
693 	}
694 
695 	mutex_exit(&vdp->xs_iomutex);
696 
697 	return (DDI_SUCCESS);
698 }
699 
700 static int
701 xdb_bindto_frontend(xdb_t *vdp)
702 {
703 	int err;
704 	char *oename;
705 	grant_ref_t gref;
706 	evtchn_port_t evtchn;
707 	dev_info_t *dip = vdp->xs_dip;
708 	char protocol[64] = "";
709 
710 	/*
711 	 * Gather info from frontend
712 	 */
713 	oename = xvdi_get_oename(dip);
714 	if (oename == NULL)
715 		return (DDI_FAILURE);
716 
717 	err = xenbus_gather(XBT_NULL, oename,
718 	    "ring-ref", "%lu", &gref, "event-channel", "%u", &evtchn, NULL);
719 	if (err != 0) {
720 		xvdi_fatal_error(dip, err,
721 		    "Getting ring-ref and evtchn from frontend");
722 		return (DDI_FAILURE);
723 	}
724 
725 	vdp->xs_blk_protocol = BLKIF_PROTOCOL_NATIVE;
726 	vdp->xs_nentry = BLKIF_RING_SIZE;
727 	vdp->xs_entrysize = sizeof (union blkif_sring_entry);
728 
729 	err = xenbus_gather(XBT_NULL, oename,
730 	    "protocol", "%63s", protocol, NULL);
731 	if (err)
732 		(void) strcpy(protocol, "unspecified, assuming native");
733 	else {
734 		/*
735 		 * We must check for NATIVE first, so that the fast path
736 		 * is taken for copying data from the guest to the host.
737 		 */
738 		if (strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE) != 0) {
739 			if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
740 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_32;
741 				vdp->xs_nentry = BLKIF_X86_32_RING_SIZE;
742 				vdp->xs_entrysize =
743 				    sizeof (union blkif_x86_32_sring_entry);
744 			} else if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_64) ==
745 			    0) {
746 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_64;
747 				vdp->xs_nentry = BLKIF_X86_64_RING_SIZE;
748 				vdp->xs_entrysize =
749 				    sizeof (union blkif_x86_64_sring_entry);
750 			} else {
751 				xvdi_fatal_error(dip, err, "unknown protocol");
752 				return (DDI_FAILURE);
753 			}
754 		}
755 	}
756 #ifdef DEBUG
757 	cmn_err(CE_NOTE, "!xdb@%s: blkif protocol '%s' ",
758 	    ddi_get_name_addr(dip), protocol);
759 #endif
760 
761 	/*
762 	 * map and init ring
763 	 *
764 	 * The ring parameters must match those which have been allocated
765 	 * in the front end.
766 	 */
767 	err = xvdi_map_ring(dip, vdp->xs_nentry, vdp->xs_entrysize,
768 	    gref, &vdp->xs_ring);
769 	if (err != DDI_SUCCESS)
770 		return (DDI_FAILURE);
771 	/*
772 	 * This will be removed after we use shadow I/O ring request since
773 	 * we don't need to access the ring itself directly, thus the access
774 	 * handle is not needed
775 	 */
776 	vdp->xs_ring_hdl = vdp->xs_ring->xr_acc_hdl;
777 
778 	/*
779 	 * bind event channel
780 	 */
781 	err = xvdi_bind_evtchn(dip, evtchn);
782 	if (err != DDI_SUCCESS) {
783 		xvdi_unmap_ring(vdp->xs_ring);
784 		return (DDI_FAILURE);
785 	}
786 
787 	return (DDI_SUCCESS);
788 }
789 
790 static void
791 xdb_unbindfrom_frontend(xdb_t *vdp)
792 {
793 	xvdi_free_evtchn(vdp->xs_dip);
794 	xvdi_unmap_ring(vdp->xs_ring);
795 }
796 
797 #define	LOFI_CTRL_NODE	"/dev/lofictl"
798 #define	LOFI_DEV_NODE	"/devices/pseudo/lofi@0:"
799 #define	LOFI_MODE	FREAD | FWRITE | FEXCL
800 
801 static int
802 xdb_setup_node(xdb_t *vdp, char *path)
803 {
804 	dev_info_t *dip;
805 	char *xsnode, *node;
806 	ldi_handle_t ldi_hdl;
807 	struct lofi_ioctl *li;
808 	int minor;
809 	int err;
810 	unsigned int len;
811 
812 	dip = vdp->xs_dip;
813 	xsnode = xvdi_get_xsname(dip);
814 	if (xsnode == NULL)
815 		return (DDI_FAILURE);
816 
817 	err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node, &len);
818 	if (err != 0) {
819 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
820 		return (DDI_FAILURE);
821 	}
822 
823 	if (!XDB_IS_LOFI(vdp)) {
824 		(void) strlcpy(path, node, MAXPATHLEN + 1);
825 		kmem_free(node, len);
826 		return (DDI_SUCCESS);
827 	}
828 
829 	do {
830 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
831 		    &ldi_hdl, vdp->xs_ldi_li);
832 	} while (err == EBUSY);
833 	if (err != 0) {
834 		kmem_free(node, len);
835 		return (DDI_FAILURE);
836 	}
837 
838 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
839 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
840 	kmem_free(node, len);
841 	if (ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
842 	    LOFI_MODE | FKIOCTL, kcred, &minor) != 0) {
843 		cmn_err(CE_WARN, "xdb@%s: Failed to create lofi dev for %s",
844 		    ddi_get_name_addr(dip), li->li_filename);
845 		(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
846 		kmem_free(li, sizeof (*li));
847 		return (DDI_FAILURE);
848 	}
849 	/*
850 	 * return '/devices/...' instead of '/dev/lofi/...' since the
851 	 * former is available immediately after calling ldi_ioctl
852 	 */
853 	(void) snprintf(path, MAXPATHLEN + 1, LOFI_DEV_NODE "%d", minor);
854 	(void) xenbus_printf(XBT_NULL, xsnode, "node", "%s", path);
855 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
856 	kmem_free(li, sizeof (*li));
857 	return (DDI_SUCCESS);
858 }
859 
860 static void
861 xdb_teardown_node(xdb_t *vdp)
862 {
863 	dev_info_t *dip;
864 	char *xsnode, *node;
865 	ldi_handle_t ldi_hdl;
866 	struct lofi_ioctl *li;
867 	int err;
868 	unsigned int len;
869 
870 	if (!XDB_IS_LOFI(vdp))
871 		return;
872 
873 	dip = vdp->xs_dip;
874 	xsnode = xvdi_get_xsname(dip);
875 	if (xsnode == NULL)
876 		return;
877 
878 	err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node, &len);
879 	if (err != 0) {
880 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
881 		return;
882 	}
883 
884 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
885 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
886 	kmem_free(node, len);
887 
888 	do {
889 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
890 		    &ldi_hdl, vdp->xs_ldi_li);
891 	} while (err == EBUSY);
892 
893 	if (err != 0) {
894 		kmem_free(li, sizeof (*li));
895 		return;
896 	}
897 
898 	if (ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE, (intptr_t)li,
899 	    LOFI_MODE | FKIOCTL, kcred, NULL) != 0) {
900 		cmn_err(CE_WARN, "xdb@%s: Failed to delete lofi dev for %s",
901 		    ddi_get_name_addr(dip), li->li_filename);
902 	}
903 
904 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
905 	kmem_free(li, sizeof (*li));
906 }
907 
908 static int
909 xdb_open_device(xdb_t *vdp)
910 {
911 	uint64_t devsize;
912 	dev_info_t *dip;
913 	char *xsnode;
914 	char *nodepath;
915 	char *mode = NULL;
916 	char *type = NULL;
917 	int err;
918 
919 	dip = vdp->xs_dip;
920 	xsnode = xvdi_get_xsname(dip);
921 	if (xsnode == NULL)
922 		return (DDI_FAILURE);
923 
924 	err = xenbus_gather(XBT_NULL, xsnode,
925 	    "mode", NULL, &mode, "type", NULL, &type, NULL);
926 	if (err != 0) {
927 		if (mode)
928 			kmem_free(mode, strlen(mode) + 1);
929 		if (type)
930 			kmem_free(type, strlen(type) + 1);
931 		xvdi_fatal_error(dip, err,
932 		    "Getting mode and type from backend device");
933 		return (DDI_FAILURE);
934 	}
935 	if (strcmp(type, "file") == 0) {
936 		vdp->xs_type |= XDB_DEV_LOFI;
937 	}
938 	kmem_free(type, strlen(type) + 1);
939 	if ((strcmp(mode, "r") == NULL) || (strcmp(mode, "ro") == NULL)) {
940 		vdp->xs_type |= XDB_DEV_RO;
941 	}
942 	kmem_free(mode, strlen(mode) + 1);
943 
944 	/*
945 	 * try to open backend device
946 	 */
947 	if (ldi_ident_from_dip(dip, &vdp->xs_ldi_li) != 0)
948 		return (DDI_FAILURE);
949 
950 	nodepath = kmem_zalloc(MAXPATHLEN + 1, KM_SLEEP);
951 	err = xdb_setup_node(vdp, nodepath);
952 	if (err != DDI_SUCCESS) {
953 		xvdi_fatal_error(dip, err,
954 		    "Getting device path of backend device");
955 		ldi_ident_release(vdp->xs_ldi_li);
956 		kmem_free(nodepath, MAXPATHLEN + 1);
957 		return (DDI_FAILURE);
958 	}
959 
960 	if (ldi_open_by_name(nodepath,
961 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE),
962 	    kcred, &vdp->xs_ldi_hdl, vdp->xs_ldi_li) != 0) {
963 		xdb_teardown_node(vdp);
964 		ldi_ident_release(vdp->xs_ldi_li);
965 		cmn_err(CE_WARN, "xdb@%s: Failed to open: %s",
966 		    ddi_get_name_addr(dip), nodepath);
967 		kmem_free(nodepath, MAXPATHLEN + 1);
968 		return (DDI_FAILURE);
969 	}
970 
971 	/* check if it's a CD/DVD disc */
972 	if (ldi_prop_get_int(vdp->xs_ldi_hdl, LDI_DEV_T_ANY | DDI_PROP_DONTPASS,
973 	    "inquiry-device-type", DTYPE_DIRECT) == DTYPE_RODIRECT)
974 		vdp->xs_type |= XDB_DEV_CD;
975 	/* check if it's a removable disk */
976 	if (ldi_prop_exists(vdp->xs_ldi_hdl,
977 	    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
978 	    "removable-media"))
979 		vdp->xs_type |= XDB_DEV_RMB;
980 
981 	if (ldi_get_size(vdp->xs_ldi_hdl, &devsize) != DDI_SUCCESS) {
982 		(void) ldi_close(vdp->xs_ldi_hdl,
983 		    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
984 		xdb_teardown_node(vdp);
985 		ldi_ident_release(vdp->xs_ldi_li);
986 		kmem_free(nodepath, MAXPATHLEN + 1);
987 		return (DDI_FAILURE);
988 	}
989 	vdp->xs_sectors = devsize / XB_BSIZE;
990 
991 	kmem_free(nodepath, MAXPATHLEN + 1);
992 	return (DDI_SUCCESS);
993 }
994 
995 static void
996 xdb_close_device(xdb_t *vdp)
997 {
998 	(void) ldi_close(vdp->xs_ldi_hdl,
999 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
1000 	xdb_teardown_node(vdp);
1001 	ldi_ident_release(vdp->xs_ldi_li);
1002 	vdp->xs_ldi_li = NULL;
1003 	vdp->xs_ldi_hdl = NULL;
1004 }
1005 
1006 /*
1007  * Kick-off connect process
1008  * If xs_fe_status == XDB_FE_READY and xs_dev_status == XDB_DEV_READY
1009  * the xs_if_status will be changed to XDB_CONNECTED on success,
1010  * otherwise, xs_if_status will not be changed
1011  */
1012 static int
1013 xdb_start_connect(xdb_t *vdp)
1014 {
1015 	uint32_t dinfo;
1016 	xenbus_transaction_t xbt;
1017 	int err, svdst;
1018 	char *xsnode;
1019 	dev_info_t *dip = vdp->xs_dip;
1020 	char *barrier;
1021 	uint_t len;
1022 
1023 	/*
1024 	 * Start connect to frontend only when backend device are ready
1025 	 * and frontend has moved to XenbusStateInitialised, which means
1026 	 * ready to connect
1027 	 */
1028 	ASSERT((vdp->xs_fe_status == XDB_FE_READY) &&
1029 	    (vdp->xs_dev_status == XDB_DEV_READY));
1030 
1031 	if (((xsnode = xvdi_get_xsname(dip)) == NULL)		 ||
1032 	    ((vdp->xs_peer = xvdi_get_oeid(dip)) == (domid_t)-1) ||
1033 	    (xdb_open_device(vdp) != DDI_SUCCESS))
1034 		return (DDI_FAILURE);
1035 
1036 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialised);
1037 
1038 	if (xdb_bindto_frontend(vdp) != DDI_SUCCESS)
1039 		goto errout1;
1040 
1041 	/* init i/o requests */
1042 	xdb_init_ioreqs(vdp);
1043 
1044 	if (ddi_add_intr(dip, 0, NULL, NULL, xdb_intr, (caddr_t)vdp)
1045 	    != DDI_SUCCESS)
1046 		goto errout2;
1047 
1048 	/*
1049 	 * we can recieve intr any time from now on
1050 	 * mark that we're ready to take intr
1051 	 */
1052 	mutex_enter(&vdp->xs_iomutex);
1053 	/*
1054 	 * save it in case we need to restore when we
1055 	 * fail to write xenstore later
1056 	 */
1057 	svdst = vdp->xs_if_status;
1058 	vdp->xs_if_status = XDB_CONNECTED;
1059 	mutex_exit(&vdp->xs_iomutex);
1060 
1061 	/* write into xenstore the info needed by frontend */
1062 trans_retry:
1063 	if (xenbus_transaction_start(&xbt)) {
1064 		xvdi_fatal_error(dip, EIO, "transaction start");
1065 		goto errout3;
1066 	}
1067 
1068 	/*
1069 	 * If feature-barrier isn't present in xenstore, add it.
1070 	 */
1071 	if (xenbus_read(xbt, xsnode, "feature-barrier",
1072 	    (void **)&barrier, &len) != 0) {
1073 		if ((err = xenbus_printf(xbt, xsnode, "feature-barrier",
1074 		    "%d", 1)) != 0) {
1075 			cmn_err(CE_WARN, "xdb@%s: failed to write "
1076 			    "'feature-barrier'", ddi_get_name_addr(dip));
1077 			xvdi_fatal_error(dip, err, "writing 'feature-barrier'");
1078 			goto abort_trans;
1079 		}
1080 	} else
1081 		kmem_free(barrier, len);
1082 
1083 	dinfo = 0;
1084 	if (XDB_IS_RO(vdp))
1085 		dinfo |= VDISK_READONLY;
1086 	if (XDB_IS_CD(vdp))
1087 		dinfo |= VDISK_CDROM;
1088 	if (XDB_IS_RMB(vdp))
1089 		dinfo |= VDISK_REMOVABLE;
1090 	if (err = xenbus_printf(xbt, xsnode, "info", "%u", dinfo)) {
1091 		xvdi_fatal_error(dip, err, "writing 'info'");
1092 		goto abort_trans;
1093 	}
1094 
1095 	/* hard-coded 512-byte sector size */
1096 	if (err = xenbus_printf(xbt, xsnode, "sector-size", "%u", DEV_BSIZE)) {
1097 		xvdi_fatal_error(dip, err, "writing 'sector-size'");
1098 		goto abort_trans;
1099 	}
1100 
1101 	if (err = xenbus_printf(xbt, xsnode, "sectors", "%"PRIu64,
1102 	    vdp->xs_sectors)) {
1103 		xvdi_fatal_error(dip, err, "writing 'sectors'");
1104 		goto abort_trans;
1105 	}
1106 
1107 	if (err = xenbus_printf(xbt, xsnode, "instance", "%d",
1108 	    ddi_get_instance(dip))) {
1109 		xvdi_fatal_error(dip, err, "writing 'instance'");
1110 		goto abort_trans;
1111 	}
1112 
1113 	if ((err = xvdi_switch_state(dip, xbt, XenbusStateConnected)) > 0) {
1114 		xvdi_fatal_error(dip, err, "writing 'state'");
1115 		goto abort_trans;
1116 	}
1117 
1118 	if (err = xenbus_transaction_end(xbt, 0)) {
1119 		if (err == EAGAIN)
1120 			/* transaction is ended, don't need to abort it */
1121 			goto trans_retry;
1122 		xvdi_fatal_error(dip, err, "completing transaction");
1123 		goto errout3;
1124 	}
1125 
1126 	return (DDI_SUCCESS);
1127 
1128 abort_trans:
1129 	(void) xenbus_transaction_end(xbt, 1);
1130 errout3:
1131 	mutex_enter(&vdp->xs_iomutex);
1132 	vdp->xs_if_status = svdst;
1133 	mutex_exit(&vdp->xs_iomutex);
1134 	ddi_remove_intr(dip, 0, NULL);
1135 errout2:
1136 	xdb_uninit_ioreqs(vdp);
1137 	xdb_unbindfrom_frontend(vdp);
1138 errout1:
1139 	xdb_close_device(vdp);
1140 	return (DDI_FAILURE);
1141 }
1142 
1143 /*
1144  * Kick-off disconnect process
1145  * xs_if_status will not be changed
1146  */
1147 static int
1148 xdb_start_disconnect(xdb_t *vdp)
1149 {
1150 	/*
1151 	 * Kick-off disconnect process
1152 	 */
1153 	if (xvdi_switch_state(vdp->xs_dip, XBT_NULL, XenbusStateClosing) > 0)
1154 		return (DDI_FAILURE);
1155 
1156 	return (DDI_SUCCESS);
1157 }
1158 
1159 /*
1160  * Disconnect from frontend and close backend device
1161  * ifstatus will be changed to XDB_DISCONNECTED
1162  * Xenbus state will be changed to XenbusStateClosed
1163  */
1164 static void
1165 xdb_close(dev_info_t *dip)
1166 {
1167 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1168 
1169 	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
1170 
1171 	mutex_enter(&vdp->xs_iomutex);
1172 
1173 	if (vdp->xs_if_status != XDB_CONNECTED) {
1174 		vdp->xs_if_status = XDB_DISCONNECTED;
1175 		cv_broadcast(&vdp->xs_iocv);
1176 		mutex_exit(&vdp->xs_iomutex);
1177 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1178 		return;
1179 	}
1180 	vdp->xs_if_status = XDB_DISCONNECTED;
1181 	cv_broadcast(&vdp->xs_iocv);
1182 
1183 	mutex_exit(&vdp->xs_iomutex);
1184 
1185 	/* stop accepting I/O request from frontend */
1186 	ddi_remove_intr(dip, 0, NULL);
1187 	/* clear all on-going I/Os, if any */
1188 	mutex_enter(&vdp->xs_iomutex);
1189 	while (vdp->xs_ionum > 0)
1190 		cv_wait(&vdp->xs_ionumcv, &vdp->xs_iomutex);
1191 	mutex_exit(&vdp->xs_iomutex);
1192 
1193 	/* clean up resources and close this interface */
1194 	xdb_uninit_ioreqs(vdp);
1195 	xdb_unbindfrom_frontend(vdp);
1196 	xdb_close_device(vdp);
1197 	vdp->xs_peer = (domid_t)-1;
1198 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1199 }
1200 
1201 /*
1202  * Xdb_check_state_transition will check the XenbusState change to see
1203  * if the change is a valid transition or not.
1204  * The new state is written by frontend domain, or by running xenstore-write
1205  * to change it manually in dom0
1206  */
1207 static int
1208 xdb_check_state_transition(xdb_t *vdp, XenbusState oestate)
1209 {
1210 	enum xdb_state status;
1211 	int stcheck;
1212 #define	STOK	0 /* need further process */
1213 #define	STNOP	1 /* no action need taking */
1214 #define	STBUG	2 /* unexpected state change, could be a bug */
1215 
1216 	status = vdp->xs_if_status;
1217 	stcheck = STOK;
1218 
1219 	switch (status) {
1220 	case XDB_UNKNOWN:
1221 		if (vdp->xs_fe_status == XDB_FE_UNKNOWN) {
1222 			if ((oestate == XenbusStateUnknown)		||
1223 			    (oestate == XenbusStateConnected))
1224 				stcheck = STBUG;
1225 			else if ((oestate == XenbusStateInitialising)	||
1226 			    (oestate == XenbusStateInitWait))
1227 				stcheck = STNOP;
1228 		} else {
1229 			if ((oestate == XenbusStateUnknown)		||
1230 			    (oestate == XenbusStateInitialising)	||
1231 			    (oestate == XenbusStateInitWait)		||
1232 			    (oestate == XenbusStateConnected))
1233 				stcheck = STBUG;
1234 			else if (oestate == XenbusStateInitialised)
1235 				stcheck = STNOP;
1236 		}
1237 		break;
1238 	case XDB_CONNECTED:
1239 		if ((oestate == XenbusStateUnknown)		||
1240 		    (oestate == XenbusStateInitialising)	||
1241 		    (oestate == XenbusStateInitWait)		||
1242 		    (oestate == XenbusStateInitialised))
1243 			stcheck = STBUG;
1244 		else if (oestate == XenbusStateConnected)
1245 			stcheck = STNOP;
1246 		break;
1247 	case XDB_DISCONNECTED:
1248 	default:
1249 			stcheck = STBUG;
1250 	}
1251 
1252 	if (stcheck == STOK)
1253 		return (DDI_SUCCESS);
1254 
1255 	if (stcheck == STBUG)
1256 		cmn_err(CE_NOTE, "xdb@%s: unexpected otherend "
1257 		    "state change to %d!, when status is %d",
1258 		    ddi_get_name_addr(vdp->xs_dip), oestate, status);
1259 
1260 	return (DDI_FAILURE);
1261 }
1262 
1263 static void
1264 xdb_send_buf(void *arg)
1265 {
1266 	buf_t *bp;
1267 	xdb_t *vdp = (xdb_t *)arg;
1268 
1269 	mutex_enter(&vdp->xs_iomutex);
1270 
1271 	while (vdp->xs_if_status != XDB_DISCONNECTED) {
1272 		while ((bp = vdp->xs_f_iobuf) != NULL) {
1273 			vdp->xs_f_iobuf = bp->av_forw;
1274 			bp->av_forw = NULL;
1275 			vdp->xs_ionum++;
1276 			mutex_exit(&vdp->xs_iomutex);
1277 			if (bp->b_bcount != 0) {
1278 				int err = ldi_strategy(vdp->xs_ldi_hdl, bp);
1279 				if (err != 0) {
1280 					bp->b_flags |= B_ERROR;
1281 					(void) xdb_biodone(bp);
1282 					XDB_DBPRINT(XDB_DBG_IO, (CE_WARN,
1283 					    "xdb@%s: sent buf to backend dev"
1284 					    "failed, err=%d",
1285 					    ddi_get_name_addr(vdp->xs_dip),
1286 					    err));
1287 				} else {
1288 					XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
1289 					    "sent buf to backend ok"));
1290 				}
1291 			} else /* no I/O need to be done */
1292 				(void) xdb_biodone(bp);
1293 
1294 			mutex_enter(&vdp->xs_iomutex);
1295 		}
1296 
1297 		if (vdp->xs_if_status != XDB_DISCONNECTED)
1298 			cv_wait(&vdp->xs_iocv, &vdp->xs_iomutex);
1299 	}
1300 
1301 	mutex_exit(&vdp->xs_iomutex);
1302 }
1303 
1304 /*ARGSUSED*/
1305 static void
1306 xdb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1307     void *impl_data)
1308 {
1309 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1310 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1311 
1312 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1313 	    "hotplug status change to %d!", ddi_get_name_addr(dip), state));
1314 
1315 	mutex_enter(&vdp->xs_cbmutex);
1316 	if (state == Connected) {
1317 		/* Hotplug script has completed successfully */
1318 		if (vdp->xs_dev_status == XDB_DEV_UNKNOWN) {
1319 			vdp->xs_dev_status = XDB_DEV_READY;
1320 			if (vdp->xs_fe_status == XDB_FE_READY)
1321 				/* try to connect to frontend */
1322 				if (xdb_start_connect(vdp) != DDI_SUCCESS)
1323 					(void) xdb_start_disconnect(vdp);
1324 		}
1325 	}
1326 	mutex_exit(&vdp->xs_cbmutex);
1327 }
1328 
1329 /*ARGSUSED*/
1330 static void
1331 xdb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1332     void *impl_data)
1333 {
1334 	XenbusState new_state = *(XenbusState *)impl_data;
1335 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1336 
1337 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1338 	    "otherend state change to %d!", ddi_get_name_addr(dip), new_state));
1339 
1340 	mutex_enter(&vdp->xs_cbmutex);
1341 
1342 	if (xdb_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1343 		mutex_exit(&vdp->xs_cbmutex);
1344 		return;
1345 	}
1346 
1347 	switch (new_state) {
1348 	case XenbusStateInitialised:
1349 		ASSERT(vdp->xs_if_status == XDB_UNKNOWN);
1350 
1351 		/* frontend is ready for connecting */
1352 		vdp->xs_fe_status = XDB_FE_READY;
1353 
1354 		if (vdp->xs_dev_status == XDB_DEV_READY)
1355 			if (xdb_start_connect(vdp) != DDI_SUCCESS)
1356 				(void) xdb_start_disconnect(vdp);
1357 		break;
1358 	case XenbusStateClosing:
1359 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1360 		break;
1361 	case XenbusStateClosed:
1362 		/* clean up */
1363 		xdb_close(dip);
1364 	}
1365 
1366 	mutex_exit(&vdp->xs_cbmutex);
1367 }
1368 
1369 static int
1370 xdb_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1371 {
1372 	xdb_t *vdp;
1373 	ddi_iblock_cookie_t ibc;
1374 	int instance;
1375 
1376 	switch (cmd) {
1377 	case DDI_RESUME:
1378 		return (DDI_FAILURE);
1379 	case DDI_ATTACH:
1380 		break;
1381 	default:
1382 		return (DDI_FAILURE);
1383 	}
1384 
1385 	/* DDI_ATTACH */
1386 	instance = ddi_get_instance(dip);
1387 	if (ddi_soft_state_zalloc(xdb_statep, instance) != DDI_SUCCESS)
1388 		return (DDI_FAILURE);
1389 
1390 	vdp = ddi_get_soft_state(xdb_statep, instance);
1391 	vdp->xs_dip = dip;
1392 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
1393 		goto errout1;
1394 
1395 	if (!xdb_kstat_init(vdp))
1396 		goto errout1;
1397 
1398 	mutex_init(&vdp->xs_iomutex, NULL, MUTEX_DRIVER, (void *)ibc);
1399 	mutex_init(&vdp->xs_cbmutex, NULL, MUTEX_DRIVER, (void *)ibc);
1400 	cv_init(&vdp->xs_iocv, NULL, CV_DRIVER, NULL);
1401 	cv_init(&vdp->xs_ionumcv, NULL, CV_DRIVER, NULL);
1402 
1403 	ddi_set_driver_private(dip, vdp);
1404 
1405 	vdp->xs_iotaskq = ddi_taskq_create(dip, "xdb_iotask", 1,
1406 	    TASKQ_DEFAULTPRI, 0);
1407 	if (vdp->xs_iotaskq == NULL)
1408 		goto errout2;
1409 	(void) ddi_taskq_dispatch(vdp->xs_iotaskq, xdb_send_buf, vdp,
1410 	    DDI_SLEEP);
1411 
1412 	/* Watch frontend and hotplug state change */
1413 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xdb_oe_state_change) !=
1414 	    DDI_SUCCESS)
1415 		goto errout3;
1416 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xdb_hp_state_change) !=
1417 	    DDI_SUCCESS) {
1418 		goto errout4;
1419 	}
1420 
1421 	/*
1422 	 * Kick-off hotplug script
1423 	 */
1424 	if (xvdi_post_event(dip, XEN_HP_ADD) != DDI_SUCCESS) {
1425 		cmn_err(CE_WARN, "xdb@%s: failed to start hotplug script",
1426 		    ddi_get_name_addr(dip));
1427 		goto errout4;
1428 	}
1429 
1430 	/*
1431 	 * start waiting for hotplug event and otherend state event
1432 	 * mainly for debugging, frontend will not take any op seeing this
1433 	 */
1434 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
1435 
1436 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: attached!",
1437 	    ddi_get_name_addr(dip)));
1438 	return (DDI_SUCCESS);
1439 
1440 errout4:
1441 	xvdi_remove_event_handler(dip, NULL);
1442 errout3:
1443 	mutex_enter(&vdp->xs_cbmutex);
1444 	mutex_enter(&vdp->xs_iomutex);
1445 	vdp->xs_if_status = XDB_DISCONNECTED;
1446 	cv_broadcast(&vdp->xs_iocv);
1447 	mutex_exit(&vdp->xs_iomutex);
1448 	mutex_exit(&vdp->xs_cbmutex);
1449 	ddi_taskq_destroy(vdp->xs_iotaskq);
1450 errout2:
1451 	ddi_set_driver_private(dip, NULL);
1452 	cv_destroy(&vdp->xs_iocv);
1453 	cv_destroy(&vdp->xs_ionumcv);
1454 	mutex_destroy(&vdp->xs_cbmutex);
1455 	mutex_destroy(&vdp->xs_iomutex);
1456 	kstat_delete(vdp->xs_kstats);
1457 errout1:
1458 	ddi_soft_state_free(xdb_statep, instance);
1459 	return (DDI_FAILURE);
1460 }
1461 
1462 /*ARGSUSED*/
1463 static int
1464 xdb_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1465 {
1466 	int instance = ddi_get_instance(dip);
1467 	xdb_t *vdp = XDB_INST2SOFTS(instance);
1468 
1469 	switch (cmd) {
1470 	case DDI_SUSPEND:
1471 		return (DDI_FAILURE);
1472 	case DDI_DETACH:
1473 		break;
1474 	default:
1475 		return (DDI_FAILURE);
1476 	}
1477 
1478 	/* DDI_DETACH handling */
1479 
1480 	/* shouldn't detach, if still used by frontend */
1481 	mutex_enter(&vdp->xs_iomutex);
1482 	if (vdp->xs_if_status != XDB_DISCONNECTED) {
1483 		mutex_exit(&vdp->xs_iomutex);
1484 		return (DDI_FAILURE);
1485 	}
1486 	mutex_exit(&vdp->xs_iomutex);
1487 
1488 	xvdi_remove_event_handler(dip, NULL);
1489 	/* can do nothing about it, if it fails */
1490 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1491 
1492 	ddi_taskq_destroy(vdp->xs_iotaskq);
1493 	cv_destroy(&vdp->xs_iocv);
1494 	cv_destroy(&vdp->xs_ionumcv);
1495 	mutex_destroy(&vdp->xs_cbmutex);
1496 	mutex_destroy(&vdp->xs_iomutex);
1497 	kstat_delete(vdp->xs_kstats);
1498 	ddi_set_driver_private(dip, NULL);
1499 	ddi_soft_state_free(xdb_statep, instance);
1500 
1501 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: detached!",
1502 	    ddi_get_name_addr(dip)));
1503 	return (DDI_SUCCESS);
1504 }
1505 
1506 static struct dev_ops xdb_dev_ops = {
1507 	DEVO_REV,	/* devo_rev */
1508 	0,		/* devo_refcnt */
1509 	ddi_getinfo_1to1, /* devo_getinfo */
1510 	nulldev,	/* devo_identify */
1511 	nulldev,	/* devo_probe */
1512 	xdb_attach,	/* devo_attach */
1513 	xdb_detach,	/* devo_detach */
1514 	nodev,		/* devo_reset */
1515 	NULL,		/* devo_cb_ops */
1516 	NULL,		/* devo_bus_ops */
1517 	NULL		/* power */
1518 };
1519 
1520 /*
1521  * Module linkage information for the kernel.
1522  */
1523 static struct modldrv modldrv = {
1524 	&mod_driverops,			/* Type of module. */
1525 	"vbd backend driver 1.4",	/* Name of the module */
1526 	&xdb_dev_ops			/* driver ops */
1527 };
1528 
1529 static struct modlinkage xdb_modlinkage = {
1530 	MODREV_1,
1531 	&modldrv,
1532 	NULL
1533 };
1534 
1535 int
1536 _init(void)
1537 {
1538 	int rv;
1539 
1540 	if ((rv = ddi_soft_state_init((void **)&xdb_statep,
1541 	    sizeof (xdb_t), 0)) == 0)
1542 		if ((rv = mod_install(&xdb_modlinkage)) != 0)
1543 			ddi_soft_state_fini((void **)&xdb_statep);
1544 	return (rv);
1545 }
1546 
1547 int
1548 _fini(void)
1549 {
1550 	int rv;
1551 
1552 	if ((rv = mod_remove(&xdb_modlinkage)) != 0)
1553 		return (rv);
1554 	ddi_soft_state_fini((void **)&xdb_statep);
1555 	return (rv);
1556 }
1557 
1558 int
1559 _info(struct modinfo *modinfop)
1560 {
1561 	return (mod_info(&xdb_modlinkage, modinfop));
1562 }
1563 
1564 static int
1565 xdb_get_request(xdb_t *vdp, blkif_request_t *req)
1566 {
1567 	void *src = xvdi_ring_get_request(vdp->xs_ring);
1568 
1569 	if (src == NULL)
1570 		return (0);
1571 
1572 	switch (vdp->xs_blk_protocol) {
1573 	case BLKIF_PROTOCOL_NATIVE:
1574 		(void) memcpy(req, src, sizeof (*req));
1575 		break;
1576 	case BLKIF_PROTOCOL_X86_32:
1577 		blkif_get_x86_32_req(req, src);
1578 		break;
1579 	case BLKIF_PROTOCOL_X86_64:
1580 		blkif_get_x86_64_req(req, src);
1581 		break;
1582 	default:
1583 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1584 		    ddi_get_name_addr(vdp->xs_dip),
1585 		    vdp->xs_blk_protocol);
1586 	}
1587 	return (1);
1588 }
1589 
1590 static int
1591 xdb_push_response(xdb_t *vdp, uint64_t id, uint8_t op, uint16_t status)
1592 {
1593 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
1594 	blkif_response_t *rsp = xvdi_ring_get_response(vdp->xs_ring);
1595 	blkif_x86_32_response_t *rsp_32 = (blkif_x86_32_response_t *)rsp;
1596 	blkif_x86_64_response_t *rsp_64 = (blkif_x86_64_response_t *)rsp;
1597 
1598 	ASSERT(rsp);
1599 
1600 	switch (vdp->xs_blk_protocol) {
1601 	case BLKIF_PROTOCOL_NATIVE:
1602 		ddi_put64(acchdl, &rsp->id, id);
1603 		ddi_put8(acchdl, &rsp->operation, op);
1604 		ddi_put16(acchdl, (uint16_t *)&rsp->status,
1605 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1606 		break;
1607 	case BLKIF_PROTOCOL_X86_32:
1608 		ddi_put64(acchdl, &rsp_32->id, id);
1609 		ddi_put8(acchdl, &rsp_32->operation, op);
1610 		ddi_put16(acchdl, (uint16_t *)&rsp_32->status,
1611 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1612 		break;
1613 	case BLKIF_PROTOCOL_X86_64:
1614 		ddi_put64(acchdl, &rsp_64->id, id);
1615 		ddi_put8(acchdl, &rsp_64->operation, op);
1616 		ddi_put16(acchdl, (uint16_t *)&rsp_64->status,
1617 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1618 		break;
1619 	default:
1620 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1621 		    ddi_get_name_addr(vdp->xs_dip),
1622 		    vdp->xs_blk_protocol);
1623 	}
1624 
1625 	return (xvdi_ring_push_response(vdp->xs_ring));
1626 }
1627 
1628 static void
1629 blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
1630 {
1631 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1632 	dst->operation = src->operation;
1633 	dst->nr_segments = src->nr_segments;
1634 	dst->handle = src->handle;
1635 	dst->id = src->id;
1636 	dst->sector_number = src->sector_number;
1637 	if (n > src->nr_segments)
1638 		n = src->nr_segments;
1639 	for (i = 0; i < n; i++)
1640 		dst->seg[i] = src->seg[i];
1641 }
1642 
1643 static void
1644 blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
1645 {
1646 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1647 	dst->operation = src->operation;
1648 	dst->nr_segments = src->nr_segments;
1649 	dst->handle = src->handle;
1650 	dst->id = src->id;
1651 	dst->sector_number = src->sector_number;
1652 	if (n > src->nr_segments)
1653 		n = src->nr_segments;
1654 	for (i = 0; i < n; i++)
1655 		dst->seg[i] = src->seg[i];
1656 }
1657