1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD: head/sys/dev/virtio/block/virtio_blk.c 252707 2013-07-04 17:57:26Z bryanv $
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/bio.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/sglist.h>
38 #include <sys/sysctl.h>
39 #include <sys/queue.h>
40 #include <sys/serialize.h>
41 #include <sys/buf2.h>
42 #include <sys/rman.h>
43 #include <sys/disk.h>
44 #include <sys/devicestat.h>
45 
46 #include <dev/virtual/virtio/virtio/virtio.h>
47 #include <dev/virtual/virtio/virtio/virtqueue.h>
48 #include "virtio_blk.h"
49 #include "virtio_if.h"
50 
51 struct vtblk_request {
52 	struct virtio_blk_outhdr	 vbr_hdr __aligned(16);
53 	struct bio			*vbr_bio;
54 	uint8_t				 vbr_ack;
55 
56 	SLIST_ENTRY(vtblk_request)	 vbr_link;
57 };
58 
59 enum vtblk_cache_mode {
60 	VTBLK_CACHE_WRITETHROUGH,
61 	VTBLK_CACHE_WRITEBACK,
62 	VTBLK_CACHE_MAX
63 };
64 
65 struct vtblk_softc {
66 	device_t		 vtblk_dev;
67 	struct lwkt_serialize	 vtblk_slz;
68 	uint64_t		 vtblk_features;
69 	uint32_t		 vtblk_flags;
70 #define VTBLK_FLAG_INDIRECT	0x0001
71 #define VTBLK_FLAG_READONLY	0x0002
72 #define VTBLK_FLAG_DETACH	0x0004
73 #define VTBLK_FLAG_SUSPEND	0x0008
74 #define VTBLK_FLAG_DUMPING	0x0010
75 #define VTBLK_FLAG_WC_CONFIG	0x0020
76 
77 	struct virtqueue	*vtblk_vq;
78 	struct sglist		*vtblk_sglist;
79 	struct disk		 vtblk_disk;
80 	cdev_t			 cdev;
81 	struct devstat		 stats;
82 
83 	struct bio_queue_head	 vtblk_bioq;
84 	SLIST_HEAD(, vtblk_request)
85 				 vtblk_req_free;
86 
87 	int			 vtblk_sector_size;
88 	int			 vtblk_max_nsegs;
89 	int			 vtblk_request_count;
90 	enum vtblk_cache_mode	 vtblk_write_cache;
91 
92 	struct vtblk_request	 vtblk_dump_request;
93 };
94 
95 static struct virtio_feature_desc vtblk_feature_desc[] = {
96 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
97 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
98 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
99 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
100 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
101 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
102 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
103 	{ VIRTIO_BLK_F_WCE,		"WriteCache"	},
104 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
105 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
106 
107 	{ 0, NULL }
108 };
109 
110 static int	vtblk_probe(device_t);
111 static int	vtblk_attach(device_t);
112 static int	vtblk_detach(device_t);
113 static int	vtblk_suspend(device_t);
114 static int	vtblk_resume(device_t);
115 static int	vtblk_shutdown(device_t);
116 
117 static void	vtblk_negotiate_features(struct vtblk_softc *);
118 static int	vtblk_maximum_segments(struct vtblk_softc *,
119 		    struct virtio_blk_config *);
120 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
121 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
122 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
123 		    struct virtio_blk_config *);
124 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
125 static void	vtblk_alloc_disk(struct vtblk_softc *,
126 		    struct virtio_blk_config *);
127 /*
128  * Interface to the device switch.
129  */
130 static d_open_t		vtblk_open;
131 static d_strategy_t	vtblk_strategy;
132 static d_dump_t		vtblk_dump;
133 
134 static struct dev_ops vbd_disk_ops = {
135 	{ "vbd", 200, D_DISK | D_MPSAFE },
136 	.d_open		= vtblk_open,
137 	.d_close	= nullclose,
138 	.d_read		= physread,
139 	.d_write	= physwrite,
140 	.d_strategy	= vtblk_strategy,
141 	.d_dump		= vtblk_dump,
142 };
143 
144 static void	vtblk_startio(struct vtblk_softc *);
145 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
146 static int	vtblk_execute_request(struct vtblk_softc *,
147 		    struct vtblk_request *);
148 
149 static int	vtblk_vq_intr(void *);
150 static void	vtblk_complete(void *);
151 
152 static void	vtblk_stop(struct vtblk_softc *);
153 
154 static void	vtblk_prepare_dump(struct vtblk_softc *);
155 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
156 static int	vtblk_flush_dump(struct vtblk_softc *);
157 static int	vtblk_poll_request(struct vtblk_softc *,
158 		    struct vtblk_request *);
159 
160 static void	vtblk_drain_vq(struct vtblk_softc *, int);
161 static void	vtblk_drain(struct vtblk_softc *);
162 
163 static int	vtblk_alloc_requests(struct vtblk_softc *);
164 static void	vtblk_free_requests(struct vtblk_softc *);
165 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
166 static void	vtblk_enqueue_request(struct vtblk_softc *,
167 		    struct vtblk_request *);
168 
169 static int	vtblk_request_error(struct vtblk_request *);
170 static void	vtblk_finish_bio(struct bio *, int);
171 
172 static void	vtblk_setup_sysctl(struct vtblk_softc *);
173 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
174 
175 /* Tunables. */
176 static int vtblk_writecache_mode = -1;
177 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
178 
179 /* Features desired/implemented by this driver. */
180 #define VTBLK_FEATURES \
181     (VIRTIO_BLK_F_SIZE_MAX		| \
182      VIRTIO_BLK_F_SEG_MAX		| \
183      VIRTIO_BLK_F_GEOMETRY		| \
184      VIRTIO_BLK_F_RO			| \
185      VIRTIO_BLK_F_BLK_SIZE		| \
186      VIRTIO_BLK_F_WCE			| \
187      VIRTIO_BLK_F_CONFIG_WCE		| \
188      VIRTIO_RING_F_INDIRECT_DESC)
189 
190 /*
191  * Each block request uses at least two segments - one for the header
192  * and one for the status.
193  */
194 #define VTBLK_MIN_SEGMENTS	2
195 
196 static device_method_t vtblk_methods[] = {
197 	/* Device methods. */
198 	DEVMETHOD(device_probe,		vtblk_probe),
199 	DEVMETHOD(device_attach,	vtblk_attach),
200 	DEVMETHOD(device_detach,	vtblk_detach),
201 	DEVMETHOD(device_suspend,	vtblk_suspend),
202 	DEVMETHOD(device_resume,	vtblk_resume),
203 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
204 
205 	DEVMETHOD_END
206 };
207 
208 static driver_t vtblk_driver = {
209 	"vtblk",
210 	vtblk_methods,
211 	sizeof(struct vtblk_softc)
212 };
213 static devclass_t vtblk_devclass;
214 
215 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass, NULL, NULL);
216 MODULE_VERSION(virtio_blk, 1);
217 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
218 
219 static int
220 vtblk_probe(device_t dev)
221 {
222 
223 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
224 		return (ENXIO);
225 
226 	device_set_desc(dev, "VirtIO Block Adapter");
227 
228 	return (BUS_PROBE_DEFAULT);
229 }
230 
231 static int
232 vtblk_attach(device_t dev)
233 {
234 	struct vtblk_softc *sc;
235 	struct virtio_blk_config blkcfg;
236 	int error;
237 
238 	sc = device_get_softc(dev);
239 	sc->vtblk_dev = dev;
240 
241 	lwkt_serialize_init(&sc->vtblk_slz);
242 
243 	bioq_init(&sc->vtblk_bioq);
244 	SLIST_INIT(&sc->vtblk_req_free);
245 
246 	virtio_set_feature_desc(dev, vtblk_feature_desc);
247 	vtblk_negotiate_features(sc);
248 
249 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
250 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
251 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
252 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
253 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
254 		sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG;
255 
256 	vtblk_setup_sysctl(sc);
257 
258 	/* Get local copy of config. */
259 	virtio_read_device_config(dev, 0, &blkcfg,
260 				  sizeof(struct virtio_blk_config));
261 
262 	/*
263 	 * With the current sglist(9) implementation, it is not easy
264 	 * for us to support a maximum segment size as adjacent
265 	 * segments are coalesced. For now, just make sure it's larger
266 	 * than the maximum supported transfer size.
267 	 */
268 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
269 		if (blkcfg.size_max < MAXPHYS) {
270 			error = ENOTSUP;
271 			device_printf(dev, "host requires unsupported "
272 			    "maximum segment size feature\n");
273 			goto fail;
274 		}
275 	}
276 
277 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
278 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
279 		error = EINVAL;
280 		device_printf(dev, "fewer than minimum number of segments "
281 		    "allowed: %d\n", sc->vtblk_max_nsegs);
282 		goto fail;
283 	}
284 
285 	/*
286 	 * Allocate working sglist. The number of segments may be too
287 	 * large to safely store on the stack.
288 	 */
289 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_INTWAIT);
290 	if (sc->vtblk_sglist == NULL) {
291 		error = ENOMEM;
292 		device_printf(dev, "cannot allocate sglist\n");
293 		goto fail;
294 	}
295 
296 	error = vtblk_alloc_virtqueue(sc);
297 	if (error) {
298 		device_printf(dev, "cannot allocate virtqueue\n");
299 		goto fail;
300 	}
301 
302 	error = vtblk_alloc_requests(sc);
303 	if (error) {
304 		device_printf(dev, "cannot preallocate requests\n");
305 		goto fail;
306 	}
307 
308 	error = virtio_setup_intr(dev, &sc->vtblk_slz);
309 	if (error) {
310 		device_printf(dev, "cannot setup virtqueue interrupt\n");
311 		goto fail;
312 	}
313 
314 	virtqueue_enable_intr(sc->vtblk_vq);
315 
316 	vtblk_alloc_disk(sc, &blkcfg);
317 
318 fail:
319 	if (error)
320 		vtblk_detach(dev);
321 
322 	return (error);
323 }
324 
325 static int
326 vtblk_detach(device_t dev)
327 {
328 	struct vtblk_softc *sc;
329 
330 	sc = device_get_softc(dev);
331 
332 	lwkt_serialize_enter(&sc->vtblk_slz);
333 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
334 	if (device_is_attached(dev))
335 		vtblk_stop(sc);
336 	lwkt_serialize_exit(&sc->vtblk_slz);
337 
338 	vtblk_drain(sc);
339 
340 	if (sc->cdev != NULL) {
341 		disk_destroy(&sc->vtblk_disk);
342 		sc->cdev = NULL;
343 	}
344 
345 	if (sc->vtblk_sglist != NULL) {
346 		sglist_free(sc->vtblk_sglist);
347 		sc->vtblk_sglist = NULL;
348 	}
349 
350 	return (0);
351 }
352 
353 static int
354 vtblk_suspend(device_t dev)
355 {
356 	struct vtblk_softc *sc;
357 
358 	sc = device_get_softc(dev);
359 
360 	lwkt_serialize_enter(&sc->vtblk_slz);
361 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
362 	/* XXX BMV: virtio_stop(), etc needed here? */
363 	lwkt_serialize_exit(&sc->vtblk_slz);
364 
365 	return (0);
366 }
367 
368 static int
369 vtblk_resume(device_t dev)
370 {
371 	struct vtblk_softc *sc;
372 
373 	sc = device_get_softc(dev);
374 
375 	lwkt_serialize_enter(&sc->vtblk_slz);
376 	/* XXX BMV: virtio_reinit(), etc needed here? */
377 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
378 #if 0 /* XXX Resume IO? */
379 	vtblk_startio(sc);
380 #endif
381 	lwkt_serialize_exit(&sc->vtblk_slz);
382 
383 	return (0);
384 }
385 
386 static int
387 vtblk_shutdown(device_t dev)
388 {
389 
390 	return (0);
391 }
392 
393 static int
394 vtblk_open(struct dev_open_args *ap)
395 {
396 	struct vtblk_softc *sc;
397 	cdev_t dev = ap->a_head.a_dev;
398 	sc = dev->si_drv1;
399 	if (sc == NULL)
400 		return (ENXIO);
401 
402 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
403 }
404 
405 static int
406 vtblk_dump(struct dev_dump_args *ap)
407 {
408 	struct vtblk_softc *sc;
409 	cdev_t dev = ap->a_head.a_dev;
410         uint64_t buf_start, buf_len;
411         int error;
412 
413 	sc = dev->si_drv1;
414 	if (sc == NULL)
415 		return (ENXIO);
416 
417         buf_start = ap->a_offset;
418         buf_len = ap->a_length;
419 
420 //	lwkt_serialize_enter(&sc->vtblk_slz);
421 
422 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
423 		vtblk_prepare_dump(sc);
424 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
425 	}
426 
427 	if (buf_len > 0)
428 		error = vtblk_write_dump(sc, ap->a_virtual, buf_start,
429 		    buf_len);
430 	else if (buf_len == 0)
431 		error = vtblk_flush_dump(sc);
432 	else {
433 		error = EINVAL;
434 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
435 	}
436 
437 //	lwkt_serialize_exit(&sc->vtblk_slz);
438 
439 	return (error);
440 }
441 
442 static int
443 vtblk_strategy(struct dev_strategy_args *ap)
444 {
445 	struct vtblk_softc *sc;
446 	cdev_t dev = ap->a_head.a_dev;
447 	sc = dev->si_drv1;
448 	struct bio *bio = ap->a_bio;
449 	struct buf *bp = bio->bio_buf;
450 
451 	if (sc == NULL) {
452 		vtblk_finish_bio(bio, EINVAL);
453 		return EINVAL;
454 	}
455 
456 	/*
457 	 * Fail any write if RO. Unfortunately, there does not seem to
458 	 * be a better way to report our readonly'ness to GEOM above.
459 	 *
460 	 * XXX: Is that true in DFly?
461 	 */
462 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
463 	    (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_FLUSH)) {
464 		vtblk_finish_bio(bio, EROFS);
465 		return (EINVAL);
466 	}
467 
468 	lwkt_serialize_enter(&sc->vtblk_slz);
469 	if ((sc->vtblk_flags & VTBLK_FLAG_DETACH) == 0) {
470 		bioqdisksort(&sc->vtblk_bioq, bio);
471 		vtblk_startio(sc);
472 		lwkt_serialize_exit(&sc->vtblk_slz);
473 	} else {
474 		lwkt_serialize_exit(&sc->vtblk_slz);
475 		vtblk_finish_bio(bio, ENXIO);
476 	}
477 	return 0;
478 }
479 
480 static void
481 vtblk_negotiate_features(struct vtblk_softc *sc)
482 {
483 	device_t dev;
484 	uint64_t features;
485 
486 	dev = sc->vtblk_dev;
487 	features = VTBLK_FEATURES;
488 
489 	sc->vtblk_features = virtio_negotiate_features(dev, features);
490 }
491 
492 /*
493  * Calculate the maximum number of DMA segment supported.  Note
494  * that the in/out header is encoded in the segment list.  We
495  * assume that VTBLK_MIN_SEGMENTS covers that part of it so
496  * we add it into the desired total.  If the SEG_MAX feature
497  * is not specified we have to just assume that the host can
498  * handle the maximum number of segments required for a MAXPHYS
499  * sized request.
500  *
501  * The additional + 1 is in case a MAXPHYS-sized buffer crosses
502  * a page boundary.
503  */
504 static int
505 vtblk_maximum_segments(struct vtblk_softc *sc,
506     struct virtio_blk_config *blkcfg)
507 {
508 	device_t dev;
509 	int nsegs;
510 
511 	dev = sc->vtblk_dev;
512 	nsegs = VTBLK_MIN_SEGMENTS;
513 
514 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
515 		nsegs = MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1 + nsegs);
516 	} else {
517 		nsegs = MAXPHYS / PAGE_SIZE + 1 + nsegs;
518 	}
519 	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
520 		nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
521 
522 	return (nsegs);
523 }
524 
525 static int
526 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
527 {
528 	device_t dev;
529 	struct vq_alloc_info vq_info;
530 
531 	dev = sc->vtblk_dev;
532 
533 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
534 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
535 	    "%s request", device_get_nameunit(dev));
536 
537 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
538 }
539 
540 static void
541 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
542 {
543 
544 	/* Set either writeback (1) or writethrough (0) mode. */
545 	virtio_write_dev_config_1(sc->vtblk_dev,
546 	    offsetof(struct virtio_blk_config, writeback), wc);
547 }
548 
549 static int
550 vtblk_write_cache_enabled(struct vtblk_softc *sc,
551     struct virtio_blk_config *blkcfg)
552 {
553 	int wc;
554 
555 	if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) {
556 		wc = vtblk_tunable_int(sc, "writecache_mode",
557 		    vtblk_writecache_mode);
558 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
559 			vtblk_set_write_cache(sc, wc);
560 		else
561 			wc = blkcfg->writeback;
562 	} else
563 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);
564 
565 	return (wc);
566 }
567 
568 static int
569 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
570 {
571 	struct vtblk_softc *sc;
572 	int wc, error;
573 
574 	sc = oidp->oid_arg1;
575 	wc = sc->vtblk_write_cache;
576 
577 	error = sysctl_handle_int(oidp, &wc, 0, req);
578 	if (error || req->newptr == NULL)
579 		return (error);
580 	if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0)
581 		return (EPERM);
582 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
583 		return (EINVAL);
584 
585 	lwkt_serialize_enter(&sc->vtblk_slz);
586 	sc->vtblk_write_cache = wc;
587 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
588 	lwkt_serialize_exit(&sc->vtblk_slz);
589 
590 	return (0);
591 }
592 
593 static void
594 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
595 {
596 
597 	struct disk_info info;
598 
599 	/* construct the disk_info */
600 	bzero(&info, sizeof(info));
601 
602 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE))
603 		sc->vtblk_sector_size = blkcfg->blk_size;
604 	else
605 		sc->vtblk_sector_size = DEV_BSIZE;
606 
607 	info.d_media_blksize = sc->vtblk_sector_size;
608 	info.d_media_blocks = blkcfg->capacity;
609 
610 	info.d_ncylinders = blkcfg->geometry.cylinders;
611 	info.d_nheads = blkcfg->geometry.heads;
612 	info.d_secpertrack = blkcfg->geometry.sectors;
613 
614 	info.d_secpercyl = info.d_secpertrack * info.d_nheads;
615 
616 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
617 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
618 	else
619 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
620 
621 	devstat_add_entry(&sc->stats, "vbd", device_get_unit(sc->vtblk_dev),
622 			  DEV_BSIZE, DEVSTAT_ALL_SUPPORTED,
623 			  DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
624 			  DEVSTAT_PRIORITY_DISK);
625 
626 	/* attach a generic disk device to ourselves */
627 	sc->cdev = disk_create(device_get_unit(sc->vtblk_dev), &sc->vtblk_disk,
628 			       &vbd_disk_ops);
629 
630 	sc->cdev->si_drv1 = sc;
631 	sc->cdev->si_iosize_max = MAXPHYS;
632 	disk_setdiskinfo(&sc->vtblk_disk, &info);
633 }
634 
635 static void
636 vtblk_startio(struct vtblk_softc *sc)
637 {
638 	struct virtqueue *vq;
639 	struct vtblk_request *req;
640 	int enq;
641 
642 	vq = sc->vtblk_vq;
643 	enq = 0;
644 
645 	ASSERT_SERIALIZED(&sc->vtblk_slz);
646 
647 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
648 		return;
649 
650 	while (!virtqueue_full(vq)) {
651 		req = vtblk_bio_request(sc);
652 		if (req == NULL)
653 			break;
654 
655 		if (vtblk_execute_request(sc, req) != 0) {
656 			bioqdisksort(&sc->vtblk_bioq, req->vbr_bio);
657 			vtblk_enqueue_request(sc, req);
658 			break;
659 		}
660 		devstat_start_transaction(&sc->stats);
661 
662 		enq++;
663 	}
664 
665 	if (enq > 0)
666 		virtqueue_notify(vq, &sc->vtblk_slz);
667 }
668 
669 static struct vtblk_request *
670 vtblk_bio_request(struct vtblk_softc *sc)
671 {
672 	struct bio_queue_head *bioq;
673 	struct vtblk_request *req;
674 	struct bio *bio;
675 	struct buf *bp;
676 
677 	bioq = &sc->vtblk_bioq;
678 
679 	if (bioq_first(bioq) == NULL)
680 		return (NULL);
681 
682 	req = vtblk_dequeue_request(sc);
683 	if (req == NULL)
684 		return (NULL);
685 
686 	bio = bioq_takefirst(bioq);
687 	req->vbr_bio = bio;
688 	req->vbr_ack = -1;
689 	req->vbr_hdr.ioprio = 1;
690 	bp = bio->bio_buf;
691 
692 	switch (bp->b_cmd) {
693 	case BUF_CMD_FLUSH:
694 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
695 		break;
696 	case BUF_CMD_READ:
697 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
698 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
699 		break;
700 	case BUF_CMD_WRITE:
701 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
702 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
703 		break;
704 	default:
705 		KASSERT(0, ("bio with unhandled cmd: %d", bp->b_cmd));
706 		req->vbr_hdr.type = -1;
707 		break;
708 	}
709 
710 	return (req);
711 }
712 
713 static int
714 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
715 {
716 	struct sglist *sg;
717 	struct bio *bio;
718 	struct buf *bp;
719 	int writable, error;
720 
721 	sg = sc->vtblk_sglist;
722 	bio = req->vbr_bio;
723 	bp = bio->bio_buf;
724 	writable = 0;
725 
726 	/*
727 	 * sglist is live throughout this subroutine.
728 	 */
729 	sglist_reset(sg);
730 
731 	error = sglist_append(sg, &req->vbr_hdr,
732 			      sizeof(struct virtio_blk_outhdr));
733 	KASSERT(error == 0, ("error adding header to sglist"));
734 	KASSERT(sg->sg_nseg == 1,
735 	    ("header spanned multiple segments: %d", sg->sg_nseg));
736 
737 	if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
738 		error = sglist_append(sg, bp->b_data, bp->b_bcount);
739 		KASSERT(error == 0, ("error adding buffer to sglist"));
740 
741 		/* BUF_CMD_READ means the host writes into our buffer. */
742 		if (bp->b_cmd == BUF_CMD_READ)
743 			writable += sg->sg_nseg - 1;
744 	}
745 
746 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
747 	KASSERT(error == 0, ("error adding ack to sglist"));
748 	writable++;
749 
750 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
751 	    ("fewer than min segments: %d", sg->sg_nseg));
752 
753 	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
754 				  sg->sg_nseg - writable, writable);
755 
756 	sglist_reset(sg);
757 
758 	return (error);
759 }
760 
761 static int
762 vtblk_vq_intr(void *xsc)
763 {
764 	vtblk_complete(xsc);
765 
766 	return (1);
767 }
768 
769 static void
770 vtblk_complete(void *arg)
771 {
772 	struct vtblk_softc *sc;
773 	struct vtblk_request *req;
774 	struct virtqueue *vq;
775 	struct bio *bio;
776 	struct buf *bp;
777 
778 	sc = arg;
779 	vq = sc->vtblk_vq;
780 
781 	lwkt_serialize_handler_disable(&sc->vtblk_slz);
782 	virtqueue_disable_intr(sc->vtblk_vq);
783 	ASSERT_SERIALIZED(&sc->vtblk_slz);
784 
785 retry:
786 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
787 		return;
788 
789 	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
790 		bio = req->vbr_bio;
791 		bp = bio->bio_buf;
792 
793 		if (req->vbr_ack == VIRTIO_BLK_S_OK)
794 			bp->b_resid = 0;
795 		else {
796 			bp->b_flags |= B_ERROR;
797 			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) {
798 				bp->b_error = ENOTSUP;
799 			} else {
800 				bp->b_error = EIO;
801 			}
802 		}
803 
804 		devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
805 
806 		lwkt_serialize_exit(&sc->vtblk_slz);
807 		/*
808 		 * Unlocking the controller around biodone() does not allow
809 		 * processing further device interrupts; when we queued
810 		 * vtblk_complete, we disabled interrupts. It will allow
811 		 * concurrent vtblk_strategy/_startio command dispatches.
812 		 */
813 		biodone(bio);
814 		lwkt_serialize_enter(&sc->vtblk_slz);
815 
816 		vtblk_enqueue_request(sc, req);
817 	}
818 
819 	vtblk_startio(sc);
820 
821 	if (virtqueue_enable_intr(vq) != 0) {
822 		/*
823 		 * If new virtqueue entries appeared immediately after
824 		 * enabling interrupts, process them now. Release and
825 		 * retake softcontroller lock to try to avoid blocking
826 		 * I/O dispatch for too long.
827 		 */
828 		virtqueue_disable_intr(vq);
829 		goto retry;
830 	}
831 	lwkt_serialize_handler_enable(&sc->vtblk_slz);
832 }
833 
834 static void
835 vtblk_stop(struct vtblk_softc *sc)
836 {
837 
838 	virtqueue_disable_intr(sc->vtblk_vq);
839 	virtio_stop(sc->vtblk_dev);
840 }
841 
842 static void
843 vtblk_prepare_dump(struct vtblk_softc *sc)
844 {
845 	device_t dev;
846 	struct virtqueue *vq;
847 
848 	dev = sc->vtblk_dev;
849 	vq = sc->vtblk_vq;
850 
851 	vtblk_stop(sc);
852 
853 	/*
854 	 * Drain all requests caught in-flight in the virtqueue,
855 	 * skipping biodone(). When dumping, only one request is
856 	 * outstanding at a time, and we just poll the virtqueue
857 	 * for the response.
858 	 */
859 	vtblk_drain_vq(sc, 1);
860 
861 	if (virtio_reinit(dev, sc->vtblk_features) != 0) {
862 		panic("%s: cannot reinit VirtIO block device during dump",
863 		    device_get_nameunit(dev));
864 	}
865 
866 	virtqueue_disable_intr(vq);
867 	virtio_reinit_complete(dev);
868 }
869 
870 static int
871 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
872     size_t length)
873 {
874 	struct bio bio;
875 	struct buf bp;
876 	struct vtblk_request *req;
877 
878 	req = &sc->vtblk_dump_request;
879 	req->vbr_ack = -1;
880 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
881 	req->vbr_hdr.ioprio = 1;
882 	req->vbr_hdr.sector = offset / 512;
883 
884 	req->vbr_bio = &bio;
885 	bzero(&bio, sizeof(struct bio));
886 	bzero(&buf, sizeof(struct buf));
887 
888 	bio.bio_buf = &bp;
889 	bp.b_cmd = BUF_CMD_WRITE;
890 	bp.b_data = virtual;
891 	bp.b_bcount = length;
892 
893 	return (vtblk_poll_request(sc, req));
894 }
895 
896 static int
897 vtblk_flush_dump(struct vtblk_softc *sc)
898 {
899 	struct bio bio;
900 	struct buf bp;
901 	struct vtblk_request *req;
902 
903 	req = &sc->vtblk_dump_request;
904 	req->vbr_ack = -1;
905 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
906 	req->vbr_hdr.ioprio = 1;
907 	req->vbr_hdr.sector = 0;
908 
909 	req->vbr_bio = &bio;
910 	bzero(&bio, sizeof(struct bio));
911 	bzero(&bp, sizeof(struct buf));
912 
913 	bio.bio_buf = &bp;
914 	bp.b_cmd = BUF_CMD_FLUSH;
915 
916 	return (vtblk_poll_request(sc, req));
917 }
918 
919 static int
920 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
921 {
922 	struct virtqueue *vq;
923 	int error;
924 
925 	vq = sc->vtblk_vq;
926 
927 	if (!virtqueue_empty(vq))
928 		return (EBUSY);
929 
930 	error = vtblk_execute_request(sc, req);
931 	if (error)
932 		return (error);
933 
934 	virtqueue_notify(vq, NULL);
935 	virtqueue_poll(vq, NULL);
936 
937 	error = vtblk_request_error(req);
938 	if (error && bootverbose) {
939 		device_printf(sc->vtblk_dev,
940 		    "%s: IO error: %d\n", __func__, error);
941 	}
942 
943 	return (error);
944 }
945 
946 static void
947 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
948 {
949 	struct virtqueue *vq;
950 	struct vtblk_request *req;
951 	int last;
952 
953 	vq = sc->vtblk_vq;
954 	last = 0;
955 
956 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
957 		if (!skip_done)
958 			vtblk_finish_bio(req->vbr_bio, ENXIO);
959 
960 		vtblk_enqueue_request(sc, req);
961 	}
962 
963 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
964 }
965 
966 static void
967 vtblk_drain(struct vtblk_softc *sc)
968 {
969 	struct bio_queue_head *bioq;
970 	struct bio *bio;
971 
972 	bioq = &sc->vtblk_bioq;
973 
974 	if (sc->vtblk_vq != NULL)
975 		vtblk_drain_vq(sc, 0);
976 
977 	while (bioq_first(bioq) != NULL) {
978 		bio = bioq_takefirst(bioq);
979 		vtblk_finish_bio(bio, ENXIO);
980 	}
981 
982 	vtblk_free_requests(sc);
983 }
984 
985 static int
986 vtblk_alloc_requests(struct vtblk_softc *sc)
987 {
988 	struct vtblk_request *req;
989 	int i, nreqs;
990 
991 	nreqs = virtqueue_size(sc->vtblk_vq);
992 
993 	/*
994 	 * Preallocate sufficient requests to keep the virtqueue full. Each
995 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
996 	 * the number allocated when indirect descriptors are not available.
997 	 */
998 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
999 		nreqs /= VTBLK_MIN_SEGMENTS;
1000 
1001 	for (i = 0; i < nreqs; i++) {
1002 		req = contigmalloc(sizeof(struct vtblk_request), M_DEVBUF,
1003 		    M_WAITOK, 0, BUS_SPACE_MAXADDR, 16, 0);
1004 		if (req == NULL)
1005 			return (ENOMEM);
1006 
1007 		KKASSERT(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr))
1008 		    == 1);
1009 		KKASSERT(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack))
1010 		    == 1);
1011 
1012 		sc->vtblk_request_count++;
1013 		vtblk_enqueue_request(sc, req);
1014 	}
1015 
1016 	return (0);
1017 }
1018 
1019 static void
1020 vtblk_free_requests(struct vtblk_softc *sc)
1021 {
1022 	struct vtblk_request *req;
1023 
1024 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1025 		sc->vtblk_request_count--;
1026 		contigfree(req, sizeof(struct vtblk_request), M_DEVBUF);
1027 	}
1028 
1029 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1030 }
1031 
1032 static struct vtblk_request *
1033 vtblk_dequeue_request(struct vtblk_softc *sc)
1034 {
1035 	struct vtblk_request *req;
1036 
1037 	req = SLIST_FIRST(&sc->vtblk_req_free);
1038 	if (req != NULL)
1039 		SLIST_REMOVE_HEAD(&sc->vtblk_req_free, vbr_link);
1040 
1041 	return (req);
1042 }
1043 
1044 static void
1045 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1046 {
1047 
1048 	bzero(req, sizeof(struct vtblk_request));
1049 	SLIST_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1050 }
1051 
1052 static int
1053 vtblk_request_error(struct vtblk_request *req)
1054 {
1055 	int error;
1056 
1057 	switch (req->vbr_ack) {
1058 	case VIRTIO_BLK_S_OK:
1059 		error = 0;
1060 		break;
1061 	case VIRTIO_BLK_S_UNSUPP:
1062 		error = ENOTSUP;
1063 		break;
1064 	default:
1065 		error = EIO;
1066 		break;
1067 	}
1068 
1069 	return (error);
1070 }
1071 
1072 static void
1073 vtblk_finish_bio(struct bio *bio, int error)
1074 {
1075 
1076 	biodone(bio);
1077 }
1078 
1079 static void
1080 vtblk_setup_sysctl(struct vtblk_softc *sc)
1081 {
1082 	device_t dev;
1083 	struct sysctl_ctx_list *ctx;
1084 	struct sysctl_oid *tree;
1085 	struct sysctl_oid_list *child;
1086 
1087 	dev = sc->vtblk_dev;
1088 	ctx = device_get_sysctl_ctx(dev);
1089 	tree = device_get_sysctl_tree(dev);
1090 	child = SYSCTL_CHILDREN(tree);
1091 
1092 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1093 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, vtblk_write_cache_sysctl,
1094 	    "I", "Write cache mode (writethrough (0) or writeback (1))");
1095 }
1096 
1097 static int
1098 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1099 {
1100 	char path[64];
1101 
1102 	ksnprintf(path, sizeof(path),
1103 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1104 	TUNABLE_INT_FETCH(path, &def);
1105 
1106 	return (def);
1107 }
1108