1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD: head/sys/dev/virtio/block/virtio_blk.c 252707 2013-07-04 17:57:26Z bryanv $
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/bio.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/sglist.h>
38 #include <sys/sysctl.h>
39 #include <sys/lock.h>
40 #include <sys/queue.h>
41 #include <sys/serialize.h>
42 #include <sys/buf2.h>
43 #include <sys/rman.h>
44 #include <sys/disk.h>
45 #include <sys/devicestat.h>
46 
47 #include <dev/virtual/virtio/virtio/virtio.h>
48 #include <dev/virtual/virtio/virtio/virtqueue.h>
49 #include "virtio_blk.h"
50 #include "virtio_if.h"
51 
52 struct vtblk_request {
53 	struct virtio_blk_outhdr	 vbr_hdr __aligned(16);
54 	struct bio			*vbr_bp;
55 	uint8_t				 vbr_ack;
56 	uint8_t				 vbr_barrier;
57 
58 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
59 };
60 
61 enum vtblk_cache_mode {
62 	VTBLK_CACHE_WRITETHROUGH,
63 	VTBLK_CACHE_WRITEBACK,
64 	VTBLK_CACHE_MAX
65 };
66 
67 struct vtblk_softc {
68 	device_t		 vtblk_dev;
69 	struct lwkt_serialize	 vtblk_slz;
70 	uint64_t		 vtblk_features;
71 	uint32_t		 vtblk_flags;
72 #define VTBLK_FLAG_INDIRECT	0x0001
73 #define VTBLK_FLAG_READONLY	0x0002
74 #define VTBLK_FLAG_DETACH	0x0004
75 #define VTBLK_FLAG_SUSPEND	0x0008
76 #define VTBLK_FLAG_DUMPING	0x0010
77 #define VTBLK_FLAG_BARRIER	0x0020
78 #define VTBLK_FLAG_WC_CONFIG	0x0040
79 
80 	struct virtqueue	*vtblk_vq;
81 	struct sglist		*vtblk_sglist;
82 	struct disk		 vtblk_disk;
83 	cdev_t			 cdev;
84 	struct devstat		 stats;
85 
86 	struct bio_queue_head	 vtblk_bioq;
87 	TAILQ_HEAD(, vtblk_request)
88 				 vtblk_req_free;
89 	TAILQ_HEAD(, vtblk_request)
90 				 vtblk_req_ready;
91 	struct vtblk_request	*vtblk_req_ordered;
92 
93 	int			 vtblk_sector_size;
94 	int			 vtblk_max_nsegs;
95 	int			 vtblk_request_count;
96 	enum vtblk_cache_mode	 vtblk_write_cache;
97 
98 	struct vtblk_request	 vtblk_dump_request;
99 };
100 
101 static struct virtio_feature_desc vtblk_feature_desc[] = {
102 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
103 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
104 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
105 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
106 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
107 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
108 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
109 	{ VIRTIO_BLK_F_WCE,		"WriteCache"	},
110 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
111 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
112 
113 	{ 0, NULL }
114 };
115 
116 static int	vtblk_modevent(module_t, int, void *);
117 
118 static int	vtblk_probe(device_t);
119 static int	vtblk_attach(device_t);
120 static int	vtblk_detach(device_t);
121 static int	vtblk_suspend(device_t);
122 static int	vtblk_resume(device_t);
123 static int	vtblk_shutdown(device_t);
124 
125 static void	vtblk_negotiate_features(struct vtblk_softc *);
126 static int	vtblk_maximum_segments(struct vtblk_softc *,
127 		    struct virtio_blk_config *);
128 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
129 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
130 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
131 		    struct virtio_blk_config *);
132 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
133 static void	vtblk_alloc_disk(struct vtblk_softc *,
134 		    struct virtio_blk_config *);
135 /*
136  * Interface to the device switch.
137  */
138 static d_open_t		vtblk_open;
139 static d_strategy_t	vtblk_strategy;
140 static d_dump_t		vtblk_dump;
141 
142 static struct dev_ops vbd_disk_ops = {
143 	{ "vbd", 200, D_DISK | D_MPSAFE },
144 	.d_open		= vtblk_open,
145 	.d_close	= nullclose,
146 	.d_read		= physread,
147 	.d_write	= physwrite,
148 	.d_strategy	= vtblk_strategy,
149 	.d_dump		= vtblk_dump,
150 };
151 
152 static void	vtblk_startio(struct vtblk_softc *);
153 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
154 static int	vtblk_execute_request(struct vtblk_softc *,
155 		    struct vtblk_request *);
156 
157 static int	vtblk_vq_intr(void *);
158 static void	vtblk_complete(void *);
159 
160 static void	vtblk_stop(struct vtblk_softc *);
161 
162 static void	vtblk_prepare_dump(struct vtblk_softc *);
163 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
164 static int	vtblk_flush_dump(struct vtblk_softc *);
165 static int	vtblk_poll_request(struct vtblk_softc *,
166 		    struct vtblk_request *);
167 
168 static void	vtblk_drain_vq(struct vtblk_softc *, int);
169 static void	vtblk_drain(struct vtblk_softc *);
170 
171 static int	vtblk_alloc_requests(struct vtblk_softc *);
172 static void	vtblk_free_requests(struct vtblk_softc *);
173 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
174 static void	vtblk_enqueue_request(struct vtblk_softc *,
175 		    struct vtblk_request *);
176 
177 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
178 static void	vtblk_enqueue_ready(struct vtblk_softc *,
179 		    struct vtblk_request *);
180 
181 static int	vtblk_request_error(struct vtblk_request *);
182 static void	vtblk_finish_bio(struct bio *, int);
183 
184 static void	vtblk_setup_sysctl(struct vtblk_softc *);
185 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
186 
187 /* Tunables. */
188 static int vtblk_writecache_mode = -1;
189 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
190 
191 /* Features desired/implemented by this driver. */
192 #define VTBLK_FEATURES \
193     (VIRTIO_BLK_F_BARRIER		| \
194      VIRTIO_BLK_F_SIZE_MAX		| \
195      VIRTIO_BLK_F_SEG_MAX		| \
196      VIRTIO_BLK_F_GEOMETRY		| \
197      VIRTIO_BLK_F_RO			| \
198      VIRTIO_BLK_F_BLK_SIZE		| \
199      VIRTIO_BLK_F_WCE			| \
200      VIRTIO_BLK_F_CONFIG_WCE		| \
201      VIRTIO_RING_F_INDIRECT_DESC)
202 
203 /*
204  * Each block request uses at least two segments - one for the header
205  * and one for the status.
206  */
207 #define VTBLK_MIN_SEGMENTS	2
208 
209 static device_method_t vtblk_methods[] = {
210 	/* Device methods. */
211 	DEVMETHOD(device_probe,		vtblk_probe),
212 	DEVMETHOD(device_attach,	vtblk_attach),
213 	DEVMETHOD(device_detach,	vtblk_detach),
214 	DEVMETHOD(device_suspend,	vtblk_suspend),
215 	DEVMETHOD(device_resume,	vtblk_resume),
216 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
217 
218 	DEVMETHOD_END
219 };
220 
221 static driver_t vtblk_driver = {
222 	"vtblk",
223 	vtblk_methods,
224 	sizeof(struct vtblk_softc)
225 };
226 static devclass_t vtblk_devclass;
227 
228 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
229     vtblk_modevent, NULL);
230 MODULE_VERSION(virtio_blk, 1);
231 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
232 
233 static int
234 vtblk_modevent(module_t mod, int type, void *unused)
235 {
236 	int error;
237 
238 	error = 0;
239 
240 	switch (type) {
241 	case MOD_LOAD:
242 		break;
243 	case MOD_UNLOAD:
244 		break;
245 	case MOD_SHUTDOWN:
246 		break;
247 	default:
248 		error = EOPNOTSUPP;
249 		break;
250 	}
251 
252 	return (error);
253 }
254 
255 static int
256 vtblk_probe(device_t dev)
257 {
258 
259 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
260 		return (ENXIO);
261 
262 	device_set_desc(dev, "VirtIO Block Adapter");
263 
264 	return (BUS_PROBE_DEFAULT);
265 }
266 
267 static int
268 vtblk_attach(device_t dev)
269 {
270 	struct vtblk_softc *sc;
271 	struct virtio_blk_config blkcfg;
272 	int error;
273 
274 	sc = device_get_softc(dev);
275 	sc->vtblk_dev = dev;
276 
277 	lwkt_serialize_init(&sc->vtblk_slz);
278 
279 	bioq_init(&sc->vtblk_bioq);
280 	TAILQ_INIT(&sc->vtblk_req_free);
281 	TAILQ_INIT(&sc->vtblk_req_ready);
282 
283 	virtio_set_feature_desc(dev, vtblk_feature_desc);
284 	vtblk_negotiate_features(sc);
285 
286 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
287 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
288 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
289 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
290 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
291 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
292 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
293 		sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG;
294 
295 	vtblk_setup_sysctl(sc);
296 
297 	/* Get local copy of config. */
298 	virtio_read_device_config(dev, 0, &blkcfg,
299 				  sizeof(struct virtio_blk_config));
300 
301 	/*
302 	 * With the current sglist(9) implementation, it is not easy
303 	 * for us to support a maximum segment size as adjacent
304 	 * segments are coalesced. For now, just make sure it's larger
305 	 * than the maximum supported transfer size.
306 	 */
307 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
308 		if (blkcfg.size_max < MAXPHYS) {
309 			error = ENOTSUP;
310 			device_printf(dev, "host requires unsupported "
311 			    "maximum segment size feature\n");
312 			goto fail;
313 		}
314 	}
315 
316 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
317 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
318 		error = EINVAL;
319 		device_printf(dev, "fewer than minimum number of segments "
320 		    "allowed: %d\n", sc->vtblk_max_nsegs);
321 		goto fail;
322 	}
323 
324 	/*
325 	 * Allocate working sglist. The number of segments may be too
326 	 * large to safely store on the stack.
327 	 */
328 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_INTWAIT);
329 	if (sc->vtblk_sglist == NULL) {
330 		error = ENOMEM;
331 		device_printf(dev, "cannot allocate sglist\n");
332 		goto fail;
333 	}
334 
335 	error = vtblk_alloc_virtqueue(sc);
336 	if (error) {
337 		device_printf(dev, "cannot allocate virtqueue\n");
338 		goto fail;
339 	}
340 
341 	error = vtblk_alloc_requests(sc);
342 	if (error) {
343 		device_printf(dev, "cannot preallocate requests\n");
344 		goto fail;
345 	}
346 
347 	vtblk_alloc_disk(sc, &blkcfg);
348 
349 	error = virtio_setup_intr(dev, &sc->vtblk_slz);
350 	if (error) {
351 		device_printf(dev, "cannot setup virtqueue interrupt\n");
352 		goto fail;
353 	}
354 
355 	virtqueue_enable_intr(sc->vtblk_vq);
356 
357 fail:
358 	if (error)
359 		vtblk_detach(dev);
360 
361 	return (error);
362 }
363 
364 static int
365 vtblk_detach(device_t dev)
366 {
367 	struct vtblk_softc *sc;
368 
369 	sc = device_get_softc(dev);
370 
371 	lwkt_serialize_enter(&sc->vtblk_slz);
372 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
373 	if (device_is_attached(dev))
374 		vtblk_stop(sc);
375 	lwkt_serialize_exit(&sc->vtblk_slz);
376 
377 	vtblk_drain(sc);
378 
379 	if (sc->cdev != NULL) {
380 		disk_destroy(&sc->vtblk_disk);
381 		sc->cdev = NULL;
382 	}
383 
384 	if (sc->vtblk_sglist != NULL) {
385 		sglist_free(sc->vtblk_sglist);
386 		sc->vtblk_sglist = NULL;
387 	}
388 
389 	return (0);
390 }
391 
392 static int
393 vtblk_suspend(device_t dev)
394 {
395 	struct vtblk_softc *sc;
396 
397 	sc = device_get_softc(dev);
398 
399 	lwkt_serialize_enter(&sc->vtblk_slz);
400 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
401 	/* XXX BMV: virtio_stop(), etc needed here? */
402 	lwkt_serialize_exit(&sc->vtblk_slz);
403 
404 	return (0);
405 }
406 
407 static int
408 vtblk_resume(device_t dev)
409 {
410 	struct vtblk_softc *sc;
411 
412 	sc = device_get_softc(dev);
413 
414 	lwkt_serialize_enter(&sc->vtblk_slz);
415 	/* XXX BMV: virtio_reinit(), etc needed here? */
416 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
417 #if 0 /* XXX Resume IO? */
418 	vtblk_startio(sc);
419 #endif
420 	lwkt_serialize_exit(&sc->vtblk_slz);
421 
422 	return (0);
423 }
424 
425 static int
426 vtblk_shutdown(device_t dev)
427 {
428 
429 	return (0);
430 }
431 
432 static int
433 vtblk_open(struct dev_open_args *ap)
434 {
435 	struct vtblk_softc *sc;
436 	cdev_t dev = ap->a_head.a_dev;
437 	sc = dev->si_drv1;
438 	if (sc == NULL)
439 		return (ENXIO);
440 
441 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
442 }
443 
444 static int
445 vtblk_dump(struct dev_dump_args *ap)
446 {
447 	struct vtblk_softc *sc;
448 	cdev_t dev = ap->a_head.a_dev;
449         uint64_t buf_start, buf_len;
450         int error;
451 
452 	sc = dev->si_drv1;
453 	if (sc == NULL)
454 		return (ENXIO);
455 
456         buf_start = ap->a_offset;
457         buf_len = ap->a_length;
458 
459 //	lwkt_serialize_enter(&sc->vtblk_slz);
460 
461 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
462 		vtblk_prepare_dump(sc);
463 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
464 	}
465 
466 	if (buf_len > 0)
467 		error = vtblk_write_dump(sc, ap->a_virtual, buf_start,
468 		    buf_len);
469 	else if (buf_len == 0)
470 		error = vtblk_flush_dump(sc);
471 	else {
472 		error = EINVAL;
473 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
474 	}
475 
476 //	lwkt_serialize_exit(&sc->vtblk_slz);
477 
478 	return (error);
479 }
480 
481 static int
482 vtblk_strategy(struct dev_strategy_args *ap)
483 {
484 	struct vtblk_softc *sc;
485 	cdev_t dev = ap->a_head.a_dev;
486 	sc = dev->si_drv1;
487 	struct bio *bio = ap->a_bio;
488 	struct buf *bp = bio->bio_buf;
489 
490 	if (sc == NULL) {
491 		vtblk_finish_bio(bio, EINVAL);
492 		return EINVAL;
493 	}
494 
495 	/*
496 	 * Fail any write if RO. Unfortunately, there does not seem to
497 	 * be a better way to report our readonly'ness to GEOM above.
498 	 *
499 	 * XXX: Is that true in DFly?
500 	 */
501 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
502 	    (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_FLUSH)) {
503 		vtblk_finish_bio(bio, EROFS);
504 		return (EINVAL);
505 	}
506 
507 	lwkt_serialize_enter(&sc->vtblk_slz);
508 	if ((sc->vtblk_flags & VTBLK_FLAG_DETACH) == 0) {
509 		devstat_start_transaction(&sc->stats);
510 		bioqdisksort(&sc->vtblk_bioq, bio);
511 		vtblk_startio(sc);
512 		lwkt_serialize_exit(&sc->vtblk_slz);
513 	} else {
514 		lwkt_serialize_exit(&sc->vtblk_slz);
515 		vtblk_finish_bio(bio, ENXIO);
516 	}
517 	return 0;
518 }
519 
520 static void
521 vtblk_negotiate_features(struct vtblk_softc *sc)
522 {
523 	device_t dev;
524 	uint64_t features;
525 
526 	dev = sc->vtblk_dev;
527 	features = VTBLK_FEATURES;
528 
529 	sc->vtblk_features = virtio_negotiate_features(dev, features);
530 }
531 
532 /*
533  * Calculate the maximum number of DMA segment supported.  Note
534  * that the in/out header is encoded in the segment list.  We
535  * assume that VTBLK_MIN_SEGMENTS covers that part of it so
536  * we add it into the desired total.  If the SEG_MAX feature
537  * is not specified we have to just assume that the host can
538  * handle the maximum number of segments required for a MAXPHYS
539  * sized request.
540  *
541  * The additional + 1 is in case a MAXPHYS-sized buffer crosses
542  * a page boundary.
543  */
544 static int
545 vtblk_maximum_segments(struct vtblk_softc *sc,
546     struct virtio_blk_config *blkcfg)
547 {
548 	device_t dev;
549 	int nsegs;
550 
551 	dev = sc->vtblk_dev;
552 	nsegs = VTBLK_MIN_SEGMENTS;
553 
554 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
555 		nsegs = MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1 + nsegs);
556 	} else {
557 		nsegs = MAXPHYS / PAGE_SIZE + 1 + nsegs;
558 	}
559 	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
560 		nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
561 
562 	return (nsegs);
563 }
564 
565 static int
566 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
567 {
568 	device_t dev;
569 	struct vq_alloc_info vq_info;
570 
571 	dev = sc->vtblk_dev;
572 
573 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
574 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
575 	    "%s request", device_get_nameunit(dev));
576 
577 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
578 }
579 
580 static void
581 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
582 {
583 
584 	/* Set either writeback (1) or writethrough (0) mode. */
585 	virtio_write_dev_config_1(sc->vtblk_dev,
586 	    offsetof(struct virtio_blk_config, writeback), wc);
587 }
588 
589 static int
590 vtblk_write_cache_enabled(struct vtblk_softc *sc,
591     struct virtio_blk_config *blkcfg)
592 {
593 	int wc;
594 
595 	if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) {
596 		wc = vtblk_tunable_int(sc, "writecache_mode",
597 		    vtblk_writecache_mode);
598 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
599 			vtblk_set_write_cache(sc, wc);
600 		else
601 			wc = blkcfg->writeback;
602 	} else
603 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);
604 
605 	return (wc);
606 }
607 
608 static int
609 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
610 {
611 	struct vtblk_softc *sc;
612 	int wc, error;
613 
614 	sc = oidp->oid_arg1;
615 	wc = sc->vtblk_write_cache;
616 
617 	error = sysctl_handle_int(oidp, &wc, 0, req);
618 	if (error || req->newptr == NULL)
619 		return (error);
620 	if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0)
621 		return (EPERM);
622 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
623 		return (EINVAL);
624 
625 	lwkt_serialize_enter(&sc->vtblk_slz);
626 	sc->vtblk_write_cache = wc;
627 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
628 	lwkt_serialize_exit(&sc->vtblk_slz);
629 
630 	return (0);
631 }
632 
633 static void
634 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
635 {
636 
637 	struct disk_info info;
638 
639 	/* construct the disk_info */
640 	bzero(&info, sizeof(info));
641 
642 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE))
643 		sc->vtblk_sector_size = blkcfg->blk_size;
644 	else
645 		sc->vtblk_sector_size = DEV_BSIZE;
646 
647 	info.d_media_blksize = sc->vtblk_sector_size;
648 	info.d_media_blocks = blkcfg->capacity;
649 
650 	info.d_ncylinders = blkcfg->geometry.cylinders;
651 	info.d_nheads = blkcfg->geometry.heads;
652 	info.d_secpertrack = blkcfg->geometry.sectors;
653 
654 	info.d_secpercyl = info.d_secpertrack * info.d_nheads;
655 
656 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
657 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
658 	else
659 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
660 
661 	devstat_add_entry(&sc->stats, "vbd", device_get_unit(sc->vtblk_dev),
662 			  DEV_BSIZE, DEVSTAT_ALL_SUPPORTED,
663 			  DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
664 			  DEVSTAT_PRIORITY_DISK);
665 
666 	/* attach a generic disk device to ourselves */
667 	sc->cdev = disk_create(device_get_unit(sc->vtblk_dev), &sc->vtblk_disk,
668 			       &vbd_disk_ops);
669 
670 	sc->cdev->si_drv1 = sc;
671 	sc->cdev->si_iosize_max = MAXPHYS;
672 	disk_setdiskinfo(&sc->vtblk_disk, &info);
673 }
674 
675 static void
676 vtblk_startio(struct vtblk_softc *sc)
677 {
678 	struct virtqueue *vq;
679 	struct vtblk_request *req;
680 	int enq;
681 
682 	vq = sc->vtblk_vq;
683 	enq = 0;
684 
685 	ASSERT_SERIALIZED(&sc->vtblk_slz);
686 
687 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
688 		return;
689 
690 	while (!virtqueue_full(vq)) {
691 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
692 			req = vtblk_bio_request(sc);
693 		if (req == NULL)
694 			break;
695 
696 		if (vtblk_execute_request(sc, req) != 0) {
697 			vtblk_enqueue_ready(sc, req);
698 			break;
699 		}
700 
701 		enq++;
702 	}
703 
704 	if (enq > 0)
705 		virtqueue_notify(vq, &sc->vtblk_slz);
706 }
707 
708 static struct vtblk_request *
709 vtblk_bio_request(struct vtblk_softc *sc)
710 {
711 	struct bio_queue_head *bioq;
712 	struct vtblk_request *req;
713 	struct bio *bio;
714 	struct buf *bp;
715 
716 	bioq = &sc->vtblk_bioq;
717 
718 	if (bioq_first(bioq) == NULL)
719 		return (NULL);
720 
721 	req = vtblk_dequeue_request(sc);
722 	if (req == NULL)
723 		return (NULL);
724 
725 	bio = bioq_takefirst(bioq);
726 	req->vbr_bp = bio;
727 	req->vbr_ack = -1;
728 	req->vbr_barrier = 0;
729 	req->vbr_hdr.ioprio = 1;
730 	bp = bio->bio_buf;
731 
732 	switch (bp->b_cmd) {
733 	case BUF_CMD_FLUSH:
734 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
735 		break;
736 	case BUF_CMD_READ:
737 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
738 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
739 		break;
740 	case BUF_CMD_WRITE:
741 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
742 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
743 		break;
744 	default:
745 		KASSERT(0, ("bio with unhandled cmd: %d", bp->b_cmd));
746 		req->vbr_hdr.type = -1;
747 		break;
748 	}
749 
750 	if (bp->b_flags & B_ORDERED) {
751 		if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0)
752 			req->vbr_barrier = 1;
753 		else
754 			req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
755 	}
756 
757 	return (req);
758 }
759 
760 static int
761 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
762 {
763 	struct sglist *sg;
764 	struct bio *bio;
765 	struct buf *bp;
766 	int ordered, writable, error;
767 
768 	sg = sc->vtblk_sglist;
769 	bio = req->vbr_bp;
770 	bp = bio->bio_buf;
771 	ordered = 0;
772 	writable = 0;
773 
774 	if (sc->vtblk_req_ordered != NULL)
775 		return (EBUSY);
776 
777 	if (req->vbr_barrier) {
778 		/*
779 		 * This request will be executed once all
780 		 * the in-flight requests are completed.
781 		 */
782 		if (!virtqueue_empty(sc->vtblk_vq))
783 			return (EBUSY);
784 		ordered = 1;
785 	}
786 
787 	/*
788 	 * sglist is live throughout this subroutine.
789 	 */
790 	sglist_reset(sg);
791 
792 	error = sglist_append(sg, &req->vbr_hdr,
793 			      sizeof(struct virtio_blk_outhdr));
794 	KASSERT(error == 0, ("error adding header to sglist"));
795 	KASSERT(sg->sg_nseg == 1,
796 	    ("header spanned multiple segments: %d", sg->sg_nseg));
797 
798 	if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
799 		error = sglist_append(sg, bp->b_data, bp->b_bcount);
800 		KASSERT(error == 0, ("error adding buffer to sglist"));
801 
802 		/* BUF_CMD_READ means the host writes into our buffer. */
803 		if (bp->b_cmd == BUF_CMD_READ)
804 			writable += sg->sg_nseg - 1;
805 	}
806 
807 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
808 	KASSERT(error == 0, ("error adding ack to sglist"));
809 	writable++;
810 
811 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
812 	    ("fewer than min segments: %d", sg->sg_nseg));
813 
814 	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
815 				  sg->sg_nseg - writable, writable);
816 	if (error == 0 && ordered)
817 		sc->vtblk_req_ordered = req;
818 
819 	sglist_reset(sg);
820 
821 	return (error);
822 }
823 
824 static int
825 vtblk_vq_intr(void *xsc)
826 {
827 	vtblk_complete(xsc);
828 
829 	return (1);
830 }
831 
832 static void
833 vtblk_complete(void *arg)
834 {
835 	struct vtblk_softc *sc;
836 	struct vtblk_request *req;
837 	struct virtqueue *vq;
838 	struct bio *bio;
839 	struct buf *bp;
840 
841 	sc = arg;
842 	vq = sc->vtblk_vq;
843 
844 	lwkt_serialize_handler_disable(&sc->vtblk_slz);
845 	virtqueue_disable_intr(sc->vtblk_vq);
846 	ASSERT_SERIALIZED(&sc->vtblk_slz);
847 
848 retry:
849 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
850 		return;
851 
852 	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
853 		bio = req->vbr_bp;
854 		bp = bio->bio_buf;
855 
856 		if (sc->vtblk_req_ordered != NULL) {
857 			/* This should be the only outstanding request. */
858 			KKASSERT(sc->vtblk_req_ordered == req);
859 			sc->vtblk_req_ordered = NULL;
860 		}
861 
862 		if (req->vbr_ack == VIRTIO_BLK_S_OK)
863 			bp->b_resid = 0;
864 		else {
865 			bp->b_flags |= B_ERROR;
866 			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) {
867 				bp->b_error = ENOTSUP;
868 			} else {
869 				bp->b_error = EIO;
870 			}
871 		}
872 
873 		devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
874 
875 		lwkt_serialize_exit(&sc->vtblk_slz);
876 		/*
877 		 * Unlocking the controller around biodone() does not allow
878 		 * processing further device interrupts; when we queued
879 		 * vtblk_complete, we disabled interrupts. It will allow
880 		 * concurrent vtblk_strategy/_startio command dispatches.
881 		 */
882 		biodone(bio);
883 		lwkt_serialize_enter(&sc->vtblk_slz);
884 
885 		vtblk_enqueue_request(sc, req);
886 	}
887 
888 	vtblk_startio(sc);
889 
890 	if (virtqueue_enable_intr(vq) != 0) {
891 		/*
892 		 * If new virtqueue entries appeared immediately after
893 		 * enabling interrupts, process them now. Release and
894 		 * retake softcontroller lock to try to avoid blocking
895 		 * I/O dispatch for too long.
896 		 */
897 		virtqueue_disable_intr(vq);
898 		goto retry;
899 	}
900 	lwkt_serialize_handler_enable(&sc->vtblk_slz);
901 }
902 
903 static void
904 vtblk_stop(struct vtblk_softc *sc)
905 {
906 
907 	virtqueue_disable_intr(sc->vtblk_vq);
908 	virtio_stop(sc->vtblk_dev);
909 }
910 
911 static void
912 vtblk_prepare_dump(struct vtblk_softc *sc)
913 {
914 	device_t dev;
915 	struct virtqueue *vq;
916 
917 	dev = sc->vtblk_dev;
918 	vq = sc->vtblk_vq;
919 
920 	vtblk_stop(sc);
921 
922 	/*
923 	 * Drain all requests caught in-flight in the virtqueue,
924 	 * skipping biodone(). When dumping, only one request is
925 	 * outstanding at a time, and we just poll the virtqueue
926 	 * for the response.
927 	 */
928 	vtblk_drain_vq(sc, 1);
929 
930 	if (virtio_reinit(dev, sc->vtblk_features) != 0) {
931 		panic("%s: cannot reinit VirtIO block device during dump",
932 		    device_get_nameunit(dev));
933 	}
934 
935 	virtqueue_disable_intr(vq);
936 	virtio_reinit_complete(dev);
937 }
938 
939 static int
940 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
941     size_t length)
942 {
943 	struct bio bio;
944 	struct buf bp;
945 	struct vtblk_request *req;
946 
947 	req = &sc->vtblk_dump_request;
948 	req->vbr_ack = -1;
949 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
950 	req->vbr_hdr.ioprio = 1;
951 	req->vbr_hdr.sector = offset / 512;
952 
953 	req->vbr_bp = &bio;
954 	bzero(&bio, sizeof(struct bio));
955 	bzero(&buf, sizeof(struct buf));
956 
957 	bio.bio_buf = &bp;
958 	bp.b_cmd = BUF_CMD_WRITE;
959 	bp.b_data = virtual;
960 	bp.b_bcount = length;
961 
962 	return (vtblk_poll_request(sc, req));
963 }
964 
965 static int
966 vtblk_flush_dump(struct vtblk_softc *sc)
967 {
968 	struct bio bio;
969 	struct buf bp;
970 	struct vtblk_request *req;
971 
972 	req = &sc->vtblk_dump_request;
973 	req->vbr_ack = -1;
974 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
975 	req->vbr_hdr.ioprio = 1;
976 	req->vbr_hdr.sector = 0;
977 
978 	req->vbr_bp = &bio;
979 	bzero(&bio, sizeof(struct bio));
980 	bzero(&bp, sizeof(struct buf));
981 
982 	bio.bio_buf = &bp;
983 	bp.b_cmd = BUF_CMD_FLUSH;
984 
985 	return (vtblk_poll_request(sc, req));
986 }
987 
988 static int
989 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
990 {
991 	struct virtqueue *vq;
992 	int error;
993 
994 	vq = sc->vtblk_vq;
995 
996 	if (!virtqueue_empty(vq))
997 		return (EBUSY);
998 
999 	error = vtblk_execute_request(sc, req);
1000 	if (error)
1001 		return (error);
1002 
1003 	virtqueue_notify(vq, NULL);
1004 	virtqueue_poll(vq, NULL);
1005 
1006 	error = vtblk_request_error(req);
1007 	if (error && bootverbose) {
1008 		device_printf(sc->vtblk_dev,
1009 		    "%s: IO error: %d\n", __func__, error);
1010 	}
1011 
1012 	return (error);
1013 }
1014 
1015 static void
1016 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1017 {
1018 	struct virtqueue *vq;
1019 	struct vtblk_request *req;
1020 	int last;
1021 
1022 	vq = sc->vtblk_vq;
1023 	last = 0;
1024 
1025 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1026 		if (!skip_done)
1027 			vtblk_finish_bio(req->vbr_bp, ENXIO);
1028 
1029 		vtblk_enqueue_request(sc, req);
1030 	}
1031 
1032 	sc->vtblk_req_ordered = NULL;
1033 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1034 }
1035 
1036 static void
1037 vtblk_drain(struct vtblk_softc *sc)
1038 {
1039 	struct bio_queue_head *bioq;
1040 	struct vtblk_request *req;
1041 	struct bio *bp;
1042 
1043 	bioq = &sc->vtblk_bioq;
1044 
1045 	if (sc->vtblk_vq != NULL)
1046 		vtblk_drain_vq(sc, 0);
1047 
1048 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1049 		vtblk_finish_bio(req->vbr_bp, ENXIO);
1050 		vtblk_enqueue_request(sc, req);
1051 	}
1052 
1053 	while (bioq_first(bioq) != NULL) {
1054 		bp = bioq_takefirst(bioq);
1055 		vtblk_finish_bio(bp, ENXIO);
1056 	}
1057 
1058 	vtblk_free_requests(sc);
1059 }
1060 
1061 static int
1062 vtblk_alloc_requests(struct vtblk_softc *sc)
1063 {
1064 	struct vtblk_request *req;
1065 	int i, nreqs;
1066 
1067 	nreqs = virtqueue_size(sc->vtblk_vq);
1068 
1069 	/*
1070 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1071 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1072 	 * the number allocated when indirect descriptors are not available.
1073 	 */
1074 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1075 		nreqs /= VTBLK_MIN_SEGMENTS;
1076 
1077 	for (i = 0; i < nreqs; i++) {
1078 		req = contigmalloc(sizeof(struct vtblk_request), M_DEVBUF,
1079 		    M_WAITOK, 0, BUS_SPACE_MAXADDR, 16, 0);
1080 		if (req == NULL)
1081 			return (ENOMEM);
1082 
1083 		KKASSERT(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr))
1084 		    == 1);
1085 		KKASSERT(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack))
1086 		    == 1);
1087 
1088 		sc->vtblk_request_count++;
1089 		vtblk_enqueue_request(sc, req);
1090 	}
1091 
1092 	return (0);
1093 }
1094 
1095 static void
1096 vtblk_free_requests(struct vtblk_softc *sc)
1097 {
1098 	struct vtblk_request *req;
1099 
1100 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1101 		sc->vtblk_request_count--;
1102 		contigfree(req, sizeof(struct vtblk_request), M_DEVBUF);
1103 	}
1104 
1105 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1106 }
1107 
1108 static struct vtblk_request *
1109 vtblk_dequeue_request(struct vtblk_softc *sc)
1110 {
1111 	struct vtblk_request *req;
1112 
1113 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1114 	if (req != NULL)
1115 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1116 
1117 	return (req);
1118 }
1119 
1120 static void
1121 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1122 {
1123 
1124 	bzero(req, sizeof(struct vtblk_request));
1125 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1126 }
1127 
1128 static struct vtblk_request *
1129 vtblk_dequeue_ready(struct vtblk_softc *sc)
1130 {
1131 	struct vtblk_request *req;
1132 
1133 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1134 	if (req != NULL)
1135 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1136 
1137 	return (req);
1138 }
1139 
1140 static void
1141 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1142 {
1143 
1144 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1145 }
1146 
1147 static int
1148 vtblk_request_error(struct vtblk_request *req)
1149 {
1150 	int error;
1151 
1152 	switch (req->vbr_ack) {
1153 	case VIRTIO_BLK_S_OK:
1154 		error = 0;
1155 		break;
1156 	case VIRTIO_BLK_S_UNSUPP:
1157 		error = ENOTSUP;
1158 		break;
1159 	default:
1160 		error = EIO;
1161 		break;
1162 	}
1163 
1164 	return (error);
1165 }
1166 
1167 static void
1168 vtblk_finish_bio(struct bio *bp, int error)
1169 {
1170 
1171 	biodone(bp);
1172 }
1173 
1174 static void
1175 vtblk_setup_sysctl(struct vtblk_softc *sc)
1176 {
1177 	device_t dev;
1178 	struct sysctl_ctx_list *ctx;
1179 	struct sysctl_oid *tree;
1180 	struct sysctl_oid_list *child;
1181 
1182 	dev = sc->vtblk_dev;
1183 	ctx = device_get_sysctl_ctx(dev);
1184 	tree = device_get_sysctl_tree(dev);
1185 	child = SYSCTL_CHILDREN(tree);
1186 
1187 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1188 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, vtblk_write_cache_sysctl,
1189 	    "I", "Write cache mode (writethrough (0) or writeback (1))");
1190 }
1191 
1192 static int
1193 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1194 {
1195 	char path[64];
1196 
1197 	ksnprintf(path, sizeof(path),
1198 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1199 	TUNABLE_INT_FETCH(path, &def);
1200 
1201 	return (def);
1202 }
1203