1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD: head/sys/dev/virtio/block/virtio_blk.c 252707 2013-07-04 17:57:26Z bryanv $
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/bio.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/sglist.h>
38 #include <sys/sysctl.h>
39 #include <sys/lock.h>
40 #include <sys/queue.h>
41 #include <sys/serialize.h>
42 #include <sys/buf2.h>
43 #include <sys/rman.h>
44 #include <sys/disk.h>
45 #include <sys/devicestat.h>
46 
47 #include <dev/virtual/virtio/virtio/virtio.h>
48 #include <dev/virtual/virtio/virtio/virtqueue.h>
49 #include "virtio_blk.h"
50 #include "virtio_if.h"
51 
52 struct vtblk_request {
53 	struct virtio_blk_outhdr	 vbr_hdr __aligned(16);
54 	struct bio			*vbr_bp;
55 	uint8_t				 vbr_ack;
56 	uint8_t				 vbr_barrier;
57 
58 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
59 };
60 
61 enum vtblk_cache_mode {
62 	VTBLK_CACHE_WRITETHROUGH,
63 	VTBLK_CACHE_WRITEBACK,
64 	VTBLK_CACHE_MAX
65 };
66 
67 struct vtblk_softc {
68 	device_t		 vtblk_dev;
69 	struct lwkt_serialize	 vtblk_slz;
70 	uint64_t		 vtblk_features;
71 	uint32_t		 vtblk_flags;
72 #define VTBLK_FLAG_INDIRECT	0x0001
73 #define VTBLK_FLAG_READONLY	0x0002
74 #define VTBLK_FLAG_DETACH	0x0004
75 #define VTBLK_FLAG_SUSPEND	0x0008
76 #define VTBLK_FLAG_DUMPING	0x0010
77 #define VTBLK_FLAG_BARRIER	0x0020
78 #define VTBLK_FLAG_WC_CONFIG	0x0040
79 
80 	struct virtqueue	*vtblk_vq;
81 	struct sglist		*vtblk_sglist;
82 	struct disk		 vtblk_disk;
83 	cdev_t			 cdev;
84 	struct devstat		 stats;
85 
86 	struct bio_queue_head	 vtblk_bioq;
87 	TAILQ_HEAD(, vtblk_request)
88 				 vtblk_req_free;
89 	TAILQ_HEAD(, vtblk_request)
90 				 vtblk_req_ready;
91 	struct vtblk_request	*vtblk_req_ordered;
92 
93 	int			 vtblk_sector_size;
94 	int			 vtblk_max_nsegs;
95 	int			 vtblk_request_count;
96 	enum vtblk_cache_mode	 vtblk_write_cache;
97 
98 	struct vtblk_request	 vtblk_dump_request;
99 };
100 
101 static struct virtio_feature_desc vtblk_feature_desc[] = {
102 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
103 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
104 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
105 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
106 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
107 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
108 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
109 	{ VIRTIO_BLK_F_WCE,		"WriteCache"	},
110 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
111 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
112 
113 	{ 0, NULL }
114 };
115 
116 static int	vtblk_modevent(module_t, int, void *);
117 
118 static int	vtblk_probe(device_t);
119 static int	vtblk_attach(device_t);
120 static int	vtblk_detach(device_t);
121 static int	vtblk_suspend(device_t);
122 static int	vtblk_resume(device_t);
123 static int	vtblk_shutdown(device_t);
124 
125 static void	vtblk_negotiate_features(struct vtblk_softc *);
126 static int	vtblk_maximum_segments(struct vtblk_softc *,
127 		    struct virtio_blk_config *);
128 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
129 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
130 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
131 		    struct virtio_blk_config *);
132 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
133 static void	vtblk_alloc_disk(struct vtblk_softc *,
134 		    struct virtio_blk_config *);
135 /*
136  * Interface to the device switch.
137  */
138 static d_open_t		vtblk_open;
139 static d_strategy_t	vtblk_strategy;
140 static d_dump_t		vtblk_dump;
141 
142 static struct dev_ops vbd_disk_ops = {
143 	{ "vbd", 200, D_DISK | D_MPSAFE },
144 	.d_open		= vtblk_open,
145 	.d_close	= nullclose,
146 	.d_read		= physread,
147 	.d_write	= physwrite,
148 	.d_strategy	= vtblk_strategy,
149 	.d_dump		= vtblk_dump,
150 };
151 
152 static void	vtblk_startio(struct vtblk_softc *);
153 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
154 static int	vtblk_execute_request(struct vtblk_softc *,
155 		    struct vtblk_request *);
156 
157 static int	vtblk_vq_intr(void *);
158 static void	vtblk_complete(void *);
159 
160 static void	vtblk_stop(struct vtblk_softc *);
161 
162 static void	vtblk_prepare_dump(struct vtblk_softc *);
163 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
164 static int	vtblk_flush_dump(struct vtblk_softc *);
165 static int	vtblk_poll_request(struct vtblk_softc *,
166 		    struct vtblk_request *);
167 
168 static void	vtblk_drain_vq(struct vtblk_softc *, int);
169 static void	vtblk_drain(struct vtblk_softc *);
170 
171 static int	vtblk_alloc_requests(struct vtblk_softc *);
172 static void	vtblk_free_requests(struct vtblk_softc *);
173 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
174 static void	vtblk_enqueue_request(struct vtblk_softc *,
175 		    struct vtblk_request *);
176 
177 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
178 static void	vtblk_enqueue_ready(struct vtblk_softc *,
179 		    struct vtblk_request *);
180 
181 static int	vtblk_request_error(struct vtblk_request *);
182 static void	vtblk_finish_bio(struct bio *, int);
183 
184 static void	vtblk_setup_sysctl(struct vtblk_softc *);
185 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
186 
187 /* Tunables. */
188 static int vtblk_writecache_mode = -1;
189 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
190 
191 /* Features desired/implemented by this driver. */
192 #define VTBLK_FEATURES \
193     (VIRTIO_BLK_F_BARRIER		| \
194      VIRTIO_BLK_F_SIZE_MAX		| \
195      VIRTIO_BLK_F_SEG_MAX		| \
196      VIRTIO_BLK_F_GEOMETRY		| \
197      VIRTIO_BLK_F_RO			| \
198      VIRTIO_BLK_F_BLK_SIZE		| \
199      VIRTIO_BLK_F_WCE			| \
200      VIRTIO_BLK_F_CONFIG_WCE		| \
201      VIRTIO_RING_F_INDIRECT_DESC)
202 
203 /*
204  * Each block request uses at least two segments - one for the header
205  * and one for the status.
206  */
207 #define VTBLK_MIN_SEGMENTS	2
208 
209 static device_method_t vtblk_methods[] = {
210 	/* Device methods. */
211 	DEVMETHOD(device_probe,		vtblk_probe),
212 	DEVMETHOD(device_attach,	vtblk_attach),
213 	DEVMETHOD(device_detach,	vtblk_detach),
214 	DEVMETHOD(device_suspend,	vtblk_suspend),
215 	DEVMETHOD(device_resume,	vtblk_resume),
216 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
217 
218 	DEVMETHOD_END
219 };
220 
221 static driver_t vtblk_driver = {
222 	"vtblk",
223 	vtblk_methods,
224 	sizeof(struct vtblk_softc)
225 };
226 static devclass_t vtblk_devclass;
227 
228 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
229     vtblk_modevent, NULL);
230 MODULE_VERSION(virtio_blk, 1);
231 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
232 
233 static int
234 vtblk_modevent(module_t mod, int type, void *unused)
235 {
236 	int error;
237 
238 	error = 0;
239 
240 	switch (type) {
241 	case MOD_LOAD:
242 		break;
243 	case MOD_UNLOAD:
244 		break;
245 	case MOD_SHUTDOWN:
246 		break;
247 	default:
248 		error = EOPNOTSUPP;
249 		break;
250 	}
251 
252 	return (error);
253 }
254 
255 static int
256 vtblk_probe(device_t dev)
257 {
258 
259 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
260 		return (ENXIO);
261 
262 	device_set_desc(dev, "VirtIO Block Adapter");
263 
264 	return (BUS_PROBE_DEFAULT);
265 }
266 
267 static int
268 vtblk_attach(device_t dev)
269 {
270 	struct vtblk_softc *sc;
271 	struct virtio_blk_config blkcfg;
272 	int error;
273 
274 	sc = device_get_softc(dev);
275 	sc->vtblk_dev = dev;
276 
277 	lwkt_serialize_init(&sc->vtblk_slz);
278 
279 	bioq_init(&sc->vtblk_bioq);
280 	TAILQ_INIT(&sc->vtblk_req_free);
281 	TAILQ_INIT(&sc->vtblk_req_ready);
282 
283 	virtio_set_feature_desc(dev, vtblk_feature_desc);
284 	vtblk_negotiate_features(sc);
285 
286 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
287 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
288 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
289 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
290 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
291 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
292 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
293 		sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG;
294 
295 	vtblk_setup_sysctl(sc);
296 
297 	/* Get local copy of config. */
298 	virtio_read_device_config(dev, 0, &blkcfg,
299 				  sizeof(struct virtio_blk_config));
300 
301 	/*
302 	 * With the current sglist(9) implementation, it is not easy
303 	 * for us to support a maximum segment size as adjacent
304 	 * segments are coalesced. For now, just make sure it's larger
305 	 * than the maximum supported transfer size.
306 	 */
307 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
308 		if (blkcfg.size_max < MAXPHYS) {
309 			error = ENOTSUP;
310 			device_printf(dev, "host requires unsupported "
311 			    "maximum segment size feature\n");
312 			goto fail;
313 		}
314 	}
315 
316 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
317 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
318 		error = EINVAL;
319 		device_printf(dev, "fewer than minimum number of segments "
320 		    "allowed: %d\n", sc->vtblk_max_nsegs);
321 		goto fail;
322 	}
323 
324 	/*
325 	 * Allocate working sglist. The number of segments may be too
326 	 * large to safely store on the stack.
327 	 */
328 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_INTWAIT);
329 	if (sc->vtblk_sglist == NULL) {
330 		error = ENOMEM;
331 		device_printf(dev, "cannot allocate sglist\n");
332 		goto fail;
333 	}
334 
335 	error = vtblk_alloc_virtqueue(sc);
336 	if (error) {
337 		device_printf(dev, "cannot allocate virtqueue\n");
338 		goto fail;
339 	}
340 
341 	error = vtblk_alloc_requests(sc);
342 	if (error) {
343 		device_printf(dev, "cannot preallocate requests\n");
344 		goto fail;
345 	}
346 
347 	vtblk_alloc_disk(sc, &blkcfg);
348 
349 	error = virtio_setup_intr(dev, &sc->vtblk_slz);
350 	if (error) {
351 		device_printf(dev, "cannot setup virtqueue interrupt\n");
352 		goto fail;
353 	}
354 
355 	virtqueue_enable_intr(sc->vtblk_vq);
356 
357 fail:
358 	if (error)
359 		vtblk_detach(dev);
360 
361 	return (error);
362 }
363 
364 static int
365 vtblk_detach(device_t dev)
366 {
367 	struct vtblk_softc *sc;
368 
369 	sc = device_get_softc(dev);
370 
371 	lwkt_serialize_enter(&sc->vtblk_slz);
372 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
373 	if (device_is_attached(dev))
374 		vtblk_stop(sc);
375 	lwkt_serialize_exit(&sc->vtblk_slz);
376 
377 	vtblk_drain(sc);
378 
379 	if (sc->cdev != NULL) {
380 		disk_destroy(&sc->vtblk_disk);
381 		sc->cdev = NULL;
382 	}
383 
384 	if (sc->vtblk_sglist != NULL) {
385 		sglist_free(sc->vtblk_sglist);
386 		sc->vtblk_sglist = NULL;
387 	}
388 
389 	return (0);
390 }
391 
392 static int
393 vtblk_suspend(device_t dev)
394 {
395 	struct vtblk_softc *sc;
396 
397 	sc = device_get_softc(dev);
398 
399 	lwkt_serialize_enter(&sc->vtblk_slz);
400 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
401 	/* XXX BMV: virtio_stop(), etc needed here? */
402 	lwkt_serialize_exit(&sc->vtblk_slz);
403 
404 	return (0);
405 }
406 
407 static int
408 vtblk_resume(device_t dev)
409 {
410 	struct vtblk_softc *sc;
411 
412 	sc = device_get_softc(dev);
413 
414 	lwkt_serialize_enter(&sc->vtblk_slz);
415 	/* XXX BMV: virtio_reinit(), etc needed here? */
416 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
417 #if 0 /* XXX Resume IO? */
418 	vtblk_startio(sc);
419 #endif
420 	lwkt_serialize_exit(&sc->vtblk_slz);
421 
422 	return (0);
423 }
424 
425 static int
426 vtblk_shutdown(device_t dev)
427 {
428 
429 	return (0);
430 }
431 
432 static int
433 vtblk_open(struct dev_open_args *ap)
434 {
435 	struct vtblk_softc *sc;
436 	cdev_t dev = ap->a_head.a_dev;
437 	sc = dev->si_drv1;
438 	if (sc == NULL)
439 		return (ENXIO);
440 
441 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
442 }
443 
444 static int
445 vtblk_dump(struct dev_dump_args *ap)
446 {
447 	struct vtblk_softc *sc;
448 	cdev_t dev = ap->a_head.a_dev;
449         uint64_t buf_start, buf_len;
450         int error;
451 
452 	sc = dev->si_drv1;
453 	if (sc == NULL)
454 		return (ENXIO);
455 
456         buf_start = ap->a_offset;
457         buf_len = ap->a_length;
458 
459 //	lwkt_serialize_enter(&sc->vtblk_slz);
460 
461 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
462 		vtblk_prepare_dump(sc);
463 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
464 	}
465 
466 	if (buf_len > 0)
467 		error = vtblk_write_dump(sc, ap->a_virtual, buf_start,
468 		    buf_len);
469 	else if (buf_len == 0)
470 		error = vtblk_flush_dump(sc);
471 	else {
472 		error = EINVAL;
473 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
474 	}
475 
476 //	lwkt_serialize_exit(&sc->vtblk_slz);
477 
478 	return (error);
479 }
480 
481 static int
482 vtblk_strategy(struct dev_strategy_args *ap)
483 {
484 	struct vtblk_softc *sc;
485 	cdev_t dev = ap->a_head.a_dev;
486 	sc = dev->si_drv1;
487 	struct bio *bio = ap->a_bio;
488 	struct buf *bp = bio->bio_buf;
489 
490 	if (sc == NULL) {
491 		vtblk_finish_bio(bio, EINVAL);
492 		return EINVAL;
493 	}
494 
495 	/*
496 	 * Fail any write if RO. Unfortunately, there does not seem to
497 	 * be a better way to report our readonly'ness to GEOM above.
498 	 *
499 	 * XXX: Is that true in DFly?
500 	 */
501 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
502 	    (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_FLUSH)) {
503 		vtblk_finish_bio(bio, EROFS);
504 		return (EINVAL);
505 	}
506 
507 	lwkt_serialize_enter(&sc->vtblk_slz);
508 	if ((sc->vtblk_flags & VTBLK_FLAG_DETACH) == 0) {
509 		devstat_start_transaction(&sc->stats);
510 		bioqdisksort(&sc->vtblk_bioq, bio);
511 		vtblk_startio(sc);
512 		lwkt_serialize_exit(&sc->vtblk_slz);
513 	} else {
514 		lwkt_serialize_exit(&sc->vtblk_slz);
515 		vtblk_finish_bio(bio, ENXIO);
516 	}
517 	return 0;
518 }
519 
520 static void
521 vtblk_negotiate_features(struct vtblk_softc *sc)
522 {
523 	device_t dev;
524 	uint64_t features;
525 
526 	dev = sc->vtblk_dev;
527 	features = VTBLK_FEATURES;
528 
529 	sc->vtblk_features = virtio_negotiate_features(dev, features);
530 }
531 
532 /*
533  * Calculate the maximum number of DMA segment supported.  Note
534  * that the in/out header is encoded in the segment list.  We
535  * assume that VTBLK_MIN_SEGMENTS covers that part of it so
536  * we add it into the desired total.  If the SEG_MAX feature
537  * is not specified we have to just assume that the host can
538  * handle the maximum number of segments required for a MAXPHYS
539  * sized request.
540  *
541  * The additional + 1 is in case a MAXPHYS-sized buffer crosses
542  * a page boundary.
543  */
544 static int
545 vtblk_maximum_segments(struct vtblk_softc *sc,
546     struct virtio_blk_config *blkcfg)
547 {
548 	device_t dev;
549 	int nsegs;
550 
551 	dev = sc->vtblk_dev;
552 	nsegs = VTBLK_MIN_SEGMENTS;
553 
554 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
555 		nsegs = MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1 + nsegs);
556 	} else {
557 		nsegs = MAXPHYS / PAGE_SIZE + 1 + nsegs;
558 	}
559 	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
560 		nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
561 
562 	return (nsegs);
563 }
564 
565 static int
566 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
567 {
568 	device_t dev;
569 	struct vq_alloc_info vq_info;
570 
571 	dev = sc->vtblk_dev;
572 
573 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
574 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
575 	    "%s request", device_get_nameunit(dev));
576 
577 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
578 }
579 
580 static void
581 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
582 {
583 
584 	/* Set either writeback (1) or writethrough (0) mode. */
585 	virtio_write_dev_config_1(sc->vtblk_dev,
586 	    offsetof(struct virtio_blk_config, writeback), wc);
587 }
588 
589 static int
590 vtblk_write_cache_enabled(struct vtblk_softc *sc,
591     struct virtio_blk_config *blkcfg)
592 {
593 	int wc;
594 
595 	if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) {
596 		wc = vtblk_tunable_int(sc, "writecache_mode",
597 		    vtblk_writecache_mode);
598 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
599 			vtblk_set_write_cache(sc, wc);
600 		else
601 			wc = blkcfg->writeback;
602 	} else
603 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);
604 
605 	return (wc);
606 }
607 
608 static int
609 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
610 {
611 	struct vtblk_softc *sc;
612 	int wc, error;
613 
614 	sc = oidp->oid_arg1;
615 	wc = sc->vtblk_write_cache;
616 
617 	error = sysctl_handle_int(oidp, &wc, 0, req);
618 	if (error || req->newptr == NULL)
619 		return (error);
620 	if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0)
621 		return (EPERM);
622 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
623 		return (EINVAL);
624 
625 	lwkt_serialize_enter(&sc->vtblk_slz);
626 	sc->vtblk_write_cache = wc;
627 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
628 	lwkt_serialize_exit(&sc->vtblk_slz);
629 
630 	return (0);
631 }
632 
633 static void
634 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
635 {
636 
637 	struct disk_info info;
638 
639 	/* construct the disk_info */
640 	bzero(&info, sizeof(info));
641 
642 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE))
643 		sc->vtblk_sector_size = blkcfg->blk_size;
644 	else
645 		sc->vtblk_sector_size = DEV_BSIZE;
646 
647 	info.d_media_blksize = sc->vtblk_sector_size;
648 	info.d_media_blocks = blkcfg->capacity;
649 
650 	info.d_ncylinders = blkcfg->geometry.cylinders;
651 	info.d_nheads = blkcfg->geometry.heads;
652 	info.d_secpertrack = blkcfg->geometry.sectors;
653 
654 	info.d_secpercyl = info.d_secpertrack * info.d_nheads;
655 
656 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
657 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
658 	else
659 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
660 
661 	devstat_add_entry(&sc->stats, "vbd", device_get_unit(sc->vtblk_dev),
662 			  DEV_BSIZE, DEVSTAT_ALL_SUPPORTED,
663 			  DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
664 			  DEVSTAT_PRIORITY_DISK);
665 
666 	/* attach a generic disk device to ourselves */
667 	sc->cdev = disk_create(device_get_unit(sc->vtblk_dev), &sc->vtblk_disk,
668 			       &vbd_disk_ops);
669 
670 	sc->cdev->si_drv1 = sc;
671 	sc->cdev->si_iosize_max = MAXPHYS;
672 	disk_setdiskinfo(&sc->vtblk_disk, &info);
673 }
674 
675 static void
676 vtblk_startio(struct vtblk_softc *sc)
677 {
678 	struct virtqueue *vq;
679 	struct vtblk_request *req;
680 	int enq;
681 
682 	vq = sc->vtblk_vq;
683 	enq = 0;
684 
685 	ASSERT_SERIALIZED(&sc->vtblk_slz);
686 
687 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
688 		return;
689 
690 	while (!virtqueue_full(vq)) {
691 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
692 			req = vtblk_bio_request(sc);
693 		if (req == NULL)
694 			break;
695 
696 		if (vtblk_execute_request(sc, req) != 0) {
697 			vtblk_enqueue_ready(sc, req);
698 			break;
699 		}
700 
701 		enq++;
702 	}
703 
704 	if (enq > 0)
705 		virtqueue_notify(vq, &sc->vtblk_slz);
706 }
707 
708 static struct vtblk_request *
709 vtblk_bio_request(struct vtblk_softc *sc)
710 {
711 	struct bio_queue_head *bioq;
712 	struct vtblk_request *req;
713 	struct bio *bio;
714 	struct buf *bp;
715 
716 	bioq = &sc->vtblk_bioq;
717 
718 	if (bioq_first(bioq) == NULL)
719 		return (NULL);
720 
721 	req = vtblk_dequeue_request(sc);
722 	if (req == NULL)
723 		return (NULL);
724 
725 	bio = bioq_takefirst(bioq);
726 	req->vbr_bp = bio;
727 	req->vbr_ack = -1;
728 	req->vbr_barrier = 0;
729 	req->vbr_hdr.ioprio = 1;
730 	bp = bio->bio_buf;
731 
732 	switch (bp->b_cmd) {
733 	case BUF_CMD_FLUSH:
734 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
735 		break;
736 	case BUF_CMD_READ:
737 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
738 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
739 		break;
740 	case BUF_CMD_WRITE:
741 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
742 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
743 		break;
744 	default:
745 		KASSERT(0, ("bio with unhandled cmd: %d", bp->b_cmd));
746 		req->vbr_hdr.type = -1;
747 		break;
748 	}
749 
750 #if 0
751 	if (bp->b_flags & B_ORDERED) {
752 		if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0)
753 			req->vbr_barrier = 1;
754 		else
755 			req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
756 	}
757 #endif
758 
759 	return (req);
760 }
761 
762 static int
763 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
764 {
765 	struct sglist *sg;
766 	struct bio *bio;
767 	struct buf *bp;
768 	int ordered, writable, error;
769 
770 	sg = sc->vtblk_sglist;
771 	bio = req->vbr_bp;
772 	bp = bio->bio_buf;
773 	ordered = 0;
774 	writable = 0;
775 
776 	if (sc->vtblk_req_ordered != NULL)
777 		return (EBUSY);
778 
779 	if (req->vbr_barrier) {
780 		/*
781 		 * This request will be executed once all
782 		 * the in-flight requests are completed.
783 		 */
784 		if (!virtqueue_empty(sc->vtblk_vq))
785 			return (EBUSY);
786 		ordered = 1;
787 	}
788 
789 	/*
790 	 * sglist is live throughout this subroutine.
791 	 */
792 	sglist_reset(sg);
793 
794 	error = sglist_append(sg, &req->vbr_hdr,
795 			      sizeof(struct virtio_blk_outhdr));
796 	KASSERT(error == 0, ("error adding header to sglist"));
797 	KASSERT(sg->sg_nseg == 1,
798 	    ("header spanned multiple segments: %d", sg->sg_nseg));
799 
800 	if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
801 		error = sglist_append(sg, bp->b_data, bp->b_bcount);
802 		KASSERT(error == 0, ("error adding buffer to sglist"));
803 
804 		/* BUF_CMD_READ means the host writes into our buffer. */
805 		if (bp->b_cmd == BUF_CMD_READ)
806 			writable += sg->sg_nseg - 1;
807 	}
808 
809 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
810 	KASSERT(error == 0, ("error adding ack to sglist"));
811 	writable++;
812 
813 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
814 	    ("fewer than min segments: %d", sg->sg_nseg));
815 
816 	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
817 				  sg->sg_nseg - writable, writable);
818 	if (error == 0 && ordered)
819 		sc->vtblk_req_ordered = req;
820 
821 	sglist_reset(sg);
822 
823 	return (error);
824 }
825 
826 static int
827 vtblk_vq_intr(void *xsc)
828 {
829 	vtblk_complete(xsc);
830 
831 	return (1);
832 }
833 
834 static void
835 vtblk_complete(void *arg)
836 {
837 	struct vtblk_softc *sc;
838 	struct vtblk_request *req;
839 	struct virtqueue *vq;
840 	struct bio *bio;
841 	struct buf *bp;
842 
843 	sc = arg;
844 	vq = sc->vtblk_vq;
845 
846 	lwkt_serialize_handler_disable(&sc->vtblk_slz);
847 	virtqueue_disable_intr(sc->vtblk_vq);
848 	ASSERT_SERIALIZED(&sc->vtblk_slz);
849 
850 retry:
851 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
852 		return;
853 
854 	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
855 		bio = req->vbr_bp;
856 		bp = bio->bio_buf;
857 
858 		if (sc->vtblk_req_ordered != NULL) {
859 			/* This should be the only outstanding request. */
860 			KKASSERT(sc->vtblk_req_ordered == req);
861 			sc->vtblk_req_ordered = NULL;
862 		}
863 
864 		if (req->vbr_ack == VIRTIO_BLK_S_OK)
865 			bp->b_resid = 0;
866 		else {
867 			bp->b_flags |= B_ERROR;
868 			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) {
869 				bp->b_error = ENOTSUP;
870 			} else {
871 				bp->b_error = EIO;
872 			}
873 		}
874 
875 		devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
876 
877 		lwkt_serialize_exit(&sc->vtblk_slz);
878 		/*
879 		 * Unlocking the controller around biodone() does not allow
880 		 * processing further device interrupts; when we queued
881 		 * vtblk_complete, we disabled interrupts. It will allow
882 		 * concurrent vtblk_strategy/_startio command dispatches.
883 		 */
884 		biodone(bio);
885 		lwkt_serialize_enter(&sc->vtblk_slz);
886 
887 		vtblk_enqueue_request(sc, req);
888 	}
889 
890 	vtblk_startio(sc);
891 
892 	if (virtqueue_enable_intr(vq) != 0) {
893 		/*
894 		 * If new virtqueue entries appeared immediately after
895 		 * enabling interrupts, process them now. Release and
896 		 * retake softcontroller lock to try to avoid blocking
897 		 * I/O dispatch for too long.
898 		 */
899 		virtqueue_disable_intr(vq);
900 		goto retry;
901 	}
902 	lwkt_serialize_handler_enable(&sc->vtblk_slz);
903 }
904 
905 static void
906 vtblk_stop(struct vtblk_softc *sc)
907 {
908 
909 	virtqueue_disable_intr(sc->vtblk_vq);
910 	virtio_stop(sc->vtblk_dev);
911 }
912 
913 static void
914 vtblk_prepare_dump(struct vtblk_softc *sc)
915 {
916 	device_t dev;
917 	struct virtqueue *vq;
918 
919 	dev = sc->vtblk_dev;
920 	vq = sc->vtblk_vq;
921 
922 	vtblk_stop(sc);
923 
924 	/*
925 	 * Drain all requests caught in-flight in the virtqueue,
926 	 * skipping biodone(). When dumping, only one request is
927 	 * outstanding at a time, and we just poll the virtqueue
928 	 * for the response.
929 	 */
930 	vtblk_drain_vq(sc, 1);
931 
932 	if (virtio_reinit(dev, sc->vtblk_features) != 0) {
933 		panic("%s: cannot reinit VirtIO block device during dump",
934 		    device_get_nameunit(dev));
935 	}
936 
937 	virtqueue_disable_intr(vq);
938 	virtio_reinit_complete(dev);
939 }
940 
941 static int
942 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
943     size_t length)
944 {
945 	struct bio bio;
946 	struct buf bp;
947 	struct vtblk_request *req;
948 
949 	req = &sc->vtblk_dump_request;
950 	req->vbr_ack = -1;
951 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
952 	req->vbr_hdr.ioprio = 1;
953 	req->vbr_hdr.sector = offset / 512;
954 
955 	req->vbr_bp = &bio;
956 	bzero(&bio, sizeof(struct bio));
957 	bzero(&buf, sizeof(struct buf));
958 
959 	bio.bio_buf = &bp;
960 	bp.b_cmd = BUF_CMD_WRITE;
961 	bp.b_data = virtual;
962 	bp.b_bcount = length;
963 
964 	return (vtblk_poll_request(sc, req));
965 }
966 
967 static int
968 vtblk_flush_dump(struct vtblk_softc *sc)
969 {
970 	struct bio bio;
971 	struct buf bp;
972 	struct vtblk_request *req;
973 
974 	req = &sc->vtblk_dump_request;
975 	req->vbr_ack = -1;
976 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
977 	req->vbr_hdr.ioprio = 1;
978 	req->vbr_hdr.sector = 0;
979 
980 	req->vbr_bp = &bio;
981 	bzero(&bio, sizeof(struct bio));
982 	bzero(&bp, sizeof(struct buf));
983 
984 	bio.bio_buf = &bp;
985 	bp.b_cmd = BUF_CMD_FLUSH;
986 
987 	return (vtblk_poll_request(sc, req));
988 }
989 
990 static int
991 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
992 {
993 	struct virtqueue *vq;
994 	int error;
995 
996 	vq = sc->vtblk_vq;
997 
998 	if (!virtqueue_empty(vq))
999 		return (EBUSY);
1000 
1001 	error = vtblk_execute_request(sc, req);
1002 	if (error)
1003 		return (error);
1004 
1005 	virtqueue_notify(vq, NULL);
1006 	virtqueue_poll(vq, NULL);
1007 
1008 	error = vtblk_request_error(req);
1009 	if (error && bootverbose) {
1010 		device_printf(sc->vtblk_dev,
1011 		    "%s: IO error: %d\n", __func__, error);
1012 	}
1013 
1014 	return (error);
1015 }
1016 
1017 static void
1018 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1019 {
1020 	struct virtqueue *vq;
1021 	struct vtblk_request *req;
1022 	int last;
1023 
1024 	vq = sc->vtblk_vq;
1025 	last = 0;
1026 
1027 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1028 		if (!skip_done)
1029 			vtblk_finish_bio(req->vbr_bp, ENXIO);
1030 
1031 		vtblk_enqueue_request(sc, req);
1032 	}
1033 
1034 	sc->vtblk_req_ordered = NULL;
1035 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1036 }
1037 
1038 static void
1039 vtblk_drain(struct vtblk_softc *sc)
1040 {
1041 	struct bio_queue_head *bioq;
1042 	struct vtblk_request *req;
1043 	struct bio *bp;
1044 
1045 	bioq = &sc->vtblk_bioq;
1046 
1047 	if (sc->vtblk_vq != NULL)
1048 		vtblk_drain_vq(sc, 0);
1049 
1050 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1051 		vtblk_finish_bio(req->vbr_bp, ENXIO);
1052 		vtblk_enqueue_request(sc, req);
1053 	}
1054 
1055 	while (bioq_first(bioq) != NULL) {
1056 		bp = bioq_takefirst(bioq);
1057 		vtblk_finish_bio(bp, ENXIO);
1058 	}
1059 
1060 	vtblk_free_requests(sc);
1061 }
1062 
1063 static int
1064 vtblk_alloc_requests(struct vtblk_softc *sc)
1065 {
1066 	struct vtblk_request *req;
1067 	int i, nreqs;
1068 
1069 	nreqs = virtqueue_size(sc->vtblk_vq);
1070 
1071 	/*
1072 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1073 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1074 	 * the number allocated when indirect descriptors are not available.
1075 	 */
1076 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1077 		nreqs /= VTBLK_MIN_SEGMENTS;
1078 
1079 	for (i = 0; i < nreqs; i++) {
1080 		req = contigmalloc(sizeof(struct vtblk_request), M_DEVBUF,
1081 		    M_WAITOK, 0, BUS_SPACE_MAXADDR, 16, 0);
1082 		if (req == NULL)
1083 			return (ENOMEM);
1084 
1085 		KKASSERT(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr))
1086 		    == 1);
1087 		KKASSERT(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack))
1088 		    == 1);
1089 
1090 		sc->vtblk_request_count++;
1091 		vtblk_enqueue_request(sc, req);
1092 	}
1093 
1094 	return (0);
1095 }
1096 
1097 static void
1098 vtblk_free_requests(struct vtblk_softc *sc)
1099 {
1100 	struct vtblk_request *req;
1101 
1102 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1103 		sc->vtblk_request_count--;
1104 		contigfree(req, sizeof(struct vtblk_request), M_DEVBUF);
1105 	}
1106 
1107 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1108 }
1109 
1110 static struct vtblk_request *
1111 vtblk_dequeue_request(struct vtblk_softc *sc)
1112 {
1113 	struct vtblk_request *req;
1114 
1115 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1116 	if (req != NULL)
1117 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1118 
1119 	return (req);
1120 }
1121 
1122 static void
1123 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1124 {
1125 
1126 	bzero(req, sizeof(struct vtblk_request));
1127 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1128 }
1129 
1130 static struct vtblk_request *
1131 vtblk_dequeue_ready(struct vtblk_softc *sc)
1132 {
1133 	struct vtblk_request *req;
1134 
1135 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1136 	if (req != NULL)
1137 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1138 
1139 	return (req);
1140 }
1141 
1142 static void
1143 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1144 {
1145 
1146 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1147 }
1148 
1149 static int
1150 vtblk_request_error(struct vtblk_request *req)
1151 {
1152 	int error;
1153 
1154 	switch (req->vbr_ack) {
1155 	case VIRTIO_BLK_S_OK:
1156 		error = 0;
1157 		break;
1158 	case VIRTIO_BLK_S_UNSUPP:
1159 		error = ENOTSUP;
1160 		break;
1161 	default:
1162 		error = EIO;
1163 		break;
1164 	}
1165 
1166 	return (error);
1167 }
1168 
1169 static void
1170 vtblk_finish_bio(struct bio *bp, int error)
1171 {
1172 
1173 	biodone(bp);
1174 }
1175 
1176 static void
1177 vtblk_setup_sysctl(struct vtblk_softc *sc)
1178 {
1179 	device_t dev;
1180 	struct sysctl_ctx_list *ctx;
1181 	struct sysctl_oid *tree;
1182 	struct sysctl_oid_list *child;
1183 
1184 	dev = sc->vtblk_dev;
1185 	ctx = device_get_sysctl_ctx(dev);
1186 	tree = device_get_sysctl_tree(dev);
1187 	child = SYSCTL_CHILDREN(tree);
1188 
1189 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1190 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, vtblk_write_cache_sysctl,
1191 	    "I", "Write cache mode (writethrough (0) or writeback (1))");
1192 }
1193 
1194 static int
1195 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1196 {
1197 	char path[64];
1198 
1199 	ksnprintf(path, sizeof(path),
1200 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1201 	TUNABLE_INT_FETCH(path, &def);
1202 
1203 	return (def);
1204 }
1205