1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD: head/sys/dev/virtio/block/virtio_blk.c 252707 2013-07-04 17:57:26Z bryanv $
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/bio.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/sglist.h>
38 #include <sys/sysctl.h>
39 #include <sys/queue.h>
40 #include <sys/serialize.h>
41 #include <sys/buf2.h>
42 #include <sys/rman.h>
43 #include <sys/disk.h>
44 #include <sys/devicestat.h>
45 
46 #include <dev/virtual/virtio/virtio/virtio.h>
47 #include <dev/virtual/virtio/virtio/virtqueue.h>
48 #include "virtio_blk.h"
49 #include "virtio_if.h"
50 
51 struct vtblk_request {
52 	struct virtio_blk_outhdr	 vbr_hdr __aligned(16);
53 	struct bio			*vbr_bio;
54 	uint8_t				 vbr_ack;
55 
56 	SLIST_ENTRY(vtblk_request)	 vbr_link;
57 };
58 
59 enum vtblk_cache_mode {
60 	VTBLK_CACHE_WRITETHROUGH,
61 	VTBLK_CACHE_WRITEBACK,
62 	VTBLK_CACHE_MAX
63 };
64 
65 struct vtblk_softc {
66 	device_t		 vtblk_dev;
67 	struct lwkt_serialize	 vtblk_slz;
68 	uint64_t		 vtblk_features;
69 	uint32_t		 vtblk_flags;
70 #define VTBLK_FLAG_INDIRECT	0x0001
71 #define VTBLK_FLAG_READONLY	0x0002
72 #define VTBLK_FLAG_DETACH	0x0004
73 #define VTBLK_FLAG_SUSPEND	0x0008
74 #define VTBLK_FLAG_DUMPING	0x0010
75 #define VTBLK_FLAG_WC_CONFIG	0x0020
76 
77 	struct virtqueue	*vtblk_vq;
78 	struct sglist		*vtblk_sglist;
79 	struct disk		 vtblk_disk;
80 	cdev_t			 cdev;
81 	struct devstat		 stats;
82 
83 	struct bio_queue_head	 vtblk_bioq;
84 	SLIST_HEAD(, vtblk_request)
85 				 vtblk_req_free;
86 
87 	int			 vtblk_sector_size;
88 	int			 vtblk_max_nsegs;
89 	int			 vtblk_request_count;
90 	enum vtblk_cache_mode	 vtblk_write_cache;
91 
92 	struct vtblk_request	 vtblk_dump_request;
93 };
94 
95 static struct virtio_feature_desc vtblk_feature_desc[] = {
96 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
97 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
98 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
99 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
100 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
101 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
102 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
103 	{ VIRTIO_BLK_F_WCE,		"WriteCache"	},
104 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
105 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
106 
107 	{ 0, NULL }
108 };
109 
110 static int	vtblk_probe(device_t);
111 static int	vtblk_attach(device_t);
112 static int	vtblk_detach(device_t);
113 static int	vtblk_suspend(device_t);
114 static int	vtblk_resume(device_t);
115 static int	vtblk_shutdown(device_t);
116 
117 static void	vtblk_negotiate_features(struct vtblk_softc *);
118 static int	vtblk_maximum_segments(struct vtblk_softc *,
119 		    struct virtio_blk_config *);
120 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
121 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
122 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
123 		    struct virtio_blk_config *);
124 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
125 static void	vtblk_alloc_disk(struct vtblk_softc *,
126 		    struct virtio_blk_config *);
127 /*
128  * Interface to the device switch.
129  */
130 static d_open_t		vtblk_open;
131 static d_strategy_t	vtblk_strategy;
132 static d_dump_t		vtblk_dump;
133 
134 static struct dev_ops vbd_disk_ops = {
135 	{ "vbd", 200, D_DISK | D_MPSAFE },
136 	.d_open		= vtblk_open,
137 	.d_close	= nullclose,
138 	.d_read		= physread,
139 	.d_write	= physwrite,
140 	.d_strategy	= vtblk_strategy,
141 	.d_dump		= vtblk_dump,
142 };
143 
144 static void	vtblk_startio(struct vtblk_softc *);
145 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
146 static int	vtblk_execute_request(struct vtblk_softc *,
147 		    struct vtblk_request *);
148 
149 static int	vtblk_vq_intr(void *);
150 static void	vtblk_complete(void *);
151 
152 static void	vtblk_stop(struct vtblk_softc *);
153 
154 static void	vtblk_prepare_dump(struct vtblk_softc *);
155 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
156 static int	vtblk_flush_dump(struct vtblk_softc *);
157 static int	vtblk_poll_request(struct vtblk_softc *,
158 		    struct vtblk_request *);
159 
160 static void	vtblk_drain_vq(struct vtblk_softc *, int);
161 static void	vtblk_drain(struct vtblk_softc *);
162 
163 static int	vtblk_alloc_requests(struct vtblk_softc *);
164 static void	vtblk_free_requests(struct vtblk_softc *);
165 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
166 static void	vtblk_enqueue_request(struct vtblk_softc *,
167 		    struct vtblk_request *);
168 
169 static int	vtblk_request_error(struct vtblk_request *);
170 static void	vtblk_finish_bio(struct bio *, int);
171 
172 static void	vtblk_setup_sysctl(struct vtblk_softc *);
173 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
174 
175 /* Tunables. */
176 static int vtblk_writecache_mode = -1;
177 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
178 
179 /* Features desired/implemented by this driver. */
180 #define VTBLK_FEATURES \
181     (VIRTIO_BLK_F_SIZE_MAX		| \
182      VIRTIO_BLK_F_SEG_MAX		| \
183      VIRTIO_BLK_F_GEOMETRY		| \
184      VIRTIO_BLK_F_RO			| \
185      VIRTIO_BLK_F_BLK_SIZE		| \
186      VIRTIO_BLK_F_WCE			| \
187      VIRTIO_BLK_F_CONFIG_WCE		| \
188      VIRTIO_RING_F_INDIRECT_DESC)
189 
190 /*
191  * Each block request uses at least two segments - one for the header
192  * and one for the status.
193  */
194 #define VTBLK_MIN_SEGMENTS	2
195 
196 static device_method_t vtblk_methods[] = {
197 	/* Device methods. */
198 	DEVMETHOD(device_probe,		vtblk_probe),
199 	DEVMETHOD(device_attach,	vtblk_attach),
200 	DEVMETHOD(device_detach,	vtblk_detach),
201 	DEVMETHOD(device_suspend,	vtblk_suspend),
202 	DEVMETHOD(device_resume,	vtblk_resume),
203 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
204 
205 	DEVMETHOD_END
206 };
207 
208 static driver_t vtblk_driver = {
209 	"vtblk",
210 	vtblk_methods,
211 	sizeof(struct vtblk_softc)
212 };
213 static devclass_t vtblk_devclass;
214 
215 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass, NULL, NULL);
216 MODULE_VERSION(virtio_blk, 1);
217 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
218 
219 static int
220 vtblk_probe(device_t dev)
221 {
222 
223 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
224 		return (ENXIO);
225 
226 	device_set_desc(dev, "VirtIO Block Adapter");
227 
228 	return (BUS_PROBE_DEFAULT);
229 }
230 
231 static int
232 vtblk_attach(device_t dev)
233 {
234 	struct vtblk_softc *sc;
235 	struct virtio_blk_config blkcfg;
236 	int error;
237 
238 	sc = device_get_softc(dev);
239 	sc->vtblk_dev = dev;
240 
241 	lwkt_serialize_init(&sc->vtblk_slz);
242 
243 	bioq_init(&sc->vtblk_bioq);
244 	SLIST_INIT(&sc->vtblk_req_free);
245 
246 	virtio_set_feature_desc(dev, vtblk_feature_desc);
247 	vtblk_negotiate_features(sc);
248 
249 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
250 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
251 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
252 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
253 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
254 		sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG;
255 
256 	vtblk_setup_sysctl(sc);
257 
258 	/* Get local copy of config. */
259 	virtio_read_device_config(dev, 0, &blkcfg,
260 				  sizeof(struct virtio_blk_config));
261 
262 	/*
263 	 * With the current sglist(9) implementation, it is not easy
264 	 * for us to support a maximum segment size as adjacent
265 	 * segments are coalesced. For now, just make sure it's larger
266 	 * than the maximum supported transfer size.
267 	 */
268 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
269 		if (blkcfg.size_max < MAXPHYS) {
270 			error = ENOTSUP;
271 			device_printf(dev, "host requires unsupported "
272 			    "maximum segment size feature\n");
273 			goto fail;
274 		}
275 	}
276 
277 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
278 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
279 		error = EINVAL;
280 		device_printf(dev, "fewer than minimum number of segments "
281 		    "allowed: %d\n", sc->vtblk_max_nsegs);
282 		goto fail;
283 	}
284 
285 	/*
286 	 * Allocate working sglist. The number of segments may be too
287 	 * large to safely store on the stack.
288 	 */
289 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_INTWAIT);
290 	if (sc->vtblk_sglist == NULL) {
291 		error = ENOMEM;
292 		device_printf(dev, "cannot allocate sglist\n");
293 		goto fail;
294 	}
295 
296 	error = vtblk_alloc_virtqueue(sc);
297 	if (error) {
298 		device_printf(dev, "cannot allocate virtqueue\n");
299 		goto fail;
300 	}
301 
302 	error = vtblk_alloc_requests(sc);
303 	if (error) {
304 		device_printf(dev, "cannot preallocate requests\n");
305 		goto fail;
306 	}
307 
308 	error = virtio_setup_intr(dev, &sc->vtblk_slz);
309 	if (error) {
310 		device_printf(dev, "cannot setup virtqueue interrupt\n");
311 		goto fail;
312 	}
313 
314 	virtqueue_enable_intr(sc->vtblk_vq);
315 
316 	vtblk_alloc_disk(sc, &blkcfg);
317 
318 fail:
319 	if (error)
320 		vtblk_detach(dev);
321 
322 	return (error);
323 }
324 
325 static int
326 vtblk_detach(device_t dev)
327 {
328 	struct vtblk_softc *sc;
329 
330 	sc = device_get_softc(dev);
331 
332 	lwkt_serialize_enter(&sc->vtblk_slz);
333 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
334 	if (device_is_attached(dev))
335 		vtblk_stop(sc);
336 	lwkt_serialize_exit(&sc->vtblk_slz);
337 
338 	vtblk_drain(sc);
339 
340 	if (sc->cdev != NULL) {
341 		disk_destroy(&sc->vtblk_disk);
342 		sc->cdev = NULL;
343 	}
344 
345 	if (sc->vtblk_sglist != NULL) {
346 		sglist_free(sc->vtblk_sglist);
347 		sc->vtblk_sglist = NULL;
348 	}
349 
350 	return (0);
351 }
352 
353 static int
354 vtblk_suspend(device_t dev)
355 {
356 	struct vtblk_softc *sc;
357 
358 	sc = device_get_softc(dev);
359 
360 	lwkt_serialize_enter(&sc->vtblk_slz);
361 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
362 	/* XXX BMV: virtio_stop(), etc needed here? */
363 	lwkt_serialize_exit(&sc->vtblk_slz);
364 
365 	return (0);
366 }
367 
368 static int
369 vtblk_resume(device_t dev)
370 {
371 	struct vtblk_softc *sc;
372 
373 	sc = device_get_softc(dev);
374 
375 	lwkt_serialize_enter(&sc->vtblk_slz);
376 	/* XXX BMV: virtio_reinit(), etc needed here? */
377 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
378 #if 0 /* XXX Resume IO? */
379 	vtblk_startio(sc);
380 #endif
381 	lwkt_serialize_exit(&sc->vtblk_slz);
382 
383 	return (0);
384 }
385 
386 static int
387 vtblk_shutdown(device_t dev)
388 {
389 
390 	return (0);
391 }
392 
393 static int
394 vtblk_open(struct dev_open_args *ap)
395 {
396 	struct vtblk_softc *sc;
397 	cdev_t dev = ap->a_head.a_dev;
398 	sc = dev->si_drv1;
399 	if (sc == NULL)
400 		return (ENXIO);
401 
402 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
403 }
404 
405 static int
406 vtblk_dump(struct dev_dump_args *ap)
407 {
408 	struct vtblk_softc *sc;
409 	cdev_t dev = ap->a_head.a_dev;
410         uint64_t buf_start, buf_len;
411         int error;
412 
413 	sc = dev->si_drv1;
414 	if (sc == NULL)
415 		return (ENXIO);
416 
417         buf_start = ap->a_offset;
418         buf_len = ap->a_length;
419 
420 //	lwkt_serialize_enter(&sc->vtblk_slz);
421 
422 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
423 		vtblk_prepare_dump(sc);
424 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
425 	}
426 
427 	if (buf_len > 0)
428 		error = vtblk_write_dump(sc, ap->a_virtual, buf_start,
429 		    buf_len);
430 	else if (buf_len == 0)
431 		error = vtblk_flush_dump(sc);
432 	else {
433 		error = EINVAL;
434 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
435 	}
436 
437 //	lwkt_serialize_exit(&sc->vtblk_slz);
438 
439 	return (error);
440 }
441 
442 static int
443 vtblk_strategy(struct dev_strategy_args *ap)
444 {
445 	struct vtblk_softc *sc;
446 	cdev_t dev = ap->a_head.a_dev;
447 	sc = dev->si_drv1;
448 	struct bio *bio = ap->a_bio;
449 	struct buf *bp = bio->bio_buf;
450 
451 	if (sc == NULL) {
452 		vtblk_finish_bio(bio, EINVAL);
453 		return EINVAL;
454 	}
455 
456 	/*
457 	 * Fail any write if RO. Unfortunately, there does not seem to
458 	 * be a better way to report our readonly'ness to GEOM above.
459 	 *
460 	 * XXX: Is that true in DFly?
461 	 */
462 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
463 	    (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_FLUSH)) {
464 		vtblk_finish_bio(bio, EROFS);
465 		return (EINVAL);
466 	}
467 
468 	lwkt_serialize_enter(&sc->vtblk_slz);
469 	if ((sc->vtblk_flags & VTBLK_FLAG_DETACH) == 0) {
470 		bioqdisksort(&sc->vtblk_bioq, bio);
471 		vtblk_startio(sc);
472 		lwkt_serialize_exit(&sc->vtblk_slz);
473 	} else {
474 		lwkt_serialize_exit(&sc->vtblk_slz);
475 		vtblk_finish_bio(bio, ENXIO);
476 	}
477 	return 0;
478 }
479 
480 static void
481 vtblk_negotiate_features(struct vtblk_softc *sc)
482 {
483 	device_t dev;
484 	uint64_t features;
485 
486 	dev = sc->vtblk_dev;
487 	features = VTBLK_FEATURES;
488 
489 	sc->vtblk_features = virtio_negotiate_features(dev, features);
490 }
491 
492 /*
493  * Calculate the maximum number of DMA segment supported.  Note
494  * that the in/out header is encoded in the segment list.  We
495  * assume that VTBLK_MIN_SEGMENTS covers that part of it so
496  * we add it into the desired total.  If the SEG_MAX feature
497  * is not specified we have to just assume that the host can
498  * handle the maximum number of segments required for a MAXPHYS
499  * sized request.
500  *
501  * The additional + 1 is in case a MAXPHYS-sized buffer crosses
502  * a page boundary.
503  */
504 static int
505 vtblk_maximum_segments(struct vtblk_softc *sc,
506     struct virtio_blk_config *blkcfg)
507 {
508 	device_t dev;
509 	int nsegs;
510 
511 	dev = sc->vtblk_dev;
512 	nsegs = VTBLK_MIN_SEGMENTS;
513 
514 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
515 		nsegs = MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1 + nsegs);
516 	} else {
517 		nsegs = MAXPHYS / PAGE_SIZE + 1 + nsegs;
518 	}
519 	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
520 		nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
521 
522 	return (nsegs);
523 }
524 
525 static int
526 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
527 {
528 	device_t dev;
529 	struct vq_alloc_info vq_info;
530 
531 	dev = sc->vtblk_dev;
532 
533 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
534 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
535 	    "%s request", device_get_nameunit(dev));
536 
537 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
538 }
539 
540 static void
541 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
542 {
543 
544 	/* Set either writeback (1) or writethrough (0) mode. */
545 	virtio_write_dev_config_1(sc->vtblk_dev,
546 	    offsetof(struct virtio_blk_config, writeback), wc);
547 }
548 
549 static int
550 vtblk_write_cache_enabled(struct vtblk_softc *sc,
551     struct virtio_blk_config *blkcfg)
552 {
553 	int wc;
554 
555 	if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) {
556 		wc = vtblk_tunable_int(sc, "writecache_mode",
557 		    vtblk_writecache_mode);
558 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
559 			vtblk_set_write_cache(sc, wc);
560 		else
561 			wc = blkcfg->writeback;
562 	} else
563 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);
564 
565 	return (wc);
566 }
567 
568 static int
569 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
570 {
571 	struct vtblk_softc *sc;
572 	int wc, error;
573 
574 	sc = oidp->oid_arg1;
575 	wc = sc->vtblk_write_cache;
576 
577 	error = sysctl_handle_int(oidp, &wc, 0, req);
578 	if (error || req->newptr == NULL)
579 		return (error);
580 	if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0)
581 		return (EPERM);
582 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
583 		return (EINVAL);
584 
585 	lwkt_serialize_enter(&sc->vtblk_slz);
586 	sc->vtblk_write_cache = wc;
587 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
588 	lwkt_serialize_exit(&sc->vtblk_slz);
589 
590 	return (0);
591 }
592 
593 static void
594 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
595 {
596 	struct disk_info info;
597 
598 	/* construct the disk_info */
599 	bzero(&info, sizeof(info));
600 
601 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE))
602 		sc->vtblk_sector_size = blkcfg->blk_size;
603 	else
604 		sc->vtblk_sector_size = 512;
605 
606 	/* blkcfg->capacity is always expressed in 512 byte sectors. */
607 	info.d_media_blksize = 512;
608 	info.d_media_blocks = blkcfg->capacity;
609 
610 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_GEOMETRY)) {
611 		info.d_ncylinders = blkcfg->geometry.cylinders;
612 		info.d_nheads = blkcfg->geometry.heads;
613 		info.d_secpertrack = blkcfg->geometry.sectors;
614 		info.d_secpercyl = info.d_secpertrack * info.d_nheads;
615 	} else {
616 		/* Fabricate a geometry */
617 		info.d_secpertrack = 1024;
618 		info.d_nheads = 1;
619 		info.d_secpercyl = info.d_secpertrack * info.d_nheads;
620 		info.d_ncylinders =
621 		    (u_int)(info.d_media_blocks / info.d_secpercyl);
622 	}
623 
624 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
625 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
626 	else
627 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
628 
629 	devstat_add_entry(&sc->stats, "vbd", device_get_unit(sc->vtblk_dev),
630 			  DEV_BSIZE, DEVSTAT_ALL_SUPPORTED,
631 			  DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
632 			  DEVSTAT_PRIORITY_DISK);
633 
634 	/* attach a generic disk device to ourselves */
635 	sc->cdev = disk_create(device_get_unit(sc->vtblk_dev), &sc->vtblk_disk,
636 			       &vbd_disk_ops);
637 
638 	sc->cdev->si_drv1 = sc;
639 	sc->cdev->si_iosize_max = MAXPHYS;
640 	disk_setdiskinfo(&sc->vtblk_disk, &info);
641 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE)) {
642 		device_printf(sc->vtblk_dev, "Block size: %u\n",
643 		    sc->vtblk_sector_size);
644 	}
645 	device_printf(sc->vtblk_dev,
646 	    "%juMB (%ju 512 byte sectors: %dH %dS/T %dC)\n",
647 	    ((uintmax_t)blkcfg->capacity * 512) / (1024*1024),
648 	    (uintmax_t)blkcfg->capacity, blkcfg->geometry.heads,
649 	    blkcfg->geometry.sectors, blkcfg->geometry.cylinders);
650 }
651 
652 static void
653 vtblk_startio(struct vtblk_softc *sc)
654 {
655 	struct virtqueue *vq;
656 	struct vtblk_request *req;
657 	int enq;
658 
659 	vq = sc->vtblk_vq;
660 	enq = 0;
661 
662 	ASSERT_SERIALIZED(&sc->vtblk_slz);
663 
664 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
665 		return;
666 
667 	while (!virtqueue_full(vq)) {
668 		req = vtblk_bio_request(sc);
669 		if (req == NULL)
670 			break;
671 
672 		if (vtblk_execute_request(sc, req) != 0) {
673 			bioqdisksort(&sc->vtblk_bioq, req->vbr_bio);
674 			vtblk_enqueue_request(sc, req);
675 			break;
676 		}
677 		devstat_start_transaction(&sc->stats);
678 
679 		enq++;
680 	}
681 
682 	if (enq > 0)
683 		virtqueue_notify(vq, &sc->vtblk_slz);
684 }
685 
686 static struct vtblk_request *
687 vtblk_bio_request(struct vtblk_softc *sc)
688 {
689 	struct bio_queue_head *bioq;
690 	struct vtblk_request *req;
691 	struct bio *bio;
692 	struct buf *bp;
693 
694 	bioq = &sc->vtblk_bioq;
695 
696 	if (bioq_first(bioq) == NULL)
697 		return (NULL);
698 
699 	req = vtblk_dequeue_request(sc);
700 	if (req == NULL)
701 		return (NULL);
702 
703 	bio = bioq_takefirst(bioq);
704 	req->vbr_bio = bio;
705 	req->vbr_ack = -1;
706 	req->vbr_hdr.ioprio = 1;
707 	bp = bio->bio_buf;
708 
709 	switch (bp->b_cmd) {
710 	case BUF_CMD_FLUSH:
711 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
712 		break;
713 	case BUF_CMD_READ:
714 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
715 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
716 		break;
717 	case BUF_CMD_WRITE:
718 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
719 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
720 		break;
721 	default:
722 		KASSERT(0, ("bio with unhandled cmd: %d", bp->b_cmd));
723 		req->vbr_hdr.type = -1;
724 		break;
725 	}
726 
727 	return (req);
728 }
729 
730 static int
731 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
732 {
733 	struct sglist *sg;
734 	struct bio *bio;
735 	struct buf *bp;
736 	int writable, error;
737 
738 	sg = sc->vtblk_sglist;
739 	bio = req->vbr_bio;
740 	bp = bio->bio_buf;
741 	writable = 0;
742 
743 	/*
744 	 * sglist is live throughout this subroutine.
745 	 */
746 	sglist_reset(sg);
747 
748 	error = sglist_append(sg, &req->vbr_hdr,
749 			      sizeof(struct virtio_blk_outhdr));
750 	KASSERT(error == 0, ("error adding header to sglist"));
751 	KASSERT(sg->sg_nseg == 1,
752 	    ("header spanned multiple segments: %d", sg->sg_nseg));
753 
754 	if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
755 		error = sglist_append(sg, bp->b_data, bp->b_bcount);
756 		KASSERT(error == 0, ("error adding buffer to sglist"));
757 
758 		/* BUF_CMD_READ means the host writes into our buffer. */
759 		if (bp->b_cmd == BUF_CMD_READ)
760 			writable += sg->sg_nseg - 1;
761 	}
762 
763 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
764 	KASSERT(error == 0, ("error adding ack to sglist"));
765 	writable++;
766 
767 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
768 	    ("fewer than min segments: %d", sg->sg_nseg));
769 
770 	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
771 				  sg->sg_nseg - writable, writable);
772 
773 	sglist_reset(sg);
774 
775 	return (error);
776 }
777 
778 static int
779 vtblk_vq_intr(void *xsc)
780 {
781 	vtblk_complete(xsc);
782 
783 	return (1);
784 }
785 
786 static void
787 vtblk_complete(void *arg)
788 {
789 	struct vtblk_softc *sc;
790 	struct vtblk_request *req;
791 	struct virtqueue *vq;
792 	struct bio *bio;
793 	struct buf *bp;
794 
795 	sc = arg;
796 	vq = sc->vtblk_vq;
797 
798 	lwkt_serialize_handler_disable(&sc->vtblk_slz);
799 	virtqueue_disable_intr(sc->vtblk_vq);
800 	ASSERT_SERIALIZED(&sc->vtblk_slz);
801 
802 retry:
803 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
804 		return;
805 
806 	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
807 		bio = req->vbr_bio;
808 		bp = bio->bio_buf;
809 
810 		if (req->vbr_ack == VIRTIO_BLK_S_OK)
811 			bp->b_resid = 0;
812 		else {
813 			bp->b_flags |= B_ERROR;
814 			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) {
815 				bp->b_error = ENOTSUP;
816 			} else {
817 				bp->b_error = EIO;
818 			}
819 		}
820 
821 		devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
822 
823 		lwkt_serialize_exit(&sc->vtblk_slz);
824 		/*
825 		 * Unlocking the controller around biodone() does not allow
826 		 * processing further device interrupts; when we queued
827 		 * vtblk_complete, we disabled interrupts. It will allow
828 		 * concurrent vtblk_strategy/_startio command dispatches.
829 		 */
830 		biodone(bio);
831 		lwkt_serialize_enter(&sc->vtblk_slz);
832 
833 		vtblk_enqueue_request(sc, req);
834 	}
835 
836 	vtblk_startio(sc);
837 
838 	if (virtqueue_enable_intr(vq) != 0) {
839 		/*
840 		 * If new virtqueue entries appeared immediately after
841 		 * enabling interrupts, process them now. Release and
842 		 * retake softcontroller lock to try to avoid blocking
843 		 * I/O dispatch for too long.
844 		 */
845 		virtqueue_disable_intr(vq);
846 		goto retry;
847 	}
848 	lwkt_serialize_handler_enable(&sc->vtblk_slz);
849 }
850 
851 static void
852 vtblk_stop(struct vtblk_softc *sc)
853 {
854 
855 	virtqueue_disable_intr(sc->vtblk_vq);
856 	virtio_stop(sc->vtblk_dev);
857 }
858 
859 static void
860 vtblk_prepare_dump(struct vtblk_softc *sc)
861 {
862 	device_t dev;
863 	struct virtqueue *vq;
864 
865 	dev = sc->vtblk_dev;
866 	vq = sc->vtblk_vq;
867 
868 	vtblk_stop(sc);
869 
870 	/*
871 	 * Drain all requests caught in-flight in the virtqueue,
872 	 * skipping biodone(). When dumping, only one request is
873 	 * outstanding at a time, and we just poll the virtqueue
874 	 * for the response.
875 	 */
876 	vtblk_drain_vq(sc, 1);
877 
878 	if (virtio_reinit(dev, sc->vtblk_features) != 0) {
879 		panic("%s: cannot reinit VirtIO block device during dump",
880 		    device_get_nameunit(dev));
881 	}
882 
883 	virtqueue_disable_intr(vq);
884 	virtio_reinit_complete(dev);
885 }
886 
887 static int
888 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
889     size_t length)
890 {
891 	struct bio bio;
892 	struct buf bp;
893 	struct vtblk_request *req;
894 
895 	req = &sc->vtblk_dump_request;
896 	req->vbr_ack = -1;
897 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
898 	req->vbr_hdr.ioprio = 1;
899 	req->vbr_hdr.sector = offset / 512;
900 
901 	req->vbr_bio = &bio;
902 	bzero(&bio, sizeof(struct bio));
903 	bzero(&buf, sizeof(struct buf));
904 
905 	bio.bio_buf = &bp;
906 	bp.b_cmd = BUF_CMD_WRITE;
907 	bp.b_data = virtual;
908 	bp.b_bcount = length;
909 
910 	return (vtblk_poll_request(sc, req));
911 }
912 
913 static int
914 vtblk_flush_dump(struct vtblk_softc *sc)
915 {
916 	struct bio bio;
917 	struct buf bp;
918 	struct vtblk_request *req;
919 
920 	req = &sc->vtblk_dump_request;
921 	req->vbr_ack = -1;
922 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
923 	req->vbr_hdr.ioprio = 1;
924 	req->vbr_hdr.sector = 0;
925 
926 	req->vbr_bio = &bio;
927 	bzero(&bio, sizeof(struct bio));
928 	bzero(&bp, sizeof(struct buf));
929 
930 	bio.bio_buf = &bp;
931 	bp.b_cmd = BUF_CMD_FLUSH;
932 
933 	return (vtblk_poll_request(sc, req));
934 }
935 
936 static int
937 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
938 {
939 	struct virtqueue *vq;
940 	int error;
941 
942 	vq = sc->vtblk_vq;
943 
944 	if (!virtqueue_empty(vq))
945 		return (EBUSY);
946 
947 	error = vtblk_execute_request(sc, req);
948 	if (error)
949 		return (error);
950 
951 	virtqueue_notify(vq, NULL);
952 	virtqueue_poll(vq, NULL);
953 
954 	error = vtblk_request_error(req);
955 	if (error && bootverbose) {
956 		device_printf(sc->vtblk_dev,
957 		    "%s: IO error: %d\n", __func__, error);
958 	}
959 
960 	return (error);
961 }
962 
963 static void
964 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
965 {
966 	struct virtqueue *vq;
967 	struct vtblk_request *req;
968 	int last;
969 
970 	vq = sc->vtblk_vq;
971 	last = 0;
972 
973 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
974 		if (!skip_done)
975 			vtblk_finish_bio(req->vbr_bio, ENXIO);
976 
977 		vtblk_enqueue_request(sc, req);
978 	}
979 
980 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
981 }
982 
983 static void
984 vtblk_drain(struct vtblk_softc *sc)
985 {
986 	struct bio_queue_head *bioq;
987 	struct bio *bio;
988 
989 	bioq = &sc->vtblk_bioq;
990 
991 	if (sc->vtblk_vq != NULL)
992 		vtblk_drain_vq(sc, 0);
993 
994 	while (bioq_first(bioq) != NULL) {
995 		bio = bioq_takefirst(bioq);
996 		vtblk_finish_bio(bio, ENXIO);
997 	}
998 
999 	vtblk_free_requests(sc);
1000 }
1001 
1002 static int
1003 vtblk_alloc_requests(struct vtblk_softc *sc)
1004 {
1005 	struct vtblk_request *req;
1006 	int i, nreqs;
1007 
1008 	nreqs = virtqueue_size(sc->vtblk_vq);
1009 
1010 	/*
1011 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1012 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1013 	 * the number allocated when indirect descriptors are not available.
1014 	 */
1015 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1016 		nreqs /= VTBLK_MIN_SEGMENTS;
1017 
1018 	for (i = 0; i < nreqs; i++) {
1019 		req = contigmalloc(sizeof(struct vtblk_request), M_DEVBUF,
1020 		    M_WAITOK, 0, BUS_SPACE_MAXADDR, 16, 0);
1021 		if (req == NULL)
1022 			return (ENOMEM);
1023 
1024 		KKASSERT(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr))
1025 		    == 1);
1026 		KKASSERT(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack))
1027 		    == 1);
1028 
1029 		sc->vtblk_request_count++;
1030 		vtblk_enqueue_request(sc, req);
1031 	}
1032 
1033 	return (0);
1034 }
1035 
1036 static void
1037 vtblk_free_requests(struct vtblk_softc *sc)
1038 {
1039 	struct vtblk_request *req;
1040 
1041 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1042 		sc->vtblk_request_count--;
1043 		contigfree(req, sizeof(struct vtblk_request), M_DEVBUF);
1044 	}
1045 
1046 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1047 }
1048 
1049 static struct vtblk_request *
1050 vtblk_dequeue_request(struct vtblk_softc *sc)
1051 {
1052 	struct vtblk_request *req;
1053 
1054 	req = SLIST_FIRST(&sc->vtblk_req_free);
1055 	if (req != NULL)
1056 		SLIST_REMOVE_HEAD(&sc->vtblk_req_free, vbr_link);
1057 
1058 	return (req);
1059 }
1060 
1061 static void
1062 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1063 {
1064 
1065 	bzero(req, sizeof(struct vtblk_request));
1066 	SLIST_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1067 }
1068 
1069 static int
1070 vtblk_request_error(struct vtblk_request *req)
1071 {
1072 	int error;
1073 
1074 	switch (req->vbr_ack) {
1075 	case VIRTIO_BLK_S_OK:
1076 		error = 0;
1077 		break;
1078 	case VIRTIO_BLK_S_UNSUPP:
1079 		error = ENOTSUP;
1080 		break;
1081 	default:
1082 		error = EIO;
1083 		break;
1084 	}
1085 
1086 	return (error);
1087 }
1088 
1089 static void
1090 vtblk_finish_bio(struct bio *bio, int error)
1091 {
1092 
1093 	biodone(bio);
1094 }
1095 
1096 static void
1097 vtblk_setup_sysctl(struct vtblk_softc *sc)
1098 {
1099 	device_t dev;
1100 	struct sysctl_ctx_list *ctx;
1101 	struct sysctl_oid *tree;
1102 	struct sysctl_oid_list *child;
1103 
1104 	dev = sc->vtblk_dev;
1105 	ctx = device_get_sysctl_ctx(dev);
1106 	tree = device_get_sysctl_tree(dev);
1107 	child = SYSCTL_CHILDREN(tree);
1108 
1109 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1110 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, vtblk_write_cache_sysctl,
1111 	    "I", "Write cache mode (writethrough (0) or writeback (1))");
1112 }
1113 
1114 static int
1115 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1116 {
1117 	char path[64];
1118 
1119 	ksnprintf(path, sizeof(path),
1120 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1121 	TUNABLE_INT_FETCH(path, &def);
1122 
1123 	return (def);
1124 }
1125