xref: /freebsd/usr.sbin/bhyve/block_if.c (revision e17f5b1d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  * Copyright 2020 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/queue.h>
40 #include <sys/errno.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/disk.h>
44 
45 #include <assert.h>
46 #ifndef WITHOUT_CAPSICUM
47 #include <capsicum_helpers.h>
48 #endif
49 #include <err.h>
50 #include <fcntl.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <pthread.h>
55 #include <pthread_np.h>
56 #include <signal.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59 
60 #include <machine/atomic.h>
61 #include <machine/vmm_snapshot.h>
62 
63 #include "bhyverun.h"
64 #include "debug.h"
65 #include "mevent.h"
66 #include "block_if.h"
67 
68 #define BLOCKIF_SIG	0xb109b109
69 
70 #define BLOCKIF_NUMTHR	8
71 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
72 
73 enum blockop {
74 	BOP_READ,
75 	BOP_WRITE,
76 	BOP_FLUSH,
77 	BOP_DELETE
78 };
79 
80 enum blockstat {
81 	BST_FREE,
82 	BST_BLOCK,
83 	BST_PEND,
84 	BST_BUSY,
85 	BST_DONE
86 };
87 
88 struct blockif_elem {
89 	TAILQ_ENTRY(blockif_elem) be_link;
90 	struct blockif_req  *be_req;
91 	enum blockop	     be_op;
92 	enum blockstat	     be_status;
93 	pthread_t            be_tid;
94 	off_t		     be_block;
95 };
96 
97 struct blockif_ctxt {
98 	int			bc_magic;
99 	int			bc_fd;
100 	int			bc_ischr;
101 	int			bc_isgeom;
102 	int			bc_candelete;
103 	int			bc_rdonly;
104 	off_t			bc_size;
105 	int			bc_sectsz;
106 	int			bc_psectsz;
107 	int			bc_psectoff;
108 	int			bc_closing;
109 	int			bc_paused;
110 	int			bc_work_count;
111 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
112 	pthread_mutex_t		bc_mtx;
113 	pthread_cond_t		bc_cond;
114 	pthread_cond_t		bc_paused_cond;
115 	pthread_cond_t		bc_work_done_cond;
116 
117 	/* Request elements and free/pending/busy queues */
118 	TAILQ_HEAD(, blockif_elem) bc_freeq;
119 	TAILQ_HEAD(, blockif_elem) bc_pendq;
120 	TAILQ_HEAD(, blockif_elem) bc_busyq;
121 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
122 };
123 
124 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
125 
126 struct blockif_sig_elem {
127 	pthread_mutex_t			bse_mtx;
128 	pthread_cond_t			bse_cond;
129 	int				bse_pending;
130 	struct blockif_sig_elem		*bse_next;
131 };
132 
133 static struct blockif_sig_elem *blockif_bse_head;
134 
135 static int
136 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
137 		enum blockop op)
138 {
139 	struct blockif_elem *be, *tbe;
140 	off_t off;
141 	int i;
142 
143 	be = TAILQ_FIRST(&bc->bc_freeq);
144 	assert(be != NULL);
145 	assert(be->be_status == BST_FREE);
146 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
147 	be->be_req = breq;
148 	be->be_op = op;
149 	switch (op) {
150 	case BOP_READ:
151 	case BOP_WRITE:
152 	case BOP_DELETE:
153 		off = breq->br_offset;
154 		for (i = 0; i < breq->br_iovcnt; i++)
155 			off += breq->br_iov[i].iov_len;
156 		break;
157 	default:
158 		off = OFF_MAX;
159 	}
160 	be->be_block = off;
161 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
162 		if (tbe->be_block == breq->br_offset)
163 			break;
164 	}
165 	if (tbe == NULL) {
166 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
167 			if (tbe->be_block == breq->br_offset)
168 				break;
169 		}
170 	}
171 	if (tbe == NULL)
172 		be->be_status = BST_PEND;
173 	else
174 		be->be_status = BST_BLOCK;
175 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
176 	return (be->be_status == BST_PEND);
177 }
178 
179 static int
180 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
181 {
182 	struct blockif_elem *be;
183 
184 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
185 		if (be->be_status == BST_PEND)
186 			break;
187 		assert(be->be_status == BST_BLOCK);
188 	}
189 	if (be == NULL)
190 		return (0);
191 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
192 	be->be_status = BST_BUSY;
193 	be->be_tid = t;
194 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
195 	*bep = be;
196 	return (1);
197 }
198 
199 static void
200 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
201 {
202 	struct blockif_elem *tbe;
203 
204 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
205 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
206 	else
207 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
208 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
209 		if (tbe->be_req->br_offset == be->be_block)
210 			tbe->be_status = BST_PEND;
211 	}
212 	be->be_tid = 0;
213 	be->be_status = BST_FREE;
214 	be->be_req = NULL;
215 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
216 }
217 
218 static int
219 blockif_flush_bc(struct blockif_ctxt *bc)
220 {
221 	if (bc->bc_ischr) {
222 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
223 			return (errno);
224 	} else if (fsync(bc->bc_fd))
225 		return (errno);
226 
227 	return (0);
228 }
229 
230 static void
231 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
232 {
233 	struct blockif_req *br;
234 	off_t arg[2];
235 	ssize_t clen, len, off, boff, voff;
236 	int i, err;
237 
238 	br = be->be_req;
239 	if (br->br_iovcnt <= 1)
240 		buf = NULL;
241 	err = 0;
242 	switch (be->be_op) {
243 	case BOP_READ:
244 		if (buf == NULL) {
245 			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
246 				   br->br_offset)) < 0)
247 				err = errno;
248 			else
249 				br->br_resid -= len;
250 			break;
251 		}
252 		i = 0;
253 		off = voff = 0;
254 		while (br->br_resid > 0) {
255 			len = MIN(br->br_resid, MAXPHYS);
256 			if (pread(bc->bc_fd, buf, len, br->br_offset +
257 			    off) < 0) {
258 				err = errno;
259 				break;
260 			}
261 			boff = 0;
262 			do {
263 				clen = MIN(len - boff, br->br_iov[i].iov_len -
264 				    voff);
265 				memcpy(br->br_iov[i].iov_base + voff,
266 				    buf + boff, clen);
267 				if (clen < br->br_iov[i].iov_len - voff)
268 					voff += clen;
269 				else {
270 					i++;
271 					voff = 0;
272 				}
273 				boff += clen;
274 			} while (boff < len);
275 			off += len;
276 			br->br_resid -= len;
277 		}
278 		break;
279 	case BOP_WRITE:
280 		if (bc->bc_rdonly) {
281 			err = EROFS;
282 			break;
283 		}
284 		if (buf == NULL) {
285 			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
286 				    br->br_offset)) < 0)
287 				err = errno;
288 			else
289 				br->br_resid -= len;
290 			break;
291 		}
292 		i = 0;
293 		off = voff = 0;
294 		while (br->br_resid > 0) {
295 			len = MIN(br->br_resid, MAXPHYS);
296 			boff = 0;
297 			do {
298 				clen = MIN(len - boff, br->br_iov[i].iov_len -
299 				    voff);
300 				memcpy(buf + boff,
301 				    br->br_iov[i].iov_base + voff, clen);
302 				if (clen < br->br_iov[i].iov_len - voff)
303 					voff += clen;
304 				else {
305 					i++;
306 					voff = 0;
307 				}
308 				boff += clen;
309 			} while (boff < len);
310 			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
311 			    off) < 0) {
312 				err = errno;
313 				break;
314 			}
315 			off += len;
316 			br->br_resid -= len;
317 		}
318 		break;
319 	case BOP_FLUSH:
320 		err = blockif_flush_bc(bc);
321 		break;
322 	case BOP_DELETE:
323 		if (!bc->bc_candelete)
324 			err = EOPNOTSUPP;
325 		else if (bc->bc_rdonly)
326 			err = EROFS;
327 		else if (bc->bc_ischr) {
328 			arg[0] = br->br_offset;
329 			arg[1] = br->br_resid;
330 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
331 				err = errno;
332 			else
333 				br->br_resid = 0;
334 		} else
335 			err = EOPNOTSUPP;
336 		break;
337 	default:
338 		err = EINVAL;
339 		break;
340 	}
341 
342 	be->be_status = BST_DONE;
343 
344 	(*br->br_callback)(br, err);
345 }
346 
347 static void *
348 blockif_thr(void *arg)
349 {
350 	struct blockif_ctxt *bc;
351 	struct blockif_elem *be;
352 	pthread_t t;
353 	uint8_t *buf;
354 
355 	bc = arg;
356 	if (bc->bc_isgeom)
357 		buf = malloc(MAXPHYS);
358 	else
359 		buf = NULL;
360 	t = pthread_self();
361 
362 	pthread_mutex_lock(&bc->bc_mtx);
363 	for (;;) {
364 		bc->bc_work_count++;
365 
366 		/* We cannot process work if the interface is paused */
367 		while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
368 			pthread_mutex_unlock(&bc->bc_mtx);
369 			blockif_proc(bc, be, buf);
370 			pthread_mutex_lock(&bc->bc_mtx);
371 			blockif_complete(bc, be);
372 		}
373 
374 		bc->bc_work_count--;
375 
376 		/* If none of the workers are busy, notify the main thread */
377 		if (bc->bc_work_count == 0)
378 			pthread_cond_broadcast(&bc->bc_work_done_cond);
379 
380 		/* Check ctxt status here to see if exit requested */
381 		if (bc->bc_closing)
382 			break;
383 
384 		/* Make all worker threads wait here if the device is paused */
385 		while (bc->bc_paused)
386 			pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
387 
388 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
389 	}
390 	pthread_mutex_unlock(&bc->bc_mtx);
391 
392 	if (buf)
393 		free(buf);
394 	pthread_exit(NULL);
395 	return (NULL);
396 }
397 
398 static void
399 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
400 {
401 	struct blockif_sig_elem *bse;
402 
403 	for (;;) {
404 		/*
405 		 * Process the entire list even if not intended for
406 		 * this thread.
407 		 */
408 		do {
409 			bse = blockif_bse_head;
410 			if (bse == NULL)
411 				return;
412 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
413 					    (uintptr_t)bse,
414 					    (uintptr_t)bse->bse_next));
415 
416 		pthread_mutex_lock(&bse->bse_mtx);
417 		bse->bse_pending = 0;
418 		pthread_cond_signal(&bse->bse_cond);
419 		pthread_mutex_unlock(&bse->bse_mtx);
420 	}
421 }
422 
423 static void
424 blockif_init(void)
425 {
426 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
427 	(void) signal(SIGCONT, SIG_IGN);
428 }
429 
430 struct blockif_ctxt *
431 blockif_open(const char *optstr, const char *ident)
432 {
433 	char tname[MAXCOMLEN + 1];
434 	char name[MAXPATHLEN];
435 	char *nopt, *xopts, *cp;
436 	struct blockif_ctxt *bc;
437 	struct stat sbuf;
438 	struct diocgattr_arg arg;
439 	off_t size, psectsz, psectoff;
440 	int extra, fd, i, sectsz;
441 	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
442 	int nodelete;
443 
444 #ifndef WITHOUT_CAPSICUM
445 	cap_rights_t rights;
446 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
447 #endif
448 
449 	pthread_once(&blockif_once, blockif_init);
450 
451 	fd = -1;
452 	ssopt = 0;
453 	nocache = 0;
454 	sync = 0;
455 	ro = 0;
456 	nodelete = 0;
457 
458 	/*
459 	 * The first element in the optstring is always a pathname.
460 	 * Optional elements follow
461 	 */
462 	nopt = xopts = strdup(optstr);
463 	while (xopts != NULL) {
464 		cp = strsep(&xopts, ",");
465 		if (cp == nopt)		/* file or device pathname */
466 			continue;
467 		else if (!strcmp(cp, "nocache"))
468 			nocache = 1;
469 		else if (!strcmp(cp, "nodelete"))
470 			nodelete = 1;
471 		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
472 			sync = 1;
473 		else if (!strcmp(cp, "ro"))
474 			ro = 1;
475 		else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
476 			;
477 		else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
478 			pssopt = ssopt;
479 		else {
480 			EPRINTLN("Invalid device option \"%s\"", cp);
481 			goto err;
482 		}
483 	}
484 
485 	extra = 0;
486 	if (nocache)
487 		extra |= O_DIRECT;
488 	if (sync)
489 		extra |= O_SYNC;
490 
491 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
492 	if (fd < 0 && !ro) {
493 		/* Attempt a r/w fail with a r/o open */
494 		fd = open(nopt, O_RDONLY | extra);
495 		ro = 1;
496 	}
497 
498 	if (fd < 0) {
499 		warn("Could not open backing file: %s", nopt);
500 		goto err;
501 	}
502 
503         if (fstat(fd, &sbuf) < 0) {
504 		warn("Could not stat backing file %s", nopt);
505 		goto err;
506         }
507 
508 #ifndef WITHOUT_CAPSICUM
509 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
510 	    CAP_WRITE);
511 	if (ro)
512 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
513 
514 	if (caph_rights_limit(fd, &rights) == -1)
515 		errx(EX_OSERR, "Unable to apply rights for sandbox");
516 #endif
517 
518         /*
519 	 * Deal with raw devices
520 	 */
521         size = sbuf.st_size;
522 	sectsz = DEV_BSIZE;
523 	psectsz = psectoff = 0;
524 	candelete = geom = 0;
525 	if (S_ISCHR(sbuf.st_mode)) {
526 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
527 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
528 			perror("Could not fetch dev blk/sector size");
529 			goto err;
530 		}
531 		assert(size != 0);
532 		assert(sectsz != 0);
533 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
534 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
535 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
536 		arg.len = sizeof(arg.value.i);
537 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
538 			candelete = arg.value.i;
539 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
540 			geom = 1;
541 	} else
542 		psectsz = sbuf.st_blksize;
543 
544 #ifndef WITHOUT_CAPSICUM
545 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
546 		errx(EX_OSERR, "Unable to apply rights for sandbox");
547 #endif
548 
549 	if (ssopt != 0) {
550 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
551 		    ssopt > pssopt) {
552 			EPRINTLN("Invalid sector size %d/%d",
553 			    ssopt, pssopt);
554 			goto err;
555 		}
556 
557 		/*
558 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
559 		 * size be a multiple of the device's sector size.
560 		 *
561 		 * Validate that the emulated sector size complies with this
562 		 * requirement.
563 		 */
564 		if (S_ISCHR(sbuf.st_mode)) {
565 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
566 				EPRINTLN("Sector size %d incompatible "
567 				    "with underlying device sector size %d",
568 				    ssopt, sectsz);
569 				goto err;
570 			}
571 		}
572 
573 		sectsz = ssopt;
574 		psectsz = pssopt;
575 		psectoff = 0;
576 	}
577 
578 	bc = calloc(1, sizeof(struct blockif_ctxt));
579 	if (bc == NULL) {
580 		perror("calloc");
581 		goto err;
582 	}
583 
584 	bc->bc_magic = BLOCKIF_SIG;
585 	bc->bc_fd = fd;
586 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
587 	bc->bc_isgeom = geom;
588 	bc->bc_candelete = candelete;
589 	bc->bc_rdonly = ro;
590 	bc->bc_size = size;
591 	bc->bc_sectsz = sectsz;
592 	bc->bc_psectsz = psectsz;
593 	bc->bc_psectoff = psectoff;
594 	pthread_mutex_init(&bc->bc_mtx, NULL);
595 	pthread_cond_init(&bc->bc_cond, NULL);
596 	bc->bc_paused = 0;
597 	bc->bc_work_count = 0;
598 	pthread_cond_init(&bc->bc_paused_cond, NULL);
599 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
600 	TAILQ_INIT(&bc->bc_freeq);
601 	TAILQ_INIT(&bc->bc_pendq);
602 	TAILQ_INIT(&bc->bc_busyq);
603 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
604 		bc->bc_reqs[i].be_status = BST_FREE;
605 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
606 	}
607 
608 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
609 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
610 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
611 		pthread_set_name_np(bc->bc_btid[i], tname);
612 	}
613 
614 	return (bc);
615 err:
616 	if (fd >= 0)
617 		close(fd);
618 	free(nopt);
619 	return (NULL);
620 }
621 
622 static int
623 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
624 		enum blockop op)
625 {
626 	int err;
627 
628 	err = 0;
629 
630 	pthread_mutex_lock(&bc->bc_mtx);
631 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
632 		/*
633 		 * Enqueue and inform the block i/o thread
634 		 * that there is work available
635 		 */
636 		if (blockif_enqueue(bc, breq, op))
637 			pthread_cond_signal(&bc->bc_cond);
638 	} else {
639 		/*
640 		 * Callers are not allowed to enqueue more than
641 		 * the specified blockif queue limit. Return an
642 		 * error to indicate that the queue length has been
643 		 * exceeded.
644 		 */
645 		err = E2BIG;
646 	}
647 	pthread_mutex_unlock(&bc->bc_mtx);
648 
649 	return (err);
650 }
651 
652 int
653 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
654 {
655 
656 	assert(bc->bc_magic == BLOCKIF_SIG);
657 	return (blockif_request(bc, breq, BOP_READ));
658 }
659 
660 int
661 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
662 {
663 
664 	assert(bc->bc_magic == BLOCKIF_SIG);
665 	return (blockif_request(bc, breq, BOP_WRITE));
666 }
667 
668 int
669 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
670 {
671 
672 	assert(bc->bc_magic == BLOCKIF_SIG);
673 	return (blockif_request(bc, breq, BOP_FLUSH));
674 }
675 
676 int
677 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
678 {
679 
680 	assert(bc->bc_magic == BLOCKIF_SIG);
681 	return (blockif_request(bc, breq, BOP_DELETE));
682 }
683 
684 int
685 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
686 {
687 	struct blockif_elem *be;
688 
689 	assert(bc->bc_magic == BLOCKIF_SIG);
690 
691 	pthread_mutex_lock(&bc->bc_mtx);
692 	/* XXX: not waiting while paused */
693 
694 	/*
695 	 * Check pending requests.
696 	 */
697 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
698 		if (be->be_req == breq)
699 			break;
700 	}
701 	if (be != NULL) {
702 		/*
703 		 * Found it.
704 		 */
705 		blockif_complete(bc, be);
706 		pthread_mutex_unlock(&bc->bc_mtx);
707 
708 		return (0);
709 	}
710 
711 	/*
712 	 * Check in-flight requests.
713 	 */
714 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
715 		if (be->be_req == breq)
716 			break;
717 	}
718 	if (be == NULL) {
719 		/*
720 		 * Didn't find it.
721 		 */
722 		pthread_mutex_unlock(&bc->bc_mtx);
723 		return (EINVAL);
724 	}
725 
726 	/*
727 	 * Interrupt the processing thread to force it return
728 	 * prematurely via it's normal callback path.
729 	 */
730 	while (be->be_status == BST_BUSY) {
731 		struct blockif_sig_elem bse, *old_head;
732 
733 		pthread_mutex_init(&bse.bse_mtx, NULL);
734 		pthread_cond_init(&bse.bse_cond, NULL);
735 
736 		bse.bse_pending = 1;
737 
738 		do {
739 			old_head = blockif_bse_head;
740 			bse.bse_next = old_head;
741 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
742 					    (uintptr_t)old_head,
743 					    (uintptr_t)&bse));
744 
745 		pthread_kill(be->be_tid, SIGCONT);
746 
747 		pthread_mutex_lock(&bse.bse_mtx);
748 		while (bse.bse_pending)
749 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
750 		pthread_mutex_unlock(&bse.bse_mtx);
751 	}
752 
753 	pthread_mutex_unlock(&bc->bc_mtx);
754 
755 	/*
756 	 * The processing thread has been interrupted.  Since it's not
757 	 * clear if the callback has been invoked yet, return EBUSY.
758 	 */
759 	return (EBUSY);
760 }
761 
762 int
763 blockif_close(struct blockif_ctxt *bc)
764 {
765 	void *jval;
766 	int i;
767 
768 	assert(bc->bc_magic == BLOCKIF_SIG);
769 
770 	/*
771 	 * Stop the block i/o thread
772 	 */
773 	pthread_mutex_lock(&bc->bc_mtx);
774 	bc->bc_closing = 1;
775 	pthread_mutex_unlock(&bc->bc_mtx);
776 	pthread_cond_broadcast(&bc->bc_cond);
777 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
778 		pthread_join(bc->bc_btid[i], &jval);
779 
780 	/* XXX Cancel queued i/o's ??? */
781 
782 	/*
783 	 * Release resources
784 	 */
785 	bc->bc_magic = 0;
786 	close(bc->bc_fd);
787 	free(bc);
788 
789 	return (0);
790 }
791 
792 /*
793  * Return virtual C/H/S values for a given block. Use the algorithm
794  * outlined in the VHD specification to calculate values.
795  */
796 void
797 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
798 {
799 	off_t sectors;		/* total sectors of the block dev */
800 	off_t hcyl;		/* cylinders times heads */
801 	uint16_t secpt;		/* sectors per track */
802 	uint8_t heads;
803 
804 	assert(bc->bc_magic == BLOCKIF_SIG);
805 
806 	sectors = bc->bc_size / bc->bc_sectsz;
807 
808 	/* Clamp the size to the largest possible with CHS */
809 	if (sectors > 65535UL*16*255)
810 		sectors = 65535UL*16*255;
811 
812 	if (sectors >= 65536UL*16*63) {
813 		secpt = 255;
814 		heads = 16;
815 		hcyl = sectors / secpt;
816 	} else {
817 		secpt = 17;
818 		hcyl = sectors / secpt;
819 		heads = (hcyl + 1023) / 1024;
820 
821 		if (heads < 4)
822 			heads = 4;
823 
824 		if (hcyl >= (heads * 1024) || heads > 16) {
825 			secpt = 31;
826 			heads = 16;
827 			hcyl = sectors / secpt;
828 		}
829 		if (hcyl >= (heads * 1024)) {
830 			secpt = 63;
831 			heads = 16;
832 			hcyl = sectors / secpt;
833 		}
834 	}
835 
836 	*c = hcyl / heads;
837 	*h = heads;
838 	*s = secpt;
839 }
840 
841 /*
842  * Accessors
843  */
844 off_t
845 blockif_size(struct blockif_ctxt *bc)
846 {
847 
848 	assert(bc->bc_magic == BLOCKIF_SIG);
849 	return (bc->bc_size);
850 }
851 
852 int
853 blockif_sectsz(struct blockif_ctxt *bc)
854 {
855 
856 	assert(bc->bc_magic == BLOCKIF_SIG);
857 	return (bc->bc_sectsz);
858 }
859 
860 void
861 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
862 {
863 
864 	assert(bc->bc_magic == BLOCKIF_SIG);
865 	*size = bc->bc_psectsz;
866 	*off = bc->bc_psectoff;
867 }
868 
869 int
870 blockif_queuesz(struct blockif_ctxt *bc)
871 {
872 
873 	assert(bc->bc_magic == BLOCKIF_SIG);
874 	return (BLOCKIF_MAXREQ - 1);
875 }
876 
877 int
878 blockif_is_ro(struct blockif_ctxt *bc)
879 {
880 
881 	assert(bc->bc_magic == BLOCKIF_SIG);
882 	return (bc->bc_rdonly);
883 }
884 
885 int
886 blockif_candelete(struct blockif_ctxt *bc)
887 {
888 
889 	assert(bc->bc_magic == BLOCKIF_SIG);
890 	return (bc->bc_candelete);
891 }
892 
893 #ifdef BHYVE_SNAPSHOT
894 void
895 blockif_pause(struct blockif_ctxt *bc)
896 {
897 	assert(bc != NULL);
898 	assert(bc->bc_magic == BLOCKIF_SIG);
899 
900 	pthread_mutex_lock(&bc->bc_mtx);
901 	bc->bc_paused = 1;
902 
903 	/* The interface is paused. Wait for workers to finish their work */
904 	while (bc->bc_work_count)
905 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
906 	pthread_mutex_unlock(&bc->bc_mtx);
907 
908 	if (blockif_flush_bc(bc))
909 		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
910 			__func__);
911 }
912 
913 void
914 blockif_resume(struct blockif_ctxt *bc)
915 {
916 	assert(bc != NULL);
917 	assert(bc->bc_magic == BLOCKIF_SIG);
918 
919 	pthread_mutex_lock(&bc->bc_mtx);
920 	bc->bc_paused = 0;
921 	/* resume the threads waiting for paused */
922 	pthread_cond_broadcast(&bc->bc_paused_cond);
923 	/* kick the threads after restore */
924 	pthread_cond_broadcast(&bc->bc_cond);
925 	pthread_mutex_unlock(&bc->bc_mtx);
926 }
927 
928 int
929 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
930 {
931 	int i;
932 	struct iovec *iov;
933 	int ret;
934 
935 	SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
936 	SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
937 	SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
938 
939 	/*
940 	 * XXX: The callback and parameter must be filled by the virtualized
941 	 * device that uses the interface, during its init; we're not touching
942 	 * them here.
943 	 */
944 
945 	/* Snapshot the iovecs. */
946 	for (i = 0; i < br->br_iovcnt; i++) {
947 		iov = &br->br_iov[i];
948 
949 		SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
950 
951 		/* We assume the iov is a guest-mapped address. */
952 		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
953 			false, meta, ret, done);
954 	}
955 
956 done:
957 	return (ret);
958 }
959 
960 int
961 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
962 {
963 	int ret;
964 
965 	if (bc->bc_paused == 0) {
966 		fprintf(stderr, "%s: Snapshot failed: "
967 			"interface not paused.\r\n", __func__);
968 		return (ENXIO);
969 	}
970 
971 	pthread_mutex_lock(&bc->bc_mtx);
972 
973 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
974 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
975 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
976 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
977 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
978 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
979 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
980 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
981 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
982 	SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
983 
984 done:
985 	pthread_mutex_unlock(&bc->bc_mtx);
986 	return (ret);
987 }
988 #endif
989