1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99 
100 #include "zfs_namecheck.h"
101 
102 #define	ZVOL_DUMPSIZE		"dumpsize"
103 
104 #ifdef ZVOL_LOCK_DEBUG
105 #define	ZVOL_RW_READER		RW_WRITER
106 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
107 #else
108 #define	ZVOL_RW_READER		RW_READER
109 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
110 #endif
111 
112 enum zvol_geom_state {
113 	ZVOL_GEOM_UNINIT,
114 	ZVOL_GEOM_STOPPED,
115 	ZVOL_GEOM_RUNNING,
116 };
117 
118 struct zvol_state_os {
119 #define	zso_dev		_zso_state._zso_dev
120 #define	zso_geom	_zso_state._zso_geom
121 	union {
122 		/* volmode=dev */
123 		struct zvol_state_dev {
124 			struct cdev *zsd_cdev;
125 			uint64_t zsd_sync_cnt;
126 		} _zso_dev;
127 
128 		/* volmode=geom */
129 		struct zvol_state_geom {
130 			struct g_provider *zsg_provider;
131 			struct bio_queue_head zsg_queue;
132 			struct mtx zsg_queue_mtx;
133 			enum zvol_geom_state zsg_state;
134 		} _zso_geom;
135 	} _zso_state;
136 	int zso_dying;
137 };
138 
139 static uint32_t zvol_minors;
140 
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144 	"Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
148 
149 /*
150  * Toggle unmap functionality.
151  */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153 
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156 
157 /*
158  * zvol maximum transfer in one DMU tx.
159  */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161 
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163 
164 static d_open_t		zvol_cdev_open;
165 static d_close_t	zvol_cdev_close;
166 static d_ioctl_t	zvol_cdev_ioctl;
167 static d_read_t		zvol_cdev_read;
168 static d_write_t	zvol_cdev_write;
169 static d_strategy_t	zvol_geom_bio_strategy;
170 
171 static struct cdevsw zvol_cdevsw = {
172 	.d_name =	"zvol",
173 	.d_version =	D_VERSION,
174 	.d_flags =	D_DISK | D_TRACKCLOSE,
175 	.d_open =	zvol_cdev_open,
176 	.d_close =	zvol_cdev_close,
177 	.d_ioctl =	zvol_cdev_ioctl,
178 	.d_read =	zvol_cdev_read,
179 	.d_write =	zvol_cdev_write,
180 	.d_strategy =	zvol_geom_bio_strategy,
181 };
182 
183 extern uint_t zfs_geom_probe_vdev_key;
184 
185 struct g_class zfs_zvol_class = {
186 	.name = "ZFS::ZVOL",
187 	.version = G_VERSION,
188 };
189 
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191 
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
201 
202 /*
203  * GEOM mode implementation
204  */
205 
206 /*ARGSUSED*/
207 static int
208 zvol_geom_open(struct g_provider *pp, int flag, int count)
209 {
210 	zvol_state_t *zv;
211 	int err = 0;
212 	boolean_t drop_suspend = B_FALSE;
213 	boolean_t drop_namespace = B_FALSE;
214 
215 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216 		/*
217 		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
218 		 * attempting to probe geom providers while looking for a
219 		 * replacement for a missing VDEV.  In this case, the
220 		 * spa_namespace_lock will not be held, but it is still illegal
221 		 * to use a zvol as a vdev.  Deadlocks can result if another
222 		 * thread has spa_namespace_lock
223 		 */
224 		return (SET_ERROR(EOPNOTSUPP));
225 	}
226 
227 retry:
228 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229 	zv = pp->private;
230 	if (zv == NULL) {
231 		rw_exit(&zvol_state_lock);
232 		err = SET_ERROR(ENXIO);
233 		goto out_locked;
234 	}
235 
236 	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
237 		/*
238 		 * We need to guarantee that the namespace lock is held
239 		 * to avoid spurious failures in zvol_first_open.
240 		 */
241 		drop_namespace = B_TRUE;
242 		if (!mutex_tryenter(&spa_namespace_lock)) {
243 			rw_exit(&zvol_state_lock);
244 			mutex_enter(&spa_namespace_lock);
245 			goto retry;
246 		}
247 	}
248 	mutex_enter(&zv->zv_state_lock);
249 	if (zv->zv_zso->zso_dying) {
250 		rw_exit(&zvol_state_lock);
251 		err = SET_ERROR(ENXIO);
252 		goto out_zv_locked;
253 	}
254 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
255 
256 	/*
257 	 * make sure zvol is not suspended during first open
258 	 * (hold zv_suspend_lock) and respect proper lock acquisition
259 	 * ordering - zv_suspend_lock before zv_state_lock
260 	 */
261 	if (zv->zv_open_count == 0) {
262 		drop_suspend = B_TRUE;
263 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
264 			mutex_exit(&zv->zv_state_lock);
265 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
266 			mutex_enter(&zv->zv_state_lock);
267 			/* check to see if zv_suspend_lock is needed */
268 			if (zv->zv_open_count != 0) {
269 				rw_exit(&zv->zv_suspend_lock);
270 				drop_suspend = B_FALSE;
271 			}
272 		}
273 	}
274 	rw_exit(&zvol_state_lock);
275 
276 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
277 
278 	if (zv->zv_open_count == 0) {
279 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
280 		err = zvol_first_open(zv, !(flag & FWRITE));
281 		if (err)
282 			goto out_zv_locked;
283 		pp->mediasize = zv->zv_volsize;
284 		pp->stripeoffset = 0;
285 		pp->stripesize = zv->zv_volblocksize;
286 	}
287 
288 	/*
289 	 * Check for a bad on-disk format version now since we
290 	 * lied about owning the dataset readonly before.
291 	 */
292 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
293 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
294 		err = SET_ERROR(EROFS);
295 		goto out_opened;
296 	}
297 	if (zv->zv_flags & ZVOL_EXCL) {
298 		err = SET_ERROR(EBUSY);
299 		goto out_opened;
300 	}
301 #ifdef FEXCL
302 	if (flag & FEXCL) {
303 		if (zv->zv_open_count != 0) {
304 			err = SET_ERROR(EBUSY);
305 			goto out_opened;
306 		}
307 		zv->zv_flags |= ZVOL_EXCL;
308 	}
309 #endif
310 
311 	zv->zv_open_count += count;
312 out_opened:
313 	if (zv->zv_open_count == 0) {
314 		zvol_last_close(zv);
315 		wakeup(zv);
316 	}
317 out_zv_locked:
318 	mutex_exit(&zv->zv_state_lock);
319 out_locked:
320 	if (drop_namespace)
321 		mutex_exit(&spa_namespace_lock);
322 	if (drop_suspend)
323 		rw_exit(&zv->zv_suspend_lock);
324 	return (err);
325 }
326 
327 /*ARGSUSED*/
328 static int
329 zvol_geom_close(struct g_provider *pp, int flag, int count)
330 {
331 	zvol_state_t *zv;
332 	boolean_t drop_suspend = B_TRUE;
333 	int new_open_count;
334 
335 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
336 	zv = pp->private;
337 	if (zv == NULL) {
338 		rw_exit(&zvol_state_lock);
339 		return (SET_ERROR(ENXIO));
340 	}
341 
342 	mutex_enter(&zv->zv_state_lock);
343 	if (zv->zv_flags & ZVOL_EXCL) {
344 		ASSERT3U(zv->zv_open_count, ==, 1);
345 		zv->zv_flags &= ~ZVOL_EXCL;
346 	}
347 
348 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
349 
350 	/*
351 	 * If the open count is zero, this is a spurious close.
352 	 * That indicates a bug in the kernel / DDI framework.
353 	 */
354 	ASSERT3U(zv->zv_open_count, >, 0);
355 
356 	/*
357 	 * make sure zvol is not suspended during last close
358 	 * (hold zv_suspend_lock) and respect proper lock acquisition
359 	 * ordering - zv_suspend_lock before zv_state_lock
360 	 */
361 	new_open_count = zv->zv_open_count - count;
362 	if (new_open_count == 0) {
363 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
364 			mutex_exit(&zv->zv_state_lock);
365 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
366 			mutex_enter(&zv->zv_state_lock);
367 			/* check to see if zv_suspend_lock is needed */
368 			new_open_count = zv->zv_open_count - count;
369 			if (new_open_count != 0) {
370 				rw_exit(&zv->zv_suspend_lock);
371 				drop_suspend = B_FALSE;
372 			}
373 		}
374 	} else {
375 		drop_suspend = B_FALSE;
376 	}
377 	rw_exit(&zvol_state_lock);
378 
379 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
380 
381 	/*
382 	 * You may get multiple opens, but only one close.
383 	 */
384 	zv->zv_open_count = new_open_count;
385 	if (zv->zv_open_count == 0) {
386 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
387 		zvol_last_close(zv);
388 		wakeup(zv);
389 	}
390 
391 	mutex_exit(&zv->zv_state_lock);
392 
393 	if (drop_suspend)
394 		rw_exit(&zv->zv_suspend_lock);
395 	return (0);
396 }
397 
398 static void
399 zvol_geom_run(zvol_state_t *zv)
400 {
401 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
402 	struct g_provider *pp = zsg->zsg_provider;
403 
404 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
405 
406 	g_error_provider(pp, 0);
407 
408 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
409 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
410 }
411 
412 static void
413 zvol_geom_destroy(zvol_state_t *zv)
414 {
415 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
416 	struct g_provider *pp = zsg->zsg_provider;
417 
418 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
419 
420 	g_topology_assert();
421 
422 	mutex_enter(&zv->zv_state_lock);
423 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
424 	mutex_exit(&zv->zv_state_lock);
425 	zsg->zsg_provider = NULL;
426 	g_wither_geom(pp->geom, ENXIO);
427 }
428 
429 void
430 zvol_wait_close(zvol_state_t *zv)
431 {
432 
433 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
434 		return;
435 	mutex_enter(&zv->zv_state_lock);
436 	zv->zv_zso->zso_dying = B_TRUE;
437 
438 	if (zv->zv_open_count)
439 		msleep(zv, &zv->zv_state_lock,
440 		    PRIBIO, "zvol:dying", 10*hz);
441 	mutex_exit(&zv->zv_state_lock);
442 }
443 
444 
445 static int
446 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
447 {
448 	int count, error, flags;
449 
450 	g_topology_assert();
451 
452 	/*
453 	 * To make it easier we expect either open or close, but not both
454 	 * at the same time.
455 	 */
456 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
457 	    (acr <= 0 && acw <= 0 && ace <= 0),
458 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
459 	    pp->name, acr, acw, ace));
460 
461 	if (pp->private == NULL) {
462 		if (acr <= 0 && acw <= 0 && ace <= 0)
463 			return (0);
464 		return (pp->error);
465 	}
466 
467 	/*
468 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
469 	 * ace != 0, because GEOM already handles that and handles it a bit
470 	 * differently. GEOM allows for multiple read/exclusive consumers and
471 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
472 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
473 	 * to decide what to do.
474 	 */
475 
476 	count = acr + acw + ace;
477 	if (count == 0)
478 		return (0);
479 
480 	flags = 0;
481 	if (acr != 0 || ace != 0)
482 		flags |= FREAD;
483 	if (acw != 0)
484 		flags |= FWRITE;
485 
486 	g_topology_unlock();
487 	if (count > 0)
488 		error = zvol_geom_open(pp, flags, count);
489 	else
490 		error = zvol_geom_close(pp, flags, -count);
491 	g_topology_lock();
492 	return (error);
493 }
494 
495 static void
496 zvol_geom_worker(void *arg)
497 {
498 	zvol_state_t *zv = arg;
499 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
500 	struct bio *bp;
501 
502 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
503 
504 	thread_lock(curthread);
505 	sched_prio(curthread, PRIBIO);
506 	thread_unlock(curthread);
507 
508 	for (;;) {
509 		mtx_lock(&zsg->zsg_queue_mtx);
510 		bp = bioq_takefirst(&zsg->zsg_queue);
511 		if (bp == NULL) {
512 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
513 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
514 				wakeup(&zsg->zsg_state);
515 				mtx_unlock(&zsg->zsg_queue_mtx);
516 				kthread_exit();
517 			}
518 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
519 			    PRIBIO | PDROP, "zvol:io", 0);
520 			continue;
521 		}
522 		mtx_unlock(&zsg->zsg_queue_mtx);
523 		zvol_geom_bio_strategy(bp);
524 	}
525 }
526 
527 static void
528 zvol_geom_bio_start(struct bio *bp)
529 {
530 	zvol_state_t *zv = bp->bio_to->private;
531 	struct zvol_state_geom *zsg;
532 	boolean_t first;
533 
534 	if (zv == NULL) {
535 		g_io_deliver(bp, ENXIO);
536 		return;
537 	}
538 	if (bp->bio_cmd == BIO_GETATTR) {
539 		if (zvol_geom_bio_getattr(bp))
540 			g_io_deliver(bp, EOPNOTSUPP);
541 		return;
542 	}
543 
544 	if (!THREAD_CAN_SLEEP()) {
545 		zsg = &zv->zv_zso->zso_geom;
546 		mtx_lock(&zsg->zsg_queue_mtx);
547 		first = (bioq_first(&zsg->zsg_queue) == NULL);
548 		bioq_insert_tail(&zsg->zsg_queue, bp);
549 		mtx_unlock(&zsg->zsg_queue_mtx);
550 		if (first)
551 			wakeup_one(&zsg->zsg_queue);
552 		return;
553 	}
554 
555 	zvol_geom_bio_strategy(bp);
556 }
557 
558 static int
559 zvol_geom_bio_getattr(struct bio *bp)
560 {
561 	zvol_state_t *zv;
562 
563 	zv = bp->bio_to->private;
564 	ASSERT3P(zv, !=, NULL);
565 
566 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
567 	uint64_t refd, avail, usedobjs, availobjs;
568 
569 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
570 		return (0);
571 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
572 		dmu_objset_space(zv->zv_objset, &refd, &avail,
573 		    &usedobjs, &availobjs);
574 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
575 			return (0);
576 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
577 		dmu_objset_space(zv->zv_objset, &refd, &avail,
578 		    &usedobjs, &availobjs);
579 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
580 			return (0);
581 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
582 		avail = metaslab_class_get_space(spa_normal_class(spa));
583 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
584 		if (g_handleattr_off_t(bp, "poolblocksavail",
585 		    avail / DEV_BSIZE))
586 			return (0);
587 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
588 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
589 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
590 			return (0);
591 	}
592 	return (1);
593 }
594 
595 static void
596 zvol_geom_bio_strategy(struct bio *bp)
597 {
598 	zvol_state_t *zv;
599 	uint64_t off, volsize;
600 	size_t resid;
601 	char *addr;
602 	objset_t *os;
603 	zfs_locked_range_t *lr;
604 	int error = 0;
605 	boolean_t doread = B_FALSE;
606 	boolean_t is_dumpified;
607 	boolean_t sync;
608 
609 	if (bp->bio_to)
610 		zv = bp->bio_to->private;
611 	else
612 		zv = bp->bio_dev->si_drv2;
613 
614 	if (zv == NULL) {
615 		error = SET_ERROR(ENXIO);
616 		goto out;
617 	}
618 
619 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
620 
621 	switch (bp->bio_cmd) {
622 	case BIO_READ:
623 		doread = B_TRUE;
624 		break;
625 	case BIO_WRITE:
626 	case BIO_FLUSH:
627 	case BIO_DELETE:
628 		if (zv->zv_flags & ZVOL_RDONLY) {
629 			error = SET_ERROR(EROFS);
630 			goto resume;
631 		}
632 		zvol_ensure_zilog(zv);
633 		if (bp->bio_cmd == BIO_FLUSH)
634 			goto sync;
635 		break;
636 	default:
637 		error = SET_ERROR(EOPNOTSUPP);
638 		goto resume;
639 	}
640 
641 	off = bp->bio_offset;
642 	volsize = zv->zv_volsize;
643 
644 	os = zv->zv_objset;
645 	ASSERT3P(os, !=, NULL);
646 
647 	addr = bp->bio_data;
648 	resid = bp->bio_length;
649 
650 	if (resid > 0 && off >= volsize) {
651 		error = SET_ERROR(EIO);
652 		goto resume;
653 	}
654 
655 	is_dumpified = B_FALSE;
656 	sync = !doread && !is_dumpified &&
657 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
658 
659 	/*
660 	 * There must be no buffer changes when doing a dmu_sync() because
661 	 * we can't change the data whilst calculating the checksum.
662 	 */
663 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
664 	    doread ? RL_READER : RL_WRITER);
665 
666 	if (bp->bio_cmd == BIO_DELETE) {
667 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
668 		error = dmu_tx_assign(tx, TXG_WAIT);
669 		if (error != 0) {
670 			dmu_tx_abort(tx);
671 		} else {
672 			zvol_log_truncate(zv, tx, off, resid, sync);
673 			dmu_tx_commit(tx);
674 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
675 			    off, resid);
676 			resid = 0;
677 		}
678 		goto unlock;
679 	}
680 	while (resid != 0 && off < volsize) {
681 		size_t size = MIN(resid, zvol_maxphys);
682 		if (doread) {
683 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
684 			    DMU_READ_PREFETCH);
685 		} else {
686 			dmu_tx_t *tx = dmu_tx_create(os);
687 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
688 			error = dmu_tx_assign(tx, TXG_WAIT);
689 			if (error) {
690 				dmu_tx_abort(tx);
691 			} else {
692 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
693 				zvol_log_write(zv, tx, off, size, sync);
694 				dmu_tx_commit(tx);
695 			}
696 		}
697 		if (error) {
698 			/* convert checksum errors into IO errors */
699 			if (error == ECKSUM)
700 				error = SET_ERROR(EIO);
701 			break;
702 		}
703 		off += size;
704 		addr += size;
705 		resid -= size;
706 	}
707 unlock:
708 	zfs_rangelock_exit(lr);
709 
710 	bp->bio_completed = bp->bio_length - resid;
711 	if (bp->bio_completed < bp->bio_length && off > volsize)
712 		error = SET_ERROR(EINVAL);
713 
714 	switch (bp->bio_cmd) {
715 	case BIO_FLUSH:
716 		break;
717 	case BIO_READ:
718 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
719 		    bp->bio_completed);
720 		break;
721 	case BIO_WRITE:
722 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
723 		    bp->bio_completed);
724 		break;
725 	case BIO_DELETE:
726 		break;
727 	default:
728 		break;
729 	}
730 
731 	if (sync) {
732 sync:
733 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
734 	}
735 resume:
736 	rw_exit(&zv->zv_suspend_lock);
737 out:
738 	if (bp->bio_to)
739 		g_io_deliver(bp, error);
740 	else
741 		biofinish(bp, NULL, error);
742 }
743 
744 /*
745  * Character device mode implementation
746  */
747 
748 static int
749 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
750 {
751 	zvol_state_t *zv;
752 	uint64_t volsize;
753 	zfs_locked_range_t *lr;
754 	int error = 0;
755 	zfs_uio_t uio;
756 
757 	zfs_uio_init(&uio, uio_s);
758 
759 	zv = dev->si_drv2;
760 
761 	volsize = zv->zv_volsize;
762 	/*
763 	 * uio_loffset == volsize isn't an error as
764 	 * it's required for EOF processing.
765 	 */
766 	if (zfs_uio_resid(&uio) > 0 &&
767 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
768 		return (SET_ERROR(EIO));
769 
770 	ssize_t start_resid = zfs_uio_resid(&uio);
771 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
772 	    zfs_uio_resid(&uio), RL_READER);
773 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
774 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
775 
776 		/* don't read past the end */
777 		if (bytes > volsize - zfs_uio_offset(&uio))
778 			bytes = volsize - zfs_uio_offset(&uio);
779 
780 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
781 		if (error) {
782 			/* convert checksum errors into IO errors */
783 			if (error == ECKSUM)
784 				error = SET_ERROR(EIO);
785 			break;
786 		}
787 	}
788 	zfs_rangelock_exit(lr);
789 	int64_t nread = start_resid - zfs_uio_resid(&uio);
790 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
791 
792 	return (error);
793 }
794 
795 static int
796 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
797 {
798 	zvol_state_t *zv;
799 	uint64_t volsize;
800 	zfs_locked_range_t *lr;
801 	int error = 0;
802 	boolean_t sync;
803 	zfs_uio_t uio;
804 
805 	zv = dev->si_drv2;
806 
807 	volsize = zv->zv_volsize;
808 
809 	zfs_uio_init(&uio, uio_s);
810 
811 	if (zfs_uio_resid(&uio) > 0 &&
812 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
813 		return (SET_ERROR(EIO));
814 
815 	ssize_t start_resid = zfs_uio_resid(&uio);
816 	sync = (ioflag & IO_SYNC) ||
817 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
818 
819 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
820 	zvol_ensure_zilog(zv);
821 
822 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
823 	    zfs_uio_resid(&uio), RL_WRITER);
824 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
825 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
826 		uint64_t off = zfs_uio_offset(&uio);
827 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
828 
829 		if (bytes > volsize - off)	/* don't write past the end */
830 			bytes = volsize - off;
831 
832 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
833 		error = dmu_tx_assign(tx, TXG_WAIT);
834 		if (error) {
835 			dmu_tx_abort(tx);
836 			break;
837 		}
838 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
839 		if (error == 0)
840 			zvol_log_write(zv, tx, off, bytes, sync);
841 		dmu_tx_commit(tx);
842 
843 		if (error)
844 			break;
845 	}
846 	zfs_rangelock_exit(lr);
847 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
848 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
849 	if (sync)
850 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
851 	rw_exit(&zv->zv_suspend_lock);
852 	return (error);
853 }
854 
855 static int
856 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
857 {
858 	zvol_state_t *zv;
859 	struct zvol_state_dev *zsd;
860 	int err = 0;
861 	boolean_t drop_suspend = B_FALSE;
862 	boolean_t drop_namespace = B_FALSE;
863 
864 retry:
865 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
866 	zv = dev->si_drv2;
867 	if (zv == NULL) {
868 		rw_exit(&zvol_state_lock);
869 		err = SET_ERROR(ENXIO);
870 		goto out_locked;
871 	}
872 
873 	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
874 		/*
875 		 * We need to guarantee that the namespace lock is held
876 		 * to avoid spurious failures in zvol_first_open.
877 		 */
878 		drop_namespace = B_TRUE;
879 		if (!mutex_tryenter(&spa_namespace_lock)) {
880 			rw_exit(&zvol_state_lock);
881 			mutex_enter(&spa_namespace_lock);
882 			goto retry;
883 		}
884 	}
885 	mutex_enter(&zv->zv_state_lock);
886 
887 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
888 
889 	/*
890 	 * make sure zvol is not suspended during first open
891 	 * (hold zv_suspend_lock) and respect proper lock acquisition
892 	 * ordering - zv_suspend_lock before zv_state_lock
893 	 */
894 	if (zv->zv_open_count == 0) {
895 		drop_suspend = B_TRUE;
896 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
897 			mutex_exit(&zv->zv_state_lock);
898 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
899 			mutex_enter(&zv->zv_state_lock);
900 			/* check to see if zv_suspend_lock is needed */
901 			if (zv->zv_open_count != 0) {
902 				rw_exit(&zv->zv_suspend_lock);
903 				drop_suspend = B_FALSE;
904 			}
905 		}
906 	}
907 	rw_exit(&zvol_state_lock);
908 
909 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
910 
911 	if (zv->zv_open_count == 0) {
912 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
913 		err = zvol_first_open(zv, !(flags & FWRITE));
914 		if (err)
915 			goto out_zv_locked;
916 	}
917 
918 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
919 		err = SET_ERROR(EROFS);
920 		goto out_opened;
921 	}
922 	if (zv->zv_flags & ZVOL_EXCL) {
923 		err = SET_ERROR(EBUSY);
924 		goto out_opened;
925 	}
926 #ifdef FEXCL
927 	if (flags & FEXCL) {
928 		if (zv->zv_open_count != 0) {
929 			err = SET_ERROR(EBUSY);
930 			goto out_opened;
931 		}
932 		zv->zv_flags |= ZVOL_EXCL;
933 	}
934 #endif
935 
936 	zv->zv_open_count++;
937 	if (flags & (FSYNC | FDSYNC)) {
938 		zsd = &zv->zv_zso->zso_dev;
939 		zsd->zsd_sync_cnt++;
940 		if (zsd->zsd_sync_cnt == 1 &&
941 		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
942 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
943 	}
944 out_opened:
945 	if (zv->zv_open_count == 0) {
946 		zvol_last_close(zv);
947 		wakeup(zv);
948 	}
949 out_zv_locked:
950 	mutex_exit(&zv->zv_state_lock);
951 out_locked:
952 	if (drop_namespace)
953 		mutex_exit(&spa_namespace_lock);
954 	if (drop_suspend)
955 		rw_exit(&zv->zv_suspend_lock);
956 	return (err);
957 }
958 
959 static int
960 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
961 {
962 	zvol_state_t *zv;
963 	struct zvol_state_dev *zsd;
964 	boolean_t drop_suspend = B_TRUE;
965 
966 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
967 	zv = dev->si_drv2;
968 	if (zv == NULL) {
969 		rw_exit(&zvol_state_lock);
970 		return (SET_ERROR(ENXIO));
971 	}
972 
973 	mutex_enter(&zv->zv_state_lock);
974 	if (zv->zv_flags & ZVOL_EXCL) {
975 		ASSERT3U(zv->zv_open_count, ==, 1);
976 		zv->zv_flags &= ~ZVOL_EXCL;
977 	}
978 
979 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
980 
981 	/*
982 	 * If the open count is zero, this is a spurious close.
983 	 * That indicates a bug in the kernel / DDI framework.
984 	 */
985 	ASSERT3U(zv->zv_open_count, >, 0);
986 	/*
987 	 * make sure zvol is not suspended during last close
988 	 * (hold zv_suspend_lock) and respect proper lock acquisition
989 	 * ordering - zv_suspend_lock before zv_state_lock
990 	 */
991 	if (zv->zv_open_count == 1) {
992 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
993 			mutex_exit(&zv->zv_state_lock);
994 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
995 			mutex_enter(&zv->zv_state_lock);
996 			/* check to see if zv_suspend_lock is needed */
997 			if (zv->zv_open_count != 1) {
998 				rw_exit(&zv->zv_suspend_lock);
999 				drop_suspend = B_FALSE;
1000 			}
1001 		}
1002 	} else {
1003 		drop_suspend = B_FALSE;
1004 	}
1005 	rw_exit(&zvol_state_lock);
1006 
1007 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1008 
1009 	/*
1010 	 * You may get multiple opens, but only one close.
1011 	 */
1012 	zv->zv_open_count--;
1013 	if (flags & (FSYNC | FDSYNC)) {
1014 		zsd = &zv->zv_zso->zso_dev;
1015 		zsd->zsd_sync_cnt--;
1016 	}
1017 
1018 	if (zv->zv_open_count == 0) {
1019 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1020 		zvol_last_close(zv);
1021 		wakeup(zv);
1022 	}
1023 
1024 	mutex_exit(&zv->zv_state_lock);
1025 
1026 	if (drop_suspend)
1027 		rw_exit(&zv->zv_suspend_lock);
1028 	return (0);
1029 }
1030 
1031 static int
1032 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1033     int fflag, struct thread *td)
1034 {
1035 	zvol_state_t *zv;
1036 	zfs_locked_range_t *lr;
1037 	off_t offset, length;
1038 	int i, error;
1039 	boolean_t sync;
1040 
1041 	zv = dev->si_drv2;
1042 
1043 	error = 0;
1044 	KASSERT(zv->zv_open_count > 0,
1045 	    ("Device with zero access count in %s", __func__));
1046 
1047 	i = IOCPARM_LEN(cmd);
1048 	switch (cmd) {
1049 	case DIOCGSECTORSIZE:
1050 		*(uint32_t *)data = DEV_BSIZE;
1051 		break;
1052 	case DIOCGMEDIASIZE:
1053 		*(off_t *)data = zv->zv_volsize;
1054 		break;
1055 	case DIOCGFLUSH:
1056 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1057 		if (zv->zv_zilog != NULL)
1058 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1059 		rw_exit(&zv->zv_suspend_lock);
1060 		break;
1061 	case DIOCGDELETE:
1062 		if (!zvol_unmap_enabled)
1063 			break;
1064 
1065 		offset = ((off_t *)data)[0];
1066 		length = ((off_t *)data)[1];
1067 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1068 		    offset < 0 || offset >= zv->zv_volsize ||
1069 		    length <= 0) {
1070 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1071 			    length);
1072 			error = SET_ERROR(EINVAL);
1073 			break;
1074 		}
1075 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1076 		zvol_ensure_zilog(zv);
1077 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1078 		    RL_WRITER);
1079 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1080 		error = dmu_tx_assign(tx, TXG_WAIT);
1081 		if (error != 0) {
1082 			sync = FALSE;
1083 			dmu_tx_abort(tx);
1084 		} else {
1085 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1086 			zvol_log_truncate(zv, tx, offset, length, sync);
1087 			dmu_tx_commit(tx);
1088 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1089 			    offset, length);
1090 		}
1091 		zfs_rangelock_exit(lr);
1092 		if (sync)
1093 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1094 		rw_exit(&zv->zv_suspend_lock);
1095 		break;
1096 	case DIOCGSTRIPESIZE:
1097 		*(off_t *)data = zv->zv_volblocksize;
1098 		break;
1099 	case DIOCGSTRIPEOFFSET:
1100 		*(off_t *)data = 0;
1101 		break;
1102 	case DIOCGATTR: {
1103 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1104 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1105 		uint64_t refd, avail, usedobjs, availobjs;
1106 
1107 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1108 			arg->value.i = 1;
1109 		else if (strcmp(arg->name, "blocksavail") == 0) {
1110 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1111 			    &usedobjs, &availobjs);
1112 			arg->value.off = avail / DEV_BSIZE;
1113 		} else if (strcmp(arg->name, "blocksused") == 0) {
1114 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1115 			    &usedobjs, &availobjs);
1116 			arg->value.off = refd / DEV_BSIZE;
1117 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1118 			avail = metaslab_class_get_space(spa_normal_class(spa));
1119 			avail -= metaslab_class_get_alloc(
1120 			    spa_normal_class(spa));
1121 			arg->value.off = avail / DEV_BSIZE;
1122 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1123 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1124 			arg->value.off = refd / DEV_BSIZE;
1125 		} else
1126 			error = SET_ERROR(ENOIOCTL);
1127 		break;
1128 	}
1129 	case FIOSEEKHOLE:
1130 	case FIOSEEKDATA: {
1131 		off_t *off = (off_t *)data;
1132 		uint64_t noff;
1133 		boolean_t hole;
1134 
1135 		hole = (cmd == FIOSEEKHOLE);
1136 		noff = *off;
1137 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1138 		*off = noff;
1139 		break;
1140 	}
1141 	default:
1142 		error = SET_ERROR(ENOIOCTL);
1143 	}
1144 
1145 	return (error);
1146 }
1147 
1148 /*
1149  * Misc. helpers
1150  */
1151 
1152 static void
1153 zvol_ensure_zilog(zvol_state_t *zv)
1154 {
1155 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1156 
1157 	/*
1158 	 * Open a ZIL if this is the first time we have written to this
1159 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1160 	 * than zv_state_lock so that we don't need to acquire an
1161 	 * additional lock in this path.
1162 	 */
1163 	if (zv->zv_zilog == NULL) {
1164 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1165 			rw_exit(&zv->zv_suspend_lock);
1166 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1167 		}
1168 		if (zv->zv_zilog == NULL) {
1169 			zv->zv_zilog = zil_open(zv->zv_objset,
1170 			    zvol_get_data);
1171 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1172 			/* replay / destroy done in zvol_create_minor_impl() */
1173 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1174 			    ZIL_REPLAY_NEEDED);
1175 		}
1176 		rw_downgrade(&zv->zv_suspend_lock);
1177 	}
1178 }
1179 
1180 static boolean_t
1181 zvol_is_zvol_impl(const char *device)
1182 {
1183 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1184 }
1185 
1186 static void
1187 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1188 {
1189 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1190 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1191 
1192 	/* move to new hashtable entry  */
1193 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1194 	hlist_del(&zv->zv_hlink);
1195 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1196 
1197 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1198 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1199 		struct g_provider *pp = zsg->zsg_provider;
1200 		struct g_geom *gp;
1201 
1202 		g_topology_lock();
1203 		gp = pp->geom;
1204 		ASSERT3P(gp, !=, NULL);
1205 
1206 		zsg->zsg_provider = NULL;
1207 		g_wither_provider(pp, ENXIO);
1208 
1209 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1210 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1211 		pp->sectorsize = DEV_BSIZE;
1212 		pp->mediasize = zv->zv_volsize;
1213 		pp->private = zv;
1214 		zsg->zsg_provider = pp;
1215 		g_error_provider(pp, 0);
1216 		g_topology_unlock();
1217 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1218 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1219 		struct cdev *dev;
1220 		struct make_dev_args args;
1221 
1222 		dev = zsd->zsd_cdev;
1223 		if (dev != NULL) {
1224 			destroy_dev(dev);
1225 			dev = zsd->zsd_cdev = NULL;
1226 			if (zv->zv_open_count > 0) {
1227 				zv->zv_flags &= ~ZVOL_EXCL;
1228 				zv->zv_open_count = 0;
1229 				/* XXX  need suspend lock but lock order */
1230 				zvol_last_close(zv);
1231 			}
1232 		}
1233 
1234 		make_dev_args_init(&args);
1235 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1236 		args.mda_devsw = &zvol_cdevsw;
1237 		args.mda_cr = NULL;
1238 		args.mda_uid = UID_ROOT;
1239 		args.mda_gid = GID_OPERATOR;
1240 		args.mda_mode = 0640;
1241 		args.mda_si_drv2 = zv;
1242 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1243 		    == 0) {
1244 			dev->si_iosize_max = maxphys;
1245 			zsd->zsd_cdev = dev;
1246 		}
1247 	}
1248 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1249 }
1250 
1251 /*
1252  * Remove minor node for the specified volume.
1253  */
1254 static void
1255 zvol_free(zvol_state_t *zv)
1256 {
1257 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1258 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1259 	ASSERT0(zv->zv_open_count);
1260 
1261 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1262 
1263 	rw_destroy(&zv->zv_suspend_lock);
1264 	zfs_rangelock_fini(&zv->zv_rangelock);
1265 
1266 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1267 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1268 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1269 
1270 		ASSERT3P(pp->private, ==, NULL);
1271 
1272 		g_topology_lock();
1273 		zvol_geom_destroy(zv);
1274 		g_topology_unlock();
1275 		mtx_destroy(&zsg->zsg_queue_mtx);
1276 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1277 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1278 		struct cdev *dev = zsd->zsd_cdev;
1279 
1280 		ASSERT3P(dev->si_drv2, ==, NULL);
1281 
1282 		destroy_dev(dev);
1283 	}
1284 
1285 	mutex_destroy(&zv->zv_state_lock);
1286 	dataset_kstats_destroy(&zv->zv_kstat);
1287 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1288 	kmem_free(zv, sizeof (zvol_state_t));
1289 	zvol_minors--;
1290 }
1291 
1292 /*
1293  * Create a minor node (plus a whole lot more) for the specified volume.
1294  */
1295 static int
1296 zvol_create_minor_impl(const char *name)
1297 {
1298 	zvol_state_t *zv;
1299 	objset_t *os;
1300 	dmu_object_info_t *doi;
1301 	uint64_t volsize;
1302 	uint64_t volmode, hash;
1303 	int error;
1304 
1305 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1306 	hash = zvol_name_hash(name);
1307 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1308 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1309 		mutex_exit(&zv->zv_state_lock);
1310 		return (SET_ERROR(EEXIST));
1311 	}
1312 
1313 	DROP_GIANT();
1314 
1315 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1316 
1317 	/* lie and say we're read-only */
1318 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1319 	if (error)
1320 		goto out_doi;
1321 
1322 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1323 	if (error)
1324 		goto out_dmu_objset_disown;
1325 
1326 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1327 	if (error)
1328 		goto out_dmu_objset_disown;
1329 
1330 	error = dsl_prop_get_integer(name,
1331 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1332 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1333 		volmode = zvol_volmode;
1334 	error = 0;
1335 
1336 	/*
1337 	 * zvol_alloc equivalent ...
1338 	 */
1339 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1340 	zv->zv_hash = hash;
1341 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1342 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1343 	zv->zv_volmode = volmode;
1344 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1345 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1346 		struct g_provider *pp;
1347 		struct g_geom *gp;
1348 
1349 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1350 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1351 
1352 		g_topology_lock();
1353 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1354 		gp->start = zvol_geom_bio_start;
1355 		gp->access = zvol_geom_access;
1356 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1357 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1358 		pp->sectorsize = DEV_BSIZE;
1359 		pp->mediasize = 0;
1360 		pp->private = zv;
1361 
1362 		zsg->zsg_provider = pp;
1363 		bioq_init(&zsg->zsg_queue);
1364 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1365 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1366 		struct cdev *dev;
1367 		struct make_dev_args args;
1368 
1369 		make_dev_args_init(&args);
1370 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1371 		args.mda_devsw = &zvol_cdevsw;
1372 		args.mda_cr = NULL;
1373 		args.mda_uid = UID_ROOT;
1374 		args.mda_gid = GID_OPERATOR;
1375 		args.mda_mode = 0640;
1376 		args.mda_si_drv2 = zv;
1377 		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1378 		if (error) {
1379 			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1380 			mutex_destroy(&zv->zv_state_lock);
1381 			kmem_free(zv, sizeof (*zv));
1382 			dmu_objset_disown(os, B_TRUE, FTAG);
1383 			goto out_doi;
1384 		}
1385 		dev->si_iosize_max = maxphys;
1386 		zsd->zsd_cdev = dev;
1387 	}
1388 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1389 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1390 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1391 
1392 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1393 		zv->zv_flags |= ZVOL_RDONLY;
1394 
1395 	zv->zv_volblocksize = doi->doi_data_block_size;
1396 	zv->zv_volsize = volsize;
1397 	zv->zv_objset = os;
1398 
1399 	ASSERT3P(zv->zv_zilog, ==, NULL);
1400 	zv->zv_zilog = zil_open(os, zvol_get_data);
1401 	if (spa_writeable(dmu_objset_spa(os))) {
1402 		if (zil_replay_disable)
1403 			zil_destroy(zv->zv_zilog, B_FALSE);
1404 		else
1405 			zil_replay(os, zv, zvol_replay_vector);
1406 	}
1407 	zil_close(zv->zv_zilog);
1408 	zv->zv_zilog = NULL;
1409 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1410 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1411 
1412 	/* TODO: prefetch for geom tasting */
1413 
1414 	zv->zv_objset = NULL;
1415 out_dmu_objset_disown:
1416 	dmu_objset_disown(os, B_TRUE, FTAG);
1417 
1418 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1419 		zvol_geom_run(zv);
1420 		g_topology_unlock();
1421 	}
1422 out_doi:
1423 	kmem_free(doi, sizeof (dmu_object_info_t));
1424 	if (error == 0) {
1425 		rw_enter(&zvol_state_lock, RW_WRITER);
1426 		zvol_insert(zv);
1427 		zvol_minors++;
1428 		rw_exit(&zvol_state_lock);
1429 		ZFS_LOG(1, "ZVOL %s created.", name);
1430 	}
1431 	PICKUP_GIANT();
1432 	return (error);
1433 }
1434 
1435 static void
1436 zvol_clear_private(zvol_state_t *zv)
1437 {
1438 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1439 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1440 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1441 		struct g_provider *pp = zsg->zsg_provider;
1442 
1443 		if (pp->private == NULL) /* already cleared */
1444 			return;
1445 
1446 		mtx_lock(&zsg->zsg_queue_mtx);
1447 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1448 		pp->private = NULL;
1449 		wakeup_one(&zsg->zsg_queue);
1450 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1451 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1452 			    0, "zvol:w", 0);
1453 		mtx_unlock(&zsg->zsg_queue_mtx);
1454 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1455 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1456 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1457 		struct cdev *dev = zsd->zsd_cdev;
1458 
1459 		dev->si_drv2 = NULL;
1460 	}
1461 }
1462 
1463 static int
1464 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1465 {
1466 	zv->zv_volsize = volsize;
1467 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1468 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1469 		struct g_provider *pp = zsg->zsg_provider;
1470 
1471 		g_topology_lock();
1472 
1473 		if (pp->private == NULL) {
1474 			g_topology_unlock();
1475 			return (SET_ERROR(ENXIO));
1476 		}
1477 
1478 		/*
1479 		 * Do not invoke resize event when initial size was zero.
1480 		 * ZVOL initializes the size on first open, this is not
1481 		 * real resizing.
1482 		 */
1483 		if (pp->mediasize == 0)
1484 			pp->mediasize = zv->zv_volsize;
1485 		else
1486 			g_resize_provider(pp, zv->zv_volsize);
1487 
1488 		g_topology_unlock();
1489 	}
1490 	return (0);
1491 }
1492 
1493 static void
1494 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1495 {
1496 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1497 }
1498 
1499 static void
1500 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1501 {
1502 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1503 }
1504 
1505 const static zvol_platform_ops_t zvol_freebsd_ops = {
1506 	.zv_free = zvol_free,
1507 	.zv_rename_minor = zvol_rename_minor,
1508 	.zv_create_minor = zvol_create_minor_impl,
1509 	.zv_update_volsize = zvol_update_volsize,
1510 	.zv_clear_private = zvol_clear_private,
1511 	.zv_is_zvol = zvol_is_zvol_impl,
1512 	.zv_set_disk_ro = zvol_set_disk_ro_impl,
1513 	.zv_set_capacity = zvol_set_capacity_impl,
1514 };
1515 
1516 /*
1517  * Public interfaces
1518  */
1519 
1520 int
1521 zvol_busy(void)
1522 {
1523 	return (zvol_minors != 0);
1524 }
1525 
1526 int
1527 zvol_init(void)
1528 {
1529 	zvol_init_impl();
1530 	zvol_register_ops(&zvol_freebsd_ops);
1531 	return (0);
1532 }
1533 
1534 void
1535 zvol_fini(void)
1536 {
1537 	zvol_fini_impl();
1538 }
1539