1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99 
100 #include "zfs_namecheck.h"
101 
102 #define	ZVOL_DUMPSIZE		"dumpsize"
103 
104 #ifdef ZVOL_LOCK_DEBUG
105 #define	ZVOL_RW_READER		RW_WRITER
106 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
107 #else
108 #define	ZVOL_RW_READER		RW_READER
109 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
110 #endif
111 
112 enum zvol_geom_state {
113 	ZVOL_GEOM_UNINIT,
114 	ZVOL_GEOM_STOPPED,
115 	ZVOL_GEOM_RUNNING,
116 };
117 
118 struct zvol_state_os {
119 #define	zso_dev		_zso_state._zso_dev
120 #define	zso_geom	_zso_state._zso_geom
121 	union {
122 		/* volmode=dev */
123 		struct zvol_state_dev {
124 			struct cdev *zsd_cdev;
125 			uint64_t zsd_sync_cnt;
126 		} _zso_dev;
127 
128 		/* volmode=geom */
129 		struct zvol_state_geom {
130 			struct g_provider *zsg_provider;
131 			struct bio_queue_head zsg_queue;
132 			struct mtx zsg_queue_mtx;
133 			enum zvol_geom_state zsg_state;
134 		} _zso_geom;
135 	} _zso_state;
136 	int zso_dying;
137 };
138 
139 static uint32_t zvol_minors;
140 
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144 	"Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
148 
149 /*
150  * Toggle unmap functionality.
151  */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153 
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156 
157 /*
158  * zvol maximum transfer in one DMU tx.
159  */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161 
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163 
164 static d_open_t		zvol_cdev_open;
165 static d_close_t	zvol_cdev_close;
166 static d_ioctl_t	zvol_cdev_ioctl;
167 static d_read_t		zvol_cdev_read;
168 static d_write_t	zvol_cdev_write;
169 static d_strategy_t	zvol_geom_bio_strategy;
170 
171 static struct cdevsw zvol_cdevsw = {
172 	.d_name =	"zvol",
173 	.d_version =	D_VERSION,
174 	.d_flags =	D_DISK | D_TRACKCLOSE,
175 	.d_open =	zvol_cdev_open,
176 	.d_close =	zvol_cdev_close,
177 	.d_ioctl =	zvol_cdev_ioctl,
178 	.d_read =	zvol_cdev_read,
179 	.d_write =	zvol_cdev_write,
180 	.d_strategy =	zvol_geom_bio_strategy,
181 };
182 
183 extern uint_t zfs_geom_probe_vdev_key;
184 
185 struct g_class zfs_zvol_class = {
186 	.name = "ZFS::ZVOL",
187 	.version = G_VERSION,
188 };
189 
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191 
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
201 
202 /*
203  * GEOM mode implementation
204  */
205 
206 static int
207 zvol_geom_open(struct g_provider *pp, int flag, int count)
208 {
209 	zvol_state_t *zv;
210 	int err = 0;
211 	boolean_t drop_suspend = B_FALSE;
212 
213 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
214 		/*
215 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
216 		 * attempting to probe geom providers while looking for a
217 		 * replacement for a missing VDEV.  In this case, the
218 		 * spa_namespace_lock will not be held, but it is still illegal
219 		 * to use a zvol as a vdev.  Deadlocks can result if another
220 		 * thread has spa_namespace_lock.
221 		 */
222 		return (SET_ERROR(EOPNOTSUPP));
223 	}
224 
225 retry:
226 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
227 	/*
228 	 * Obtain a copy of private under zvol_state_lock to make sure either
229 	 * the result of zvol free code setting private to NULL is observed,
230 	 * or the zv is protected from being freed because of the positive
231 	 * zv_open_count.
232 	 */
233 	zv = pp->private;
234 	if (zv == NULL) {
235 		rw_exit(&zvol_state_lock);
236 		err = SET_ERROR(ENXIO);
237 		goto out_locked;
238 	}
239 
240 	mutex_enter(&zv->zv_state_lock);
241 	if (zv->zv_zso->zso_dying) {
242 		rw_exit(&zvol_state_lock);
243 		err = SET_ERROR(ENXIO);
244 		goto out_zv_locked;
245 	}
246 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
247 
248 	/*
249 	 * Make sure zvol is not suspended during first open
250 	 * (hold zv_suspend_lock) and respect proper lock acquisition
251 	 * ordering - zv_suspend_lock before zv_state_lock.
252 	 */
253 	if (zv->zv_open_count == 0) {
254 		drop_suspend = B_TRUE;
255 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
256 			mutex_exit(&zv->zv_state_lock);
257 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
258 			mutex_enter(&zv->zv_state_lock);
259 			/* Check to see if zv_suspend_lock is needed. */
260 			if (zv->zv_open_count != 0) {
261 				rw_exit(&zv->zv_suspend_lock);
262 				drop_suspend = B_FALSE;
263 			}
264 		}
265 	}
266 	rw_exit(&zvol_state_lock);
267 
268 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
269 
270 	if (zv->zv_open_count == 0) {
271 		boolean_t drop_namespace = B_FALSE;
272 
273 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
274 
275 		/*
276 		 * Take spa_namespace_lock to prevent lock inversion when
277 		 * zvols from one pool are opened as vdevs in another.
278 		 */
279 		if (!mutex_owned(&spa_namespace_lock)) {
280 			if (!mutex_tryenter(&spa_namespace_lock)) {
281 				mutex_exit(&zv->zv_state_lock);
282 				rw_exit(&zv->zv_suspend_lock);
283 				kern_yield(PRI_USER);
284 				goto retry;
285 			} else {
286 				drop_namespace = B_TRUE;
287 			}
288 		}
289 		err = zvol_first_open(zv, !(flag & FWRITE));
290 		if (drop_namespace)
291 			mutex_exit(&spa_namespace_lock);
292 		if (err)
293 			goto out_zv_locked;
294 		pp->mediasize = zv->zv_volsize;
295 		pp->stripeoffset = 0;
296 		pp->stripesize = zv->zv_volblocksize;
297 	}
298 
299 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
300 
301 	/*
302 	 * Check for a bad on-disk format version now since we
303 	 * lied about owning the dataset readonly before.
304 	 */
305 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
306 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
307 		err = SET_ERROR(EROFS);
308 		goto out_opened;
309 	}
310 	if (zv->zv_flags & ZVOL_EXCL) {
311 		err = SET_ERROR(EBUSY);
312 		goto out_opened;
313 	}
314 #ifdef FEXCL
315 	if (flag & FEXCL) {
316 		if (zv->zv_open_count != 0) {
317 			err = SET_ERROR(EBUSY);
318 			goto out_opened;
319 		}
320 		zv->zv_flags |= ZVOL_EXCL;
321 	}
322 #endif
323 
324 	zv->zv_open_count += count;
325 out_opened:
326 	if (zv->zv_open_count == 0) {
327 		zvol_last_close(zv);
328 		wakeup(zv);
329 	}
330 out_zv_locked:
331 	mutex_exit(&zv->zv_state_lock);
332 out_locked:
333 	if (drop_suspend)
334 		rw_exit(&zv->zv_suspend_lock);
335 	return (err);
336 }
337 
338 static int
339 zvol_geom_close(struct g_provider *pp, int flag, int count)
340 {
341 	(void) flag;
342 	zvol_state_t *zv;
343 	boolean_t drop_suspend = B_TRUE;
344 	int new_open_count;
345 
346 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
347 	zv = pp->private;
348 	if (zv == NULL) {
349 		rw_exit(&zvol_state_lock);
350 		return (SET_ERROR(ENXIO));
351 	}
352 
353 	mutex_enter(&zv->zv_state_lock);
354 	if (zv->zv_flags & ZVOL_EXCL) {
355 		ASSERT3U(zv->zv_open_count, ==, 1);
356 		zv->zv_flags &= ~ZVOL_EXCL;
357 	}
358 
359 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
360 
361 	/*
362 	 * If the open count is zero, this is a spurious close.
363 	 * That indicates a bug in the kernel / DDI framework.
364 	 */
365 	ASSERT3U(zv->zv_open_count, >, 0);
366 
367 	/*
368 	 * Make sure zvol is not suspended during last close
369 	 * (hold zv_suspend_lock) and respect proper lock acquisition
370 	 * ordering - zv_suspend_lock before zv_state_lock.
371 	 */
372 	new_open_count = zv->zv_open_count - count;
373 	if (new_open_count == 0) {
374 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
375 			mutex_exit(&zv->zv_state_lock);
376 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
377 			mutex_enter(&zv->zv_state_lock);
378 			/* Check to see if zv_suspend_lock is needed. */
379 			new_open_count = zv->zv_open_count - count;
380 			if (new_open_count != 0) {
381 				rw_exit(&zv->zv_suspend_lock);
382 				drop_suspend = B_FALSE;
383 			}
384 		}
385 	} else {
386 		drop_suspend = B_FALSE;
387 	}
388 	rw_exit(&zvol_state_lock);
389 
390 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
391 
392 	/*
393 	 * You may get multiple opens, but only one close.
394 	 */
395 	zv->zv_open_count = new_open_count;
396 	if (zv->zv_open_count == 0) {
397 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
398 		zvol_last_close(zv);
399 		wakeup(zv);
400 	}
401 
402 	mutex_exit(&zv->zv_state_lock);
403 
404 	if (drop_suspend)
405 		rw_exit(&zv->zv_suspend_lock);
406 	return (0);
407 }
408 
409 static void
410 zvol_geom_run(zvol_state_t *zv)
411 {
412 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
413 	struct g_provider *pp = zsg->zsg_provider;
414 
415 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
416 
417 	g_error_provider(pp, 0);
418 
419 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
420 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
421 }
422 
423 static void
424 zvol_geom_destroy(zvol_state_t *zv)
425 {
426 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
427 	struct g_provider *pp = zsg->zsg_provider;
428 
429 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
430 
431 	g_topology_assert();
432 
433 	mutex_enter(&zv->zv_state_lock);
434 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
435 	mutex_exit(&zv->zv_state_lock);
436 	zsg->zsg_provider = NULL;
437 	g_wither_geom(pp->geom, ENXIO);
438 }
439 
440 void
441 zvol_wait_close(zvol_state_t *zv)
442 {
443 
444 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
445 		return;
446 	mutex_enter(&zv->zv_state_lock);
447 	zv->zv_zso->zso_dying = B_TRUE;
448 
449 	if (zv->zv_open_count)
450 		msleep(zv, &zv->zv_state_lock,
451 		    PRIBIO, "zvol:dying", 10*hz);
452 	mutex_exit(&zv->zv_state_lock);
453 }
454 
455 
456 static int
457 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
458 {
459 	int count, error, flags;
460 
461 	g_topology_assert();
462 
463 	/*
464 	 * To make it easier we expect either open or close, but not both
465 	 * at the same time.
466 	 */
467 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
468 	    (acr <= 0 && acw <= 0 && ace <= 0),
469 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
470 	    pp->name, acr, acw, ace));
471 
472 	if (pp->private == NULL) {
473 		if (acr <= 0 && acw <= 0 && ace <= 0)
474 			return (0);
475 		return (pp->error);
476 	}
477 
478 	/*
479 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
480 	 * ace != 0, because GEOM already handles that and handles it a bit
481 	 * differently. GEOM allows for multiple read/exclusive consumers and
482 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
483 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
484 	 * to decide what to do.
485 	 */
486 
487 	count = acr + acw + ace;
488 	if (count == 0)
489 		return (0);
490 
491 	flags = 0;
492 	if (acr != 0 || ace != 0)
493 		flags |= FREAD;
494 	if (acw != 0)
495 		flags |= FWRITE;
496 
497 	g_topology_unlock();
498 	if (count > 0)
499 		error = zvol_geom_open(pp, flags, count);
500 	else
501 		error = zvol_geom_close(pp, flags, -count);
502 	g_topology_lock();
503 	return (error);
504 }
505 
506 static void
507 zvol_geom_worker(void *arg)
508 {
509 	zvol_state_t *zv = arg;
510 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
511 	struct bio *bp;
512 
513 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
514 
515 	thread_lock(curthread);
516 	sched_prio(curthread, PRIBIO);
517 	thread_unlock(curthread);
518 
519 	for (;;) {
520 		mtx_lock(&zsg->zsg_queue_mtx);
521 		bp = bioq_takefirst(&zsg->zsg_queue);
522 		if (bp == NULL) {
523 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
524 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
525 				wakeup(&zsg->zsg_state);
526 				mtx_unlock(&zsg->zsg_queue_mtx);
527 				kthread_exit();
528 			}
529 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
530 			    PRIBIO | PDROP, "zvol:io", 0);
531 			continue;
532 		}
533 		mtx_unlock(&zsg->zsg_queue_mtx);
534 		zvol_geom_bio_strategy(bp);
535 	}
536 }
537 
538 static void
539 zvol_geom_bio_start(struct bio *bp)
540 {
541 	zvol_state_t *zv = bp->bio_to->private;
542 	struct zvol_state_geom *zsg;
543 	boolean_t first;
544 
545 	if (zv == NULL) {
546 		g_io_deliver(bp, ENXIO);
547 		return;
548 	}
549 	if (bp->bio_cmd == BIO_GETATTR) {
550 		if (zvol_geom_bio_getattr(bp))
551 			g_io_deliver(bp, EOPNOTSUPP);
552 		return;
553 	}
554 
555 	if (!THREAD_CAN_SLEEP()) {
556 		zsg = &zv->zv_zso->zso_geom;
557 		mtx_lock(&zsg->zsg_queue_mtx);
558 		first = (bioq_first(&zsg->zsg_queue) == NULL);
559 		bioq_insert_tail(&zsg->zsg_queue, bp);
560 		mtx_unlock(&zsg->zsg_queue_mtx);
561 		if (first)
562 			wakeup_one(&zsg->zsg_queue);
563 		return;
564 	}
565 
566 	zvol_geom_bio_strategy(bp);
567 }
568 
569 static int
570 zvol_geom_bio_getattr(struct bio *bp)
571 {
572 	zvol_state_t *zv;
573 
574 	zv = bp->bio_to->private;
575 	ASSERT3P(zv, !=, NULL);
576 
577 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
578 	uint64_t refd, avail, usedobjs, availobjs;
579 
580 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
581 		return (0);
582 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
583 		dmu_objset_space(zv->zv_objset, &refd, &avail,
584 		    &usedobjs, &availobjs);
585 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
586 			return (0);
587 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
588 		dmu_objset_space(zv->zv_objset, &refd, &avail,
589 		    &usedobjs, &availobjs);
590 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
591 			return (0);
592 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
593 		avail = metaslab_class_get_space(spa_normal_class(spa));
594 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
595 		if (g_handleattr_off_t(bp, "poolblocksavail",
596 		    avail / DEV_BSIZE))
597 			return (0);
598 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
599 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
600 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
601 			return (0);
602 	}
603 	return (1);
604 }
605 
606 static void
607 zvol_geom_bio_strategy(struct bio *bp)
608 {
609 	zvol_state_t *zv;
610 	uint64_t off, volsize;
611 	size_t resid;
612 	char *addr;
613 	objset_t *os;
614 	zfs_locked_range_t *lr;
615 	int error = 0;
616 	boolean_t doread = B_FALSE;
617 	boolean_t is_dumpified;
618 	boolean_t sync;
619 
620 	if (bp->bio_to)
621 		zv = bp->bio_to->private;
622 	else
623 		zv = bp->bio_dev->si_drv2;
624 
625 	if (zv == NULL) {
626 		error = SET_ERROR(ENXIO);
627 		goto out;
628 	}
629 
630 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
631 
632 	switch (bp->bio_cmd) {
633 	case BIO_READ:
634 		doread = B_TRUE;
635 		break;
636 	case BIO_WRITE:
637 	case BIO_FLUSH:
638 	case BIO_DELETE:
639 		if (zv->zv_flags & ZVOL_RDONLY) {
640 			error = SET_ERROR(EROFS);
641 			goto resume;
642 		}
643 		zvol_ensure_zilog(zv);
644 		if (bp->bio_cmd == BIO_FLUSH)
645 			goto sync;
646 		break;
647 	default:
648 		error = SET_ERROR(EOPNOTSUPP);
649 		goto resume;
650 	}
651 
652 	off = bp->bio_offset;
653 	volsize = zv->zv_volsize;
654 
655 	os = zv->zv_objset;
656 	ASSERT3P(os, !=, NULL);
657 
658 	addr = bp->bio_data;
659 	resid = bp->bio_length;
660 
661 	if (resid > 0 && off >= volsize) {
662 		error = SET_ERROR(EIO);
663 		goto resume;
664 	}
665 
666 	is_dumpified = B_FALSE;
667 	sync = !doread && !is_dumpified &&
668 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
669 
670 	/*
671 	 * There must be no buffer changes when doing a dmu_sync() because
672 	 * we can't change the data whilst calculating the checksum.
673 	 */
674 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
675 	    doread ? RL_READER : RL_WRITER);
676 
677 	if (bp->bio_cmd == BIO_DELETE) {
678 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
679 		error = dmu_tx_assign(tx, TXG_WAIT);
680 		if (error != 0) {
681 			dmu_tx_abort(tx);
682 		} else {
683 			zvol_log_truncate(zv, tx, off, resid, sync);
684 			dmu_tx_commit(tx);
685 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
686 			    off, resid);
687 			resid = 0;
688 		}
689 		goto unlock;
690 	}
691 	while (resid != 0 && off < volsize) {
692 		size_t size = MIN(resid, zvol_maxphys);
693 		if (doread) {
694 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
695 			    DMU_READ_PREFETCH);
696 		} else {
697 			dmu_tx_t *tx = dmu_tx_create(os);
698 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
699 			error = dmu_tx_assign(tx, TXG_WAIT);
700 			if (error) {
701 				dmu_tx_abort(tx);
702 			} else {
703 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
704 				zvol_log_write(zv, tx, off, size, sync);
705 				dmu_tx_commit(tx);
706 			}
707 		}
708 		if (error) {
709 			/* Convert checksum errors into IO errors. */
710 			if (error == ECKSUM)
711 				error = SET_ERROR(EIO);
712 			break;
713 		}
714 		off += size;
715 		addr += size;
716 		resid -= size;
717 	}
718 unlock:
719 	zfs_rangelock_exit(lr);
720 
721 	bp->bio_completed = bp->bio_length - resid;
722 	if (bp->bio_completed < bp->bio_length && off > volsize)
723 		error = SET_ERROR(EINVAL);
724 
725 	switch (bp->bio_cmd) {
726 	case BIO_FLUSH:
727 		break;
728 	case BIO_READ:
729 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
730 		    bp->bio_completed);
731 		break;
732 	case BIO_WRITE:
733 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
734 		    bp->bio_completed);
735 		break;
736 	case BIO_DELETE:
737 		break;
738 	default:
739 		break;
740 	}
741 
742 	if (sync) {
743 sync:
744 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
745 	}
746 resume:
747 	rw_exit(&zv->zv_suspend_lock);
748 out:
749 	if (bp->bio_to)
750 		g_io_deliver(bp, error);
751 	else
752 		biofinish(bp, NULL, error);
753 }
754 
755 /*
756  * Character device mode implementation
757  */
758 
759 static int
760 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
761 {
762 	zvol_state_t *zv;
763 	uint64_t volsize;
764 	zfs_locked_range_t *lr;
765 	int error = 0;
766 	zfs_uio_t uio;
767 
768 	zfs_uio_init(&uio, uio_s);
769 
770 	zv = dev->si_drv2;
771 
772 	volsize = zv->zv_volsize;
773 	/*
774 	 * uio_loffset == volsize isn't an error as
775 	 * it's required for EOF processing.
776 	 */
777 	if (zfs_uio_resid(&uio) > 0 &&
778 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
779 		return (SET_ERROR(EIO));
780 
781 	ssize_t start_resid = zfs_uio_resid(&uio);
782 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
783 	    zfs_uio_resid(&uio), RL_READER);
784 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
785 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
786 
787 		/* Don't read past the end. */
788 		if (bytes > volsize - zfs_uio_offset(&uio))
789 			bytes = volsize - zfs_uio_offset(&uio);
790 
791 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
792 		if (error) {
793 			/* Convert checksum errors into IO errors. */
794 			if (error == ECKSUM)
795 				error = SET_ERROR(EIO);
796 			break;
797 		}
798 	}
799 	zfs_rangelock_exit(lr);
800 	int64_t nread = start_resid - zfs_uio_resid(&uio);
801 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
802 
803 	return (error);
804 }
805 
806 static int
807 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
808 {
809 	zvol_state_t *zv;
810 	uint64_t volsize;
811 	zfs_locked_range_t *lr;
812 	int error = 0;
813 	boolean_t sync;
814 	zfs_uio_t uio;
815 
816 	zv = dev->si_drv2;
817 
818 	volsize = zv->zv_volsize;
819 
820 	zfs_uio_init(&uio, uio_s);
821 
822 	if (zfs_uio_resid(&uio) > 0 &&
823 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
824 		return (SET_ERROR(EIO));
825 
826 	ssize_t start_resid = zfs_uio_resid(&uio);
827 	sync = (ioflag & IO_SYNC) ||
828 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
829 
830 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
831 	zvol_ensure_zilog(zv);
832 
833 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
834 	    zfs_uio_resid(&uio), RL_WRITER);
835 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
836 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
837 		uint64_t off = zfs_uio_offset(&uio);
838 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
839 
840 		if (bytes > volsize - off)	/* Don't write past the end. */
841 			bytes = volsize - off;
842 
843 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
844 		error = dmu_tx_assign(tx, TXG_WAIT);
845 		if (error) {
846 			dmu_tx_abort(tx);
847 			break;
848 		}
849 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
850 		if (error == 0)
851 			zvol_log_write(zv, tx, off, bytes, sync);
852 		dmu_tx_commit(tx);
853 
854 		if (error)
855 			break;
856 	}
857 	zfs_rangelock_exit(lr);
858 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
859 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
860 	if (sync)
861 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
862 	rw_exit(&zv->zv_suspend_lock);
863 	return (error);
864 }
865 
866 static int
867 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
868 {
869 	zvol_state_t *zv;
870 	struct zvol_state_dev *zsd;
871 	int err = 0;
872 	boolean_t drop_suspend = B_FALSE;
873 
874 retry:
875 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
876 	/*
877 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
878 	 * the result of zvol free code setting si_drv2 to NULL is observed,
879 	 * or the zv is protected from being freed because of the positive
880 	 * zv_open_count.
881 	 */
882 	zv = dev->si_drv2;
883 	if (zv == NULL) {
884 		rw_exit(&zvol_state_lock);
885 		err = SET_ERROR(ENXIO);
886 		goto out_locked;
887 	}
888 
889 	mutex_enter(&zv->zv_state_lock);
890 	if (zv->zv_zso->zso_dying) {
891 		rw_exit(&zvol_state_lock);
892 		err = SET_ERROR(ENXIO);
893 		goto out_zv_locked;
894 	}
895 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
896 
897 	/*
898 	 * Make sure zvol is not suspended during first open
899 	 * (hold zv_suspend_lock) and respect proper lock acquisition
900 	 * ordering - zv_suspend_lock before zv_state_lock.
901 	 */
902 	if (zv->zv_open_count == 0) {
903 		drop_suspend = B_TRUE;
904 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
905 			mutex_exit(&zv->zv_state_lock);
906 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
907 			mutex_enter(&zv->zv_state_lock);
908 			/* Check to see if zv_suspend_lock is needed. */
909 			if (zv->zv_open_count != 0) {
910 				rw_exit(&zv->zv_suspend_lock);
911 				drop_suspend = B_FALSE;
912 			}
913 		}
914 	}
915 	rw_exit(&zvol_state_lock);
916 
917 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
918 
919 	if (zv->zv_open_count == 0) {
920 		boolean_t drop_namespace = B_FALSE;
921 
922 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
923 
924 		/*
925 		 * Take spa_namespace_lock to prevent lock inversion when
926 		 * zvols from one pool are opened as vdevs in another.
927 		 */
928 		if (!mutex_owned(&spa_namespace_lock)) {
929 			if (!mutex_tryenter(&spa_namespace_lock)) {
930 				mutex_exit(&zv->zv_state_lock);
931 				rw_exit(&zv->zv_suspend_lock);
932 				kern_yield(PRI_USER);
933 				goto retry;
934 			} else {
935 				drop_namespace = B_TRUE;
936 			}
937 		}
938 		err = zvol_first_open(zv, !(flags & FWRITE));
939 		if (drop_namespace)
940 			mutex_exit(&spa_namespace_lock);
941 		if (err)
942 			goto out_zv_locked;
943 	}
944 
945 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
946 
947 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
948 		err = SET_ERROR(EROFS);
949 		goto out_opened;
950 	}
951 	if (zv->zv_flags & ZVOL_EXCL) {
952 		err = SET_ERROR(EBUSY);
953 		goto out_opened;
954 	}
955 #ifdef FEXCL
956 	if (flags & FEXCL) {
957 		if (zv->zv_open_count != 0) {
958 			err = SET_ERROR(EBUSY);
959 			goto out_opened;
960 		}
961 		zv->zv_flags |= ZVOL_EXCL;
962 	}
963 #endif
964 
965 	zv->zv_open_count++;
966 	if (flags & (FSYNC | FDSYNC)) {
967 		zsd = &zv->zv_zso->zso_dev;
968 		zsd->zsd_sync_cnt++;
969 		if (zsd->zsd_sync_cnt == 1 &&
970 		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
971 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
972 	}
973 out_opened:
974 	if (zv->zv_open_count == 0) {
975 		zvol_last_close(zv);
976 		wakeup(zv);
977 	}
978 out_zv_locked:
979 	mutex_exit(&zv->zv_state_lock);
980 out_locked:
981 	if (drop_suspend)
982 		rw_exit(&zv->zv_suspend_lock);
983 	return (err);
984 }
985 
986 static int
987 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
988 {
989 	zvol_state_t *zv;
990 	struct zvol_state_dev *zsd;
991 	boolean_t drop_suspend = B_TRUE;
992 
993 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
994 	zv = dev->si_drv2;
995 	if (zv == NULL) {
996 		rw_exit(&zvol_state_lock);
997 		return (SET_ERROR(ENXIO));
998 	}
999 
1000 	mutex_enter(&zv->zv_state_lock);
1001 	if (zv->zv_flags & ZVOL_EXCL) {
1002 		ASSERT3U(zv->zv_open_count, ==, 1);
1003 		zv->zv_flags &= ~ZVOL_EXCL;
1004 	}
1005 
1006 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1007 
1008 	/*
1009 	 * If the open count is zero, this is a spurious close.
1010 	 * That indicates a bug in the kernel / DDI framework.
1011 	 */
1012 	ASSERT3U(zv->zv_open_count, >, 0);
1013 	/*
1014 	 * Make sure zvol is not suspended during last close
1015 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1016 	 * ordering - zv_suspend_lock before zv_state_lock.
1017 	 */
1018 	if (zv->zv_open_count == 1) {
1019 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1020 			mutex_exit(&zv->zv_state_lock);
1021 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1022 			mutex_enter(&zv->zv_state_lock);
1023 			/* Check to see if zv_suspend_lock is needed. */
1024 			if (zv->zv_open_count != 1) {
1025 				rw_exit(&zv->zv_suspend_lock);
1026 				drop_suspend = B_FALSE;
1027 			}
1028 		}
1029 	} else {
1030 		drop_suspend = B_FALSE;
1031 	}
1032 	rw_exit(&zvol_state_lock);
1033 
1034 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1035 
1036 	/*
1037 	 * You may get multiple opens, but only one close.
1038 	 */
1039 	zv->zv_open_count--;
1040 	if (flags & (FSYNC | FDSYNC)) {
1041 		zsd = &zv->zv_zso->zso_dev;
1042 		zsd->zsd_sync_cnt--;
1043 	}
1044 
1045 	if (zv->zv_open_count == 0) {
1046 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1047 		zvol_last_close(zv);
1048 		wakeup(zv);
1049 	}
1050 
1051 	mutex_exit(&zv->zv_state_lock);
1052 
1053 	if (drop_suspend)
1054 		rw_exit(&zv->zv_suspend_lock);
1055 	return (0);
1056 }
1057 
1058 static int
1059 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1060     int fflag, struct thread *td)
1061 {
1062 	zvol_state_t *zv;
1063 	zfs_locked_range_t *lr;
1064 	off_t offset, length;
1065 	int error;
1066 	boolean_t sync;
1067 
1068 	zv = dev->si_drv2;
1069 
1070 	error = 0;
1071 	KASSERT(zv->zv_open_count > 0,
1072 	    ("Device with zero access count in %s", __func__));
1073 
1074 	switch (cmd) {
1075 	case DIOCGSECTORSIZE:
1076 		*(uint32_t *)data = DEV_BSIZE;
1077 		break;
1078 	case DIOCGMEDIASIZE:
1079 		*(off_t *)data = zv->zv_volsize;
1080 		break;
1081 	case DIOCGFLUSH:
1082 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1083 		if (zv->zv_zilog != NULL)
1084 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1085 		rw_exit(&zv->zv_suspend_lock);
1086 		break;
1087 	case DIOCGDELETE:
1088 		if (!zvol_unmap_enabled)
1089 			break;
1090 
1091 		offset = ((off_t *)data)[0];
1092 		length = ((off_t *)data)[1];
1093 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1094 		    offset < 0 || offset >= zv->zv_volsize ||
1095 		    length <= 0) {
1096 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1097 			    length);
1098 			error = SET_ERROR(EINVAL);
1099 			break;
1100 		}
1101 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1102 		zvol_ensure_zilog(zv);
1103 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1104 		    RL_WRITER);
1105 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1106 		error = dmu_tx_assign(tx, TXG_WAIT);
1107 		if (error != 0) {
1108 			sync = FALSE;
1109 			dmu_tx_abort(tx);
1110 		} else {
1111 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1112 			zvol_log_truncate(zv, tx, offset, length, sync);
1113 			dmu_tx_commit(tx);
1114 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1115 			    offset, length);
1116 		}
1117 		zfs_rangelock_exit(lr);
1118 		if (sync)
1119 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1120 		rw_exit(&zv->zv_suspend_lock);
1121 		break;
1122 	case DIOCGSTRIPESIZE:
1123 		*(off_t *)data = zv->zv_volblocksize;
1124 		break;
1125 	case DIOCGSTRIPEOFFSET:
1126 		*(off_t *)data = 0;
1127 		break;
1128 	case DIOCGATTR: {
1129 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1130 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1131 		uint64_t refd, avail, usedobjs, availobjs;
1132 
1133 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1134 			arg->value.i = 1;
1135 		else if (strcmp(arg->name, "blocksavail") == 0) {
1136 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1137 			    &usedobjs, &availobjs);
1138 			arg->value.off = avail / DEV_BSIZE;
1139 		} else if (strcmp(arg->name, "blocksused") == 0) {
1140 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1141 			    &usedobjs, &availobjs);
1142 			arg->value.off = refd / DEV_BSIZE;
1143 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1144 			avail = metaslab_class_get_space(spa_normal_class(spa));
1145 			avail -= metaslab_class_get_alloc(
1146 			    spa_normal_class(spa));
1147 			arg->value.off = avail / DEV_BSIZE;
1148 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1149 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1150 			arg->value.off = refd / DEV_BSIZE;
1151 		} else
1152 			error = SET_ERROR(ENOIOCTL);
1153 		break;
1154 	}
1155 	case FIOSEEKHOLE:
1156 	case FIOSEEKDATA: {
1157 		off_t *off = (off_t *)data;
1158 		uint64_t noff;
1159 		boolean_t hole;
1160 
1161 		hole = (cmd == FIOSEEKHOLE);
1162 		noff = *off;
1163 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1164 		*off = noff;
1165 		break;
1166 	}
1167 	default:
1168 		error = SET_ERROR(ENOIOCTL);
1169 	}
1170 
1171 	return (error);
1172 }
1173 
1174 /*
1175  * Misc. helpers
1176  */
1177 
1178 static void
1179 zvol_ensure_zilog(zvol_state_t *zv)
1180 {
1181 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1182 
1183 	/*
1184 	 * Open a ZIL if this is the first time we have written to this
1185 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1186 	 * than zv_state_lock so that we don't need to acquire an
1187 	 * additional lock in this path.
1188 	 */
1189 	if (zv->zv_zilog == NULL) {
1190 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1191 			rw_exit(&zv->zv_suspend_lock);
1192 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1193 		}
1194 		if (zv->zv_zilog == NULL) {
1195 			zv->zv_zilog = zil_open(zv->zv_objset,
1196 			    zvol_get_data);
1197 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1198 			/* replay / destroy done in zvol_os_create_minor() */
1199 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1200 			    ZIL_REPLAY_NEEDED);
1201 		}
1202 		rw_downgrade(&zv->zv_suspend_lock);
1203 	}
1204 }
1205 
1206 boolean_t
1207 zvol_os_is_zvol(const char *device)
1208 {
1209 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1210 }
1211 
1212 void
1213 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1214 {
1215 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1216 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1217 
1218 	/* Move to a new hashtable entry.  */
1219 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1220 	hlist_del(&zv->zv_hlink);
1221 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1222 
1223 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1224 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1225 		struct g_provider *pp = zsg->zsg_provider;
1226 		struct g_geom *gp;
1227 
1228 		g_topology_lock();
1229 		gp = pp->geom;
1230 		ASSERT3P(gp, !=, NULL);
1231 
1232 		zsg->zsg_provider = NULL;
1233 		g_wither_provider(pp, ENXIO);
1234 
1235 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1236 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1237 		pp->sectorsize = DEV_BSIZE;
1238 		pp->mediasize = zv->zv_volsize;
1239 		pp->private = zv;
1240 		zsg->zsg_provider = pp;
1241 		g_error_provider(pp, 0);
1242 		g_topology_unlock();
1243 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1244 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1245 		struct cdev *dev;
1246 		struct make_dev_args args;
1247 
1248 		dev = zsd->zsd_cdev;
1249 		if (dev != NULL) {
1250 			destroy_dev(dev);
1251 			dev = zsd->zsd_cdev = NULL;
1252 			if (zv->zv_open_count > 0) {
1253 				zv->zv_flags &= ~ZVOL_EXCL;
1254 				zv->zv_open_count = 0;
1255 				/* XXX  need suspend lock but lock order */
1256 				zvol_last_close(zv);
1257 			}
1258 		}
1259 
1260 		make_dev_args_init(&args);
1261 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1262 		args.mda_devsw = &zvol_cdevsw;
1263 		args.mda_cr = NULL;
1264 		args.mda_uid = UID_ROOT;
1265 		args.mda_gid = GID_OPERATOR;
1266 		args.mda_mode = 0640;
1267 		args.mda_si_drv2 = zv;
1268 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1269 		    == 0) {
1270 #if __FreeBSD_version > 1300130
1271 			dev->si_iosize_max = maxphys;
1272 #else
1273 			dev->si_iosize_max = MAXPHYS;
1274 #endif
1275 			zsd->zsd_cdev = dev;
1276 		}
1277 	}
1278 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1279 }
1280 
1281 /*
1282  * Remove minor node for the specified volume.
1283  */
1284 void
1285 zvol_os_free(zvol_state_t *zv)
1286 {
1287 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1288 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1289 	ASSERT0(zv->zv_open_count);
1290 
1291 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1292 
1293 	rw_destroy(&zv->zv_suspend_lock);
1294 	zfs_rangelock_fini(&zv->zv_rangelock);
1295 
1296 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1297 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1298 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1299 
1300 		ASSERT3P(pp->private, ==, NULL);
1301 
1302 		g_topology_lock();
1303 		zvol_geom_destroy(zv);
1304 		g_topology_unlock();
1305 		mtx_destroy(&zsg->zsg_queue_mtx);
1306 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1307 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1308 		struct cdev *dev = zsd->zsd_cdev;
1309 
1310 		if (dev != NULL) {
1311 			ASSERT3P(dev->si_drv2, ==, NULL);
1312 			destroy_dev(dev);
1313 		}
1314 	}
1315 
1316 	mutex_destroy(&zv->zv_state_lock);
1317 	dataset_kstats_destroy(&zv->zv_kstat);
1318 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1319 	kmem_free(zv, sizeof (zvol_state_t));
1320 	zvol_minors--;
1321 }
1322 
1323 /*
1324  * Create a minor node (plus a whole lot more) for the specified volume.
1325  */
1326 int
1327 zvol_os_create_minor(const char *name)
1328 {
1329 	zvol_state_t *zv;
1330 	objset_t *os;
1331 	dmu_object_info_t *doi;
1332 	uint64_t volsize;
1333 	uint64_t volmode, hash;
1334 	int error;
1335 
1336 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1337 	hash = zvol_name_hash(name);
1338 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1339 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1340 		mutex_exit(&zv->zv_state_lock);
1341 		return (SET_ERROR(EEXIST));
1342 	}
1343 
1344 	DROP_GIANT();
1345 
1346 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1347 
1348 	/* Lie and say we're read-only. */
1349 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1350 	if (error)
1351 		goto out_doi;
1352 
1353 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1354 	if (error)
1355 		goto out_dmu_objset_disown;
1356 
1357 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1358 	if (error)
1359 		goto out_dmu_objset_disown;
1360 
1361 	error = dsl_prop_get_integer(name,
1362 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1363 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1364 		volmode = zvol_volmode;
1365 	error = 0;
1366 
1367 	/*
1368 	 * zvol_alloc equivalent ...
1369 	 */
1370 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1371 	zv->zv_hash = hash;
1372 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1373 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1374 	zv->zv_volmode = volmode;
1375 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1376 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1377 		struct g_provider *pp;
1378 		struct g_geom *gp;
1379 
1380 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1381 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1382 
1383 		g_topology_lock();
1384 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1385 		gp->start = zvol_geom_bio_start;
1386 		gp->access = zvol_geom_access;
1387 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1388 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1389 		pp->sectorsize = DEV_BSIZE;
1390 		pp->mediasize = 0;
1391 		pp->private = zv;
1392 
1393 		zsg->zsg_provider = pp;
1394 		bioq_init(&zsg->zsg_queue);
1395 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1396 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1397 		struct cdev *dev;
1398 		struct make_dev_args args;
1399 
1400 		make_dev_args_init(&args);
1401 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1402 		args.mda_devsw = &zvol_cdevsw;
1403 		args.mda_cr = NULL;
1404 		args.mda_uid = UID_ROOT;
1405 		args.mda_gid = GID_OPERATOR;
1406 		args.mda_mode = 0640;
1407 		args.mda_si_drv2 = zv;
1408 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1409 		    == 0) {
1410 #if __FreeBSD_version > 1300130
1411 			dev->si_iosize_max = maxphys;
1412 #else
1413 			dev->si_iosize_max = MAXPHYS;
1414 #endif
1415 			zsd->zsd_cdev = dev;
1416 		}
1417 	}
1418 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1419 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1420 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1421 
1422 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1423 		zv->zv_flags |= ZVOL_RDONLY;
1424 
1425 	zv->zv_volblocksize = doi->doi_data_block_size;
1426 	zv->zv_volsize = volsize;
1427 	zv->zv_objset = os;
1428 
1429 	ASSERT3P(zv->zv_zilog, ==, NULL);
1430 	zv->zv_zilog = zil_open(os, zvol_get_data);
1431 	if (spa_writeable(dmu_objset_spa(os))) {
1432 		if (zil_replay_disable)
1433 			zil_destroy(zv->zv_zilog, B_FALSE);
1434 		else
1435 			zil_replay(os, zv, zvol_replay_vector);
1436 	}
1437 	zil_close(zv->zv_zilog);
1438 	zv->zv_zilog = NULL;
1439 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1440 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1441 
1442 	/* TODO: prefetch for geom tasting */
1443 
1444 	zv->zv_objset = NULL;
1445 out_dmu_objset_disown:
1446 	dmu_objset_disown(os, B_TRUE, FTAG);
1447 
1448 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1449 		zvol_geom_run(zv);
1450 		g_topology_unlock();
1451 	}
1452 out_doi:
1453 	kmem_free(doi, sizeof (dmu_object_info_t));
1454 	if (error == 0) {
1455 		rw_enter(&zvol_state_lock, RW_WRITER);
1456 		zvol_insert(zv);
1457 		zvol_minors++;
1458 		rw_exit(&zvol_state_lock);
1459 		ZFS_LOG(1, "ZVOL %s created.", name);
1460 	}
1461 	PICKUP_GIANT();
1462 	return (error);
1463 }
1464 
1465 void
1466 zvol_os_clear_private(zvol_state_t *zv)
1467 {
1468 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1469 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1470 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1471 		struct g_provider *pp = zsg->zsg_provider;
1472 
1473 		if (pp->private == NULL) /* already cleared */
1474 			return;
1475 
1476 		mtx_lock(&zsg->zsg_queue_mtx);
1477 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1478 		pp->private = NULL;
1479 		wakeup_one(&zsg->zsg_queue);
1480 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1481 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1482 			    0, "zvol:w", 0);
1483 		mtx_unlock(&zsg->zsg_queue_mtx);
1484 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1485 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1486 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1487 		struct cdev *dev = zsd->zsd_cdev;
1488 
1489 		if (dev != NULL)
1490 			dev->si_drv2 = NULL;
1491 	}
1492 }
1493 
1494 int
1495 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1496 {
1497 	zv->zv_volsize = volsize;
1498 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1499 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1500 		struct g_provider *pp = zsg->zsg_provider;
1501 
1502 		g_topology_lock();
1503 
1504 		if (pp->private == NULL) {
1505 			g_topology_unlock();
1506 			return (SET_ERROR(ENXIO));
1507 		}
1508 
1509 		/*
1510 		 * Do not invoke resize event when initial size was zero.
1511 		 * ZVOL initializes the size on first open, this is not
1512 		 * real resizing.
1513 		 */
1514 		if (pp->mediasize == 0)
1515 			pp->mediasize = zv->zv_volsize;
1516 		else
1517 			g_resize_provider(pp, zv->zv_volsize);
1518 
1519 		g_topology_unlock();
1520 	}
1521 	return (0);
1522 }
1523 
1524 void
1525 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1526 {
1527 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1528 }
1529 
1530 void
1531 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1532 {
1533 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1534 }
1535 
1536 /*
1537  * Public interfaces
1538  */
1539 
1540 int
1541 zvol_busy(void)
1542 {
1543 	return (zvol_minors != 0);
1544 }
1545 
1546 int
1547 zvol_init(void)
1548 {
1549 	zvol_init_impl();
1550 	return (0);
1551 }
1552 
1553 void
1554 zvol_fini(void)
1555 {
1556 	zvol_fini_impl();
1557 }
1558