1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 #include <sys/freebsd_event.h>
96 
97 #include <geom/geom.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
100 
101 #include "zfs_namecheck.h"
102 
103 #define	ZVOL_DUMPSIZE		"dumpsize"
104 
105 #ifdef ZVOL_LOCK_DEBUG
106 #define	ZVOL_RW_READER		RW_WRITER
107 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
108 #else
109 #define	ZVOL_RW_READER		RW_READER
110 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
111 #endif
112 
113 enum zvol_geom_state {
114 	ZVOL_GEOM_UNINIT,
115 	ZVOL_GEOM_STOPPED,
116 	ZVOL_GEOM_RUNNING,
117 };
118 
119 struct zvol_state_os {
120 #define	zso_dev		_zso_state._zso_dev
121 #define	zso_geom	_zso_state._zso_geom
122 	union {
123 		/* volmode=dev */
124 		struct zvol_state_dev {
125 			struct cdev *zsd_cdev;
126 			uint64_t zsd_sync_cnt;
127 			struct selinfo zsd_selinfo;
128 		} _zso_dev;
129 
130 		/* volmode=geom */
131 		struct zvol_state_geom {
132 			struct g_provider *zsg_provider;
133 			struct bio_queue_head zsg_queue;
134 			struct mtx zsg_queue_mtx;
135 			enum zvol_geom_state zsg_state;
136 		} _zso_geom;
137 	} _zso_state;
138 	int zso_dying;
139 };
140 
141 static uint32_t zvol_minors;
142 
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 	"Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
150 
151 /*
152  * Toggle unmap functionality.
153  */
154 boolean_t zvol_unmap_enabled = B_TRUE;
155 
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158 
159 /*
160  * zvol maximum transfer in one DMU tx.
161  */
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
163 
164 static void zvol_ensure_zilog(zvol_state_t *zv);
165 
166 static d_open_t		zvol_cdev_open;
167 static d_close_t	zvol_cdev_close;
168 static d_ioctl_t	zvol_cdev_ioctl;
169 static d_read_t		zvol_cdev_read;
170 static d_write_t	zvol_cdev_write;
171 static d_strategy_t	zvol_geom_bio_strategy;
172 static d_kqfilter_t	zvol_cdev_kqfilter;
173 
174 static struct cdevsw zvol_cdevsw = {
175 	.d_name =	"zvol",
176 	.d_version =	D_VERSION,
177 	.d_flags =	D_DISK | D_TRACKCLOSE,
178 	.d_open =	zvol_cdev_open,
179 	.d_close =	zvol_cdev_close,
180 	.d_ioctl =	zvol_cdev_ioctl,
181 	.d_read =	zvol_cdev_read,
182 	.d_write =	zvol_cdev_write,
183 	.d_strategy =	zvol_geom_bio_strategy,
184 	.d_kqfilter =	zvol_cdev_kqfilter,
185 };
186 
187 static void		zvol_filter_detach(struct knote *kn);
188 static int		zvol_filter_vnode(struct knote *kn, long hint);
189 
190 static struct filterops zvol_filterops_vnode = {
191 	.f_isfd = 1,
192 	.f_detach = zvol_filter_detach,
193 	.f_event = zvol_filter_vnode,
194 };
195 
196 extern uint_t zfs_geom_probe_vdev_key;
197 
198 struct g_class zfs_zvol_class = {
199 	.name = "ZFS::ZVOL",
200 	.version = G_VERSION,
201 };
202 
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
204 
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
214 
215 /*
216  * GEOM mode implementation
217  */
218 
219 static int
220 zvol_geom_open(struct g_provider *pp, int flag, int count)
221 {
222 	zvol_state_t *zv;
223 	int err = 0;
224 	boolean_t drop_suspend = B_FALSE;
225 
226 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
227 		/*
228 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 		 * attempting to probe geom providers while looking for a
230 		 * replacement for a missing VDEV.  In this case, the
231 		 * spa_namespace_lock will not be held, but it is still illegal
232 		 * to use a zvol as a vdev.  Deadlocks can result if another
233 		 * thread has spa_namespace_lock.
234 		 */
235 		return (SET_ERROR(EOPNOTSUPP));
236 	}
237 
238 retry:
239 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
240 	/*
241 	 * Obtain a copy of private under zvol_state_lock to make sure either
242 	 * the result of zvol free code setting private to NULL is observed,
243 	 * or the zv is protected from being freed because of the positive
244 	 * zv_open_count.
245 	 */
246 	zv = pp->private;
247 	if (zv == NULL) {
248 		rw_exit(&zvol_state_lock);
249 		err = SET_ERROR(ENXIO);
250 		goto out_locked;
251 	}
252 
253 	mutex_enter(&zv->zv_state_lock);
254 	if (zv->zv_zso->zso_dying) {
255 		rw_exit(&zvol_state_lock);
256 		err = SET_ERROR(ENXIO);
257 		goto out_zv_locked;
258 	}
259 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
260 
261 	/*
262 	 * Make sure zvol is not suspended during first open
263 	 * (hold zv_suspend_lock) and respect proper lock acquisition
264 	 * ordering - zv_suspend_lock before zv_state_lock.
265 	 */
266 	if (zv->zv_open_count == 0) {
267 		drop_suspend = B_TRUE;
268 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 			mutex_exit(&zv->zv_state_lock);
270 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 			mutex_enter(&zv->zv_state_lock);
272 			/* Check to see if zv_suspend_lock is needed. */
273 			if (zv->zv_open_count != 0) {
274 				rw_exit(&zv->zv_suspend_lock);
275 				drop_suspend = B_FALSE;
276 			}
277 		}
278 	}
279 	rw_exit(&zvol_state_lock);
280 
281 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
282 
283 	if (zv->zv_open_count == 0) {
284 		boolean_t drop_namespace = B_FALSE;
285 
286 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
287 
288 		/*
289 		 * Take spa_namespace_lock to prevent lock inversion when
290 		 * zvols from one pool are opened as vdevs in another.
291 		 */
292 		if (!mutex_owned(&spa_namespace_lock)) {
293 			if (!mutex_tryenter(&spa_namespace_lock)) {
294 				mutex_exit(&zv->zv_state_lock);
295 				rw_exit(&zv->zv_suspend_lock);
296 				kern_yield(PRI_USER);
297 				goto retry;
298 			} else {
299 				drop_namespace = B_TRUE;
300 			}
301 		}
302 		err = zvol_first_open(zv, !(flag & FWRITE));
303 		if (drop_namespace)
304 			mutex_exit(&spa_namespace_lock);
305 		if (err)
306 			goto out_zv_locked;
307 		pp->mediasize = zv->zv_volsize;
308 		pp->stripeoffset = 0;
309 		pp->stripesize = zv->zv_volblocksize;
310 	}
311 
312 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
313 
314 	/*
315 	 * Check for a bad on-disk format version now since we
316 	 * lied about owning the dataset readonly before.
317 	 */
318 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
319 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
320 		err = SET_ERROR(EROFS);
321 		goto out_opened;
322 	}
323 	if (zv->zv_flags & ZVOL_EXCL) {
324 		err = SET_ERROR(EBUSY);
325 		goto out_opened;
326 	}
327 	if (flag & O_EXCL) {
328 		if (zv->zv_open_count != 0) {
329 			err = SET_ERROR(EBUSY);
330 			goto out_opened;
331 		}
332 		zv->zv_flags |= ZVOL_EXCL;
333 	}
334 
335 	zv->zv_open_count += count;
336 out_opened:
337 	if (zv->zv_open_count == 0) {
338 		zvol_last_close(zv);
339 		wakeup(zv);
340 	}
341 out_zv_locked:
342 	mutex_exit(&zv->zv_state_lock);
343 out_locked:
344 	if (drop_suspend)
345 		rw_exit(&zv->zv_suspend_lock);
346 	return (err);
347 }
348 
349 static int
350 zvol_geom_close(struct g_provider *pp, int flag, int count)
351 {
352 	(void) flag;
353 	zvol_state_t *zv;
354 	boolean_t drop_suspend = B_TRUE;
355 	int new_open_count;
356 
357 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
358 	zv = pp->private;
359 	if (zv == NULL) {
360 		rw_exit(&zvol_state_lock);
361 		return (SET_ERROR(ENXIO));
362 	}
363 
364 	mutex_enter(&zv->zv_state_lock);
365 	if (zv->zv_flags & ZVOL_EXCL) {
366 		ASSERT3U(zv->zv_open_count, ==, 1);
367 		zv->zv_flags &= ~ZVOL_EXCL;
368 	}
369 
370 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
371 
372 	/*
373 	 * If the open count is zero, this is a spurious close.
374 	 * That indicates a bug in the kernel / DDI framework.
375 	 */
376 	ASSERT3U(zv->zv_open_count, >, 0);
377 
378 	/*
379 	 * Make sure zvol is not suspended during last close
380 	 * (hold zv_suspend_lock) and respect proper lock acquisition
381 	 * ordering - zv_suspend_lock before zv_state_lock.
382 	 */
383 	new_open_count = zv->zv_open_count - count;
384 	if (new_open_count == 0) {
385 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
386 			mutex_exit(&zv->zv_state_lock);
387 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
388 			mutex_enter(&zv->zv_state_lock);
389 			/* Check to see if zv_suspend_lock is needed. */
390 			new_open_count = zv->zv_open_count - count;
391 			if (new_open_count != 0) {
392 				rw_exit(&zv->zv_suspend_lock);
393 				drop_suspend = B_FALSE;
394 			}
395 		}
396 	} else {
397 		drop_suspend = B_FALSE;
398 	}
399 	rw_exit(&zvol_state_lock);
400 
401 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
402 
403 	/*
404 	 * You may get multiple opens, but only one close.
405 	 */
406 	zv->zv_open_count = new_open_count;
407 	if (zv->zv_open_count == 0) {
408 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
409 		zvol_last_close(zv);
410 		wakeup(zv);
411 	}
412 
413 	mutex_exit(&zv->zv_state_lock);
414 
415 	if (drop_suspend)
416 		rw_exit(&zv->zv_suspend_lock);
417 	return (0);
418 }
419 
420 static void
421 zvol_geom_run(zvol_state_t *zv)
422 {
423 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
424 	struct g_provider *pp = zsg->zsg_provider;
425 
426 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
427 
428 	g_error_provider(pp, 0);
429 
430 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
431 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
432 }
433 
434 static void
435 zvol_geom_destroy(zvol_state_t *zv)
436 {
437 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
438 	struct g_provider *pp = zsg->zsg_provider;
439 
440 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
441 
442 	g_topology_assert();
443 
444 	mutex_enter(&zv->zv_state_lock);
445 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
446 	mutex_exit(&zv->zv_state_lock);
447 	zsg->zsg_provider = NULL;
448 	g_wither_geom(pp->geom, ENXIO);
449 }
450 
451 void
452 zvol_wait_close(zvol_state_t *zv)
453 {
454 
455 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
456 		return;
457 	mutex_enter(&zv->zv_state_lock);
458 	zv->zv_zso->zso_dying = B_TRUE;
459 
460 	if (zv->zv_open_count)
461 		msleep(zv, &zv->zv_state_lock,
462 		    PRIBIO, "zvol:dying", 10*hz);
463 	mutex_exit(&zv->zv_state_lock);
464 }
465 
466 
467 static int
468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
469 {
470 	int count, error, flags;
471 
472 	g_topology_assert();
473 
474 	/*
475 	 * To make it easier we expect either open or close, but not both
476 	 * at the same time.
477 	 */
478 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
479 	    (acr <= 0 && acw <= 0 && ace <= 0),
480 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
481 	    pp->name, acr, acw, ace));
482 
483 	if (pp->private == NULL) {
484 		if (acr <= 0 && acw <= 0 && ace <= 0)
485 			return (0);
486 		return (pp->error);
487 	}
488 
489 	/*
490 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
491 	 * ace != 0, because GEOM already handles that and handles it a bit
492 	 * differently. GEOM allows for multiple read/exclusive consumers and
493 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
494 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
495 	 * to decide what to do.
496 	 */
497 
498 	count = acr + acw + ace;
499 	if (count == 0)
500 		return (0);
501 
502 	flags = 0;
503 	if (acr != 0 || ace != 0)
504 		flags |= FREAD;
505 	if (acw != 0)
506 		flags |= FWRITE;
507 
508 	g_topology_unlock();
509 	if (count > 0)
510 		error = zvol_geom_open(pp, flags, count);
511 	else
512 		error = zvol_geom_close(pp, flags, -count);
513 	g_topology_lock();
514 	return (error);
515 }
516 
517 static void
518 zvol_geom_worker(void *arg)
519 {
520 	zvol_state_t *zv = arg;
521 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
522 	struct bio *bp;
523 
524 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
525 
526 	thread_lock(curthread);
527 	sched_prio(curthread, PRIBIO);
528 	thread_unlock(curthread);
529 
530 	for (;;) {
531 		mtx_lock(&zsg->zsg_queue_mtx);
532 		bp = bioq_takefirst(&zsg->zsg_queue);
533 		if (bp == NULL) {
534 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
535 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
536 				wakeup(&zsg->zsg_state);
537 				mtx_unlock(&zsg->zsg_queue_mtx);
538 				kthread_exit();
539 			}
540 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
541 			    PRIBIO | PDROP, "zvol:io", 0);
542 			continue;
543 		}
544 		mtx_unlock(&zsg->zsg_queue_mtx);
545 		zvol_geom_bio_strategy(bp);
546 	}
547 }
548 
549 static void
550 zvol_geom_bio_start(struct bio *bp)
551 {
552 	zvol_state_t *zv = bp->bio_to->private;
553 	struct zvol_state_geom *zsg;
554 	boolean_t first;
555 
556 	if (zv == NULL) {
557 		g_io_deliver(bp, ENXIO);
558 		return;
559 	}
560 	if (bp->bio_cmd == BIO_GETATTR) {
561 		if (zvol_geom_bio_getattr(bp))
562 			g_io_deliver(bp, EOPNOTSUPP);
563 		return;
564 	}
565 
566 	if (!THREAD_CAN_SLEEP()) {
567 		zsg = &zv->zv_zso->zso_geom;
568 		mtx_lock(&zsg->zsg_queue_mtx);
569 		first = (bioq_first(&zsg->zsg_queue) == NULL);
570 		bioq_insert_tail(&zsg->zsg_queue, bp);
571 		mtx_unlock(&zsg->zsg_queue_mtx);
572 		if (first)
573 			wakeup_one(&zsg->zsg_queue);
574 		return;
575 	}
576 
577 	zvol_geom_bio_strategy(bp);
578 }
579 
580 static int
581 zvol_geom_bio_getattr(struct bio *bp)
582 {
583 	zvol_state_t *zv;
584 
585 	zv = bp->bio_to->private;
586 	ASSERT3P(zv, !=, NULL);
587 
588 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
589 	uint64_t refd, avail, usedobjs, availobjs;
590 
591 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
592 		return (0);
593 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
594 		dmu_objset_space(zv->zv_objset, &refd, &avail,
595 		    &usedobjs, &availobjs);
596 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
597 			return (0);
598 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
599 		dmu_objset_space(zv->zv_objset, &refd, &avail,
600 		    &usedobjs, &availobjs);
601 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
602 			return (0);
603 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
604 		avail = metaslab_class_get_space(spa_normal_class(spa));
605 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
606 		if (g_handleattr_off_t(bp, "poolblocksavail",
607 		    avail / DEV_BSIZE))
608 			return (0);
609 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
610 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
611 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
612 			return (0);
613 	}
614 	return (1);
615 }
616 
617 static void
618 zvol_filter_detach(struct knote *kn)
619 {
620 	zvol_state_t *zv;
621 	struct zvol_state_dev *zsd;
622 
623 	zv = kn->kn_hook;
624 	zsd = &zv->zv_zso->zso_dev;
625 
626 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
627 }
628 
629 static int
630 zvol_filter_vnode(struct knote *kn, long hint)
631 {
632 	kn->kn_fflags |= kn->kn_sfflags & hint;
633 
634 	return (kn->kn_fflags != 0);
635 }
636 
637 static int
638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
639 {
640 	zvol_state_t *zv;
641 	struct zvol_state_dev *zsd;
642 
643 	zv = dev->si_drv2;
644 	zsd = &zv->zv_zso->zso_dev;
645 
646 	if (kn->kn_filter != EVFILT_VNODE)
647 		return (EINVAL);
648 
649 	/* XXX: extend support for other NOTE_* events */
650 	if (kn->kn_sfflags != NOTE_ATTRIB)
651 		return (EINVAL);
652 
653 	kn->kn_fop = &zvol_filterops_vnode;
654 	kn->kn_hook = zv;
655 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
656 
657 	return (0);
658 }
659 
660 static void
661 zvol_geom_bio_strategy(struct bio *bp)
662 {
663 	zvol_state_t *zv;
664 	uint64_t off, volsize;
665 	size_t resid;
666 	char *addr;
667 	objset_t *os;
668 	zfs_locked_range_t *lr;
669 	int error = 0;
670 	boolean_t doread = B_FALSE;
671 	boolean_t is_dumpified;
672 	boolean_t sync;
673 
674 	if (bp->bio_to)
675 		zv = bp->bio_to->private;
676 	else
677 		zv = bp->bio_dev->si_drv2;
678 
679 	if (zv == NULL) {
680 		error = SET_ERROR(ENXIO);
681 		goto out;
682 	}
683 
684 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
685 
686 	switch (bp->bio_cmd) {
687 	case BIO_READ:
688 		doread = B_TRUE;
689 		break;
690 	case BIO_WRITE:
691 	case BIO_FLUSH:
692 	case BIO_DELETE:
693 		if (zv->zv_flags & ZVOL_RDONLY) {
694 			error = SET_ERROR(EROFS);
695 			goto resume;
696 		}
697 		zvol_ensure_zilog(zv);
698 		if (bp->bio_cmd == BIO_FLUSH)
699 			goto sync;
700 		break;
701 	default:
702 		error = SET_ERROR(EOPNOTSUPP);
703 		goto resume;
704 	}
705 
706 	off = bp->bio_offset;
707 	volsize = zv->zv_volsize;
708 
709 	os = zv->zv_objset;
710 	ASSERT3P(os, !=, NULL);
711 
712 	addr = bp->bio_data;
713 	resid = bp->bio_length;
714 
715 	if (resid > 0 && off >= volsize) {
716 		error = SET_ERROR(EIO);
717 		goto resume;
718 	}
719 
720 	is_dumpified = B_FALSE;
721 	sync = !doread && !is_dumpified &&
722 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
723 
724 	/*
725 	 * There must be no buffer changes when doing a dmu_sync() because
726 	 * we can't change the data whilst calculating the checksum.
727 	 */
728 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
729 	    doread ? RL_READER : RL_WRITER);
730 
731 	if (bp->bio_cmd == BIO_DELETE) {
732 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
733 		error = dmu_tx_assign(tx, TXG_WAIT);
734 		if (error != 0) {
735 			dmu_tx_abort(tx);
736 		} else {
737 			zvol_log_truncate(zv, tx, off, resid, sync);
738 			dmu_tx_commit(tx);
739 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
740 			    off, resid);
741 			resid = 0;
742 		}
743 		goto unlock;
744 	}
745 	while (resid != 0 && off < volsize) {
746 		size_t size = MIN(resid, zvol_maxphys);
747 		if (doread) {
748 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
749 			    DMU_READ_PREFETCH);
750 		} else {
751 			dmu_tx_t *tx = dmu_tx_create(os);
752 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
753 			error = dmu_tx_assign(tx, TXG_WAIT);
754 			if (error) {
755 				dmu_tx_abort(tx);
756 			} else {
757 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
758 				zvol_log_write(zv, tx, off, size, sync);
759 				dmu_tx_commit(tx);
760 			}
761 		}
762 		if (error) {
763 			/* Convert checksum errors into IO errors. */
764 			if (error == ECKSUM)
765 				error = SET_ERROR(EIO);
766 			break;
767 		}
768 		off += size;
769 		addr += size;
770 		resid -= size;
771 	}
772 unlock:
773 	zfs_rangelock_exit(lr);
774 
775 	bp->bio_completed = bp->bio_length - resid;
776 	if (bp->bio_completed < bp->bio_length && off > volsize)
777 		error = SET_ERROR(EINVAL);
778 
779 	switch (bp->bio_cmd) {
780 	case BIO_FLUSH:
781 		break;
782 	case BIO_READ:
783 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
784 		    bp->bio_completed);
785 		break;
786 	case BIO_WRITE:
787 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
788 		    bp->bio_completed);
789 		break;
790 	case BIO_DELETE:
791 		break;
792 	default:
793 		break;
794 	}
795 
796 	if (sync) {
797 sync:
798 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
799 	}
800 resume:
801 	rw_exit(&zv->zv_suspend_lock);
802 out:
803 	if (bp->bio_to)
804 		g_io_deliver(bp, error);
805 	else
806 		biofinish(bp, NULL, error);
807 }
808 
809 /*
810  * Character device mode implementation
811  */
812 
813 static int
814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
815 {
816 	zvol_state_t *zv;
817 	uint64_t volsize;
818 	zfs_locked_range_t *lr;
819 	int error = 0;
820 	zfs_uio_t uio;
821 
822 	zfs_uio_init(&uio, uio_s);
823 
824 	zv = dev->si_drv2;
825 
826 	volsize = zv->zv_volsize;
827 	/*
828 	 * uio_loffset == volsize isn't an error as
829 	 * it's required for EOF processing.
830 	 */
831 	if (zfs_uio_resid(&uio) > 0 &&
832 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
833 		return (SET_ERROR(EIO));
834 
835 	ssize_t start_resid = zfs_uio_resid(&uio);
836 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
837 	    zfs_uio_resid(&uio), RL_READER);
838 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
839 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
840 
841 		/* Don't read past the end. */
842 		if (bytes > volsize - zfs_uio_offset(&uio))
843 			bytes = volsize - zfs_uio_offset(&uio);
844 
845 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
846 		if (error) {
847 			/* Convert checksum errors into IO errors. */
848 			if (error == ECKSUM)
849 				error = SET_ERROR(EIO);
850 			break;
851 		}
852 	}
853 	zfs_rangelock_exit(lr);
854 	int64_t nread = start_resid - zfs_uio_resid(&uio);
855 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
856 
857 	return (error);
858 }
859 
860 static int
861 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
862 {
863 	zvol_state_t *zv;
864 	uint64_t volsize;
865 	zfs_locked_range_t *lr;
866 	int error = 0;
867 	boolean_t sync;
868 	zfs_uio_t uio;
869 
870 	zv = dev->si_drv2;
871 
872 	volsize = zv->zv_volsize;
873 
874 	zfs_uio_init(&uio, uio_s);
875 
876 	if (zfs_uio_resid(&uio) > 0 &&
877 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
878 		return (SET_ERROR(EIO));
879 
880 	ssize_t start_resid = zfs_uio_resid(&uio);
881 	sync = (ioflag & IO_SYNC) ||
882 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
883 
884 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
885 	zvol_ensure_zilog(zv);
886 
887 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
888 	    zfs_uio_resid(&uio), RL_WRITER);
889 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
890 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
891 		uint64_t off = zfs_uio_offset(&uio);
892 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
893 
894 		if (bytes > volsize - off)	/* Don't write past the end. */
895 			bytes = volsize - off;
896 
897 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
898 		error = dmu_tx_assign(tx, TXG_WAIT);
899 		if (error) {
900 			dmu_tx_abort(tx);
901 			break;
902 		}
903 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
904 		if (error == 0)
905 			zvol_log_write(zv, tx, off, bytes, sync);
906 		dmu_tx_commit(tx);
907 
908 		if (error)
909 			break;
910 	}
911 	zfs_rangelock_exit(lr);
912 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
913 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
914 	if (sync)
915 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
916 	rw_exit(&zv->zv_suspend_lock);
917 	return (error);
918 }
919 
920 static int
921 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
922 {
923 	zvol_state_t *zv;
924 	struct zvol_state_dev *zsd;
925 	int err = 0;
926 	boolean_t drop_suspend = B_FALSE;
927 
928 retry:
929 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
930 	/*
931 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
932 	 * the result of zvol free code setting si_drv2 to NULL is observed,
933 	 * or the zv is protected from being freed because of the positive
934 	 * zv_open_count.
935 	 */
936 	zv = dev->si_drv2;
937 	if (zv == NULL) {
938 		rw_exit(&zvol_state_lock);
939 		err = SET_ERROR(ENXIO);
940 		goto out_locked;
941 	}
942 
943 	mutex_enter(&zv->zv_state_lock);
944 	if (zv->zv_zso->zso_dying) {
945 		rw_exit(&zvol_state_lock);
946 		err = SET_ERROR(ENXIO);
947 		goto out_zv_locked;
948 	}
949 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
950 
951 	/*
952 	 * Make sure zvol is not suspended during first open
953 	 * (hold zv_suspend_lock) and respect proper lock acquisition
954 	 * ordering - zv_suspend_lock before zv_state_lock.
955 	 */
956 	if (zv->zv_open_count == 0) {
957 		drop_suspend = B_TRUE;
958 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
959 			mutex_exit(&zv->zv_state_lock);
960 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
961 			mutex_enter(&zv->zv_state_lock);
962 			/* Check to see if zv_suspend_lock is needed. */
963 			if (zv->zv_open_count != 0) {
964 				rw_exit(&zv->zv_suspend_lock);
965 				drop_suspend = B_FALSE;
966 			}
967 		}
968 	}
969 	rw_exit(&zvol_state_lock);
970 
971 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
972 
973 	if (zv->zv_open_count == 0) {
974 		boolean_t drop_namespace = B_FALSE;
975 
976 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
977 
978 		/*
979 		 * Take spa_namespace_lock to prevent lock inversion when
980 		 * zvols from one pool are opened as vdevs in another.
981 		 */
982 		if (!mutex_owned(&spa_namespace_lock)) {
983 			if (!mutex_tryenter(&spa_namespace_lock)) {
984 				mutex_exit(&zv->zv_state_lock);
985 				rw_exit(&zv->zv_suspend_lock);
986 				kern_yield(PRI_USER);
987 				goto retry;
988 			} else {
989 				drop_namespace = B_TRUE;
990 			}
991 		}
992 		err = zvol_first_open(zv, !(flags & FWRITE));
993 		if (drop_namespace)
994 			mutex_exit(&spa_namespace_lock);
995 		if (err)
996 			goto out_zv_locked;
997 	}
998 
999 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1000 
1001 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1002 		err = SET_ERROR(EROFS);
1003 		goto out_opened;
1004 	}
1005 	if (zv->zv_flags & ZVOL_EXCL) {
1006 		err = SET_ERROR(EBUSY);
1007 		goto out_opened;
1008 	}
1009 	if (flags & O_EXCL) {
1010 		if (zv->zv_open_count != 0) {
1011 			err = SET_ERROR(EBUSY);
1012 			goto out_opened;
1013 		}
1014 		zv->zv_flags |= ZVOL_EXCL;
1015 	}
1016 
1017 	zv->zv_open_count++;
1018 	if (flags & O_SYNC) {
1019 		zsd = &zv->zv_zso->zso_dev;
1020 		zsd->zsd_sync_cnt++;
1021 		if (zsd->zsd_sync_cnt == 1 &&
1022 		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
1023 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
1024 	}
1025 out_opened:
1026 	if (zv->zv_open_count == 0) {
1027 		zvol_last_close(zv);
1028 		wakeup(zv);
1029 	}
1030 out_zv_locked:
1031 	mutex_exit(&zv->zv_state_lock);
1032 out_locked:
1033 	if (drop_suspend)
1034 		rw_exit(&zv->zv_suspend_lock);
1035 	return (err);
1036 }
1037 
1038 static int
1039 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1040 {
1041 	zvol_state_t *zv;
1042 	struct zvol_state_dev *zsd;
1043 	boolean_t drop_suspend = B_TRUE;
1044 
1045 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1046 	zv = dev->si_drv2;
1047 	if (zv == NULL) {
1048 		rw_exit(&zvol_state_lock);
1049 		return (SET_ERROR(ENXIO));
1050 	}
1051 
1052 	mutex_enter(&zv->zv_state_lock);
1053 	if (zv->zv_flags & ZVOL_EXCL) {
1054 		ASSERT3U(zv->zv_open_count, ==, 1);
1055 		zv->zv_flags &= ~ZVOL_EXCL;
1056 	}
1057 
1058 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1059 
1060 	/*
1061 	 * If the open count is zero, this is a spurious close.
1062 	 * That indicates a bug in the kernel / DDI framework.
1063 	 */
1064 	ASSERT3U(zv->zv_open_count, >, 0);
1065 	/*
1066 	 * Make sure zvol is not suspended during last close
1067 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1068 	 * ordering - zv_suspend_lock before zv_state_lock.
1069 	 */
1070 	if (zv->zv_open_count == 1) {
1071 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1072 			mutex_exit(&zv->zv_state_lock);
1073 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1074 			mutex_enter(&zv->zv_state_lock);
1075 			/* Check to see if zv_suspend_lock is needed. */
1076 			if (zv->zv_open_count != 1) {
1077 				rw_exit(&zv->zv_suspend_lock);
1078 				drop_suspend = B_FALSE;
1079 			}
1080 		}
1081 	} else {
1082 		drop_suspend = B_FALSE;
1083 	}
1084 	rw_exit(&zvol_state_lock);
1085 
1086 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1087 
1088 	/*
1089 	 * You may get multiple opens, but only one close.
1090 	 */
1091 	zv->zv_open_count--;
1092 	if (flags & O_SYNC) {
1093 		zsd = &zv->zv_zso->zso_dev;
1094 		zsd->zsd_sync_cnt--;
1095 	}
1096 
1097 	if (zv->zv_open_count == 0) {
1098 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1099 		zvol_last_close(zv);
1100 		wakeup(zv);
1101 	}
1102 
1103 	mutex_exit(&zv->zv_state_lock);
1104 
1105 	if (drop_suspend)
1106 		rw_exit(&zv->zv_suspend_lock);
1107 	return (0);
1108 }
1109 
1110 static int
1111 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1112     int fflag, struct thread *td)
1113 {
1114 	zvol_state_t *zv;
1115 	zfs_locked_range_t *lr;
1116 	off_t offset, length;
1117 	int error;
1118 	boolean_t sync;
1119 
1120 	zv = dev->si_drv2;
1121 
1122 	error = 0;
1123 	KASSERT(zv->zv_open_count > 0,
1124 	    ("Device with zero access count in %s", __func__));
1125 
1126 	switch (cmd) {
1127 	case DIOCGSECTORSIZE:
1128 		*(uint32_t *)data = DEV_BSIZE;
1129 		break;
1130 	case DIOCGMEDIASIZE:
1131 		*(off_t *)data = zv->zv_volsize;
1132 		break;
1133 	case DIOCGFLUSH:
1134 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1135 		if (zv->zv_zilog != NULL)
1136 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1137 		rw_exit(&zv->zv_suspend_lock);
1138 		break;
1139 	case DIOCGDELETE:
1140 		if (!zvol_unmap_enabled)
1141 			break;
1142 
1143 		offset = ((off_t *)data)[0];
1144 		length = ((off_t *)data)[1];
1145 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1146 		    offset < 0 || offset >= zv->zv_volsize ||
1147 		    length <= 0) {
1148 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1149 			    length);
1150 			error = SET_ERROR(EINVAL);
1151 			break;
1152 		}
1153 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1154 		zvol_ensure_zilog(zv);
1155 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1156 		    RL_WRITER);
1157 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1158 		error = dmu_tx_assign(tx, TXG_WAIT);
1159 		if (error != 0) {
1160 			sync = FALSE;
1161 			dmu_tx_abort(tx);
1162 		} else {
1163 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1164 			zvol_log_truncate(zv, tx, offset, length, sync);
1165 			dmu_tx_commit(tx);
1166 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1167 			    offset, length);
1168 		}
1169 		zfs_rangelock_exit(lr);
1170 		if (sync)
1171 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1172 		rw_exit(&zv->zv_suspend_lock);
1173 		break;
1174 	case DIOCGSTRIPESIZE:
1175 		*(off_t *)data = zv->zv_volblocksize;
1176 		break;
1177 	case DIOCGSTRIPEOFFSET:
1178 		*(off_t *)data = 0;
1179 		break;
1180 	case DIOCGATTR: {
1181 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1182 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1183 		uint64_t refd, avail, usedobjs, availobjs;
1184 
1185 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1186 			arg->value.i = 1;
1187 		else if (strcmp(arg->name, "blocksavail") == 0) {
1188 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1189 			    &usedobjs, &availobjs);
1190 			arg->value.off = avail / DEV_BSIZE;
1191 		} else if (strcmp(arg->name, "blocksused") == 0) {
1192 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1193 			    &usedobjs, &availobjs);
1194 			arg->value.off = refd / DEV_BSIZE;
1195 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1196 			avail = metaslab_class_get_space(spa_normal_class(spa));
1197 			avail -= metaslab_class_get_alloc(
1198 			    spa_normal_class(spa));
1199 			arg->value.off = avail / DEV_BSIZE;
1200 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1201 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1202 			arg->value.off = refd / DEV_BSIZE;
1203 		} else
1204 			error = SET_ERROR(ENOIOCTL);
1205 		break;
1206 	}
1207 	case FIOSEEKHOLE:
1208 	case FIOSEEKDATA: {
1209 		off_t *off = (off_t *)data;
1210 		uint64_t noff;
1211 		boolean_t hole;
1212 
1213 		hole = (cmd == FIOSEEKHOLE);
1214 		noff = *off;
1215 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1216 		*off = noff;
1217 		break;
1218 	}
1219 	default:
1220 		error = SET_ERROR(ENOIOCTL);
1221 	}
1222 
1223 	return (error);
1224 }
1225 
1226 /*
1227  * Misc. helpers
1228  */
1229 
1230 static void
1231 zvol_ensure_zilog(zvol_state_t *zv)
1232 {
1233 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1234 
1235 	/*
1236 	 * Open a ZIL if this is the first time we have written to this
1237 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 	 * than zv_state_lock so that we don't need to acquire an
1239 	 * additional lock in this path.
1240 	 */
1241 	if (zv->zv_zilog == NULL) {
1242 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243 			rw_exit(&zv->zv_suspend_lock);
1244 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1245 		}
1246 		if (zv->zv_zilog == NULL) {
1247 			zv->zv_zilog = zil_open(zv->zv_objset,
1248 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1250 			/* replay / destroy done in zvol_os_create_minor() */
1251 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1252 			    ZIL_REPLAY_NEEDED);
1253 		}
1254 		rw_downgrade(&zv->zv_suspend_lock);
1255 	}
1256 }
1257 
1258 boolean_t
1259 zvol_os_is_zvol(const char *device)
1260 {
1261 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1262 }
1263 
1264 void
1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1266 {
1267 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1269 
1270 	/* Move to a new hashtable entry.  */
1271 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1272 	hlist_del(&zv->zv_hlink);
1273 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1274 
1275 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277 		struct g_provider *pp = zsg->zsg_provider;
1278 		struct g_geom *gp;
1279 
1280 		g_topology_lock();
1281 		gp = pp->geom;
1282 		ASSERT3P(gp, !=, NULL);
1283 
1284 		zsg->zsg_provider = NULL;
1285 		g_wither_provider(pp, ENXIO);
1286 
1287 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289 		pp->sectorsize = DEV_BSIZE;
1290 		pp->mediasize = zv->zv_volsize;
1291 		pp->private = zv;
1292 		zsg->zsg_provider = pp;
1293 		g_error_provider(pp, 0);
1294 		g_topology_unlock();
1295 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297 		struct cdev *dev;
1298 		struct make_dev_args args;
1299 
1300 		dev = zsd->zsd_cdev;
1301 		if (dev != NULL) {
1302 			destroy_dev(dev);
1303 			dev = zsd->zsd_cdev = NULL;
1304 			if (zv->zv_open_count > 0) {
1305 				zv->zv_flags &= ~ZVOL_EXCL;
1306 				zv->zv_open_count = 0;
1307 				/* XXX  need suspend lock but lock order */
1308 				zvol_last_close(zv);
1309 			}
1310 		}
1311 
1312 		make_dev_args_init(&args);
1313 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314 		args.mda_devsw = &zvol_cdevsw;
1315 		args.mda_cr = NULL;
1316 		args.mda_uid = UID_ROOT;
1317 		args.mda_gid = GID_OPERATOR;
1318 		args.mda_mode = 0640;
1319 		args.mda_si_drv2 = zv;
1320 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321 		    == 0) {
1322 #if __FreeBSD_version > 1300130
1323 			dev->si_iosize_max = maxphys;
1324 #else
1325 			dev->si_iosize_max = MAXPHYS;
1326 #endif
1327 			zsd->zsd_cdev = dev;
1328 		}
1329 	}
1330 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1331 }
1332 
1333 /*
1334  * Remove minor node for the specified volume.
1335  */
1336 void
1337 zvol_os_free(zvol_state_t *zv)
1338 {
1339 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1340 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1341 	ASSERT0(zv->zv_open_count);
1342 
1343 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1344 
1345 	rw_destroy(&zv->zv_suspend_lock);
1346 	zfs_rangelock_fini(&zv->zv_rangelock);
1347 
1348 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1349 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1350 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1351 
1352 		ASSERT3P(pp->private, ==, NULL);
1353 
1354 		g_topology_lock();
1355 		zvol_geom_destroy(zv);
1356 		g_topology_unlock();
1357 		mtx_destroy(&zsg->zsg_queue_mtx);
1358 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1359 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1360 		struct cdev *dev = zsd->zsd_cdev;
1361 
1362 		if (dev != NULL) {
1363 			ASSERT3P(dev->si_drv2, ==, NULL);
1364 			destroy_dev(dev);
1365 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1366 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1367 		}
1368 	}
1369 
1370 	mutex_destroy(&zv->zv_state_lock);
1371 	dataset_kstats_destroy(&zv->zv_kstat);
1372 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1373 	kmem_free(zv, sizeof (zvol_state_t));
1374 	zvol_minors--;
1375 }
1376 
1377 /*
1378  * Create a minor node (plus a whole lot more) for the specified volume.
1379  */
1380 int
1381 zvol_os_create_minor(const char *name)
1382 {
1383 	zvol_state_t *zv;
1384 	objset_t *os;
1385 	dmu_object_info_t *doi;
1386 	uint64_t volsize;
1387 	uint64_t volmode, hash;
1388 	int error;
1389 
1390 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1391 	hash = zvol_name_hash(name);
1392 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1393 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1394 		mutex_exit(&zv->zv_state_lock);
1395 		return (SET_ERROR(EEXIST));
1396 	}
1397 
1398 	DROP_GIANT();
1399 
1400 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1401 
1402 	/* Lie and say we're read-only. */
1403 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1404 	if (error)
1405 		goto out_doi;
1406 
1407 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1408 	if (error)
1409 		goto out_dmu_objset_disown;
1410 
1411 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1412 	if (error)
1413 		goto out_dmu_objset_disown;
1414 
1415 	error = dsl_prop_get_integer(name,
1416 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1417 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1418 		volmode = zvol_volmode;
1419 	error = 0;
1420 
1421 	/*
1422 	 * zvol_alloc equivalent ...
1423 	 */
1424 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1425 	zv->zv_hash = hash;
1426 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1427 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1428 	zv->zv_volmode = volmode;
1429 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1430 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1431 		struct g_provider *pp;
1432 		struct g_geom *gp;
1433 
1434 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1435 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1436 
1437 		g_topology_lock();
1438 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1439 		gp->start = zvol_geom_bio_start;
1440 		gp->access = zvol_geom_access;
1441 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1442 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1443 		pp->sectorsize = DEV_BSIZE;
1444 		pp->mediasize = 0;
1445 		pp->private = zv;
1446 
1447 		zsg->zsg_provider = pp;
1448 		bioq_init(&zsg->zsg_queue);
1449 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451 		struct cdev *dev;
1452 		struct make_dev_args args;
1453 
1454 		make_dev_args_init(&args);
1455 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1456 		args.mda_devsw = &zvol_cdevsw;
1457 		args.mda_cr = NULL;
1458 		args.mda_uid = UID_ROOT;
1459 		args.mda_gid = GID_OPERATOR;
1460 		args.mda_mode = 0640;
1461 		args.mda_si_drv2 = zv;
1462 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1463 		    == 0) {
1464 #if __FreeBSD_version > 1300130
1465 			dev->si_iosize_max = maxphys;
1466 #else
1467 			dev->si_iosize_max = MAXPHYS;
1468 #endif
1469 			zsd->zsd_cdev = dev;
1470 			knlist_init_sx(&zsd->zsd_selinfo.si_note,
1471 			    &zv->zv_state_lock);
1472 		}
1473 	}
1474 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1475 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1476 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1477 
1478 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1479 		zv->zv_flags |= ZVOL_RDONLY;
1480 
1481 	zv->zv_volblocksize = doi->doi_data_block_size;
1482 	zv->zv_volsize = volsize;
1483 	zv->zv_objset = os;
1484 
1485 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1486 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1487 	if (error)
1488 		goto out_dmu_objset_disown;
1489 	ASSERT3P(zv->zv_zilog, ==, NULL);
1490 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1491 	if (spa_writeable(dmu_objset_spa(os))) {
1492 		if (zil_replay_disable)
1493 			zil_destroy(zv->zv_zilog, B_FALSE);
1494 		else
1495 			zil_replay(os, zv, zvol_replay_vector);
1496 	}
1497 	zil_close(zv->zv_zilog);
1498 	zv->zv_zilog = NULL;
1499 
1500 	/* TODO: prefetch for geom tasting */
1501 
1502 	zv->zv_objset = NULL;
1503 out_dmu_objset_disown:
1504 	dmu_objset_disown(os, B_TRUE, FTAG);
1505 
1506 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1507 		zvol_geom_run(zv);
1508 		g_topology_unlock();
1509 	}
1510 out_doi:
1511 	kmem_free(doi, sizeof (dmu_object_info_t));
1512 	if (error == 0) {
1513 		rw_enter(&zvol_state_lock, RW_WRITER);
1514 		zvol_insert(zv);
1515 		zvol_minors++;
1516 		rw_exit(&zvol_state_lock);
1517 		ZFS_LOG(1, "ZVOL %s created.", name);
1518 	}
1519 	PICKUP_GIANT();
1520 	return (error);
1521 }
1522 
1523 void
1524 zvol_os_clear_private(zvol_state_t *zv)
1525 {
1526 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1527 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1528 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1529 		struct g_provider *pp = zsg->zsg_provider;
1530 
1531 		if (pp->private == NULL) /* already cleared */
1532 			return;
1533 
1534 		mtx_lock(&zsg->zsg_queue_mtx);
1535 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1536 		pp->private = NULL;
1537 		wakeup_one(&zsg->zsg_queue);
1538 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1539 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1540 			    0, "zvol:w", 0);
1541 		mtx_unlock(&zsg->zsg_queue_mtx);
1542 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1543 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1544 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1545 		struct cdev *dev = zsd->zsd_cdev;
1546 
1547 		if (dev != NULL)
1548 			dev->si_drv2 = NULL;
1549 	}
1550 }
1551 
1552 int
1553 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1554 {
1555 	zv->zv_volsize = volsize;
1556 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1557 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1558 		struct g_provider *pp = zsg->zsg_provider;
1559 
1560 		g_topology_lock();
1561 
1562 		if (pp->private == NULL) {
1563 			g_topology_unlock();
1564 			return (SET_ERROR(ENXIO));
1565 		}
1566 
1567 		/*
1568 		 * Do not invoke resize event when initial size was zero.
1569 		 * ZVOL initializes the size on first open, this is not
1570 		 * real resizing.
1571 		 */
1572 		if (pp->mediasize == 0)
1573 			pp->mediasize = zv->zv_volsize;
1574 		else
1575 			g_resize_provider(pp, zv->zv_volsize);
1576 
1577 		g_topology_unlock();
1578 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1579 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1580 
1581 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1582 	}
1583 	return (0);
1584 }
1585 
1586 void
1587 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1588 {
1589 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1590 }
1591 
1592 void
1593 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1594 {
1595 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1596 }
1597 
1598 /*
1599  * Public interfaces
1600  */
1601 
1602 int
1603 zvol_busy(void)
1604 {
1605 	return (zvol_minors != 0);
1606 }
1607 
1608 int
1609 zvol_init(void)
1610 {
1611 	zvol_init_impl();
1612 	return (0);
1613 }
1614 
1615 void
1616 zvol_fini(void)
1617 {
1618 	zvol_fini_impl();
1619 }
1620