1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 #include <sys/freebsd_event.h>
96 
97 #include <geom/geom.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
100 
101 #include "zfs_namecheck.h"
102 
103 #define	ZVOL_DUMPSIZE		"dumpsize"
104 
105 #ifdef ZVOL_LOCK_DEBUG
106 #define	ZVOL_RW_READER		RW_WRITER
107 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
108 #else
109 #define	ZVOL_RW_READER		RW_READER
110 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
111 #endif
112 
113 enum zvol_geom_state {
114 	ZVOL_GEOM_UNINIT,
115 	ZVOL_GEOM_STOPPED,
116 	ZVOL_GEOM_RUNNING,
117 };
118 
119 struct zvol_state_os {
120 #define	zso_dev		_zso_state._zso_dev
121 #define	zso_geom	_zso_state._zso_geom
122 	union {
123 		/* volmode=dev */
124 		struct zvol_state_dev {
125 			struct cdev *zsd_cdev;
126 			uint64_t zsd_sync_cnt;
127 			struct selinfo zsd_selinfo;
128 		} _zso_dev;
129 
130 		/* volmode=geom */
131 		struct zvol_state_geom {
132 			struct g_provider *zsg_provider;
133 			struct bio_queue_head zsg_queue;
134 			struct mtx zsg_queue_mtx;
135 			enum zvol_geom_state zsg_state;
136 		} _zso_geom;
137 	} _zso_state;
138 	int zso_dying;
139 };
140 
141 static uint32_t zvol_minors;
142 
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 	"Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
150 
151 /*
152  * Toggle unmap functionality.
153  */
154 boolean_t zvol_unmap_enabled = B_TRUE;
155 
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158 
159 /*
160  * zvol maximum transfer in one DMU tx.
161  */
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
163 
164 static void zvol_ensure_zilog(zvol_state_t *zv);
165 
166 static d_open_t		zvol_cdev_open;
167 static d_close_t	zvol_cdev_close;
168 static d_ioctl_t	zvol_cdev_ioctl;
169 static d_read_t		zvol_cdev_read;
170 static d_write_t	zvol_cdev_write;
171 static d_strategy_t	zvol_geom_bio_strategy;
172 static d_kqfilter_t	zvol_cdev_kqfilter;
173 
174 static struct cdevsw zvol_cdevsw = {
175 	.d_name =	"zvol",
176 	.d_version =	D_VERSION,
177 	.d_flags =	D_DISK | D_TRACKCLOSE,
178 	.d_open =	zvol_cdev_open,
179 	.d_close =	zvol_cdev_close,
180 	.d_ioctl =	zvol_cdev_ioctl,
181 	.d_read =	zvol_cdev_read,
182 	.d_write =	zvol_cdev_write,
183 	.d_strategy =	zvol_geom_bio_strategy,
184 	.d_kqfilter =	zvol_cdev_kqfilter,
185 };
186 
187 static void		zvol_filter_detach(struct knote *kn);
188 static int		zvol_filter_vnode(struct knote *kn, long hint);
189 
190 static struct filterops zvol_filterops_vnode = {
191 	.f_isfd = 1,
192 	.f_detach = zvol_filter_detach,
193 	.f_event = zvol_filter_vnode,
194 };
195 
196 extern uint_t zfs_geom_probe_vdev_key;
197 
198 struct g_class zfs_zvol_class = {
199 	.name = "ZFS::ZVOL",
200 	.version = G_VERSION,
201 };
202 
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
204 
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
214 
215 /*
216  * GEOM mode implementation
217  */
218 
219 static int
220 zvol_geom_open(struct g_provider *pp, int flag, int count)
221 {
222 	zvol_state_t *zv;
223 	int err = 0;
224 	boolean_t drop_suspend = B_FALSE;
225 
226 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
227 		/*
228 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 		 * attempting to probe geom providers while looking for a
230 		 * replacement for a missing VDEV.  In this case, the
231 		 * spa_namespace_lock will not be held, but it is still illegal
232 		 * to use a zvol as a vdev.  Deadlocks can result if another
233 		 * thread has spa_namespace_lock.
234 		 */
235 		return (SET_ERROR(EOPNOTSUPP));
236 	}
237 
238 retry:
239 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
240 	/*
241 	 * Obtain a copy of private under zvol_state_lock to make sure either
242 	 * the result of zvol free code setting private to NULL is observed,
243 	 * or the zv is protected from being freed because of the positive
244 	 * zv_open_count.
245 	 */
246 	zv = pp->private;
247 	if (zv == NULL) {
248 		rw_exit(&zvol_state_lock);
249 		err = SET_ERROR(ENXIO);
250 		goto out_locked;
251 	}
252 
253 	mutex_enter(&zv->zv_state_lock);
254 	if (zv->zv_zso->zso_dying) {
255 		rw_exit(&zvol_state_lock);
256 		err = SET_ERROR(ENXIO);
257 		goto out_zv_locked;
258 	}
259 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
260 
261 	/*
262 	 * Make sure zvol is not suspended during first open
263 	 * (hold zv_suspend_lock) and respect proper lock acquisition
264 	 * ordering - zv_suspend_lock before zv_state_lock.
265 	 */
266 	if (zv->zv_open_count == 0) {
267 		drop_suspend = B_TRUE;
268 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 			mutex_exit(&zv->zv_state_lock);
270 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 			mutex_enter(&zv->zv_state_lock);
272 			/* Check to see if zv_suspend_lock is needed. */
273 			if (zv->zv_open_count != 0) {
274 				rw_exit(&zv->zv_suspend_lock);
275 				drop_suspend = B_FALSE;
276 			}
277 		}
278 	}
279 	rw_exit(&zvol_state_lock);
280 
281 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
282 
283 	if (zv->zv_open_count == 0) {
284 		boolean_t drop_namespace = B_FALSE;
285 
286 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
287 
288 		/*
289 		 * Take spa_namespace_lock to prevent lock inversion when
290 		 * zvols from one pool are opened as vdevs in another.
291 		 */
292 		if (!mutex_owned(&spa_namespace_lock)) {
293 			if (!mutex_tryenter(&spa_namespace_lock)) {
294 				mutex_exit(&zv->zv_state_lock);
295 				rw_exit(&zv->zv_suspend_lock);
296 				kern_yield(PRI_USER);
297 				goto retry;
298 			} else {
299 				drop_namespace = B_TRUE;
300 			}
301 		}
302 		err = zvol_first_open(zv, !(flag & FWRITE));
303 		if (drop_namespace)
304 			mutex_exit(&spa_namespace_lock);
305 		if (err)
306 			goto out_zv_locked;
307 		pp->mediasize = zv->zv_volsize;
308 		pp->stripeoffset = 0;
309 		pp->stripesize = zv->zv_volblocksize;
310 	}
311 
312 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
313 
314 	/*
315 	 * Check for a bad on-disk format version now since we
316 	 * lied about owning the dataset readonly before.
317 	 */
318 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
319 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
320 		err = SET_ERROR(EROFS);
321 		goto out_opened;
322 	}
323 	if (zv->zv_flags & ZVOL_EXCL) {
324 		err = SET_ERROR(EBUSY);
325 		goto out_opened;
326 	}
327 	if (flag & O_EXCL) {
328 		if (zv->zv_open_count != 0) {
329 			err = SET_ERROR(EBUSY);
330 			goto out_opened;
331 		}
332 		zv->zv_flags |= ZVOL_EXCL;
333 	}
334 
335 	zv->zv_open_count += count;
336 out_opened:
337 	if (zv->zv_open_count == 0) {
338 		zvol_last_close(zv);
339 		wakeup(zv);
340 	}
341 out_zv_locked:
342 	mutex_exit(&zv->zv_state_lock);
343 out_locked:
344 	if (drop_suspend)
345 		rw_exit(&zv->zv_suspend_lock);
346 	return (err);
347 }
348 
349 static int
350 zvol_geom_close(struct g_provider *pp, int flag, int count)
351 {
352 	(void) flag;
353 	zvol_state_t *zv;
354 	boolean_t drop_suspend = B_TRUE;
355 	int new_open_count;
356 
357 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
358 	zv = pp->private;
359 	if (zv == NULL) {
360 		rw_exit(&zvol_state_lock);
361 		return (SET_ERROR(ENXIO));
362 	}
363 
364 	mutex_enter(&zv->zv_state_lock);
365 	if (zv->zv_flags & ZVOL_EXCL) {
366 		ASSERT3U(zv->zv_open_count, ==, 1);
367 		zv->zv_flags &= ~ZVOL_EXCL;
368 	}
369 
370 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
371 
372 	/*
373 	 * If the open count is zero, this is a spurious close.
374 	 * That indicates a bug in the kernel / DDI framework.
375 	 */
376 	ASSERT3U(zv->zv_open_count, >, 0);
377 
378 	/*
379 	 * Make sure zvol is not suspended during last close
380 	 * (hold zv_suspend_lock) and respect proper lock acquisition
381 	 * ordering - zv_suspend_lock before zv_state_lock.
382 	 */
383 	new_open_count = zv->zv_open_count - count;
384 	if (new_open_count == 0) {
385 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
386 			mutex_exit(&zv->zv_state_lock);
387 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
388 			mutex_enter(&zv->zv_state_lock);
389 			/* Check to see if zv_suspend_lock is needed. */
390 			new_open_count = zv->zv_open_count - count;
391 			if (new_open_count != 0) {
392 				rw_exit(&zv->zv_suspend_lock);
393 				drop_suspend = B_FALSE;
394 			}
395 		}
396 	} else {
397 		drop_suspend = B_FALSE;
398 	}
399 	rw_exit(&zvol_state_lock);
400 
401 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
402 
403 	/*
404 	 * You may get multiple opens, but only one close.
405 	 */
406 	zv->zv_open_count = new_open_count;
407 	if (zv->zv_open_count == 0) {
408 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
409 		zvol_last_close(zv);
410 		wakeup(zv);
411 	}
412 
413 	mutex_exit(&zv->zv_state_lock);
414 
415 	if (drop_suspend)
416 		rw_exit(&zv->zv_suspend_lock);
417 	return (0);
418 }
419 
420 static void
421 zvol_geom_run(zvol_state_t *zv)
422 {
423 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
424 	struct g_provider *pp = zsg->zsg_provider;
425 
426 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
427 
428 	g_error_provider(pp, 0);
429 
430 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
431 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
432 }
433 
434 static void
435 zvol_geom_destroy(zvol_state_t *zv)
436 {
437 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
438 	struct g_provider *pp = zsg->zsg_provider;
439 
440 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
441 
442 	g_topology_assert();
443 
444 	mutex_enter(&zv->zv_state_lock);
445 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
446 	mutex_exit(&zv->zv_state_lock);
447 	zsg->zsg_provider = NULL;
448 	g_wither_geom(pp->geom, ENXIO);
449 }
450 
451 void
452 zvol_wait_close(zvol_state_t *zv)
453 {
454 
455 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
456 		return;
457 	mutex_enter(&zv->zv_state_lock);
458 	zv->zv_zso->zso_dying = B_TRUE;
459 
460 	if (zv->zv_open_count)
461 		msleep(zv, &zv->zv_state_lock,
462 		    PRIBIO, "zvol:dying", 10*hz);
463 	mutex_exit(&zv->zv_state_lock);
464 }
465 
466 
467 static int
468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
469 {
470 	int count, error, flags;
471 
472 	g_topology_assert();
473 
474 	/*
475 	 * To make it easier we expect either open or close, but not both
476 	 * at the same time.
477 	 */
478 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
479 	    (acr <= 0 && acw <= 0 && ace <= 0),
480 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
481 	    pp->name, acr, acw, ace));
482 
483 	if (pp->private == NULL) {
484 		if (acr <= 0 && acw <= 0 && ace <= 0)
485 			return (0);
486 		return (pp->error);
487 	}
488 
489 	/*
490 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
491 	 * ace != 0, because GEOM already handles that and handles it a bit
492 	 * differently. GEOM allows for multiple read/exclusive consumers and
493 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
494 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
495 	 * to decide what to do.
496 	 */
497 
498 	count = acr + acw + ace;
499 	if (count == 0)
500 		return (0);
501 
502 	flags = 0;
503 	if (acr != 0 || ace != 0)
504 		flags |= FREAD;
505 	if (acw != 0)
506 		flags |= FWRITE;
507 
508 	g_topology_unlock();
509 	if (count > 0)
510 		error = zvol_geom_open(pp, flags, count);
511 	else
512 		error = zvol_geom_close(pp, flags, -count);
513 	g_topology_lock();
514 	return (error);
515 }
516 
517 static void
518 zvol_geom_worker(void *arg)
519 {
520 	zvol_state_t *zv = arg;
521 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
522 	struct bio *bp;
523 
524 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
525 
526 	thread_lock(curthread);
527 	sched_prio(curthread, PRIBIO);
528 	thread_unlock(curthread);
529 
530 	for (;;) {
531 		mtx_lock(&zsg->zsg_queue_mtx);
532 		bp = bioq_takefirst(&zsg->zsg_queue);
533 		if (bp == NULL) {
534 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
535 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
536 				wakeup(&zsg->zsg_state);
537 				mtx_unlock(&zsg->zsg_queue_mtx);
538 				kthread_exit();
539 			}
540 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
541 			    PRIBIO | PDROP, "zvol:io", 0);
542 			continue;
543 		}
544 		mtx_unlock(&zsg->zsg_queue_mtx);
545 		zvol_geom_bio_strategy(bp);
546 	}
547 }
548 
549 static void
550 zvol_geom_bio_start(struct bio *bp)
551 {
552 	zvol_state_t *zv = bp->bio_to->private;
553 	struct zvol_state_geom *zsg;
554 	boolean_t first;
555 
556 	if (zv == NULL) {
557 		g_io_deliver(bp, ENXIO);
558 		return;
559 	}
560 	if (bp->bio_cmd == BIO_GETATTR) {
561 		if (zvol_geom_bio_getattr(bp))
562 			g_io_deliver(bp, EOPNOTSUPP);
563 		return;
564 	}
565 
566 	if (!THREAD_CAN_SLEEP()) {
567 		zsg = &zv->zv_zso->zso_geom;
568 		mtx_lock(&zsg->zsg_queue_mtx);
569 		first = (bioq_first(&zsg->zsg_queue) == NULL);
570 		bioq_insert_tail(&zsg->zsg_queue, bp);
571 		mtx_unlock(&zsg->zsg_queue_mtx);
572 		if (first)
573 			wakeup_one(&zsg->zsg_queue);
574 		return;
575 	}
576 
577 	zvol_geom_bio_strategy(bp);
578 }
579 
580 static int
581 zvol_geom_bio_getattr(struct bio *bp)
582 {
583 	zvol_state_t *zv;
584 
585 	zv = bp->bio_to->private;
586 	ASSERT3P(zv, !=, NULL);
587 
588 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
589 	uint64_t refd, avail, usedobjs, availobjs;
590 
591 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
592 		return (0);
593 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
594 		dmu_objset_space(zv->zv_objset, &refd, &avail,
595 		    &usedobjs, &availobjs);
596 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
597 			return (0);
598 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
599 		dmu_objset_space(zv->zv_objset, &refd, &avail,
600 		    &usedobjs, &availobjs);
601 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
602 			return (0);
603 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
604 		avail = metaslab_class_get_space(spa_normal_class(spa));
605 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
606 		if (g_handleattr_off_t(bp, "poolblocksavail",
607 		    avail / DEV_BSIZE))
608 			return (0);
609 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
610 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
611 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
612 			return (0);
613 	}
614 	return (1);
615 }
616 
617 static void
618 zvol_filter_detach(struct knote *kn)
619 {
620 	zvol_state_t *zv;
621 	struct zvol_state_dev *zsd;
622 
623 	zv = kn->kn_hook;
624 	zsd = &zv->zv_zso->zso_dev;
625 
626 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
627 }
628 
629 static int
630 zvol_filter_vnode(struct knote *kn, long hint)
631 {
632 	kn->kn_fflags |= kn->kn_sfflags & hint;
633 
634 	return (kn->kn_fflags != 0);
635 }
636 
637 static int
638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
639 {
640 	zvol_state_t *zv;
641 	struct zvol_state_dev *zsd;
642 
643 	zv = dev->si_drv2;
644 	zsd = &zv->zv_zso->zso_dev;
645 
646 	if (kn->kn_filter != EVFILT_VNODE)
647 		return (EINVAL);
648 
649 	/* XXX: extend support for other NOTE_* events */
650 	if (kn->kn_sfflags != NOTE_ATTRIB)
651 		return (EINVAL);
652 
653 	kn->kn_fop = &zvol_filterops_vnode;
654 	kn->kn_hook = zv;
655 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
656 
657 	return (0);
658 }
659 
660 static void
661 zvol_geom_bio_strategy(struct bio *bp)
662 {
663 	zvol_state_t *zv;
664 	uint64_t off, volsize;
665 	size_t resid;
666 	char *addr;
667 	objset_t *os;
668 	zfs_locked_range_t *lr;
669 	int error = 0;
670 	boolean_t doread = B_FALSE;
671 	boolean_t is_dumpified;
672 	boolean_t sync;
673 
674 	if (bp->bio_to)
675 		zv = bp->bio_to->private;
676 	else
677 		zv = bp->bio_dev->si_drv2;
678 
679 	if (zv == NULL) {
680 		error = SET_ERROR(ENXIO);
681 		goto out;
682 	}
683 
684 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
685 
686 	switch (bp->bio_cmd) {
687 	case BIO_READ:
688 		doread = B_TRUE;
689 		break;
690 	case BIO_WRITE:
691 	case BIO_FLUSH:
692 	case BIO_DELETE:
693 		if (zv->zv_flags & ZVOL_RDONLY) {
694 			error = SET_ERROR(EROFS);
695 			goto resume;
696 		}
697 		zvol_ensure_zilog(zv);
698 		if (bp->bio_cmd == BIO_FLUSH)
699 			goto sync;
700 		break;
701 	default:
702 		error = SET_ERROR(EOPNOTSUPP);
703 		goto resume;
704 	}
705 
706 	off = bp->bio_offset;
707 	volsize = zv->zv_volsize;
708 
709 	os = zv->zv_objset;
710 	ASSERT3P(os, !=, NULL);
711 
712 	addr = bp->bio_data;
713 	resid = bp->bio_length;
714 
715 	if (resid > 0 && off >= volsize) {
716 		error = SET_ERROR(EIO);
717 		goto resume;
718 	}
719 
720 	is_dumpified = B_FALSE;
721 	sync = !doread && !is_dumpified &&
722 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
723 
724 	/*
725 	 * There must be no buffer changes when doing a dmu_sync() because
726 	 * we can't change the data whilst calculating the checksum.
727 	 */
728 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
729 	    doread ? RL_READER : RL_WRITER);
730 
731 	if (bp->bio_cmd == BIO_DELETE) {
732 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
733 		error = dmu_tx_assign(tx, TXG_WAIT);
734 		if (error != 0) {
735 			dmu_tx_abort(tx);
736 		} else {
737 			zvol_log_truncate(zv, tx, off, resid, sync);
738 			dmu_tx_commit(tx);
739 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
740 			    off, resid);
741 			resid = 0;
742 		}
743 		goto unlock;
744 	}
745 	while (resid != 0 && off < volsize) {
746 		size_t size = MIN(resid, zvol_maxphys);
747 		if (doread) {
748 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
749 			    DMU_READ_PREFETCH);
750 		} else {
751 			dmu_tx_t *tx = dmu_tx_create(os);
752 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
753 			error = dmu_tx_assign(tx, TXG_WAIT);
754 			if (error) {
755 				dmu_tx_abort(tx);
756 			} else {
757 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
758 				zvol_log_write(zv, tx, off, size, sync);
759 				dmu_tx_commit(tx);
760 			}
761 		}
762 		if (error) {
763 			/* Convert checksum errors into IO errors. */
764 			if (error == ECKSUM)
765 				error = SET_ERROR(EIO);
766 			break;
767 		}
768 		off += size;
769 		addr += size;
770 		resid -= size;
771 	}
772 unlock:
773 	zfs_rangelock_exit(lr);
774 
775 	bp->bio_completed = bp->bio_length - resid;
776 	if (bp->bio_completed < bp->bio_length && off > volsize)
777 		error = SET_ERROR(EINVAL);
778 
779 	switch (bp->bio_cmd) {
780 	case BIO_FLUSH:
781 		break;
782 	case BIO_READ:
783 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
784 		    bp->bio_completed);
785 		break;
786 	case BIO_WRITE:
787 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
788 		    bp->bio_completed);
789 		break;
790 	case BIO_DELETE:
791 		break;
792 	default:
793 		break;
794 	}
795 
796 	if (sync) {
797 sync:
798 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
799 	}
800 resume:
801 	rw_exit(&zv->zv_suspend_lock);
802 out:
803 	if (bp->bio_to)
804 		g_io_deliver(bp, error);
805 	else
806 		biofinish(bp, NULL, error);
807 }
808 
809 /*
810  * Character device mode implementation
811  */
812 
813 static int
814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
815 {
816 	zvol_state_t *zv;
817 	uint64_t volsize;
818 	zfs_locked_range_t *lr;
819 	int error = 0;
820 	zfs_uio_t uio;
821 
822 	zfs_uio_init(&uio, uio_s);
823 
824 	zv = dev->si_drv2;
825 
826 	volsize = zv->zv_volsize;
827 	/*
828 	 * uio_loffset == volsize isn't an error as
829 	 * it's required for EOF processing.
830 	 */
831 	if (zfs_uio_resid(&uio) > 0 &&
832 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
833 		return (SET_ERROR(EIO));
834 
835 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
836 	ssize_t start_resid = zfs_uio_resid(&uio);
837 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
838 	    zfs_uio_resid(&uio), RL_READER);
839 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
840 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
841 
842 		/* Don't read past the end. */
843 		if (bytes > volsize - zfs_uio_offset(&uio))
844 			bytes = volsize - zfs_uio_offset(&uio);
845 
846 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
847 		if (error) {
848 			/* Convert checksum errors into IO errors. */
849 			if (error == ECKSUM)
850 				error = SET_ERROR(EIO);
851 			break;
852 		}
853 	}
854 	zfs_rangelock_exit(lr);
855 	int64_t nread = start_resid - zfs_uio_resid(&uio);
856 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
857 	rw_exit(&zv->zv_suspend_lock);
858 
859 	return (error);
860 }
861 
862 static int
863 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
864 {
865 	zvol_state_t *zv;
866 	uint64_t volsize;
867 	zfs_locked_range_t *lr;
868 	int error = 0;
869 	boolean_t sync;
870 	zfs_uio_t uio;
871 
872 	zv = dev->si_drv2;
873 
874 	volsize = zv->zv_volsize;
875 
876 	zfs_uio_init(&uio, uio_s);
877 
878 	if (zfs_uio_resid(&uio) > 0 &&
879 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
880 		return (SET_ERROR(EIO));
881 
882 	ssize_t start_resid = zfs_uio_resid(&uio);
883 	sync = (ioflag & IO_SYNC) ||
884 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
885 
886 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
887 	zvol_ensure_zilog(zv);
888 
889 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
890 	    zfs_uio_resid(&uio), RL_WRITER);
891 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
892 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
893 		uint64_t off = zfs_uio_offset(&uio);
894 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
895 
896 		if (bytes > volsize - off)	/* Don't write past the end. */
897 			bytes = volsize - off;
898 
899 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
900 		error = dmu_tx_assign(tx, TXG_WAIT);
901 		if (error) {
902 			dmu_tx_abort(tx);
903 			break;
904 		}
905 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
906 		if (error == 0)
907 			zvol_log_write(zv, tx, off, bytes, sync);
908 		dmu_tx_commit(tx);
909 
910 		if (error)
911 			break;
912 	}
913 	zfs_rangelock_exit(lr);
914 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
915 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
916 	if (sync)
917 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
918 	rw_exit(&zv->zv_suspend_lock);
919 	return (error);
920 }
921 
922 static int
923 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
924 {
925 	zvol_state_t *zv;
926 	struct zvol_state_dev *zsd;
927 	int err = 0;
928 	boolean_t drop_suspend = B_FALSE;
929 
930 retry:
931 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
932 	/*
933 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
934 	 * the result of zvol free code setting si_drv2 to NULL is observed,
935 	 * or the zv is protected from being freed because of the positive
936 	 * zv_open_count.
937 	 */
938 	zv = dev->si_drv2;
939 	if (zv == NULL) {
940 		rw_exit(&zvol_state_lock);
941 		err = SET_ERROR(ENXIO);
942 		goto out_locked;
943 	}
944 
945 	mutex_enter(&zv->zv_state_lock);
946 	if (zv->zv_zso->zso_dying) {
947 		rw_exit(&zvol_state_lock);
948 		err = SET_ERROR(ENXIO);
949 		goto out_zv_locked;
950 	}
951 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
952 
953 	/*
954 	 * Make sure zvol is not suspended during first open
955 	 * (hold zv_suspend_lock) and respect proper lock acquisition
956 	 * ordering - zv_suspend_lock before zv_state_lock.
957 	 */
958 	if (zv->zv_open_count == 0) {
959 		drop_suspend = B_TRUE;
960 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
961 			mutex_exit(&zv->zv_state_lock);
962 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
963 			mutex_enter(&zv->zv_state_lock);
964 			/* Check to see if zv_suspend_lock is needed. */
965 			if (zv->zv_open_count != 0) {
966 				rw_exit(&zv->zv_suspend_lock);
967 				drop_suspend = B_FALSE;
968 			}
969 		}
970 	}
971 	rw_exit(&zvol_state_lock);
972 
973 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
974 
975 	if (zv->zv_open_count == 0) {
976 		boolean_t drop_namespace = B_FALSE;
977 
978 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
979 
980 		/*
981 		 * Take spa_namespace_lock to prevent lock inversion when
982 		 * zvols from one pool are opened as vdevs in another.
983 		 */
984 		if (!mutex_owned(&spa_namespace_lock)) {
985 			if (!mutex_tryenter(&spa_namespace_lock)) {
986 				mutex_exit(&zv->zv_state_lock);
987 				rw_exit(&zv->zv_suspend_lock);
988 				kern_yield(PRI_USER);
989 				goto retry;
990 			} else {
991 				drop_namespace = B_TRUE;
992 			}
993 		}
994 		err = zvol_first_open(zv, !(flags & FWRITE));
995 		if (drop_namespace)
996 			mutex_exit(&spa_namespace_lock);
997 		if (err)
998 			goto out_zv_locked;
999 	}
1000 
1001 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1002 
1003 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1004 		err = SET_ERROR(EROFS);
1005 		goto out_opened;
1006 	}
1007 	if (zv->zv_flags & ZVOL_EXCL) {
1008 		err = SET_ERROR(EBUSY);
1009 		goto out_opened;
1010 	}
1011 	if (flags & O_EXCL) {
1012 		if (zv->zv_open_count != 0) {
1013 			err = SET_ERROR(EBUSY);
1014 			goto out_opened;
1015 		}
1016 		zv->zv_flags |= ZVOL_EXCL;
1017 	}
1018 
1019 	zv->zv_open_count++;
1020 	if (flags & O_SYNC) {
1021 		zsd = &zv->zv_zso->zso_dev;
1022 		zsd->zsd_sync_cnt++;
1023 		if (zsd->zsd_sync_cnt == 1 &&
1024 		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
1025 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
1026 	}
1027 out_opened:
1028 	if (zv->zv_open_count == 0) {
1029 		zvol_last_close(zv);
1030 		wakeup(zv);
1031 	}
1032 out_zv_locked:
1033 	mutex_exit(&zv->zv_state_lock);
1034 out_locked:
1035 	if (drop_suspend)
1036 		rw_exit(&zv->zv_suspend_lock);
1037 	return (err);
1038 }
1039 
1040 static int
1041 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1042 {
1043 	zvol_state_t *zv;
1044 	struct zvol_state_dev *zsd;
1045 	boolean_t drop_suspend = B_TRUE;
1046 
1047 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1048 	zv = dev->si_drv2;
1049 	if (zv == NULL) {
1050 		rw_exit(&zvol_state_lock);
1051 		return (SET_ERROR(ENXIO));
1052 	}
1053 
1054 	mutex_enter(&zv->zv_state_lock);
1055 	if (zv->zv_flags & ZVOL_EXCL) {
1056 		ASSERT3U(zv->zv_open_count, ==, 1);
1057 		zv->zv_flags &= ~ZVOL_EXCL;
1058 	}
1059 
1060 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1061 
1062 	/*
1063 	 * If the open count is zero, this is a spurious close.
1064 	 * That indicates a bug in the kernel / DDI framework.
1065 	 */
1066 	ASSERT3U(zv->zv_open_count, >, 0);
1067 	/*
1068 	 * Make sure zvol is not suspended during last close
1069 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1070 	 * ordering - zv_suspend_lock before zv_state_lock.
1071 	 */
1072 	if (zv->zv_open_count == 1) {
1073 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1074 			mutex_exit(&zv->zv_state_lock);
1075 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1076 			mutex_enter(&zv->zv_state_lock);
1077 			/* Check to see if zv_suspend_lock is needed. */
1078 			if (zv->zv_open_count != 1) {
1079 				rw_exit(&zv->zv_suspend_lock);
1080 				drop_suspend = B_FALSE;
1081 			}
1082 		}
1083 	} else {
1084 		drop_suspend = B_FALSE;
1085 	}
1086 	rw_exit(&zvol_state_lock);
1087 
1088 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1089 
1090 	/*
1091 	 * You may get multiple opens, but only one close.
1092 	 */
1093 	zv->zv_open_count--;
1094 	if (flags & O_SYNC) {
1095 		zsd = &zv->zv_zso->zso_dev;
1096 		zsd->zsd_sync_cnt--;
1097 	}
1098 
1099 	if (zv->zv_open_count == 0) {
1100 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1101 		zvol_last_close(zv);
1102 		wakeup(zv);
1103 	}
1104 
1105 	mutex_exit(&zv->zv_state_lock);
1106 
1107 	if (drop_suspend)
1108 		rw_exit(&zv->zv_suspend_lock);
1109 	return (0);
1110 }
1111 
1112 static int
1113 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1114     int fflag, struct thread *td)
1115 {
1116 	zvol_state_t *zv;
1117 	zfs_locked_range_t *lr;
1118 	off_t offset, length;
1119 	int error;
1120 	boolean_t sync;
1121 
1122 	zv = dev->si_drv2;
1123 
1124 	error = 0;
1125 	KASSERT(zv->zv_open_count > 0,
1126 	    ("Device with zero access count in %s", __func__));
1127 
1128 	switch (cmd) {
1129 	case DIOCGSECTORSIZE:
1130 		*(uint32_t *)data = DEV_BSIZE;
1131 		break;
1132 	case DIOCGMEDIASIZE:
1133 		*(off_t *)data = zv->zv_volsize;
1134 		break;
1135 	case DIOCGFLUSH:
1136 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1137 		if (zv->zv_zilog != NULL)
1138 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1139 		rw_exit(&zv->zv_suspend_lock);
1140 		break;
1141 	case DIOCGDELETE:
1142 		if (!zvol_unmap_enabled)
1143 			break;
1144 
1145 		offset = ((off_t *)data)[0];
1146 		length = ((off_t *)data)[1];
1147 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1148 		    offset < 0 || offset >= zv->zv_volsize ||
1149 		    length <= 0) {
1150 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1151 			    length);
1152 			error = SET_ERROR(EINVAL);
1153 			break;
1154 		}
1155 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1156 		zvol_ensure_zilog(zv);
1157 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1158 		    RL_WRITER);
1159 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1160 		error = dmu_tx_assign(tx, TXG_WAIT);
1161 		if (error != 0) {
1162 			sync = FALSE;
1163 			dmu_tx_abort(tx);
1164 		} else {
1165 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1166 			zvol_log_truncate(zv, tx, offset, length, sync);
1167 			dmu_tx_commit(tx);
1168 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1169 			    offset, length);
1170 		}
1171 		zfs_rangelock_exit(lr);
1172 		if (sync)
1173 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1174 		rw_exit(&zv->zv_suspend_lock);
1175 		break;
1176 	case DIOCGSTRIPESIZE:
1177 		*(off_t *)data = zv->zv_volblocksize;
1178 		break;
1179 	case DIOCGSTRIPEOFFSET:
1180 		*(off_t *)data = 0;
1181 		break;
1182 	case DIOCGATTR: {
1183 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1184 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1185 		uint64_t refd, avail, usedobjs, availobjs;
1186 
1187 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1188 			arg->value.i = 1;
1189 		else if (strcmp(arg->name, "blocksavail") == 0) {
1190 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1191 			    &usedobjs, &availobjs);
1192 			arg->value.off = avail / DEV_BSIZE;
1193 		} else if (strcmp(arg->name, "blocksused") == 0) {
1194 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1195 			    &usedobjs, &availobjs);
1196 			arg->value.off = refd / DEV_BSIZE;
1197 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1198 			avail = metaslab_class_get_space(spa_normal_class(spa));
1199 			avail -= metaslab_class_get_alloc(
1200 			    spa_normal_class(spa));
1201 			arg->value.off = avail / DEV_BSIZE;
1202 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1203 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1204 			arg->value.off = refd / DEV_BSIZE;
1205 		} else
1206 			error = SET_ERROR(ENOIOCTL);
1207 		break;
1208 	}
1209 	case FIOSEEKHOLE:
1210 	case FIOSEEKDATA: {
1211 		off_t *off = (off_t *)data;
1212 		uint64_t noff;
1213 		boolean_t hole;
1214 
1215 		hole = (cmd == FIOSEEKHOLE);
1216 		noff = *off;
1217 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1218 		    RL_READER);
1219 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1220 		zfs_rangelock_exit(lr);
1221 		*off = noff;
1222 		break;
1223 	}
1224 	default:
1225 		error = SET_ERROR(ENOIOCTL);
1226 	}
1227 
1228 	return (error);
1229 }
1230 
1231 /*
1232  * Misc. helpers
1233  */
1234 
1235 static void
1236 zvol_ensure_zilog(zvol_state_t *zv)
1237 {
1238 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1239 
1240 	/*
1241 	 * Open a ZIL if this is the first time we have written to this
1242 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1243 	 * than zv_state_lock so that we don't need to acquire an
1244 	 * additional lock in this path.
1245 	 */
1246 	if (zv->zv_zilog == NULL) {
1247 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1248 			rw_exit(&zv->zv_suspend_lock);
1249 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1250 		}
1251 		if (zv->zv_zilog == NULL) {
1252 			zv->zv_zilog = zil_open(zv->zv_objset,
1253 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1254 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1255 			/* replay / destroy done in zvol_os_create_minor() */
1256 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1257 			    ZIL_REPLAY_NEEDED);
1258 		}
1259 		rw_downgrade(&zv->zv_suspend_lock);
1260 	}
1261 }
1262 
1263 boolean_t
1264 zvol_os_is_zvol(const char *device)
1265 {
1266 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1267 }
1268 
1269 void
1270 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1271 {
1272 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1273 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1274 
1275 	/* Move to a new hashtable entry.  */
1276 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1277 	hlist_del(&zv->zv_hlink);
1278 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1279 
1280 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1281 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1282 		struct g_provider *pp = zsg->zsg_provider;
1283 		struct g_geom *gp;
1284 
1285 		g_topology_lock();
1286 		gp = pp->geom;
1287 		ASSERT3P(gp, !=, NULL);
1288 
1289 		zsg->zsg_provider = NULL;
1290 		g_wither_provider(pp, ENXIO);
1291 
1292 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1293 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1294 		pp->sectorsize = DEV_BSIZE;
1295 		pp->mediasize = zv->zv_volsize;
1296 		pp->private = zv;
1297 		zsg->zsg_provider = pp;
1298 		g_error_provider(pp, 0);
1299 		g_topology_unlock();
1300 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1301 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1302 		struct cdev *dev;
1303 		struct make_dev_args args;
1304 
1305 		dev = zsd->zsd_cdev;
1306 		if (dev != NULL) {
1307 			destroy_dev(dev);
1308 			dev = zsd->zsd_cdev = NULL;
1309 			if (zv->zv_open_count > 0) {
1310 				zv->zv_flags &= ~ZVOL_EXCL;
1311 				zv->zv_open_count = 0;
1312 				/* XXX  need suspend lock but lock order */
1313 				zvol_last_close(zv);
1314 			}
1315 		}
1316 
1317 		make_dev_args_init(&args);
1318 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1319 		args.mda_devsw = &zvol_cdevsw;
1320 		args.mda_cr = NULL;
1321 		args.mda_uid = UID_ROOT;
1322 		args.mda_gid = GID_OPERATOR;
1323 		args.mda_mode = 0640;
1324 		args.mda_si_drv2 = zv;
1325 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1326 		    == 0) {
1327 #if __FreeBSD_version > 1300130
1328 			dev->si_iosize_max = maxphys;
1329 #else
1330 			dev->si_iosize_max = MAXPHYS;
1331 #endif
1332 			zsd->zsd_cdev = dev;
1333 		}
1334 	}
1335 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1336 }
1337 
1338 /*
1339  * Remove minor node for the specified volume.
1340  */
1341 void
1342 zvol_os_free(zvol_state_t *zv)
1343 {
1344 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1345 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1346 	ASSERT0(zv->zv_open_count);
1347 
1348 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1349 
1350 	rw_destroy(&zv->zv_suspend_lock);
1351 	zfs_rangelock_fini(&zv->zv_rangelock);
1352 
1353 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1354 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1355 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1356 
1357 		ASSERT3P(pp->private, ==, NULL);
1358 
1359 		g_topology_lock();
1360 		zvol_geom_destroy(zv);
1361 		g_topology_unlock();
1362 		mtx_destroy(&zsg->zsg_queue_mtx);
1363 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1364 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1365 		struct cdev *dev = zsd->zsd_cdev;
1366 
1367 		if (dev != NULL) {
1368 			ASSERT3P(dev->si_drv2, ==, NULL);
1369 			destroy_dev(dev);
1370 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1371 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1372 		}
1373 	}
1374 
1375 	mutex_destroy(&zv->zv_state_lock);
1376 	dataset_kstats_destroy(&zv->zv_kstat);
1377 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1378 	kmem_free(zv, sizeof (zvol_state_t));
1379 	zvol_minors--;
1380 }
1381 
1382 /*
1383  * Create a minor node (plus a whole lot more) for the specified volume.
1384  */
1385 int
1386 zvol_os_create_minor(const char *name)
1387 {
1388 	zvol_state_t *zv;
1389 	objset_t *os;
1390 	dmu_object_info_t *doi;
1391 	uint64_t volsize;
1392 	uint64_t volmode, hash;
1393 	int error;
1394 	bool replayed_zil = B_FALSE;
1395 
1396 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1397 	hash = zvol_name_hash(name);
1398 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1399 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1400 		mutex_exit(&zv->zv_state_lock);
1401 		return (SET_ERROR(EEXIST));
1402 	}
1403 
1404 	DROP_GIANT();
1405 
1406 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1407 
1408 	/* Lie and say we're read-only. */
1409 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1410 	if (error)
1411 		goto out_doi;
1412 
1413 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1414 	if (error)
1415 		goto out_dmu_objset_disown;
1416 
1417 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1418 	if (error)
1419 		goto out_dmu_objset_disown;
1420 
1421 	error = dsl_prop_get_integer(name,
1422 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1423 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1424 		volmode = zvol_volmode;
1425 	error = 0;
1426 
1427 	/*
1428 	 * zvol_alloc equivalent ...
1429 	 */
1430 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1431 	zv->zv_hash = hash;
1432 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1433 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1434 	zv->zv_volmode = volmode;
1435 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1436 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1437 		struct g_provider *pp;
1438 		struct g_geom *gp;
1439 
1440 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1441 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1442 
1443 		g_topology_lock();
1444 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1445 		gp->start = zvol_geom_bio_start;
1446 		gp->access = zvol_geom_access;
1447 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1448 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1449 		pp->sectorsize = DEV_BSIZE;
1450 		pp->mediasize = 0;
1451 		pp->private = zv;
1452 
1453 		zsg->zsg_provider = pp;
1454 		bioq_init(&zsg->zsg_queue);
1455 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1456 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1457 		struct cdev *dev;
1458 		struct make_dev_args args;
1459 
1460 		make_dev_args_init(&args);
1461 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1462 		args.mda_devsw = &zvol_cdevsw;
1463 		args.mda_cr = NULL;
1464 		args.mda_uid = UID_ROOT;
1465 		args.mda_gid = GID_OPERATOR;
1466 		args.mda_mode = 0640;
1467 		args.mda_si_drv2 = zv;
1468 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1469 		    == 0) {
1470 #if __FreeBSD_version > 1300130
1471 			dev->si_iosize_max = maxphys;
1472 #else
1473 			dev->si_iosize_max = MAXPHYS;
1474 #endif
1475 			zsd->zsd_cdev = dev;
1476 			knlist_init_sx(&zsd->zsd_selinfo.si_note,
1477 			    &zv->zv_state_lock);
1478 		}
1479 	}
1480 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1481 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1482 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1483 
1484 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1485 		zv->zv_flags |= ZVOL_RDONLY;
1486 
1487 	zv->zv_volblocksize = doi->doi_data_block_size;
1488 	zv->zv_volsize = volsize;
1489 	zv->zv_objset = os;
1490 
1491 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1492 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1493 	if (error)
1494 		goto out_dmu_objset_disown;
1495 	ASSERT3P(zv->zv_zilog, ==, NULL);
1496 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1497 	if (spa_writeable(dmu_objset_spa(os))) {
1498 		if (zil_replay_disable)
1499 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1500 		else
1501 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1502 	}
1503 	if (replayed_zil)
1504 		zil_close(zv->zv_zilog);
1505 	zv->zv_zilog = NULL;
1506 
1507 	/* TODO: prefetch for geom tasting */
1508 
1509 	zv->zv_objset = NULL;
1510 out_dmu_objset_disown:
1511 	dmu_objset_disown(os, B_TRUE, FTAG);
1512 
1513 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1514 		zvol_geom_run(zv);
1515 		g_topology_unlock();
1516 	}
1517 out_doi:
1518 	kmem_free(doi, sizeof (dmu_object_info_t));
1519 	if (error == 0) {
1520 		rw_enter(&zvol_state_lock, RW_WRITER);
1521 		zvol_insert(zv);
1522 		zvol_minors++;
1523 		rw_exit(&zvol_state_lock);
1524 		ZFS_LOG(1, "ZVOL %s created.", name);
1525 	}
1526 	PICKUP_GIANT();
1527 	return (error);
1528 }
1529 
1530 void
1531 zvol_os_clear_private(zvol_state_t *zv)
1532 {
1533 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1534 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1535 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1536 		struct g_provider *pp = zsg->zsg_provider;
1537 
1538 		if (pp->private == NULL) /* already cleared */
1539 			return;
1540 
1541 		mtx_lock(&zsg->zsg_queue_mtx);
1542 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1543 		pp->private = NULL;
1544 		wakeup_one(&zsg->zsg_queue);
1545 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1546 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1547 			    0, "zvol:w", 0);
1548 		mtx_unlock(&zsg->zsg_queue_mtx);
1549 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1550 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1551 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1552 		struct cdev *dev = zsd->zsd_cdev;
1553 
1554 		if (dev != NULL)
1555 			dev->si_drv2 = NULL;
1556 	}
1557 }
1558 
1559 int
1560 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1561 {
1562 	zv->zv_volsize = volsize;
1563 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1564 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1565 		struct g_provider *pp = zsg->zsg_provider;
1566 
1567 		g_topology_lock();
1568 
1569 		if (pp->private == NULL) {
1570 			g_topology_unlock();
1571 			return (SET_ERROR(ENXIO));
1572 		}
1573 
1574 		/*
1575 		 * Do not invoke resize event when initial size was zero.
1576 		 * ZVOL initializes the size on first open, this is not
1577 		 * real resizing.
1578 		 */
1579 		if (pp->mediasize == 0)
1580 			pp->mediasize = zv->zv_volsize;
1581 		else
1582 			g_resize_provider(pp, zv->zv_volsize);
1583 
1584 		g_topology_unlock();
1585 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1586 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1587 
1588 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1589 	}
1590 	return (0);
1591 }
1592 
1593 void
1594 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1595 {
1596 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1597 }
1598 
1599 void
1600 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1601 {
1602 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1603 }
1604 
1605 /*
1606  * Public interfaces
1607  */
1608 
1609 int
1610 zvol_busy(void)
1611 {
1612 	return (zvol_minors != 0);
1613 }
1614 
1615 int
1616 zvol_init(void)
1617 {
1618 	zvol_init_impl();
1619 	return (0);
1620 }
1621 
1622 void
1623 zvol_fini(void)
1624 {
1625 	zvol_fini_impl();
1626 }
1627