1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 #include <sys/freebsd_event.h>
96 
97 #include <geom/geom.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
100 
101 #include "zfs_namecheck.h"
102 
103 #define	ZVOL_DUMPSIZE		"dumpsize"
104 
105 #ifdef ZVOL_LOCK_DEBUG
106 #define	ZVOL_RW_READER		RW_WRITER
107 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
108 #else
109 #define	ZVOL_RW_READER		RW_READER
110 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
111 #endif
112 
113 enum zvol_geom_state {
114 	ZVOL_GEOM_UNINIT,
115 	ZVOL_GEOM_STOPPED,
116 	ZVOL_GEOM_RUNNING,
117 };
118 
119 struct zvol_state_os {
120 #define	zso_dev		_zso_state._zso_dev
121 #define	zso_geom	_zso_state._zso_geom
122 	union {
123 		/* volmode=dev */
124 		struct zvol_state_dev {
125 			struct cdev *zsd_cdev;
126 			struct selinfo zsd_selinfo;
127 		} _zso_dev;
128 
129 		/* volmode=geom */
130 		struct zvol_state_geom {
131 			struct g_provider *zsg_provider;
132 			struct bio_queue_head zsg_queue;
133 			struct mtx zsg_queue_mtx;
134 			enum zvol_geom_state zsg_state;
135 		} _zso_geom;
136 	} _zso_state;
137 	int zso_dying;
138 };
139 
140 static uint32_t zvol_minors;
141 
142 SYSCTL_DECL(_vfs_zfs);
143 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
144 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
145 	"Expose as GEOM providers (1), device files (2) or neither");
146 static boolean_t zpool_on_zvol = B_FALSE;
147 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
148 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
149 
150 /*
151  * Toggle unmap functionality.
152  */
153 boolean_t zvol_unmap_enabled = B_TRUE;
154 
155 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
156 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
157 
158 /*
159  * zvol maximum transfer in one DMU tx.
160  */
161 int zvol_maxphys = DMU_MAX_ACCESS / 2;
162 
163 static void zvol_ensure_zilog(zvol_state_t *zv);
164 
165 static d_open_t		zvol_cdev_open;
166 static d_close_t	zvol_cdev_close;
167 static d_ioctl_t	zvol_cdev_ioctl;
168 static d_read_t		zvol_cdev_read;
169 static d_write_t	zvol_cdev_write;
170 static d_strategy_t	zvol_geom_bio_strategy;
171 static d_kqfilter_t	zvol_cdev_kqfilter;
172 
173 static struct cdevsw zvol_cdevsw = {
174 	.d_name =	"zvol",
175 	.d_version =	D_VERSION,
176 	.d_flags =	D_DISK | D_TRACKCLOSE,
177 	.d_open =	zvol_cdev_open,
178 	.d_close =	zvol_cdev_close,
179 	.d_ioctl =	zvol_cdev_ioctl,
180 	.d_read =	zvol_cdev_read,
181 	.d_write =	zvol_cdev_write,
182 	.d_strategy =	zvol_geom_bio_strategy,
183 	.d_kqfilter =	zvol_cdev_kqfilter,
184 };
185 
186 static void		zvol_filter_detach(struct knote *kn);
187 static int		zvol_filter_vnode(struct knote *kn, long hint);
188 
189 static struct filterops zvol_filterops_vnode = {
190 	.f_isfd = 1,
191 	.f_detach = zvol_filter_detach,
192 	.f_event = zvol_filter_vnode,
193 };
194 
195 extern uint_t zfs_geom_probe_vdev_key;
196 
197 struct g_class zfs_zvol_class = {
198 	.name = "ZFS::ZVOL",
199 	.version = G_VERSION,
200 };
201 
202 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
203 
204 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
205 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
206 static void zvol_geom_run(zvol_state_t *zv);
207 static void zvol_geom_destroy(zvol_state_t *zv);
208 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
209 static void zvol_geom_worker(void *arg);
210 static void zvol_geom_bio_start(struct bio *bp);
211 static int zvol_geom_bio_getattr(struct bio *bp);
212 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
213 
214 /*
215  * GEOM mode implementation
216  */
217 
218 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)219 zvol_geom_open(struct g_provider *pp, int flag, int count)
220 {
221 	zvol_state_t *zv;
222 	int err = 0;
223 	boolean_t drop_suspend = B_FALSE;
224 
225 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
226 		/*
227 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
228 		 * attempting to probe geom providers while looking for a
229 		 * replacement for a missing VDEV.  In this case, the
230 		 * spa_namespace_lock will not be held, but it is still illegal
231 		 * to use a zvol as a vdev.  Deadlocks can result if another
232 		 * thread has spa_namespace_lock.
233 		 */
234 		return (SET_ERROR(EOPNOTSUPP));
235 	}
236 
237 retry:
238 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
239 	/*
240 	 * Obtain a copy of private under zvol_state_lock to make sure either
241 	 * the result of zvol free code setting private to NULL is observed,
242 	 * or the zv is protected from being freed because of the positive
243 	 * zv_open_count.
244 	 */
245 	zv = pp->private;
246 	if (zv == NULL) {
247 		rw_exit(&zvol_state_lock);
248 		err = SET_ERROR(ENXIO);
249 		goto out_locked;
250 	}
251 
252 	mutex_enter(&zv->zv_state_lock);
253 	if (zv->zv_zso->zso_dying) {
254 		rw_exit(&zvol_state_lock);
255 		err = SET_ERROR(ENXIO);
256 		goto out_zv_locked;
257 	}
258 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
259 
260 	/*
261 	 * Make sure zvol is not suspended during first open
262 	 * (hold zv_suspend_lock) and respect proper lock acquisition
263 	 * ordering - zv_suspend_lock before zv_state_lock.
264 	 */
265 	if (zv->zv_open_count == 0) {
266 		drop_suspend = B_TRUE;
267 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
268 			mutex_exit(&zv->zv_state_lock);
269 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
270 			mutex_enter(&zv->zv_state_lock);
271 			/* Check to see if zv_suspend_lock is needed. */
272 			if (zv->zv_open_count != 0) {
273 				rw_exit(&zv->zv_suspend_lock);
274 				drop_suspend = B_FALSE;
275 			}
276 		}
277 	}
278 	rw_exit(&zvol_state_lock);
279 
280 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
281 
282 	if (zv->zv_open_count == 0) {
283 		boolean_t drop_namespace = B_FALSE;
284 
285 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
286 
287 		/*
288 		 * Take spa_namespace_lock to prevent lock inversion when
289 		 * zvols from one pool are opened as vdevs in another.
290 		 */
291 		if (!mutex_owned(&spa_namespace_lock)) {
292 			if (!mutex_tryenter(&spa_namespace_lock)) {
293 				mutex_exit(&zv->zv_state_lock);
294 				rw_exit(&zv->zv_suspend_lock);
295 				kern_yield(PRI_USER);
296 				goto retry;
297 			} else {
298 				drop_namespace = B_TRUE;
299 			}
300 		}
301 		err = zvol_first_open(zv, !(flag & FWRITE));
302 		if (drop_namespace)
303 			mutex_exit(&spa_namespace_lock);
304 		if (err)
305 			goto out_zv_locked;
306 		pp->mediasize = zv->zv_volsize;
307 		pp->stripeoffset = 0;
308 		pp->stripesize = zv->zv_volblocksize;
309 	}
310 
311 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
312 
313 	/*
314 	 * Check for a bad on-disk format version now since we
315 	 * lied about owning the dataset readonly before.
316 	 */
317 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
318 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
319 		err = SET_ERROR(EROFS);
320 		goto out_opened;
321 	}
322 	if (zv->zv_flags & ZVOL_EXCL) {
323 		err = SET_ERROR(EBUSY);
324 		goto out_opened;
325 	}
326 	if (flag & O_EXCL) {
327 		if (zv->zv_open_count != 0) {
328 			err = SET_ERROR(EBUSY);
329 			goto out_opened;
330 		}
331 		zv->zv_flags |= ZVOL_EXCL;
332 	}
333 
334 	zv->zv_open_count += count;
335 out_opened:
336 	if (zv->zv_open_count == 0) {
337 		zvol_last_close(zv);
338 		wakeup(zv);
339 	}
340 out_zv_locked:
341 	mutex_exit(&zv->zv_state_lock);
342 out_locked:
343 	if (drop_suspend)
344 		rw_exit(&zv->zv_suspend_lock);
345 	return (err);
346 }
347 
348 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)349 zvol_geom_close(struct g_provider *pp, int flag, int count)
350 {
351 	(void) flag;
352 	zvol_state_t *zv;
353 	boolean_t drop_suspend = B_TRUE;
354 	int new_open_count;
355 
356 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
357 	zv = pp->private;
358 	if (zv == NULL) {
359 		rw_exit(&zvol_state_lock);
360 		return (SET_ERROR(ENXIO));
361 	}
362 
363 	mutex_enter(&zv->zv_state_lock);
364 	if (zv->zv_flags & ZVOL_EXCL) {
365 		ASSERT3U(zv->zv_open_count, ==, 1);
366 		zv->zv_flags &= ~ZVOL_EXCL;
367 	}
368 
369 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
370 
371 	/*
372 	 * If the open count is zero, this is a spurious close.
373 	 * That indicates a bug in the kernel / DDI framework.
374 	 */
375 	ASSERT3U(zv->zv_open_count, >, 0);
376 
377 	/*
378 	 * Make sure zvol is not suspended during last close
379 	 * (hold zv_suspend_lock) and respect proper lock acquisition
380 	 * ordering - zv_suspend_lock before zv_state_lock.
381 	 */
382 	new_open_count = zv->zv_open_count - count;
383 	if (new_open_count == 0) {
384 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
385 			mutex_exit(&zv->zv_state_lock);
386 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
387 			mutex_enter(&zv->zv_state_lock);
388 			/* Check to see if zv_suspend_lock is needed. */
389 			new_open_count = zv->zv_open_count - count;
390 			if (new_open_count != 0) {
391 				rw_exit(&zv->zv_suspend_lock);
392 				drop_suspend = B_FALSE;
393 			}
394 		}
395 	} else {
396 		drop_suspend = B_FALSE;
397 	}
398 	rw_exit(&zvol_state_lock);
399 
400 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
401 
402 	/*
403 	 * You may get multiple opens, but only one close.
404 	 */
405 	zv->zv_open_count = new_open_count;
406 	if (zv->zv_open_count == 0) {
407 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
408 		zvol_last_close(zv);
409 		wakeup(zv);
410 	}
411 
412 	mutex_exit(&zv->zv_state_lock);
413 
414 	if (drop_suspend)
415 		rw_exit(&zv->zv_suspend_lock);
416 	return (0);
417 }
418 
419 static void
zvol_geom_run(zvol_state_t * zv)420 zvol_geom_run(zvol_state_t *zv)
421 {
422 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
423 	struct g_provider *pp = zsg->zsg_provider;
424 
425 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
426 
427 	g_error_provider(pp, 0);
428 
429 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
430 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
431 }
432 
433 static void
zvol_geom_destroy(zvol_state_t * zv)434 zvol_geom_destroy(zvol_state_t *zv)
435 {
436 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
437 	struct g_provider *pp = zsg->zsg_provider;
438 
439 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
440 
441 	g_topology_assert();
442 
443 	mutex_enter(&zv->zv_state_lock);
444 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
445 	mutex_exit(&zv->zv_state_lock);
446 	zsg->zsg_provider = NULL;
447 	g_wither_geom(pp->geom, ENXIO);
448 }
449 
450 void
zvol_wait_close(zvol_state_t * zv)451 zvol_wait_close(zvol_state_t *zv)
452 {
453 
454 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
455 		return;
456 	mutex_enter(&zv->zv_state_lock);
457 	zv->zv_zso->zso_dying = B_TRUE;
458 
459 	if (zv->zv_open_count)
460 		msleep(zv, &zv->zv_state_lock,
461 		    PRIBIO, "zvol:dying", 10*hz);
462 	mutex_exit(&zv->zv_state_lock);
463 }
464 
465 
466 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)467 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
468 {
469 	int count, error, flags;
470 
471 	g_topology_assert();
472 
473 	/*
474 	 * To make it easier we expect either open or close, but not both
475 	 * at the same time.
476 	 */
477 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
478 	    (acr <= 0 && acw <= 0 && ace <= 0),
479 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
480 	    pp->name, acr, acw, ace));
481 
482 	if (pp->private == NULL) {
483 		if (acr <= 0 && acw <= 0 && ace <= 0)
484 			return (0);
485 		return (pp->error);
486 	}
487 
488 	/*
489 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
490 	 * ace != 0, because GEOM already handles that and handles it a bit
491 	 * differently. GEOM allows for multiple read/exclusive consumers and
492 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
493 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
494 	 * to decide what to do.
495 	 */
496 
497 	count = acr + acw + ace;
498 	if (count == 0)
499 		return (0);
500 
501 	flags = 0;
502 	if (acr != 0 || ace != 0)
503 		flags |= FREAD;
504 	if (acw != 0)
505 		flags |= FWRITE;
506 
507 	g_topology_unlock();
508 	if (count > 0)
509 		error = zvol_geom_open(pp, flags, count);
510 	else
511 		error = zvol_geom_close(pp, flags, -count);
512 	g_topology_lock();
513 	return (error);
514 }
515 
516 static void
zvol_geom_worker(void * arg)517 zvol_geom_worker(void *arg)
518 {
519 	zvol_state_t *zv = arg;
520 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
521 	struct bio *bp;
522 
523 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
524 
525 	thread_lock(curthread);
526 	sched_prio(curthread, PRIBIO);
527 	thread_unlock(curthread);
528 
529 	for (;;) {
530 		mtx_lock(&zsg->zsg_queue_mtx);
531 		bp = bioq_takefirst(&zsg->zsg_queue);
532 		if (bp == NULL) {
533 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
534 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
535 				wakeup(&zsg->zsg_state);
536 				mtx_unlock(&zsg->zsg_queue_mtx);
537 				kthread_exit();
538 			}
539 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
540 			    PRIBIO | PDROP, "zvol:io", 0);
541 			continue;
542 		}
543 		mtx_unlock(&zsg->zsg_queue_mtx);
544 		zvol_geom_bio_strategy(bp);
545 	}
546 }
547 
548 static void
zvol_geom_bio_start(struct bio * bp)549 zvol_geom_bio_start(struct bio *bp)
550 {
551 	zvol_state_t *zv = bp->bio_to->private;
552 	struct zvol_state_geom *zsg;
553 	boolean_t first;
554 
555 	if (zv == NULL) {
556 		g_io_deliver(bp, ENXIO);
557 		return;
558 	}
559 	if (bp->bio_cmd == BIO_GETATTR) {
560 		if (zvol_geom_bio_getattr(bp))
561 			g_io_deliver(bp, EOPNOTSUPP);
562 		return;
563 	}
564 
565 	if (!THREAD_CAN_SLEEP()) {
566 		zsg = &zv->zv_zso->zso_geom;
567 		mtx_lock(&zsg->zsg_queue_mtx);
568 		first = (bioq_first(&zsg->zsg_queue) == NULL);
569 		bioq_insert_tail(&zsg->zsg_queue, bp);
570 		mtx_unlock(&zsg->zsg_queue_mtx);
571 		if (first)
572 			wakeup_one(&zsg->zsg_queue);
573 		return;
574 	}
575 
576 	zvol_geom_bio_strategy(bp);
577 }
578 
579 static int
zvol_geom_bio_getattr(struct bio * bp)580 zvol_geom_bio_getattr(struct bio *bp)
581 {
582 	zvol_state_t *zv;
583 
584 	zv = bp->bio_to->private;
585 	ASSERT3P(zv, !=, NULL);
586 
587 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
588 	uint64_t refd, avail, usedobjs, availobjs;
589 
590 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
591 		return (0);
592 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
593 		dmu_objset_space(zv->zv_objset, &refd, &avail,
594 		    &usedobjs, &availobjs);
595 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
596 			return (0);
597 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
598 		dmu_objset_space(zv->zv_objset, &refd, &avail,
599 		    &usedobjs, &availobjs);
600 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
601 			return (0);
602 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
603 		avail = metaslab_class_get_space(spa_normal_class(spa));
604 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
605 		if (g_handleattr_off_t(bp, "poolblocksavail",
606 		    avail / DEV_BSIZE))
607 			return (0);
608 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
609 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
610 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
611 			return (0);
612 	}
613 	return (1);
614 }
615 
616 static void
zvol_filter_detach(struct knote * kn)617 zvol_filter_detach(struct knote *kn)
618 {
619 	zvol_state_t *zv;
620 	struct zvol_state_dev *zsd;
621 
622 	zv = kn->kn_hook;
623 	zsd = &zv->zv_zso->zso_dev;
624 
625 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
626 }
627 
628 static int
zvol_filter_vnode(struct knote * kn,long hint)629 zvol_filter_vnode(struct knote *kn, long hint)
630 {
631 	kn->kn_fflags |= kn->kn_sfflags & hint;
632 
633 	return (kn->kn_fflags != 0);
634 }
635 
636 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)637 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
638 {
639 	zvol_state_t *zv;
640 	struct zvol_state_dev *zsd;
641 
642 	zv = dev->si_drv2;
643 	zsd = &zv->zv_zso->zso_dev;
644 
645 	if (kn->kn_filter != EVFILT_VNODE)
646 		return (EINVAL);
647 
648 	/* XXX: extend support for other NOTE_* events */
649 	if (kn->kn_sfflags != NOTE_ATTRIB)
650 		return (EINVAL);
651 
652 	kn->kn_fop = &zvol_filterops_vnode;
653 	kn->kn_hook = zv;
654 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
655 
656 	return (0);
657 }
658 
659 static void
zvol_geom_bio_strategy(struct bio * bp)660 zvol_geom_bio_strategy(struct bio *bp)
661 {
662 	zvol_state_t *zv;
663 	uint64_t off, volsize;
664 	size_t resid;
665 	char *addr;
666 	objset_t *os;
667 	zfs_locked_range_t *lr;
668 	int error = 0;
669 	boolean_t doread = B_FALSE;
670 	boolean_t is_dumpified;
671 	boolean_t commit;
672 
673 	if (bp->bio_to)
674 		zv = bp->bio_to->private;
675 	else
676 		zv = bp->bio_dev->si_drv2;
677 
678 	if (zv == NULL) {
679 		error = SET_ERROR(ENXIO);
680 		goto out;
681 	}
682 
683 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
684 
685 	switch (bp->bio_cmd) {
686 	case BIO_READ:
687 		doread = B_TRUE;
688 		break;
689 	case BIO_WRITE:
690 	case BIO_FLUSH:
691 	case BIO_DELETE:
692 		if (zv->zv_flags & ZVOL_RDONLY) {
693 			error = SET_ERROR(EROFS);
694 			goto resume;
695 		}
696 		zvol_ensure_zilog(zv);
697 		if (bp->bio_cmd == BIO_FLUSH)
698 			goto commit;
699 		break;
700 	default:
701 		error = SET_ERROR(EOPNOTSUPP);
702 		goto resume;
703 	}
704 
705 	off = bp->bio_offset;
706 	volsize = zv->zv_volsize;
707 
708 	os = zv->zv_objset;
709 	ASSERT3P(os, !=, NULL);
710 
711 	addr = bp->bio_data;
712 	resid = bp->bio_length;
713 
714 	if (resid > 0 && off >= volsize) {
715 		error = SET_ERROR(EIO);
716 		goto resume;
717 	}
718 
719 	is_dumpified = B_FALSE;
720 	commit = !doread && !is_dumpified &&
721 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
722 
723 	/*
724 	 * There must be no buffer changes when doing a dmu_sync() because
725 	 * we can't change the data whilst calculating the checksum.
726 	 */
727 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
728 	    doread ? RL_READER : RL_WRITER);
729 
730 	if (bp->bio_cmd == BIO_DELETE) {
731 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
732 		error = dmu_tx_assign(tx, TXG_WAIT);
733 		if (error != 0) {
734 			dmu_tx_abort(tx);
735 		} else {
736 			zvol_log_truncate(zv, tx, off, resid);
737 			dmu_tx_commit(tx);
738 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
739 			    off, resid);
740 			resid = 0;
741 		}
742 		goto unlock;
743 	}
744 	while (resid != 0 && off < volsize) {
745 		size_t size = MIN(resid, zvol_maxphys);
746 		if (doread) {
747 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
748 			    DMU_READ_PREFETCH);
749 		} else {
750 			dmu_tx_t *tx = dmu_tx_create(os);
751 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
752 			error = dmu_tx_assign(tx, TXG_WAIT);
753 			if (error) {
754 				dmu_tx_abort(tx);
755 			} else {
756 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
757 				zvol_log_write(zv, tx, off, size, commit);
758 				dmu_tx_commit(tx);
759 			}
760 		}
761 		if (error) {
762 			/* Convert checksum errors into IO errors. */
763 			if (error == ECKSUM)
764 				error = SET_ERROR(EIO);
765 			break;
766 		}
767 		off += size;
768 		addr += size;
769 		resid -= size;
770 	}
771 unlock:
772 	zfs_rangelock_exit(lr);
773 
774 	bp->bio_completed = bp->bio_length - resid;
775 	if (bp->bio_completed < bp->bio_length && off > volsize)
776 		error = SET_ERROR(EINVAL);
777 
778 	switch (bp->bio_cmd) {
779 	case BIO_FLUSH:
780 		break;
781 	case BIO_READ:
782 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
783 		    bp->bio_completed);
784 		break;
785 	case BIO_WRITE:
786 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
787 		    bp->bio_completed);
788 		break;
789 	case BIO_DELETE:
790 		break;
791 	default:
792 		break;
793 	}
794 
795 	if (commit) {
796 commit:
797 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
798 	}
799 resume:
800 	rw_exit(&zv->zv_suspend_lock);
801 out:
802 	if (bp->bio_to)
803 		g_io_deliver(bp, error);
804 	else
805 		biofinish(bp, NULL, error);
806 }
807 
808 /*
809  * Character device mode implementation
810  */
811 
812 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)813 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
814 {
815 	zvol_state_t *zv;
816 	uint64_t volsize;
817 	zfs_locked_range_t *lr;
818 	int error = 0;
819 	zfs_uio_t uio;
820 
821 	zfs_uio_init(&uio, uio_s);
822 
823 	zv = dev->si_drv2;
824 
825 	volsize = zv->zv_volsize;
826 	/*
827 	 * uio_loffset == volsize isn't an error as
828 	 * it's required for EOF processing.
829 	 */
830 	if (zfs_uio_resid(&uio) > 0 &&
831 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
832 		return (SET_ERROR(EIO));
833 
834 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
835 	ssize_t start_resid = zfs_uio_resid(&uio);
836 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
837 	    zfs_uio_resid(&uio), RL_READER);
838 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
839 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
840 
841 		/* Don't read past the end. */
842 		if (bytes > volsize - zfs_uio_offset(&uio))
843 			bytes = volsize - zfs_uio_offset(&uio);
844 
845 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
846 		if (error) {
847 			/* Convert checksum errors into IO errors. */
848 			if (error == ECKSUM)
849 				error = SET_ERROR(EIO);
850 			break;
851 		}
852 	}
853 	zfs_rangelock_exit(lr);
854 	int64_t nread = start_resid - zfs_uio_resid(&uio);
855 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
856 	rw_exit(&zv->zv_suspend_lock);
857 
858 	return (error);
859 }
860 
861 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)862 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
863 {
864 	zvol_state_t *zv;
865 	uint64_t volsize;
866 	zfs_locked_range_t *lr;
867 	int error = 0;
868 	boolean_t commit;
869 	zfs_uio_t uio;
870 
871 	zv = dev->si_drv2;
872 
873 	volsize = zv->zv_volsize;
874 
875 	zfs_uio_init(&uio, uio_s);
876 
877 	if (zfs_uio_resid(&uio) > 0 &&
878 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
879 		return (SET_ERROR(EIO));
880 
881 	ssize_t start_resid = zfs_uio_resid(&uio);
882 	commit = (ioflag & IO_SYNC) ||
883 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
884 
885 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
886 	zvol_ensure_zilog(zv);
887 
888 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
889 	    zfs_uio_resid(&uio), RL_WRITER);
890 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
891 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
892 		uint64_t off = zfs_uio_offset(&uio);
893 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
894 
895 		if (bytes > volsize - off)	/* Don't write past the end. */
896 			bytes = volsize - off;
897 
898 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
899 		error = dmu_tx_assign(tx, TXG_WAIT);
900 		if (error) {
901 			dmu_tx_abort(tx);
902 			break;
903 		}
904 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
905 		if (error == 0)
906 			zvol_log_write(zv, tx, off, bytes, commit);
907 		dmu_tx_commit(tx);
908 
909 		if (error)
910 			break;
911 	}
912 	zfs_rangelock_exit(lr);
913 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
914 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
915 	if (commit)
916 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
917 	rw_exit(&zv->zv_suspend_lock);
918 	return (error);
919 }
920 
921 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)922 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
923 {
924 	zvol_state_t *zv;
925 	int err = 0;
926 	boolean_t drop_suspend = B_FALSE;
927 
928 retry:
929 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
930 	/*
931 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
932 	 * the result of zvol free code setting si_drv2 to NULL is observed,
933 	 * or the zv is protected from being freed because of the positive
934 	 * zv_open_count.
935 	 */
936 	zv = dev->si_drv2;
937 	if (zv == NULL) {
938 		rw_exit(&zvol_state_lock);
939 		err = SET_ERROR(ENXIO);
940 		goto out_locked;
941 	}
942 
943 	mutex_enter(&zv->zv_state_lock);
944 	if (zv->zv_zso->zso_dying) {
945 		rw_exit(&zvol_state_lock);
946 		err = SET_ERROR(ENXIO);
947 		goto out_zv_locked;
948 	}
949 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
950 
951 	/*
952 	 * Make sure zvol is not suspended during first open
953 	 * (hold zv_suspend_lock) and respect proper lock acquisition
954 	 * ordering - zv_suspend_lock before zv_state_lock.
955 	 */
956 	if (zv->zv_open_count == 0) {
957 		drop_suspend = B_TRUE;
958 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
959 			mutex_exit(&zv->zv_state_lock);
960 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
961 			mutex_enter(&zv->zv_state_lock);
962 			/* Check to see if zv_suspend_lock is needed. */
963 			if (zv->zv_open_count != 0) {
964 				rw_exit(&zv->zv_suspend_lock);
965 				drop_suspend = B_FALSE;
966 			}
967 		}
968 	}
969 	rw_exit(&zvol_state_lock);
970 
971 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
972 
973 	if (zv->zv_open_count == 0) {
974 		boolean_t drop_namespace = B_FALSE;
975 
976 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
977 
978 		/*
979 		 * Take spa_namespace_lock to prevent lock inversion when
980 		 * zvols from one pool are opened as vdevs in another.
981 		 */
982 		if (!mutex_owned(&spa_namespace_lock)) {
983 			if (!mutex_tryenter(&spa_namespace_lock)) {
984 				mutex_exit(&zv->zv_state_lock);
985 				rw_exit(&zv->zv_suspend_lock);
986 				kern_yield(PRI_USER);
987 				goto retry;
988 			} else {
989 				drop_namespace = B_TRUE;
990 			}
991 		}
992 		err = zvol_first_open(zv, !(flags & FWRITE));
993 		if (drop_namespace)
994 			mutex_exit(&spa_namespace_lock);
995 		if (err)
996 			goto out_zv_locked;
997 	}
998 
999 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1000 
1001 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1002 		err = SET_ERROR(EROFS);
1003 		goto out_opened;
1004 	}
1005 	if (zv->zv_flags & ZVOL_EXCL) {
1006 		err = SET_ERROR(EBUSY);
1007 		goto out_opened;
1008 	}
1009 	if (flags & O_EXCL) {
1010 		if (zv->zv_open_count != 0) {
1011 			err = SET_ERROR(EBUSY);
1012 			goto out_opened;
1013 		}
1014 		zv->zv_flags |= ZVOL_EXCL;
1015 	}
1016 
1017 	zv->zv_open_count++;
1018 out_opened:
1019 	if (zv->zv_open_count == 0) {
1020 		zvol_last_close(zv);
1021 		wakeup(zv);
1022 	}
1023 out_zv_locked:
1024 	mutex_exit(&zv->zv_state_lock);
1025 out_locked:
1026 	if (drop_suspend)
1027 		rw_exit(&zv->zv_suspend_lock);
1028 	return (err);
1029 }
1030 
1031 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1032 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1033 {
1034 	zvol_state_t *zv;
1035 	boolean_t drop_suspend = B_TRUE;
1036 
1037 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1038 	zv = dev->si_drv2;
1039 	if (zv == NULL) {
1040 		rw_exit(&zvol_state_lock);
1041 		return (SET_ERROR(ENXIO));
1042 	}
1043 
1044 	mutex_enter(&zv->zv_state_lock);
1045 	if (zv->zv_flags & ZVOL_EXCL) {
1046 		ASSERT3U(zv->zv_open_count, ==, 1);
1047 		zv->zv_flags &= ~ZVOL_EXCL;
1048 	}
1049 
1050 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1051 
1052 	/*
1053 	 * If the open count is zero, this is a spurious close.
1054 	 * That indicates a bug in the kernel / DDI framework.
1055 	 */
1056 	ASSERT3U(zv->zv_open_count, >, 0);
1057 	/*
1058 	 * Make sure zvol is not suspended during last close
1059 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1060 	 * ordering - zv_suspend_lock before zv_state_lock.
1061 	 */
1062 	if (zv->zv_open_count == 1) {
1063 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1064 			mutex_exit(&zv->zv_state_lock);
1065 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1066 			mutex_enter(&zv->zv_state_lock);
1067 			/* Check to see if zv_suspend_lock is needed. */
1068 			if (zv->zv_open_count != 1) {
1069 				rw_exit(&zv->zv_suspend_lock);
1070 				drop_suspend = B_FALSE;
1071 			}
1072 		}
1073 	} else {
1074 		drop_suspend = B_FALSE;
1075 	}
1076 	rw_exit(&zvol_state_lock);
1077 
1078 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1079 
1080 	/*
1081 	 * You may get multiple opens, but only one close.
1082 	 */
1083 	zv->zv_open_count--;
1084 
1085 	if (zv->zv_open_count == 0) {
1086 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1087 		zvol_last_close(zv);
1088 		wakeup(zv);
1089 	}
1090 
1091 	mutex_exit(&zv->zv_state_lock);
1092 
1093 	if (drop_suspend)
1094 		rw_exit(&zv->zv_suspend_lock);
1095 	return (0);
1096 }
1097 
1098 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1099 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1100     int fflag, struct thread *td)
1101 {
1102 	zvol_state_t *zv;
1103 	zfs_locked_range_t *lr;
1104 	off_t offset, length;
1105 	int error;
1106 	boolean_t sync;
1107 
1108 	zv = dev->si_drv2;
1109 
1110 	error = 0;
1111 	KASSERT(zv->zv_open_count > 0,
1112 	    ("Device with zero access count in %s", __func__));
1113 
1114 	switch (cmd) {
1115 	case DIOCGSECTORSIZE:
1116 		*(uint32_t *)data = DEV_BSIZE;
1117 		break;
1118 	case DIOCGMEDIASIZE:
1119 		*(off_t *)data = zv->zv_volsize;
1120 		break;
1121 	case DIOCGFLUSH:
1122 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1123 		if (zv->zv_zilog != NULL)
1124 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1125 		rw_exit(&zv->zv_suspend_lock);
1126 		break;
1127 	case DIOCGDELETE:
1128 		if (!zvol_unmap_enabled)
1129 			break;
1130 
1131 		offset = ((off_t *)data)[0];
1132 		length = ((off_t *)data)[1];
1133 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1134 		    offset < 0 || offset >= zv->zv_volsize ||
1135 		    length <= 0) {
1136 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1137 			    length);
1138 			error = SET_ERROR(EINVAL);
1139 			break;
1140 		}
1141 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1142 		zvol_ensure_zilog(zv);
1143 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1144 		    RL_WRITER);
1145 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1146 		error = dmu_tx_assign(tx, TXG_WAIT);
1147 		if (error != 0) {
1148 			sync = FALSE;
1149 			dmu_tx_abort(tx);
1150 		} else {
1151 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1152 			zvol_log_truncate(zv, tx, offset, length);
1153 			dmu_tx_commit(tx);
1154 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1155 			    offset, length);
1156 		}
1157 		zfs_rangelock_exit(lr);
1158 		if (sync)
1159 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1160 		rw_exit(&zv->zv_suspend_lock);
1161 		break;
1162 	case DIOCGSTRIPESIZE:
1163 		*(off_t *)data = zv->zv_volblocksize;
1164 		break;
1165 	case DIOCGSTRIPEOFFSET:
1166 		*(off_t *)data = 0;
1167 		break;
1168 	case DIOCGATTR: {
1169 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1170 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1171 		uint64_t refd, avail, usedobjs, availobjs;
1172 
1173 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1174 			arg->value.i = 1;
1175 		else if (strcmp(arg->name, "blocksavail") == 0) {
1176 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1177 			    &usedobjs, &availobjs);
1178 			arg->value.off = avail / DEV_BSIZE;
1179 		} else if (strcmp(arg->name, "blocksused") == 0) {
1180 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1181 			    &usedobjs, &availobjs);
1182 			arg->value.off = refd / DEV_BSIZE;
1183 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1184 			avail = metaslab_class_get_space(spa_normal_class(spa));
1185 			avail -= metaslab_class_get_alloc(
1186 			    spa_normal_class(spa));
1187 			arg->value.off = avail / DEV_BSIZE;
1188 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1189 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1190 			arg->value.off = refd / DEV_BSIZE;
1191 		} else
1192 			error = SET_ERROR(ENOIOCTL);
1193 		break;
1194 	}
1195 	case FIOSEEKHOLE:
1196 	case FIOSEEKDATA: {
1197 		off_t *off = (off_t *)data;
1198 		uint64_t noff;
1199 		boolean_t hole;
1200 
1201 		hole = (cmd == FIOSEEKHOLE);
1202 		noff = *off;
1203 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1204 		    RL_READER);
1205 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1206 		zfs_rangelock_exit(lr);
1207 		*off = noff;
1208 		break;
1209 	}
1210 	default:
1211 		error = SET_ERROR(ENOIOCTL);
1212 	}
1213 
1214 	return (error);
1215 }
1216 
1217 /*
1218  * Misc. helpers
1219  */
1220 
1221 static void
zvol_ensure_zilog(zvol_state_t * zv)1222 zvol_ensure_zilog(zvol_state_t *zv)
1223 {
1224 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1225 
1226 	/*
1227 	 * Open a ZIL if this is the first time we have written to this
1228 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1229 	 * than zv_state_lock so that we don't need to acquire an
1230 	 * additional lock in this path.
1231 	 */
1232 	if (zv->zv_zilog == NULL) {
1233 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1234 			rw_exit(&zv->zv_suspend_lock);
1235 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1236 		}
1237 		if (zv->zv_zilog == NULL) {
1238 			zv->zv_zilog = zil_open(zv->zv_objset,
1239 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1240 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1241 			/* replay / destroy done in zvol_os_create_minor() */
1242 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1243 			    ZIL_REPLAY_NEEDED);
1244 		}
1245 		rw_downgrade(&zv->zv_suspend_lock);
1246 	}
1247 }
1248 
1249 boolean_t
zvol_os_is_zvol(const char * device)1250 zvol_os_is_zvol(const char *device)
1251 {
1252 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1253 }
1254 
1255 void
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1256 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1257 {
1258 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1259 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1260 
1261 	/* Move to a new hashtable entry.  */
1262 	zv->zv_hash = zvol_name_hash(newname);
1263 	hlist_del(&zv->zv_hlink);
1264 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1265 
1266 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1267 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1268 		struct g_provider *pp = zsg->zsg_provider;
1269 		struct g_geom *gp;
1270 
1271 		g_topology_lock();
1272 		gp = pp->geom;
1273 		ASSERT3P(gp, !=, NULL);
1274 
1275 		zsg->zsg_provider = NULL;
1276 		g_wither_provider(pp, ENXIO);
1277 
1278 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1279 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1280 		pp->sectorsize = DEV_BSIZE;
1281 		pp->mediasize = zv->zv_volsize;
1282 		pp->private = zv;
1283 		zsg->zsg_provider = pp;
1284 		g_error_provider(pp, 0);
1285 		g_topology_unlock();
1286 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1287 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1288 		struct cdev *dev;
1289 		struct make_dev_args args;
1290 
1291 		dev = zsd->zsd_cdev;
1292 		if (dev != NULL) {
1293 			destroy_dev(dev);
1294 			dev = zsd->zsd_cdev = NULL;
1295 			if (zv->zv_open_count > 0) {
1296 				zv->zv_flags &= ~ZVOL_EXCL;
1297 				zv->zv_open_count = 0;
1298 				/* XXX  need suspend lock but lock order */
1299 				zvol_last_close(zv);
1300 			}
1301 		}
1302 
1303 		make_dev_args_init(&args);
1304 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1305 		args.mda_devsw = &zvol_cdevsw;
1306 		args.mda_cr = NULL;
1307 		args.mda_uid = UID_ROOT;
1308 		args.mda_gid = GID_OPERATOR;
1309 		args.mda_mode = 0640;
1310 		args.mda_si_drv2 = zv;
1311 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1312 		    == 0) {
1313 #if __FreeBSD_version > 1300130
1314 			dev->si_iosize_max = maxphys;
1315 #else
1316 			dev->si_iosize_max = MAXPHYS;
1317 #endif
1318 			zsd->zsd_cdev = dev;
1319 		}
1320 	}
1321 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1322 	dataset_kstats_rename(&zv->zv_kstat, newname);
1323 }
1324 
1325 /*
1326  * Remove minor node for the specified volume.
1327  */
1328 void
zvol_os_free(zvol_state_t * zv)1329 zvol_os_free(zvol_state_t *zv)
1330 {
1331 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1332 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1333 	ASSERT0(zv->zv_open_count);
1334 
1335 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1336 
1337 	rw_destroy(&zv->zv_suspend_lock);
1338 	zfs_rangelock_fini(&zv->zv_rangelock);
1339 
1340 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1341 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1342 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1343 
1344 		ASSERT3P(pp->private, ==, NULL);
1345 
1346 		g_topology_lock();
1347 		zvol_geom_destroy(zv);
1348 		g_topology_unlock();
1349 		mtx_destroy(&zsg->zsg_queue_mtx);
1350 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1351 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1352 		struct cdev *dev = zsd->zsd_cdev;
1353 
1354 		if (dev != NULL) {
1355 			ASSERT3P(dev->si_drv2, ==, NULL);
1356 			destroy_dev(dev);
1357 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1358 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1359 		}
1360 	}
1361 
1362 	mutex_destroy(&zv->zv_state_lock);
1363 	dataset_kstats_destroy(&zv->zv_kstat);
1364 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1365 	kmem_free(zv, sizeof (zvol_state_t));
1366 	zvol_minors--;
1367 }
1368 
1369 /*
1370  * Create a minor node (plus a whole lot more) for the specified volume.
1371  */
1372 int
zvol_os_create_minor(const char * name)1373 zvol_os_create_minor(const char *name)
1374 {
1375 	zvol_state_t *zv;
1376 	objset_t *os;
1377 	dmu_object_info_t *doi;
1378 	uint64_t volsize;
1379 	uint64_t volmode, hash;
1380 	int error;
1381 	bool replayed_zil = B_FALSE;
1382 
1383 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1384 	hash = zvol_name_hash(name);
1385 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1386 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1387 		mutex_exit(&zv->zv_state_lock);
1388 		return (SET_ERROR(EEXIST));
1389 	}
1390 
1391 	DROP_GIANT();
1392 
1393 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1394 
1395 	/* Lie and say we're read-only. */
1396 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1397 	if (error)
1398 		goto out_doi;
1399 
1400 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1401 	if (error)
1402 		goto out_dmu_objset_disown;
1403 
1404 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1405 	if (error)
1406 		goto out_dmu_objset_disown;
1407 
1408 	error = dsl_prop_get_integer(name,
1409 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1410 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1411 		volmode = zvol_volmode;
1412 	error = 0;
1413 
1414 	/*
1415 	 * zvol_alloc equivalent ...
1416 	 */
1417 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1418 	zv->zv_hash = hash;
1419 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1420 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1421 	zv->zv_volmode = volmode;
1422 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1423 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1424 		struct g_provider *pp;
1425 		struct g_geom *gp;
1426 
1427 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1428 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1429 
1430 		g_topology_lock();
1431 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1432 		gp->start = zvol_geom_bio_start;
1433 		gp->access = zvol_geom_access;
1434 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1435 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1436 		pp->sectorsize = DEV_BSIZE;
1437 		pp->mediasize = 0;
1438 		pp->private = zv;
1439 
1440 		zsg->zsg_provider = pp;
1441 		bioq_init(&zsg->zsg_queue);
1442 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1443 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1444 		struct cdev *dev;
1445 		struct make_dev_args args;
1446 
1447 		make_dev_args_init(&args);
1448 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1449 		args.mda_devsw = &zvol_cdevsw;
1450 		args.mda_cr = NULL;
1451 		args.mda_uid = UID_ROOT;
1452 		args.mda_gid = GID_OPERATOR;
1453 		args.mda_mode = 0640;
1454 		args.mda_si_drv2 = zv;
1455 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1456 		    == 0) {
1457 #if __FreeBSD_version > 1300130
1458 			dev->si_iosize_max = maxphys;
1459 #else
1460 			dev->si_iosize_max = MAXPHYS;
1461 #endif
1462 			zsd->zsd_cdev = dev;
1463 			knlist_init_sx(&zsd->zsd_selinfo.si_note,
1464 			    &zv->zv_state_lock);
1465 		}
1466 	}
1467 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1468 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1469 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1470 
1471 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1472 		zv->zv_flags |= ZVOL_RDONLY;
1473 
1474 	zv->zv_volblocksize = doi->doi_data_block_size;
1475 	zv->zv_volsize = volsize;
1476 	zv->zv_objset = os;
1477 
1478 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1479 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1480 	if (error)
1481 		goto out_dmu_objset_disown;
1482 	ASSERT3P(zv->zv_zilog, ==, NULL);
1483 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1484 	if (spa_writeable(dmu_objset_spa(os))) {
1485 		if (zil_replay_disable)
1486 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1487 		else
1488 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1489 	}
1490 	if (replayed_zil)
1491 		zil_close(zv->zv_zilog);
1492 	zv->zv_zilog = NULL;
1493 
1494 	/* TODO: prefetch for geom tasting */
1495 
1496 	zv->zv_objset = NULL;
1497 out_dmu_objset_disown:
1498 	dmu_objset_disown(os, B_TRUE, FTAG);
1499 
1500 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1501 		zvol_geom_run(zv);
1502 		g_topology_unlock();
1503 	}
1504 out_doi:
1505 	kmem_free(doi, sizeof (dmu_object_info_t));
1506 	if (error == 0) {
1507 		rw_enter(&zvol_state_lock, RW_WRITER);
1508 		zvol_insert(zv);
1509 		zvol_minors++;
1510 		rw_exit(&zvol_state_lock);
1511 		ZFS_LOG(1, "ZVOL %s created.", name);
1512 	}
1513 	PICKUP_GIANT();
1514 	return (error);
1515 }
1516 
1517 void
zvol_os_clear_private(zvol_state_t * zv)1518 zvol_os_clear_private(zvol_state_t *zv)
1519 {
1520 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1521 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1522 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1523 		struct g_provider *pp = zsg->zsg_provider;
1524 
1525 		if (pp->private == NULL) /* already cleared */
1526 			return;
1527 
1528 		mtx_lock(&zsg->zsg_queue_mtx);
1529 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1530 		pp->private = NULL;
1531 		wakeup_one(&zsg->zsg_queue);
1532 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1533 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1534 			    0, "zvol:w", 0);
1535 		mtx_unlock(&zsg->zsg_queue_mtx);
1536 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1537 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1538 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1539 		struct cdev *dev = zsd->zsd_cdev;
1540 
1541 		if (dev != NULL)
1542 			dev->si_drv2 = NULL;
1543 	}
1544 }
1545 
1546 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1547 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1548 {
1549 	zv->zv_volsize = volsize;
1550 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1551 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1552 		struct g_provider *pp = zsg->zsg_provider;
1553 
1554 		g_topology_lock();
1555 
1556 		if (pp->private == NULL) {
1557 			g_topology_unlock();
1558 			return (SET_ERROR(ENXIO));
1559 		}
1560 
1561 		/*
1562 		 * Do not invoke resize event when initial size was zero.
1563 		 * ZVOL initializes the size on first open, this is not
1564 		 * real resizing.
1565 		 */
1566 		if (pp->mediasize == 0)
1567 			pp->mediasize = zv->zv_volsize;
1568 		else
1569 			g_resize_provider(pp, zv->zv_volsize);
1570 
1571 		g_topology_unlock();
1572 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1573 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1574 
1575 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1576 	}
1577 	return (0);
1578 }
1579 
1580 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1581 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1582 {
1583 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1584 }
1585 
1586 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1587 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1588 {
1589 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1590 }
1591 
1592 /*
1593  * Public interfaces
1594  */
1595 
1596 int
zvol_busy(void)1597 zvol_busy(void)
1598 {
1599 	return (zvol_minors != 0);
1600 }
1601 
1602 int
zvol_init(void)1603 zvol_init(void)
1604 {
1605 	zvol_init_impl();
1606 	return (0);
1607 }
1608 
1609 void
zvol_fini(void)1610 zvol_fini(void)
1611 {
1612 	zvol_fini_impl();
1613 }
1614