1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 *
27 * Portions Copyright 2010 Robert Milkowski
28 *
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 */
34
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36
37 /*
38 * ZFS volume emulation driver.
39 *
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
42 *
43 * /dev/zvol/<pool_name>/<dataset_name>
44 *
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
47 *
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
50 */
51
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 #include <sys/freebsd_event.h>
96
97 #include <geom/geom.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
100
101 #include "zfs_namecheck.h"
102
103 #define ZVOL_DUMPSIZE "dumpsize"
104
105 #ifdef ZVOL_LOCK_DEBUG
106 #define ZVOL_RW_READER RW_WRITER
107 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
108 #else
109 #define ZVOL_RW_READER RW_READER
110 #define ZVOL_RW_READ_HELD RW_READ_HELD
111 #endif
112
113 enum zvol_geom_state {
114 ZVOL_GEOM_UNINIT,
115 ZVOL_GEOM_STOPPED,
116 ZVOL_GEOM_RUNNING,
117 };
118
119 struct zvol_state_os {
120 #define zso_dev _zso_state._zso_dev
121 #define zso_geom _zso_state._zso_geom
122 union {
123 /* volmode=dev */
124 struct zvol_state_dev {
125 struct cdev *zsd_cdev;
126 struct selinfo zsd_selinfo;
127 } _zso_dev;
128
129 /* volmode=geom */
130 struct zvol_state_geom {
131 struct g_provider *zsg_provider;
132 struct bio_queue_head zsg_queue;
133 struct mtx zsg_queue_mtx;
134 enum zvol_geom_state zsg_state;
135 } _zso_geom;
136 } _zso_state;
137 int zso_dying;
138 };
139
140 static uint32_t zvol_minors;
141
142 SYSCTL_DECL(_vfs_zfs);
143 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
144 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
145 "Expose as GEOM providers (1), device files (2) or neither");
146 static boolean_t zpool_on_zvol = B_FALSE;
147 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
148 "Allow zpools to use zvols as vdevs (DANGEROUS)");
149
150 /*
151 * Toggle unmap functionality.
152 */
153 boolean_t zvol_unmap_enabled = B_TRUE;
154
155 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
156 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
157
158 /*
159 * zvol maximum transfer in one DMU tx.
160 */
161 int zvol_maxphys = DMU_MAX_ACCESS / 2;
162
163 static void zvol_ensure_zilog(zvol_state_t *zv);
164
165 static d_open_t zvol_cdev_open;
166 static d_close_t zvol_cdev_close;
167 static d_ioctl_t zvol_cdev_ioctl;
168 static d_read_t zvol_cdev_read;
169 static d_write_t zvol_cdev_write;
170 static d_strategy_t zvol_geom_bio_strategy;
171 static d_kqfilter_t zvol_cdev_kqfilter;
172
173 static struct cdevsw zvol_cdevsw = {
174 .d_name = "zvol",
175 .d_version = D_VERSION,
176 .d_flags = D_DISK | D_TRACKCLOSE,
177 .d_open = zvol_cdev_open,
178 .d_close = zvol_cdev_close,
179 .d_ioctl = zvol_cdev_ioctl,
180 .d_read = zvol_cdev_read,
181 .d_write = zvol_cdev_write,
182 .d_strategy = zvol_geom_bio_strategy,
183 .d_kqfilter = zvol_cdev_kqfilter,
184 };
185
186 static void zvol_filter_detach(struct knote *kn);
187 static int zvol_filter_vnode(struct knote *kn, long hint);
188
189 static struct filterops zvol_filterops_vnode = {
190 .f_isfd = 1,
191 .f_detach = zvol_filter_detach,
192 .f_event = zvol_filter_vnode,
193 };
194
195 extern uint_t zfs_geom_probe_vdev_key;
196
197 struct g_class zfs_zvol_class = {
198 .name = "ZFS::ZVOL",
199 .version = G_VERSION,
200 };
201
202 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
203
204 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
205 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
206 static void zvol_geom_run(zvol_state_t *zv);
207 static void zvol_geom_destroy(zvol_state_t *zv);
208 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
209 static void zvol_geom_worker(void *arg);
210 static void zvol_geom_bio_start(struct bio *bp);
211 static int zvol_geom_bio_getattr(struct bio *bp);
212 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
213
214 /*
215 * GEOM mode implementation
216 */
217
218 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)219 zvol_geom_open(struct g_provider *pp, int flag, int count)
220 {
221 zvol_state_t *zv;
222 int err = 0;
223 boolean_t drop_suspend = B_FALSE;
224
225 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
226 /*
227 * If zfs_geom_probe_vdev_key is set, that means that zfs is
228 * attempting to probe geom providers while looking for a
229 * replacement for a missing VDEV. In this case, the
230 * spa_namespace_lock will not be held, but it is still illegal
231 * to use a zvol as a vdev. Deadlocks can result if another
232 * thread has spa_namespace_lock.
233 */
234 return (SET_ERROR(EOPNOTSUPP));
235 }
236
237 retry:
238 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
239 /*
240 * Obtain a copy of private under zvol_state_lock to make sure either
241 * the result of zvol free code setting private to NULL is observed,
242 * or the zv is protected from being freed because of the positive
243 * zv_open_count.
244 */
245 zv = pp->private;
246 if (zv == NULL) {
247 rw_exit(&zvol_state_lock);
248 err = SET_ERROR(ENXIO);
249 goto out_locked;
250 }
251
252 mutex_enter(&zv->zv_state_lock);
253 if (zv->zv_zso->zso_dying) {
254 rw_exit(&zvol_state_lock);
255 err = SET_ERROR(ENXIO);
256 goto out_zv_locked;
257 }
258 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
259
260 /*
261 * Make sure zvol is not suspended during first open
262 * (hold zv_suspend_lock) and respect proper lock acquisition
263 * ordering - zv_suspend_lock before zv_state_lock.
264 */
265 if (zv->zv_open_count == 0) {
266 drop_suspend = B_TRUE;
267 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
268 mutex_exit(&zv->zv_state_lock);
269 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
270 mutex_enter(&zv->zv_state_lock);
271 /* Check to see if zv_suspend_lock is needed. */
272 if (zv->zv_open_count != 0) {
273 rw_exit(&zv->zv_suspend_lock);
274 drop_suspend = B_FALSE;
275 }
276 }
277 }
278 rw_exit(&zvol_state_lock);
279
280 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
281
282 if (zv->zv_open_count == 0) {
283 boolean_t drop_namespace = B_FALSE;
284
285 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
286
287 /*
288 * Take spa_namespace_lock to prevent lock inversion when
289 * zvols from one pool are opened as vdevs in another.
290 */
291 if (!mutex_owned(&spa_namespace_lock)) {
292 if (!mutex_tryenter(&spa_namespace_lock)) {
293 mutex_exit(&zv->zv_state_lock);
294 rw_exit(&zv->zv_suspend_lock);
295 kern_yield(PRI_USER);
296 goto retry;
297 } else {
298 drop_namespace = B_TRUE;
299 }
300 }
301 err = zvol_first_open(zv, !(flag & FWRITE));
302 if (drop_namespace)
303 mutex_exit(&spa_namespace_lock);
304 if (err)
305 goto out_zv_locked;
306 pp->mediasize = zv->zv_volsize;
307 pp->stripeoffset = 0;
308 pp->stripesize = zv->zv_volblocksize;
309 }
310
311 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
312
313 /*
314 * Check for a bad on-disk format version now since we
315 * lied about owning the dataset readonly before.
316 */
317 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
318 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
319 err = SET_ERROR(EROFS);
320 goto out_opened;
321 }
322 if (zv->zv_flags & ZVOL_EXCL) {
323 err = SET_ERROR(EBUSY);
324 goto out_opened;
325 }
326 if (flag & O_EXCL) {
327 if (zv->zv_open_count != 0) {
328 err = SET_ERROR(EBUSY);
329 goto out_opened;
330 }
331 zv->zv_flags |= ZVOL_EXCL;
332 }
333
334 zv->zv_open_count += count;
335 out_opened:
336 if (zv->zv_open_count == 0) {
337 zvol_last_close(zv);
338 wakeup(zv);
339 }
340 out_zv_locked:
341 mutex_exit(&zv->zv_state_lock);
342 out_locked:
343 if (drop_suspend)
344 rw_exit(&zv->zv_suspend_lock);
345 return (err);
346 }
347
348 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)349 zvol_geom_close(struct g_provider *pp, int flag, int count)
350 {
351 (void) flag;
352 zvol_state_t *zv;
353 boolean_t drop_suspend = B_TRUE;
354 int new_open_count;
355
356 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
357 zv = pp->private;
358 if (zv == NULL) {
359 rw_exit(&zvol_state_lock);
360 return (SET_ERROR(ENXIO));
361 }
362
363 mutex_enter(&zv->zv_state_lock);
364 if (zv->zv_flags & ZVOL_EXCL) {
365 ASSERT3U(zv->zv_open_count, ==, 1);
366 zv->zv_flags &= ~ZVOL_EXCL;
367 }
368
369 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
370
371 /*
372 * If the open count is zero, this is a spurious close.
373 * That indicates a bug in the kernel / DDI framework.
374 */
375 ASSERT3U(zv->zv_open_count, >, 0);
376
377 /*
378 * Make sure zvol is not suspended during last close
379 * (hold zv_suspend_lock) and respect proper lock acquisition
380 * ordering - zv_suspend_lock before zv_state_lock.
381 */
382 new_open_count = zv->zv_open_count - count;
383 if (new_open_count == 0) {
384 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
385 mutex_exit(&zv->zv_state_lock);
386 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
387 mutex_enter(&zv->zv_state_lock);
388 /* Check to see if zv_suspend_lock is needed. */
389 new_open_count = zv->zv_open_count - count;
390 if (new_open_count != 0) {
391 rw_exit(&zv->zv_suspend_lock);
392 drop_suspend = B_FALSE;
393 }
394 }
395 } else {
396 drop_suspend = B_FALSE;
397 }
398 rw_exit(&zvol_state_lock);
399
400 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
401
402 /*
403 * You may get multiple opens, but only one close.
404 */
405 zv->zv_open_count = new_open_count;
406 if (zv->zv_open_count == 0) {
407 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
408 zvol_last_close(zv);
409 wakeup(zv);
410 }
411
412 mutex_exit(&zv->zv_state_lock);
413
414 if (drop_suspend)
415 rw_exit(&zv->zv_suspend_lock);
416 return (0);
417 }
418
419 static void
zvol_geom_run(zvol_state_t * zv)420 zvol_geom_run(zvol_state_t *zv)
421 {
422 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
423 struct g_provider *pp = zsg->zsg_provider;
424
425 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
426
427 g_error_provider(pp, 0);
428
429 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
430 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
431 }
432
433 static void
zvol_geom_destroy(zvol_state_t * zv)434 zvol_geom_destroy(zvol_state_t *zv)
435 {
436 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
437 struct g_provider *pp = zsg->zsg_provider;
438
439 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
440
441 g_topology_assert();
442
443 mutex_enter(&zv->zv_state_lock);
444 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
445 mutex_exit(&zv->zv_state_lock);
446 zsg->zsg_provider = NULL;
447 g_wither_geom(pp->geom, ENXIO);
448 }
449
450 void
zvol_wait_close(zvol_state_t * zv)451 zvol_wait_close(zvol_state_t *zv)
452 {
453
454 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
455 return;
456 mutex_enter(&zv->zv_state_lock);
457 zv->zv_zso->zso_dying = B_TRUE;
458
459 if (zv->zv_open_count)
460 msleep(zv, &zv->zv_state_lock,
461 PRIBIO, "zvol:dying", 10*hz);
462 mutex_exit(&zv->zv_state_lock);
463 }
464
465
466 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)467 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
468 {
469 int count, error, flags;
470
471 g_topology_assert();
472
473 /*
474 * To make it easier we expect either open or close, but not both
475 * at the same time.
476 */
477 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
478 (acr <= 0 && acw <= 0 && ace <= 0),
479 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
480 pp->name, acr, acw, ace));
481
482 if (pp->private == NULL) {
483 if (acr <= 0 && acw <= 0 && ace <= 0)
484 return (0);
485 return (pp->error);
486 }
487
488 /*
489 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
490 * ace != 0, because GEOM already handles that and handles it a bit
491 * differently. GEOM allows for multiple read/exclusive consumers and
492 * ZFS allows only one exclusive consumer, no matter if it is reader or
493 * writer. I like better the way GEOM works so I'll leave it for GEOM
494 * to decide what to do.
495 */
496
497 count = acr + acw + ace;
498 if (count == 0)
499 return (0);
500
501 flags = 0;
502 if (acr != 0 || ace != 0)
503 flags |= FREAD;
504 if (acw != 0)
505 flags |= FWRITE;
506
507 g_topology_unlock();
508 if (count > 0)
509 error = zvol_geom_open(pp, flags, count);
510 else
511 error = zvol_geom_close(pp, flags, -count);
512 g_topology_lock();
513 return (error);
514 }
515
516 static void
zvol_geom_worker(void * arg)517 zvol_geom_worker(void *arg)
518 {
519 zvol_state_t *zv = arg;
520 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
521 struct bio *bp;
522
523 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
524
525 thread_lock(curthread);
526 sched_prio(curthread, PRIBIO);
527 thread_unlock(curthread);
528
529 for (;;) {
530 mtx_lock(&zsg->zsg_queue_mtx);
531 bp = bioq_takefirst(&zsg->zsg_queue);
532 if (bp == NULL) {
533 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
534 zsg->zsg_state = ZVOL_GEOM_RUNNING;
535 wakeup(&zsg->zsg_state);
536 mtx_unlock(&zsg->zsg_queue_mtx);
537 kthread_exit();
538 }
539 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
540 PRIBIO | PDROP, "zvol:io", 0);
541 continue;
542 }
543 mtx_unlock(&zsg->zsg_queue_mtx);
544 zvol_geom_bio_strategy(bp);
545 }
546 }
547
548 static void
zvol_geom_bio_start(struct bio * bp)549 zvol_geom_bio_start(struct bio *bp)
550 {
551 zvol_state_t *zv = bp->bio_to->private;
552 struct zvol_state_geom *zsg;
553 boolean_t first;
554
555 if (zv == NULL) {
556 g_io_deliver(bp, ENXIO);
557 return;
558 }
559 if (bp->bio_cmd == BIO_GETATTR) {
560 if (zvol_geom_bio_getattr(bp))
561 g_io_deliver(bp, EOPNOTSUPP);
562 return;
563 }
564
565 if (!THREAD_CAN_SLEEP()) {
566 zsg = &zv->zv_zso->zso_geom;
567 mtx_lock(&zsg->zsg_queue_mtx);
568 first = (bioq_first(&zsg->zsg_queue) == NULL);
569 bioq_insert_tail(&zsg->zsg_queue, bp);
570 mtx_unlock(&zsg->zsg_queue_mtx);
571 if (first)
572 wakeup_one(&zsg->zsg_queue);
573 return;
574 }
575
576 zvol_geom_bio_strategy(bp);
577 }
578
579 static int
zvol_geom_bio_getattr(struct bio * bp)580 zvol_geom_bio_getattr(struct bio *bp)
581 {
582 zvol_state_t *zv;
583
584 zv = bp->bio_to->private;
585 ASSERT3P(zv, !=, NULL);
586
587 spa_t *spa = dmu_objset_spa(zv->zv_objset);
588 uint64_t refd, avail, usedobjs, availobjs;
589
590 if (g_handleattr_int(bp, "GEOM::candelete", 1))
591 return (0);
592 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
593 dmu_objset_space(zv->zv_objset, &refd, &avail,
594 &usedobjs, &availobjs);
595 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
596 return (0);
597 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
598 dmu_objset_space(zv->zv_objset, &refd, &avail,
599 &usedobjs, &availobjs);
600 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
601 return (0);
602 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
603 avail = metaslab_class_get_space(spa_normal_class(spa));
604 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
605 if (g_handleattr_off_t(bp, "poolblocksavail",
606 avail / DEV_BSIZE))
607 return (0);
608 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
609 refd = metaslab_class_get_alloc(spa_normal_class(spa));
610 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
611 return (0);
612 }
613 return (1);
614 }
615
616 static void
zvol_filter_detach(struct knote * kn)617 zvol_filter_detach(struct knote *kn)
618 {
619 zvol_state_t *zv;
620 struct zvol_state_dev *zsd;
621
622 zv = kn->kn_hook;
623 zsd = &zv->zv_zso->zso_dev;
624
625 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
626 }
627
628 static int
zvol_filter_vnode(struct knote * kn,long hint)629 zvol_filter_vnode(struct knote *kn, long hint)
630 {
631 kn->kn_fflags |= kn->kn_sfflags & hint;
632
633 return (kn->kn_fflags != 0);
634 }
635
636 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)637 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
638 {
639 zvol_state_t *zv;
640 struct zvol_state_dev *zsd;
641
642 zv = dev->si_drv2;
643 zsd = &zv->zv_zso->zso_dev;
644
645 if (kn->kn_filter != EVFILT_VNODE)
646 return (EINVAL);
647
648 /* XXX: extend support for other NOTE_* events */
649 if (kn->kn_sfflags != NOTE_ATTRIB)
650 return (EINVAL);
651
652 kn->kn_fop = &zvol_filterops_vnode;
653 kn->kn_hook = zv;
654 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
655
656 return (0);
657 }
658
659 static void
zvol_geom_bio_strategy(struct bio * bp)660 zvol_geom_bio_strategy(struct bio *bp)
661 {
662 zvol_state_t *zv;
663 uint64_t off, volsize;
664 size_t resid;
665 char *addr;
666 objset_t *os;
667 zfs_locked_range_t *lr;
668 int error = 0;
669 boolean_t doread = B_FALSE;
670 boolean_t is_dumpified;
671 boolean_t commit;
672
673 if (bp->bio_to)
674 zv = bp->bio_to->private;
675 else
676 zv = bp->bio_dev->si_drv2;
677
678 if (zv == NULL) {
679 error = SET_ERROR(ENXIO);
680 goto out;
681 }
682
683 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
684
685 switch (bp->bio_cmd) {
686 case BIO_READ:
687 doread = B_TRUE;
688 break;
689 case BIO_WRITE:
690 case BIO_FLUSH:
691 case BIO_DELETE:
692 if (zv->zv_flags & ZVOL_RDONLY) {
693 error = SET_ERROR(EROFS);
694 goto resume;
695 }
696 zvol_ensure_zilog(zv);
697 if (bp->bio_cmd == BIO_FLUSH)
698 goto commit;
699 break;
700 default:
701 error = SET_ERROR(EOPNOTSUPP);
702 goto resume;
703 }
704
705 off = bp->bio_offset;
706 volsize = zv->zv_volsize;
707
708 os = zv->zv_objset;
709 ASSERT3P(os, !=, NULL);
710
711 addr = bp->bio_data;
712 resid = bp->bio_length;
713
714 if (resid > 0 && off >= volsize) {
715 error = SET_ERROR(EIO);
716 goto resume;
717 }
718
719 is_dumpified = B_FALSE;
720 commit = !doread && !is_dumpified &&
721 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
722
723 /*
724 * There must be no buffer changes when doing a dmu_sync() because
725 * we can't change the data whilst calculating the checksum.
726 */
727 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
728 doread ? RL_READER : RL_WRITER);
729
730 if (bp->bio_cmd == BIO_DELETE) {
731 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
732 error = dmu_tx_assign(tx, TXG_WAIT);
733 if (error != 0) {
734 dmu_tx_abort(tx);
735 } else {
736 zvol_log_truncate(zv, tx, off, resid);
737 dmu_tx_commit(tx);
738 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
739 off, resid);
740 resid = 0;
741 }
742 goto unlock;
743 }
744 while (resid != 0 && off < volsize) {
745 size_t size = MIN(resid, zvol_maxphys);
746 if (doread) {
747 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
748 DMU_READ_PREFETCH);
749 } else {
750 dmu_tx_t *tx = dmu_tx_create(os);
751 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
752 error = dmu_tx_assign(tx, TXG_WAIT);
753 if (error) {
754 dmu_tx_abort(tx);
755 } else {
756 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
757 zvol_log_write(zv, tx, off, size, commit);
758 dmu_tx_commit(tx);
759 }
760 }
761 if (error) {
762 /* Convert checksum errors into IO errors. */
763 if (error == ECKSUM)
764 error = SET_ERROR(EIO);
765 break;
766 }
767 off += size;
768 addr += size;
769 resid -= size;
770 }
771 unlock:
772 zfs_rangelock_exit(lr);
773
774 bp->bio_completed = bp->bio_length - resid;
775 if (bp->bio_completed < bp->bio_length && off > volsize)
776 error = SET_ERROR(EINVAL);
777
778 switch (bp->bio_cmd) {
779 case BIO_FLUSH:
780 break;
781 case BIO_READ:
782 dataset_kstats_update_read_kstats(&zv->zv_kstat,
783 bp->bio_completed);
784 break;
785 case BIO_WRITE:
786 dataset_kstats_update_write_kstats(&zv->zv_kstat,
787 bp->bio_completed);
788 break;
789 case BIO_DELETE:
790 break;
791 default:
792 break;
793 }
794
795 if (commit) {
796 commit:
797 zil_commit(zv->zv_zilog, ZVOL_OBJ);
798 }
799 resume:
800 rw_exit(&zv->zv_suspend_lock);
801 out:
802 if (bp->bio_to)
803 g_io_deliver(bp, error);
804 else
805 biofinish(bp, NULL, error);
806 }
807
808 /*
809 * Character device mode implementation
810 */
811
812 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)813 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
814 {
815 zvol_state_t *zv;
816 uint64_t volsize;
817 zfs_locked_range_t *lr;
818 int error = 0;
819 zfs_uio_t uio;
820
821 zfs_uio_init(&uio, uio_s);
822
823 zv = dev->si_drv2;
824
825 volsize = zv->zv_volsize;
826 /*
827 * uio_loffset == volsize isn't an error as
828 * it's required for EOF processing.
829 */
830 if (zfs_uio_resid(&uio) > 0 &&
831 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
832 return (SET_ERROR(EIO));
833
834 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
835 ssize_t start_resid = zfs_uio_resid(&uio);
836 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
837 zfs_uio_resid(&uio), RL_READER);
838 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
839 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
840
841 /* Don't read past the end. */
842 if (bytes > volsize - zfs_uio_offset(&uio))
843 bytes = volsize - zfs_uio_offset(&uio);
844
845 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
846 if (error) {
847 /* Convert checksum errors into IO errors. */
848 if (error == ECKSUM)
849 error = SET_ERROR(EIO);
850 break;
851 }
852 }
853 zfs_rangelock_exit(lr);
854 int64_t nread = start_resid - zfs_uio_resid(&uio);
855 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
856 rw_exit(&zv->zv_suspend_lock);
857
858 return (error);
859 }
860
861 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)862 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
863 {
864 zvol_state_t *zv;
865 uint64_t volsize;
866 zfs_locked_range_t *lr;
867 int error = 0;
868 boolean_t commit;
869 zfs_uio_t uio;
870
871 zv = dev->si_drv2;
872
873 volsize = zv->zv_volsize;
874
875 zfs_uio_init(&uio, uio_s);
876
877 if (zfs_uio_resid(&uio) > 0 &&
878 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
879 return (SET_ERROR(EIO));
880
881 ssize_t start_resid = zfs_uio_resid(&uio);
882 commit = (ioflag & IO_SYNC) ||
883 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
884
885 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
886 zvol_ensure_zilog(zv);
887
888 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
889 zfs_uio_resid(&uio), RL_WRITER);
890 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
891 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
892 uint64_t off = zfs_uio_offset(&uio);
893 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
894
895 if (bytes > volsize - off) /* Don't write past the end. */
896 bytes = volsize - off;
897
898 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
899 error = dmu_tx_assign(tx, TXG_WAIT);
900 if (error) {
901 dmu_tx_abort(tx);
902 break;
903 }
904 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
905 if (error == 0)
906 zvol_log_write(zv, tx, off, bytes, commit);
907 dmu_tx_commit(tx);
908
909 if (error)
910 break;
911 }
912 zfs_rangelock_exit(lr);
913 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
914 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
915 if (commit)
916 zil_commit(zv->zv_zilog, ZVOL_OBJ);
917 rw_exit(&zv->zv_suspend_lock);
918 return (error);
919 }
920
921 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)922 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
923 {
924 zvol_state_t *zv;
925 int err = 0;
926 boolean_t drop_suspend = B_FALSE;
927
928 retry:
929 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
930 /*
931 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
932 * the result of zvol free code setting si_drv2 to NULL is observed,
933 * or the zv is protected from being freed because of the positive
934 * zv_open_count.
935 */
936 zv = dev->si_drv2;
937 if (zv == NULL) {
938 rw_exit(&zvol_state_lock);
939 err = SET_ERROR(ENXIO);
940 goto out_locked;
941 }
942
943 mutex_enter(&zv->zv_state_lock);
944 if (zv->zv_zso->zso_dying) {
945 rw_exit(&zvol_state_lock);
946 err = SET_ERROR(ENXIO);
947 goto out_zv_locked;
948 }
949 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
950
951 /*
952 * Make sure zvol is not suspended during first open
953 * (hold zv_suspend_lock) and respect proper lock acquisition
954 * ordering - zv_suspend_lock before zv_state_lock.
955 */
956 if (zv->zv_open_count == 0) {
957 drop_suspend = B_TRUE;
958 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
959 mutex_exit(&zv->zv_state_lock);
960 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
961 mutex_enter(&zv->zv_state_lock);
962 /* Check to see if zv_suspend_lock is needed. */
963 if (zv->zv_open_count != 0) {
964 rw_exit(&zv->zv_suspend_lock);
965 drop_suspend = B_FALSE;
966 }
967 }
968 }
969 rw_exit(&zvol_state_lock);
970
971 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
972
973 if (zv->zv_open_count == 0) {
974 boolean_t drop_namespace = B_FALSE;
975
976 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
977
978 /*
979 * Take spa_namespace_lock to prevent lock inversion when
980 * zvols from one pool are opened as vdevs in another.
981 */
982 if (!mutex_owned(&spa_namespace_lock)) {
983 if (!mutex_tryenter(&spa_namespace_lock)) {
984 mutex_exit(&zv->zv_state_lock);
985 rw_exit(&zv->zv_suspend_lock);
986 kern_yield(PRI_USER);
987 goto retry;
988 } else {
989 drop_namespace = B_TRUE;
990 }
991 }
992 err = zvol_first_open(zv, !(flags & FWRITE));
993 if (drop_namespace)
994 mutex_exit(&spa_namespace_lock);
995 if (err)
996 goto out_zv_locked;
997 }
998
999 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1000
1001 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1002 err = SET_ERROR(EROFS);
1003 goto out_opened;
1004 }
1005 if (zv->zv_flags & ZVOL_EXCL) {
1006 err = SET_ERROR(EBUSY);
1007 goto out_opened;
1008 }
1009 if (flags & O_EXCL) {
1010 if (zv->zv_open_count != 0) {
1011 err = SET_ERROR(EBUSY);
1012 goto out_opened;
1013 }
1014 zv->zv_flags |= ZVOL_EXCL;
1015 }
1016
1017 zv->zv_open_count++;
1018 out_opened:
1019 if (zv->zv_open_count == 0) {
1020 zvol_last_close(zv);
1021 wakeup(zv);
1022 }
1023 out_zv_locked:
1024 mutex_exit(&zv->zv_state_lock);
1025 out_locked:
1026 if (drop_suspend)
1027 rw_exit(&zv->zv_suspend_lock);
1028 return (err);
1029 }
1030
1031 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1032 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1033 {
1034 zvol_state_t *zv;
1035 boolean_t drop_suspend = B_TRUE;
1036
1037 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1038 zv = dev->si_drv2;
1039 if (zv == NULL) {
1040 rw_exit(&zvol_state_lock);
1041 return (SET_ERROR(ENXIO));
1042 }
1043
1044 mutex_enter(&zv->zv_state_lock);
1045 if (zv->zv_flags & ZVOL_EXCL) {
1046 ASSERT3U(zv->zv_open_count, ==, 1);
1047 zv->zv_flags &= ~ZVOL_EXCL;
1048 }
1049
1050 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1051
1052 /*
1053 * If the open count is zero, this is a spurious close.
1054 * That indicates a bug in the kernel / DDI framework.
1055 */
1056 ASSERT3U(zv->zv_open_count, >, 0);
1057 /*
1058 * Make sure zvol is not suspended during last close
1059 * (hold zv_suspend_lock) and respect proper lock acquisition
1060 * ordering - zv_suspend_lock before zv_state_lock.
1061 */
1062 if (zv->zv_open_count == 1) {
1063 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1064 mutex_exit(&zv->zv_state_lock);
1065 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1066 mutex_enter(&zv->zv_state_lock);
1067 /* Check to see if zv_suspend_lock is needed. */
1068 if (zv->zv_open_count != 1) {
1069 rw_exit(&zv->zv_suspend_lock);
1070 drop_suspend = B_FALSE;
1071 }
1072 }
1073 } else {
1074 drop_suspend = B_FALSE;
1075 }
1076 rw_exit(&zvol_state_lock);
1077
1078 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1079
1080 /*
1081 * You may get multiple opens, but only one close.
1082 */
1083 zv->zv_open_count--;
1084
1085 if (zv->zv_open_count == 0) {
1086 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1087 zvol_last_close(zv);
1088 wakeup(zv);
1089 }
1090
1091 mutex_exit(&zv->zv_state_lock);
1092
1093 if (drop_suspend)
1094 rw_exit(&zv->zv_suspend_lock);
1095 return (0);
1096 }
1097
1098 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1099 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1100 int fflag, struct thread *td)
1101 {
1102 zvol_state_t *zv;
1103 zfs_locked_range_t *lr;
1104 off_t offset, length;
1105 int error;
1106 boolean_t sync;
1107
1108 zv = dev->si_drv2;
1109
1110 error = 0;
1111 KASSERT(zv->zv_open_count > 0,
1112 ("Device with zero access count in %s", __func__));
1113
1114 switch (cmd) {
1115 case DIOCGSECTORSIZE:
1116 *(uint32_t *)data = DEV_BSIZE;
1117 break;
1118 case DIOCGMEDIASIZE:
1119 *(off_t *)data = zv->zv_volsize;
1120 break;
1121 case DIOCGFLUSH:
1122 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1123 if (zv->zv_zilog != NULL)
1124 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1125 rw_exit(&zv->zv_suspend_lock);
1126 break;
1127 case DIOCGDELETE:
1128 if (!zvol_unmap_enabled)
1129 break;
1130
1131 offset = ((off_t *)data)[0];
1132 length = ((off_t *)data)[1];
1133 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1134 offset < 0 || offset >= zv->zv_volsize ||
1135 length <= 0) {
1136 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1137 length);
1138 error = SET_ERROR(EINVAL);
1139 break;
1140 }
1141 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1142 zvol_ensure_zilog(zv);
1143 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1144 RL_WRITER);
1145 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1146 error = dmu_tx_assign(tx, TXG_WAIT);
1147 if (error != 0) {
1148 sync = FALSE;
1149 dmu_tx_abort(tx);
1150 } else {
1151 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1152 zvol_log_truncate(zv, tx, offset, length);
1153 dmu_tx_commit(tx);
1154 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1155 offset, length);
1156 }
1157 zfs_rangelock_exit(lr);
1158 if (sync)
1159 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1160 rw_exit(&zv->zv_suspend_lock);
1161 break;
1162 case DIOCGSTRIPESIZE:
1163 *(off_t *)data = zv->zv_volblocksize;
1164 break;
1165 case DIOCGSTRIPEOFFSET:
1166 *(off_t *)data = 0;
1167 break;
1168 case DIOCGATTR: {
1169 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1170 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1171 uint64_t refd, avail, usedobjs, availobjs;
1172
1173 if (strcmp(arg->name, "GEOM::candelete") == 0)
1174 arg->value.i = 1;
1175 else if (strcmp(arg->name, "blocksavail") == 0) {
1176 dmu_objset_space(zv->zv_objset, &refd, &avail,
1177 &usedobjs, &availobjs);
1178 arg->value.off = avail / DEV_BSIZE;
1179 } else if (strcmp(arg->name, "blocksused") == 0) {
1180 dmu_objset_space(zv->zv_objset, &refd, &avail,
1181 &usedobjs, &availobjs);
1182 arg->value.off = refd / DEV_BSIZE;
1183 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1184 avail = metaslab_class_get_space(spa_normal_class(spa));
1185 avail -= metaslab_class_get_alloc(
1186 spa_normal_class(spa));
1187 arg->value.off = avail / DEV_BSIZE;
1188 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1189 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1190 arg->value.off = refd / DEV_BSIZE;
1191 } else
1192 error = SET_ERROR(ENOIOCTL);
1193 break;
1194 }
1195 case FIOSEEKHOLE:
1196 case FIOSEEKDATA: {
1197 off_t *off = (off_t *)data;
1198 uint64_t noff;
1199 boolean_t hole;
1200
1201 hole = (cmd == FIOSEEKHOLE);
1202 noff = *off;
1203 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1204 RL_READER);
1205 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1206 zfs_rangelock_exit(lr);
1207 *off = noff;
1208 break;
1209 }
1210 default:
1211 error = SET_ERROR(ENOIOCTL);
1212 }
1213
1214 return (error);
1215 }
1216
1217 /*
1218 * Misc. helpers
1219 */
1220
1221 static void
zvol_ensure_zilog(zvol_state_t * zv)1222 zvol_ensure_zilog(zvol_state_t *zv)
1223 {
1224 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1225
1226 /*
1227 * Open a ZIL if this is the first time we have written to this
1228 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1229 * than zv_state_lock so that we don't need to acquire an
1230 * additional lock in this path.
1231 */
1232 if (zv->zv_zilog == NULL) {
1233 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1234 rw_exit(&zv->zv_suspend_lock);
1235 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1236 }
1237 if (zv->zv_zilog == NULL) {
1238 zv->zv_zilog = zil_open(zv->zv_objset,
1239 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1240 zv->zv_flags |= ZVOL_WRITTEN_TO;
1241 /* replay / destroy done in zvol_os_create_minor() */
1242 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1243 ZIL_REPLAY_NEEDED);
1244 }
1245 rw_downgrade(&zv->zv_suspend_lock);
1246 }
1247 }
1248
1249 boolean_t
zvol_os_is_zvol(const char * device)1250 zvol_os_is_zvol(const char *device)
1251 {
1252 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1253 }
1254
1255 void
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1256 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1257 {
1258 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1259 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1260
1261 /* Move to a new hashtable entry. */
1262 zv->zv_hash = zvol_name_hash(newname);
1263 hlist_del(&zv->zv_hlink);
1264 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1265
1266 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1267 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1268 struct g_provider *pp = zsg->zsg_provider;
1269 struct g_geom *gp;
1270
1271 g_topology_lock();
1272 gp = pp->geom;
1273 ASSERT3P(gp, !=, NULL);
1274
1275 zsg->zsg_provider = NULL;
1276 g_wither_provider(pp, ENXIO);
1277
1278 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1279 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1280 pp->sectorsize = DEV_BSIZE;
1281 pp->mediasize = zv->zv_volsize;
1282 pp->private = zv;
1283 zsg->zsg_provider = pp;
1284 g_error_provider(pp, 0);
1285 g_topology_unlock();
1286 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1287 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1288 struct cdev *dev;
1289 struct make_dev_args args;
1290
1291 dev = zsd->zsd_cdev;
1292 if (dev != NULL) {
1293 destroy_dev(dev);
1294 dev = zsd->zsd_cdev = NULL;
1295 if (zv->zv_open_count > 0) {
1296 zv->zv_flags &= ~ZVOL_EXCL;
1297 zv->zv_open_count = 0;
1298 /* XXX need suspend lock but lock order */
1299 zvol_last_close(zv);
1300 }
1301 }
1302
1303 make_dev_args_init(&args);
1304 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1305 args.mda_devsw = &zvol_cdevsw;
1306 args.mda_cr = NULL;
1307 args.mda_uid = UID_ROOT;
1308 args.mda_gid = GID_OPERATOR;
1309 args.mda_mode = 0640;
1310 args.mda_si_drv2 = zv;
1311 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1312 == 0) {
1313 #if __FreeBSD_version > 1300130
1314 dev->si_iosize_max = maxphys;
1315 #else
1316 dev->si_iosize_max = MAXPHYS;
1317 #endif
1318 zsd->zsd_cdev = dev;
1319 }
1320 }
1321 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1322 dataset_kstats_rename(&zv->zv_kstat, newname);
1323 }
1324
1325 /*
1326 * Remove minor node for the specified volume.
1327 */
1328 void
zvol_os_free(zvol_state_t * zv)1329 zvol_os_free(zvol_state_t *zv)
1330 {
1331 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1332 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1333 ASSERT0(zv->zv_open_count);
1334
1335 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1336
1337 rw_destroy(&zv->zv_suspend_lock);
1338 zfs_rangelock_fini(&zv->zv_rangelock);
1339
1340 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1341 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1342 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1343
1344 ASSERT3P(pp->private, ==, NULL);
1345
1346 g_topology_lock();
1347 zvol_geom_destroy(zv);
1348 g_topology_unlock();
1349 mtx_destroy(&zsg->zsg_queue_mtx);
1350 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1351 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1352 struct cdev *dev = zsd->zsd_cdev;
1353
1354 if (dev != NULL) {
1355 ASSERT3P(dev->si_drv2, ==, NULL);
1356 destroy_dev(dev);
1357 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1358 knlist_destroy(&zsd->zsd_selinfo.si_note);
1359 }
1360 }
1361
1362 mutex_destroy(&zv->zv_state_lock);
1363 dataset_kstats_destroy(&zv->zv_kstat);
1364 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1365 kmem_free(zv, sizeof (zvol_state_t));
1366 zvol_minors--;
1367 }
1368
1369 /*
1370 * Create a minor node (plus a whole lot more) for the specified volume.
1371 */
1372 int
zvol_os_create_minor(const char * name)1373 zvol_os_create_minor(const char *name)
1374 {
1375 zvol_state_t *zv;
1376 objset_t *os;
1377 dmu_object_info_t *doi;
1378 uint64_t volsize;
1379 uint64_t volmode, hash;
1380 int error;
1381 bool replayed_zil = B_FALSE;
1382
1383 ZFS_LOG(1, "Creating ZVOL %s...", name);
1384 hash = zvol_name_hash(name);
1385 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1386 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1387 mutex_exit(&zv->zv_state_lock);
1388 return (SET_ERROR(EEXIST));
1389 }
1390
1391 DROP_GIANT();
1392
1393 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1394
1395 /* Lie and say we're read-only. */
1396 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1397 if (error)
1398 goto out_doi;
1399
1400 error = dmu_object_info(os, ZVOL_OBJ, doi);
1401 if (error)
1402 goto out_dmu_objset_disown;
1403
1404 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1405 if (error)
1406 goto out_dmu_objset_disown;
1407
1408 error = dsl_prop_get_integer(name,
1409 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1410 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1411 volmode = zvol_volmode;
1412 error = 0;
1413
1414 /*
1415 * zvol_alloc equivalent ...
1416 */
1417 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1418 zv->zv_hash = hash;
1419 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1420 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1421 zv->zv_volmode = volmode;
1422 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1423 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1424 struct g_provider *pp;
1425 struct g_geom *gp;
1426
1427 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1428 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1429
1430 g_topology_lock();
1431 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1432 gp->start = zvol_geom_bio_start;
1433 gp->access = zvol_geom_access;
1434 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1435 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1436 pp->sectorsize = DEV_BSIZE;
1437 pp->mediasize = 0;
1438 pp->private = zv;
1439
1440 zsg->zsg_provider = pp;
1441 bioq_init(&zsg->zsg_queue);
1442 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1443 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1444 struct cdev *dev;
1445 struct make_dev_args args;
1446
1447 make_dev_args_init(&args);
1448 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1449 args.mda_devsw = &zvol_cdevsw;
1450 args.mda_cr = NULL;
1451 args.mda_uid = UID_ROOT;
1452 args.mda_gid = GID_OPERATOR;
1453 args.mda_mode = 0640;
1454 args.mda_si_drv2 = zv;
1455 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1456 == 0) {
1457 #if __FreeBSD_version > 1300130
1458 dev->si_iosize_max = maxphys;
1459 #else
1460 dev->si_iosize_max = MAXPHYS;
1461 #endif
1462 zsd->zsd_cdev = dev;
1463 knlist_init_sx(&zsd->zsd_selinfo.si_note,
1464 &zv->zv_state_lock);
1465 }
1466 }
1467 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1468 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1469 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1470
1471 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1472 zv->zv_flags |= ZVOL_RDONLY;
1473
1474 zv->zv_volblocksize = doi->doi_data_block_size;
1475 zv->zv_volsize = volsize;
1476 zv->zv_objset = os;
1477
1478 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1479 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1480 if (error)
1481 goto out_dmu_objset_disown;
1482 ASSERT3P(zv->zv_zilog, ==, NULL);
1483 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1484 if (spa_writeable(dmu_objset_spa(os))) {
1485 if (zil_replay_disable)
1486 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1487 else
1488 replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1489 }
1490 if (replayed_zil)
1491 zil_close(zv->zv_zilog);
1492 zv->zv_zilog = NULL;
1493
1494 /* TODO: prefetch for geom tasting */
1495
1496 zv->zv_objset = NULL;
1497 out_dmu_objset_disown:
1498 dmu_objset_disown(os, B_TRUE, FTAG);
1499
1500 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1501 zvol_geom_run(zv);
1502 g_topology_unlock();
1503 }
1504 out_doi:
1505 kmem_free(doi, sizeof (dmu_object_info_t));
1506 if (error == 0) {
1507 rw_enter(&zvol_state_lock, RW_WRITER);
1508 zvol_insert(zv);
1509 zvol_minors++;
1510 rw_exit(&zvol_state_lock);
1511 ZFS_LOG(1, "ZVOL %s created.", name);
1512 }
1513 PICKUP_GIANT();
1514 return (error);
1515 }
1516
1517 void
zvol_os_clear_private(zvol_state_t * zv)1518 zvol_os_clear_private(zvol_state_t *zv)
1519 {
1520 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1521 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1522 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1523 struct g_provider *pp = zsg->zsg_provider;
1524
1525 if (pp->private == NULL) /* already cleared */
1526 return;
1527
1528 mtx_lock(&zsg->zsg_queue_mtx);
1529 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1530 pp->private = NULL;
1531 wakeup_one(&zsg->zsg_queue);
1532 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1533 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1534 0, "zvol:w", 0);
1535 mtx_unlock(&zsg->zsg_queue_mtx);
1536 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1537 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1538 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1539 struct cdev *dev = zsd->zsd_cdev;
1540
1541 if (dev != NULL)
1542 dev->si_drv2 = NULL;
1543 }
1544 }
1545
1546 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1547 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1548 {
1549 zv->zv_volsize = volsize;
1550 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1551 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1552 struct g_provider *pp = zsg->zsg_provider;
1553
1554 g_topology_lock();
1555
1556 if (pp->private == NULL) {
1557 g_topology_unlock();
1558 return (SET_ERROR(ENXIO));
1559 }
1560
1561 /*
1562 * Do not invoke resize event when initial size was zero.
1563 * ZVOL initializes the size on first open, this is not
1564 * real resizing.
1565 */
1566 if (pp->mediasize == 0)
1567 pp->mediasize = zv->zv_volsize;
1568 else
1569 g_resize_provider(pp, zv->zv_volsize);
1570
1571 g_topology_unlock();
1572 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1573 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1574
1575 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1576 }
1577 return (0);
1578 }
1579
1580 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1581 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1582 {
1583 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1584 }
1585
1586 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1587 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1588 {
1589 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1590 }
1591
1592 /*
1593 * Public interfaces
1594 */
1595
1596 int
zvol_busy(void)1597 zvol_busy(void)
1598 {
1599 return (zvol_minors != 0);
1600 }
1601
1602 int
zvol_init(void)1603 zvol_init(void)
1604 {
1605 zvol_init_impl();
1606 return (0);
1607 }
1608
1609 void
zvol_fini(void)1610 zvol_fini(void)
1611 {
1612 zvol_fini_impl();
1613 }
1614