1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 #include <sys/freebsd_event.h> 96 97 #include <geom/geom.h> 98 #include <sys/zvol.h> 99 #include <sys/zvol_impl.h> 100 101 #include "zfs_namecheck.h" 102 103 #define ZVOL_DUMPSIZE "dumpsize" 104 105 #ifdef ZVOL_LOCK_DEBUG 106 #define ZVOL_RW_READER RW_WRITER 107 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 108 #else 109 #define ZVOL_RW_READER RW_READER 110 #define ZVOL_RW_READ_HELD RW_READ_HELD 111 #endif 112 113 enum zvol_geom_state { 114 ZVOL_GEOM_UNINIT, 115 ZVOL_GEOM_STOPPED, 116 ZVOL_GEOM_RUNNING, 117 }; 118 119 struct zvol_state_os { 120 #define zso_dev _zso_state._zso_dev 121 #define zso_geom _zso_state._zso_geom 122 union { 123 /* volmode=dev */ 124 struct zvol_state_dev { 125 struct cdev *zsd_cdev; 126 uint64_t zsd_sync_cnt; 127 struct selinfo zsd_selinfo; 128 } _zso_dev; 129 130 /* volmode=geom */ 131 struct zvol_state_geom { 132 struct g_provider *zsg_provider; 133 struct bio_queue_head zsg_queue; 134 struct mtx zsg_queue_mtx; 135 enum zvol_geom_state zsg_state; 136 } _zso_geom; 137 } _zso_state; 138 int zso_dying; 139 }; 140 141 static uint32_t zvol_minors; 142 143 SYSCTL_DECL(_vfs_zfs); 144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 146 "Expose as GEOM providers (1), device files (2) or neither"); 147 static boolean_t zpool_on_zvol = B_FALSE; 148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 149 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 150 151 /* 152 * Toggle unmap functionality. 153 */ 154 boolean_t zvol_unmap_enabled = B_TRUE; 155 156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 158 159 /* 160 * zvol maximum transfer in one DMU tx. 161 */ 162 int zvol_maxphys = DMU_MAX_ACCESS / 2; 163 164 static void zvol_ensure_zilog(zvol_state_t *zv); 165 166 static d_open_t zvol_cdev_open; 167 static d_close_t zvol_cdev_close; 168 static d_ioctl_t zvol_cdev_ioctl; 169 static d_read_t zvol_cdev_read; 170 static d_write_t zvol_cdev_write; 171 static d_strategy_t zvol_geom_bio_strategy; 172 static d_kqfilter_t zvol_cdev_kqfilter; 173 174 static struct cdevsw zvol_cdevsw = { 175 .d_name = "zvol", 176 .d_version = D_VERSION, 177 .d_flags = D_DISK | D_TRACKCLOSE, 178 .d_open = zvol_cdev_open, 179 .d_close = zvol_cdev_close, 180 .d_ioctl = zvol_cdev_ioctl, 181 .d_read = zvol_cdev_read, 182 .d_write = zvol_cdev_write, 183 .d_strategy = zvol_geom_bio_strategy, 184 .d_kqfilter = zvol_cdev_kqfilter, 185 }; 186 187 static void zvol_filter_detach(struct knote *kn); 188 static int zvol_filter_vnode(struct knote *kn, long hint); 189 190 static struct filterops zvol_filterops_vnode = { 191 .f_isfd = 1, 192 .f_detach = zvol_filter_detach, 193 .f_event = zvol_filter_vnode, 194 }; 195 196 extern uint_t zfs_geom_probe_vdev_key; 197 198 struct g_class zfs_zvol_class = { 199 .name = "ZFS::ZVOL", 200 .version = G_VERSION, 201 }; 202 203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 204 205 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 206 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 207 static void zvol_geom_run(zvol_state_t *zv); 208 static void zvol_geom_destroy(zvol_state_t *zv); 209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 210 static void zvol_geom_worker(void *arg); 211 static void zvol_geom_bio_start(struct bio *bp); 212 static int zvol_geom_bio_getattr(struct bio *bp); 213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 214 215 /* 216 * GEOM mode implementation 217 */ 218 219 static int 220 zvol_geom_open(struct g_provider *pp, int flag, int count) 221 { 222 zvol_state_t *zv; 223 int err = 0; 224 boolean_t drop_suspend = B_FALSE; 225 226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 227 /* 228 * If zfs_geom_probe_vdev_key is set, that means that zfs is 229 * attempting to probe geom providers while looking for a 230 * replacement for a missing VDEV. In this case, the 231 * spa_namespace_lock will not be held, but it is still illegal 232 * to use a zvol as a vdev. Deadlocks can result if another 233 * thread has spa_namespace_lock. 234 */ 235 return (SET_ERROR(EOPNOTSUPP)); 236 } 237 238 retry: 239 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 240 /* 241 * Obtain a copy of private under zvol_state_lock to make sure either 242 * the result of zvol free code setting private to NULL is observed, 243 * or the zv is protected from being freed because of the positive 244 * zv_open_count. 245 */ 246 zv = pp->private; 247 if (zv == NULL) { 248 rw_exit(&zvol_state_lock); 249 err = SET_ERROR(ENXIO); 250 goto out_locked; 251 } 252 253 mutex_enter(&zv->zv_state_lock); 254 if (zv->zv_zso->zso_dying) { 255 rw_exit(&zvol_state_lock); 256 err = SET_ERROR(ENXIO); 257 goto out_zv_locked; 258 } 259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 260 261 /* 262 * Make sure zvol is not suspended during first open 263 * (hold zv_suspend_lock) and respect proper lock acquisition 264 * ordering - zv_suspend_lock before zv_state_lock. 265 */ 266 if (zv->zv_open_count == 0) { 267 drop_suspend = B_TRUE; 268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 269 mutex_exit(&zv->zv_state_lock); 270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 271 mutex_enter(&zv->zv_state_lock); 272 /* Check to see if zv_suspend_lock is needed. */ 273 if (zv->zv_open_count != 0) { 274 rw_exit(&zv->zv_suspend_lock); 275 drop_suspend = B_FALSE; 276 } 277 } 278 } 279 rw_exit(&zvol_state_lock); 280 281 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 282 283 if (zv->zv_open_count == 0) { 284 boolean_t drop_namespace = B_FALSE; 285 286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 287 288 /* 289 * Take spa_namespace_lock to prevent lock inversion when 290 * zvols from one pool are opened as vdevs in another. 291 */ 292 if (!mutex_owned(&spa_namespace_lock)) { 293 if (!mutex_tryenter(&spa_namespace_lock)) { 294 mutex_exit(&zv->zv_state_lock); 295 rw_exit(&zv->zv_suspend_lock); 296 kern_yield(PRI_USER); 297 goto retry; 298 } else { 299 drop_namespace = B_TRUE; 300 } 301 } 302 err = zvol_first_open(zv, !(flag & FWRITE)); 303 if (drop_namespace) 304 mutex_exit(&spa_namespace_lock); 305 if (err) 306 goto out_zv_locked; 307 pp->mediasize = zv->zv_volsize; 308 pp->stripeoffset = 0; 309 pp->stripesize = zv->zv_volblocksize; 310 } 311 312 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 313 314 /* 315 * Check for a bad on-disk format version now since we 316 * lied about owning the dataset readonly before. 317 */ 318 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 319 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 320 err = SET_ERROR(EROFS); 321 goto out_opened; 322 } 323 if (zv->zv_flags & ZVOL_EXCL) { 324 err = SET_ERROR(EBUSY); 325 goto out_opened; 326 } 327 if (flag & O_EXCL) { 328 if (zv->zv_open_count != 0) { 329 err = SET_ERROR(EBUSY); 330 goto out_opened; 331 } 332 zv->zv_flags |= ZVOL_EXCL; 333 } 334 335 zv->zv_open_count += count; 336 out_opened: 337 if (zv->zv_open_count == 0) { 338 zvol_last_close(zv); 339 wakeup(zv); 340 } 341 out_zv_locked: 342 mutex_exit(&zv->zv_state_lock); 343 out_locked: 344 if (drop_suspend) 345 rw_exit(&zv->zv_suspend_lock); 346 return (err); 347 } 348 349 static int 350 zvol_geom_close(struct g_provider *pp, int flag, int count) 351 { 352 (void) flag; 353 zvol_state_t *zv; 354 boolean_t drop_suspend = B_TRUE; 355 int new_open_count; 356 357 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 358 zv = pp->private; 359 if (zv == NULL) { 360 rw_exit(&zvol_state_lock); 361 return (SET_ERROR(ENXIO)); 362 } 363 364 mutex_enter(&zv->zv_state_lock); 365 if (zv->zv_flags & ZVOL_EXCL) { 366 ASSERT3U(zv->zv_open_count, ==, 1); 367 zv->zv_flags &= ~ZVOL_EXCL; 368 } 369 370 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 371 372 /* 373 * If the open count is zero, this is a spurious close. 374 * That indicates a bug in the kernel / DDI framework. 375 */ 376 ASSERT3U(zv->zv_open_count, >, 0); 377 378 /* 379 * Make sure zvol is not suspended during last close 380 * (hold zv_suspend_lock) and respect proper lock acquisition 381 * ordering - zv_suspend_lock before zv_state_lock. 382 */ 383 new_open_count = zv->zv_open_count - count; 384 if (new_open_count == 0) { 385 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 386 mutex_exit(&zv->zv_state_lock); 387 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 388 mutex_enter(&zv->zv_state_lock); 389 /* Check to see if zv_suspend_lock is needed. */ 390 new_open_count = zv->zv_open_count - count; 391 if (new_open_count != 0) { 392 rw_exit(&zv->zv_suspend_lock); 393 drop_suspend = B_FALSE; 394 } 395 } 396 } else { 397 drop_suspend = B_FALSE; 398 } 399 rw_exit(&zvol_state_lock); 400 401 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 402 403 /* 404 * You may get multiple opens, but only one close. 405 */ 406 zv->zv_open_count = new_open_count; 407 if (zv->zv_open_count == 0) { 408 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 409 zvol_last_close(zv); 410 wakeup(zv); 411 } 412 413 mutex_exit(&zv->zv_state_lock); 414 415 if (drop_suspend) 416 rw_exit(&zv->zv_suspend_lock); 417 return (0); 418 } 419 420 static void 421 zvol_geom_run(zvol_state_t *zv) 422 { 423 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 424 struct g_provider *pp = zsg->zsg_provider; 425 426 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 427 428 g_error_provider(pp, 0); 429 430 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 431 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 432 } 433 434 static void 435 zvol_geom_destroy(zvol_state_t *zv) 436 { 437 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 438 struct g_provider *pp = zsg->zsg_provider; 439 440 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 441 442 g_topology_assert(); 443 444 mutex_enter(&zv->zv_state_lock); 445 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 446 mutex_exit(&zv->zv_state_lock); 447 zsg->zsg_provider = NULL; 448 g_wither_geom(pp->geom, ENXIO); 449 } 450 451 void 452 zvol_wait_close(zvol_state_t *zv) 453 { 454 455 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 456 return; 457 mutex_enter(&zv->zv_state_lock); 458 zv->zv_zso->zso_dying = B_TRUE; 459 460 if (zv->zv_open_count) 461 msleep(zv, &zv->zv_state_lock, 462 PRIBIO, "zvol:dying", 10*hz); 463 mutex_exit(&zv->zv_state_lock); 464 } 465 466 467 static int 468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 469 { 470 int count, error, flags; 471 472 g_topology_assert(); 473 474 /* 475 * To make it easier we expect either open or close, but not both 476 * at the same time. 477 */ 478 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 479 (acr <= 0 && acw <= 0 && ace <= 0), 480 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 481 pp->name, acr, acw, ace)); 482 483 if (pp->private == NULL) { 484 if (acr <= 0 && acw <= 0 && ace <= 0) 485 return (0); 486 return (pp->error); 487 } 488 489 /* 490 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 491 * ace != 0, because GEOM already handles that and handles it a bit 492 * differently. GEOM allows for multiple read/exclusive consumers and 493 * ZFS allows only one exclusive consumer, no matter if it is reader or 494 * writer. I like better the way GEOM works so I'll leave it for GEOM 495 * to decide what to do. 496 */ 497 498 count = acr + acw + ace; 499 if (count == 0) 500 return (0); 501 502 flags = 0; 503 if (acr != 0 || ace != 0) 504 flags |= FREAD; 505 if (acw != 0) 506 flags |= FWRITE; 507 508 g_topology_unlock(); 509 if (count > 0) 510 error = zvol_geom_open(pp, flags, count); 511 else 512 error = zvol_geom_close(pp, flags, -count); 513 g_topology_lock(); 514 return (error); 515 } 516 517 static void 518 zvol_geom_worker(void *arg) 519 { 520 zvol_state_t *zv = arg; 521 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 522 struct bio *bp; 523 524 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 525 526 thread_lock(curthread); 527 sched_prio(curthread, PRIBIO); 528 thread_unlock(curthread); 529 530 for (;;) { 531 mtx_lock(&zsg->zsg_queue_mtx); 532 bp = bioq_takefirst(&zsg->zsg_queue); 533 if (bp == NULL) { 534 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 535 zsg->zsg_state = ZVOL_GEOM_RUNNING; 536 wakeup(&zsg->zsg_state); 537 mtx_unlock(&zsg->zsg_queue_mtx); 538 kthread_exit(); 539 } 540 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 541 PRIBIO | PDROP, "zvol:io", 0); 542 continue; 543 } 544 mtx_unlock(&zsg->zsg_queue_mtx); 545 zvol_geom_bio_strategy(bp); 546 } 547 } 548 549 static void 550 zvol_geom_bio_start(struct bio *bp) 551 { 552 zvol_state_t *zv = bp->bio_to->private; 553 struct zvol_state_geom *zsg; 554 boolean_t first; 555 556 if (zv == NULL) { 557 g_io_deliver(bp, ENXIO); 558 return; 559 } 560 if (bp->bio_cmd == BIO_GETATTR) { 561 if (zvol_geom_bio_getattr(bp)) 562 g_io_deliver(bp, EOPNOTSUPP); 563 return; 564 } 565 566 if (!THREAD_CAN_SLEEP()) { 567 zsg = &zv->zv_zso->zso_geom; 568 mtx_lock(&zsg->zsg_queue_mtx); 569 first = (bioq_first(&zsg->zsg_queue) == NULL); 570 bioq_insert_tail(&zsg->zsg_queue, bp); 571 mtx_unlock(&zsg->zsg_queue_mtx); 572 if (first) 573 wakeup_one(&zsg->zsg_queue); 574 return; 575 } 576 577 zvol_geom_bio_strategy(bp); 578 } 579 580 static int 581 zvol_geom_bio_getattr(struct bio *bp) 582 { 583 zvol_state_t *zv; 584 585 zv = bp->bio_to->private; 586 ASSERT3P(zv, !=, NULL); 587 588 spa_t *spa = dmu_objset_spa(zv->zv_objset); 589 uint64_t refd, avail, usedobjs, availobjs; 590 591 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 592 return (0); 593 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 594 dmu_objset_space(zv->zv_objset, &refd, &avail, 595 &usedobjs, &availobjs); 596 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 597 return (0); 598 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 599 dmu_objset_space(zv->zv_objset, &refd, &avail, 600 &usedobjs, &availobjs); 601 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 602 return (0); 603 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 604 avail = metaslab_class_get_space(spa_normal_class(spa)); 605 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 606 if (g_handleattr_off_t(bp, "poolblocksavail", 607 avail / DEV_BSIZE)) 608 return (0); 609 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 610 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 611 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 612 return (0); 613 } 614 return (1); 615 } 616 617 static void 618 zvol_filter_detach(struct knote *kn) 619 { 620 zvol_state_t *zv; 621 struct zvol_state_dev *zsd; 622 623 zv = kn->kn_hook; 624 zsd = &zv->zv_zso->zso_dev; 625 626 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); 627 } 628 629 static int 630 zvol_filter_vnode(struct knote *kn, long hint) 631 { 632 kn->kn_fflags |= kn->kn_sfflags & hint; 633 634 return (kn->kn_fflags != 0); 635 } 636 637 static int 638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) 639 { 640 zvol_state_t *zv; 641 struct zvol_state_dev *zsd; 642 643 zv = dev->si_drv2; 644 zsd = &zv->zv_zso->zso_dev; 645 646 if (kn->kn_filter != EVFILT_VNODE) 647 return (EINVAL); 648 649 /* XXX: extend support for other NOTE_* events */ 650 if (kn->kn_sfflags != NOTE_ATTRIB) 651 return (EINVAL); 652 653 kn->kn_fop = &zvol_filterops_vnode; 654 kn->kn_hook = zv; 655 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); 656 657 return (0); 658 } 659 660 static void 661 zvol_geom_bio_strategy(struct bio *bp) 662 { 663 zvol_state_t *zv; 664 uint64_t off, volsize; 665 size_t resid; 666 char *addr; 667 objset_t *os; 668 zfs_locked_range_t *lr; 669 int error = 0; 670 boolean_t doread = B_FALSE; 671 boolean_t is_dumpified; 672 boolean_t sync; 673 674 if (bp->bio_to) 675 zv = bp->bio_to->private; 676 else 677 zv = bp->bio_dev->si_drv2; 678 679 if (zv == NULL) { 680 error = SET_ERROR(ENXIO); 681 goto out; 682 } 683 684 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 685 686 switch (bp->bio_cmd) { 687 case BIO_READ: 688 doread = B_TRUE; 689 break; 690 case BIO_WRITE: 691 case BIO_FLUSH: 692 case BIO_DELETE: 693 if (zv->zv_flags & ZVOL_RDONLY) { 694 error = SET_ERROR(EROFS); 695 goto resume; 696 } 697 zvol_ensure_zilog(zv); 698 if (bp->bio_cmd == BIO_FLUSH) 699 goto sync; 700 break; 701 default: 702 error = SET_ERROR(EOPNOTSUPP); 703 goto resume; 704 } 705 706 off = bp->bio_offset; 707 volsize = zv->zv_volsize; 708 709 os = zv->zv_objset; 710 ASSERT3P(os, !=, NULL); 711 712 addr = bp->bio_data; 713 resid = bp->bio_length; 714 715 if (resid > 0 && off >= volsize) { 716 error = SET_ERROR(EIO); 717 goto resume; 718 } 719 720 is_dumpified = B_FALSE; 721 sync = !doread && !is_dumpified && 722 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 723 724 /* 725 * There must be no buffer changes when doing a dmu_sync() because 726 * we can't change the data whilst calculating the checksum. 727 */ 728 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 729 doread ? RL_READER : RL_WRITER); 730 731 if (bp->bio_cmd == BIO_DELETE) { 732 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 733 error = dmu_tx_assign(tx, TXG_WAIT); 734 if (error != 0) { 735 dmu_tx_abort(tx); 736 } else { 737 zvol_log_truncate(zv, tx, off, resid, sync); 738 dmu_tx_commit(tx); 739 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 740 off, resid); 741 resid = 0; 742 } 743 goto unlock; 744 } 745 while (resid != 0 && off < volsize) { 746 size_t size = MIN(resid, zvol_maxphys); 747 if (doread) { 748 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 749 DMU_READ_PREFETCH); 750 } else { 751 dmu_tx_t *tx = dmu_tx_create(os); 752 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 753 error = dmu_tx_assign(tx, TXG_WAIT); 754 if (error) { 755 dmu_tx_abort(tx); 756 } else { 757 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 758 zvol_log_write(zv, tx, off, size, sync); 759 dmu_tx_commit(tx); 760 } 761 } 762 if (error) { 763 /* Convert checksum errors into IO errors. */ 764 if (error == ECKSUM) 765 error = SET_ERROR(EIO); 766 break; 767 } 768 off += size; 769 addr += size; 770 resid -= size; 771 } 772 unlock: 773 zfs_rangelock_exit(lr); 774 775 bp->bio_completed = bp->bio_length - resid; 776 if (bp->bio_completed < bp->bio_length && off > volsize) 777 error = SET_ERROR(EINVAL); 778 779 switch (bp->bio_cmd) { 780 case BIO_FLUSH: 781 break; 782 case BIO_READ: 783 dataset_kstats_update_read_kstats(&zv->zv_kstat, 784 bp->bio_completed); 785 break; 786 case BIO_WRITE: 787 dataset_kstats_update_write_kstats(&zv->zv_kstat, 788 bp->bio_completed); 789 break; 790 case BIO_DELETE: 791 break; 792 default: 793 break; 794 } 795 796 if (sync) { 797 sync: 798 zil_commit(zv->zv_zilog, ZVOL_OBJ); 799 } 800 resume: 801 rw_exit(&zv->zv_suspend_lock); 802 out: 803 if (bp->bio_to) 804 g_io_deliver(bp, error); 805 else 806 biofinish(bp, NULL, error); 807 } 808 809 /* 810 * Character device mode implementation 811 */ 812 813 static int 814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 815 { 816 zvol_state_t *zv; 817 uint64_t volsize; 818 zfs_locked_range_t *lr; 819 int error = 0; 820 zfs_uio_t uio; 821 822 zfs_uio_init(&uio, uio_s); 823 824 zv = dev->si_drv2; 825 826 volsize = zv->zv_volsize; 827 /* 828 * uio_loffset == volsize isn't an error as 829 * it's required for EOF processing. 830 */ 831 if (zfs_uio_resid(&uio) > 0 && 832 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 833 return (SET_ERROR(EIO)); 834 835 ssize_t start_resid = zfs_uio_resid(&uio); 836 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 837 zfs_uio_resid(&uio), RL_READER); 838 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 839 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 840 841 /* Don't read past the end. */ 842 if (bytes > volsize - zfs_uio_offset(&uio)) 843 bytes = volsize - zfs_uio_offset(&uio); 844 845 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 846 if (error) { 847 /* Convert checksum errors into IO errors. */ 848 if (error == ECKSUM) 849 error = SET_ERROR(EIO); 850 break; 851 } 852 } 853 zfs_rangelock_exit(lr); 854 int64_t nread = start_resid - zfs_uio_resid(&uio); 855 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 856 857 return (error); 858 } 859 860 static int 861 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 862 { 863 zvol_state_t *zv; 864 uint64_t volsize; 865 zfs_locked_range_t *lr; 866 int error = 0; 867 boolean_t sync; 868 zfs_uio_t uio; 869 870 zv = dev->si_drv2; 871 872 volsize = zv->zv_volsize; 873 874 zfs_uio_init(&uio, uio_s); 875 876 if (zfs_uio_resid(&uio) > 0 && 877 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 878 return (SET_ERROR(EIO)); 879 880 ssize_t start_resid = zfs_uio_resid(&uio); 881 sync = (ioflag & IO_SYNC) || 882 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 883 884 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 885 zvol_ensure_zilog(zv); 886 887 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 888 zfs_uio_resid(&uio), RL_WRITER); 889 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 890 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 891 uint64_t off = zfs_uio_offset(&uio); 892 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 893 894 if (bytes > volsize - off) /* Don't write past the end. */ 895 bytes = volsize - off; 896 897 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 898 error = dmu_tx_assign(tx, TXG_WAIT); 899 if (error) { 900 dmu_tx_abort(tx); 901 break; 902 } 903 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 904 if (error == 0) 905 zvol_log_write(zv, tx, off, bytes, sync); 906 dmu_tx_commit(tx); 907 908 if (error) 909 break; 910 } 911 zfs_rangelock_exit(lr); 912 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 913 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 914 if (sync) 915 zil_commit(zv->zv_zilog, ZVOL_OBJ); 916 rw_exit(&zv->zv_suspend_lock); 917 return (error); 918 } 919 920 static int 921 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 922 { 923 zvol_state_t *zv; 924 struct zvol_state_dev *zsd; 925 int err = 0; 926 boolean_t drop_suspend = B_FALSE; 927 928 retry: 929 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 930 /* 931 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either 932 * the result of zvol free code setting si_drv2 to NULL is observed, 933 * or the zv is protected from being freed because of the positive 934 * zv_open_count. 935 */ 936 zv = dev->si_drv2; 937 if (zv == NULL) { 938 rw_exit(&zvol_state_lock); 939 err = SET_ERROR(ENXIO); 940 goto out_locked; 941 } 942 943 mutex_enter(&zv->zv_state_lock); 944 if (zv->zv_zso->zso_dying) { 945 rw_exit(&zvol_state_lock); 946 err = SET_ERROR(ENXIO); 947 goto out_zv_locked; 948 } 949 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 950 951 /* 952 * Make sure zvol is not suspended during first open 953 * (hold zv_suspend_lock) and respect proper lock acquisition 954 * ordering - zv_suspend_lock before zv_state_lock. 955 */ 956 if (zv->zv_open_count == 0) { 957 drop_suspend = B_TRUE; 958 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 959 mutex_exit(&zv->zv_state_lock); 960 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 961 mutex_enter(&zv->zv_state_lock); 962 /* Check to see if zv_suspend_lock is needed. */ 963 if (zv->zv_open_count != 0) { 964 rw_exit(&zv->zv_suspend_lock); 965 drop_suspend = B_FALSE; 966 } 967 } 968 } 969 rw_exit(&zvol_state_lock); 970 971 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 972 973 if (zv->zv_open_count == 0) { 974 boolean_t drop_namespace = B_FALSE; 975 976 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 977 978 /* 979 * Take spa_namespace_lock to prevent lock inversion when 980 * zvols from one pool are opened as vdevs in another. 981 */ 982 if (!mutex_owned(&spa_namespace_lock)) { 983 if (!mutex_tryenter(&spa_namespace_lock)) { 984 mutex_exit(&zv->zv_state_lock); 985 rw_exit(&zv->zv_suspend_lock); 986 kern_yield(PRI_USER); 987 goto retry; 988 } else { 989 drop_namespace = B_TRUE; 990 } 991 } 992 err = zvol_first_open(zv, !(flags & FWRITE)); 993 if (drop_namespace) 994 mutex_exit(&spa_namespace_lock); 995 if (err) 996 goto out_zv_locked; 997 } 998 999 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1000 1001 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 1002 err = SET_ERROR(EROFS); 1003 goto out_opened; 1004 } 1005 if (zv->zv_flags & ZVOL_EXCL) { 1006 err = SET_ERROR(EBUSY); 1007 goto out_opened; 1008 } 1009 if (flags & O_EXCL) { 1010 if (zv->zv_open_count != 0) { 1011 err = SET_ERROR(EBUSY); 1012 goto out_opened; 1013 } 1014 zv->zv_flags |= ZVOL_EXCL; 1015 } 1016 1017 zv->zv_open_count++; 1018 if (flags & O_SYNC) { 1019 zsd = &zv->zv_zso->zso_dev; 1020 zsd->zsd_sync_cnt++; 1021 if (zsd->zsd_sync_cnt == 1 && 1022 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) 1023 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); 1024 } 1025 out_opened: 1026 if (zv->zv_open_count == 0) { 1027 zvol_last_close(zv); 1028 wakeup(zv); 1029 } 1030 out_zv_locked: 1031 mutex_exit(&zv->zv_state_lock); 1032 out_locked: 1033 if (drop_suspend) 1034 rw_exit(&zv->zv_suspend_lock); 1035 return (err); 1036 } 1037 1038 static int 1039 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 1040 { 1041 zvol_state_t *zv; 1042 struct zvol_state_dev *zsd; 1043 boolean_t drop_suspend = B_TRUE; 1044 1045 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 1046 zv = dev->si_drv2; 1047 if (zv == NULL) { 1048 rw_exit(&zvol_state_lock); 1049 return (SET_ERROR(ENXIO)); 1050 } 1051 1052 mutex_enter(&zv->zv_state_lock); 1053 if (zv->zv_flags & ZVOL_EXCL) { 1054 ASSERT3U(zv->zv_open_count, ==, 1); 1055 zv->zv_flags &= ~ZVOL_EXCL; 1056 } 1057 1058 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1059 1060 /* 1061 * If the open count is zero, this is a spurious close. 1062 * That indicates a bug in the kernel / DDI framework. 1063 */ 1064 ASSERT3U(zv->zv_open_count, >, 0); 1065 /* 1066 * Make sure zvol is not suspended during last close 1067 * (hold zv_suspend_lock) and respect proper lock acquisition 1068 * ordering - zv_suspend_lock before zv_state_lock. 1069 */ 1070 if (zv->zv_open_count == 1) { 1071 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1072 mutex_exit(&zv->zv_state_lock); 1073 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1074 mutex_enter(&zv->zv_state_lock); 1075 /* Check to see if zv_suspend_lock is needed. */ 1076 if (zv->zv_open_count != 1) { 1077 rw_exit(&zv->zv_suspend_lock); 1078 drop_suspend = B_FALSE; 1079 } 1080 } 1081 } else { 1082 drop_suspend = B_FALSE; 1083 } 1084 rw_exit(&zvol_state_lock); 1085 1086 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1087 1088 /* 1089 * You may get multiple opens, but only one close. 1090 */ 1091 zv->zv_open_count--; 1092 if (flags & O_SYNC) { 1093 zsd = &zv->zv_zso->zso_dev; 1094 zsd->zsd_sync_cnt--; 1095 } 1096 1097 if (zv->zv_open_count == 0) { 1098 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1099 zvol_last_close(zv); 1100 wakeup(zv); 1101 } 1102 1103 mutex_exit(&zv->zv_state_lock); 1104 1105 if (drop_suspend) 1106 rw_exit(&zv->zv_suspend_lock); 1107 return (0); 1108 } 1109 1110 static int 1111 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1112 int fflag, struct thread *td) 1113 { 1114 zvol_state_t *zv; 1115 zfs_locked_range_t *lr; 1116 off_t offset, length; 1117 int error; 1118 boolean_t sync; 1119 1120 zv = dev->si_drv2; 1121 1122 error = 0; 1123 KASSERT(zv->zv_open_count > 0, 1124 ("Device with zero access count in %s", __func__)); 1125 1126 switch (cmd) { 1127 case DIOCGSECTORSIZE: 1128 *(uint32_t *)data = DEV_BSIZE; 1129 break; 1130 case DIOCGMEDIASIZE: 1131 *(off_t *)data = zv->zv_volsize; 1132 break; 1133 case DIOCGFLUSH: 1134 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1135 if (zv->zv_zilog != NULL) 1136 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1137 rw_exit(&zv->zv_suspend_lock); 1138 break; 1139 case DIOCGDELETE: 1140 if (!zvol_unmap_enabled) 1141 break; 1142 1143 offset = ((off_t *)data)[0]; 1144 length = ((off_t *)data)[1]; 1145 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1146 offset < 0 || offset >= zv->zv_volsize || 1147 length <= 0) { 1148 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1149 length); 1150 error = SET_ERROR(EINVAL); 1151 break; 1152 } 1153 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1154 zvol_ensure_zilog(zv); 1155 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1156 RL_WRITER); 1157 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1158 error = dmu_tx_assign(tx, TXG_WAIT); 1159 if (error != 0) { 1160 sync = FALSE; 1161 dmu_tx_abort(tx); 1162 } else { 1163 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1164 zvol_log_truncate(zv, tx, offset, length, sync); 1165 dmu_tx_commit(tx); 1166 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1167 offset, length); 1168 } 1169 zfs_rangelock_exit(lr); 1170 if (sync) 1171 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1172 rw_exit(&zv->zv_suspend_lock); 1173 break; 1174 case DIOCGSTRIPESIZE: 1175 *(off_t *)data = zv->zv_volblocksize; 1176 break; 1177 case DIOCGSTRIPEOFFSET: 1178 *(off_t *)data = 0; 1179 break; 1180 case DIOCGATTR: { 1181 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1182 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1183 uint64_t refd, avail, usedobjs, availobjs; 1184 1185 if (strcmp(arg->name, "GEOM::candelete") == 0) 1186 arg->value.i = 1; 1187 else if (strcmp(arg->name, "blocksavail") == 0) { 1188 dmu_objset_space(zv->zv_objset, &refd, &avail, 1189 &usedobjs, &availobjs); 1190 arg->value.off = avail / DEV_BSIZE; 1191 } else if (strcmp(arg->name, "blocksused") == 0) { 1192 dmu_objset_space(zv->zv_objset, &refd, &avail, 1193 &usedobjs, &availobjs); 1194 arg->value.off = refd / DEV_BSIZE; 1195 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1196 avail = metaslab_class_get_space(spa_normal_class(spa)); 1197 avail -= metaslab_class_get_alloc( 1198 spa_normal_class(spa)); 1199 arg->value.off = avail / DEV_BSIZE; 1200 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1201 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1202 arg->value.off = refd / DEV_BSIZE; 1203 } else 1204 error = SET_ERROR(ENOIOCTL); 1205 break; 1206 } 1207 case FIOSEEKHOLE: 1208 case FIOSEEKDATA: { 1209 off_t *off = (off_t *)data; 1210 uint64_t noff; 1211 boolean_t hole; 1212 1213 hole = (cmd == FIOSEEKHOLE); 1214 noff = *off; 1215 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1216 *off = noff; 1217 break; 1218 } 1219 default: 1220 error = SET_ERROR(ENOIOCTL); 1221 } 1222 1223 return (error); 1224 } 1225 1226 /* 1227 * Misc. helpers 1228 */ 1229 1230 static void 1231 zvol_ensure_zilog(zvol_state_t *zv) 1232 { 1233 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1234 1235 /* 1236 * Open a ZIL if this is the first time we have written to this 1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1238 * than zv_state_lock so that we don't need to acquire an 1239 * additional lock in this path. 1240 */ 1241 if (zv->zv_zilog == NULL) { 1242 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1243 rw_exit(&zv->zv_suspend_lock); 1244 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1245 } 1246 if (zv->zv_zilog == NULL) { 1247 zv->zv_zilog = zil_open(zv->zv_objset, 1248 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1249 zv->zv_flags |= ZVOL_WRITTEN_TO; 1250 /* replay / destroy done in zvol_os_create_minor() */ 1251 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1252 ZIL_REPLAY_NEEDED); 1253 } 1254 rw_downgrade(&zv->zv_suspend_lock); 1255 } 1256 } 1257 1258 boolean_t 1259 zvol_os_is_zvol(const char *device) 1260 { 1261 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1262 } 1263 1264 void 1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1266 { 1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1268 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1269 1270 /* Move to a new hashtable entry. */ 1271 zv->zv_hash = zvol_name_hash(zv->zv_name); 1272 hlist_del(&zv->zv_hlink); 1273 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1274 1275 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1276 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1277 struct g_provider *pp = zsg->zsg_provider; 1278 struct g_geom *gp; 1279 1280 g_topology_lock(); 1281 gp = pp->geom; 1282 ASSERT3P(gp, !=, NULL); 1283 1284 zsg->zsg_provider = NULL; 1285 g_wither_provider(pp, ENXIO); 1286 1287 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1288 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1289 pp->sectorsize = DEV_BSIZE; 1290 pp->mediasize = zv->zv_volsize; 1291 pp->private = zv; 1292 zsg->zsg_provider = pp; 1293 g_error_provider(pp, 0); 1294 g_topology_unlock(); 1295 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1296 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1297 struct cdev *dev; 1298 struct make_dev_args args; 1299 1300 dev = zsd->zsd_cdev; 1301 if (dev != NULL) { 1302 destroy_dev(dev); 1303 dev = zsd->zsd_cdev = NULL; 1304 if (zv->zv_open_count > 0) { 1305 zv->zv_flags &= ~ZVOL_EXCL; 1306 zv->zv_open_count = 0; 1307 /* XXX need suspend lock but lock order */ 1308 zvol_last_close(zv); 1309 } 1310 } 1311 1312 make_dev_args_init(&args); 1313 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1314 args.mda_devsw = &zvol_cdevsw; 1315 args.mda_cr = NULL; 1316 args.mda_uid = UID_ROOT; 1317 args.mda_gid = GID_OPERATOR; 1318 args.mda_mode = 0640; 1319 args.mda_si_drv2 = zv; 1320 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1321 == 0) { 1322 #if __FreeBSD_version > 1300130 1323 dev->si_iosize_max = maxphys; 1324 #else 1325 dev->si_iosize_max = MAXPHYS; 1326 #endif 1327 zsd->zsd_cdev = dev; 1328 } 1329 } 1330 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1331 } 1332 1333 /* 1334 * Remove minor node for the specified volume. 1335 */ 1336 void 1337 zvol_os_free(zvol_state_t *zv) 1338 { 1339 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1340 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1341 ASSERT0(zv->zv_open_count); 1342 1343 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1344 1345 rw_destroy(&zv->zv_suspend_lock); 1346 zfs_rangelock_fini(&zv->zv_rangelock); 1347 1348 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1349 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1350 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1351 1352 ASSERT3P(pp->private, ==, NULL); 1353 1354 g_topology_lock(); 1355 zvol_geom_destroy(zv); 1356 g_topology_unlock(); 1357 mtx_destroy(&zsg->zsg_queue_mtx); 1358 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1359 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1360 struct cdev *dev = zsd->zsd_cdev; 1361 1362 if (dev != NULL) { 1363 ASSERT3P(dev->si_drv2, ==, NULL); 1364 destroy_dev(dev); 1365 knlist_clear(&zsd->zsd_selinfo.si_note, 0); 1366 knlist_destroy(&zsd->zsd_selinfo.si_note); 1367 } 1368 } 1369 1370 mutex_destroy(&zv->zv_state_lock); 1371 dataset_kstats_destroy(&zv->zv_kstat); 1372 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1373 kmem_free(zv, sizeof (zvol_state_t)); 1374 zvol_minors--; 1375 } 1376 1377 /* 1378 * Create a minor node (plus a whole lot more) for the specified volume. 1379 */ 1380 int 1381 zvol_os_create_minor(const char *name) 1382 { 1383 zvol_state_t *zv; 1384 objset_t *os; 1385 dmu_object_info_t *doi; 1386 uint64_t volsize; 1387 uint64_t volmode, hash; 1388 int error; 1389 bool replayed_zil = B_FALSE; 1390 1391 ZFS_LOG(1, "Creating ZVOL %s...", name); 1392 hash = zvol_name_hash(name); 1393 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1394 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1395 mutex_exit(&zv->zv_state_lock); 1396 return (SET_ERROR(EEXIST)); 1397 } 1398 1399 DROP_GIANT(); 1400 1401 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1402 1403 /* Lie and say we're read-only. */ 1404 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1405 if (error) 1406 goto out_doi; 1407 1408 error = dmu_object_info(os, ZVOL_OBJ, doi); 1409 if (error) 1410 goto out_dmu_objset_disown; 1411 1412 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1413 if (error) 1414 goto out_dmu_objset_disown; 1415 1416 error = dsl_prop_get_integer(name, 1417 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1418 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1419 volmode = zvol_volmode; 1420 error = 0; 1421 1422 /* 1423 * zvol_alloc equivalent ... 1424 */ 1425 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1426 zv->zv_hash = hash; 1427 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1428 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1429 zv->zv_volmode = volmode; 1430 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1431 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1432 struct g_provider *pp; 1433 struct g_geom *gp; 1434 1435 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1436 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1437 1438 g_topology_lock(); 1439 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1440 gp->start = zvol_geom_bio_start; 1441 gp->access = zvol_geom_access; 1442 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1443 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1444 pp->sectorsize = DEV_BSIZE; 1445 pp->mediasize = 0; 1446 pp->private = zv; 1447 1448 zsg->zsg_provider = pp; 1449 bioq_init(&zsg->zsg_queue); 1450 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1451 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1452 struct cdev *dev; 1453 struct make_dev_args args; 1454 1455 make_dev_args_init(&args); 1456 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1457 args.mda_devsw = &zvol_cdevsw; 1458 args.mda_cr = NULL; 1459 args.mda_uid = UID_ROOT; 1460 args.mda_gid = GID_OPERATOR; 1461 args.mda_mode = 0640; 1462 args.mda_si_drv2 = zv; 1463 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) 1464 == 0) { 1465 #if __FreeBSD_version > 1300130 1466 dev->si_iosize_max = maxphys; 1467 #else 1468 dev->si_iosize_max = MAXPHYS; 1469 #endif 1470 zsd->zsd_cdev = dev; 1471 knlist_init_sx(&zsd->zsd_selinfo.si_note, 1472 &zv->zv_state_lock); 1473 } 1474 } 1475 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1476 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1477 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1478 1479 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1480 zv->zv_flags |= ZVOL_RDONLY; 1481 1482 zv->zv_volblocksize = doi->doi_data_block_size; 1483 zv->zv_volsize = volsize; 1484 zv->zv_objset = os; 1485 1486 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1487 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1488 if (error) 1489 goto out_dmu_objset_disown; 1490 ASSERT3P(zv->zv_zilog, ==, NULL); 1491 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1492 if (spa_writeable(dmu_objset_spa(os))) { 1493 if (zil_replay_disable) 1494 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1495 else 1496 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1497 } 1498 if (replayed_zil) 1499 zil_close(zv->zv_zilog); 1500 zv->zv_zilog = NULL; 1501 1502 /* TODO: prefetch for geom tasting */ 1503 1504 zv->zv_objset = NULL; 1505 out_dmu_objset_disown: 1506 dmu_objset_disown(os, B_TRUE, FTAG); 1507 1508 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1509 zvol_geom_run(zv); 1510 g_topology_unlock(); 1511 } 1512 out_doi: 1513 kmem_free(doi, sizeof (dmu_object_info_t)); 1514 if (error == 0) { 1515 rw_enter(&zvol_state_lock, RW_WRITER); 1516 zvol_insert(zv); 1517 zvol_minors++; 1518 rw_exit(&zvol_state_lock); 1519 ZFS_LOG(1, "ZVOL %s created.", name); 1520 } 1521 PICKUP_GIANT(); 1522 return (error); 1523 } 1524 1525 void 1526 zvol_os_clear_private(zvol_state_t *zv) 1527 { 1528 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1529 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1530 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1531 struct g_provider *pp = zsg->zsg_provider; 1532 1533 if (pp->private == NULL) /* already cleared */ 1534 return; 1535 1536 mtx_lock(&zsg->zsg_queue_mtx); 1537 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1538 pp->private = NULL; 1539 wakeup_one(&zsg->zsg_queue); 1540 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1541 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1542 0, "zvol:w", 0); 1543 mtx_unlock(&zsg->zsg_queue_mtx); 1544 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1545 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1546 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1547 struct cdev *dev = zsd->zsd_cdev; 1548 1549 if (dev != NULL) 1550 dev->si_drv2 = NULL; 1551 } 1552 } 1553 1554 int 1555 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1556 { 1557 zv->zv_volsize = volsize; 1558 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1559 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1560 struct g_provider *pp = zsg->zsg_provider; 1561 1562 g_topology_lock(); 1563 1564 if (pp->private == NULL) { 1565 g_topology_unlock(); 1566 return (SET_ERROR(ENXIO)); 1567 } 1568 1569 /* 1570 * Do not invoke resize event when initial size was zero. 1571 * ZVOL initializes the size on first open, this is not 1572 * real resizing. 1573 */ 1574 if (pp->mediasize == 0) 1575 pp->mediasize = zv->zv_volsize; 1576 else 1577 g_resize_provider(pp, zv->zv_volsize); 1578 1579 g_topology_unlock(); 1580 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1581 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1582 1583 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); 1584 } 1585 return (0); 1586 } 1587 1588 void 1589 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1590 { 1591 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1592 } 1593 1594 void 1595 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1596 { 1597 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1598 } 1599 1600 /* 1601 * Public interfaces 1602 */ 1603 1604 int 1605 zvol_busy(void) 1606 { 1607 return (zvol_minors != 0); 1608 } 1609 1610 int 1611 zvol_init(void) 1612 { 1613 zvol_init_impl(); 1614 return (0); 1615 } 1616 1617 void 1618 zvol_fini(void) 1619 { 1620 zvol_fini_impl(); 1621 } 1622