1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 #include <sys/freebsd_event.h> 96 97 #include <geom/geom.h> 98 #include <sys/zvol.h> 99 #include <sys/zvol_impl.h> 100 101 #include "zfs_namecheck.h" 102 103 #define ZVOL_DUMPSIZE "dumpsize" 104 105 #ifdef ZVOL_LOCK_DEBUG 106 #define ZVOL_RW_READER RW_WRITER 107 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 108 #else 109 #define ZVOL_RW_READER RW_READER 110 #define ZVOL_RW_READ_HELD RW_READ_HELD 111 #endif 112 113 enum zvol_geom_state { 114 ZVOL_GEOM_UNINIT, 115 ZVOL_GEOM_STOPPED, 116 ZVOL_GEOM_RUNNING, 117 }; 118 119 struct zvol_state_os { 120 #define zso_dev _zso_state._zso_dev 121 #define zso_geom _zso_state._zso_geom 122 union { 123 /* volmode=dev */ 124 struct zvol_state_dev { 125 struct cdev *zsd_cdev; 126 uint64_t zsd_sync_cnt; 127 struct selinfo zsd_selinfo; 128 } _zso_dev; 129 130 /* volmode=geom */ 131 struct zvol_state_geom { 132 struct g_provider *zsg_provider; 133 struct bio_queue_head zsg_queue; 134 struct mtx zsg_queue_mtx; 135 enum zvol_geom_state zsg_state; 136 } _zso_geom; 137 } _zso_state; 138 int zso_dying; 139 }; 140 141 static uint32_t zvol_minors; 142 143 SYSCTL_DECL(_vfs_zfs); 144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 146 "Expose as GEOM providers (1), device files (2) or neither"); 147 static boolean_t zpool_on_zvol = B_FALSE; 148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 149 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 150 151 /* 152 * Toggle unmap functionality. 153 */ 154 boolean_t zvol_unmap_enabled = B_TRUE; 155 156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 158 159 /* 160 * zvol maximum transfer in one DMU tx. 161 */ 162 int zvol_maxphys = DMU_MAX_ACCESS / 2; 163 164 static void zvol_ensure_zilog(zvol_state_t *zv); 165 166 static d_open_t zvol_cdev_open; 167 static d_close_t zvol_cdev_close; 168 static d_ioctl_t zvol_cdev_ioctl; 169 static d_read_t zvol_cdev_read; 170 static d_write_t zvol_cdev_write; 171 static d_strategy_t zvol_geom_bio_strategy; 172 static d_kqfilter_t zvol_cdev_kqfilter; 173 174 static struct cdevsw zvol_cdevsw = { 175 .d_name = "zvol", 176 .d_version = D_VERSION, 177 .d_flags = D_DISK | D_TRACKCLOSE, 178 .d_open = zvol_cdev_open, 179 .d_close = zvol_cdev_close, 180 .d_ioctl = zvol_cdev_ioctl, 181 .d_read = zvol_cdev_read, 182 .d_write = zvol_cdev_write, 183 .d_strategy = zvol_geom_bio_strategy, 184 .d_kqfilter = zvol_cdev_kqfilter, 185 }; 186 187 static void zvol_filter_detach(struct knote *kn); 188 static int zvol_filter_vnode(struct knote *kn, long hint); 189 190 static struct filterops zvol_filterops_vnode = { 191 .f_isfd = 1, 192 .f_detach = zvol_filter_detach, 193 .f_event = zvol_filter_vnode, 194 }; 195 196 extern uint_t zfs_geom_probe_vdev_key; 197 198 struct g_class zfs_zvol_class = { 199 .name = "ZFS::ZVOL", 200 .version = G_VERSION, 201 }; 202 203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 204 205 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 206 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 207 static void zvol_geom_run(zvol_state_t *zv); 208 static void zvol_geom_destroy(zvol_state_t *zv); 209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 210 static void zvol_geom_worker(void *arg); 211 static void zvol_geom_bio_start(struct bio *bp); 212 static int zvol_geom_bio_getattr(struct bio *bp); 213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 214 215 /* 216 * GEOM mode implementation 217 */ 218 219 static int 220 zvol_geom_open(struct g_provider *pp, int flag, int count) 221 { 222 zvol_state_t *zv; 223 int err = 0; 224 boolean_t drop_suspend = B_FALSE; 225 226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 227 /* 228 * If zfs_geom_probe_vdev_key is set, that means that zfs is 229 * attempting to probe geom providers while looking for a 230 * replacement for a missing VDEV. In this case, the 231 * spa_namespace_lock will not be held, but it is still illegal 232 * to use a zvol as a vdev. Deadlocks can result if another 233 * thread has spa_namespace_lock. 234 */ 235 return (SET_ERROR(EOPNOTSUPP)); 236 } 237 238 retry: 239 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 240 /* 241 * Obtain a copy of private under zvol_state_lock to make sure either 242 * the result of zvol free code setting private to NULL is observed, 243 * or the zv is protected from being freed because of the positive 244 * zv_open_count. 245 */ 246 zv = pp->private; 247 if (zv == NULL) { 248 rw_exit(&zvol_state_lock); 249 err = SET_ERROR(ENXIO); 250 goto out_locked; 251 } 252 253 mutex_enter(&zv->zv_state_lock); 254 if (zv->zv_zso->zso_dying) { 255 rw_exit(&zvol_state_lock); 256 err = SET_ERROR(ENXIO); 257 goto out_zv_locked; 258 } 259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 260 261 /* 262 * Make sure zvol is not suspended during first open 263 * (hold zv_suspend_lock) and respect proper lock acquisition 264 * ordering - zv_suspend_lock before zv_state_lock. 265 */ 266 if (zv->zv_open_count == 0) { 267 drop_suspend = B_TRUE; 268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 269 mutex_exit(&zv->zv_state_lock); 270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 271 mutex_enter(&zv->zv_state_lock); 272 /* Check to see if zv_suspend_lock is needed. */ 273 if (zv->zv_open_count != 0) { 274 rw_exit(&zv->zv_suspend_lock); 275 drop_suspend = B_FALSE; 276 } 277 } 278 } 279 rw_exit(&zvol_state_lock); 280 281 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 282 283 if (zv->zv_open_count == 0) { 284 boolean_t drop_namespace = B_FALSE; 285 286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 287 288 /* 289 * Take spa_namespace_lock to prevent lock inversion when 290 * zvols from one pool are opened as vdevs in another. 291 */ 292 if (!mutex_owned(&spa_namespace_lock)) { 293 if (!mutex_tryenter(&spa_namespace_lock)) { 294 mutex_exit(&zv->zv_state_lock); 295 rw_exit(&zv->zv_suspend_lock); 296 kern_yield(PRI_USER); 297 goto retry; 298 } else { 299 drop_namespace = B_TRUE; 300 } 301 } 302 err = zvol_first_open(zv, !(flag & FWRITE)); 303 if (drop_namespace) 304 mutex_exit(&spa_namespace_lock); 305 if (err) 306 goto out_zv_locked; 307 pp->mediasize = zv->zv_volsize; 308 pp->stripeoffset = 0; 309 pp->stripesize = zv->zv_volblocksize; 310 } 311 312 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 313 314 /* 315 * Check for a bad on-disk format version now since we 316 * lied about owning the dataset readonly before. 317 */ 318 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 319 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 320 err = SET_ERROR(EROFS); 321 goto out_opened; 322 } 323 if (zv->zv_flags & ZVOL_EXCL) { 324 err = SET_ERROR(EBUSY); 325 goto out_opened; 326 } 327 if (flag & O_EXCL) { 328 if (zv->zv_open_count != 0) { 329 err = SET_ERROR(EBUSY); 330 goto out_opened; 331 } 332 zv->zv_flags |= ZVOL_EXCL; 333 } 334 335 zv->zv_open_count += count; 336 out_opened: 337 if (zv->zv_open_count == 0) { 338 zvol_last_close(zv); 339 wakeup(zv); 340 } 341 out_zv_locked: 342 mutex_exit(&zv->zv_state_lock); 343 out_locked: 344 if (drop_suspend) 345 rw_exit(&zv->zv_suspend_lock); 346 return (err); 347 } 348 349 static int 350 zvol_geom_close(struct g_provider *pp, int flag, int count) 351 { 352 (void) flag; 353 zvol_state_t *zv; 354 boolean_t drop_suspend = B_TRUE; 355 int new_open_count; 356 357 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 358 zv = pp->private; 359 if (zv == NULL) { 360 rw_exit(&zvol_state_lock); 361 return (SET_ERROR(ENXIO)); 362 } 363 364 mutex_enter(&zv->zv_state_lock); 365 if (zv->zv_flags & ZVOL_EXCL) { 366 ASSERT3U(zv->zv_open_count, ==, 1); 367 zv->zv_flags &= ~ZVOL_EXCL; 368 } 369 370 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 371 372 /* 373 * If the open count is zero, this is a spurious close. 374 * That indicates a bug in the kernel / DDI framework. 375 */ 376 ASSERT3U(zv->zv_open_count, >, 0); 377 378 /* 379 * Make sure zvol is not suspended during last close 380 * (hold zv_suspend_lock) and respect proper lock acquisition 381 * ordering - zv_suspend_lock before zv_state_lock. 382 */ 383 new_open_count = zv->zv_open_count - count; 384 if (new_open_count == 0) { 385 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 386 mutex_exit(&zv->zv_state_lock); 387 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 388 mutex_enter(&zv->zv_state_lock); 389 /* Check to see if zv_suspend_lock is needed. */ 390 new_open_count = zv->zv_open_count - count; 391 if (new_open_count != 0) { 392 rw_exit(&zv->zv_suspend_lock); 393 drop_suspend = B_FALSE; 394 } 395 } 396 } else { 397 drop_suspend = B_FALSE; 398 } 399 rw_exit(&zvol_state_lock); 400 401 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 402 403 /* 404 * You may get multiple opens, but only one close. 405 */ 406 zv->zv_open_count = new_open_count; 407 if (zv->zv_open_count == 0) { 408 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 409 zvol_last_close(zv); 410 wakeup(zv); 411 } 412 413 mutex_exit(&zv->zv_state_lock); 414 415 if (drop_suspend) 416 rw_exit(&zv->zv_suspend_lock); 417 return (0); 418 } 419 420 static void 421 zvol_geom_run(zvol_state_t *zv) 422 { 423 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 424 struct g_provider *pp = zsg->zsg_provider; 425 426 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 427 428 g_error_provider(pp, 0); 429 430 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 431 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 432 } 433 434 static void 435 zvol_geom_destroy(zvol_state_t *zv) 436 { 437 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 438 struct g_provider *pp = zsg->zsg_provider; 439 440 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 441 442 g_topology_assert(); 443 444 mutex_enter(&zv->zv_state_lock); 445 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 446 mutex_exit(&zv->zv_state_lock); 447 zsg->zsg_provider = NULL; 448 g_wither_geom(pp->geom, ENXIO); 449 } 450 451 void 452 zvol_wait_close(zvol_state_t *zv) 453 { 454 455 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 456 return; 457 mutex_enter(&zv->zv_state_lock); 458 zv->zv_zso->zso_dying = B_TRUE; 459 460 if (zv->zv_open_count) 461 msleep(zv, &zv->zv_state_lock, 462 PRIBIO, "zvol:dying", 10*hz); 463 mutex_exit(&zv->zv_state_lock); 464 } 465 466 467 static int 468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 469 { 470 int count, error, flags; 471 472 g_topology_assert(); 473 474 /* 475 * To make it easier we expect either open or close, but not both 476 * at the same time. 477 */ 478 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 479 (acr <= 0 && acw <= 0 && ace <= 0), 480 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 481 pp->name, acr, acw, ace)); 482 483 if (pp->private == NULL) { 484 if (acr <= 0 && acw <= 0 && ace <= 0) 485 return (0); 486 return (pp->error); 487 } 488 489 /* 490 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 491 * ace != 0, because GEOM already handles that and handles it a bit 492 * differently. GEOM allows for multiple read/exclusive consumers and 493 * ZFS allows only one exclusive consumer, no matter if it is reader or 494 * writer. I like better the way GEOM works so I'll leave it for GEOM 495 * to decide what to do. 496 */ 497 498 count = acr + acw + ace; 499 if (count == 0) 500 return (0); 501 502 flags = 0; 503 if (acr != 0 || ace != 0) 504 flags |= FREAD; 505 if (acw != 0) 506 flags |= FWRITE; 507 508 g_topology_unlock(); 509 if (count > 0) 510 error = zvol_geom_open(pp, flags, count); 511 else 512 error = zvol_geom_close(pp, flags, -count); 513 g_topology_lock(); 514 return (error); 515 } 516 517 static void 518 zvol_geom_worker(void *arg) 519 { 520 zvol_state_t *zv = arg; 521 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 522 struct bio *bp; 523 524 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 525 526 thread_lock(curthread); 527 sched_prio(curthread, PRIBIO); 528 thread_unlock(curthread); 529 530 for (;;) { 531 mtx_lock(&zsg->zsg_queue_mtx); 532 bp = bioq_takefirst(&zsg->zsg_queue); 533 if (bp == NULL) { 534 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 535 zsg->zsg_state = ZVOL_GEOM_RUNNING; 536 wakeup(&zsg->zsg_state); 537 mtx_unlock(&zsg->zsg_queue_mtx); 538 kthread_exit(); 539 } 540 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 541 PRIBIO | PDROP, "zvol:io", 0); 542 continue; 543 } 544 mtx_unlock(&zsg->zsg_queue_mtx); 545 zvol_geom_bio_strategy(bp); 546 } 547 } 548 549 static void 550 zvol_geom_bio_start(struct bio *bp) 551 { 552 zvol_state_t *zv = bp->bio_to->private; 553 struct zvol_state_geom *zsg; 554 boolean_t first; 555 556 if (zv == NULL) { 557 g_io_deliver(bp, ENXIO); 558 return; 559 } 560 if (bp->bio_cmd == BIO_GETATTR) { 561 if (zvol_geom_bio_getattr(bp)) 562 g_io_deliver(bp, EOPNOTSUPP); 563 return; 564 } 565 566 if (!THREAD_CAN_SLEEP()) { 567 zsg = &zv->zv_zso->zso_geom; 568 mtx_lock(&zsg->zsg_queue_mtx); 569 first = (bioq_first(&zsg->zsg_queue) == NULL); 570 bioq_insert_tail(&zsg->zsg_queue, bp); 571 mtx_unlock(&zsg->zsg_queue_mtx); 572 if (first) 573 wakeup_one(&zsg->zsg_queue); 574 return; 575 } 576 577 zvol_geom_bio_strategy(bp); 578 } 579 580 static int 581 zvol_geom_bio_getattr(struct bio *bp) 582 { 583 zvol_state_t *zv; 584 585 zv = bp->bio_to->private; 586 ASSERT3P(zv, !=, NULL); 587 588 spa_t *spa = dmu_objset_spa(zv->zv_objset); 589 uint64_t refd, avail, usedobjs, availobjs; 590 591 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 592 return (0); 593 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 594 dmu_objset_space(zv->zv_objset, &refd, &avail, 595 &usedobjs, &availobjs); 596 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 597 return (0); 598 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 599 dmu_objset_space(zv->zv_objset, &refd, &avail, 600 &usedobjs, &availobjs); 601 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 602 return (0); 603 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 604 avail = metaslab_class_get_space(spa_normal_class(spa)); 605 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 606 if (g_handleattr_off_t(bp, "poolblocksavail", 607 avail / DEV_BSIZE)) 608 return (0); 609 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 610 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 611 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 612 return (0); 613 } 614 return (1); 615 } 616 617 static void 618 zvol_filter_detach(struct knote *kn) 619 { 620 zvol_state_t *zv; 621 struct zvol_state_dev *zsd; 622 623 zv = kn->kn_hook; 624 zsd = &zv->zv_zso->zso_dev; 625 626 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); 627 } 628 629 static int 630 zvol_filter_vnode(struct knote *kn, long hint) 631 { 632 kn->kn_fflags |= kn->kn_sfflags & hint; 633 634 return (kn->kn_fflags != 0); 635 } 636 637 static int 638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) 639 { 640 zvol_state_t *zv; 641 struct zvol_state_dev *zsd; 642 643 zv = dev->si_drv2; 644 zsd = &zv->zv_zso->zso_dev; 645 646 if (kn->kn_filter != EVFILT_VNODE) 647 return (EINVAL); 648 649 /* XXX: extend support for other NOTE_* events */ 650 if (kn->kn_sfflags != NOTE_ATTRIB) 651 return (EINVAL); 652 653 kn->kn_fop = &zvol_filterops_vnode; 654 kn->kn_hook = zv; 655 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); 656 657 return (0); 658 } 659 660 static void 661 zvol_geom_bio_strategy(struct bio *bp) 662 { 663 zvol_state_t *zv; 664 uint64_t off, volsize; 665 size_t resid; 666 char *addr; 667 objset_t *os; 668 zfs_locked_range_t *lr; 669 int error = 0; 670 boolean_t doread = B_FALSE; 671 boolean_t is_dumpified; 672 boolean_t sync; 673 674 if (bp->bio_to) 675 zv = bp->bio_to->private; 676 else 677 zv = bp->bio_dev->si_drv2; 678 679 if (zv == NULL) { 680 error = SET_ERROR(ENXIO); 681 goto out; 682 } 683 684 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 685 686 switch (bp->bio_cmd) { 687 case BIO_READ: 688 doread = B_TRUE; 689 break; 690 case BIO_WRITE: 691 case BIO_FLUSH: 692 case BIO_DELETE: 693 if (zv->zv_flags & ZVOL_RDONLY) { 694 error = SET_ERROR(EROFS); 695 goto resume; 696 } 697 zvol_ensure_zilog(zv); 698 if (bp->bio_cmd == BIO_FLUSH) 699 goto sync; 700 break; 701 default: 702 error = SET_ERROR(EOPNOTSUPP); 703 goto resume; 704 } 705 706 off = bp->bio_offset; 707 volsize = zv->zv_volsize; 708 709 os = zv->zv_objset; 710 ASSERT3P(os, !=, NULL); 711 712 addr = bp->bio_data; 713 resid = bp->bio_length; 714 715 if (resid > 0 && off >= volsize) { 716 error = SET_ERROR(EIO); 717 goto resume; 718 } 719 720 is_dumpified = B_FALSE; 721 sync = !doread && !is_dumpified && 722 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 723 724 /* 725 * There must be no buffer changes when doing a dmu_sync() because 726 * we can't change the data whilst calculating the checksum. 727 */ 728 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 729 doread ? RL_READER : RL_WRITER); 730 731 if (bp->bio_cmd == BIO_DELETE) { 732 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 733 error = dmu_tx_assign(tx, TXG_WAIT); 734 if (error != 0) { 735 dmu_tx_abort(tx); 736 } else { 737 zvol_log_truncate(zv, tx, off, resid, sync); 738 dmu_tx_commit(tx); 739 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 740 off, resid); 741 resid = 0; 742 } 743 goto unlock; 744 } 745 while (resid != 0 && off < volsize) { 746 size_t size = MIN(resid, zvol_maxphys); 747 if (doread) { 748 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 749 DMU_READ_PREFETCH); 750 } else { 751 dmu_tx_t *tx = dmu_tx_create(os); 752 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 753 error = dmu_tx_assign(tx, TXG_WAIT); 754 if (error) { 755 dmu_tx_abort(tx); 756 } else { 757 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 758 zvol_log_write(zv, tx, off, size, sync); 759 dmu_tx_commit(tx); 760 } 761 } 762 if (error) { 763 /* Convert checksum errors into IO errors. */ 764 if (error == ECKSUM) 765 error = SET_ERROR(EIO); 766 break; 767 } 768 off += size; 769 addr += size; 770 resid -= size; 771 } 772 unlock: 773 zfs_rangelock_exit(lr); 774 775 bp->bio_completed = bp->bio_length - resid; 776 if (bp->bio_completed < bp->bio_length && off > volsize) 777 error = SET_ERROR(EINVAL); 778 779 switch (bp->bio_cmd) { 780 case BIO_FLUSH: 781 break; 782 case BIO_READ: 783 dataset_kstats_update_read_kstats(&zv->zv_kstat, 784 bp->bio_completed); 785 break; 786 case BIO_WRITE: 787 dataset_kstats_update_write_kstats(&zv->zv_kstat, 788 bp->bio_completed); 789 break; 790 case BIO_DELETE: 791 break; 792 default: 793 break; 794 } 795 796 if (sync) { 797 sync: 798 zil_commit(zv->zv_zilog, ZVOL_OBJ); 799 } 800 resume: 801 rw_exit(&zv->zv_suspend_lock); 802 out: 803 if (bp->bio_to) 804 g_io_deliver(bp, error); 805 else 806 biofinish(bp, NULL, error); 807 } 808 809 /* 810 * Character device mode implementation 811 */ 812 813 static int 814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 815 { 816 zvol_state_t *zv; 817 uint64_t volsize; 818 zfs_locked_range_t *lr; 819 int error = 0; 820 zfs_uio_t uio; 821 822 zfs_uio_init(&uio, uio_s); 823 824 zv = dev->si_drv2; 825 826 volsize = zv->zv_volsize; 827 /* 828 * uio_loffset == volsize isn't an error as 829 * it's required for EOF processing. 830 */ 831 if (zfs_uio_resid(&uio) > 0 && 832 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 833 return (SET_ERROR(EIO)); 834 835 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 836 ssize_t start_resid = zfs_uio_resid(&uio); 837 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 838 zfs_uio_resid(&uio), RL_READER); 839 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 840 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 841 842 /* Don't read past the end. */ 843 if (bytes > volsize - zfs_uio_offset(&uio)) 844 bytes = volsize - zfs_uio_offset(&uio); 845 846 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 847 if (error) { 848 /* Convert checksum errors into IO errors. */ 849 if (error == ECKSUM) 850 error = SET_ERROR(EIO); 851 break; 852 } 853 } 854 zfs_rangelock_exit(lr); 855 int64_t nread = start_resid - zfs_uio_resid(&uio); 856 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 857 rw_exit(&zv->zv_suspend_lock); 858 859 return (error); 860 } 861 862 static int 863 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 864 { 865 zvol_state_t *zv; 866 uint64_t volsize; 867 zfs_locked_range_t *lr; 868 int error = 0; 869 boolean_t sync; 870 zfs_uio_t uio; 871 872 zv = dev->si_drv2; 873 874 volsize = zv->zv_volsize; 875 876 zfs_uio_init(&uio, uio_s); 877 878 if (zfs_uio_resid(&uio) > 0 && 879 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 880 return (SET_ERROR(EIO)); 881 882 ssize_t start_resid = zfs_uio_resid(&uio); 883 sync = (ioflag & IO_SYNC) || 884 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 885 886 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 887 zvol_ensure_zilog(zv); 888 889 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 890 zfs_uio_resid(&uio), RL_WRITER); 891 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 892 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 893 uint64_t off = zfs_uio_offset(&uio); 894 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 895 896 if (bytes > volsize - off) /* Don't write past the end. */ 897 bytes = volsize - off; 898 899 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 900 error = dmu_tx_assign(tx, TXG_WAIT); 901 if (error) { 902 dmu_tx_abort(tx); 903 break; 904 } 905 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 906 if (error == 0) 907 zvol_log_write(zv, tx, off, bytes, sync); 908 dmu_tx_commit(tx); 909 910 if (error) 911 break; 912 } 913 zfs_rangelock_exit(lr); 914 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 915 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 916 if (sync) 917 zil_commit(zv->zv_zilog, ZVOL_OBJ); 918 rw_exit(&zv->zv_suspend_lock); 919 return (error); 920 } 921 922 static int 923 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 924 { 925 zvol_state_t *zv; 926 struct zvol_state_dev *zsd; 927 int err = 0; 928 boolean_t drop_suspend = B_FALSE; 929 930 retry: 931 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 932 /* 933 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either 934 * the result of zvol free code setting si_drv2 to NULL is observed, 935 * or the zv is protected from being freed because of the positive 936 * zv_open_count. 937 */ 938 zv = dev->si_drv2; 939 if (zv == NULL) { 940 rw_exit(&zvol_state_lock); 941 err = SET_ERROR(ENXIO); 942 goto out_locked; 943 } 944 945 mutex_enter(&zv->zv_state_lock); 946 if (zv->zv_zso->zso_dying) { 947 rw_exit(&zvol_state_lock); 948 err = SET_ERROR(ENXIO); 949 goto out_zv_locked; 950 } 951 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 952 953 /* 954 * Make sure zvol is not suspended during first open 955 * (hold zv_suspend_lock) and respect proper lock acquisition 956 * ordering - zv_suspend_lock before zv_state_lock. 957 */ 958 if (zv->zv_open_count == 0) { 959 drop_suspend = B_TRUE; 960 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 961 mutex_exit(&zv->zv_state_lock); 962 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 963 mutex_enter(&zv->zv_state_lock); 964 /* Check to see if zv_suspend_lock is needed. */ 965 if (zv->zv_open_count != 0) { 966 rw_exit(&zv->zv_suspend_lock); 967 drop_suspend = B_FALSE; 968 } 969 } 970 } 971 rw_exit(&zvol_state_lock); 972 973 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 974 975 if (zv->zv_open_count == 0) { 976 boolean_t drop_namespace = B_FALSE; 977 978 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 979 980 /* 981 * Take spa_namespace_lock to prevent lock inversion when 982 * zvols from one pool are opened as vdevs in another. 983 */ 984 if (!mutex_owned(&spa_namespace_lock)) { 985 if (!mutex_tryenter(&spa_namespace_lock)) { 986 mutex_exit(&zv->zv_state_lock); 987 rw_exit(&zv->zv_suspend_lock); 988 kern_yield(PRI_USER); 989 goto retry; 990 } else { 991 drop_namespace = B_TRUE; 992 } 993 } 994 err = zvol_first_open(zv, !(flags & FWRITE)); 995 if (drop_namespace) 996 mutex_exit(&spa_namespace_lock); 997 if (err) 998 goto out_zv_locked; 999 } 1000 1001 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1002 1003 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 1004 err = SET_ERROR(EROFS); 1005 goto out_opened; 1006 } 1007 if (zv->zv_flags & ZVOL_EXCL) { 1008 err = SET_ERROR(EBUSY); 1009 goto out_opened; 1010 } 1011 if (flags & O_EXCL) { 1012 if (zv->zv_open_count != 0) { 1013 err = SET_ERROR(EBUSY); 1014 goto out_opened; 1015 } 1016 zv->zv_flags |= ZVOL_EXCL; 1017 } 1018 1019 zv->zv_open_count++; 1020 if (flags & O_SYNC) { 1021 zsd = &zv->zv_zso->zso_dev; 1022 zsd->zsd_sync_cnt++; 1023 if (zsd->zsd_sync_cnt == 1 && 1024 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) 1025 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); 1026 } 1027 out_opened: 1028 if (zv->zv_open_count == 0) { 1029 zvol_last_close(zv); 1030 wakeup(zv); 1031 } 1032 out_zv_locked: 1033 mutex_exit(&zv->zv_state_lock); 1034 out_locked: 1035 if (drop_suspend) 1036 rw_exit(&zv->zv_suspend_lock); 1037 return (err); 1038 } 1039 1040 static int 1041 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 1042 { 1043 zvol_state_t *zv; 1044 struct zvol_state_dev *zsd; 1045 boolean_t drop_suspend = B_TRUE; 1046 1047 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 1048 zv = dev->si_drv2; 1049 if (zv == NULL) { 1050 rw_exit(&zvol_state_lock); 1051 return (SET_ERROR(ENXIO)); 1052 } 1053 1054 mutex_enter(&zv->zv_state_lock); 1055 if (zv->zv_flags & ZVOL_EXCL) { 1056 ASSERT3U(zv->zv_open_count, ==, 1); 1057 zv->zv_flags &= ~ZVOL_EXCL; 1058 } 1059 1060 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1061 1062 /* 1063 * If the open count is zero, this is a spurious close. 1064 * That indicates a bug in the kernel / DDI framework. 1065 */ 1066 ASSERT3U(zv->zv_open_count, >, 0); 1067 /* 1068 * Make sure zvol is not suspended during last close 1069 * (hold zv_suspend_lock) and respect proper lock acquisition 1070 * ordering - zv_suspend_lock before zv_state_lock. 1071 */ 1072 if (zv->zv_open_count == 1) { 1073 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1074 mutex_exit(&zv->zv_state_lock); 1075 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1076 mutex_enter(&zv->zv_state_lock); 1077 /* Check to see if zv_suspend_lock is needed. */ 1078 if (zv->zv_open_count != 1) { 1079 rw_exit(&zv->zv_suspend_lock); 1080 drop_suspend = B_FALSE; 1081 } 1082 } 1083 } else { 1084 drop_suspend = B_FALSE; 1085 } 1086 rw_exit(&zvol_state_lock); 1087 1088 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1089 1090 /* 1091 * You may get multiple opens, but only one close. 1092 */ 1093 zv->zv_open_count--; 1094 if (flags & O_SYNC) { 1095 zsd = &zv->zv_zso->zso_dev; 1096 zsd->zsd_sync_cnt--; 1097 } 1098 1099 if (zv->zv_open_count == 0) { 1100 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1101 zvol_last_close(zv); 1102 wakeup(zv); 1103 } 1104 1105 mutex_exit(&zv->zv_state_lock); 1106 1107 if (drop_suspend) 1108 rw_exit(&zv->zv_suspend_lock); 1109 return (0); 1110 } 1111 1112 static int 1113 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1114 int fflag, struct thread *td) 1115 { 1116 zvol_state_t *zv; 1117 zfs_locked_range_t *lr; 1118 off_t offset, length; 1119 int error; 1120 boolean_t sync; 1121 1122 zv = dev->si_drv2; 1123 1124 error = 0; 1125 KASSERT(zv->zv_open_count > 0, 1126 ("Device with zero access count in %s", __func__)); 1127 1128 switch (cmd) { 1129 case DIOCGSECTORSIZE: 1130 *(uint32_t *)data = DEV_BSIZE; 1131 break; 1132 case DIOCGMEDIASIZE: 1133 *(off_t *)data = zv->zv_volsize; 1134 break; 1135 case DIOCGFLUSH: 1136 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1137 if (zv->zv_zilog != NULL) 1138 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1139 rw_exit(&zv->zv_suspend_lock); 1140 break; 1141 case DIOCGDELETE: 1142 if (!zvol_unmap_enabled) 1143 break; 1144 1145 offset = ((off_t *)data)[0]; 1146 length = ((off_t *)data)[1]; 1147 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1148 offset < 0 || offset >= zv->zv_volsize || 1149 length <= 0) { 1150 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1151 length); 1152 error = SET_ERROR(EINVAL); 1153 break; 1154 } 1155 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1156 zvol_ensure_zilog(zv); 1157 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1158 RL_WRITER); 1159 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1160 error = dmu_tx_assign(tx, TXG_WAIT); 1161 if (error != 0) { 1162 sync = FALSE; 1163 dmu_tx_abort(tx); 1164 } else { 1165 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1166 zvol_log_truncate(zv, tx, offset, length, sync); 1167 dmu_tx_commit(tx); 1168 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1169 offset, length); 1170 } 1171 zfs_rangelock_exit(lr); 1172 if (sync) 1173 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1174 rw_exit(&zv->zv_suspend_lock); 1175 break; 1176 case DIOCGSTRIPESIZE: 1177 *(off_t *)data = zv->zv_volblocksize; 1178 break; 1179 case DIOCGSTRIPEOFFSET: 1180 *(off_t *)data = 0; 1181 break; 1182 case DIOCGATTR: { 1183 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1184 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1185 uint64_t refd, avail, usedobjs, availobjs; 1186 1187 if (strcmp(arg->name, "GEOM::candelete") == 0) 1188 arg->value.i = 1; 1189 else if (strcmp(arg->name, "blocksavail") == 0) { 1190 dmu_objset_space(zv->zv_objset, &refd, &avail, 1191 &usedobjs, &availobjs); 1192 arg->value.off = avail / DEV_BSIZE; 1193 } else if (strcmp(arg->name, "blocksused") == 0) { 1194 dmu_objset_space(zv->zv_objset, &refd, &avail, 1195 &usedobjs, &availobjs); 1196 arg->value.off = refd / DEV_BSIZE; 1197 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1198 avail = metaslab_class_get_space(spa_normal_class(spa)); 1199 avail -= metaslab_class_get_alloc( 1200 spa_normal_class(spa)); 1201 arg->value.off = avail / DEV_BSIZE; 1202 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1203 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1204 arg->value.off = refd / DEV_BSIZE; 1205 } else 1206 error = SET_ERROR(ENOIOCTL); 1207 break; 1208 } 1209 case FIOSEEKHOLE: 1210 case FIOSEEKDATA: { 1211 off_t *off = (off_t *)data; 1212 uint64_t noff; 1213 boolean_t hole; 1214 1215 hole = (cmd == FIOSEEKHOLE); 1216 noff = *off; 1217 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, 1218 RL_READER); 1219 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1220 zfs_rangelock_exit(lr); 1221 *off = noff; 1222 break; 1223 } 1224 default: 1225 error = SET_ERROR(ENOIOCTL); 1226 } 1227 1228 return (error); 1229 } 1230 1231 /* 1232 * Misc. helpers 1233 */ 1234 1235 static void 1236 zvol_ensure_zilog(zvol_state_t *zv) 1237 { 1238 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1239 1240 /* 1241 * Open a ZIL if this is the first time we have written to this 1242 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1243 * than zv_state_lock so that we don't need to acquire an 1244 * additional lock in this path. 1245 */ 1246 if (zv->zv_zilog == NULL) { 1247 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1248 rw_exit(&zv->zv_suspend_lock); 1249 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1250 } 1251 if (zv->zv_zilog == NULL) { 1252 zv->zv_zilog = zil_open(zv->zv_objset, 1253 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1254 zv->zv_flags |= ZVOL_WRITTEN_TO; 1255 /* replay / destroy done in zvol_os_create_minor() */ 1256 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1257 ZIL_REPLAY_NEEDED); 1258 } 1259 rw_downgrade(&zv->zv_suspend_lock); 1260 } 1261 } 1262 1263 boolean_t 1264 zvol_os_is_zvol(const char *device) 1265 { 1266 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1267 } 1268 1269 void 1270 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1271 { 1272 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1273 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1274 1275 /* Move to a new hashtable entry. */ 1276 zv->zv_hash = zvol_name_hash(zv->zv_name); 1277 hlist_del(&zv->zv_hlink); 1278 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1279 1280 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1281 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1282 struct g_provider *pp = zsg->zsg_provider; 1283 struct g_geom *gp; 1284 1285 g_topology_lock(); 1286 gp = pp->geom; 1287 ASSERT3P(gp, !=, NULL); 1288 1289 zsg->zsg_provider = NULL; 1290 g_wither_provider(pp, ENXIO); 1291 1292 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1293 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1294 pp->sectorsize = DEV_BSIZE; 1295 pp->mediasize = zv->zv_volsize; 1296 pp->private = zv; 1297 zsg->zsg_provider = pp; 1298 g_error_provider(pp, 0); 1299 g_topology_unlock(); 1300 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1301 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1302 struct cdev *dev; 1303 struct make_dev_args args; 1304 1305 dev = zsd->zsd_cdev; 1306 if (dev != NULL) { 1307 destroy_dev(dev); 1308 dev = zsd->zsd_cdev = NULL; 1309 if (zv->zv_open_count > 0) { 1310 zv->zv_flags &= ~ZVOL_EXCL; 1311 zv->zv_open_count = 0; 1312 /* XXX need suspend lock but lock order */ 1313 zvol_last_close(zv); 1314 } 1315 } 1316 1317 make_dev_args_init(&args); 1318 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1319 args.mda_devsw = &zvol_cdevsw; 1320 args.mda_cr = NULL; 1321 args.mda_uid = UID_ROOT; 1322 args.mda_gid = GID_OPERATOR; 1323 args.mda_mode = 0640; 1324 args.mda_si_drv2 = zv; 1325 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1326 == 0) { 1327 #if __FreeBSD_version > 1300130 1328 dev->si_iosize_max = maxphys; 1329 #else 1330 dev->si_iosize_max = MAXPHYS; 1331 #endif 1332 zsd->zsd_cdev = dev; 1333 } 1334 } 1335 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1336 } 1337 1338 /* 1339 * Remove minor node for the specified volume. 1340 */ 1341 void 1342 zvol_os_free(zvol_state_t *zv) 1343 { 1344 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1345 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1346 ASSERT0(zv->zv_open_count); 1347 1348 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1349 1350 rw_destroy(&zv->zv_suspend_lock); 1351 zfs_rangelock_fini(&zv->zv_rangelock); 1352 1353 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1354 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1355 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1356 1357 ASSERT3P(pp->private, ==, NULL); 1358 1359 g_topology_lock(); 1360 zvol_geom_destroy(zv); 1361 g_topology_unlock(); 1362 mtx_destroy(&zsg->zsg_queue_mtx); 1363 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1364 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1365 struct cdev *dev = zsd->zsd_cdev; 1366 1367 if (dev != NULL) { 1368 ASSERT3P(dev->si_drv2, ==, NULL); 1369 destroy_dev(dev); 1370 knlist_clear(&zsd->zsd_selinfo.si_note, 0); 1371 knlist_destroy(&zsd->zsd_selinfo.si_note); 1372 } 1373 } 1374 1375 mutex_destroy(&zv->zv_state_lock); 1376 dataset_kstats_destroy(&zv->zv_kstat); 1377 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1378 kmem_free(zv, sizeof (zvol_state_t)); 1379 zvol_minors--; 1380 } 1381 1382 /* 1383 * Create a minor node (plus a whole lot more) for the specified volume. 1384 */ 1385 int 1386 zvol_os_create_minor(const char *name) 1387 { 1388 zvol_state_t *zv; 1389 objset_t *os; 1390 dmu_object_info_t *doi; 1391 uint64_t volsize; 1392 uint64_t volmode, hash; 1393 int error; 1394 bool replayed_zil = B_FALSE; 1395 1396 ZFS_LOG(1, "Creating ZVOL %s...", name); 1397 hash = zvol_name_hash(name); 1398 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1399 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1400 mutex_exit(&zv->zv_state_lock); 1401 return (SET_ERROR(EEXIST)); 1402 } 1403 1404 DROP_GIANT(); 1405 1406 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1407 1408 /* Lie and say we're read-only. */ 1409 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1410 if (error) 1411 goto out_doi; 1412 1413 error = dmu_object_info(os, ZVOL_OBJ, doi); 1414 if (error) 1415 goto out_dmu_objset_disown; 1416 1417 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1418 if (error) 1419 goto out_dmu_objset_disown; 1420 1421 error = dsl_prop_get_integer(name, 1422 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1423 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1424 volmode = zvol_volmode; 1425 error = 0; 1426 1427 /* 1428 * zvol_alloc equivalent ... 1429 */ 1430 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1431 zv->zv_hash = hash; 1432 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1433 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1434 zv->zv_volmode = volmode; 1435 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1436 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1437 struct g_provider *pp; 1438 struct g_geom *gp; 1439 1440 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1441 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1442 1443 g_topology_lock(); 1444 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1445 gp->start = zvol_geom_bio_start; 1446 gp->access = zvol_geom_access; 1447 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1448 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1449 pp->sectorsize = DEV_BSIZE; 1450 pp->mediasize = 0; 1451 pp->private = zv; 1452 1453 zsg->zsg_provider = pp; 1454 bioq_init(&zsg->zsg_queue); 1455 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1456 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1457 struct cdev *dev; 1458 struct make_dev_args args; 1459 1460 make_dev_args_init(&args); 1461 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1462 args.mda_devsw = &zvol_cdevsw; 1463 args.mda_cr = NULL; 1464 args.mda_uid = UID_ROOT; 1465 args.mda_gid = GID_OPERATOR; 1466 args.mda_mode = 0640; 1467 args.mda_si_drv2 = zv; 1468 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) 1469 == 0) { 1470 #if __FreeBSD_version > 1300130 1471 dev->si_iosize_max = maxphys; 1472 #else 1473 dev->si_iosize_max = MAXPHYS; 1474 #endif 1475 zsd->zsd_cdev = dev; 1476 knlist_init_sx(&zsd->zsd_selinfo.si_note, 1477 &zv->zv_state_lock); 1478 } 1479 } 1480 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1481 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1482 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1483 1484 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1485 zv->zv_flags |= ZVOL_RDONLY; 1486 1487 zv->zv_volblocksize = doi->doi_data_block_size; 1488 zv->zv_volsize = volsize; 1489 zv->zv_objset = os; 1490 1491 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1492 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1493 if (error) 1494 goto out_dmu_objset_disown; 1495 ASSERT3P(zv->zv_zilog, ==, NULL); 1496 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1497 if (spa_writeable(dmu_objset_spa(os))) { 1498 if (zil_replay_disable) 1499 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1500 else 1501 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1502 } 1503 if (replayed_zil) 1504 zil_close(zv->zv_zilog); 1505 zv->zv_zilog = NULL; 1506 1507 /* TODO: prefetch for geom tasting */ 1508 1509 zv->zv_objset = NULL; 1510 out_dmu_objset_disown: 1511 dmu_objset_disown(os, B_TRUE, FTAG); 1512 1513 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1514 zvol_geom_run(zv); 1515 g_topology_unlock(); 1516 } 1517 out_doi: 1518 kmem_free(doi, sizeof (dmu_object_info_t)); 1519 if (error == 0) { 1520 rw_enter(&zvol_state_lock, RW_WRITER); 1521 zvol_insert(zv); 1522 zvol_minors++; 1523 rw_exit(&zvol_state_lock); 1524 ZFS_LOG(1, "ZVOL %s created.", name); 1525 } 1526 PICKUP_GIANT(); 1527 return (error); 1528 } 1529 1530 void 1531 zvol_os_clear_private(zvol_state_t *zv) 1532 { 1533 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1534 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1535 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1536 struct g_provider *pp = zsg->zsg_provider; 1537 1538 if (pp->private == NULL) /* already cleared */ 1539 return; 1540 1541 mtx_lock(&zsg->zsg_queue_mtx); 1542 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1543 pp->private = NULL; 1544 wakeup_one(&zsg->zsg_queue); 1545 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1546 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1547 0, "zvol:w", 0); 1548 mtx_unlock(&zsg->zsg_queue_mtx); 1549 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1550 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1551 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1552 struct cdev *dev = zsd->zsd_cdev; 1553 1554 if (dev != NULL) 1555 dev->si_drv2 = NULL; 1556 } 1557 } 1558 1559 int 1560 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1561 { 1562 zv->zv_volsize = volsize; 1563 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1564 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1565 struct g_provider *pp = zsg->zsg_provider; 1566 1567 g_topology_lock(); 1568 1569 if (pp->private == NULL) { 1570 g_topology_unlock(); 1571 return (SET_ERROR(ENXIO)); 1572 } 1573 1574 /* 1575 * Do not invoke resize event when initial size was zero. 1576 * ZVOL initializes the size on first open, this is not 1577 * real resizing. 1578 */ 1579 if (pp->mediasize == 0) 1580 pp->mediasize = zv->zv_volsize; 1581 else 1582 g_resize_provider(pp, zv->zv_volsize); 1583 1584 g_topology_unlock(); 1585 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1586 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1587 1588 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); 1589 } 1590 return (0); 1591 } 1592 1593 void 1594 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1595 { 1596 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1597 } 1598 1599 void 1600 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1601 { 1602 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1603 } 1604 1605 /* 1606 * Public interfaces 1607 */ 1608 1609 int 1610 zvol_busy(void) 1611 { 1612 return (zvol_minors != 0); 1613 } 1614 1615 int 1616 zvol_init(void) 1617 { 1618 zvol_init_impl(); 1619 return (0); 1620 } 1621 1622 void 1623 zvol_fini(void) 1624 { 1625 zvol_fini_impl(); 1626 } 1627