1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 96 #include <geom/geom.h> 97 #include <sys/zvol.h> 98 #include <sys/zvol_impl.h> 99 100 #include "zfs_namecheck.h" 101 102 #define ZVOL_DUMPSIZE "dumpsize" 103 104 #ifdef ZVOL_LOCK_DEBUG 105 #define ZVOL_RW_READER RW_WRITER 106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 107 #else 108 #define ZVOL_RW_READER RW_READER 109 #define ZVOL_RW_READ_HELD RW_READ_HELD 110 #endif 111 112 enum zvol_geom_state { 113 ZVOL_GEOM_UNINIT, 114 ZVOL_GEOM_STOPPED, 115 ZVOL_GEOM_RUNNING, 116 }; 117 118 struct zvol_state_os { 119 #define zso_dev _zso_state._zso_dev 120 #define zso_geom _zso_state._zso_geom 121 union { 122 /* volmode=dev */ 123 struct zvol_state_dev { 124 struct cdev *zsd_cdev; 125 uint64_t zsd_sync_cnt; 126 } _zso_dev; 127 128 /* volmode=geom */ 129 struct zvol_state_geom { 130 struct g_provider *zsg_provider; 131 struct bio_queue_head zsg_queue; 132 struct mtx zsg_queue_mtx; 133 enum zvol_geom_state zsg_state; 134 } _zso_geom; 135 } _zso_state; 136 int zso_dying; 137 }; 138 139 static uint32_t zvol_minors; 140 141 SYSCTL_DECL(_vfs_zfs); 142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 144 "Expose as GEOM providers (1), device files (2) or neither"); 145 static boolean_t zpool_on_zvol = B_FALSE; 146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 147 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 148 149 /* 150 * Toggle unmap functionality. 151 */ 152 boolean_t zvol_unmap_enabled = B_TRUE; 153 154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 155 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 156 157 /* 158 * zvol maximum transfer in one DMU tx. 159 */ 160 int zvol_maxphys = DMU_MAX_ACCESS / 2; 161 162 static void zvol_ensure_zilog(zvol_state_t *zv); 163 164 static d_open_t zvol_cdev_open; 165 static d_close_t zvol_cdev_close; 166 static d_ioctl_t zvol_cdev_ioctl; 167 static d_read_t zvol_cdev_read; 168 static d_write_t zvol_cdev_write; 169 static d_strategy_t zvol_geom_bio_strategy; 170 171 static struct cdevsw zvol_cdevsw = { 172 .d_name = "zvol", 173 .d_version = D_VERSION, 174 .d_flags = D_DISK | D_TRACKCLOSE, 175 .d_open = zvol_cdev_open, 176 .d_close = zvol_cdev_close, 177 .d_ioctl = zvol_cdev_ioctl, 178 .d_read = zvol_cdev_read, 179 .d_write = zvol_cdev_write, 180 .d_strategy = zvol_geom_bio_strategy, 181 }; 182 183 extern uint_t zfs_geom_probe_vdev_key; 184 185 struct g_class zfs_zvol_class = { 186 .name = "ZFS::ZVOL", 187 .version = G_VERSION, 188 }; 189 190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 191 192 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 193 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 194 static void zvol_geom_run(zvol_state_t *zv); 195 static void zvol_geom_destroy(zvol_state_t *zv); 196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 197 static void zvol_geom_worker(void *arg); 198 static void zvol_geom_bio_start(struct bio *bp); 199 static int zvol_geom_bio_getattr(struct bio *bp); 200 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 201 202 /* 203 * GEOM mode implementation 204 */ 205 206 static int 207 zvol_geom_open(struct g_provider *pp, int flag, int count) 208 { 209 zvol_state_t *zv; 210 int err = 0; 211 boolean_t drop_suspend = B_FALSE; 212 213 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 214 /* 215 * If zfs_geom_probe_vdev_key is set, that means that zfs is 216 * attempting to probe geom providers while looking for a 217 * replacement for a missing VDEV. In this case, the 218 * spa_namespace_lock will not be held, but it is still illegal 219 * to use a zvol as a vdev. Deadlocks can result if another 220 * thread has spa_namespace_lock. 221 */ 222 return (SET_ERROR(EOPNOTSUPP)); 223 } 224 225 retry: 226 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 227 /* 228 * Obtain a copy of private under zvol_state_lock to make sure either 229 * the result of zvol free code setting private to NULL is observed, 230 * or the zv is protected from being freed because of the positive 231 * zv_open_count. 232 */ 233 zv = pp->private; 234 if (zv == NULL) { 235 rw_exit(&zvol_state_lock); 236 err = SET_ERROR(ENXIO); 237 goto out_locked; 238 } 239 240 mutex_enter(&zv->zv_state_lock); 241 if (zv->zv_zso->zso_dying) { 242 rw_exit(&zvol_state_lock); 243 err = SET_ERROR(ENXIO); 244 goto out_zv_locked; 245 } 246 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 247 248 /* 249 * Make sure zvol is not suspended during first open 250 * (hold zv_suspend_lock) and respect proper lock acquisition 251 * ordering - zv_suspend_lock before zv_state_lock. 252 */ 253 if (zv->zv_open_count == 0) { 254 drop_suspend = B_TRUE; 255 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 256 mutex_exit(&zv->zv_state_lock); 257 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 258 mutex_enter(&zv->zv_state_lock); 259 /* Check to see if zv_suspend_lock is needed. */ 260 if (zv->zv_open_count != 0) { 261 rw_exit(&zv->zv_suspend_lock); 262 drop_suspend = B_FALSE; 263 } 264 } 265 } 266 rw_exit(&zvol_state_lock); 267 268 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 269 270 if (zv->zv_open_count == 0) { 271 boolean_t drop_namespace = B_FALSE; 272 273 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 274 275 /* 276 * Take spa_namespace_lock to prevent lock inversion when 277 * zvols from one pool are opened as vdevs in another. 278 */ 279 if (!mutex_owned(&spa_namespace_lock)) { 280 if (!mutex_tryenter(&spa_namespace_lock)) { 281 mutex_exit(&zv->zv_state_lock); 282 rw_exit(&zv->zv_suspend_lock); 283 kern_yield(PRI_USER); 284 goto retry; 285 } else { 286 drop_namespace = B_TRUE; 287 } 288 } 289 err = zvol_first_open(zv, !(flag & FWRITE)); 290 if (drop_namespace) 291 mutex_exit(&spa_namespace_lock); 292 if (err) 293 goto out_zv_locked; 294 pp->mediasize = zv->zv_volsize; 295 pp->stripeoffset = 0; 296 pp->stripesize = zv->zv_volblocksize; 297 } 298 299 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 300 301 /* 302 * Check for a bad on-disk format version now since we 303 * lied about owning the dataset readonly before. 304 */ 305 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 306 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 307 err = SET_ERROR(EROFS); 308 goto out_opened; 309 } 310 if (zv->zv_flags & ZVOL_EXCL) { 311 err = SET_ERROR(EBUSY); 312 goto out_opened; 313 } 314 #ifdef FEXCL 315 if (flag & FEXCL) { 316 if (zv->zv_open_count != 0) { 317 err = SET_ERROR(EBUSY); 318 goto out_opened; 319 } 320 zv->zv_flags |= ZVOL_EXCL; 321 } 322 #endif 323 324 zv->zv_open_count += count; 325 out_opened: 326 if (zv->zv_open_count == 0) { 327 zvol_last_close(zv); 328 wakeup(zv); 329 } 330 out_zv_locked: 331 mutex_exit(&zv->zv_state_lock); 332 out_locked: 333 if (drop_suspend) 334 rw_exit(&zv->zv_suspend_lock); 335 return (err); 336 } 337 338 static int 339 zvol_geom_close(struct g_provider *pp, int flag, int count) 340 { 341 (void) flag; 342 zvol_state_t *zv; 343 boolean_t drop_suspend = B_TRUE; 344 int new_open_count; 345 346 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 347 zv = pp->private; 348 if (zv == NULL) { 349 rw_exit(&zvol_state_lock); 350 return (SET_ERROR(ENXIO)); 351 } 352 353 mutex_enter(&zv->zv_state_lock); 354 if (zv->zv_flags & ZVOL_EXCL) { 355 ASSERT3U(zv->zv_open_count, ==, 1); 356 zv->zv_flags &= ~ZVOL_EXCL; 357 } 358 359 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 360 361 /* 362 * If the open count is zero, this is a spurious close. 363 * That indicates a bug in the kernel / DDI framework. 364 */ 365 ASSERT3U(zv->zv_open_count, >, 0); 366 367 /* 368 * Make sure zvol is not suspended during last close 369 * (hold zv_suspend_lock) and respect proper lock acquisition 370 * ordering - zv_suspend_lock before zv_state_lock. 371 */ 372 new_open_count = zv->zv_open_count - count; 373 if (new_open_count == 0) { 374 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 375 mutex_exit(&zv->zv_state_lock); 376 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 377 mutex_enter(&zv->zv_state_lock); 378 /* Check to see if zv_suspend_lock is needed. */ 379 new_open_count = zv->zv_open_count - count; 380 if (new_open_count != 0) { 381 rw_exit(&zv->zv_suspend_lock); 382 drop_suspend = B_FALSE; 383 } 384 } 385 } else { 386 drop_suspend = B_FALSE; 387 } 388 rw_exit(&zvol_state_lock); 389 390 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 391 392 /* 393 * You may get multiple opens, but only one close. 394 */ 395 zv->zv_open_count = new_open_count; 396 if (zv->zv_open_count == 0) { 397 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 398 zvol_last_close(zv); 399 wakeup(zv); 400 } 401 402 mutex_exit(&zv->zv_state_lock); 403 404 if (drop_suspend) 405 rw_exit(&zv->zv_suspend_lock); 406 return (0); 407 } 408 409 static void 410 zvol_geom_run(zvol_state_t *zv) 411 { 412 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 413 struct g_provider *pp = zsg->zsg_provider; 414 415 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 416 417 g_error_provider(pp, 0); 418 419 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 420 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 421 } 422 423 static void 424 zvol_geom_destroy(zvol_state_t *zv) 425 { 426 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 427 struct g_provider *pp = zsg->zsg_provider; 428 429 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 430 431 g_topology_assert(); 432 433 mutex_enter(&zv->zv_state_lock); 434 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 435 mutex_exit(&zv->zv_state_lock); 436 zsg->zsg_provider = NULL; 437 g_wither_geom(pp->geom, ENXIO); 438 } 439 440 void 441 zvol_wait_close(zvol_state_t *zv) 442 { 443 444 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 445 return; 446 mutex_enter(&zv->zv_state_lock); 447 zv->zv_zso->zso_dying = B_TRUE; 448 449 if (zv->zv_open_count) 450 msleep(zv, &zv->zv_state_lock, 451 PRIBIO, "zvol:dying", 10*hz); 452 mutex_exit(&zv->zv_state_lock); 453 } 454 455 456 static int 457 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 458 { 459 int count, error, flags; 460 461 g_topology_assert(); 462 463 /* 464 * To make it easier we expect either open or close, but not both 465 * at the same time. 466 */ 467 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 468 (acr <= 0 && acw <= 0 && ace <= 0), 469 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 470 pp->name, acr, acw, ace)); 471 472 if (pp->private == NULL) { 473 if (acr <= 0 && acw <= 0 && ace <= 0) 474 return (0); 475 return (pp->error); 476 } 477 478 /* 479 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 480 * ace != 0, because GEOM already handles that and handles it a bit 481 * differently. GEOM allows for multiple read/exclusive consumers and 482 * ZFS allows only one exclusive consumer, no matter if it is reader or 483 * writer. I like better the way GEOM works so I'll leave it for GEOM 484 * to decide what to do. 485 */ 486 487 count = acr + acw + ace; 488 if (count == 0) 489 return (0); 490 491 flags = 0; 492 if (acr != 0 || ace != 0) 493 flags |= FREAD; 494 if (acw != 0) 495 flags |= FWRITE; 496 497 g_topology_unlock(); 498 if (count > 0) 499 error = zvol_geom_open(pp, flags, count); 500 else 501 error = zvol_geom_close(pp, flags, -count); 502 g_topology_lock(); 503 return (error); 504 } 505 506 static void 507 zvol_geom_worker(void *arg) 508 { 509 zvol_state_t *zv = arg; 510 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 511 struct bio *bp; 512 513 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 514 515 thread_lock(curthread); 516 sched_prio(curthread, PRIBIO); 517 thread_unlock(curthread); 518 519 for (;;) { 520 mtx_lock(&zsg->zsg_queue_mtx); 521 bp = bioq_takefirst(&zsg->zsg_queue); 522 if (bp == NULL) { 523 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 524 zsg->zsg_state = ZVOL_GEOM_RUNNING; 525 wakeup(&zsg->zsg_state); 526 mtx_unlock(&zsg->zsg_queue_mtx); 527 kthread_exit(); 528 } 529 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 530 PRIBIO | PDROP, "zvol:io", 0); 531 continue; 532 } 533 mtx_unlock(&zsg->zsg_queue_mtx); 534 zvol_geom_bio_strategy(bp); 535 } 536 } 537 538 static void 539 zvol_geom_bio_start(struct bio *bp) 540 { 541 zvol_state_t *zv = bp->bio_to->private; 542 struct zvol_state_geom *zsg; 543 boolean_t first; 544 545 if (zv == NULL) { 546 g_io_deliver(bp, ENXIO); 547 return; 548 } 549 if (bp->bio_cmd == BIO_GETATTR) { 550 if (zvol_geom_bio_getattr(bp)) 551 g_io_deliver(bp, EOPNOTSUPP); 552 return; 553 } 554 555 if (!THREAD_CAN_SLEEP()) { 556 zsg = &zv->zv_zso->zso_geom; 557 mtx_lock(&zsg->zsg_queue_mtx); 558 first = (bioq_first(&zsg->zsg_queue) == NULL); 559 bioq_insert_tail(&zsg->zsg_queue, bp); 560 mtx_unlock(&zsg->zsg_queue_mtx); 561 if (first) 562 wakeup_one(&zsg->zsg_queue); 563 return; 564 } 565 566 zvol_geom_bio_strategy(bp); 567 } 568 569 static int 570 zvol_geom_bio_getattr(struct bio *bp) 571 { 572 zvol_state_t *zv; 573 574 zv = bp->bio_to->private; 575 ASSERT3P(zv, !=, NULL); 576 577 spa_t *spa = dmu_objset_spa(zv->zv_objset); 578 uint64_t refd, avail, usedobjs, availobjs; 579 580 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 581 return (0); 582 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 583 dmu_objset_space(zv->zv_objset, &refd, &avail, 584 &usedobjs, &availobjs); 585 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 586 return (0); 587 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 588 dmu_objset_space(zv->zv_objset, &refd, &avail, 589 &usedobjs, &availobjs); 590 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 591 return (0); 592 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 593 avail = metaslab_class_get_space(spa_normal_class(spa)); 594 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 595 if (g_handleattr_off_t(bp, "poolblocksavail", 596 avail / DEV_BSIZE)) 597 return (0); 598 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 599 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 600 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 601 return (0); 602 } 603 return (1); 604 } 605 606 static void 607 zvol_geom_bio_strategy(struct bio *bp) 608 { 609 zvol_state_t *zv; 610 uint64_t off, volsize; 611 size_t resid; 612 char *addr; 613 objset_t *os; 614 zfs_locked_range_t *lr; 615 int error = 0; 616 boolean_t doread = B_FALSE; 617 boolean_t is_dumpified; 618 boolean_t sync; 619 620 if (bp->bio_to) 621 zv = bp->bio_to->private; 622 else 623 zv = bp->bio_dev->si_drv2; 624 625 if (zv == NULL) { 626 error = SET_ERROR(ENXIO); 627 goto out; 628 } 629 630 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 631 632 switch (bp->bio_cmd) { 633 case BIO_READ: 634 doread = B_TRUE; 635 break; 636 case BIO_WRITE: 637 case BIO_FLUSH: 638 case BIO_DELETE: 639 if (zv->zv_flags & ZVOL_RDONLY) { 640 error = SET_ERROR(EROFS); 641 goto resume; 642 } 643 zvol_ensure_zilog(zv); 644 if (bp->bio_cmd == BIO_FLUSH) 645 goto sync; 646 break; 647 default: 648 error = SET_ERROR(EOPNOTSUPP); 649 goto resume; 650 } 651 652 off = bp->bio_offset; 653 volsize = zv->zv_volsize; 654 655 os = zv->zv_objset; 656 ASSERT3P(os, !=, NULL); 657 658 addr = bp->bio_data; 659 resid = bp->bio_length; 660 661 if (resid > 0 && off >= volsize) { 662 error = SET_ERROR(EIO); 663 goto resume; 664 } 665 666 is_dumpified = B_FALSE; 667 sync = !doread && !is_dumpified && 668 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 669 670 /* 671 * There must be no buffer changes when doing a dmu_sync() because 672 * we can't change the data whilst calculating the checksum. 673 */ 674 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 675 doread ? RL_READER : RL_WRITER); 676 677 if (bp->bio_cmd == BIO_DELETE) { 678 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 679 error = dmu_tx_assign(tx, TXG_WAIT); 680 if (error != 0) { 681 dmu_tx_abort(tx); 682 } else { 683 zvol_log_truncate(zv, tx, off, resid, sync); 684 dmu_tx_commit(tx); 685 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 686 off, resid); 687 resid = 0; 688 } 689 goto unlock; 690 } 691 while (resid != 0 && off < volsize) { 692 size_t size = MIN(resid, zvol_maxphys); 693 if (doread) { 694 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 695 DMU_READ_PREFETCH); 696 } else { 697 dmu_tx_t *tx = dmu_tx_create(os); 698 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 699 error = dmu_tx_assign(tx, TXG_WAIT); 700 if (error) { 701 dmu_tx_abort(tx); 702 } else { 703 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 704 zvol_log_write(zv, tx, off, size, sync); 705 dmu_tx_commit(tx); 706 } 707 } 708 if (error) { 709 /* Convert checksum errors into IO errors. */ 710 if (error == ECKSUM) 711 error = SET_ERROR(EIO); 712 break; 713 } 714 off += size; 715 addr += size; 716 resid -= size; 717 } 718 unlock: 719 zfs_rangelock_exit(lr); 720 721 bp->bio_completed = bp->bio_length - resid; 722 if (bp->bio_completed < bp->bio_length && off > volsize) 723 error = SET_ERROR(EINVAL); 724 725 switch (bp->bio_cmd) { 726 case BIO_FLUSH: 727 break; 728 case BIO_READ: 729 dataset_kstats_update_read_kstats(&zv->zv_kstat, 730 bp->bio_completed); 731 break; 732 case BIO_WRITE: 733 dataset_kstats_update_write_kstats(&zv->zv_kstat, 734 bp->bio_completed); 735 break; 736 case BIO_DELETE: 737 break; 738 default: 739 break; 740 } 741 742 if (sync) { 743 sync: 744 zil_commit(zv->zv_zilog, ZVOL_OBJ); 745 } 746 resume: 747 rw_exit(&zv->zv_suspend_lock); 748 out: 749 if (bp->bio_to) 750 g_io_deliver(bp, error); 751 else 752 biofinish(bp, NULL, error); 753 } 754 755 /* 756 * Character device mode implementation 757 */ 758 759 static int 760 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 761 { 762 zvol_state_t *zv; 763 uint64_t volsize; 764 zfs_locked_range_t *lr; 765 int error = 0; 766 zfs_uio_t uio; 767 768 zfs_uio_init(&uio, uio_s); 769 770 zv = dev->si_drv2; 771 772 volsize = zv->zv_volsize; 773 /* 774 * uio_loffset == volsize isn't an error as 775 * it's required for EOF processing. 776 */ 777 if (zfs_uio_resid(&uio) > 0 && 778 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 779 return (SET_ERROR(EIO)); 780 781 ssize_t start_resid = zfs_uio_resid(&uio); 782 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 783 zfs_uio_resid(&uio), RL_READER); 784 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 785 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 786 787 /* Don't read past the end. */ 788 if (bytes > volsize - zfs_uio_offset(&uio)) 789 bytes = volsize - zfs_uio_offset(&uio); 790 791 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 792 if (error) { 793 /* Convert checksum errors into IO errors. */ 794 if (error == ECKSUM) 795 error = SET_ERROR(EIO); 796 break; 797 } 798 } 799 zfs_rangelock_exit(lr); 800 int64_t nread = start_resid - zfs_uio_resid(&uio); 801 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 802 803 return (error); 804 } 805 806 static int 807 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 808 { 809 zvol_state_t *zv; 810 uint64_t volsize; 811 zfs_locked_range_t *lr; 812 int error = 0; 813 boolean_t sync; 814 zfs_uio_t uio; 815 816 zv = dev->si_drv2; 817 818 volsize = zv->zv_volsize; 819 820 zfs_uio_init(&uio, uio_s); 821 822 if (zfs_uio_resid(&uio) > 0 && 823 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 824 return (SET_ERROR(EIO)); 825 826 ssize_t start_resid = zfs_uio_resid(&uio); 827 sync = (ioflag & IO_SYNC) || 828 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 829 830 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 831 zvol_ensure_zilog(zv); 832 833 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 834 zfs_uio_resid(&uio), RL_WRITER); 835 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 836 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 837 uint64_t off = zfs_uio_offset(&uio); 838 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 839 840 if (bytes > volsize - off) /* Don't write past the end. */ 841 bytes = volsize - off; 842 843 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 844 error = dmu_tx_assign(tx, TXG_WAIT); 845 if (error) { 846 dmu_tx_abort(tx); 847 break; 848 } 849 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 850 if (error == 0) 851 zvol_log_write(zv, tx, off, bytes, sync); 852 dmu_tx_commit(tx); 853 854 if (error) 855 break; 856 } 857 zfs_rangelock_exit(lr); 858 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 859 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 860 if (sync) 861 zil_commit(zv->zv_zilog, ZVOL_OBJ); 862 rw_exit(&zv->zv_suspend_lock); 863 return (error); 864 } 865 866 static int 867 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 868 { 869 zvol_state_t *zv; 870 struct zvol_state_dev *zsd; 871 int err = 0; 872 boolean_t drop_suspend = B_FALSE; 873 874 retry: 875 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 876 /* 877 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either 878 * the result of zvol free code setting si_drv2 to NULL is observed, 879 * or the zv is protected from being freed because of the positive 880 * zv_open_count. 881 */ 882 zv = dev->si_drv2; 883 if (zv == NULL) { 884 rw_exit(&zvol_state_lock); 885 err = SET_ERROR(ENXIO); 886 goto out_locked; 887 } 888 889 mutex_enter(&zv->zv_state_lock); 890 if (zv->zv_zso->zso_dying) { 891 rw_exit(&zvol_state_lock); 892 err = SET_ERROR(ENXIO); 893 goto out_zv_locked; 894 } 895 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 896 897 /* 898 * Make sure zvol is not suspended during first open 899 * (hold zv_suspend_lock) and respect proper lock acquisition 900 * ordering - zv_suspend_lock before zv_state_lock. 901 */ 902 if (zv->zv_open_count == 0) { 903 drop_suspend = B_TRUE; 904 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 905 mutex_exit(&zv->zv_state_lock); 906 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 907 mutex_enter(&zv->zv_state_lock); 908 /* Check to see if zv_suspend_lock is needed. */ 909 if (zv->zv_open_count != 0) { 910 rw_exit(&zv->zv_suspend_lock); 911 drop_suspend = B_FALSE; 912 } 913 } 914 } 915 rw_exit(&zvol_state_lock); 916 917 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 918 919 if (zv->zv_open_count == 0) { 920 boolean_t drop_namespace = B_FALSE; 921 922 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 923 924 /* 925 * Take spa_namespace_lock to prevent lock inversion when 926 * zvols from one pool are opened as vdevs in another. 927 */ 928 if (!mutex_owned(&spa_namespace_lock)) { 929 if (!mutex_tryenter(&spa_namespace_lock)) { 930 mutex_exit(&zv->zv_state_lock); 931 rw_exit(&zv->zv_suspend_lock); 932 kern_yield(PRI_USER); 933 goto retry; 934 } else { 935 drop_namespace = B_TRUE; 936 } 937 } 938 err = zvol_first_open(zv, !(flags & FWRITE)); 939 if (drop_namespace) 940 mutex_exit(&spa_namespace_lock); 941 if (err) 942 goto out_zv_locked; 943 } 944 945 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 946 947 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 948 err = SET_ERROR(EROFS); 949 goto out_opened; 950 } 951 if (zv->zv_flags & ZVOL_EXCL) { 952 err = SET_ERROR(EBUSY); 953 goto out_opened; 954 } 955 #ifdef FEXCL 956 if (flags & FEXCL) { 957 if (zv->zv_open_count != 0) { 958 err = SET_ERROR(EBUSY); 959 goto out_opened; 960 } 961 zv->zv_flags |= ZVOL_EXCL; 962 } 963 #endif 964 965 zv->zv_open_count++; 966 if (flags & (FSYNC | FDSYNC)) { 967 zsd = &zv->zv_zso->zso_dev; 968 zsd->zsd_sync_cnt++; 969 if (zsd->zsd_sync_cnt == 1 && 970 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) 971 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); 972 } 973 out_opened: 974 if (zv->zv_open_count == 0) { 975 zvol_last_close(zv); 976 wakeup(zv); 977 } 978 out_zv_locked: 979 mutex_exit(&zv->zv_state_lock); 980 out_locked: 981 if (drop_suspend) 982 rw_exit(&zv->zv_suspend_lock); 983 return (err); 984 } 985 986 static int 987 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 988 { 989 zvol_state_t *zv; 990 struct zvol_state_dev *zsd; 991 boolean_t drop_suspend = B_TRUE; 992 993 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 994 zv = dev->si_drv2; 995 if (zv == NULL) { 996 rw_exit(&zvol_state_lock); 997 return (SET_ERROR(ENXIO)); 998 } 999 1000 mutex_enter(&zv->zv_state_lock); 1001 if (zv->zv_flags & ZVOL_EXCL) { 1002 ASSERT3U(zv->zv_open_count, ==, 1); 1003 zv->zv_flags &= ~ZVOL_EXCL; 1004 } 1005 1006 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1007 1008 /* 1009 * If the open count is zero, this is a spurious close. 1010 * That indicates a bug in the kernel / DDI framework. 1011 */ 1012 ASSERT3U(zv->zv_open_count, >, 0); 1013 /* 1014 * Make sure zvol is not suspended during last close 1015 * (hold zv_suspend_lock) and respect proper lock acquisition 1016 * ordering - zv_suspend_lock before zv_state_lock. 1017 */ 1018 if (zv->zv_open_count == 1) { 1019 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1020 mutex_exit(&zv->zv_state_lock); 1021 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1022 mutex_enter(&zv->zv_state_lock); 1023 /* Check to see if zv_suspend_lock is needed. */ 1024 if (zv->zv_open_count != 1) { 1025 rw_exit(&zv->zv_suspend_lock); 1026 drop_suspend = B_FALSE; 1027 } 1028 } 1029 } else { 1030 drop_suspend = B_FALSE; 1031 } 1032 rw_exit(&zvol_state_lock); 1033 1034 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1035 1036 /* 1037 * You may get multiple opens, but only one close. 1038 */ 1039 zv->zv_open_count--; 1040 if (flags & (FSYNC | FDSYNC)) { 1041 zsd = &zv->zv_zso->zso_dev; 1042 zsd->zsd_sync_cnt--; 1043 } 1044 1045 if (zv->zv_open_count == 0) { 1046 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1047 zvol_last_close(zv); 1048 wakeup(zv); 1049 } 1050 1051 mutex_exit(&zv->zv_state_lock); 1052 1053 if (drop_suspend) 1054 rw_exit(&zv->zv_suspend_lock); 1055 return (0); 1056 } 1057 1058 static int 1059 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1060 int fflag, struct thread *td) 1061 { 1062 zvol_state_t *zv; 1063 zfs_locked_range_t *lr; 1064 off_t offset, length; 1065 int error; 1066 boolean_t sync; 1067 1068 zv = dev->si_drv2; 1069 1070 error = 0; 1071 KASSERT(zv->zv_open_count > 0, 1072 ("Device with zero access count in %s", __func__)); 1073 1074 switch (cmd) { 1075 case DIOCGSECTORSIZE: 1076 *(uint32_t *)data = DEV_BSIZE; 1077 break; 1078 case DIOCGMEDIASIZE: 1079 *(off_t *)data = zv->zv_volsize; 1080 break; 1081 case DIOCGFLUSH: 1082 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1083 if (zv->zv_zilog != NULL) 1084 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1085 rw_exit(&zv->zv_suspend_lock); 1086 break; 1087 case DIOCGDELETE: 1088 if (!zvol_unmap_enabled) 1089 break; 1090 1091 offset = ((off_t *)data)[0]; 1092 length = ((off_t *)data)[1]; 1093 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1094 offset < 0 || offset >= zv->zv_volsize || 1095 length <= 0) { 1096 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1097 length); 1098 error = SET_ERROR(EINVAL); 1099 break; 1100 } 1101 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1102 zvol_ensure_zilog(zv); 1103 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1104 RL_WRITER); 1105 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1106 error = dmu_tx_assign(tx, TXG_WAIT); 1107 if (error != 0) { 1108 sync = FALSE; 1109 dmu_tx_abort(tx); 1110 } else { 1111 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1112 zvol_log_truncate(zv, tx, offset, length, sync); 1113 dmu_tx_commit(tx); 1114 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1115 offset, length); 1116 } 1117 zfs_rangelock_exit(lr); 1118 if (sync) 1119 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1120 rw_exit(&zv->zv_suspend_lock); 1121 break; 1122 case DIOCGSTRIPESIZE: 1123 *(off_t *)data = zv->zv_volblocksize; 1124 break; 1125 case DIOCGSTRIPEOFFSET: 1126 *(off_t *)data = 0; 1127 break; 1128 case DIOCGATTR: { 1129 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1130 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1131 uint64_t refd, avail, usedobjs, availobjs; 1132 1133 if (strcmp(arg->name, "GEOM::candelete") == 0) 1134 arg->value.i = 1; 1135 else if (strcmp(arg->name, "blocksavail") == 0) { 1136 dmu_objset_space(zv->zv_objset, &refd, &avail, 1137 &usedobjs, &availobjs); 1138 arg->value.off = avail / DEV_BSIZE; 1139 } else if (strcmp(arg->name, "blocksused") == 0) { 1140 dmu_objset_space(zv->zv_objset, &refd, &avail, 1141 &usedobjs, &availobjs); 1142 arg->value.off = refd / DEV_BSIZE; 1143 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1144 avail = metaslab_class_get_space(spa_normal_class(spa)); 1145 avail -= metaslab_class_get_alloc( 1146 spa_normal_class(spa)); 1147 arg->value.off = avail / DEV_BSIZE; 1148 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1149 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1150 arg->value.off = refd / DEV_BSIZE; 1151 } else 1152 error = SET_ERROR(ENOIOCTL); 1153 break; 1154 } 1155 case FIOSEEKHOLE: 1156 case FIOSEEKDATA: { 1157 off_t *off = (off_t *)data; 1158 uint64_t noff; 1159 boolean_t hole; 1160 1161 hole = (cmd == FIOSEEKHOLE); 1162 noff = *off; 1163 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1164 *off = noff; 1165 break; 1166 } 1167 default: 1168 error = SET_ERROR(ENOIOCTL); 1169 } 1170 1171 return (error); 1172 } 1173 1174 /* 1175 * Misc. helpers 1176 */ 1177 1178 static void 1179 zvol_ensure_zilog(zvol_state_t *zv) 1180 { 1181 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1182 1183 /* 1184 * Open a ZIL if this is the first time we have written to this 1185 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1186 * than zv_state_lock so that we don't need to acquire an 1187 * additional lock in this path. 1188 */ 1189 if (zv->zv_zilog == NULL) { 1190 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1191 rw_exit(&zv->zv_suspend_lock); 1192 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1193 } 1194 if (zv->zv_zilog == NULL) { 1195 zv->zv_zilog = zil_open(zv->zv_objset, 1196 zvol_get_data); 1197 zv->zv_flags |= ZVOL_WRITTEN_TO; 1198 /* replay / destroy done in zvol_os_create_minor() */ 1199 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1200 ZIL_REPLAY_NEEDED); 1201 } 1202 rw_downgrade(&zv->zv_suspend_lock); 1203 } 1204 } 1205 1206 boolean_t 1207 zvol_os_is_zvol(const char *device) 1208 { 1209 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1210 } 1211 1212 void 1213 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1214 { 1215 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1216 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1217 1218 /* Move to a new hashtable entry. */ 1219 zv->zv_hash = zvol_name_hash(zv->zv_name); 1220 hlist_del(&zv->zv_hlink); 1221 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1222 1223 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1224 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1225 struct g_provider *pp = zsg->zsg_provider; 1226 struct g_geom *gp; 1227 1228 g_topology_lock(); 1229 gp = pp->geom; 1230 ASSERT3P(gp, !=, NULL); 1231 1232 zsg->zsg_provider = NULL; 1233 g_wither_provider(pp, ENXIO); 1234 1235 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1236 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1237 pp->sectorsize = DEV_BSIZE; 1238 pp->mediasize = zv->zv_volsize; 1239 pp->private = zv; 1240 zsg->zsg_provider = pp; 1241 g_error_provider(pp, 0); 1242 g_topology_unlock(); 1243 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1244 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1245 struct cdev *dev; 1246 struct make_dev_args args; 1247 1248 dev = zsd->zsd_cdev; 1249 if (dev != NULL) { 1250 destroy_dev(dev); 1251 dev = zsd->zsd_cdev = NULL; 1252 if (zv->zv_open_count > 0) { 1253 zv->zv_flags &= ~ZVOL_EXCL; 1254 zv->zv_open_count = 0; 1255 /* XXX need suspend lock but lock order */ 1256 zvol_last_close(zv); 1257 } 1258 } 1259 1260 make_dev_args_init(&args); 1261 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1262 args.mda_devsw = &zvol_cdevsw; 1263 args.mda_cr = NULL; 1264 args.mda_uid = UID_ROOT; 1265 args.mda_gid = GID_OPERATOR; 1266 args.mda_mode = 0640; 1267 args.mda_si_drv2 = zv; 1268 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1269 == 0) { 1270 #if __FreeBSD_version > 1300130 1271 dev->si_iosize_max = maxphys; 1272 #else 1273 dev->si_iosize_max = MAXPHYS; 1274 #endif 1275 zsd->zsd_cdev = dev; 1276 } 1277 } 1278 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1279 } 1280 1281 /* 1282 * Remove minor node for the specified volume. 1283 */ 1284 void 1285 zvol_os_free(zvol_state_t *zv) 1286 { 1287 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1288 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1289 ASSERT0(zv->zv_open_count); 1290 1291 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1292 1293 rw_destroy(&zv->zv_suspend_lock); 1294 zfs_rangelock_fini(&zv->zv_rangelock); 1295 1296 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1297 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1298 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1299 1300 ASSERT3P(pp->private, ==, NULL); 1301 1302 g_topology_lock(); 1303 zvol_geom_destroy(zv); 1304 g_topology_unlock(); 1305 mtx_destroy(&zsg->zsg_queue_mtx); 1306 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1307 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1308 struct cdev *dev = zsd->zsd_cdev; 1309 1310 if (dev != NULL) { 1311 ASSERT3P(dev->si_drv2, ==, NULL); 1312 destroy_dev(dev); 1313 } 1314 } 1315 1316 mutex_destroy(&zv->zv_state_lock); 1317 dataset_kstats_destroy(&zv->zv_kstat); 1318 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1319 kmem_free(zv, sizeof (zvol_state_t)); 1320 zvol_minors--; 1321 } 1322 1323 /* 1324 * Create a minor node (plus a whole lot more) for the specified volume. 1325 */ 1326 int 1327 zvol_os_create_minor(const char *name) 1328 { 1329 zvol_state_t *zv; 1330 objset_t *os; 1331 dmu_object_info_t *doi; 1332 uint64_t volsize; 1333 uint64_t volmode, hash; 1334 int error; 1335 1336 ZFS_LOG(1, "Creating ZVOL %s...", name); 1337 hash = zvol_name_hash(name); 1338 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1339 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1340 mutex_exit(&zv->zv_state_lock); 1341 return (SET_ERROR(EEXIST)); 1342 } 1343 1344 DROP_GIANT(); 1345 1346 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1347 1348 /* Lie and say we're read-only. */ 1349 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1350 if (error) 1351 goto out_doi; 1352 1353 error = dmu_object_info(os, ZVOL_OBJ, doi); 1354 if (error) 1355 goto out_dmu_objset_disown; 1356 1357 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1358 if (error) 1359 goto out_dmu_objset_disown; 1360 1361 error = dsl_prop_get_integer(name, 1362 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1363 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1364 volmode = zvol_volmode; 1365 error = 0; 1366 1367 /* 1368 * zvol_alloc equivalent ... 1369 */ 1370 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1371 zv->zv_hash = hash; 1372 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1373 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1374 zv->zv_volmode = volmode; 1375 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1376 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1377 struct g_provider *pp; 1378 struct g_geom *gp; 1379 1380 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1381 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1382 1383 g_topology_lock(); 1384 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1385 gp->start = zvol_geom_bio_start; 1386 gp->access = zvol_geom_access; 1387 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1388 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1389 pp->sectorsize = DEV_BSIZE; 1390 pp->mediasize = 0; 1391 pp->private = zv; 1392 1393 zsg->zsg_provider = pp; 1394 bioq_init(&zsg->zsg_queue); 1395 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1396 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1397 struct cdev *dev; 1398 struct make_dev_args args; 1399 1400 make_dev_args_init(&args); 1401 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1402 args.mda_devsw = &zvol_cdevsw; 1403 args.mda_cr = NULL; 1404 args.mda_uid = UID_ROOT; 1405 args.mda_gid = GID_OPERATOR; 1406 args.mda_mode = 0640; 1407 args.mda_si_drv2 = zv; 1408 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) 1409 == 0) { 1410 #if __FreeBSD_version > 1300130 1411 dev->si_iosize_max = maxphys; 1412 #else 1413 dev->si_iosize_max = MAXPHYS; 1414 #endif 1415 zsd->zsd_cdev = dev; 1416 } 1417 } 1418 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1419 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1420 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1421 1422 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1423 zv->zv_flags |= ZVOL_RDONLY; 1424 1425 zv->zv_volblocksize = doi->doi_data_block_size; 1426 zv->zv_volsize = volsize; 1427 zv->zv_objset = os; 1428 1429 ASSERT3P(zv->zv_zilog, ==, NULL); 1430 zv->zv_zilog = zil_open(os, zvol_get_data); 1431 if (spa_writeable(dmu_objset_spa(os))) { 1432 if (zil_replay_disable) 1433 zil_destroy(zv->zv_zilog, B_FALSE); 1434 else 1435 zil_replay(os, zv, zvol_replay_vector); 1436 } 1437 zil_close(zv->zv_zilog); 1438 zv->zv_zilog = NULL; 1439 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1440 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1441 1442 /* TODO: prefetch for geom tasting */ 1443 1444 zv->zv_objset = NULL; 1445 out_dmu_objset_disown: 1446 dmu_objset_disown(os, B_TRUE, FTAG); 1447 1448 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1449 zvol_geom_run(zv); 1450 g_topology_unlock(); 1451 } 1452 out_doi: 1453 kmem_free(doi, sizeof (dmu_object_info_t)); 1454 if (error == 0) { 1455 rw_enter(&zvol_state_lock, RW_WRITER); 1456 zvol_insert(zv); 1457 zvol_minors++; 1458 rw_exit(&zvol_state_lock); 1459 ZFS_LOG(1, "ZVOL %s created.", name); 1460 } 1461 PICKUP_GIANT(); 1462 return (error); 1463 } 1464 1465 void 1466 zvol_os_clear_private(zvol_state_t *zv) 1467 { 1468 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1469 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1470 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1471 struct g_provider *pp = zsg->zsg_provider; 1472 1473 if (pp->private == NULL) /* already cleared */ 1474 return; 1475 1476 mtx_lock(&zsg->zsg_queue_mtx); 1477 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1478 pp->private = NULL; 1479 wakeup_one(&zsg->zsg_queue); 1480 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1481 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1482 0, "zvol:w", 0); 1483 mtx_unlock(&zsg->zsg_queue_mtx); 1484 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1485 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1486 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1487 struct cdev *dev = zsd->zsd_cdev; 1488 1489 if (dev != NULL) 1490 dev->si_drv2 = NULL; 1491 } 1492 } 1493 1494 int 1495 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1496 { 1497 zv->zv_volsize = volsize; 1498 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1499 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1500 struct g_provider *pp = zsg->zsg_provider; 1501 1502 g_topology_lock(); 1503 1504 if (pp->private == NULL) { 1505 g_topology_unlock(); 1506 return (SET_ERROR(ENXIO)); 1507 } 1508 1509 /* 1510 * Do not invoke resize event when initial size was zero. 1511 * ZVOL initializes the size on first open, this is not 1512 * real resizing. 1513 */ 1514 if (pp->mediasize == 0) 1515 pp->mediasize = zv->zv_volsize; 1516 else 1517 g_resize_provider(pp, zv->zv_volsize); 1518 1519 g_topology_unlock(); 1520 } 1521 return (0); 1522 } 1523 1524 void 1525 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1526 { 1527 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1528 } 1529 1530 void 1531 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1532 { 1533 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1534 } 1535 1536 /* 1537 * Public interfaces 1538 */ 1539 1540 int 1541 zvol_busy(void) 1542 { 1543 return (zvol_minors != 0); 1544 } 1545 1546 int 1547 zvol_init(void) 1548 { 1549 zvol_init_impl(); 1550 return (0); 1551 } 1552 1553 void 1554 zvol_fini(void) 1555 { 1556 zvol_fini_impl(); 1557 } 1558