1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 96 #include <geom/geom.h> 97 #include <sys/zvol.h> 98 #include <sys/zvol_impl.h> 99 100 #include "zfs_namecheck.h" 101 102 #define ZVOL_DUMPSIZE "dumpsize" 103 104 #ifdef ZVOL_LOCK_DEBUG 105 #define ZVOL_RW_READER RW_WRITER 106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 107 #else 108 #define ZVOL_RW_READER RW_READER 109 #define ZVOL_RW_READ_HELD RW_READ_HELD 110 #endif 111 112 enum zvol_geom_state { 113 ZVOL_GEOM_UNINIT, 114 ZVOL_GEOM_STOPPED, 115 ZVOL_GEOM_RUNNING, 116 }; 117 118 struct zvol_state_os { 119 #define zso_dev _zso_state._zso_dev 120 #define zso_geom _zso_state._zso_geom 121 union { 122 /* volmode=dev */ 123 struct zvol_state_dev { 124 struct cdev *zsd_cdev; 125 uint64_t zsd_sync_cnt; 126 } _zso_dev; 127 128 /* volmode=geom */ 129 struct zvol_state_geom { 130 struct g_provider *zsg_provider; 131 struct bio_queue_head zsg_queue; 132 struct mtx zsg_queue_mtx; 133 enum zvol_geom_state zsg_state; 134 } _zso_geom; 135 } _zso_state; 136 int zso_dying; 137 }; 138 139 static uint32_t zvol_minors; 140 141 SYSCTL_DECL(_vfs_zfs); 142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 144 "Expose as GEOM providers (1), device files (2) or neither"); 145 static boolean_t zpool_on_zvol = B_FALSE; 146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 147 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 148 149 /* 150 * Toggle unmap functionality. 151 */ 152 boolean_t zvol_unmap_enabled = B_TRUE; 153 154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 155 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 156 157 /* 158 * zvol maximum transfer in one DMU tx. 159 */ 160 int zvol_maxphys = DMU_MAX_ACCESS / 2; 161 162 static void zvol_ensure_zilog(zvol_state_t *zv); 163 164 static d_open_t zvol_cdev_open; 165 static d_close_t zvol_cdev_close; 166 static d_ioctl_t zvol_cdev_ioctl; 167 static d_read_t zvol_cdev_read; 168 static d_write_t zvol_cdev_write; 169 static d_strategy_t zvol_geom_bio_strategy; 170 171 static struct cdevsw zvol_cdevsw = { 172 .d_name = "zvol", 173 .d_version = D_VERSION, 174 .d_flags = D_DISK | D_TRACKCLOSE, 175 .d_open = zvol_cdev_open, 176 .d_close = zvol_cdev_close, 177 .d_ioctl = zvol_cdev_ioctl, 178 .d_read = zvol_cdev_read, 179 .d_write = zvol_cdev_write, 180 .d_strategy = zvol_geom_bio_strategy, 181 }; 182 183 extern uint_t zfs_geom_probe_vdev_key; 184 185 struct g_class zfs_zvol_class = { 186 .name = "ZFS::ZVOL", 187 .version = G_VERSION, 188 }; 189 190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 191 192 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 193 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 194 static void zvol_geom_run(zvol_state_t *zv); 195 static void zvol_geom_destroy(zvol_state_t *zv); 196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 197 static void zvol_geom_worker(void *arg); 198 static void zvol_geom_bio_start(struct bio *bp); 199 static int zvol_geom_bio_getattr(struct bio *bp); 200 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 201 202 /* 203 * GEOM mode implementation 204 */ 205 206 /*ARGSUSED*/ 207 static int 208 zvol_geom_open(struct g_provider *pp, int flag, int count) 209 { 210 zvol_state_t *zv; 211 int err = 0; 212 boolean_t drop_suspend = B_FALSE; 213 boolean_t drop_namespace = B_FALSE; 214 215 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 216 /* 217 * if zfs_geom_probe_vdev_key is set, that means that zfs is 218 * attempting to probe geom providers while looking for a 219 * replacement for a missing VDEV. In this case, the 220 * spa_namespace_lock will not be held, but it is still illegal 221 * to use a zvol as a vdev. Deadlocks can result if another 222 * thread has spa_namespace_lock 223 */ 224 return (SET_ERROR(EOPNOTSUPP)); 225 } 226 227 retry: 228 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 229 zv = pp->private; 230 if (zv == NULL) { 231 rw_exit(&zvol_state_lock); 232 err = SET_ERROR(ENXIO); 233 goto out_locked; 234 } 235 236 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { 237 /* 238 * We need to guarantee that the namespace lock is held 239 * to avoid spurious failures in zvol_first_open. 240 */ 241 drop_namespace = B_TRUE; 242 if (!mutex_tryenter(&spa_namespace_lock)) { 243 rw_exit(&zvol_state_lock); 244 mutex_enter(&spa_namespace_lock); 245 goto retry; 246 } 247 } 248 mutex_enter(&zv->zv_state_lock); 249 if (zv->zv_zso->zso_dying) { 250 rw_exit(&zvol_state_lock); 251 err = SET_ERROR(ENXIO); 252 goto out_zv_locked; 253 } 254 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 255 256 /* 257 * make sure zvol is not suspended during first open 258 * (hold zv_suspend_lock) and respect proper lock acquisition 259 * ordering - zv_suspend_lock before zv_state_lock 260 */ 261 if (zv->zv_open_count == 0) { 262 drop_suspend = B_TRUE; 263 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 264 mutex_exit(&zv->zv_state_lock); 265 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 266 mutex_enter(&zv->zv_state_lock); 267 /* check to see if zv_suspend_lock is needed */ 268 if (zv->zv_open_count != 0) { 269 rw_exit(&zv->zv_suspend_lock); 270 drop_suspend = B_FALSE; 271 } 272 } 273 } 274 rw_exit(&zvol_state_lock); 275 276 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 277 278 if (zv->zv_open_count == 0) { 279 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 280 err = zvol_first_open(zv, !(flag & FWRITE)); 281 if (err) 282 goto out_zv_locked; 283 pp->mediasize = zv->zv_volsize; 284 pp->stripeoffset = 0; 285 pp->stripesize = zv->zv_volblocksize; 286 } 287 288 /* 289 * Check for a bad on-disk format version now since we 290 * lied about owning the dataset readonly before. 291 */ 292 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 293 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 294 err = SET_ERROR(EROFS); 295 goto out_opened; 296 } 297 if (zv->zv_flags & ZVOL_EXCL) { 298 err = SET_ERROR(EBUSY); 299 goto out_opened; 300 } 301 #ifdef FEXCL 302 if (flag & FEXCL) { 303 if (zv->zv_open_count != 0) { 304 err = SET_ERROR(EBUSY); 305 goto out_opened; 306 } 307 zv->zv_flags |= ZVOL_EXCL; 308 } 309 #endif 310 311 zv->zv_open_count += count; 312 out_opened: 313 if (zv->zv_open_count == 0) { 314 zvol_last_close(zv); 315 wakeup(zv); 316 } 317 out_zv_locked: 318 mutex_exit(&zv->zv_state_lock); 319 out_locked: 320 if (drop_namespace) 321 mutex_exit(&spa_namespace_lock); 322 if (drop_suspend) 323 rw_exit(&zv->zv_suspend_lock); 324 return (err); 325 } 326 327 /*ARGSUSED*/ 328 static int 329 zvol_geom_close(struct g_provider *pp, int flag, int count) 330 { 331 zvol_state_t *zv; 332 boolean_t drop_suspend = B_TRUE; 333 int new_open_count; 334 335 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 336 zv = pp->private; 337 if (zv == NULL) { 338 rw_exit(&zvol_state_lock); 339 return (SET_ERROR(ENXIO)); 340 } 341 342 mutex_enter(&zv->zv_state_lock); 343 if (zv->zv_flags & ZVOL_EXCL) { 344 ASSERT3U(zv->zv_open_count, ==, 1); 345 zv->zv_flags &= ~ZVOL_EXCL; 346 } 347 348 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 349 350 /* 351 * If the open count is zero, this is a spurious close. 352 * That indicates a bug in the kernel / DDI framework. 353 */ 354 ASSERT3U(zv->zv_open_count, >, 0); 355 356 /* 357 * make sure zvol is not suspended during last close 358 * (hold zv_suspend_lock) and respect proper lock acquisition 359 * ordering - zv_suspend_lock before zv_state_lock 360 */ 361 new_open_count = zv->zv_open_count - count; 362 if (new_open_count == 0) { 363 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 364 mutex_exit(&zv->zv_state_lock); 365 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 366 mutex_enter(&zv->zv_state_lock); 367 /* check to see if zv_suspend_lock is needed */ 368 new_open_count = zv->zv_open_count - count; 369 if (new_open_count != 0) { 370 rw_exit(&zv->zv_suspend_lock); 371 drop_suspend = B_FALSE; 372 } 373 } 374 } else { 375 drop_suspend = B_FALSE; 376 } 377 rw_exit(&zvol_state_lock); 378 379 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 380 381 /* 382 * You may get multiple opens, but only one close. 383 */ 384 zv->zv_open_count = new_open_count; 385 if (zv->zv_open_count == 0) { 386 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 387 zvol_last_close(zv); 388 wakeup(zv); 389 } 390 391 mutex_exit(&zv->zv_state_lock); 392 393 if (drop_suspend) 394 rw_exit(&zv->zv_suspend_lock); 395 return (0); 396 } 397 398 static void 399 zvol_geom_run(zvol_state_t *zv) 400 { 401 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 402 struct g_provider *pp = zsg->zsg_provider; 403 404 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 405 406 g_error_provider(pp, 0); 407 408 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 409 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 410 } 411 412 static void 413 zvol_geom_destroy(zvol_state_t *zv) 414 { 415 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 416 struct g_provider *pp = zsg->zsg_provider; 417 418 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 419 420 g_topology_assert(); 421 422 mutex_enter(&zv->zv_state_lock); 423 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 424 mutex_exit(&zv->zv_state_lock); 425 zsg->zsg_provider = NULL; 426 g_wither_geom(pp->geom, ENXIO); 427 } 428 429 void 430 zvol_wait_close(zvol_state_t *zv) 431 { 432 433 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 434 return; 435 mutex_enter(&zv->zv_state_lock); 436 zv->zv_zso->zso_dying = B_TRUE; 437 438 if (zv->zv_open_count) 439 msleep(zv, &zv->zv_state_lock, 440 PRIBIO, "zvol:dying", 10*hz); 441 mutex_exit(&zv->zv_state_lock); 442 } 443 444 445 static int 446 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 447 { 448 int count, error, flags; 449 450 g_topology_assert(); 451 452 /* 453 * To make it easier we expect either open or close, but not both 454 * at the same time. 455 */ 456 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 457 (acr <= 0 && acw <= 0 && ace <= 0), 458 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 459 pp->name, acr, acw, ace)); 460 461 if (pp->private == NULL) { 462 if (acr <= 0 && acw <= 0 && ace <= 0) 463 return (0); 464 return (pp->error); 465 } 466 467 /* 468 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 469 * ace != 0, because GEOM already handles that and handles it a bit 470 * differently. GEOM allows for multiple read/exclusive consumers and 471 * ZFS allows only one exclusive consumer, no matter if it is reader or 472 * writer. I like better the way GEOM works so I'll leave it for GEOM 473 * to decide what to do. 474 */ 475 476 count = acr + acw + ace; 477 if (count == 0) 478 return (0); 479 480 flags = 0; 481 if (acr != 0 || ace != 0) 482 flags |= FREAD; 483 if (acw != 0) 484 flags |= FWRITE; 485 486 g_topology_unlock(); 487 if (count > 0) 488 error = zvol_geom_open(pp, flags, count); 489 else 490 error = zvol_geom_close(pp, flags, -count); 491 g_topology_lock(); 492 return (error); 493 } 494 495 static void 496 zvol_geom_worker(void *arg) 497 { 498 zvol_state_t *zv = arg; 499 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 500 struct bio *bp; 501 502 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 503 504 thread_lock(curthread); 505 sched_prio(curthread, PRIBIO); 506 thread_unlock(curthread); 507 508 for (;;) { 509 mtx_lock(&zsg->zsg_queue_mtx); 510 bp = bioq_takefirst(&zsg->zsg_queue); 511 if (bp == NULL) { 512 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 513 zsg->zsg_state = ZVOL_GEOM_RUNNING; 514 wakeup(&zsg->zsg_state); 515 mtx_unlock(&zsg->zsg_queue_mtx); 516 kthread_exit(); 517 } 518 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 519 PRIBIO | PDROP, "zvol:io", 0); 520 continue; 521 } 522 mtx_unlock(&zsg->zsg_queue_mtx); 523 zvol_geom_bio_strategy(bp); 524 } 525 } 526 527 static void 528 zvol_geom_bio_start(struct bio *bp) 529 { 530 zvol_state_t *zv = bp->bio_to->private; 531 struct zvol_state_geom *zsg; 532 boolean_t first; 533 534 if (zv == NULL) { 535 g_io_deliver(bp, ENXIO); 536 return; 537 } 538 if (bp->bio_cmd == BIO_GETATTR) { 539 if (zvol_geom_bio_getattr(bp)) 540 g_io_deliver(bp, EOPNOTSUPP); 541 return; 542 } 543 544 if (!THREAD_CAN_SLEEP()) { 545 zsg = &zv->zv_zso->zso_geom; 546 mtx_lock(&zsg->zsg_queue_mtx); 547 first = (bioq_first(&zsg->zsg_queue) == NULL); 548 bioq_insert_tail(&zsg->zsg_queue, bp); 549 mtx_unlock(&zsg->zsg_queue_mtx); 550 if (first) 551 wakeup_one(&zsg->zsg_queue); 552 return; 553 } 554 555 zvol_geom_bio_strategy(bp); 556 } 557 558 static int 559 zvol_geom_bio_getattr(struct bio *bp) 560 { 561 zvol_state_t *zv; 562 563 zv = bp->bio_to->private; 564 ASSERT3P(zv, !=, NULL); 565 566 spa_t *spa = dmu_objset_spa(zv->zv_objset); 567 uint64_t refd, avail, usedobjs, availobjs; 568 569 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 570 return (0); 571 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 572 dmu_objset_space(zv->zv_objset, &refd, &avail, 573 &usedobjs, &availobjs); 574 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 575 return (0); 576 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 577 dmu_objset_space(zv->zv_objset, &refd, &avail, 578 &usedobjs, &availobjs); 579 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 580 return (0); 581 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 582 avail = metaslab_class_get_space(spa_normal_class(spa)); 583 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 584 if (g_handleattr_off_t(bp, "poolblocksavail", 585 avail / DEV_BSIZE)) 586 return (0); 587 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 588 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 589 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 590 return (0); 591 } 592 return (1); 593 } 594 595 static void 596 zvol_geom_bio_strategy(struct bio *bp) 597 { 598 zvol_state_t *zv; 599 uint64_t off, volsize; 600 size_t resid; 601 char *addr; 602 objset_t *os; 603 zfs_locked_range_t *lr; 604 int error = 0; 605 boolean_t doread = B_FALSE; 606 boolean_t is_dumpified; 607 boolean_t sync; 608 609 if (bp->bio_to) 610 zv = bp->bio_to->private; 611 else 612 zv = bp->bio_dev->si_drv2; 613 614 if (zv == NULL) { 615 error = SET_ERROR(ENXIO); 616 goto out; 617 } 618 619 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 620 621 switch (bp->bio_cmd) { 622 case BIO_READ: 623 doread = B_TRUE; 624 break; 625 case BIO_WRITE: 626 case BIO_FLUSH: 627 case BIO_DELETE: 628 if (zv->zv_flags & ZVOL_RDONLY) { 629 error = SET_ERROR(EROFS); 630 goto resume; 631 } 632 zvol_ensure_zilog(zv); 633 if (bp->bio_cmd == BIO_FLUSH) 634 goto sync; 635 break; 636 default: 637 error = SET_ERROR(EOPNOTSUPP); 638 goto resume; 639 } 640 641 off = bp->bio_offset; 642 volsize = zv->zv_volsize; 643 644 os = zv->zv_objset; 645 ASSERT3P(os, !=, NULL); 646 647 addr = bp->bio_data; 648 resid = bp->bio_length; 649 650 if (resid > 0 && off >= volsize) { 651 error = SET_ERROR(EIO); 652 goto resume; 653 } 654 655 is_dumpified = B_FALSE; 656 sync = !doread && !is_dumpified && 657 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 658 659 /* 660 * There must be no buffer changes when doing a dmu_sync() because 661 * we can't change the data whilst calculating the checksum. 662 */ 663 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 664 doread ? RL_READER : RL_WRITER); 665 666 if (bp->bio_cmd == BIO_DELETE) { 667 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 668 error = dmu_tx_assign(tx, TXG_WAIT); 669 if (error != 0) { 670 dmu_tx_abort(tx); 671 } else { 672 zvol_log_truncate(zv, tx, off, resid, sync); 673 dmu_tx_commit(tx); 674 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 675 off, resid); 676 resid = 0; 677 } 678 goto unlock; 679 } 680 while (resid != 0 && off < volsize) { 681 size_t size = MIN(resid, zvol_maxphys); 682 if (doread) { 683 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 684 DMU_READ_PREFETCH); 685 } else { 686 dmu_tx_t *tx = dmu_tx_create(os); 687 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 688 error = dmu_tx_assign(tx, TXG_WAIT); 689 if (error) { 690 dmu_tx_abort(tx); 691 } else { 692 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 693 zvol_log_write(zv, tx, off, size, sync); 694 dmu_tx_commit(tx); 695 } 696 } 697 if (error) { 698 /* convert checksum errors into IO errors */ 699 if (error == ECKSUM) 700 error = SET_ERROR(EIO); 701 break; 702 } 703 off += size; 704 addr += size; 705 resid -= size; 706 } 707 unlock: 708 zfs_rangelock_exit(lr); 709 710 bp->bio_completed = bp->bio_length - resid; 711 if (bp->bio_completed < bp->bio_length && off > volsize) 712 error = SET_ERROR(EINVAL); 713 714 switch (bp->bio_cmd) { 715 case BIO_FLUSH: 716 break; 717 case BIO_READ: 718 dataset_kstats_update_read_kstats(&zv->zv_kstat, 719 bp->bio_completed); 720 break; 721 case BIO_WRITE: 722 dataset_kstats_update_write_kstats(&zv->zv_kstat, 723 bp->bio_completed); 724 break; 725 case BIO_DELETE: 726 break; 727 default: 728 break; 729 } 730 731 if (sync) { 732 sync: 733 zil_commit(zv->zv_zilog, ZVOL_OBJ); 734 } 735 resume: 736 rw_exit(&zv->zv_suspend_lock); 737 out: 738 if (bp->bio_to) 739 g_io_deliver(bp, error); 740 else 741 biofinish(bp, NULL, error); 742 } 743 744 /* 745 * Character device mode implementation 746 */ 747 748 static int 749 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 750 { 751 zvol_state_t *zv; 752 uint64_t volsize; 753 zfs_locked_range_t *lr; 754 int error = 0; 755 zfs_uio_t uio; 756 757 zfs_uio_init(&uio, uio_s); 758 759 zv = dev->si_drv2; 760 761 volsize = zv->zv_volsize; 762 /* 763 * uio_loffset == volsize isn't an error as 764 * it's required for EOF processing. 765 */ 766 if (zfs_uio_resid(&uio) > 0 && 767 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 768 return (SET_ERROR(EIO)); 769 770 ssize_t start_resid = zfs_uio_resid(&uio); 771 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 772 zfs_uio_resid(&uio), RL_READER); 773 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 774 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 775 776 /* don't read past the end */ 777 if (bytes > volsize - zfs_uio_offset(&uio)) 778 bytes = volsize - zfs_uio_offset(&uio); 779 780 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 781 if (error) { 782 /* convert checksum errors into IO errors */ 783 if (error == ECKSUM) 784 error = SET_ERROR(EIO); 785 break; 786 } 787 } 788 zfs_rangelock_exit(lr); 789 int64_t nread = start_resid - zfs_uio_resid(&uio); 790 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 791 792 return (error); 793 } 794 795 static int 796 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 797 { 798 zvol_state_t *zv; 799 uint64_t volsize; 800 zfs_locked_range_t *lr; 801 int error = 0; 802 boolean_t sync; 803 zfs_uio_t uio; 804 805 zv = dev->si_drv2; 806 807 volsize = zv->zv_volsize; 808 809 zfs_uio_init(&uio, uio_s); 810 811 if (zfs_uio_resid(&uio) > 0 && 812 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 813 return (SET_ERROR(EIO)); 814 815 ssize_t start_resid = zfs_uio_resid(&uio); 816 sync = (ioflag & IO_SYNC) || 817 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 818 819 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 820 zvol_ensure_zilog(zv); 821 822 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 823 zfs_uio_resid(&uio), RL_WRITER); 824 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 825 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 826 uint64_t off = zfs_uio_offset(&uio); 827 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 828 829 if (bytes > volsize - off) /* don't write past the end */ 830 bytes = volsize - off; 831 832 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 833 error = dmu_tx_assign(tx, TXG_WAIT); 834 if (error) { 835 dmu_tx_abort(tx); 836 break; 837 } 838 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 839 if (error == 0) 840 zvol_log_write(zv, tx, off, bytes, sync); 841 dmu_tx_commit(tx); 842 843 if (error) 844 break; 845 } 846 zfs_rangelock_exit(lr); 847 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 848 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 849 if (sync) 850 zil_commit(zv->zv_zilog, ZVOL_OBJ); 851 rw_exit(&zv->zv_suspend_lock); 852 return (error); 853 } 854 855 static int 856 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 857 { 858 zvol_state_t *zv; 859 struct zvol_state_dev *zsd; 860 int err = 0; 861 boolean_t drop_suspend = B_FALSE; 862 boolean_t drop_namespace = B_FALSE; 863 864 retry: 865 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 866 zv = dev->si_drv2; 867 if (zv == NULL) { 868 rw_exit(&zvol_state_lock); 869 err = SET_ERROR(ENXIO); 870 goto out_locked; 871 } 872 873 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { 874 /* 875 * We need to guarantee that the namespace lock is held 876 * to avoid spurious failures in zvol_first_open. 877 */ 878 drop_namespace = B_TRUE; 879 if (!mutex_tryenter(&spa_namespace_lock)) { 880 rw_exit(&zvol_state_lock); 881 mutex_enter(&spa_namespace_lock); 882 goto retry; 883 } 884 } 885 mutex_enter(&zv->zv_state_lock); 886 887 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 888 889 /* 890 * make sure zvol is not suspended during first open 891 * (hold zv_suspend_lock) and respect proper lock acquisition 892 * ordering - zv_suspend_lock before zv_state_lock 893 */ 894 if (zv->zv_open_count == 0) { 895 drop_suspend = B_TRUE; 896 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 897 mutex_exit(&zv->zv_state_lock); 898 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 899 mutex_enter(&zv->zv_state_lock); 900 /* check to see if zv_suspend_lock is needed */ 901 if (zv->zv_open_count != 0) { 902 rw_exit(&zv->zv_suspend_lock); 903 drop_suspend = B_FALSE; 904 } 905 } 906 } 907 rw_exit(&zvol_state_lock); 908 909 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 910 911 if (zv->zv_open_count == 0) { 912 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 913 err = zvol_first_open(zv, !(flags & FWRITE)); 914 if (err) 915 goto out_zv_locked; 916 } 917 918 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 919 err = SET_ERROR(EROFS); 920 goto out_opened; 921 } 922 if (zv->zv_flags & ZVOL_EXCL) { 923 err = SET_ERROR(EBUSY); 924 goto out_opened; 925 } 926 #ifdef FEXCL 927 if (flags & FEXCL) { 928 if (zv->zv_open_count != 0) { 929 err = SET_ERROR(EBUSY); 930 goto out_opened; 931 } 932 zv->zv_flags |= ZVOL_EXCL; 933 } 934 #endif 935 936 zv->zv_open_count++; 937 if (flags & (FSYNC | FDSYNC)) { 938 zsd = &zv->zv_zso->zso_dev; 939 zsd->zsd_sync_cnt++; 940 if (zsd->zsd_sync_cnt == 1 && 941 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) 942 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); 943 } 944 out_opened: 945 if (zv->zv_open_count == 0) { 946 zvol_last_close(zv); 947 wakeup(zv); 948 } 949 out_zv_locked: 950 mutex_exit(&zv->zv_state_lock); 951 out_locked: 952 if (drop_namespace) 953 mutex_exit(&spa_namespace_lock); 954 if (drop_suspend) 955 rw_exit(&zv->zv_suspend_lock); 956 return (err); 957 } 958 959 static int 960 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 961 { 962 zvol_state_t *zv; 963 struct zvol_state_dev *zsd; 964 boolean_t drop_suspend = B_TRUE; 965 966 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 967 zv = dev->si_drv2; 968 if (zv == NULL) { 969 rw_exit(&zvol_state_lock); 970 return (SET_ERROR(ENXIO)); 971 } 972 973 mutex_enter(&zv->zv_state_lock); 974 if (zv->zv_flags & ZVOL_EXCL) { 975 ASSERT3U(zv->zv_open_count, ==, 1); 976 zv->zv_flags &= ~ZVOL_EXCL; 977 } 978 979 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 980 981 /* 982 * If the open count is zero, this is a spurious close. 983 * That indicates a bug in the kernel / DDI framework. 984 */ 985 ASSERT3U(zv->zv_open_count, >, 0); 986 /* 987 * make sure zvol is not suspended during last close 988 * (hold zv_suspend_lock) and respect proper lock acquisition 989 * ordering - zv_suspend_lock before zv_state_lock 990 */ 991 if (zv->zv_open_count == 1) { 992 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 993 mutex_exit(&zv->zv_state_lock); 994 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 995 mutex_enter(&zv->zv_state_lock); 996 /* check to see if zv_suspend_lock is needed */ 997 if (zv->zv_open_count != 1) { 998 rw_exit(&zv->zv_suspend_lock); 999 drop_suspend = B_FALSE; 1000 } 1001 } 1002 } else { 1003 drop_suspend = B_FALSE; 1004 } 1005 rw_exit(&zvol_state_lock); 1006 1007 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1008 1009 /* 1010 * You may get multiple opens, but only one close. 1011 */ 1012 zv->zv_open_count--; 1013 if (flags & (FSYNC | FDSYNC)) { 1014 zsd = &zv->zv_zso->zso_dev; 1015 zsd->zsd_sync_cnt--; 1016 } 1017 1018 if (zv->zv_open_count == 0) { 1019 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1020 zvol_last_close(zv); 1021 wakeup(zv); 1022 } 1023 1024 mutex_exit(&zv->zv_state_lock); 1025 1026 if (drop_suspend) 1027 rw_exit(&zv->zv_suspend_lock); 1028 return (0); 1029 } 1030 1031 static int 1032 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1033 int fflag, struct thread *td) 1034 { 1035 zvol_state_t *zv; 1036 zfs_locked_range_t *lr; 1037 off_t offset, length; 1038 int i, error; 1039 boolean_t sync; 1040 1041 zv = dev->si_drv2; 1042 1043 error = 0; 1044 KASSERT(zv->zv_open_count > 0, 1045 ("Device with zero access count in %s", __func__)); 1046 1047 i = IOCPARM_LEN(cmd); 1048 switch (cmd) { 1049 case DIOCGSECTORSIZE: 1050 *(uint32_t *)data = DEV_BSIZE; 1051 break; 1052 case DIOCGMEDIASIZE: 1053 *(off_t *)data = zv->zv_volsize; 1054 break; 1055 case DIOCGFLUSH: 1056 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1057 if (zv->zv_zilog != NULL) 1058 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1059 rw_exit(&zv->zv_suspend_lock); 1060 break; 1061 case DIOCGDELETE: 1062 if (!zvol_unmap_enabled) 1063 break; 1064 1065 offset = ((off_t *)data)[0]; 1066 length = ((off_t *)data)[1]; 1067 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1068 offset < 0 || offset >= zv->zv_volsize || 1069 length <= 0) { 1070 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1071 length); 1072 error = SET_ERROR(EINVAL); 1073 break; 1074 } 1075 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1076 zvol_ensure_zilog(zv); 1077 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1078 RL_WRITER); 1079 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1080 error = dmu_tx_assign(tx, TXG_WAIT); 1081 if (error != 0) { 1082 sync = FALSE; 1083 dmu_tx_abort(tx); 1084 } else { 1085 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1086 zvol_log_truncate(zv, tx, offset, length, sync); 1087 dmu_tx_commit(tx); 1088 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1089 offset, length); 1090 } 1091 zfs_rangelock_exit(lr); 1092 if (sync) 1093 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1094 rw_exit(&zv->zv_suspend_lock); 1095 break; 1096 case DIOCGSTRIPESIZE: 1097 *(off_t *)data = zv->zv_volblocksize; 1098 break; 1099 case DIOCGSTRIPEOFFSET: 1100 *(off_t *)data = 0; 1101 break; 1102 case DIOCGATTR: { 1103 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1104 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1105 uint64_t refd, avail, usedobjs, availobjs; 1106 1107 if (strcmp(arg->name, "GEOM::candelete") == 0) 1108 arg->value.i = 1; 1109 else if (strcmp(arg->name, "blocksavail") == 0) { 1110 dmu_objset_space(zv->zv_objset, &refd, &avail, 1111 &usedobjs, &availobjs); 1112 arg->value.off = avail / DEV_BSIZE; 1113 } else if (strcmp(arg->name, "blocksused") == 0) { 1114 dmu_objset_space(zv->zv_objset, &refd, &avail, 1115 &usedobjs, &availobjs); 1116 arg->value.off = refd / DEV_BSIZE; 1117 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1118 avail = metaslab_class_get_space(spa_normal_class(spa)); 1119 avail -= metaslab_class_get_alloc( 1120 spa_normal_class(spa)); 1121 arg->value.off = avail / DEV_BSIZE; 1122 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1123 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1124 arg->value.off = refd / DEV_BSIZE; 1125 } else 1126 error = SET_ERROR(ENOIOCTL); 1127 break; 1128 } 1129 case FIOSEEKHOLE: 1130 case FIOSEEKDATA: { 1131 off_t *off = (off_t *)data; 1132 uint64_t noff; 1133 boolean_t hole; 1134 1135 hole = (cmd == FIOSEEKHOLE); 1136 noff = *off; 1137 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1138 *off = noff; 1139 break; 1140 } 1141 default: 1142 error = SET_ERROR(ENOIOCTL); 1143 } 1144 1145 return (error); 1146 } 1147 1148 /* 1149 * Misc. helpers 1150 */ 1151 1152 static void 1153 zvol_ensure_zilog(zvol_state_t *zv) 1154 { 1155 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1156 1157 /* 1158 * Open a ZIL if this is the first time we have written to this 1159 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1160 * than zv_state_lock so that we don't need to acquire an 1161 * additional lock in this path. 1162 */ 1163 if (zv->zv_zilog == NULL) { 1164 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1165 rw_exit(&zv->zv_suspend_lock); 1166 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1167 } 1168 if (zv->zv_zilog == NULL) { 1169 zv->zv_zilog = zil_open(zv->zv_objset, 1170 zvol_get_data); 1171 zv->zv_flags |= ZVOL_WRITTEN_TO; 1172 /* replay / destroy done in zvol_create_minor_impl() */ 1173 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1174 ZIL_REPLAY_NEEDED); 1175 } 1176 rw_downgrade(&zv->zv_suspend_lock); 1177 } 1178 } 1179 1180 static boolean_t 1181 zvol_is_zvol_impl(const char *device) 1182 { 1183 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1184 } 1185 1186 static void 1187 zvol_rename_minor(zvol_state_t *zv, const char *newname) 1188 { 1189 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1190 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1191 1192 /* move to new hashtable entry */ 1193 zv->zv_hash = zvol_name_hash(zv->zv_name); 1194 hlist_del(&zv->zv_hlink); 1195 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1196 1197 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1198 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1199 struct g_provider *pp = zsg->zsg_provider; 1200 struct g_geom *gp; 1201 1202 g_topology_lock(); 1203 gp = pp->geom; 1204 ASSERT3P(gp, !=, NULL); 1205 1206 zsg->zsg_provider = NULL; 1207 g_wither_provider(pp, ENXIO); 1208 1209 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1210 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1211 pp->sectorsize = DEV_BSIZE; 1212 pp->mediasize = zv->zv_volsize; 1213 pp->private = zv; 1214 zsg->zsg_provider = pp; 1215 g_error_provider(pp, 0); 1216 g_topology_unlock(); 1217 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1218 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1219 struct cdev *dev; 1220 struct make_dev_args args; 1221 1222 dev = zsd->zsd_cdev; 1223 if (dev != NULL) { 1224 destroy_dev(dev); 1225 dev = zsd->zsd_cdev = NULL; 1226 if (zv->zv_open_count > 0) { 1227 zv->zv_flags &= ~ZVOL_EXCL; 1228 zv->zv_open_count = 0; 1229 /* XXX need suspend lock but lock order */ 1230 zvol_last_close(zv); 1231 } 1232 } 1233 1234 make_dev_args_init(&args); 1235 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1236 args.mda_devsw = &zvol_cdevsw; 1237 args.mda_cr = NULL; 1238 args.mda_uid = UID_ROOT; 1239 args.mda_gid = GID_OPERATOR; 1240 args.mda_mode = 0640; 1241 args.mda_si_drv2 = zv; 1242 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1243 == 0) { 1244 dev->si_iosize_max = maxphys; 1245 zsd->zsd_cdev = dev; 1246 } 1247 } 1248 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1249 } 1250 1251 /* 1252 * Remove minor node for the specified volume. 1253 */ 1254 static void 1255 zvol_free(zvol_state_t *zv) 1256 { 1257 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1258 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1259 ASSERT0(zv->zv_open_count); 1260 1261 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1262 1263 rw_destroy(&zv->zv_suspend_lock); 1264 zfs_rangelock_fini(&zv->zv_rangelock); 1265 1266 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1267 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1268 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1269 1270 ASSERT3P(pp->private, ==, NULL); 1271 1272 g_topology_lock(); 1273 zvol_geom_destroy(zv); 1274 g_topology_unlock(); 1275 mtx_destroy(&zsg->zsg_queue_mtx); 1276 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1277 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1278 struct cdev *dev = zsd->zsd_cdev; 1279 1280 ASSERT3P(dev->si_drv2, ==, NULL); 1281 1282 destroy_dev(dev); 1283 } 1284 1285 mutex_destroy(&zv->zv_state_lock); 1286 dataset_kstats_destroy(&zv->zv_kstat); 1287 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1288 kmem_free(zv, sizeof (zvol_state_t)); 1289 zvol_minors--; 1290 } 1291 1292 /* 1293 * Create a minor node (plus a whole lot more) for the specified volume. 1294 */ 1295 static int 1296 zvol_create_minor_impl(const char *name) 1297 { 1298 zvol_state_t *zv; 1299 objset_t *os; 1300 dmu_object_info_t *doi; 1301 uint64_t volsize; 1302 uint64_t volmode, hash; 1303 int error; 1304 1305 ZFS_LOG(1, "Creating ZVOL %s...", name); 1306 hash = zvol_name_hash(name); 1307 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1308 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1309 mutex_exit(&zv->zv_state_lock); 1310 return (SET_ERROR(EEXIST)); 1311 } 1312 1313 DROP_GIANT(); 1314 1315 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1316 1317 /* lie and say we're read-only */ 1318 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1319 if (error) 1320 goto out_doi; 1321 1322 error = dmu_object_info(os, ZVOL_OBJ, doi); 1323 if (error) 1324 goto out_dmu_objset_disown; 1325 1326 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1327 if (error) 1328 goto out_dmu_objset_disown; 1329 1330 error = dsl_prop_get_integer(name, 1331 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1332 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1333 volmode = zvol_volmode; 1334 error = 0; 1335 1336 /* 1337 * zvol_alloc equivalent ... 1338 */ 1339 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1340 zv->zv_hash = hash; 1341 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1342 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1343 zv->zv_volmode = volmode; 1344 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1345 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1346 struct g_provider *pp; 1347 struct g_geom *gp; 1348 1349 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1350 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1351 1352 g_topology_lock(); 1353 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1354 gp->start = zvol_geom_bio_start; 1355 gp->access = zvol_geom_access; 1356 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1357 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1358 pp->sectorsize = DEV_BSIZE; 1359 pp->mediasize = 0; 1360 pp->private = zv; 1361 1362 zsg->zsg_provider = pp; 1363 bioq_init(&zsg->zsg_queue); 1364 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1365 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1366 struct cdev *dev; 1367 struct make_dev_args args; 1368 1369 make_dev_args_init(&args); 1370 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1371 args.mda_devsw = &zvol_cdevsw; 1372 args.mda_cr = NULL; 1373 args.mda_uid = UID_ROOT; 1374 args.mda_gid = GID_OPERATOR; 1375 args.mda_mode = 0640; 1376 args.mda_si_drv2 = zv; 1377 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); 1378 if (error) { 1379 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1380 mutex_destroy(&zv->zv_state_lock); 1381 kmem_free(zv, sizeof (*zv)); 1382 dmu_objset_disown(os, B_TRUE, FTAG); 1383 goto out_doi; 1384 } 1385 dev->si_iosize_max = maxphys; 1386 zsd->zsd_cdev = dev; 1387 } 1388 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1389 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1390 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1391 1392 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1393 zv->zv_flags |= ZVOL_RDONLY; 1394 1395 zv->zv_volblocksize = doi->doi_data_block_size; 1396 zv->zv_volsize = volsize; 1397 zv->zv_objset = os; 1398 1399 ASSERT3P(zv->zv_zilog, ==, NULL); 1400 zv->zv_zilog = zil_open(os, zvol_get_data); 1401 if (spa_writeable(dmu_objset_spa(os))) { 1402 if (zil_replay_disable) 1403 zil_destroy(zv->zv_zilog, B_FALSE); 1404 else 1405 zil_replay(os, zv, zvol_replay_vector); 1406 } 1407 zil_close(zv->zv_zilog); 1408 zv->zv_zilog = NULL; 1409 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1410 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1411 1412 /* TODO: prefetch for geom tasting */ 1413 1414 zv->zv_objset = NULL; 1415 out_dmu_objset_disown: 1416 dmu_objset_disown(os, B_TRUE, FTAG); 1417 1418 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1419 zvol_geom_run(zv); 1420 g_topology_unlock(); 1421 } 1422 out_doi: 1423 kmem_free(doi, sizeof (dmu_object_info_t)); 1424 if (error == 0) { 1425 rw_enter(&zvol_state_lock, RW_WRITER); 1426 zvol_insert(zv); 1427 zvol_minors++; 1428 rw_exit(&zvol_state_lock); 1429 ZFS_LOG(1, "ZVOL %s created.", name); 1430 } 1431 PICKUP_GIANT(); 1432 return (error); 1433 } 1434 1435 static void 1436 zvol_clear_private(zvol_state_t *zv) 1437 { 1438 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1439 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1440 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1441 struct g_provider *pp = zsg->zsg_provider; 1442 1443 if (pp->private == NULL) /* already cleared */ 1444 return; 1445 1446 mtx_lock(&zsg->zsg_queue_mtx); 1447 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1448 pp->private = NULL; 1449 wakeup_one(&zsg->zsg_queue); 1450 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1451 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1452 0, "zvol:w", 0); 1453 mtx_unlock(&zsg->zsg_queue_mtx); 1454 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1455 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1456 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1457 struct cdev *dev = zsd->zsd_cdev; 1458 1459 dev->si_drv2 = NULL; 1460 } 1461 } 1462 1463 static int 1464 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) 1465 { 1466 zv->zv_volsize = volsize; 1467 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1468 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1469 struct g_provider *pp = zsg->zsg_provider; 1470 1471 g_topology_lock(); 1472 1473 if (pp->private == NULL) { 1474 g_topology_unlock(); 1475 return (SET_ERROR(ENXIO)); 1476 } 1477 1478 /* 1479 * Do not invoke resize event when initial size was zero. 1480 * ZVOL initializes the size on first open, this is not 1481 * real resizing. 1482 */ 1483 if (pp->mediasize == 0) 1484 pp->mediasize = zv->zv_volsize; 1485 else 1486 g_resize_provider(pp, zv->zv_volsize); 1487 1488 g_topology_unlock(); 1489 } 1490 return (0); 1491 } 1492 1493 static void 1494 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) 1495 { 1496 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1497 } 1498 1499 static void 1500 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) 1501 { 1502 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1503 } 1504 1505 const static zvol_platform_ops_t zvol_freebsd_ops = { 1506 .zv_free = zvol_free, 1507 .zv_rename_minor = zvol_rename_minor, 1508 .zv_create_minor = zvol_create_minor_impl, 1509 .zv_update_volsize = zvol_update_volsize, 1510 .zv_clear_private = zvol_clear_private, 1511 .zv_is_zvol = zvol_is_zvol_impl, 1512 .zv_set_disk_ro = zvol_set_disk_ro_impl, 1513 .zv_set_capacity = zvol_set_capacity_impl, 1514 }; 1515 1516 /* 1517 * Public interfaces 1518 */ 1519 1520 int 1521 zvol_busy(void) 1522 { 1523 return (zvol_minors != 0); 1524 } 1525 1526 int 1527 zvol_init(void) 1528 { 1529 zvol_init_impl(); 1530 zvol_register_ops(&zvol_freebsd_ops); 1531 return (0); 1532 } 1533 1534 void 1535 zvol_fini(void) 1536 { 1537 zvol_fini_impl(); 1538 } 1539