1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 #include <sys/freebsd_event.h> 96 97 #include <geom/geom.h> 98 #include <sys/zvol.h> 99 #include <sys/zvol_impl.h> 100 101 #include "zfs_namecheck.h" 102 103 #define ZVOL_DUMPSIZE "dumpsize" 104 105 #ifdef ZVOL_LOCK_DEBUG 106 #define ZVOL_RW_READER RW_WRITER 107 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 108 #else 109 #define ZVOL_RW_READER RW_READER 110 #define ZVOL_RW_READ_HELD RW_READ_HELD 111 #endif 112 113 enum zvol_geom_state { 114 ZVOL_GEOM_UNINIT, 115 ZVOL_GEOM_STOPPED, 116 ZVOL_GEOM_RUNNING, 117 }; 118 119 struct zvol_state_os { 120 #define zso_dev _zso_state._zso_dev 121 #define zso_geom _zso_state._zso_geom 122 union { 123 /* volmode=dev */ 124 struct zvol_state_dev { 125 struct cdev *zsd_cdev; 126 struct selinfo zsd_selinfo; 127 } _zso_dev; 128 129 /* volmode=geom */ 130 struct zvol_state_geom { 131 struct g_provider *zsg_provider; 132 struct bio_queue_head zsg_queue; 133 struct mtx zsg_queue_mtx; 134 enum zvol_geom_state zsg_state; 135 } _zso_geom; 136 } _zso_state; 137 int zso_dying; 138 }; 139 140 static uint32_t zvol_minors; 141 142 SYSCTL_DECL(_vfs_zfs); 143 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 144 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 145 "Expose as GEOM providers (1), device files (2) or neither"); 146 static boolean_t zpool_on_zvol = B_FALSE; 147 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 148 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 149 150 /* 151 * Toggle unmap functionality. 152 */ 153 boolean_t zvol_unmap_enabled = B_TRUE; 154 155 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 156 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 157 158 /* 159 * zvol maximum transfer in one DMU tx. 160 */ 161 int zvol_maxphys = DMU_MAX_ACCESS / 2; 162 163 static void zvol_ensure_zilog(zvol_state_t *zv); 164 165 static d_open_t zvol_cdev_open; 166 static d_close_t zvol_cdev_close; 167 static d_ioctl_t zvol_cdev_ioctl; 168 static d_read_t zvol_cdev_read; 169 static d_write_t zvol_cdev_write; 170 static d_strategy_t zvol_geom_bio_strategy; 171 static d_kqfilter_t zvol_cdev_kqfilter; 172 173 static struct cdevsw zvol_cdevsw = { 174 .d_name = "zvol", 175 .d_version = D_VERSION, 176 .d_flags = D_DISK | D_TRACKCLOSE, 177 .d_open = zvol_cdev_open, 178 .d_close = zvol_cdev_close, 179 .d_ioctl = zvol_cdev_ioctl, 180 .d_read = zvol_cdev_read, 181 .d_write = zvol_cdev_write, 182 .d_strategy = zvol_geom_bio_strategy, 183 .d_kqfilter = zvol_cdev_kqfilter, 184 }; 185 186 static void zvol_filter_detach(struct knote *kn); 187 static int zvol_filter_vnode(struct knote *kn, long hint); 188 189 static struct filterops zvol_filterops_vnode = { 190 .f_isfd = 1, 191 .f_detach = zvol_filter_detach, 192 .f_event = zvol_filter_vnode, 193 }; 194 195 extern uint_t zfs_geom_probe_vdev_key; 196 197 struct g_class zfs_zvol_class = { 198 .name = "ZFS::ZVOL", 199 .version = G_VERSION, 200 }; 201 202 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 203 204 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 205 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 206 static void zvol_geom_run(zvol_state_t *zv); 207 static void zvol_geom_destroy(zvol_state_t *zv); 208 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 209 static void zvol_geom_worker(void *arg); 210 static void zvol_geom_bio_start(struct bio *bp); 211 static int zvol_geom_bio_getattr(struct bio *bp); 212 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 213 214 /* 215 * GEOM mode implementation 216 */ 217 218 static int 219 zvol_geom_open(struct g_provider *pp, int flag, int count) 220 { 221 zvol_state_t *zv; 222 int err = 0; 223 boolean_t drop_suspend = B_FALSE; 224 225 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 226 /* 227 * If zfs_geom_probe_vdev_key is set, that means that zfs is 228 * attempting to probe geom providers while looking for a 229 * replacement for a missing VDEV. In this case, the 230 * spa_namespace_lock will not be held, but it is still illegal 231 * to use a zvol as a vdev. Deadlocks can result if another 232 * thread has spa_namespace_lock. 233 */ 234 return (SET_ERROR(EOPNOTSUPP)); 235 } 236 237 retry: 238 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 239 /* 240 * Obtain a copy of private under zvol_state_lock to make sure either 241 * the result of zvol free code setting private to NULL is observed, 242 * or the zv is protected from being freed because of the positive 243 * zv_open_count. 244 */ 245 zv = pp->private; 246 if (zv == NULL) { 247 rw_exit(&zvol_state_lock); 248 err = SET_ERROR(ENXIO); 249 goto out_locked; 250 } 251 252 mutex_enter(&zv->zv_state_lock); 253 if (zv->zv_zso->zso_dying) { 254 rw_exit(&zvol_state_lock); 255 err = SET_ERROR(ENXIO); 256 goto out_zv_locked; 257 } 258 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 259 260 /* 261 * Make sure zvol is not suspended during first open 262 * (hold zv_suspend_lock) and respect proper lock acquisition 263 * ordering - zv_suspend_lock before zv_state_lock. 264 */ 265 if (zv->zv_open_count == 0) { 266 drop_suspend = B_TRUE; 267 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 268 mutex_exit(&zv->zv_state_lock); 269 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 270 mutex_enter(&zv->zv_state_lock); 271 /* Check to see if zv_suspend_lock is needed. */ 272 if (zv->zv_open_count != 0) { 273 rw_exit(&zv->zv_suspend_lock); 274 drop_suspend = B_FALSE; 275 } 276 } 277 } 278 rw_exit(&zvol_state_lock); 279 280 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 281 282 if (zv->zv_open_count == 0) { 283 boolean_t drop_namespace = B_FALSE; 284 285 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 286 287 /* 288 * Take spa_namespace_lock to prevent lock inversion when 289 * zvols from one pool are opened as vdevs in another. 290 */ 291 if (!mutex_owned(&spa_namespace_lock)) { 292 if (!mutex_tryenter(&spa_namespace_lock)) { 293 mutex_exit(&zv->zv_state_lock); 294 rw_exit(&zv->zv_suspend_lock); 295 kern_yield(PRI_USER); 296 goto retry; 297 } else { 298 drop_namespace = B_TRUE; 299 } 300 } 301 err = zvol_first_open(zv, !(flag & FWRITE)); 302 if (drop_namespace) 303 mutex_exit(&spa_namespace_lock); 304 if (err) 305 goto out_zv_locked; 306 pp->mediasize = zv->zv_volsize; 307 pp->stripeoffset = 0; 308 pp->stripesize = zv->zv_volblocksize; 309 } 310 311 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 312 313 /* 314 * Check for a bad on-disk format version now since we 315 * lied about owning the dataset readonly before. 316 */ 317 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 318 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 319 err = SET_ERROR(EROFS); 320 goto out_opened; 321 } 322 if (zv->zv_flags & ZVOL_EXCL) { 323 err = SET_ERROR(EBUSY); 324 goto out_opened; 325 } 326 if (flag & O_EXCL) { 327 if (zv->zv_open_count != 0) { 328 err = SET_ERROR(EBUSY); 329 goto out_opened; 330 } 331 zv->zv_flags |= ZVOL_EXCL; 332 } 333 334 zv->zv_open_count += count; 335 out_opened: 336 if (zv->zv_open_count == 0) { 337 zvol_last_close(zv); 338 wakeup(zv); 339 } 340 out_zv_locked: 341 mutex_exit(&zv->zv_state_lock); 342 out_locked: 343 if (drop_suspend) 344 rw_exit(&zv->zv_suspend_lock); 345 return (err); 346 } 347 348 static int 349 zvol_geom_close(struct g_provider *pp, int flag, int count) 350 { 351 (void) flag; 352 zvol_state_t *zv; 353 boolean_t drop_suspend = B_TRUE; 354 int new_open_count; 355 356 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 357 zv = pp->private; 358 if (zv == NULL) { 359 rw_exit(&zvol_state_lock); 360 return (SET_ERROR(ENXIO)); 361 } 362 363 mutex_enter(&zv->zv_state_lock); 364 if (zv->zv_flags & ZVOL_EXCL) { 365 ASSERT3U(zv->zv_open_count, ==, 1); 366 zv->zv_flags &= ~ZVOL_EXCL; 367 } 368 369 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 370 371 /* 372 * If the open count is zero, this is a spurious close. 373 * That indicates a bug in the kernel / DDI framework. 374 */ 375 ASSERT3U(zv->zv_open_count, >, 0); 376 377 /* 378 * Make sure zvol is not suspended during last close 379 * (hold zv_suspend_lock) and respect proper lock acquisition 380 * ordering - zv_suspend_lock before zv_state_lock. 381 */ 382 new_open_count = zv->zv_open_count - count; 383 if (new_open_count == 0) { 384 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 385 mutex_exit(&zv->zv_state_lock); 386 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 387 mutex_enter(&zv->zv_state_lock); 388 /* Check to see if zv_suspend_lock is needed. */ 389 new_open_count = zv->zv_open_count - count; 390 if (new_open_count != 0) { 391 rw_exit(&zv->zv_suspend_lock); 392 drop_suspend = B_FALSE; 393 } 394 } 395 } else { 396 drop_suspend = B_FALSE; 397 } 398 rw_exit(&zvol_state_lock); 399 400 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 401 402 /* 403 * You may get multiple opens, but only one close. 404 */ 405 zv->zv_open_count = new_open_count; 406 if (zv->zv_open_count == 0) { 407 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 408 zvol_last_close(zv); 409 wakeup(zv); 410 } 411 412 mutex_exit(&zv->zv_state_lock); 413 414 if (drop_suspend) 415 rw_exit(&zv->zv_suspend_lock); 416 return (0); 417 } 418 419 static void 420 zvol_geom_run(zvol_state_t *zv) 421 { 422 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 423 struct g_provider *pp = zsg->zsg_provider; 424 425 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 426 427 g_error_provider(pp, 0); 428 429 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 430 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 431 } 432 433 static void 434 zvol_geom_destroy(zvol_state_t *zv) 435 { 436 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 437 struct g_provider *pp = zsg->zsg_provider; 438 439 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 440 441 g_topology_assert(); 442 443 mutex_enter(&zv->zv_state_lock); 444 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 445 mutex_exit(&zv->zv_state_lock); 446 zsg->zsg_provider = NULL; 447 g_wither_geom(pp->geom, ENXIO); 448 } 449 450 void 451 zvol_wait_close(zvol_state_t *zv) 452 { 453 454 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 455 return; 456 mutex_enter(&zv->zv_state_lock); 457 zv->zv_zso->zso_dying = B_TRUE; 458 459 if (zv->zv_open_count) 460 msleep(zv, &zv->zv_state_lock, 461 PRIBIO, "zvol:dying", 10*hz); 462 mutex_exit(&zv->zv_state_lock); 463 } 464 465 466 static int 467 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 468 { 469 int count, error, flags; 470 471 g_topology_assert(); 472 473 /* 474 * To make it easier we expect either open or close, but not both 475 * at the same time. 476 */ 477 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 478 (acr <= 0 && acw <= 0 && ace <= 0), 479 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 480 pp->name, acr, acw, ace)); 481 482 if (pp->private == NULL) { 483 if (acr <= 0 && acw <= 0 && ace <= 0) 484 return (0); 485 return (pp->error); 486 } 487 488 /* 489 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 490 * ace != 0, because GEOM already handles that and handles it a bit 491 * differently. GEOM allows for multiple read/exclusive consumers and 492 * ZFS allows only one exclusive consumer, no matter if it is reader or 493 * writer. I like better the way GEOM works so I'll leave it for GEOM 494 * to decide what to do. 495 */ 496 497 count = acr + acw + ace; 498 if (count == 0) 499 return (0); 500 501 flags = 0; 502 if (acr != 0 || ace != 0) 503 flags |= FREAD; 504 if (acw != 0) 505 flags |= FWRITE; 506 507 g_topology_unlock(); 508 if (count > 0) 509 error = zvol_geom_open(pp, flags, count); 510 else 511 error = zvol_geom_close(pp, flags, -count); 512 g_topology_lock(); 513 return (error); 514 } 515 516 static void 517 zvol_geom_worker(void *arg) 518 { 519 zvol_state_t *zv = arg; 520 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 521 struct bio *bp; 522 523 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 524 525 thread_lock(curthread); 526 sched_prio(curthread, PRIBIO); 527 thread_unlock(curthread); 528 529 for (;;) { 530 mtx_lock(&zsg->zsg_queue_mtx); 531 bp = bioq_takefirst(&zsg->zsg_queue); 532 if (bp == NULL) { 533 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 534 zsg->zsg_state = ZVOL_GEOM_RUNNING; 535 wakeup(&zsg->zsg_state); 536 mtx_unlock(&zsg->zsg_queue_mtx); 537 kthread_exit(); 538 } 539 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 540 PRIBIO | PDROP, "zvol:io", 0); 541 continue; 542 } 543 mtx_unlock(&zsg->zsg_queue_mtx); 544 zvol_geom_bio_strategy(bp); 545 } 546 } 547 548 static void 549 zvol_geom_bio_start(struct bio *bp) 550 { 551 zvol_state_t *zv = bp->bio_to->private; 552 struct zvol_state_geom *zsg; 553 boolean_t first; 554 555 if (zv == NULL) { 556 g_io_deliver(bp, ENXIO); 557 return; 558 } 559 if (bp->bio_cmd == BIO_GETATTR) { 560 if (zvol_geom_bio_getattr(bp)) 561 g_io_deliver(bp, EOPNOTSUPP); 562 return; 563 } 564 565 if (!THREAD_CAN_SLEEP()) { 566 zsg = &zv->zv_zso->zso_geom; 567 mtx_lock(&zsg->zsg_queue_mtx); 568 first = (bioq_first(&zsg->zsg_queue) == NULL); 569 bioq_insert_tail(&zsg->zsg_queue, bp); 570 mtx_unlock(&zsg->zsg_queue_mtx); 571 if (first) 572 wakeup_one(&zsg->zsg_queue); 573 return; 574 } 575 576 zvol_geom_bio_strategy(bp); 577 } 578 579 static int 580 zvol_geom_bio_getattr(struct bio *bp) 581 { 582 zvol_state_t *zv; 583 584 zv = bp->bio_to->private; 585 ASSERT3P(zv, !=, NULL); 586 587 spa_t *spa = dmu_objset_spa(zv->zv_objset); 588 uint64_t refd, avail, usedobjs, availobjs; 589 590 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 591 return (0); 592 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 593 dmu_objset_space(zv->zv_objset, &refd, &avail, 594 &usedobjs, &availobjs); 595 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 596 return (0); 597 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 598 dmu_objset_space(zv->zv_objset, &refd, &avail, 599 &usedobjs, &availobjs); 600 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 601 return (0); 602 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 603 avail = metaslab_class_get_space(spa_normal_class(spa)); 604 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 605 if (g_handleattr_off_t(bp, "poolblocksavail", 606 avail / DEV_BSIZE)) 607 return (0); 608 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 609 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 610 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 611 return (0); 612 } 613 return (1); 614 } 615 616 static void 617 zvol_filter_detach(struct knote *kn) 618 { 619 zvol_state_t *zv; 620 struct zvol_state_dev *zsd; 621 622 zv = kn->kn_hook; 623 zsd = &zv->zv_zso->zso_dev; 624 625 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); 626 } 627 628 static int 629 zvol_filter_vnode(struct knote *kn, long hint) 630 { 631 kn->kn_fflags |= kn->kn_sfflags & hint; 632 633 return (kn->kn_fflags != 0); 634 } 635 636 static int 637 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) 638 { 639 zvol_state_t *zv; 640 struct zvol_state_dev *zsd; 641 642 zv = dev->si_drv2; 643 zsd = &zv->zv_zso->zso_dev; 644 645 if (kn->kn_filter != EVFILT_VNODE) 646 return (EINVAL); 647 648 /* XXX: extend support for other NOTE_* events */ 649 if (kn->kn_sfflags != NOTE_ATTRIB) 650 return (EINVAL); 651 652 kn->kn_fop = &zvol_filterops_vnode; 653 kn->kn_hook = zv; 654 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); 655 656 return (0); 657 } 658 659 static void 660 zvol_geom_bio_strategy(struct bio *bp) 661 { 662 zvol_state_t *zv; 663 uint64_t off, volsize; 664 size_t resid; 665 char *addr; 666 objset_t *os; 667 zfs_locked_range_t *lr; 668 int error = 0; 669 boolean_t doread = B_FALSE; 670 boolean_t is_dumpified; 671 boolean_t commit; 672 673 if (bp->bio_to) 674 zv = bp->bio_to->private; 675 else 676 zv = bp->bio_dev->si_drv2; 677 678 if (zv == NULL) { 679 error = SET_ERROR(ENXIO); 680 goto out; 681 } 682 683 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 684 685 switch (bp->bio_cmd) { 686 case BIO_READ: 687 doread = B_TRUE; 688 break; 689 case BIO_WRITE: 690 case BIO_FLUSH: 691 case BIO_DELETE: 692 if (zv->zv_flags & ZVOL_RDONLY) { 693 error = SET_ERROR(EROFS); 694 goto resume; 695 } 696 zvol_ensure_zilog(zv); 697 if (bp->bio_cmd == BIO_FLUSH) 698 goto commit; 699 break; 700 default: 701 error = SET_ERROR(EOPNOTSUPP); 702 goto resume; 703 } 704 705 off = bp->bio_offset; 706 volsize = zv->zv_volsize; 707 708 os = zv->zv_objset; 709 ASSERT3P(os, !=, NULL); 710 711 addr = bp->bio_data; 712 resid = bp->bio_length; 713 714 if (resid > 0 && off >= volsize) { 715 error = SET_ERROR(EIO); 716 goto resume; 717 } 718 719 is_dumpified = B_FALSE; 720 commit = !doread && !is_dumpified && 721 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 722 723 /* 724 * There must be no buffer changes when doing a dmu_sync() because 725 * we can't change the data whilst calculating the checksum. 726 */ 727 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 728 doread ? RL_READER : RL_WRITER); 729 730 if (bp->bio_cmd == BIO_DELETE) { 731 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 732 error = dmu_tx_assign(tx, TXG_WAIT); 733 if (error != 0) { 734 dmu_tx_abort(tx); 735 } else { 736 zvol_log_truncate(zv, tx, off, resid); 737 dmu_tx_commit(tx); 738 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 739 off, resid); 740 resid = 0; 741 } 742 goto unlock; 743 } 744 while (resid != 0 && off < volsize) { 745 size_t size = MIN(resid, zvol_maxphys); 746 if (doread) { 747 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 748 DMU_READ_PREFETCH); 749 } else { 750 dmu_tx_t *tx = dmu_tx_create(os); 751 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 752 error = dmu_tx_assign(tx, TXG_WAIT); 753 if (error) { 754 dmu_tx_abort(tx); 755 } else { 756 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 757 zvol_log_write(zv, tx, off, size, commit); 758 dmu_tx_commit(tx); 759 } 760 } 761 if (error) { 762 /* Convert checksum errors into IO errors. */ 763 if (error == ECKSUM) 764 error = SET_ERROR(EIO); 765 break; 766 } 767 off += size; 768 addr += size; 769 resid -= size; 770 } 771 unlock: 772 zfs_rangelock_exit(lr); 773 774 bp->bio_completed = bp->bio_length - resid; 775 if (bp->bio_completed < bp->bio_length && off > volsize) 776 error = SET_ERROR(EINVAL); 777 778 switch (bp->bio_cmd) { 779 case BIO_FLUSH: 780 break; 781 case BIO_READ: 782 dataset_kstats_update_read_kstats(&zv->zv_kstat, 783 bp->bio_completed); 784 break; 785 case BIO_WRITE: 786 dataset_kstats_update_write_kstats(&zv->zv_kstat, 787 bp->bio_completed); 788 break; 789 case BIO_DELETE: 790 break; 791 default: 792 break; 793 } 794 795 if (commit) { 796 commit: 797 zil_commit(zv->zv_zilog, ZVOL_OBJ); 798 } 799 resume: 800 rw_exit(&zv->zv_suspend_lock); 801 out: 802 if (bp->bio_to) 803 g_io_deliver(bp, error); 804 else 805 biofinish(bp, NULL, error); 806 } 807 808 /* 809 * Character device mode implementation 810 */ 811 812 static int 813 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 814 { 815 zvol_state_t *zv; 816 uint64_t volsize; 817 zfs_locked_range_t *lr; 818 int error = 0; 819 zfs_uio_t uio; 820 821 zfs_uio_init(&uio, uio_s); 822 823 zv = dev->si_drv2; 824 825 volsize = zv->zv_volsize; 826 /* 827 * uio_loffset == volsize isn't an error as 828 * it's required for EOF processing. 829 */ 830 if (zfs_uio_resid(&uio) > 0 && 831 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 832 return (SET_ERROR(EIO)); 833 834 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 835 ssize_t start_resid = zfs_uio_resid(&uio); 836 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 837 zfs_uio_resid(&uio), RL_READER); 838 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 839 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 840 841 /* Don't read past the end. */ 842 if (bytes > volsize - zfs_uio_offset(&uio)) 843 bytes = volsize - zfs_uio_offset(&uio); 844 845 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 846 if (error) { 847 /* Convert checksum errors into IO errors. */ 848 if (error == ECKSUM) 849 error = SET_ERROR(EIO); 850 break; 851 } 852 } 853 zfs_rangelock_exit(lr); 854 int64_t nread = start_resid - zfs_uio_resid(&uio); 855 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 856 rw_exit(&zv->zv_suspend_lock); 857 858 return (error); 859 } 860 861 static int 862 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 863 { 864 zvol_state_t *zv; 865 uint64_t volsize; 866 zfs_locked_range_t *lr; 867 int error = 0; 868 boolean_t commit; 869 zfs_uio_t uio; 870 871 zv = dev->si_drv2; 872 873 volsize = zv->zv_volsize; 874 875 zfs_uio_init(&uio, uio_s); 876 877 if (zfs_uio_resid(&uio) > 0 && 878 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 879 return (SET_ERROR(EIO)); 880 881 ssize_t start_resid = zfs_uio_resid(&uio); 882 commit = (ioflag & IO_SYNC) || 883 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 884 885 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 886 zvol_ensure_zilog(zv); 887 888 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 889 zfs_uio_resid(&uio), RL_WRITER); 890 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 891 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 892 uint64_t off = zfs_uio_offset(&uio); 893 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 894 895 if (bytes > volsize - off) /* Don't write past the end. */ 896 bytes = volsize - off; 897 898 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 899 error = dmu_tx_assign(tx, TXG_WAIT); 900 if (error) { 901 dmu_tx_abort(tx); 902 break; 903 } 904 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 905 if (error == 0) 906 zvol_log_write(zv, tx, off, bytes, commit); 907 dmu_tx_commit(tx); 908 909 if (error) 910 break; 911 } 912 zfs_rangelock_exit(lr); 913 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 914 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 915 if (commit) 916 zil_commit(zv->zv_zilog, ZVOL_OBJ); 917 rw_exit(&zv->zv_suspend_lock); 918 return (error); 919 } 920 921 static int 922 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 923 { 924 zvol_state_t *zv; 925 int err = 0; 926 boolean_t drop_suspend = B_FALSE; 927 928 retry: 929 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 930 /* 931 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either 932 * the result of zvol free code setting si_drv2 to NULL is observed, 933 * or the zv is protected from being freed because of the positive 934 * zv_open_count. 935 */ 936 zv = dev->si_drv2; 937 if (zv == NULL) { 938 rw_exit(&zvol_state_lock); 939 err = SET_ERROR(ENXIO); 940 goto out_locked; 941 } 942 943 mutex_enter(&zv->zv_state_lock); 944 if (zv->zv_zso->zso_dying) { 945 rw_exit(&zvol_state_lock); 946 err = SET_ERROR(ENXIO); 947 goto out_zv_locked; 948 } 949 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 950 951 /* 952 * Make sure zvol is not suspended during first open 953 * (hold zv_suspend_lock) and respect proper lock acquisition 954 * ordering - zv_suspend_lock before zv_state_lock. 955 */ 956 if (zv->zv_open_count == 0) { 957 drop_suspend = B_TRUE; 958 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 959 mutex_exit(&zv->zv_state_lock); 960 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 961 mutex_enter(&zv->zv_state_lock); 962 /* Check to see if zv_suspend_lock is needed. */ 963 if (zv->zv_open_count != 0) { 964 rw_exit(&zv->zv_suspend_lock); 965 drop_suspend = B_FALSE; 966 } 967 } 968 } 969 rw_exit(&zvol_state_lock); 970 971 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 972 973 if (zv->zv_open_count == 0) { 974 boolean_t drop_namespace = B_FALSE; 975 976 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 977 978 /* 979 * Take spa_namespace_lock to prevent lock inversion when 980 * zvols from one pool are opened as vdevs in another. 981 */ 982 if (!mutex_owned(&spa_namespace_lock)) { 983 if (!mutex_tryenter(&spa_namespace_lock)) { 984 mutex_exit(&zv->zv_state_lock); 985 rw_exit(&zv->zv_suspend_lock); 986 kern_yield(PRI_USER); 987 goto retry; 988 } else { 989 drop_namespace = B_TRUE; 990 } 991 } 992 err = zvol_first_open(zv, !(flags & FWRITE)); 993 if (drop_namespace) 994 mutex_exit(&spa_namespace_lock); 995 if (err) 996 goto out_zv_locked; 997 } 998 999 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1000 1001 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 1002 err = SET_ERROR(EROFS); 1003 goto out_opened; 1004 } 1005 if (zv->zv_flags & ZVOL_EXCL) { 1006 err = SET_ERROR(EBUSY); 1007 goto out_opened; 1008 } 1009 if (flags & O_EXCL) { 1010 if (zv->zv_open_count != 0) { 1011 err = SET_ERROR(EBUSY); 1012 goto out_opened; 1013 } 1014 zv->zv_flags |= ZVOL_EXCL; 1015 } 1016 1017 zv->zv_open_count++; 1018 out_opened: 1019 if (zv->zv_open_count == 0) { 1020 zvol_last_close(zv); 1021 wakeup(zv); 1022 } 1023 out_zv_locked: 1024 mutex_exit(&zv->zv_state_lock); 1025 out_locked: 1026 if (drop_suspend) 1027 rw_exit(&zv->zv_suspend_lock); 1028 return (err); 1029 } 1030 1031 static int 1032 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 1033 { 1034 zvol_state_t *zv; 1035 boolean_t drop_suspend = B_TRUE; 1036 1037 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 1038 zv = dev->si_drv2; 1039 if (zv == NULL) { 1040 rw_exit(&zvol_state_lock); 1041 return (SET_ERROR(ENXIO)); 1042 } 1043 1044 mutex_enter(&zv->zv_state_lock); 1045 if (zv->zv_flags & ZVOL_EXCL) { 1046 ASSERT3U(zv->zv_open_count, ==, 1); 1047 zv->zv_flags &= ~ZVOL_EXCL; 1048 } 1049 1050 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1051 1052 /* 1053 * If the open count is zero, this is a spurious close. 1054 * That indicates a bug in the kernel / DDI framework. 1055 */ 1056 ASSERT3U(zv->zv_open_count, >, 0); 1057 /* 1058 * Make sure zvol is not suspended during last close 1059 * (hold zv_suspend_lock) and respect proper lock acquisition 1060 * ordering - zv_suspend_lock before zv_state_lock. 1061 */ 1062 if (zv->zv_open_count == 1) { 1063 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1064 mutex_exit(&zv->zv_state_lock); 1065 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1066 mutex_enter(&zv->zv_state_lock); 1067 /* Check to see if zv_suspend_lock is needed. */ 1068 if (zv->zv_open_count != 1) { 1069 rw_exit(&zv->zv_suspend_lock); 1070 drop_suspend = B_FALSE; 1071 } 1072 } 1073 } else { 1074 drop_suspend = B_FALSE; 1075 } 1076 rw_exit(&zvol_state_lock); 1077 1078 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1079 1080 /* 1081 * You may get multiple opens, but only one close. 1082 */ 1083 zv->zv_open_count--; 1084 1085 if (zv->zv_open_count == 0) { 1086 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1087 zvol_last_close(zv); 1088 wakeup(zv); 1089 } 1090 1091 mutex_exit(&zv->zv_state_lock); 1092 1093 if (drop_suspend) 1094 rw_exit(&zv->zv_suspend_lock); 1095 return (0); 1096 } 1097 1098 static int 1099 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1100 int fflag, struct thread *td) 1101 { 1102 zvol_state_t *zv; 1103 zfs_locked_range_t *lr; 1104 off_t offset, length; 1105 int error; 1106 boolean_t sync; 1107 1108 zv = dev->si_drv2; 1109 1110 error = 0; 1111 KASSERT(zv->zv_open_count > 0, 1112 ("Device with zero access count in %s", __func__)); 1113 1114 switch (cmd) { 1115 case DIOCGSECTORSIZE: 1116 *(uint32_t *)data = DEV_BSIZE; 1117 break; 1118 case DIOCGMEDIASIZE: 1119 *(off_t *)data = zv->zv_volsize; 1120 break; 1121 case DIOCGFLUSH: 1122 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1123 if (zv->zv_zilog != NULL) 1124 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1125 rw_exit(&zv->zv_suspend_lock); 1126 break; 1127 case DIOCGDELETE: 1128 if (!zvol_unmap_enabled) 1129 break; 1130 1131 offset = ((off_t *)data)[0]; 1132 length = ((off_t *)data)[1]; 1133 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1134 offset < 0 || offset >= zv->zv_volsize || 1135 length <= 0) { 1136 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1137 length); 1138 error = SET_ERROR(EINVAL); 1139 break; 1140 } 1141 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1142 zvol_ensure_zilog(zv); 1143 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1144 RL_WRITER); 1145 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1146 error = dmu_tx_assign(tx, TXG_WAIT); 1147 if (error != 0) { 1148 sync = FALSE; 1149 dmu_tx_abort(tx); 1150 } else { 1151 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1152 zvol_log_truncate(zv, tx, offset, length); 1153 dmu_tx_commit(tx); 1154 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1155 offset, length); 1156 } 1157 zfs_rangelock_exit(lr); 1158 if (sync) 1159 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1160 rw_exit(&zv->zv_suspend_lock); 1161 break; 1162 case DIOCGSTRIPESIZE: 1163 *(off_t *)data = zv->zv_volblocksize; 1164 break; 1165 case DIOCGSTRIPEOFFSET: 1166 *(off_t *)data = 0; 1167 break; 1168 case DIOCGATTR: { 1169 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1170 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1171 uint64_t refd, avail, usedobjs, availobjs; 1172 1173 if (strcmp(arg->name, "GEOM::candelete") == 0) 1174 arg->value.i = 1; 1175 else if (strcmp(arg->name, "blocksavail") == 0) { 1176 dmu_objset_space(zv->zv_objset, &refd, &avail, 1177 &usedobjs, &availobjs); 1178 arg->value.off = avail / DEV_BSIZE; 1179 } else if (strcmp(arg->name, "blocksused") == 0) { 1180 dmu_objset_space(zv->zv_objset, &refd, &avail, 1181 &usedobjs, &availobjs); 1182 arg->value.off = refd / DEV_BSIZE; 1183 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1184 avail = metaslab_class_get_space(spa_normal_class(spa)); 1185 avail -= metaslab_class_get_alloc( 1186 spa_normal_class(spa)); 1187 arg->value.off = avail / DEV_BSIZE; 1188 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1189 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1190 arg->value.off = refd / DEV_BSIZE; 1191 } else 1192 error = SET_ERROR(ENOIOCTL); 1193 break; 1194 } 1195 case FIOSEEKHOLE: 1196 case FIOSEEKDATA: { 1197 off_t *off = (off_t *)data; 1198 uint64_t noff; 1199 boolean_t hole; 1200 1201 hole = (cmd == FIOSEEKHOLE); 1202 noff = *off; 1203 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, 1204 RL_READER); 1205 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1206 zfs_rangelock_exit(lr); 1207 *off = noff; 1208 break; 1209 } 1210 default: 1211 error = SET_ERROR(ENOIOCTL); 1212 } 1213 1214 return (error); 1215 } 1216 1217 /* 1218 * Misc. helpers 1219 */ 1220 1221 static void 1222 zvol_ensure_zilog(zvol_state_t *zv) 1223 { 1224 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1225 1226 /* 1227 * Open a ZIL if this is the first time we have written to this 1228 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1229 * than zv_state_lock so that we don't need to acquire an 1230 * additional lock in this path. 1231 */ 1232 if (zv->zv_zilog == NULL) { 1233 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1234 rw_exit(&zv->zv_suspend_lock); 1235 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1236 } 1237 if (zv->zv_zilog == NULL) { 1238 zv->zv_zilog = zil_open(zv->zv_objset, 1239 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1240 zv->zv_flags |= ZVOL_WRITTEN_TO; 1241 /* replay / destroy done in zvol_os_create_minor() */ 1242 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1243 ZIL_REPLAY_NEEDED); 1244 } 1245 rw_downgrade(&zv->zv_suspend_lock); 1246 } 1247 } 1248 1249 boolean_t 1250 zvol_os_is_zvol(const char *device) 1251 { 1252 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1253 } 1254 1255 void 1256 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1257 { 1258 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1259 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1260 1261 /* Move to a new hashtable entry. */ 1262 zv->zv_hash = zvol_name_hash(zv->zv_name); 1263 hlist_del(&zv->zv_hlink); 1264 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1265 1266 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1267 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1268 struct g_provider *pp = zsg->zsg_provider; 1269 struct g_geom *gp; 1270 1271 g_topology_lock(); 1272 gp = pp->geom; 1273 ASSERT3P(gp, !=, NULL); 1274 1275 zsg->zsg_provider = NULL; 1276 g_wither_provider(pp, ENXIO); 1277 1278 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1279 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1280 pp->sectorsize = DEV_BSIZE; 1281 pp->mediasize = zv->zv_volsize; 1282 pp->private = zv; 1283 zsg->zsg_provider = pp; 1284 g_error_provider(pp, 0); 1285 g_topology_unlock(); 1286 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1287 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1288 struct cdev *dev; 1289 struct make_dev_args args; 1290 1291 dev = zsd->zsd_cdev; 1292 if (dev != NULL) { 1293 destroy_dev(dev); 1294 dev = zsd->zsd_cdev = NULL; 1295 if (zv->zv_open_count > 0) { 1296 zv->zv_flags &= ~ZVOL_EXCL; 1297 zv->zv_open_count = 0; 1298 /* XXX need suspend lock but lock order */ 1299 zvol_last_close(zv); 1300 } 1301 } 1302 1303 make_dev_args_init(&args); 1304 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1305 args.mda_devsw = &zvol_cdevsw; 1306 args.mda_cr = NULL; 1307 args.mda_uid = UID_ROOT; 1308 args.mda_gid = GID_OPERATOR; 1309 args.mda_mode = 0640; 1310 args.mda_si_drv2 = zv; 1311 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1312 == 0) { 1313 #if __FreeBSD_version > 1300130 1314 dev->si_iosize_max = maxphys; 1315 #else 1316 dev->si_iosize_max = MAXPHYS; 1317 #endif 1318 zsd->zsd_cdev = dev; 1319 } 1320 } 1321 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1322 dataset_kstats_rename(&zv->zv_kstat, newname); 1323 } 1324 1325 /* 1326 * Remove minor node for the specified volume. 1327 */ 1328 void 1329 zvol_os_free(zvol_state_t *zv) 1330 { 1331 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1332 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1333 ASSERT0(zv->zv_open_count); 1334 1335 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1336 1337 rw_destroy(&zv->zv_suspend_lock); 1338 zfs_rangelock_fini(&zv->zv_rangelock); 1339 1340 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1341 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1342 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1343 1344 ASSERT3P(pp->private, ==, NULL); 1345 1346 g_topology_lock(); 1347 zvol_geom_destroy(zv); 1348 g_topology_unlock(); 1349 mtx_destroy(&zsg->zsg_queue_mtx); 1350 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1351 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1352 struct cdev *dev = zsd->zsd_cdev; 1353 1354 if (dev != NULL) { 1355 ASSERT3P(dev->si_drv2, ==, NULL); 1356 destroy_dev(dev); 1357 knlist_clear(&zsd->zsd_selinfo.si_note, 0); 1358 knlist_destroy(&zsd->zsd_selinfo.si_note); 1359 } 1360 } 1361 1362 mutex_destroy(&zv->zv_state_lock); 1363 dataset_kstats_destroy(&zv->zv_kstat); 1364 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1365 kmem_free(zv, sizeof (zvol_state_t)); 1366 zvol_minors--; 1367 } 1368 1369 /* 1370 * Create a minor node (plus a whole lot more) for the specified volume. 1371 */ 1372 int 1373 zvol_os_create_minor(const char *name) 1374 { 1375 zvol_state_t *zv; 1376 objset_t *os; 1377 dmu_object_info_t *doi; 1378 uint64_t volsize; 1379 uint64_t volmode, hash; 1380 int error; 1381 bool replayed_zil = B_FALSE; 1382 1383 ZFS_LOG(1, "Creating ZVOL %s...", name); 1384 hash = zvol_name_hash(name); 1385 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1386 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1387 mutex_exit(&zv->zv_state_lock); 1388 return (SET_ERROR(EEXIST)); 1389 } 1390 1391 DROP_GIANT(); 1392 1393 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1394 1395 /* Lie and say we're read-only. */ 1396 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1397 if (error) 1398 goto out_doi; 1399 1400 error = dmu_object_info(os, ZVOL_OBJ, doi); 1401 if (error) 1402 goto out_dmu_objset_disown; 1403 1404 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1405 if (error) 1406 goto out_dmu_objset_disown; 1407 1408 error = dsl_prop_get_integer(name, 1409 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1410 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1411 volmode = zvol_volmode; 1412 error = 0; 1413 1414 /* 1415 * zvol_alloc equivalent ... 1416 */ 1417 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1418 zv->zv_hash = hash; 1419 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1420 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1421 zv->zv_volmode = volmode; 1422 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1423 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1424 struct g_provider *pp; 1425 struct g_geom *gp; 1426 1427 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1428 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1429 1430 g_topology_lock(); 1431 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1432 gp->start = zvol_geom_bio_start; 1433 gp->access = zvol_geom_access; 1434 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1435 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1436 pp->sectorsize = DEV_BSIZE; 1437 pp->mediasize = 0; 1438 pp->private = zv; 1439 1440 zsg->zsg_provider = pp; 1441 bioq_init(&zsg->zsg_queue); 1442 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1443 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1444 struct cdev *dev; 1445 struct make_dev_args args; 1446 1447 make_dev_args_init(&args); 1448 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1449 args.mda_devsw = &zvol_cdevsw; 1450 args.mda_cr = NULL; 1451 args.mda_uid = UID_ROOT; 1452 args.mda_gid = GID_OPERATOR; 1453 args.mda_mode = 0640; 1454 args.mda_si_drv2 = zv; 1455 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) 1456 == 0) { 1457 #if __FreeBSD_version > 1300130 1458 dev->si_iosize_max = maxphys; 1459 #else 1460 dev->si_iosize_max = MAXPHYS; 1461 #endif 1462 zsd->zsd_cdev = dev; 1463 knlist_init_sx(&zsd->zsd_selinfo.si_note, 1464 &zv->zv_state_lock); 1465 } 1466 } 1467 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1468 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1469 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1470 1471 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1472 zv->zv_flags |= ZVOL_RDONLY; 1473 1474 zv->zv_volblocksize = doi->doi_data_block_size; 1475 zv->zv_volsize = volsize; 1476 zv->zv_objset = os; 1477 1478 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1479 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1480 if (error) 1481 goto out_dmu_objset_disown; 1482 ASSERT3P(zv->zv_zilog, ==, NULL); 1483 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1484 if (spa_writeable(dmu_objset_spa(os))) { 1485 if (zil_replay_disable) 1486 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1487 else 1488 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1489 } 1490 if (replayed_zil) 1491 zil_close(zv->zv_zilog); 1492 zv->zv_zilog = NULL; 1493 1494 /* TODO: prefetch for geom tasting */ 1495 1496 zv->zv_objset = NULL; 1497 out_dmu_objset_disown: 1498 dmu_objset_disown(os, B_TRUE, FTAG); 1499 1500 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1501 zvol_geom_run(zv); 1502 g_topology_unlock(); 1503 } 1504 out_doi: 1505 kmem_free(doi, sizeof (dmu_object_info_t)); 1506 if (error == 0) { 1507 rw_enter(&zvol_state_lock, RW_WRITER); 1508 zvol_insert(zv); 1509 zvol_minors++; 1510 rw_exit(&zvol_state_lock); 1511 ZFS_LOG(1, "ZVOL %s created.", name); 1512 } 1513 PICKUP_GIANT(); 1514 return (error); 1515 } 1516 1517 void 1518 zvol_os_clear_private(zvol_state_t *zv) 1519 { 1520 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1521 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1522 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1523 struct g_provider *pp = zsg->zsg_provider; 1524 1525 if (pp->private == NULL) /* already cleared */ 1526 return; 1527 1528 mtx_lock(&zsg->zsg_queue_mtx); 1529 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1530 pp->private = NULL; 1531 wakeup_one(&zsg->zsg_queue); 1532 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1533 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1534 0, "zvol:w", 0); 1535 mtx_unlock(&zsg->zsg_queue_mtx); 1536 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1537 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1538 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1539 struct cdev *dev = zsd->zsd_cdev; 1540 1541 if (dev != NULL) 1542 dev->si_drv2 = NULL; 1543 } 1544 } 1545 1546 int 1547 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1548 { 1549 zv->zv_volsize = volsize; 1550 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1551 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1552 struct g_provider *pp = zsg->zsg_provider; 1553 1554 g_topology_lock(); 1555 1556 if (pp->private == NULL) { 1557 g_topology_unlock(); 1558 return (SET_ERROR(ENXIO)); 1559 } 1560 1561 /* 1562 * Do not invoke resize event when initial size was zero. 1563 * ZVOL initializes the size on first open, this is not 1564 * real resizing. 1565 */ 1566 if (pp->mediasize == 0) 1567 pp->mediasize = zv->zv_volsize; 1568 else 1569 g_resize_provider(pp, zv->zv_volsize); 1570 1571 g_topology_unlock(); 1572 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1573 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1574 1575 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); 1576 } 1577 return (0); 1578 } 1579 1580 void 1581 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1582 { 1583 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1584 } 1585 1586 void 1587 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1588 { 1589 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1590 } 1591 1592 /* 1593 * Public interfaces 1594 */ 1595 1596 int 1597 zvol_busy(void) 1598 { 1599 return (zvol_minors != 0); 1600 } 1601 1602 int 1603 zvol_init(void) 1604 { 1605 zvol_init_impl(); 1606 return (0); 1607 } 1608 1609 void 1610 zvol_fini(void) 1611 { 1612 zvol_fini_impl(); 1613 } 1614