1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> 23 * All rights reserved. 24 * 25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org> 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/param.h> 30 #include <sys/kernel.h> 31 #include <sys/bio.h> 32 #include <sys/buf.h> 33 #include <sys/file.h> 34 #include <sys/spa.h> 35 #include <sys/spa_impl.h> 36 #include <sys/vdev_impl.h> 37 #include <sys/vdev_os.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zio.h> 40 #include <vm/vm_page.h> 41 #include <geom/geom.h> 42 #include <geom/geom_disk.h> 43 #include <geom/geom_int.h> 44 45 #ifndef g_topology_locked 46 #define g_topology_locked() sx_xlocked(&topology_lock) 47 #endif 48 49 /* 50 * Virtual device vector for GEOM. 51 */ 52 53 static g_attrchanged_t vdev_geom_attrchanged; 54 struct g_class zfs_vdev_class = { 55 .name = "ZFS::VDEV", 56 .version = G_VERSION, 57 .attrchanged = vdev_geom_attrchanged, 58 }; 59 60 struct consumer_vdev_elem { 61 SLIST_ENTRY(consumer_vdev_elem) elems; 62 vdev_t *vd; 63 }; 64 65 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); 66 /* BEGIN CSTYLED */ 67 _Static_assert(sizeof (((struct g_consumer *)NULL)->private) 68 == sizeof (struct consumer_priv_t*), 69 "consumer_priv_t* can't be stored in g_consumer.private"); 70 71 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); 72 73 SYSCTL_DECL(_vfs_zfs_vdev); 74 /* Don't send BIO_FLUSH. */ 75 static int vdev_geom_bio_flush_disable; 76 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, 77 &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); 78 /* Don't send BIO_DELETE. */ 79 static int vdev_geom_bio_delete_disable; 80 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, 81 &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); 82 /* END CSTYLED */ 83 84 /* Declare local functions */ 85 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); 86 87 /* 88 * Thread local storage used to indicate when a thread is probing geoms 89 * for their guids. If NULL, this thread is not tasting geoms. If non NULL, 90 * it is looking for a replacement for the vdev_t* that is its value. 91 */ 92 uint_t zfs_geom_probe_vdev_key; 93 94 static void 95 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, 96 boolean_t do_null_update) 97 { 98 boolean_t needs_update = B_FALSE; 99 char *physpath; 100 int error, physpath_len; 101 102 physpath_len = MAXPATHLEN; 103 physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); 104 error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); 105 if (error == 0) { 106 char *old_physpath; 107 108 /* g_topology lock ensures that vdev has not been closed */ 109 g_topology_assert(); 110 old_physpath = vd->vdev_physpath; 111 vd->vdev_physpath = spa_strdup(physpath); 112 113 if (old_physpath != NULL) { 114 needs_update = (strcmp(old_physpath, 115 vd->vdev_physpath) != 0); 116 spa_strfree(old_physpath); 117 } else 118 needs_update = do_null_update; 119 } 120 g_free(physpath); 121 122 /* 123 * If the physical path changed, update the config. 124 * Only request an update for previously unset physpaths if 125 * requested by the caller. 126 */ 127 if (needs_update) 128 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); 129 130 } 131 132 static void 133 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) 134 { 135 struct consumer_priv_t *priv; 136 struct consumer_vdev_elem *elem; 137 138 priv = (struct consumer_priv_t *)&cp->private; 139 if (SLIST_EMPTY(priv)) 140 return; 141 142 SLIST_FOREACH(elem, priv, elems) { 143 vdev_t *vd = elem->vd; 144 if (strcmp(attr, "GEOM::physpath") == 0) { 145 vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); 146 return; 147 } 148 } 149 } 150 151 static void 152 vdev_geom_resize(struct g_consumer *cp) 153 { 154 struct consumer_priv_t *priv; 155 struct consumer_vdev_elem *elem; 156 spa_t *spa; 157 vdev_t *vd; 158 159 priv = (struct consumer_priv_t *)&cp->private; 160 if (SLIST_EMPTY(priv)) 161 return; 162 163 SLIST_FOREACH(elem, priv, elems) { 164 vd = elem->vd; 165 if (vd->vdev_state != VDEV_STATE_HEALTHY) 166 continue; 167 spa = vd->vdev_spa; 168 if (!spa->spa_autoexpand) 169 continue; 170 vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); 171 } 172 } 173 174 static void 175 vdev_geom_orphan(struct g_consumer *cp) 176 { 177 struct consumer_priv_t *priv; 178 // cppcheck-suppress uninitvar 179 struct consumer_vdev_elem *elem; 180 181 g_topology_assert(); 182 183 priv = (struct consumer_priv_t *)&cp->private; 184 if (SLIST_EMPTY(priv)) 185 /* Vdev close in progress. Ignore the event. */ 186 return; 187 188 /* 189 * Orphan callbacks occur from the GEOM event thread. 190 * Concurrent with this call, new I/O requests may be 191 * working their way through GEOM about to find out 192 * (only once executed by the g_down thread) that we've 193 * been orphaned from our disk provider. These I/Os 194 * must be retired before we can detach our consumer. 195 * This is most easily achieved by acquiring the 196 * SPA ZIO configuration lock as a writer, but doing 197 * so with the GEOM topology lock held would cause 198 * a lock order reversal. Instead, rely on the SPA's 199 * async removal support to invoke a close on this 200 * vdev once it is safe to do so. 201 */ 202 // cppcheck-suppress All 203 SLIST_FOREACH(elem, priv, elems) { 204 // cppcheck-suppress uninitvar 205 vdev_t *vd = elem->vd; 206 207 vd->vdev_remove_wanted = B_TRUE; 208 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 209 } 210 } 211 212 static struct g_consumer * 213 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) 214 { 215 struct g_geom *gp; 216 struct g_consumer *cp; 217 int error; 218 219 g_topology_assert(); 220 221 ZFS_LOG(1, "Attaching to %s.", pp->name); 222 223 if (sanity) { 224 if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { 225 ZFS_LOG(1, "Failing attach of %s. " 226 "Incompatible sectorsize %d\n", 227 pp->name, pp->sectorsize); 228 return (NULL); 229 } else if (pp->mediasize < SPA_MINDEVSIZE) { 230 ZFS_LOG(1, "Failing attach of %s. " 231 "Incompatible mediasize %ju\n", 232 pp->name, pp->mediasize); 233 return (NULL); 234 } 235 } 236 237 /* Do we have geom already? No? Create one. */ 238 LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { 239 if (gp->flags & G_GEOM_WITHER) 240 continue; 241 if (strcmp(gp->name, "zfs::vdev") != 0) 242 continue; 243 break; 244 } 245 if (gp == NULL) { 246 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); 247 gp->orphan = vdev_geom_orphan; 248 gp->attrchanged = vdev_geom_attrchanged; 249 gp->resize = vdev_geom_resize; 250 cp = g_new_consumer(gp); 251 error = g_attach(cp, pp); 252 if (error != 0) { 253 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, 254 __LINE__, error); 255 vdev_geom_detach(cp, B_FALSE); 256 return (NULL); 257 } 258 error = g_access(cp, 1, 0, 1); 259 if (error != 0) { 260 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, 261 __LINE__, error); 262 vdev_geom_detach(cp, B_FALSE); 263 return (NULL); 264 } 265 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); 266 } else { 267 /* Check if we are already connected to this provider. */ 268 LIST_FOREACH(cp, &gp->consumer, consumer) { 269 if (cp->provider == pp) { 270 ZFS_LOG(1, "Found consumer for %s.", pp->name); 271 break; 272 } 273 } 274 if (cp == NULL) { 275 cp = g_new_consumer(gp); 276 error = g_attach(cp, pp); 277 if (error != 0) { 278 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", 279 __func__, __LINE__, error); 280 vdev_geom_detach(cp, B_FALSE); 281 return (NULL); 282 } 283 error = g_access(cp, 1, 0, 1); 284 if (error != 0) { 285 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", 286 __func__, __LINE__, error); 287 vdev_geom_detach(cp, B_FALSE); 288 return (NULL); 289 } 290 ZFS_LOG(1, "Created consumer for %s.", pp->name); 291 } else { 292 error = g_access(cp, 1, 0, 1); 293 if (error != 0) { 294 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", 295 __func__, __LINE__, error); 296 return (NULL); 297 } 298 ZFS_LOG(1, "Used existing consumer for %s.", pp->name); 299 } 300 } 301 302 if (vd != NULL) 303 vd->vdev_tsd = cp; 304 305 cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; 306 return (cp); 307 } 308 309 static void 310 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) 311 { 312 struct g_geom *gp; 313 314 g_topology_assert(); 315 316 ZFS_LOG(1, "Detaching from %s.", 317 cp->provider && cp->provider->name ? cp->provider->name : "NULL"); 318 319 gp = cp->geom; 320 if (open_for_read) 321 g_access(cp, -1, 0, -1); 322 /* Destroy consumer on last close. */ 323 if (cp->acr == 0 && cp->ace == 0) { 324 if (cp->acw > 0) 325 g_access(cp, 0, -cp->acw, 0); 326 if (cp->provider != NULL) { 327 ZFS_LOG(1, "Destroying consumer for %s.", 328 cp->provider->name ? cp->provider->name : "NULL"); 329 g_detach(cp); 330 } 331 g_destroy_consumer(cp); 332 } 333 /* Destroy geom if there are no consumers left. */ 334 if (LIST_EMPTY(&gp->consumer)) { 335 ZFS_LOG(1, "Destroyed geom %s.", gp->name); 336 g_wither_geom(gp, ENXIO); 337 } 338 } 339 340 static void 341 vdev_geom_close_locked(vdev_t *vd) 342 { 343 struct g_consumer *cp; 344 struct consumer_priv_t *priv; 345 struct consumer_vdev_elem *elem, *elem_temp; 346 347 g_topology_assert(); 348 349 cp = vd->vdev_tsd; 350 vd->vdev_delayed_close = B_FALSE; 351 if (cp == NULL) 352 return; 353 354 ZFS_LOG(1, "Closing access to %s.", cp->provider->name); 355 KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); 356 priv = (struct consumer_priv_t *)&cp->private; 357 vd->vdev_tsd = NULL; 358 SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { 359 if (elem->vd == vd) { 360 SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); 361 g_free(elem); 362 } 363 } 364 365 vdev_geom_detach(cp, B_TRUE); 366 } 367 368 /* 369 * Issue one or more bios to the vdev in parallel 370 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO 371 * operation is described by parallel entries from each array. There may be 372 * more bios actually issued than entries in the array 373 */ 374 static void 375 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, 376 off_t *sizes, int *errors, int ncmds) 377 { 378 struct bio **bios; 379 uint8_t *p; 380 off_t off, maxio, s, end; 381 int i, n_bios, j; 382 size_t bios_size; 383 384 #if __FreeBSD_version > 1300130 385 maxio = maxphys - (maxphys % cp->provider->sectorsize); 386 #else 387 maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); 388 #endif 389 n_bios = 0; 390 391 /* How many bios are required for all commands ? */ 392 for (i = 0; i < ncmds; i++) 393 n_bios += (sizes[i] + maxio - 1) / maxio; 394 395 /* Allocate memory for the bios */ 396 bios_size = n_bios * sizeof (struct bio *); 397 bios = kmem_zalloc(bios_size, KM_SLEEP); 398 399 /* Prepare and issue all of the bios */ 400 for (i = j = 0; i < ncmds; i++) { 401 off = offsets[i]; 402 p = datas[i]; 403 s = sizes[i]; 404 end = off + s; 405 ASSERT0(off % cp->provider->sectorsize); 406 ASSERT0(s % cp->provider->sectorsize); 407 408 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { 409 bios[j] = g_alloc_bio(); 410 bios[j]->bio_cmd = cmds[i]; 411 bios[j]->bio_done = NULL; 412 bios[j]->bio_offset = off; 413 bios[j]->bio_length = MIN(s, maxio); 414 bios[j]->bio_data = (caddr_t)p; 415 g_io_request(bios[j], cp); 416 } 417 } 418 ASSERT3S(j, ==, n_bios); 419 420 /* Wait for all of the bios to complete, and clean them up */ 421 for (i = j = 0; i < ncmds; i++) { 422 off = offsets[i]; 423 s = sizes[i]; 424 end = off + s; 425 426 for (; off < end; off += maxio, s -= maxio, j++) { 427 errors[i] = biowait(bios[j], "vdev_geom_io") || 428 errors[i]; 429 g_destroy_bio(bios[j]); 430 } 431 } 432 kmem_free(bios, bios_size); 433 } 434 435 /* 436 * Read the vdev config from a device. Return the number of valid labels that 437 * were found. The vdev config will be returned in config if and only if at 438 * least one valid label was found. 439 */ 440 static int 441 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) 442 { 443 struct g_provider *pp; 444 nvlist_t *config; 445 vdev_phys_t *vdev_lists[VDEV_LABELS]; 446 char *buf; 447 size_t buflen; 448 uint64_t psize, state, txg; 449 off_t offsets[VDEV_LABELS]; 450 off_t size; 451 off_t sizes[VDEV_LABELS]; 452 int cmds[VDEV_LABELS]; 453 int errors[VDEV_LABELS]; 454 int l, nlabels; 455 456 g_topology_assert_not(); 457 458 pp = cp->provider; 459 ZFS_LOG(1, "Reading config from %s...", pp->name); 460 461 psize = pp->mediasize; 462 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); 463 464 size = sizeof (*vdev_lists[0]) + pp->sectorsize - 465 ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; 466 467 buflen = sizeof (vdev_lists[0]->vp_nvlist); 468 469 /* Create all of the IO requests */ 470 for (l = 0; l < VDEV_LABELS; l++) { 471 cmds[l] = BIO_READ; 472 vdev_lists[l] = kmem_alloc(size, KM_SLEEP); 473 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; 474 sizes[l] = size; 475 errors[l] = 0; 476 ASSERT0(offsets[l] % pp->sectorsize); 477 } 478 479 /* Issue the IO requests */ 480 vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, 481 VDEV_LABELS); 482 483 /* Parse the labels */ 484 config = *configp = NULL; 485 nlabels = 0; 486 for (l = 0; l < VDEV_LABELS; l++) { 487 if (errors[l] != 0) 488 continue; 489 490 buf = vdev_lists[l]->vp_nvlist; 491 492 if (nvlist_unpack(buf, buflen, &config, 0) != 0) 493 continue; 494 495 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, 496 &state) != 0 || state > POOL_STATE_L2CACHE) { 497 nvlist_free(config); 498 continue; 499 } 500 501 if (state != POOL_STATE_SPARE && 502 state != POOL_STATE_L2CACHE && 503 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 504 &txg) != 0 || txg == 0)) { 505 nvlist_free(config); 506 continue; 507 } 508 509 if (*configp != NULL) 510 nvlist_free(*configp); 511 *configp = config; 512 nlabels++; 513 } 514 515 /* Free the label storage */ 516 for (l = 0; l < VDEV_LABELS; l++) 517 kmem_free(vdev_lists[l], size); 518 519 return (nlabels); 520 } 521 522 static void 523 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) 524 { 525 nvlist_t **new_configs; 526 uint64_t i; 527 528 if (id < *count) 529 return; 530 new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), 531 KM_SLEEP); 532 for (i = 0; i < *count; i++) 533 new_configs[i] = (*configs)[i]; 534 if (*configs != NULL) 535 kmem_free(*configs, *count * sizeof (void *)); 536 *configs = new_configs; 537 *count = id + 1; 538 } 539 540 static void 541 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, 542 const char *name, uint64_t *known_pool_guid) 543 { 544 nvlist_t *vdev_tree; 545 uint64_t pool_guid; 546 uint64_t vdev_guid; 547 uint64_t id, txg, known_txg; 548 char *pname; 549 550 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || 551 strcmp(pname, name) != 0) 552 goto ignore; 553 554 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) 555 goto ignore; 556 557 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) 558 goto ignore; 559 560 if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) 561 goto ignore; 562 563 if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) 564 goto ignore; 565 566 txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG); 567 568 if (*known_pool_guid != 0) { 569 if (pool_guid != *known_pool_guid) 570 goto ignore; 571 } else 572 *known_pool_guid = pool_guid; 573 574 resize_configs(configs, count, id); 575 576 if ((*configs)[id] != NULL) { 577 known_txg = fnvlist_lookup_uint64((*configs)[id], 578 ZPOOL_CONFIG_POOL_TXG); 579 if (txg <= known_txg) 580 goto ignore; 581 nvlist_free((*configs)[id]); 582 } 583 584 (*configs)[id] = cfg; 585 return; 586 587 ignore: 588 nvlist_free(cfg); 589 } 590 591 int 592 vdev_geom_read_pool_label(const char *name, 593 nvlist_t ***configs, uint64_t *count) 594 { 595 struct g_class *mp; 596 struct g_geom *gp; 597 struct g_provider *pp; 598 struct g_consumer *zcp; 599 nvlist_t *vdev_cfg; 600 uint64_t pool_guid; 601 int nlabels; 602 603 DROP_GIANT(); 604 g_topology_lock(); 605 606 *configs = NULL; 607 *count = 0; 608 pool_guid = 0; 609 LIST_FOREACH(mp, &g_classes, class) { 610 if (mp == &zfs_vdev_class) 611 continue; 612 LIST_FOREACH(gp, &mp->geom, geom) { 613 if (gp->flags & G_GEOM_WITHER) 614 continue; 615 LIST_FOREACH(pp, &gp->provider, provider) { 616 if (pp->flags & G_PF_WITHER) 617 continue; 618 zcp = vdev_geom_attach(pp, NULL, B_TRUE); 619 if (zcp == NULL) 620 continue; 621 g_topology_unlock(); 622 nlabels = vdev_geom_read_config(zcp, &vdev_cfg); 623 g_topology_lock(); 624 vdev_geom_detach(zcp, B_TRUE); 625 if (nlabels == 0) 626 continue; 627 ZFS_LOG(1, "successfully read vdev config"); 628 629 process_vdev_config(configs, count, 630 vdev_cfg, name, &pool_guid); 631 } 632 } 633 } 634 g_topology_unlock(); 635 PICKUP_GIANT(); 636 637 return (*count > 0 ? 0 : ENOENT); 638 } 639 640 enum match { 641 NO_MATCH = 0, /* No matching labels found */ 642 TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ 643 ZERO_MATCH = 1, /* Should never be returned */ 644 ONE_MATCH = 2, /* 1 label matching the vdev_guid */ 645 TWO_MATCH = 3, /* 2 label matching the vdev_guid */ 646 THREE_MATCH = 4, /* 3 label matching the vdev_guid */ 647 FULL_MATCH = 5 /* all labels match the vdev_guid */ 648 }; 649 650 static enum match 651 vdev_attach_ok(vdev_t *vd, struct g_provider *pp) 652 { 653 nvlist_t *config; 654 uint64_t pool_guid, top_guid, vdev_guid; 655 struct g_consumer *cp; 656 int nlabels; 657 658 cp = vdev_geom_attach(pp, NULL, B_TRUE); 659 if (cp == NULL) { 660 ZFS_LOG(1, "Unable to attach tasting instance to %s.", 661 pp->name); 662 return (NO_MATCH); 663 } 664 g_topology_unlock(); 665 nlabels = vdev_geom_read_config(cp, &config); 666 g_topology_lock(); 667 vdev_geom_detach(cp, B_TRUE); 668 if (nlabels == 0) { 669 ZFS_LOG(1, "Unable to read config from %s.", pp->name); 670 return (NO_MATCH); 671 } 672 673 pool_guid = 0; 674 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); 675 top_guid = 0; 676 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); 677 vdev_guid = 0; 678 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); 679 nvlist_free(config); 680 681 /* 682 * Check that the label's pool guid matches the desired guid. 683 * Inactive spares and L2ARCs do not have any pool guid in the label. 684 */ 685 if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { 686 ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", 687 pp->name, 688 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); 689 return (NO_MATCH); 690 } 691 692 /* 693 * Check that the label's vdev guid matches the desired guid. 694 * The second condition handles possible race on vdev detach, when 695 * remaining vdev receives GUID of destroyed top level mirror vdev. 696 */ 697 if (vdev_guid == vd->vdev_guid) { 698 ZFS_LOG(1, "guids match for provider %s.", pp->name); 699 return (ZERO_MATCH + nlabels); 700 } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { 701 ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); 702 return (TOPGUID_MATCH); 703 } 704 ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", 705 pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); 706 return (NO_MATCH); 707 } 708 709 static struct g_consumer * 710 vdev_geom_attach_by_guids(vdev_t *vd) 711 { 712 struct g_class *mp; 713 struct g_geom *gp; 714 struct g_provider *pp, *best_pp; 715 struct g_consumer *cp; 716 const char *vdpath; 717 enum match match, best_match; 718 719 g_topology_assert(); 720 721 vdpath = vd->vdev_path + sizeof ("/dev/") - 1; 722 cp = NULL; 723 best_pp = NULL; 724 best_match = NO_MATCH; 725 LIST_FOREACH(mp, &g_classes, class) { 726 if (mp == &zfs_vdev_class) 727 continue; 728 LIST_FOREACH(gp, &mp->geom, geom) { 729 if (gp->flags & G_GEOM_WITHER) 730 continue; 731 LIST_FOREACH(pp, &gp->provider, provider) { 732 match = vdev_attach_ok(vd, pp); 733 if (match > best_match) { 734 best_match = match; 735 best_pp = pp; 736 } else if (match == best_match) { 737 if (strcmp(pp->name, vdpath) == 0) { 738 best_pp = pp; 739 } 740 } 741 if (match == FULL_MATCH) 742 goto out; 743 } 744 } 745 } 746 747 out: 748 if (best_pp) { 749 cp = vdev_geom_attach(best_pp, vd, B_TRUE); 750 if (cp == NULL) { 751 printf("ZFS WARNING: Unable to attach to %s.\n", 752 best_pp->name); 753 } 754 } 755 return (cp); 756 } 757 758 static struct g_consumer * 759 vdev_geom_open_by_guids(vdev_t *vd) 760 { 761 struct g_consumer *cp; 762 char *buf; 763 size_t len; 764 765 g_topology_assert(); 766 767 ZFS_LOG(1, "Searching by guids [%ju:%ju].", 768 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); 769 cp = vdev_geom_attach_by_guids(vd); 770 if (cp != NULL) { 771 len = strlen(cp->provider->name) + strlen("/dev/") + 1; 772 buf = kmem_alloc(len, KM_SLEEP); 773 774 snprintf(buf, len, "/dev/%s", cp->provider->name); 775 spa_strfree(vd->vdev_path); 776 vd->vdev_path = buf; 777 778 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", 779 (uintmax_t)spa_guid(vd->vdev_spa), 780 (uintmax_t)vd->vdev_guid, cp->provider->name); 781 } else { 782 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", 783 (uintmax_t)spa_guid(vd->vdev_spa), 784 (uintmax_t)vd->vdev_guid); 785 } 786 787 return (cp); 788 } 789 790 static struct g_consumer * 791 vdev_geom_open_by_path(vdev_t *vd, int check_guid) 792 { 793 struct g_provider *pp; 794 struct g_consumer *cp; 795 796 g_topology_assert(); 797 798 cp = NULL; 799 pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); 800 if (pp != NULL) { 801 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); 802 if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) 803 cp = vdev_geom_attach(pp, vd, B_FALSE); 804 } 805 806 return (cp); 807 } 808 809 static int 810 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 811 uint64_t *logical_ashift, uint64_t *physical_ashift) 812 { 813 struct g_provider *pp; 814 struct g_consumer *cp; 815 int error, has_trim; 816 uint16_t rate; 817 818 /* 819 * Set the TLS to indicate downstack that we 820 * should not access zvols 821 */ 822 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd)); 823 824 /* 825 * We must have a pathname, and it must be absolute. 826 */ 827 if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { 828 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 829 return (EINVAL); 830 } 831 832 /* 833 * Reopen the device if it's not currently open. Otherwise, 834 * just update the physical size of the device. 835 */ 836 if ((cp = vd->vdev_tsd) != NULL) { 837 ASSERT(vd->vdev_reopening); 838 goto skip_open; 839 } 840 841 DROP_GIANT(); 842 g_topology_lock(); 843 error = 0; 844 845 if (vd->vdev_spa->spa_is_splitting || 846 ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && 847 (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || 848 vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { 849 /* 850 * We are dealing with a vdev that hasn't been previously 851 * opened (since boot), and we are not loading an 852 * existing pool configuration. This looks like a 853 * vdev add operation to a new or existing pool. 854 * Assume the user really wants to do this, and find 855 * GEOM provider by its name, ignoring GUID mismatches. 856 * 857 * XXPOLICY: It would be safer to only allow a device 858 * that is unlabeled or labeled but missing 859 * GUID information to be opened in this fashion, 860 * unless we are doing a split, in which case we 861 * should allow any guid. 862 */ 863 cp = vdev_geom_open_by_path(vd, 0); 864 } else { 865 /* 866 * Try using the recorded path for this device, but only 867 * accept it if its label data contains the expected GUIDs. 868 */ 869 cp = vdev_geom_open_by_path(vd, 1); 870 if (cp == NULL) { 871 /* 872 * The device at vd->vdev_path doesn't have the 873 * expected GUIDs. The disks might have merely 874 * moved around so try all other GEOM providers 875 * to find one with the right GUIDs. 876 */ 877 cp = vdev_geom_open_by_guids(vd); 878 } 879 } 880 881 /* Clear the TLS now that tasting is done */ 882 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL)); 883 884 if (cp == NULL) { 885 ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); 886 error = ENOENT; 887 } else { 888 struct consumer_priv_t *priv; 889 struct consumer_vdev_elem *elem; 890 int spamode; 891 892 priv = (struct consumer_priv_t *)&cp->private; 893 if (cp->private == NULL) 894 SLIST_INIT(priv); 895 elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); 896 elem->vd = vd; 897 SLIST_INSERT_HEAD(priv, elem, elems); 898 899 spamode = spa_mode(vd->vdev_spa); 900 if (cp->provider->sectorsize > VDEV_PAD_SIZE || 901 !ISP2(cp->provider->sectorsize)) { 902 ZFS_LOG(1, "Provider %s has unsupported sectorsize.", 903 cp->provider->name); 904 905 vdev_geom_close_locked(vd); 906 error = EINVAL; 907 cp = NULL; 908 } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { 909 int i; 910 911 for (i = 0; i < 5; i++) { 912 error = g_access(cp, 0, 1, 0); 913 if (error == 0) 914 break; 915 g_topology_unlock(); 916 tsleep(vd, 0, "vdev", hz / 2); 917 g_topology_lock(); 918 } 919 if (error != 0) { 920 printf("ZFS WARNING: Unable to open %s for " 921 "writing (error=%d).\n", 922 cp->provider->name, error); 923 vdev_geom_close_locked(vd); 924 cp = NULL; 925 } 926 } 927 } 928 929 /* Fetch initial physical path information for this device. */ 930 if (cp != NULL) { 931 vdev_geom_attrchanged(cp, "GEOM::physpath"); 932 933 /* Set other GEOM characteristics */ 934 vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); 935 } 936 937 g_topology_unlock(); 938 PICKUP_GIANT(); 939 if (cp == NULL) { 940 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 941 vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", 942 error); 943 return (error); 944 } 945 skip_open: 946 pp = cp->provider; 947 948 /* 949 * Determine the actual size of the device. 950 */ 951 *max_psize = *psize = pp->mediasize; 952 953 /* 954 * Determine the device's minimum transfer size and preferred 955 * transfer size. 956 */ 957 *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; 958 *physical_ashift = 0; 959 if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && 960 ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && 961 pp->stripeoffset == 0) 962 *physical_ashift = highbit(pp->stripesize) - 1; 963 964 /* 965 * Clear the nowritecache settings, so that on a vdev_reopen() 966 * we will try again. 967 */ 968 vd->vdev_nowritecache = B_FALSE; 969 970 /* Inform the ZIO pipeline that we are non-rotational. */ 971 error = g_getattr("GEOM::rotation_rate", cp, &rate); 972 if (error == 0 && rate == DISK_RR_NON_ROTATING) 973 vd->vdev_nonrot = B_TRUE; 974 else 975 vd->vdev_nonrot = B_FALSE; 976 977 /* Set when device reports it supports TRIM. */ 978 error = g_getattr("GEOM::candelete", cp, &has_trim); 979 vd->vdev_has_trim = (error == 0 && has_trim); 980 981 /* Set when device reports it supports secure TRIM. */ 982 /* unavailable on FreeBSD */ 983 vd->vdev_has_securetrim = B_FALSE; 984 985 return (0); 986 } 987 988 static void 989 vdev_geom_close(vdev_t *vd) 990 { 991 struct g_consumer *cp; 992 boolean_t locked; 993 994 cp = vd->vdev_tsd; 995 996 DROP_GIANT(); 997 locked = g_topology_locked(); 998 if (!locked) 999 g_topology_lock(); 1000 1001 if (!vd->vdev_reopening || 1002 (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || 1003 (cp->provider != NULL && cp->provider->error != 0)))) 1004 vdev_geom_close_locked(vd); 1005 1006 if (!locked) 1007 g_topology_unlock(); 1008 PICKUP_GIANT(); 1009 } 1010 1011 static void 1012 vdev_geom_io_intr(struct bio *bp) 1013 { 1014 vdev_t *vd; 1015 zio_t *zio; 1016 1017 zio = bp->bio_caller1; 1018 vd = zio->io_vd; 1019 zio->io_error = bp->bio_error; 1020 if (zio->io_error == 0 && bp->bio_resid != 0) 1021 zio->io_error = SET_ERROR(EIO); 1022 1023 switch (zio->io_error) { 1024 case ENOTSUP: 1025 /* 1026 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know 1027 * that future attempts will never succeed. In this case 1028 * we set a persistent flag so that we don't bother with 1029 * requests in the future. 1030 */ 1031 switch (bp->bio_cmd) { 1032 case BIO_FLUSH: 1033 vd->vdev_nowritecache = B_TRUE; 1034 break; 1035 case BIO_DELETE: 1036 break; 1037 } 1038 break; 1039 case ENXIO: 1040 if (!vd->vdev_remove_wanted) { 1041 /* 1042 * If provider's error is set we assume it is being 1043 * removed. 1044 */ 1045 if (bp->bio_to->error != 0) { 1046 vd->vdev_remove_wanted = B_TRUE; 1047 spa_async_request(zio->io_spa, 1048 SPA_ASYNC_REMOVE); 1049 } else if (!vd->vdev_delayed_close) { 1050 vd->vdev_delayed_close = B_TRUE; 1051 } 1052 } 1053 break; 1054 } 1055 1056 /* 1057 * We have to split bio freeing into two parts, because the ABD code 1058 * cannot be called in this context and vdev_op_io_done is not called 1059 * for ZIO_TYPE_IOCTL zio-s. 1060 */ 1061 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { 1062 g_destroy_bio(bp); 1063 zio->io_bio = NULL; 1064 } 1065 zio_delay_interrupt(zio); 1066 } 1067 1068 struct vdev_geom_check_unmapped_cb_state { 1069 int pages; 1070 uint_t end; 1071 }; 1072 1073 /* 1074 * Callback to check the ABD segment size/alignment and count the pages. 1075 * GEOM requires data buffer to look virtually contiguous. It means only 1076 * the first page of the buffer may not start and only the last may not 1077 * end on a page boundary. All other physical pages must be full. 1078 */ 1079 static int 1080 vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv) 1081 { 1082 struct vdev_geom_check_unmapped_cb_state *s = priv; 1083 vm_offset_t off = (vm_offset_t)buf & PAGE_MASK; 1084 1085 if (s->pages != 0 && off != 0) 1086 return (1); 1087 if (s->end != 0) 1088 return (1); 1089 s->end = (off + len) & PAGE_MASK; 1090 s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT; 1091 return (0); 1092 } 1093 1094 /* 1095 * Check whether we can use unmapped I/O for this ZIO on this device to 1096 * avoid data copying between scattered and/or gang ABD buffer and linear. 1097 */ 1098 static int 1099 vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp) 1100 { 1101 struct vdev_geom_check_unmapped_cb_state s; 1102 1103 /* If unmapped I/O is administratively disabled, respect that. */ 1104 if (!unmapped_buf_allowed) 1105 return (0); 1106 1107 /* If the buffer is already linear, then nothing to do here. */ 1108 if (abd_is_linear(zio->io_abd)) 1109 return (0); 1110 1111 /* 1112 * If unmapped I/O is not supported by the GEOM provider, 1113 * then we can't do anything and have to copy the data. 1114 */ 1115 if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) 1116 return (0); 1117 1118 /* Check the buffer chunks sizes/alignments and count pages. */ 1119 s.pages = s.end = 0; 1120 if (abd_iterate_func(zio->io_abd, 0, zio->io_size, 1121 vdev_geom_check_unmapped_cb, &s)) 1122 return (0); 1123 return (s.pages); 1124 } 1125 1126 /* 1127 * Callback to translate the ABD segment into array of physical pages. 1128 */ 1129 static int 1130 vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv) 1131 { 1132 struct bio *bp = priv; 1133 vm_offset_t addr = (vm_offset_t)buf; 1134 vm_offset_t end = addr + len; 1135 1136 if (bp->bio_ma_n == 0) 1137 bp->bio_ma_offset = addr & PAGE_MASK; 1138 do { 1139 bp->bio_ma[bp->bio_ma_n++] = 1140 PHYS_TO_VM_PAGE(pmap_kextract(addr)); 1141 addr += PAGE_SIZE; 1142 } while (addr < end); 1143 return (0); 1144 } 1145 1146 static void 1147 vdev_geom_io_start(zio_t *zio) 1148 { 1149 vdev_t *vd; 1150 struct g_consumer *cp; 1151 struct bio *bp; 1152 1153 vd = zio->io_vd; 1154 1155 switch (zio->io_type) { 1156 case ZIO_TYPE_IOCTL: 1157 /* XXPOLICY */ 1158 if (!vdev_readable(vd)) { 1159 zio->io_error = SET_ERROR(ENXIO); 1160 zio_interrupt(zio); 1161 return; 1162 } else { 1163 switch (zio->io_cmd) { 1164 case DKIOCFLUSHWRITECACHE: 1165 if (zfs_nocacheflush || 1166 vdev_geom_bio_flush_disable) 1167 break; 1168 if (vd->vdev_nowritecache) { 1169 zio->io_error = SET_ERROR(ENOTSUP); 1170 break; 1171 } 1172 goto sendreq; 1173 default: 1174 zio->io_error = SET_ERROR(ENOTSUP); 1175 } 1176 } 1177 1178 zio_execute(zio); 1179 return; 1180 case ZIO_TYPE_TRIM: 1181 if (!vdev_geom_bio_delete_disable) { 1182 goto sendreq; 1183 } 1184 zio_execute(zio); 1185 return; 1186 default: 1187 ; 1188 /* PASSTHROUGH --- placate compiler */ 1189 } 1190 sendreq: 1191 ASSERT(zio->io_type == ZIO_TYPE_READ || 1192 zio->io_type == ZIO_TYPE_WRITE || 1193 zio->io_type == ZIO_TYPE_TRIM || 1194 zio->io_type == ZIO_TYPE_IOCTL); 1195 1196 cp = vd->vdev_tsd; 1197 if (cp == NULL) { 1198 zio->io_error = SET_ERROR(ENXIO); 1199 zio_interrupt(zio); 1200 return; 1201 } 1202 bp = g_alloc_bio(); 1203 bp->bio_caller1 = zio; 1204 switch (zio->io_type) { 1205 case ZIO_TYPE_READ: 1206 case ZIO_TYPE_WRITE: 1207 zio->io_target_timestamp = zio_handle_io_delay(zio); 1208 bp->bio_offset = zio->io_offset; 1209 bp->bio_length = zio->io_size; 1210 if (zio->io_type == ZIO_TYPE_READ) 1211 bp->bio_cmd = BIO_READ; 1212 else 1213 bp->bio_cmd = BIO_WRITE; 1214 1215 /* 1216 * If possible, represent scattered and/or gang ABD buffer to 1217 * GEOM as an array of physical pages. It allows to satisfy 1218 * requirement of virtually contiguous buffer without copying. 1219 */ 1220 int pgs = vdev_geom_check_unmapped(zio, cp); 1221 if (pgs > 0) { 1222 bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs, 1223 M_DEVBUF, M_WAITOK); 1224 bp->bio_ma_n = 0; 1225 bp->bio_ma_offset = 0; 1226 abd_iterate_func(zio->io_abd, 0, zio->io_size, 1227 vdev_geom_fill_unmap_cb, bp); 1228 bp->bio_data = unmapped_buf; 1229 bp->bio_flags |= BIO_UNMAPPED; 1230 } else { 1231 if (zio->io_type == ZIO_TYPE_READ) { 1232 bp->bio_data = abd_borrow_buf(zio->io_abd, 1233 zio->io_size); 1234 } else { 1235 bp->bio_data = abd_borrow_buf_copy(zio->io_abd, 1236 zio->io_size); 1237 } 1238 } 1239 break; 1240 case ZIO_TYPE_TRIM: 1241 bp->bio_cmd = BIO_DELETE; 1242 bp->bio_data = NULL; 1243 bp->bio_offset = zio->io_offset; 1244 bp->bio_length = zio->io_size; 1245 break; 1246 case ZIO_TYPE_IOCTL: 1247 bp->bio_cmd = BIO_FLUSH; 1248 bp->bio_data = NULL; 1249 bp->bio_offset = cp->provider->mediasize; 1250 bp->bio_length = 0; 1251 break; 1252 default: 1253 panic("invalid zio->io_type: %d\n", zio->io_type); 1254 } 1255 bp->bio_done = vdev_geom_io_intr; 1256 zio->io_bio = bp; 1257 1258 g_io_request(bp, cp); 1259 } 1260 1261 static void 1262 vdev_geom_io_done(zio_t *zio) 1263 { 1264 struct bio *bp = zio->io_bio; 1265 1266 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { 1267 ASSERT3P(bp, ==, NULL); 1268 return; 1269 } 1270 1271 if (bp == NULL) { 1272 ASSERT3S(zio->io_error, ==, ENXIO); 1273 return; 1274 } 1275 1276 if (bp->bio_ma != NULL) { 1277 free(bp->bio_ma, M_DEVBUF); 1278 } else { 1279 if (zio->io_type == ZIO_TYPE_READ) { 1280 abd_return_buf_copy(zio->io_abd, bp->bio_data, 1281 zio->io_size); 1282 } else { 1283 abd_return_buf(zio->io_abd, bp->bio_data, 1284 zio->io_size); 1285 } 1286 } 1287 1288 g_destroy_bio(bp); 1289 zio->io_bio = NULL; 1290 } 1291 1292 static void 1293 vdev_geom_hold(vdev_t *vd) 1294 { 1295 } 1296 1297 static void 1298 vdev_geom_rele(vdev_t *vd) 1299 { 1300 } 1301 1302 vdev_ops_t vdev_disk_ops = { 1303 .vdev_op_init = NULL, 1304 .vdev_op_fini = NULL, 1305 .vdev_op_open = vdev_geom_open, 1306 .vdev_op_close = vdev_geom_close, 1307 .vdev_op_asize = vdev_default_asize, 1308 .vdev_op_min_asize = vdev_default_min_asize, 1309 .vdev_op_min_alloc = NULL, 1310 .vdev_op_io_start = vdev_geom_io_start, 1311 .vdev_op_io_done = vdev_geom_io_done, 1312 .vdev_op_state_change = NULL, 1313 .vdev_op_need_resilver = NULL, 1314 .vdev_op_hold = vdev_geom_hold, 1315 .vdev_op_rele = vdev_geom_rele, 1316 .vdev_op_remap = NULL, 1317 .vdev_op_xlate = vdev_default_xlate, 1318 .vdev_op_rebuild_asize = NULL, 1319 .vdev_op_metaslab_init = NULL, 1320 .vdev_op_config_generate = NULL, 1321 .vdev_op_nparity = NULL, 1322 .vdev_op_ndisks = NULL, 1323 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1324 .vdev_op_leaf = B_TRUE /* leaf vdev */ 1325 }; 1326