1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> 23 * All rights reserved. 24 * 25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org> 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/param.h> 30 #include <sys/kernel.h> 31 #include <sys/bio.h> 32 #include <sys/buf.h> 33 #include <sys/file.h> 34 #include <sys/spa.h> 35 #include <sys/spa_impl.h> 36 #include <sys/vdev_impl.h> 37 #include <sys/vdev_os.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zio.h> 40 #include <vm/vm_page.h> 41 #include <geom/geom.h> 42 #include <geom/geom_disk.h> 43 #include <geom/geom_int.h> 44 45 #ifndef g_topology_locked 46 #define g_topology_locked() sx_xlocked(&topology_lock) 47 #endif 48 49 /* 50 * Virtual device vector for GEOM. 51 */ 52 53 static g_attrchanged_t vdev_geom_attrchanged; 54 struct g_class zfs_vdev_class = { 55 .name = "ZFS::VDEV", 56 .version = G_VERSION, 57 .attrchanged = vdev_geom_attrchanged, 58 }; 59 60 struct consumer_vdev_elem { 61 SLIST_ENTRY(consumer_vdev_elem) elems; 62 vdev_t *vd; 63 }; 64 65 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); 66 _Static_assert( 67 sizeof (((struct g_consumer *)NULL)->private) == 68 sizeof (struct consumer_priv_t *), 69 "consumer_priv_t* can't be stored in g_consumer.private"); 70 71 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); 72 73 SYSCTL_DECL(_vfs_zfs_vdev); 74 /* Don't send BIO_FLUSH. */ 75 static int vdev_geom_bio_flush_disable; 76 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, 77 &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); 78 /* Don't send BIO_DELETE. */ 79 static int vdev_geom_bio_delete_disable; 80 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, 81 &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); 82 83 /* Declare local functions */ 84 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); 85 86 /* 87 * Thread local storage used to indicate when a thread is probing geoms 88 * for their guids. If NULL, this thread is not tasting geoms. If non NULL, 89 * it is looking for a replacement for the vdev_t* that is its value. 90 */ 91 uint_t zfs_geom_probe_vdev_key; 92 93 static void 94 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, 95 boolean_t do_null_update) 96 { 97 boolean_t needs_update = B_FALSE; 98 char *physpath; 99 int error, physpath_len; 100 101 physpath_len = MAXPATHLEN; 102 physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); 103 error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); 104 if (error == 0) { 105 char *old_physpath; 106 107 /* g_topology lock ensures that vdev has not been closed */ 108 g_topology_assert(); 109 old_physpath = vd->vdev_physpath; 110 vd->vdev_physpath = spa_strdup(physpath); 111 112 if (old_physpath != NULL) { 113 needs_update = (strcmp(old_physpath, 114 vd->vdev_physpath) != 0); 115 spa_strfree(old_physpath); 116 } else 117 needs_update = do_null_update; 118 } 119 g_free(physpath); 120 121 /* 122 * If the physical path changed, update the config. 123 * Only request an update for previously unset physpaths if 124 * requested by the caller. 125 */ 126 if (needs_update) 127 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); 128 129 } 130 131 static void 132 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) 133 { 134 struct consumer_priv_t *priv; 135 struct consumer_vdev_elem *elem; 136 137 priv = (struct consumer_priv_t *)&cp->private; 138 if (SLIST_EMPTY(priv)) 139 return; 140 141 SLIST_FOREACH(elem, priv, elems) { 142 vdev_t *vd = elem->vd; 143 if (strcmp(attr, "GEOM::physpath") == 0) { 144 vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); 145 return; 146 } 147 } 148 } 149 150 static void 151 vdev_geom_resize(struct g_consumer *cp) 152 { 153 struct consumer_priv_t *priv; 154 struct consumer_vdev_elem *elem; 155 spa_t *spa; 156 vdev_t *vd; 157 158 priv = (struct consumer_priv_t *)&cp->private; 159 if (SLIST_EMPTY(priv)) 160 return; 161 162 SLIST_FOREACH(elem, priv, elems) { 163 vd = elem->vd; 164 if (vd->vdev_state != VDEV_STATE_HEALTHY) 165 continue; 166 spa = vd->vdev_spa; 167 if (!spa->spa_autoexpand) 168 continue; 169 vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); 170 } 171 } 172 173 static void 174 vdev_geom_orphan(struct g_consumer *cp) 175 { 176 struct consumer_priv_t *priv; 177 // cppcheck-suppress uninitvar 178 struct consumer_vdev_elem *elem; 179 180 g_topology_assert(); 181 182 priv = (struct consumer_priv_t *)&cp->private; 183 if (SLIST_EMPTY(priv)) 184 /* Vdev close in progress. Ignore the event. */ 185 return; 186 187 /* 188 * Orphan callbacks occur from the GEOM event thread. 189 * Concurrent with this call, new I/O requests may be 190 * working their way through GEOM about to find out 191 * (only once executed by the g_down thread) that we've 192 * been orphaned from our disk provider. These I/Os 193 * must be retired before we can detach our consumer. 194 * This is most easily achieved by acquiring the 195 * SPA ZIO configuration lock as a writer, but doing 196 * so with the GEOM topology lock held would cause 197 * a lock order reversal. Instead, rely on the SPA's 198 * async removal support to invoke a close on this 199 * vdev once it is safe to do so. 200 */ 201 SLIST_FOREACH(elem, priv, elems) { 202 // cppcheck-suppress uninitvar 203 vdev_t *vd = elem->vd; 204 205 vd->vdev_remove_wanted = B_TRUE; 206 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 207 } 208 } 209 210 static struct g_consumer * 211 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) 212 { 213 struct g_geom *gp; 214 struct g_consumer *cp; 215 int error; 216 217 g_topology_assert(); 218 219 ZFS_LOG(1, "Attaching to %s.", pp->name); 220 221 if (sanity) { 222 if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { 223 ZFS_LOG(1, "Failing attach of %s. " 224 "Incompatible sectorsize %d\n", 225 pp->name, pp->sectorsize); 226 return (NULL); 227 } else if (pp->mediasize < SPA_MINDEVSIZE) { 228 ZFS_LOG(1, "Failing attach of %s. " 229 "Incompatible mediasize %ju\n", 230 pp->name, pp->mediasize); 231 return (NULL); 232 } 233 } 234 235 /* Do we have geom already? No? Create one. */ 236 LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { 237 if (gp->flags & G_GEOM_WITHER) 238 continue; 239 if (strcmp(gp->name, "zfs::vdev") != 0) 240 continue; 241 break; 242 } 243 if (gp == NULL) { 244 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); 245 gp->orphan = vdev_geom_orphan; 246 gp->attrchanged = vdev_geom_attrchanged; 247 gp->resize = vdev_geom_resize; 248 cp = g_new_consumer(gp); 249 error = g_attach(cp, pp); 250 if (error != 0) { 251 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, 252 __LINE__, error); 253 vdev_geom_detach(cp, B_FALSE); 254 return (NULL); 255 } 256 error = g_access(cp, 1, 0, 1); 257 if (error != 0) { 258 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, 259 __LINE__, error); 260 vdev_geom_detach(cp, B_FALSE); 261 return (NULL); 262 } 263 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); 264 } else { 265 /* Check if we are already connected to this provider. */ 266 LIST_FOREACH(cp, &gp->consumer, consumer) { 267 if (cp->provider == pp) { 268 ZFS_LOG(1, "Found consumer for %s.", pp->name); 269 break; 270 } 271 } 272 if (cp == NULL) { 273 cp = g_new_consumer(gp); 274 error = g_attach(cp, pp); 275 if (error != 0) { 276 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", 277 __func__, __LINE__, error); 278 vdev_geom_detach(cp, B_FALSE); 279 return (NULL); 280 } 281 error = g_access(cp, 1, 0, 1); 282 if (error != 0) { 283 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", 284 __func__, __LINE__, error); 285 vdev_geom_detach(cp, B_FALSE); 286 return (NULL); 287 } 288 ZFS_LOG(1, "Created consumer for %s.", pp->name); 289 } else { 290 error = g_access(cp, 1, 0, 1); 291 if (error != 0) { 292 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", 293 __func__, __LINE__, error); 294 return (NULL); 295 } 296 ZFS_LOG(1, "Used existing consumer for %s.", pp->name); 297 } 298 } 299 300 if (vd != NULL) 301 vd->vdev_tsd = cp; 302 303 cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; 304 return (cp); 305 } 306 307 static void 308 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) 309 { 310 struct g_geom *gp; 311 312 g_topology_assert(); 313 314 ZFS_LOG(1, "Detaching from %s.", 315 cp->provider && cp->provider->name ? cp->provider->name : "NULL"); 316 317 gp = cp->geom; 318 if (open_for_read) 319 g_access(cp, -1, 0, -1); 320 /* Destroy consumer on last close. */ 321 if (cp->acr == 0 && cp->ace == 0) { 322 if (cp->acw > 0) 323 g_access(cp, 0, -cp->acw, 0); 324 if (cp->provider != NULL) { 325 ZFS_LOG(1, "Destroying consumer for %s.", 326 cp->provider->name ? cp->provider->name : "NULL"); 327 g_detach(cp); 328 } 329 g_destroy_consumer(cp); 330 } 331 /* Destroy geom if there are no consumers left. */ 332 if (LIST_EMPTY(&gp->consumer)) { 333 ZFS_LOG(1, "Destroyed geom %s.", gp->name); 334 g_wither_geom(gp, ENXIO); 335 } 336 } 337 338 static void 339 vdev_geom_close_locked(vdev_t *vd) 340 { 341 struct g_consumer *cp; 342 struct consumer_priv_t *priv; 343 struct consumer_vdev_elem *elem, *elem_temp; 344 345 g_topology_assert(); 346 347 cp = vd->vdev_tsd; 348 vd->vdev_delayed_close = B_FALSE; 349 if (cp == NULL) 350 return; 351 352 ZFS_LOG(1, "Closing access to %s.", cp->provider->name); 353 KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); 354 priv = (struct consumer_priv_t *)&cp->private; 355 vd->vdev_tsd = NULL; 356 SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { 357 if (elem->vd == vd) { 358 SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); 359 g_free(elem); 360 } 361 } 362 363 vdev_geom_detach(cp, B_TRUE); 364 } 365 366 /* 367 * Issue one or more bios to the vdev in parallel 368 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO 369 * operation is described by parallel entries from each array. There may be 370 * more bios actually issued than entries in the array 371 */ 372 static void 373 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, 374 off_t *sizes, int *errors, int ncmds) 375 { 376 struct bio **bios; 377 uint8_t *p; 378 off_t off, maxio, s, end; 379 int i, n_bios, j; 380 size_t bios_size; 381 382 #if __FreeBSD_version > 1300130 383 maxio = maxphys - (maxphys % cp->provider->sectorsize); 384 #else 385 maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); 386 #endif 387 n_bios = 0; 388 389 /* How many bios are required for all commands ? */ 390 for (i = 0; i < ncmds; i++) 391 n_bios += (sizes[i] + maxio - 1) / maxio; 392 393 /* Allocate memory for the bios */ 394 bios_size = n_bios * sizeof (struct bio *); 395 bios = kmem_zalloc(bios_size, KM_SLEEP); 396 397 /* Prepare and issue all of the bios */ 398 for (i = j = 0; i < ncmds; i++) { 399 off = offsets[i]; 400 p = datas[i]; 401 s = sizes[i]; 402 end = off + s; 403 ASSERT0(off % cp->provider->sectorsize); 404 ASSERT0(s % cp->provider->sectorsize); 405 406 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { 407 bios[j] = g_alloc_bio(); 408 bios[j]->bio_cmd = cmds[i]; 409 bios[j]->bio_done = NULL; 410 bios[j]->bio_offset = off; 411 bios[j]->bio_length = MIN(s, maxio); 412 bios[j]->bio_data = (caddr_t)p; 413 g_io_request(bios[j], cp); 414 } 415 } 416 ASSERT3S(j, ==, n_bios); 417 418 /* Wait for all of the bios to complete, and clean them up */ 419 for (i = j = 0; i < ncmds; i++) { 420 off = offsets[i]; 421 s = sizes[i]; 422 end = off + s; 423 424 for (; off < end; off += maxio, s -= maxio, j++) { 425 errors[i] = biowait(bios[j], "vdev_geom_io") || 426 errors[i]; 427 g_destroy_bio(bios[j]); 428 } 429 } 430 kmem_free(bios, bios_size); 431 } 432 433 /* 434 * Read the vdev config from a device. Return the number of valid labels that 435 * were found. The vdev config will be returned in config if and only if at 436 * least one valid label was found. 437 */ 438 static int 439 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) 440 { 441 struct g_provider *pp; 442 nvlist_t *config; 443 vdev_phys_t *vdev_lists[VDEV_LABELS]; 444 char *buf; 445 size_t buflen; 446 uint64_t psize, state, txg; 447 off_t offsets[VDEV_LABELS]; 448 off_t size; 449 off_t sizes[VDEV_LABELS]; 450 int cmds[VDEV_LABELS]; 451 int errors[VDEV_LABELS]; 452 int l, nlabels; 453 454 g_topology_assert_not(); 455 456 pp = cp->provider; 457 ZFS_LOG(1, "Reading config from %s...", pp->name); 458 459 psize = pp->mediasize; 460 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); 461 462 size = sizeof (*vdev_lists[0]) + pp->sectorsize - 463 ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; 464 465 buflen = sizeof (vdev_lists[0]->vp_nvlist); 466 467 /* Create all of the IO requests */ 468 for (l = 0; l < VDEV_LABELS; l++) { 469 cmds[l] = BIO_READ; 470 vdev_lists[l] = kmem_alloc(size, KM_SLEEP); 471 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; 472 sizes[l] = size; 473 errors[l] = 0; 474 ASSERT0(offsets[l] % pp->sectorsize); 475 } 476 477 /* Issue the IO requests */ 478 vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, 479 VDEV_LABELS); 480 481 /* Parse the labels */ 482 config = *configp = NULL; 483 nlabels = 0; 484 for (l = 0; l < VDEV_LABELS; l++) { 485 if (errors[l] != 0) 486 continue; 487 488 buf = vdev_lists[l]->vp_nvlist; 489 490 if (nvlist_unpack(buf, buflen, &config, 0) != 0) 491 continue; 492 493 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, 494 &state) != 0 || state > POOL_STATE_L2CACHE) { 495 nvlist_free(config); 496 continue; 497 } 498 499 if (state != POOL_STATE_SPARE && 500 state != POOL_STATE_L2CACHE && 501 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 502 &txg) != 0 || txg == 0)) { 503 nvlist_free(config); 504 continue; 505 } 506 507 if (*configp != NULL) 508 nvlist_free(*configp); 509 *configp = config; 510 nlabels++; 511 } 512 513 /* Free the label storage */ 514 for (l = 0; l < VDEV_LABELS; l++) 515 kmem_free(vdev_lists[l], size); 516 517 return (nlabels); 518 } 519 520 static void 521 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) 522 { 523 nvlist_t **new_configs; 524 uint64_t i; 525 526 if (id < *count) 527 return; 528 new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), 529 KM_SLEEP); 530 for (i = 0; i < *count; i++) 531 new_configs[i] = (*configs)[i]; 532 if (*configs != NULL) 533 kmem_free(*configs, *count * sizeof (void *)); 534 *configs = new_configs; 535 *count = id + 1; 536 } 537 538 static void 539 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, 540 const char *name, uint64_t *known_pool_guid) 541 { 542 nvlist_t *vdev_tree; 543 uint64_t pool_guid; 544 uint64_t vdev_guid; 545 uint64_t id, txg, known_txg; 546 char *pname; 547 548 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || 549 strcmp(pname, name) != 0) 550 goto ignore; 551 552 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) 553 goto ignore; 554 555 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) 556 goto ignore; 557 558 if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) 559 goto ignore; 560 561 if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) 562 goto ignore; 563 564 txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG); 565 566 if (*known_pool_guid != 0) { 567 if (pool_guid != *known_pool_guid) 568 goto ignore; 569 } else 570 *known_pool_guid = pool_guid; 571 572 resize_configs(configs, count, id); 573 574 if ((*configs)[id] != NULL) { 575 known_txg = fnvlist_lookup_uint64((*configs)[id], 576 ZPOOL_CONFIG_POOL_TXG); 577 if (txg <= known_txg) 578 goto ignore; 579 nvlist_free((*configs)[id]); 580 } 581 582 (*configs)[id] = cfg; 583 return; 584 585 ignore: 586 nvlist_free(cfg); 587 } 588 589 int 590 vdev_geom_read_pool_label(const char *name, 591 nvlist_t ***configs, uint64_t *count) 592 { 593 struct g_class *mp; 594 struct g_geom *gp; 595 struct g_provider *pp; 596 struct g_consumer *zcp; 597 nvlist_t *vdev_cfg; 598 uint64_t pool_guid; 599 int nlabels; 600 601 DROP_GIANT(); 602 g_topology_lock(); 603 604 *configs = NULL; 605 *count = 0; 606 pool_guid = 0; 607 LIST_FOREACH(mp, &g_classes, class) { 608 if (mp == &zfs_vdev_class) 609 continue; 610 LIST_FOREACH(gp, &mp->geom, geom) { 611 if (gp->flags & G_GEOM_WITHER) 612 continue; 613 LIST_FOREACH(pp, &gp->provider, provider) { 614 if (pp->flags & G_PF_WITHER) 615 continue; 616 zcp = vdev_geom_attach(pp, NULL, B_TRUE); 617 if (zcp == NULL) 618 continue; 619 g_topology_unlock(); 620 nlabels = vdev_geom_read_config(zcp, &vdev_cfg); 621 g_topology_lock(); 622 vdev_geom_detach(zcp, B_TRUE); 623 if (nlabels == 0) 624 continue; 625 ZFS_LOG(1, "successfully read vdev config"); 626 627 process_vdev_config(configs, count, 628 vdev_cfg, name, &pool_guid); 629 } 630 } 631 } 632 g_topology_unlock(); 633 PICKUP_GIANT(); 634 635 return (*count > 0 ? 0 : ENOENT); 636 } 637 638 enum match { 639 NO_MATCH = 0, /* No matching labels found */ 640 TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ 641 ZERO_MATCH = 1, /* Should never be returned */ 642 ONE_MATCH = 2, /* 1 label matching the vdev_guid */ 643 TWO_MATCH = 3, /* 2 label matching the vdev_guid */ 644 THREE_MATCH = 4, /* 3 label matching the vdev_guid */ 645 FULL_MATCH = 5 /* all labels match the vdev_guid */ 646 }; 647 648 static enum match 649 vdev_attach_ok(vdev_t *vd, struct g_provider *pp) 650 { 651 nvlist_t *config; 652 uint64_t pool_guid, top_guid, vdev_guid; 653 struct g_consumer *cp; 654 int nlabels; 655 656 cp = vdev_geom_attach(pp, NULL, B_TRUE); 657 if (cp == NULL) { 658 ZFS_LOG(1, "Unable to attach tasting instance to %s.", 659 pp->name); 660 return (NO_MATCH); 661 } 662 g_topology_unlock(); 663 nlabels = vdev_geom_read_config(cp, &config); 664 g_topology_lock(); 665 vdev_geom_detach(cp, B_TRUE); 666 if (nlabels == 0) { 667 ZFS_LOG(1, "Unable to read config from %s.", pp->name); 668 return (NO_MATCH); 669 } 670 671 pool_guid = 0; 672 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); 673 top_guid = 0; 674 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); 675 vdev_guid = 0; 676 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); 677 nvlist_free(config); 678 679 /* 680 * Check that the label's pool guid matches the desired guid. 681 * Inactive spares and L2ARCs do not have any pool guid in the label. 682 */ 683 if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { 684 ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", 685 pp->name, 686 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); 687 return (NO_MATCH); 688 } 689 690 /* 691 * Check that the label's vdev guid matches the desired guid. 692 * The second condition handles possible race on vdev detach, when 693 * remaining vdev receives GUID of destroyed top level mirror vdev. 694 */ 695 if (vdev_guid == vd->vdev_guid) { 696 ZFS_LOG(1, "guids match for provider %s.", pp->name); 697 return (ZERO_MATCH + nlabels); 698 } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { 699 ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); 700 return (TOPGUID_MATCH); 701 } 702 ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", 703 pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); 704 return (NO_MATCH); 705 } 706 707 static struct g_consumer * 708 vdev_geom_attach_by_guids(vdev_t *vd) 709 { 710 struct g_class *mp; 711 struct g_geom *gp; 712 struct g_provider *pp, *best_pp; 713 struct g_consumer *cp; 714 const char *vdpath; 715 enum match match, best_match; 716 717 g_topology_assert(); 718 719 vdpath = vd->vdev_path + sizeof ("/dev/") - 1; 720 cp = NULL; 721 best_pp = NULL; 722 best_match = NO_MATCH; 723 LIST_FOREACH(mp, &g_classes, class) { 724 if (mp == &zfs_vdev_class) 725 continue; 726 LIST_FOREACH(gp, &mp->geom, geom) { 727 if (gp->flags & G_GEOM_WITHER) 728 continue; 729 LIST_FOREACH(pp, &gp->provider, provider) { 730 match = vdev_attach_ok(vd, pp); 731 if (match > best_match) { 732 best_match = match; 733 best_pp = pp; 734 } else if (match == best_match) { 735 if (strcmp(pp->name, vdpath) == 0) { 736 best_pp = pp; 737 } 738 } 739 if (match == FULL_MATCH) 740 goto out; 741 } 742 } 743 } 744 745 out: 746 if (best_pp) { 747 cp = vdev_geom_attach(best_pp, vd, B_TRUE); 748 if (cp == NULL) { 749 printf("ZFS WARNING: Unable to attach to %s.\n", 750 best_pp->name); 751 } 752 } 753 return (cp); 754 } 755 756 static struct g_consumer * 757 vdev_geom_open_by_guids(vdev_t *vd) 758 { 759 struct g_consumer *cp; 760 char *buf; 761 size_t len; 762 763 g_topology_assert(); 764 765 ZFS_LOG(1, "Searching by guids [%ju:%ju].", 766 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); 767 cp = vdev_geom_attach_by_guids(vd); 768 if (cp != NULL) { 769 len = strlen(cp->provider->name) + strlen("/dev/") + 1; 770 buf = kmem_alloc(len, KM_SLEEP); 771 772 snprintf(buf, len, "/dev/%s", cp->provider->name); 773 spa_strfree(vd->vdev_path); 774 vd->vdev_path = buf; 775 776 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", 777 (uintmax_t)spa_guid(vd->vdev_spa), 778 (uintmax_t)vd->vdev_guid, cp->provider->name); 779 } else { 780 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", 781 (uintmax_t)spa_guid(vd->vdev_spa), 782 (uintmax_t)vd->vdev_guid); 783 } 784 785 return (cp); 786 } 787 788 static struct g_consumer * 789 vdev_geom_open_by_path(vdev_t *vd, int check_guid) 790 { 791 struct g_provider *pp; 792 struct g_consumer *cp; 793 794 g_topology_assert(); 795 796 cp = NULL; 797 pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); 798 if (pp != NULL) { 799 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); 800 if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) 801 cp = vdev_geom_attach(pp, vd, B_FALSE); 802 } 803 804 return (cp); 805 } 806 807 static int 808 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 809 uint64_t *logical_ashift, uint64_t *physical_ashift) 810 { 811 struct g_provider *pp; 812 struct g_consumer *cp; 813 int error, has_trim; 814 uint16_t rate; 815 816 /* 817 * Set the TLS to indicate downstack that we 818 * should not access zvols 819 */ 820 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd)); 821 822 /* 823 * We must have a pathname, and it must be absolute. 824 */ 825 if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { 826 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 827 return (EINVAL); 828 } 829 830 /* 831 * Reopen the device if it's not currently open. Otherwise, 832 * just update the physical size of the device. 833 */ 834 if ((cp = vd->vdev_tsd) != NULL) { 835 ASSERT(vd->vdev_reopening); 836 goto skip_open; 837 } 838 839 DROP_GIANT(); 840 g_topology_lock(); 841 error = 0; 842 843 if (vd->vdev_spa->spa_is_splitting || 844 ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && 845 (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || 846 vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { 847 /* 848 * We are dealing with a vdev that hasn't been previously 849 * opened (since boot), and we are not loading an 850 * existing pool configuration. This looks like a 851 * vdev add operation to a new or existing pool. 852 * Assume the user really wants to do this, and find 853 * GEOM provider by its name, ignoring GUID mismatches. 854 * 855 * XXPOLICY: It would be safer to only allow a device 856 * that is unlabeled or labeled but missing 857 * GUID information to be opened in this fashion, 858 * unless we are doing a split, in which case we 859 * should allow any guid. 860 */ 861 cp = vdev_geom_open_by_path(vd, 0); 862 } else { 863 /* 864 * Try using the recorded path for this device, but only 865 * accept it if its label data contains the expected GUIDs. 866 */ 867 cp = vdev_geom_open_by_path(vd, 1); 868 if (cp == NULL) { 869 /* 870 * The device at vd->vdev_path doesn't have the 871 * expected GUIDs. The disks might have merely 872 * moved around so try all other GEOM providers 873 * to find one with the right GUIDs. 874 */ 875 cp = vdev_geom_open_by_guids(vd); 876 } 877 } 878 879 /* Clear the TLS now that tasting is done */ 880 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL)); 881 882 if (cp == NULL) { 883 ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); 884 error = ENOENT; 885 } else { 886 struct consumer_priv_t *priv; 887 struct consumer_vdev_elem *elem; 888 int spamode; 889 890 priv = (struct consumer_priv_t *)&cp->private; 891 if (cp->private == NULL) 892 SLIST_INIT(priv); 893 elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); 894 elem->vd = vd; 895 SLIST_INSERT_HEAD(priv, elem, elems); 896 897 spamode = spa_mode(vd->vdev_spa); 898 if (cp->provider->sectorsize > VDEV_PAD_SIZE || 899 !ISP2(cp->provider->sectorsize)) { 900 ZFS_LOG(1, "Provider %s has unsupported sectorsize.", 901 cp->provider->name); 902 903 vdev_geom_close_locked(vd); 904 error = EINVAL; 905 cp = NULL; 906 } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { 907 int i; 908 909 for (i = 0; i < 5; i++) { 910 error = g_access(cp, 0, 1, 0); 911 if (error == 0) 912 break; 913 g_topology_unlock(); 914 tsleep(vd, 0, "vdev", hz / 2); 915 g_topology_lock(); 916 } 917 if (error != 0) { 918 printf("ZFS WARNING: Unable to open %s for " 919 "writing (error=%d).\n", 920 cp->provider->name, error); 921 vdev_geom_close_locked(vd); 922 cp = NULL; 923 } 924 } 925 } 926 927 /* Fetch initial physical path information for this device. */ 928 if (cp != NULL) { 929 vdev_geom_attrchanged(cp, "GEOM::physpath"); 930 931 /* Set other GEOM characteristics */ 932 vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); 933 } 934 935 g_topology_unlock(); 936 PICKUP_GIANT(); 937 if (cp == NULL) { 938 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 939 vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", 940 error); 941 return (error); 942 } 943 skip_open: 944 pp = cp->provider; 945 946 /* 947 * Determine the actual size of the device. 948 */ 949 *max_psize = *psize = pp->mediasize; 950 951 /* 952 * Determine the device's minimum transfer size and preferred 953 * transfer size. 954 */ 955 *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; 956 *physical_ashift = 0; 957 if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && 958 ISP2(pp->stripesize) && pp->stripeoffset == 0) 959 *physical_ashift = highbit(pp->stripesize) - 1; 960 961 /* 962 * Clear the nowritecache settings, so that on a vdev_reopen() 963 * we will try again. 964 */ 965 vd->vdev_nowritecache = B_FALSE; 966 967 /* Inform the ZIO pipeline that we are non-rotational. */ 968 error = g_getattr("GEOM::rotation_rate", cp, &rate); 969 if (error == 0 && rate == DISK_RR_NON_ROTATING) 970 vd->vdev_nonrot = B_TRUE; 971 else 972 vd->vdev_nonrot = B_FALSE; 973 974 /* Set when device reports it supports TRIM. */ 975 error = g_getattr("GEOM::candelete", cp, &has_trim); 976 vd->vdev_has_trim = (error == 0 && has_trim); 977 978 /* Set when device reports it supports secure TRIM. */ 979 /* unavailable on FreeBSD */ 980 vd->vdev_has_securetrim = B_FALSE; 981 982 return (0); 983 } 984 985 static void 986 vdev_geom_close(vdev_t *vd) 987 { 988 struct g_consumer *cp; 989 boolean_t locked; 990 991 cp = vd->vdev_tsd; 992 993 DROP_GIANT(); 994 locked = g_topology_locked(); 995 if (!locked) 996 g_topology_lock(); 997 998 if (!vd->vdev_reopening || 999 (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || 1000 (cp->provider != NULL && cp->provider->error != 0)))) 1001 vdev_geom_close_locked(vd); 1002 1003 if (!locked) 1004 g_topology_unlock(); 1005 PICKUP_GIANT(); 1006 } 1007 1008 static void 1009 vdev_geom_io_intr(struct bio *bp) 1010 { 1011 vdev_t *vd; 1012 zio_t *zio; 1013 1014 zio = bp->bio_caller1; 1015 vd = zio->io_vd; 1016 zio->io_error = bp->bio_error; 1017 if (zio->io_error == 0 && bp->bio_resid != 0) 1018 zio->io_error = SET_ERROR(EIO); 1019 1020 switch (zio->io_error) { 1021 case ENOTSUP: 1022 /* 1023 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know 1024 * that future attempts will never succeed. In this case 1025 * we set a persistent flag so that we don't bother with 1026 * requests in the future. 1027 */ 1028 switch (bp->bio_cmd) { 1029 case BIO_FLUSH: 1030 vd->vdev_nowritecache = B_TRUE; 1031 break; 1032 case BIO_DELETE: 1033 break; 1034 } 1035 break; 1036 case ENXIO: 1037 if (!vd->vdev_remove_wanted) { 1038 /* 1039 * If provider's error is set we assume it is being 1040 * removed. 1041 */ 1042 if (bp->bio_to->error != 0) { 1043 vd->vdev_remove_wanted = B_TRUE; 1044 spa_async_request(zio->io_spa, 1045 SPA_ASYNC_REMOVE); 1046 } else if (!vd->vdev_delayed_close) { 1047 vd->vdev_delayed_close = B_TRUE; 1048 } 1049 } 1050 break; 1051 } 1052 1053 /* 1054 * We have to split bio freeing into two parts, because the ABD code 1055 * cannot be called in this context and vdev_op_io_done is not called 1056 * for ZIO_TYPE_IOCTL zio-s. 1057 */ 1058 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { 1059 g_destroy_bio(bp); 1060 zio->io_bio = NULL; 1061 } 1062 zio_delay_interrupt(zio); 1063 } 1064 1065 struct vdev_geom_check_unmapped_cb_state { 1066 int pages; 1067 uint_t end; 1068 }; 1069 1070 /* 1071 * Callback to check the ABD segment size/alignment and count the pages. 1072 * GEOM requires data buffer to look virtually contiguous. It means only 1073 * the first page of the buffer may not start and only the last may not 1074 * end on a page boundary. All other physical pages must be full. 1075 */ 1076 static int 1077 vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv) 1078 { 1079 struct vdev_geom_check_unmapped_cb_state *s = priv; 1080 vm_offset_t off = (vm_offset_t)buf & PAGE_MASK; 1081 1082 if (s->pages != 0 && off != 0) 1083 return (1); 1084 if (s->end != 0) 1085 return (1); 1086 s->end = (off + len) & PAGE_MASK; 1087 s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT; 1088 return (0); 1089 } 1090 1091 /* 1092 * Check whether we can use unmapped I/O for this ZIO on this device to 1093 * avoid data copying between scattered and/or gang ABD buffer and linear. 1094 */ 1095 static int 1096 vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp) 1097 { 1098 struct vdev_geom_check_unmapped_cb_state s; 1099 1100 /* If unmapped I/O is administratively disabled, respect that. */ 1101 if (!unmapped_buf_allowed) 1102 return (0); 1103 1104 /* If the buffer is already linear, then nothing to do here. */ 1105 if (abd_is_linear(zio->io_abd)) 1106 return (0); 1107 1108 /* 1109 * If unmapped I/O is not supported by the GEOM provider, 1110 * then we can't do anything and have to copy the data. 1111 */ 1112 if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) 1113 return (0); 1114 1115 /* Check the buffer chunks sizes/alignments and count pages. */ 1116 s.pages = s.end = 0; 1117 if (abd_iterate_func(zio->io_abd, 0, zio->io_size, 1118 vdev_geom_check_unmapped_cb, &s)) 1119 return (0); 1120 return (s.pages); 1121 } 1122 1123 /* 1124 * Callback to translate the ABD segment into array of physical pages. 1125 */ 1126 static int 1127 vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv) 1128 { 1129 struct bio *bp = priv; 1130 vm_offset_t addr = (vm_offset_t)buf; 1131 vm_offset_t end = addr + len; 1132 1133 if (bp->bio_ma_n == 0) { 1134 bp->bio_ma_offset = addr & PAGE_MASK; 1135 addr &= ~PAGE_MASK; 1136 } else { 1137 ASSERT0(P2PHASE(addr, PAGE_SIZE)); 1138 } 1139 do { 1140 bp->bio_ma[bp->bio_ma_n++] = 1141 PHYS_TO_VM_PAGE(pmap_kextract(addr)); 1142 addr += PAGE_SIZE; 1143 } while (addr < end); 1144 return (0); 1145 } 1146 1147 static void 1148 vdev_geom_io_start(zio_t *zio) 1149 { 1150 vdev_t *vd; 1151 struct g_consumer *cp; 1152 struct bio *bp; 1153 1154 vd = zio->io_vd; 1155 1156 switch (zio->io_type) { 1157 case ZIO_TYPE_IOCTL: 1158 /* XXPOLICY */ 1159 if (!vdev_readable(vd)) { 1160 zio->io_error = SET_ERROR(ENXIO); 1161 zio_interrupt(zio); 1162 return; 1163 } else { 1164 switch (zio->io_cmd) { 1165 case DKIOCFLUSHWRITECACHE: 1166 if (zfs_nocacheflush || 1167 vdev_geom_bio_flush_disable) 1168 break; 1169 if (vd->vdev_nowritecache) { 1170 zio->io_error = SET_ERROR(ENOTSUP); 1171 break; 1172 } 1173 goto sendreq; 1174 default: 1175 zio->io_error = SET_ERROR(ENOTSUP); 1176 } 1177 } 1178 1179 zio_execute(zio); 1180 return; 1181 case ZIO_TYPE_TRIM: 1182 if (!vdev_geom_bio_delete_disable) { 1183 goto sendreq; 1184 } 1185 zio_execute(zio); 1186 return; 1187 default: 1188 ; 1189 /* PASSTHROUGH --- placate compiler */ 1190 } 1191 sendreq: 1192 ASSERT(zio->io_type == ZIO_TYPE_READ || 1193 zio->io_type == ZIO_TYPE_WRITE || 1194 zio->io_type == ZIO_TYPE_TRIM || 1195 zio->io_type == ZIO_TYPE_IOCTL); 1196 1197 cp = vd->vdev_tsd; 1198 if (cp == NULL) { 1199 zio->io_error = SET_ERROR(ENXIO); 1200 zio_interrupt(zio); 1201 return; 1202 } 1203 bp = g_alloc_bio(); 1204 bp->bio_caller1 = zio; 1205 switch (zio->io_type) { 1206 case ZIO_TYPE_READ: 1207 case ZIO_TYPE_WRITE: 1208 zio->io_target_timestamp = zio_handle_io_delay(zio); 1209 bp->bio_offset = zio->io_offset; 1210 bp->bio_length = zio->io_size; 1211 if (zio->io_type == ZIO_TYPE_READ) 1212 bp->bio_cmd = BIO_READ; 1213 else 1214 bp->bio_cmd = BIO_WRITE; 1215 1216 /* 1217 * If possible, represent scattered and/or gang ABD buffer to 1218 * GEOM as an array of physical pages. It allows to satisfy 1219 * requirement of virtually contiguous buffer without copying. 1220 */ 1221 int pgs = vdev_geom_check_unmapped(zio, cp); 1222 if (pgs > 0) { 1223 bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs, 1224 M_DEVBUF, M_WAITOK); 1225 bp->bio_ma_n = 0; 1226 bp->bio_ma_offset = 0; 1227 abd_iterate_func(zio->io_abd, 0, zio->io_size, 1228 vdev_geom_fill_unmap_cb, bp); 1229 bp->bio_data = unmapped_buf; 1230 bp->bio_flags |= BIO_UNMAPPED; 1231 } else { 1232 if (zio->io_type == ZIO_TYPE_READ) { 1233 bp->bio_data = abd_borrow_buf(zio->io_abd, 1234 zio->io_size); 1235 } else { 1236 bp->bio_data = abd_borrow_buf_copy(zio->io_abd, 1237 zio->io_size); 1238 } 1239 } 1240 break; 1241 case ZIO_TYPE_TRIM: 1242 bp->bio_cmd = BIO_DELETE; 1243 bp->bio_data = NULL; 1244 bp->bio_offset = zio->io_offset; 1245 bp->bio_length = zio->io_size; 1246 break; 1247 case ZIO_TYPE_IOCTL: 1248 bp->bio_cmd = BIO_FLUSH; 1249 bp->bio_data = NULL; 1250 bp->bio_offset = cp->provider->mediasize; 1251 bp->bio_length = 0; 1252 break; 1253 default: 1254 panic("invalid zio->io_type: %d\n", zio->io_type); 1255 } 1256 bp->bio_done = vdev_geom_io_intr; 1257 zio->io_bio = bp; 1258 1259 g_io_request(bp, cp); 1260 } 1261 1262 static void 1263 vdev_geom_io_done(zio_t *zio) 1264 { 1265 struct bio *bp = zio->io_bio; 1266 1267 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { 1268 ASSERT3P(bp, ==, NULL); 1269 return; 1270 } 1271 1272 if (bp == NULL) { 1273 ASSERT3S(zio->io_error, ==, ENXIO); 1274 return; 1275 } 1276 1277 if (bp->bio_ma != NULL) { 1278 free(bp->bio_ma, M_DEVBUF); 1279 } else { 1280 if (zio->io_type == ZIO_TYPE_READ) { 1281 abd_return_buf_copy(zio->io_abd, bp->bio_data, 1282 zio->io_size); 1283 } else { 1284 abd_return_buf(zio->io_abd, bp->bio_data, 1285 zio->io_size); 1286 } 1287 } 1288 1289 g_destroy_bio(bp); 1290 zio->io_bio = NULL; 1291 } 1292 1293 static void 1294 vdev_geom_hold(vdev_t *vd) 1295 { 1296 } 1297 1298 static void 1299 vdev_geom_rele(vdev_t *vd) 1300 { 1301 } 1302 1303 vdev_ops_t vdev_disk_ops = { 1304 .vdev_op_init = NULL, 1305 .vdev_op_fini = NULL, 1306 .vdev_op_open = vdev_geom_open, 1307 .vdev_op_close = vdev_geom_close, 1308 .vdev_op_asize = vdev_default_asize, 1309 .vdev_op_min_asize = vdev_default_min_asize, 1310 .vdev_op_min_alloc = NULL, 1311 .vdev_op_io_start = vdev_geom_io_start, 1312 .vdev_op_io_done = vdev_geom_io_done, 1313 .vdev_op_state_change = NULL, 1314 .vdev_op_need_resilver = NULL, 1315 .vdev_op_hold = vdev_geom_hold, 1316 .vdev_op_rele = vdev_geom_rele, 1317 .vdev_op_remap = NULL, 1318 .vdev_op_xlate = vdev_default_xlate, 1319 .vdev_op_rebuild_asize = NULL, 1320 .vdev_op_metaslab_init = NULL, 1321 .vdev_op_config_generate = NULL, 1322 .vdev_op_nparity = NULL, 1323 .vdev_op_ndisks = NULL, 1324 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1325 .vdev_op_leaf = B_TRUE /* leaf vdev */ 1326 }; 1327