1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 37 */ 38 39 /* 40 * SPA: Storage Pool Allocator 41 * 42 * This file contains all the routines used when modifying on-disk SPA state. 43 * This includes opening, importing, destroying, exporting a pool, and syncing a 44 * pool. 45 */ 46 47 #include <sys/zfs_context.h> 48 #include <sys/fm/fs/zfs.h> 49 #include <sys/spa_impl.h> 50 #include <sys/zio.h> 51 #include <sys/zio_checksum.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_tx.h> 54 #include <sys/zap.h> 55 #include <sys/zil.h> 56 #include <sys/brt.h> 57 #include <sys/ddt.h> 58 #include <sys/vdev_impl.h> 59 #include <sys/vdev_removal.h> 60 #include <sys/vdev_indirect_mapping.h> 61 #include <sys/vdev_indirect_births.h> 62 #include <sys/vdev_initialize.h> 63 #include <sys/vdev_rebuild.h> 64 #include <sys/vdev_trim.h> 65 #include <sys/vdev_disk.h> 66 #include <sys/vdev_raidz.h> 67 #include <sys/vdev_draid.h> 68 #include <sys/metaslab.h> 69 #include <sys/metaslab_impl.h> 70 #include <sys/mmp.h> 71 #include <sys/uberblock_impl.h> 72 #include <sys/txg.h> 73 #include <sys/avl.h> 74 #include <sys/bpobj.h> 75 #include <sys/dmu_traverse.h> 76 #include <sys/dmu_objset.h> 77 #include <sys/unique.h> 78 #include <sys/dsl_pool.h> 79 #include <sys/dsl_dataset.h> 80 #include <sys/dsl_dir.h> 81 #include <sys/dsl_prop.h> 82 #include <sys/dsl_synctask.h> 83 #include <sys/fs/zfs.h> 84 #include <sys/arc.h> 85 #include <sys/callb.h> 86 #include <sys/systeminfo.h> 87 #include <sys/zfs_ioctl.h> 88 #include <sys/dsl_scan.h> 89 #include <sys/zfeature.h> 90 #include <sys/dsl_destroy.h> 91 #include <sys/zvol.h> 92 93 #ifdef _KERNEL 94 #include <sys/fm/protocol.h> 95 #include <sys/fm/util.h> 96 #include <sys/callb.h> 97 #include <sys/zone.h> 98 #include <sys/vmsystm.h> 99 #endif /* _KERNEL */ 100 101 #include "zfs_prop.h" 102 #include "zfs_comutil.h" 103 #include <cityhash.h> 104 105 /* 106 * spa_thread() existed on Illumos as a parent thread for the various worker 107 * threads that actually run the pool, as a way to both reference the entire 108 * pool work as a single object, and to share properties like scheduling 109 * options. It has not yet been adapted to Linux or FreeBSD. This define is 110 * used to mark related parts of the code to make things easier for the reader, 111 * and to compile this code out. It can be removed when someone implements it, 112 * moves it to some Illumos-specific place, or removes it entirely. 113 */ 114 #undef HAVE_SPA_THREAD 115 116 /* 117 * The "System Duty Cycle" scheduling class is an Illumos feature to help 118 * prevent CPU-intensive kernel threads from affecting latency on interactive 119 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 120 * gated behind a define. On Illumos SDC depends on spa_thread(), but 121 * spa_thread() also has other uses, so this is a separate define. 122 */ 123 #undef HAVE_SYSDC 124 125 /* 126 * The interval, in seconds, at which failed configuration cache file writes 127 * should be retried. 128 */ 129 int zfs_ccw_retry_interval = 300; 130 131 typedef enum zti_modes { 132 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 133 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 134 ZTI_MODE_SYNC, /* sync thread assigned */ 135 ZTI_MODE_NULL, /* don't create a taskq */ 136 ZTI_NMODES 137 } zti_modes_t; 138 139 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 140 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 141 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 142 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 143 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 144 145 #define ZTI_N(n) ZTI_P(n, 1) 146 #define ZTI_ONE ZTI_N(1) 147 148 typedef struct zio_taskq_info { 149 zti_modes_t zti_mode; 150 uint_t zti_value; 151 uint_t zti_count; 152 } zio_taskq_info_t; 153 154 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 155 "iss", "iss_h", "int", "int_h" 156 }; 157 158 /* 159 * This table defines the taskq settings for each ZFS I/O type. When 160 * initializing a pool, we use this table to create an appropriately sized 161 * taskq. Some operations are low volume and therefore have a small, static 162 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 163 * macros. Other operations process a large amount of data; the ZTI_SCALE 164 * macro causes us to create a taskq oriented for throughput. Some operations 165 * are so high frequency and short-lived that the taskq itself can become a 166 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 167 * additional degree of parallelism specified by the number of threads per- 168 * taskq and the number of taskqs; when dispatching an event in this case, the 169 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 170 * that scales with the number of CPUs. 171 * 172 * The different taskq priorities are to handle the different contexts (issue 173 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 174 * need to be handled with minimum delay. 175 */ 176 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 177 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 178 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 179 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 180 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 181 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 182 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 183 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 184 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 185 }; 186 187 static void spa_sync_version(void *arg, dmu_tx_t *tx); 188 static void spa_sync_props(void *arg, dmu_tx_t *tx); 189 static boolean_t spa_has_active_shared_spare(spa_t *spa); 190 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 191 const char **ereport); 192 static void spa_vdev_resilver_done(spa_t *spa); 193 194 /* 195 * Percentage of all CPUs that can be used by the metaslab preload taskq. 196 */ 197 static uint_t metaslab_preload_pct = 50; 198 199 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 200 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 201 202 #ifdef HAVE_SYSDC 203 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 204 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 205 #endif 206 207 #ifdef HAVE_SPA_THREAD 208 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 209 #endif 210 211 static uint_t zio_taskq_wr_iss_ncpus = 0; 212 213 /* 214 * Report any spa_load_verify errors found, but do not fail spa_load. 215 * This is used by zdb to analyze non-idle pools. 216 */ 217 boolean_t spa_load_verify_dryrun = B_FALSE; 218 219 /* 220 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 221 * This is used by zdb for spacemaps verification. 222 */ 223 boolean_t spa_mode_readable_spacemaps = B_FALSE; 224 225 /* 226 * This (illegal) pool name is used when temporarily importing a spa_t in order 227 * to get the vdev stats associated with the imported devices. 228 */ 229 #define TRYIMPORT_NAME "$import" 230 231 /* 232 * For debugging purposes: print out vdev tree during pool import. 233 */ 234 static int spa_load_print_vdev_tree = B_FALSE; 235 236 /* 237 * A non-zero value for zfs_max_missing_tvds means that we allow importing 238 * pools with missing top-level vdevs. This is strictly intended for advanced 239 * pool recovery cases since missing data is almost inevitable. Pools with 240 * missing devices can only be imported read-only for safety reasons, and their 241 * fail-mode will be automatically set to "continue". 242 * 243 * With 1 missing vdev we should be able to import the pool and mount all 244 * datasets. User data that was not modified after the missing device has been 245 * added should be recoverable. This means that snapshots created prior to the 246 * addition of that device should be completely intact. 247 * 248 * With 2 missing vdevs, some datasets may fail to mount since there are 249 * dataset statistics that are stored as regular metadata. Some data might be 250 * recoverable if those vdevs were added recently. 251 * 252 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 253 * may be missing entirely. Chances of data recovery are very low. Note that 254 * there are also risks of performing an inadvertent rewind as we might be 255 * missing all the vdevs with the latest uberblocks. 256 */ 257 uint64_t zfs_max_missing_tvds = 0; 258 259 /* 260 * The parameters below are similar to zfs_max_missing_tvds but are only 261 * intended for a preliminary open of the pool with an untrusted config which 262 * might be incomplete or out-dated. 263 * 264 * We are more tolerant for pools opened from a cachefile since we could have 265 * an out-dated cachefile where a device removal was not registered. 266 * We could have set the limit arbitrarily high but in the case where devices 267 * are really missing we would want to return the proper error codes; we chose 268 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 269 * and we get a chance to retrieve the trusted config. 270 */ 271 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 272 273 /* 274 * In the case where config was assembled by scanning device paths (/dev/dsks 275 * by default) we are less tolerant since all the existing devices should have 276 * been detected and we want spa_load to return the right error codes. 277 */ 278 uint64_t zfs_max_missing_tvds_scan = 0; 279 280 /* 281 * Debugging aid that pauses spa_sync() towards the end. 282 */ 283 static const boolean_t zfs_pause_spa_sync = B_FALSE; 284 285 /* 286 * Variables to indicate the livelist condense zthr func should wait at certain 287 * points for the livelist to be removed - used to test condense/destroy races 288 */ 289 static int zfs_livelist_condense_zthr_pause = 0; 290 static int zfs_livelist_condense_sync_pause = 0; 291 292 /* 293 * Variables to track whether or not condense cancellation has been 294 * triggered in testing. 295 */ 296 static int zfs_livelist_condense_sync_cancel = 0; 297 static int zfs_livelist_condense_zthr_cancel = 0; 298 299 /* 300 * Variable to track whether or not extra ALLOC blkptrs were added to a 301 * livelist entry while it was being condensed (caused by the way we track 302 * remapped blkptrs in dbuf_remap_impl) 303 */ 304 static int zfs_livelist_condense_new_alloc = 0; 305 306 /* 307 * ========================================================================== 308 * SPA properties routines 309 * ========================================================================== 310 */ 311 312 /* 313 * Add a (source=src, propname=propval) list to an nvlist. 314 */ 315 static void 316 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 317 uint64_t intval, zprop_source_t src) 318 { 319 const char *propname = zpool_prop_to_name(prop); 320 nvlist_t *propval; 321 322 propval = fnvlist_alloc(); 323 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 324 325 if (strval != NULL) 326 fnvlist_add_string(propval, ZPROP_VALUE, strval); 327 else 328 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 329 330 fnvlist_add_nvlist(nvl, propname, propval); 331 nvlist_free(propval); 332 } 333 334 /* 335 * Add a user property (source=src, propname=propval) to an nvlist. 336 */ 337 static void 338 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 339 zprop_source_t src) 340 { 341 nvlist_t *propval; 342 343 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 344 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 345 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 346 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 347 nvlist_free(propval); 348 } 349 350 /* 351 * Get property values from the spa configuration. 352 */ 353 static void 354 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 355 { 356 vdev_t *rvd = spa->spa_root_vdev; 357 dsl_pool_t *pool = spa->spa_dsl_pool; 358 uint64_t size, alloc, cap, version; 359 const zprop_source_t src = ZPROP_SRC_NONE; 360 spa_config_dirent_t *dp; 361 metaslab_class_t *mc = spa_normal_class(spa); 362 363 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 364 365 if (rvd != NULL) { 366 alloc = metaslab_class_get_alloc(mc); 367 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 368 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 369 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 370 371 size = metaslab_class_get_space(mc); 372 size += metaslab_class_get_space(spa_special_class(spa)); 373 size += metaslab_class_get_space(spa_dedup_class(spa)); 374 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 375 376 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 377 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 378 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 379 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 380 size - alloc, src); 381 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 382 spa->spa_checkpoint_info.sci_dspace, src); 383 384 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 385 metaslab_class_fragmentation(mc), src); 386 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 387 metaslab_class_expandable_space(mc), src); 388 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 389 (spa_mode(spa) == SPA_MODE_READ), src); 390 391 cap = (size == 0) ? 0 : (alloc * 100 / size); 392 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 393 394 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 395 ddt_get_pool_dedup_ratio(spa), src); 396 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 397 brt_get_used(spa), src); 398 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 399 brt_get_saved(spa), src); 400 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 401 brt_get_ratio(spa), src); 402 403 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 404 rvd->vdev_state, src); 405 406 version = spa_version(spa); 407 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 408 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 409 version, ZPROP_SRC_DEFAULT); 410 } else { 411 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 412 version, ZPROP_SRC_LOCAL); 413 } 414 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 415 NULL, spa_load_guid(spa), src); 416 } 417 418 if (pool != NULL) { 419 /* 420 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 421 * when opening pools before this version freedir will be NULL. 422 */ 423 if (pool->dp_free_dir != NULL) { 424 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 425 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 426 src); 427 } else { 428 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 429 NULL, 0, src); 430 } 431 432 if (pool->dp_leak_dir != NULL) { 433 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 434 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 435 src); 436 } else { 437 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 438 NULL, 0, src); 439 } 440 } 441 442 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 443 444 if (spa->spa_comment != NULL) { 445 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 446 0, ZPROP_SRC_LOCAL); 447 } 448 449 if (spa->spa_compatibility != NULL) { 450 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 451 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 452 } 453 454 if (spa->spa_root != NULL) 455 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 456 0, ZPROP_SRC_LOCAL); 457 458 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 459 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 460 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 461 } else { 462 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 463 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 464 } 465 466 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 467 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 468 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 469 } else { 470 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 471 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 472 } 473 474 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 475 if (dp->scd_path == NULL) { 476 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 477 "none", 0, ZPROP_SRC_LOCAL); 478 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 479 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 480 dp->scd_path, 0, ZPROP_SRC_LOCAL); 481 } 482 } 483 } 484 485 /* 486 * Get zpool property values. 487 */ 488 int 489 spa_prop_get(spa_t *spa, nvlist_t **nvp) 490 { 491 objset_t *mos = spa->spa_meta_objset; 492 zap_cursor_t zc; 493 zap_attribute_t za; 494 dsl_pool_t *dp; 495 int err; 496 497 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 498 if (err) 499 return (err); 500 501 dp = spa_get_dsl(spa); 502 dsl_pool_config_enter(dp, FTAG); 503 mutex_enter(&spa->spa_props_lock); 504 505 /* 506 * Get properties from the spa config. 507 */ 508 spa_prop_get_config(spa, nvp); 509 510 /* If no pool property object, no more prop to get. */ 511 if (mos == NULL || spa->spa_pool_props_object == 0) 512 goto out; 513 514 /* 515 * Get properties from the MOS pool property object. 516 */ 517 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 518 (err = zap_cursor_retrieve(&zc, &za)) == 0; 519 zap_cursor_advance(&zc)) { 520 uint64_t intval = 0; 521 char *strval = NULL; 522 zprop_source_t src = ZPROP_SRC_DEFAULT; 523 zpool_prop_t prop; 524 525 if ((prop = zpool_name_to_prop(za.za_name)) == 526 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 527 continue; 528 529 switch (za.za_integer_length) { 530 case 8: 531 /* integer property */ 532 if (za.za_first_integer != 533 zpool_prop_default_numeric(prop)) 534 src = ZPROP_SRC_LOCAL; 535 536 if (prop == ZPOOL_PROP_BOOTFS) { 537 dsl_dataset_t *ds = NULL; 538 539 err = dsl_dataset_hold_obj(dp, 540 za.za_first_integer, FTAG, &ds); 541 if (err != 0) 542 break; 543 544 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 545 KM_SLEEP); 546 dsl_dataset_name(ds, strval); 547 dsl_dataset_rele(ds, FTAG); 548 } else { 549 strval = NULL; 550 intval = za.za_first_integer; 551 } 552 553 spa_prop_add_list(*nvp, prop, strval, intval, src); 554 555 if (strval != NULL) 556 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 557 558 break; 559 560 case 1: 561 /* string property */ 562 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 563 err = zap_lookup(mos, spa->spa_pool_props_object, 564 za.za_name, 1, za.za_num_integers, strval); 565 if (err) { 566 kmem_free(strval, za.za_num_integers); 567 break; 568 } 569 if (prop != ZPOOL_PROP_INVAL) { 570 spa_prop_add_list(*nvp, prop, strval, 0, src); 571 } else { 572 src = ZPROP_SRC_LOCAL; 573 spa_prop_add_user(*nvp, za.za_name, strval, 574 src); 575 } 576 kmem_free(strval, za.za_num_integers); 577 break; 578 579 default: 580 break; 581 } 582 } 583 zap_cursor_fini(&zc); 584 out: 585 mutex_exit(&spa->spa_props_lock); 586 dsl_pool_config_exit(dp, FTAG); 587 if (err && err != ENOENT) { 588 nvlist_free(*nvp); 589 *nvp = NULL; 590 return (err); 591 } 592 593 return (0); 594 } 595 596 /* 597 * Validate the given pool properties nvlist and modify the list 598 * for the property values to be set. 599 */ 600 static int 601 spa_prop_validate(spa_t *spa, nvlist_t *props) 602 { 603 nvpair_t *elem; 604 int error = 0, reset_bootfs = 0; 605 uint64_t objnum = 0; 606 boolean_t has_feature = B_FALSE; 607 608 elem = NULL; 609 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 610 uint64_t intval; 611 const char *strval, *slash, *check, *fname; 612 const char *propname = nvpair_name(elem); 613 zpool_prop_t prop = zpool_name_to_prop(propname); 614 615 switch (prop) { 616 case ZPOOL_PROP_INVAL: 617 /* 618 * Sanitize the input. 619 */ 620 if (zfs_prop_user(propname)) { 621 if (strlen(propname) >= ZAP_MAXNAMELEN) { 622 error = SET_ERROR(ENAMETOOLONG); 623 break; 624 } 625 626 if (strlen(fnvpair_value_string(elem)) >= 627 ZAP_MAXVALUELEN) { 628 error = SET_ERROR(E2BIG); 629 break; 630 } 631 } else if (zpool_prop_feature(propname)) { 632 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 633 error = SET_ERROR(EINVAL); 634 break; 635 } 636 637 if (nvpair_value_uint64(elem, &intval) != 0) { 638 error = SET_ERROR(EINVAL); 639 break; 640 } 641 642 if (intval != 0) { 643 error = SET_ERROR(EINVAL); 644 break; 645 } 646 647 fname = strchr(propname, '@') + 1; 648 if (zfeature_lookup_name(fname, NULL) != 0) { 649 error = SET_ERROR(EINVAL); 650 break; 651 } 652 653 has_feature = B_TRUE; 654 } else { 655 error = SET_ERROR(EINVAL); 656 break; 657 } 658 break; 659 660 case ZPOOL_PROP_VERSION: 661 error = nvpair_value_uint64(elem, &intval); 662 if (!error && 663 (intval < spa_version(spa) || 664 intval > SPA_VERSION_BEFORE_FEATURES || 665 has_feature)) 666 error = SET_ERROR(EINVAL); 667 break; 668 669 case ZPOOL_PROP_DELEGATION: 670 case ZPOOL_PROP_AUTOREPLACE: 671 case ZPOOL_PROP_LISTSNAPS: 672 case ZPOOL_PROP_AUTOEXPAND: 673 case ZPOOL_PROP_AUTOTRIM: 674 error = nvpair_value_uint64(elem, &intval); 675 if (!error && intval > 1) 676 error = SET_ERROR(EINVAL); 677 break; 678 679 case ZPOOL_PROP_MULTIHOST: 680 error = nvpair_value_uint64(elem, &intval); 681 if (!error && intval > 1) 682 error = SET_ERROR(EINVAL); 683 684 if (!error) { 685 uint32_t hostid = zone_get_hostid(NULL); 686 if (hostid) 687 spa->spa_hostid = hostid; 688 else 689 error = SET_ERROR(ENOTSUP); 690 } 691 692 break; 693 694 case ZPOOL_PROP_BOOTFS: 695 /* 696 * If the pool version is less than SPA_VERSION_BOOTFS, 697 * or the pool is still being created (version == 0), 698 * the bootfs property cannot be set. 699 */ 700 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 701 error = SET_ERROR(ENOTSUP); 702 break; 703 } 704 705 /* 706 * Make sure the vdev config is bootable 707 */ 708 if (!vdev_is_bootable(spa->spa_root_vdev)) { 709 error = SET_ERROR(ENOTSUP); 710 break; 711 } 712 713 reset_bootfs = 1; 714 715 error = nvpair_value_string(elem, &strval); 716 717 if (!error) { 718 objset_t *os; 719 720 if (strval == NULL || strval[0] == '\0') { 721 objnum = zpool_prop_default_numeric( 722 ZPOOL_PROP_BOOTFS); 723 break; 724 } 725 726 error = dmu_objset_hold(strval, FTAG, &os); 727 if (error != 0) 728 break; 729 730 /* Must be ZPL. */ 731 if (dmu_objset_type(os) != DMU_OST_ZFS) { 732 error = SET_ERROR(ENOTSUP); 733 } else { 734 objnum = dmu_objset_id(os); 735 } 736 dmu_objset_rele(os, FTAG); 737 } 738 break; 739 740 case ZPOOL_PROP_FAILUREMODE: 741 error = nvpair_value_uint64(elem, &intval); 742 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 743 error = SET_ERROR(EINVAL); 744 745 /* 746 * This is a special case which only occurs when 747 * the pool has completely failed. This allows 748 * the user to change the in-core failmode property 749 * without syncing it out to disk (I/Os might 750 * currently be blocked). We do this by returning 751 * EIO to the caller (spa_prop_set) to trick it 752 * into thinking we encountered a property validation 753 * error. 754 */ 755 if (!error && spa_suspended(spa)) { 756 spa->spa_failmode = intval; 757 error = SET_ERROR(EIO); 758 } 759 break; 760 761 case ZPOOL_PROP_CACHEFILE: 762 if ((error = nvpair_value_string(elem, &strval)) != 0) 763 break; 764 765 if (strval[0] == '\0') 766 break; 767 768 if (strcmp(strval, "none") == 0) 769 break; 770 771 if (strval[0] != '/') { 772 error = SET_ERROR(EINVAL); 773 break; 774 } 775 776 slash = strrchr(strval, '/'); 777 ASSERT(slash != NULL); 778 779 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 780 strcmp(slash, "/..") == 0) 781 error = SET_ERROR(EINVAL); 782 break; 783 784 case ZPOOL_PROP_COMMENT: 785 if ((error = nvpair_value_string(elem, &strval)) != 0) 786 break; 787 for (check = strval; *check != '\0'; check++) { 788 if (!isprint(*check)) { 789 error = SET_ERROR(EINVAL); 790 break; 791 } 792 } 793 if (strlen(strval) > ZPROP_MAX_COMMENT) 794 error = SET_ERROR(E2BIG); 795 break; 796 797 default: 798 break; 799 } 800 801 if (error) 802 break; 803 } 804 805 (void) nvlist_remove_all(props, 806 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 807 808 if (!error && reset_bootfs) { 809 error = nvlist_remove(props, 810 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 811 812 if (!error) { 813 error = nvlist_add_uint64(props, 814 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 815 } 816 } 817 818 return (error); 819 } 820 821 void 822 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 823 { 824 const char *cachefile; 825 spa_config_dirent_t *dp; 826 827 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 828 &cachefile) != 0) 829 return; 830 831 dp = kmem_alloc(sizeof (spa_config_dirent_t), 832 KM_SLEEP); 833 834 if (cachefile[0] == '\0') 835 dp->scd_path = spa_strdup(spa_config_path); 836 else if (strcmp(cachefile, "none") == 0) 837 dp->scd_path = NULL; 838 else 839 dp->scd_path = spa_strdup(cachefile); 840 841 list_insert_head(&spa->spa_config_list, dp); 842 if (need_sync) 843 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 844 } 845 846 int 847 spa_prop_set(spa_t *spa, nvlist_t *nvp) 848 { 849 int error; 850 nvpair_t *elem = NULL; 851 boolean_t need_sync = B_FALSE; 852 853 if ((error = spa_prop_validate(spa, nvp)) != 0) 854 return (error); 855 856 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 857 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 858 859 if (prop == ZPOOL_PROP_CACHEFILE || 860 prop == ZPOOL_PROP_ALTROOT || 861 prop == ZPOOL_PROP_READONLY) 862 continue; 863 864 if (prop == ZPOOL_PROP_INVAL && 865 zfs_prop_user(nvpair_name(elem))) { 866 need_sync = B_TRUE; 867 break; 868 } 869 870 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 871 uint64_t ver = 0; 872 873 if (prop == ZPOOL_PROP_VERSION) { 874 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 875 } else { 876 ASSERT(zpool_prop_feature(nvpair_name(elem))); 877 ver = SPA_VERSION_FEATURES; 878 need_sync = B_TRUE; 879 } 880 881 /* Save time if the version is already set. */ 882 if (ver == spa_version(spa)) 883 continue; 884 885 /* 886 * In addition to the pool directory object, we might 887 * create the pool properties object, the features for 888 * read object, the features for write object, or the 889 * feature descriptions object. 890 */ 891 error = dsl_sync_task(spa->spa_name, NULL, 892 spa_sync_version, &ver, 893 6, ZFS_SPACE_CHECK_RESERVED); 894 if (error) 895 return (error); 896 continue; 897 } 898 899 need_sync = B_TRUE; 900 break; 901 } 902 903 if (need_sync) { 904 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 905 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 906 } 907 908 return (0); 909 } 910 911 /* 912 * If the bootfs property value is dsobj, clear it. 913 */ 914 void 915 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 916 { 917 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 918 VERIFY(zap_remove(spa->spa_meta_objset, 919 spa->spa_pool_props_object, 920 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 921 spa->spa_bootfs = 0; 922 } 923 } 924 925 static int 926 spa_change_guid_check(void *arg, dmu_tx_t *tx) 927 { 928 uint64_t *newguid __maybe_unused = arg; 929 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 930 vdev_t *rvd = spa->spa_root_vdev; 931 uint64_t vdev_state; 932 933 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 934 int error = (spa_has_checkpoint(spa)) ? 935 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 936 return (SET_ERROR(error)); 937 } 938 939 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 940 vdev_state = rvd->vdev_state; 941 spa_config_exit(spa, SCL_STATE, FTAG); 942 943 if (vdev_state != VDEV_STATE_HEALTHY) 944 return (SET_ERROR(ENXIO)); 945 946 ASSERT3U(spa_guid(spa), !=, *newguid); 947 948 return (0); 949 } 950 951 static void 952 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 953 { 954 uint64_t *newguid = arg; 955 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 956 uint64_t oldguid; 957 vdev_t *rvd = spa->spa_root_vdev; 958 959 oldguid = spa_guid(spa); 960 961 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 962 rvd->vdev_guid = *newguid; 963 rvd->vdev_guid_sum += (*newguid - oldguid); 964 vdev_config_dirty(rvd); 965 spa_config_exit(spa, SCL_STATE, FTAG); 966 967 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 968 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 969 } 970 971 /* 972 * Change the GUID for the pool. This is done so that we can later 973 * re-import a pool built from a clone of our own vdevs. We will modify 974 * the root vdev's guid, our own pool guid, and then mark all of our 975 * vdevs dirty. Note that we must make sure that all our vdevs are 976 * online when we do this, or else any vdevs that weren't present 977 * would be orphaned from our pool. We are also going to issue a 978 * sysevent to update any watchers. 979 */ 980 int 981 spa_change_guid(spa_t *spa) 982 { 983 int error; 984 uint64_t guid; 985 986 mutex_enter(&spa->spa_vdev_top_lock); 987 mutex_enter(&spa_namespace_lock); 988 guid = spa_generate_guid(NULL); 989 990 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 991 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 992 993 if (error == 0) { 994 /* 995 * Clear the kobj flag from all the vdevs to allow 996 * vdev_cache_process_kobj_evt() to post events to all the 997 * vdevs since GUID is updated. 998 */ 999 vdev_clear_kobj_evt(spa->spa_root_vdev); 1000 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1001 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1002 1003 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1004 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1005 } 1006 1007 mutex_exit(&spa_namespace_lock); 1008 mutex_exit(&spa->spa_vdev_top_lock); 1009 1010 return (error); 1011 } 1012 1013 /* 1014 * ========================================================================== 1015 * SPA state manipulation (open/create/destroy/import/export) 1016 * ========================================================================== 1017 */ 1018 1019 static int 1020 spa_error_entry_compare(const void *a, const void *b) 1021 { 1022 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1023 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1024 int ret; 1025 1026 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1027 sizeof (zbookmark_phys_t)); 1028 1029 return (TREE_ISIGN(ret)); 1030 } 1031 1032 /* 1033 * Utility function which retrieves copies of the current logs and 1034 * re-initializes them in the process. 1035 */ 1036 void 1037 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1038 { 1039 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1040 1041 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1042 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1043 1044 avl_create(&spa->spa_errlist_scrub, 1045 spa_error_entry_compare, sizeof (spa_error_entry_t), 1046 offsetof(spa_error_entry_t, se_avl)); 1047 avl_create(&spa->spa_errlist_last, 1048 spa_error_entry_compare, sizeof (spa_error_entry_t), 1049 offsetof(spa_error_entry_t, se_avl)); 1050 } 1051 1052 static void 1053 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1054 { 1055 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1056 enum zti_modes mode = ztip->zti_mode; 1057 uint_t value = ztip->zti_value; 1058 uint_t count = ztip->zti_count; 1059 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1060 uint_t cpus, flags = TASKQ_DYNAMIC; 1061 1062 switch (mode) { 1063 case ZTI_MODE_FIXED: 1064 ASSERT3U(value, >, 0); 1065 break; 1066 1067 case ZTI_MODE_SYNC: 1068 1069 /* 1070 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', 1071 * not to exceed the number of spa allocators. 1072 */ 1073 if (zio_taskq_wr_iss_ncpus == 0) { 1074 count = MAX(boot_ncpus / spa->spa_alloc_count, 1); 1075 } else { 1076 count = MAX(1, 1077 boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); 1078 } 1079 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1080 count = MIN(count, spa->spa_alloc_count); 1081 1082 /* 1083 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1084 * single taskq may have more threads than 100% of online cpus. 1085 */ 1086 value = (zio_taskq_batch_pct + count / 2) / count; 1087 value = MIN(value, 100); 1088 flags |= TASKQ_THREADS_CPU_PCT; 1089 break; 1090 1091 case ZTI_MODE_SCALE: 1092 flags |= TASKQ_THREADS_CPU_PCT; 1093 /* 1094 * We want more taskqs to reduce lock contention, but we want 1095 * less for better request ordering and CPU utilization. 1096 */ 1097 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1098 if (zio_taskq_batch_tpq > 0) { 1099 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1100 zio_taskq_batch_tpq); 1101 } else { 1102 /* 1103 * Prefer 6 threads per taskq, but no more taskqs 1104 * than threads in them on large systems. For 80%: 1105 * 1106 * taskq taskq total 1107 * cpus taskqs percent threads threads 1108 * ------- ------- ------- ------- ------- 1109 * 1 1 80% 1 1 1110 * 2 1 80% 1 1 1111 * 4 1 80% 3 3 1112 * 8 2 40% 3 6 1113 * 16 3 27% 4 12 1114 * 32 5 16% 5 25 1115 * 64 7 11% 7 49 1116 * 128 10 8% 10 100 1117 * 256 14 6% 15 210 1118 */ 1119 count = 1 + cpus / 6; 1120 while (count * count > cpus) 1121 count--; 1122 } 1123 /* Limit each taskq within 100% to not trigger assertion. */ 1124 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1125 value = (zio_taskq_batch_pct + count / 2) / count; 1126 break; 1127 1128 case ZTI_MODE_NULL: 1129 tqs->stqs_count = 0; 1130 tqs->stqs_taskq = NULL; 1131 return; 1132 1133 default: 1134 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1135 "spa_taskqs_init()", 1136 zio_type_name[t], zio_taskq_types[q], mode, value); 1137 break; 1138 } 1139 1140 ASSERT3U(count, >, 0); 1141 tqs->stqs_count = count; 1142 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1143 1144 for (uint_t i = 0; i < count; i++) { 1145 taskq_t *tq; 1146 char name[32]; 1147 1148 if (count > 1) 1149 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1150 zio_type_name[t], zio_taskq_types[q], i); 1151 else 1152 (void) snprintf(name, sizeof (name), "%s_%s", 1153 zio_type_name[t], zio_taskq_types[q]); 1154 1155 #ifdef HAVE_SYSDC 1156 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1157 (void) zio_taskq_basedc; 1158 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1159 spa->spa_proc, zio_taskq_basedc, flags); 1160 } else { 1161 #endif 1162 pri_t pri = maxclsyspri; 1163 /* 1164 * The write issue taskq can be extremely CPU 1165 * intensive. Run it at slightly less important 1166 * priority than the other taskqs. 1167 * 1168 * Under Linux and FreeBSD this means incrementing 1169 * the priority value as opposed to platforms like 1170 * illumos where it should be decremented. 1171 * 1172 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1173 * are equal then a difference between them is 1174 * insignificant. 1175 */ 1176 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1177 #if defined(__linux__) 1178 pri++; 1179 #elif defined(__FreeBSD__) 1180 pri += 4; 1181 #else 1182 #error "unknown OS" 1183 #endif 1184 } 1185 tq = taskq_create_proc(name, value, pri, 50, 1186 INT_MAX, spa->spa_proc, flags); 1187 #ifdef HAVE_SYSDC 1188 } 1189 #endif 1190 1191 tqs->stqs_taskq[i] = tq; 1192 } 1193 } 1194 1195 static void 1196 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1197 { 1198 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1199 1200 if (tqs->stqs_taskq == NULL) { 1201 ASSERT3U(tqs->stqs_count, ==, 0); 1202 return; 1203 } 1204 1205 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1206 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1207 taskq_destroy(tqs->stqs_taskq[i]); 1208 } 1209 1210 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1211 tqs->stqs_taskq = NULL; 1212 } 1213 1214 #ifdef _KERNEL 1215 /* 1216 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1217 * by setting zio_taskq_read or zio_taskq_write. 1218 * 1219 * Example (the defaults for READ and WRITE) 1220 * zio_taskq_read='fixed,1,8 null scale null' 1221 * zio_taskq_write='sync fixed,1,5 scale fixed,1,5' 1222 * 1223 * Each sets the entire row at a time. 1224 * 1225 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1226 * of threads per taskq. 1227 * 1228 * 'null' can only be set on the high-priority queues (queue selection for 1229 * high-priority queues will fall back to the regular queue if the high-pri 1230 * is NULL. 1231 */ 1232 static const char *const modes[ZTI_NMODES] = { 1233 "fixed", "scale", "sync", "null" 1234 }; 1235 1236 /* Parse the incoming config string. Modifies cfg */ 1237 static int 1238 spa_taskq_param_set(zio_type_t t, char *cfg) 1239 { 1240 int err = 0; 1241 1242 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1243 1244 char *next = cfg, *tok, *c; 1245 1246 /* 1247 * Parse out each element from the string and fill `row`. The entire 1248 * row has to be set at once, so any errors are flagged by just 1249 * breaking out of this loop early. 1250 */ 1251 uint_t q; 1252 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1253 /* `next` is the start of the config */ 1254 if (next == NULL) 1255 break; 1256 1257 /* Eat up leading space */ 1258 while (isspace(*next)) 1259 next++; 1260 if (*next == '\0') 1261 break; 1262 1263 /* Mode ends at space or end of string */ 1264 tok = next; 1265 next = strchr(tok, ' '); 1266 if (next != NULL) *next++ = '\0'; 1267 1268 /* Parameters start after a comma */ 1269 c = strchr(tok, ','); 1270 if (c != NULL) *c++ = '\0'; 1271 1272 /* Match mode string */ 1273 uint_t mode; 1274 for (mode = 0; mode < ZTI_NMODES; mode++) 1275 if (strcmp(tok, modes[mode]) == 0) 1276 break; 1277 if (mode == ZTI_NMODES) 1278 break; 1279 1280 /* Invalid canary */ 1281 row[q].zti_mode = ZTI_NMODES; 1282 1283 /* Per-mode setup */ 1284 switch (mode) { 1285 1286 /* 1287 * FIXED is parameterised: number of queues, and number of 1288 * threads per queue. 1289 */ 1290 case ZTI_MODE_FIXED: { 1291 /* No parameters? */ 1292 if (c == NULL || *c == '\0') 1293 break; 1294 1295 /* Find next parameter */ 1296 tok = c; 1297 c = strchr(tok, ','); 1298 if (c == NULL) 1299 break; 1300 1301 /* Take digits and convert */ 1302 unsigned long long nq; 1303 if (!(isdigit(*tok))) 1304 break; 1305 err = ddi_strtoull(tok, &tok, 10, &nq); 1306 /* Must succeed and also end at the next param sep */ 1307 if (err != 0 || tok != c) 1308 break; 1309 1310 /* Move past the comma */ 1311 tok++; 1312 /* Need another number */ 1313 if (!(isdigit(*tok))) 1314 break; 1315 /* Remember start to make sure we moved */ 1316 c = tok; 1317 1318 /* Take digits */ 1319 unsigned long long ntpq; 1320 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1321 /* Must succeed, and moved forward */ 1322 if (err != 0 || tok == c || *tok != '\0') 1323 break; 1324 1325 /* 1326 * sanity; zero queues/threads make no sense, and 1327 * 16K is almost certainly more than anyone will ever 1328 * need and avoids silly numbers like UINT32_MAX 1329 */ 1330 if (nq == 0 || nq >= 16384 || 1331 ntpq == 0 || ntpq >= 16384) 1332 break; 1333 1334 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1335 row[q] = zti; 1336 break; 1337 } 1338 1339 case ZTI_MODE_SCALE: { 1340 const zio_taskq_info_t zti = ZTI_SCALE; 1341 row[q] = zti; 1342 break; 1343 } 1344 1345 case ZTI_MODE_SYNC: { 1346 const zio_taskq_info_t zti = ZTI_SYNC; 1347 row[q] = zti; 1348 break; 1349 } 1350 1351 case ZTI_MODE_NULL: { 1352 /* 1353 * Can only null the high-priority queues; the general- 1354 * purpose ones have to exist. 1355 */ 1356 if (q != ZIO_TASKQ_ISSUE_HIGH && 1357 q != ZIO_TASKQ_INTERRUPT_HIGH) 1358 break; 1359 1360 const zio_taskq_info_t zti = ZTI_NULL; 1361 row[q] = zti; 1362 break; 1363 } 1364 1365 default: 1366 break; 1367 } 1368 1369 /* Ensure we set a mode */ 1370 if (row[q].zti_mode == ZTI_NMODES) 1371 break; 1372 } 1373 1374 /* Didn't get a full row, fail */ 1375 if (q < ZIO_TASKQ_TYPES) 1376 return (SET_ERROR(EINVAL)); 1377 1378 /* Eat trailing space */ 1379 if (next != NULL) 1380 while (isspace(*next)) 1381 next++; 1382 1383 /* If there's anything left over then fail */ 1384 if (next != NULL && *next != '\0') 1385 return (SET_ERROR(EINVAL)); 1386 1387 /* Success! Copy it into the real config */ 1388 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1389 zio_taskqs[t][q] = row[q]; 1390 1391 return (0); 1392 } 1393 1394 static int 1395 spa_taskq_param_get(zio_type_t t, char *buf) 1396 { 1397 int pos = 0; 1398 1399 /* Build paramater string from live config */ 1400 const char *sep = ""; 1401 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1402 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1403 if (zti->zti_mode == ZTI_MODE_FIXED) 1404 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1405 modes[zti->zti_mode], zti->zti_count, 1406 zti->zti_value); 1407 else 1408 pos += sprintf(&buf[pos], "%s%s", sep, 1409 modes[zti->zti_mode]); 1410 sep = " "; 1411 } 1412 1413 buf[pos++] = '\n'; 1414 buf[pos] = '\0'; 1415 1416 return (pos); 1417 } 1418 1419 #ifdef __linux__ 1420 static int 1421 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1422 { 1423 char *cfg = kmem_strdup(val); 1424 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1425 kmem_free(cfg, strlen(val)+1); 1426 return (-err); 1427 } 1428 static int 1429 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1430 { 1431 return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); 1432 } 1433 1434 static int 1435 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1436 { 1437 char *cfg = kmem_strdup(val); 1438 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1439 kmem_free(cfg, strlen(val)+1); 1440 return (-err); 1441 } 1442 static int 1443 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1444 { 1445 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); 1446 } 1447 #else 1448 #include <sys/sbuf.h> 1449 1450 /* 1451 * On FreeBSD load-time parameters can be set up before malloc() is available, 1452 * so we have to do all the parsing work on the stack. 1453 */ 1454 #define SPA_TASKQ_PARAM_MAX (128) 1455 1456 static int 1457 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1458 { 1459 char buf[SPA_TASKQ_PARAM_MAX]; 1460 int err = 0; 1461 1462 if (req->newptr == NULL) { 1463 int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); 1464 struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); 1465 sbuf_cpy(s, buf); 1466 err = sbuf_finish(s); 1467 sbuf_delete(s); 1468 return (err); 1469 } 1470 1471 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1472 if (err) 1473 return (err); 1474 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1475 } 1476 1477 static int 1478 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1479 { 1480 char buf[SPA_TASKQ_PARAM_MAX]; 1481 int err = 0; 1482 1483 if (req->newptr == NULL) { 1484 int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); 1485 struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); 1486 sbuf_cpy(s, buf); 1487 err = sbuf_finish(s); 1488 sbuf_delete(s); 1489 return (err); 1490 } 1491 1492 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1493 if (err) 1494 return (err); 1495 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1496 } 1497 #endif 1498 #endif /* _KERNEL */ 1499 1500 /* 1501 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1502 * Note that a type may have multiple discrete taskqs to avoid lock contention 1503 * on the taskq itself. 1504 */ 1505 static taskq_t * 1506 spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1507 zio_t *zio) 1508 { 1509 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1510 taskq_t *tq; 1511 1512 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1513 ASSERT3U(tqs->stqs_count, !=, 0); 1514 1515 if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1516 (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { 1517 /* dispatch to assigned write issue taskq */ 1518 tq = zio->io_wr_iss_tq; 1519 return (tq); 1520 } 1521 1522 if (tqs->stqs_count == 1) { 1523 tq = tqs->stqs_taskq[0]; 1524 } else { 1525 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1526 } 1527 return (tq); 1528 } 1529 1530 void 1531 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1532 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, 1533 zio_t *zio) 1534 { 1535 taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); 1536 taskq_dispatch_ent(tq, func, arg, flags, ent); 1537 } 1538 1539 /* 1540 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1541 */ 1542 void 1543 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1544 task_func_t *func, void *arg, uint_t flags) 1545 { 1546 taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); 1547 taskqid_t id = taskq_dispatch(tq, func, arg, flags); 1548 if (id) 1549 taskq_wait_id(tq, id); 1550 } 1551 1552 static void 1553 spa_create_zio_taskqs(spa_t *spa) 1554 { 1555 for (int t = 0; t < ZIO_TYPES; t++) { 1556 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1557 spa_taskqs_init(spa, t, q); 1558 } 1559 } 1560 } 1561 1562 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1563 static void 1564 spa_thread(void *arg) 1565 { 1566 psetid_t zio_taskq_psrset_bind = PS_NONE; 1567 callb_cpr_t cprinfo; 1568 1569 spa_t *spa = arg; 1570 user_t *pu = PTOU(curproc); 1571 1572 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1573 spa->spa_name); 1574 1575 ASSERT(curproc != &p0); 1576 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1577 "zpool-%s", spa->spa_name); 1578 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1579 1580 /* bind this thread to the requested psrset */ 1581 if (zio_taskq_psrset_bind != PS_NONE) { 1582 pool_lock(); 1583 mutex_enter(&cpu_lock); 1584 mutex_enter(&pidlock); 1585 mutex_enter(&curproc->p_lock); 1586 1587 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1588 0, NULL, NULL) == 0) { 1589 curthread->t_bind_pset = zio_taskq_psrset_bind; 1590 } else { 1591 cmn_err(CE_WARN, 1592 "Couldn't bind process for zfs pool \"%s\" to " 1593 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1594 } 1595 1596 mutex_exit(&curproc->p_lock); 1597 mutex_exit(&pidlock); 1598 mutex_exit(&cpu_lock); 1599 pool_unlock(); 1600 } 1601 1602 #ifdef HAVE_SYSDC 1603 if (zio_taskq_sysdc) { 1604 sysdc_thread_enter(curthread, 100, 0); 1605 } 1606 #endif 1607 1608 spa->spa_proc = curproc; 1609 spa->spa_did = curthread->t_did; 1610 1611 spa_create_zio_taskqs(spa); 1612 1613 mutex_enter(&spa->spa_proc_lock); 1614 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1615 1616 spa->spa_proc_state = SPA_PROC_ACTIVE; 1617 cv_broadcast(&spa->spa_proc_cv); 1618 1619 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1620 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1621 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1622 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1623 1624 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1625 spa->spa_proc_state = SPA_PROC_GONE; 1626 spa->spa_proc = &p0; 1627 cv_broadcast(&spa->spa_proc_cv); 1628 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1629 1630 mutex_enter(&curproc->p_lock); 1631 lwp_exit(); 1632 } 1633 #endif 1634 1635 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1636 1637 /* 1638 * Activate an uninitialized pool. 1639 */ 1640 static void 1641 spa_activate(spa_t *spa, spa_mode_t mode) 1642 { 1643 metaslab_ops_t *msp = metaslab_allocator(spa); 1644 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1645 1646 spa->spa_state = POOL_STATE_ACTIVE; 1647 spa->spa_mode = mode; 1648 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1649 1650 spa->spa_normal_class = metaslab_class_create(spa, msp); 1651 spa->spa_log_class = metaslab_class_create(spa, msp); 1652 spa->spa_embedded_log_class = metaslab_class_create(spa, msp); 1653 spa->spa_special_class = metaslab_class_create(spa, msp); 1654 spa->spa_dedup_class = metaslab_class_create(spa, msp); 1655 1656 /* Try to create a covering process */ 1657 mutex_enter(&spa->spa_proc_lock); 1658 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1659 ASSERT(spa->spa_proc == &p0); 1660 spa->spa_did = 0; 1661 1662 #ifdef HAVE_SPA_THREAD 1663 /* Only create a process if we're going to be around a while. */ 1664 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1665 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1666 NULL, 0) == 0) { 1667 spa->spa_proc_state = SPA_PROC_CREATED; 1668 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1669 cv_wait(&spa->spa_proc_cv, 1670 &spa->spa_proc_lock); 1671 } 1672 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1673 ASSERT(spa->spa_proc != &p0); 1674 ASSERT(spa->spa_did != 0); 1675 } else { 1676 #ifdef _KERNEL 1677 cmn_err(CE_WARN, 1678 "Couldn't create process for zfs pool \"%s\"\n", 1679 spa->spa_name); 1680 #endif 1681 } 1682 } 1683 #endif /* HAVE_SPA_THREAD */ 1684 mutex_exit(&spa->spa_proc_lock); 1685 1686 /* If we didn't create a process, we need to create our taskqs. */ 1687 if (spa->spa_proc == &p0) { 1688 spa_create_zio_taskqs(spa); 1689 } 1690 1691 for (size_t i = 0; i < TXG_SIZE; i++) { 1692 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1693 ZIO_FLAG_CANFAIL); 1694 } 1695 1696 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1697 offsetof(vdev_t, vdev_config_dirty_node)); 1698 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1699 offsetof(objset_t, os_evicting_node)); 1700 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1701 offsetof(vdev_t, vdev_state_dirty_node)); 1702 1703 txg_list_create(&spa->spa_vdev_txg_list, spa, 1704 offsetof(struct vdev, vdev_txg_node)); 1705 1706 avl_create(&spa->spa_errlist_scrub, 1707 spa_error_entry_compare, sizeof (spa_error_entry_t), 1708 offsetof(spa_error_entry_t, se_avl)); 1709 avl_create(&spa->spa_errlist_last, 1710 spa_error_entry_compare, sizeof (spa_error_entry_t), 1711 offsetof(spa_error_entry_t, se_avl)); 1712 avl_create(&spa->spa_errlist_healed, 1713 spa_error_entry_compare, sizeof (spa_error_entry_t), 1714 offsetof(spa_error_entry_t, se_avl)); 1715 1716 spa_activate_os(spa); 1717 1718 spa_keystore_init(&spa->spa_keystore); 1719 1720 /* 1721 * This taskq is used to perform zvol-minor-related tasks 1722 * asynchronously. This has several advantages, including easy 1723 * resolution of various deadlocks. 1724 * 1725 * The taskq must be single threaded to ensure tasks are always 1726 * processed in the order in which they were dispatched. 1727 * 1728 * A taskq per pool allows one to keep the pools independent. 1729 * This way if one pool is suspended, it will not impact another. 1730 * 1731 * The preferred location to dispatch a zvol minor task is a sync 1732 * task. In this context, there is easy access to the spa_t and minimal 1733 * error handling is required because the sync task must succeed. 1734 */ 1735 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1736 1, INT_MAX, 0); 1737 1738 /* 1739 * The taskq to preload metaslabs. 1740 */ 1741 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1742 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1743 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1744 1745 /* 1746 * Taskq dedicated to prefetcher threads: this is used to prevent the 1747 * pool traverse code from monopolizing the global (and limited) 1748 * system_taskq by inappropriately scheduling long running tasks on it. 1749 */ 1750 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1751 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1752 1753 /* 1754 * The taskq to upgrade datasets in this pool. Currently used by 1755 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1756 */ 1757 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1758 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1759 } 1760 1761 /* 1762 * Opposite of spa_activate(). 1763 */ 1764 static void 1765 spa_deactivate(spa_t *spa) 1766 { 1767 ASSERT(spa->spa_sync_on == B_FALSE); 1768 ASSERT(spa->spa_dsl_pool == NULL); 1769 ASSERT(spa->spa_root_vdev == NULL); 1770 ASSERT(spa->spa_async_zio_root == NULL); 1771 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1772 1773 spa_evicting_os_wait(spa); 1774 1775 if (spa->spa_zvol_taskq) { 1776 taskq_destroy(spa->spa_zvol_taskq); 1777 spa->spa_zvol_taskq = NULL; 1778 } 1779 1780 if (spa->spa_metaslab_taskq) { 1781 taskq_destroy(spa->spa_metaslab_taskq); 1782 spa->spa_metaslab_taskq = NULL; 1783 } 1784 1785 if (spa->spa_prefetch_taskq) { 1786 taskq_destroy(spa->spa_prefetch_taskq); 1787 spa->spa_prefetch_taskq = NULL; 1788 } 1789 1790 if (spa->spa_upgrade_taskq) { 1791 taskq_destroy(spa->spa_upgrade_taskq); 1792 spa->spa_upgrade_taskq = NULL; 1793 } 1794 1795 txg_list_destroy(&spa->spa_vdev_txg_list); 1796 1797 list_destroy(&spa->spa_config_dirty_list); 1798 list_destroy(&spa->spa_evicting_os_list); 1799 list_destroy(&spa->spa_state_dirty_list); 1800 1801 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1802 1803 for (int t = 0; t < ZIO_TYPES; t++) { 1804 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1805 spa_taskqs_fini(spa, t, q); 1806 } 1807 } 1808 1809 for (size_t i = 0; i < TXG_SIZE; i++) { 1810 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1811 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1812 spa->spa_txg_zio[i] = NULL; 1813 } 1814 1815 metaslab_class_destroy(spa->spa_normal_class); 1816 spa->spa_normal_class = NULL; 1817 1818 metaslab_class_destroy(spa->spa_log_class); 1819 spa->spa_log_class = NULL; 1820 1821 metaslab_class_destroy(spa->spa_embedded_log_class); 1822 spa->spa_embedded_log_class = NULL; 1823 1824 metaslab_class_destroy(spa->spa_special_class); 1825 spa->spa_special_class = NULL; 1826 1827 metaslab_class_destroy(spa->spa_dedup_class); 1828 spa->spa_dedup_class = NULL; 1829 1830 /* 1831 * If this was part of an import or the open otherwise failed, we may 1832 * still have errors left in the queues. Empty them just in case. 1833 */ 1834 spa_errlog_drain(spa); 1835 avl_destroy(&spa->spa_errlist_scrub); 1836 avl_destroy(&spa->spa_errlist_last); 1837 avl_destroy(&spa->spa_errlist_healed); 1838 1839 spa_keystore_fini(&spa->spa_keystore); 1840 1841 spa->spa_state = POOL_STATE_UNINITIALIZED; 1842 1843 mutex_enter(&spa->spa_proc_lock); 1844 if (spa->spa_proc_state != SPA_PROC_NONE) { 1845 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1846 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1847 cv_broadcast(&spa->spa_proc_cv); 1848 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1849 ASSERT(spa->spa_proc != &p0); 1850 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1851 } 1852 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1853 spa->spa_proc_state = SPA_PROC_NONE; 1854 } 1855 ASSERT(spa->spa_proc == &p0); 1856 mutex_exit(&spa->spa_proc_lock); 1857 1858 /* 1859 * We want to make sure spa_thread() has actually exited the ZFS 1860 * module, so that the module can't be unloaded out from underneath 1861 * it. 1862 */ 1863 if (spa->spa_did != 0) { 1864 thread_join(spa->spa_did); 1865 spa->spa_did = 0; 1866 } 1867 1868 spa_deactivate_os(spa); 1869 1870 } 1871 1872 /* 1873 * Verify a pool configuration, and construct the vdev tree appropriately. This 1874 * will create all the necessary vdevs in the appropriate layout, with each vdev 1875 * in the CLOSED state. This will prep the pool before open/creation/import. 1876 * All vdev validation is done by the vdev_alloc() routine. 1877 */ 1878 int 1879 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1880 uint_t id, int atype) 1881 { 1882 nvlist_t **child; 1883 uint_t children; 1884 int error; 1885 1886 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1887 return (error); 1888 1889 if ((*vdp)->vdev_ops->vdev_op_leaf) 1890 return (0); 1891 1892 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1893 &child, &children); 1894 1895 if (error == ENOENT) 1896 return (0); 1897 1898 if (error) { 1899 vdev_free(*vdp); 1900 *vdp = NULL; 1901 return (SET_ERROR(EINVAL)); 1902 } 1903 1904 for (int c = 0; c < children; c++) { 1905 vdev_t *vd; 1906 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1907 atype)) != 0) { 1908 vdev_free(*vdp); 1909 *vdp = NULL; 1910 return (error); 1911 } 1912 } 1913 1914 ASSERT(*vdp != NULL); 1915 1916 return (0); 1917 } 1918 1919 static boolean_t 1920 spa_should_flush_logs_on_unload(spa_t *spa) 1921 { 1922 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1923 return (B_FALSE); 1924 1925 if (!spa_writeable(spa)) 1926 return (B_FALSE); 1927 1928 if (!spa->spa_sync_on) 1929 return (B_FALSE); 1930 1931 if (spa_state(spa) != POOL_STATE_EXPORTED) 1932 return (B_FALSE); 1933 1934 if (zfs_keep_log_spacemaps_at_export) 1935 return (B_FALSE); 1936 1937 return (B_TRUE); 1938 } 1939 1940 /* 1941 * Opens a transaction that will set the flag that will instruct 1942 * spa_sync to attempt to flush all the metaslabs for that txg. 1943 */ 1944 static void 1945 spa_unload_log_sm_flush_all(spa_t *spa) 1946 { 1947 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1948 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1949 1950 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1951 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1952 1953 dmu_tx_commit(tx); 1954 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1955 } 1956 1957 static void 1958 spa_unload_log_sm_metadata(spa_t *spa) 1959 { 1960 void *cookie = NULL; 1961 spa_log_sm_t *sls; 1962 log_summary_entry_t *e; 1963 1964 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1965 &cookie)) != NULL) { 1966 VERIFY0(sls->sls_mscount); 1967 kmem_free(sls, sizeof (spa_log_sm_t)); 1968 } 1969 1970 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 1971 VERIFY0(e->lse_mscount); 1972 kmem_free(e, sizeof (log_summary_entry_t)); 1973 } 1974 1975 spa->spa_unflushed_stats.sus_nblocks = 0; 1976 spa->spa_unflushed_stats.sus_memused = 0; 1977 spa->spa_unflushed_stats.sus_blocklimit = 0; 1978 } 1979 1980 static void 1981 spa_destroy_aux_threads(spa_t *spa) 1982 { 1983 if (spa->spa_condense_zthr != NULL) { 1984 zthr_destroy(spa->spa_condense_zthr); 1985 spa->spa_condense_zthr = NULL; 1986 } 1987 if (spa->spa_checkpoint_discard_zthr != NULL) { 1988 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1989 spa->spa_checkpoint_discard_zthr = NULL; 1990 } 1991 if (spa->spa_livelist_delete_zthr != NULL) { 1992 zthr_destroy(spa->spa_livelist_delete_zthr); 1993 spa->spa_livelist_delete_zthr = NULL; 1994 } 1995 if (spa->spa_livelist_condense_zthr != NULL) { 1996 zthr_destroy(spa->spa_livelist_condense_zthr); 1997 spa->spa_livelist_condense_zthr = NULL; 1998 } 1999 if (spa->spa_raidz_expand_zthr != NULL) { 2000 zthr_destroy(spa->spa_raidz_expand_zthr); 2001 spa->spa_raidz_expand_zthr = NULL; 2002 } 2003 } 2004 2005 /* 2006 * Opposite of spa_load(). 2007 */ 2008 static void 2009 spa_unload(spa_t *spa) 2010 { 2011 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2012 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 2013 2014 spa_import_progress_remove(spa_guid(spa)); 2015 spa_load_note(spa, "UNLOADING"); 2016 2017 spa_wake_waiters(spa); 2018 2019 /* 2020 * If we have set the spa_final_txg, we have already performed the 2021 * tasks below in spa_export_common(). We should not redo it here since 2022 * we delay the final TXGs beyond what spa_final_txg is set at. 2023 */ 2024 if (spa->spa_final_txg == UINT64_MAX) { 2025 /* 2026 * If the log space map feature is enabled and the pool is 2027 * getting exported (but not destroyed), we want to spend some 2028 * time flushing as many metaslabs as we can in an attempt to 2029 * destroy log space maps and save import time. 2030 */ 2031 if (spa_should_flush_logs_on_unload(spa)) 2032 spa_unload_log_sm_flush_all(spa); 2033 2034 /* 2035 * Stop async tasks. 2036 */ 2037 spa_async_suspend(spa); 2038 2039 if (spa->spa_root_vdev) { 2040 vdev_t *root_vdev = spa->spa_root_vdev; 2041 vdev_initialize_stop_all(root_vdev, 2042 VDEV_INITIALIZE_ACTIVE); 2043 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2044 vdev_autotrim_stop_all(spa); 2045 vdev_rebuild_stop_all(spa); 2046 } 2047 } 2048 2049 /* 2050 * Stop syncing. 2051 */ 2052 if (spa->spa_sync_on) { 2053 txg_sync_stop(spa->spa_dsl_pool); 2054 spa->spa_sync_on = B_FALSE; 2055 } 2056 2057 /* 2058 * This ensures that there is no async metaslab prefetching 2059 * while we attempt to unload the spa. 2060 */ 2061 taskq_wait(spa->spa_metaslab_taskq); 2062 2063 if (spa->spa_mmp.mmp_thread) 2064 mmp_thread_stop(spa); 2065 2066 /* 2067 * Wait for any outstanding async I/O to complete. 2068 */ 2069 if (spa->spa_async_zio_root != NULL) { 2070 for (int i = 0; i < max_ncpus; i++) 2071 (void) zio_wait(spa->spa_async_zio_root[i]); 2072 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2073 spa->spa_async_zio_root = NULL; 2074 } 2075 2076 if (spa->spa_vdev_removal != NULL) { 2077 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2078 spa->spa_vdev_removal = NULL; 2079 } 2080 2081 spa_destroy_aux_threads(spa); 2082 2083 spa_condense_fini(spa); 2084 2085 bpobj_close(&spa->spa_deferred_bpobj); 2086 2087 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2088 2089 /* 2090 * Close all vdevs. 2091 */ 2092 if (spa->spa_root_vdev) 2093 vdev_free(spa->spa_root_vdev); 2094 ASSERT(spa->spa_root_vdev == NULL); 2095 2096 /* 2097 * Close the dsl pool. 2098 */ 2099 if (spa->spa_dsl_pool) { 2100 dsl_pool_close(spa->spa_dsl_pool); 2101 spa->spa_dsl_pool = NULL; 2102 spa->spa_meta_objset = NULL; 2103 } 2104 2105 ddt_unload(spa); 2106 brt_unload(spa); 2107 spa_unload_log_sm_metadata(spa); 2108 2109 /* 2110 * Drop and purge level 2 cache 2111 */ 2112 spa_l2cache_drop(spa); 2113 2114 if (spa->spa_spares.sav_vdevs) { 2115 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2116 vdev_free(spa->spa_spares.sav_vdevs[i]); 2117 kmem_free(spa->spa_spares.sav_vdevs, 2118 spa->spa_spares.sav_count * sizeof (void *)); 2119 spa->spa_spares.sav_vdevs = NULL; 2120 } 2121 if (spa->spa_spares.sav_config) { 2122 nvlist_free(spa->spa_spares.sav_config); 2123 spa->spa_spares.sav_config = NULL; 2124 } 2125 spa->spa_spares.sav_count = 0; 2126 2127 if (spa->spa_l2cache.sav_vdevs) { 2128 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2129 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2130 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2131 } 2132 kmem_free(spa->spa_l2cache.sav_vdevs, 2133 spa->spa_l2cache.sav_count * sizeof (void *)); 2134 spa->spa_l2cache.sav_vdevs = NULL; 2135 } 2136 if (spa->spa_l2cache.sav_config) { 2137 nvlist_free(spa->spa_l2cache.sav_config); 2138 spa->spa_l2cache.sav_config = NULL; 2139 } 2140 spa->spa_l2cache.sav_count = 0; 2141 2142 spa->spa_async_suspended = 0; 2143 2144 spa->spa_indirect_vdevs_loaded = B_FALSE; 2145 2146 if (spa->spa_comment != NULL) { 2147 spa_strfree(spa->spa_comment); 2148 spa->spa_comment = NULL; 2149 } 2150 if (spa->spa_compatibility != NULL) { 2151 spa_strfree(spa->spa_compatibility); 2152 spa->spa_compatibility = NULL; 2153 } 2154 2155 spa->spa_raidz_expand = NULL; 2156 2157 spa_config_exit(spa, SCL_ALL, spa); 2158 } 2159 2160 /* 2161 * Load (or re-load) the current list of vdevs describing the active spares for 2162 * this pool. When this is called, we have some form of basic information in 2163 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2164 * then re-generate a more complete list including status information. 2165 */ 2166 void 2167 spa_load_spares(spa_t *spa) 2168 { 2169 nvlist_t **spares; 2170 uint_t nspares; 2171 int i; 2172 vdev_t *vd, *tvd; 2173 2174 #ifndef _KERNEL 2175 /* 2176 * zdb opens both the current state of the pool and the 2177 * checkpointed state (if present), with a different spa_t. 2178 * 2179 * As spare vdevs are shared among open pools, we skip loading 2180 * them when we load the checkpointed state of the pool. 2181 */ 2182 if (!spa_writeable(spa)) 2183 return; 2184 #endif 2185 2186 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2187 2188 /* 2189 * First, close and free any existing spare vdevs. 2190 */ 2191 if (spa->spa_spares.sav_vdevs) { 2192 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2193 vd = spa->spa_spares.sav_vdevs[i]; 2194 2195 /* Undo the call to spa_activate() below */ 2196 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2197 B_FALSE)) != NULL && tvd->vdev_isspare) 2198 spa_spare_remove(tvd); 2199 vdev_close(vd); 2200 vdev_free(vd); 2201 } 2202 2203 kmem_free(spa->spa_spares.sav_vdevs, 2204 spa->spa_spares.sav_count * sizeof (void *)); 2205 } 2206 2207 if (spa->spa_spares.sav_config == NULL) 2208 nspares = 0; 2209 else 2210 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2211 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2212 2213 spa->spa_spares.sav_count = (int)nspares; 2214 spa->spa_spares.sav_vdevs = NULL; 2215 2216 if (nspares == 0) 2217 return; 2218 2219 /* 2220 * Construct the array of vdevs, opening them to get status in the 2221 * process. For each spare, there is potentially two different vdev_t 2222 * structures associated with it: one in the list of spares (used only 2223 * for basic validation purposes) and one in the active vdev 2224 * configuration (if it's spared in). During this phase we open and 2225 * validate each vdev on the spare list. If the vdev also exists in the 2226 * active configuration, then we also mark this vdev as an active spare. 2227 */ 2228 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2229 KM_SLEEP); 2230 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2231 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2232 VDEV_ALLOC_SPARE) == 0); 2233 ASSERT(vd != NULL); 2234 2235 spa->spa_spares.sav_vdevs[i] = vd; 2236 2237 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2238 B_FALSE)) != NULL) { 2239 if (!tvd->vdev_isspare) 2240 spa_spare_add(tvd); 2241 2242 /* 2243 * We only mark the spare active if we were successfully 2244 * able to load the vdev. Otherwise, importing a pool 2245 * with a bad active spare would result in strange 2246 * behavior, because multiple pool would think the spare 2247 * is actively in use. 2248 * 2249 * There is a vulnerability here to an equally bizarre 2250 * circumstance, where a dead active spare is later 2251 * brought back to life (onlined or otherwise). Given 2252 * the rarity of this scenario, and the extra complexity 2253 * it adds, we ignore the possibility. 2254 */ 2255 if (!vdev_is_dead(tvd)) 2256 spa_spare_activate(tvd); 2257 } 2258 2259 vd->vdev_top = vd; 2260 vd->vdev_aux = &spa->spa_spares; 2261 2262 if (vdev_open(vd) != 0) 2263 continue; 2264 2265 if (vdev_validate_aux(vd) == 0) 2266 spa_spare_add(vd); 2267 } 2268 2269 /* 2270 * Recompute the stashed list of spares, with status information 2271 * this time. 2272 */ 2273 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2274 2275 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2276 KM_SLEEP); 2277 for (i = 0; i < spa->spa_spares.sav_count; i++) 2278 spares[i] = vdev_config_generate(spa, 2279 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2280 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2281 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2282 spa->spa_spares.sav_count); 2283 for (i = 0; i < spa->spa_spares.sav_count; i++) 2284 nvlist_free(spares[i]); 2285 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2286 } 2287 2288 /* 2289 * Load (or re-load) the current list of vdevs describing the active l2cache for 2290 * this pool. When this is called, we have some form of basic information in 2291 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2292 * then re-generate a more complete list including status information. 2293 * Devices which are already active have their details maintained, and are 2294 * not re-opened. 2295 */ 2296 void 2297 spa_load_l2cache(spa_t *spa) 2298 { 2299 nvlist_t **l2cache = NULL; 2300 uint_t nl2cache; 2301 int i, j, oldnvdevs; 2302 uint64_t guid; 2303 vdev_t *vd, **oldvdevs, **newvdevs; 2304 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2305 2306 #ifndef _KERNEL 2307 /* 2308 * zdb opens both the current state of the pool and the 2309 * checkpointed state (if present), with a different spa_t. 2310 * 2311 * As L2 caches are part of the ARC which is shared among open 2312 * pools, we skip loading them when we load the checkpointed 2313 * state of the pool. 2314 */ 2315 if (!spa_writeable(spa)) 2316 return; 2317 #endif 2318 2319 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2320 2321 oldvdevs = sav->sav_vdevs; 2322 oldnvdevs = sav->sav_count; 2323 sav->sav_vdevs = NULL; 2324 sav->sav_count = 0; 2325 2326 if (sav->sav_config == NULL) { 2327 nl2cache = 0; 2328 newvdevs = NULL; 2329 goto out; 2330 } 2331 2332 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2333 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2334 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2335 2336 /* 2337 * Process new nvlist of vdevs. 2338 */ 2339 for (i = 0; i < nl2cache; i++) { 2340 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2341 2342 newvdevs[i] = NULL; 2343 for (j = 0; j < oldnvdevs; j++) { 2344 vd = oldvdevs[j]; 2345 if (vd != NULL && guid == vd->vdev_guid) { 2346 /* 2347 * Retain previous vdev for add/remove ops. 2348 */ 2349 newvdevs[i] = vd; 2350 oldvdevs[j] = NULL; 2351 break; 2352 } 2353 } 2354 2355 if (newvdevs[i] == NULL) { 2356 /* 2357 * Create new vdev 2358 */ 2359 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2360 VDEV_ALLOC_L2CACHE) == 0); 2361 ASSERT(vd != NULL); 2362 newvdevs[i] = vd; 2363 2364 /* 2365 * Commit this vdev as an l2cache device, 2366 * even if it fails to open. 2367 */ 2368 spa_l2cache_add(vd); 2369 2370 vd->vdev_top = vd; 2371 vd->vdev_aux = sav; 2372 2373 spa_l2cache_activate(vd); 2374 2375 if (vdev_open(vd) != 0) 2376 continue; 2377 2378 (void) vdev_validate_aux(vd); 2379 2380 if (!vdev_is_dead(vd)) 2381 l2arc_add_vdev(spa, vd); 2382 2383 /* 2384 * Upon cache device addition to a pool or pool 2385 * creation with a cache device or if the header 2386 * of the device is invalid we issue an async 2387 * TRIM command for the whole device which will 2388 * execute if l2arc_trim_ahead > 0. 2389 */ 2390 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2391 } 2392 } 2393 2394 sav->sav_vdevs = newvdevs; 2395 sav->sav_count = (int)nl2cache; 2396 2397 /* 2398 * Recompute the stashed list of l2cache devices, with status 2399 * information this time. 2400 */ 2401 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2402 2403 if (sav->sav_count > 0) 2404 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2405 KM_SLEEP); 2406 for (i = 0; i < sav->sav_count; i++) 2407 l2cache[i] = vdev_config_generate(spa, 2408 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2409 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2410 (const nvlist_t * const *)l2cache, sav->sav_count); 2411 2412 out: 2413 /* 2414 * Purge vdevs that were dropped 2415 */ 2416 if (oldvdevs) { 2417 for (i = 0; i < oldnvdevs; i++) { 2418 uint64_t pool; 2419 2420 vd = oldvdevs[i]; 2421 if (vd != NULL) { 2422 ASSERT(vd->vdev_isl2cache); 2423 2424 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2425 pool != 0ULL && l2arc_vdev_present(vd)) 2426 l2arc_remove_vdev(vd); 2427 vdev_clear_stats(vd); 2428 vdev_free(vd); 2429 } 2430 } 2431 2432 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2433 } 2434 2435 for (i = 0; i < sav->sav_count; i++) 2436 nvlist_free(l2cache[i]); 2437 if (sav->sav_count) 2438 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2439 } 2440 2441 static int 2442 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2443 { 2444 dmu_buf_t *db; 2445 char *packed = NULL; 2446 size_t nvsize = 0; 2447 int error; 2448 *value = NULL; 2449 2450 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2451 if (error) 2452 return (error); 2453 2454 nvsize = *(uint64_t *)db->db_data; 2455 dmu_buf_rele(db, FTAG); 2456 2457 packed = vmem_alloc(nvsize, KM_SLEEP); 2458 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2459 DMU_READ_PREFETCH); 2460 if (error == 0) 2461 error = nvlist_unpack(packed, nvsize, value, 0); 2462 vmem_free(packed, nvsize); 2463 2464 return (error); 2465 } 2466 2467 /* 2468 * Concrete top-level vdevs that are not missing and are not logs. At every 2469 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2470 */ 2471 static uint64_t 2472 spa_healthy_core_tvds(spa_t *spa) 2473 { 2474 vdev_t *rvd = spa->spa_root_vdev; 2475 uint64_t tvds = 0; 2476 2477 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2478 vdev_t *vd = rvd->vdev_child[i]; 2479 if (vd->vdev_islog) 2480 continue; 2481 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2482 tvds++; 2483 } 2484 2485 return (tvds); 2486 } 2487 2488 /* 2489 * Checks to see if the given vdev could not be opened, in which case we post a 2490 * sysevent to notify the autoreplace code that the device has been removed. 2491 */ 2492 static void 2493 spa_check_removed(vdev_t *vd) 2494 { 2495 for (uint64_t c = 0; c < vd->vdev_children; c++) 2496 spa_check_removed(vd->vdev_child[c]); 2497 2498 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2499 vdev_is_concrete(vd)) { 2500 zfs_post_autoreplace(vd->vdev_spa, vd); 2501 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2502 } 2503 } 2504 2505 static int 2506 spa_check_for_missing_logs(spa_t *spa) 2507 { 2508 vdev_t *rvd = spa->spa_root_vdev; 2509 2510 /* 2511 * If we're doing a normal import, then build up any additional 2512 * diagnostic information about missing log devices. 2513 * We'll pass this up to the user for further processing. 2514 */ 2515 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2516 nvlist_t **child, *nv; 2517 uint64_t idx = 0; 2518 2519 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2520 KM_SLEEP); 2521 nv = fnvlist_alloc(); 2522 2523 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2524 vdev_t *tvd = rvd->vdev_child[c]; 2525 2526 /* 2527 * We consider a device as missing only if it failed 2528 * to open (i.e. offline or faulted is not considered 2529 * as missing). 2530 */ 2531 if (tvd->vdev_islog && 2532 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2533 child[idx++] = vdev_config_generate(spa, tvd, 2534 B_FALSE, VDEV_CONFIG_MISSING); 2535 } 2536 } 2537 2538 if (idx > 0) { 2539 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2540 (const nvlist_t * const *)child, idx); 2541 fnvlist_add_nvlist(spa->spa_load_info, 2542 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2543 2544 for (uint64_t i = 0; i < idx; i++) 2545 nvlist_free(child[i]); 2546 } 2547 nvlist_free(nv); 2548 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2549 2550 if (idx > 0) { 2551 spa_load_failed(spa, "some log devices are missing"); 2552 vdev_dbgmsg_print_tree(rvd, 2); 2553 return (SET_ERROR(ENXIO)); 2554 } 2555 } else { 2556 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2557 vdev_t *tvd = rvd->vdev_child[c]; 2558 2559 if (tvd->vdev_islog && 2560 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2561 spa_set_log_state(spa, SPA_LOG_CLEAR); 2562 spa_load_note(spa, "some log devices are " 2563 "missing, ZIL is dropped."); 2564 vdev_dbgmsg_print_tree(rvd, 2); 2565 break; 2566 } 2567 } 2568 } 2569 2570 return (0); 2571 } 2572 2573 /* 2574 * Check for missing log devices 2575 */ 2576 static boolean_t 2577 spa_check_logs(spa_t *spa) 2578 { 2579 boolean_t rv = B_FALSE; 2580 dsl_pool_t *dp = spa_get_dsl(spa); 2581 2582 switch (spa->spa_log_state) { 2583 default: 2584 break; 2585 case SPA_LOG_MISSING: 2586 /* need to recheck in case slog has been restored */ 2587 case SPA_LOG_UNKNOWN: 2588 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2589 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2590 if (rv) 2591 spa_set_log_state(spa, SPA_LOG_MISSING); 2592 break; 2593 } 2594 return (rv); 2595 } 2596 2597 /* 2598 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2599 */ 2600 static boolean_t 2601 spa_passivate_log(spa_t *spa) 2602 { 2603 vdev_t *rvd = spa->spa_root_vdev; 2604 boolean_t slog_found = B_FALSE; 2605 2606 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2607 2608 for (int c = 0; c < rvd->vdev_children; c++) { 2609 vdev_t *tvd = rvd->vdev_child[c]; 2610 2611 if (tvd->vdev_islog) { 2612 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2613 metaslab_group_passivate(tvd->vdev_mg); 2614 slog_found = B_TRUE; 2615 } 2616 } 2617 2618 return (slog_found); 2619 } 2620 2621 /* 2622 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2623 */ 2624 static void 2625 spa_activate_log(spa_t *spa) 2626 { 2627 vdev_t *rvd = spa->spa_root_vdev; 2628 2629 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2630 2631 for (int c = 0; c < rvd->vdev_children; c++) { 2632 vdev_t *tvd = rvd->vdev_child[c]; 2633 2634 if (tvd->vdev_islog) { 2635 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2636 metaslab_group_activate(tvd->vdev_mg); 2637 } 2638 } 2639 } 2640 2641 int 2642 spa_reset_logs(spa_t *spa) 2643 { 2644 int error; 2645 2646 error = dmu_objset_find(spa_name(spa), zil_reset, 2647 NULL, DS_FIND_CHILDREN); 2648 if (error == 0) { 2649 /* 2650 * We successfully offlined the log device, sync out the 2651 * current txg so that the "stubby" block can be removed 2652 * by zil_sync(). 2653 */ 2654 txg_wait_synced(spa->spa_dsl_pool, 0); 2655 } 2656 return (error); 2657 } 2658 2659 static void 2660 spa_aux_check_removed(spa_aux_vdev_t *sav) 2661 { 2662 for (int i = 0; i < sav->sav_count; i++) 2663 spa_check_removed(sav->sav_vdevs[i]); 2664 } 2665 2666 void 2667 spa_claim_notify(zio_t *zio) 2668 { 2669 spa_t *spa = zio->io_spa; 2670 2671 if (zio->io_error) 2672 return; 2673 2674 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2675 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2676 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2677 mutex_exit(&spa->spa_props_lock); 2678 } 2679 2680 typedef struct spa_load_error { 2681 boolean_t sle_verify_data; 2682 uint64_t sle_meta_count; 2683 uint64_t sle_data_count; 2684 } spa_load_error_t; 2685 2686 static void 2687 spa_load_verify_done(zio_t *zio) 2688 { 2689 blkptr_t *bp = zio->io_bp; 2690 spa_load_error_t *sle = zio->io_private; 2691 dmu_object_type_t type = BP_GET_TYPE(bp); 2692 int error = zio->io_error; 2693 spa_t *spa = zio->io_spa; 2694 2695 abd_free(zio->io_abd); 2696 if (error) { 2697 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2698 type != DMU_OT_INTENT_LOG) 2699 atomic_inc_64(&sle->sle_meta_count); 2700 else 2701 atomic_inc_64(&sle->sle_data_count); 2702 } 2703 2704 mutex_enter(&spa->spa_scrub_lock); 2705 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2706 cv_broadcast(&spa->spa_scrub_io_cv); 2707 mutex_exit(&spa->spa_scrub_lock); 2708 } 2709 2710 /* 2711 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2712 * By default, we set it to 1/16th of the arc. 2713 */ 2714 static uint_t spa_load_verify_shift = 4; 2715 static int spa_load_verify_metadata = B_TRUE; 2716 static int spa_load_verify_data = B_TRUE; 2717 2718 static int 2719 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2720 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2721 { 2722 zio_t *rio = arg; 2723 spa_load_error_t *sle = rio->io_private; 2724 2725 (void) zilog, (void) dnp; 2726 2727 /* 2728 * Note: normally this routine will not be called if 2729 * spa_load_verify_metadata is not set. However, it may be useful 2730 * to manually set the flag after the traversal has begun. 2731 */ 2732 if (!spa_load_verify_metadata) 2733 return (0); 2734 2735 /* 2736 * Sanity check the block pointer in order to detect obvious damage 2737 * before using the contents in subsequent checks or in zio_read(). 2738 * When damaged consider it to be a metadata error since we cannot 2739 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2740 */ 2741 if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2742 atomic_inc_64(&sle->sle_meta_count); 2743 return (0); 2744 } 2745 2746 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2747 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2748 return (0); 2749 2750 if (!BP_IS_METADATA(bp) && 2751 (!spa_load_verify_data || !sle->sle_verify_data)) 2752 return (0); 2753 2754 uint64_t maxinflight_bytes = 2755 arc_target_bytes() >> spa_load_verify_shift; 2756 size_t size = BP_GET_PSIZE(bp); 2757 2758 mutex_enter(&spa->spa_scrub_lock); 2759 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2760 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2761 spa->spa_load_verify_bytes += size; 2762 mutex_exit(&spa->spa_scrub_lock); 2763 2764 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2765 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2766 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2767 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2768 return (0); 2769 } 2770 2771 static int 2772 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2773 { 2774 (void) dp, (void) arg; 2775 2776 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2777 return (SET_ERROR(ENAMETOOLONG)); 2778 2779 return (0); 2780 } 2781 2782 static int 2783 spa_load_verify(spa_t *spa) 2784 { 2785 zio_t *rio; 2786 spa_load_error_t sle = { 0 }; 2787 zpool_load_policy_t policy; 2788 boolean_t verify_ok = B_FALSE; 2789 int error = 0; 2790 2791 zpool_get_load_policy(spa->spa_config, &policy); 2792 2793 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2794 policy.zlp_maxmeta == UINT64_MAX) 2795 return (0); 2796 2797 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2798 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2799 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2800 DS_FIND_CHILDREN); 2801 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2802 if (error != 0) 2803 return (error); 2804 2805 /* 2806 * Verify data only if we are rewinding or error limit was set. 2807 * Otherwise nothing except dbgmsg care about it to waste time. 2808 */ 2809 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2810 (policy.zlp_maxdata < UINT64_MAX); 2811 2812 rio = zio_root(spa, NULL, &sle, 2813 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2814 2815 if (spa_load_verify_metadata) { 2816 if (spa->spa_extreme_rewind) { 2817 spa_load_note(spa, "performing a complete scan of the " 2818 "pool since extreme rewind is on. This may take " 2819 "a very long time.\n (spa_load_verify_data=%u, " 2820 "spa_load_verify_metadata=%u)", 2821 spa_load_verify_data, spa_load_verify_metadata); 2822 } 2823 2824 error = traverse_pool(spa, spa->spa_verify_min_txg, 2825 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2826 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2827 } 2828 2829 (void) zio_wait(rio); 2830 ASSERT0(spa->spa_load_verify_bytes); 2831 2832 spa->spa_load_meta_errors = sle.sle_meta_count; 2833 spa->spa_load_data_errors = sle.sle_data_count; 2834 2835 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2836 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2837 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2838 (u_longlong_t)sle.sle_data_count); 2839 } 2840 2841 if (spa_load_verify_dryrun || 2842 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2843 sle.sle_data_count <= policy.zlp_maxdata)) { 2844 int64_t loss = 0; 2845 2846 verify_ok = B_TRUE; 2847 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2848 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2849 2850 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2851 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2852 spa->spa_load_txg_ts); 2853 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2854 loss); 2855 fnvlist_add_uint64(spa->spa_load_info, 2856 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2857 fnvlist_add_uint64(spa->spa_load_info, 2858 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2859 } else { 2860 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2861 } 2862 2863 if (spa_load_verify_dryrun) 2864 return (0); 2865 2866 if (error) { 2867 if (error != ENXIO && error != EIO) 2868 error = SET_ERROR(EIO); 2869 return (error); 2870 } 2871 2872 return (verify_ok ? 0 : EIO); 2873 } 2874 2875 /* 2876 * Find a value in the pool props object. 2877 */ 2878 static void 2879 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2880 { 2881 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2882 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2883 } 2884 2885 /* 2886 * Find a value in the pool directory object. 2887 */ 2888 static int 2889 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2890 { 2891 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2892 name, sizeof (uint64_t), 1, val); 2893 2894 if (error != 0 && (error != ENOENT || log_enoent)) { 2895 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2896 "[error=%d]", name, error); 2897 } 2898 2899 return (error); 2900 } 2901 2902 static int 2903 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2904 { 2905 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2906 return (SET_ERROR(err)); 2907 } 2908 2909 boolean_t 2910 spa_livelist_delete_check(spa_t *spa) 2911 { 2912 return (spa->spa_livelists_to_delete != 0); 2913 } 2914 2915 static boolean_t 2916 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2917 { 2918 (void) z; 2919 spa_t *spa = arg; 2920 return (spa_livelist_delete_check(spa)); 2921 } 2922 2923 static int 2924 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2925 { 2926 spa_t *spa = arg; 2927 zio_free(spa, tx->tx_txg, bp); 2928 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2929 -bp_get_dsize_sync(spa, bp), 2930 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2931 return (0); 2932 } 2933 2934 static int 2935 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2936 { 2937 int err; 2938 zap_cursor_t zc; 2939 zap_attribute_t za; 2940 zap_cursor_init(&zc, os, zap_obj); 2941 err = zap_cursor_retrieve(&zc, &za); 2942 zap_cursor_fini(&zc); 2943 if (err == 0) 2944 *llp = za.za_first_integer; 2945 return (err); 2946 } 2947 2948 /* 2949 * Components of livelist deletion that must be performed in syncing 2950 * context: freeing block pointers and updating the pool-wide data 2951 * structures to indicate how much work is left to do 2952 */ 2953 typedef struct sublist_delete_arg { 2954 spa_t *spa; 2955 dsl_deadlist_t *ll; 2956 uint64_t key; 2957 bplist_t *to_free; 2958 } sublist_delete_arg_t; 2959 2960 static void 2961 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2962 { 2963 sublist_delete_arg_t *sda = arg; 2964 spa_t *spa = sda->spa; 2965 dsl_deadlist_t *ll = sda->ll; 2966 uint64_t key = sda->key; 2967 bplist_t *to_free = sda->to_free; 2968 2969 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2970 dsl_deadlist_remove_entry(ll, key, tx); 2971 } 2972 2973 typedef struct livelist_delete_arg { 2974 spa_t *spa; 2975 uint64_t ll_obj; 2976 uint64_t zap_obj; 2977 } livelist_delete_arg_t; 2978 2979 static void 2980 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2981 { 2982 livelist_delete_arg_t *lda = arg; 2983 spa_t *spa = lda->spa; 2984 uint64_t ll_obj = lda->ll_obj; 2985 uint64_t zap_obj = lda->zap_obj; 2986 objset_t *mos = spa->spa_meta_objset; 2987 uint64_t count; 2988 2989 /* free the livelist and decrement the feature count */ 2990 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2991 dsl_deadlist_free(mos, ll_obj, tx); 2992 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2993 VERIFY0(zap_count(mos, zap_obj, &count)); 2994 if (count == 0) { 2995 /* no more livelists to delete */ 2996 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2997 DMU_POOL_DELETED_CLONES, tx)); 2998 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2999 spa->spa_livelists_to_delete = 0; 3000 spa_notify_waiters(spa); 3001 } 3002 } 3003 3004 /* 3005 * Load in the value for the livelist to be removed and open it. Then, 3006 * load its first sublist and determine which block pointers should actually 3007 * be freed. Then, call a synctask which performs the actual frees and updates 3008 * the pool-wide livelist data. 3009 */ 3010 static void 3011 spa_livelist_delete_cb(void *arg, zthr_t *z) 3012 { 3013 spa_t *spa = arg; 3014 uint64_t ll_obj = 0, count; 3015 objset_t *mos = spa->spa_meta_objset; 3016 uint64_t zap_obj = spa->spa_livelists_to_delete; 3017 /* 3018 * Determine the next livelist to delete. This function should only 3019 * be called if there is at least one deleted clone. 3020 */ 3021 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3022 VERIFY0(zap_count(mos, ll_obj, &count)); 3023 if (count > 0) { 3024 dsl_deadlist_t *ll; 3025 dsl_deadlist_entry_t *dle; 3026 bplist_t to_free; 3027 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3028 dsl_deadlist_open(ll, mos, ll_obj); 3029 dle = dsl_deadlist_first(ll); 3030 ASSERT3P(dle, !=, NULL); 3031 bplist_create(&to_free); 3032 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3033 z, NULL); 3034 if (err == 0) { 3035 sublist_delete_arg_t sync_arg = { 3036 .spa = spa, 3037 .ll = ll, 3038 .key = dle->dle_mintxg, 3039 .to_free = &to_free 3040 }; 3041 zfs_dbgmsg("deleting sublist (id %llu) from" 3042 " livelist %llu, %lld remaining", 3043 (u_longlong_t)dle->dle_bpobj.bpo_object, 3044 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3045 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3046 sublist_delete_sync, &sync_arg, 0, 3047 ZFS_SPACE_CHECK_DESTROY)); 3048 } else { 3049 VERIFY3U(err, ==, EINTR); 3050 } 3051 bplist_clear(&to_free); 3052 bplist_destroy(&to_free); 3053 dsl_deadlist_close(ll); 3054 kmem_free(ll, sizeof (dsl_deadlist_t)); 3055 } else { 3056 livelist_delete_arg_t sync_arg = { 3057 .spa = spa, 3058 .ll_obj = ll_obj, 3059 .zap_obj = zap_obj 3060 }; 3061 zfs_dbgmsg("deletion of livelist %llu completed", 3062 (u_longlong_t)ll_obj); 3063 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3064 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3065 } 3066 } 3067 3068 static void 3069 spa_start_livelist_destroy_thread(spa_t *spa) 3070 { 3071 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 3072 spa->spa_livelist_delete_zthr = 3073 zthr_create("z_livelist_destroy", 3074 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3075 minclsyspri); 3076 } 3077 3078 typedef struct livelist_new_arg { 3079 bplist_t *allocs; 3080 bplist_t *frees; 3081 } livelist_new_arg_t; 3082 3083 static int 3084 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3085 dmu_tx_t *tx) 3086 { 3087 ASSERT(tx == NULL); 3088 livelist_new_arg_t *lna = arg; 3089 if (bp_freed) { 3090 bplist_append(lna->frees, bp); 3091 } else { 3092 bplist_append(lna->allocs, bp); 3093 zfs_livelist_condense_new_alloc++; 3094 } 3095 return (0); 3096 } 3097 3098 typedef struct livelist_condense_arg { 3099 spa_t *spa; 3100 bplist_t to_keep; 3101 uint64_t first_size; 3102 uint64_t next_size; 3103 } livelist_condense_arg_t; 3104 3105 static void 3106 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3107 { 3108 livelist_condense_arg_t *lca = arg; 3109 spa_t *spa = lca->spa; 3110 bplist_t new_frees; 3111 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3112 3113 /* Have we been cancelled? */ 3114 if (spa->spa_to_condense.cancelled) { 3115 zfs_livelist_condense_sync_cancel++; 3116 goto out; 3117 } 3118 3119 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3120 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3121 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3122 3123 /* 3124 * It's possible that the livelist was changed while the zthr was 3125 * running. Therefore, we need to check for new blkptrs in the two 3126 * entries being condensed and continue to track them in the livelist. 3127 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3128 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3129 * we need to sort them into two different bplists. 3130 */ 3131 uint64_t first_obj = first->dle_bpobj.bpo_object; 3132 uint64_t next_obj = next->dle_bpobj.bpo_object; 3133 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3134 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3135 3136 bplist_create(&new_frees); 3137 livelist_new_arg_t new_bps = { 3138 .allocs = &lca->to_keep, 3139 .frees = &new_frees, 3140 }; 3141 3142 if (cur_first_size > lca->first_size) { 3143 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3144 livelist_track_new_cb, &new_bps, lca->first_size)); 3145 } 3146 if (cur_next_size > lca->next_size) { 3147 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3148 livelist_track_new_cb, &new_bps, lca->next_size)); 3149 } 3150 3151 dsl_deadlist_clear_entry(first, ll, tx); 3152 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3153 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3154 3155 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3156 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3157 bplist_destroy(&new_frees); 3158 3159 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3160 dsl_dataset_name(ds, dsname); 3161 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3162 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3163 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3164 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3165 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3166 (u_longlong_t)cur_next_size, 3167 (u_longlong_t)first->dle_bpobj.bpo_object, 3168 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3169 out: 3170 dmu_buf_rele(ds->ds_dbuf, spa); 3171 spa->spa_to_condense.ds = NULL; 3172 bplist_clear(&lca->to_keep); 3173 bplist_destroy(&lca->to_keep); 3174 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3175 spa->spa_to_condense.syncing = B_FALSE; 3176 } 3177 3178 static void 3179 spa_livelist_condense_cb(void *arg, zthr_t *t) 3180 { 3181 while (zfs_livelist_condense_zthr_pause && 3182 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3183 delay(1); 3184 3185 spa_t *spa = arg; 3186 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3187 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3188 uint64_t first_size, next_size; 3189 3190 livelist_condense_arg_t *lca = 3191 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3192 bplist_create(&lca->to_keep); 3193 3194 /* 3195 * Process the livelists (matching FREEs and ALLOCs) in open context 3196 * so we have minimal work in syncing context to condense. 3197 * 3198 * We save bpobj sizes (first_size and next_size) to use later in 3199 * syncing context to determine if entries were added to these sublists 3200 * while in open context. This is possible because the clone is still 3201 * active and open for normal writes and we want to make sure the new, 3202 * unprocessed blockpointers are inserted into the livelist normally. 3203 * 3204 * Note that dsl_process_sub_livelist() both stores the size number of 3205 * blockpointers and iterates over them while the bpobj's lock held, so 3206 * the sizes returned to us are consistent which what was actually 3207 * processed. 3208 */ 3209 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3210 &first_size); 3211 if (err == 0) 3212 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3213 t, &next_size); 3214 3215 if (err == 0) { 3216 while (zfs_livelist_condense_sync_pause && 3217 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3218 delay(1); 3219 3220 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3221 dmu_tx_mark_netfree(tx); 3222 dmu_tx_hold_space(tx, 1); 3223 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 3224 if (err == 0) { 3225 /* 3226 * Prevent the condense zthr restarting before 3227 * the synctask completes. 3228 */ 3229 spa->spa_to_condense.syncing = B_TRUE; 3230 lca->spa = spa; 3231 lca->first_size = first_size; 3232 lca->next_size = next_size; 3233 dsl_sync_task_nowait(spa_get_dsl(spa), 3234 spa_livelist_condense_sync, lca, tx); 3235 dmu_tx_commit(tx); 3236 return; 3237 } 3238 } 3239 /* 3240 * Condensing can not continue: either it was externally stopped or 3241 * we were unable to assign to a tx because the pool has run out of 3242 * space. In the second case, we'll just end up trying to condense 3243 * again in a later txg. 3244 */ 3245 ASSERT(err != 0); 3246 bplist_clear(&lca->to_keep); 3247 bplist_destroy(&lca->to_keep); 3248 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3249 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3250 spa->spa_to_condense.ds = NULL; 3251 if (err == EINTR) 3252 zfs_livelist_condense_zthr_cancel++; 3253 } 3254 3255 /* 3256 * Check that there is something to condense but that a condense is not 3257 * already in progress and that condensing has not been cancelled. 3258 */ 3259 static boolean_t 3260 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3261 { 3262 (void) z; 3263 spa_t *spa = arg; 3264 if ((spa->spa_to_condense.ds != NULL) && 3265 (spa->spa_to_condense.syncing == B_FALSE) && 3266 (spa->spa_to_condense.cancelled == B_FALSE)) { 3267 return (B_TRUE); 3268 } 3269 return (B_FALSE); 3270 } 3271 3272 static void 3273 spa_start_livelist_condensing_thread(spa_t *spa) 3274 { 3275 spa->spa_to_condense.ds = NULL; 3276 spa->spa_to_condense.first = NULL; 3277 spa->spa_to_condense.next = NULL; 3278 spa->spa_to_condense.syncing = B_FALSE; 3279 spa->spa_to_condense.cancelled = B_FALSE; 3280 3281 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 3282 spa->spa_livelist_condense_zthr = 3283 zthr_create("z_livelist_condense", 3284 spa_livelist_condense_cb_check, 3285 spa_livelist_condense_cb, spa, minclsyspri); 3286 } 3287 3288 static void 3289 spa_spawn_aux_threads(spa_t *spa) 3290 { 3291 ASSERT(spa_writeable(spa)); 3292 3293 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3294 3295 spa_start_raidz_expansion_thread(spa); 3296 spa_start_indirect_condensing_thread(spa); 3297 spa_start_livelist_destroy_thread(spa); 3298 spa_start_livelist_condensing_thread(spa); 3299 3300 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 3301 spa->spa_checkpoint_discard_zthr = 3302 zthr_create("z_checkpoint_discard", 3303 spa_checkpoint_discard_thread_check, 3304 spa_checkpoint_discard_thread, spa, minclsyspri); 3305 } 3306 3307 /* 3308 * Fix up config after a partly-completed split. This is done with the 3309 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3310 * pool have that entry in their config, but only the splitting one contains 3311 * a list of all the guids of the vdevs that are being split off. 3312 * 3313 * This function determines what to do with that list: either rejoin 3314 * all the disks to the pool, or complete the splitting process. To attempt 3315 * the rejoin, each disk that is offlined is marked online again, and 3316 * we do a reopen() call. If the vdev label for every disk that was 3317 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3318 * then we call vdev_split() on each disk, and complete the split. 3319 * 3320 * Otherwise we leave the config alone, with all the vdevs in place in 3321 * the original pool. 3322 */ 3323 static void 3324 spa_try_repair(spa_t *spa, nvlist_t *config) 3325 { 3326 uint_t extracted; 3327 uint64_t *glist; 3328 uint_t i, gcount; 3329 nvlist_t *nvl; 3330 vdev_t **vd; 3331 boolean_t attempt_reopen; 3332 3333 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3334 return; 3335 3336 /* check that the config is complete */ 3337 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3338 &glist, &gcount) != 0) 3339 return; 3340 3341 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3342 3343 /* attempt to online all the vdevs & validate */ 3344 attempt_reopen = B_TRUE; 3345 for (i = 0; i < gcount; i++) { 3346 if (glist[i] == 0) /* vdev is hole */ 3347 continue; 3348 3349 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3350 if (vd[i] == NULL) { 3351 /* 3352 * Don't bother attempting to reopen the disks; 3353 * just do the split. 3354 */ 3355 attempt_reopen = B_FALSE; 3356 } else { 3357 /* attempt to re-online it */ 3358 vd[i]->vdev_offline = B_FALSE; 3359 } 3360 } 3361 3362 if (attempt_reopen) { 3363 vdev_reopen(spa->spa_root_vdev); 3364 3365 /* check each device to see what state it's in */ 3366 for (extracted = 0, i = 0; i < gcount; i++) { 3367 if (vd[i] != NULL && 3368 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3369 break; 3370 ++extracted; 3371 } 3372 } 3373 3374 /* 3375 * If every disk has been moved to the new pool, or if we never 3376 * even attempted to look at them, then we split them off for 3377 * good. 3378 */ 3379 if (!attempt_reopen || gcount == extracted) { 3380 for (i = 0; i < gcount; i++) 3381 if (vd[i] != NULL) 3382 vdev_split(vd[i]); 3383 vdev_reopen(spa->spa_root_vdev); 3384 } 3385 3386 kmem_free(vd, gcount * sizeof (vdev_t *)); 3387 } 3388 3389 static int 3390 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3391 { 3392 const char *ereport = FM_EREPORT_ZFS_POOL; 3393 int error; 3394 3395 spa->spa_load_state = state; 3396 (void) spa_import_progress_set_state(spa_guid(spa), 3397 spa_load_state(spa)); 3398 spa_import_progress_set_notes(spa, "spa_load()"); 3399 3400 gethrestime(&spa->spa_loaded_ts); 3401 error = spa_load_impl(spa, type, &ereport); 3402 3403 /* 3404 * Don't count references from objsets that are already closed 3405 * and are making their way through the eviction process. 3406 */ 3407 spa_evicting_os_wait(spa); 3408 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3409 if (error) { 3410 if (error != EEXIST) { 3411 spa->spa_loaded_ts.tv_sec = 0; 3412 spa->spa_loaded_ts.tv_nsec = 0; 3413 } 3414 if (error != EBADF) { 3415 (void) zfs_ereport_post(ereport, spa, 3416 NULL, NULL, NULL, 0); 3417 } 3418 } 3419 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3420 spa->spa_ena = 0; 3421 3422 (void) spa_import_progress_set_state(spa_guid(spa), 3423 spa_load_state(spa)); 3424 3425 return (error); 3426 } 3427 3428 #ifdef ZFS_DEBUG 3429 /* 3430 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3431 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3432 * spa's per-vdev ZAP list. 3433 */ 3434 static uint64_t 3435 vdev_count_verify_zaps(vdev_t *vd) 3436 { 3437 spa_t *spa = vd->vdev_spa; 3438 uint64_t total = 0; 3439 3440 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3441 vd->vdev_root_zap != 0) { 3442 total++; 3443 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3444 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3445 } 3446 if (vd->vdev_top_zap != 0) { 3447 total++; 3448 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3449 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3450 } 3451 if (vd->vdev_leaf_zap != 0) { 3452 total++; 3453 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3454 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3455 } 3456 3457 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3458 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3459 } 3460 3461 return (total); 3462 } 3463 #else 3464 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3465 #endif 3466 3467 /* 3468 * Determine whether the activity check is required. 3469 */ 3470 static boolean_t 3471 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3472 nvlist_t *config) 3473 { 3474 uint64_t state = 0; 3475 uint64_t hostid = 0; 3476 uint64_t tryconfig_txg = 0; 3477 uint64_t tryconfig_timestamp = 0; 3478 uint16_t tryconfig_mmp_seq = 0; 3479 nvlist_t *nvinfo; 3480 3481 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3482 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3483 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3484 &tryconfig_txg); 3485 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3486 &tryconfig_timestamp); 3487 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3488 &tryconfig_mmp_seq); 3489 } 3490 3491 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3492 3493 /* 3494 * Disable the MMP activity check - This is used by zdb which 3495 * is intended to be used on potentially active pools. 3496 */ 3497 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3498 return (B_FALSE); 3499 3500 /* 3501 * Skip the activity check when the MMP feature is disabled. 3502 */ 3503 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3504 return (B_FALSE); 3505 3506 /* 3507 * If the tryconfig_ values are nonzero, they are the results of an 3508 * earlier tryimport. If they all match the uberblock we just found, 3509 * then the pool has not changed and we return false so we do not test 3510 * a second time. 3511 */ 3512 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3513 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3514 tryconfig_mmp_seq && tryconfig_mmp_seq == 3515 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3516 return (B_FALSE); 3517 3518 /* 3519 * Allow the activity check to be skipped when importing the pool 3520 * on the same host which last imported it. Since the hostid from 3521 * configuration may be stale use the one read from the label. 3522 */ 3523 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3524 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3525 3526 if (hostid == spa_get_hostid(spa)) 3527 return (B_FALSE); 3528 3529 /* 3530 * Skip the activity test when the pool was cleanly exported. 3531 */ 3532 if (state != POOL_STATE_ACTIVE) 3533 return (B_FALSE); 3534 3535 return (B_TRUE); 3536 } 3537 3538 /* 3539 * Nanoseconds the activity check must watch for changes on-disk. 3540 */ 3541 static uint64_t 3542 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3543 { 3544 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3545 uint64_t multihost_interval = MSEC2NSEC( 3546 MMP_INTERVAL_OK(zfs_multihost_interval)); 3547 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3548 multihost_interval); 3549 3550 /* 3551 * Local tunables determine a minimum duration except for the case 3552 * where we know when the remote host will suspend the pool if MMP 3553 * writes do not land. 3554 * 3555 * See Big Theory comment at the top of mmp.c for the reasoning behind 3556 * these cases and times. 3557 */ 3558 3559 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3560 3561 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3562 MMP_FAIL_INT(ub) > 0) { 3563 3564 /* MMP on remote host will suspend pool after failed writes */ 3565 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3566 MMP_IMPORT_SAFETY_FACTOR / 100; 3567 3568 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3569 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3570 "import_intervals=%llu", (u_longlong_t)import_delay, 3571 (u_longlong_t)MMP_FAIL_INT(ub), 3572 (u_longlong_t)MMP_INTERVAL(ub), 3573 (u_longlong_t)import_intervals); 3574 3575 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3576 MMP_FAIL_INT(ub) == 0) { 3577 3578 /* MMP on remote host will never suspend pool */ 3579 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3580 ub->ub_mmp_delay) * import_intervals); 3581 3582 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3583 "mmp_interval=%llu ub_mmp_delay=%llu " 3584 "import_intervals=%llu", (u_longlong_t)import_delay, 3585 (u_longlong_t)MMP_INTERVAL(ub), 3586 (u_longlong_t)ub->ub_mmp_delay, 3587 (u_longlong_t)import_intervals); 3588 3589 } else if (MMP_VALID(ub)) { 3590 /* 3591 * zfs-0.7 compatibility case 3592 */ 3593 3594 import_delay = MAX(import_delay, (multihost_interval + 3595 ub->ub_mmp_delay) * import_intervals); 3596 3597 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3598 "import_intervals=%llu leaves=%u", 3599 (u_longlong_t)import_delay, 3600 (u_longlong_t)ub->ub_mmp_delay, 3601 (u_longlong_t)import_intervals, 3602 vdev_count_leaves(spa)); 3603 } else { 3604 /* Using local tunings is the only reasonable option */ 3605 zfs_dbgmsg("pool last imported on non-MMP aware " 3606 "host using import_delay=%llu multihost_interval=%llu " 3607 "import_intervals=%llu", (u_longlong_t)import_delay, 3608 (u_longlong_t)multihost_interval, 3609 (u_longlong_t)import_intervals); 3610 } 3611 3612 return (import_delay); 3613 } 3614 3615 /* 3616 * Perform the import activity check. If the user canceled the import or 3617 * we detected activity then fail. 3618 */ 3619 static int 3620 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3621 { 3622 uint64_t txg = ub->ub_txg; 3623 uint64_t timestamp = ub->ub_timestamp; 3624 uint64_t mmp_config = ub->ub_mmp_config; 3625 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3626 uint64_t import_delay; 3627 hrtime_t import_expire, now; 3628 nvlist_t *mmp_label = NULL; 3629 vdev_t *rvd = spa->spa_root_vdev; 3630 kcondvar_t cv; 3631 kmutex_t mtx; 3632 int error = 0; 3633 3634 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3635 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3636 mutex_enter(&mtx); 3637 3638 /* 3639 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3640 * during the earlier tryimport. If the txg recorded there is 0 then 3641 * the pool is known to be active on another host. 3642 * 3643 * Otherwise, the pool might be in use on another host. Check for 3644 * changes in the uberblocks on disk if necessary. 3645 */ 3646 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3647 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3648 ZPOOL_CONFIG_LOAD_INFO); 3649 3650 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3651 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3652 vdev_uberblock_load(rvd, ub, &mmp_label); 3653 error = SET_ERROR(EREMOTEIO); 3654 goto out; 3655 } 3656 } 3657 3658 import_delay = spa_activity_check_duration(spa, ub); 3659 3660 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3661 import_delay += import_delay * random_in_range(250) / 1000; 3662 3663 import_expire = gethrtime() + import_delay; 3664 3665 spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " 3666 "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3667 3668 int interations = 0; 3669 while ((now = gethrtime()) < import_expire) { 3670 if (interations++ % 30 == 0) { 3671 spa_import_progress_set_notes(spa, "Checking MMP " 3672 "activity, %llu ms remaining", 3673 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3674 } 3675 3676 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3677 NSEC2SEC(import_expire - gethrtime())); 3678 3679 vdev_uberblock_load(rvd, ub, &mmp_label); 3680 3681 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3682 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3683 zfs_dbgmsg("multihost activity detected " 3684 "txg %llu ub_txg %llu " 3685 "timestamp %llu ub_timestamp %llu " 3686 "mmp_config %#llx ub_mmp_config %#llx", 3687 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3688 (u_longlong_t)timestamp, 3689 (u_longlong_t)ub->ub_timestamp, 3690 (u_longlong_t)mmp_config, 3691 (u_longlong_t)ub->ub_mmp_config); 3692 3693 error = SET_ERROR(EREMOTEIO); 3694 break; 3695 } 3696 3697 if (mmp_label) { 3698 nvlist_free(mmp_label); 3699 mmp_label = NULL; 3700 } 3701 3702 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3703 if (error != -1) { 3704 error = SET_ERROR(EINTR); 3705 break; 3706 } 3707 error = 0; 3708 } 3709 3710 out: 3711 mutex_exit(&mtx); 3712 mutex_destroy(&mtx); 3713 cv_destroy(&cv); 3714 3715 /* 3716 * If the pool is determined to be active store the status in the 3717 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3718 * available from configuration read from disk store them as well. 3719 * This allows 'zpool import' to generate a more useful message. 3720 * 3721 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3722 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3723 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3724 */ 3725 if (error == EREMOTEIO) { 3726 const char *hostname = "<unknown>"; 3727 uint64_t hostid = 0; 3728 3729 if (mmp_label) { 3730 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3731 hostname = fnvlist_lookup_string(mmp_label, 3732 ZPOOL_CONFIG_HOSTNAME); 3733 fnvlist_add_string(spa->spa_load_info, 3734 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3735 } 3736 3737 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3738 hostid = fnvlist_lookup_uint64(mmp_label, 3739 ZPOOL_CONFIG_HOSTID); 3740 fnvlist_add_uint64(spa->spa_load_info, 3741 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3742 } 3743 } 3744 3745 fnvlist_add_uint64(spa->spa_load_info, 3746 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3747 fnvlist_add_uint64(spa->spa_load_info, 3748 ZPOOL_CONFIG_MMP_TXG, 0); 3749 3750 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3751 } 3752 3753 if (mmp_label) 3754 nvlist_free(mmp_label); 3755 3756 return (error); 3757 } 3758 3759 static int 3760 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3761 { 3762 uint64_t hostid; 3763 const char *hostname; 3764 uint64_t myhostid = 0; 3765 3766 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3767 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3768 hostname = fnvlist_lookup_string(mos_config, 3769 ZPOOL_CONFIG_HOSTNAME); 3770 3771 myhostid = zone_get_hostid(NULL); 3772 3773 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3774 cmn_err(CE_WARN, "pool '%s' could not be " 3775 "loaded as it was last accessed by " 3776 "another system (host: %s hostid: 0x%llx). " 3777 "See: https://openzfs.github.io/openzfs-docs/msg/" 3778 "ZFS-8000-EY", 3779 spa_name(spa), hostname, (u_longlong_t)hostid); 3780 spa_load_failed(spa, "hostid verification failed: pool " 3781 "last accessed by host: %s (hostid: 0x%llx)", 3782 hostname, (u_longlong_t)hostid); 3783 return (SET_ERROR(EBADF)); 3784 } 3785 } 3786 3787 return (0); 3788 } 3789 3790 static int 3791 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3792 { 3793 int error = 0; 3794 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3795 int parse; 3796 vdev_t *rvd; 3797 uint64_t pool_guid; 3798 const char *comment; 3799 const char *compatibility; 3800 3801 /* 3802 * Versioning wasn't explicitly added to the label until later, so if 3803 * it's not present treat it as the initial version. 3804 */ 3805 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3806 &spa->spa_ubsync.ub_version) != 0) 3807 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3808 3809 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3810 spa_load_failed(spa, "invalid config provided: '%s' missing", 3811 ZPOOL_CONFIG_POOL_GUID); 3812 return (SET_ERROR(EINVAL)); 3813 } 3814 3815 /* 3816 * If we are doing an import, ensure that the pool is not already 3817 * imported by checking if its pool guid already exists in the 3818 * spa namespace. 3819 * 3820 * The only case that we allow an already imported pool to be 3821 * imported again, is when the pool is checkpointed and we want to 3822 * look at its checkpointed state from userland tools like zdb. 3823 */ 3824 #ifdef _KERNEL 3825 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3826 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3827 spa_guid_exists(pool_guid, 0)) { 3828 #else 3829 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3830 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3831 spa_guid_exists(pool_guid, 0) && 3832 !spa_importing_readonly_checkpoint(spa)) { 3833 #endif 3834 spa_load_failed(spa, "a pool with guid %llu is already open", 3835 (u_longlong_t)pool_guid); 3836 return (SET_ERROR(EEXIST)); 3837 } 3838 3839 spa->spa_config_guid = pool_guid; 3840 3841 nvlist_free(spa->spa_load_info); 3842 spa->spa_load_info = fnvlist_alloc(); 3843 3844 ASSERT(spa->spa_comment == NULL); 3845 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3846 spa->spa_comment = spa_strdup(comment); 3847 3848 ASSERT(spa->spa_compatibility == NULL); 3849 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3850 &compatibility) == 0) 3851 spa->spa_compatibility = spa_strdup(compatibility); 3852 3853 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3854 &spa->spa_config_txg); 3855 3856 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3857 spa->spa_config_splitting = fnvlist_dup(nvl); 3858 3859 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3860 spa_load_failed(spa, "invalid config provided: '%s' missing", 3861 ZPOOL_CONFIG_VDEV_TREE); 3862 return (SET_ERROR(EINVAL)); 3863 } 3864 3865 /* 3866 * Create "The Godfather" zio to hold all async IOs 3867 */ 3868 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3869 KM_SLEEP); 3870 for (int i = 0; i < max_ncpus; i++) { 3871 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3872 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3873 ZIO_FLAG_GODFATHER); 3874 } 3875 3876 /* 3877 * Parse the configuration into a vdev tree. We explicitly set the 3878 * value that will be returned by spa_version() since parsing the 3879 * configuration requires knowing the version number. 3880 */ 3881 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3882 parse = (type == SPA_IMPORT_EXISTING ? 3883 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3884 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3885 spa_config_exit(spa, SCL_ALL, FTAG); 3886 3887 if (error != 0) { 3888 spa_load_failed(spa, "unable to parse config [error=%d]", 3889 error); 3890 return (error); 3891 } 3892 3893 ASSERT(spa->spa_root_vdev == rvd); 3894 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3895 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3896 3897 if (type != SPA_IMPORT_ASSEMBLE) { 3898 ASSERT(spa_guid(spa) == pool_guid); 3899 } 3900 3901 return (0); 3902 } 3903 3904 /* 3905 * Recursively open all vdevs in the vdev tree. This function is called twice: 3906 * first with the untrusted config, then with the trusted config. 3907 */ 3908 static int 3909 spa_ld_open_vdevs(spa_t *spa) 3910 { 3911 int error = 0; 3912 3913 /* 3914 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3915 * missing/unopenable for the root vdev to be still considered openable. 3916 */ 3917 if (spa->spa_trust_config) { 3918 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3919 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3920 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3921 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3922 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3923 } else { 3924 spa->spa_missing_tvds_allowed = 0; 3925 } 3926 3927 spa->spa_missing_tvds_allowed = 3928 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3929 3930 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3931 error = vdev_open(spa->spa_root_vdev); 3932 spa_config_exit(spa, SCL_ALL, FTAG); 3933 3934 if (spa->spa_missing_tvds != 0) { 3935 spa_load_note(spa, "vdev tree has %lld missing top-level " 3936 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3937 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3938 /* 3939 * Although theoretically we could allow users to open 3940 * incomplete pools in RW mode, we'd need to add a lot 3941 * of extra logic (e.g. adjust pool space to account 3942 * for missing vdevs). 3943 * This limitation also prevents users from accidentally 3944 * opening the pool in RW mode during data recovery and 3945 * damaging it further. 3946 */ 3947 spa_load_note(spa, "pools with missing top-level " 3948 "vdevs can only be opened in read-only mode."); 3949 error = SET_ERROR(ENXIO); 3950 } else { 3951 spa_load_note(spa, "current settings allow for maximum " 3952 "%lld missing top-level vdevs at this stage.", 3953 (u_longlong_t)spa->spa_missing_tvds_allowed); 3954 } 3955 } 3956 if (error != 0) { 3957 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3958 error); 3959 } 3960 if (spa->spa_missing_tvds != 0 || error != 0) 3961 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3962 3963 return (error); 3964 } 3965 3966 /* 3967 * We need to validate the vdev labels against the configuration that 3968 * we have in hand. This function is called twice: first with an untrusted 3969 * config, then with a trusted config. The validation is more strict when the 3970 * config is trusted. 3971 */ 3972 static int 3973 spa_ld_validate_vdevs(spa_t *spa) 3974 { 3975 int error = 0; 3976 vdev_t *rvd = spa->spa_root_vdev; 3977 3978 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3979 error = vdev_validate(rvd); 3980 spa_config_exit(spa, SCL_ALL, FTAG); 3981 3982 if (error != 0) { 3983 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3984 return (error); 3985 } 3986 3987 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3988 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3989 "some vdevs"); 3990 vdev_dbgmsg_print_tree(rvd, 2); 3991 return (SET_ERROR(ENXIO)); 3992 } 3993 3994 return (0); 3995 } 3996 3997 static void 3998 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3999 { 4000 spa->spa_state = POOL_STATE_ACTIVE; 4001 spa->spa_ubsync = spa->spa_uberblock; 4002 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 4003 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 4004 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 4005 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 4006 spa->spa_claim_max_txg = spa->spa_first_txg; 4007 spa->spa_prev_software_version = ub->ub_software_version; 4008 } 4009 4010 static int 4011 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 4012 { 4013 vdev_t *rvd = spa->spa_root_vdev; 4014 nvlist_t *label; 4015 uberblock_t *ub = &spa->spa_uberblock; 4016 boolean_t activity_check = B_FALSE; 4017 4018 /* 4019 * If we are opening the checkpointed state of the pool by 4020 * rewinding to it, at this point we will have written the 4021 * checkpointed uberblock to the vdev labels, so searching 4022 * the labels will find the right uberblock. However, if 4023 * we are opening the checkpointed state read-only, we have 4024 * not modified the labels. Therefore, we must ignore the 4025 * labels and continue using the spa_uberblock that was set 4026 * by spa_ld_checkpoint_rewind. 4027 * 4028 * Note that it would be fine to ignore the labels when 4029 * rewinding (opening writeable) as well. However, if we 4030 * crash just after writing the labels, we will end up 4031 * searching the labels. Doing so in the common case means 4032 * that this code path gets exercised normally, rather than 4033 * just in the edge case. 4034 */ 4035 if (ub->ub_checkpoint_txg != 0 && 4036 spa_importing_readonly_checkpoint(spa)) { 4037 spa_ld_select_uberblock_done(spa, ub); 4038 return (0); 4039 } 4040 4041 /* 4042 * Find the best uberblock. 4043 */ 4044 vdev_uberblock_load(rvd, ub, &label); 4045 4046 /* 4047 * If we weren't able to find a single valid uberblock, return failure. 4048 */ 4049 if (ub->ub_txg == 0) { 4050 nvlist_free(label); 4051 spa_load_failed(spa, "no valid uberblock found"); 4052 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4053 } 4054 4055 if (spa->spa_load_max_txg != UINT64_MAX) { 4056 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4057 (u_longlong_t)spa->spa_load_max_txg); 4058 } 4059 spa_load_note(spa, "using uberblock with txg=%llu", 4060 (u_longlong_t)ub->ub_txg); 4061 if (ub->ub_raidz_reflow_info != 0) { 4062 spa_load_note(spa, "uberblock raidz_reflow_info: " 4063 "state=%u offset=%llu", 4064 (int)RRSS_GET_STATE(ub), 4065 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4066 } 4067 4068 4069 /* 4070 * For pools which have the multihost property on determine if the 4071 * pool is truly inactive and can be safely imported. Prevent 4072 * hosts which don't have a hostid set from importing the pool. 4073 */ 4074 activity_check = spa_activity_check_required(spa, ub, label, 4075 spa->spa_config); 4076 if (activity_check) { 4077 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4078 spa_get_hostid(spa) == 0) { 4079 nvlist_free(label); 4080 fnvlist_add_uint64(spa->spa_load_info, 4081 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4082 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4083 } 4084 4085 int error = spa_activity_check(spa, ub, spa->spa_config); 4086 if (error) { 4087 nvlist_free(label); 4088 return (error); 4089 } 4090 4091 fnvlist_add_uint64(spa->spa_load_info, 4092 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4093 fnvlist_add_uint64(spa->spa_load_info, 4094 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4095 fnvlist_add_uint16(spa->spa_load_info, 4096 ZPOOL_CONFIG_MMP_SEQ, 4097 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4098 } 4099 4100 /* 4101 * If the pool has an unsupported version we can't open it. 4102 */ 4103 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4104 nvlist_free(label); 4105 spa_load_failed(spa, "version %llu is not supported", 4106 (u_longlong_t)ub->ub_version); 4107 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4108 } 4109 4110 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4111 nvlist_t *features; 4112 4113 /* 4114 * If we weren't able to find what's necessary for reading the 4115 * MOS in the label, return failure. 4116 */ 4117 if (label == NULL) { 4118 spa_load_failed(spa, "label config unavailable"); 4119 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4120 ENXIO)); 4121 } 4122 4123 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4124 &features) != 0) { 4125 nvlist_free(label); 4126 spa_load_failed(spa, "invalid label: '%s' missing", 4127 ZPOOL_CONFIG_FEATURES_FOR_READ); 4128 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4129 ENXIO)); 4130 } 4131 4132 /* 4133 * Update our in-core representation with the definitive values 4134 * from the label. 4135 */ 4136 nvlist_free(spa->spa_label_features); 4137 spa->spa_label_features = fnvlist_dup(features); 4138 } 4139 4140 nvlist_free(label); 4141 4142 /* 4143 * Look through entries in the label nvlist's features_for_read. If 4144 * there is a feature listed there which we don't understand then we 4145 * cannot open a pool. 4146 */ 4147 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4148 nvlist_t *unsup_feat; 4149 4150 unsup_feat = fnvlist_alloc(); 4151 4152 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4153 NULL); nvp != NULL; 4154 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4155 if (!zfeature_is_supported(nvpair_name(nvp))) { 4156 fnvlist_add_string(unsup_feat, 4157 nvpair_name(nvp), ""); 4158 } 4159 } 4160 4161 if (!nvlist_empty(unsup_feat)) { 4162 fnvlist_add_nvlist(spa->spa_load_info, 4163 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4164 nvlist_free(unsup_feat); 4165 spa_load_failed(spa, "some features are unsupported"); 4166 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4167 ENOTSUP)); 4168 } 4169 4170 nvlist_free(unsup_feat); 4171 } 4172 4173 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4174 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4175 spa_try_repair(spa, spa->spa_config); 4176 spa_config_exit(spa, SCL_ALL, FTAG); 4177 nvlist_free(spa->spa_config_splitting); 4178 spa->spa_config_splitting = NULL; 4179 } 4180 4181 /* 4182 * Initialize internal SPA structures. 4183 */ 4184 spa_ld_select_uberblock_done(spa, ub); 4185 4186 return (0); 4187 } 4188 4189 static int 4190 spa_ld_open_rootbp(spa_t *spa) 4191 { 4192 int error = 0; 4193 vdev_t *rvd = spa->spa_root_vdev; 4194 4195 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4196 if (error != 0) { 4197 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4198 "[error=%d]", error); 4199 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4200 } 4201 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4202 4203 return (0); 4204 } 4205 4206 static int 4207 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4208 boolean_t reloading) 4209 { 4210 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4211 nvlist_t *nv, *mos_config, *policy; 4212 int error = 0, copy_error; 4213 uint64_t healthy_tvds, healthy_tvds_mos; 4214 uint64_t mos_config_txg; 4215 4216 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4217 != 0) 4218 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4219 4220 /* 4221 * If we're assembling a pool from a split, the config provided is 4222 * already trusted so there is nothing to do. 4223 */ 4224 if (type == SPA_IMPORT_ASSEMBLE) 4225 return (0); 4226 4227 healthy_tvds = spa_healthy_core_tvds(spa); 4228 4229 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4230 != 0) { 4231 spa_load_failed(spa, "unable to retrieve MOS config"); 4232 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4233 } 4234 4235 /* 4236 * If we are doing an open, pool owner wasn't verified yet, thus do 4237 * the verification here. 4238 */ 4239 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4240 error = spa_verify_host(spa, mos_config); 4241 if (error != 0) { 4242 nvlist_free(mos_config); 4243 return (error); 4244 } 4245 } 4246 4247 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4248 4249 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4250 4251 /* 4252 * Build a new vdev tree from the trusted config 4253 */ 4254 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4255 if (error != 0) { 4256 nvlist_free(mos_config); 4257 spa_config_exit(spa, SCL_ALL, FTAG); 4258 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4259 error); 4260 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4261 } 4262 4263 /* 4264 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4265 * obtained by scanning /dev/dsk, then it will have the right vdev 4266 * paths. We update the trusted MOS config with this information. 4267 * We first try to copy the paths with vdev_copy_path_strict, which 4268 * succeeds only when both configs have exactly the same vdev tree. 4269 * If that fails, we fall back to a more flexible method that has a 4270 * best effort policy. 4271 */ 4272 copy_error = vdev_copy_path_strict(rvd, mrvd); 4273 if (copy_error != 0 || spa_load_print_vdev_tree) { 4274 spa_load_note(spa, "provided vdev tree:"); 4275 vdev_dbgmsg_print_tree(rvd, 2); 4276 spa_load_note(spa, "MOS vdev tree:"); 4277 vdev_dbgmsg_print_tree(mrvd, 2); 4278 } 4279 if (copy_error != 0) { 4280 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4281 "back to vdev_copy_path_relaxed"); 4282 vdev_copy_path_relaxed(rvd, mrvd); 4283 } 4284 4285 vdev_close(rvd); 4286 vdev_free(rvd); 4287 spa->spa_root_vdev = mrvd; 4288 rvd = mrvd; 4289 spa_config_exit(spa, SCL_ALL, FTAG); 4290 4291 /* 4292 * If 'zpool import' used a cached config, then the on-disk hostid and 4293 * hostname may be different to the cached config in ways that should 4294 * prevent import. Userspace can't discover this without a scan, but 4295 * we know, so we add these values to LOAD_INFO so the caller can know 4296 * the difference. 4297 * 4298 * Note that we have to do this before the config is regenerated, 4299 * because the new config will have the hostid and hostname for this 4300 * host, in readiness for import. 4301 */ 4302 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4303 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4304 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4305 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4306 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4307 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4308 4309 /* 4310 * We will use spa_config if we decide to reload the spa or if spa_load 4311 * fails and we rewind. We must thus regenerate the config using the 4312 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4313 * pass settings on how to load the pool and is not stored in the MOS. 4314 * We copy it over to our new, trusted config. 4315 */ 4316 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4317 ZPOOL_CONFIG_POOL_TXG); 4318 nvlist_free(mos_config); 4319 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4320 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4321 &policy) == 0) 4322 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4323 spa_config_set(spa, mos_config); 4324 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4325 4326 /* 4327 * Now that we got the config from the MOS, we should be more strict 4328 * in checking blkptrs and can make assumptions about the consistency 4329 * of the vdev tree. spa_trust_config must be set to true before opening 4330 * vdevs in order for them to be writeable. 4331 */ 4332 spa->spa_trust_config = B_TRUE; 4333 4334 /* 4335 * Open and validate the new vdev tree 4336 */ 4337 error = spa_ld_open_vdevs(spa); 4338 if (error != 0) 4339 return (error); 4340 4341 error = spa_ld_validate_vdevs(spa); 4342 if (error != 0) 4343 return (error); 4344 4345 if (copy_error != 0 || spa_load_print_vdev_tree) { 4346 spa_load_note(spa, "final vdev tree:"); 4347 vdev_dbgmsg_print_tree(rvd, 2); 4348 } 4349 4350 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4351 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4352 /* 4353 * Sanity check to make sure that we are indeed loading the 4354 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4355 * in the config provided and they happened to be the only ones 4356 * to have the latest uberblock, we could involuntarily perform 4357 * an extreme rewind. 4358 */ 4359 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4360 if (healthy_tvds_mos - healthy_tvds >= 4361 SPA_SYNC_MIN_VDEVS) { 4362 spa_load_note(spa, "config provided misses too many " 4363 "top-level vdevs compared to MOS (%lld vs %lld). ", 4364 (u_longlong_t)healthy_tvds, 4365 (u_longlong_t)healthy_tvds_mos); 4366 spa_load_note(spa, "vdev tree:"); 4367 vdev_dbgmsg_print_tree(rvd, 2); 4368 if (reloading) { 4369 spa_load_failed(spa, "config was already " 4370 "provided from MOS. Aborting."); 4371 return (spa_vdev_err(rvd, 4372 VDEV_AUX_CORRUPT_DATA, EIO)); 4373 } 4374 spa_load_note(spa, "spa must be reloaded using MOS " 4375 "config"); 4376 return (SET_ERROR(EAGAIN)); 4377 } 4378 } 4379 4380 error = spa_check_for_missing_logs(spa); 4381 if (error != 0) 4382 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4383 4384 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4385 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4386 "guid sum (%llu != %llu)", 4387 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4388 (u_longlong_t)rvd->vdev_guid_sum); 4389 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4390 ENXIO)); 4391 } 4392 4393 return (0); 4394 } 4395 4396 static int 4397 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4398 { 4399 int error = 0; 4400 vdev_t *rvd = spa->spa_root_vdev; 4401 4402 /* 4403 * Everything that we read before spa_remove_init() must be stored 4404 * on concreted vdevs. Therefore we do this as early as possible. 4405 */ 4406 error = spa_remove_init(spa); 4407 if (error != 0) { 4408 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4409 error); 4410 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4411 } 4412 4413 /* 4414 * Retrieve information needed to condense indirect vdev mappings. 4415 */ 4416 error = spa_condense_init(spa); 4417 if (error != 0) { 4418 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4419 error); 4420 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4421 } 4422 4423 return (0); 4424 } 4425 4426 static int 4427 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4428 { 4429 int error = 0; 4430 vdev_t *rvd = spa->spa_root_vdev; 4431 4432 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4433 boolean_t missing_feat_read = B_FALSE; 4434 nvlist_t *unsup_feat, *enabled_feat; 4435 4436 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4437 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4438 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4439 } 4440 4441 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4442 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4443 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4444 } 4445 4446 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4447 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4448 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4449 } 4450 4451 enabled_feat = fnvlist_alloc(); 4452 unsup_feat = fnvlist_alloc(); 4453 4454 if (!spa_features_check(spa, B_FALSE, 4455 unsup_feat, enabled_feat)) 4456 missing_feat_read = B_TRUE; 4457 4458 if (spa_writeable(spa) || 4459 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4460 if (!spa_features_check(spa, B_TRUE, 4461 unsup_feat, enabled_feat)) { 4462 *missing_feat_writep = B_TRUE; 4463 } 4464 } 4465 4466 fnvlist_add_nvlist(spa->spa_load_info, 4467 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4468 4469 if (!nvlist_empty(unsup_feat)) { 4470 fnvlist_add_nvlist(spa->spa_load_info, 4471 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4472 } 4473 4474 fnvlist_free(enabled_feat); 4475 fnvlist_free(unsup_feat); 4476 4477 if (!missing_feat_read) { 4478 fnvlist_add_boolean(spa->spa_load_info, 4479 ZPOOL_CONFIG_CAN_RDONLY); 4480 } 4481 4482 /* 4483 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4484 * twofold: to determine whether the pool is available for 4485 * import in read-write mode and (if it is not) whether the 4486 * pool is available for import in read-only mode. If the pool 4487 * is available for import in read-write mode, it is displayed 4488 * as available in userland; if it is not available for import 4489 * in read-only mode, it is displayed as unavailable in 4490 * userland. If the pool is available for import in read-only 4491 * mode but not read-write mode, it is displayed as unavailable 4492 * in userland with a special note that the pool is actually 4493 * available for open in read-only mode. 4494 * 4495 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4496 * missing a feature for write, we must first determine whether 4497 * the pool can be opened read-only before returning to 4498 * userland in order to know whether to display the 4499 * abovementioned note. 4500 */ 4501 if (missing_feat_read || (*missing_feat_writep && 4502 spa_writeable(spa))) { 4503 spa_load_failed(spa, "pool uses unsupported features"); 4504 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4505 ENOTSUP)); 4506 } 4507 4508 /* 4509 * Load refcounts for ZFS features from disk into an in-memory 4510 * cache during SPA initialization. 4511 */ 4512 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4513 uint64_t refcount; 4514 4515 error = feature_get_refcount_from_disk(spa, 4516 &spa_feature_table[i], &refcount); 4517 if (error == 0) { 4518 spa->spa_feat_refcount_cache[i] = refcount; 4519 } else if (error == ENOTSUP) { 4520 spa->spa_feat_refcount_cache[i] = 4521 SPA_FEATURE_DISABLED; 4522 } else { 4523 spa_load_failed(spa, "error getting refcount " 4524 "for feature %s [error=%d]", 4525 spa_feature_table[i].fi_guid, error); 4526 return (spa_vdev_err(rvd, 4527 VDEV_AUX_CORRUPT_DATA, EIO)); 4528 } 4529 } 4530 } 4531 4532 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4533 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4534 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4535 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4536 } 4537 4538 /* 4539 * Encryption was added before bookmark_v2, even though bookmark_v2 4540 * is now a dependency. If this pool has encryption enabled without 4541 * bookmark_v2, trigger an errata message. 4542 */ 4543 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4544 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4545 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4546 } 4547 4548 return (0); 4549 } 4550 4551 static int 4552 spa_ld_load_special_directories(spa_t *spa) 4553 { 4554 int error = 0; 4555 vdev_t *rvd = spa->spa_root_vdev; 4556 4557 spa->spa_is_initializing = B_TRUE; 4558 error = dsl_pool_open(spa->spa_dsl_pool); 4559 spa->spa_is_initializing = B_FALSE; 4560 if (error != 0) { 4561 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4562 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4563 } 4564 4565 return (0); 4566 } 4567 4568 static int 4569 spa_ld_get_props(spa_t *spa) 4570 { 4571 int error = 0; 4572 uint64_t obj; 4573 vdev_t *rvd = spa->spa_root_vdev; 4574 4575 /* Grab the checksum salt from the MOS. */ 4576 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4577 DMU_POOL_CHECKSUM_SALT, 1, 4578 sizeof (spa->spa_cksum_salt.zcs_bytes), 4579 spa->spa_cksum_salt.zcs_bytes); 4580 if (error == ENOENT) { 4581 /* Generate a new salt for subsequent use */ 4582 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4583 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4584 } else if (error != 0) { 4585 spa_load_failed(spa, "unable to retrieve checksum salt from " 4586 "MOS [error=%d]", error); 4587 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4588 } 4589 4590 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4591 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4592 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4593 if (error != 0) { 4594 spa_load_failed(spa, "error opening deferred-frees bpobj " 4595 "[error=%d]", error); 4596 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4597 } 4598 4599 /* 4600 * Load the bit that tells us to use the new accounting function 4601 * (raid-z deflation). If we have an older pool, this will not 4602 * be present. 4603 */ 4604 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4605 if (error != 0 && error != ENOENT) 4606 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4607 4608 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4609 &spa->spa_creation_version, B_FALSE); 4610 if (error != 0 && error != ENOENT) 4611 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4612 4613 /* 4614 * Load the persistent error log. If we have an older pool, this will 4615 * not be present. 4616 */ 4617 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4618 B_FALSE); 4619 if (error != 0 && error != ENOENT) 4620 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4621 4622 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4623 &spa->spa_errlog_scrub, B_FALSE); 4624 if (error != 0 && error != ENOENT) 4625 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4626 4627 /* 4628 * Load the livelist deletion field. If a livelist is queued for 4629 * deletion, indicate that in the spa 4630 */ 4631 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4632 &spa->spa_livelists_to_delete, B_FALSE); 4633 if (error != 0 && error != ENOENT) 4634 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4635 4636 /* 4637 * Load the history object. If we have an older pool, this 4638 * will not be present. 4639 */ 4640 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4641 if (error != 0 && error != ENOENT) 4642 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4643 4644 /* 4645 * Load the per-vdev ZAP map. If we have an older pool, this will not 4646 * be present; in this case, defer its creation to a later time to 4647 * avoid dirtying the MOS this early / out of sync context. See 4648 * spa_sync_config_object. 4649 */ 4650 4651 /* The sentinel is only available in the MOS config. */ 4652 nvlist_t *mos_config; 4653 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4654 spa_load_failed(spa, "unable to retrieve MOS config"); 4655 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4656 } 4657 4658 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4659 &spa->spa_all_vdev_zaps, B_FALSE); 4660 4661 if (error == ENOENT) { 4662 VERIFY(!nvlist_exists(mos_config, 4663 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4664 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4665 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4666 } else if (error != 0) { 4667 nvlist_free(mos_config); 4668 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4669 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4670 /* 4671 * An older version of ZFS overwrote the sentinel value, so 4672 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4673 * destruction to later; see spa_sync_config_object. 4674 */ 4675 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4676 /* 4677 * We're assuming that no vdevs have had their ZAPs created 4678 * before this. Better be sure of it. 4679 */ 4680 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4681 } 4682 nvlist_free(mos_config); 4683 4684 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4685 4686 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4687 B_FALSE); 4688 if (error && error != ENOENT) 4689 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4690 4691 if (error == 0) { 4692 uint64_t autoreplace = 0; 4693 4694 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4695 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4696 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4697 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4698 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4699 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4700 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4701 spa->spa_autoreplace = (autoreplace != 0); 4702 } 4703 4704 /* 4705 * If we are importing a pool with missing top-level vdevs, 4706 * we enforce that the pool doesn't panic or get suspended on 4707 * error since the likelihood of missing data is extremely high. 4708 */ 4709 if (spa->spa_missing_tvds > 0 && 4710 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4711 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4712 spa_load_note(spa, "forcing failmode to 'continue' " 4713 "as some top level vdevs are missing"); 4714 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4715 } 4716 4717 return (0); 4718 } 4719 4720 static int 4721 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4722 { 4723 int error = 0; 4724 vdev_t *rvd = spa->spa_root_vdev; 4725 4726 /* 4727 * If we're assembling the pool from the split-off vdevs of 4728 * an existing pool, we don't want to attach the spares & cache 4729 * devices. 4730 */ 4731 4732 /* 4733 * Load any hot spares for this pool. 4734 */ 4735 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4736 B_FALSE); 4737 if (error != 0 && error != ENOENT) 4738 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4739 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4740 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4741 if (load_nvlist(spa, spa->spa_spares.sav_object, 4742 &spa->spa_spares.sav_config) != 0) { 4743 spa_load_failed(spa, "error loading spares nvlist"); 4744 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4745 } 4746 4747 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4748 spa_load_spares(spa); 4749 spa_config_exit(spa, SCL_ALL, FTAG); 4750 } else if (error == 0) { 4751 spa->spa_spares.sav_sync = B_TRUE; 4752 } 4753 4754 /* 4755 * Load any level 2 ARC devices for this pool. 4756 */ 4757 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4758 &spa->spa_l2cache.sav_object, B_FALSE); 4759 if (error != 0 && error != ENOENT) 4760 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4761 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4762 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4763 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4764 &spa->spa_l2cache.sav_config) != 0) { 4765 spa_load_failed(spa, "error loading l2cache nvlist"); 4766 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4767 } 4768 4769 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4770 spa_load_l2cache(spa); 4771 spa_config_exit(spa, SCL_ALL, FTAG); 4772 } else if (error == 0) { 4773 spa->spa_l2cache.sav_sync = B_TRUE; 4774 } 4775 4776 return (0); 4777 } 4778 4779 static int 4780 spa_ld_load_vdev_metadata(spa_t *spa) 4781 { 4782 int error = 0; 4783 vdev_t *rvd = spa->spa_root_vdev; 4784 4785 /* 4786 * If the 'multihost' property is set, then never allow a pool to 4787 * be imported when the system hostid is zero. The exception to 4788 * this rule is zdb which is always allowed to access pools. 4789 */ 4790 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4791 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4792 fnvlist_add_uint64(spa->spa_load_info, 4793 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4794 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4795 } 4796 4797 /* 4798 * If the 'autoreplace' property is set, then post a resource notifying 4799 * the ZFS DE that it should not issue any faults for unopenable 4800 * devices. We also iterate over the vdevs, and post a sysevent for any 4801 * unopenable vdevs so that the normal autoreplace handler can take 4802 * over. 4803 */ 4804 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4805 spa_check_removed(spa->spa_root_vdev); 4806 /* 4807 * For the import case, this is done in spa_import(), because 4808 * at this point we're using the spare definitions from 4809 * the MOS config, not necessarily from the userland config. 4810 */ 4811 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4812 spa_aux_check_removed(&spa->spa_spares); 4813 spa_aux_check_removed(&spa->spa_l2cache); 4814 } 4815 } 4816 4817 /* 4818 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4819 */ 4820 error = vdev_load(rvd); 4821 if (error != 0) { 4822 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4823 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4824 } 4825 4826 error = spa_ld_log_spacemaps(spa); 4827 if (error != 0) { 4828 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4829 error); 4830 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4831 } 4832 4833 /* 4834 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4835 */ 4836 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4837 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4838 spa_config_exit(spa, SCL_ALL, FTAG); 4839 4840 return (0); 4841 } 4842 4843 static int 4844 spa_ld_load_dedup_tables(spa_t *spa) 4845 { 4846 int error = 0; 4847 vdev_t *rvd = spa->spa_root_vdev; 4848 4849 error = ddt_load(spa); 4850 if (error != 0) { 4851 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4852 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4853 } 4854 4855 return (0); 4856 } 4857 4858 static int 4859 spa_ld_load_brt(spa_t *spa) 4860 { 4861 int error = 0; 4862 vdev_t *rvd = spa->spa_root_vdev; 4863 4864 error = brt_load(spa); 4865 if (error != 0) { 4866 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4867 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4868 } 4869 4870 return (0); 4871 } 4872 4873 static int 4874 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4875 { 4876 vdev_t *rvd = spa->spa_root_vdev; 4877 4878 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4879 boolean_t missing = spa_check_logs(spa); 4880 if (missing) { 4881 if (spa->spa_missing_tvds != 0) { 4882 spa_load_note(spa, "spa_check_logs failed " 4883 "so dropping the logs"); 4884 } else { 4885 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4886 spa_load_failed(spa, "spa_check_logs failed"); 4887 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4888 ENXIO)); 4889 } 4890 } 4891 } 4892 4893 return (0); 4894 } 4895 4896 static int 4897 spa_ld_verify_pool_data(spa_t *spa) 4898 { 4899 int error = 0; 4900 vdev_t *rvd = spa->spa_root_vdev; 4901 4902 /* 4903 * We've successfully opened the pool, verify that we're ready 4904 * to start pushing transactions. 4905 */ 4906 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4907 error = spa_load_verify(spa); 4908 if (error != 0) { 4909 spa_load_failed(spa, "spa_load_verify failed " 4910 "[error=%d]", error); 4911 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4912 error)); 4913 } 4914 } 4915 4916 return (0); 4917 } 4918 4919 static void 4920 spa_ld_claim_log_blocks(spa_t *spa) 4921 { 4922 dmu_tx_t *tx; 4923 dsl_pool_t *dp = spa_get_dsl(spa); 4924 4925 /* 4926 * Claim log blocks that haven't been committed yet. 4927 * This must all happen in a single txg. 4928 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4929 * invoked from zil_claim_log_block()'s i/o done callback. 4930 * Price of rollback is that we abandon the log. 4931 */ 4932 spa->spa_claiming = B_TRUE; 4933 4934 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4935 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4936 zil_claim, tx, DS_FIND_CHILDREN); 4937 dmu_tx_commit(tx); 4938 4939 spa->spa_claiming = B_FALSE; 4940 4941 spa_set_log_state(spa, SPA_LOG_GOOD); 4942 } 4943 4944 static void 4945 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4946 boolean_t update_config_cache) 4947 { 4948 vdev_t *rvd = spa->spa_root_vdev; 4949 int need_update = B_FALSE; 4950 4951 /* 4952 * If the config cache is stale, or we have uninitialized 4953 * metaslabs (see spa_vdev_add()), then update the config. 4954 * 4955 * If this is a verbatim import, trust the current 4956 * in-core spa_config and update the disk labels. 4957 */ 4958 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4959 spa->spa_load_state == SPA_LOAD_IMPORT || 4960 spa->spa_load_state == SPA_LOAD_RECOVER || 4961 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4962 need_update = B_TRUE; 4963 4964 for (int c = 0; c < rvd->vdev_children; c++) 4965 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4966 need_update = B_TRUE; 4967 4968 /* 4969 * Update the config cache asynchronously in case we're the 4970 * root pool, in which case the config cache isn't writable yet. 4971 */ 4972 if (need_update) 4973 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4974 } 4975 4976 static void 4977 spa_ld_prepare_for_reload(spa_t *spa) 4978 { 4979 spa_mode_t mode = spa->spa_mode; 4980 int async_suspended = spa->spa_async_suspended; 4981 4982 spa_unload(spa); 4983 spa_deactivate(spa); 4984 spa_activate(spa, mode); 4985 4986 /* 4987 * We save the value of spa_async_suspended as it gets reset to 0 by 4988 * spa_unload(). We want to restore it back to the original value before 4989 * returning as we might be calling spa_async_resume() later. 4990 */ 4991 spa->spa_async_suspended = async_suspended; 4992 } 4993 4994 static int 4995 spa_ld_read_checkpoint_txg(spa_t *spa) 4996 { 4997 uberblock_t checkpoint; 4998 int error = 0; 4999 5000 ASSERT0(spa->spa_checkpoint_txg); 5001 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5002 5003 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5004 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5005 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5006 5007 if (error == ENOENT) 5008 return (0); 5009 5010 if (error != 0) 5011 return (error); 5012 5013 ASSERT3U(checkpoint.ub_txg, !=, 0); 5014 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 5015 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 5016 spa->spa_checkpoint_txg = checkpoint.ub_txg; 5017 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5018 5019 return (0); 5020 } 5021 5022 static int 5023 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5024 { 5025 int error = 0; 5026 5027 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5028 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5029 5030 /* 5031 * Never trust the config that is provided unless we are assembling 5032 * a pool following a split. 5033 * This means don't trust blkptrs and the vdev tree in general. This 5034 * also effectively puts the spa in read-only mode since 5035 * spa_writeable() checks for spa_trust_config to be true. 5036 * We will later load a trusted config from the MOS. 5037 */ 5038 if (type != SPA_IMPORT_ASSEMBLE) 5039 spa->spa_trust_config = B_FALSE; 5040 5041 /* 5042 * Parse the config provided to create a vdev tree. 5043 */ 5044 error = spa_ld_parse_config(spa, type); 5045 if (error != 0) 5046 return (error); 5047 5048 spa_import_progress_add(spa); 5049 5050 /* 5051 * Now that we have the vdev tree, try to open each vdev. This involves 5052 * opening the underlying physical device, retrieving its geometry and 5053 * probing the vdev with a dummy I/O. The state of each vdev will be set 5054 * based on the success of those operations. After this we'll be ready 5055 * to read from the vdevs. 5056 */ 5057 error = spa_ld_open_vdevs(spa); 5058 if (error != 0) 5059 return (error); 5060 5061 /* 5062 * Read the label of each vdev and make sure that the GUIDs stored 5063 * there match the GUIDs in the config provided. 5064 * If we're assembling a new pool that's been split off from an 5065 * existing pool, the labels haven't yet been updated so we skip 5066 * validation for now. 5067 */ 5068 if (type != SPA_IMPORT_ASSEMBLE) { 5069 error = spa_ld_validate_vdevs(spa); 5070 if (error != 0) 5071 return (error); 5072 } 5073 5074 /* 5075 * Read all vdev labels to find the best uberblock (i.e. latest, 5076 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5077 * get the list of features required to read blkptrs in the MOS from 5078 * the vdev label with the best uberblock and verify that our version 5079 * of zfs supports them all. 5080 */ 5081 error = spa_ld_select_uberblock(spa, type); 5082 if (error != 0) 5083 return (error); 5084 5085 /* 5086 * Pass that uberblock to the dsl_pool layer which will open the root 5087 * blkptr. This blkptr points to the latest version of the MOS and will 5088 * allow us to read its contents. 5089 */ 5090 error = spa_ld_open_rootbp(spa); 5091 if (error != 0) 5092 return (error); 5093 5094 return (0); 5095 } 5096 5097 static int 5098 spa_ld_checkpoint_rewind(spa_t *spa) 5099 { 5100 uberblock_t checkpoint; 5101 int error = 0; 5102 5103 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5104 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5105 5106 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5107 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5108 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5109 5110 if (error != 0) { 5111 spa_load_failed(spa, "unable to retrieve checkpointed " 5112 "uberblock from the MOS config [error=%d]", error); 5113 5114 if (error == ENOENT) 5115 error = ZFS_ERR_NO_CHECKPOINT; 5116 5117 return (error); 5118 } 5119 5120 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5121 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5122 5123 /* 5124 * We need to update the txg and timestamp of the checkpointed 5125 * uberblock to be higher than the latest one. This ensures that 5126 * the checkpointed uberblock is selected if we were to close and 5127 * reopen the pool right after we've written it in the vdev labels. 5128 * (also see block comment in vdev_uberblock_compare) 5129 */ 5130 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5131 checkpoint.ub_timestamp = gethrestime_sec(); 5132 5133 /* 5134 * Set current uberblock to be the checkpointed uberblock. 5135 */ 5136 spa->spa_uberblock = checkpoint; 5137 5138 /* 5139 * If we are doing a normal rewind, then the pool is open for 5140 * writing and we sync the "updated" checkpointed uberblock to 5141 * disk. Once this is done, we've basically rewound the whole 5142 * pool and there is no way back. 5143 * 5144 * There are cases when we don't want to attempt and sync the 5145 * checkpointed uberblock to disk because we are opening a 5146 * pool as read-only. Specifically, verifying the checkpointed 5147 * state with zdb, and importing the checkpointed state to get 5148 * a "preview" of its content. 5149 */ 5150 if (spa_writeable(spa)) { 5151 vdev_t *rvd = spa->spa_root_vdev; 5152 5153 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5154 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5155 int svdcount = 0; 5156 int children = rvd->vdev_children; 5157 int c0 = random_in_range(children); 5158 5159 for (int c = 0; c < children; c++) { 5160 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5161 5162 /* Stop when revisiting the first vdev */ 5163 if (c > 0 && svd[0] == vd) 5164 break; 5165 5166 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5167 !vdev_is_concrete(vd)) 5168 continue; 5169 5170 svd[svdcount++] = vd; 5171 if (svdcount == SPA_SYNC_MIN_VDEVS) 5172 break; 5173 } 5174 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5175 if (error == 0) 5176 spa->spa_last_synced_guid = rvd->vdev_guid; 5177 spa_config_exit(spa, SCL_ALL, FTAG); 5178 5179 if (error != 0) { 5180 spa_load_failed(spa, "failed to write checkpointed " 5181 "uberblock to the vdev labels [error=%d]", error); 5182 return (error); 5183 } 5184 } 5185 5186 return (0); 5187 } 5188 5189 static int 5190 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5191 boolean_t *update_config_cache) 5192 { 5193 int error; 5194 5195 /* 5196 * Parse the config for pool, open and validate vdevs, 5197 * select an uberblock, and use that uberblock to open 5198 * the MOS. 5199 */ 5200 error = spa_ld_mos_init(spa, type); 5201 if (error != 0) 5202 return (error); 5203 5204 /* 5205 * Retrieve the trusted config stored in the MOS and use it to create 5206 * a new, exact version of the vdev tree, then reopen all vdevs. 5207 */ 5208 error = spa_ld_trusted_config(spa, type, B_FALSE); 5209 if (error == EAGAIN) { 5210 if (update_config_cache != NULL) 5211 *update_config_cache = B_TRUE; 5212 5213 /* 5214 * Redo the loading process with the trusted config if it is 5215 * too different from the untrusted config. 5216 */ 5217 spa_ld_prepare_for_reload(spa); 5218 spa_load_note(spa, "RELOADING"); 5219 error = spa_ld_mos_init(spa, type); 5220 if (error != 0) 5221 return (error); 5222 5223 error = spa_ld_trusted_config(spa, type, B_TRUE); 5224 if (error != 0) 5225 return (error); 5226 5227 } else if (error != 0) { 5228 return (error); 5229 } 5230 5231 return (0); 5232 } 5233 5234 /* 5235 * Load an existing storage pool, using the config provided. This config 5236 * describes which vdevs are part of the pool and is later validated against 5237 * partial configs present in each vdev's label and an entire copy of the 5238 * config stored in the MOS. 5239 */ 5240 static int 5241 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5242 { 5243 int error = 0; 5244 boolean_t missing_feat_write = B_FALSE; 5245 boolean_t checkpoint_rewind = 5246 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5247 boolean_t update_config_cache = B_FALSE; 5248 5249 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5250 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5251 5252 spa_load_note(spa, "LOADING"); 5253 5254 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5255 if (error != 0) 5256 return (error); 5257 5258 /* 5259 * If we are rewinding to the checkpoint then we need to repeat 5260 * everything we've done so far in this function but this time 5261 * selecting the checkpointed uberblock and using that to open 5262 * the MOS. 5263 */ 5264 if (checkpoint_rewind) { 5265 /* 5266 * If we are rewinding to the checkpoint update config cache 5267 * anyway. 5268 */ 5269 update_config_cache = B_TRUE; 5270 5271 /* 5272 * Extract the checkpointed uberblock from the current MOS 5273 * and use this as the pool's uberblock from now on. If the 5274 * pool is imported as writeable we also write the checkpoint 5275 * uberblock to the labels, making the rewind permanent. 5276 */ 5277 error = spa_ld_checkpoint_rewind(spa); 5278 if (error != 0) 5279 return (error); 5280 5281 /* 5282 * Redo the loading process again with the 5283 * checkpointed uberblock. 5284 */ 5285 spa_ld_prepare_for_reload(spa); 5286 spa_load_note(spa, "LOADING checkpointed uberblock"); 5287 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5288 if (error != 0) 5289 return (error); 5290 } 5291 5292 /* 5293 * Retrieve the checkpoint txg if the pool has a checkpoint. 5294 */ 5295 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5296 error = spa_ld_read_checkpoint_txg(spa); 5297 if (error != 0) 5298 return (error); 5299 5300 /* 5301 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5302 * from the pool and their contents were re-mapped to other vdevs. Note 5303 * that everything that we read before this step must have been 5304 * rewritten on concrete vdevs after the last device removal was 5305 * initiated. Otherwise we could be reading from indirect vdevs before 5306 * we have loaded their mappings. 5307 */ 5308 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5309 error = spa_ld_open_indirect_vdev_metadata(spa); 5310 if (error != 0) 5311 return (error); 5312 5313 /* 5314 * Retrieve the full list of active features from the MOS and check if 5315 * they are all supported. 5316 */ 5317 spa_import_progress_set_notes(spa, "Checking feature flags"); 5318 error = spa_ld_check_features(spa, &missing_feat_write); 5319 if (error != 0) 5320 return (error); 5321 5322 /* 5323 * Load several special directories from the MOS needed by the dsl_pool 5324 * layer. 5325 */ 5326 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5327 error = spa_ld_load_special_directories(spa); 5328 if (error != 0) 5329 return (error); 5330 5331 /* 5332 * Retrieve pool properties from the MOS. 5333 */ 5334 spa_import_progress_set_notes(spa, "Loading properties"); 5335 error = spa_ld_get_props(spa); 5336 if (error != 0) 5337 return (error); 5338 5339 /* 5340 * Retrieve the list of auxiliary devices - cache devices and spares - 5341 * and open them. 5342 */ 5343 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5344 error = spa_ld_open_aux_vdevs(spa, type); 5345 if (error != 0) 5346 return (error); 5347 5348 /* 5349 * Load the metadata for all vdevs. Also check if unopenable devices 5350 * should be autoreplaced. 5351 */ 5352 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5353 error = spa_ld_load_vdev_metadata(spa); 5354 if (error != 0) 5355 return (error); 5356 5357 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5358 error = spa_ld_load_dedup_tables(spa); 5359 if (error != 0) 5360 return (error); 5361 5362 spa_import_progress_set_notes(spa, "Loading BRT"); 5363 error = spa_ld_load_brt(spa); 5364 if (error != 0) 5365 return (error); 5366 5367 /* 5368 * Verify the logs now to make sure we don't have any unexpected errors 5369 * when we claim log blocks later. 5370 */ 5371 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5372 error = spa_ld_verify_logs(spa, type, ereport); 5373 if (error != 0) 5374 return (error); 5375 5376 if (missing_feat_write) { 5377 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5378 5379 /* 5380 * At this point, we know that we can open the pool in 5381 * read-only mode but not read-write mode. We now have enough 5382 * information and can return to userland. 5383 */ 5384 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5385 ENOTSUP)); 5386 } 5387 5388 /* 5389 * Traverse the last txgs to make sure the pool was left off in a safe 5390 * state. When performing an extreme rewind, we verify the whole pool, 5391 * which can take a very long time. 5392 */ 5393 spa_import_progress_set_notes(spa, "Verifying pool data"); 5394 error = spa_ld_verify_pool_data(spa); 5395 if (error != 0) 5396 return (error); 5397 5398 /* 5399 * Calculate the deflated space for the pool. This must be done before 5400 * we write anything to the pool because we'd need to update the space 5401 * accounting using the deflated sizes. 5402 */ 5403 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5404 spa_update_dspace(spa); 5405 5406 /* 5407 * We have now retrieved all the information we needed to open the 5408 * pool. If we are importing the pool in read-write mode, a few 5409 * additional steps must be performed to finish the import. 5410 */ 5411 spa_import_progress_set_notes(spa, "Starting import"); 5412 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5413 spa->spa_load_max_txg == UINT64_MAX)) { 5414 uint64_t config_cache_txg = spa->spa_config_txg; 5415 5416 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5417 5418 /* 5419 * Before we do any zio_write's, complete the raidz expansion 5420 * scratch space copying, if necessary. 5421 */ 5422 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5423 vdev_raidz_reflow_copy_scratch(spa); 5424 5425 /* 5426 * In case of a checkpoint rewind, log the original txg 5427 * of the checkpointed uberblock. 5428 */ 5429 if (checkpoint_rewind) { 5430 spa_history_log_internal(spa, "checkpoint rewind", 5431 NULL, "rewound state to txg=%llu", 5432 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5433 } 5434 5435 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5436 /* 5437 * Traverse the ZIL and claim all blocks. 5438 */ 5439 spa_ld_claim_log_blocks(spa); 5440 5441 /* 5442 * Kick-off the syncing thread. 5443 */ 5444 spa->spa_sync_on = B_TRUE; 5445 txg_sync_start(spa->spa_dsl_pool); 5446 mmp_thread_start(spa); 5447 5448 /* 5449 * Wait for all claims to sync. We sync up to the highest 5450 * claimed log block birth time so that claimed log blocks 5451 * don't appear to be from the future. spa_claim_max_txg 5452 * will have been set for us by ZIL traversal operations 5453 * performed above. 5454 */ 5455 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5456 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5457 5458 /* 5459 * Check if we need to request an update of the config. On the 5460 * next sync, we would update the config stored in vdev labels 5461 * and the cachefile (by default /etc/zfs/zpool.cache). 5462 */ 5463 spa_import_progress_set_notes(spa, "Updating configs"); 5464 spa_ld_check_for_config_update(spa, config_cache_txg, 5465 update_config_cache); 5466 5467 /* 5468 * Check if a rebuild was in progress and if so resume it. 5469 * Then check all DTLs to see if anything needs resilvering. 5470 * The resilver will be deferred if a rebuild was started. 5471 */ 5472 spa_import_progress_set_notes(spa, "Starting resilvers"); 5473 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5474 vdev_rebuild_restart(spa); 5475 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5476 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5477 spa_async_request(spa, SPA_ASYNC_RESILVER); 5478 } 5479 5480 /* 5481 * Log the fact that we booted up (so that we can detect if 5482 * we rebooted in the middle of an operation). 5483 */ 5484 spa_history_log_version(spa, "open", NULL); 5485 5486 spa_import_progress_set_notes(spa, 5487 "Restarting device removals"); 5488 spa_restart_removal(spa); 5489 spa_spawn_aux_threads(spa); 5490 5491 /* 5492 * Delete any inconsistent datasets. 5493 * 5494 * Note: 5495 * Since we may be issuing deletes for clones here, 5496 * we make sure to do so after we've spawned all the 5497 * auxiliary threads above (from which the livelist 5498 * deletion zthr is part of). 5499 */ 5500 spa_import_progress_set_notes(spa, 5501 "Cleaning up inconsistent objsets"); 5502 (void) dmu_objset_find(spa_name(spa), 5503 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5504 5505 /* 5506 * Clean up any stale temporary dataset userrefs. 5507 */ 5508 spa_import_progress_set_notes(spa, 5509 "Cleaning up temporary userrefs"); 5510 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5511 5512 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5513 spa_import_progress_set_notes(spa, "Restarting initialize"); 5514 vdev_initialize_restart(spa->spa_root_vdev); 5515 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5516 vdev_trim_restart(spa->spa_root_vdev); 5517 vdev_autotrim_restart(spa); 5518 spa_config_exit(spa, SCL_CONFIG, FTAG); 5519 spa_import_progress_set_notes(spa, "Finished importing"); 5520 } 5521 5522 spa_import_progress_remove(spa_guid(spa)); 5523 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5524 5525 spa_load_note(spa, "LOADED"); 5526 5527 return (0); 5528 } 5529 5530 static int 5531 spa_load_retry(spa_t *spa, spa_load_state_t state) 5532 { 5533 spa_mode_t mode = spa->spa_mode; 5534 5535 spa_unload(spa); 5536 spa_deactivate(spa); 5537 5538 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5539 5540 spa_activate(spa, mode); 5541 spa_async_suspend(spa); 5542 5543 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5544 (u_longlong_t)spa->spa_load_max_txg); 5545 5546 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5547 } 5548 5549 /* 5550 * If spa_load() fails this function will try loading prior txg's. If 5551 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5552 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5553 * function will not rewind the pool and will return the same error as 5554 * spa_load(). 5555 */ 5556 static int 5557 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5558 int rewind_flags) 5559 { 5560 nvlist_t *loadinfo = NULL; 5561 nvlist_t *config = NULL; 5562 int load_error, rewind_error; 5563 uint64_t safe_rewind_txg; 5564 uint64_t min_txg; 5565 5566 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5567 spa->spa_load_max_txg = spa->spa_load_txg; 5568 spa_set_log_state(spa, SPA_LOG_CLEAR); 5569 } else { 5570 spa->spa_load_max_txg = max_request; 5571 if (max_request != UINT64_MAX) 5572 spa->spa_extreme_rewind = B_TRUE; 5573 } 5574 5575 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5576 if (load_error == 0) 5577 return (0); 5578 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5579 /* 5580 * When attempting checkpoint-rewind on a pool with no 5581 * checkpoint, we should not attempt to load uberblocks 5582 * from previous txgs when spa_load fails. 5583 */ 5584 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5585 spa_import_progress_remove(spa_guid(spa)); 5586 return (load_error); 5587 } 5588 5589 if (spa->spa_root_vdev != NULL) 5590 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5591 5592 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5593 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5594 5595 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5596 nvlist_free(config); 5597 spa_import_progress_remove(spa_guid(spa)); 5598 return (load_error); 5599 } 5600 5601 if (state == SPA_LOAD_RECOVER) { 5602 /* Price of rolling back is discarding txgs, including log */ 5603 spa_set_log_state(spa, SPA_LOG_CLEAR); 5604 } else { 5605 /* 5606 * If we aren't rolling back save the load info from our first 5607 * import attempt so that we can restore it after attempting 5608 * to rewind. 5609 */ 5610 loadinfo = spa->spa_load_info; 5611 spa->spa_load_info = fnvlist_alloc(); 5612 } 5613 5614 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5615 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5616 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5617 TXG_INITIAL : safe_rewind_txg; 5618 5619 /* 5620 * Continue as long as we're finding errors, we're still within 5621 * the acceptable rewind range, and we're still finding uberblocks 5622 */ 5623 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5624 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5625 if (spa->spa_load_max_txg < safe_rewind_txg) 5626 spa->spa_extreme_rewind = B_TRUE; 5627 rewind_error = spa_load_retry(spa, state); 5628 } 5629 5630 spa->spa_extreme_rewind = B_FALSE; 5631 spa->spa_load_max_txg = UINT64_MAX; 5632 5633 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5634 spa_config_set(spa, config); 5635 else 5636 nvlist_free(config); 5637 5638 if (state == SPA_LOAD_RECOVER) { 5639 ASSERT3P(loadinfo, ==, NULL); 5640 spa_import_progress_remove(spa_guid(spa)); 5641 return (rewind_error); 5642 } else { 5643 /* Store the rewind info as part of the initial load info */ 5644 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5645 spa->spa_load_info); 5646 5647 /* Restore the initial load info */ 5648 fnvlist_free(spa->spa_load_info); 5649 spa->spa_load_info = loadinfo; 5650 5651 spa_import_progress_remove(spa_guid(spa)); 5652 return (load_error); 5653 } 5654 } 5655 5656 /* 5657 * Pool Open/Import 5658 * 5659 * The import case is identical to an open except that the configuration is sent 5660 * down from userland, instead of grabbed from the configuration cache. For the 5661 * case of an open, the pool configuration will exist in the 5662 * POOL_STATE_UNINITIALIZED state. 5663 * 5664 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5665 * the same time open the pool, without having to keep around the spa_t in some 5666 * ambiguous state. 5667 */ 5668 static int 5669 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5670 nvlist_t *nvpolicy, nvlist_t **config) 5671 { 5672 spa_t *spa; 5673 spa_load_state_t state = SPA_LOAD_OPEN; 5674 int error; 5675 int locked = B_FALSE; 5676 int firstopen = B_FALSE; 5677 5678 *spapp = NULL; 5679 5680 /* 5681 * As disgusting as this is, we need to support recursive calls to this 5682 * function because dsl_dir_open() is called during spa_load(), and ends 5683 * up calling spa_open() again. The real fix is to figure out how to 5684 * avoid dsl_dir_open() calling this in the first place. 5685 */ 5686 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5687 mutex_enter(&spa_namespace_lock); 5688 locked = B_TRUE; 5689 } 5690 5691 if ((spa = spa_lookup(pool)) == NULL) { 5692 if (locked) 5693 mutex_exit(&spa_namespace_lock); 5694 return (SET_ERROR(ENOENT)); 5695 } 5696 5697 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5698 zpool_load_policy_t policy; 5699 5700 firstopen = B_TRUE; 5701 5702 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5703 &policy); 5704 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5705 state = SPA_LOAD_RECOVER; 5706 5707 spa_activate(spa, spa_mode_global); 5708 5709 if (state != SPA_LOAD_RECOVER) 5710 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5711 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5712 5713 zfs_dbgmsg("spa_open_common: opening %s", pool); 5714 error = spa_load_best(spa, state, policy.zlp_txg, 5715 policy.zlp_rewind); 5716 5717 if (error == EBADF) { 5718 /* 5719 * If vdev_validate() returns failure (indicated by 5720 * EBADF), it indicates that one of the vdevs indicates 5721 * that the pool has been exported or destroyed. If 5722 * this is the case, the config cache is out of sync and 5723 * we should remove the pool from the namespace. 5724 */ 5725 spa_unload(spa); 5726 spa_deactivate(spa); 5727 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5728 spa_remove(spa); 5729 if (locked) 5730 mutex_exit(&spa_namespace_lock); 5731 return (SET_ERROR(ENOENT)); 5732 } 5733 5734 if (error) { 5735 /* 5736 * We can't open the pool, but we still have useful 5737 * information: the state of each vdev after the 5738 * attempted vdev_open(). Return this to the user. 5739 */ 5740 if (config != NULL && spa->spa_config) { 5741 *config = fnvlist_dup(spa->spa_config); 5742 fnvlist_add_nvlist(*config, 5743 ZPOOL_CONFIG_LOAD_INFO, 5744 spa->spa_load_info); 5745 } 5746 spa_unload(spa); 5747 spa_deactivate(spa); 5748 spa->spa_last_open_failed = error; 5749 if (locked) 5750 mutex_exit(&spa_namespace_lock); 5751 *spapp = NULL; 5752 return (error); 5753 } 5754 } 5755 5756 spa_open_ref(spa, tag); 5757 5758 if (config != NULL) 5759 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5760 5761 /* 5762 * If we've recovered the pool, pass back any information we 5763 * gathered while doing the load. 5764 */ 5765 if (state == SPA_LOAD_RECOVER && config != NULL) { 5766 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5767 spa->spa_load_info); 5768 } 5769 5770 if (locked) { 5771 spa->spa_last_open_failed = 0; 5772 spa->spa_last_ubsync_txg = 0; 5773 spa->spa_load_txg = 0; 5774 mutex_exit(&spa_namespace_lock); 5775 } 5776 5777 if (firstopen) 5778 zvol_create_minors_recursive(spa_name(spa)); 5779 5780 *spapp = spa; 5781 5782 return (0); 5783 } 5784 5785 int 5786 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5787 nvlist_t *policy, nvlist_t **config) 5788 { 5789 return (spa_open_common(name, spapp, tag, policy, config)); 5790 } 5791 5792 int 5793 spa_open(const char *name, spa_t **spapp, const void *tag) 5794 { 5795 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5796 } 5797 5798 /* 5799 * Lookup the given spa_t, incrementing the inject count in the process, 5800 * preventing it from being exported or destroyed. 5801 */ 5802 spa_t * 5803 spa_inject_addref(char *name) 5804 { 5805 spa_t *spa; 5806 5807 mutex_enter(&spa_namespace_lock); 5808 if ((spa = spa_lookup(name)) == NULL) { 5809 mutex_exit(&spa_namespace_lock); 5810 return (NULL); 5811 } 5812 spa->spa_inject_ref++; 5813 mutex_exit(&spa_namespace_lock); 5814 5815 return (spa); 5816 } 5817 5818 void 5819 spa_inject_delref(spa_t *spa) 5820 { 5821 mutex_enter(&spa_namespace_lock); 5822 spa->spa_inject_ref--; 5823 mutex_exit(&spa_namespace_lock); 5824 } 5825 5826 /* 5827 * Add spares device information to the nvlist. 5828 */ 5829 static void 5830 spa_add_spares(spa_t *spa, nvlist_t *config) 5831 { 5832 nvlist_t **spares; 5833 uint_t i, nspares; 5834 nvlist_t *nvroot; 5835 uint64_t guid; 5836 vdev_stat_t *vs; 5837 uint_t vsc; 5838 uint64_t pool; 5839 5840 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5841 5842 if (spa->spa_spares.sav_count == 0) 5843 return; 5844 5845 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5846 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5847 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5848 if (nspares != 0) { 5849 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5850 (const nvlist_t * const *)spares, nspares); 5851 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5852 &spares, &nspares)); 5853 5854 /* 5855 * Go through and find any spares which have since been 5856 * repurposed as an active spare. If this is the case, update 5857 * their status appropriately. 5858 */ 5859 for (i = 0; i < nspares; i++) { 5860 guid = fnvlist_lookup_uint64(spares[i], 5861 ZPOOL_CONFIG_GUID); 5862 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5863 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5864 if (spa_spare_exists(guid, &pool, NULL) && 5865 pool != 0ULL) { 5866 vs->vs_state = VDEV_STATE_CANT_OPEN; 5867 vs->vs_aux = VDEV_AUX_SPARED; 5868 } else { 5869 vs->vs_state = 5870 spa->spa_spares.sav_vdevs[i]->vdev_state; 5871 } 5872 } 5873 } 5874 } 5875 5876 /* 5877 * Add l2cache device information to the nvlist, including vdev stats. 5878 */ 5879 static void 5880 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5881 { 5882 nvlist_t **l2cache; 5883 uint_t i, j, nl2cache; 5884 nvlist_t *nvroot; 5885 uint64_t guid; 5886 vdev_t *vd; 5887 vdev_stat_t *vs; 5888 uint_t vsc; 5889 5890 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5891 5892 if (spa->spa_l2cache.sav_count == 0) 5893 return; 5894 5895 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5896 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5897 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5898 if (nl2cache != 0) { 5899 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5900 (const nvlist_t * const *)l2cache, nl2cache); 5901 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5902 &l2cache, &nl2cache)); 5903 5904 /* 5905 * Update level 2 cache device stats. 5906 */ 5907 5908 for (i = 0; i < nl2cache; i++) { 5909 guid = fnvlist_lookup_uint64(l2cache[i], 5910 ZPOOL_CONFIG_GUID); 5911 5912 vd = NULL; 5913 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5914 if (guid == 5915 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5916 vd = spa->spa_l2cache.sav_vdevs[j]; 5917 break; 5918 } 5919 } 5920 ASSERT(vd != NULL); 5921 5922 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5923 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5924 vdev_get_stats(vd, vs); 5925 vdev_config_generate_stats(vd, l2cache[i]); 5926 5927 } 5928 } 5929 } 5930 5931 static void 5932 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5933 { 5934 zap_cursor_t zc; 5935 zap_attribute_t za; 5936 5937 if (spa->spa_feat_for_read_obj != 0) { 5938 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5939 spa->spa_feat_for_read_obj); 5940 zap_cursor_retrieve(&zc, &za) == 0; 5941 zap_cursor_advance(&zc)) { 5942 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5943 za.za_num_integers == 1); 5944 VERIFY0(nvlist_add_uint64(features, za.za_name, 5945 za.za_first_integer)); 5946 } 5947 zap_cursor_fini(&zc); 5948 } 5949 5950 if (spa->spa_feat_for_write_obj != 0) { 5951 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5952 spa->spa_feat_for_write_obj); 5953 zap_cursor_retrieve(&zc, &za) == 0; 5954 zap_cursor_advance(&zc)) { 5955 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5956 za.za_num_integers == 1); 5957 VERIFY0(nvlist_add_uint64(features, za.za_name, 5958 za.za_first_integer)); 5959 } 5960 zap_cursor_fini(&zc); 5961 } 5962 } 5963 5964 static void 5965 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5966 { 5967 int i; 5968 5969 for (i = 0; i < SPA_FEATURES; i++) { 5970 zfeature_info_t feature = spa_feature_table[i]; 5971 uint64_t refcount; 5972 5973 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5974 continue; 5975 5976 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5977 } 5978 } 5979 5980 /* 5981 * Store a list of pool features and their reference counts in the 5982 * config. 5983 * 5984 * The first time this is called on a spa, allocate a new nvlist, fetch 5985 * the pool features and reference counts from disk, then save the list 5986 * in the spa. In subsequent calls on the same spa use the saved nvlist 5987 * and refresh its values from the cached reference counts. This 5988 * ensures we don't block here on I/O on a suspended pool so 'zpool 5989 * clear' can resume the pool. 5990 */ 5991 static void 5992 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5993 { 5994 nvlist_t *features; 5995 5996 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5997 5998 mutex_enter(&spa->spa_feat_stats_lock); 5999 features = spa->spa_feat_stats; 6000 6001 if (features != NULL) { 6002 spa_feature_stats_from_cache(spa, features); 6003 } else { 6004 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 6005 spa->spa_feat_stats = features; 6006 spa_feature_stats_from_disk(spa, features); 6007 } 6008 6009 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 6010 features)); 6011 6012 mutex_exit(&spa->spa_feat_stats_lock); 6013 } 6014 6015 int 6016 spa_get_stats(const char *name, nvlist_t **config, 6017 char *altroot, size_t buflen) 6018 { 6019 int error; 6020 spa_t *spa; 6021 6022 *config = NULL; 6023 error = spa_open_common(name, &spa, FTAG, NULL, config); 6024 6025 if (spa != NULL) { 6026 /* 6027 * This still leaves a window of inconsistency where the spares 6028 * or l2cache devices could change and the config would be 6029 * self-inconsistent. 6030 */ 6031 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6032 6033 if (*config != NULL) { 6034 uint64_t loadtimes[2]; 6035 6036 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6037 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6038 fnvlist_add_uint64_array(*config, 6039 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6040 6041 fnvlist_add_uint64(*config, 6042 ZPOOL_CONFIG_ERRCOUNT, 6043 spa_approx_errlog_size(spa)); 6044 6045 if (spa_suspended(spa)) { 6046 fnvlist_add_uint64(*config, 6047 ZPOOL_CONFIG_SUSPENDED, 6048 spa->spa_failmode); 6049 fnvlist_add_uint64(*config, 6050 ZPOOL_CONFIG_SUSPENDED_REASON, 6051 spa->spa_suspended); 6052 } 6053 6054 spa_add_spares(spa, *config); 6055 spa_add_l2cache(spa, *config); 6056 spa_add_feature_stats(spa, *config); 6057 } 6058 } 6059 6060 /* 6061 * We want to get the alternate root even for faulted pools, so we cheat 6062 * and call spa_lookup() directly. 6063 */ 6064 if (altroot) { 6065 if (spa == NULL) { 6066 mutex_enter(&spa_namespace_lock); 6067 spa = spa_lookup(name); 6068 if (spa) 6069 spa_altroot(spa, altroot, buflen); 6070 else 6071 altroot[0] = '\0'; 6072 spa = NULL; 6073 mutex_exit(&spa_namespace_lock); 6074 } else { 6075 spa_altroot(spa, altroot, buflen); 6076 } 6077 } 6078 6079 if (spa != NULL) { 6080 spa_config_exit(spa, SCL_CONFIG, FTAG); 6081 spa_close(spa, FTAG); 6082 } 6083 6084 return (error); 6085 } 6086 6087 /* 6088 * Validate that the auxiliary device array is well formed. We must have an 6089 * array of nvlists, each which describes a valid leaf vdev. If this is an 6090 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6091 * specified, as long as they are well-formed. 6092 */ 6093 static int 6094 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6095 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6096 vdev_labeltype_t label) 6097 { 6098 nvlist_t **dev; 6099 uint_t i, ndev; 6100 vdev_t *vd; 6101 int error; 6102 6103 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6104 6105 /* 6106 * It's acceptable to have no devs specified. 6107 */ 6108 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6109 return (0); 6110 6111 if (ndev == 0) 6112 return (SET_ERROR(EINVAL)); 6113 6114 /* 6115 * Make sure the pool is formatted with a version that supports this 6116 * device type. 6117 */ 6118 if (spa_version(spa) < version) 6119 return (SET_ERROR(ENOTSUP)); 6120 6121 /* 6122 * Set the pending device list so we correctly handle device in-use 6123 * checking. 6124 */ 6125 sav->sav_pending = dev; 6126 sav->sav_npending = ndev; 6127 6128 for (i = 0; i < ndev; i++) { 6129 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6130 mode)) != 0) 6131 goto out; 6132 6133 if (!vd->vdev_ops->vdev_op_leaf) { 6134 vdev_free(vd); 6135 error = SET_ERROR(EINVAL); 6136 goto out; 6137 } 6138 6139 vd->vdev_top = vd; 6140 6141 if ((error = vdev_open(vd)) == 0 && 6142 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6143 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6144 vd->vdev_guid); 6145 } 6146 6147 vdev_free(vd); 6148 6149 if (error && 6150 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6151 goto out; 6152 else 6153 error = 0; 6154 } 6155 6156 out: 6157 sav->sav_pending = NULL; 6158 sav->sav_npending = 0; 6159 return (error); 6160 } 6161 6162 static int 6163 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6164 { 6165 int error; 6166 6167 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6168 6169 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6170 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6171 VDEV_LABEL_SPARE)) != 0) { 6172 return (error); 6173 } 6174 6175 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6176 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6177 VDEV_LABEL_L2CACHE)); 6178 } 6179 6180 static void 6181 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6182 const char *config) 6183 { 6184 int i; 6185 6186 if (sav->sav_config != NULL) { 6187 nvlist_t **olddevs; 6188 uint_t oldndevs; 6189 nvlist_t **newdevs; 6190 6191 /* 6192 * Generate new dev list by concatenating with the 6193 * current dev list. 6194 */ 6195 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6196 &olddevs, &oldndevs)); 6197 6198 newdevs = kmem_alloc(sizeof (void *) * 6199 (ndevs + oldndevs), KM_SLEEP); 6200 for (i = 0; i < oldndevs; i++) 6201 newdevs[i] = fnvlist_dup(olddevs[i]); 6202 for (i = 0; i < ndevs; i++) 6203 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6204 6205 fnvlist_remove(sav->sav_config, config); 6206 6207 fnvlist_add_nvlist_array(sav->sav_config, config, 6208 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6209 for (i = 0; i < oldndevs + ndevs; i++) 6210 nvlist_free(newdevs[i]); 6211 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6212 } else { 6213 /* 6214 * Generate a new dev list. 6215 */ 6216 sav->sav_config = fnvlist_alloc(); 6217 fnvlist_add_nvlist_array(sav->sav_config, config, 6218 (const nvlist_t * const *)devs, ndevs); 6219 } 6220 } 6221 6222 /* 6223 * Stop and drop level 2 ARC devices 6224 */ 6225 void 6226 spa_l2cache_drop(spa_t *spa) 6227 { 6228 vdev_t *vd; 6229 int i; 6230 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6231 6232 for (i = 0; i < sav->sav_count; i++) { 6233 uint64_t pool; 6234 6235 vd = sav->sav_vdevs[i]; 6236 ASSERT(vd != NULL); 6237 6238 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6239 pool != 0ULL && l2arc_vdev_present(vd)) 6240 l2arc_remove_vdev(vd); 6241 } 6242 } 6243 6244 /* 6245 * Verify encryption parameters for spa creation. If we are encrypting, we must 6246 * have the encryption feature flag enabled. 6247 */ 6248 static int 6249 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6250 boolean_t has_encryption) 6251 { 6252 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6253 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6254 !has_encryption) 6255 return (SET_ERROR(ENOTSUP)); 6256 6257 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6258 } 6259 6260 /* 6261 * Pool Creation 6262 */ 6263 int 6264 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6265 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6266 { 6267 spa_t *spa; 6268 const char *altroot = NULL; 6269 vdev_t *rvd; 6270 dsl_pool_t *dp; 6271 dmu_tx_t *tx; 6272 int error = 0; 6273 uint64_t txg = TXG_INITIAL; 6274 nvlist_t **spares, **l2cache; 6275 uint_t nspares, nl2cache; 6276 uint64_t version, obj, ndraid = 0; 6277 boolean_t has_features; 6278 boolean_t has_encryption; 6279 boolean_t has_allocclass; 6280 spa_feature_t feat; 6281 const char *feat_name; 6282 const char *poolname; 6283 nvlist_t *nvl; 6284 6285 if (props == NULL || 6286 nvlist_lookup_string(props, "tname", &poolname) != 0) 6287 poolname = (char *)pool; 6288 6289 /* 6290 * If this pool already exists, return failure. 6291 */ 6292 mutex_enter(&spa_namespace_lock); 6293 if (spa_lookup(poolname) != NULL) { 6294 mutex_exit(&spa_namespace_lock); 6295 return (SET_ERROR(EEXIST)); 6296 } 6297 6298 /* 6299 * Allocate a new spa_t structure. 6300 */ 6301 nvl = fnvlist_alloc(); 6302 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6303 (void) nvlist_lookup_string(props, 6304 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6305 spa = spa_add(poolname, nvl, altroot); 6306 fnvlist_free(nvl); 6307 spa_activate(spa, spa_mode_global); 6308 6309 if (props && (error = spa_prop_validate(spa, props))) { 6310 spa_deactivate(spa); 6311 spa_remove(spa); 6312 mutex_exit(&spa_namespace_lock); 6313 return (error); 6314 } 6315 6316 /* 6317 * Temporary pool names should never be written to disk. 6318 */ 6319 if (poolname != pool) 6320 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6321 6322 has_features = B_FALSE; 6323 has_encryption = B_FALSE; 6324 has_allocclass = B_FALSE; 6325 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6326 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6327 if (zpool_prop_feature(nvpair_name(elem))) { 6328 has_features = B_TRUE; 6329 6330 feat_name = strchr(nvpair_name(elem), '@') + 1; 6331 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6332 if (feat == SPA_FEATURE_ENCRYPTION) 6333 has_encryption = B_TRUE; 6334 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6335 has_allocclass = B_TRUE; 6336 } 6337 } 6338 6339 /* verify encryption params, if they were provided */ 6340 if (dcp != NULL) { 6341 error = spa_create_check_encryption_params(dcp, has_encryption); 6342 if (error != 0) { 6343 spa_deactivate(spa); 6344 spa_remove(spa); 6345 mutex_exit(&spa_namespace_lock); 6346 return (error); 6347 } 6348 } 6349 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6350 spa_deactivate(spa); 6351 spa_remove(spa); 6352 mutex_exit(&spa_namespace_lock); 6353 return (ENOTSUP); 6354 } 6355 6356 if (has_features || nvlist_lookup_uint64(props, 6357 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6358 version = SPA_VERSION; 6359 } 6360 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6361 6362 spa->spa_first_txg = txg; 6363 spa->spa_uberblock.ub_txg = txg - 1; 6364 spa->spa_uberblock.ub_version = version; 6365 spa->spa_ubsync = spa->spa_uberblock; 6366 spa->spa_load_state = SPA_LOAD_CREATE; 6367 spa->spa_removing_phys.sr_state = DSS_NONE; 6368 spa->spa_removing_phys.sr_removing_vdev = -1; 6369 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6370 spa->spa_indirect_vdevs_loaded = B_TRUE; 6371 6372 /* 6373 * Create "The Godfather" zio to hold all async IOs 6374 */ 6375 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6376 KM_SLEEP); 6377 for (int i = 0; i < max_ncpus; i++) { 6378 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6379 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6380 ZIO_FLAG_GODFATHER); 6381 } 6382 6383 /* 6384 * Create the root vdev. 6385 */ 6386 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6387 6388 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6389 6390 ASSERT(error != 0 || rvd != NULL); 6391 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6392 6393 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6394 error = SET_ERROR(EINVAL); 6395 6396 if (error == 0 && 6397 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6398 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6399 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6400 /* 6401 * instantiate the metaslab groups (this will dirty the vdevs) 6402 * we can no longer error exit past this point 6403 */ 6404 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6405 vdev_t *vd = rvd->vdev_child[c]; 6406 6407 vdev_metaslab_set_size(vd); 6408 vdev_expand(vd, txg); 6409 } 6410 } 6411 6412 spa_config_exit(spa, SCL_ALL, FTAG); 6413 6414 if (error != 0) { 6415 spa_unload(spa); 6416 spa_deactivate(spa); 6417 spa_remove(spa); 6418 mutex_exit(&spa_namespace_lock); 6419 return (error); 6420 } 6421 6422 /* 6423 * Get the list of spares, if specified. 6424 */ 6425 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6426 &spares, &nspares) == 0) { 6427 spa->spa_spares.sav_config = fnvlist_alloc(); 6428 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6429 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6430 nspares); 6431 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6432 spa_load_spares(spa); 6433 spa_config_exit(spa, SCL_ALL, FTAG); 6434 spa->spa_spares.sav_sync = B_TRUE; 6435 } 6436 6437 /* 6438 * Get the list of level 2 cache devices, if specified. 6439 */ 6440 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6441 &l2cache, &nl2cache) == 0) { 6442 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6443 NV_UNIQUE_NAME, KM_SLEEP)); 6444 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6445 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6446 nl2cache); 6447 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6448 spa_load_l2cache(spa); 6449 spa_config_exit(spa, SCL_ALL, FTAG); 6450 spa->spa_l2cache.sav_sync = B_TRUE; 6451 } 6452 6453 spa->spa_is_initializing = B_TRUE; 6454 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6455 spa->spa_is_initializing = B_FALSE; 6456 6457 /* 6458 * Create DDTs (dedup tables). 6459 */ 6460 ddt_create(spa); 6461 /* 6462 * Create BRT table and BRT table object. 6463 */ 6464 brt_create(spa); 6465 6466 spa_update_dspace(spa); 6467 6468 tx = dmu_tx_create_assigned(dp, txg); 6469 6470 /* 6471 * Create the pool's history object. 6472 */ 6473 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6474 spa_history_create_obj(spa, tx); 6475 6476 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6477 spa_history_log_version(spa, "create", tx); 6478 6479 /* 6480 * Create the pool config object. 6481 */ 6482 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6483 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6484 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6485 6486 if (zap_add(spa->spa_meta_objset, 6487 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6488 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6489 cmn_err(CE_PANIC, "failed to add pool config"); 6490 } 6491 6492 if (zap_add(spa->spa_meta_objset, 6493 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6494 sizeof (uint64_t), 1, &version, tx) != 0) { 6495 cmn_err(CE_PANIC, "failed to add pool version"); 6496 } 6497 6498 /* Newly created pools with the right version are always deflated. */ 6499 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6500 spa->spa_deflate = TRUE; 6501 if (zap_add(spa->spa_meta_objset, 6502 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6503 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6504 cmn_err(CE_PANIC, "failed to add deflate"); 6505 } 6506 } 6507 6508 /* 6509 * Create the deferred-free bpobj. Turn off compression 6510 * because sync-to-convergence takes longer if the blocksize 6511 * keeps changing. 6512 */ 6513 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6514 dmu_object_set_compress(spa->spa_meta_objset, obj, 6515 ZIO_COMPRESS_OFF, tx); 6516 if (zap_add(spa->spa_meta_objset, 6517 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6518 sizeof (uint64_t), 1, &obj, tx) != 0) { 6519 cmn_err(CE_PANIC, "failed to add bpobj"); 6520 } 6521 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6522 spa->spa_meta_objset, obj)); 6523 6524 /* 6525 * Generate some random noise for salted checksums to operate on. 6526 */ 6527 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6528 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6529 6530 /* 6531 * Set pool properties. 6532 */ 6533 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6534 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6535 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6536 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6537 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6538 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6539 6540 if (props != NULL) { 6541 spa_configfile_set(spa, props, B_FALSE); 6542 spa_sync_props(props, tx); 6543 } 6544 6545 for (int i = 0; i < ndraid; i++) 6546 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6547 6548 dmu_tx_commit(tx); 6549 6550 spa->spa_sync_on = B_TRUE; 6551 txg_sync_start(dp); 6552 mmp_thread_start(spa); 6553 txg_wait_synced(dp, txg); 6554 6555 spa_spawn_aux_threads(spa); 6556 6557 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6558 6559 /* 6560 * Don't count references from objsets that are already closed 6561 * and are making their way through the eviction process. 6562 */ 6563 spa_evicting_os_wait(spa); 6564 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6565 spa->spa_load_state = SPA_LOAD_NONE; 6566 6567 spa_import_os(spa); 6568 6569 mutex_exit(&spa_namespace_lock); 6570 6571 return (0); 6572 } 6573 6574 /* 6575 * Import a non-root pool into the system. 6576 */ 6577 int 6578 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6579 { 6580 spa_t *spa; 6581 const char *altroot = NULL; 6582 spa_load_state_t state = SPA_LOAD_IMPORT; 6583 zpool_load_policy_t policy; 6584 spa_mode_t mode = spa_mode_global; 6585 uint64_t readonly = B_FALSE; 6586 int error; 6587 nvlist_t *nvroot; 6588 nvlist_t **spares, **l2cache; 6589 uint_t nspares, nl2cache; 6590 6591 /* 6592 * If a pool with this name exists, return failure. 6593 */ 6594 mutex_enter(&spa_namespace_lock); 6595 if (spa_lookup(pool) != NULL) { 6596 mutex_exit(&spa_namespace_lock); 6597 return (SET_ERROR(EEXIST)); 6598 } 6599 6600 /* 6601 * Create and initialize the spa structure. 6602 */ 6603 (void) nvlist_lookup_string(props, 6604 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6605 (void) nvlist_lookup_uint64(props, 6606 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6607 if (readonly) 6608 mode = SPA_MODE_READ; 6609 spa = spa_add(pool, config, altroot); 6610 spa->spa_import_flags = flags; 6611 6612 /* 6613 * Verbatim import - Take a pool and insert it into the namespace 6614 * as if it had been loaded at boot. 6615 */ 6616 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6617 if (props != NULL) 6618 spa_configfile_set(spa, props, B_FALSE); 6619 6620 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6621 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6622 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6623 mutex_exit(&spa_namespace_lock); 6624 return (0); 6625 } 6626 6627 spa_activate(spa, mode); 6628 6629 /* 6630 * Don't start async tasks until we know everything is healthy. 6631 */ 6632 spa_async_suspend(spa); 6633 6634 zpool_get_load_policy(config, &policy); 6635 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6636 state = SPA_LOAD_RECOVER; 6637 6638 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6639 6640 if (state != SPA_LOAD_RECOVER) { 6641 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6642 zfs_dbgmsg("spa_import: importing %s", pool); 6643 } else { 6644 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6645 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6646 } 6647 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6648 6649 /* 6650 * Propagate anything learned while loading the pool and pass it 6651 * back to caller (i.e. rewind info, missing devices, etc). 6652 */ 6653 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6654 6655 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6656 /* 6657 * Toss any existing sparelist, as it doesn't have any validity 6658 * anymore, and conflicts with spa_has_spare(). 6659 */ 6660 if (spa->spa_spares.sav_config) { 6661 nvlist_free(spa->spa_spares.sav_config); 6662 spa->spa_spares.sav_config = NULL; 6663 spa_load_spares(spa); 6664 } 6665 if (spa->spa_l2cache.sav_config) { 6666 nvlist_free(spa->spa_l2cache.sav_config); 6667 spa->spa_l2cache.sav_config = NULL; 6668 spa_load_l2cache(spa); 6669 } 6670 6671 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6672 spa_config_exit(spa, SCL_ALL, FTAG); 6673 6674 if (props != NULL) 6675 spa_configfile_set(spa, props, B_FALSE); 6676 6677 if (error != 0 || (props && spa_writeable(spa) && 6678 (error = spa_prop_set(spa, props)))) { 6679 spa_unload(spa); 6680 spa_deactivate(spa); 6681 spa_remove(spa); 6682 mutex_exit(&spa_namespace_lock); 6683 return (error); 6684 } 6685 6686 spa_async_resume(spa); 6687 6688 /* 6689 * Override any spares and level 2 cache devices as specified by 6690 * the user, as these may have correct device names/devids, etc. 6691 */ 6692 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6693 &spares, &nspares) == 0) { 6694 if (spa->spa_spares.sav_config) 6695 fnvlist_remove(spa->spa_spares.sav_config, 6696 ZPOOL_CONFIG_SPARES); 6697 else 6698 spa->spa_spares.sav_config = fnvlist_alloc(); 6699 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6700 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6701 nspares); 6702 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6703 spa_load_spares(spa); 6704 spa_config_exit(spa, SCL_ALL, FTAG); 6705 spa->spa_spares.sav_sync = B_TRUE; 6706 } 6707 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6708 &l2cache, &nl2cache) == 0) { 6709 if (spa->spa_l2cache.sav_config) 6710 fnvlist_remove(spa->spa_l2cache.sav_config, 6711 ZPOOL_CONFIG_L2CACHE); 6712 else 6713 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6714 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6715 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6716 nl2cache); 6717 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6718 spa_load_l2cache(spa); 6719 spa_config_exit(spa, SCL_ALL, FTAG); 6720 spa->spa_l2cache.sav_sync = B_TRUE; 6721 } 6722 6723 /* 6724 * Check for any removed devices. 6725 */ 6726 if (spa->spa_autoreplace) { 6727 spa_aux_check_removed(&spa->spa_spares); 6728 spa_aux_check_removed(&spa->spa_l2cache); 6729 } 6730 6731 if (spa_writeable(spa)) { 6732 /* 6733 * Update the config cache to include the newly-imported pool. 6734 */ 6735 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6736 } 6737 6738 /* 6739 * It's possible that the pool was expanded while it was exported. 6740 * We kick off an async task to handle this for us. 6741 */ 6742 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6743 6744 spa_history_log_version(spa, "import", NULL); 6745 6746 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6747 6748 mutex_exit(&spa_namespace_lock); 6749 6750 zvol_create_minors_recursive(pool); 6751 6752 spa_import_os(spa); 6753 6754 return (0); 6755 } 6756 6757 nvlist_t * 6758 spa_tryimport(nvlist_t *tryconfig) 6759 { 6760 nvlist_t *config = NULL; 6761 const char *poolname, *cachefile; 6762 spa_t *spa; 6763 uint64_t state; 6764 int error; 6765 zpool_load_policy_t policy; 6766 6767 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6768 return (NULL); 6769 6770 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6771 return (NULL); 6772 6773 /* 6774 * Create and initialize the spa structure. 6775 */ 6776 mutex_enter(&spa_namespace_lock); 6777 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6778 spa_activate(spa, SPA_MODE_READ); 6779 6780 /* 6781 * Rewind pool if a max txg was provided. 6782 */ 6783 zpool_get_load_policy(spa->spa_config, &policy); 6784 if (policy.zlp_txg != UINT64_MAX) { 6785 spa->spa_load_max_txg = policy.zlp_txg; 6786 spa->spa_extreme_rewind = B_TRUE; 6787 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6788 poolname, (longlong_t)policy.zlp_txg); 6789 } else { 6790 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6791 } 6792 6793 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6794 == 0) { 6795 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6796 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6797 } else { 6798 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6799 } 6800 6801 /* 6802 * spa_import() relies on a pool config fetched by spa_try_import() 6803 * for spare/cache devices. Import flags are not passed to 6804 * spa_tryimport(), which makes it return early due to a missing log 6805 * device and missing retrieving the cache device and spare eventually. 6806 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 6807 * the correct configuration regardless of the missing log device. 6808 */ 6809 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 6810 6811 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6812 6813 /* 6814 * If 'tryconfig' was at least parsable, return the current config. 6815 */ 6816 if (spa->spa_root_vdev != NULL) { 6817 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6818 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6819 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6820 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6821 spa->spa_uberblock.ub_timestamp); 6822 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6823 spa->spa_load_info); 6824 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6825 spa->spa_errata); 6826 6827 /* 6828 * If the bootfs property exists on this pool then we 6829 * copy it out so that external consumers can tell which 6830 * pools are bootable. 6831 */ 6832 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6833 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6834 6835 /* 6836 * We have to play games with the name since the 6837 * pool was opened as TRYIMPORT_NAME. 6838 */ 6839 if (dsl_dsobj_to_dsname(spa_name(spa), 6840 spa->spa_bootfs, tmpname) == 0) { 6841 char *cp; 6842 char *dsname; 6843 6844 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6845 6846 cp = strchr(tmpname, '/'); 6847 if (cp == NULL) { 6848 (void) strlcpy(dsname, tmpname, 6849 MAXPATHLEN); 6850 } else { 6851 (void) snprintf(dsname, MAXPATHLEN, 6852 "%s/%s", poolname, ++cp); 6853 } 6854 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6855 dsname); 6856 kmem_free(dsname, MAXPATHLEN); 6857 } 6858 kmem_free(tmpname, MAXPATHLEN); 6859 } 6860 6861 /* 6862 * Add the list of hot spares and level 2 cache devices. 6863 */ 6864 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6865 spa_add_spares(spa, config); 6866 spa_add_l2cache(spa, config); 6867 spa_config_exit(spa, SCL_CONFIG, FTAG); 6868 } 6869 6870 spa_unload(spa); 6871 spa_deactivate(spa); 6872 spa_remove(spa); 6873 mutex_exit(&spa_namespace_lock); 6874 6875 return (config); 6876 } 6877 6878 /* 6879 * Pool export/destroy 6880 * 6881 * The act of destroying or exporting a pool is very simple. We make sure there 6882 * is no more pending I/O and any references to the pool are gone. Then, we 6883 * update the pool state and sync all the labels to disk, removing the 6884 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6885 * we don't sync the labels or remove the configuration cache. 6886 */ 6887 static int 6888 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6889 boolean_t force, boolean_t hardforce) 6890 { 6891 int error; 6892 spa_t *spa; 6893 6894 if (oldconfig) 6895 *oldconfig = NULL; 6896 6897 if (!(spa_mode_global & SPA_MODE_WRITE)) 6898 return (SET_ERROR(EROFS)); 6899 6900 mutex_enter(&spa_namespace_lock); 6901 if ((spa = spa_lookup(pool)) == NULL) { 6902 mutex_exit(&spa_namespace_lock); 6903 return (SET_ERROR(ENOENT)); 6904 } 6905 6906 if (spa->spa_is_exporting) { 6907 /* the pool is being exported by another thread */ 6908 mutex_exit(&spa_namespace_lock); 6909 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6910 } 6911 spa->spa_is_exporting = B_TRUE; 6912 6913 /* 6914 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6915 * reacquire the namespace lock, and see if we can export. 6916 */ 6917 spa_open_ref(spa, FTAG); 6918 mutex_exit(&spa_namespace_lock); 6919 spa_async_suspend(spa); 6920 if (spa->spa_zvol_taskq) { 6921 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6922 taskq_wait(spa->spa_zvol_taskq); 6923 } 6924 mutex_enter(&spa_namespace_lock); 6925 spa_close(spa, FTAG); 6926 6927 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6928 goto export_spa; 6929 /* 6930 * The pool will be in core if it's openable, in which case we can 6931 * modify its state. Objsets may be open only because they're dirty, 6932 * so we have to force it to sync before checking spa_refcnt. 6933 */ 6934 if (spa->spa_sync_on) { 6935 txg_wait_synced(spa->spa_dsl_pool, 0); 6936 spa_evicting_os_wait(spa); 6937 } 6938 6939 /* 6940 * A pool cannot be exported or destroyed if there are active 6941 * references. If we are resetting a pool, allow references by 6942 * fault injection handlers. 6943 */ 6944 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6945 error = SET_ERROR(EBUSY); 6946 goto fail; 6947 } 6948 6949 if (spa->spa_sync_on) { 6950 vdev_t *rvd = spa->spa_root_vdev; 6951 /* 6952 * A pool cannot be exported if it has an active shared spare. 6953 * This is to prevent other pools stealing the active spare 6954 * from an exported pool. At user's own will, such pool can 6955 * be forcedly exported. 6956 */ 6957 if (!force && new_state == POOL_STATE_EXPORTED && 6958 spa_has_active_shared_spare(spa)) { 6959 error = SET_ERROR(EXDEV); 6960 goto fail; 6961 } 6962 6963 /* 6964 * We're about to export or destroy this pool. Make sure 6965 * we stop all initialization and trim activity here before 6966 * we set the spa_final_txg. This will ensure that all 6967 * dirty data resulting from the initialization is 6968 * committed to disk before we unload the pool. 6969 */ 6970 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6971 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6972 vdev_autotrim_stop_all(spa); 6973 vdev_rebuild_stop_all(spa); 6974 6975 /* 6976 * We want this to be reflected on every label, 6977 * so mark them all dirty. spa_unload() will do the 6978 * final sync that pushes these changes out. 6979 */ 6980 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6981 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6982 spa->spa_state = new_state; 6983 vdev_config_dirty(rvd); 6984 spa_config_exit(spa, SCL_ALL, FTAG); 6985 } 6986 6987 /* 6988 * If the log space map feature is enabled and the pool is 6989 * getting exported (but not destroyed), we want to spend some 6990 * time flushing as many metaslabs as we can in an attempt to 6991 * destroy log space maps and save import time. This has to be 6992 * done before we set the spa_final_txg, otherwise 6993 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 6994 * spa_should_flush_logs_on_unload() should be called after 6995 * spa_state has been set to the new_state. 6996 */ 6997 if (spa_should_flush_logs_on_unload(spa)) 6998 spa_unload_log_sm_flush_all(spa); 6999 7000 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7001 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7002 spa->spa_final_txg = spa_last_synced_txg(spa) + 7003 TXG_DEFER_SIZE + 1; 7004 spa_config_exit(spa, SCL_ALL, FTAG); 7005 } 7006 } 7007 7008 export_spa: 7009 spa_export_os(spa); 7010 7011 if (new_state == POOL_STATE_DESTROYED) 7012 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 7013 else if (new_state == POOL_STATE_EXPORTED) 7014 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 7015 7016 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7017 spa_unload(spa); 7018 spa_deactivate(spa); 7019 } 7020 7021 if (oldconfig && spa->spa_config) 7022 *oldconfig = fnvlist_dup(spa->spa_config); 7023 7024 if (new_state != POOL_STATE_UNINITIALIZED) { 7025 if (!hardforce) 7026 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7027 spa_remove(spa); 7028 } else { 7029 /* 7030 * If spa_remove() is not called for this spa_t and 7031 * there is any possibility that it can be reused, 7032 * we make sure to reset the exporting flag. 7033 */ 7034 spa->spa_is_exporting = B_FALSE; 7035 } 7036 7037 mutex_exit(&spa_namespace_lock); 7038 return (0); 7039 7040 fail: 7041 spa->spa_is_exporting = B_FALSE; 7042 spa_async_resume(spa); 7043 mutex_exit(&spa_namespace_lock); 7044 return (error); 7045 } 7046 7047 /* 7048 * Destroy a storage pool. 7049 */ 7050 int 7051 spa_destroy(const char *pool) 7052 { 7053 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7054 B_FALSE, B_FALSE)); 7055 } 7056 7057 /* 7058 * Export a storage pool. 7059 */ 7060 int 7061 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7062 boolean_t hardforce) 7063 { 7064 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7065 force, hardforce)); 7066 } 7067 7068 /* 7069 * Similar to spa_export(), this unloads the spa_t without actually removing it 7070 * from the namespace in any way. 7071 */ 7072 int 7073 spa_reset(const char *pool) 7074 { 7075 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7076 B_FALSE, B_FALSE)); 7077 } 7078 7079 /* 7080 * ========================================================================== 7081 * Device manipulation 7082 * ========================================================================== 7083 */ 7084 7085 /* 7086 * This is called as a synctask to increment the draid feature flag 7087 */ 7088 static void 7089 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7090 { 7091 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7092 int draid = (int)(uintptr_t)arg; 7093 7094 for (int c = 0; c < draid; c++) 7095 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7096 } 7097 7098 /* 7099 * Add a device to a storage pool. 7100 */ 7101 int 7102 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 7103 { 7104 uint64_t txg, ndraid = 0; 7105 int error; 7106 vdev_t *rvd = spa->spa_root_vdev; 7107 vdev_t *vd, *tvd; 7108 nvlist_t **spares, **l2cache; 7109 uint_t nspares, nl2cache; 7110 7111 ASSERT(spa_writeable(spa)); 7112 7113 txg = spa_vdev_enter(spa); 7114 7115 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7116 VDEV_ALLOC_ADD)) != 0) 7117 return (spa_vdev_exit(spa, NULL, txg, error)); 7118 7119 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7120 7121 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7122 &nspares) != 0) 7123 nspares = 0; 7124 7125 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7126 &nl2cache) != 0) 7127 nl2cache = 0; 7128 7129 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7130 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7131 7132 if (vd->vdev_children != 0 && 7133 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7134 return (spa_vdev_exit(spa, vd, txg, error)); 7135 } 7136 7137 /* 7138 * The virtual dRAID spares must be added after vdev tree is created 7139 * and the vdev guids are generated. The guid of their associated 7140 * dRAID is stored in the config and used when opening the spare. 7141 */ 7142 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7143 rvd->vdev_children)) == 0) { 7144 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7145 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7146 nspares = 0; 7147 } else { 7148 return (spa_vdev_exit(spa, vd, txg, error)); 7149 } 7150 7151 /* 7152 * We must validate the spares and l2cache devices after checking the 7153 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7154 */ 7155 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7156 return (spa_vdev_exit(spa, vd, txg, error)); 7157 7158 /* 7159 * If we are in the middle of a device removal, we can only add 7160 * devices which match the existing devices in the pool. 7161 * If we are in the middle of a removal, or have some indirect 7162 * vdevs, we can not add raidz or dRAID top levels. 7163 */ 7164 if (spa->spa_vdev_removal != NULL || 7165 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7166 for (int c = 0; c < vd->vdev_children; c++) { 7167 tvd = vd->vdev_child[c]; 7168 if (spa->spa_vdev_removal != NULL && 7169 tvd->vdev_ashift != spa->spa_max_ashift) { 7170 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7171 } 7172 /* Fail if top level vdev is raidz or a dRAID */ 7173 if (vdev_get_nparity(tvd) != 0) 7174 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7175 7176 /* 7177 * Need the top level mirror to be 7178 * a mirror of leaf vdevs only 7179 */ 7180 if (tvd->vdev_ops == &vdev_mirror_ops) { 7181 for (uint64_t cid = 0; 7182 cid < tvd->vdev_children; cid++) { 7183 vdev_t *cvd = tvd->vdev_child[cid]; 7184 if (!cvd->vdev_ops->vdev_op_leaf) { 7185 return (spa_vdev_exit(spa, vd, 7186 txg, EINVAL)); 7187 } 7188 } 7189 } 7190 } 7191 } 7192 7193 for (int c = 0; c < vd->vdev_children; c++) { 7194 tvd = vd->vdev_child[c]; 7195 vdev_remove_child(vd, tvd); 7196 tvd->vdev_id = rvd->vdev_children; 7197 vdev_add_child(rvd, tvd); 7198 vdev_config_dirty(tvd); 7199 } 7200 7201 if (nspares != 0) { 7202 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7203 ZPOOL_CONFIG_SPARES); 7204 spa_load_spares(spa); 7205 spa->spa_spares.sav_sync = B_TRUE; 7206 } 7207 7208 if (nl2cache != 0) { 7209 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7210 ZPOOL_CONFIG_L2CACHE); 7211 spa_load_l2cache(spa); 7212 spa->spa_l2cache.sav_sync = B_TRUE; 7213 } 7214 7215 /* 7216 * We can't increment a feature while holding spa_vdev so we 7217 * have to do it in a synctask. 7218 */ 7219 if (ndraid != 0) { 7220 dmu_tx_t *tx; 7221 7222 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7223 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7224 (void *)(uintptr_t)ndraid, tx); 7225 dmu_tx_commit(tx); 7226 } 7227 7228 /* 7229 * We have to be careful when adding new vdevs to an existing pool. 7230 * If other threads start allocating from these vdevs before we 7231 * sync the config cache, and we lose power, then upon reboot we may 7232 * fail to open the pool because there are DVAs that the config cache 7233 * can't translate. Therefore, we first add the vdevs without 7234 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7235 * and then let spa_config_update() initialize the new metaslabs. 7236 * 7237 * spa_load() checks for added-but-not-initialized vdevs, so that 7238 * if we lose power at any point in this sequence, the remaining 7239 * steps will be completed the next time we load the pool. 7240 */ 7241 (void) spa_vdev_exit(spa, vd, txg, 0); 7242 7243 mutex_enter(&spa_namespace_lock); 7244 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7245 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7246 mutex_exit(&spa_namespace_lock); 7247 7248 return (0); 7249 } 7250 7251 /* 7252 * Attach a device to a vdev specified by its guid. The vdev type can be 7253 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7254 * single device). When the vdev is a single device, a mirror vdev will be 7255 * automatically inserted. 7256 * 7257 * If 'replacing' is specified, the new device is intended to replace the 7258 * existing device; in this case the two devices are made into their own 7259 * mirror using the 'replacing' vdev, which is functionally identical to 7260 * the mirror vdev (it actually reuses all the same ops) but has a few 7261 * extra rules: you can't attach to it after it's been created, and upon 7262 * completion of resilvering, the first disk (the one being replaced) 7263 * is automatically detached. 7264 * 7265 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7266 * should be performed instead of traditional healing reconstruction. From 7267 * an administrators perspective these are both resilver operations. 7268 */ 7269 int 7270 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7271 int rebuild) 7272 { 7273 uint64_t txg, dtl_max_txg; 7274 vdev_t *rvd = spa->spa_root_vdev; 7275 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7276 vdev_ops_t *pvops; 7277 char *oldvdpath, *newvdpath; 7278 int newvd_isspare = B_FALSE; 7279 int error; 7280 7281 ASSERT(spa_writeable(spa)); 7282 7283 txg = spa_vdev_enter(spa); 7284 7285 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7286 7287 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7288 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7289 error = (spa_has_checkpoint(spa)) ? 7290 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7291 return (spa_vdev_exit(spa, NULL, txg, error)); 7292 } 7293 7294 if (rebuild) { 7295 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7296 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7297 7298 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7299 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7300 return (spa_vdev_exit(spa, NULL, txg, 7301 ZFS_ERR_RESILVER_IN_PROGRESS)); 7302 } 7303 } else { 7304 if (vdev_rebuild_active(rvd)) 7305 return (spa_vdev_exit(spa, NULL, txg, 7306 ZFS_ERR_REBUILD_IN_PROGRESS)); 7307 } 7308 7309 if (spa->spa_vdev_removal != NULL) { 7310 return (spa_vdev_exit(spa, NULL, txg, 7311 ZFS_ERR_DEVRM_IN_PROGRESS)); 7312 } 7313 7314 if (oldvd == NULL) 7315 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7316 7317 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7318 7319 if (raidz) { 7320 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7321 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7322 7323 /* 7324 * Can't expand a raidz while prior expand is in progress. 7325 */ 7326 if (spa->spa_raidz_expand != NULL) { 7327 return (spa_vdev_exit(spa, NULL, txg, 7328 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7329 } 7330 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7331 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7332 } 7333 7334 if (raidz) 7335 pvd = oldvd; 7336 else 7337 pvd = oldvd->vdev_parent; 7338 7339 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7340 VDEV_ALLOC_ATTACH) != 0) 7341 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7342 7343 if (newrootvd->vdev_children != 1) 7344 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7345 7346 newvd = newrootvd->vdev_child[0]; 7347 7348 if (!newvd->vdev_ops->vdev_op_leaf) 7349 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7350 7351 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7352 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7353 7354 /* 7355 * log, dedup and special vdevs should not be replaced by spares. 7356 */ 7357 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7358 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7359 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7360 } 7361 7362 /* 7363 * A dRAID spare can only replace a child of its parent dRAID vdev. 7364 */ 7365 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7366 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7367 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7368 } 7369 7370 if (rebuild) { 7371 /* 7372 * For rebuilds, the top vdev must support reconstruction 7373 * using only space maps. This means the only allowable 7374 * vdevs types are the root vdev, a mirror, or dRAID. 7375 */ 7376 tvd = pvd; 7377 if (pvd->vdev_top != NULL) 7378 tvd = pvd->vdev_top; 7379 7380 if (tvd->vdev_ops != &vdev_mirror_ops && 7381 tvd->vdev_ops != &vdev_root_ops && 7382 tvd->vdev_ops != &vdev_draid_ops) { 7383 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7384 } 7385 } 7386 7387 if (!replacing) { 7388 /* 7389 * For attach, the only allowable parent is a mirror or 7390 * the root vdev. A raidz vdev can be attached to, but 7391 * you cannot attach to a raidz child. 7392 */ 7393 if (pvd->vdev_ops != &vdev_mirror_ops && 7394 pvd->vdev_ops != &vdev_root_ops && 7395 !raidz) 7396 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7397 7398 pvops = &vdev_mirror_ops; 7399 } else { 7400 /* 7401 * Active hot spares can only be replaced by inactive hot 7402 * spares. 7403 */ 7404 if (pvd->vdev_ops == &vdev_spare_ops && 7405 oldvd->vdev_isspare && 7406 !spa_has_spare(spa, newvd->vdev_guid)) 7407 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7408 7409 /* 7410 * If the source is a hot spare, and the parent isn't already a 7411 * spare, then we want to create a new hot spare. Otherwise, we 7412 * want to create a replacing vdev. The user is not allowed to 7413 * attach to a spared vdev child unless the 'isspare' state is 7414 * the same (spare replaces spare, non-spare replaces 7415 * non-spare). 7416 */ 7417 if (pvd->vdev_ops == &vdev_replacing_ops && 7418 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7419 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7420 } else if (pvd->vdev_ops == &vdev_spare_ops && 7421 newvd->vdev_isspare != oldvd->vdev_isspare) { 7422 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7423 } 7424 7425 if (newvd->vdev_isspare) 7426 pvops = &vdev_spare_ops; 7427 else 7428 pvops = &vdev_replacing_ops; 7429 } 7430 7431 /* 7432 * Make sure the new device is big enough. 7433 */ 7434 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7435 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7436 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7437 7438 /* 7439 * The new device cannot have a higher alignment requirement 7440 * than the top-level vdev. 7441 */ 7442 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 7443 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7444 7445 /* 7446 * RAIDZ-expansion-specific checks. 7447 */ 7448 if (raidz) { 7449 if (vdev_raidz_attach_check(newvd) != 0) 7450 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7451 7452 /* 7453 * Fail early if a child is not healthy or being replaced 7454 */ 7455 for (int i = 0; i < oldvd->vdev_children; i++) { 7456 if (vdev_is_dead(oldvd->vdev_child[i]) || 7457 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7458 return (spa_vdev_exit(spa, newrootvd, txg, 7459 ENXIO)); 7460 } 7461 /* Also fail if reserved boot area is in-use */ 7462 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7463 != 0) { 7464 return (spa_vdev_exit(spa, newrootvd, txg, 7465 EADDRINUSE)); 7466 } 7467 } 7468 } 7469 7470 if (raidz) { 7471 /* 7472 * Note: oldvdpath is freed by spa_strfree(), but 7473 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7474 * move it to a spa_strdup-ed string. 7475 */ 7476 char *tmp = kmem_asprintf("raidz%u-%u", 7477 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7478 oldvdpath = spa_strdup(tmp); 7479 kmem_strfree(tmp); 7480 } else { 7481 oldvdpath = spa_strdup(oldvd->vdev_path); 7482 } 7483 newvdpath = spa_strdup(newvd->vdev_path); 7484 7485 /* 7486 * If this is an in-place replacement, update oldvd's path and devid 7487 * to make it distinguishable from newvd, and unopenable from now on. 7488 */ 7489 if (strcmp(oldvdpath, newvdpath) == 0) { 7490 spa_strfree(oldvd->vdev_path); 7491 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7492 KM_SLEEP); 7493 (void) sprintf(oldvd->vdev_path, "%s/old", 7494 newvdpath); 7495 if (oldvd->vdev_devid != NULL) { 7496 spa_strfree(oldvd->vdev_devid); 7497 oldvd->vdev_devid = NULL; 7498 } 7499 spa_strfree(oldvdpath); 7500 oldvdpath = spa_strdup(oldvd->vdev_path); 7501 } 7502 7503 /* 7504 * If the parent is not a mirror, or if we're replacing, insert the new 7505 * mirror/replacing/spare vdev above oldvd. 7506 */ 7507 if (!raidz && pvd->vdev_ops != pvops) { 7508 pvd = vdev_add_parent(oldvd, pvops); 7509 ASSERT(pvd->vdev_ops == pvops); 7510 ASSERT(oldvd->vdev_parent == pvd); 7511 } 7512 7513 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7514 7515 /* 7516 * Extract the new device from its root and add it to pvd. 7517 */ 7518 vdev_remove_child(newrootvd, newvd); 7519 newvd->vdev_id = pvd->vdev_children; 7520 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7521 vdev_add_child(pvd, newvd); 7522 7523 /* 7524 * Reevaluate the parent vdev state. 7525 */ 7526 vdev_propagate_state(pvd); 7527 7528 tvd = newvd->vdev_top; 7529 ASSERT(pvd->vdev_top == tvd); 7530 ASSERT(tvd->vdev_parent == rvd); 7531 7532 vdev_config_dirty(tvd); 7533 7534 /* 7535 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7536 * for any dmu_sync-ed blocks. It will propagate upward when 7537 * spa_vdev_exit() calls vdev_dtl_reassess(). 7538 */ 7539 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7540 7541 if (raidz) { 7542 /* 7543 * Wait for the youngest allocations and frees to sync, 7544 * and then wait for the deferral of those frees to finish. 7545 */ 7546 spa_vdev_config_exit(spa, NULL, 7547 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 7548 7549 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 7550 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 7551 vdev_autotrim_stop_wait(tvd); 7552 7553 dtl_max_txg = spa_vdev_config_enter(spa); 7554 7555 tvd->vdev_rz_expanding = B_TRUE; 7556 7557 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 7558 vdev_config_dirty(tvd); 7559 7560 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 7561 dtl_max_txg); 7562 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 7563 newvd, tx); 7564 dmu_tx_commit(tx); 7565 } else { 7566 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 7567 dtl_max_txg - TXG_INITIAL); 7568 7569 if (newvd->vdev_isspare) { 7570 spa_spare_activate(newvd); 7571 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7572 } 7573 7574 newvd_isspare = newvd->vdev_isspare; 7575 7576 /* 7577 * Mark newvd's DTL dirty in this txg. 7578 */ 7579 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7580 7581 /* 7582 * Schedule the resilver or rebuild to restart in the future. 7583 * We do this to ensure that dmu_sync-ed blocks have been 7584 * stitched into the respective datasets. 7585 */ 7586 if (rebuild) { 7587 newvd->vdev_rebuild_txg = txg; 7588 7589 vdev_rebuild(tvd); 7590 } else { 7591 newvd->vdev_resilver_txg = txg; 7592 7593 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7594 spa_feature_is_enabled(spa, 7595 SPA_FEATURE_RESILVER_DEFER)) { 7596 vdev_defer_resilver(newvd); 7597 } else { 7598 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7599 dtl_max_txg); 7600 } 7601 } 7602 } 7603 7604 if (spa->spa_bootfs) 7605 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7606 7607 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7608 7609 /* 7610 * Commit the config 7611 */ 7612 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7613 7614 spa_history_log_internal(spa, "vdev attach", NULL, 7615 "%s vdev=%s %s vdev=%s", 7616 replacing && newvd_isspare ? "spare in" : 7617 replacing ? "replace" : "attach", newvdpath, 7618 replacing ? "for" : "to", oldvdpath); 7619 7620 spa_strfree(oldvdpath); 7621 spa_strfree(newvdpath); 7622 7623 return (0); 7624 } 7625 7626 /* 7627 * Detach a device from a mirror or replacing vdev. 7628 * 7629 * If 'replace_done' is specified, only detach if the parent 7630 * is a replacing or a spare vdev. 7631 */ 7632 int 7633 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7634 { 7635 uint64_t txg; 7636 int error; 7637 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7638 vdev_t *vd, *pvd, *cvd, *tvd; 7639 boolean_t unspare = B_FALSE; 7640 uint64_t unspare_guid = 0; 7641 char *vdpath; 7642 7643 ASSERT(spa_writeable(spa)); 7644 7645 txg = spa_vdev_detach_enter(spa, guid); 7646 7647 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7648 7649 /* 7650 * Besides being called directly from the userland through the 7651 * ioctl interface, spa_vdev_detach() can be potentially called 7652 * at the end of spa_vdev_resilver_done(). 7653 * 7654 * In the regular case, when we have a checkpoint this shouldn't 7655 * happen as we never empty the DTLs of a vdev during the scrub 7656 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7657 * should never get here when we have a checkpoint. 7658 * 7659 * That said, even in a case when we checkpoint the pool exactly 7660 * as spa_vdev_resilver_done() calls this function everything 7661 * should be fine as the resilver will return right away. 7662 */ 7663 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7664 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7665 error = (spa_has_checkpoint(spa)) ? 7666 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7667 return (spa_vdev_exit(spa, NULL, txg, error)); 7668 } 7669 7670 if (vd == NULL) 7671 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7672 7673 if (!vd->vdev_ops->vdev_op_leaf) 7674 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7675 7676 pvd = vd->vdev_parent; 7677 7678 /* 7679 * If the parent/child relationship is not as expected, don't do it. 7680 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7681 * vdev that's replacing B with C. The user's intent in replacing 7682 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7683 * the replace by detaching C, the expected behavior is to end up 7684 * M(A,B). But suppose that right after deciding to detach C, 7685 * the replacement of B completes. We would have M(A,C), and then 7686 * ask to detach C, which would leave us with just A -- not what 7687 * the user wanted. To prevent this, we make sure that the 7688 * parent/child relationship hasn't changed -- in this example, 7689 * that C's parent is still the replacing vdev R. 7690 */ 7691 if (pvd->vdev_guid != pguid && pguid != 0) 7692 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7693 7694 /* 7695 * Only 'replacing' or 'spare' vdevs can be replaced. 7696 */ 7697 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7698 pvd->vdev_ops != &vdev_spare_ops) 7699 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7700 7701 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7702 spa_version(spa) >= SPA_VERSION_SPARES); 7703 7704 /* 7705 * Only mirror, replacing, and spare vdevs support detach. 7706 */ 7707 if (pvd->vdev_ops != &vdev_replacing_ops && 7708 pvd->vdev_ops != &vdev_mirror_ops && 7709 pvd->vdev_ops != &vdev_spare_ops) 7710 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7711 7712 /* 7713 * If this device has the only valid copy of some data, 7714 * we cannot safely detach it. 7715 */ 7716 if (vdev_dtl_required(vd)) 7717 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7718 7719 ASSERT(pvd->vdev_children >= 2); 7720 7721 /* 7722 * If we are detaching the second disk from a replacing vdev, then 7723 * check to see if we changed the original vdev's path to have "/old" 7724 * at the end in spa_vdev_attach(). If so, undo that change now. 7725 */ 7726 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7727 vd->vdev_path != NULL) { 7728 size_t len = strlen(vd->vdev_path); 7729 7730 for (int c = 0; c < pvd->vdev_children; c++) { 7731 cvd = pvd->vdev_child[c]; 7732 7733 if (cvd == vd || cvd->vdev_path == NULL) 7734 continue; 7735 7736 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7737 strcmp(cvd->vdev_path + len, "/old") == 0) { 7738 spa_strfree(cvd->vdev_path); 7739 cvd->vdev_path = spa_strdup(vd->vdev_path); 7740 break; 7741 } 7742 } 7743 } 7744 7745 /* 7746 * If we are detaching the original disk from a normal spare, then it 7747 * implies that the spare should become a real disk, and be removed 7748 * from the active spare list for the pool. dRAID spares on the 7749 * other hand are coupled to the pool and thus should never be removed 7750 * from the spares list. 7751 */ 7752 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7753 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7754 7755 if (last_cvd->vdev_isspare && 7756 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7757 unspare = B_TRUE; 7758 } 7759 } 7760 7761 /* 7762 * Erase the disk labels so the disk can be used for other things. 7763 * This must be done after all other error cases are handled, 7764 * but before we disembowel vd (so we can still do I/O to it). 7765 * But if we can't do it, don't treat the error as fatal -- 7766 * it may be that the unwritability of the disk is the reason 7767 * it's being detached! 7768 */ 7769 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7770 7771 /* 7772 * Remove vd from its parent and compact the parent's children. 7773 */ 7774 vdev_remove_child(pvd, vd); 7775 vdev_compact_children(pvd); 7776 7777 /* 7778 * Remember one of the remaining children so we can get tvd below. 7779 */ 7780 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7781 7782 /* 7783 * If we need to remove the remaining child from the list of hot spares, 7784 * do it now, marking the vdev as no longer a spare in the process. 7785 * We must do this before vdev_remove_parent(), because that can 7786 * change the GUID if it creates a new toplevel GUID. For a similar 7787 * reason, we must remove the spare now, in the same txg as the detach; 7788 * otherwise someone could attach a new sibling, change the GUID, and 7789 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7790 */ 7791 if (unspare) { 7792 ASSERT(cvd->vdev_isspare); 7793 spa_spare_remove(cvd); 7794 unspare_guid = cvd->vdev_guid; 7795 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7796 cvd->vdev_unspare = B_TRUE; 7797 } 7798 7799 /* 7800 * If the parent mirror/replacing vdev only has one child, 7801 * the parent is no longer needed. Remove it from the tree. 7802 */ 7803 if (pvd->vdev_children == 1) { 7804 if (pvd->vdev_ops == &vdev_spare_ops) 7805 cvd->vdev_unspare = B_FALSE; 7806 vdev_remove_parent(cvd); 7807 } 7808 7809 /* 7810 * We don't set tvd until now because the parent we just removed 7811 * may have been the previous top-level vdev. 7812 */ 7813 tvd = cvd->vdev_top; 7814 ASSERT(tvd->vdev_parent == rvd); 7815 7816 /* 7817 * Reevaluate the parent vdev state. 7818 */ 7819 vdev_propagate_state(cvd); 7820 7821 /* 7822 * If the 'autoexpand' property is set on the pool then automatically 7823 * try to expand the size of the pool. For example if the device we 7824 * just detached was smaller than the others, it may be possible to 7825 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7826 * first so that we can obtain the updated sizes of the leaf vdevs. 7827 */ 7828 if (spa->spa_autoexpand) { 7829 vdev_reopen(tvd); 7830 vdev_expand(tvd, txg); 7831 } 7832 7833 vdev_config_dirty(tvd); 7834 7835 /* 7836 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7837 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7838 * But first make sure we're not on any *other* txg's DTL list, to 7839 * prevent vd from being accessed after it's freed. 7840 */ 7841 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7842 for (int t = 0; t < TXG_SIZE; t++) 7843 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7844 vd->vdev_detached = B_TRUE; 7845 vdev_dirty(tvd, VDD_DTL, vd, txg); 7846 7847 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7848 spa_notify_waiters(spa); 7849 7850 /* hang on to the spa before we release the lock */ 7851 spa_open_ref(spa, FTAG); 7852 7853 error = spa_vdev_exit(spa, vd, txg, 0); 7854 7855 spa_history_log_internal(spa, "detach", NULL, 7856 "vdev=%s", vdpath); 7857 spa_strfree(vdpath); 7858 7859 /* 7860 * If this was the removal of the original device in a hot spare vdev, 7861 * then we want to go through and remove the device from the hot spare 7862 * list of every other pool. 7863 */ 7864 if (unspare) { 7865 spa_t *altspa = NULL; 7866 7867 mutex_enter(&spa_namespace_lock); 7868 while ((altspa = spa_next(altspa)) != NULL) { 7869 if (altspa->spa_state != POOL_STATE_ACTIVE || 7870 altspa == spa) 7871 continue; 7872 7873 spa_open_ref(altspa, FTAG); 7874 mutex_exit(&spa_namespace_lock); 7875 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7876 mutex_enter(&spa_namespace_lock); 7877 spa_close(altspa, FTAG); 7878 } 7879 mutex_exit(&spa_namespace_lock); 7880 7881 /* search the rest of the vdevs for spares to remove */ 7882 spa_vdev_resilver_done(spa); 7883 } 7884 7885 /* all done with the spa; OK to release */ 7886 mutex_enter(&spa_namespace_lock); 7887 spa_close(spa, FTAG); 7888 mutex_exit(&spa_namespace_lock); 7889 7890 return (error); 7891 } 7892 7893 static int 7894 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7895 list_t *vd_list) 7896 { 7897 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7898 7899 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7900 7901 /* Look up vdev and ensure it's a leaf. */ 7902 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7903 if (vd == NULL || vd->vdev_detached) { 7904 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7905 return (SET_ERROR(ENODEV)); 7906 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7907 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7908 return (SET_ERROR(EINVAL)); 7909 } else if (!vdev_writeable(vd)) { 7910 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7911 return (SET_ERROR(EROFS)); 7912 } 7913 mutex_enter(&vd->vdev_initialize_lock); 7914 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7915 7916 /* 7917 * When we activate an initialize action we check to see 7918 * if the vdev_initialize_thread is NULL. We do this instead 7919 * of using the vdev_initialize_state since there might be 7920 * a previous initialization process which has completed but 7921 * the thread is not exited. 7922 */ 7923 if (cmd_type == POOL_INITIALIZE_START && 7924 (vd->vdev_initialize_thread != NULL || 7925 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 7926 mutex_exit(&vd->vdev_initialize_lock); 7927 return (SET_ERROR(EBUSY)); 7928 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7929 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7930 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7931 mutex_exit(&vd->vdev_initialize_lock); 7932 return (SET_ERROR(ESRCH)); 7933 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7934 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7935 mutex_exit(&vd->vdev_initialize_lock); 7936 return (SET_ERROR(ESRCH)); 7937 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 7938 vd->vdev_initialize_thread != NULL) { 7939 mutex_exit(&vd->vdev_initialize_lock); 7940 return (SET_ERROR(EBUSY)); 7941 } 7942 7943 switch (cmd_type) { 7944 case POOL_INITIALIZE_START: 7945 vdev_initialize(vd); 7946 break; 7947 case POOL_INITIALIZE_CANCEL: 7948 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7949 break; 7950 case POOL_INITIALIZE_SUSPEND: 7951 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7952 break; 7953 case POOL_INITIALIZE_UNINIT: 7954 vdev_uninitialize(vd); 7955 break; 7956 default: 7957 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7958 } 7959 mutex_exit(&vd->vdev_initialize_lock); 7960 7961 return (0); 7962 } 7963 7964 int 7965 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7966 nvlist_t *vdev_errlist) 7967 { 7968 int total_errors = 0; 7969 list_t vd_list; 7970 7971 list_create(&vd_list, sizeof (vdev_t), 7972 offsetof(vdev_t, vdev_initialize_node)); 7973 7974 /* 7975 * We hold the namespace lock through the whole function 7976 * to prevent any changes to the pool while we're starting or 7977 * stopping initialization. The config and state locks are held so that 7978 * we can properly assess the vdev state before we commit to 7979 * the initializing operation. 7980 */ 7981 mutex_enter(&spa_namespace_lock); 7982 7983 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7984 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7985 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7986 7987 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7988 &vd_list); 7989 if (error != 0) { 7990 char guid_as_str[MAXNAMELEN]; 7991 7992 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7993 "%llu", (unsigned long long)vdev_guid); 7994 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7995 total_errors++; 7996 } 7997 } 7998 7999 /* Wait for all initialize threads to stop. */ 8000 vdev_initialize_stop_wait(spa, &vd_list); 8001 8002 /* Sync out the initializing state */ 8003 txg_wait_synced(spa->spa_dsl_pool, 0); 8004 mutex_exit(&spa_namespace_lock); 8005 8006 list_destroy(&vd_list); 8007 8008 return (total_errors); 8009 } 8010 8011 static int 8012 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8013 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 8014 { 8015 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8016 8017 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8018 8019 /* Look up vdev and ensure it's a leaf. */ 8020 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8021 if (vd == NULL || vd->vdev_detached) { 8022 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8023 return (SET_ERROR(ENODEV)); 8024 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8025 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8026 return (SET_ERROR(EINVAL)); 8027 } else if (!vdev_writeable(vd)) { 8028 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8029 return (SET_ERROR(EROFS)); 8030 } else if (!vd->vdev_has_trim) { 8031 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8032 return (SET_ERROR(EOPNOTSUPP)); 8033 } else if (secure && !vd->vdev_has_securetrim) { 8034 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8035 return (SET_ERROR(EOPNOTSUPP)); 8036 } 8037 mutex_enter(&vd->vdev_trim_lock); 8038 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8039 8040 /* 8041 * When we activate a TRIM action we check to see if the 8042 * vdev_trim_thread is NULL. We do this instead of using the 8043 * vdev_trim_state since there might be a previous TRIM process 8044 * which has completed but the thread is not exited. 8045 */ 8046 if (cmd_type == POOL_TRIM_START && 8047 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8048 vd->vdev_top->vdev_rz_expanding)) { 8049 mutex_exit(&vd->vdev_trim_lock); 8050 return (SET_ERROR(EBUSY)); 8051 } else if (cmd_type == POOL_TRIM_CANCEL && 8052 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8053 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8054 mutex_exit(&vd->vdev_trim_lock); 8055 return (SET_ERROR(ESRCH)); 8056 } else if (cmd_type == POOL_TRIM_SUSPEND && 8057 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8058 mutex_exit(&vd->vdev_trim_lock); 8059 return (SET_ERROR(ESRCH)); 8060 } 8061 8062 switch (cmd_type) { 8063 case POOL_TRIM_START: 8064 vdev_trim(vd, rate, partial, secure); 8065 break; 8066 case POOL_TRIM_CANCEL: 8067 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8068 break; 8069 case POOL_TRIM_SUSPEND: 8070 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8071 break; 8072 default: 8073 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8074 } 8075 mutex_exit(&vd->vdev_trim_lock); 8076 8077 return (0); 8078 } 8079 8080 /* 8081 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8082 * TRIM threads for each child vdev. These threads pass over all of the free 8083 * space in the vdev's metaslabs and issues TRIM commands for that space. 8084 */ 8085 int 8086 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8087 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8088 { 8089 int total_errors = 0; 8090 list_t vd_list; 8091 8092 list_create(&vd_list, sizeof (vdev_t), 8093 offsetof(vdev_t, vdev_trim_node)); 8094 8095 /* 8096 * We hold the namespace lock through the whole function 8097 * to prevent any changes to the pool while we're starting or 8098 * stopping TRIM. The config and state locks are held so that 8099 * we can properly assess the vdev state before we commit to 8100 * the TRIM operation. 8101 */ 8102 mutex_enter(&spa_namespace_lock); 8103 8104 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8105 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8106 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8107 8108 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8109 rate, partial, secure, &vd_list); 8110 if (error != 0) { 8111 char guid_as_str[MAXNAMELEN]; 8112 8113 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8114 "%llu", (unsigned long long)vdev_guid); 8115 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8116 total_errors++; 8117 } 8118 } 8119 8120 /* Wait for all TRIM threads to stop. */ 8121 vdev_trim_stop_wait(spa, &vd_list); 8122 8123 /* Sync out the TRIM state */ 8124 txg_wait_synced(spa->spa_dsl_pool, 0); 8125 mutex_exit(&spa_namespace_lock); 8126 8127 list_destroy(&vd_list); 8128 8129 return (total_errors); 8130 } 8131 8132 /* 8133 * Split a set of devices from their mirrors, and create a new pool from them. 8134 */ 8135 int 8136 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8137 nvlist_t *props, boolean_t exp) 8138 { 8139 int error = 0; 8140 uint64_t txg, *glist; 8141 spa_t *newspa; 8142 uint_t c, children, lastlog; 8143 nvlist_t **child, *nvl, *tmp; 8144 dmu_tx_t *tx; 8145 const char *altroot = NULL; 8146 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8147 boolean_t activate_slog; 8148 8149 ASSERT(spa_writeable(spa)); 8150 8151 txg = spa_vdev_enter(spa); 8152 8153 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8154 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8155 error = (spa_has_checkpoint(spa)) ? 8156 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8157 return (spa_vdev_exit(spa, NULL, txg, error)); 8158 } 8159 8160 /* clear the log and flush everything up to now */ 8161 activate_slog = spa_passivate_log(spa); 8162 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8163 error = spa_reset_logs(spa); 8164 txg = spa_vdev_config_enter(spa); 8165 8166 if (activate_slog) 8167 spa_activate_log(spa); 8168 8169 if (error != 0) 8170 return (spa_vdev_exit(spa, NULL, txg, error)); 8171 8172 /* check new spa name before going any further */ 8173 if (spa_lookup(newname) != NULL) 8174 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8175 8176 /* 8177 * scan through all the children to ensure they're all mirrors 8178 */ 8179 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8180 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8181 &children) != 0) 8182 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8183 8184 /* first, check to ensure we've got the right child count */ 8185 rvd = spa->spa_root_vdev; 8186 lastlog = 0; 8187 for (c = 0; c < rvd->vdev_children; c++) { 8188 vdev_t *vd = rvd->vdev_child[c]; 8189 8190 /* don't count the holes & logs as children */ 8191 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8192 !vdev_is_concrete(vd))) { 8193 if (lastlog == 0) 8194 lastlog = c; 8195 continue; 8196 } 8197 8198 lastlog = 0; 8199 } 8200 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8201 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8202 8203 /* next, ensure no spare or cache devices are part of the split */ 8204 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8205 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8206 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8207 8208 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8209 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8210 8211 /* then, loop over each vdev and validate it */ 8212 for (c = 0; c < children; c++) { 8213 uint64_t is_hole = 0; 8214 8215 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8216 &is_hole); 8217 8218 if (is_hole != 0) { 8219 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8220 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8221 continue; 8222 } else { 8223 error = SET_ERROR(EINVAL); 8224 break; 8225 } 8226 } 8227 8228 /* deal with indirect vdevs */ 8229 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8230 &vdev_indirect_ops) 8231 continue; 8232 8233 /* which disk is going to be split? */ 8234 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8235 &glist[c]) != 0) { 8236 error = SET_ERROR(EINVAL); 8237 break; 8238 } 8239 8240 /* look it up in the spa */ 8241 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8242 if (vml[c] == NULL) { 8243 error = SET_ERROR(ENODEV); 8244 break; 8245 } 8246 8247 /* make sure there's nothing stopping the split */ 8248 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8249 vml[c]->vdev_islog || 8250 !vdev_is_concrete(vml[c]) || 8251 vml[c]->vdev_isspare || 8252 vml[c]->vdev_isl2cache || 8253 !vdev_writeable(vml[c]) || 8254 vml[c]->vdev_children != 0 || 8255 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8256 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8257 error = SET_ERROR(EINVAL); 8258 break; 8259 } 8260 8261 if (vdev_dtl_required(vml[c]) || 8262 vdev_resilver_needed(vml[c], NULL, NULL)) { 8263 error = SET_ERROR(EBUSY); 8264 break; 8265 } 8266 8267 /* we need certain info from the top level */ 8268 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8269 vml[c]->vdev_top->vdev_ms_array); 8270 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8271 vml[c]->vdev_top->vdev_ms_shift); 8272 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8273 vml[c]->vdev_top->vdev_asize); 8274 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8275 vml[c]->vdev_top->vdev_ashift); 8276 8277 /* transfer per-vdev ZAPs */ 8278 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8279 VERIFY0(nvlist_add_uint64(child[c], 8280 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8281 8282 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8283 VERIFY0(nvlist_add_uint64(child[c], 8284 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8285 vml[c]->vdev_parent->vdev_top_zap)); 8286 } 8287 8288 if (error != 0) { 8289 kmem_free(vml, children * sizeof (vdev_t *)); 8290 kmem_free(glist, children * sizeof (uint64_t)); 8291 return (spa_vdev_exit(spa, NULL, txg, error)); 8292 } 8293 8294 /* stop writers from using the disks */ 8295 for (c = 0; c < children; c++) { 8296 if (vml[c] != NULL) 8297 vml[c]->vdev_offline = B_TRUE; 8298 } 8299 vdev_reopen(spa->spa_root_vdev); 8300 8301 /* 8302 * Temporarily record the splitting vdevs in the spa config. This 8303 * will disappear once the config is regenerated. 8304 */ 8305 nvl = fnvlist_alloc(); 8306 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8307 kmem_free(glist, children * sizeof (uint64_t)); 8308 8309 mutex_enter(&spa->spa_props_lock); 8310 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8311 mutex_exit(&spa->spa_props_lock); 8312 spa->spa_config_splitting = nvl; 8313 vdev_config_dirty(spa->spa_root_vdev); 8314 8315 /* configure and create the new pool */ 8316 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8317 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8318 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8319 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8320 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8321 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8322 spa_generate_guid(NULL)); 8323 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8324 (void) nvlist_lookup_string(props, 8325 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8326 8327 /* add the new pool to the namespace */ 8328 newspa = spa_add(newname, config, altroot); 8329 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8330 newspa->spa_config_txg = spa->spa_config_txg; 8331 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8332 8333 /* release the spa config lock, retaining the namespace lock */ 8334 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8335 8336 if (zio_injection_enabled) 8337 zio_handle_panic_injection(spa, FTAG, 1); 8338 8339 spa_activate(newspa, spa_mode_global); 8340 spa_async_suspend(newspa); 8341 8342 /* 8343 * Temporarily stop the initializing and TRIM activity. We set the 8344 * state to ACTIVE so that we know to resume initializing or TRIM 8345 * once the split has completed. 8346 */ 8347 list_t vd_initialize_list; 8348 list_create(&vd_initialize_list, sizeof (vdev_t), 8349 offsetof(vdev_t, vdev_initialize_node)); 8350 8351 list_t vd_trim_list; 8352 list_create(&vd_trim_list, sizeof (vdev_t), 8353 offsetof(vdev_t, vdev_trim_node)); 8354 8355 for (c = 0; c < children; c++) { 8356 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8357 mutex_enter(&vml[c]->vdev_initialize_lock); 8358 vdev_initialize_stop(vml[c], 8359 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8360 mutex_exit(&vml[c]->vdev_initialize_lock); 8361 8362 mutex_enter(&vml[c]->vdev_trim_lock); 8363 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8364 mutex_exit(&vml[c]->vdev_trim_lock); 8365 } 8366 } 8367 8368 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8369 vdev_trim_stop_wait(spa, &vd_trim_list); 8370 8371 list_destroy(&vd_initialize_list); 8372 list_destroy(&vd_trim_list); 8373 8374 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8375 newspa->spa_is_splitting = B_TRUE; 8376 8377 /* create the new pool from the disks of the original pool */ 8378 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8379 if (error) 8380 goto out; 8381 8382 /* if that worked, generate a real config for the new pool */ 8383 if (newspa->spa_root_vdev != NULL) { 8384 newspa->spa_config_splitting = fnvlist_alloc(); 8385 fnvlist_add_uint64(newspa->spa_config_splitting, 8386 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8387 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8388 B_TRUE)); 8389 } 8390 8391 /* set the props */ 8392 if (props != NULL) { 8393 spa_configfile_set(newspa, props, B_FALSE); 8394 error = spa_prop_set(newspa, props); 8395 if (error) 8396 goto out; 8397 } 8398 8399 /* flush everything */ 8400 txg = spa_vdev_config_enter(newspa); 8401 vdev_config_dirty(newspa->spa_root_vdev); 8402 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8403 8404 if (zio_injection_enabled) 8405 zio_handle_panic_injection(spa, FTAG, 2); 8406 8407 spa_async_resume(newspa); 8408 8409 /* finally, update the original pool's config */ 8410 txg = spa_vdev_config_enter(spa); 8411 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8412 error = dmu_tx_assign(tx, TXG_WAIT); 8413 if (error != 0) 8414 dmu_tx_abort(tx); 8415 for (c = 0; c < children; c++) { 8416 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8417 vdev_t *tvd = vml[c]->vdev_top; 8418 8419 /* 8420 * Need to be sure the detachable VDEV is not 8421 * on any *other* txg's DTL list to prevent it 8422 * from being accessed after it's freed. 8423 */ 8424 for (int t = 0; t < TXG_SIZE; t++) { 8425 (void) txg_list_remove_this( 8426 &tvd->vdev_dtl_list, vml[c], t); 8427 } 8428 8429 vdev_split(vml[c]); 8430 if (error == 0) 8431 spa_history_log_internal(spa, "detach", tx, 8432 "vdev=%s", vml[c]->vdev_path); 8433 8434 vdev_free(vml[c]); 8435 } 8436 } 8437 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8438 vdev_config_dirty(spa->spa_root_vdev); 8439 spa->spa_config_splitting = NULL; 8440 nvlist_free(nvl); 8441 if (error == 0) 8442 dmu_tx_commit(tx); 8443 (void) spa_vdev_exit(spa, NULL, txg, 0); 8444 8445 if (zio_injection_enabled) 8446 zio_handle_panic_injection(spa, FTAG, 3); 8447 8448 /* split is complete; log a history record */ 8449 spa_history_log_internal(newspa, "split", NULL, 8450 "from pool %s", spa_name(spa)); 8451 8452 newspa->spa_is_splitting = B_FALSE; 8453 kmem_free(vml, children * sizeof (vdev_t *)); 8454 8455 /* if we're not going to mount the filesystems in userland, export */ 8456 if (exp) 8457 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8458 B_FALSE, B_FALSE); 8459 8460 return (error); 8461 8462 out: 8463 spa_unload(newspa); 8464 spa_deactivate(newspa); 8465 spa_remove(newspa); 8466 8467 txg = spa_vdev_config_enter(spa); 8468 8469 /* re-online all offlined disks */ 8470 for (c = 0; c < children; c++) { 8471 if (vml[c] != NULL) 8472 vml[c]->vdev_offline = B_FALSE; 8473 } 8474 8475 /* restart initializing or trimming disks as necessary */ 8476 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8477 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8478 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8479 8480 vdev_reopen(spa->spa_root_vdev); 8481 8482 nvlist_free(spa->spa_config_splitting); 8483 spa->spa_config_splitting = NULL; 8484 (void) spa_vdev_exit(spa, NULL, txg, error); 8485 8486 kmem_free(vml, children * sizeof (vdev_t *)); 8487 return (error); 8488 } 8489 8490 /* 8491 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8492 * currently spared, so we can detach it. 8493 */ 8494 static vdev_t * 8495 spa_vdev_resilver_done_hunt(vdev_t *vd) 8496 { 8497 vdev_t *newvd, *oldvd; 8498 8499 for (int c = 0; c < vd->vdev_children; c++) { 8500 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8501 if (oldvd != NULL) 8502 return (oldvd); 8503 } 8504 8505 /* 8506 * Check for a completed replacement. We always consider the first 8507 * vdev in the list to be the oldest vdev, and the last one to be 8508 * the newest (see spa_vdev_attach() for how that works). In 8509 * the case where the newest vdev is faulted, we will not automatically 8510 * remove it after a resilver completes. This is OK as it will require 8511 * user intervention to determine which disk the admin wishes to keep. 8512 */ 8513 if (vd->vdev_ops == &vdev_replacing_ops) { 8514 ASSERT(vd->vdev_children > 1); 8515 8516 newvd = vd->vdev_child[vd->vdev_children - 1]; 8517 oldvd = vd->vdev_child[0]; 8518 8519 if (vdev_dtl_empty(newvd, DTL_MISSING) && 8520 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8521 !vdev_dtl_required(oldvd)) 8522 return (oldvd); 8523 } 8524 8525 /* 8526 * Check for a completed resilver with the 'unspare' flag set. 8527 * Also potentially update faulted state. 8528 */ 8529 if (vd->vdev_ops == &vdev_spare_ops) { 8530 vdev_t *first = vd->vdev_child[0]; 8531 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8532 8533 if (last->vdev_unspare) { 8534 oldvd = first; 8535 newvd = last; 8536 } else if (first->vdev_unspare) { 8537 oldvd = last; 8538 newvd = first; 8539 } else { 8540 oldvd = NULL; 8541 } 8542 8543 if (oldvd != NULL && 8544 vdev_dtl_empty(newvd, DTL_MISSING) && 8545 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8546 !vdev_dtl_required(oldvd)) 8547 return (oldvd); 8548 8549 vdev_propagate_state(vd); 8550 8551 /* 8552 * If there are more than two spares attached to a disk, 8553 * and those spares are not required, then we want to 8554 * attempt to free them up now so that they can be used 8555 * by other pools. Once we're back down to a single 8556 * disk+spare, we stop removing them. 8557 */ 8558 if (vd->vdev_children > 2) { 8559 newvd = vd->vdev_child[1]; 8560 8561 if (newvd->vdev_isspare && last->vdev_isspare && 8562 vdev_dtl_empty(last, DTL_MISSING) && 8563 vdev_dtl_empty(last, DTL_OUTAGE) && 8564 !vdev_dtl_required(newvd)) 8565 return (newvd); 8566 } 8567 } 8568 8569 return (NULL); 8570 } 8571 8572 static void 8573 spa_vdev_resilver_done(spa_t *spa) 8574 { 8575 vdev_t *vd, *pvd, *ppvd; 8576 uint64_t guid, sguid, pguid, ppguid; 8577 8578 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8579 8580 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8581 pvd = vd->vdev_parent; 8582 ppvd = pvd->vdev_parent; 8583 guid = vd->vdev_guid; 8584 pguid = pvd->vdev_guid; 8585 ppguid = ppvd->vdev_guid; 8586 sguid = 0; 8587 /* 8588 * If we have just finished replacing a hot spared device, then 8589 * we need to detach the parent's first child (the original hot 8590 * spare) as well. 8591 */ 8592 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8593 ppvd->vdev_children == 2) { 8594 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8595 sguid = ppvd->vdev_child[1]->vdev_guid; 8596 } 8597 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8598 8599 spa_config_exit(spa, SCL_ALL, FTAG); 8600 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8601 return; 8602 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8603 return; 8604 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8605 } 8606 8607 spa_config_exit(spa, SCL_ALL, FTAG); 8608 8609 /* 8610 * If a detach was not performed above replace waiters will not have 8611 * been notified. In which case we must do so now. 8612 */ 8613 spa_notify_waiters(spa); 8614 } 8615 8616 /* 8617 * Update the stored path or FRU for this vdev. 8618 */ 8619 static int 8620 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8621 boolean_t ispath) 8622 { 8623 vdev_t *vd; 8624 boolean_t sync = B_FALSE; 8625 8626 ASSERT(spa_writeable(spa)); 8627 8628 spa_vdev_state_enter(spa, SCL_ALL); 8629 8630 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8631 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8632 8633 if (!vd->vdev_ops->vdev_op_leaf) 8634 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8635 8636 if (ispath) { 8637 if (strcmp(value, vd->vdev_path) != 0) { 8638 spa_strfree(vd->vdev_path); 8639 vd->vdev_path = spa_strdup(value); 8640 sync = B_TRUE; 8641 } 8642 } else { 8643 if (vd->vdev_fru == NULL) { 8644 vd->vdev_fru = spa_strdup(value); 8645 sync = B_TRUE; 8646 } else if (strcmp(value, vd->vdev_fru) != 0) { 8647 spa_strfree(vd->vdev_fru); 8648 vd->vdev_fru = spa_strdup(value); 8649 sync = B_TRUE; 8650 } 8651 } 8652 8653 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8654 } 8655 8656 int 8657 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8658 { 8659 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8660 } 8661 8662 int 8663 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8664 { 8665 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8666 } 8667 8668 /* 8669 * ========================================================================== 8670 * SPA Scanning 8671 * ========================================================================== 8672 */ 8673 int 8674 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8675 { 8676 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8677 8678 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8679 return (SET_ERROR(EBUSY)); 8680 8681 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8682 } 8683 8684 int 8685 spa_scan_stop(spa_t *spa) 8686 { 8687 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8688 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8689 return (SET_ERROR(EBUSY)); 8690 8691 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8692 } 8693 8694 int 8695 spa_scan(spa_t *spa, pool_scan_func_t func) 8696 { 8697 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8698 8699 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8700 return (SET_ERROR(ENOTSUP)); 8701 8702 if (func == POOL_SCAN_RESILVER && 8703 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8704 return (SET_ERROR(ENOTSUP)); 8705 8706 /* 8707 * If a resilver was requested, but there is no DTL on a 8708 * writeable leaf device, we have nothing to do. 8709 */ 8710 if (func == POOL_SCAN_RESILVER && 8711 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8712 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8713 return (0); 8714 } 8715 8716 if (func == POOL_SCAN_ERRORSCRUB && 8717 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 8718 return (SET_ERROR(ENOTSUP)); 8719 8720 return (dsl_scan(spa->spa_dsl_pool, func)); 8721 } 8722 8723 /* 8724 * ========================================================================== 8725 * SPA async task processing 8726 * ========================================================================== 8727 */ 8728 8729 static void 8730 spa_async_remove(spa_t *spa, vdev_t *vd) 8731 { 8732 if (vd->vdev_remove_wanted) { 8733 vd->vdev_remove_wanted = B_FALSE; 8734 vd->vdev_delayed_close = B_FALSE; 8735 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8736 8737 /* 8738 * We want to clear the stats, but we don't want to do a full 8739 * vdev_clear() as that will cause us to throw away 8740 * degraded/faulted state as well as attempt to reopen the 8741 * device, all of which is a waste. 8742 */ 8743 vd->vdev_stat.vs_read_errors = 0; 8744 vd->vdev_stat.vs_write_errors = 0; 8745 vd->vdev_stat.vs_checksum_errors = 0; 8746 8747 vdev_state_dirty(vd->vdev_top); 8748 8749 /* Tell userspace that the vdev is gone. */ 8750 zfs_post_remove(spa, vd); 8751 } 8752 8753 for (int c = 0; c < vd->vdev_children; c++) 8754 spa_async_remove(spa, vd->vdev_child[c]); 8755 } 8756 8757 static void 8758 spa_async_probe(spa_t *spa, vdev_t *vd) 8759 { 8760 if (vd->vdev_probe_wanted) { 8761 vd->vdev_probe_wanted = B_FALSE; 8762 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8763 } 8764 8765 for (int c = 0; c < vd->vdev_children; c++) 8766 spa_async_probe(spa, vd->vdev_child[c]); 8767 } 8768 8769 static void 8770 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8771 { 8772 if (!spa->spa_autoexpand) 8773 return; 8774 8775 for (int c = 0; c < vd->vdev_children; c++) { 8776 vdev_t *cvd = vd->vdev_child[c]; 8777 spa_async_autoexpand(spa, cvd); 8778 } 8779 8780 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8781 return; 8782 8783 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8784 } 8785 8786 static __attribute__((noreturn)) void 8787 spa_async_thread(void *arg) 8788 { 8789 spa_t *spa = (spa_t *)arg; 8790 dsl_pool_t *dp = spa->spa_dsl_pool; 8791 int tasks; 8792 8793 ASSERT(spa->spa_sync_on); 8794 8795 mutex_enter(&spa->spa_async_lock); 8796 tasks = spa->spa_async_tasks; 8797 spa->spa_async_tasks = 0; 8798 mutex_exit(&spa->spa_async_lock); 8799 8800 /* 8801 * See if the config needs to be updated. 8802 */ 8803 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8804 uint64_t old_space, new_space; 8805 8806 mutex_enter(&spa_namespace_lock); 8807 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8808 old_space += metaslab_class_get_space(spa_special_class(spa)); 8809 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8810 old_space += metaslab_class_get_space( 8811 spa_embedded_log_class(spa)); 8812 8813 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8814 8815 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8816 new_space += metaslab_class_get_space(spa_special_class(spa)); 8817 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8818 new_space += metaslab_class_get_space( 8819 spa_embedded_log_class(spa)); 8820 mutex_exit(&spa_namespace_lock); 8821 8822 /* 8823 * If the pool grew as a result of the config update, 8824 * then log an internal history event. 8825 */ 8826 if (new_space != old_space) { 8827 spa_history_log_internal(spa, "vdev online", NULL, 8828 "pool '%s' size: %llu(+%llu)", 8829 spa_name(spa), (u_longlong_t)new_space, 8830 (u_longlong_t)(new_space - old_space)); 8831 } 8832 } 8833 8834 /* 8835 * See if any devices need to be marked REMOVED. 8836 */ 8837 if (tasks & SPA_ASYNC_REMOVE) { 8838 spa_vdev_state_enter(spa, SCL_NONE); 8839 spa_async_remove(spa, spa->spa_root_vdev); 8840 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8841 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8842 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8843 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8844 (void) spa_vdev_state_exit(spa, NULL, 0); 8845 } 8846 8847 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8848 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8849 spa_async_autoexpand(spa, spa->spa_root_vdev); 8850 spa_config_exit(spa, SCL_CONFIG, FTAG); 8851 } 8852 8853 /* 8854 * See if any devices need to be probed. 8855 */ 8856 if (tasks & SPA_ASYNC_PROBE) { 8857 spa_vdev_state_enter(spa, SCL_NONE); 8858 spa_async_probe(spa, spa->spa_root_vdev); 8859 (void) spa_vdev_state_exit(spa, NULL, 0); 8860 } 8861 8862 /* 8863 * If any devices are done replacing, detach them. 8864 */ 8865 if (tasks & SPA_ASYNC_RESILVER_DONE || 8866 tasks & SPA_ASYNC_REBUILD_DONE || 8867 tasks & SPA_ASYNC_DETACH_SPARE) { 8868 spa_vdev_resilver_done(spa); 8869 } 8870 8871 /* 8872 * Kick off a resilver. 8873 */ 8874 if (tasks & SPA_ASYNC_RESILVER && 8875 !vdev_rebuild_active(spa->spa_root_vdev) && 8876 (!dsl_scan_resilvering(dp) || 8877 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8878 dsl_scan_restart_resilver(dp, 0); 8879 8880 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8881 mutex_enter(&spa_namespace_lock); 8882 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8883 vdev_initialize_restart(spa->spa_root_vdev); 8884 spa_config_exit(spa, SCL_CONFIG, FTAG); 8885 mutex_exit(&spa_namespace_lock); 8886 } 8887 8888 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8889 mutex_enter(&spa_namespace_lock); 8890 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8891 vdev_trim_restart(spa->spa_root_vdev); 8892 spa_config_exit(spa, SCL_CONFIG, FTAG); 8893 mutex_exit(&spa_namespace_lock); 8894 } 8895 8896 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8897 mutex_enter(&spa_namespace_lock); 8898 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8899 vdev_autotrim_restart(spa); 8900 spa_config_exit(spa, SCL_CONFIG, FTAG); 8901 mutex_exit(&spa_namespace_lock); 8902 } 8903 8904 /* 8905 * Kick off L2 cache whole device TRIM. 8906 */ 8907 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8908 mutex_enter(&spa_namespace_lock); 8909 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8910 vdev_trim_l2arc(spa); 8911 spa_config_exit(spa, SCL_CONFIG, FTAG); 8912 mutex_exit(&spa_namespace_lock); 8913 } 8914 8915 /* 8916 * Kick off L2 cache rebuilding. 8917 */ 8918 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8919 mutex_enter(&spa_namespace_lock); 8920 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8921 l2arc_spa_rebuild_start(spa); 8922 spa_config_exit(spa, SCL_L2ARC, FTAG); 8923 mutex_exit(&spa_namespace_lock); 8924 } 8925 8926 /* 8927 * Let the world know that we're done. 8928 */ 8929 mutex_enter(&spa->spa_async_lock); 8930 spa->spa_async_thread = NULL; 8931 cv_broadcast(&spa->spa_async_cv); 8932 mutex_exit(&spa->spa_async_lock); 8933 thread_exit(); 8934 } 8935 8936 void 8937 spa_async_suspend(spa_t *spa) 8938 { 8939 mutex_enter(&spa->spa_async_lock); 8940 spa->spa_async_suspended++; 8941 while (spa->spa_async_thread != NULL) 8942 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8943 mutex_exit(&spa->spa_async_lock); 8944 8945 spa_vdev_remove_suspend(spa); 8946 8947 zthr_t *condense_thread = spa->spa_condense_zthr; 8948 if (condense_thread != NULL) 8949 zthr_cancel(condense_thread); 8950 8951 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 8952 if (raidz_expand_thread != NULL) 8953 zthr_cancel(raidz_expand_thread); 8954 8955 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8956 if (discard_thread != NULL) 8957 zthr_cancel(discard_thread); 8958 8959 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8960 if (ll_delete_thread != NULL) 8961 zthr_cancel(ll_delete_thread); 8962 8963 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8964 if (ll_condense_thread != NULL) 8965 zthr_cancel(ll_condense_thread); 8966 } 8967 8968 void 8969 spa_async_resume(spa_t *spa) 8970 { 8971 mutex_enter(&spa->spa_async_lock); 8972 ASSERT(spa->spa_async_suspended != 0); 8973 spa->spa_async_suspended--; 8974 mutex_exit(&spa->spa_async_lock); 8975 spa_restart_removal(spa); 8976 8977 zthr_t *condense_thread = spa->spa_condense_zthr; 8978 if (condense_thread != NULL) 8979 zthr_resume(condense_thread); 8980 8981 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 8982 if (raidz_expand_thread != NULL) 8983 zthr_resume(raidz_expand_thread); 8984 8985 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8986 if (discard_thread != NULL) 8987 zthr_resume(discard_thread); 8988 8989 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8990 if (ll_delete_thread != NULL) 8991 zthr_resume(ll_delete_thread); 8992 8993 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8994 if (ll_condense_thread != NULL) 8995 zthr_resume(ll_condense_thread); 8996 } 8997 8998 static boolean_t 8999 spa_async_tasks_pending(spa_t *spa) 9000 { 9001 uint_t non_config_tasks; 9002 uint_t config_task; 9003 boolean_t config_task_suspended; 9004 9005 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 9006 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 9007 if (spa->spa_ccw_fail_time == 0) { 9008 config_task_suspended = B_FALSE; 9009 } else { 9010 config_task_suspended = 9011 (gethrtime() - spa->spa_ccw_fail_time) < 9012 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 9013 } 9014 9015 return (non_config_tasks || (config_task && !config_task_suspended)); 9016 } 9017 9018 static void 9019 spa_async_dispatch(spa_t *spa) 9020 { 9021 mutex_enter(&spa->spa_async_lock); 9022 if (spa_async_tasks_pending(spa) && 9023 !spa->spa_async_suspended && 9024 spa->spa_async_thread == NULL) 9025 spa->spa_async_thread = thread_create(NULL, 0, 9026 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9027 mutex_exit(&spa->spa_async_lock); 9028 } 9029 9030 void 9031 spa_async_request(spa_t *spa, int task) 9032 { 9033 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9034 mutex_enter(&spa->spa_async_lock); 9035 spa->spa_async_tasks |= task; 9036 mutex_exit(&spa->spa_async_lock); 9037 } 9038 9039 int 9040 spa_async_tasks(spa_t *spa) 9041 { 9042 return (spa->spa_async_tasks); 9043 } 9044 9045 /* 9046 * ========================================================================== 9047 * SPA syncing routines 9048 * ========================================================================== 9049 */ 9050 9051 9052 static int 9053 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9054 dmu_tx_t *tx) 9055 { 9056 bpobj_t *bpo = arg; 9057 bpobj_enqueue(bpo, bp, bp_freed, tx); 9058 return (0); 9059 } 9060 9061 int 9062 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9063 { 9064 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9065 } 9066 9067 int 9068 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9069 { 9070 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9071 } 9072 9073 static int 9074 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9075 { 9076 zio_t *pio = arg; 9077 9078 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9079 pio->io_flags)); 9080 return (0); 9081 } 9082 9083 static int 9084 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9085 dmu_tx_t *tx) 9086 { 9087 ASSERT(!bp_freed); 9088 return (spa_free_sync_cb(arg, bp, tx)); 9089 } 9090 9091 /* 9092 * Note: this simple function is not inlined to make it easier to dtrace the 9093 * amount of time spent syncing frees. 9094 */ 9095 static void 9096 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9097 { 9098 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9099 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9100 VERIFY(zio_wait(zio) == 0); 9101 } 9102 9103 /* 9104 * Note: this simple function is not inlined to make it easier to dtrace the 9105 * amount of time spent syncing deferred frees. 9106 */ 9107 static void 9108 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9109 { 9110 if (spa_sync_pass(spa) != 1) 9111 return; 9112 9113 /* 9114 * Note: 9115 * If the log space map feature is active, we stop deferring 9116 * frees to the next TXG and therefore running this function 9117 * would be considered a no-op as spa_deferred_bpobj should 9118 * not have any entries. 9119 * 9120 * That said we run this function anyway (instead of returning 9121 * immediately) for the edge-case scenario where we just 9122 * activated the log space map feature in this TXG but we have 9123 * deferred frees from the previous TXG. 9124 */ 9125 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9126 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9127 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9128 VERIFY0(zio_wait(zio)); 9129 } 9130 9131 static void 9132 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9133 { 9134 char *packed = NULL; 9135 size_t bufsize; 9136 size_t nvsize = 0; 9137 dmu_buf_t *db; 9138 9139 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 9140 9141 /* 9142 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9143 * information. This avoids the dmu_buf_will_dirty() path and 9144 * saves us a pre-read to get data we don't actually care about. 9145 */ 9146 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9147 packed = vmem_alloc(bufsize, KM_SLEEP); 9148 9149 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9150 KM_SLEEP) == 0); 9151 memset(packed + nvsize, 0, bufsize - nvsize); 9152 9153 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 9154 9155 vmem_free(packed, bufsize); 9156 9157 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9158 dmu_buf_will_dirty(db, tx); 9159 *(uint64_t *)db->db_data = nvsize; 9160 dmu_buf_rele(db, FTAG); 9161 } 9162 9163 static void 9164 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9165 const char *config, const char *entry) 9166 { 9167 nvlist_t *nvroot; 9168 nvlist_t **list; 9169 int i; 9170 9171 if (!sav->sav_sync) 9172 return; 9173 9174 /* 9175 * Update the MOS nvlist describing the list of available devices. 9176 * spa_validate_aux() will have already made sure this nvlist is 9177 * valid and the vdevs are labeled appropriately. 9178 */ 9179 if (sav->sav_object == 0) { 9180 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9181 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9182 sizeof (uint64_t), tx); 9183 VERIFY(zap_update(spa->spa_meta_objset, 9184 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9185 &sav->sav_object, tx) == 0); 9186 } 9187 9188 nvroot = fnvlist_alloc(); 9189 if (sav->sav_count == 0) { 9190 fnvlist_add_nvlist_array(nvroot, config, 9191 (const nvlist_t * const *)NULL, 0); 9192 } else { 9193 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9194 for (i = 0; i < sav->sav_count; i++) 9195 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9196 B_FALSE, VDEV_CONFIG_L2CACHE); 9197 fnvlist_add_nvlist_array(nvroot, config, 9198 (const nvlist_t * const *)list, sav->sav_count); 9199 for (i = 0; i < sav->sav_count; i++) 9200 nvlist_free(list[i]); 9201 kmem_free(list, sav->sav_count * sizeof (void *)); 9202 } 9203 9204 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9205 nvlist_free(nvroot); 9206 9207 sav->sav_sync = B_FALSE; 9208 } 9209 9210 /* 9211 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9212 * The all-vdev ZAP must be empty. 9213 */ 9214 static void 9215 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9216 { 9217 spa_t *spa = vd->vdev_spa; 9218 9219 if (vd->vdev_root_zap != 0 && 9220 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9221 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9222 vd->vdev_root_zap, tx)); 9223 } 9224 if (vd->vdev_top_zap != 0) { 9225 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9226 vd->vdev_top_zap, tx)); 9227 } 9228 if (vd->vdev_leaf_zap != 0) { 9229 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9230 vd->vdev_leaf_zap, tx)); 9231 } 9232 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9233 spa_avz_build(vd->vdev_child[i], avz, tx); 9234 } 9235 } 9236 9237 static void 9238 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9239 { 9240 nvlist_t *config; 9241 9242 /* 9243 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9244 * its config may not be dirty but we still need to build per-vdev ZAPs. 9245 * Similarly, if the pool is being assembled (e.g. after a split), we 9246 * need to rebuild the AVZ although the config may not be dirty. 9247 */ 9248 if (list_is_empty(&spa->spa_config_dirty_list) && 9249 spa->spa_avz_action == AVZ_ACTION_NONE) 9250 return; 9251 9252 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9253 9254 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9255 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9256 spa->spa_all_vdev_zaps != 0); 9257 9258 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9259 /* Make and build the new AVZ */ 9260 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9261 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9262 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9263 9264 /* Diff old AVZ with new one */ 9265 zap_cursor_t zc; 9266 zap_attribute_t za; 9267 9268 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9269 spa->spa_all_vdev_zaps); 9270 zap_cursor_retrieve(&zc, &za) == 0; 9271 zap_cursor_advance(&zc)) { 9272 uint64_t vdzap = za.za_first_integer; 9273 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9274 vdzap) == ENOENT) { 9275 /* 9276 * ZAP is listed in old AVZ but not in new one; 9277 * destroy it 9278 */ 9279 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9280 tx)); 9281 } 9282 } 9283 9284 zap_cursor_fini(&zc); 9285 9286 /* Destroy the old AVZ */ 9287 VERIFY0(zap_destroy(spa->spa_meta_objset, 9288 spa->spa_all_vdev_zaps, tx)); 9289 9290 /* Replace the old AVZ in the dir obj with the new one */ 9291 VERIFY0(zap_update(spa->spa_meta_objset, 9292 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9293 sizeof (new_avz), 1, &new_avz, tx)); 9294 9295 spa->spa_all_vdev_zaps = new_avz; 9296 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9297 zap_cursor_t zc; 9298 zap_attribute_t za; 9299 9300 /* Walk through the AVZ and destroy all listed ZAPs */ 9301 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9302 spa->spa_all_vdev_zaps); 9303 zap_cursor_retrieve(&zc, &za) == 0; 9304 zap_cursor_advance(&zc)) { 9305 uint64_t zap = za.za_first_integer; 9306 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9307 } 9308 9309 zap_cursor_fini(&zc); 9310 9311 /* Destroy and unlink the AVZ itself */ 9312 VERIFY0(zap_destroy(spa->spa_meta_objset, 9313 spa->spa_all_vdev_zaps, tx)); 9314 VERIFY0(zap_remove(spa->spa_meta_objset, 9315 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9316 spa->spa_all_vdev_zaps = 0; 9317 } 9318 9319 if (spa->spa_all_vdev_zaps == 0) { 9320 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9321 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9322 DMU_POOL_VDEV_ZAP_MAP, tx); 9323 } 9324 spa->spa_avz_action = AVZ_ACTION_NONE; 9325 9326 /* Create ZAPs for vdevs that don't have them. */ 9327 vdev_construct_zaps(spa->spa_root_vdev, tx); 9328 9329 config = spa_config_generate(spa, spa->spa_root_vdev, 9330 dmu_tx_get_txg(tx), B_FALSE); 9331 9332 /* 9333 * If we're upgrading the spa version then make sure that 9334 * the config object gets updated with the correct version. 9335 */ 9336 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9337 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9338 spa->spa_uberblock.ub_version); 9339 9340 spa_config_exit(spa, SCL_STATE, FTAG); 9341 9342 nvlist_free(spa->spa_config_syncing); 9343 spa->spa_config_syncing = config; 9344 9345 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9346 } 9347 9348 static void 9349 spa_sync_version(void *arg, dmu_tx_t *tx) 9350 { 9351 uint64_t *versionp = arg; 9352 uint64_t version = *versionp; 9353 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9354 9355 /* 9356 * Setting the version is special cased when first creating the pool. 9357 */ 9358 ASSERT(tx->tx_txg != TXG_INITIAL); 9359 9360 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9361 ASSERT(version >= spa_version(spa)); 9362 9363 spa->spa_uberblock.ub_version = version; 9364 vdev_config_dirty(spa->spa_root_vdev); 9365 spa_history_log_internal(spa, "set", tx, "version=%lld", 9366 (longlong_t)version); 9367 } 9368 9369 /* 9370 * Set zpool properties. 9371 */ 9372 static void 9373 spa_sync_props(void *arg, dmu_tx_t *tx) 9374 { 9375 nvlist_t *nvp = arg; 9376 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9377 objset_t *mos = spa->spa_meta_objset; 9378 nvpair_t *elem = NULL; 9379 9380 mutex_enter(&spa->spa_props_lock); 9381 9382 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9383 uint64_t intval; 9384 const char *strval, *fname; 9385 zpool_prop_t prop; 9386 const char *propname; 9387 const char *elemname = nvpair_name(elem); 9388 zprop_type_t proptype; 9389 spa_feature_t fid; 9390 9391 switch (prop = zpool_name_to_prop(elemname)) { 9392 case ZPOOL_PROP_VERSION: 9393 intval = fnvpair_value_uint64(elem); 9394 /* 9395 * The version is synced separately before other 9396 * properties and should be correct by now. 9397 */ 9398 ASSERT3U(spa_version(spa), >=, intval); 9399 break; 9400 9401 case ZPOOL_PROP_ALTROOT: 9402 /* 9403 * 'altroot' is a non-persistent property. It should 9404 * have been set temporarily at creation or import time. 9405 */ 9406 ASSERT(spa->spa_root != NULL); 9407 break; 9408 9409 case ZPOOL_PROP_READONLY: 9410 case ZPOOL_PROP_CACHEFILE: 9411 /* 9412 * 'readonly' and 'cachefile' are also non-persistent 9413 * properties. 9414 */ 9415 break; 9416 case ZPOOL_PROP_COMMENT: 9417 strval = fnvpair_value_string(elem); 9418 if (spa->spa_comment != NULL) 9419 spa_strfree(spa->spa_comment); 9420 spa->spa_comment = spa_strdup(strval); 9421 /* 9422 * We need to dirty the configuration on all the vdevs 9423 * so that their labels get updated. We also need to 9424 * update the cache file to keep it in sync with the 9425 * MOS version. It's unnecessary to do this for pool 9426 * creation since the vdev's configuration has already 9427 * been dirtied. 9428 */ 9429 if (tx->tx_txg != TXG_INITIAL) { 9430 vdev_config_dirty(spa->spa_root_vdev); 9431 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9432 } 9433 spa_history_log_internal(spa, "set", tx, 9434 "%s=%s", elemname, strval); 9435 break; 9436 case ZPOOL_PROP_COMPATIBILITY: 9437 strval = fnvpair_value_string(elem); 9438 if (spa->spa_compatibility != NULL) 9439 spa_strfree(spa->spa_compatibility); 9440 spa->spa_compatibility = spa_strdup(strval); 9441 /* 9442 * Dirty the configuration on vdevs as above. 9443 */ 9444 if (tx->tx_txg != TXG_INITIAL) { 9445 vdev_config_dirty(spa->spa_root_vdev); 9446 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9447 } 9448 9449 spa_history_log_internal(spa, "set", tx, 9450 "%s=%s", nvpair_name(elem), strval); 9451 break; 9452 9453 case ZPOOL_PROP_INVAL: 9454 if (zpool_prop_feature(elemname)) { 9455 fname = strchr(elemname, '@') + 1; 9456 VERIFY0(zfeature_lookup_name(fname, &fid)); 9457 9458 spa_feature_enable(spa, fid, tx); 9459 spa_history_log_internal(spa, "set", tx, 9460 "%s=enabled", elemname); 9461 break; 9462 } else if (!zfs_prop_user(elemname)) { 9463 ASSERT(zpool_prop_feature(elemname)); 9464 break; 9465 } 9466 zfs_fallthrough; 9467 default: 9468 /* 9469 * Set pool property values in the poolprops mos object. 9470 */ 9471 if (spa->spa_pool_props_object == 0) { 9472 spa->spa_pool_props_object = 9473 zap_create_link(mos, DMU_OT_POOL_PROPS, 9474 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9475 tx); 9476 } 9477 9478 /* normalize the property name */ 9479 if (prop == ZPOOL_PROP_INVAL) { 9480 propname = elemname; 9481 proptype = PROP_TYPE_STRING; 9482 } else { 9483 propname = zpool_prop_to_name(prop); 9484 proptype = zpool_prop_get_type(prop); 9485 } 9486 9487 if (nvpair_type(elem) == DATA_TYPE_STRING) { 9488 ASSERT(proptype == PROP_TYPE_STRING); 9489 strval = fnvpair_value_string(elem); 9490 VERIFY0(zap_update(mos, 9491 spa->spa_pool_props_object, propname, 9492 1, strlen(strval) + 1, strval, tx)); 9493 spa_history_log_internal(spa, "set", tx, 9494 "%s=%s", elemname, strval); 9495 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 9496 intval = fnvpair_value_uint64(elem); 9497 9498 if (proptype == PROP_TYPE_INDEX) { 9499 const char *unused; 9500 VERIFY0(zpool_prop_index_to_string( 9501 prop, intval, &unused)); 9502 } 9503 VERIFY0(zap_update(mos, 9504 spa->spa_pool_props_object, propname, 9505 8, 1, &intval, tx)); 9506 spa_history_log_internal(spa, "set", tx, 9507 "%s=%lld", elemname, 9508 (longlong_t)intval); 9509 9510 switch (prop) { 9511 case ZPOOL_PROP_DELEGATION: 9512 spa->spa_delegation = intval; 9513 break; 9514 case ZPOOL_PROP_BOOTFS: 9515 spa->spa_bootfs = intval; 9516 break; 9517 case ZPOOL_PROP_FAILUREMODE: 9518 spa->spa_failmode = intval; 9519 break; 9520 case ZPOOL_PROP_AUTOTRIM: 9521 spa->spa_autotrim = intval; 9522 spa_async_request(spa, 9523 SPA_ASYNC_AUTOTRIM_RESTART); 9524 break; 9525 case ZPOOL_PROP_AUTOEXPAND: 9526 spa->spa_autoexpand = intval; 9527 if (tx->tx_txg != TXG_INITIAL) 9528 spa_async_request(spa, 9529 SPA_ASYNC_AUTOEXPAND); 9530 break; 9531 case ZPOOL_PROP_MULTIHOST: 9532 spa->spa_multihost = intval; 9533 break; 9534 default: 9535 break; 9536 } 9537 } else { 9538 ASSERT(0); /* not allowed */ 9539 } 9540 } 9541 9542 } 9543 9544 mutex_exit(&spa->spa_props_lock); 9545 } 9546 9547 /* 9548 * Perform one-time upgrade on-disk changes. spa_version() does not 9549 * reflect the new version this txg, so there must be no changes this 9550 * txg to anything that the upgrade code depends on after it executes. 9551 * Therefore this must be called after dsl_pool_sync() does the sync 9552 * tasks. 9553 */ 9554 static void 9555 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9556 { 9557 if (spa_sync_pass(spa) != 1) 9558 return; 9559 9560 dsl_pool_t *dp = spa->spa_dsl_pool; 9561 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9562 9563 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9564 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9565 dsl_pool_create_origin(dp, tx); 9566 9567 /* Keeping the origin open increases spa_minref */ 9568 spa->spa_minref += 3; 9569 } 9570 9571 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9572 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9573 dsl_pool_upgrade_clones(dp, tx); 9574 } 9575 9576 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9577 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9578 dsl_pool_upgrade_dir_clones(dp, tx); 9579 9580 /* Keeping the freedir open increases spa_minref */ 9581 spa->spa_minref += 3; 9582 } 9583 9584 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9585 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9586 spa_feature_create_zap_objects(spa, tx); 9587 } 9588 9589 /* 9590 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9591 * when possibility to use lz4 compression for metadata was added 9592 * Old pools that have this feature enabled must be upgraded to have 9593 * this feature active 9594 */ 9595 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9596 boolean_t lz4_en = spa_feature_is_enabled(spa, 9597 SPA_FEATURE_LZ4_COMPRESS); 9598 boolean_t lz4_ac = spa_feature_is_active(spa, 9599 SPA_FEATURE_LZ4_COMPRESS); 9600 9601 if (lz4_en && !lz4_ac) 9602 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9603 } 9604 9605 /* 9606 * If we haven't written the salt, do so now. Note that the 9607 * feature may not be activated yet, but that's fine since 9608 * the presence of this ZAP entry is backwards compatible. 9609 */ 9610 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9611 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9612 VERIFY0(zap_add(spa->spa_meta_objset, 9613 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9614 sizeof (spa->spa_cksum_salt.zcs_bytes), 9615 spa->spa_cksum_salt.zcs_bytes, tx)); 9616 } 9617 9618 rrw_exit(&dp->dp_config_rwlock, FTAG); 9619 } 9620 9621 static void 9622 vdev_indirect_state_sync_verify(vdev_t *vd) 9623 { 9624 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9625 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9626 9627 if (vd->vdev_ops == &vdev_indirect_ops) { 9628 ASSERT(vim != NULL); 9629 ASSERT(vib != NULL); 9630 } 9631 9632 uint64_t obsolete_sm_object = 0; 9633 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9634 if (obsolete_sm_object != 0) { 9635 ASSERT(vd->vdev_obsolete_sm != NULL); 9636 ASSERT(vd->vdev_removing || 9637 vd->vdev_ops == &vdev_indirect_ops); 9638 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9639 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9640 ASSERT3U(obsolete_sm_object, ==, 9641 space_map_object(vd->vdev_obsolete_sm)); 9642 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9643 space_map_allocated(vd->vdev_obsolete_sm)); 9644 } 9645 ASSERT(vd->vdev_obsolete_segments != NULL); 9646 9647 /* 9648 * Since frees / remaps to an indirect vdev can only 9649 * happen in syncing context, the obsolete segments 9650 * tree must be empty when we start syncing. 9651 */ 9652 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9653 } 9654 9655 /* 9656 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9657 * async write queue depth in case it changed. The max queue depth will 9658 * not change in the middle of syncing out this txg. 9659 */ 9660 static void 9661 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9662 { 9663 ASSERT(spa_writeable(spa)); 9664 9665 vdev_t *rvd = spa->spa_root_vdev; 9666 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9667 zfs_vdev_queue_depth_pct / 100; 9668 metaslab_class_t *normal = spa_normal_class(spa); 9669 metaslab_class_t *special = spa_special_class(spa); 9670 metaslab_class_t *dedup = spa_dedup_class(spa); 9671 9672 uint64_t slots_per_allocator = 0; 9673 for (int c = 0; c < rvd->vdev_children; c++) { 9674 vdev_t *tvd = rvd->vdev_child[c]; 9675 9676 metaslab_group_t *mg = tvd->vdev_mg; 9677 if (mg == NULL || !metaslab_group_initialized(mg)) 9678 continue; 9679 9680 metaslab_class_t *mc = mg->mg_class; 9681 if (mc != normal && mc != special && mc != dedup) 9682 continue; 9683 9684 /* 9685 * It is safe to do a lock-free check here because only async 9686 * allocations look at mg_max_alloc_queue_depth, and async 9687 * allocations all happen from spa_sync(). 9688 */ 9689 for (int i = 0; i < mg->mg_allocators; i++) { 9690 ASSERT0(zfs_refcount_count( 9691 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9692 } 9693 mg->mg_max_alloc_queue_depth = max_queue_depth; 9694 9695 for (int i = 0; i < mg->mg_allocators; i++) { 9696 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9697 zfs_vdev_def_queue_depth; 9698 } 9699 slots_per_allocator += zfs_vdev_def_queue_depth; 9700 } 9701 9702 for (int i = 0; i < spa->spa_alloc_count; i++) { 9703 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9704 mca_alloc_slots)); 9705 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9706 mca_alloc_slots)); 9707 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9708 mca_alloc_slots)); 9709 normal->mc_allocator[i].mca_alloc_max_slots = 9710 slots_per_allocator; 9711 special->mc_allocator[i].mca_alloc_max_slots = 9712 slots_per_allocator; 9713 dedup->mc_allocator[i].mca_alloc_max_slots = 9714 slots_per_allocator; 9715 } 9716 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9717 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9718 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9719 } 9720 9721 static void 9722 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9723 { 9724 ASSERT(spa_writeable(spa)); 9725 9726 vdev_t *rvd = spa->spa_root_vdev; 9727 for (int c = 0; c < rvd->vdev_children; c++) { 9728 vdev_t *vd = rvd->vdev_child[c]; 9729 vdev_indirect_state_sync_verify(vd); 9730 9731 if (vdev_indirect_should_condense(vd)) { 9732 spa_condense_indirect_start_sync(vd, tx); 9733 break; 9734 } 9735 } 9736 } 9737 9738 static void 9739 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9740 { 9741 objset_t *mos = spa->spa_meta_objset; 9742 dsl_pool_t *dp = spa->spa_dsl_pool; 9743 uint64_t txg = tx->tx_txg; 9744 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9745 9746 do { 9747 int pass = ++spa->spa_sync_pass; 9748 9749 spa_sync_config_object(spa, tx); 9750 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9751 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9752 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9753 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9754 spa_errlog_sync(spa, txg); 9755 dsl_pool_sync(dp, txg); 9756 9757 if (pass < zfs_sync_pass_deferred_free || 9758 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9759 /* 9760 * If the log space map feature is active we don't 9761 * care about deferred frees and the deferred bpobj 9762 * as the log space map should effectively have the 9763 * same results (i.e. appending only to one object). 9764 */ 9765 spa_sync_frees(spa, free_bpl, tx); 9766 } else { 9767 /* 9768 * We can not defer frees in pass 1, because 9769 * we sync the deferred frees later in pass 1. 9770 */ 9771 ASSERT3U(pass, >, 1); 9772 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9773 &spa->spa_deferred_bpobj, tx); 9774 } 9775 9776 brt_sync(spa, txg); 9777 ddt_sync(spa, txg); 9778 dsl_scan_sync(dp, tx); 9779 dsl_errorscrub_sync(dp, tx); 9780 svr_sync(spa, tx); 9781 spa_sync_upgrades(spa, tx); 9782 9783 spa_flush_metaslabs(spa, tx); 9784 9785 vdev_t *vd = NULL; 9786 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9787 != NULL) 9788 vdev_sync(vd, txg); 9789 9790 if (pass == 1) { 9791 /* 9792 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 9793 * the config. If that happens, this txg should not 9794 * be a no-op. So we must sync the config to the MOS 9795 * before checking for no-op. 9796 * 9797 * Note that when the config is dirty, it will 9798 * be written to the MOS (i.e. the MOS will be 9799 * dirtied) every time we call spa_sync_config_object() 9800 * in this txg. Therefore we can't call this after 9801 * dsl_pool_sync() every pass, because it would 9802 * prevent us from converging, since we'd dirty 9803 * the MOS every pass. 9804 * 9805 * Sync tasks can only be processed in pass 1, so 9806 * there's no need to do this in later passes. 9807 */ 9808 spa_sync_config_object(spa, tx); 9809 } 9810 9811 /* 9812 * Note: We need to check if the MOS is dirty because we could 9813 * have marked the MOS dirty without updating the uberblock 9814 * (e.g. if we have sync tasks but no dirty user data). We need 9815 * to check the uberblock's rootbp because it is updated if we 9816 * have synced out dirty data (though in this case the MOS will 9817 * most likely also be dirty due to second order effects, we 9818 * don't want to rely on that here). 9819 */ 9820 if (pass == 1 && 9821 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9822 !dmu_objset_is_dirty(mos, txg)) { 9823 /* 9824 * Nothing changed on the first pass, therefore this 9825 * TXG is a no-op. Avoid syncing deferred frees, so 9826 * that we can keep this TXG as a no-op. 9827 */ 9828 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9829 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9830 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9831 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9832 break; 9833 } 9834 9835 spa_sync_deferred_frees(spa, tx); 9836 } while (dmu_objset_is_dirty(mos, txg)); 9837 } 9838 9839 /* 9840 * Rewrite the vdev configuration (which includes the uberblock) to 9841 * commit the transaction group. 9842 * 9843 * If there are no dirty vdevs, we sync the uberblock to a few random 9844 * top-level vdevs that are known to be visible in the config cache 9845 * (see spa_vdev_add() for a complete description). If there *are* dirty 9846 * vdevs, sync the uberblock to all vdevs. 9847 */ 9848 static void 9849 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9850 { 9851 vdev_t *rvd = spa->spa_root_vdev; 9852 uint64_t txg = tx->tx_txg; 9853 9854 for (;;) { 9855 int error = 0; 9856 9857 /* 9858 * We hold SCL_STATE to prevent vdev open/close/etc. 9859 * while we're attempting to write the vdev labels. 9860 */ 9861 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9862 9863 if (list_is_empty(&spa->spa_config_dirty_list)) { 9864 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9865 int svdcount = 0; 9866 int children = rvd->vdev_children; 9867 int c0 = random_in_range(children); 9868 9869 for (int c = 0; c < children; c++) { 9870 vdev_t *vd = 9871 rvd->vdev_child[(c0 + c) % children]; 9872 9873 /* Stop when revisiting the first vdev */ 9874 if (c > 0 && svd[0] == vd) 9875 break; 9876 9877 if (vd->vdev_ms_array == 0 || 9878 vd->vdev_islog || 9879 !vdev_is_concrete(vd)) 9880 continue; 9881 9882 svd[svdcount++] = vd; 9883 if (svdcount == SPA_SYNC_MIN_VDEVS) 9884 break; 9885 } 9886 error = vdev_config_sync(svd, svdcount, txg); 9887 } else { 9888 error = vdev_config_sync(rvd->vdev_child, 9889 rvd->vdev_children, txg); 9890 } 9891 9892 if (error == 0) 9893 spa->spa_last_synced_guid = rvd->vdev_guid; 9894 9895 spa_config_exit(spa, SCL_STATE, FTAG); 9896 9897 if (error == 0) 9898 break; 9899 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9900 zio_resume_wait(spa); 9901 } 9902 } 9903 9904 /* 9905 * Sync the specified transaction group. New blocks may be dirtied as 9906 * part of the process, so we iterate until it converges. 9907 */ 9908 void 9909 spa_sync(spa_t *spa, uint64_t txg) 9910 { 9911 vdev_t *vd = NULL; 9912 9913 VERIFY(spa_writeable(spa)); 9914 9915 /* 9916 * Wait for i/os issued in open context that need to complete 9917 * before this txg syncs. 9918 */ 9919 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9920 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9921 ZIO_FLAG_CANFAIL); 9922 9923 /* 9924 * Now that there can be no more cloning in this transaction group, 9925 * but we are still before issuing frees, we can process pending BRT 9926 * updates. 9927 */ 9928 brt_pending_apply(spa, txg); 9929 9930 /* 9931 * Lock out configuration changes. 9932 */ 9933 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9934 9935 spa->spa_syncing_txg = txg; 9936 spa->spa_sync_pass = 0; 9937 9938 for (int i = 0; i < spa->spa_alloc_count; i++) { 9939 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9940 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9941 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9942 } 9943 9944 /* 9945 * If there are any pending vdev state changes, convert them 9946 * into config changes that go out with this transaction group. 9947 */ 9948 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9949 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9950 /* Avoid holding the write lock unless actually necessary */ 9951 if (vd->vdev_aux == NULL) { 9952 vdev_state_clean(vd); 9953 vdev_config_dirty(vd); 9954 continue; 9955 } 9956 /* 9957 * We need the write lock here because, for aux vdevs, 9958 * calling vdev_config_dirty() modifies sav_config. 9959 * This is ugly and will become unnecessary when we 9960 * eliminate the aux vdev wart by integrating all vdevs 9961 * into the root vdev tree. 9962 */ 9963 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9964 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9965 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9966 vdev_state_clean(vd); 9967 vdev_config_dirty(vd); 9968 } 9969 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9970 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9971 } 9972 spa_config_exit(spa, SCL_STATE, FTAG); 9973 9974 dsl_pool_t *dp = spa->spa_dsl_pool; 9975 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9976 9977 spa->spa_sync_starttime = gethrtime(); 9978 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9979 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9980 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9981 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9982 9983 /* 9984 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9985 * set spa_deflate if we have no raid-z vdevs. 9986 */ 9987 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9988 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9989 vdev_t *rvd = spa->spa_root_vdev; 9990 9991 int i; 9992 for (i = 0; i < rvd->vdev_children; i++) { 9993 vd = rvd->vdev_child[i]; 9994 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9995 break; 9996 } 9997 if (i == rvd->vdev_children) { 9998 spa->spa_deflate = TRUE; 9999 VERIFY0(zap_add(spa->spa_meta_objset, 10000 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10001 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 10002 } 10003 } 10004 10005 spa_sync_adjust_vdev_max_queue_depth(spa); 10006 10007 spa_sync_condense_indirect(spa, tx); 10008 10009 spa_sync_iterate_to_convergence(spa, tx); 10010 10011 #ifdef ZFS_DEBUG 10012 if (!list_is_empty(&spa->spa_config_dirty_list)) { 10013 /* 10014 * Make sure that the number of ZAPs for all the vdevs matches 10015 * the number of ZAPs in the per-vdev ZAP list. This only gets 10016 * called if the config is dirty; otherwise there may be 10017 * outstanding AVZ operations that weren't completed in 10018 * spa_sync_config_object. 10019 */ 10020 uint64_t all_vdev_zap_entry_count; 10021 ASSERT0(zap_count(spa->spa_meta_objset, 10022 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10023 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10024 all_vdev_zap_entry_count); 10025 } 10026 #endif 10027 10028 if (spa->spa_vdev_removal != NULL) { 10029 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10030 } 10031 10032 spa_sync_rewrite_vdev_config(spa, tx); 10033 dmu_tx_commit(tx); 10034 10035 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10036 spa->spa_deadman_tqid = 0; 10037 10038 /* 10039 * Clear the dirty config list. 10040 */ 10041 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10042 vdev_config_clean(vd); 10043 10044 /* 10045 * Now that the new config has synced transactionally, 10046 * let it become visible to the config cache. 10047 */ 10048 if (spa->spa_config_syncing != NULL) { 10049 spa_config_set(spa, spa->spa_config_syncing); 10050 spa->spa_config_txg = txg; 10051 spa->spa_config_syncing = NULL; 10052 } 10053 10054 dsl_pool_sync_done(dp, txg); 10055 10056 for (int i = 0; i < spa->spa_alloc_count; i++) { 10057 mutex_enter(&spa->spa_allocs[i].spaa_lock); 10058 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 10059 mutex_exit(&spa->spa_allocs[i].spaa_lock); 10060 } 10061 10062 /* 10063 * Update usable space statistics. 10064 */ 10065 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10066 != NULL) 10067 vdev_sync_done(vd, txg); 10068 10069 metaslab_class_evict_old(spa->spa_normal_class, txg); 10070 metaslab_class_evict_old(spa->spa_log_class, txg); 10071 10072 spa_sync_close_syncing_log_sm(spa); 10073 10074 spa_update_dspace(spa); 10075 10076 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10077 vdev_autotrim_kick(spa); 10078 10079 /* 10080 * It had better be the case that we didn't dirty anything 10081 * since vdev_config_sync(). 10082 */ 10083 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10084 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10085 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10086 10087 while (zfs_pause_spa_sync) 10088 delay(1); 10089 10090 spa->spa_sync_pass = 0; 10091 10092 /* 10093 * Update the last synced uberblock here. We want to do this at 10094 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10095 * will be guaranteed that all the processing associated with 10096 * that txg has been completed. 10097 */ 10098 spa->spa_ubsync = spa->spa_uberblock; 10099 spa_config_exit(spa, SCL_CONFIG, FTAG); 10100 10101 spa_handle_ignored_writes(spa); 10102 10103 /* 10104 * If any async tasks have been requested, kick them off. 10105 */ 10106 spa_async_dispatch(spa); 10107 } 10108 10109 /* 10110 * Sync all pools. We don't want to hold the namespace lock across these 10111 * operations, so we take a reference on the spa_t and drop the lock during the 10112 * sync. 10113 */ 10114 void 10115 spa_sync_allpools(void) 10116 { 10117 spa_t *spa = NULL; 10118 mutex_enter(&spa_namespace_lock); 10119 while ((spa = spa_next(spa)) != NULL) { 10120 if (spa_state(spa) != POOL_STATE_ACTIVE || 10121 !spa_writeable(spa) || spa_suspended(spa)) 10122 continue; 10123 spa_open_ref(spa, FTAG); 10124 mutex_exit(&spa_namespace_lock); 10125 txg_wait_synced(spa_get_dsl(spa), 0); 10126 mutex_enter(&spa_namespace_lock); 10127 spa_close(spa, FTAG); 10128 } 10129 mutex_exit(&spa_namespace_lock); 10130 } 10131 10132 taskq_t * 10133 spa_sync_tq_create(spa_t *spa, const char *name) 10134 { 10135 kthread_t **kthreads; 10136 10137 ASSERT(spa->spa_sync_tq == NULL); 10138 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10139 10140 /* 10141 * - do not allow more allocators than cpus. 10142 * - there may be more cpus than allocators. 10143 * - do not allow more sync taskq threads than allocators or cpus. 10144 */ 10145 int nthreads = spa->spa_alloc_count; 10146 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10147 nthreads, KM_SLEEP); 10148 10149 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10150 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10151 VERIFY(spa->spa_sync_tq != NULL); 10152 VERIFY(kthreads != NULL); 10153 10154 spa_taskqs_t *tqs = 10155 &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; 10156 10157 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10158 for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { 10159 ti->sti_thread = kthreads[i]; 10160 if (w == tqs->stqs_count) { 10161 w = 0; 10162 } 10163 ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; 10164 } 10165 10166 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10167 return (spa->spa_sync_tq); 10168 } 10169 10170 void 10171 spa_sync_tq_destroy(spa_t *spa) 10172 { 10173 ASSERT(spa->spa_sync_tq != NULL); 10174 10175 taskq_wait(spa->spa_sync_tq); 10176 taskq_destroy(spa->spa_sync_tq); 10177 kmem_free(spa->spa_syncthreads, 10178 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10179 spa->spa_sync_tq = NULL; 10180 } 10181 10182 void 10183 spa_select_allocator(zio_t *zio) 10184 { 10185 zbookmark_phys_t *bm = &zio->io_bookmark; 10186 spa_t *spa = zio->io_spa; 10187 10188 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10189 10190 /* 10191 * A gang block (for example) may have inherited its parent's 10192 * allocator, in which case there is nothing further to do here. 10193 */ 10194 if (ZIO_HAS_ALLOCATOR(zio)) 10195 return; 10196 10197 ASSERT(spa != NULL); 10198 ASSERT(bm != NULL); 10199 10200 /* 10201 * First try to use an allocator assigned to the syncthread, and set 10202 * the corresponding write issue taskq for the allocator. 10203 * Note, we must have an open pool to do this. 10204 */ 10205 if (spa->spa_sync_tq != NULL) { 10206 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10207 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10208 if (ti->sti_thread == curthread) { 10209 zio->io_allocator = i; 10210 zio->io_wr_iss_tq = ti->sti_wr_iss_tq; 10211 return; 10212 } 10213 } 10214 } 10215 10216 /* 10217 * We want to try to use as many allocators as possible to help improve 10218 * performance, but we also want logically adjacent IOs to be physically 10219 * adjacent to improve sequential read performance. We chunk each object 10220 * into 2^20 block regions, and then hash based on the objset, object, 10221 * level, and region to accomplish both of these goals. 10222 */ 10223 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10224 bm->zb_blkid >> 20); 10225 10226 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10227 zio->io_wr_iss_tq = NULL; 10228 } 10229 10230 /* 10231 * ========================================================================== 10232 * Miscellaneous routines 10233 * ========================================================================== 10234 */ 10235 10236 /* 10237 * Remove all pools in the system. 10238 */ 10239 void 10240 spa_evict_all(void) 10241 { 10242 spa_t *spa; 10243 10244 /* 10245 * Remove all cached state. All pools should be closed now, 10246 * so every spa in the AVL tree should be unreferenced. 10247 */ 10248 mutex_enter(&spa_namespace_lock); 10249 while ((spa = spa_next(NULL)) != NULL) { 10250 /* 10251 * Stop async tasks. The async thread may need to detach 10252 * a device that's been replaced, which requires grabbing 10253 * spa_namespace_lock, so we must drop it here. 10254 */ 10255 spa_open_ref(spa, FTAG); 10256 mutex_exit(&spa_namespace_lock); 10257 spa_async_suspend(spa); 10258 mutex_enter(&spa_namespace_lock); 10259 spa_close(spa, FTAG); 10260 10261 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10262 spa_unload(spa); 10263 spa_deactivate(spa); 10264 } 10265 spa_remove(spa); 10266 } 10267 mutex_exit(&spa_namespace_lock); 10268 } 10269 10270 vdev_t * 10271 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10272 { 10273 vdev_t *vd; 10274 int i; 10275 10276 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10277 return (vd); 10278 10279 if (aux) { 10280 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10281 vd = spa->spa_l2cache.sav_vdevs[i]; 10282 if (vd->vdev_guid == guid) 10283 return (vd); 10284 } 10285 10286 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10287 vd = spa->spa_spares.sav_vdevs[i]; 10288 if (vd->vdev_guid == guid) 10289 return (vd); 10290 } 10291 } 10292 10293 return (NULL); 10294 } 10295 10296 void 10297 spa_upgrade(spa_t *spa, uint64_t version) 10298 { 10299 ASSERT(spa_writeable(spa)); 10300 10301 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10302 10303 /* 10304 * This should only be called for a non-faulted pool, and since a 10305 * future version would result in an unopenable pool, this shouldn't be 10306 * possible. 10307 */ 10308 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10309 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10310 10311 spa->spa_uberblock.ub_version = version; 10312 vdev_config_dirty(spa->spa_root_vdev); 10313 10314 spa_config_exit(spa, SCL_ALL, FTAG); 10315 10316 txg_wait_synced(spa_get_dsl(spa), 0); 10317 } 10318 10319 static boolean_t 10320 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10321 { 10322 (void) spa; 10323 int i; 10324 uint64_t vdev_guid; 10325 10326 for (i = 0; i < sav->sav_count; i++) 10327 if (sav->sav_vdevs[i]->vdev_guid == guid) 10328 return (B_TRUE); 10329 10330 for (i = 0; i < sav->sav_npending; i++) { 10331 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10332 &vdev_guid) == 0 && vdev_guid == guid) 10333 return (B_TRUE); 10334 } 10335 10336 return (B_FALSE); 10337 } 10338 10339 boolean_t 10340 spa_has_l2cache(spa_t *spa, uint64_t guid) 10341 { 10342 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10343 } 10344 10345 boolean_t 10346 spa_has_spare(spa_t *spa, uint64_t guid) 10347 { 10348 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10349 } 10350 10351 /* 10352 * Check if a pool has an active shared spare device. 10353 * Note: reference count of an active spare is 2, as a spare and as a replace 10354 */ 10355 static boolean_t 10356 spa_has_active_shared_spare(spa_t *spa) 10357 { 10358 int i, refcnt; 10359 uint64_t pool; 10360 spa_aux_vdev_t *sav = &spa->spa_spares; 10361 10362 for (i = 0; i < sav->sav_count; i++) { 10363 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10364 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10365 refcnt > 2) 10366 return (B_TRUE); 10367 } 10368 10369 return (B_FALSE); 10370 } 10371 10372 uint64_t 10373 spa_total_metaslabs(spa_t *spa) 10374 { 10375 vdev_t *rvd = spa->spa_root_vdev; 10376 10377 uint64_t m = 0; 10378 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10379 vdev_t *vd = rvd->vdev_child[c]; 10380 if (!vdev_is_concrete(vd)) 10381 continue; 10382 m += vd->vdev_ms_count; 10383 } 10384 return (m); 10385 } 10386 10387 /* 10388 * Notify any waiting threads that some activity has switched from being in- 10389 * progress to not-in-progress so that the thread can wake up and determine 10390 * whether it is finished waiting. 10391 */ 10392 void 10393 spa_notify_waiters(spa_t *spa) 10394 { 10395 /* 10396 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10397 * happening between the waiting thread's check and cv_wait. 10398 */ 10399 mutex_enter(&spa->spa_activities_lock); 10400 cv_broadcast(&spa->spa_activities_cv); 10401 mutex_exit(&spa->spa_activities_lock); 10402 } 10403 10404 /* 10405 * Notify any waiting threads that the pool is exporting, and then block until 10406 * they are finished using the spa_t. 10407 */ 10408 void 10409 spa_wake_waiters(spa_t *spa) 10410 { 10411 mutex_enter(&spa->spa_activities_lock); 10412 spa->spa_waiters_cancel = B_TRUE; 10413 cv_broadcast(&spa->spa_activities_cv); 10414 while (spa->spa_waiters != 0) 10415 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10416 spa->spa_waiters_cancel = B_FALSE; 10417 mutex_exit(&spa->spa_activities_lock); 10418 } 10419 10420 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10421 static boolean_t 10422 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10423 { 10424 spa_t *spa = vd->vdev_spa; 10425 10426 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10427 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10428 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10429 activity == ZPOOL_WAIT_TRIM); 10430 10431 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10432 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10433 10434 mutex_exit(&spa->spa_activities_lock); 10435 mutex_enter(lock); 10436 mutex_enter(&spa->spa_activities_lock); 10437 10438 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10439 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10440 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10441 mutex_exit(lock); 10442 10443 if (in_progress) 10444 return (B_TRUE); 10445 10446 for (int i = 0; i < vd->vdev_children; i++) { 10447 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10448 activity)) 10449 return (B_TRUE); 10450 } 10451 10452 return (B_FALSE); 10453 } 10454 10455 /* 10456 * If use_guid is true, this checks whether the vdev specified by guid is 10457 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10458 * is being initialized/trimmed. The caller must hold the config lock and 10459 * spa_activities_lock. 10460 */ 10461 static int 10462 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10463 zpool_wait_activity_t activity, boolean_t *in_progress) 10464 { 10465 mutex_exit(&spa->spa_activities_lock); 10466 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10467 mutex_enter(&spa->spa_activities_lock); 10468 10469 vdev_t *vd; 10470 if (use_guid) { 10471 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10472 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10473 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10474 return (EINVAL); 10475 } 10476 } else { 10477 vd = spa->spa_root_vdev; 10478 } 10479 10480 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10481 10482 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10483 return (0); 10484 } 10485 10486 /* 10487 * Locking for waiting threads 10488 * --------------------------- 10489 * 10490 * Waiting threads need a way to check whether a given activity is in progress, 10491 * and then, if it is, wait for it to complete. Each activity will have some 10492 * in-memory representation of the relevant on-disk state which can be used to 10493 * determine whether or not the activity is in progress. The in-memory state and 10494 * the locking used to protect it will be different for each activity, and may 10495 * not be suitable for use with a cvar (e.g., some state is protected by the 10496 * config lock). To allow waiting threads to wait without any races, another 10497 * lock, spa_activities_lock, is used. 10498 * 10499 * When the state is checked, both the activity-specific lock (if there is one) 10500 * and spa_activities_lock are held. In some cases, the activity-specific lock 10501 * is acquired explicitly (e.g. the config lock). In others, the locking is 10502 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 10503 * thread releases the activity-specific lock and, if the activity is in 10504 * progress, then cv_waits using spa_activities_lock. 10505 * 10506 * The waiting thread is woken when another thread, one completing some 10507 * activity, updates the state of the activity and then calls 10508 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 10509 * needs to hold its activity-specific lock when updating the state, and this 10510 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 10511 * 10512 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 10513 * and because it is held when the waiting thread checks the state of the 10514 * activity, it can never be the case that the completing thread both updates 10515 * the activity state and cv_broadcasts in between the waiting thread's check 10516 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 10517 * 10518 * In order to prevent deadlock, when the waiting thread does its check, in some 10519 * cases it will temporarily drop spa_activities_lock in order to acquire the 10520 * activity-specific lock. The order in which spa_activities_lock and the 10521 * activity specific lock are acquired in the waiting thread is determined by 10522 * the order in which they are acquired in the completing thread; if the 10523 * completing thread calls spa_notify_waiters with the activity-specific lock 10524 * held, then the waiting thread must also acquire the activity-specific lock 10525 * first. 10526 */ 10527 10528 static int 10529 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 10530 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 10531 { 10532 int error = 0; 10533 10534 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10535 10536 switch (activity) { 10537 case ZPOOL_WAIT_CKPT_DISCARD: 10538 *in_progress = 10539 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 10540 zap_contains(spa_meta_objset(spa), 10541 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 10542 ENOENT); 10543 break; 10544 case ZPOOL_WAIT_FREE: 10545 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 10546 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 10547 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 10548 spa_livelist_delete_check(spa)); 10549 break; 10550 case ZPOOL_WAIT_INITIALIZE: 10551 case ZPOOL_WAIT_TRIM: 10552 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 10553 activity, in_progress); 10554 break; 10555 case ZPOOL_WAIT_REPLACE: 10556 mutex_exit(&spa->spa_activities_lock); 10557 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10558 mutex_enter(&spa->spa_activities_lock); 10559 10560 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 10561 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10562 break; 10563 case ZPOOL_WAIT_REMOVE: 10564 *in_progress = (spa->spa_removing_phys.sr_state == 10565 DSS_SCANNING); 10566 break; 10567 case ZPOOL_WAIT_RESILVER: 10568 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 10569 if (*in_progress) 10570 break; 10571 zfs_fallthrough; 10572 case ZPOOL_WAIT_SCRUB: 10573 { 10574 boolean_t scanning, paused, is_scrub; 10575 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 10576 10577 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 10578 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 10579 paused = dsl_scan_is_paused_scrub(scn); 10580 *in_progress = (scanning && !paused && 10581 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 10582 break; 10583 } 10584 case ZPOOL_WAIT_RAIDZ_EXPAND: 10585 { 10586 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 10587 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 10588 break; 10589 } 10590 default: 10591 panic("unrecognized value for activity %d", activity); 10592 } 10593 10594 return (error); 10595 } 10596 10597 static int 10598 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 10599 boolean_t use_tag, uint64_t tag, boolean_t *waited) 10600 { 10601 /* 10602 * The tag is used to distinguish between instances of an activity. 10603 * 'initialize' and 'trim' are the only activities that we use this for. 10604 * The other activities can only have a single instance in progress in a 10605 * pool at one time, making the tag unnecessary. 10606 * 10607 * There can be multiple devices being replaced at once, but since they 10608 * all finish once resilvering finishes, we don't bother keeping track 10609 * of them individually, we just wait for them all to finish. 10610 */ 10611 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 10612 activity != ZPOOL_WAIT_TRIM) 10613 return (EINVAL); 10614 10615 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 10616 return (EINVAL); 10617 10618 spa_t *spa; 10619 int error = spa_open(pool, &spa, FTAG); 10620 if (error != 0) 10621 return (error); 10622 10623 /* 10624 * Increment the spa's waiter count so that we can call spa_close and 10625 * still ensure that the spa_t doesn't get freed before this thread is 10626 * finished with it when the pool is exported. We want to call spa_close 10627 * before we start waiting because otherwise the additional ref would 10628 * prevent the pool from being exported or destroyed throughout the 10629 * potentially long wait. 10630 */ 10631 mutex_enter(&spa->spa_activities_lock); 10632 spa->spa_waiters++; 10633 spa_close(spa, FTAG); 10634 10635 *waited = B_FALSE; 10636 for (;;) { 10637 boolean_t in_progress; 10638 error = spa_activity_in_progress(spa, activity, use_tag, tag, 10639 &in_progress); 10640 10641 if (error || !in_progress || spa->spa_waiters_cancel) 10642 break; 10643 10644 *waited = B_TRUE; 10645 10646 if (cv_wait_sig(&spa->spa_activities_cv, 10647 &spa->spa_activities_lock) == 0) { 10648 error = EINTR; 10649 break; 10650 } 10651 } 10652 10653 spa->spa_waiters--; 10654 cv_signal(&spa->spa_waiters_cv); 10655 mutex_exit(&spa->spa_activities_lock); 10656 10657 return (error); 10658 } 10659 10660 /* 10661 * Wait for a particular instance of the specified activity to complete, where 10662 * the instance is identified by 'tag' 10663 */ 10664 int 10665 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 10666 boolean_t *waited) 10667 { 10668 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 10669 } 10670 10671 /* 10672 * Wait for all instances of the specified activity complete 10673 */ 10674 int 10675 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10676 { 10677 10678 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10679 } 10680 10681 sysevent_t * 10682 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10683 { 10684 sysevent_t *ev = NULL; 10685 #ifdef _KERNEL 10686 nvlist_t *resource; 10687 10688 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10689 if (resource) { 10690 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10691 ev->resource = resource; 10692 } 10693 #else 10694 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10695 #endif 10696 return (ev); 10697 } 10698 10699 void 10700 spa_event_post(sysevent_t *ev) 10701 { 10702 #ifdef _KERNEL 10703 if (ev) { 10704 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10705 kmem_free(ev, sizeof (*ev)); 10706 } 10707 #else 10708 (void) ev; 10709 #endif 10710 } 10711 10712 /* 10713 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10714 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10715 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10716 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10717 * or zdb as real changes. 10718 */ 10719 void 10720 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10721 { 10722 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10723 } 10724 10725 /* state manipulation functions */ 10726 EXPORT_SYMBOL(spa_open); 10727 EXPORT_SYMBOL(spa_open_rewind); 10728 EXPORT_SYMBOL(spa_get_stats); 10729 EXPORT_SYMBOL(spa_create); 10730 EXPORT_SYMBOL(spa_import); 10731 EXPORT_SYMBOL(spa_tryimport); 10732 EXPORT_SYMBOL(spa_destroy); 10733 EXPORT_SYMBOL(spa_export); 10734 EXPORT_SYMBOL(spa_reset); 10735 EXPORT_SYMBOL(spa_async_request); 10736 EXPORT_SYMBOL(spa_async_suspend); 10737 EXPORT_SYMBOL(spa_async_resume); 10738 EXPORT_SYMBOL(spa_inject_addref); 10739 EXPORT_SYMBOL(spa_inject_delref); 10740 EXPORT_SYMBOL(spa_scan_stat_init); 10741 EXPORT_SYMBOL(spa_scan_get_stats); 10742 10743 /* device manipulation */ 10744 EXPORT_SYMBOL(spa_vdev_add); 10745 EXPORT_SYMBOL(spa_vdev_attach); 10746 EXPORT_SYMBOL(spa_vdev_detach); 10747 EXPORT_SYMBOL(spa_vdev_setpath); 10748 EXPORT_SYMBOL(spa_vdev_setfru); 10749 EXPORT_SYMBOL(spa_vdev_split_mirror); 10750 10751 /* spare statech is global across all pools) */ 10752 EXPORT_SYMBOL(spa_spare_add); 10753 EXPORT_SYMBOL(spa_spare_remove); 10754 EXPORT_SYMBOL(spa_spare_exists); 10755 EXPORT_SYMBOL(spa_spare_activate); 10756 10757 /* L2ARC statech is global across all pools) */ 10758 EXPORT_SYMBOL(spa_l2cache_add); 10759 EXPORT_SYMBOL(spa_l2cache_remove); 10760 EXPORT_SYMBOL(spa_l2cache_exists); 10761 EXPORT_SYMBOL(spa_l2cache_activate); 10762 EXPORT_SYMBOL(spa_l2cache_drop); 10763 10764 /* scanning */ 10765 EXPORT_SYMBOL(spa_scan); 10766 EXPORT_SYMBOL(spa_scan_stop); 10767 10768 /* spa syncing */ 10769 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10770 EXPORT_SYMBOL(spa_sync_allpools); 10771 10772 /* properties */ 10773 EXPORT_SYMBOL(spa_prop_set); 10774 EXPORT_SYMBOL(spa_prop_get); 10775 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10776 10777 /* asynchronous event notification */ 10778 EXPORT_SYMBOL(spa_event_notify); 10779 10780 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 10781 "Percentage of CPUs to run a metaslab preload taskq"); 10782 10783 /* BEGIN CSTYLED */ 10784 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10785 "log2 fraction of arc that can be used by inflight I/Os when " 10786 "verifying pool during import"); 10787 /* END CSTYLED */ 10788 10789 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10790 "Set to traverse metadata on pool import"); 10791 10792 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10793 "Set to traverse data on pool import"); 10794 10795 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10796 "Print vdev tree to zfs_dbgmsg during pool import"); 10797 10798 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 10799 "Percentage of CPUs to run an IO worker thread"); 10800 10801 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 10802 "Number of threads per IO worker taskqueue"); 10803 10804 /* BEGIN CSTYLED */ 10805 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 10806 "Allow importing pool with up to this number of missing top-level " 10807 "vdevs (in read-only mode)"); 10808 /* END CSTYLED */ 10809 10810 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10811 ZMOD_RW, "Set the livelist condense zthr to pause"); 10812 10813 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10814 ZMOD_RW, "Set the livelist condense synctask to pause"); 10815 10816 /* BEGIN CSTYLED */ 10817 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10818 INT, ZMOD_RW, 10819 "Whether livelist condensing was canceled in the synctask"); 10820 10821 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10822 INT, ZMOD_RW, 10823 "Whether livelist condensing was canceled in the zthr function"); 10824 10825 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10826 ZMOD_RW, 10827 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10828 "was being condensed"); 10829 10830 #ifdef _KERNEL 10831 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 10832 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, 10833 "Configure IO queues for read IO"); 10834 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 10835 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, 10836 "Configure IO queues for write IO"); 10837 #endif 10838 /* END CSTYLED */ 10839 10840 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, 10841 "Number of CPUs to run write issue taskqs"); 10842