1c1cb2cd8Shaad /*
2c1cb2cd8Shaad * CDDL HEADER START
3c1cb2cd8Shaad *
4c1cb2cd8Shaad * The contents of this file are subject to the terms of the
5c1cb2cd8Shaad * Common Development and Distribution License (the "License").
6c1cb2cd8Shaad * You may not use this file except in compliance with the License.
7c1cb2cd8Shaad *
8c1cb2cd8Shaad * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c1cb2cd8Shaad * or http://www.opensolaris.org/os/licensing.
10c1cb2cd8Shaad * See the License for the specific language governing permissions
11c1cb2cd8Shaad * and limitations under the License.
12c1cb2cd8Shaad *
13c1cb2cd8Shaad * When distributing Covered Code, include this CDDL HEADER in each
14c1cb2cd8Shaad * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c1cb2cd8Shaad * If applicable, add the following below this CDDL HEADER, with the
16c1cb2cd8Shaad * fields enclosed by brackets "[]" replaced with your own identifying
17c1cb2cd8Shaad * information: Portions Copyright [yyyy] [name of copyright owner]
18c1cb2cd8Shaad *
19c1cb2cd8Shaad * CDDL HEADER END
20c1cb2cd8Shaad */
21c1cb2cd8Shaad /*
2293f3d2b8Schs * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2393f3d2b8Schs * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
2493f3d2b8Schs * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2593f3d2b8Schs * Copyright (c) 2014 Integros [integros.com]
26c1cb2cd8Shaad */
27c1cb2cd8Shaad
28c1cb2cd8Shaad #include <sys/zfs_context.h>
29c1cb2cd8Shaad #include <sys/dmu.h>
30c1cb2cd8Shaad #include <sys/dmu_tx.h>
31c1cb2cd8Shaad #include <sys/space_map.h>
32c1cb2cd8Shaad #include <sys/metaslab_impl.h>
33c1cb2cd8Shaad #include <sys/vdev_impl.h>
34c1cb2cd8Shaad #include <sys/zio.h>
3593f3d2b8Schs #include <sys/spa_impl.h>
3693f3d2b8Schs #include <sys/zfeature.h>
3793f3d2b8Schs
3893f3d2b8Schs SYSCTL_DECL(_vfs_zfs);
3993f3d2b8Schs SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
4093f3d2b8Schs
4193f3d2b8Schs #define GANG_ALLOCATION(flags) \
4293f3d2b8Schs ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43c1cb2cd8Shaad
44c1cb2cd8Shaad uint64_t metaslab_aliquot = 512ULL << 10;
45c1cb2cd8Shaad uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
4693f3d2b8Schs SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
4793f3d2b8Schs &metaslab_gang_bang, 0,
4893f3d2b8Schs "Force gang block allocation for blocks larger than or equal to this value");
49c1cb2cd8Shaad
50c1cb2cd8Shaad /*
5193f3d2b8Schs * The in-core space map representation is more compact than its on-disk form.
5293f3d2b8Schs * The zfs_condense_pct determines how much more compact the in-core
5393f3d2b8Schs * space map representation must be before we compact it on-disk.
5493f3d2b8Schs * Values should be greater than or equal to 100.
55f59c7639Shaad */
5693f3d2b8Schs int zfs_condense_pct = 200;
5793f3d2b8Schs SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
5893f3d2b8Schs &zfs_condense_pct, 0,
5993f3d2b8Schs "Condense on-disk spacemap when it is more than this many percents"
6093f3d2b8Schs " of in-memory counterpart");
6193f3d2b8Schs
6293f3d2b8Schs /*
6393f3d2b8Schs * Condensing a metaslab is not guaranteed to actually reduce the amount of
6493f3d2b8Schs * space used on disk. In particular, a space map uses data in increments of
6593f3d2b8Schs * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
6693f3d2b8Schs * same number of blocks after condensing. Since the goal of condensing is to
6793f3d2b8Schs * reduce the number of IOPs required to read the space map, we only want to
6893f3d2b8Schs * condense when we can be sure we will reduce the number of blocks used by the
6993f3d2b8Schs * space map. Unfortunately, we cannot precisely compute whether or not this is
7093f3d2b8Schs * the case in metaslab_should_condense since we are holding ms_lock. Instead,
7193f3d2b8Schs * we apply the following heuristic: do not condense a spacemap unless the
7293f3d2b8Schs * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
7393f3d2b8Schs * blocks.
7493f3d2b8Schs */
7593f3d2b8Schs int zfs_metaslab_condense_block_threshold = 4;
7693f3d2b8Schs
7793f3d2b8Schs /*
7893f3d2b8Schs * The zfs_mg_noalloc_threshold defines which metaslab groups should
7993f3d2b8Schs * be eligible for allocation. The value is defined as a percentage of
8093f3d2b8Schs * free space. Metaslab groups that have more free space than
8193f3d2b8Schs * zfs_mg_noalloc_threshold are always eligible for allocations. Once
8293f3d2b8Schs * a metaslab group's free space is less than or equal to the
8393f3d2b8Schs * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
8493f3d2b8Schs * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
8593f3d2b8Schs * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
8693f3d2b8Schs * groups are allowed to accept allocations. Gang blocks are always
8793f3d2b8Schs * eligible to allocate on any metaslab group. The default value of 0 means
8893f3d2b8Schs * no metaslab group will be excluded based on this criterion.
8993f3d2b8Schs */
9093f3d2b8Schs int zfs_mg_noalloc_threshold = 0;
9193f3d2b8Schs SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
9293f3d2b8Schs &zfs_mg_noalloc_threshold, 0,
9393f3d2b8Schs "Percentage of metaslab group size that should be free"
9493f3d2b8Schs " to make it eligible for allocation");
9593f3d2b8Schs
9693f3d2b8Schs /*
9793f3d2b8Schs * Metaslab groups are considered eligible for allocations if their
9893f3d2b8Schs * fragmenation metric (measured as a percentage) is less than or equal to
9993f3d2b8Schs * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
10093f3d2b8Schs * then it will be skipped unless all metaslab groups within the metaslab
10193f3d2b8Schs * class have also crossed this threshold.
10293f3d2b8Schs */
10393f3d2b8Schs int zfs_mg_fragmentation_threshold = 85;
10493f3d2b8Schs SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
10593f3d2b8Schs &zfs_mg_fragmentation_threshold, 0,
10693f3d2b8Schs "Percentage of metaslab group size that should be considered "
10793f3d2b8Schs "eligible for allocations unless all metaslab groups within the metaslab class "
10893f3d2b8Schs "have also crossed this threshold");
10993f3d2b8Schs
11093f3d2b8Schs /*
11193f3d2b8Schs * Allow metaslabs to keep their active state as long as their fragmentation
11293f3d2b8Schs * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
11393f3d2b8Schs * active metaslab that exceeds this threshold will no longer keep its active
11493f3d2b8Schs * status allowing better metaslabs to be selected.
11593f3d2b8Schs */
11693f3d2b8Schs int zfs_metaslab_fragmentation_threshold = 70;
11793f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
11893f3d2b8Schs &zfs_metaslab_fragmentation_threshold, 0,
11993f3d2b8Schs "Maximum percentage of metaslab fragmentation level to keep their active state");
12093f3d2b8Schs
12193f3d2b8Schs /*
12293f3d2b8Schs * When set will load all metaslabs when pool is first opened.
12393f3d2b8Schs */
12493f3d2b8Schs int metaslab_debug_load = 0;
12593f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
12693f3d2b8Schs &metaslab_debug_load, 0,
12793f3d2b8Schs "Load all metaslabs when pool is first opened");
12893f3d2b8Schs
12993f3d2b8Schs /*
13093f3d2b8Schs * When set will prevent metaslabs from being unloaded.
13193f3d2b8Schs */
13293f3d2b8Schs int metaslab_debug_unload = 0;
13393f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
13493f3d2b8Schs &metaslab_debug_unload, 0,
13593f3d2b8Schs "Prevent metaslabs from being unloaded");
136f59c7639Shaad
137f59c7639Shaad /*
138f59c7639Shaad * Minimum size which forces the dynamic allocator to change
139f59c7639Shaad * it's allocation strategy. Once the space map cannot satisfy
140f59c7639Shaad * an allocation of this size then it switches to using more
141f59c7639Shaad * aggressive strategy (i.e search by size rather than offset).
142f59c7639Shaad */
14393f3d2b8Schs uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
14493f3d2b8Schs SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
14593f3d2b8Schs &metaslab_df_alloc_threshold, 0,
14693f3d2b8Schs "Minimum size which forces the dynamic allocator to change it's allocation strategy");
147f59c7639Shaad
148f59c7639Shaad /*
149f59c7639Shaad * The minimum free space, in percent, which must be available
150f59c7639Shaad * in a space map to continue allocations in a first-fit fashion.
15193f3d2b8Schs * Once the space map's free space drops below this level we dynamically
152f59c7639Shaad * switch to using best-fit allocations.
153f59c7639Shaad */
154f59c7639Shaad int metaslab_df_free_pct = 4;
15593f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
15693f3d2b8Schs &metaslab_df_free_pct, 0,
15793f3d2b8Schs "The minimum free space, in percent, which must be available in a "
15893f3d2b8Schs "space map to continue allocations in a first-fit fashion");
159f59c7639Shaad
160f59c7639Shaad /*
161f59c7639Shaad * A metaslab is considered "free" if it contains a contiguous
162f59c7639Shaad * segment which is greater than metaslab_min_alloc_size.
163f59c7639Shaad */
164f59c7639Shaad uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
16593f3d2b8Schs SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
16693f3d2b8Schs &metaslab_min_alloc_size, 0,
16793f3d2b8Schs "A metaslab is considered \"free\" if it contains a contiguous "
16893f3d2b8Schs "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
169f59c7639Shaad
170f59c7639Shaad /*
17193f3d2b8Schs * Percentage of all cpus that can be used by the metaslab taskq.
172f59c7639Shaad */
17393f3d2b8Schs int metaslab_load_pct = 50;
17493f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
17593f3d2b8Schs &metaslab_load_pct, 0,
17693f3d2b8Schs "Percentage of cpus that can be used by the metaslab taskq");
177f59c7639Shaad
178f59c7639Shaad /*
17993f3d2b8Schs * Determines how many txgs a metaslab may remain loaded without having any
18093f3d2b8Schs * allocations from it. As long as a metaslab continues to be used we will
18193f3d2b8Schs * keep it loaded.
182f59c7639Shaad */
18393f3d2b8Schs int metaslab_unload_delay = TXG_SIZE * 2;
18493f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
18593f3d2b8Schs &metaslab_unload_delay, 0,
18693f3d2b8Schs "Number of TXGs that an unused metaslab can be kept in memory");
18793f3d2b8Schs
18893f3d2b8Schs /*
18993f3d2b8Schs * Max number of metaslabs per group to preload.
19093f3d2b8Schs */
19193f3d2b8Schs int metaslab_preload_limit = SPA_DVAS_PER_BP;
19293f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
19393f3d2b8Schs &metaslab_preload_limit, 0,
19493f3d2b8Schs "Max number of metaslabs per group to preload");
19593f3d2b8Schs
19693f3d2b8Schs /*
19793f3d2b8Schs * Enable/disable preloading of metaslab.
19893f3d2b8Schs */
19993f3d2b8Schs boolean_t metaslab_preload_enabled = B_TRUE;
20093f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
20193f3d2b8Schs &metaslab_preload_enabled, 0,
20293f3d2b8Schs "Max number of metaslabs per group to preload");
20393f3d2b8Schs
20493f3d2b8Schs /*
20593f3d2b8Schs * Enable/disable fragmentation weighting on metaslabs.
20693f3d2b8Schs */
20793f3d2b8Schs boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
20893f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
20993f3d2b8Schs &metaslab_fragmentation_factor_enabled, 0,
21093f3d2b8Schs "Enable fragmentation weighting on metaslabs");
21193f3d2b8Schs
21293f3d2b8Schs /*
21393f3d2b8Schs * Enable/disable lba weighting (i.e. outer tracks are given preference).
21493f3d2b8Schs */
21593f3d2b8Schs boolean_t metaslab_lba_weighting_enabled = B_TRUE;
21693f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
21793f3d2b8Schs &metaslab_lba_weighting_enabled, 0,
21893f3d2b8Schs "Enable LBA weighting (i.e. outer tracks are given preference)");
21993f3d2b8Schs
22093f3d2b8Schs /*
22193f3d2b8Schs * Enable/disable metaslab group biasing.
22293f3d2b8Schs */
22393f3d2b8Schs boolean_t metaslab_bias_enabled = B_TRUE;
22493f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
22593f3d2b8Schs &metaslab_bias_enabled, 0,
22693f3d2b8Schs "Enable metaslab group biasing");
22793f3d2b8Schs
22893f3d2b8Schs /*
22993f3d2b8Schs * Enable/disable segment-based metaslab selection.
23093f3d2b8Schs */
23193f3d2b8Schs boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
23293f3d2b8Schs
23393f3d2b8Schs /*
23493f3d2b8Schs * When using segment-based metaslab selection, we will continue
23593f3d2b8Schs * allocating from the active metaslab until we have exhausted
23693f3d2b8Schs * zfs_metaslab_switch_threshold of its buckets.
23793f3d2b8Schs */
23893f3d2b8Schs int zfs_metaslab_switch_threshold = 2;
23993f3d2b8Schs
24093f3d2b8Schs /*
24193f3d2b8Schs * Internal switch to enable/disable the metaslab allocation tracing
24293f3d2b8Schs * facility.
24393f3d2b8Schs */
24493f3d2b8Schs boolean_t metaslab_trace_enabled = B_TRUE;
24593f3d2b8Schs
24693f3d2b8Schs /*
24793f3d2b8Schs * Maximum entries that the metaslab allocation tracing facility will keep
24893f3d2b8Schs * in a given list when running in non-debug mode. We limit the number
24993f3d2b8Schs * of entries in non-debug mode to prevent us from using up too much memory.
25093f3d2b8Schs * The limit should be sufficiently large that we don't expect any allocation
25193f3d2b8Schs * to every exceed this value. In debug mode, the system will panic if this
25293f3d2b8Schs * limit is ever reached allowing for further investigation.
25393f3d2b8Schs */
25493f3d2b8Schs uint64_t metaslab_trace_max_entries = 5000;
25593f3d2b8Schs
25693f3d2b8Schs static uint64_t metaslab_weight(metaslab_t *);
25793f3d2b8Schs static void metaslab_set_fragmentation(metaslab_t *);
25893f3d2b8Schs
25993f3d2b8Schs kmem_cache_t *metaslab_alloc_trace_cache;
260f59c7639Shaad
261f59c7639Shaad /*
262c1cb2cd8Shaad * ==========================================================================
263c1cb2cd8Shaad * Metaslab classes
264c1cb2cd8Shaad * ==========================================================================
265c1cb2cd8Shaad */
266c1cb2cd8Shaad metaslab_class_t *
metaslab_class_create(spa_t * spa,metaslab_ops_t * ops)26793f3d2b8Schs metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
268c1cb2cd8Shaad {
269c1cb2cd8Shaad metaslab_class_t *mc;
270c1cb2cd8Shaad
271c1cb2cd8Shaad mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
272c1cb2cd8Shaad
273f59c7639Shaad mc->mc_spa = spa;
274c1cb2cd8Shaad mc->mc_rotor = NULL;
275f59c7639Shaad mc->mc_ops = ops;
27693f3d2b8Schs mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
27793f3d2b8Schs refcount_create_tracked(&mc->mc_alloc_slots);
278c1cb2cd8Shaad
279c1cb2cd8Shaad return (mc);
280c1cb2cd8Shaad }
281c1cb2cd8Shaad
282c1cb2cd8Shaad void
metaslab_class_destroy(metaslab_class_t * mc)283c1cb2cd8Shaad metaslab_class_destroy(metaslab_class_t *mc)
284c1cb2cd8Shaad {
285f59c7639Shaad ASSERT(mc->mc_rotor == NULL);
286f59c7639Shaad ASSERT(mc->mc_alloc == 0);
287f59c7639Shaad ASSERT(mc->mc_deferred == 0);
288f59c7639Shaad ASSERT(mc->mc_space == 0);
289f59c7639Shaad ASSERT(mc->mc_dspace == 0);
290c1cb2cd8Shaad
29193f3d2b8Schs refcount_destroy(&mc->mc_alloc_slots);
29293f3d2b8Schs mutex_destroy(&mc->mc_lock);
293c1cb2cd8Shaad kmem_free(mc, sizeof (metaslab_class_t));
294c1cb2cd8Shaad }
295c1cb2cd8Shaad
296f59c7639Shaad int
metaslab_class_validate(metaslab_class_t * mc)297f59c7639Shaad metaslab_class_validate(metaslab_class_t *mc)
298c1cb2cd8Shaad {
299f59c7639Shaad metaslab_group_t *mg;
300f59c7639Shaad vdev_t *vd;
301c1cb2cd8Shaad
302f59c7639Shaad /*
303f59c7639Shaad * Must hold one of the spa_config locks.
304f59c7639Shaad */
305f59c7639Shaad ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
306f59c7639Shaad spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
307c1cb2cd8Shaad
308f59c7639Shaad if ((mg = mc->mc_rotor) == NULL)
309f59c7639Shaad return (0);
310f59c7639Shaad
311f59c7639Shaad do {
312f59c7639Shaad vd = mg->mg_vd;
313f59c7639Shaad ASSERT(vd->vdev_mg != NULL);
314f59c7639Shaad ASSERT3P(vd->vdev_top, ==, vd);
315f59c7639Shaad ASSERT3P(mg->mg_class, ==, mc);
316f59c7639Shaad ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
317f59c7639Shaad } while ((mg = mg->mg_next) != mc->mc_rotor);
318f59c7639Shaad
319f59c7639Shaad return (0);
320c1cb2cd8Shaad }
321c1cb2cd8Shaad
322c1cb2cd8Shaad void
metaslab_class_space_update(metaslab_class_t * mc,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta,int64_t dspace_delta)323f59c7639Shaad metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
324f59c7639Shaad int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
325c1cb2cd8Shaad {
326f59c7639Shaad atomic_add_64(&mc->mc_alloc, alloc_delta);
327f59c7639Shaad atomic_add_64(&mc->mc_deferred, defer_delta);
328f59c7639Shaad atomic_add_64(&mc->mc_space, space_delta);
329f59c7639Shaad atomic_add_64(&mc->mc_dspace, dspace_delta);
330c1cb2cd8Shaad }
331c1cb2cd8Shaad
33293f3d2b8Schs void
metaslab_class_minblocksize_update(metaslab_class_t * mc)33393f3d2b8Schs metaslab_class_minblocksize_update(metaslab_class_t *mc)
33493f3d2b8Schs {
33593f3d2b8Schs metaslab_group_t *mg;
33693f3d2b8Schs vdev_t *vd;
33793f3d2b8Schs uint64_t minashift = UINT64_MAX;
33893f3d2b8Schs
33993f3d2b8Schs if ((mg = mc->mc_rotor) == NULL) {
34093f3d2b8Schs mc->mc_minblocksize = SPA_MINBLOCKSIZE;
34193f3d2b8Schs return;
34293f3d2b8Schs }
34393f3d2b8Schs
34493f3d2b8Schs do {
34593f3d2b8Schs vd = mg->mg_vd;
34693f3d2b8Schs if (vd->vdev_ashift < minashift)
34793f3d2b8Schs minashift = vd->vdev_ashift;
34893f3d2b8Schs } while ((mg = mg->mg_next) != mc->mc_rotor);
34993f3d2b8Schs
35093f3d2b8Schs mc->mc_minblocksize = 1ULL << minashift;
35193f3d2b8Schs }
35293f3d2b8Schs
353f59c7639Shaad uint64_t
metaslab_class_get_alloc(metaslab_class_t * mc)354f59c7639Shaad metaslab_class_get_alloc(metaslab_class_t *mc)
355f59c7639Shaad {
356f59c7639Shaad return (mc->mc_alloc);
357f59c7639Shaad }
358f59c7639Shaad
359f59c7639Shaad uint64_t
metaslab_class_get_deferred(metaslab_class_t * mc)360f59c7639Shaad metaslab_class_get_deferred(metaslab_class_t *mc)
361f59c7639Shaad {
362f59c7639Shaad return (mc->mc_deferred);
363f59c7639Shaad }
364f59c7639Shaad
365f59c7639Shaad uint64_t
metaslab_class_get_space(metaslab_class_t * mc)366f59c7639Shaad metaslab_class_get_space(metaslab_class_t *mc)
367f59c7639Shaad {
368f59c7639Shaad return (mc->mc_space);
369f59c7639Shaad }
370f59c7639Shaad
371f59c7639Shaad uint64_t
metaslab_class_get_dspace(metaslab_class_t * mc)372f59c7639Shaad metaslab_class_get_dspace(metaslab_class_t *mc)
373f59c7639Shaad {
374f59c7639Shaad return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
375c1cb2cd8Shaad }
376c1cb2cd8Shaad
37793f3d2b8Schs uint64_t
metaslab_class_get_minblocksize(metaslab_class_t * mc)37893f3d2b8Schs metaslab_class_get_minblocksize(metaslab_class_t *mc)
37993f3d2b8Schs {
38093f3d2b8Schs return (mc->mc_minblocksize);
38193f3d2b8Schs }
38293f3d2b8Schs
38393f3d2b8Schs void
metaslab_class_histogram_verify(metaslab_class_t * mc)38493f3d2b8Schs metaslab_class_histogram_verify(metaslab_class_t *mc)
38593f3d2b8Schs {
38693f3d2b8Schs vdev_t *rvd = mc->mc_spa->spa_root_vdev;
38793f3d2b8Schs uint64_t *mc_hist;
38893f3d2b8Schs int i;
38993f3d2b8Schs
39093f3d2b8Schs if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
39193f3d2b8Schs return;
39293f3d2b8Schs
39393f3d2b8Schs mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
39493f3d2b8Schs KM_SLEEP);
39593f3d2b8Schs
39693f3d2b8Schs for (int c = 0; c < rvd->vdev_children; c++) {
39793f3d2b8Schs vdev_t *tvd = rvd->vdev_child[c];
39893f3d2b8Schs metaslab_group_t *mg = tvd->vdev_mg;
39993f3d2b8Schs
400c1cb2cd8Shaad /*
40193f3d2b8Schs * Skip any holes, uninitialized top-levels, or
40293f3d2b8Schs * vdevs that are not in this metalab class.
403c1cb2cd8Shaad */
40493f3d2b8Schs if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
40593f3d2b8Schs mg->mg_class != mc) {
40693f3d2b8Schs continue;
40793f3d2b8Schs }
40893f3d2b8Schs
40993f3d2b8Schs for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
41093f3d2b8Schs mc_hist[i] += mg->mg_histogram[i];
41193f3d2b8Schs }
41293f3d2b8Schs
41393f3d2b8Schs for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
41493f3d2b8Schs VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
41593f3d2b8Schs
41693f3d2b8Schs kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
41793f3d2b8Schs }
41893f3d2b8Schs
41993f3d2b8Schs /*
42093f3d2b8Schs * Calculate the metaslab class's fragmentation metric. The metric
42193f3d2b8Schs * is weighted based on the space contribution of each metaslab group.
42293f3d2b8Schs * The return value will be a number between 0 and 100 (inclusive), or
42393f3d2b8Schs * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
42493f3d2b8Schs * zfs_frag_table for more information about the metric.
42593f3d2b8Schs */
42693f3d2b8Schs uint64_t
metaslab_class_fragmentation(metaslab_class_t * mc)42793f3d2b8Schs metaslab_class_fragmentation(metaslab_class_t *mc)
42893f3d2b8Schs {
42993f3d2b8Schs vdev_t *rvd = mc->mc_spa->spa_root_vdev;
43093f3d2b8Schs uint64_t fragmentation = 0;
43193f3d2b8Schs
43293f3d2b8Schs spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
43393f3d2b8Schs
43493f3d2b8Schs for (int c = 0; c < rvd->vdev_children; c++) {
43593f3d2b8Schs vdev_t *tvd = rvd->vdev_child[c];
43693f3d2b8Schs metaslab_group_t *mg = tvd->vdev_mg;
43793f3d2b8Schs
43893f3d2b8Schs /*
43993f3d2b8Schs * Skip any holes, uninitialized top-levels, or
44093f3d2b8Schs * vdevs that are not in this metalab class.
44193f3d2b8Schs */
44293f3d2b8Schs if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
44393f3d2b8Schs mg->mg_class != mc) {
44493f3d2b8Schs continue;
44593f3d2b8Schs }
44693f3d2b8Schs
44793f3d2b8Schs /*
44893f3d2b8Schs * If a metaslab group does not contain a fragmentation
44993f3d2b8Schs * metric then just bail out.
45093f3d2b8Schs */
45193f3d2b8Schs if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
45293f3d2b8Schs spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
45393f3d2b8Schs return (ZFS_FRAG_INVALID);
45493f3d2b8Schs }
45593f3d2b8Schs
45693f3d2b8Schs /*
45793f3d2b8Schs * Determine how much this metaslab_group is contributing
45893f3d2b8Schs * to the overall pool fragmentation metric.
45993f3d2b8Schs */
46093f3d2b8Schs fragmentation += mg->mg_fragmentation *
46193f3d2b8Schs metaslab_group_get_space(mg);
46293f3d2b8Schs }
46393f3d2b8Schs fragmentation /= metaslab_class_get_space(mc);
46493f3d2b8Schs
46593f3d2b8Schs ASSERT3U(fragmentation, <=, 100);
46693f3d2b8Schs spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
46793f3d2b8Schs return (fragmentation);
46893f3d2b8Schs }
46993f3d2b8Schs
47093f3d2b8Schs /*
47193f3d2b8Schs * Calculate the amount of expandable space that is available in
47293f3d2b8Schs * this metaslab class. If a device is expanded then its expandable
47393f3d2b8Schs * space will be the amount of allocatable space that is currently not
47493f3d2b8Schs * part of this metaslab class.
47593f3d2b8Schs */
47693f3d2b8Schs uint64_t
metaslab_class_expandable_space(metaslab_class_t * mc)47793f3d2b8Schs metaslab_class_expandable_space(metaslab_class_t *mc)
47893f3d2b8Schs {
47993f3d2b8Schs vdev_t *rvd = mc->mc_spa->spa_root_vdev;
48093f3d2b8Schs uint64_t space = 0;
48193f3d2b8Schs
48293f3d2b8Schs spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
48393f3d2b8Schs for (int c = 0; c < rvd->vdev_children; c++) {
48493f3d2b8Schs vdev_t *tvd = rvd->vdev_child[c];
48593f3d2b8Schs metaslab_group_t *mg = tvd->vdev_mg;
48693f3d2b8Schs
48793f3d2b8Schs if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
48893f3d2b8Schs mg->mg_class != mc) {
48993f3d2b8Schs continue;
49093f3d2b8Schs }
49193f3d2b8Schs
49293f3d2b8Schs /*
49393f3d2b8Schs * Calculate if we have enough space to add additional
49493f3d2b8Schs * metaslabs. We report the expandable space in terms
49593f3d2b8Schs * of the metaslab size since that's the unit of expansion.
49693f3d2b8Schs */
49793f3d2b8Schs space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
49893f3d2b8Schs 1ULL << tvd->vdev_ms_shift);
49993f3d2b8Schs }
50093f3d2b8Schs spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
50193f3d2b8Schs return (space);
50293f3d2b8Schs }
50393f3d2b8Schs
504c1cb2cd8Shaad static int
metaslab_compare(const void * x1,const void * x2)505c1cb2cd8Shaad metaslab_compare(const void *x1, const void *x2)
506c1cb2cd8Shaad {
507c1cb2cd8Shaad const metaslab_t *m1 = x1;
508c1cb2cd8Shaad const metaslab_t *m2 = x2;
509c1cb2cd8Shaad
510c1cb2cd8Shaad if (m1->ms_weight < m2->ms_weight)
511c1cb2cd8Shaad return (1);
512c1cb2cd8Shaad if (m1->ms_weight > m2->ms_weight)
513c1cb2cd8Shaad return (-1);
514c1cb2cd8Shaad
515c1cb2cd8Shaad /*
516c1cb2cd8Shaad * If the weights are identical, use the offset to force uniqueness.
517c1cb2cd8Shaad */
51893f3d2b8Schs if (m1->ms_start < m2->ms_start)
519c1cb2cd8Shaad return (-1);
52093f3d2b8Schs if (m1->ms_start > m2->ms_start)
521c1cb2cd8Shaad return (1);
522c1cb2cd8Shaad
523c1cb2cd8Shaad ASSERT3P(m1, ==, m2);
524c1cb2cd8Shaad
525c1cb2cd8Shaad return (0);
526c1cb2cd8Shaad }
527c1cb2cd8Shaad
52893f3d2b8Schs /*
52993f3d2b8Schs * Verify that the space accounting on disk matches the in-core range_trees.
53093f3d2b8Schs */
53193f3d2b8Schs void
metaslab_verify_space(metaslab_t * msp,uint64_t txg)53293f3d2b8Schs metaslab_verify_space(metaslab_t *msp, uint64_t txg)
53393f3d2b8Schs {
53493f3d2b8Schs spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
53593f3d2b8Schs uint64_t allocated = 0;
53693f3d2b8Schs uint64_t freed = 0;
53793f3d2b8Schs uint64_t sm_free_space, msp_free_space;
53893f3d2b8Schs
53993f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
54093f3d2b8Schs
54193f3d2b8Schs if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
54293f3d2b8Schs return;
54393f3d2b8Schs
54493f3d2b8Schs /*
54593f3d2b8Schs * We can only verify the metaslab space when we're called
54693f3d2b8Schs * from syncing context with a loaded metaslab that has an allocated
54793f3d2b8Schs * space map. Calling this in non-syncing context does not
54893f3d2b8Schs * provide a consistent view of the metaslab since we're performing
54993f3d2b8Schs * allocations in the future.
55093f3d2b8Schs */
55193f3d2b8Schs if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
55293f3d2b8Schs !msp->ms_loaded)
55393f3d2b8Schs return;
55493f3d2b8Schs
55593f3d2b8Schs sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
55693f3d2b8Schs space_map_alloc_delta(msp->ms_sm);
55793f3d2b8Schs
55893f3d2b8Schs /*
55993f3d2b8Schs * Account for future allocations since we would have already
56093f3d2b8Schs * deducted that space from the ms_freetree.
56193f3d2b8Schs */
56293f3d2b8Schs for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
56393f3d2b8Schs allocated +=
56493f3d2b8Schs range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
56593f3d2b8Schs }
56693f3d2b8Schs freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]);
56793f3d2b8Schs
56893f3d2b8Schs msp_free_space = range_tree_space(msp->ms_tree) + allocated +
56993f3d2b8Schs msp->ms_deferspace + freed;
57093f3d2b8Schs
57193f3d2b8Schs VERIFY3U(sm_free_space, ==, msp_free_space);
57293f3d2b8Schs }
57393f3d2b8Schs
57493f3d2b8Schs /*
57593f3d2b8Schs * ==========================================================================
57693f3d2b8Schs * Metaslab groups
57793f3d2b8Schs * ==========================================================================
57893f3d2b8Schs */
57993f3d2b8Schs /*
58093f3d2b8Schs * Update the allocatable flag and the metaslab group's capacity.
58193f3d2b8Schs * The allocatable flag is set to true if the capacity is below
58293f3d2b8Schs * the zfs_mg_noalloc_threshold or has a fragmentation value that is
58393f3d2b8Schs * greater than zfs_mg_fragmentation_threshold. If a metaslab group
58493f3d2b8Schs * transitions from allocatable to non-allocatable or vice versa then the
58593f3d2b8Schs * metaslab group's class is updated to reflect the transition.
58693f3d2b8Schs */
58793f3d2b8Schs static void
metaslab_group_alloc_update(metaslab_group_t * mg)58893f3d2b8Schs metaslab_group_alloc_update(metaslab_group_t *mg)
58993f3d2b8Schs {
59093f3d2b8Schs vdev_t *vd = mg->mg_vd;
59193f3d2b8Schs metaslab_class_t *mc = mg->mg_class;
59293f3d2b8Schs vdev_stat_t *vs = &vd->vdev_stat;
59393f3d2b8Schs boolean_t was_allocatable;
59493f3d2b8Schs boolean_t was_initialized;
59593f3d2b8Schs
59693f3d2b8Schs ASSERT(vd == vd->vdev_top);
59793f3d2b8Schs
59893f3d2b8Schs mutex_enter(&mg->mg_lock);
59993f3d2b8Schs was_allocatable = mg->mg_allocatable;
60093f3d2b8Schs was_initialized = mg->mg_initialized;
60193f3d2b8Schs
60293f3d2b8Schs mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
60393f3d2b8Schs (vs->vs_space + 1);
60493f3d2b8Schs
60593f3d2b8Schs mutex_enter(&mc->mc_lock);
60693f3d2b8Schs
60793f3d2b8Schs /*
60893f3d2b8Schs * If the metaslab group was just added then it won't
60993f3d2b8Schs * have any space until we finish syncing out this txg.
61093f3d2b8Schs * At that point we will consider it initialized and available
61193f3d2b8Schs * for allocations. We also don't consider non-activated
61293f3d2b8Schs * metaslab groups (e.g. vdevs that are in the middle of being removed)
61393f3d2b8Schs * to be initialized, because they can't be used for allocation.
61493f3d2b8Schs */
61593f3d2b8Schs mg->mg_initialized = metaslab_group_initialized(mg);
61693f3d2b8Schs if (!was_initialized && mg->mg_initialized) {
61793f3d2b8Schs mc->mc_groups++;
61893f3d2b8Schs } else if (was_initialized && !mg->mg_initialized) {
61993f3d2b8Schs ASSERT3U(mc->mc_groups, >, 0);
62093f3d2b8Schs mc->mc_groups--;
62193f3d2b8Schs }
62293f3d2b8Schs if (mg->mg_initialized)
62393f3d2b8Schs mg->mg_no_free_space = B_FALSE;
62493f3d2b8Schs
62593f3d2b8Schs /*
62693f3d2b8Schs * A metaslab group is considered allocatable if it has plenty
62793f3d2b8Schs * of free space or is not heavily fragmented. We only take
62893f3d2b8Schs * fragmentation into account if the metaslab group has a valid
62993f3d2b8Schs * fragmentation metric (i.e. a value between 0 and 100).
63093f3d2b8Schs */
63193f3d2b8Schs mg->mg_allocatable = (mg->mg_activation_count > 0 &&
63293f3d2b8Schs mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
63393f3d2b8Schs (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
63493f3d2b8Schs mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
63593f3d2b8Schs
63693f3d2b8Schs /*
63793f3d2b8Schs * The mc_alloc_groups maintains a count of the number of
63893f3d2b8Schs * groups in this metaslab class that are still above the
63993f3d2b8Schs * zfs_mg_noalloc_threshold. This is used by the allocating
64093f3d2b8Schs * threads to determine if they should avoid allocations to
64193f3d2b8Schs * a given group. The allocator will avoid allocations to a group
64293f3d2b8Schs * if that group has reached or is below the zfs_mg_noalloc_threshold
64393f3d2b8Schs * and there are still other groups that are above the threshold.
64493f3d2b8Schs * When a group transitions from allocatable to non-allocatable or
64593f3d2b8Schs * vice versa we update the metaslab class to reflect that change.
64693f3d2b8Schs * When the mc_alloc_groups value drops to 0 that means that all
64793f3d2b8Schs * groups have reached the zfs_mg_noalloc_threshold making all groups
64893f3d2b8Schs * eligible for allocations. This effectively means that all devices
64993f3d2b8Schs * are balanced again.
65093f3d2b8Schs */
65193f3d2b8Schs if (was_allocatable && !mg->mg_allocatable)
65293f3d2b8Schs mc->mc_alloc_groups--;
65393f3d2b8Schs else if (!was_allocatable && mg->mg_allocatable)
65493f3d2b8Schs mc->mc_alloc_groups++;
65593f3d2b8Schs mutex_exit(&mc->mc_lock);
65693f3d2b8Schs
65793f3d2b8Schs mutex_exit(&mg->mg_lock);
65893f3d2b8Schs }
65993f3d2b8Schs
660c1cb2cd8Shaad metaslab_group_t *
metaslab_group_create(metaslab_class_t * mc,vdev_t * vd)661c1cb2cd8Shaad metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
662c1cb2cd8Shaad {
663c1cb2cd8Shaad metaslab_group_t *mg;
664c1cb2cd8Shaad
665c1cb2cd8Shaad mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
666c1cb2cd8Shaad mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
667c1cb2cd8Shaad avl_create(&mg->mg_metaslab_tree, metaslab_compare,
668c1cb2cd8Shaad sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
669c1cb2cd8Shaad mg->mg_vd = vd;
670f59c7639Shaad mg->mg_class = mc;
671f59c7639Shaad mg->mg_activation_count = 0;
67293f3d2b8Schs mg->mg_initialized = B_FALSE;
67393f3d2b8Schs mg->mg_no_free_space = B_TRUE;
67493f3d2b8Schs refcount_create_tracked(&mg->mg_alloc_queue_depth);
67593f3d2b8Schs
67693f3d2b8Schs mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
67793f3d2b8Schs minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
678c1cb2cd8Shaad
679c1cb2cd8Shaad return (mg);
680c1cb2cd8Shaad }
681c1cb2cd8Shaad
682c1cb2cd8Shaad void
metaslab_group_destroy(metaslab_group_t * mg)683c1cb2cd8Shaad metaslab_group_destroy(metaslab_group_t *mg)
684c1cb2cd8Shaad {
685f59c7639Shaad ASSERT(mg->mg_prev == NULL);
686f59c7639Shaad ASSERT(mg->mg_next == NULL);
687f59c7639Shaad /*
688f59c7639Shaad * We may have gone below zero with the activation count
689f59c7639Shaad * either because we never activated in the first place or
690f59c7639Shaad * because we're done, and possibly removing the vdev.
691f59c7639Shaad */
692f59c7639Shaad ASSERT(mg->mg_activation_count <= 0);
693f59c7639Shaad
69493f3d2b8Schs taskq_destroy(mg->mg_taskq);
695c1cb2cd8Shaad avl_destroy(&mg->mg_metaslab_tree);
696c1cb2cd8Shaad mutex_destroy(&mg->mg_lock);
69793f3d2b8Schs refcount_destroy(&mg->mg_alloc_queue_depth);
698c1cb2cd8Shaad kmem_free(mg, sizeof (metaslab_group_t));
699c1cb2cd8Shaad }
700c1cb2cd8Shaad
701f59c7639Shaad void
metaslab_group_activate(metaslab_group_t * mg)702f59c7639Shaad metaslab_group_activate(metaslab_group_t *mg)
703f59c7639Shaad {
704f59c7639Shaad metaslab_class_t *mc = mg->mg_class;
705f59c7639Shaad metaslab_group_t *mgprev, *mgnext;
706f59c7639Shaad
707f59c7639Shaad ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
708f59c7639Shaad
709f59c7639Shaad ASSERT(mc->mc_rotor != mg);
710f59c7639Shaad ASSERT(mg->mg_prev == NULL);
711f59c7639Shaad ASSERT(mg->mg_next == NULL);
712f59c7639Shaad ASSERT(mg->mg_activation_count <= 0);
713f59c7639Shaad
714f59c7639Shaad if (++mg->mg_activation_count <= 0)
715f59c7639Shaad return;
716f59c7639Shaad
717f59c7639Shaad mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
71893f3d2b8Schs metaslab_group_alloc_update(mg);
719f59c7639Shaad
720f59c7639Shaad if ((mgprev = mc->mc_rotor) == NULL) {
721f59c7639Shaad mg->mg_prev = mg;
722f59c7639Shaad mg->mg_next = mg;
723f59c7639Shaad } else {
724f59c7639Shaad mgnext = mgprev->mg_next;
725f59c7639Shaad mg->mg_prev = mgprev;
726f59c7639Shaad mg->mg_next = mgnext;
727f59c7639Shaad mgprev->mg_next = mg;
728f59c7639Shaad mgnext->mg_prev = mg;
729f59c7639Shaad }
730f59c7639Shaad mc->mc_rotor = mg;
73193f3d2b8Schs metaslab_class_minblocksize_update(mc);
732f59c7639Shaad }
733f59c7639Shaad
734f59c7639Shaad void
metaslab_group_passivate(metaslab_group_t * mg)735f59c7639Shaad metaslab_group_passivate(metaslab_group_t *mg)
736f59c7639Shaad {
737f59c7639Shaad metaslab_class_t *mc = mg->mg_class;
738f59c7639Shaad metaslab_group_t *mgprev, *mgnext;
739f59c7639Shaad
740f59c7639Shaad ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
741f59c7639Shaad
742f59c7639Shaad if (--mg->mg_activation_count != 0) {
743f59c7639Shaad ASSERT(mc->mc_rotor != mg);
744f59c7639Shaad ASSERT(mg->mg_prev == NULL);
745f59c7639Shaad ASSERT(mg->mg_next == NULL);
746f59c7639Shaad ASSERT(mg->mg_activation_count < 0);
747f59c7639Shaad return;
748f59c7639Shaad }
749f59c7639Shaad
75093f3d2b8Schs taskq_wait(mg->mg_taskq);
75193f3d2b8Schs metaslab_group_alloc_update(mg);
75293f3d2b8Schs
753f59c7639Shaad mgprev = mg->mg_prev;
754f59c7639Shaad mgnext = mg->mg_next;
755f59c7639Shaad
756f59c7639Shaad if (mg == mgnext) {
757f59c7639Shaad mc->mc_rotor = NULL;
758f59c7639Shaad } else {
759f59c7639Shaad mc->mc_rotor = mgnext;
760f59c7639Shaad mgprev->mg_next = mgnext;
761f59c7639Shaad mgnext->mg_prev = mgprev;
762f59c7639Shaad }
763f59c7639Shaad
764f59c7639Shaad mg->mg_prev = NULL;
765f59c7639Shaad mg->mg_next = NULL;
76693f3d2b8Schs metaslab_class_minblocksize_update(mc);
76793f3d2b8Schs }
76893f3d2b8Schs
76993f3d2b8Schs boolean_t
metaslab_group_initialized(metaslab_group_t * mg)77093f3d2b8Schs metaslab_group_initialized(metaslab_group_t *mg)
77193f3d2b8Schs {
77293f3d2b8Schs vdev_t *vd = mg->mg_vd;
77393f3d2b8Schs vdev_stat_t *vs = &vd->vdev_stat;
77493f3d2b8Schs
77593f3d2b8Schs return (vs->vs_space != 0 && mg->mg_activation_count > 0);
77693f3d2b8Schs }
77793f3d2b8Schs
77893f3d2b8Schs uint64_t
metaslab_group_get_space(metaslab_group_t * mg)77993f3d2b8Schs metaslab_group_get_space(metaslab_group_t *mg)
78093f3d2b8Schs {
78193f3d2b8Schs return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
78293f3d2b8Schs }
78393f3d2b8Schs
78493f3d2b8Schs void
metaslab_group_histogram_verify(metaslab_group_t * mg)78593f3d2b8Schs metaslab_group_histogram_verify(metaslab_group_t *mg)
78693f3d2b8Schs {
78793f3d2b8Schs uint64_t *mg_hist;
78893f3d2b8Schs vdev_t *vd = mg->mg_vd;
78993f3d2b8Schs uint64_t ashift = vd->vdev_ashift;
79093f3d2b8Schs int i;
79193f3d2b8Schs
79293f3d2b8Schs if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
79393f3d2b8Schs return;
79493f3d2b8Schs
79593f3d2b8Schs mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
79693f3d2b8Schs KM_SLEEP);
79793f3d2b8Schs
79893f3d2b8Schs ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
79993f3d2b8Schs SPACE_MAP_HISTOGRAM_SIZE + ashift);
80093f3d2b8Schs
80193f3d2b8Schs for (int m = 0; m < vd->vdev_ms_count; m++) {
80293f3d2b8Schs metaslab_t *msp = vd->vdev_ms[m];
80393f3d2b8Schs
80493f3d2b8Schs if (msp->ms_sm == NULL)
80593f3d2b8Schs continue;
80693f3d2b8Schs
80793f3d2b8Schs for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
80893f3d2b8Schs mg_hist[i + ashift] +=
80993f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i];
81093f3d2b8Schs }
81193f3d2b8Schs
81293f3d2b8Schs for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
81393f3d2b8Schs VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
81493f3d2b8Schs
81593f3d2b8Schs kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
81693f3d2b8Schs }
81793f3d2b8Schs
81893f3d2b8Schs static void
metaslab_group_histogram_add(metaslab_group_t * mg,metaslab_t * msp)81993f3d2b8Schs metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
82093f3d2b8Schs {
82193f3d2b8Schs metaslab_class_t *mc = mg->mg_class;
82293f3d2b8Schs uint64_t ashift = mg->mg_vd->vdev_ashift;
82393f3d2b8Schs
82493f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
82593f3d2b8Schs if (msp->ms_sm == NULL)
82693f3d2b8Schs return;
82793f3d2b8Schs
82893f3d2b8Schs mutex_enter(&mg->mg_lock);
82993f3d2b8Schs for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
83093f3d2b8Schs mg->mg_histogram[i + ashift] +=
83193f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i];
83293f3d2b8Schs mc->mc_histogram[i + ashift] +=
83393f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i];
83493f3d2b8Schs }
83593f3d2b8Schs mutex_exit(&mg->mg_lock);
83693f3d2b8Schs }
83793f3d2b8Schs
83893f3d2b8Schs void
metaslab_group_histogram_remove(metaslab_group_t * mg,metaslab_t * msp)83993f3d2b8Schs metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
84093f3d2b8Schs {
84193f3d2b8Schs metaslab_class_t *mc = mg->mg_class;
84293f3d2b8Schs uint64_t ashift = mg->mg_vd->vdev_ashift;
84393f3d2b8Schs
84493f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
84593f3d2b8Schs if (msp->ms_sm == NULL)
84693f3d2b8Schs return;
84793f3d2b8Schs
84893f3d2b8Schs mutex_enter(&mg->mg_lock);
84993f3d2b8Schs for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
85093f3d2b8Schs ASSERT3U(mg->mg_histogram[i + ashift], >=,
85193f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i]);
85293f3d2b8Schs ASSERT3U(mc->mc_histogram[i + ashift], >=,
85393f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i]);
85493f3d2b8Schs
85593f3d2b8Schs mg->mg_histogram[i + ashift] -=
85693f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i];
85793f3d2b8Schs mc->mc_histogram[i + ashift] -=
85893f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i];
85993f3d2b8Schs }
86093f3d2b8Schs mutex_exit(&mg->mg_lock);
861f59c7639Shaad }
862f59c7639Shaad
863c1cb2cd8Shaad static void
metaslab_group_add(metaslab_group_t * mg,metaslab_t * msp)864c1cb2cd8Shaad metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
865c1cb2cd8Shaad {
866c1cb2cd8Shaad ASSERT(msp->ms_group == NULL);
86793f3d2b8Schs mutex_enter(&mg->mg_lock);
868c1cb2cd8Shaad msp->ms_group = mg;
869c1cb2cd8Shaad msp->ms_weight = 0;
870c1cb2cd8Shaad avl_add(&mg->mg_metaslab_tree, msp);
871c1cb2cd8Shaad mutex_exit(&mg->mg_lock);
87293f3d2b8Schs
87393f3d2b8Schs mutex_enter(&msp->ms_lock);
87493f3d2b8Schs metaslab_group_histogram_add(mg, msp);
87593f3d2b8Schs mutex_exit(&msp->ms_lock);
876c1cb2cd8Shaad }
877c1cb2cd8Shaad
878c1cb2cd8Shaad static void
metaslab_group_remove(metaslab_group_t * mg,metaslab_t * msp)879c1cb2cd8Shaad metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
880c1cb2cd8Shaad {
88193f3d2b8Schs mutex_enter(&msp->ms_lock);
88293f3d2b8Schs metaslab_group_histogram_remove(mg, msp);
88393f3d2b8Schs mutex_exit(&msp->ms_lock);
88493f3d2b8Schs
885c1cb2cd8Shaad mutex_enter(&mg->mg_lock);
886c1cb2cd8Shaad ASSERT(msp->ms_group == mg);
887c1cb2cd8Shaad avl_remove(&mg->mg_metaslab_tree, msp);
888c1cb2cd8Shaad msp->ms_group = NULL;
889c1cb2cd8Shaad mutex_exit(&mg->mg_lock);
890c1cb2cd8Shaad }
891c1cb2cd8Shaad
892c1cb2cd8Shaad static void
metaslab_group_sort(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)893c1cb2cd8Shaad metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
894c1cb2cd8Shaad {
895c1cb2cd8Shaad /*
896c1cb2cd8Shaad * Although in principle the weight can be any value, in
89793f3d2b8Schs * practice we do not use values in the range [1, 511].
898c1cb2cd8Shaad */
89993f3d2b8Schs ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
900c1cb2cd8Shaad ASSERT(MUTEX_HELD(&msp->ms_lock));
901c1cb2cd8Shaad
902c1cb2cd8Shaad mutex_enter(&mg->mg_lock);
903c1cb2cd8Shaad ASSERT(msp->ms_group == mg);
904c1cb2cd8Shaad avl_remove(&mg->mg_metaslab_tree, msp);
905c1cb2cd8Shaad msp->ms_weight = weight;
906c1cb2cd8Shaad avl_add(&mg->mg_metaslab_tree, msp);
907c1cb2cd8Shaad mutex_exit(&mg->mg_lock);
908c1cb2cd8Shaad }
909c1cb2cd8Shaad
910c1cb2cd8Shaad /*
91193f3d2b8Schs * Calculate the fragmentation for a given metaslab group. We can use
91293f3d2b8Schs * a simple average here since all metaslabs within the group must have
91393f3d2b8Schs * the same size. The return value will be a value between 0 and 100
91493f3d2b8Schs * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
91593f3d2b8Schs * group have a fragmentation metric.
91693f3d2b8Schs */
91793f3d2b8Schs uint64_t
metaslab_group_fragmentation(metaslab_group_t * mg)91893f3d2b8Schs metaslab_group_fragmentation(metaslab_group_t *mg)
91993f3d2b8Schs {
92093f3d2b8Schs vdev_t *vd = mg->mg_vd;
92193f3d2b8Schs uint64_t fragmentation = 0;
92293f3d2b8Schs uint64_t valid_ms = 0;
92393f3d2b8Schs
92493f3d2b8Schs for (int m = 0; m < vd->vdev_ms_count; m++) {
92593f3d2b8Schs metaslab_t *msp = vd->vdev_ms[m];
92693f3d2b8Schs
92793f3d2b8Schs if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
92893f3d2b8Schs continue;
92993f3d2b8Schs
93093f3d2b8Schs valid_ms++;
93193f3d2b8Schs fragmentation += msp->ms_fragmentation;
93293f3d2b8Schs }
93393f3d2b8Schs
93493f3d2b8Schs if (valid_ms <= vd->vdev_ms_count / 2)
93593f3d2b8Schs return (ZFS_FRAG_INVALID);
93693f3d2b8Schs
93793f3d2b8Schs fragmentation /= valid_ms;
93893f3d2b8Schs ASSERT3U(fragmentation, <=, 100);
93993f3d2b8Schs return (fragmentation);
94093f3d2b8Schs }
94193f3d2b8Schs
94293f3d2b8Schs /*
94393f3d2b8Schs * Determine if a given metaslab group should skip allocations. A metaslab
94493f3d2b8Schs * group should avoid allocations if its free capacity is less than the
94593f3d2b8Schs * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
94693f3d2b8Schs * zfs_mg_fragmentation_threshold and there is at least one metaslab group
94793f3d2b8Schs * that can still handle allocations. If the allocation throttle is enabled
94893f3d2b8Schs * then we skip allocations to devices that have reached their maximum
94993f3d2b8Schs * allocation queue depth unless the selected metaslab group is the only
95093f3d2b8Schs * eligible group remaining.
95193f3d2b8Schs */
95293f3d2b8Schs static boolean_t
metaslab_group_allocatable(metaslab_group_t * mg,metaslab_group_t * rotor,uint64_t psize)95393f3d2b8Schs metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
95493f3d2b8Schs uint64_t psize)
95593f3d2b8Schs {
95693f3d2b8Schs spa_t *spa = mg->mg_vd->vdev_spa;
95793f3d2b8Schs metaslab_class_t *mc = mg->mg_class;
95893f3d2b8Schs
95993f3d2b8Schs /*
96093f3d2b8Schs * We can only consider skipping this metaslab group if it's
96193f3d2b8Schs * in the normal metaslab class and there are other metaslab
96293f3d2b8Schs * groups to select from. Otherwise, we always consider it eligible
96393f3d2b8Schs * for allocations.
96493f3d2b8Schs */
96593f3d2b8Schs if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
96693f3d2b8Schs return (B_TRUE);
96793f3d2b8Schs
96893f3d2b8Schs /*
96993f3d2b8Schs * If the metaslab group's mg_allocatable flag is set (see comments
97093f3d2b8Schs * in metaslab_group_alloc_update() for more information) and
97193f3d2b8Schs * the allocation throttle is disabled then allow allocations to this
97293f3d2b8Schs * device. However, if the allocation throttle is enabled then
97393f3d2b8Schs * check if we have reached our allocation limit (mg_alloc_queue_depth)
97493f3d2b8Schs * to determine if we should allow allocations to this metaslab group.
97593f3d2b8Schs * If all metaslab groups are no longer considered allocatable
97693f3d2b8Schs * (mc_alloc_groups == 0) or we're trying to allocate the smallest
97793f3d2b8Schs * gang block size then we allow allocations on this metaslab group
97893f3d2b8Schs * regardless of the mg_allocatable or throttle settings.
97993f3d2b8Schs */
98093f3d2b8Schs if (mg->mg_allocatable) {
98193f3d2b8Schs metaslab_group_t *mgp;
98293f3d2b8Schs int64_t qdepth;
98393f3d2b8Schs uint64_t qmax = mg->mg_max_alloc_queue_depth;
98493f3d2b8Schs
98593f3d2b8Schs if (!mc->mc_alloc_throttle_enabled)
98693f3d2b8Schs return (B_TRUE);
98793f3d2b8Schs
98893f3d2b8Schs /*
98993f3d2b8Schs * If this metaslab group does not have any free space, then
99093f3d2b8Schs * there is no point in looking further.
99193f3d2b8Schs */
99293f3d2b8Schs if (mg->mg_no_free_space)
99393f3d2b8Schs return (B_FALSE);
99493f3d2b8Schs
99593f3d2b8Schs qdepth = refcount_count(&mg->mg_alloc_queue_depth);
99693f3d2b8Schs
99793f3d2b8Schs /*
99893f3d2b8Schs * If this metaslab group is below its qmax or it's
99993f3d2b8Schs * the only allocatable metasable group, then attempt
100093f3d2b8Schs * to allocate from it.
100193f3d2b8Schs */
100293f3d2b8Schs if (qdepth < qmax || mc->mc_alloc_groups == 1)
100393f3d2b8Schs return (B_TRUE);
100493f3d2b8Schs ASSERT3U(mc->mc_alloc_groups, >, 1);
100593f3d2b8Schs
100693f3d2b8Schs /*
100793f3d2b8Schs * Since this metaslab group is at or over its qmax, we
100893f3d2b8Schs * need to determine if there are metaslab groups after this
100993f3d2b8Schs * one that might be able to handle this allocation. This is
101093f3d2b8Schs * racy since we can't hold the locks for all metaslab
101193f3d2b8Schs * groups at the same time when we make this check.
101293f3d2b8Schs */
101393f3d2b8Schs for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
101493f3d2b8Schs qmax = mgp->mg_max_alloc_queue_depth;
101593f3d2b8Schs
101693f3d2b8Schs qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
101793f3d2b8Schs
101893f3d2b8Schs /*
101993f3d2b8Schs * If there is another metaslab group that
102093f3d2b8Schs * might be able to handle the allocation, then
102193f3d2b8Schs * we return false so that we skip this group.
102293f3d2b8Schs */
102393f3d2b8Schs if (qdepth < qmax && !mgp->mg_no_free_space)
102493f3d2b8Schs return (B_FALSE);
102593f3d2b8Schs }
102693f3d2b8Schs
102793f3d2b8Schs /*
102893f3d2b8Schs * We didn't find another group to handle the allocation
102993f3d2b8Schs * so we can't skip this metaslab group even though
103093f3d2b8Schs * we are at or over our qmax.
103193f3d2b8Schs */
103293f3d2b8Schs return (B_TRUE);
103393f3d2b8Schs
103493f3d2b8Schs } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
103593f3d2b8Schs return (B_TRUE);
103693f3d2b8Schs }
103793f3d2b8Schs return (B_FALSE);
103893f3d2b8Schs }
103993f3d2b8Schs
104093f3d2b8Schs /*
104193f3d2b8Schs * ==========================================================================
104293f3d2b8Schs * Range tree callbacks
104393f3d2b8Schs * ==========================================================================
104493f3d2b8Schs */
104593f3d2b8Schs
104693f3d2b8Schs /*
104793f3d2b8Schs * Comparison function for the private size-ordered tree. Tree is sorted
104893f3d2b8Schs * by size, larger sizes at the end of the tree.
104993f3d2b8Schs */
105093f3d2b8Schs static int
metaslab_rangesize_compare(const void * x1,const void * x2)105193f3d2b8Schs metaslab_rangesize_compare(const void *x1, const void *x2)
105293f3d2b8Schs {
105393f3d2b8Schs const range_seg_t *r1 = x1;
105493f3d2b8Schs const range_seg_t *r2 = x2;
105593f3d2b8Schs uint64_t rs_size1 = r1->rs_end - r1->rs_start;
105693f3d2b8Schs uint64_t rs_size2 = r2->rs_end - r2->rs_start;
105793f3d2b8Schs
105893f3d2b8Schs if (rs_size1 < rs_size2)
105993f3d2b8Schs return (-1);
106093f3d2b8Schs if (rs_size1 > rs_size2)
106193f3d2b8Schs return (1);
106293f3d2b8Schs
106393f3d2b8Schs if (r1->rs_start < r2->rs_start)
106493f3d2b8Schs return (-1);
106593f3d2b8Schs
106693f3d2b8Schs if (r1->rs_start > r2->rs_start)
106793f3d2b8Schs return (1);
106893f3d2b8Schs
106993f3d2b8Schs return (0);
107093f3d2b8Schs }
107193f3d2b8Schs
107293f3d2b8Schs /*
107393f3d2b8Schs * Create any block allocator specific components. The current allocators
107493f3d2b8Schs * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
107593f3d2b8Schs */
107693f3d2b8Schs static void
metaslab_rt_create(range_tree_t * rt,void * arg)107793f3d2b8Schs metaslab_rt_create(range_tree_t *rt, void *arg)
107893f3d2b8Schs {
107993f3d2b8Schs metaslab_t *msp = arg;
108093f3d2b8Schs
108193f3d2b8Schs ASSERT3P(rt->rt_arg, ==, msp);
108293f3d2b8Schs ASSERT(msp->ms_tree == NULL);
108393f3d2b8Schs
108493f3d2b8Schs avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
108593f3d2b8Schs sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
108693f3d2b8Schs }
108793f3d2b8Schs
108893f3d2b8Schs /*
108993f3d2b8Schs * Destroy the block allocator specific components.
109093f3d2b8Schs */
109193f3d2b8Schs static void
metaslab_rt_destroy(range_tree_t * rt,void * arg)109293f3d2b8Schs metaslab_rt_destroy(range_tree_t *rt, void *arg)
109393f3d2b8Schs {
109493f3d2b8Schs metaslab_t *msp = arg;
109593f3d2b8Schs
109693f3d2b8Schs ASSERT3P(rt->rt_arg, ==, msp);
109793f3d2b8Schs ASSERT3P(msp->ms_tree, ==, rt);
109893f3d2b8Schs ASSERT0(avl_numnodes(&msp->ms_size_tree));
109993f3d2b8Schs
110093f3d2b8Schs avl_destroy(&msp->ms_size_tree);
110193f3d2b8Schs }
110293f3d2b8Schs
110393f3d2b8Schs static void
metaslab_rt_add(range_tree_t * rt,range_seg_t * rs,void * arg)110493f3d2b8Schs metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
110593f3d2b8Schs {
110693f3d2b8Schs metaslab_t *msp = arg;
110793f3d2b8Schs
110893f3d2b8Schs ASSERT3P(rt->rt_arg, ==, msp);
110993f3d2b8Schs ASSERT3P(msp->ms_tree, ==, rt);
111093f3d2b8Schs VERIFY(!msp->ms_condensing);
111193f3d2b8Schs avl_add(&msp->ms_size_tree, rs);
111293f3d2b8Schs }
111393f3d2b8Schs
111493f3d2b8Schs static void
metaslab_rt_remove(range_tree_t * rt,range_seg_t * rs,void * arg)111593f3d2b8Schs metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
111693f3d2b8Schs {
111793f3d2b8Schs metaslab_t *msp = arg;
111893f3d2b8Schs
111993f3d2b8Schs ASSERT3P(rt->rt_arg, ==, msp);
112093f3d2b8Schs ASSERT3P(msp->ms_tree, ==, rt);
112193f3d2b8Schs VERIFY(!msp->ms_condensing);
112293f3d2b8Schs avl_remove(&msp->ms_size_tree, rs);
112393f3d2b8Schs }
112493f3d2b8Schs
112593f3d2b8Schs static void
metaslab_rt_vacate(range_tree_t * rt,void * arg)112693f3d2b8Schs metaslab_rt_vacate(range_tree_t *rt, void *arg)
112793f3d2b8Schs {
112893f3d2b8Schs metaslab_t *msp = arg;
112993f3d2b8Schs
113093f3d2b8Schs ASSERT3P(rt->rt_arg, ==, msp);
113193f3d2b8Schs ASSERT3P(msp->ms_tree, ==, rt);
113293f3d2b8Schs
113393f3d2b8Schs /*
113493f3d2b8Schs * Normally one would walk the tree freeing nodes along the way.
113593f3d2b8Schs * Since the nodes are shared with the range trees we can avoid
113693f3d2b8Schs * walking all nodes and just reinitialize the avl tree. The nodes
113793f3d2b8Schs * will be freed by the range tree, so we don't want to free them here.
113893f3d2b8Schs */
113993f3d2b8Schs avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
114093f3d2b8Schs sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
114193f3d2b8Schs }
114293f3d2b8Schs
114393f3d2b8Schs static range_tree_ops_t metaslab_rt_ops = {
114493f3d2b8Schs metaslab_rt_create,
114593f3d2b8Schs metaslab_rt_destroy,
114693f3d2b8Schs metaslab_rt_add,
114793f3d2b8Schs metaslab_rt_remove,
114893f3d2b8Schs metaslab_rt_vacate
114993f3d2b8Schs };
115093f3d2b8Schs
115193f3d2b8Schs /*
1152c1cb2cd8Shaad * ==========================================================================
1153f59c7639Shaad * Common allocator routines
1154c1cb2cd8Shaad * ==========================================================================
1155c1cb2cd8Shaad */
115693f3d2b8Schs
115793f3d2b8Schs /*
115893f3d2b8Schs * Return the maximum contiguous segment within the metaslab.
115993f3d2b8Schs */
116093f3d2b8Schs uint64_t
metaslab_block_maxsize(metaslab_t * msp)116193f3d2b8Schs metaslab_block_maxsize(metaslab_t *msp)
1162c1cb2cd8Shaad {
116393f3d2b8Schs avl_tree_t *t = &msp->ms_size_tree;
116493f3d2b8Schs range_seg_t *rs;
1165f59c7639Shaad
116693f3d2b8Schs if (t == NULL || (rs = avl_last(t)) == NULL)
116793f3d2b8Schs return (0ULL);
1168f59c7639Shaad
116993f3d2b8Schs return (rs->rs_end - rs->rs_start);
117093f3d2b8Schs }
1171f59c7639Shaad
117293f3d2b8Schs static range_seg_t *
metaslab_block_find(avl_tree_t * t,uint64_t start,uint64_t size)117393f3d2b8Schs metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
117493f3d2b8Schs {
117593f3d2b8Schs range_seg_t *rs, rsearch;
117693f3d2b8Schs avl_index_t where;
117793f3d2b8Schs
117893f3d2b8Schs rsearch.rs_start = start;
117993f3d2b8Schs rsearch.rs_end = start + size;
118093f3d2b8Schs
118193f3d2b8Schs rs = avl_find(t, &rsearch, &where);
118293f3d2b8Schs if (rs == NULL) {
118393f3d2b8Schs rs = avl_nearest(t, where, AVL_AFTER);
118493f3d2b8Schs }
118593f3d2b8Schs
118693f3d2b8Schs return (rs);
1187c1cb2cd8Shaad }
1188c1cb2cd8Shaad
1189f59c7639Shaad /*
1190f59c7639Shaad * This is a helper function that can be used by the allocator to find
1191f59c7639Shaad * a suitable block to allocate. This will search the specified AVL
1192f59c7639Shaad * tree looking for a block that matches the specified criteria.
1193f59c7639Shaad */
1194c1cb2cd8Shaad static uint64_t
metaslab_block_picker(avl_tree_t * t,uint64_t * cursor,uint64_t size,uint64_t align)1195f59c7639Shaad metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1196f59c7639Shaad uint64_t align)
1197c1cb2cd8Shaad {
119893f3d2b8Schs range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1199c1cb2cd8Shaad
120093f3d2b8Schs while (rs != NULL) {
120193f3d2b8Schs uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1202c1cb2cd8Shaad
120393f3d2b8Schs if (offset + size <= rs->rs_end) {
1204c1cb2cd8Shaad *cursor = offset + size;
1205c1cb2cd8Shaad return (offset);
1206c1cb2cd8Shaad }
120793f3d2b8Schs rs = AVL_NEXT(t, rs);
1208c1cb2cd8Shaad }
1209c1cb2cd8Shaad
1210c1cb2cd8Shaad /*
1211c1cb2cd8Shaad * If we know we've searched the whole map (*cursor == 0), give up.
1212c1cb2cd8Shaad * Otherwise, reset the cursor to the beginning and try again.
1213c1cb2cd8Shaad */
1214c1cb2cd8Shaad if (*cursor == 0)
1215c1cb2cd8Shaad return (-1ULL);
1216c1cb2cd8Shaad
1217c1cb2cd8Shaad *cursor = 0;
1218f59c7639Shaad return (metaslab_block_picker(t, cursor, size, align));
1219f59c7639Shaad }
1220f59c7639Shaad
1221f59c7639Shaad /*
1222f59c7639Shaad * ==========================================================================
1223f59c7639Shaad * The first-fit block allocator
1224f59c7639Shaad * ==========================================================================
1225f59c7639Shaad */
1226f59c7639Shaad static uint64_t
metaslab_ff_alloc(metaslab_t * msp,uint64_t size)122793f3d2b8Schs metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1228f59c7639Shaad {
122993f3d2b8Schs /*
123093f3d2b8Schs * Find the largest power of 2 block size that evenly divides the
123193f3d2b8Schs * requested size. This is used to try to allocate blocks with similar
123293f3d2b8Schs * alignment from the same area of the metaslab (i.e. same cursor
123393f3d2b8Schs * bucket) but it does not guarantee that other allocations sizes
123493f3d2b8Schs * may exist in the same region.
123593f3d2b8Schs */
1236f59c7639Shaad uint64_t align = size & -size;
123793f3d2b8Schs uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
123893f3d2b8Schs avl_tree_t *t = &msp->ms_tree->rt_root;
1239f59c7639Shaad
1240f59c7639Shaad return (metaslab_block_picker(t, cursor, size, align));
1241f59c7639Shaad }
1242f59c7639Shaad
124393f3d2b8Schs static metaslab_ops_t metaslab_ff_ops = {
124493f3d2b8Schs metaslab_ff_alloc
1245c1cb2cd8Shaad };
1246c1cb2cd8Shaad
1247c1cb2cd8Shaad /*
1248c1cb2cd8Shaad * ==========================================================================
1249f59c7639Shaad * Dynamic block allocator -
1250f59c7639Shaad * Uses the first fit allocation scheme until space get low and then
1251f59c7639Shaad * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1252f59c7639Shaad * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1253f59c7639Shaad * ==========================================================================
1254f59c7639Shaad */
1255f59c7639Shaad static uint64_t
metaslab_df_alloc(metaslab_t * msp,uint64_t size)125693f3d2b8Schs metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1257f59c7639Shaad {
125893f3d2b8Schs /*
125993f3d2b8Schs * Find the largest power of 2 block size that evenly divides the
126093f3d2b8Schs * requested size. This is used to try to allocate blocks with similar
126193f3d2b8Schs * alignment from the same area of the metaslab (i.e. same cursor
126293f3d2b8Schs * bucket) but it does not guarantee that other allocations sizes
126393f3d2b8Schs * may exist in the same region.
126493f3d2b8Schs */
1265f59c7639Shaad uint64_t align = size & -size;
126693f3d2b8Schs uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
126793f3d2b8Schs range_tree_t *rt = msp->ms_tree;
126893f3d2b8Schs avl_tree_t *t = &rt->rt_root;
126993f3d2b8Schs uint64_t max_size = metaslab_block_maxsize(msp);
127093f3d2b8Schs int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1271f59c7639Shaad
127293f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
127393f3d2b8Schs ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1274f59c7639Shaad
1275f59c7639Shaad if (max_size < size)
1276f59c7639Shaad return (-1ULL);
1277f59c7639Shaad
1278f59c7639Shaad /*
1279f59c7639Shaad * If we're running low on space switch to using the size
1280f59c7639Shaad * sorted AVL tree (best-fit).
1281f59c7639Shaad */
1282f59c7639Shaad if (max_size < metaslab_df_alloc_threshold ||
1283f59c7639Shaad free_pct < metaslab_df_free_pct) {
128493f3d2b8Schs t = &msp->ms_size_tree;
1285f59c7639Shaad *cursor = 0;
1286f59c7639Shaad }
1287f59c7639Shaad
1288f59c7639Shaad return (metaslab_block_picker(t, cursor, size, 1ULL));
1289f59c7639Shaad }
1290f59c7639Shaad
129193f3d2b8Schs static metaslab_ops_t metaslab_df_ops = {
129293f3d2b8Schs metaslab_df_alloc
1293f59c7639Shaad };
1294f59c7639Shaad
1295f59c7639Shaad /*
1296f59c7639Shaad * ==========================================================================
129793f3d2b8Schs * Cursor fit block allocator -
129893f3d2b8Schs * Select the largest region in the metaslab, set the cursor to the beginning
129993f3d2b8Schs * of the range and the cursor_end to the end of the range. As allocations
130093f3d2b8Schs * are made advance the cursor. Continue allocating from the cursor until
130193f3d2b8Schs * the range is exhausted and then find a new range.
1302f59c7639Shaad * ==========================================================================
1303f59c7639Shaad */
1304f59c7639Shaad static uint64_t
metaslab_cf_alloc(metaslab_t * msp,uint64_t size)130593f3d2b8Schs metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1306f59c7639Shaad {
130793f3d2b8Schs range_tree_t *rt = msp->ms_tree;
130893f3d2b8Schs avl_tree_t *t = &msp->ms_size_tree;
130993f3d2b8Schs uint64_t *cursor = &msp->ms_lbas[0];
131093f3d2b8Schs uint64_t *cursor_end = &msp->ms_lbas[1];
1311f59c7639Shaad uint64_t offset = 0;
1312f59c7639Shaad
131393f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
131493f3d2b8Schs ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1315f59c7639Shaad
131693f3d2b8Schs ASSERT3U(*cursor_end, >=, *cursor);
131793f3d2b8Schs
131893f3d2b8Schs if ((*cursor + size) > *cursor_end) {
131993f3d2b8Schs range_seg_t *rs;
132093f3d2b8Schs
132193f3d2b8Schs rs = avl_last(&msp->ms_size_tree);
132293f3d2b8Schs if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1323f59c7639Shaad return (-1ULL);
1324f59c7639Shaad
132593f3d2b8Schs *cursor = rs->rs_start;
132693f3d2b8Schs *cursor_end = rs->rs_end;
1327f59c7639Shaad }
132893f3d2b8Schs
132993f3d2b8Schs offset = *cursor;
133093f3d2b8Schs *cursor += size;
133193f3d2b8Schs
1332f59c7639Shaad return (offset);
1333f59c7639Shaad }
1334f59c7639Shaad
133593f3d2b8Schs static metaslab_ops_t metaslab_cf_ops = {
133693f3d2b8Schs metaslab_cf_alloc
1337f59c7639Shaad };
1338f59c7639Shaad
133993f3d2b8Schs /*
134093f3d2b8Schs * ==========================================================================
134193f3d2b8Schs * New dynamic fit allocator -
134293f3d2b8Schs * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
134393f3d2b8Schs * contiguous blocks. If no region is found then just use the largest segment
134493f3d2b8Schs * that remains.
134593f3d2b8Schs * ==========================================================================
134693f3d2b8Schs */
1347f59c7639Shaad
134893f3d2b8Schs /*
134993f3d2b8Schs * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
135093f3d2b8Schs * to request from the allocator.
135193f3d2b8Schs */
135293f3d2b8Schs uint64_t metaslab_ndf_clump_shift = 4;
135393f3d2b8Schs
135493f3d2b8Schs static uint64_t
metaslab_ndf_alloc(metaslab_t * msp,uint64_t size)135593f3d2b8Schs metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
135693f3d2b8Schs {
135793f3d2b8Schs avl_tree_t *t = &msp->ms_tree->rt_root;
135893f3d2b8Schs avl_index_t where;
135993f3d2b8Schs range_seg_t *rs, rsearch;
136093f3d2b8Schs uint64_t hbit = highbit64(size);
136193f3d2b8Schs uint64_t *cursor = &msp->ms_lbas[hbit - 1];
136293f3d2b8Schs uint64_t max_size = metaslab_block_maxsize(msp);
136393f3d2b8Schs
136493f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
136593f3d2b8Schs ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1366f59c7639Shaad
1367f59c7639Shaad if (max_size < size)
1368f59c7639Shaad return (-1ULL);
1369f59c7639Shaad
137093f3d2b8Schs rsearch.rs_start = *cursor;
137193f3d2b8Schs rsearch.rs_end = *cursor + size;
1372f59c7639Shaad
137393f3d2b8Schs rs = avl_find(t, &rsearch, &where);
137493f3d2b8Schs if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
137593f3d2b8Schs t = &msp->ms_size_tree;
1376f59c7639Shaad
137793f3d2b8Schs rsearch.rs_start = 0;
137893f3d2b8Schs rsearch.rs_end = MIN(max_size,
137993f3d2b8Schs 1ULL << (hbit + metaslab_ndf_clump_shift));
138093f3d2b8Schs rs = avl_find(t, &rsearch, &where);
138193f3d2b8Schs if (rs == NULL)
138293f3d2b8Schs rs = avl_nearest(t, where, AVL_AFTER);
138393f3d2b8Schs ASSERT(rs != NULL);
1384f59c7639Shaad }
1385f59c7639Shaad
138693f3d2b8Schs if ((rs->rs_end - rs->rs_start) >= size) {
138793f3d2b8Schs *cursor = rs->rs_start + size;
138893f3d2b8Schs return (rs->rs_start);
1389f59c7639Shaad }
1390f59c7639Shaad return (-1ULL);
1391f59c7639Shaad }
1392f59c7639Shaad
139393f3d2b8Schs static metaslab_ops_t metaslab_ndf_ops = {
139493f3d2b8Schs metaslab_ndf_alloc
1395f59c7639Shaad };
1396f59c7639Shaad
139793f3d2b8Schs metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1398f59c7639Shaad
1399f59c7639Shaad /*
1400f59c7639Shaad * ==========================================================================
1401c1cb2cd8Shaad * Metaslabs
1402c1cb2cd8Shaad * ==========================================================================
1403c1cb2cd8Shaad */
1404c1cb2cd8Shaad
1405c1cb2cd8Shaad /*
140693f3d2b8Schs * Wait for any in-progress metaslab loads to complete.
140793f3d2b8Schs */
140893f3d2b8Schs void
metaslab_load_wait(metaslab_t * msp)140993f3d2b8Schs metaslab_load_wait(metaslab_t *msp)
141093f3d2b8Schs {
141193f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
141293f3d2b8Schs
141393f3d2b8Schs while (msp->ms_loading) {
141493f3d2b8Schs ASSERT(!msp->ms_loaded);
141593f3d2b8Schs cv_wait(&msp->ms_load_cv, &msp->ms_lock);
141693f3d2b8Schs }
141793f3d2b8Schs }
141893f3d2b8Schs
141993f3d2b8Schs int
metaslab_load(metaslab_t * msp)142093f3d2b8Schs metaslab_load(metaslab_t *msp)
142193f3d2b8Schs {
142293f3d2b8Schs int error = 0;
142393f3d2b8Schs boolean_t success = B_FALSE;
142493f3d2b8Schs
142593f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
142693f3d2b8Schs ASSERT(!msp->ms_loaded);
142793f3d2b8Schs ASSERT(!msp->ms_loading);
142893f3d2b8Schs
142993f3d2b8Schs msp->ms_loading = B_TRUE;
143093f3d2b8Schs
143193f3d2b8Schs /*
143293f3d2b8Schs * If the space map has not been allocated yet, then treat
143393f3d2b8Schs * all the space in the metaslab as free and add it to the
143493f3d2b8Schs * ms_tree.
143593f3d2b8Schs */
143693f3d2b8Schs if (msp->ms_sm != NULL)
143793f3d2b8Schs error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
143893f3d2b8Schs else
143993f3d2b8Schs range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
144093f3d2b8Schs
144193f3d2b8Schs success = (error == 0);
144293f3d2b8Schs msp->ms_loading = B_FALSE;
144393f3d2b8Schs
144493f3d2b8Schs if (success) {
144593f3d2b8Schs ASSERT3P(msp->ms_group, !=, NULL);
144693f3d2b8Schs msp->ms_loaded = B_TRUE;
144793f3d2b8Schs
144893f3d2b8Schs for (int t = 0; t < TXG_DEFER_SIZE; t++) {
144993f3d2b8Schs range_tree_walk(msp->ms_defertree[t],
145093f3d2b8Schs range_tree_remove, msp->ms_tree);
145193f3d2b8Schs }
145293f3d2b8Schs msp->ms_max_size = metaslab_block_maxsize(msp);
145393f3d2b8Schs }
145493f3d2b8Schs cv_broadcast(&msp->ms_load_cv);
145593f3d2b8Schs return (error);
145693f3d2b8Schs }
145793f3d2b8Schs
145893f3d2b8Schs void
metaslab_unload(metaslab_t * msp)145993f3d2b8Schs metaslab_unload(metaslab_t *msp)
146093f3d2b8Schs {
146193f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
146293f3d2b8Schs range_tree_vacate(msp->ms_tree, NULL, NULL);
146393f3d2b8Schs msp->ms_loaded = B_FALSE;
146493f3d2b8Schs msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
146593f3d2b8Schs msp->ms_max_size = 0;
146693f3d2b8Schs }
146793f3d2b8Schs
146893f3d2b8Schs int
metaslab_init(metaslab_group_t * mg,uint64_t id,uint64_t object,uint64_t txg,metaslab_t ** msp)146993f3d2b8Schs metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
147093f3d2b8Schs metaslab_t **msp)
147193f3d2b8Schs {
147293f3d2b8Schs vdev_t *vd = mg->mg_vd;
147393f3d2b8Schs objset_t *mos = vd->vdev_spa->spa_meta_objset;
147493f3d2b8Schs metaslab_t *ms;
147593f3d2b8Schs int error;
147693f3d2b8Schs
147793f3d2b8Schs ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
147893f3d2b8Schs mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
147993f3d2b8Schs cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
148093f3d2b8Schs ms->ms_id = id;
148193f3d2b8Schs ms->ms_start = id << vd->vdev_ms_shift;
148293f3d2b8Schs ms->ms_size = 1ULL << vd->vdev_ms_shift;
148393f3d2b8Schs
148493f3d2b8Schs /*
148593f3d2b8Schs * We only open space map objects that already exist. All others
148693f3d2b8Schs * will be opened when we finally allocate an object for it.
148793f3d2b8Schs */
148893f3d2b8Schs if (object != 0) {
148993f3d2b8Schs error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
149093f3d2b8Schs ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
149193f3d2b8Schs
149293f3d2b8Schs if (error != 0) {
149393f3d2b8Schs kmem_free(ms, sizeof (metaslab_t));
149493f3d2b8Schs return (error);
149593f3d2b8Schs }
149693f3d2b8Schs
149793f3d2b8Schs ASSERT(ms->ms_sm != NULL);
149893f3d2b8Schs }
149993f3d2b8Schs
150093f3d2b8Schs /*
150193f3d2b8Schs * We create the main range tree here, but we don't create the
150293f3d2b8Schs * alloctree and freetree until metaslab_sync_done(). This serves
1503c1cb2cd8Shaad * two purposes: it allows metaslab_sync_done() to detect the
1504c1cb2cd8Shaad * addition of new space; and for debugging, it ensures that we'd
1505c1cb2cd8Shaad * data fault on any attempt to use this metaslab before it's ready.
1506c1cb2cd8Shaad */
150793f3d2b8Schs ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
150893f3d2b8Schs metaslab_group_add(mg, ms);
1509c1cb2cd8Shaad
151093f3d2b8Schs metaslab_set_fragmentation(ms);
1511f59c7639Shaad
1512c1cb2cd8Shaad /*
1513c1cb2cd8Shaad * If we're opening an existing pool (txg == 0) or creating
1514c1cb2cd8Shaad * a new one (txg == TXG_INITIAL), all space is available now.
1515c1cb2cd8Shaad * If we're adding space to an existing pool, the new space
1516c1cb2cd8Shaad * does not become available until after this txg has synced.
151793f3d2b8Schs * The metaslab's weight will also be initialized when we sync
151893f3d2b8Schs * out this txg. This ensures that we don't attempt to allocate
151993f3d2b8Schs * from it before we have initialized it completely.
1520c1cb2cd8Shaad */
1521c1cb2cd8Shaad if (txg <= TXG_INITIAL)
152293f3d2b8Schs metaslab_sync_done(ms, 0);
152393f3d2b8Schs
152493f3d2b8Schs /*
152593f3d2b8Schs * If metaslab_debug_load is set and we're initializing a metaslab
152693f3d2b8Schs * that has an allocated space map object then load the its space
152793f3d2b8Schs * map so that can verify frees.
152893f3d2b8Schs */
152993f3d2b8Schs if (metaslab_debug_load && ms->ms_sm != NULL) {
153093f3d2b8Schs mutex_enter(&ms->ms_lock);
153193f3d2b8Schs VERIFY0(metaslab_load(ms));
153293f3d2b8Schs mutex_exit(&ms->ms_lock);
153393f3d2b8Schs }
1534c1cb2cd8Shaad
1535c1cb2cd8Shaad if (txg != 0) {
1536c1cb2cd8Shaad vdev_dirty(vd, 0, NULL, txg);
153793f3d2b8Schs vdev_dirty(vd, VDD_METASLAB, ms, txg);
1538c1cb2cd8Shaad }
1539c1cb2cd8Shaad
154093f3d2b8Schs *msp = ms;
154193f3d2b8Schs
154293f3d2b8Schs return (0);
1543c1cb2cd8Shaad }
1544c1cb2cd8Shaad
1545c1cb2cd8Shaad void
metaslab_fini(metaslab_t * msp)1546c1cb2cd8Shaad metaslab_fini(metaslab_t *msp)
1547c1cb2cd8Shaad {
1548c1cb2cd8Shaad metaslab_group_t *mg = msp->ms_group;
1549c1cb2cd8Shaad
1550c1cb2cd8Shaad metaslab_group_remove(mg, msp);
1551c1cb2cd8Shaad
1552c1cb2cd8Shaad mutex_enter(&msp->ms_lock);
155393f3d2b8Schs VERIFY(msp->ms_group == NULL);
155493f3d2b8Schs vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
155593f3d2b8Schs 0, -msp->ms_size);
155693f3d2b8Schs space_map_close(msp->ms_sm);
1557c1cb2cd8Shaad
155893f3d2b8Schs metaslab_unload(msp);
155993f3d2b8Schs range_tree_destroy(msp->ms_tree);
1560c1cb2cd8Shaad
1561f59c7639Shaad for (int t = 0; t < TXG_SIZE; t++) {
156293f3d2b8Schs range_tree_destroy(msp->ms_alloctree[t]);
156393f3d2b8Schs range_tree_destroy(msp->ms_freetree[t]);
1564c1cb2cd8Shaad }
1565c1cb2cd8Shaad
156693f3d2b8Schs for (int t = 0; t < TXG_DEFER_SIZE; t++) {
156793f3d2b8Schs range_tree_destroy(msp->ms_defertree[t]);
156893f3d2b8Schs }
1569f59c7639Shaad
157093f3d2b8Schs ASSERT0(msp->ms_deferspace);
1571f59c7639Shaad
1572c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
157393f3d2b8Schs cv_destroy(&msp->ms_load_cv);
1574c1cb2cd8Shaad mutex_destroy(&msp->ms_lock);
1575c1cb2cd8Shaad
1576c1cb2cd8Shaad kmem_free(msp, sizeof (metaslab_t));
1577c1cb2cd8Shaad }
1578c1cb2cd8Shaad
157993f3d2b8Schs #define FRAGMENTATION_TABLE_SIZE 17
1580c1cb2cd8Shaad
158193f3d2b8Schs /*
158293f3d2b8Schs * This table defines a segment size based fragmentation metric that will
158393f3d2b8Schs * allow each metaslab to derive its own fragmentation value. This is done
158493f3d2b8Schs * by calculating the space in each bucket of the spacemap histogram and
158593f3d2b8Schs * multiplying that by the fragmetation metric in this table. Doing
158693f3d2b8Schs * this for all buckets and dividing it by the total amount of free
158793f3d2b8Schs * space in this metaslab (i.e. the total free space in all buckets) gives
158893f3d2b8Schs * us the fragmentation metric. This means that a high fragmentation metric
158993f3d2b8Schs * equates to most of the free space being comprised of small segments.
159093f3d2b8Schs * Conversely, if the metric is low, then most of the free space is in
159193f3d2b8Schs * large segments. A 10% change in fragmentation equates to approximately
159293f3d2b8Schs * double the number of segments.
159393f3d2b8Schs *
159493f3d2b8Schs * This table defines 0% fragmented space using 16MB segments. Testing has
159593f3d2b8Schs * shown that segments that are greater than or equal to 16MB do not suffer
159693f3d2b8Schs * from drastic performance problems. Using this value, we derive the rest
159793f3d2b8Schs * of the table. Since the fragmentation value is never stored on disk, it
159893f3d2b8Schs * is possible to change these calculations in the future.
159993f3d2b8Schs */
160093f3d2b8Schs int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
160193f3d2b8Schs 100, /* 512B */
160293f3d2b8Schs 100, /* 1K */
160393f3d2b8Schs 98, /* 2K */
160493f3d2b8Schs 95, /* 4K */
160593f3d2b8Schs 90, /* 8K */
160693f3d2b8Schs 80, /* 16K */
160793f3d2b8Schs 70, /* 32K */
160893f3d2b8Schs 60, /* 64K */
160993f3d2b8Schs 50, /* 128K */
161093f3d2b8Schs 40, /* 256K */
161193f3d2b8Schs 30, /* 512K */
161293f3d2b8Schs 20, /* 1M */
161393f3d2b8Schs 15, /* 2M */
161493f3d2b8Schs 10, /* 4M */
161593f3d2b8Schs 5, /* 8M */
161693f3d2b8Schs 0 /* 16M */
161793f3d2b8Schs };
161893f3d2b8Schs
161993f3d2b8Schs /*
162093f3d2b8Schs * Calclate the metaslab's fragmentation metric. A return value
162193f3d2b8Schs * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
162293f3d2b8Schs * not support this metric. Otherwise, the return value should be in the
162393f3d2b8Schs * range [0, 100].
162493f3d2b8Schs */
162593f3d2b8Schs static void
metaslab_set_fragmentation(metaslab_t * msp)162693f3d2b8Schs metaslab_set_fragmentation(metaslab_t *msp)
162793f3d2b8Schs {
162893f3d2b8Schs spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
162993f3d2b8Schs uint64_t fragmentation = 0;
163093f3d2b8Schs uint64_t total = 0;
163193f3d2b8Schs boolean_t feature_enabled = spa_feature_is_enabled(spa,
163293f3d2b8Schs SPA_FEATURE_SPACEMAP_HISTOGRAM);
163393f3d2b8Schs
163493f3d2b8Schs if (!feature_enabled) {
163593f3d2b8Schs msp->ms_fragmentation = ZFS_FRAG_INVALID;
163693f3d2b8Schs return;
163793f3d2b8Schs }
163893f3d2b8Schs
163993f3d2b8Schs /*
164093f3d2b8Schs * A null space map means that the entire metaslab is free
164193f3d2b8Schs * and thus is not fragmented.
164293f3d2b8Schs */
164393f3d2b8Schs if (msp->ms_sm == NULL) {
164493f3d2b8Schs msp->ms_fragmentation = 0;
164593f3d2b8Schs return;
164693f3d2b8Schs }
164793f3d2b8Schs
164893f3d2b8Schs /*
164993f3d2b8Schs * If this metaslab's space map has not been upgraded, flag it
165093f3d2b8Schs * so that we upgrade next time we encounter it.
165193f3d2b8Schs */
165293f3d2b8Schs if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
165393f3d2b8Schs uint64_t txg = spa_syncing_txg(spa);
165493f3d2b8Schs vdev_t *vd = msp->ms_group->mg_vd;
165593f3d2b8Schs
165693f3d2b8Schs if (spa_writeable(spa)) {
165793f3d2b8Schs msp->ms_condense_wanted = B_TRUE;
165893f3d2b8Schs vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
165993f3d2b8Schs spa_dbgmsg(spa, "txg %llu, requesting force condense: "
166093f3d2b8Schs "msp %p, vd %p", txg, msp, vd);
166193f3d2b8Schs }
166293f3d2b8Schs msp->ms_fragmentation = ZFS_FRAG_INVALID;
166393f3d2b8Schs return;
166493f3d2b8Schs }
166593f3d2b8Schs
166693f3d2b8Schs for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
166793f3d2b8Schs uint64_t space = 0;
166893f3d2b8Schs uint8_t shift = msp->ms_sm->sm_shift;
166993f3d2b8Schs
167093f3d2b8Schs int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
167193f3d2b8Schs FRAGMENTATION_TABLE_SIZE - 1);
167293f3d2b8Schs
167393f3d2b8Schs if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
167493f3d2b8Schs continue;
167593f3d2b8Schs
167693f3d2b8Schs space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
167793f3d2b8Schs total += space;
167893f3d2b8Schs
167993f3d2b8Schs ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
168093f3d2b8Schs fragmentation += space * zfs_frag_table[idx];
168193f3d2b8Schs }
168293f3d2b8Schs
168393f3d2b8Schs if (total > 0)
168493f3d2b8Schs fragmentation /= total;
168593f3d2b8Schs ASSERT3U(fragmentation, <=, 100);
168693f3d2b8Schs
168793f3d2b8Schs msp->ms_fragmentation = fragmentation;
168893f3d2b8Schs }
168993f3d2b8Schs
169093f3d2b8Schs /*
169193f3d2b8Schs * Compute a weight -- a selection preference value -- for the given metaslab.
169293f3d2b8Schs * This is based on the amount of free space, the level of fragmentation,
169393f3d2b8Schs * the LBA range, and whether the metaslab is loaded.
169493f3d2b8Schs */
1695c1cb2cd8Shaad static uint64_t
metaslab_space_weight(metaslab_t * msp)169693f3d2b8Schs metaslab_space_weight(metaslab_t *msp)
1697c1cb2cd8Shaad {
1698c1cb2cd8Shaad metaslab_group_t *mg = msp->ms_group;
1699c1cb2cd8Shaad vdev_t *vd = mg->mg_vd;
1700c1cb2cd8Shaad uint64_t weight, space;
1701c1cb2cd8Shaad
1702c1cb2cd8Shaad ASSERT(MUTEX_HELD(&msp->ms_lock));
170393f3d2b8Schs ASSERT(!vd->vdev_removing);
1704c1cb2cd8Shaad
1705c1cb2cd8Shaad /*
1706c1cb2cd8Shaad * The baseline weight is the metaslab's free space.
1707c1cb2cd8Shaad */
170893f3d2b8Schs space = msp->ms_size - space_map_allocated(msp->ms_sm);
170993f3d2b8Schs
171093f3d2b8Schs if (metaslab_fragmentation_factor_enabled &&
171193f3d2b8Schs msp->ms_fragmentation != ZFS_FRAG_INVALID) {
171293f3d2b8Schs /*
171393f3d2b8Schs * Use the fragmentation information to inversely scale
171493f3d2b8Schs * down the baseline weight. We need to ensure that we
171593f3d2b8Schs * don't exclude this metaslab completely when it's 100%
171693f3d2b8Schs * fragmented. To avoid this we reduce the fragmented value
171793f3d2b8Schs * by 1.
171893f3d2b8Schs */
171993f3d2b8Schs space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
172093f3d2b8Schs
172193f3d2b8Schs /*
172293f3d2b8Schs * If space < SPA_MINBLOCKSIZE, then we will not allocate from
172393f3d2b8Schs * this metaslab again. The fragmentation metric may have
172493f3d2b8Schs * decreased the space to something smaller than
172593f3d2b8Schs * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
172693f3d2b8Schs * so that we can consume any remaining space.
172793f3d2b8Schs */
172893f3d2b8Schs if (space > 0 && space < SPA_MINBLOCKSIZE)
172993f3d2b8Schs space = SPA_MINBLOCKSIZE;
173093f3d2b8Schs }
1731c1cb2cd8Shaad weight = space;
1732c1cb2cd8Shaad
1733c1cb2cd8Shaad /*
1734c1cb2cd8Shaad * Modern disks have uniform bit density and constant angular velocity.
1735c1cb2cd8Shaad * Therefore, the outer recording zones are faster (higher bandwidth)
1736c1cb2cd8Shaad * than the inner zones by the ratio of outer to inner track diameter,
1737c1cb2cd8Shaad * which is typically around 2:1. We account for this by assigning
1738c1cb2cd8Shaad * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1739c1cb2cd8Shaad * In effect, this means that we'll select the metaslab with the most
1740c1cb2cd8Shaad * free bandwidth rather than simply the one with the most free space.
1741c1cb2cd8Shaad */
174293f3d2b8Schs if (metaslab_lba_weighting_enabled) {
174393f3d2b8Schs weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1744c1cb2cd8Shaad ASSERT(weight >= space && weight <= 2 * space);
174593f3d2b8Schs }
1746c1cb2cd8Shaad
1747c1cb2cd8Shaad /*
1748f59c7639Shaad * If this metaslab is one we're actively using, adjust its
1749f59c7639Shaad * weight to make it preferable to any inactive metaslab so
175093f3d2b8Schs * we'll polish it off. If the fragmentation on this metaslab
175193f3d2b8Schs * has exceed our threshold, then don't mark it active.
1752c1cb2cd8Shaad */
175393f3d2b8Schs if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
175493f3d2b8Schs msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1755c1cb2cd8Shaad weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1756f59c7639Shaad }
175793f3d2b8Schs
175893f3d2b8Schs WEIGHT_SET_SPACEBASED(weight);
175993f3d2b8Schs return (weight);
176093f3d2b8Schs }
176193f3d2b8Schs
176293f3d2b8Schs /*
176393f3d2b8Schs * Return the weight of the specified metaslab, according to the segment-based
176493f3d2b8Schs * weighting algorithm. The metaslab must be loaded. This function can
176593f3d2b8Schs * be called within a sync pass since it relies only on the metaslab's
176693f3d2b8Schs * range tree which is always accurate when the metaslab is loaded.
176793f3d2b8Schs */
176893f3d2b8Schs static uint64_t
metaslab_weight_from_range_tree(metaslab_t * msp)176993f3d2b8Schs metaslab_weight_from_range_tree(metaslab_t *msp)
177093f3d2b8Schs {
177193f3d2b8Schs uint64_t weight = 0;
177293f3d2b8Schs uint32_t segments = 0;
177393f3d2b8Schs
177493f3d2b8Schs ASSERT(msp->ms_loaded);
177593f3d2b8Schs
177693f3d2b8Schs for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
177793f3d2b8Schs i--) {
177893f3d2b8Schs uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
177993f3d2b8Schs int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
178093f3d2b8Schs
178193f3d2b8Schs segments <<= 1;
178293f3d2b8Schs segments += msp->ms_tree->rt_histogram[i];
178393f3d2b8Schs
178493f3d2b8Schs /*
178593f3d2b8Schs * The range tree provides more precision than the space map
178693f3d2b8Schs * and must be downgraded so that all values fit within the
178793f3d2b8Schs * space map's histogram. This allows us to compare loaded
178893f3d2b8Schs * vs. unloaded metaslabs to determine which metaslab is
178993f3d2b8Schs * considered "best".
179093f3d2b8Schs */
179193f3d2b8Schs if (i > max_idx)
179293f3d2b8Schs continue;
179393f3d2b8Schs
179493f3d2b8Schs if (segments != 0) {
179593f3d2b8Schs WEIGHT_SET_COUNT(weight, segments);
179693f3d2b8Schs WEIGHT_SET_INDEX(weight, i);
179793f3d2b8Schs WEIGHT_SET_ACTIVE(weight, 0);
179893f3d2b8Schs break;
179993f3d2b8Schs }
180093f3d2b8Schs }
1801c1cb2cd8Shaad return (weight);
1802c1cb2cd8Shaad }
1803c1cb2cd8Shaad
180493f3d2b8Schs /*
180593f3d2b8Schs * Calculate the weight based on the on-disk histogram. This should only
180693f3d2b8Schs * be called after a sync pass has completely finished since the on-disk
180793f3d2b8Schs * information is updated in metaslab_sync().
180893f3d2b8Schs */
180993f3d2b8Schs static uint64_t
metaslab_weight_from_spacemap(metaslab_t * msp)181093f3d2b8Schs metaslab_weight_from_spacemap(metaslab_t *msp)
1811c1cb2cd8Shaad {
181293f3d2b8Schs uint64_t weight = 0;
1813f59c7639Shaad
181493f3d2b8Schs for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
181593f3d2b8Schs if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
181693f3d2b8Schs WEIGHT_SET_COUNT(weight,
181793f3d2b8Schs msp->ms_sm->sm_phys->smp_histogram[i]);
181893f3d2b8Schs WEIGHT_SET_INDEX(weight, i +
181993f3d2b8Schs msp->ms_sm->sm_shift);
182093f3d2b8Schs WEIGHT_SET_ACTIVE(weight, 0);
182193f3d2b8Schs break;
182293f3d2b8Schs }
182393f3d2b8Schs }
182493f3d2b8Schs return (weight);
182593f3d2b8Schs }
1826f59c7639Shaad
1827f59c7639Shaad /*
182893f3d2b8Schs * Compute a segment-based weight for the specified metaslab. The weight
182993f3d2b8Schs * is determined by highest bucket in the histogram. The information
183093f3d2b8Schs * for the highest bucket is encoded into the weight value.
1831f59c7639Shaad */
183293f3d2b8Schs static uint64_t
metaslab_segment_weight(metaslab_t * msp)183393f3d2b8Schs metaslab_segment_weight(metaslab_t *msp)
1834f59c7639Shaad {
1835f59c7639Shaad metaslab_group_t *mg = msp->ms_group;
183693f3d2b8Schs uint64_t weight = 0;
183793f3d2b8Schs uint8_t shift = mg->mg_vd->vdev_ashift;
1838c1cb2cd8Shaad
1839c1cb2cd8Shaad ASSERT(MUTEX_HELD(&msp->ms_lock));
1840c1cb2cd8Shaad
184193f3d2b8Schs /*
184293f3d2b8Schs * The metaslab is completely free.
184393f3d2b8Schs */
184493f3d2b8Schs if (space_map_allocated(msp->ms_sm) == 0) {
184593f3d2b8Schs int idx = highbit64(msp->ms_size) - 1;
184693f3d2b8Schs int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
184793f3d2b8Schs
184893f3d2b8Schs if (idx < max_idx) {
184993f3d2b8Schs WEIGHT_SET_COUNT(weight, 1ULL);
185093f3d2b8Schs WEIGHT_SET_INDEX(weight, idx);
185193f3d2b8Schs } else {
185293f3d2b8Schs WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
185393f3d2b8Schs WEIGHT_SET_INDEX(weight, max_idx);
185493f3d2b8Schs }
185593f3d2b8Schs WEIGHT_SET_ACTIVE(weight, 0);
185693f3d2b8Schs ASSERT(!WEIGHT_IS_SPACEBASED(weight));
185793f3d2b8Schs
185893f3d2b8Schs return (weight);
185993f3d2b8Schs }
186093f3d2b8Schs
186193f3d2b8Schs ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
186293f3d2b8Schs
186393f3d2b8Schs /*
186493f3d2b8Schs * If the metaslab is fully allocated then just make the weight 0.
186593f3d2b8Schs */
186693f3d2b8Schs if (space_map_allocated(msp->ms_sm) == msp->ms_size)
186793f3d2b8Schs return (0);
186893f3d2b8Schs /*
186993f3d2b8Schs * If the metaslab is already loaded, then use the range tree to
187093f3d2b8Schs * determine the weight. Otherwise, we rely on the space map information
187193f3d2b8Schs * to generate the weight.
187293f3d2b8Schs */
187393f3d2b8Schs if (msp->ms_loaded) {
187493f3d2b8Schs weight = metaslab_weight_from_range_tree(msp);
187593f3d2b8Schs } else {
187693f3d2b8Schs weight = metaslab_weight_from_spacemap(msp);
187793f3d2b8Schs }
187893f3d2b8Schs
187993f3d2b8Schs /*
188093f3d2b8Schs * If the metaslab was active the last time we calculated its weight
188193f3d2b8Schs * then keep it active. We want to consume the entire region that
188293f3d2b8Schs * is associated with this weight.
188393f3d2b8Schs */
188493f3d2b8Schs if (msp->ms_activation_weight != 0 && weight != 0)
188593f3d2b8Schs WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
188693f3d2b8Schs return (weight);
188793f3d2b8Schs }
188893f3d2b8Schs
188993f3d2b8Schs /*
189093f3d2b8Schs * Determine if we should attempt to allocate from this metaslab. If the
189193f3d2b8Schs * metaslab has a maximum size then we can quickly determine if the desired
189293f3d2b8Schs * allocation size can be satisfied. Otherwise, if we're using segment-based
189393f3d2b8Schs * weighting then we can determine the maximum allocation that this metaslab
189493f3d2b8Schs * can accommodate based on the index encoded in the weight. If we're using
189593f3d2b8Schs * space-based weights then rely on the entire weight (excluding the weight
189693f3d2b8Schs * type bit).
189793f3d2b8Schs */
189893f3d2b8Schs boolean_t
metaslab_should_allocate(metaslab_t * msp,uint64_t asize)189993f3d2b8Schs metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
190093f3d2b8Schs {
190193f3d2b8Schs boolean_t should_allocate;
190293f3d2b8Schs
190393f3d2b8Schs if (msp->ms_max_size != 0)
190493f3d2b8Schs return (msp->ms_max_size >= asize);
190593f3d2b8Schs
190693f3d2b8Schs if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
190793f3d2b8Schs /*
190893f3d2b8Schs * The metaslab segment weight indicates segments in the
190993f3d2b8Schs * range [2^i, 2^(i+1)), where i is the index in the weight.
191093f3d2b8Schs * Since the asize might be in the middle of the range, we
191193f3d2b8Schs * should attempt the allocation if asize < 2^(i+1).
191293f3d2b8Schs */
191393f3d2b8Schs should_allocate = (asize <
191493f3d2b8Schs 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
191593f3d2b8Schs } else {
191693f3d2b8Schs should_allocate = (asize <=
191793f3d2b8Schs (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
191893f3d2b8Schs }
191993f3d2b8Schs return (should_allocate);
192093f3d2b8Schs }
192193f3d2b8Schs
192293f3d2b8Schs static uint64_t
metaslab_weight(metaslab_t * msp)192393f3d2b8Schs metaslab_weight(metaslab_t *msp)
192493f3d2b8Schs {
192593f3d2b8Schs vdev_t *vd = msp->ms_group->mg_vd;
192693f3d2b8Schs spa_t *spa = vd->vdev_spa;
192793f3d2b8Schs uint64_t weight;
192893f3d2b8Schs
192993f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
193093f3d2b8Schs
193193f3d2b8Schs /*
193293f3d2b8Schs * This vdev is in the process of being removed so there is nothing
193393f3d2b8Schs * for us to do here.
193493f3d2b8Schs */
193593f3d2b8Schs if (vd->vdev_removing) {
193693f3d2b8Schs ASSERT0(space_map_allocated(msp->ms_sm));
193793f3d2b8Schs ASSERT0(vd->vdev_ms_shift);
193893f3d2b8Schs return (0);
193993f3d2b8Schs }
194093f3d2b8Schs
194193f3d2b8Schs metaslab_set_fragmentation(msp);
194293f3d2b8Schs
194393f3d2b8Schs /*
194493f3d2b8Schs * Update the maximum size if the metaslab is loaded. This will
194593f3d2b8Schs * ensure that we get an accurate maximum size if newly freed space
194693f3d2b8Schs * has been added back into the free tree.
194793f3d2b8Schs */
194893f3d2b8Schs if (msp->ms_loaded)
194993f3d2b8Schs msp->ms_max_size = metaslab_block_maxsize(msp);
195093f3d2b8Schs
195193f3d2b8Schs /*
195293f3d2b8Schs * Segment-based weighting requires space map histogram support.
195393f3d2b8Schs */
195493f3d2b8Schs if (zfs_metaslab_segment_weight_enabled &&
195593f3d2b8Schs spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
195693f3d2b8Schs (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
195793f3d2b8Schs sizeof (space_map_phys_t))) {
195893f3d2b8Schs weight = metaslab_segment_weight(msp);
195993f3d2b8Schs } else {
196093f3d2b8Schs weight = metaslab_space_weight(msp);
196193f3d2b8Schs }
196293f3d2b8Schs return (weight);
196393f3d2b8Schs }
196493f3d2b8Schs
196593f3d2b8Schs static int
metaslab_activate(metaslab_t * msp,uint64_t activation_weight)196693f3d2b8Schs metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
196793f3d2b8Schs {
196893f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
196993f3d2b8Schs
1970c1cb2cd8Shaad if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
197193f3d2b8Schs metaslab_load_wait(msp);
197293f3d2b8Schs if (!msp->ms_loaded) {
197393f3d2b8Schs int error = metaslab_load(msp);
1974c1cb2cd8Shaad if (error) {
1975c1cb2cd8Shaad metaslab_group_sort(msp->ms_group, msp, 0);
1976c1cb2cd8Shaad return (error);
1977c1cb2cd8Shaad }
1978f59c7639Shaad }
1979f59c7639Shaad
198093f3d2b8Schs msp->ms_activation_weight = msp->ms_weight;
1981c1cb2cd8Shaad metaslab_group_sort(msp->ms_group, msp,
1982c1cb2cd8Shaad msp->ms_weight | activation_weight);
1983c1cb2cd8Shaad }
198493f3d2b8Schs ASSERT(msp->ms_loaded);
1985c1cb2cd8Shaad ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1986c1cb2cd8Shaad
1987c1cb2cd8Shaad return (0);
1988c1cb2cd8Shaad }
1989c1cb2cd8Shaad
1990c1cb2cd8Shaad static void
metaslab_passivate(metaslab_t * msp,uint64_t weight)199193f3d2b8Schs metaslab_passivate(metaslab_t *msp, uint64_t weight)
1992c1cb2cd8Shaad {
199393f3d2b8Schs uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
199493f3d2b8Schs
1995c1cb2cd8Shaad /*
1996c1cb2cd8Shaad * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1997c1cb2cd8Shaad * this metaslab again. In that case, it had better be empty,
1998c1cb2cd8Shaad * or we would be leaving space on the table.
1999c1cb2cd8Shaad */
200093f3d2b8Schs ASSERT(size >= SPA_MINBLOCKSIZE ||
200193f3d2b8Schs range_tree_space(msp->ms_tree) == 0);
200293f3d2b8Schs ASSERT0(weight & METASLAB_ACTIVE_MASK);
200393f3d2b8Schs
200493f3d2b8Schs msp->ms_activation_weight = 0;
200593f3d2b8Schs metaslab_group_sort(msp->ms_group, msp, weight);
2006c1cb2cd8Shaad ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2007c1cb2cd8Shaad }
2008c1cb2cd8Shaad
2009c1cb2cd8Shaad /*
201093f3d2b8Schs * Segment-based metaslabs are activated once and remain active until
201193f3d2b8Schs * we either fail an allocation attempt (similar to space-based metaslabs)
201293f3d2b8Schs * or have exhausted the free space in zfs_metaslab_switch_threshold
201393f3d2b8Schs * buckets since the metaslab was activated. This function checks to see
201493f3d2b8Schs * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
201593f3d2b8Schs * metaslab and passivates it proactively. This will allow us to select a
201693f3d2b8Schs * metaslabs with larger contiguous region if any remaining within this
201793f3d2b8Schs * metaslab group. If we're in sync pass > 1, then we continue using this
201893f3d2b8Schs * metaslab so that we don't dirty more block and cause more sync passes.
201993f3d2b8Schs */
202093f3d2b8Schs void
metaslab_segment_may_passivate(metaslab_t * msp)202193f3d2b8Schs metaslab_segment_may_passivate(metaslab_t *msp)
202293f3d2b8Schs {
202393f3d2b8Schs spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
202493f3d2b8Schs
202593f3d2b8Schs if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
202693f3d2b8Schs return;
202793f3d2b8Schs
202893f3d2b8Schs /*
202993f3d2b8Schs * Since we are in the middle of a sync pass, the most accurate
203093f3d2b8Schs * information that is accessible to us is the in-core range tree
203193f3d2b8Schs * histogram; calculate the new weight based on that information.
203293f3d2b8Schs */
203393f3d2b8Schs uint64_t weight = metaslab_weight_from_range_tree(msp);
203493f3d2b8Schs int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
203593f3d2b8Schs int current_idx = WEIGHT_GET_INDEX(weight);
203693f3d2b8Schs
203793f3d2b8Schs if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
203893f3d2b8Schs metaslab_passivate(msp, weight);
203993f3d2b8Schs }
204093f3d2b8Schs
204193f3d2b8Schs static void
metaslab_preload(void * arg)204293f3d2b8Schs metaslab_preload(void *arg)
204393f3d2b8Schs {
204493f3d2b8Schs metaslab_t *msp = arg;
204593f3d2b8Schs spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
204693f3d2b8Schs
204793f3d2b8Schs ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
204893f3d2b8Schs
204993f3d2b8Schs mutex_enter(&msp->ms_lock);
205093f3d2b8Schs metaslab_load_wait(msp);
205193f3d2b8Schs if (!msp->ms_loaded)
205293f3d2b8Schs (void) metaslab_load(msp);
205393f3d2b8Schs msp->ms_selected_txg = spa_syncing_txg(spa);
205493f3d2b8Schs mutex_exit(&msp->ms_lock);
205593f3d2b8Schs }
205693f3d2b8Schs
205793f3d2b8Schs static void
metaslab_group_preload(metaslab_group_t * mg)205893f3d2b8Schs metaslab_group_preload(metaslab_group_t *mg)
205993f3d2b8Schs {
206093f3d2b8Schs spa_t *spa = mg->mg_vd->vdev_spa;
206193f3d2b8Schs metaslab_t *msp;
206293f3d2b8Schs avl_tree_t *t = &mg->mg_metaslab_tree;
206393f3d2b8Schs int m = 0;
206493f3d2b8Schs
206593f3d2b8Schs if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
206693f3d2b8Schs taskq_wait(mg->mg_taskq);
206793f3d2b8Schs return;
206893f3d2b8Schs }
206993f3d2b8Schs
207093f3d2b8Schs mutex_enter(&mg->mg_lock);
207193f3d2b8Schs /*
207293f3d2b8Schs * Load the next potential metaslabs
207393f3d2b8Schs */
207493f3d2b8Schs for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
207593f3d2b8Schs /*
207693f3d2b8Schs * We preload only the maximum number of metaslabs specified
207793f3d2b8Schs * by metaslab_preload_limit. If a metaslab is being forced
207893f3d2b8Schs * to condense then we preload it too. This will ensure
207993f3d2b8Schs * that force condensing happens in the next txg.
208093f3d2b8Schs */
208193f3d2b8Schs if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
208293f3d2b8Schs continue;
208393f3d2b8Schs }
208493f3d2b8Schs
208593f3d2b8Schs VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
208693f3d2b8Schs msp, TQ_SLEEP) != 0);
208793f3d2b8Schs }
208893f3d2b8Schs mutex_exit(&mg->mg_lock);
208993f3d2b8Schs }
209093f3d2b8Schs
209193f3d2b8Schs /*
209293f3d2b8Schs * Determine if the space map's on-disk footprint is past our tolerance
209393f3d2b8Schs * for inefficiency. We would like to use the following criteria to make
209493f3d2b8Schs * our decision:
209593f3d2b8Schs *
209693f3d2b8Schs * 1. The size of the space map object should not dramatically increase as a
209793f3d2b8Schs * result of writing out the free space range tree.
209893f3d2b8Schs *
209993f3d2b8Schs * 2. The minimal on-disk space map representation is zfs_condense_pct/100
210093f3d2b8Schs * times the size than the free space range tree representation
210193f3d2b8Schs * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
210293f3d2b8Schs *
210393f3d2b8Schs * 3. The on-disk size of the space map should actually decrease.
210493f3d2b8Schs *
210593f3d2b8Schs * Checking the first condition is tricky since we don't want to walk
210693f3d2b8Schs * the entire AVL tree calculating the estimated on-disk size. Instead we
210793f3d2b8Schs * use the size-ordered range tree in the metaslab and calculate the
210893f3d2b8Schs * size required to write out the largest segment in our free tree. If the
210993f3d2b8Schs * size required to represent that segment on disk is larger than the space
211093f3d2b8Schs * map object then we avoid condensing this map.
211193f3d2b8Schs *
211293f3d2b8Schs * To determine the second criterion we use a best-case estimate and assume
211393f3d2b8Schs * each segment can be represented on-disk as a single 64-bit entry. We refer
211493f3d2b8Schs * to this best-case estimate as the space map's minimal form.
211593f3d2b8Schs *
211693f3d2b8Schs * Unfortunately, we cannot compute the on-disk size of the space map in this
211793f3d2b8Schs * context because we cannot accurately compute the effects of compression, etc.
211893f3d2b8Schs * Instead, we apply the heuristic described in the block comment for
211993f3d2b8Schs * zfs_metaslab_condense_block_threshold - we only condense if the space used
212093f3d2b8Schs * is greater than a threshold number of blocks.
212193f3d2b8Schs */
212293f3d2b8Schs static boolean_t
metaslab_should_condense(metaslab_t * msp)212393f3d2b8Schs metaslab_should_condense(metaslab_t *msp)
212493f3d2b8Schs {
212593f3d2b8Schs space_map_t *sm = msp->ms_sm;
212693f3d2b8Schs range_seg_t *rs;
212793f3d2b8Schs uint64_t size, entries, segsz, object_size, optimal_size, record_size;
212893f3d2b8Schs dmu_object_info_t doi;
212993f3d2b8Schs uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
213093f3d2b8Schs
213193f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
213293f3d2b8Schs ASSERT(msp->ms_loaded);
213393f3d2b8Schs
213493f3d2b8Schs /*
213593f3d2b8Schs * Use the ms_size_tree range tree, which is ordered by size, to
213693f3d2b8Schs * obtain the largest segment in the free tree. We always condense
213793f3d2b8Schs * metaslabs that are empty and metaslabs for which a condense
213893f3d2b8Schs * request has been made.
213993f3d2b8Schs */
214093f3d2b8Schs rs = avl_last(&msp->ms_size_tree);
214193f3d2b8Schs if (rs == NULL || msp->ms_condense_wanted)
214293f3d2b8Schs return (B_TRUE);
214393f3d2b8Schs
214493f3d2b8Schs /*
214593f3d2b8Schs * Calculate the number of 64-bit entries this segment would
214693f3d2b8Schs * require when written to disk. If this single segment would be
214793f3d2b8Schs * larger on-disk than the entire current on-disk structure, then
214893f3d2b8Schs * clearly condensing will increase the on-disk structure size.
214993f3d2b8Schs */
215093f3d2b8Schs size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
215193f3d2b8Schs entries = size / (MIN(size, SM_RUN_MAX));
215293f3d2b8Schs segsz = entries * sizeof (uint64_t);
215393f3d2b8Schs
215493f3d2b8Schs optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
215593f3d2b8Schs object_size = space_map_length(msp->ms_sm);
215693f3d2b8Schs
215793f3d2b8Schs dmu_object_info_from_db(sm->sm_dbuf, &doi);
215893f3d2b8Schs record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
215993f3d2b8Schs
216093f3d2b8Schs return (segsz <= object_size &&
216193f3d2b8Schs object_size >= (optimal_size * zfs_condense_pct / 100) &&
216293f3d2b8Schs object_size > zfs_metaslab_condense_block_threshold * record_size);
216393f3d2b8Schs }
216493f3d2b8Schs
216593f3d2b8Schs /*
216693f3d2b8Schs * Condense the on-disk space map representation to its minimized form.
216793f3d2b8Schs * The minimized form consists of a small number of allocations followed by
216893f3d2b8Schs * the entries of the free range tree.
216993f3d2b8Schs */
217093f3d2b8Schs static void
metaslab_condense(metaslab_t * msp,uint64_t txg,dmu_tx_t * tx)217193f3d2b8Schs metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
217293f3d2b8Schs {
217393f3d2b8Schs spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
217493f3d2b8Schs range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
217593f3d2b8Schs range_tree_t *condense_tree;
217693f3d2b8Schs space_map_t *sm = msp->ms_sm;
217793f3d2b8Schs
217893f3d2b8Schs ASSERT(MUTEX_HELD(&msp->ms_lock));
217993f3d2b8Schs ASSERT3U(spa_sync_pass(spa), ==, 1);
218093f3d2b8Schs ASSERT(msp->ms_loaded);
218193f3d2b8Schs
218293f3d2b8Schs
218393f3d2b8Schs spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
218493f3d2b8Schs "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
218593f3d2b8Schs msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
218693f3d2b8Schs msp->ms_group->mg_vd->vdev_spa->spa_name,
218793f3d2b8Schs space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
218893f3d2b8Schs msp->ms_condense_wanted ? "TRUE" : "FALSE");
218993f3d2b8Schs
219093f3d2b8Schs msp->ms_condense_wanted = B_FALSE;
219193f3d2b8Schs
219293f3d2b8Schs /*
219393f3d2b8Schs * Create an range tree that is 100% allocated. We remove segments
219493f3d2b8Schs * that have been freed in this txg, any deferred frees that exist,
219593f3d2b8Schs * and any allocation in the future. Removing segments should be
219693f3d2b8Schs * a relatively inexpensive operation since we expect these trees to
219793f3d2b8Schs * have a small number of nodes.
219893f3d2b8Schs */
219993f3d2b8Schs condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
220093f3d2b8Schs range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
220193f3d2b8Schs
220293f3d2b8Schs /*
220393f3d2b8Schs * Remove what's been freed in this txg from the condense_tree.
220493f3d2b8Schs * Since we're in sync_pass 1, we know that all the frees from
220593f3d2b8Schs * this txg are in the freetree.
220693f3d2b8Schs */
220793f3d2b8Schs range_tree_walk(freetree, range_tree_remove, condense_tree);
220893f3d2b8Schs
220993f3d2b8Schs for (int t = 0; t < TXG_DEFER_SIZE; t++) {
221093f3d2b8Schs range_tree_walk(msp->ms_defertree[t],
221193f3d2b8Schs range_tree_remove, condense_tree);
221293f3d2b8Schs }
221393f3d2b8Schs
221493f3d2b8Schs for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
221593f3d2b8Schs range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
221693f3d2b8Schs range_tree_remove, condense_tree);
221793f3d2b8Schs }
221893f3d2b8Schs
221993f3d2b8Schs /*
222093f3d2b8Schs * We're about to drop the metaslab's lock thus allowing
222193f3d2b8Schs * other consumers to change it's content. Set the
222293f3d2b8Schs * metaslab's ms_condensing flag to ensure that
222393f3d2b8Schs * allocations on this metaslab do not occur while we're
222493f3d2b8Schs * in the middle of committing it to disk. This is only critical
222593f3d2b8Schs * for the ms_tree as all other range trees use per txg
222693f3d2b8Schs * views of their content.
222793f3d2b8Schs */
222893f3d2b8Schs msp->ms_condensing = B_TRUE;
222993f3d2b8Schs
223093f3d2b8Schs mutex_exit(&msp->ms_lock);
223193f3d2b8Schs space_map_truncate(sm, tx);
223293f3d2b8Schs mutex_enter(&msp->ms_lock);
223393f3d2b8Schs
223493f3d2b8Schs /*
223593f3d2b8Schs * While we would ideally like to create a space map representation
223693f3d2b8Schs * that consists only of allocation records, doing so can be
223793f3d2b8Schs * prohibitively expensive because the in-core free tree can be
223893f3d2b8Schs * large, and therefore computationally expensive to subtract
223993f3d2b8Schs * from the condense_tree. Instead we sync out two trees, a cheap
224093f3d2b8Schs * allocation only tree followed by the in-core free tree. While not
224193f3d2b8Schs * optimal, this is typically close to optimal, and much cheaper to
224293f3d2b8Schs * compute.
224393f3d2b8Schs */
224493f3d2b8Schs space_map_write(sm, condense_tree, SM_ALLOC, tx);
224593f3d2b8Schs range_tree_vacate(condense_tree, NULL, NULL);
224693f3d2b8Schs range_tree_destroy(condense_tree);
224793f3d2b8Schs
224893f3d2b8Schs space_map_write(sm, msp->ms_tree, SM_FREE, tx);
224993f3d2b8Schs msp->ms_condensing = B_FALSE;
225093f3d2b8Schs }
225193f3d2b8Schs
225293f3d2b8Schs /*
2253c1cb2cd8Shaad * Write a metaslab to disk in the context of the specified transaction group.
2254c1cb2cd8Shaad */
2255c1cb2cd8Shaad void
metaslab_sync(metaslab_t * msp,uint64_t txg)2256c1cb2cd8Shaad metaslab_sync(metaslab_t *msp, uint64_t txg)
2257c1cb2cd8Shaad {
225893f3d2b8Schs metaslab_group_t *mg = msp->ms_group;
225993f3d2b8Schs vdev_t *vd = mg->mg_vd;
2260c1cb2cd8Shaad spa_t *spa = vd->vdev_spa;
2261f59c7639Shaad objset_t *mos = spa_meta_objset(spa);
226293f3d2b8Schs range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
226393f3d2b8Schs range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
226493f3d2b8Schs range_tree_t **freed_tree =
226593f3d2b8Schs &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2266c1cb2cd8Shaad dmu_tx_t *tx;
226793f3d2b8Schs uint64_t object = space_map_object(msp->ms_sm);
2268c1cb2cd8Shaad
2269f59c7639Shaad ASSERT(!vd->vdev_ishole);
2270f59c7639Shaad
227193f3d2b8Schs /*
227293f3d2b8Schs * This metaslab has just been added so there's no work to do now.
227393f3d2b8Schs */
227493f3d2b8Schs if (*freetree == NULL) {
227593f3d2b8Schs ASSERT3P(alloctree, ==, NULL);
227693f3d2b8Schs return;
227793f3d2b8Schs }
227893f3d2b8Schs
227993f3d2b8Schs ASSERT3P(alloctree, !=, NULL);
228093f3d2b8Schs ASSERT3P(*freetree, !=, NULL);
228193f3d2b8Schs ASSERT3P(*freed_tree, !=, NULL);
228293f3d2b8Schs
228393f3d2b8Schs /*
228493f3d2b8Schs * Normally, we don't want to process a metaslab if there
228593f3d2b8Schs * are no allocations or frees to perform. However, if the metaslab
228693f3d2b8Schs * is being forced to condense we need to let it through.
228793f3d2b8Schs */
228893f3d2b8Schs if (range_tree_space(alloctree) == 0 &&
228993f3d2b8Schs range_tree_space(*freetree) == 0 &&
229093f3d2b8Schs !msp->ms_condense_wanted)
2291f59c7639Shaad return;
2292c1cb2cd8Shaad
2293c1cb2cd8Shaad /*
2294c1cb2cd8Shaad * The only state that can actually be changing concurrently with
229593f3d2b8Schs * metaslab_sync() is the metaslab's ms_tree. No other thread can
229693f3d2b8Schs * be modifying this txg's alloctree, freetree, freed_tree, or
229793f3d2b8Schs * space_map_phys_t. Therefore, we only hold ms_lock to satify
229893f3d2b8Schs * space map ASSERTs. We drop it whenever we call into the DMU,
229993f3d2b8Schs * because the DMU can call down to us (e.g. via zio_free()) at
230093f3d2b8Schs * any time.
2301c1cb2cd8Shaad */
2302f59c7639Shaad
2303f59c7639Shaad tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2304c1cb2cd8Shaad
230593f3d2b8Schs if (msp->ms_sm == NULL) {
230693f3d2b8Schs uint64_t new_object;
230793f3d2b8Schs
230893f3d2b8Schs new_object = space_map_alloc(mos, tx);
230993f3d2b8Schs VERIFY3U(new_object, !=, 0);
231093f3d2b8Schs
231193f3d2b8Schs VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
231293f3d2b8Schs msp->ms_start, msp->ms_size, vd->vdev_ashift,
231393f3d2b8Schs &msp->ms_lock));
231493f3d2b8Schs ASSERT(msp->ms_sm != NULL);
2315c1cb2cd8Shaad }
2316c1cb2cd8Shaad
2317f59c7639Shaad mutex_enter(&msp->ms_lock);
2318f59c7639Shaad
2319c1cb2cd8Shaad /*
232093f3d2b8Schs * Note: metaslab_condense() clears the space map's histogram.
232193f3d2b8Schs * Therefore we must verify and remove this histogram before
232293f3d2b8Schs * condensing.
2323c1cb2cd8Shaad */
232493f3d2b8Schs metaslab_group_histogram_verify(mg);
232593f3d2b8Schs metaslab_class_histogram_verify(mg->mg_class);
232693f3d2b8Schs metaslab_group_histogram_remove(mg, msp);
2327c1cb2cd8Shaad
232893f3d2b8Schs if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
232993f3d2b8Schs metaslab_should_condense(msp)) {
233093f3d2b8Schs metaslab_condense(msp, txg, tx);
233193f3d2b8Schs } else {
233293f3d2b8Schs space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
233393f3d2b8Schs space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
2334c1cb2cd8Shaad }
2335c1cb2cd8Shaad
233693f3d2b8Schs if (msp->ms_loaded) {
233793f3d2b8Schs /*
233893f3d2b8Schs * When the space map is loaded, we have an accruate
233993f3d2b8Schs * histogram in the range tree. This gives us an opportunity
234093f3d2b8Schs * to bring the space map's histogram up-to-date so we clear
234193f3d2b8Schs * it first before updating it.
234293f3d2b8Schs */
234393f3d2b8Schs space_map_histogram_clear(msp->ms_sm);
234493f3d2b8Schs space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
234593f3d2b8Schs
234693f3d2b8Schs /*
234793f3d2b8Schs * Since we've cleared the histogram we need to add back
234893f3d2b8Schs * any free space that has already been processed, plus
234993f3d2b8Schs * any deferred space. This allows the on-disk histogram
235093f3d2b8Schs * to accurately reflect all free space even if some space
235193f3d2b8Schs * is not yet available for allocation (i.e. deferred).
235293f3d2b8Schs */
235393f3d2b8Schs space_map_histogram_add(msp->ms_sm, *freed_tree, tx);
235493f3d2b8Schs
235593f3d2b8Schs /*
235693f3d2b8Schs * Add back any deferred free space that has not been
235793f3d2b8Schs * added back into the in-core free tree yet. This will
235893f3d2b8Schs * ensure that we don't end up with a space map histogram
235993f3d2b8Schs * that is completely empty unless the metaslab is fully
236093f3d2b8Schs * allocated.
236193f3d2b8Schs */
236293f3d2b8Schs for (int t = 0; t < TXG_DEFER_SIZE; t++) {
236393f3d2b8Schs space_map_histogram_add(msp->ms_sm,
236493f3d2b8Schs msp->ms_defertree[t], tx);
236593f3d2b8Schs }
236693f3d2b8Schs }
236793f3d2b8Schs
236893f3d2b8Schs /*
236993f3d2b8Schs * Always add the free space from this sync pass to the space
237093f3d2b8Schs * map histogram. We want to make sure that the on-disk histogram
237193f3d2b8Schs * accounts for all free space. If the space map is not loaded,
237293f3d2b8Schs * then we will lose some accuracy but will correct it the next
237393f3d2b8Schs * time we load the space map.
237493f3d2b8Schs */
237593f3d2b8Schs space_map_histogram_add(msp->ms_sm, *freetree, tx);
237693f3d2b8Schs
237793f3d2b8Schs metaslab_group_histogram_add(mg, msp);
237893f3d2b8Schs metaslab_group_histogram_verify(mg);
237993f3d2b8Schs metaslab_class_histogram_verify(mg->mg_class);
238093f3d2b8Schs
238193f3d2b8Schs /*
238293f3d2b8Schs * For sync pass 1, we avoid traversing this txg's free range tree
238393f3d2b8Schs * and instead will just swap the pointers for freetree and
238493f3d2b8Schs * freed_tree. We can safely do this since the freed_tree is
238593f3d2b8Schs * guaranteed to be empty on the initial pass.
238693f3d2b8Schs */
238793f3d2b8Schs if (spa_sync_pass(spa) == 1) {
238893f3d2b8Schs range_tree_swap(freetree, freed_tree);
238993f3d2b8Schs } else {
239093f3d2b8Schs range_tree_vacate(*freetree, range_tree_add, *freed_tree);
239193f3d2b8Schs }
239293f3d2b8Schs range_tree_vacate(alloctree, NULL, NULL);
239393f3d2b8Schs
239493f3d2b8Schs ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
239593f3d2b8Schs ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
239693f3d2b8Schs ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2397c1cb2cd8Shaad
2398c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
2399c1cb2cd8Shaad
240093f3d2b8Schs if (object != space_map_object(msp->ms_sm)) {
240193f3d2b8Schs object = space_map_object(msp->ms_sm);
240293f3d2b8Schs dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
240393f3d2b8Schs msp->ms_id, sizeof (uint64_t), &object, tx);
240493f3d2b8Schs }
2405c1cb2cd8Shaad dmu_tx_commit(tx);
2406c1cb2cd8Shaad }
2407c1cb2cd8Shaad
2408c1cb2cd8Shaad /*
2409c1cb2cd8Shaad * Called after a transaction group has completely synced to mark
2410c1cb2cd8Shaad * all of the metaslab's free space as usable.
2411c1cb2cd8Shaad */
2412c1cb2cd8Shaad void
metaslab_sync_done(metaslab_t * msp,uint64_t txg)2413c1cb2cd8Shaad metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2414c1cb2cd8Shaad {
2415c1cb2cd8Shaad metaslab_group_t *mg = msp->ms_group;
2416c1cb2cd8Shaad vdev_t *vd = mg->mg_vd;
241793f3d2b8Schs spa_t *spa = vd->vdev_spa;
241893f3d2b8Schs range_tree_t **freed_tree;
241993f3d2b8Schs range_tree_t **defer_tree;
2420f59c7639Shaad int64_t alloc_delta, defer_delta;
242193f3d2b8Schs boolean_t defer_allowed = B_TRUE;
2422f59c7639Shaad
2423f59c7639Shaad ASSERT(!vd->vdev_ishole);
2424c1cb2cd8Shaad
2425c1cb2cd8Shaad mutex_enter(&msp->ms_lock);
2426c1cb2cd8Shaad
2427c1cb2cd8Shaad /*
2428c1cb2cd8Shaad * If this metaslab is just becoming available, initialize its
242993f3d2b8Schs * alloctrees, freetrees, and defertree and add its capacity to
243093f3d2b8Schs * the vdev.
2431c1cb2cd8Shaad */
243293f3d2b8Schs if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
2433f59c7639Shaad for (int t = 0; t < TXG_SIZE; t++) {
243493f3d2b8Schs ASSERT(msp->ms_alloctree[t] == NULL);
243593f3d2b8Schs ASSERT(msp->ms_freetree[t] == NULL);
243693f3d2b8Schs
243793f3d2b8Schs msp->ms_alloctree[t] = range_tree_create(NULL, msp,
243893f3d2b8Schs &msp->ms_lock);
243993f3d2b8Schs msp->ms_freetree[t] = range_tree_create(NULL, msp,
244093f3d2b8Schs &msp->ms_lock);
2441c1cb2cd8Shaad }
2442f59c7639Shaad
244393f3d2b8Schs for (int t = 0; t < TXG_DEFER_SIZE; t++) {
244493f3d2b8Schs ASSERT(msp->ms_defertree[t] == NULL);
2445f59c7639Shaad
244693f3d2b8Schs msp->ms_defertree[t] = range_tree_create(NULL, msp,
244793f3d2b8Schs &msp->ms_lock);
2448c1cb2cd8Shaad }
2449c1cb2cd8Shaad
245093f3d2b8Schs vdev_space_update(vd, 0, 0, msp->ms_size);
245193f3d2b8Schs }
245293f3d2b8Schs
245393f3d2b8Schs freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
245493f3d2b8Schs defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
245593f3d2b8Schs
245693f3d2b8Schs uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
245793f3d2b8Schs metaslab_class_get_alloc(spa_normal_class(spa));
245893f3d2b8Schs if (free_space <= spa_get_slop_space(spa)) {
245993f3d2b8Schs defer_allowed = B_FALSE;
246093f3d2b8Schs }
246193f3d2b8Schs
246293f3d2b8Schs defer_delta = 0;
246393f3d2b8Schs alloc_delta = space_map_alloc_delta(msp->ms_sm);
246493f3d2b8Schs if (defer_allowed) {
246593f3d2b8Schs defer_delta = range_tree_space(*freed_tree) -
246693f3d2b8Schs range_tree_space(*defer_tree);
246793f3d2b8Schs } else {
246893f3d2b8Schs defer_delta -= range_tree_space(*defer_tree);
246993f3d2b8Schs }
2470f59c7639Shaad
2471f59c7639Shaad vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2472c1cb2cd8Shaad
247393f3d2b8Schs ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
247493f3d2b8Schs ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2475c1cb2cd8Shaad
2476c1cb2cd8Shaad /*
247793f3d2b8Schs * If there's a metaslab_load() in progress, wait for it to complete
2478c1cb2cd8Shaad * so that we have a consistent view of the in-core space map.
2479c1cb2cd8Shaad */
248093f3d2b8Schs metaslab_load_wait(msp);
2481c1cb2cd8Shaad
248293f3d2b8Schs /*
248393f3d2b8Schs * Move the frees from the defer_tree back to the free
248493f3d2b8Schs * range tree (if it's loaded). Swap the freed_tree and the
248593f3d2b8Schs * defer_tree -- this is safe to do because we've just emptied out
248693f3d2b8Schs * the defer_tree.
248793f3d2b8Schs */
248893f3d2b8Schs range_tree_vacate(*defer_tree,
248993f3d2b8Schs msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
249093f3d2b8Schs if (defer_allowed) {
249193f3d2b8Schs range_tree_swap(freed_tree, defer_tree);
249293f3d2b8Schs } else {
249393f3d2b8Schs range_tree_vacate(*freed_tree,
249493f3d2b8Schs msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
249593f3d2b8Schs }
249693f3d2b8Schs
249793f3d2b8Schs space_map_update(msp->ms_sm);
2498c1cb2cd8Shaad
2499f59c7639Shaad msp->ms_deferspace += defer_delta;
2500f59c7639Shaad ASSERT3S(msp->ms_deferspace, >=, 0);
250193f3d2b8Schs ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2502f59c7639Shaad if (msp->ms_deferspace != 0) {
2503f59c7639Shaad /*
2504f59c7639Shaad * Keep syncing this metaslab until all deferred frees
2505f59c7639Shaad * are back in circulation.
2506f59c7639Shaad */
2507f59c7639Shaad vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2508f59c7639Shaad }
2509f59c7639Shaad
2510c1cb2cd8Shaad /*
251193f3d2b8Schs * Calculate the new weights before unloading any metaslabs.
251293f3d2b8Schs * This will give us the most accurate weighting.
2513c1cb2cd8Shaad */
251493f3d2b8Schs metaslab_group_sort(mg, msp, metaslab_weight(msp));
2515c1cb2cd8Shaad
251693f3d2b8Schs /*
251793f3d2b8Schs * If the metaslab is loaded and we've not tried to load or allocate
251893f3d2b8Schs * from it in 'metaslab_unload_delay' txgs, then unload it.
251993f3d2b8Schs */
252093f3d2b8Schs if (msp->ms_loaded &&
252193f3d2b8Schs msp->ms_selected_txg + metaslab_unload_delay < txg) {
252293f3d2b8Schs for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
252393f3d2b8Schs VERIFY0(range_tree_space(
252493f3d2b8Schs msp->ms_alloctree[(txg + t) & TXG_MASK]));
2525c1cb2cd8Shaad }
2526c1cb2cd8Shaad
252793f3d2b8Schs if (!metaslab_debug_unload)
252893f3d2b8Schs metaslab_unload(msp);
252993f3d2b8Schs }
2530c1cb2cd8Shaad
2531c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
2532c1cb2cd8Shaad }
2533c1cb2cd8Shaad
2534f59c7639Shaad void
metaslab_sync_reassess(metaslab_group_t * mg)2535f59c7639Shaad metaslab_sync_reassess(metaslab_group_t *mg)
2536f59c7639Shaad {
253793f3d2b8Schs metaslab_group_alloc_update(mg);
253893f3d2b8Schs mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2539f59c7639Shaad
2540f59c7639Shaad /*
254193f3d2b8Schs * Preload the next potential metaslabs
2542f59c7639Shaad */
254393f3d2b8Schs metaslab_group_preload(mg);
2544f59c7639Shaad }
2545f59c7639Shaad
2546c1cb2cd8Shaad static uint64_t
metaslab_distance(metaslab_t * msp,dva_t * dva)2547c1cb2cd8Shaad metaslab_distance(metaslab_t *msp, dva_t *dva)
2548c1cb2cd8Shaad {
2549c1cb2cd8Shaad uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2550c1cb2cd8Shaad uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
255193f3d2b8Schs uint64_t start = msp->ms_id;
2552c1cb2cd8Shaad
2553c1cb2cd8Shaad if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2554c1cb2cd8Shaad return (1ULL << 63);
2555c1cb2cd8Shaad
2556c1cb2cd8Shaad if (offset < start)
2557c1cb2cd8Shaad return ((start - offset) << ms_shift);
2558c1cb2cd8Shaad if (offset > start)
2559c1cb2cd8Shaad return ((offset - start) << ms_shift);
2560c1cb2cd8Shaad return (0);
2561c1cb2cd8Shaad }
2562c1cb2cd8Shaad
256393f3d2b8Schs /*
256493f3d2b8Schs * ==========================================================================
256593f3d2b8Schs * Metaslab allocation tracing facility
256693f3d2b8Schs * ==========================================================================
256793f3d2b8Schs */
256893f3d2b8Schs kstat_t *metaslab_trace_ksp;
256993f3d2b8Schs kstat_named_t metaslab_trace_over_limit;
257093f3d2b8Schs
257193f3d2b8Schs void
metaslab_alloc_trace_init(void)257293f3d2b8Schs metaslab_alloc_trace_init(void)
257393f3d2b8Schs {
257493f3d2b8Schs ASSERT(metaslab_alloc_trace_cache == NULL);
257593f3d2b8Schs metaslab_alloc_trace_cache = kmem_cache_create(
257693f3d2b8Schs "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
257793f3d2b8Schs 0, NULL, NULL, NULL, NULL, NULL, 0);
257893f3d2b8Schs metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
257993f3d2b8Schs "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
258093f3d2b8Schs if (metaslab_trace_ksp != NULL) {
258193f3d2b8Schs metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
258293f3d2b8Schs kstat_named_init(&metaslab_trace_over_limit,
258393f3d2b8Schs "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
258493f3d2b8Schs kstat_install(metaslab_trace_ksp);
258593f3d2b8Schs }
258693f3d2b8Schs }
258793f3d2b8Schs
258893f3d2b8Schs void
metaslab_alloc_trace_fini(void)258993f3d2b8Schs metaslab_alloc_trace_fini(void)
259093f3d2b8Schs {
259193f3d2b8Schs if (metaslab_trace_ksp != NULL) {
259293f3d2b8Schs kstat_delete(metaslab_trace_ksp);
259393f3d2b8Schs metaslab_trace_ksp = NULL;
259493f3d2b8Schs }
259593f3d2b8Schs kmem_cache_destroy(metaslab_alloc_trace_cache);
259693f3d2b8Schs metaslab_alloc_trace_cache = NULL;
259793f3d2b8Schs }
259893f3d2b8Schs
259993f3d2b8Schs /*
260093f3d2b8Schs * Add an allocation trace element to the allocation tracing list.
260193f3d2b8Schs */
260293f3d2b8Schs static void
metaslab_trace_add(zio_alloc_list_t * zal,metaslab_group_t * mg,metaslab_t * msp,uint64_t psize,uint32_t dva_id,uint64_t offset)260393f3d2b8Schs metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
260493f3d2b8Schs metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
260593f3d2b8Schs {
260693f3d2b8Schs if (!metaslab_trace_enabled)
260793f3d2b8Schs return;
260893f3d2b8Schs
260993f3d2b8Schs /*
261093f3d2b8Schs * When the tracing list reaches its maximum we remove
261193f3d2b8Schs * the second element in the list before adding a new one.
261293f3d2b8Schs * By removing the second element we preserve the original
261393f3d2b8Schs * entry as a clue to what allocations steps have already been
261493f3d2b8Schs * performed.
261593f3d2b8Schs */
261693f3d2b8Schs if (zal->zal_size == metaslab_trace_max_entries) {
261793f3d2b8Schs metaslab_alloc_trace_t *mat_next;
261893f3d2b8Schs #ifdef DEBUG
261993f3d2b8Schs panic("too many entries in allocation list");
262093f3d2b8Schs #endif
262193f3d2b8Schs atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
262293f3d2b8Schs zal->zal_size--;
262393f3d2b8Schs mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
262493f3d2b8Schs list_remove(&zal->zal_list, mat_next);
262593f3d2b8Schs kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
262693f3d2b8Schs }
262793f3d2b8Schs
262893f3d2b8Schs metaslab_alloc_trace_t *mat =
262993f3d2b8Schs kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
263093f3d2b8Schs list_link_init(&mat->mat_list_node);
263193f3d2b8Schs mat->mat_mg = mg;
263293f3d2b8Schs mat->mat_msp = msp;
263393f3d2b8Schs mat->mat_size = psize;
263493f3d2b8Schs mat->mat_dva_id = dva_id;
263593f3d2b8Schs mat->mat_offset = offset;
263693f3d2b8Schs mat->mat_weight = 0;
263793f3d2b8Schs
263893f3d2b8Schs if (msp != NULL)
263993f3d2b8Schs mat->mat_weight = msp->ms_weight;
264093f3d2b8Schs
264193f3d2b8Schs /*
264293f3d2b8Schs * The list is part of the zio so locking is not required. Only
264393f3d2b8Schs * a single thread will perform allocations for a given zio.
264493f3d2b8Schs */
264593f3d2b8Schs list_insert_tail(&zal->zal_list, mat);
264693f3d2b8Schs zal->zal_size++;
264793f3d2b8Schs
264893f3d2b8Schs ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
264993f3d2b8Schs }
265093f3d2b8Schs
265193f3d2b8Schs void
metaslab_trace_init(zio_alloc_list_t * zal)265293f3d2b8Schs metaslab_trace_init(zio_alloc_list_t *zal)
265393f3d2b8Schs {
265493f3d2b8Schs list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
265593f3d2b8Schs offsetof(metaslab_alloc_trace_t, mat_list_node));
265693f3d2b8Schs zal->zal_size = 0;
265793f3d2b8Schs }
265893f3d2b8Schs
265993f3d2b8Schs void
metaslab_trace_fini(zio_alloc_list_t * zal)266093f3d2b8Schs metaslab_trace_fini(zio_alloc_list_t *zal)
266193f3d2b8Schs {
266293f3d2b8Schs metaslab_alloc_trace_t *mat;
266393f3d2b8Schs
266493f3d2b8Schs while ((mat = list_remove_head(&zal->zal_list)) != NULL)
266593f3d2b8Schs kmem_cache_free(metaslab_alloc_trace_cache, mat);
266693f3d2b8Schs list_destroy(&zal->zal_list);
266793f3d2b8Schs zal->zal_size = 0;
266893f3d2b8Schs }
266993f3d2b8Schs
267093f3d2b8Schs /*
267193f3d2b8Schs * ==========================================================================
267293f3d2b8Schs * Metaslab block operations
267393f3d2b8Schs * ==========================================================================
267493f3d2b8Schs */
267593f3d2b8Schs
267693f3d2b8Schs static void
metaslab_group_alloc_increment(spa_t * spa,uint64_t vdev,void * tag,int flags)267793f3d2b8Schs metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
267893f3d2b8Schs {
267993f3d2b8Schs if (!(flags & METASLAB_ASYNC_ALLOC) ||
268093f3d2b8Schs flags & METASLAB_DONT_THROTTLE)
268193f3d2b8Schs return;
268293f3d2b8Schs
268393f3d2b8Schs metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
268493f3d2b8Schs if (!mg->mg_class->mc_alloc_throttle_enabled)
268593f3d2b8Schs return;
268693f3d2b8Schs
268793f3d2b8Schs (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
268893f3d2b8Schs }
268993f3d2b8Schs
269093f3d2b8Schs void
metaslab_group_alloc_decrement(spa_t * spa,uint64_t vdev,void * tag,int flags)269193f3d2b8Schs metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
269293f3d2b8Schs {
269393f3d2b8Schs if (!(flags & METASLAB_ASYNC_ALLOC) ||
269493f3d2b8Schs flags & METASLAB_DONT_THROTTLE)
269593f3d2b8Schs return;
269693f3d2b8Schs
269793f3d2b8Schs metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
269893f3d2b8Schs if (!mg->mg_class->mc_alloc_throttle_enabled)
269993f3d2b8Schs return;
270093f3d2b8Schs
270193f3d2b8Schs (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
270293f3d2b8Schs }
270393f3d2b8Schs
270493f3d2b8Schs void
metaslab_group_alloc_verify(spa_t * spa,const blkptr_t * bp,void * tag)270593f3d2b8Schs metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
270693f3d2b8Schs {
270793f3d2b8Schs #ifdef ZFS_DEBUG
270893f3d2b8Schs const dva_t *dva = bp->blk_dva;
270993f3d2b8Schs int ndvas = BP_GET_NDVAS(bp);
271093f3d2b8Schs
271193f3d2b8Schs for (int d = 0; d < ndvas; d++) {
271293f3d2b8Schs uint64_t vdev = DVA_GET_VDEV(&dva[d]);
271393f3d2b8Schs metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
271493f3d2b8Schs VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
271593f3d2b8Schs }
271693f3d2b8Schs #endif
271793f3d2b8Schs }
271893f3d2b8Schs
2719c1cb2cd8Shaad static uint64_t
metaslab_block_alloc(metaslab_t * msp,uint64_t size,uint64_t txg)272093f3d2b8Schs metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
272193f3d2b8Schs {
272293f3d2b8Schs uint64_t start;
272393f3d2b8Schs range_tree_t *rt = msp->ms_tree;
272493f3d2b8Schs metaslab_class_t *mc = msp->ms_group->mg_class;
272593f3d2b8Schs
272693f3d2b8Schs VERIFY(!msp->ms_condensing);
272793f3d2b8Schs
272893f3d2b8Schs start = mc->mc_ops->msop_alloc(msp, size);
272993f3d2b8Schs if (start != -1ULL) {
273093f3d2b8Schs metaslab_group_t *mg = msp->ms_group;
273193f3d2b8Schs vdev_t *vd = mg->mg_vd;
273293f3d2b8Schs
273393f3d2b8Schs VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
273493f3d2b8Schs VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
273593f3d2b8Schs VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
273693f3d2b8Schs range_tree_remove(rt, start, size);
273793f3d2b8Schs
273893f3d2b8Schs if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
273993f3d2b8Schs vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
274093f3d2b8Schs
274193f3d2b8Schs range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
274293f3d2b8Schs
274393f3d2b8Schs /* Track the last successful allocation */
274493f3d2b8Schs msp->ms_alloc_txg = txg;
274593f3d2b8Schs metaslab_verify_space(msp, txg);
274693f3d2b8Schs }
274793f3d2b8Schs
274893f3d2b8Schs /*
274993f3d2b8Schs * Now that we've attempted the allocation we need to update the
275093f3d2b8Schs * metaslab's maximum block size since it may have changed.
275193f3d2b8Schs */
275293f3d2b8Schs msp->ms_max_size = metaslab_block_maxsize(msp);
275393f3d2b8Schs return (start);
275493f3d2b8Schs }
275593f3d2b8Schs
275693f3d2b8Schs static uint64_t
metaslab_group_alloc_normal(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,uint64_t min_distance,dva_t * dva,int d)275793f3d2b8Schs metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
275893f3d2b8Schs uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2759c1cb2cd8Shaad {
2760c1cb2cd8Shaad metaslab_t *msp = NULL;
2761c1cb2cd8Shaad uint64_t offset = -1ULL;
2762c1cb2cd8Shaad uint64_t activation_weight;
2763c1cb2cd8Shaad uint64_t target_distance;
2764c1cb2cd8Shaad int i;
2765c1cb2cd8Shaad
2766c1cb2cd8Shaad activation_weight = METASLAB_WEIGHT_PRIMARY;
2767f59c7639Shaad for (i = 0; i < d; i++) {
2768f59c7639Shaad if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2769c1cb2cd8Shaad activation_weight = METASLAB_WEIGHT_SECONDARY;
2770f59c7639Shaad break;
2771f59c7639Shaad }
2772f59c7639Shaad }
2773c1cb2cd8Shaad
277493f3d2b8Schs metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
277593f3d2b8Schs search->ms_weight = UINT64_MAX;
277693f3d2b8Schs search->ms_start = 0;
2777c1cb2cd8Shaad for (;;) {
2778f59c7639Shaad boolean_t was_active;
277993f3d2b8Schs avl_tree_t *t = &mg->mg_metaslab_tree;
278093f3d2b8Schs avl_index_t idx;
2781f59c7639Shaad
2782c1cb2cd8Shaad mutex_enter(&mg->mg_lock);
278393f3d2b8Schs
278493f3d2b8Schs /*
278593f3d2b8Schs * Find the metaslab with the highest weight that is less
278693f3d2b8Schs * than what we've already tried. In the common case, this
278793f3d2b8Schs * means that we will examine each metaslab at most once.
278893f3d2b8Schs * Note that concurrent callers could reorder metaslabs
278993f3d2b8Schs * by activation/passivation once we have dropped the mg_lock.
279093f3d2b8Schs * If a metaslab is activated by another thread, and we fail
279193f3d2b8Schs * to allocate from the metaslab we have selected, we may
279293f3d2b8Schs * not try the newly-activated metaslab, and instead activate
279393f3d2b8Schs * another metaslab. This is not optimal, but generally
279493f3d2b8Schs * does not cause any problems (a possible exception being
279593f3d2b8Schs * if every metaslab is completely full except for the
279693f3d2b8Schs * the newly-activated metaslab which we fail to examine).
279793f3d2b8Schs */
279893f3d2b8Schs msp = avl_find(t, search, &idx);
279993f3d2b8Schs if (msp == NULL)
280093f3d2b8Schs msp = avl_nearest(t, idx, AVL_AFTER);
280193f3d2b8Schs for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
280293f3d2b8Schs
280393f3d2b8Schs if (!metaslab_should_allocate(msp, asize)) {
280493f3d2b8Schs metaslab_trace_add(zal, mg, msp, asize, d,
280593f3d2b8Schs TRACE_TOO_SMALL);
280693f3d2b8Schs continue;
2807c1cb2cd8Shaad }
2808c1cb2cd8Shaad
280993f3d2b8Schs /*
281093f3d2b8Schs * If the selected metaslab is condensing, skip it.
281193f3d2b8Schs */
281293f3d2b8Schs if (msp->ms_condensing)
281393f3d2b8Schs continue;
281493f3d2b8Schs
2815f59c7639Shaad was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2816c1cb2cd8Shaad if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2817c1cb2cd8Shaad break;
2818c1cb2cd8Shaad
2819c1cb2cd8Shaad target_distance = min_distance +
282093f3d2b8Schs (space_map_allocated(msp->ms_sm) != 0 ? 0 :
282193f3d2b8Schs min_distance >> 1);
2822c1cb2cd8Shaad
282393f3d2b8Schs for (i = 0; i < d; i++) {
2824c1cb2cd8Shaad if (metaslab_distance(msp, &dva[i]) <
2825c1cb2cd8Shaad target_distance)
2826c1cb2cd8Shaad break;
282793f3d2b8Schs }
2828c1cb2cd8Shaad if (i == d)
2829c1cb2cd8Shaad break;
2830c1cb2cd8Shaad }
2831c1cb2cd8Shaad mutex_exit(&mg->mg_lock);
283293f3d2b8Schs if (msp == NULL) {
283393f3d2b8Schs kmem_free(search, sizeof (*search));
2834c1cb2cd8Shaad return (-1ULL);
283593f3d2b8Schs }
283693f3d2b8Schs search->ms_weight = msp->ms_weight;
283793f3d2b8Schs search->ms_start = msp->ms_start + 1;
2838c1cb2cd8Shaad
2839c1cb2cd8Shaad mutex_enter(&msp->ms_lock);
2840c1cb2cd8Shaad
2841c1cb2cd8Shaad /*
2842c1cb2cd8Shaad * Ensure that the metaslab we have selected is still
2843c1cb2cd8Shaad * capable of handling our request. It's possible that
2844c1cb2cd8Shaad * another thread may have changed the weight while we
284593f3d2b8Schs * were blocked on the metaslab lock. We check the
284693f3d2b8Schs * active status first to see if we need to reselect
284793f3d2b8Schs * a new metaslab.
2848c1cb2cd8Shaad */
284993f3d2b8Schs if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2850c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
2851c1cb2cd8Shaad continue;
2852c1cb2cd8Shaad }
2853c1cb2cd8Shaad
2854c1cb2cd8Shaad if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2855c1cb2cd8Shaad activation_weight == METASLAB_WEIGHT_PRIMARY) {
2856c1cb2cd8Shaad metaslab_passivate(msp,
2857c1cb2cd8Shaad msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2858c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
2859c1cb2cd8Shaad continue;
2860c1cb2cd8Shaad }
2861c1cb2cd8Shaad
286293f3d2b8Schs if (metaslab_activate(msp, activation_weight) != 0) {
286393f3d2b8Schs mutex_exit(&msp->ms_lock);
286493f3d2b8Schs continue;
286593f3d2b8Schs }
286693f3d2b8Schs msp->ms_selected_txg = txg;
286793f3d2b8Schs
286893f3d2b8Schs /*
286993f3d2b8Schs * Now that we have the lock, recheck to see if we should
287093f3d2b8Schs * continue to use this metaslab for this allocation. The
287193f3d2b8Schs * the metaslab is now loaded so metaslab_should_allocate() can
287293f3d2b8Schs * accurately determine if the allocation attempt should
287393f3d2b8Schs * proceed.
287493f3d2b8Schs */
287593f3d2b8Schs if (!metaslab_should_allocate(msp, asize)) {
287693f3d2b8Schs /* Passivate this metaslab and select a new one. */
287793f3d2b8Schs metaslab_trace_add(zal, mg, msp, asize, d,
287893f3d2b8Schs TRACE_TOO_SMALL);
287993f3d2b8Schs goto next;
288093f3d2b8Schs }
288193f3d2b8Schs
288293f3d2b8Schs /*
288393f3d2b8Schs * If this metaslab is currently condensing then pick again as
288493f3d2b8Schs * we can't manipulate this metaslab until it's committed
288593f3d2b8Schs * to disk.
288693f3d2b8Schs */
288793f3d2b8Schs if (msp->ms_condensing) {
288893f3d2b8Schs metaslab_trace_add(zal, mg, msp, asize, d,
288993f3d2b8Schs TRACE_CONDENSING);
2890c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
2891c1cb2cd8Shaad continue;
2892c1cb2cd8Shaad }
2893c1cb2cd8Shaad
289493f3d2b8Schs offset = metaslab_block_alloc(msp, asize, txg);
289593f3d2b8Schs metaslab_trace_add(zal, mg, msp, asize, d, offset);
289693f3d2b8Schs
289793f3d2b8Schs if (offset != -1ULL) {
289893f3d2b8Schs /* Proactively passivate the metaslab, if needed */
289993f3d2b8Schs metaslab_segment_may_passivate(msp);
2900c1cb2cd8Shaad break;
290193f3d2b8Schs }
290293f3d2b8Schs next:
290393f3d2b8Schs ASSERT(msp->ms_loaded);
2904c1cb2cd8Shaad
290593f3d2b8Schs /*
290693f3d2b8Schs * We were unable to allocate from this metaslab so determine
290793f3d2b8Schs * a new weight for this metaslab. Now that we have loaded
290893f3d2b8Schs * the metaslab we can provide a better hint to the metaslab
290993f3d2b8Schs * selector.
291093f3d2b8Schs *
291193f3d2b8Schs * For space-based metaslabs, we use the maximum block size.
291293f3d2b8Schs * This information is only available when the metaslab
291393f3d2b8Schs * is loaded and is more accurate than the generic free
291493f3d2b8Schs * space weight that was calculated by metaslab_weight().
291593f3d2b8Schs * This information allows us to quickly compare the maximum
291693f3d2b8Schs * available allocation in the metaslab to the allocation
291793f3d2b8Schs * size being requested.
291893f3d2b8Schs *
291993f3d2b8Schs * For segment-based metaslabs, determine the new weight
292093f3d2b8Schs * based on the highest bucket in the range tree. We
292193f3d2b8Schs * explicitly use the loaded segment weight (i.e. the range
292293f3d2b8Schs * tree histogram) since it contains the space that is
292393f3d2b8Schs * currently available for allocation and is accurate
292493f3d2b8Schs * even within a sync pass.
292593f3d2b8Schs */
292693f3d2b8Schs if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
292793f3d2b8Schs uint64_t weight = metaslab_block_maxsize(msp);
292893f3d2b8Schs WEIGHT_SET_SPACEBASED(weight);
292993f3d2b8Schs metaslab_passivate(msp, weight);
293093f3d2b8Schs } else {
293193f3d2b8Schs metaslab_passivate(msp,
293293f3d2b8Schs metaslab_weight_from_range_tree(msp));
2933c1cb2cd8Shaad }
2934c1cb2cd8Shaad
293593f3d2b8Schs /*
293693f3d2b8Schs * We have just failed an allocation attempt, check
293793f3d2b8Schs * that metaslab_should_allocate() agrees. Otherwise,
293893f3d2b8Schs * we may end up in an infinite loop retrying the same
293993f3d2b8Schs * metaslab.
294093f3d2b8Schs */
294193f3d2b8Schs ASSERT(!metaslab_should_allocate(msp, asize));
2942c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
294393f3d2b8Schs }
294493f3d2b8Schs mutex_exit(&msp->ms_lock);
294593f3d2b8Schs kmem_free(search, sizeof (*search));
2946c1cb2cd8Shaad return (offset);
2947c1cb2cd8Shaad }
2948c1cb2cd8Shaad
294993f3d2b8Schs static uint64_t
metaslab_group_alloc(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,uint64_t min_distance,dva_t * dva,int d)295093f3d2b8Schs metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
295193f3d2b8Schs uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
295293f3d2b8Schs {
295393f3d2b8Schs uint64_t offset;
295493f3d2b8Schs ASSERT(mg->mg_initialized);
295593f3d2b8Schs
295693f3d2b8Schs offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
295793f3d2b8Schs min_distance, dva, d);
295893f3d2b8Schs
295993f3d2b8Schs mutex_enter(&mg->mg_lock);
296093f3d2b8Schs if (offset == -1ULL) {
296193f3d2b8Schs mg->mg_failed_allocations++;
296293f3d2b8Schs metaslab_trace_add(zal, mg, NULL, asize, d,
296393f3d2b8Schs TRACE_GROUP_FAILURE);
296493f3d2b8Schs if (asize == SPA_GANGBLOCKSIZE) {
296593f3d2b8Schs /*
296693f3d2b8Schs * This metaslab group was unable to allocate
296793f3d2b8Schs * the minimum gang block size so it must be out of
296893f3d2b8Schs * space. We must notify the allocation throttle
296993f3d2b8Schs * to start skipping allocation attempts to this
297093f3d2b8Schs * metaslab group until more space becomes available.
297193f3d2b8Schs * Note: this failure cannot be caused by the
297293f3d2b8Schs * allocation throttle since the allocation throttle
297393f3d2b8Schs * is only responsible for skipping devices and
297493f3d2b8Schs * not failing block allocations.
297593f3d2b8Schs */
297693f3d2b8Schs mg->mg_no_free_space = B_TRUE;
297793f3d2b8Schs }
297893f3d2b8Schs }
297993f3d2b8Schs mg->mg_allocations++;
298093f3d2b8Schs mutex_exit(&mg->mg_lock);
298193f3d2b8Schs return (offset);
298293f3d2b8Schs }
298393f3d2b8Schs
298493f3d2b8Schs /*
298593f3d2b8Schs * If we have to write a ditto block (i.e. more than one DVA for a given BP)
298693f3d2b8Schs * on the same vdev as an existing DVA of this BP, then try to allocate it
298793f3d2b8Schs * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
298893f3d2b8Schs * existing DVAs.
298993f3d2b8Schs */
299093f3d2b8Schs int ditto_same_vdev_distance_shift = 3;
299193f3d2b8Schs
2992c1cb2cd8Shaad /*
2993c1cb2cd8Shaad * Allocate a block for the specified i/o.
2994c1cb2cd8Shaad */
2995c1cb2cd8Shaad static int
metaslab_alloc_dva(spa_t * spa,metaslab_class_t * mc,uint64_t psize,dva_t * dva,int d,dva_t * hintdva,uint64_t txg,int flags,zio_alloc_list_t * zal)2996c1cb2cd8Shaad metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
299793f3d2b8Schs dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
299893f3d2b8Schs zio_alloc_list_t *zal)
2999c1cb2cd8Shaad {
3000c1cb2cd8Shaad metaslab_group_t *mg, *rotor;
3001c1cb2cd8Shaad vdev_t *vd;
300293f3d2b8Schs boolean_t try_hard = B_FALSE;
3003c1cb2cd8Shaad
3004c1cb2cd8Shaad ASSERT(!DVA_IS_VALID(&dva[d]));
3005c1cb2cd8Shaad
3006c1cb2cd8Shaad /*
3007c1cb2cd8Shaad * For testing, make some blocks above a certain size be gang blocks.
3008c1cb2cd8Shaad */
300993f3d2b8Schs if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
301093f3d2b8Schs metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
301193f3d2b8Schs return (SET_ERROR(ENOSPC));
301293f3d2b8Schs }
3013c1cb2cd8Shaad
3014c1cb2cd8Shaad /*
3015c1cb2cd8Shaad * Start at the rotor and loop through all mgs until we find something.
3016f59c7639Shaad * Note that there's no locking on mc_rotor or mc_aliquot because
3017c1cb2cd8Shaad * nothing actually breaks if we miss a few updates -- we just won't
3018c1cb2cd8Shaad * allocate quite as evenly. It all balances out over time.
3019c1cb2cd8Shaad *
3020c1cb2cd8Shaad * If we are doing ditto or log blocks, try to spread them across
3021c1cb2cd8Shaad * consecutive vdevs. If we're forced to reuse a vdev before we've
3022c1cb2cd8Shaad * allocated all of our ditto blocks, then try and spread them out on
3023c1cb2cd8Shaad * that vdev as much as possible. If it turns out to not be possible,
3024c1cb2cd8Shaad * gradually lower our standards until anything becomes acceptable.
3025c1cb2cd8Shaad * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3026c1cb2cd8Shaad * gives us hope of containing our fault domains to something we're
3027c1cb2cd8Shaad * able to reason about. Otherwise, any two top-level vdev failures
3028c1cb2cd8Shaad * will guarantee the loss of data. With consecutive allocation,
3029c1cb2cd8Shaad * only two adjacent top-level vdev failures will result in data loss.
3030c1cb2cd8Shaad *
3031c1cb2cd8Shaad * If we are doing gang blocks (hintdva is non-NULL), try to keep
3032c1cb2cd8Shaad * ourselves on the same vdev as our gang block header. That
3033c1cb2cd8Shaad * way, we can hope for locality in vdev_cache, plus it makes our
3034c1cb2cd8Shaad * fault domains something tractable.
3035c1cb2cd8Shaad */
3036c1cb2cd8Shaad if (hintdva) {
3037c1cb2cd8Shaad vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3038f59c7639Shaad
3039f59c7639Shaad /*
3040f59c7639Shaad * It's possible the vdev we're using as the hint no
3041f59c7639Shaad * longer exists (i.e. removed). Consult the rotor when
3042f59c7639Shaad * all else fails.
3043f59c7639Shaad */
3044f59c7639Shaad if (vd != NULL) {
3045c1cb2cd8Shaad mg = vd->vdev_mg;
3046f59c7639Shaad
3047f59c7639Shaad if (flags & METASLAB_HINTBP_AVOID &&
3048f59c7639Shaad mg->mg_next != NULL)
3049f59c7639Shaad mg = mg->mg_next;
3050f59c7639Shaad } else {
3051f59c7639Shaad mg = mc->mc_rotor;
3052f59c7639Shaad }
3053c1cb2cd8Shaad } else if (d != 0) {
3054c1cb2cd8Shaad vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3055c1cb2cd8Shaad mg = vd->vdev_mg->mg_next;
3056c1cb2cd8Shaad } else {
3057c1cb2cd8Shaad mg = mc->mc_rotor;
3058c1cb2cd8Shaad }
3059c1cb2cd8Shaad
3060c1cb2cd8Shaad /*
3061f59c7639Shaad * If the hint put us into the wrong metaslab class, or into a
3062f59c7639Shaad * metaslab group that has been passivated, just follow the rotor.
3063c1cb2cd8Shaad */
3064f59c7639Shaad if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3065c1cb2cd8Shaad mg = mc->mc_rotor;
3066c1cb2cd8Shaad
3067c1cb2cd8Shaad rotor = mg;
3068c1cb2cd8Shaad top:
3069c1cb2cd8Shaad do {
307093f3d2b8Schs boolean_t allocatable;
3071f59c7639Shaad
307293f3d2b8Schs ASSERT(mg->mg_activation_count == 1);
3073c1cb2cd8Shaad vd = mg->mg_vd;
3074f59c7639Shaad
3075c1cb2cd8Shaad /*
3076c1cb2cd8Shaad * Don't allocate from faulted devices.
3077c1cb2cd8Shaad */
307893f3d2b8Schs if (try_hard) {
3079f59c7639Shaad spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3080f59c7639Shaad allocatable = vdev_allocatable(vd);
3081f59c7639Shaad spa_config_exit(spa, SCL_ZIO, FTAG);
3082f59c7639Shaad } else {
3083f59c7639Shaad allocatable = vdev_allocatable(vd);
3084f59c7639Shaad }
3085f59c7639Shaad
3086c1cb2cd8Shaad /*
308793f3d2b8Schs * Determine if the selected metaslab group is eligible
308893f3d2b8Schs * for allocations. If we're ganging then don't allow
308993f3d2b8Schs * this metaslab group to skip allocations since that would
309093f3d2b8Schs * inadvertently return ENOSPC and suspend the pool
309193f3d2b8Schs * even though space is still available.
309293f3d2b8Schs */
309393f3d2b8Schs if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
309493f3d2b8Schs allocatable = metaslab_group_allocatable(mg, rotor,
309593f3d2b8Schs psize);
309693f3d2b8Schs }
309793f3d2b8Schs
309893f3d2b8Schs if (!allocatable) {
309993f3d2b8Schs metaslab_trace_add(zal, mg, NULL, psize, d,
310093f3d2b8Schs TRACE_NOT_ALLOCATABLE);
310193f3d2b8Schs goto next;
310293f3d2b8Schs }
310393f3d2b8Schs
310493f3d2b8Schs ASSERT(mg->mg_initialized);
310593f3d2b8Schs
310693f3d2b8Schs /*
310793f3d2b8Schs * Avoid writing single-copy data to a failing,
310893f3d2b8Schs * non-redundant vdev, unless we've already tried all
310993f3d2b8Schs * other vdevs.
3110c1cb2cd8Shaad */
3111c1cb2cd8Shaad if ((vd->vdev_stat.vs_write_errors > 0 ||
3112c1cb2cd8Shaad vd->vdev_state < VDEV_STATE_HEALTHY) &&
311393f3d2b8Schs d == 0 && !try_hard && vd->vdev_children == 0) {
311493f3d2b8Schs metaslab_trace_add(zal, mg, NULL, psize, d,
311593f3d2b8Schs TRACE_VDEV_ERROR);
3116c1cb2cd8Shaad goto next;
3117c1cb2cd8Shaad }
3118c1cb2cd8Shaad
3119c1cb2cd8Shaad ASSERT(mg->mg_class == mc);
3120c1cb2cd8Shaad
312193f3d2b8Schs /*
312293f3d2b8Schs * If we don't need to try hard, then require that the
312393f3d2b8Schs * block be 1/8th of the device away from any other DVAs
312493f3d2b8Schs * in this BP. If we are trying hard, allow any offset
312593f3d2b8Schs * to be used (distance=0).
312693f3d2b8Schs */
312793f3d2b8Schs uint64_t distance = 0;
312893f3d2b8Schs if (!try_hard) {
312993f3d2b8Schs distance = vd->vdev_asize >>
313093f3d2b8Schs ditto_same_vdev_distance_shift;
3131c1cb2cd8Shaad if (distance <= (1ULL << vd->vdev_ms_shift))
3132c1cb2cd8Shaad distance = 0;
313393f3d2b8Schs }
3134c1cb2cd8Shaad
313593f3d2b8Schs uint64_t asize = vdev_psize_to_asize(vd, psize);
3136c1cb2cd8Shaad ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3137c1cb2cd8Shaad
313893f3d2b8Schs uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
313993f3d2b8Schs distance, dva, d);
314093f3d2b8Schs
3141c1cb2cd8Shaad if (offset != -1ULL) {
3142c1cb2cd8Shaad /*
3143c1cb2cd8Shaad * If we've just selected this metaslab group,
3144c1cb2cd8Shaad * figure out whether the corresponding vdev is
3145c1cb2cd8Shaad * over- or under-used relative to the pool,
3146c1cb2cd8Shaad * and set an allocation bias to even it out.
3147c1cb2cd8Shaad */
314893f3d2b8Schs if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3149c1cb2cd8Shaad vdev_stat_t *vs = &vd->vdev_stat;
3150f59c7639Shaad int64_t vu, cu;
3151c1cb2cd8Shaad
315293f3d2b8Schs vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
315393f3d2b8Schs cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3154c1cb2cd8Shaad
3155c1cb2cd8Shaad /*
315693f3d2b8Schs * Calculate how much more or less we should
315793f3d2b8Schs * try to allocate from this device during
315893f3d2b8Schs * this iteration around the rotor.
315993f3d2b8Schs * For example, if a device is 80% full
316093f3d2b8Schs * and the pool is 20% full then we should
316193f3d2b8Schs * reduce allocations by 60% on this device.
316293f3d2b8Schs *
316393f3d2b8Schs * mg_bias = (20 - 80) * 512K / 100 = -307K
316493f3d2b8Schs *
316593f3d2b8Schs * This reduces allocations by 307K for this
316693f3d2b8Schs * iteration.
3167c1cb2cd8Shaad */
3168f59c7639Shaad mg->mg_bias = ((cu - vu) *
316993f3d2b8Schs (int64_t)mg->mg_aliquot) / 100;
317093f3d2b8Schs } else if (!metaslab_bias_enabled) {
317193f3d2b8Schs mg->mg_bias = 0;
3172c1cb2cd8Shaad }
3173c1cb2cd8Shaad
3174f59c7639Shaad if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3175c1cb2cd8Shaad mg->mg_aliquot + mg->mg_bias) {
3176c1cb2cd8Shaad mc->mc_rotor = mg->mg_next;
3177f59c7639Shaad mc->mc_aliquot = 0;
3178c1cb2cd8Shaad }
3179c1cb2cd8Shaad
3180c1cb2cd8Shaad DVA_SET_VDEV(&dva[d], vd->vdev_id);
3181c1cb2cd8Shaad DVA_SET_OFFSET(&dva[d], offset);
3182c1cb2cd8Shaad DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3183c1cb2cd8Shaad DVA_SET_ASIZE(&dva[d], asize);
3184c1cb2cd8Shaad
3185c1cb2cd8Shaad return (0);
3186c1cb2cd8Shaad }
3187c1cb2cd8Shaad next:
3188c1cb2cd8Shaad mc->mc_rotor = mg->mg_next;
3189f59c7639Shaad mc->mc_aliquot = 0;
3190c1cb2cd8Shaad } while ((mg = mg->mg_next) != rotor);
3191c1cb2cd8Shaad
319293f3d2b8Schs /*
319393f3d2b8Schs * If we haven't tried hard, do so now.
319493f3d2b8Schs */
319593f3d2b8Schs if (!try_hard) {
319693f3d2b8Schs try_hard = B_TRUE;
3197f59c7639Shaad goto top;
3198f59c7639Shaad }
3199f59c7639Shaad
3200c1cb2cd8Shaad bzero(&dva[d], sizeof (dva_t));
3201c1cb2cd8Shaad
320293f3d2b8Schs metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
320393f3d2b8Schs return (SET_ERROR(ENOSPC));
3204c1cb2cd8Shaad }
3205c1cb2cd8Shaad
3206c1cb2cd8Shaad /*
3207c1cb2cd8Shaad * Free the block represented by DVA in the context of the specified
3208c1cb2cd8Shaad * transaction group.
3209c1cb2cd8Shaad */
3210c1cb2cd8Shaad static void
metaslab_free_dva(spa_t * spa,const dva_t * dva,uint64_t txg,boolean_t now)3211c1cb2cd8Shaad metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3212c1cb2cd8Shaad {
3213c1cb2cd8Shaad uint64_t vdev = DVA_GET_VDEV(dva);
3214c1cb2cd8Shaad uint64_t offset = DVA_GET_OFFSET(dva);
3215c1cb2cd8Shaad uint64_t size = DVA_GET_ASIZE(dva);
3216c1cb2cd8Shaad vdev_t *vd;
3217c1cb2cd8Shaad metaslab_t *msp;
3218c1cb2cd8Shaad
3219c1cb2cd8Shaad ASSERT(DVA_IS_VALID(dva));
3220c1cb2cd8Shaad
3221c1cb2cd8Shaad if (txg > spa_freeze_txg(spa))
3222c1cb2cd8Shaad return;
3223c1cb2cd8Shaad
3224c1cb2cd8Shaad if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3225c1cb2cd8Shaad (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3226c1cb2cd8Shaad cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3227c1cb2cd8Shaad (u_longlong_t)vdev, (u_longlong_t)offset);
3228c1cb2cd8Shaad ASSERT(0);
3229c1cb2cd8Shaad return;
3230c1cb2cd8Shaad }
3231c1cb2cd8Shaad
3232c1cb2cd8Shaad msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3233c1cb2cd8Shaad
3234c1cb2cd8Shaad if (DVA_GET_GANG(dva))
3235c1cb2cd8Shaad size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3236c1cb2cd8Shaad
3237c1cb2cd8Shaad mutex_enter(&msp->ms_lock);
3238c1cb2cd8Shaad
3239c1cb2cd8Shaad if (now) {
324093f3d2b8Schs range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3241c1cb2cd8Shaad offset, size);
324293f3d2b8Schs
324393f3d2b8Schs VERIFY(!msp->ms_condensing);
324493f3d2b8Schs VERIFY3U(offset, >=, msp->ms_start);
324593f3d2b8Schs VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
324693f3d2b8Schs VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
324793f3d2b8Schs msp->ms_size);
324893f3d2b8Schs VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
324993f3d2b8Schs VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
325093f3d2b8Schs range_tree_add(msp->ms_tree, offset, size);
325193f3d2b8Schs msp->ms_max_size = metaslab_block_maxsize(msp);
3252c1cb2cd8Shaad } else {
325393f3d2b8Schs if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
3254c1cb2cd8Shaad vdev_dirty(vd, VDD_METASLAB, msp, txg);
325593f3d2b8Schs range_tree_add(msp->ms_freetree[txg & TXG_MASK],
325693f3d2b8Schs offset, size);
3257c1cb2cd8Shaad }
3258c1cb2cd8Shaad
3259c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
3260c1cb2cd8Shaad }
3261c1cb2cd8Shaad
3262c1cb2cd8Shaad /*
3263c1cb2cd8Shaad * Intent log support: upon opening the pool after a crash, notify the SPA
3264c1cb2cd8Shaad * of blocks that the intent log has allocated for immediate write, but
3265c1cb2cd8Shaad * which are still considered free by the SPA because the last transaction
3266c1cb2cd8Shaad * group didn't commit yet.
3267c1cb2cd8Shaad */
3268c1cb2cd8Shaad static int
metaslab_claim_dva(spa_t * spa,const dva_t * dva,uint64_t txg)3269c1cb2cd8Shaad metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3270c1cb2cd8Shaad {
3271c1cb2cd8Shaad uint64_t vdev = DVA_GET_VDEV(dva);
3272c1cb2cd8Shaad uint64_t offset = DVA_GET_OFFSET(dva);
3273c1cb2cd8Shaad uint64_t size = DVA_GET_ASIZE(dva);
3274c1cb2cd8Shaad vdev_t *vd;
3275c1cb2cd8Shaad metaslab_t *msp;
3276f59c7639Shaad int error = 0;
3277c1cb2cd8Shaad
3278c1cb2cd8Shaad ASSERT(DVA_IS_VALID(dva));
3279c1cb2cd8Shaad
3280c1cb2cd8Shaad if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3281c1cb2cd8Shaad (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
328293f3d2b8Schs return (SET_ERROR(ENXIO));
3283c1cb2cd8Shaad
3284c1cb2cd8Shaad msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3285c1cb2cd8Shaad
3286c1cb2cd8Shaad if (DVA_GET_GANG(dva))
3287c1cb2cd8Shaad size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3288c1cb2cd8Shaad
3289c1cb2cd8Shaad mutex_enter(&msp->ms_lock);
3290c1cb2cd8Shaad
329193f3d2b8Schs if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
329293f3d2b8Schs error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3293f59c7639Shaad
329493f3d2b8Schs if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
329593f3d2b8Schs error = SET_ERROR(ENOENT);
3296f59c7639Shaad
3297c1cb2cd8Shaad if (error || txg == 0) { /* txg == 0 indicates dry run */
3298c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
3299c1cb2cd8Shaad return (error);
3300c1cb2cd8Shaad }
3301c1cb2cd8Shaad
330293f3d2b8Schs VERIFY(!msp->ms_condensing);
330393f3d2b8Schs VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
330493f3d2b8Schs VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
330593f3d2b8Schs VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
330693f3d2b8Schs range_tree_remove(msp->ms_tree, offset, size);
3307c1cb2cd8Shaad
3308f59c7639Shaad if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
330993f3d2b8Schs if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3310c1cb2cd8Shaad vdev_dirty(vd, VDD_METASLAB, msp, txg);
331193f3d2b8Schs range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3312c1cb2cd8Shaad }
3313c1cb2cd8Shaad
3314c1cb2cd8Shaad mutex_exit(&msp->ms_lock);
3315c1cb2cd8Shaad
3316c1cb2cd8Shaad return (0);
3317c1cb2cd8Shaad }
3318c1cb2cd8Shaad
331993f3d2b8Schs /*
332093f3d2b8Schs * Reserve some allocation slots. The reservation system must be called
332193f3d2b8Schs * before we call into the allocator. If there aren't any available slots
332293f3d2b8Schs * then the I/O will be throttled until an I/O completes and its slots are
332393f3d2b8Schs * freed up. The function returns true if it was successful in placing
332493f3d2b8Schs * the reservation.
332593f3d2b8Schs */
332693f3d2b8Schs boolean_t
metaslab_class_throttle_reserve(metaslab_class_t * mc,int slots,zio_t * zio,int flags)332793f3d2b8Schs metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
332893f3d2b8Schs int flags)
332993f3d2b8Schs {
333093f3d2b8Schs uint64_t available_slots = 0;
333193f3d2b8Schs boolean_t slot_reserved = B_FALSE;
333293f3d2b8Schs
333393f3d2b8Schs ASSERT(mc->mc_alloc_throttle_enabled);
333493f3d2b8Schs mutex_enter(&mc->mc_lock);
333593f3d2b8Schs
333693f3d2b8Schs uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
333793f3d2b8Schs if (reserved_slots < mc->mc_alloc_max_slots)
333893f3d2b8Schs available_slots = mc->mc_alloc_max_slots - reserved_slots;
333993f3d2b8Schs
334093f3d2b8Schs if (slots <= available_slots || GANG_ALLOCATION(flags)) {
334193f3d2b8Schs /*
334293f3d2b8Schs * We reserve the slots individually so that we can unreserve
334393f3d2b8Schs * them individually when an I/O completes.
334493f3d2b8Schs */
334593f3d2b8Schs for (int d = 0; d < slots; d++) {
334693f3d2b8Schs reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
334793f3d2b8Schs }
334893f3d2b8Schs zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
334993f3d2b8Schs slot_reserved = B_TRUE;
335093f3d2b8Schs }
335193f3d2b8Schs
335293f3d2b8Schs mutex_exit(&mc->mc_lock);
335393f3d2b8Schs return (slot_reserved);
335493f3d2b8Schs }
335593f3d2b8Schs
335693f3d2b8Schs void
metaslab_class_throttle_unreserve(metaslab_class_t * mc,int slots,zio_t * zio)335793f3d2b8Schs metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
335893f3d2b8Schs {
335993f3d2b8Schs ASSERT(mc->mc_alloc_throttle_enabled);
336093f3d2b8Schs mutex_enter(&mc->mc_lock);
336193f3d2b8Schs for (int d = 0; d < slots; d++) {
336293f3d2b8Schs (void) refcount_remove(&mc->mc_alloc_slots, zio);
336393f3d2b8Schs }
336493f3d2b8Schs mutex_exit(&mc->mc_lock);
336593f3d2b8Schs }
336693f3d2b8Schs
3367c1cb2cd8Shaad int
metaslab_alloc(spa_t * spa,metaslab_class_t * mc,uint64_t psize,blkptr_t * bp,int ndvas,uint64_t txg,blkptr_t * hintbp,int flags,zio_alloc_list_t * zal,zio_t * zio)3368c1cb2cd8Shaad metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
336993f3d2b8Schs int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
337093f3d2b8Schs zio_alloc_list_t *zal, zio_t *zio)
3371c1cb2cd8Shaad {
3372c1cb2cd8Shaad dva_t *dva = bp->blk_dva;
3373*03f30658Sfox dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
3374c1cb2cd8Shaad int error = 0;
3375c1cb2cd8Shaad
3376c1cb2cd8Shaad ASSERT(bp->blk_birth == 0);
3377f59c7639Shaad ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3378c1cb2cd8Shaad
3379c1cb2cd8Shaad spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3380c1cb2cd8Shaad
3381c1cb2cd8Shaad if (mc->mc_rotor == NULL) { /* no vdevs in this class */
3382c1cb2cd8Shaad spa_config_exit(spa, SCL_ALLOC, FTAG);
338393f3d2b8Schs return (SET_ERROR(ENOSPC));
3384c1cb2cd8Shaad }
3385c1cb2cd8Shaad
3386c1cb2cd8Shaad ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3387c1cb2cd8Shaad ASSERT(BP_GET_NDVAS(bp) == 0);
3388c1cb2cd8Shaad ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
338993f3d2b8Schs ASSERT3P(zal, !=, NULL);
3390c1cb2cd8Shaad
3391c1cb2cd8Shaad for (int d = 0; d < ndvas; d++) {
3392c1cb2cd8Shaad error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
339393f3d2b8Schs txg, flags, zal);
339493f3d2b8Schs if (error != 0) {
3395c1cb2cd8Shaad for (d--; d >= 0; d--) {
3396c1cb2cd8Shaad metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
339793f3d2b8Schs metaslab_group_alloc_decrement(spa,
339893f3d2b8Schs DVA_GET_VDEV(&dva[d]), zio, flags);
3399c1cb2cd8Shaad bzero(&dva[d], sizeof (dva_t));
3400c1cb2cd8Shaad }
3401c1cb2cd8Shaad spa_config_exit(spa, SCL_ALLOC, FTAG);
3402c1cb2cd8Shaad return (error);
340393f3d2b8Schs } else {
340493f3d2b8Schs /*
340593f3d2b8Schs * Update the metaslab group's queue depth
340693f3d2b8Schs * based on the newly allocated dva.
340793f3d2b8Schs */
340893f3d2b8Schs metaslab_group_alloc_increment(spa,
340993f3d2b8Schs DVA_GET_VDEV(&dva[d]), zio, flags);
3410c1cb2cd8Shaad }
341193f3d2b8Schs
3412c1cb2cd8Shaad }
3413c1cb2cd8Shaad ASSERT(error == 0);
3414c1cb2cd8Shaad ASSERT(BP_GET_NDVAS(bp) == ndvas);
3415c1cb2cd8Shaad
3416c1cb2cd8Shaad spa_config_exit(spa, SCL_ALLOC, FTAG);
3417c1cb2cd8Shaad
3418f59c7639Shaad BP_SET_BIRTH(bp, txg, txg);
3419c1cb2cd8Shaad
3420c1cb2cd8Shaad return (0);
3421c1cb2cd8Shaad }
3422c1cb2cd8Shaad
3423c1cb2cd8Shaad void
metaslab_free(spa_t * spa,const blkptr_t * bp,uint64_t txg,boolean_t now)3424c1cb2cd8Shaad metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3425c1cb2cd8Shaad {
3426c1cb2cd8Shaad const dva_t *dva = bp->blk_dva;
3427c1cb2cd8Shaad int ndvas = BP_GET_NDVAS(bp);
3428c1cb2cd8Shaad
3429c1cb2cd8Shaad ASSERT(!BP_IS_HOLE(bp));
3430f59c7639Shaad ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3431c1cb2cd8Shaad
3432c1cb2cd8Shaad spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3433c1cb2cd8Shaad
3434c1cb2cd8Shaad for (int d = 0; d < ndvas; d++)
3435c1cb2cd8Shaad metaslab_free_dva(spa, &dva[d], txg, now);
3436c1cb2cd8Shaad
3437c1cb2cd8Shaad spa_config_exit(spa, SCL_FREE, FTAG);
3438c1cb2cd8Shaad }
3439c1cb2cd8Shaad
3440c1cb2cd8Shaad int
metaslab_claim(spa_t * spa,const blkptr_t * bp,uint64_t txg)3441c1cb2cd8Shaad metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3442c1cb2cd8Shaad {
3443c1cb2cd8Shaad const dva_t *dva = bp->blk_dva;
3444c1cb2cd8Shaad int ndvas = BP_GET_NDVAS(bp);
3445c1cb2cd8Shaad int error = 0;
3446c1cb2cd8Shaad
3447c1cb2cd8Shaad ASSERT(!BP_IS_HOLE(bp));
3448c1cb2cd8Shaad
3449c1cb2cd8Shaad if (txg != 0) {
3450c1cb2cd8Shaad /*
3451c1cb2cd8Shaad * First do a dry run to make sure all DVAs are claimable,
3452c1cb2cd8Shaad * so we don't have to unwind from partial failures below.
3453c1cb2cd8Shaad */
3454c1cb2cd8Shaad if ((error = metaslab_claim(spa, bp, 0)) != 0)
3455c1cb2cd8Shaad return (error);
3456c1cb2cd8Shaad }
3457c1cb2cd8Shaad
3458c1cb2cd8Shaad spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3459c1cb2cd8Shaad
3460c1cb2cd8Shaad for (int d = 0; d < ndvas; d++)
3461c1cb2cd8Shaad if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3462c1cb2cd8Shaad break;
3463c1cb2cd8Shaad
3464c1cb2cd8Shaad spa_config_exit(spa, SCL_ALLOC, FTAG);
3465c1cb2cd8Shaad
3466c1cb2cd8Shaad ASSERT(error == 0 || txg == 0);
3467c1cb2cd8Shaad
3468c1cb2cd8Shaad return (error);
3469c1cb2cd8Shaad }
347093f3d2b8Schs
347193f3d2b8Schs void
metaslab_check_free(spa_t * spa,const blkptr_t * bp)347293f3d2b8Schs metaslab_check_free(spa_t *spa, const blkptr_t *bp)
347393f3d2b8Schs {
347493f3d2b8Schs if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
347593f3d2b8Schs return;
347693f3d2b8Schs
347793f3d2b8Schs spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
347893f3d2b8Schs for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
347993f3d2b8Schs uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
348093f3d2b8Schs vdev_t *vd = vdev_lookup_top(spa, vdev);
348193f3d2b8Schs uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
348293f3d2b8Schs uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
348393f3d2b8Schs metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
348493f3d2b8Schs
348593f3d2b8Schs if (msp->ms_loaded)
348693f3d2b8Schs range_tree_verify(msp->ms_tree, offset, size);
348793f3d2b8Schs
348893f3d2b8Schs for (int j = 0; j < TXG_SIZE; j++)
348993f3d2b8Schs range_tree_verify(msp->ms_freetree[j], offset, size);
349093f3d2b8Schs for (int j = 0; j < TXG_DEFER_SIZE; j++)
349193f3d2b8Schs range_tree_verify(msp->ms_defertree[j], offset, size);
349293f3d2b8Schs }
349393f3d2b8Schs spa_config_exit(spa, SCL_VDEV, FTAG);
349493f3d2b8Schs }
3495