1c1cb2cd8Shaad /*
2c1cb2cd8Shaad  * CDDL HEADER START
3c1cb2cd8Shaad  *
4c1cb2cd8Shaad  * The contents of this file are subject to the terms of the
5c1cb2cd8Shaad  * Common Development and Distribution License (the "License").
6c1cb2cd8Shaad  * You may not use this file except in compliance with the License.
7c1cb2cd8Shaad  *
8c1cb2cd8Shaad  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c1cb2cd8Shaad  * or http://www.opensolaris.org/os/licensing.
10c1cb2cd8Shaad  * See the License for the specific language governing permissions
11c1cb2cd8Shaad  * and limitations under the License.
12c1cb2cd8Shaad  *
13c1cb2cd8Shaad  * When distributing Covered Code, include this CDDL HEADER in each
14c1cb2cd8Shaad  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c1cb2cd8Shaad  * If applicable, add the following below this CDDL HEADER, with the
16c1cb2cd8Shaad  * fields enclosed by brackets "[]" replaced with your own identifying
17c1cb2cd8Shaad  * information: Portions Copyright [yyyy] [name of copyright owner]
18c1cb2cd8Shaad  *
19c1cb2cd8Shaad  * CDDL HEADER END
20c1cb2cd8Shaad  */
21c1cb2cd8Shaad /*
2293f3d2b8Schs  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2393f3d2b8Schs  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
2493f3d2b8Schs  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2593f3d2b8Schs  * Copyright (c) 2014 Integros [integros.com]
26c1cb2cd8Shaad  */
27c1cb2cd8Shaad 
28c1cb2cd8Shaad #include <sys/zfs_context.h>
29c1cb2cd8Shaad #include <sys/dmu.h>
30c1cb2cd8Shaad #include <sys/dmu_tx.h>
31c1cb2cd8Shaad #include <sys/space_map.h>
32c1cb2cd8Shaad #include <sys/metaslab_impl.h>
33c1cb2cd8Shaad #include <sys/vdev_impl.h>
34c1cb2cd8Shaad #include <sys/zio.h>
3593f3d2b8Schs #include <sys/spa_impl.h>
3693f3d2b8Schs #include <sys/zfeature.h>
3793f3d2b8Schs 
3893f3d2b8Schs SYSCTL_DECL(_vfs_zfs);
3993f3d2b8Schs SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
4093f3d2b8Schs 
4193f3d2b8Schs #define	GANG_ALLOCATION(flags) \
4293f3d2b8Schs 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43c1cb2cd8Shaad 
44c1cb2cd8Shaad uint64_t metaslab_aliquot = 512ULL << 10;
45c1cb2cd8Shaad uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
4693f3d2b8Schs SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
4793f3d2b8Schs     &metaslab_gang_bang, 0,
4893f3d2b8Schs     "Force gang block allocation for blocks larger than or equal to this value");
49c1cb2cd8Shaad 
50c1cb2cd8Shaad /*
5193f3d2b8Schs  * The in-core space map representation is more compact than its on-disk form.
5293f3d2b8Schs  * The zfs_condense_pct determines how much more compact the in-core
5393f3d2b8Schs  * space map representation must be before we compact it on-disk.
5493f3d2b8Schs  * Values should be greater than or equal to 100.
55f59c7639Shaad  */
5693f3d2b8Schs int zfs_condense_pct = 200;
5793f3d2b8Schs SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
5893f3d2b8Schs     &zfs_condense_pct, 0,
5993f3d2b8Schs     "Condense on-disk spacemap when it is more than this many percents"
6093f3d2b8Schs     " of in-memory counterpart");
6193f3d2b8Schs 
6293f3d2b8Schs /*
6393f3d2b8Schs  * Condensing a metaslab is not guaranteed to actually reduce the amount of
6493f3d2b8Schs  * space used on disk. In particular, a space map uses data in increments of
6593f3d2b8Schs  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
6693f3d2b8Schs  * same number of blocks after condensing. Since the goal of condensing is to
6793f3d2b8Schs  * reduce the number of IOPs required to read the space map, we only want to
6893f3d2b8Schs  * condense when we can be sure we will reduce the number of blocks used by the
6993f3d2b8Schs  * space map. Unfortunately, we cannot precisely compute whether or not this is
7093f3d2b8Schs  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
7193f3d2b8Schs  * we apply the following heuristic: do not condense a spacemap unless the
7293f3d2b8Schs  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
7393f3d2b8Schs  * blocks.
7493f3d2b8Schs  */
7593f3d2b8Schs int zfs_metaslab_condense_block_threshold = 4;
7693f3d2b8Schs 
7793f3d2b8Schs /*
7893f3d2b8Schs  * The zfs_mg_noalloc_threshold defines which metaslab groups should
7993f3d2b8Schs  * be eligible for allocation. The value is defined as a percentage of
8093f3d2b8Schs  * free space. Metaslab groups that have more free space than
8193f3d2b8Schs  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
8293f3d2b8Schs  * a metaslab group's free space is less than or equal to the
8393f3d2b8Schs  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
8493f3d2b8Schs  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
8593f3d2b8Schs  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
8693f3d2b8Schs  * groups are allowed to accept allocations. Gang blocks are always
8793f3d2b8Schs  * eligible to allocate on any metaslab group. The default value of 0 means
8893f3d2b8Schs  * no metaslab group will be excluded based on this criterion.
8993f3d2b8Schs  */
9093f3d2b8Schs int zfs_mg_noalloc_threshold = 0;
9193f3d2b8Schs SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
9293f3d2b8Schs     &zfs_mg_noalloc_threshold, 0,
9393f3d2b8Schs     "Percentage of metaslab group size that should be free"
9493f3d2b8Schs     " to make it eligible for allocation");
9593f3d2b8Schs 
9693f3d2b8Schs /*
9793f3d2b8Schs  * Metaslab groups are considered eligible for allocations if their
9893f3d2b8Schs  * fragmenation metric (measured as a percentage) is less than or equal to
9993f3d2b8Schs  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
10093f3d2b8Schs  * then it will be skipped unless all metaslab groups within the metaslab
10193f3d2b8Schs  * class have also crossed this threshold.
10293f3d2b8Schs  */
10393f3d2b8Schs int zfs_mg_fragmentation_threshold = 85;
10493f3d2b8Schs SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
10593f3d2b8Schs     &zfs_mg_fragmentation_threshold, 0,
10693f3d2b8Schs     "Percentage of metaslab group size that should be considered "
10793f3d2b8Schs     "eligible for allocations unless all metaslab groups within the metaslab class "
10893f3d2b8Schs     "have also crossed this threshold");
10993f3d2b8Schs 
11093f3d2b8Schs /*
11193f3d2b8Schs  * Allow metaslabs to keep their active state as long as their fragmentation
11293f3d2b8Schs  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
11393f3d2b8Schs  * active metaslab that exceeds this threshold will no longer keep its active
11493f3d2b8Schs  * status allowing better metaslabs to be selected.
11593f3d2b8Schs  */
11693f3d2b8Schs int zfs_metaslab_fragmentation_threshold = 70;
11793f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
11893f3d2b8Schs     &zfs_metaslab_fragmentation_threshold, 0,
11993f3d2b8Schs     "Maximum percentage of metaslab fragmentation level to keep their active state");
12093f3d2b8Schs 
12193f3d2b8Schs /*
12293f3d2b8Schs  * When set will load all metaslabs when pool is first opened.
12393f3d2b8Schs  */
12493f3d2b8Schs int metaslab_debug_load = 0;
12593f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
12693f3d2b8Schs     &metaslab_debug_load, 0,
12793f3d2b8Schs     "Load all metaslabs when pool is first opened");
12893f3d2b8Schs 
12993f3d2b8Schs /*
13093f3d2b8Schs  * When set will prevent metaslabs from being unloaded.
13193f3d2b8Schs  */
13293f3d2b8Schs int metaslab_debug_unload = 0;
13393f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
13493f3d2b8Schs     &metaslab_debug_unload, 0,
13593f3d2b8Schs     "Prevent metaslabs from being unloaded");
136f59c7639Shaad 
137f59c7639Shaad /*
138f59c7639Shaad  * Minimum size which forces the dynamic allocator to change
139f59c7639Shaad  * it's allocation strategy.  Once the space map cannot satisfy
140f59c7639Shaad  * an allocation of this size then it switches to using more
141f59c7639Shaad  * aggressive strategy (i.e search by size rather than offset).
142f59c7639Shaad  */
14393f3d2b8Schs uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
14493f3d2b8Schs SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
14593f3d2b8Schs     &metaslab_df_alloc_threshold, 0,
14693f3d2b8Schs     "Minimum size which forces the dynamic allocator to change it's allocation strategy");
147f59c7639Shaad 
148f59c7639Shaad /*
149f59c7639Shaad  * The minimum free space, in percent, which must be available
150f59c7639Shaad  * in a space map to continue allocations in a first-fit fashion.
15193f3d2b8Schs  * Once the space map's free space drops below this level we dynamically
152f59c7639Shaad  * switch to using best-fit allocations.
153f59c7639Shaad  */
154f59c7639Shaad int metaslab_df_free_pct = 4;
15593f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
15693f3d2b8Schs     &metaslab_df_free_pct, 0,
15793f3d2b8Schs     "The minimum free space, in percent, which must be available in a "
15893f3d2b8Schs     "space map to continue allocations in a first-fit fashion");
159f59c7639Shaad 
160f59c7639Shaad /*
161f59c7639Shaad  * A metaslab is considered "free" if it contains a contiguous
162f59c7639Shaad  * segment which is greater than metaslab_min_alloc_size.
163f59c7639Shaad  */
164f59c7639Shaad uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
16593f3d2b8Schs SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
16693f3d2b8Schs     &metaslab_min_alloc_size, 0,
16793f3d2b8Schs     "A metaslab is considered \"free\" if it contains a contiguous "
16893f3d2b8Schs     "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
169f59c7639Shaad 
170f59c7639Shaad /*
17193f3d2b8Schs  * Percentage of all cpus that can be used by the metaslab taskq.
172f59c7639Shaad  */
17393f3d2b8Schs int metaslab_load_pct = 50;
17493f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
17593f3d2b8Schs     &metaslab_load_pct, 0,
17693f3d2b8Schs     "Percentage of cpus that can be used by the metaslab taskq");
177f59c7639Shaad 
178f59c7639Shaad /*
17993f3d2b8Schs  * Determines how many txgs a metaslab may remain loaded without having any
18093f3d2b8Schs  * allocations from it. As long as a metaslab continues to be used we will
18193f3d2b8Schs  * keep it loaded.
182f59c7639Shaad  */
18393f3d2b8Schs int metaslab_unload_delay = TXG_SIZE * 2;
18493f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
18593f3d2b8Schs     &metaslab_unload_delay, 0,
18693f3d2b8Schs     "Number of TXGs that an unused metaslab can be kept in memory");
18793f3d2b8Schs 
18893f3d2b8Schs /*
18993f3d2b8Schs  * Max number of metaslabs per group to preload.
19093f3d2b8Schs  */
19193f3d2b8Schs int metaslab_preload_limit = SPA_DVAS_PER_BP;
19293f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
19393f3d2b8Schs     &metaslab_preload_limit, 0,
19493f3d2b8Schs     "Max number of metaslabs per group to preload");
19593f3d2b8Schs 
19693f3d2b8Schs /*
19793f3d2b8Schs  * Enable/disable preloading of metaslab.
19893f3d2b8Schs  */
19993f3d2b8Schs boolean_t metaslab_preload_enabled = B_TRUE;
20093f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
20193f3d2b8Schs     &metaslab_preload_enabled, 0,
20293f3d2b8Schs     "Max number of metaslabs per group to preload");
20393f3d2b8Schs 
20493f3d2b8Schs /*
20593f3d2b8Schs  * Enable/disable fragmentation weighting on metaslabs.
20693f3d2b8Schs  */
20793f3d2b8Schs boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
20893f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
20993f3d2b8Schs     &metaslab_fragmentation_factor_enabled, 0,
21093f3d2b8Schs     "Enable fragmentation weighting on metaslabs");
21193f3d2b8Schs 
21293f3d2b8Schs /*
21393f3d2b8Schs  * Enable/disable lba weighting (i.e. outer tracks are given preference).
21493f3d2b8Schs  */
21593f3d2b8Schs boolean_t metaslab_lba_weighting_enabled = B_TRUE;
21693f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
21793f3d2b8Schs     &metaslab_lba_weighting_enabled, 0,
21893f3d2b8Schs     "Enable LBA weighting (i.e. outer tracks are given preference)");
21993f3d2b8Schs 
22093f3d2b8Schs /*
22193f3d2b8Schs  * Enable/disable metaslab group biasing.
22293f3d2b8Schs  */
22393f3d2b8Schs boolean_t metaslab_bias_enabled = B_TRUE;
22493f3d2b8Schs SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
22593f3d2b8Schs     &metaslab_bias_enabled, 0,
22693f3d2b8Schs     "Enable metaslab group biasing");
22793f3d2b8Schs 
22893f3d2b8Schs /*
22993f3d2b8Schs  * Enable/disable segment-based metaslab selection.
23093f3d2b8Schs  */
23193f3d2b8Schs boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
23293f3d2b8Schs 
23393f3d2b8Schs /*
23493f3d2b8Schs  * When using segment-based metaslab selection, we will continue
23593f3d2b8Schs  * allocating from the active metaslab until we have exhausted
23693f3d2b8Schs  * zfs_metaslab_switch_threshold of its buckets.
23793f3d2b8Schs  */
23893f3d2b8Schs int zfs_metaslab_switch_threshold = 2;
23993f3d2b8Schs 
24093f3d2b8Schs /*
24193f3d2b8Schs  * Internal switch to enable/disable the metaslab allocation tracing
24293f3d2b8Schs  * facility.
24393f3d2b8Schs  */
24493f3d2b8Schs boolean_t metaslab_trace_enabled = B_TRUE;
24593f3d2b8Schs 
24693f3d2b8Schs /*
24793f3d2b8Schs  * Maximum entries that the metaslab allocation tracing facility will keep
24893f3d2b8Schs  * in a given list when running in non-debug mode. We limit the number
24993f3d2b8Schs  * of entries in non-debug mode to prevent us from using up too much memory.
25093f3d2b8Schs  * The limit should be sufficiently large that we don't expect any allocation
25193f3d2b8Schs  * to every exceed this value. In debug mode, the system will panic if this
25293f3d2b8Schs  * limit is ever reached allowing for further investigation.
25393f3d2b8Schs  */
25493f3d2b8Schs uint64_t metaslab_trace_max_entries = 5000;
25593f3d2b8Schs 
25693f3d2b8Schs static uint64_t metaslab_weight(metaslab_t *);
25793f3d2b8Schs static void metaslab_set_fragmentation(metaslab_t *);
25893f3d2b8Schs 
25993f3d2b8Schs kmem_cache_t *metaslab_alloc_trace_cache;
260f59c7639Shaad 
261f59c7639Shaad /*
262c1cb2cd8Shaad  * ==========================================================================
263c1cb2cd8Shaad  * Metaslab classes
264c1cb2cd8Shaad  * ==========================================================================
265c1cb2cd8Shaad  */
266c1cb2cd8Shaad metaslab_class_t *
metaslab_class_create(spa_t * spa,metaslab_ops_t * ops)26793f3d2b8Schs metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
268c1cb2cd8Shaad {
269c1cb2cd8Shaad 	metaslab_class_t *mc;
270c1cb2cd8Shaad 
271c1cb2cd8Shaad 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
272c1cb2cd8Shaad 
273f59c7639Shaad 	mc->mc_spa = spa;
274c1cb2cd8Shaad 	mc->mc_rotor = NULL;
275f59c7639Shaad 	mc->mc_ops = ops;
27693f3d2b8Schs 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
27793f3d2b8Schs 	refcount_create_tracked(&mc->mc_alloc_slots);
278c1cb2cd8Shaad 
279c1cb2cd8Shaad 	return (mc);
280c1cb2cd8Shaad }
281c1cb2cd8Shaad 
282c1cb2cd8Shaad void
metaslab_class_destroy(metaslab_class_t * mc)283c1cb2cd8Shaad metaslab_class_destroy(metaslab_class_t *mc)
284c1cb2cd8Shaad {
285f59c7639Shaad 	ASSERT(mc->mc_rotor == NULL);
286f59c7639Shaad 	ASSERT(mc->mc_alloc == 0);
287f59c7639Shaad 	ASSERT(mc->mc_deferred == 0);
288f59c7639Shaad 	ASSERT(mc->mc_space == 0);
289f59c7639Shaad 	ASSERT(mc->mc_dspace == 0);
290c1cb2cd8Shaad 
29193f3d2b8Schs 	refcount_destroy(&mc->mc_alloc_slots);
29293f3d2b8Schs 	mutex_destroy(&mc->mc_lock);
293c1cb2cd8Shaad 	kmem_free(mc, sizeof (metaslab_class_t));
294c1cb2cd8Shaad }
295c1cb2cd8Shaad 
296f59c7639Shaad int
metaslab_class_validate(metaslab_class_t * mc)297f59c7639Shaad metaslab_class_validate(metaslab_class_t *mc)
298c1cb2cd8Shaad {
299f59c7639Shaad 	metaslab_group_t *mg;
300f59c7639Shaad 	vdev_t *vd;
301c1cb2cd8Shaad 
302f59c7639Shaad 	/*
303f59c7639Shaad 	 * Must hold one of the spa_config locks.
304f59c7639Shaad 	 */
305f59c7639Shaad 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
306f59c7639Shaad 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
307c1cb2cd8Shaad 
308f59c7639Shaad 	if ((mg = mc->mc_rotor) == NULL)
309f59c7639Shaad 		return (0);
310f59c7639Shaad 
311f59c7639Shaad 	do {
312f59c7639Shaad 		vd = mg->mg_vd;
313f59c7639Shaad 		ASSERT(vd->vdev_mg != NULL);
314f59c7639Shaad 		ASSERT3P(vd->vdev_top, ==, vd);
315f59c7639Shaad 		ASSERT3P(mg->mg_class, ==, mc);
316f59c7639Shaad 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
317f59c7639Shaad 	} while ((mg = mg->mg_next) != mc->mc_rotor);
318f59c7639Shaad 
319f59c7639Shaad 	return (0);
320c1cb2cd8Shaad }
321c1cb2cd8Shaad 
322c1cb2cd8Shaad void
metaslab_class_space_update(metaslab_class_t * mc,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta,int64_t dspace_delta)323f59c7639Shaad metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
324f59c7639Shaad     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
325c1cb2cd8Shaad {
326f59c7639Shaad 	atomic_add_64(&mc->mc_alloc, alloc_delta);
327f59c7639Shaad 	atomic_add_64(&mc->mc_deferred, defer_delta);
328f59c7639Shaad 	atomic_add_64(&mc->mc_space, space_delta);
329f59c7639Shaad 	atomic_add_64(&mc->mc_dspace, dspace_delta);
330c1cb2cd8Shaad }
331c1cb2cd8Shaad 
33293f3d2b8Schs void
metaslab_class_minblocksize_update(metaslab_class_t * mc)33393f3d2b8Schs metaslab_class_minblocksize_update(metaslab_class_t *mc)
33493f3d2b8Schs {
33593f3d2b8Schs 	metaslab_group_t *mg;
33693f3d2b8Schs 	vdev_t *vd;
33793f3d2b8Schs 	uint64_t minashift = UINT64_MAX;
33893f3d2b8Schs 
33993f3d2b8Schs 	if ((mg = mc->mc_rotor) == NULL) {
34093f3d2b8Schs 		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
34193f3d2b8Schs 		return;
34293f3d2b8Schs 	}
34393f3d2b8Schs 
34493f3d2b8Schs 	do {
34593f3d2b8Schs 		vd = mg->mg_vd;
34693f3d2b8Schs 		if (vd->vdev_ashift < minashift)
34793f3d2b8Schs 			minashift = vd->vdev_ashift;
34893f3d2b8Schs 	} while ((mg = mg->mg_next) != mc->mc_rotor);
34993f3d2b8Schs 
35093f3d2b8Schs 	mc->mc_minblocksize = 1ULL << minashift;
35193f3d2b8Schs }
35293f3d2b8Schs 
353f59c7639Shaad uint64_t
metaslab_class_get_alloc(metaslab_class_t * mc)354f59c7639Shaad metaslab_class_get_alloc(metaslab_class_t *mc)
355f59c7639Shaad {
356f59c7639Shaad 	return (mc->mc_alloc);
357f59c7639Shaad }
358f59c7639Shaad 
359f59c7639Shaad uint64_t
metaslab_class_get_deferred(metaslab_class_t * mc)360f59c7639Shaad metaslab_class_get_deferred(metaslab_class_t *mc)
361f59c7639Shaad {
362f59c7639Shaad 	return (mc->mc_deferred);
363f59c7639Shaad }
364f59c7639Shaad 
365f59c7639Shaad uint64_t
metaslab_class_get_space(metaslab_class_t * mc)366f59c7639Shaad metaslab_class_get_space(metaslab_class_t *mc)
367f59c7639Shaad {
368f59c7639Shaad 	return (mc->mc_space);
369f59c7639Shaad }
370f59c7639Shaad 
371f59c7639Shaad uint64_t
metaslab_class_get_dspace(metaslab_class_t * mc)372f59c7639Shaad metaslab_class_get_dspace(metaslab_class_t *mc)
373f59c7639Shaad {
374f59c7639Shaad 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
375c1cb2cd8Shaad }
376c1cb2cd8Shaad 
37793f3d2b8Schs uint64_t
metaslab_class_get_minblocksize(metaslab_class_t * mc)37893f3d2b8Schs metaslab_class_get_minblocksize(metaslab_class_t *mc)
37993f3d2b8Schs {
38093f3d2b8Schs 	return (mc->mc_minblocksize);
38193f3d2b8Schs }
38293f3d2b8Schs 
38393f3d2b8Schs void
metaslab_class_histogram_verify(metaslab_class_t * mc)38493f3d2b8Schs metaslab_class_histogram_verify(metaslab_class_t *mc)
38593f3d2b8Schs {
38693f3d2b8Schs 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
38793f3d2b8Schs 	uint64_t *mc_hist;
38893f3d2b8Schs 	int i;
38993f3d2b8Schs 
39093f3d2b8Schs 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
39193f3d2b8Schs 		return;
39293f3d2b8Schs 
39393f3d2b8Schs 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
39493f3d2b8Schs 	    KM_SLEEP);
39593f3d2b8Schs 
39693f3d2b8Schs 	for (int c = 0; c < rvd->vdev_children; c++) {
39793f3d2b8Schs 		vdev_t *tvd = rvd->vdev_child[c];
39893f3d2b8Schs 		metaslab_group_t *mg = tvd->vdev_mg;
39993f3d2b8Schs 
400c1cb2cd8Shaad 		/*
40193f3d2b8Schs 		 * Skip any holes, uninitialized top-levels, or
40293f3d2b8Schs 		 * vdevs that are not in this metalab class.
403c1cb2cd8Shaad 		 */
40493f3d2b8Schs 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
40593f3d2b8Schs 		    mg->mg_class != mc) {
40693f3d2b8Schs 			continue;
40793f3d2b8Schs 		}
40893f3d2b8Schs 
40993f3d2b8Schs 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
41093f3d2b8Schs 			mc_hist[i] += mg->mg_histogram[i];
41193f3d2b8Schs 	}
41293f3d2b8Schs 
41393f3d2b8Schs 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
41493f3d2b8Schs 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
41593f3d2b8Schs 
41693f3d2b8Schs 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
41793f3d2b8Schs }
41893f3d2b8Schs 
41993f3d2b8Schs /*
42093f3d2b8Schs  * Calculate the metaslab class's fragmentation metric. The metric
42193f3d2b8Schs  * is weighted based on the space contribution of each metaslab group.
42293f3d2b8Schs  * The return value will be a number between 0 and 100 (inclusive), or
42393f3d2b8Schs  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
42493f3d2b8Schs  * zfs_frag_table for more information about the metric.
42593f3d2b8Schs  */
42693f3d2b8Schs uint64_t
metaslab_class_fragmentation(metaslab_class_t * mc)42793f3d2b8Schs metaslab_class_fragmentation(metaslab_class_t *mc)
42893f3d2b8Schs {
42993f3d2b8Schs 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
43093f3d2b8Schs 	uint64_t fragmentation = 0;
43193f3d2b8Schs 
43293f3d2b8Schs 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
43393f3d2b8Schs 
43493f3d2b8Schs 	for (int c = 0; c < rvd->vdev_children; c++) {
43593f3d2b8Schs 		vdev_t *tvd = rvd->vdev_child[c];
43693f3d2b8Schs 		metaslab_group_t *mg = tvd->vdev_mg;
43793f3d2b8Schs 
43893f3d2b8Schs 		/*
43993f3d2b8Schs 		 * Skip any holes, uninitialized top-levels, or
44093f3d2b8Schs 		 * vdevs that are not in this metalab class.
44193f3d2b8Schs 		 */
44293f3d2b8Schs 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
44393f3d2b8Schs 		    mg->mg_class != mc) {
44493f3d2b8Schs 			continue;
44593f3d2b8Schs 		}
44693f3d2b8Schs 
44793f3d2b8Schs 		/*
44893f3d2b8Schs 		 * If a metaslab group does not contain a fragmentation
44993f3d2b8Schs 		 * metric then just bail out.
45093f3d2b8Schs 		 */
45193f3d2b8Schs 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
45293f3d2b8Schs 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
45393f3d2b8Schs 			return (ZFS_FRAG_INVALID);
45493f3d2b8Schs 		}
45593f3d2b8Schs 
45693f3d2b8Schs 		/*
45793f3d2b8Schs 		 * Determine how much this metaslab_group is contributing
45893f3d2b8Schs 		 * to the overall pool fragmentation metric.
45993f3d2b8Schs 		 */
46093f3d2b8Schs 		fragmentation += mg->mg_fragmentation *
46193f3d2b8Schs 		    metaslab_group_get_space(mg);
46293f3d2b8Schs 	}
46393f3d2b8Schs 	fragmentation /= metaslab_class_get_space(mc);
46493f3d2b8Schs 
46593f3d2b8Schs 	ASSERT3U(fragmentation, <=, 100);
46693f3d2b8Schs 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
46793f3d2b8Schs 	return (fragmentation);
46893f3d2b8Schs }
46993f3d2b8Schs 
47093f3d2b8Schs /*
47193f3d2b8Schs  * Calculate the amount of expandable space that is available in
47293f3d2b8Schs  * this metaslab class. If a device is expanded then its expandable
47393f3d2b8Schs  * space will be the amount of allocatable space that is currently not
47493f3d2b8Schs  * part of this metaslab class.
47593f3d2b8Schs  */
47693f3d2b8Schs uint64_t
metaslab_class_expandable_space(metaslab_class_t * mc)47793f3d2b8Schs metaslab_class_expandable_space(metaslab_class_t *mc)
47893f3d2b8Schs {
47993f3d2b8Schs 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
48093f3d2b8Schs 	uint64_t space = 0;
48193f3d2b8Schs 
48293f3d2b8Schs 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
48393f3d2b8Schs 	for (int c = 0; c < rvd->vdev_children; c++) {
48493f3d2b8Schs 		vdev_t *tvd = rvd->vdev_child[c];
48593f3d2b8Schs 		metaslab_group_t *mg = tvd->vdev_mg;
48693f3d2b8Schs 
48793f3d2b8Schs 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
48893f3d2b8Schs 		    mg->mg_class != mc) {
48993f3d2b8Schs 			continue;
49093f3d2b8Schs 		}
49193f3d2b8Schs 
49293f3d2b8Schs 		/*
49393f3d2b8Schs 		 * Calculate if we have enough space to add additional
49493f3d2b8Schs 		 * metaslabs. We report the expandable space in terms
49593f3d2b8Schs 		 * of the metaslab size since that's the unit of expansion.
49693f3d2b8Schs 		 */
49793f3d2b8Schs 		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
49893f3d2b8Schs 		    1ULL << tvd->vdev_ms_shift);
49993f3d2b8Schs 	}
50093f3d2b8Schs 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
50193f3d2b8Schs 	return (space);
50293f3d2b8Schs }
50393f3d2b8Schs 
504c1cb2cd8Shaad static int
metaslab_compare(const void * x1,const void * x2)505c1cb2cd8Shaad metaslab_compare(const void *x1, const void *x2)
506c1cb2cd8Shaad {
507c1cb2cd8Shaad 	const metaslab_t *m1 = x1;
508c1cb2cd8Shaad 	const metaslab_t *m2 = x2;
509c1cb2cd8Shaad 
510c1cb2cd8Shaad 	if (m1->ms_weight < m2->ms_weight)
511c1cb2cd8Shaad 		return (1);
512c1cb2cd8Shaad 	if (m1->ms_weight > m2->ms_weight)
513c1cb2cd8Shaad 		return (-1);
514c1cb2cd8Shaad 
515c1cb2cd8Shaad 	/*
516c1cb2cd8Shaad 	 * If the weights are identical, use the offset to force uniqueness.
517c1cb2cd8Shaad 	 */
51893f3d2b8Schs 	if (m1->ms_start < m2->ms_start)
519c1cb2cd8Shaad 		return (-1);
52093f3d2b8Schs 	if (m1->ms_start > m2->ms_start)
521c1cb2cd8Shaad 		return (1);
522c1cb2cd8Shaad 
523c1cb2cd8Shaad 	ASSERT3P(m1, ==, m2);
524c1cb2cd8Shaad 
525c1cb2cd8Shaad 	return (0);
526c1cb2cd8Shaad }
527c1cb2cd8Shaad 
52893f3d2b8Schs /*
52993f3d2b8Schs  * Verify that the space accounting on disk matches the in-core range_trees.
53093f3d2b8Schs  */
53193f3d2b8Schs void
metaslab_verify_space(metaslab_t * msp,uint64_t txg)53293f3d2b8Schs metaslab_verify_space(metaslab_t *msp, uint64_t txg)
53393f3d2b8Schs {
53493f3d2b8Schs 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
53593f3d2b8Schs 	uint64_t allocated = 0;
53693f3d2b8Schs 	uint64_t freed = 0;
53793f3d2b8Schs 	uint64_t sm_free_space, msp_free_space;
53893f3d2b8Schs 
53993f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
54093f3d2b8Schs 
54193f3d2b8Schs 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
54293f3d2b8Schs 		return;
54393f3d2b8Schs 
54493f3d2b8Schs 	/*
54593f3d2b8Schs 	 * We can only verify the metaslab space when we're called
54693f3d2b8Schs 	 * from syncing context with a loaded metaslab that has an allocated
54793f3d2b8Schs 	 * space map. Calling this in non-syncing context does not
54893f3d2b8Schs 	 * provide a consistent view of the metaslab since we're performing
54993f3d2b8Schs 	 * allocations in the future.
55093f3d2b8Schs 	 */
55193f3d2b8Schs 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
55293f3d2b8Schs 	    !msp->ms_loaded)
55393f3d2b8Schs 		return;
55493f3d2b8Schs 
55593f3d2b8Schs 	sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
55693f3d2b8Schs 	    space_map_alloc_delta(msp->ms_sm);
55793f3d2b8Schs 
55893f3d2b8Schs 	/*
55993f3d2b8Schs 	 * Account for future allocations since we would have already
56093f3d2b8Schs 	 * deducted that space from the ms_freetree.
56193f3d2b8Schs 	 */
56293f3d2b8Schs 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
56393f3d2b8Schs 		allocated +=
56493f3d2b8Schs 		    range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
56593f3d2b8Schs 	}
56693f3d2b8Schs 	freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]);
56793f3d2b8Schs 
56893f3d2b8Schs 	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
56993f3d2b8Schs 	    msp->ms_deferspace + freed;
57093f3d2b8Schs 
57193f3d2b8Schs 	VERIFY3U(sm_free_space, ==, msp_free_space);
57293f3d2b8Schs }
57393f3d2b8Schs 
57493f3d2b8Schs /*
57593f3d2b8Schs  * ==========================================================================
57693f3d2b8Schs  * Metaslab groups
57793f3d2b8Schs  * ==========================================================================
57893f3d2b8Schs  */
57993f3d2b8Schs /*
58093f3d2b8Schs  * Update the allocatable flag and the metaslab group's capacity.
58193f3d2b8Schs  * The allocatable flag is set to true if the capacity is below
58293f3d2b8Schs  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
58393f3d2b8Schs  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
58493f3d2b8Schs  * transitions from allocatable to non-allocatable or vice versa then the
58593f3d2b8Schs  * metaslab group's class is updated to reflect the transition.
58693f3d2b8Schs  */
58793f3d2b8Schs static void
metaslab_group_alloc_update(metaslab_group_t * mg)58893f3d2b8Schs metaslab_group_alloc_update(metaslab_group_t *mg)
58993f3d2b8Schs {
59093f3d2b8Schs 	vdev_t *vd = mg->mg_vd;
59193f3d2b8Schs 	metaslab_class_t *mc = mg->mg_class;
59293f3d2b8Schs 	vdev_stat_t *vs = &vd->vdev_stat;
59393f3d2b8Schs 	boolean_t was_allocatable;
59493f3d2b8Schs 	boolean_t was_initialized;
59593f3d2b8Schs 
59693f3d2b8Schs 	ASSERT(vd == vd->vdev_top);
59793f3d2b8Schs 
59893f3d2b8Schs 	mutex_enter(&mg->mg_lock);
59993f3d2b8Schs 	was_allocatable = mg->mg_allocatable;
60093f3d2b8Schs 	was_initialized = mg->mg_initialized;
60193f3d2b8Schs 
60293f3d2b8Schs 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
60393f3d2b8Schs 	    (vs->vs_space + 1);
60493f3d2b8Schs 
60593f3d2b8Schs 	mutex_enter(&mc->mc_lock);
60693f3d2b8Schs 
60793f3d2b8Schs 	/*
60893f3d2b8Schs 	 * If the metaslab group was just added then it won't
60993f3d2b8Schs 	 * have any space until we finish syncing out this txg.
61093f3d2b8Schs 	 * At that point we will consider it initialized and available
61193f3d2b8Schs 	 * for allocations.  We also don't consider non-activated
61293f3d2b8Schs 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
61393f3d2b8Schs 	 * to be initialized, because they can't be used for allocation.
61493f3d2b8Schs 	 */
61593f3d2b8Schs 	mg->mg_initialized = metaslab_group_initialized(mg);
61693f3d2b8Schs 	if (!was_initialized && mg->mg_initialized) {
61793f3d2b8Schs 		mc->mc_groups++;
61893f3d2b8Schs 	} else if (was_initialized && !mg->mg_initialized) {
61993f3d2b8Schs 		ASSERT3U(mc->mc_groups, >, 0);
62093f3d2b8Schs 		mc->mc_groups--;
62193f3d2b8Schs 	}
62293f3d2b8Schs 	if (mg->mg_initialized)
62393f3d2b8Schs 		mg->mg_no_free_space = B_FALSE;
62493f3d2b8Schs 
62593f3d2b8Schs 	/*
62693f3d2b8Schs 	 * A metaslab group is considered allocatable if it has plenty
62793f3d2b8Schs 	 * of free space or is not heavily fragmented. We only take
62893f3d2b8Schs 	 * fragmentation into account if the metaslab group has a valid
62993f3d2b8Schs 	 * fragmentation metric (i.e. a value between 0 and 100).
63093f3d2b8Schs 	 */
63193f3d2b8Schs 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
63293f3d2b8Schs 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
63393f3d2b8Schs 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
63493f3d2b8Schs 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
63593f3d2b8Schs 
63693f3d2b8Schs 	/*
63793f3d2b8Schs 	 * The mc_alloc_groups maintains a count of the number of
63893f3d2b8Schs 	 * groups in this metaslab class that are still above the
63993f3d2b8Schs 	 * zfs_mg_noalloc_threshold. This is used by the allocating
64093f3d2b8Schs 	 * threads to determine if they should avoid allocations to
64193f3d2b8Schs 	 * a given group. The allocator will avoid allocations to a group
64293f3d2b8Schs 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
64393f3d2b8Schs 	 * and there are still other groups that are above the threshold.
64493f3d2b8Schs 	 * When a group transitions from allocatable to non-allocatable or
64593f3d2b8Schs 	 * vice versa we update the metaslab class to reflect that change.
64693f3d2b8Schs 	 * When the mc_alloc_groups value drops to 0 that means that all
64793f3d2b8Schs 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
64893f3d2b8Schs 	 * eligible for allocations. This effectively means that all devices
64993f3d2b8Schs 	 * are balanced again.
65093f3d2b8Schs 	 */
65193f3d2b8Schs 	if (was_allocatable && !mg->mg_allocatable)
65293f3d2b8Schs 		mc->mc_alloc_groups--;
65393f3d2b8Schs 	else if (!was_allocatable && mg->mg_allocatable)
65493f3d2b8Schs 		mc->mc_alloc_groups++;
65593f3d2b8Schs 	mutex_exit(&mc->mc_lock);
65693f3d2b8Schs 
65793f3d2b8Schs 	mutex_exit(&mg->mg_lock);
65893f3d2b8Schs }
65993f3d2b8Schs 
660c1cb2cd8Shaad metaslab_group_t *
metaslab_group_create(metaslab_class_t * mc,vdev_t * vd)661c1cb2cd8Shaad metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
662c1cb2cd8Shaad {
663c1cb2cd8Shaad 	metaslab_group_t *mg;
664c1cb2cd8Shaad 
665c1cb2cd8Shaad 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
666c1cb2cd8Shaad 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
667c1cb2cd8Shaad 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
668c1cb2cd8Shaad 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
669c1cb2cd8Shaad 	mg->mg_vd = vd;
670f59c7639Shaad 	mg->mg_class = mc;
671f59c7639Shaad 	mg->mg_activation_count = 0;
67293f3d2b8Schs 	mg->mg_initialized = B_FALSE;
67393f3d2b8Schs 	mg->mg_no_free_space = B_TRUE;
67493f3d2b8Schs 	refcount_create_tracked(&mg->mg_alloc_queue_depth);
67593f3d2b8Schs 
67693f3d2b8Schs 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
67793f3d2b8Schs 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
678c1cb2cd8Shaad 
679c1cb2cd8Shaad 	return (mg);
680c1cb2cd8Shaad }
681c1cb2cd8Shaad 
682c1cb2cd8Shaad void
metaslab_group_destroy(metaslab_group_t * mg)683c1cb2cd8Shaad metaslab_group_destroy(metaslab_group_t *mg)
684c1cb2cd8Shaad {
685f59c7639Shaad 	ASSERT(mg->mg_prev == NULL);
686f59c7639Shaad 	ASSERT(mg->mg_next == NULL);
687f59c7639Shaad 	/*
688f59c7639Shaad 	 * We may have gone below zero with the activation count
689f59c7639Shaad 	 * either because we never activated in the first place or
690f59c7639Shaad 	 * because we're done, and possibly removing the vdev.
691f59c7639Shaad 	 */
692f59c7639Shaad 	ASSERT(mg->mg_activation_count <= 0);
693f59c7639Shaad 
69493f3d2b8Schs 	taskq_destroy(mg->mg_taskq);
695c1cb2cd8Shaad 	avl_destroy(&mg->mg_metaslab_tree);
696c1cb2cd8Shaad 	mutex_destroy(&mg->mg_lock);
69793f3d2b8Schs 	refcount_destroy(&mg->mg_alloc_queue_depth);
698c1cb2cd8Shaad 	kmem_free(mg, sizeof (metaslab_group_t));
699c1cb2cd8Shaad }
700c1cb2cd8Shaad 
701f59c7639Shaad void
metaslab_group_activate(metaslab_group_t * mg)702f59c7639Shaad metaslab_group_activate(metaslab_group_t *mg)
703f59c7639Shaad {
704f59c7639Shaad 	metaslab_class_t *mc = mg->mg_class;
705f59c7639Shaad 	metaslab_group_t *mgprev, *mgnext;
706f59c7639Shaad 
707f59c7639Shaad 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
708f59c7639Shaad 
709f59c7639Shaad 	ASSERT(mc->mc_rotor != mg);
710f59c7639Shaad 	ASSERT(mg->mg_prev == NULL);
711f59c7639Shaad 	ASSERT(mg->mg_next == NULL);
712f59c7639Shaad 	ASSERT(mg->mg_activation_count <= 0);
713f59c7639Shaad 
714f59c7639Shaad 	if (++mg->mg_activation_count <= 0)
715f59c7639Shaad 		return;
716f59c7639Shaad 
717f59c7639Shaad 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
71893f3d2b8Schs 	metaslab_group_alloc_update(mg);
719f59c7639Shaad 
720f59c7639Shaad 	if ((mgprev = mc->mc_rotor) == NULL) {
721f59c7639Shaad 		mg->mg_prev = mg;
722f59c7639Shaad 		mg->mg_next = mg;
723f59c7639Shaad 	} else {
724f59c7639Shaad 		mgnext = mgprev->mg_next;
725f59c7639Shaad 		mg->mg_prev = mgprev;
726f59c7639Shaad 		mg->mg_next = mgnext;
727f59c7639Shaad 		mgprev->mg_next = mg;
728f59c7639Shaad 		mgnext->mg_prev = mg;
729f59c7639Shaad 	}
730f59c7639Shaad 	mc->mc_rotor = mg;
73193f3d2b8Schs 	metaslab_class_minblocksize_update(mc);
732f59c7639Shaad }
733f59c7639Shaad 
734f59c7639Shaad void
metaslab_group_passivate(metaslab_group_t * mg)735f59c7639Shaad metaslab_group_passivate(metaslab_group_t *mg)
736f59c7639Shaad {
737f59c7639Shaad 	metaslab_class_t *mc = mg->mg_class;
738f59c7639Shaad 	metaslab_group_t *mgprev, *mgnext;
739f59c7639Shaad 
740f59c7639Shaad 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
741f59c7639Shaad 
742f59c7639Shaad 	if (--mg->mg_activation_count != 0) {
743f59c7639Shaad 		ASSERT(mc->mc_rotor != mg);
744f59c7639Shaad 		ASSERT(mg->mg_prev == NULL);
745f59c7639Shaad 		ASSERT(mg->mg_next == NULL);
746f59c7639Shaad 		ASSERT(mg->mg_activation_count < 0);
747f59c7639Shaad 		return;
748f59c7639Shaad 	}
749f59c7639Shaad 
75093f3d2b8Schs 	taskq_wait(mg->mg_taskq);
75193f3d2b8Schs 	metaslab_group_alloc_update(mg);
75293f3d2b8Schs 
753f59c7639Shaad 	mgprev = mg->mg_prev;
754f59c7639Shaad 	mgnext = mg->mg_next;
755f59c7639Shaad 
756f59c7639Shaad 	if (mg == mgnext) {
757f59c7639Shaad 		mc->mc_rotor = NULL;
758f59c7639Shaad 	} else {
759f59c7639Shaad 		mc->mc_rotor = mgnext;
760f59c7639Shaad 		mgprev->mg_next = mgnext;
761f59c7639Shaad 		mgnext->mg_prev = mgprev;
762f59c7639Shaad 	}
763f59c7639Shaad 
764f59c7639Shaad 	mg->mg_prev = NULL;
765f59c7639Shaad 	mg->mg_next = NULL;
76693f3d2b8Schs 	metaslab_class_minblocksize_update(mc);
76793f3d2b8Schs }
76893f3d2b8Schs 
76993f3d2b8Schs boolean_t
metaslab_group_initialized(metaslab_group_t * mg)77093f3d2b8Schs metaslab_group_initialized(metaslab_group_t *mg)
77193f3d2b8Schs {
77293f3d2b8Schs 	vdev_t *vd = mg->mg_vd;
77393f3d2b8Schs 	vdev_stat_t *vs = &vd->vdev_stat;
77493f3d2b8Schs 
77593f3d2b8Schs 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
77693f3d2b8Schs }
77793f3d2b8Schs 
77893f3d2b8Schs uint64_t
metaslab_group_get_space(metaslab_group_t * mg)77993f3d2b8Schs metaslab_group_get_space(metaslab_group_t *mg)
78093f3d2b8Schs {
78193f3d2b8Schs 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
78293f3d2b8Schs }
78393f3d2b8Schs 
78493f3d2b8Schs void
metaslab_group_histogram_verify(metaslab_group_t * mg)78593f3d2b8Schs metaslab_group_histogram_verify(metaslab_group_t *mg)
78693f3d2b8Schs {
78793f3d2b8Schs 	uint64_t *mg_hist;
78893f3d2b8Schs 	vdev_t *vd = mg->mg_vd;
78993f3d2b8Schs 	uint64_t ashift = vd->vdev_ashift;
79093f3d2b8Schs 	int i;
79193f3d2b8Schs 
79293f3d2b8Schs 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
79393f3d2b8Schs 		return;
79493f3d2b8Schs 
79593f3d2b8Schs 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
79693f3d2b8Schs 	    KM_SLEEP);
79793f3d2b8Schs 
79893f3d2b8Schs 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
79993f3d2b8Schs 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
80093f3d2b8Schs 
80193f3d2b8Schs 	for (int m = 0; m < vd->vdev_ms_count; m++) {
80293f3d2b8Schs 		metaslab_t *msp = vd->vdev_ms[m];
80393f3d2b8Schs 
80493f3d2b8Schs 		if (msp->ms_sm == NULL)
80593f3d2b8Schs 			continue;
80693f3d2b8Schs 
80793f3d2b8Schs 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
80893f3d2b8Schs 			mg_hist[i + ashift] +=
80993f3d2b8Schs 			    msp->ms_sm->sm_phys->smp_histogram[i];
81093f3d2b8Schs 	}
81193f3d2b8Schs 
81293f3d2b8Schs 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
81393f3d2b8Schs 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
81493f3d2b8Schs 
81593f3d2b8Schs 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
81693f3d2b8Schs }
81793f3d2b8Schs 
81893f3d2b8Schs static void
metaslab_group_histogram_add(metaslab_group_t * mg,metaslab_t * msp)81993f3d2b8Schs metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
82093f3d2b8Schs {
82193f3d2b8Schs 	metaslab_class_t *mc = mg->mg_class;
82293f3d2b8Schs 	uint64_t ashift = mg->mg_vd->vdev_ashift;
82393f3d2b8Schs 
82493f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
82593f3d2b8Schs 	if (msp->ms_sm == NULL)
82693f3d2b8Schs 		return;
82793f3d2b8Schs 
82893f3d2b8Schs 	mutex_enter(&mg->mg_lock);
82993f3d2b8Schs 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
83093f3d2b8Schs 		mg->mg_histogram[i + ashift] +=
83193f3d2b8Schs 		    msp->ms_sm->sm_phys->smp_histogram[i];
83293f3d2b8Schs 		mc->mc_histogram[i + ashift] +=
83393f3d2b8Schs 		    msp->ms_sm->sm_phys->smp_histogram[i];
83493f3d2b8Schs 	}
83593f3d2b8Schs 	mutex_exit(&mg->mg_lock);
83693f3d2b8Schs }
83793f3d2b8Schs 
83893f3d2b8Schs void
metaslab_group_histogram_remove(metaslab_group_t * mg,metaslab_t * msp)83993f3d2b8Schs metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
84093f3d2b8Schs {
84193f3d2b8Schs 	metaslab_class_t *mc = mg->mg_class;
84293f3d2b8Schs 	uint64_t ashift = mg->mg_vd->vdev_ashift;
84393f3d2b8Schs 
84493f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
84593f3d2b8Schs 	if (msp->ms_sm == NULL)
84693f3d2b8Schs 		return;
84793f3d2b8Schs 
84893f3d2b8Schs 	mutex_enter(&mg->mg_lock);
84993f3d2b8Schs 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
85093f3d2b8Schs 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
85193f3d2b8Schs 		    msp->ms_sm->sm_phys->smp_histogram[i]);
85293f3d2b8Schs 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
85393f3d2b8Schs 		    msp->ms_sm->sm_phys->smp_histogram[i]);
85493f3d2b8Schs 
85593f3d2b8Schs 		mg->mg_histogram[i + ashift] -=
85693f3d2b8Schs 		    msp->ms_sm->sm_phys->smp_histogram[i];
85793f3d2b8Schs 		mc->mc_histogram[i + ashift] -=
85893f3d2b8Schs 		    msp->ms_sm->sm_phys->smp_histogram[i];
85993f3d2b8Schs 	}
86093f3d2b8Schs 	mutex_exit(&mg->mg_lock);
861f59c7639Shaad }
862f59c7639Shaad 
863c1cb2cd8Shaad static void
metaslab_group_add(metaslab_group_t * mg,metaslab_t * msp)864c1cb2cd8Shaad metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
865c1cb2cd8Shaad {
866c1cb2cd8Shaad 	ASSERT(msp->ms_group == NULL);
86793f3d2b8Schs 	mutex_enter(&mg->mg_lock);
868c1cb2cd8Shaad 	msp->ms_group = mg;
869c1cb2cd8Shaad 	msp->ms_weight = 0;
870c1cb2cd8Shaad 	avl_add(&mg->mg_metaslab_tree, msp);
871c1cb2cd8Shaad 	mutex_exit(&mg->mg_lock);
87293f3d2b8Schs 
87393f3d2b8Schs 	mutex_enter(&msp->ms_lock);
87493f3d2b8Schs 	metaslab_group_histogram_add(mg, msp);
87593f3d2b8Schs 	mutex_exit(&msp->ms_lock);
876c1cb2cd8Shaad }
877c1cb2cd8Shaad 
878c1cb2cd8Shaad static void
metaslab_group_remove(metaslab_group_t * mg,metaslab_t * msp)879c1cb2cd8Shaad metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
880c1cb2cd8Shaad {
88193f3d2b8Schs 	mutex_enter(&msp->ms_lock);
88293f3d2b8Schs 	metaslab_group_histogram_remove(mg, msp);
88393f3d2b8Schs 	mutex_exit(&msp->ms_lock);
88493f3d2b8Schs 
885c1cb2cd8Shaad 	mutex_enter(&mg->mg_lock);
886c1cb2cd8Shaad 	ASSERT(msp->ms_group == mg);
887c1cb2cd8Shaad 	avl_remove(&mg->mg_metaslab_tree, msp);
888c1cb2cd8Shaad 	msp->ms_group = NULL;
889c1cb2cd8Shaad 	mutex_exit(&mg->mg_lock);
890c1cb2cd8Shaad }
891c1cb2cd8Shaad 
892c1cb2cd8Shaad static void
metaslab_group_sort(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)893c1cb2cd8Shaad metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
894c1cb2cd8Shaad {
895c1cb2cd8Shaad 	/*
896c1cb2cd8Shaad 	 * Although in principle the weight can be any value, in
89793f3d2b8Schs 	 * practice we do not use values in the range [1, 511].
898c1cb2cd8Shaad 	 */
89993f3d2b8Schs 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
900c1cb2cd8Shaad 	ASSERT(MUTEX_HELD(&msp->ms_lock));
901c1cb2cd8Shaad 
902c1cb2cd8Shaad 	mutex_enter(&mg->mg_lock);
903c1cb2cd8Shaad 	ASSERT(msp->ms_group == mg);
904c1cb2cd8Shaad 	avl_remove(&mg->mg_metaslab_tree, msp);
905c1cb2cd8Shaad 	msp->ms_weight = weight;
906c1cb2cd8Shaad 	avl_add(&mg->mg_metaslab_tree, msp);
907c1cb2cd8Shaad 	mutex_exit(&mg->mg_lock);
908c1cb2cd8Shaad }
909c1cb2cd8Shaad 
910c1cb2cd8Shaad /*
91193f3d2b8Schs  * Calculate the fragmentation for a given metaslab group. We can use
91293f3d2b8Schs  * a simple average here since all metaslabs within the group must have
91393f3d2b8Schs  * the same size. The return value will be a value between 0 and 100
91493f3d2b8Schs  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
91593f3d2b8Schs  * group have a fragmentation metric.
91693f3d2b8Schs  */
91793f3d2b8Schs uint64_t
metaslab_group_fragmentation(metaslab_group_t * mg)91893f3d2b8Schs metaslab_group_fragmentation(metaslab_group_t *mg)
91993f3d2b8Schs {
92093f3d2b8Schs 	vdev_t *vd = mg->mg_vd;
92193f3d2b8Schs 	uint64_t fragmentation = 0;
92293f3d2b8Schs 	uint64_t valid_ms = 0;
92393f3d2b8Schs 
92493f3d2b8Schs 	for (int m = 0; m < vd->vdev_ms_count; m++) {
92593f3d2b8Schs 		metaslab_t *msp = vd->vdev_ms[m];
92693f3d2b8Schs 
92793f3d2b8Schs 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
92893f3d2b8Schs 			continue;
92993f3d2b8Schs 
93093f3d2b8Schs 		valid_ms++;
93193f3d2b8Schs 		fragmentation += msp->ms_fragmentation;
93293f3d2b8Schs 	}
93393f3d2b8Schs 
93493f3d2b8Schs 	if (valid_ms <= vd->vdev_ms_count / 2)
93593f3d2b8Schs 		return (ZFS_FRAG_INVALID);
93693f3d2b8Schs 
93793f3d2b8Schs 	fragmentation /= valid_ms;
93893f3d2b8Schs 	ASSERT3U(fragmentation, <=, 100);
93993f3d2b8Schs 	return (fragmentation);
94093f3d2b8Schs }
94193f3d2b8Schs 
94293f3d2b8Schs /*
94393f3d2b8Schs  * Determine if a given metaslab group should skip allocations. A metaslab
94493f3d2b8Schs  * group should avoid allocations if its free capacity is less than the
94593f3d2b8Schs  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
94693f3d2b8Schs  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
94793f3d2b8Schs  * that can still handle allocations. If the allocation throttle is enabled
94893f3d2b8Schs  * then we skip allocations to devices that have reached their maximum
94993f3d2b8Schs  * allocation queue depth unless the selected metaslab group is the only
95093f3d2b8Schs  * eligible group remaining.
95193f3d2b8Schs  */
95293f3d2b8Schs static boolean_t
metaslab_group_allocatable(metaslab_group_t * mg,metaslab_group_t * rotor,uint64_t psize)95393f3d2b8Schs metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
95493f3d2b8Schs     uint64_t psize)
95593f3d2b8Schs {
95693f3d2b8Schs 	spa_t *spa = mg->mg_vd->vdev_spa;
95793f3d2b8Schs 	metaslab_class_t *mc = mg->mg_class;
95893f3d2b8Schs 
95993f3d2b8Schs 	/*
96093f3d2b8Schs 	 * We can only consider skipping this metaslab group if it's
96193f3d2b8Schs 	 * in the normal metaslab class and there are other metaslab
96293f3d2b8Schs 	 * groups to select from. Otherwise, we always consider it eligible
96393f3d2b8Schs 	 * for allocations.
96493f3d2b8Schs 	 */
96593f3d2b8Schs 	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
96693f3d2b8Schs 		return (B_TRUE);
96793f3d2b8Schs 
96893f3d2b8Schs 	/*
96993f3d2b8Schs 	 * If the metaslab group's mg_allocatable flag is set (see comments
97093f3d2b8Schs 	 * in metaslab_group_alloc_update() for more information) and
97193f3d2b8Schs 	 * the allocation throttle is disabled then allow allocations to this
97293f3d2b8Schs 	 * device. However, if the allocation throttle is enabled then
97393f3d2b8Schs 	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
97493f3d2b8Schs 	 * to determine if we should allow allocations to this metaslab group.
97593f3d2b8Schs 	 * If all metaslab groups are no longer considered allocatable
97693f3d2b8Schs 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
97793f3d2b8Schs 	 * gang block size then we allow allocations on this metaslab group
97893f3d2b8Schs 	 * regardless of the mg_allocatable or throttle settings.
97993f3d2b8Schs 	 */
98093f3d2b8Schs 	if (mg->mg_allocatable) {
98193f3d2b8Schs 		metaslab_group_t *mgp;
98293f3d2b8Schs 		int64_t qdepth;
98393f3d2b8Schs 		uint64_t qmax = mg->mg_max_alloc_queue_depth;
98493f3d2b8Schs 
98593f3d2b8Schs 		if (!mc->mc_alloc_throttle_enabled)
98693f3d2b8Schs 			return (B_TRUE);
98793f3d2b8Schs 
98893f3d2b8Schs 		/*
98993f3d2b8Schs 		 * If this metaslab group does not have any free space, then
99093f3d2b8Schs 		 * there is no point in looking further.
99193f3d2b8Schs 		 */
99293f3d2b8Schs 		if (mg->mg_no_free_space)
99393f3d2b8Schs 			return (B_FALSE);
99493f3d2b8Schs 
99593f3d2b8Schs 		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
99693f3d2b8Schs 
99793f3d2b8Schs 		/*
99893f3d2b8Schs 		 * If this metaslab group is below its qmax or it's
99993f3d2b8Schs 		 * the only allocatable metasable group, then attempt
100093f3d2b8Schs 		 * to allocate from it.
100193f3d2b8Schs 		 */
100293f3d2b8Schs 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
100393f3d2b8Schs 			return (B_TRUE);
100493f3d2b8Schs 		ASSERT3U(mc->mc_alloc_groups, >, 1);
100593f3d2b8Schs 
100693f3d2b8Schs 		/*
100793f3d2b8Schs 		 * Since this metaslab group is at or over its qmax, we
100893f3d2b8Schs 		 * need to determine if there are metaslab groups after this
100993f3d2b8Schs 		 * one that might be able to handle this allocation. This is
101093f3d2b8Schs 		 * racy since we can't hold the locks for all metaslab
101193f3d2b8Schs 		 * groups at the same time when we make this check.
101293f3d2b8Schs 		 */
101393f3d2b8Schs 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
101493f3d2b8Schs 			qmax = mgp->mg_max_alloc_queue_depth;
101593f3d2b8Schs 
101693f3d2b8Schs 			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
101793f3d2b8Schs 
101893f3d2b8Schs 			/*
101993f3d2b8Schs 			 * If there is another metaslab group that
102093f3d2b8Schs 			 * might be able to handle the allocation, then
102193f3d2b8Schs 			 * we return false so that we skip this group.
102293f3d2b8Schs 			 */
102393f3d2b8Schs 			if (qdepth < qmax && !mgp->mg_no_free_space)
102493f3d2b8Schs 				return (B_FALSE);
102593f3d2b8Schs 		}
102693f3d2b8Schs 
102793f3d2b8Schs 		/*
102893f3d2b8Schs 		 * We didn't find another group to handle the allocation
102993f3d2b8Schs 		 * so we can't skip this metaslab group even though
103093f3d2b8Schs 		 * we are at or over our qmax.
103193f3d2b8Schs 		 */
103293f3d2b8Schs 		return (B_TRUE);
103393f3d2b8Schs 
103493f3d2b8Schs 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
103593f3d2b8Schs 		return (B_TRUE);
103693f3d2b8Schs 	}
103793f3d2b8Schs 	return (B_FALSE);
103893f3d2b8Schs }
103993f3d2b8Schs 
104093f3d2b8Schs /*
104193f3d2b8Schs  * ==========================================================================
104293f3d2b8Schs  * Range tree callbacks
104393f3d2b8Schs  * ==========================================================================
104493f3d2b8Schs  */
104593f3d2b8Schs 
104693f3d2b8Schs /*
104793f3d2b8Schs  * Comparison function for the private size-ordered tree. Tree is sorted
104893f3d2b8Schs  * by size, larger sizes at the end of the tree.
104993f3d2b8Schs  */
105093f3d2b8Schs static int
metaslab_rangesize_compare(const void * x1,const void * x2)105193f3d2b8Schs metaslab_rangesize_compare(const void *x1, const void *x2)
105293f3d2b8Schs {
105393f3d2b8Schs 	const range_seg_t *r1 = x1;
105493f3d2b8Schs 	const range_seg_t *r2 = x2;
105593f3d2b8Schs 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
105693f3d2b8Schs 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
105793f3d2b8Schs 
105893f3d2b8Schs 	if (rs_size1 < rs_size2)
105993f3d2b8Schs 		return (-1);
106093f3d2b8Schs 	if (rs_size1 > rs_size2)
106193f3d2b8Schs 		return (1);
106293f3d2b8Schs 
106393f3d2b8Schs 	if (r1->rs_start < r2->rs_start)
106493f3d2b8Schs 		return (-1);
106593f3d2b8Schs 
106693f3d2b8Schs 	if (r1->rs_start > r2->rs_start)
106793f3d2b8Schs 		return (1);
106893f3d2b8Schs 
106993f3d2b8Schs 	return (0);
107093f3d2b8Schs }
107193f3d2b8Schs 
107293f3d2b8Schs /*
107393f3d2b8Schs  * Create any block allocator specific components. The current allocators
107493f3d2b8Schs  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
107593f3d2b8Schs  */
107693f3d2b8Schs static void
metaslab_rt_create(range_tree_t * rt,void * arg)107793f3d2b8Schs metaslab_rt_create(range_tree_t *rt, void *arg)
107893f3d2b8Schs {
107993f3d2b8Schs 	metaslab_t *msp = arg;
108093f3d2b8Schs 
108193f3d2b8Schs 	ASSERT3P(rt->rt_arg, ==, msp);
108293f3d2b8Schs 	ASSERT(msp->ms_tree == NULL);
108393f3d2b8Schs 
108493f3d2b8Schs 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
108593f3d2b8Schs 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
108693f3d2b8Schs }
108793f3d2b8Schs 
108893f3d2b8Schs /*
108993f3d2b8Schs  * Destroy the block allocator specific components.
109093f3d2b8Schs  */
109193f3d2b8Schs static void
metaslab_rt_destroy(range_tree_t * rt,void * arg)109293f3d2b8Schs metaslab_rt_destroy(range_tree_t *rt, void *arg)
109393f3d2b8Schs {
109493f3d2b8Schs 	metaslab_t *msp = arg;
109593f3d2b8Schs 
109693f3d2b8Schs 	ASSERT3P(rt->rt_arg, ==, msp);
109793f3d2b8Schs 	ASSERT3P(msp->ms_tree, ==, rt);
109893f3d2b8Schs 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
109993f3d2b8Schs 
110093f3d2b8Schs 	avl_destroy(&msp->ms_size_tree);
110193f3d2b8Schs }
110293f3d2b8Schs 
110393f3d2b8Schs static void
metaslab_rt_add(range_tree_t * rt,range_seg_t * rs,void * arg)110493f3d2b8Schs metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
110593f3d2b8Schs {
110693f3d2b8Schs 	metaslab_t *msp = arg;
110793f3d2b8Schs 
110893f3d2b8Schs 	ASSERT3P(rt->rt_arg, ==, msp);
110993f3d2b8Schs 	ASSERT3P(msp->ms_tree, ==, rt);
111093f3d2b8Schs 	VERIFY(!msp->ms_condensing);
111193f3d2b8Schs 	avl_add(&msp->ms_size_tree, rs);
111293f3d2b8Schs }
111393f3d2b8Schs 
111493f3d2b8Schs static void
metaslab_rt_remove(range_tree_t * rt,range_seg_t * rs,void * arg)111593f3d2b8Schs metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
111693f3d2b8Schs {
111793f3d2b8Schs 	metaslab_t *msp = arg;
111893f3d2b8Schs 
111993f3d2b8Schs 	ASSERT3P(rt->rt_arg, ==, msp);
112093f3d2b8Schs 	ASSERT3P(msp->ms_tree, ==, rt);
112193f3d2b8Schs 	VERIFY(!msp->ms_condensing);
112293f3d2b8Schs 	avl_remove(&msp->ms_size_tree, rs);
112393f3d2b8Schs }
112493f3d2b8Schs 
112593f3d2b8Schs static void
metaslab_rt_vacate(range_tree_t * rt,void * arg)112693f3d2b8Schs metaslab_rt_vacate(range_tree_t *rt, void *arg)
112793f3d2b8Schs {
112893f3d2b8Schs 	metaslab_t *msp = arg;
112993f3d2b8Schs 
113093f3d2b8Schs 	ASSERT3P(rt->rt_arg, ==, msp);
113193f3d2b8Schs 	ASSERT3P(msp->ms_tree, ==, rt);
113293f3d2b8Schs 
113393f3d2b8Schs 	/*
113493f3d2b8Schs 	 * Normally one would walk the tree freeing nodes along the way.
113593f3d2b8Schs 	 * Since the nodes are shared with the range trees we can avoid
113693f3d2b8Schs 	 * walking all nodes and just reinitialize the avl tree. The nodes
113793f3d2b8Schs 	 * will be freed by the range tree, so we don't want to free them here.
113893f3d2b8Schs 	 */
113993f3d2b8Schs 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
114093f3d2b8Schs 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
114193f3d2b8Schs }
114293f3d2b8Schs 
114393f3d2b8Schs static range_tree_ops_t metaslab_rt_ops = {
114493f3d2b8Schs 	metaslab_rt_create,
114593f3d2b8Schs 	metaslab_rt_destroy,
114693f3d2b8Schs 	metaslab_rt_add,
114793f3d2b8Schs 	metaslab_rt_remove,
114893f3d2b8Schs 	metaslab_rt_vacate
114993f3d2b8Schs };
115093f3d2b8Schs 
115193f3d2b8Schs /*
1152c1cb2cd8Shaad  * ==========================================================================
1153f59c7639Shaad  * Common allocator routines
1154c1cb2cd8Shaad  * ==========================================================================
1155c1cb2cd8Shaad  */
115693f3d2b8Schs 
115793f3d2b8Schs /*
115893f3d2b8Schs  * Return the maximum contiguous segment within the metaslab.
115993f3d2b8Schs  */
116093f3d2b8Schs uint64_t
metaslab_block_maxsize(metaslab_t * msp)116193f3d2b8Schs metaslab_block_maxsize(metaslab_t *msp)
1162c1cb2cd8Shaad {
116393f3d2b8Schs 	avl_tree_t *t = &msp->ms_size_tree;
116493f3d2b8Schs 	range_seg_t *rs;
1165f59c7639Shaad 
116693f3d2b8Schs 	if (t == NULL || (rs = avl_last(t)) == NULL)
116793f3d2b8Schs 		return (0ULL);
1168f59c7639Shaad 
116993f3d2b8Schs 	return (rs->rs_end - rs->rs_start);
117093f3d2b8Schs }
1171f59c7639Shaad 
117293f3d2b8Schs static range_seg_t *
metaslab_block_find(avl_tree_t * t,uint64_t start,uint64_t size)117393f3d2b8Schs metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
117493f3d2b8Schs {
117593f3d2b8Schs 	range_seg_t *rs, rsearch;
117693f3d2b8Schs 	avl_index_t where;
117793f3d2b8Schs 
117893f3d2b8Schs 	rsearch.rs_start = start;
117993f3d2b8Schs 	rsearch.rs_end = start + size;
118093f3d2b8Schs 
118193f3d2b8Schs 	rs = avl_find(t, &rsearch, &where);
118293f3d2b8Schs 	if (rs == NULL) {
118393f3d2b8Schs 		rs = avl_nearest(t, where, AVL_AFTER);
118493f3d2b8Schs 	}
118593f3d2b8Schs 
118693f3d2b8Schs 	return (rs);
1187c1cb2cd8Shaad }
1188c1cb2cd8Shaad 
1189f59c7639Shaad /*
1190f59c7639Shaad  * This is a helper function that can be used by the allocator to find
1191f59c7639Shaad  * a suitable block to allocate. This will search the specified AVL
1192f59c7639Shaad  * tree looking for a block that matches the specified criteria.
1193f59c7639Shaad  */
1194c1cb2cd8Shaad static uint64_t
metaslab_block_picker(avl_tree_t * t,uint64_t * cursor,uint64_t size,uint64_t align)1195f59c7639Shaad metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1196f59c7639Shaad     uint64_t align)
1197c1cb2cd8Shaad {
119893f3d2b8Schs 	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1199c1cb2cd8Shaad 
120093f3d2b8Schs 	while (rs != NULL) {
120193f3d2b8Schs 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1202c1cb2cd8Shaad 
120393f3d2b8Schs 		if (offset + size <= rs->rs_end) {
1204c1cb2cd8Shaad 			*cursor = offset + size;
1205c1cb2cd8Shaad 			return (offset);
1206c1cb2cd8Shaad 		}
120793f3d2b8Schs 		rs = AVL_NEXT(t, rs);
1208c1cb2cd8Shaad 	}
1209c1cb2cd8Shaad 
1210c1cb2cd8Shaad 	/*
1211c1cb2cd8Shaad 	 * If we know we've searched the whole map (*cursor == 0), give up.
1212c1cb2cd8Shaad 	 * Otherwise, reset the cursor to the beginning and try again.
1213c1cb2cd8Shaad 	 */
1214c1cb2cd8Shaad 	if (*cursor == 0)
1215c1cb2cd8Shaad 		return (-1ULL);
1216c1cb2cd8Shaad 
1217c1cb2cd8Shaad 	*cursor = 0;
1218f59c7639Shaad 	return (metaslab_block_picker(t, cursor, size, align));
1219f59c7639Shaad }
1220f59c7639Shaad 
1221f59c7639Shaad /*
1222f59c7639Shaad  * ==========================================================================
1223f59c7639Shaad  * The first-fit block allocator
1224f59c7639Shaad  * ==========================================================================
1225f59c7639Shaad  */
1226f59c7639Shaad static uint64_t
metaslab_ff_alloc(metaslab_t * msp,uint64_t size)122793f3d2b8Schs metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1228f59c7639Shaad {
122993f3d2b8Schs 	/*
123093f3d2b8Schs 	 * Find the largest power of 2 block size that evenly divides the
123193f3d2b8Schs 	 * requested size. This is used to try to allocate blocks with similar
123293f3d2b8Schs 	 * alignment from the same area of the metaslab (i.e. same cursor
123393f3d2b8Schs 	 * bucket) but it does not guarantee that other allocations sizes
123493f3d2b8Schs 	 * may exist in the same region.
123593f3d2b8Schs 	 */
1236f59c7639Shaad 	uint64_t align = size & -size;
123793f3d2b8Schs 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
123893f3d2b8Schs 	avl_tree_t *t = &msp->ms_tree->rt_root;
1239f59c7639Shaad 
1240f59c7639Shaad 	return (metaslab_block_picker(t, cursor, size, align));
1241f59c7639Shaad }
1242f59c7639Shaad 
124393f3d2b8Schs static metaslab_ops_t metaslab_ff_ops = {
124493f3d2b8Schs 	metaslab_ff_alloc
1245c1cb2cd8Shaad };
1246c1cb2cd8Shaad 
1247c1cb2cd8Shaad /*
1248c1cb2cd8Shaad  * ==========================================================================
1249f59c7639Shaad  * Dynamic block allocator -
1250f59c7639Shaad  * Uses the first fit allocation scheme until space get low and then
1251f59c7639Shaad  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1252f59c7639Shaad  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1253f59c7639Shaad  * ==========================================================================
1254f59c7639Shaad  */
1255f59c7639Shaad static uint64_t
metaslab_df_alloc(metaslab_t * msp,uint64_t size)125693f3d2b8Schs metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1257f59c7639Shaad {
125893f3d2b8Schs 	/*
125993f3d2b8Schs 	 * Find the largest power of 2 block size that evenly divides the
126093f3d2b8Schs 	 * requested size. This is used to try to allocate blocks with similar
126193f3d2b8Schs 	 * alignment from the same area of the metaslab (i.e. same cursor
126293f3d2b8Schs 	 * bucket) but it does not guarantee that other allocations sizes
126393f3d2b8Schs 	 * may exist in the same region.
126493f3d2b8Schs 	 */
1265f59c7639Shaad 	uint64_t align = size & -size;
126693f3d2b8Schs 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
126793f3d2b8Schs 	range_tree_t *rt = msp->ms_tree;
126893f3d2b8Schs 	avl_tree_t *t = &rt->rt_root;
126993f3d2b8Schs 	uint64_t max_size = metaslab_block_maxsize(msp);
127093f3d2b8Schs 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1271f59c7639Shaad 
127293f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
127393f3d2b8Schs 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1274f59c7639Shaad 
1275f59c7639Shaad 	if (max_size < size)
1276f59c7639Shaad 		return (-1ULL);
1277f59c7639Shaad 
1278f59c7639Shaad 	/*
1279f59c7639Shaad 	 * If we're running low on space switch to using the size
1280f59c7639Shaad 	 * sorted AVL tree (best-fit).
1281f59c7639Shaad 	 */
1282f59c7639Shaad 	if (max_size < metaslab_df_alloc_threshold ||
1283f59c7639Shaad 	    free_pct < metaslab_df_free_pct) {
128493f3d2b8Schs 		t = &msp->ms_size_tree;
1285f59c7639Shaad 		*cursor = 0;
1286f59c7639Shaad 	}
1287f59c7639Shaad 
1288f59c7639Shaad 	return (metaslab_block_picker(t, cursor, size, 1ULL));
1289f59c7639Shaad }
1290f59c7639Shaad 
129193f3d2b8Schs static metaslab_ops_t metaslab_df_ops = {
129293f3d2b8Schs 	metaslab_df_alloc
1293f59c7639Shaad };
1294f59c7639Shaad 
1295f59c7639Shaad /*
1296f59c7639Shaad  * ==========================================================================
129793f3d2b8Schs  * Cursor fit block allocator -
129893f3d2b8Schs  * Select the largest region in the metaslab, set the cursor to the beginning
129993f3d2b8Schs  * of the range and the cursor_end to the end of the range. As allocations
130093f3d2b8Schs  * are made advance the cursor. Continue allocating from the cursor until
130193f3d2b8Schs  * the range is exhausted and then find a new range.
1302f59c7639Shaad  * ==========================================================================
1303f59c7639Shaad  */
1304f59c7639Shaad static uint64_t
metaslab_cf_alloc(metaslab_t * msp,uint64_t size)130593f3d2b8Schs metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1306f59c7639Shaad {
130793f3d2b8Schs 	range_tree_t *rt = msp->ms_tree;
130893f3d2b8Schs 	avl_tree_t *t = &msp->ms_size_tree;
130993f3d2b8Schs 	uint64_t *cursor = &msp->ms_lbas[0];
131093f3d2b8Schs 	uint64_t *cursor_end = &msp->ms_lbas[1];
1311f59c7639Shaad 	uint64_t offset = 0;
1312f59c7639Shaad 
131393f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
131493f3d2b8Schs 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1315f59c7639Shaad 
131693f3d2b8Schs 	ASSERT3U(*cursor_end, >=, *cursor);
131793f3d2b8Schs 
131893f3d2b8Schs 	if ((*cursor + size) > *cursor_end) {
131993f3d2b8Schs 		range_seg_t *rs;
132093f3d2b8Schs 
132193f3d2b8Schs 		rs = avl_last(&msp->ms_size_tree);
132293f3d2b8Schs 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1323f59c7639Shaad 			return (-1ULL);
1324f59c7639Shaad 
132593f3d2b8Schs 		*cursor = rs->rs_start;
132693f3d2b8Schs 		*cursor_end = rs->rs_end;
1327f59c7639Shaad 	}
132893f3d2b8Schs 
132993f3d2b8Schs 	offset = *cursor;
133093f3d2b8Schs 	*cursor += size;
133193f3d2b8Schs 
1332f59c7639Shaad 	return (offset);
1333f59c7639Shaad }
1334f59c7639Shaad 
133593f3d2b8Schs static metaslab_ops_t metaslab_cf_ops = {
133693f3d2b8Schs 	metaslab_cf_alloc
1337f59c7639Shaad };
1338f59c7639Shaad 
133993f3d2b8Schs /*
134093f3d2b8Schs  * ==========================================================================
134193f3d2b8Schs  * New dynamic fit allocator -
134293f3d2b8Schs  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
134393f3d2b8Schs  * contiguous blocks. If no region is found then just use the largest segment
134493f3d2b8Schs  * that remains.
134593f3d2b8Schs  * ==========================================================================
134693f3d2b8Schs  */
1347f59c7639Shaad 
134893f3d2b8Schs /*
134993f3d2b8Schs  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
135093f3d2b8Schs  * to request from the allocator.
135193f3d2b8Schs  */
135293f3d2b8Schs uint64_t metaslab_ndf_clump_shift = 4;
135393f3d2b8Schs 
135493f3d2b8Schs static uint64_t
metaslab_ndf_alloc(metaslab_t * msp,uint64_t size)135593f3d2b8Schs metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
135693f3d2b8Schs {
135793f3d2b8Schs 	avl_tree_t *t = &msp->ms_tree->rt_root;
135893f3d2b8Schs 	avl_index_t where;
135993f3d2b8Schs 	range_seg_t *rs, rsearch;
136093f3d2b8Schs 	uint64_t hbit = highbit64(size);
136193f3d2b8Schs 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
136293f3d2b8Schs 	uint64_t max_size = metaslab_block_maxsize(msp);
136393f3d2b8Schs 
136493f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
136593f3d2b8Schs 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1366f59c7639Shaad 
1367f59c7639Shaad 	if (max_size < size)
1368f59c7639Shaad 		return (-1ULL);
1369f59c7639Shaad 
137093f3d2b8Schs 	rsearch.rs_start = *cursor;
137193f3d2b8Schs 	rsearch.rs_end = *cursor + size;
1372f59c7639Shaad 
137393f3d2b8Schs 	rs = avl_find(t, &rsearch, &where);
137493f3d2b8Schs 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
137593f3d2b8Schs 		t = &msp->ms_size_tree;
1376f59c7639Shaad 
137793f3d2b8Schs 		rsearch.rs_start = 0;
137893f3d2b8Schs 		rsearch.rs_end = MIN(max_size,
137993f3d2b8Schs 		    1ULL << (hbit + metaslab_ndf_clump_shift));
138093f3d2b8Schs 		rs = avl_find(t, &rsearch, &where);
138193f3d2b8Schs 		if (rs == NULL)
138293f3d2b8Schs 			rs = avl_nearest(t, where, AVL_AFTER);
138393f3d2b8Schs 		ASSERT(rs != NULL);
1384f59c7639Shaad 	}
1385f59c7639Shaad 
138693f3d2b8Schs 	if ((rs->rs_end - rs->rs_start) >= size) {
138793f3d2b8Schs 		*cursor = rs->rs_start + size;
138893f3d2b8Schs 		return (rs->rs_start);
1389f59c7639Shaad 	}
1390f59c7639Shaad 	return (-1ULL);
1391f59c7639Shaad }
1392f59c7639Shaad 
139393f3d2b8Schs static metaslab_ops_t metaslab_ndf_ops = {
139493f3d2b8Schs 	metaslab_ndf_alloc
1395f59c7639Shaad };
1396f59c7639Shaad 
139793f3d2b8Schs metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1398f59c7639Shaad 
1399f59c7639Shaad /*
1400f59c7639Shaad  * ==========================================================================
1401c1cb2cd8Shaad  * Metaslabs
1402c1cb2cd8Shaad  * ==========================================================================
1403c1cb2cd8Shaad  */
1404c1cb2cd8Shaad 
1405c1cb2cd8Shaad /*
140693f3d2b8Schs  * Wait for any in-progress metaslab loads to complete.
140793f3d2b8Schs  */
140893f3d2b8Schs void
metaslab_load_wait(metaslab_t * msp)140993f3d2b8Schs metaslab_load_wait(metaslab_t *msp)
141093f3d2b8Schs {
141193f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
141293f3d2b8Schs 
141393f3d2b8Schs 	while (msp->ms_loading) {
141493f3d2b8Schs 		ASSERT(!msp->ms_loaded);
141593f3d2b8Schs 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
141693f3d2b8Schs 	}
141793f3d2b8Schs }
141893f3d2b8Schs 
141993f3d2b8Schs int
metaslab_load(metaslab_t * msp)142093f3d2b8Schs metaslab_load(metaslab_t *msp)
142193f3d2b8Schs {
142293f3d2b8Schs 	int error = 0;
142393f3d2b8Schs 	boolean_t success = B_FALSE;
142493f3d2b8Schs 
142593f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
142693f3d2b8Schs 	ASSERT(!msp->ms_loaded);
142793f3d2b8Schs 	ASSERT(!msp->ms_loading);
142893f3d2b8Schs 
142993f3d2b8Schs 	msp->ms_loading = B_TRUE;
143093f3d2b8Schs 
143193f3d2b8Schs 	/*
143293f3d2b8Schs 	 * If the space map has not been allocated yet, then treat
143393f3d2b8Schs 	 * all the space in the metaslab as free and add it to the
143493f3d2b8Schs 	 * ms_tree.
143593f3d2b8Schs 	 */
143693f3d2b8Schs 	if (msp->ms_sm != NULL)
143793f3d2b8Schs 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
143893f3d2b8Schs 	else
143993f3d2b8Schs 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
144093f3d2b8Schs 
144193f3d2b8Schs 	success = (error == 0);
144293f3d2b8Schs 	msp->ms_loading = B_FALSE;
144393f3d2b8Schs 
144493f3d2b8Schs 	if (success) {
144593f3d2b8Schs 		ASSERT3P(msp->ms_group, !=, NULL);
144693f3d2b8Schs 		msp->ms_loaded = B_TRUE;
144793f3d2b8Schs 
144893f3d2b8Schs 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
144993f3d2b8Schs 			range_tree_walk(msp->ms_defertree[t],
145093f3d2b8Schs 			    range_tree_remove, msp->ms_tree);
145193f3d2b8Schs 		}
145293f3d2b8Schs 		msp->ms_max_size = metaslab_block_maxsize(msp);
145393f3d2b8Schs 	}
145493f3d2b8Schs 	cv_broadcast(&msp->ms_load_cv);
145593f3d2b8Schs 	return (error);
145693f3d2b8Schs }
145793f3d2b8Schs 
145893f3d2b8Schs void
metaslab_unload(metaslab_t * msp)145993f3d2b8Schs metaslab_unload(metaslab_t *msp)
146093f3d2b8Schs {
146193f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
146293f3d2b8Schs 	range_tree_vacate(msp->ms_tree, NULL, NULL);
146393f3d2b8Schs 	msp->ms_loaded = B_FALSE;
146493f3d2b8Schs 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
146593f3d2b8Schs 	msp->ms_max_size = 0;
146693f3d2b8Schs }
146793f3d2b8Schs 
146893f3d2b8Schs int
metaslab_init(metaslab_group_t * mg,uint64_t id,uint64_t object,uint64_t txg,metaslab_t ** msp)146993f3d2b8Schs metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
147093f3d2b8Schs     metaslab_t **msp)
147193f3d2b8Schs {
147293f3d2b8Schs 	vdev_t *vd = mg->mg_vd;
147393f3d2b8Schs 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
147493f3d2b8Schs 	metaslab_t *ms;
147593f3d2b8Schs 	int error;
147693f3d2b8Schs 
147793f3d2b8Schs 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
147893f3d2b8Schs 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
147993f3d2b8Schs 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
148093f3d2b8Schs 	ms->ms_id = id;
148193f3d2b8Schs 	ms->ms_start = id << vd->vdev_ms_shift;
148293f3d2b8Schs 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
148393f3d2b8Schs 
148493f3d2b8Schs 	/*
148593f3d2b8Schs 	 * We only open space map objects that already exist. All others
148693f3d2b8Schs 	 * will be opened when we finally allocate an object for it.
148793f3d2b8Schs 	 */
148893f3d2b8Schs 	if (object != 0) {
148993f3d2b8Schs 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
149093f3d2b8Schs 		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
149193f3d2b8Schs 
149293f3d2b8Schs 		if (error != 0) {
149393f3d2b8Schs 			kmem_free(ms, sizeof (metaslab_t));
149493f3d2b8Schs 			return (error);
149593f3d2b8Schs 		}
149693f3d2b8Schs 
149793f3d2b8Schs 		ASSERT(ms->ms_sm != NULL);
149893f3d2b8Schs 	}
149993f3d2b8Schs 
150093f3d2b8Schs 	/*
150193f3d2b8Schs 	 * We create the main range tree here, but we don't create the
150293f3d2b8Schs 	 * alloctree and freetree until metaslab_sync_done().  This serves
1503c1cb2cd8Shaad 	 * two purposes: it allows metaslab_sync_done() to detect the
1504c1cb2cd8Shaad 	 * addition of new space; and for debugging, it ensures that we'd
1505c1cb2cd8Shaad 	 * data fault on any attempt to use this metaslab before it's ready.
1506c1cb2cd8Shaad 	 */
150793f3d2b8Schs 	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
150893f3d2b8Schs 	metaslab_group_add(mg, ms);
1509c1cb2cd8Shaad 
151093f3d2b8Schs 	metaslab_set_fragmentation(ms);
1511f59c7639Shaad 
1512c1cb2cd8Shaad 	/*
1513c1cb2cd8Shaad 	 * If we're opening an existing pool (txg == 0) or creating
1514c1cb2cd8Shaad 	 * a new one (txg == TXG_INITIAL), all space is available now.
1515c1cb2cd8Shaad 	 * If we're adding space to an existing pool, the new space
1516c1cb2cd8Shaad 	 * does not become available until after this txg has synced.
151793f3d2b8Schs 	 * The metaslab's weight will also be initialized when we sync
151893f3d2b8Schs 	 * out this txg. This ensures that we don't attempt to allocate
151993f3d2b8Schs 	 * from it before we have initialized it completely.
1520c1cb2cd8Shaad 	 */
1521c1cb2cd8Shaad 	if (txg <= TXG_INITIAL)
152293f3d2b8Schs 		metaslab_sync_done(ms, 0);
152393f3d2b8Schs 
152493f3d2b8Schs 	/*
152593f3d2b8Schs 	 * If metaslab_debug_load is set and we're initializing a metaslab
152693f3d2b8Schs 	 * that has an allocated space map object then load the its space
152793f3d2b8Schs 	 * map so that can verify frees.
152893f3d2b8Schs 	 */
152993f3d2b8Schs 	if (metaslab_debug_load && ms->ms_sm != NULL) {
153093f3d2b8Schs 		mutex_enter(&ms->ms_lock);
153193f3d2b8Schs 		VERIFY0(metaslab_load(ms));
153293f3d2b8Schs 		mutex_exit(&ms->ms_lock);
153393f3d2b8Schs 	}
1534c1cb2cd8Shaad 
1535c1cb2cd8Shaad 	if (txg != 0) {
1536c1cb2cd8Shaad 		vdev_dirty(vd, 0, NULL, txg);
153793f3d2b8Schs 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1538c1cb2cd8Shaad 	}
1539c1cb2cd8Shaad 
154093f3d2b8Schs 	*msp = ms;
154193f3d2b8Schs 
154293f3d2b8Schs 	return (0);
1543c1cb2cd8Shaad }
1544c1cb2cd8Shaad 
1545c1cb2cd8Shaad void
metaslab_fini(metaslab_t * msp)1546c1cb2cd8Shaad metaslab_fini(metaslab_t *msp)
1547c1cb2cd8Shaad {
1548c1cb2cd8Shaad 	metaslab_group_t *mg = msp->ms_group;
1549c1cb2cd8Shaad 
1550c1cb2cd8Shaad 	metaslab_group_remove(mg, msp);
1551c1cb2cd8Shaad 
1552c1cb2cd8Shaad 	mutex_enter(&msp->ms_lock);
155393f3d2b8Schs 	VERIFY(msp->ms_group == NULL);
155493f3d2b8Schs 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
155593f3d2b8Schs 	    0, -msp->ms_size);
155693f3d2b8Schs 	space_map_close(msp->ms_sm);
1557c1cb2cd8Shaad 
155893f3d2b8Schs 	metaslab_unload(msp);
155993f3d2b8Schs 	range_tree_destroy(msp->ms_tree);
1560c1cb2cd8Shaad 
1561f59c7639Shaad 	for (int t = 0; t < TXG_SIZE; t++) {
156293f3d2b8Schs 		range_tree_destroy(msp->ms_alloctree[t]);
156393f3d2b8Schs 		range_tree_destroy(msp->ms_freetree[t]);
1564c1cb2cd8Shaad 	}
1565c1cb2cd8Shaad 
156693f3d2b8Schs 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
156793f3d2b8Schs 		range_tree_destroy(msp->ms_defertree[t]);
156893f3d2b8Schs 	}
1569f59c7639Shaad 
157093f3d2b8Schs 	ASSERT0(msp->ms_deferspace);
1571f59c7639Shaad 
1572c1cb2cd8Shaad 	mutex_exit(&msp->ms_lock);
157393f3d2b8Schs 	cv_destroy(&msp->ms_load_cv);
1574c1cb2cd8Shaad 	mutex_destroy(&msp->ms_lock);
1575c1cb2cd8Shaad 
1576c1cb2cd8Shaad 	kmem_free(msp, sizeof (metaslab_t));
1577c1cb2cd8Shaad }
1578c1cb2cd8Shaad 
157993f3d2b8Schs #define	FRAGMENTATION_TABLE_SIZE	17
1580c1cb2cd8Shaad 
158193f3d2b8Schs /*
158293f3d2b8Schs  * This table defines a segment size based fragmentation metric that will
158393f3d2b8Schs  * allow each metaslab to derive its own fragmentation value. This is done
158493f3d2b8Schs  * by calculating the space in each bucket of the spacemap histogram and
158593f3d2b8Schs  * multiplying that by the fragmetation metric in this table. Doing
158693f3d2b8Schs  * this for all buckets and dividing it by the total amount of free
158793f3d2b8Schs  * space in this metaslab (i.e. the total free space in all buckets) gives
158893f3d2b8Schs  * us the fragmentation metric. This means that a high fragmentation metric
158993f3d2b8Schs  * equates to most of the free space being comprised of small segments.
159093f3d2b8Schs  * Conversely, if the metric is low, then most of the free space is in
159193f3d2b8Schs  * large segments. A 10% change in fragmentation equates to approximately
159293f3d2b8Schs  * double the number of segments.
159393f3d2b8Schs  *
159493f3d2b8Schs  * This table defines 0% fragmented space using 16MB segments. Testing has
159593f3d2b8Schs  * shown that segments that are greater than or equal to 16MB do not suffer
159693f3d2b8Schs  * from drastic performance problems. Using this value, we derive the rest
159793f3d2b8Schs  * of the table. Since the fragmentation value is never stored on disk, it
159893f3d2b8Schs  * is possible to change these calculations in the future.
159993f3d2b8Schs  */
160093f3d2b8Schs int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
160193f3d2b8Schs 	100,	/* 512B	*/
160293f3d2b8Schs 	100,	/* 1K	*/
160393f3d2b8Schs 	98,	/* 2K	*/
160493f3d2b8Schs 	95,	/* 4K	*/
160593f3d2b8Schs 	90,	/* 8K	*/
160693f3d2b8Schs 	80,	/* 16K	*/
160793f3d2b8Schs 	70,	/* 32K	*/
160893f3d2b8Schs 	60,	/* 64K	*/
160993f3d2b8Schs 	50,	/* 128K	*/
161093f3d2b8Schs 	40,	/* 256K	*/
161193f3d2b8Schs 	30,	/* 512K	*/
161293f3d2b8Schs 	20,	/* 1M	*/
161393f3d2b8Schs 	15,	/* 2M	*/
161493f3d2b8Schs 	10,	/* 4M	*/
161593f3d2b8Schs 	5,	/* 8M	*/
161693f3d2b8Schs 	0	/* 16M	*/
161793f3d2b8Schs };
161893f3d2b8Schs 
161993f3d2b8Schs /*
162093f3d2b8Schs  * Calclate the metaslab's fragmentation metric. A return value
162193f3d2b8Schs  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
162293f3d2b8Schs  * not support this metric. Otherwise, the return value should be in the
162393f3d2b8Schs  * range [0, 100].
162493f3d2b8Schs  */
162593f3d2b8Schs static void
metaslab_set_fragmentation(metaslab_t * msp)162693f3d2b8Schs metaslab_set_fragmentation(metaslab_t *msp)
162793f3d2b8Schs {
162893f3d2b8Schs 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
162993f3d2b8Schs 	uint64_t fragmentation = 0;
163093f3d2b8Schs 	uint64_t total = 0;
163193f3d2b8Schs 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
163293f3d2b8Schs 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
163393f3d2b8Schs 
163493f3d2b8Schs 	if (!feature_enabled) {
163593f3d2b8Schs 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
163693f3d2b8Schs 		return;
163793f3d2b8Schs 	}
163893f3d2b8Schs 
163993f3d2b8Schs 	/*
164093f3d2b8Schs 	 * A null space map means that the entire metaslab is free
164193f3d2b8Schs 	 * and thus is not fragmented.
164293f3d2b8Schs 	 */
164393f3d2b8Schs 	if (msp->ms_sm == NULL) {
164493f3d2b8Schs 		msp->ms_fragmentation = 0;
164593f3d2b8Schs 		return;
164693f3d2b8Schs 	}
164793f3d2b8Schs 
164893f3d2b8Schs 	/*
164993f3d2b8Schs 	 * If this metaslab's space map has not been upgraded, flag it
165093f3d2b8Schs 	 * so that we upgrade next time we encounter it.
165193f3d2b8Schs 	 */
165293f3d2b8Schs 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
165393f3d2b8Schs 		uint64_t txg = spa_syncing_txg(spa);
165493f3d2b8Schs 		vdev_t *vd = msp->ms_group->mg_vd;
165593f3d2b8Schs 
165693f3d2b8Schs 		if (spa_writeable(spa)) {
165793f3d2b8Schs 			msp->ms_condense_wanted = B_TRUE;
165893f3d2b8Schs 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
165993f3d2b8Schs 			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
166093f3d2b8Schs 			    "msp %p, vd %p", txg, msp, vd);
166193f3d2b8Schs 		}
166293f3d2b8Schs 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
166393f3d2b8Schs 		return;
166493f3d2b8Schs 	}
166593f3d2b8Schs 
166693f3d2b8Schs 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
166793f3d2b8Schs 		uint64_t space = 0;
166893f3d2b8Schs 		uint8_t shift = msp->ms_sm->sm_shift;
166993f3d2b8Schs 
167093f3d2b8Schs 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
167193f3d2b8Schs 		    FRAGMENTATION_TABLE_SIZE - 1);
167293f3d2b8Schs 
167393f3d2b8Schs 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
167493f3d2b8Schs 			continue;
167593f3d2b8Schs 
167693f3d2b8Schs 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
167793f3d2b8Schs 		total += space;
167893f3d2b8Schs 
167993f3d2b8Schs 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
168093f3d2b8Schs 		fragmentation += space * zfs_frag_table[idx];
168193f3d2b8Schs 	}
168293f3d2b8Schs 
168393f3d2b8Schs 	if (total > 0)
168493f3d2b8Schs 		fragmentation /= total;
168593f3d2b8Schs 	ASSERT3U(fragmentation, <=, 100);
168693f3d2b8Schs 
168793f3d2b8Schs 	msp->ms_fragmentation = fragmentation;
168893f3d2b8Schs }
168993f3d2b8Schs 
169093f3d2b8Schs /*
169193f3d2b8Schs  * Compute a weight -- a selection preference value -- for the given metaslab.
169293f3d2b8Schs  * This is based on the amount of free space, the level of fragmentation,
169393f3d2b8Schs  * the LBA range, and whether the metaslab is loaded.
169493f3d2b8Schs  */
1695c1cb2cd8Shaad static uint64_t
metaslab_space_weight(metaslab_t * msp)169693f3d2b8Schs metaslab_space_weight(metaslab_t *msp)
1697c1cb2cd8Shaad {
1698c1cb2cd8Shaad 	metaslab_group_t *mg = msp->ms_group;
1699c1cb2cd8Shaad 	vdev_t *vd = mg->mg_vd;
1700c1cb2cd8Shaad 	uint64_t weight, space;
1701c1cb2cd8Shaad 
1702c1cb2cd8Shaad 	ASSERT(MUTEX_HELD(&msp->ms_lock));
170393f3d2b8Schs 	ASSERT(!vd->vdev_removing);
1704c1cb2cd8Shaad 
1705c1cb2cd8Shaad 	/*
1706c1cb2cd8Shaad 	 * The baseline weight is the metaslab's free space.
1707c1cb2cd8Shaad 	 */
170893f3d2b8Schs 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
170993f3d2b8Schs 
171093f3d2b8Schs 	if (metaslab_fragmentation_factor_enabled &&
171193f3d2b8Schs 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
171293f3d2b8Schs 		/*
171393f3d2b8Schs 		 * Use the fragmentation information to inversely scale
171493f3d2b8Schs 		 * down the baseline weight. We need to ensure that we
171593f3d2b8Schs 		 * don't exclude this metaslab completely when it's 100%
171693f3d2b8Schs 		 * fragmented. To avoid this we reduce the fragmented value
171793f3d2b8Schs 		 * by 1.
171893f3d2b8Schs 		 */
171993f3d2b8Schs 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
172093f3d2b8Schs 
172193f3d2b8Schs 		/*
172293f3d2b8Schs 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
172393f3d2b8Schs 		 * this metaslab again. The fragmentation metric may have
172493f3d2b8Schs 		 * decreased the space to something smaller than
172593f3d2b8Schs 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
172693f3d2b8Schs 		 * so that we can consume any remaining space.
172793f3d2b8Schs 		 */
172893f3d2b8Schs 		if (space > 0 && space < SPA_MINBLOCKSIZE)
172993f3d2b8Schs 			space = SPA_MINBLOCKSIZE;
173093f3d2b8Schs 	}
1731c1cb2cd8Shaad 	weight = space;
1732c1cb2cd8Shaad 
1733c1cb2cd8Shaad 	/*
1734c1cb2cd8Shaad 	 * Modern disks have uniform bit density and constant angular velocity.
1735c1cb2cd8Shaad 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1736c1cb2cd8Shaad 	 * than the inner zones by the ratio of outer to inner track diameter,
1737c1cb2cd8Shaad 	 * which is typically around 2:1.  We account for this by assigning
1738c1cb2cd8Shaad 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1739c1cb2cd8Shaad 	 * In effect, this means that we'll select the metaslab with the most
1740c1cb2cd8Shaad 	 * free bandwidth rather than simply the one with the most free space.
1741c1cb2cd8Shaad 	 */
174293f3d2b8Schs 	if (metaslab_lba_weighting_enabled) {
174393f3d2b8Schs 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1744c1cb2cd8Shaad 		ASSERT(weight >= space && weight <= 2 * space);
174593f3d2b8Schs 	}
1746c1cb2cd8Shaad 
1747c1cb2cd8Shaad 	/*
1748f59c7639Shaad 	 * If this metaslab is one we're actively using, adjust its
1749f59c7639Shaad 	 * weight to make it preferable to any inactive metaslab so
175093f3d2b8Schs 	 * we'll polish it off. If the fragmentation on this metaslab
175193f3d2b8Schs 	 * has exceed our threshold, then don't mark it active.
1752c1cb2cd8Shaad 	 */
175393f3d2b8Schs 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
175493f3d2b8Schs 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1755c1cb2cd8Shaad 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1756f59c7639Shaad 	}
175793f3d2b8Schs 
175893f3d2b8Schs 	WEIGHT_SET_SPACEBASED(weight);
175993f3d2b8Schs 	return (weight);
176093f3d2b8Schs }
176193f3d2b8Schs 
176293f3d2b8Schs /*
176393f3d2b8Schs  * Return the weight of the specified metaslab, according to the segment-based
176493f3d2b8Schs  * weighting algorithm. The metaslab must be loaded. This function can
176593f3d2b8Schs  * be called within a sync pass since it relies only on the metaslab's
176693f3d2b8Schs  * range tree which is always accurate when the metaslab is loaded.
176793f3d2b8Schs  */
176893f3d2b8Schs static uint64_t
metaslab_weight_from_range_tree(metaslab_t * msp)176993f3d2b8Schs metaslab_weight_from_range_tree(metaslab_t *msp)
177093f3d2b8Schs {
177193f3d2b8Schs 	uint64_t weight = 0;
177293f3d2b8Schs 	uint32_t segments = 0;
177393f3d2b8Schs 
177493f3d2b8Schs 	ASSERT(msp->ms_loaded);
177593f3d2b8Schs 
177693f3d2b8Schs 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
177793f3d2b8Schs 	    i--) {
177893f3d2b8Schs 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
177993f3d2b8Schs 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
178093f3d2b8Schs 
178193f3d2b8Schs 		segments <<= 1;
178293f3d2b8Schs 		segments += msp->ms_tree->rt_histogram[i];
178393f3d2b8Schs 
178493f3d2b8Schs 		/*
178593f3d2b8Schs 		 * The range tree provides more precision than the space map
178693f3d2b8Schs 		 * and must be downgraded so that all values fit within the
178793f3d2b8Schs 		 * space map's histogram. This allows us to compare loaded
178893f3d2b8Schs 		 * vs. unloaded metaslabs to determine which metaslab is
178993f3d2b8Schs 		 * considered "best".
179093f3d2b8Schs 		 */
179193f3d2b8Schs 		if (i > max_idx)
179293f3d2b8Schs 			continue;
179393f3d2b8Schs 
179493f3d2b8Schs 		if (segments != 0) {
179593f3d2b8Schs 			WEIGHT_SET_COUNT(weight, segments);
179693f3d2b8Schs 			WEIGHT_SET_INDEX(weight, i);
179793f3d2b8Schs 			WEIGHT_SET_ACTIVE(weight, 0);
179893f3d2b8Schs 			break;
179993f3d2b8Schs 		}
180093f3d2b8Schs 	}
1801c1cb2cd8Shaad 	return (weight);
1802c1cb2cd8Shaad }
1803c1cb2cd8Shaad 
180493f3d2b8Schs /*
180593f3d2b8Schs  * Calculate the weight based on the on-disk histogram. This should only
180693f3d2b8Schs  * be called after a sync pass has completely finished since the on-disk
180793f3d2b8Schs  * information is updated in metaslab_sync().
180893f3d2b8Schs  */
180993f3d2b8Schs static uint64_t
metaslab_weight_from_spacemap(metaslab_t * msp)181093f3d2b8Schs metaslab_weight_from_spacemap(metaslab_t *msp)
1811c1cb2cd8Shaad {
181293f3d2b8Schs 	uint64_t weight = 0;
1813f59c7639Shaad 
181493f3d2b8Schs 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
181593f3d2b8Schs 		if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
181693f3d2b8Schs 			WEIGHT_SET_COUNT(weight,
181793f3d2b8Schs 			    msp->ms_sm->sm_phys->smp_histogram[i]);
181893f3d2b8Schs 			WEIGHT_SET_INDEX(weight, i +
181993f3d2b8Schs 			    msp->ms_sm->sm_shift);
182093f3d2b8Schs 			WEIGHT_SET_ACTIVE(weight, 0);
182193f3d2b8Schs 			break;
182293f3d2b8Schs 		}
182393f3d2b8Schs 	}
182493f3d2b8Schs 	return (weight);
182593f3d2b8Schs }
1826f59c7639Shaad 
1827f59c7639Shaad /*
182893f3d2b8Schs  * Compute a segment-based weight for the specified metaslab. The weight
182993f3d2b8Schs  * is determined by highest bucket in the histogram. The information
183093f3d2b8Schs  * for the highest bucket is encoded into the weight value.
1831f59c7639Shaad  */
183293f3d2b8Schs static uint64_t
metaslab_segment_weight(metaslab_t * msp)183393f3d2b8Schs metaslab_segment_weight(metaslab_t *msp)
1834f59c7639Shaad {
1835f59c7639Shaad 	metaslab_group_t *mg = msp->ms_group;
183693f3d2b8Schs 	uint64_t weight = 0;
183793f3d2b8Schs 	uint8_t shift = mg->mg_vd->vdev_ashift;
1838c1cb2cd8Shaad 
1839c1cb2cd8Shaad 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1840c1cb2cd8Shaad 
184193f3d2b8Schs 	/*
184293f3d2b8Schs 	 * The metaslab is completely free.
184393f3d2b8Schs 	 */
184493f3d2b8Schs 	if (space_map_allocated(msp->ms_sm) == 0) {
184593f3d2b8Schs 		int idx = highbit64(msp->ms_size) - 1;
184693f3d2b8Schs 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
184793f3d2b8Schs 
184893f3d2b8Schs 		if (idx < max_idx) {
184993f3d2b8Schs 			WEIGHT_SET_COUNT(weight, 1ULL);
185093f3d2b8Schs 			WEIGHT_SET_INDEX(weight, idx);
185193f3d2b8Schs 		} else {
185293f3d2b8Schs 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
185393f3d2b8Schs 			WEIGHT_SET_INDEX(weight, max_idx);
185493f3d2b8Schs 		}
185593f3d2b8Schs 		WEIGHT_SET_ACTIVE(weight, 0);
185693f3d2b8Schs 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
185793f3d2b8Schs 
185893f3d2b8Schs 		return (weight);
185993f3d2b8Schs 	}
186093f3d2b8Schs 
186193f3d2b8Schs 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
186293f3d2b8Schs 
186393f3d2b8Schs 	/*
186493f3d2b8Schs 	 * If the metaslab is fully allocated then just make the weight 0.
186593f3d2b8Schs 	 */
186693f3d2b8Schs 	if (space_map_allocated(msp->ms_sm) == msp->ms_size)
186793f3d2b8Schs 		return (0);
186893f3d2b8Schs 	/*
186993f3d2b8Schs 	 * If the metaslab is already loaded, then use the range tree to
187093f3d2b8Schs 	 * determine the weight. Otherwise, we rely on the space map information
187193f3d2b8Schs 	 * to generate the weight.
187293f3d2b8Schs 	 */
187393f3d2b8Schs 	if (msp->ms_loaded) {
187493f3d2b8Schs 		weight = metaslab_weight_from_range_tree(msp);
187593f3d2b8Schs 	} else {
187693f3d2b8Schs 		weight = metaslab_weight_from_spacemap(msp);
187793f3d2b8Schs 	}
187893f3d2b8Schs 
187993f3d2b8Schs 	/*
188093f3d2b8Schs 	 * If the metaslab was active the last time we calculated its weight
188193f3d2b8Schs 	 * then keep it active. We want to consume the entire region that
188293f3d2b8Schs 	 * is associated with this weight.
188393f3d2b8Schs 	 */
188493f3d2b8Schs 	if (msp->ms_activation_weight != 0 && weight != 0)
188593f3d2b8Schs 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
188693f3d2b8Schs 	return (weight);
188793f3d2b8Schs }
188893f3d2b8Schs 
188993f3d2b8Schs /*
189093f3d2b8Schs  * Determine if we should attempt to allocate from this metaslab. If the
189193f3d2b8Schs  * metaslab has a maximum size then we can quickly determine if the desired
189293f3d2b8Schs  * allocation size can be satisfied. Otherwise, if we're using segment-based
189393f3d2b8Schs  * weighting then we can determine the maximum allocation that this metaslab
189493f3d2b8Schs  * can accommodate based on the index encoded in the weight. If we're using
189593f3d2b8Schs  * space-based weights then rely on the entire weight (excluding the weight
189693f3d2b8Schs  * type bit).
189793f3d2b8Schs  */
189893f3d2b8Schs boolean_t
metaslab_should_allocate(metaslab_t * msp,uint64_t asize)189993f3d2b8Schs metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
190093f3d2b8Schs {
190193f3d2b8Schs 	boolean_t should_allocate;
190293f3d2b8Schs 
190393f3d2b8Schs 	if (msp->ms_max_size != 0)
190493f3d2b8Schs 		return (msp->ms_max_size >= asize);
190593f3d2b8Schs 
190693f3d2b8Schs 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
190793f3d2b8Schs 		/*
190893f3d2b8Schs 		 * The metaslab segment weight indicates segments in the
190993f3d2b8Schs 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
191093f3d2b8Schs 		 * Since the asize might be in the middle of the range, we
191193f3d2b8Schs 		 * should attempt the allocation if asize < 2^(i+1).
191293f3d2b8Schs 		 */
191393f3d2b8Schs 		should_allocate = (asize <
191493f3d2b8Schs 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
191593f3d2b8Schs 	} else {
191693f3d2b8Schs 		should_allocate = (asize <=
191793f3d2b8Schs 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
191893f3d2b8Schs 	}
191993f3d2b8Schs 	return (should_allocate);
192093f3d2b8Schs }
192193f3d2b8Schs 
192293f3d2b8Schs static uint64_t
metaslab_weight(metaslab_t * msp)192393f3d2b8Schs metaslab_weight(metaslab_t *msp)
192493f3d2b8Schs {
192593f3d2b8Schs 	vdev_t *vd = msp->ms_group->mg_vd;
192693f3d2b8Schs 	spa_t *spa = vd->vdev_spa;
192793f3d2b8Schs 	uint64_t weight;
192893f3d2b8Schs 
192993f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
193093f3d2b8Schs 
193193f3d2b8Schs 	/*
193293f3d2b8Schs 	 * This vdev is in the process of being removed so there is nothing
193393f3d2b8Schs 	 * for us to do here.
193493f3d2b8Schs 	 */
193593f3d2b8Schs 	if (vd->vdev_removing) {
193693f3d2b8Schs 		ASSERT0(space_map_allocated(msp->ms_sm));
193793f3d2b8Schs 		ASSERT0(vd->vdev_ms_shift);
193893f3d2b8Schs 		return (0);
193993f3d2b8Schs 	}
194093f3d2b8Schs 
194193f3d2b8Schs 	metaslab_set_fragmentation(msp);
194293f3d2b8Schs 
194393f3d2b8Schs 	/*
194493f3d2b8Schs 	 * Update the maximum size if the metaslab is loaded. This will
194593f3d2b8Schs 	 * ensure that we get an accurate maximum size if newly freed space
194693f3d2b8Schs 	 * has been added back into the free tree.
194793f3d2b8Schs 	 */
194893f3d2b8Schs 	if (msp->ms_loaded)
194993f3d2b8Schs 		msp->ms_max_size = metaslab_block_maxsize(msp);
195093f3d2b8Schs 
195193f3d2b8Schs 	/*
195293f3d2b8Schs 	 * Segment-based weighting requires space map histogram support.
195393f3d2b8Schs 	 */
195493f3d2b8Schs 	if (zfs_metaslab_segment_weight_enabled &&
195593f3d2b8Schs 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
195693f3d2b8Schs 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
195793f3d2b8Schs 	    sizeof (space_map_phys_t))) {
195893f3d2b8Schs 		weight = metaslab_segment_weight(msp);
195993f3d2b8Schs 	} else {
196093f3d2b8Schs 		weight = metaslab_space_weight(msp);
196193f3d2b8Schs 	}
196293f3d2b8Schs 	return (weight);
196393f3d2b8Schs }
196493f3d2b8Schs 
196593f3d2b8Schs static int
metaslab_activate(metaslab_t * msp,uint64_t activation_weight)196693f3d2b8Schs metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
196793f3d2b8Schs {
196893f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
196993f3d2b8Schs 
1970c1cb2cd8Shaad 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
197193f3d2b8Schs 		metaslab_load_wait(msp);
197293f3d2b8Schs 		if (!msp->ms_loaded) {
197393f3d2b8Schs 			int error = metaslab_load(msp);
1974c1cb2cd8Shaad 			if (error) {
1975c1cb2cd8Shaad 				metaslab_group_sort(msp->ms_group, msp, 0);
1976c1cb2cd8Shaad 				return (error);
1977c1cb2cd8Shaad 			}
1978f59c7639Shaad 		}
1979f59c7639Shaad 
198093f3d2b8Schs 		msp->ms_activation_weight = msp->ms_weight;
1981c1cb2cd8Shaad 		metaslab_group_sort(msp->ms_group, msp,
1982c1cb2cd8Shaad 		    msp->ms_weight | activation_weight);
1983c1cb2cd8Shaad 	}
198493f3d2b8Schs 	ASSERT(msp->ms_loaded);
1985c1cb2cd8Shaad 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1986c1cb2cd8Shaad 
1987c1cb2cd8Shaad 	return (0);
1988c1cb2cd8Shaad }
1989c1cb2cd8Shaad 
1990c1cb2cd8Shaad static void
metaslab_passivate(metaslab_t * msp,uint64_t weight)199193f3d2b8Schs metaslab_passivate(metaslab_t *msp, uint64_t weight)
1992c1cb2cd8Shaad {
199393f3d2b8Schs 	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
199493f3d2b8Schs 
1995c1cb2cd8Shaad 	/*
1996c1cb2cd8Shaad 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1997c1cb2cd8Shaad 	 * this metaslab again.  In that case, it had better be empty,
1998c1cb2cd8Shaad 	 * or we would be leaving space on the table.
1999c1cb2cd8Shaad 	 */
200093f3d2b8Schs 	ASSERT(size >= SPA_MINBLOCKSIZE ||
200193f3d2b8Schs 	    range_tree_space(msp->ms_tree) == 0);
200293f3d2b8Schs 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
200393f3d2b8Schs 
200493f3d2b8Schs 	msp->ms_activation_weight = 0;
200593f3d2b8Schs 	metaslab_group_sort(msp->ms_group, msp, weight);
2006c1cb2cd8Shaad 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2007c1cb2cd8Shaad }
2008c1cb2cd8Shaad 
2009c1cb2cd8Shaad /*
201093f3d2b8Schs  * Segment-based metaslabs are activated once and remain active until
201193f3d2b8Schs  * we either fail an allocation attempt (similar to space-based metaslabs)
201293f3d2b8Schs  * or have exhausted the free space in zfs_metaslab_switch_threshold
201393f3d2b8Schs  * buckets since the metaslab was activated. This function checks to see
201493f3d2b8Schs  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
201593f3d2b8Schs  * metaslab and passivates it proactively. This will allow us to select a
201693f3d2b8Schs  * metaslabs with larger contiguous region if any remaining within this
201793f3d2b8Schs  * metaslab group. If we're in sync pass > 1, then we continue using this
201893f3d2b8Schs  * metaslab so that we don't dirty more block and cause more sync passes.
201993f3d2b8Schs  */
202093f3d2b8Schs void
metaslab_segment_may_passivate(metaslab_t * msp)202193f3d2b8Schs metaslab_segment_may_passivate(metaslab_t *msp)
202293f3d2b8Schs {
202393f3d2b8Schs 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
202493f3d2b8Schs 
202593f3d2b8Schs 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
202693f3d2b8Schs 		return;
202793f3d2b8Schs 
202893f3d2b8Schs 	/*
202993f3d2b8Schs 	 * Since we are in the middle of a sync pass, the most accurate
203093f3d2b8Schs 	 * information that is accessible to us is the in-core range tree
203193f3d2b8Schs 	 * histogram; calculate the new weight based on that information.
203293f3d2b8Schs 	 */
203393f3d2b8Schs 	uint64_t weight = metaslab_weight_from_range_tree(msp);
203493f3d2b8Schs 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
203593f3d2b8Schs 	int current_idx = WEIGHT_GET_INDEX(weight);
203693f3d2b8Schs 
203793f3d2b8Schs 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
203893f3d2b8Schs 		metaslab_passivate(msp, weight);
203993f3d2b8Schs }
204093f3d2b8Schs 
204193f3d2b8Schs static void
metaslab_preload(void * arg)204293f3d2b8Schs metaslab_preload(void *arg)
204393f3d2b8Schs {
204493f3d2b8Schs 	metaslab_t *msp = arg;
204593f3d2b8Schs 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
204693f3d2b8Schs 
204793f3d2b8Schs 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
204893f3d2b8Schs 
204993f3d2b8Schs 	mutex_enter(&msp->ms_lock);
205093f3d2b8Schs 	metaslab_load_wait(msp);
205193f3d2b8Schs 	if (!msp->ms_loaded)
205293f3d2b8Schs 		(void) metaslab_load(msp);
205393f3d2b8Schs 	msp->ms_selected_txg = spa_syncing_txg(spa);
205493f3d2b8Schs 	mutex_exit(&msp->ms_lock);
205593f3d2b8Schs }
205693f3d2b8Schs 
205793f3d2b8Schs static void
metaslab_group_preload(metaslab_group_t * mg)205893f3d2b8Schs metaslab_group_preload(metaslab_group_t *mg)
205993f3d2b8Schs {
206093f3d2b8Schs 	spa_t *spa = mg->mg_vd->vdev_spa;
206193f3d2b8Schs 	metaslab_t *msp;
206293f3d2b8Schs 	avl_tree_t *t = &mg->mg_metaslab_tree;
206393f3d2b8Schs 	int m = 0;
206493f3d2b8Schs 
206593f3d2b8Schs 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
206693f3d2b8Schs 		taskq_wait(mg->mg_taskq);
206793f3d2b8Schs 		return;
206893f3d2b8Schs 	}
206993f3d2b8Schs 
207093f3d2b8Schs 	mutex_enter(&mg->mg_lock);
207193f3d2b8Schs 	/*
207293f3d2b8Schs 	 * Load the next potential metaslabs
207393f3d2b8Schs 	 */
207493f3d2b8Schs 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
207593f3d2b8Schs 		/*
207693f3d2b8Schs 		 * We preload only the maximum number of metaslabs specified
207793f3d2b8Schs 		 * by metaslab_preload_limit. If a metaslab is being forced
207893f3d2b8Schs 		 * to condense then we preload it too. This will ensure
207993f3d2b8Schs 		 * that force condensing happens in the next txg.
208093f3d2b8Schs 		 */
208193f3d2b8Schs 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
208293f3d2b8Schs 			continue;
208393f3d2b8Schs 		}
208493f3d2b8Schs 
208593f3d2b8Schs 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
208693f3d2b8Schs 		    msp, TQ_SLEEP) != 0);
208793f3d2b8Schs 	}
208893f3d2b8Schs 	mutex_exit(&mg->mg_lock);
208993f3d2b8Schs }
209093f3d2b8Schs 
209193f3d2b8Schs /*
209293f3d2b8Schs  * Determine if the space map's on-disk footprint is past our tolerance
209393f3d2b8Schs  * for inefficiency. We would like to use the following criteria to make
209493f3d2b8Schs  * our decision:
209593f3d2b8Schs  *
209693f3d2b8Schs  * 1. The size of the space map object should not dramatically increase as a
209793f3d2b8Schs  * result of writing out the free space range tree.
209893f3d2b8Schs  *
209993f3d2b8Schs  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
210093f3d2b8Schs  * times the size than the free space range tree representation
210193f3d2b8Schs  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
210293f3d2b8Schs  *
210393f3d2b8Schs  * 3. The on-disk size of the space map should actually decrease.
210493f3d2b8Schs  *
210593f3d2b8Schs  * Checking the first condition is tricky since we don't want to walk
210693f3d2b8Schs  * the entire AVL tree calculating the estimated on-disk size. Instead we
210793f3d2b8Schs  * use the size-ordered range tree in the metaslab and calculate the
210893f3d2b8Schs  * size required to write out the largest segment in our free tree. If the
210993f3d2b8Schs  * size required to represent that segment on disk is larger than the space
211093f3d2b8Schs  * map object then we avoid condensing this map.
211193f3d2b8Schs  *
211293f3d2b8Schs  * To determine the second criterion we use a best-case estimate and assume
211393f3d2b8Schs  * each segment can be represented on-disk as a single 64-bit entry. We refer
211493f3d2b8Schs  * to this best-case estimate as the space map's minimal form.
211593f3d2b8Schs  *
211693f3d2b8Schs  * Unfortunately, we cannot compute the on-disk size of the space map in this
211793f3d2b8Schs  * context because we cannot accurately compute the effects of compression, etc.
211893f3d2b8Schs  * Instead, we apply the heuristic described in the block comment for
211993f3d2b8Schs  * zfs_metaslab_condense_block_threshold - we only condense if the space used
212093f3d2b8Schs  * is greater than a threshold number of blocks.
212193f3d2b8Schs  */
212293f3d2b8Schs static boolean_t
metaslab_should_condense(metaslab_t * msp)212393f3d2b8Schs metaslab_should_condense(metaslab_t *msp)
212493f3d2b8Schs {
212593f3d2b8Schs 	space_map_t *sm = msp->ms_sm;
212693f3d2b8Schs 	range_seg_t *rs;
212793f3d2b8Schs 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
212893f3d2b8Schs 	dmu_object_info_t doi;
212993f3d2b8Schs 	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
213093f3d2b8Schs 
213193f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
213293f3d2b8Schs 	ASSERT(msp->ms_loaded);
213393f3d2b8Schs 
213493f3d2b8Schs 	/*
213593f3d2b8Schs 	 * Use the ms_size_tree range tree, which is ordered by size, to
213693f3d2b8Schs 	 * obtain the largest segment in the free tree. We always condense
213793f3d2b8Schs 	 * metaslabs that are empty and metaslabs for which a condense
213893f3d2b8Schs 	 * request has been made.
213993f3d2b8Schs 	 */
214093f3d2b8Schs 	rs = avl_last(&msp->ms_size_tree);
214193f3d2b8Schs 	if (rs == NULL || msp->ms_condense_wanted)
214293f3d2b8Schs 		return (B_TRUE);
214393f3d2b8Schs 
214493f3d2b8Schs 	/*
214593f3d2b8Schs 	 * Calculate the number of 64-bit entries this segment would
214693f3d2b8Schs 	 * require when written to disk. If this single segment would be
214793f3d2b8Schs 	 * larger on-disk than the entire current on-disk structure, then
214893f3d2b8Schs 	 * clearly condensing will increase the on-disk structure size.
214993f3d2b8Schs 	 */
215093f3d2b8Schs 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
215193f3d2b8Schs 	entries = size / (MIN(size, SM_RUN_MAX));
215293f3d2b8Schs 	segsz = entries * sizeof (uint64_t);
215393f3d2b8Schs 
215493f3d2b8Schs 	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
215593f3d2b8Schs 	object_size = space_map_length(msp->ms_sm);
215693f3d2b8Schs 
215793f3d2b8Schs 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
215893f3d2b8Schs 	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
215993f3d2b8Schs 
216093f3d2b8Schs 	return (segsz <= object_size &&
216193f3d2b8Schs 	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
216293f3d2b8Schs 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
216393f3d2b8Schs }
216493f3d2b8Schs 
216593f3d2b8Schs /*
216693f3d2b8Schs  * Condense the on-disk space map representation to its minimized form.
216793f3d2b8Schs  * The minimized form consists of a small number of allocations followed by
216893f3d2b8Schs  * the entries of the free range tree.
216993f3d2b8Schs  */
217093f3d2b8Schs static void
metaslab_condense(metaslab_t * msp,uint64_t txg,dmu_tx_t * tx)217193f3d2b8Schs metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
217293f3d2b8Schs {
217393f3d2b8Schs 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
217493f3d2b8Schs 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
217593f3d2b8Schs 	range_tree_t *condense_tree;
217693f3d2b8Schs 	space_map_t *sm = msp->ms_sm;
217793f3d2b8Schs 
217893f3d2b8Schs 	ASSERT(MUTEX_HELD(&msp->ms_lock));
217993f3d2b8Schs 	ASSERT3U(spa_sync_pass(spa), ==, 1);
218093f3d2b8Schs 	ASSERT(msp->ms_loaded);
218193f3d2b8Schs 
218293f3d2b8Schs 
218393f3d2b8Schs 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
218493f3d2b8Schs 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
218593f3d2b8Schs 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
218693f3d2b8Schs 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
218793f3d2b8Schs 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
218893f3d2b8Schs 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
218993f3d2b8Schs 
219093f3d2b8Schs 	msp->ms_condense_wanted = B_FALSE;
219193f3d2b8Schs 
219293f3d2b8Schs 	/*
219393f3d2b8Schs 	 * Create an range tree that is 100% allocated. We remove segments
219493f3d2b8Schs 	 * that have been freed in this txg, any deferred frees that exist,
219593f3d2b8Schs 	 * and any allocation in the future. Removing segments should be
219693f3d2b8Schs 	 * a relatively inexpensive operation since we expect these trees to
219793f3d2b8Schs 	 * have a small number of nodes.
219893f3d2b8Schs 	 */
219993f3d2b8Schs 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
220093f3d2b8Schs 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
220193f3d2b8Schs 
220293f3d2b8Schs 	/*
220393f3d2b8Schs 	 * Remove what's been freed in this txg from the condense_tree.
220493f3d2b8Schs 	 * Since we're in sync_pass 1, we know that all the frees from
220593f3d2b8Schs 	 * this txg are in the freetree.
220693f3d2b8Schs 	 */
220793f3d2b8Schs 	range_tree_walk(freetree, range_tree_remove, condense_tree);
220893f3d2b8Schs 
220993f3d2b8Schs 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
221093f3d2b8Schs 		range_tree_walk(msp->ms_defertree[t],
221193f3d2b8Schs 		    range_tree_remove, condense_tree);
221293f3d2b8Schs 	}
221393f3d2b8Schs 
221493f3d2b8Schs 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
221593f3d2b8Schs 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
221693f3d2b8Schs 		    range_tree_remove, condense_tree);
221793f3d2b8Schs 	}
221893f3d2b8Schs 
221993f3d2b8Schs 	/*
222093f3d2b8Schs 	 * We're about to drop the metaslab's lock thus allowing
222193f3d2b8Schs 	 * other consumers to change it's content. Set the
222293f3d2b8Schs 	 * metaslab's ms_condensing flag to ensure that
222393f3d2b8Schs 	 * allocations on this metaslab do not occur while we're
222493f3d2b8Schs 	 * in the middle of committing it to disk. This is only critical
222593f3d2b8Schs 	 * for the ms_tree as all other range trees use per txg
222693f3d2b8Schs 	 * views of their content.
222793f3d2b8Schs 	 */
222893f3d2b8Schs 	msp->ms_condensing = B_TRUE;
222993f3d2b8Schs 
223093f3d2b8Schs 	mutex_exit(&msp->ms_lock);
223193f3d2b8Schs 	space_map_truncate(sm, tx);
223293f3d2b8Schs 	mutex_enter(&msp->ms_lock);
223393f3d2b8Schs 
223493f3d2b8Schs 	/*
223593f3d2b8Schs 	 * While we would ideally like to create a space map representation
223693f3d2b8Schs 	 * that consists only of allocation records, doing so can be
223793f3d2b8Schs 	 * prohibitively expensive because the in-core free tree can be
223893f3d2b8Schs 	 * large, and therefore computationally expensive to subtract
223993f3d2b8Schs 	 * from the condense_tree. Instead we sync out two trees, a cheap
224093f3d2b8Schs 	 * allocation only tree followed by the in-core free tree. While not
224193f3d2b8Schs 	 * optimal, this is typically close to optimal, and much cheaper to
224293f3d2b8Schs 	 * compute.
224393f3d2b8Schs 	 */
224493f3d2b8Schs 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
224593f3d2b8Schs 	range_tree_vacate(condense_tree, NULL, NULL);
224693f3d2b8Schs 	range_tree_destroy(condense_tree);
224793f3d2b8Schs 
224893f3d2b8Schs 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
224993f3d2b8Schs 	msp->ms_condensing = B_FALSE;
225093f3d2b8Schs }
225193f3d2b8Schs 
225293f3d2b8Schs /*
2253c1cb2cd8Shaad  * Write a metaslab to disk in the context of the specified transaction group.
2254c1cb2cd8Shaad  */
2255c1cb2cd8Shaad void
metaslab_sync(metaslab_t * msp,uint64_t txg)2256c1cb2cd8Shaad metaslab_sync(metaslab_t *msp, uint64_t txg)
2257c1cb2cd8Shaad {
225893f3d2b8Schs 	metaslab_group_t *mg = msp->ms_group;
225993f3d2b8Schs 	vdev_t *vd = mg->mg_vd;
2260c1cb2cd8Shaad 	spa_t *spa = vd->vdev_spa;
2261f59c7639Shaad 	objset_t *mos = spa_meta_objset(spa);
226293f3d2b8Schs 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
226393f3d2b8Schs 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
226493f3d2b8Schs 	range_tree_t **freed_tree =
226593f3d2b8Schs 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2266c1cb2cd8Shaad 	dmu_tx_t *tx;
226793f3d2b8Schs 	uint64_t object = space_map_object(msp->ms_sm);
2268c1cb2cd8Shaad 
2269f59c7639Shaad 	ASSERT(!vd->vdev_ishole);
2270f59c7639Shaad 
227193f3d2b8Schs 	/*
227293f3d2b8Schs 	 * This metaslab has just been added so there's no work to do now.
227393f3d2b8Schs 	 */
227493f3d2b8Schs 	if (*freetree == NULL) {
227593f3d2b8Schs 		ASSERT3P(alloctree, ==, NULL);
227693f3d2b8Schs 		return;
227793f3d2b8Schs 	}
227893f3d2b8Schs 
227993f3d2b8Schs 	ASSERT3P(alloctree, !=, NULL);
228093f3d2b8Schs 	ASSERT3P(*freetree, !=, NULL);
228193f3d2b8Schs 	ASSERT3P(*freed_tree, !=, NULL);
228293f3d2b8Schs 
228393f3d2b8Schs 	/*
228493f3d2b8Schs 	 * Normally, we don't want to process a metaslab if there
228593f3d2b8Schs 	 * are no allocations or frees to perform. However, if the metaslab
228693f3d2b8Schs 	 * is being forced to condense we need to let it through.
228793f3d2b8Schs 	 */
228893f3d2b8Schs 	if (range_tree_space(alloctree) == 0 &&
228993f3d2b8Schs 	    range_tree_space(*freetree) == 0 &&
229093f3d2b8Schs 	    !msp->ms_condense_wanted)
2291f59c7639Shaad 		return;
2292c1cb2cd8Shaad 
2293c1cb2cd8Shaad 	/*
2294c1cb2cd8Shaad 	 * The only state that can actually be changing concurrently with
229593f3d2b8Schs 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
229693f3d2b8Schs 	 * be modifying this txg's alloctree, freetree, freed_tree, or
229793f3d2b8Schs 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
229893f3d2b8Schs 	 * space map ASSERTs. We drop it whenever we call into the DMU,
229993f3d2b8Schs 	 * because the DMU can call down to us (e.g. via zio_free()) at
230093f3d2b8Schs 	 * any time.
2301c1cb2cd8Shaad 	 */
2302f59c7639Shaad 
2303f59c7639Shaad 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2304c1cb2cd8Shaad 
230593f3d2b8Schs 	if (msp->ms_sm == NULL) {
230693f3d2b8Schs 		uint64_t new_object;
230793f3d2b8Schs 
230893f3d2b8Schs 		new_object = space_map_alloc(mos, tx);
230993f3d2b8Schs 		VERIFY3U(new_object, !=, 0);
231093f3d2b8Schs 
231193f3d2b8Schs 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
231293f3d2b8Schs 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
231393f3d2b8Schs 		    &msp->ms_lock));
231493f3d2b8Schs 		ASSERT(msp->ms_sm != NULL);
2315c1cb2cd8Shaad 	}
2316c1cb2cd8Shaad 
2317f59c7639Shaad 	mutex_enter(&msp->ms_lock);
2318f59c7639Shaad 
2319c1cb2cd8Shaad 	/*
232093f3d2b8Schs 	 * Note: metaslab_condense() clears the space map's histogram.
232193f3d2b8Schs 	 * Therefore we must verify and remove this histogram before
232293f3d2b8Schs 	 * condensing.
2323c1cb2cd8Shaad 	 */
232493f3d2b8Schs 	metaslab_group_histogram_verify(mg);
232593f3d2b8Schs 	metaslab_class_histogram_verify(mg->mg_class);
232693f3d2b8Schs 	metaslab_group_histogram_remove(mg, msp);
2327c1cb2cd8Shaad 
232893f3d2b8Schs 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
232993f3d2b8Schs 	    metaslab_should_condense(msp)) {
233093f3d2b8Schs 		metaslab_condense(msp, txg, tx);
233193f3d2b8Schs 	} else {
233293f3d2b8Schs 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
233393f3d2b8Schs 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
2334c1cb2cd8Shaad 	}
2335c1cb2cd8Shaad 
233693f3d2b8Schs 	if (msp->ms_loaded) {
233793f3d2b8Schs 		/*
233893f3d2b8Schs 		 * When the space map is loaded, we have an accruate
233993f3d2b8Schs 		 * histogram in the range tree. This gives us an opportunity
234093f3d2b8Schs 		 * to bring the space map's histogram up-to-date so we clear
234193f3d2b8Schs 		 * it first before updating it.
234293f3d2b8Schs 		 */
234393f3d2b8Schs 		space_map_histogram_clear(msp->ms_sm);
234493f3d2b8Schs 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
234593f3d2b8Schs 
234693f3d2b8Schs 		/*
234793f3d2b8Schs 		 * Since we've cleared the histogram we need to add back
234893f3d2b8Schs 		 * any free space that has already been processed, plus
234993f3d2b8Schs 		 * any deferred space. This allows the on-disk histogram
235093f3d2b8Schs 		 * to accurately reflect all free space even if some space
235193f3d2b8Schs 		 * is not yet available for allocation (i.e. deferred).
235293f3d2b8Schs 		 */
235393f3d2b8Schs 		space_map_histogram_add(msp->ms_sm, *freed_tree, tx);
235493f3d2b8Schs 
235593f3d2b8Schs 		/*
235693f3d2b8Schs 		 * Add back any deferred free space that has not been
235793f3d2b8Schs 		 * added back into the in-core free tree yet. This will
235893f3d2b8Schs 		 * ensure that we don't end up with a space map histogram
235993f3d2b8Schs 		 * that is completely empty unless the metaslab is fully
236093f3d2b8Schs 		 * allocated.
236193f3d2b8Schs 		 */
236293f3d2b8Schs 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
236393f3d2b8Schs 			space_map_histogram_add(msp->ms_sm,
236493f3d2b8Schs 			    msp->ms_defertree[t], tx);
236593f3d2b8Schs 		}
236693f3d2b8Schs 	}
236793f3d2b8Schs 
236893f3d2b8Schs 	/*
236993f3d2b8Schs 	 * Always add the free space from this sync pass to the space
237093f3d2b8Schs 	 * map histogram. We want to make sure that the on-disk histogram
237193f3d2b8Schs 	 * accounts for all free space. If the space map is not loaded,
237293f3d2b8Schs 	 * then we will lose some accuracy but will correct it the next
237393f3d2b8Schs 	 * time we load the space map.
237493f3d2b8Schs 	 */
237593f3d2b8Schs 	space_map_histogram_add(msp->ms_sm, *freetree, tx);
237693f3d2b8Schs 
237793f3d2b8Schs 	metaslab_group_histogram_add(mg, msp);
237893f3d2b8Schs 	metaslab_group_histogram_verify(mg);
237993f3d2b8Schs 	metaslab_class_histogram_verify(mg->mg_class);
238093f3d2b8Schs 
238193f3d2b8Schs 	/*
238293f3d2b8Schs 	 * For sync pass 1, we avoid traversing this txg's free range tree
238393f3d2b8Schs 	 * and instead will just swap the pointers for freetree and
238493f3d2b8Schs 	 * freed_tree. We can safely do this since the freed_tree is
238593f3d2b8Schs 	 * guaranteed to be empty on the initial pass.
238693f3d2b8Schs 	 */
238793f3d2b8Schs 	if (spa_sync_pass(spa) == 1) {
238893f3d2b8Schs 		range_tree_swap(freetree, freed_tree);
238993f3d2b8Schs 	} else {
239093f3d2b8Schs 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
239193f3d2b8Schs 	}
239293f3d2b8Schs 	range_tree_vacate(alloctree, NULL, NULL);
239393f3d2b8Schs 
239493f3d2b8Schs 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
239593f3d2b8Schs 	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
239693f3d2b8Schs 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2397c1cb2cd8Shaad 
2398c1cb2cd8Shaad 	mutex_exit(&msp->ms_lock);
2399c1cb2cd8Shaad 
240093f3d2b8Schs 	if (object != space_map_object(msp->ms_sm)) {
240193f3d2b8Schs 		object = space_map_object(msp->ms_sm);
240293f3d2b8Schs 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
240393f3d2b8Schs 		    msp->ms_id, sizeof (uint64_t), &object, tx);
240493f3d2b8Schs 	}
2405c1cb2cd8Shaad 	dmu_tx_commit(tx);
2406c1cb2cd8Shaad }
2407c1cb2cd8Shaad 
2408c1cb2cd8Shaad /*
2409c1cb2cd8Shaad  * Called after a transaction group has completely synced to mark
2410c1cb2cd8Shaad  * all of the metaslab's free space as usable.
2411c1cb2cd8Shaad  */
2412c1cb2cd8Shaad void
metaslab_sync_done(metaslab_t * msp,uint64_t txg)2413c1cb2cd8Shaad metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2414c1cb2cd8Shaad {
2415c1cb2cd8Shaad 	metaslab_group_t *mg = msp->ms_group;
2416c1cb2cd8Shaad 	vdev_t *vd = mg->mg_vd;
241793f3d2b8Schs 	spa_t *spa = vd->vdev_spa;
241893f3d2b8Schs 	range_tree_t **freed_tree;
241993f3d2b8Schs 	range_tree_t **defer_tree;
2420f59c7639Shaad 	int64_t alloc_delta, defer_delta;
242193f3d2b8Schs 	boolean_t defer_allowed = B_TRUE;
2422f59c7639Shaad 
2423f59c7639Shaad 	ASSERT(!vd->vdev_ishole);
2424c1cb2cd8Shaad 
2425c1cb2cd8Shaad 	mutex_enter(&msp->ms_lock);
2426c1cb2cd8Shaad 
2427c1cb2cd8Shaad 	/*
2428c1cb2cd8Shaad 	 * If this metaslab is just becoming available, initialize its
242993f3d2b8Schs 	 * alloctrees, freetrees, and defertree and add its capacity to
243093f3d2b8Schs 	 * the vdev.
2431c1cb2cd8Shaad 	 */
243293f3d2b8Schs 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
2433f59c7639Shaad 		for (int t = 0; t < TXG_SIZE; t++) {
243493f3d2b8Schs 			ASSERT(msp->ms_alloctree[t] == NULL);
243593f3d2b8Schs 			ASSERT(msp->ms_freetree[t] == NULL);
243693f3d2b8Schs 
243793f3d2b8Schs 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
243893f3d2b8Schs 			    &msp->ms_lock);
243993f3d2b8Schs 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
244093f3d2b8Schs 			    &msp->ms_lock);
2441c1cb2cd8Shaad 		}
2442f59c7639Shaad 
244393f3d2b8Schs 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
244493f3d2b8Schs 			ASSERT(msp->ms_defertree[t] == NULL);
2445f59c7639Shaad 
244693f3d2b8Schs 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
244793f3d2b8Schs 			    &msp->ms_lock);
2448c1cb2cd8Shaad 		}
2449c1cb2cd8Shaad 
245093f3d2b8Schs 		vdev_space_update(vd, 0, 0, msp->ms_size);
245193f3d2b8Schs 	}
245293f3d2b8Schs 
245393f3d2b8Schs 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
245493f3d2b8Schs 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
245593f3d2b8Schs 
245693f3d2b8Schs 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
245793f3d2b8Schs 	    metaslab_class_get_alloc(spa_normal_class(spa));
245893f3d2b8Schs 	if (free_space <= spa_get_slop_space(spa)) {
245993f3d2b8Schs 		defer_allowed = B_FALSE;
246093f3d2b8Schs 	}
246193f3d2b8Schs 
246293f3d2b8Schs 	defer_delta = 0;
246393f3d2b8Schs 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
246493f3d2b8Schs 	if (defer_allowed) {
246593f3d2b8Schs 		defer_delta = range_tree_space(*freed_tree) -
246693f3d2b8Schs 		    range_tree_space(*defer_tree);
246793f3d2b8Schs 	} else {
246893f3d2b8Schs 		defer_delta -= range_tree_space(*defer_tree);
246993f3d2b8Schs 	}
2470f59c7639Shaad 
2471f59c7639Shaad 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2472c1cb2cd8Shaad 
247393f3d2b8Schs 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
247493f3d2b8Schs 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2475c1cb2cd8Shaad 
2476c1cb2cd8Shaad 	/*
247793f3d2b8Schs 	 * If there's a metaslab_load() in progress, wait for it to complete
2478c1cb2cd8Shaad 	 * so that we have a consistent view of the in-core space map.
2479c1cb2cd8Shaad 	 */
248093f3d2b8Schs 	metaslab_load_wait(msp);
2481c1cb2cd8Shaad 
248293f3d2b8Schs 	/*
248393f3d2b8Schs 	 * Move the frees from the defer_tree back to the free
248493f3d2b8Schs 	 * range tree (if it's loaded). Swap the freed_tree and the
248593f3d2b8Schs 	 * defer_tree -- this is safe to do because we've just emptied out
248693f3d2b8Schs 	 * the defer_tree.
248793f3d2b8Schs 	 */
248893f3d2b8Schs 	range_tree_vacate(*defer_tree,
248993f3d2b8Schs 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
249093f3d2b8Schs 	if (defer_allowed) {
249193f3d2b8Schs 		range_tree_swap(freed_tree, defer_tree);
249293f3d2b8Schs 	} else {
249393f3d2b8Schs 		range_tree_vacate(*freed_tree,
249493f3d2b8Schs 		    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
249593f3d2b8Schs 	}
249693f3d2b8Schs 
249793f3d2b8Schs 	space_map_update(msp->ms_sm);
2498c1cb2cd8Shaad 
2499f59c7639Shaad 	msp->ms_deferspace += defer_delta;
2500f59c7639Shaad 	ASSERT3S(msp->ms_deferspace, >=, 0);
250193f3d2b8Schs 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2502f59c7639Shaad 	if (msp->ms_deferspace != 0) {
2503f59c7639Shaad 		/*
2504f59c7639Shaad 		 * Keep syncing this metaslab until all deferred frees
2505f59c7639Shaad 		 * are back in circulation.
2506f59c7639Shaad 		 */
2507f59c7639Shaad 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2508f59c7639Shaad 	}
2509f59c7639Shaad 
2510c1cb2cd8Shaad 	/*
251193f3d2b8Schs 	 * Calculate the new weights before unloading any metaslabs.
251293f3d2b8Schs 	 * This will give us the most accurate weighting.
2513c1cb2cd8Shaad 	 */
251493f3d2b8Schs 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2515c1cb2cd8Shaad 
251693f3d2b8Schs 	/*
251793f3d2b8Schs 	 * If the metaslab is loaded and we've not tried to load or allocate
251893f3d2b8Schs 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
251993f3d2b8Schs 	 */
252093f3d2b8Schs 	if (msp->ms_loaded &&
252193f3d2b8Schs 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
252293f3d2b8Schs 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
252393f3d2b8Schs 			VERIFY0(range_tree_space(
252493f3d2b8Schs 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2525c1cb2cd8Shaad 		}
2526c1cb2cd8Shaad 
252793f3d2b8Schs 		if (!metaslab_debug_unload)
252893f3d2b8Schs 			metaslab_unload(msp);
252993f3d2b8Schs 	}
2530c1cb2cd8Shaad 
2531c1cb2cd8Shaad 	mutex_exit(&msp->ms_lock);
2532c1cb2cd8Shaad }
2533c1cb2cd8Shaad 
2534f59c7639Shaad void
metaslab_sync_reassess(metaslab_group_t * mg)2535f59c7639Shaad metaslab_sync_reassess(metaslab_group_t *mg)
2536f59c7639Shaad {
253793f3d2b8Schs 	metaslab_group_alloc_update(mg);
253893f3d2b8Schs 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2539f59c7639Shaad 
2540f59c7639Shaad 	/*
254193f3d2b8Schs 	 * Preload the next potential metaslabs
2542f59c7639Shaad 	 */
254393f3d2b8Schs 	metaslab_group_preload(mg);
2544f59c7639Shaad }
2545f59c7639Shaad 
2546c1cb2cd8Shaad static uint64_t
metaslab_distance(metaslab_t * msp,dva_t * dva)2547c1cb2cd8Shaad metaslab_distance(metaslab_t *msp, dva_t *dva)
2548c1cb2cd8Shaad {
2549c1cb2cd8Shaad 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2550c1cb2cd8Shaad 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
255193f3d2b8Schs 	uint64_t start = msp->ms_id;
2552c1cb2cd8Shaad 
2553c1cb2cd8Shaad 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2554c1cb2cd8Shaad 		return (1ULL << 63);
2555c1cb2cd8Shaad 
2556c1cb2cd8Shaad 	if (offset < start)
2557c1cb2cd8Shaad 		return ((start - offset) << ms_shift);
2558c1cb2cd8Shaad 	if (offset > start)
2559c1cb2cd8Shaad 		return ((offset - start) << ms_shift);
2560c1cb2cd8Shaad 	return (0);
2561c1cb2cd8Shaad }
2562c1cb2cd8Shaad 
256393f3d2b8Schs /*
256493f3d2b8Schs  * ==========================================================================
256593f3d2b8Schs  * Metaslab allocation tracing facility
256693f3d2b8Schs  * ==========================================================================
256793f3d2b8Schs  */
256893f3d2b8Schs kstat_t *metaslab_trace_ksp;
256993f3d2b8Schs kstat_named_t metaslab_trace_over_limit;
257093f3d2b8Schs 
257193f3d2b8Schs void
metaslab_alloc_trace_init(void)257293f3d2b8Schs metaslab_alloc_trace_init(void)
257393f3d2b8Schs {
257493f3d2b8Schs 	ASSERT(metaslab_alloc_trace_cache == NULL);
257593f3d2b8Schs 	metaslab_alloc_trace_cache = kmem_cache_create(
257693f3d2b8Schs 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
257793f3d2b8Schs 	    0, NULL, NULL, NULL, NULL, NULL, 0);
257893f3d2b8Schs 	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
257993f3d2b8Schs 	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
258093f3d2b8Schs 	if (metaslab_trace_ksp != NULL) {
258193f3d2b8Schs 		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
258293f3d2b8Schs 		kstat_named_init(&metaslab_trace_over_limit,
258393f3d2b8Schs 		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
258493f3d2b8Schs 		kstat_install(metaslab_trace_ksp);
258593f3d2b8Schs 	}
258693f3d2b8Schs }
258793f3d2b8Schs 
258893f3d2b8Schs void
metaslab_alloc_trace_fini(void)258993f3d2b8Schs metaslab_alloc_trace_fini(void)
259093f3d2b8Schs {
259193f3d2b8Schs 	if (metaslab_trace_ksp != NULL) {
259293f3d2b8Schs 		kstat_delete(metaslab_trace_ksp);
259393f3d2b8Schs 		metaslab_trace_ksp = NULL;
259493f3d2b8Schs 	}
259593f3d2b8Schs 	kmem_cache_destroy(metaslab_alloc_trace_cache);
259693f3d2b8Schs 	metaslab_alloc_trace_cache = NULL;
259793f3d2b8Schs }
259893f3d2b8Schs 
259993f3d2b8Schs /*
260093f3d2b8Schs  * Add an allocation trace element to the allocation tracing list.
260193f3d2b8Schs  */
260293f3d2b8Schs static void
metaslab_trace_add(zio_alloc_list_t * zal,metaslab_group_t * mg,metaslab_t * msp,uint64_t psize,uint32_t dva_id,uint64_t offset)260393f3d2b8Schs metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
260493f3d2b8Schs     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
260593f3d2b8Schs {
260693f3d2b8Schs 	if (!metaslab_trace_enabled)
260793f3d2b8Schs 		return;
260893f3d2b8Schs 
260993f3d2b8Schs 	/*
261093f3d2b8Schs 	 * When the tracing list reaches its maximum we remove
261193f3d2b8Schs 	 * the second element in the list before adding a new one.
261293f3d2b8Schs 	 * By removing the second element we preserve the original
261393f3d2b8Schs 	 * entry as a clue to what allocations steps have already been
261493f3d2b8Schs 	 * performed.
261593f3d2b8Schs 	 */
261693f3d2b8Schs 	if (zal->zal_size == metaslab_trace_max_entries) {
261793f3d2b8Schs 		metaslab_alloc_trace_t *mat_next;
261893f3d2b8Schs #ifdef DEBUG
261993f3d2b8Schs 		panic("too many entries in allocation list");
262093f3d2b8Schs #endif
262193f3d2b8Schs 		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
262293f3d2b8Schs 		zal->zal_size--;
262393f3d2b8Schs 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
262493f3d2b8Schs 		list_remove(&zal->zal_list, mat_next);
262593f3d2b8Schs 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
262693f3d2b8Schs 	}
262793f3d2b8Schs 
262893f3d2b8Schs 	metaslab_alloc_trace_t *mat =
262993f3d2b8Schs 	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
263093f3d2b8Schs 	list_link_init(&mat->mat_list_node);
263193f3d2b8Schs 	mat->mat_mg = mg;
263293f3d2b8Schs 	mat->mat_msp = msp;
263393f3d2b8Schs 	mat->mat_size = psize;
263493f3d2b8Schs 	mat->mat_dva_id = dva_id;
263593f3d2b8Schs 	mat->mat_offset = offset;
263693f3d2b8Schs 	mat->mat_weight = 0;
263793f3d2b8Schs 
263893f3d2b8Schs 	if (msp != NULL)
263993f3d2b8Schs 		mat->mat_weight = msp->ms_weight;
264093f3d2b8Schs 
264193f3d2b8Schs 	/*
264293f3d2b8Schs 	 * The list is part of the zio so locking is not required. Only
264393f3d2b8Schs 	 * a single thread will perform allocations for a given zio.
264493f3d2b8Schs 	 */
264593f3d2b8Schs 	list_insert_tail(&zal->zal_list, mat);
264693f3d2b8Schs 	zal->zal_size++;
264793f3d2b8Schs 
264893f3d2b8Schs 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
264993f3d2b8Schs }
265093f3d2b8Schs 
265193f3d2b8Schs void
metaslab_trace_init(zio_alloc_list_t * zal)265293f3d2b8Schs metaslab_trace_init(zio_alloc_list_t *zal)
265393f3d2b8Schs {
265493f3d2b8Schs 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
265593f3d2b8Schs 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
265693f3d2b8Schs 	zal->zal_size = 0;
265793f3d2b8Schs }
265893f3d2b8Schs 
265993f3d2b8Schs void
metaslab_trace_fini(zio_alloc_list_t * zal)266093f3d2b8Schs metaslab_trace_fini(zio_alloc_list_t *zal)
266193f3d2b8Schs {
266293f3d2b8Schs 	metaslab_alloc_trace_t *mat;
266393f3d2b8Schs 
266493f3d2b8Schs 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
266593f3d2b8Schs 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
266693f3d2b8Schs 	list_destroy(&zal->zal_list);
266793f3d2b8Schs 	zal->zal_size = 0;
266893f3d2b8Schs }
266993f3d2b8Schs 
267093f3d2b8Schs /*
267193f3d2b8Schs  * ==========================================================================
267293f3d2b8Schs  * Metaslab block operations
267393f3d2b8Schs  * ==========================================================================
267493f3d2b8Schs  */
267593f3d2b8Schs 
267693f3d2b8Schs static void
metaslab_group_alloc_increment(spa_t * spa,uint64_t vdev,void * tag,int flags)267793f3d2b8Schs metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
267893f3d2b8Schs {
267993f3d2b8Schs 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
268093f3d2b8Schs 	    flags & METASLAB_DONT_THROTTLE)
268193f3d2b8Schs 		return;
268293f3d2b8Schs 
268393f3d2b8Schs 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
268493f3d2b8Schs 	if (!mg->mg_class->mc_alloc_throttle_enabled)
268593f3d2b8Schs 		return;
268693f3d2b8Schs 
268793f3d2b8Schs 	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
268893f3d2b8Schs }
268993f3d2b8Schs 
269093f3d2b8Schs void
metaslab_group_alloc_decrement(spa_t * spa,uint64_t vdev,void * tag,int flags)269193f3d2b8Schs metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
269293f3d2b8Schs {
269393f3d2b8Schs 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
269493f3d2b8Schs 	    flags & METASLAB_DONT_THROTTLE)
269593f3d2b8Schs 		return;
269693f3d2b8Schs 
269793f3d2b8Schs 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
269893f3d2b8Schs 	if (!mg->mg_class->mc_alloc_throttle_enabled)
269993f3d2b8Schs 		return;
270093f3d2b8Schs 
270193f3d2b8Schs 	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
270293f3d2b8Schs }
270393f3d2b8Schs 
270493f3d2b8Schs void
metaslab_group_alloc_verify(spa_t * spa,const blkptr_t * bp,void * tag)270593f3d2b8Schs metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
270693f3d2b8Schs {
270793f3d2b8Schs #ifdef ZFS_DEBUG
270893f3d2b8Schs 	const dva_t *dva = bp->blk_dva;
270993f3d2b8Schs 	int ndvas = BP_GET_NDVAS(bp);
271093f3d2b8Schs 
271193f3d2b8Schs 	for (int d = 0; d < ndvas; d++) {
271293f3d2b8Schs 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
271393f3d2b8Schs 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
271493f3d2b8Schs 		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
271593f3d2b8Schs 	}
271693f3d2b8Schs #endif
271793f3d2b8Schs }
271893f3d2b8Schs 
2719c1cb2cd8Shaad static uint64_t
metaslab_block_alloc(metaslab_t * msp,uint64_t size,uint64_t txg)272093f3d2b8Schs metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
272193f3d2b8Schs {
272293f3d2b8Schs 	uint64_t start;
272393f3d2b8Schs 	range_tree_t *rt = msp->ms_tree;
272493f3d2b8Schs 	metaslab_class_t *mc = msp->ms_group->mg_class;
272593f3d2b8Schs 
272693f3d2b8Schs 	VERIFY(!msp->ms_condensing);
272793f3d2b8Schs 
272893f3d2b8Schs 	start = mc->mc_ops->msop_alloc(msp, size);
272993f3d2b8Schs 	if (start != -1ULL) {
273093f3d2b8Schs 		metaslab_group_t *mg = msp->ms_group;
273193f3d2b8Schs 		vdev_t *vd = mg->mg_vd;
273293f3d2b8Schs 
273393f3d2b8Schs 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
273493f3d2b8Schs 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
273593f3d2b8Schs 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
273693f3d2b8Schs 		range_tree_remove(rt, start, size);
273793f3d2b8Schs 
273893f3d2b8Schs 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
273993f3d2b8Schs 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
274093f3d2b8Schs 
274193f3d2b8Schs 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
274293f3d2b8Schs 
274393f3d2b8Schs 		/* Track the last successful allocation */
274493f3d2b8Schs 		msp->ms_alloc_txg = txg;
274593f3d2b8Schs 		metaslab_verify_space(msp, txg);
274693f3d2b8Schs 	}
274793f3d2b8Schs 
274893f3d2b8Schs 	/*
274993f3d2b8Schs 	 * Now that we've attempted the allocation we need to update the
275093f3d2b8Schs 	 * metaslab's maximum block size since it may have changed.
275193f3d2b8Schs 	 */
275293f3d2b8Schs 	msp->ms_max_size = metaslab_block_maxsize(msp);
275393f3d2b8Schs 	return (start);
275493f3d2b8Schs }
275593f3d2b8Schs 
275693f3d2b8Schs static uint64_t
metaslab_group_alloc_normal(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,uint64_t min_distance,dva_t * dva,int d)275793f3d2b8Schs metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
275893f3d2b8Schs     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2759c1cb2cd8Shaad {
2760c1cb2cd8Shaad 	metaslab_t *msp = NULL;
2761c1cb2cd8Shaad 	uint64_t offset = -1ULL;
2762c1cb2cd8Shaad 	uint64_t activation_weight;
2763c1cb2cd8Shaad 	uint64_t target_distance;
2764c1cb2cd8Shaad 	int i;
2765c1cb2cd8Shaad 
2766c1cb2cd8Shaad 	activation_weight = METASLAB_WEIGHT_PRIMARY;
2767f59c7639Shaad 	for (i = 0; i < d; i++) {
2768f59c7639Shaad 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2769c1cb2cd8Shaad 			activation_weight = METASLAB_WEIGHT_SECONDARY;
2770f59c7639Shaad 			break;
2771f59c7639Shaad 		}
2772f59c7639Shaad 	}
2773c1cb2cd8Shaad 
277493f3d2b8Schs 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
277593f3d2b8Schs 	search->ms_weight = UINT64_MAX;
277693f3d2b8Schs 	search->ms_start = 0;
2777c1cb2cd8Shaad 	for (;;) {
2778f59c7639Shaad 		boolean_t was_active;
277993f3d2b8Schs 		avl_tree_t *t = &mg->mg_metaslab_tree;
278093f3d2b8Schs 		avl_index_t idx;
2781f59c7639Shaad 
2782c1cb2cd8Shaad 		mutex_enter(&mg->mg_lock);
278393f3d2b8Schs 
278493f3d2b8Schs 		/*
278593f3d2b8Schs 		 * Find the metaslab with the highest weight that is less
278693f3d2b8Schs 		 * than what we've already tried.  In the common case, this
278793f3d2b8Schs 		 * means that we will examine each metaslab at most once.
278893f3d2b8Schs 		 * Note that concurrent callers could reorder metaslabs
278993f3d2b8Schs 		 * by activation/passivation once we have dropped the mg_lock.
279093f3d2b8Schs 		 * If a metaslab is activated by another thread, and we fail
279193f3d2b8Schs 		 * to allocate from the metaslab we have selected, we may
279293f3d2b8Schs 		 * not try the newly-activated metaslab, and instead activate
279393f3d2b8Schs 		 * another metaslab.  This is not optimal, but generally
279493f3d2b8Schs 		 * does not cause any problems (a possible exception being
279593f3d2b8Schs 		 * if every metaslab is completely full except for the
279693f3d2b8Schs 		 * the newly-activated metaslab which we fail to examine).
279793f3d2b8Schs 		 */
279893f3d2b8Schs 		msp = avl_find(t, search, &idx);
279993f3d2b8Schs 		if (msp == NULL)
280093f3d2b8Schs 			msp = avl_nearest(t, idx, AVL_AFTER);
280193f3d2b8Schs 		for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
280293f3d2b8Schs 
280393f3d2b8Schs 			if (!metaslab_should_allocate(msp, asize)) {
280493f3d2b8Schs 				metaslab_trace_add(zal, mg, msp, asize, d,
280593f3d2b8Schs 				    TRACE_TOO_SMALL);
280693f3d2b8Schs 				continue;
2807c1cb2cd8Shaad 			}
2808c1cb2cd8Shaad 
280993f3d2b8Schs 			/*
281093f3d2b8Schs 			 * If the selected metaslab is condensing, skip it.
281193f3d2b8Schs 			 */
281293f3d2b8Schs 			if (msp->ms_condensing)
281393f3d2b8Schs 				continue;
281493f3d2b8Schs 
2815f59c7639Shaad 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2816c1cb2cd8Shaad 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2817c1cb2cd8Shaad 				break;
2818c1cb2cd8Shaad 
2819c1cb2cd8Shaad 			target_distance = min_distance +
282093f3d2b8Schs 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
282193f3d2b8Schs 			    min_distance >> 1);
2822c1cb2cd8Shaad 
282393f3d2b8Schs 			for (i = 0; i < d; i++) {
2824c1cb2cd8Shaad 				if (metaslab_distance(msp, &dva[i]) <
2825c1cb2cd8Shaad 				    target_distance)
2826c1cb2cd8Shaad 					break;
282793f3d2b8Schs 			}
2828c1cb2cd8Shaad 			if (i == d)
2829c1cb2cd8Shaad 				break;
2830c1cb2cd8Shaad 		}
2831c1cb2cd8Shaad 		mutex_exit(&mg->mg_lock);
283293f3d2b8Schs 		if (msp == NULL) {
283393f3d2b8Schs 			kmem_free(search, sizeof (*search));
2834c1cb2cd8Shaad 			return (-1ULL);
283593f3d2b8Schs 		}
283693f3d2b8Schs 		search->ms_weight = msp->ms_weight;
283793f3d2b8Schs 		search->ms_start = msp->ms_start + 1;
2838c1cb2cd8Shaad 
2839c1cb2cd8Shaad 		mutex_enter(&msp->ms_lock);
2840c1cb2cd8Shaad 
2841c1cb2cd8Shaad 		/*
2842c1cb2cd8Shaad 		 * Ensure that the metaslab we have selected is still
2843c1cb2cd8Shaad 		 * capable of handling our request. It's possible that
2844c1cb2cd8Shaad 		 * another thread may have changed the weight while we
284593f3d2b8Schs 		 * were blocked on the metaslab lock. We check the
284693f3d2b8Schs 		 * active status first to see if we need to reselect
284793f3d2b8Schs 		 * a new metaslab.
2848c1cb2cd8Shaad 		 */
284993f3d2b8Schs 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2850c1cb2cd8Shaad 			mutex_exit(&msp->ms_lock);
2851c1cb2cd8Shaad 			continue;
2852c1cb2cd8Shaad 		}
2853c1cb2cd8Shaad 
2854c1cb2cd8Shaad 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2855c1cb2cd8Shaad 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2856c1cb2cd8Shaad 			metaslab_passivate(msp,
2857c1cb2cd8Shaad 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2858c1cb2cd8Shaad 			mutex_exit(&msp->ms_lock);
2859c1cb2cd8Shaad 			continue;
2860c1cb2cd8Shaad 		}
2861c1cb2cd8Shaad 
286293f3d2b8Schs 		if (metaslab_activate(msp, activation_weight) != 0) {
286393f3d2b8Schs 			mutex_exit(&msp->ms_lock);
286493f3d2b8Schs 			continue;
286593f3d2b8Schs 		}
286693f3d2b8Schs 		msp->ms_selected_txg = txg;
286793f3d2b8Schs 
286893f3d2b8Schs 		/*
286993f3d2b8Schs 		 * Now that we have the lock, recheck to see if we should
287093f3d2b8Schs 		 * continue to use this metaslab for this allocation. The
287193f3d2b8Schs 		 * the metaslab is now loaded so metaslab_should_allocate() can
287293f3d2b8Schs 		 * accurately determine if the allocation attempt should
287393f3d2b8Schs 		 * proceed.
287493f3d2b8Schs 		 */
287593f3d2b8Schs 		if (!metaslab_should_allocate(msp, asize)) {
287693f3d2b8Schs 			/* Passivate this metaslab and select a new one. */
287793f3d2b8Schs 			metaslab_trace_add(zal, mg, msp, asize, d,
287893f3d2b8Schs 			    TRACE_TOO_SMALL);
287993f3d2b8Schs 			goto next;
288093f3d2b8Schs 		}
288193f3d2b8Schs 
288293f3d2b8Schs 		/*
288393f3d2b8Schs 		 * If this metaslab is currently condensing then pick again as
288493f3d2b8Schs 		 * we can't manipulate this metaslab until it's committed
288593f3d2b8Schs 		 * to disk.
288693f3d2b8Schs 		 */
288793f3d2b8Schs 		if (msp->ms_condensing) {
288893f3d2b8Schs 			metaslab_trace_add(zal, mg, msp, asize, d,
288993f3d2b8Schs 			    TRACE_CONDENSING);
2890c1cb2cd8Shaad 			mutex_exit(&msp->ms_lock);
2891c1cb2cd8Shaad 			continue;
2892c1cb2cd8Shaad 		}
2893c1cb2cd8Shaad 
289493f3d2b8Schs 		offset = metaslab_block_alloc(msp, asize, txg);
289593f3d2b8Schs 		metaslab_trace_add(zal, mg, msp, asize, d, offset);
289693f3d2b8Schs 
289793f3d2b8Schs 		if (offset != -1ULL) {
289893f3d2b8Schs 			/* Proactively passivate the metaslab, if needed */
289993f3d2b8Schs 			metaslab_segment_may_passivate(msp);
2900c1cb2cd8Shaad 			break;
290193f3d2b8Schs 		}
290293f3d2b8Schs next:
290393f3d2b8Schs 		ASSERT(msp->ms_loaded);
2904c1cb2cd8Shaad 
290593f3d2b8Schs 		/*
290693f3d2b8Schs 		 * We were unable to allocate from this metaslab so determine
290793f3d2b8Schs 		 * a new weight for this metaslab. Now that we have loaded
290893f3d2b8Schs 		 * the metaslab we can provide a better hint to the metaslab
290993f3d2b8Schs 		 * selector.
291093f3d2b8Schs 		 *
291193f3d2b8Schs 		 * For space-based metaslabs, we use the maximum block size.
291293f3d2b8Schs 		 * This information is only available when the metaslab
291393f3d2b8Schs 		 * is loaded and is more accurate than the generic free
291493f3d2b8Schs 		 * space weight that was calculated by metaslab_weight().
291593f3d2b8Schs 		 * This information allows us to quickly compare the maximum
291693f3d2b8Schs 		 * available allocation in the metaslab to the allocation
291793f3d2b8Schs 		 * size being requested.
291893f3d2b8Schs 		 *
291993f3d2b8Schs 		 * For segment-based metaslabs, determine the new weight
292093f3d2b8Schs 		 * based on the highest bucket in the range tree. We
292193f3d2b8Schs 		 * explicitly use the loaded segment weight (i.e. the range
292293f3d2b8Schs 		 * tree histogram) since it contains the space that is
292393f3d2b8Schs 		 * currently available for allocation and is accurate
292493f3d2b8Schs 		 * even within a sync pass.
292593f3d2b8Schs 		 */
292693f3d2b8Schs 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
292793f3d2b8Schs 			uint64_t weight = metaslab_block_maxsize(msp);
292893f3d2b8Schs 			WEIGHT_SET_SPACEBASED(weight);
292993f3d2b8Schs 			metaslab_passivate(msp, weight);
293093f3d2b8Schs 		} else {
293193f3d2b8Schs 			metaslab_passivate(msp,
293293f3d2b8Schs 			    metaslab_weight_from_range_tree(msp));
2933c1cb2cd8Shaad 		}
2934c1cb2cd8Shaad 
293593f3d2b8Schs 		/*
293693f3d2b8Schs 		 * We have just failed an allocation attempt, check
293793f3d2b8Schs 		 * that metaslab_should_allocate() agrees. Otherwise,
293893f3d2b8Schs 		 * we may end up in an infinite loop retrying the same
293993f3d2b8Schs 		 * metaslab.
294093f3d2b8Schs 		 */
294193f3d2b8Schs 		ASSERT(!metaslab_should_allocate(msp, asize));
2942c1cb2cd8Shaad 		mutex_exit(&msp->ms_lock);
294393f3d2b8Schs 	}
294493f3d2b8Schs 	mutex_exit(&msp->ms_lock);
294593f3d2b8Schs 	kmem_free(search, sizeof (*search));
2946c1cb2cd8Shaad 	return (offset);
2947c1cb2cd8Shaad }
2948c1cb2cd8Shaad 
294993f3d2b8Schs static uint64_t
metaslab_group_alloc(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,uint64_t min_distance,dva_t * dva,int d)295093f3d2b8Schs metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
295193f3d2b8Schs     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
295293f3d2b8Schs {
295393f3d2b8Schs 	uint64_t offset;
295493f3d2b8Schs 	ASSERT(mg->mg_initialized);
295593f3d2b8Schs 
295693f3d2b8Schs 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
295793f3d2b8Schs 	    min_distance, dva, d);
295893f3d2b8Schs 
295993f3d2b8Schs 	mutex_enter(&mg->mg_lock);
296093f3d2b8Schs 	if (offset == -1ULL) {
296193f3d2b8Schs 		mg->mg_failed_allocations++;
296293f3d2b8Schs 		metaslab_trace_add(zal, mg, NULL, asize, d,
296393f3d2b8Schs 		    TRACE_GROUP_FAILURE);
296493f3d2b8Schs 		if (asize == SPA_GANGBLOCKSIZE) {
296593f3d2b8Schs 			/*
296693f3d2b8Schs 			 * This metaslab group was unable to allocate
296793f3d2b8Schs 			 * the minimum gang block size so it must be out of
296893f3d2b8Schs 			 * space. We must notify the allocation throttle
296993f3d2b8Schs 			 * to start skipping allocation attempts to this
297093f3d2b8Schs 			 * metaslab group until more space becomes available.
297193f3d2b8Schs 			 * Note: this failure cannot be caused by the
297293f3d2b8Schs 			 * allocation throttle since the allocation throttle
297393f3d2b8Schs 			 * is only responsible for skipping devices and
297493f3d2b8Schs 			 * not failing block allocations.
297593f3d2b8Schs 			 */
297693f3d2b8Schs 			mg->mg_no_free_space = B_TRUE;
297793f3d2b8Schs 		}
297893f3d2b8Schs 	}
297993f3d2b8Schs 	mg->mg_allocations++;
298093f3d2b8Schs 	mutex_exit(&mg->mg_lock);
298193f3d2b8Schs 	return (offset);
298293f3d2b8Schs }
298393f3d2b8Schs 
298493f3d2b8Schs /*
298593f3d2b8Schs  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
298693f3d2b8Schs  * on the same vdev as an existing DVA of this BP, then try to allocate it
298793f3d2b8Schs  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
298893f3d2b8Schs  * existing DVAs.
298993f3d2b8Schs  */
299093f3d2b8Schs int ditto_same_vdev_distance_shift = 3;
299193f3d2b8Schs 
2992c1cb2cd8Shaad /*
2993c1cb2cd8Shaad  * Allocate a block for the specified i/o.
2994c1cb2cd8Shaad  */
2995c1cb2cd8Shaad static int
metaslab_alloc_dva(spa_t * spa,metaslab_class_t * mc,uint64_t psize,dva_t * dva,int d,dva_t * hintdva,uint64_t txg,int flags,zio_alloc_list_t * zal)2996c1cb2cd8Shaad metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
299793f3d2b8Schs     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
299893f3d2b8Schs     zio_alloc_list_t *zal)
2999c1cb2cd8Shaad {
3000c1cb2cd8Shaad 	metaslab_group_t *mg, *rotor;
3001c1cb2cd8Shaad 	vdev_t *vd;
300293f3d2b8Schs 	boolean_t try_hard = B_FALSE;
3003c1cb2cd8Shaad 
3004c1cb2cd8Shaad 	ASSERT(!DVA_IS_VALID(&dva[d]));
3005c1cb2cd8Shaad 
3006c1cb2cd8Shaad 	/*
3007c1cb2cd8Shaad 	 * For testing, make some blocks above a certain size be gang blocks.
3008c1cb2cd8Shaad 	 */
300993f3d2b8Schs 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
301093f3d2b8Schs 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
301193f3d2b8Schs 		return (SET_ERROR(ENOSPC));
301293f3d2b8Schs 	}
3013c1cb2cd8Shaad 
3014c1cb2cd8Shaad 	/*
3015c1cb2cd8Shaad 	 * Start at the rotor and loop through all mgs until we find something.
3016f59c7639Shaad 	 * Note that there's no locking on mc_rotor or mc_aliquot because
3017c1cb2cd8Shaad 	 * nothing actually breaks if we miss a few updates -- we just won't
3018c1cb2cd8Shaad 	 * allocate quite as evenly.  It all balances out over time.
3019c1cb2cd8Shaad 	 *
3020c1cb2cd8Shaad 	 * If we are doing ditto or log blocks, try to spread them across
3021c1cb2cd8Shaad 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
3022c1cb2cd8Shaad 	 * allocated all of our ditto blocks, then try and spread them out on
3023c1cb2cd8Shaad 	 * that vdev as much as possible.  If it turns out to not be possible,
3024c1cb2cd8Shaad 	 * gradually lower our standards until anything becomes acceptable.
3025c1cb2cd8Shaad 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3026c1cb2cd8Shaad 	 * gives us hope of containing our fault domains to something we're
3027c1cb2cd8Shaad 	 * able to reason about.  Otherwise, any two top-level vdev failures
3028c1cb2cd8Shaad 	 * will guarantee the loss of data.  With consecutive allocation,
3029c1cb2cd8Shaad 	 * only two adjacent top-level vdev failures will result in data loss.
3030c1cb2cd8Shaad 	 *
3031c1cb2cd8Shaad 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
3032c1cb2cd8Shaad 	 * ourselves on the same vdev as our gang block header.  That
3033c1cb2cd8Shaad 	 * way, we can hope for locality in vdev_cache, plus it makes our
3034c1cb2cd8Shaad 	 * fault domains something tractable.
3035c1cb2cd8Shaad 	 */
3036c1cb2cd8Shaad 	if (hintdva) {
3037c1cb2cd8Shaad 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3038f59c7639Shaad 
3039f59c7639Shaad 		/*
3040f59c7639Shaad 		 * It's possible the vdev we're using as the hint no
3041f59c7639Shaad 		 * longer exists (i.e. removed). Consult the rotor when
3042f59c7639Shaad 		 * all else fails.
3043f59c7639Shaad 		 */
3044f59c7639Shaad 		if (vd != NULL) {
3045c1cb2cd8Shaad 			mg = vd->vdev_mg;
3046f59c7639Shaad 
3047f59c7639Shaad 			if (flags & METASLAB_HINTBP_AVOID &&
3048f59c7639Shaad 			    mg->mg_next != NULL)
3049f59c7639Shaad 				mg = mg->mg_next;
3050f59c7639Shaad 		} else {
3051f59c7639Shaad 			mg = mc->mc_rotor;
3052f59c7639Shaad 		}
3053c1cb2cd8Shaad 	} else if (d != 0) {
3054c1cb2cd8Shaad 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3055c1cb2cd8Shaad 		mg = vd->vdev_mg->mg_next;
3056c1cb2cd8Shaad 	} else {
3057c1cb2cd8Shaad 		mg = mc->mc_rotor;
3058c1cb2cd8Shaad 	}
3059c1cb2cd8Shaad 
3060c1cb2cd8Shaad 	/*
3061f59c7639Shaad 	 * If the hint put us into the wrong metaslab class, or into a
3062f59c7639Shaad 	 * metaslab group that has been passivated, just follow the rotor.
3063c1cb2cd8Shaad 	 */
3064f59c7639Shaad 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3065c1cb2cd8Shaad 		mg = mc->mc_rotor;
3066c1cb2cd8Shaad 
3067c1cb2cd8Shaad 	rotor = mg;
3068c1cb2cd8Shaad top:
3069c1cb2cd8Shaad 	do {
307093f3d2b8Schs 		boolean_t allocatable;
3071f59c7639Shaad 
307293f3d2b8Schs 		ASSERT(mg->mg_activation_count == 1);
3073c1cb2cd8Shaad 		vd = mg->mg_vd;
3074f59c7639Shaad 
3075c1cb2cd8Shaad 		/*
3076c1cb2cd8Shaad 		 * Don't allocate from faulted devices.
3077c1cb2cd8Shaad 		 */
307893f3d2b8Schs 		if (try_hard) {
3079f59c7639Shaad 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3080f59c7639Shaad 			allocatable = vdev_allocatable(vd);
3081f59c7639Shaad 			spa_config_exit(spa, SCL_ZIO, FTAG);
3082f59c7639Shaad 		} else {
3083f59c7639Shaad 			allocatable = vdev_allocatable(vd);
3084f59c7639Shaad 		}
3085f59c7639Shaad 
3086c1cb2cd8Shaad 		/*
308793f3d2b8Schs 		 * Determine if the selected metaslab group is eligible
308893f3d2b8Schs 		 * for allocations. If we're ganging then don't allow
308993f3d2b8Schs 		 * this metaslab group to skip allocations since that would
309093f3d2b8Schs 		 * inadvertently return ENOSPC and suspend the pool
309193f3d2b8Schs 		 * even though space is still available.
309293f3d2b8Schs 		 */
309393f3d2b8Schs 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
309493f3d2b8Schs 			allocatable = metaslab_group_allocatable(mg, rotor,
309593f3d2b8Schs 			    psize);
309693f3d2b8Schs 		}
309793f3d2b8Schs 
309893f3d2b8Schs 		if (!allocatable) {
309993f3d2b8Schs 			metaslab_trace_add(zal, mg, NULL, psize, d,
310093f3d2b8Schs 			    TRACE_NOT_ALLOCATABLE);
310193f3d2b8Schs 			goto next;
310293f3d2b8Schs 		}
310393f3d2b8Schs 
310493f3d2b8Schs 		ASSERT(mg->mg_initialized);
310593f3d2b8Schs 
310693f3d2b8Schs 		/*
310793f3d2b8Schs 		 * Avoid writing single-copy data to a failing,
310893f3d2b8Schs 		 * non-redundant vdev, unless we've already tried all
310993f3d2b8Schs 		 * other vdevs.
3110c1cb2cd8Shaad 		 */
3111c1cb2cd8Shaad 		if ((vd->vdev_stat.vs_write_errors > 0 ||
3112c1cb2cd8Shaad 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
311393f3d2b8Schs 		    d == 0 && !try_hard && vd->vdev_children == 0) {
311493f3d2b8Schs 			metaslab_trace_add(zal, mg, NULL, psize, d,
311593f3d2b8Schs 			    TRACE_VDEV_ERROR);
3116c1cb2cd8Shaad 			goto next;
3117c1cb2cd8Shaad 		}
3118c1cb2cd8Shaad 
3119c1cb2cd8Shaad 		ASSERT(mg->mg_class == mc);
3120c1cb2cd8Shaad 
312193f3d2b8Schs 		/*
312293f3d2b8Schs 		 * If we don't need to try hard, then require that the
312393f3d2b8Schs 		 * block be 1/8th of the device away from any other DVAs
312493f3d2b8Schs 		 * in this BP.  If we are trying hard, allow any offset
312593f3d2b8Schs 		 * to be used (distance=0).
312693f3d2b8Schs 		 */
312793f3d2b8Schs 		uint64_t distance = 0;
312893f3d2b8Schs 		if (!try_hard) {
312993f3d2b8Schs 			distance = vd->vdev_asize >>
313093f3d2b8Schs 			    ditto_same_vdev_distance_shift;
3131c1cb2cd8Shaad 			if (distance <= (1ULL << vd->vdev_ms_shift))
3132c1cb2cd8Shaad 				distance = 0;
313393f3d2b8Schs 		}
3134c1cb2cd8Shaad 
313593f3d2b8Schs 		uint64_t asize = vdev_psize_to_asize(vd, psize);
3136c1cb2cd8Shaad 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3137c1cb2cd8Shaad 
313893f3d2b8Schs 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
313993f3d2b8Schs 		    distance, dva, d);
314093f3d2b8Schs 
3141c1cb2cd8Shaad 		if (offset != -1ULL) {
3142c1cb2cd8Shaad 			/*
3143c1cb2cd8Shaad 			 * If we've just selected this metaslab group,
3144c1cb2cd8Shaad 			 * figure out whether the corresponding vdev is
3145c1cb2cd8Shaad 			 * over- or under-used relative to the pool,
3146c1cb2cd8Shaad 			 * and set an allocation bias to even it out.
3147c1cb2cd8Shaad 			 */
314893f3d2b8Schs 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3149c1cb2cd8Shaad 				vdev_stat_t *vs = &vd->vdev_stat;
3150f59c7639Shaad 				int64_t vu, cu;
3151c1cb2cd8Shaad 
315293f3d2b8Schs 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
315393f3d2b8Schs 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3154c1cb2cd8Shaad 
3155c1cb2cd8Shaad 				/*
315693f3d2b8Schs 				 * Calculate how much more or less we should
315793f3d2b8Schs 				 * try to allocate from this device during
315893f3d2b8Schs 				 * this iteration around the rotor.
315993f3d2b8Schs 				 * For example, if a device is 80% full
316093f3d2b8Schs 				 * and the pool is 20% full then we should
316193f3d2b8Schs 				 * reduce allocations by 60% on this device.
316293f3d2b8Schs 				 *
316393f3d2b8Schs 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
316493f3d2b8Schs 				 *
316593f3d2b8Schs 				 * This reduces allocations by 307K for this
316693f3d2b8Schs 				 * iteration.
3167c1cb2cd8Shaad 				 */
3168f59c7639Shaad 				mg->mg_bias = ((cu - vu) *
316993f3d2b8Schs 				    (int64_t)mg->mg_aliquot) / 100;
317093f3d2b8Schs 			} else if (!metaslab_bias_enabled) {
317193f3d2b8Schs 				mg->mg_bias = 0;
3172c1cb2cd8Shaad 			}
3173c1cb2cd8Shaad 
3174f59c7639Shaad 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3175c1cb2cd8Shaad 			    mg->mg_aliquot + mg->mg_bias) {
3176c1cb2cd8Shaad 				mc->mc_rotor = mg->mg_next;
3177f59c7639Shaad 				mc->mc_aliquot = 0;
3178c1cb2cd8Shaad 			}
3179c1cb2cd8Shaad 
3180c1cb2cd8Shaad 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
3181c1cb2cd8Shaad 			DVA_SET_OFFSET(&dva[d], offset);
3182c1cb2cd8Shaad 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3183c1cb2cd8Shaad 			DVA_SET_ASIZE(&dva[d], asize);
3184c1cb2cd8Shaad 
3185c1cb2cd8Shaad 			return (0);
3186c1cb2cd8Shaad 		}
3187c1cb2cd8Shaad next:
3188c1cb2cd8Shaad 		mc->mc_rotor = mg->mg_next;
3189f59c7639Shaad 		mc->mc_aliquot = 0;
3190c1cb2cd8Shaad 	} while ((mg = mg->mg_next) != rotor);
3191c1cb2cd8Shaad 
319293f3d2b8Schs 	/*
319393f3d2b8Schs 	 * If we haven't tried hard, do so now.
319493f3d2b8Schs 	 */
319593f3d2b8Schs 	if (!try_hard) {
319693f3d2b8Schs 		try_hard = B_TRUE;
3197f59c7639Shaad 		goto top;
3198f59c7639Shaad 	}
3199f59c7639Shaad 
3200c1cb2cd8Shaad 	bzero(&dva[d], sizeof (dva_t));
3201c1cb2cd8Shaad 
320293f3d2b8Schs 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
320393f3d2b8Schs 	return (SET_ERROR(ENOSPC));
3204c1cb2cd8Shaad }
3205c1cb2cd8Shaad 
3206c1cb2cd8Shaad /*
3207c1cb2cd8Shaad  * Free the block represented by DVA in the context of the specified
3208c1cb2cd8Shaad  * transaction group.
3209c1cb2cd8Shaad  */
3210c1cb2cd8Shaad static void
metaslab_free_dva(spa_t * spa,const dva_t * dva,uint64_t txg,boolean_t now)3211c1cb2cd8Shaad metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3212c1cb2cd8Shaad {
3213c1cb2cd8Shaad 	uint64_t vdev = DVA_GET_VDEV(dva);
3214c1cb2cd8Shaad 	uint64_t offset = DVA_GET_OFFSET(dva);
3215c1cb2cd8Shaad 	uint64_t size = DVA_GET_ASIZE(dva);
3216c1cb2cd8Shaad 	vdev_t *vd;
3217c1cb2cd8Shaad 	metaslab_t *msp;
3218c1cb2cd8Shaad 
3219c1cb2cd8Shaad 	ASSERT(DVA_IS_VALID(dva));
3220c1cb2cd8Shaad 
3221c1cb2cd8Shaad 	if (txg > spa_freeze_txg(spa))
3222c1cb2cd8Shaad 		return;
3223c1cb2cd8Shaad 
3224c1cb2cd8Shaad 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3225c1cb2cd8Shaad 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3226c1cb2cd8Shaad 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3227c1cb2cd8Shaad 		    (u_longlong_t)vdev, (u_longlong_t)offset);
3228c1cb2cd8Shaad 		ASSERT(0);
3229c1cb2cd8Shaad 		return;
3230c1cb2cd8Shaad 	}
3231c1cb2cd8Shaad 
3232c1cb2cd8Shaad 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3233c1cb2cd8Shaad 
3234c1cb2cd8Shaad 	if (DVA_GET_GANG(dva))
3235c1cb2cd8Shaad 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3236c1cb2cd8Shaad 
3237c1cb2cd8Shaad 	mutex_enter(&msp->ms_lock);
3238c1cb2cd8Shaad 
3239c1cb2cd8Shaad 	if (now) {
324093f3d2b8Schs 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3241c1cb2cd8Shaad 		    offset, size);
324293f3d2b8Schs 
324393f3d2b8Schs 		VERIFY(!msp->ms_condensing);
324493f3d2b8Schs 		VERIFY3U(offset, >=, msp->ms_start);
324593f3d2b8Schs 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
324693f3d2b8Schs 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
324793f3d2b8Schs 		    msp->ms_size);
324893f3d2b8Schs 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
324993f3d2b8Schs 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
325093f3d2b8Schs 		range_tree_add(msp->ms_tree, offset, size);
325193f3d2b8Schs 		msp->ms_max_size = metaslab_block_maxsize(msp);
3252c1cb2cd8Shaad 	} else {
325393f3d2b8Schs 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
3254c1cb2cd8Shaad 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
325593f3d2b8Schs 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
325693f3d2b8Schs 		    offset, size);
3257c1cb2cd8Shaad 	}
3258c1cb2cd8Shaad 
3259c1cb2cd8Shaad 	mutex_exit(&msp->ms_lock);
3260c1cb2cd8Shaad }
3261c1cb2cd8Shaad 
3262c1cb2cd8Shaad /*
3263c1cb2cd8Shaad  * Intent log support: upon opening the pool after a crash, notify the SPA
3264c1cb2cd8Shaad  * of blocks that the intent log has allocated for immediate write, but
3265c1cb2cd8Shaad  * which are still considered free by the SPA because the last transaction
3266c1cb2cd8Shaad  * group didn't commit yet.
3267c1cb2cd8Shaad  */
3268c1cb2cd8Shaad static int
metaslab_claim_dva(spa_t * spa,const dva_t * dva,uint64_t txg)3269c1cb2cd8Shaad metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3270c1cb2cd8Shaad {
3271c1cb2cd8Shaad 	uint64_t vdev = DVA_GET_VDEV(dva);
3272c1cb2cd8Shaad 	uint64_t offset = DVA_GET_OFFSET(dva);
3273c1cb2cd8Shaad 	uint64_t size = DVA_GET_ASIZE(dva);
3274c1cb2cd8Shaad 	vdev_t *vd;
3275c1cb2cd8Shaad 	metaslab_t *msp;
3276f59c7639Shaad 	int error = 0;
3277c1cb2cd8Shaad 
3278c1cb2cd8Shaad 	ASSERT(DVA_IS_VALID(dva));
3279c1cb2cd8Shaad 
3280c1cb2cd8Shaad 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3281c1cb2cd8Shaad 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
328293f3d2b8Schs 		return (SET_ERROR(ENXIO));
3283c1cb2cd8Shaad 
3284c1cb2cd8Shaad 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3285c1cb2cd8Shaad 
3286c1cb2cd8Shaad 	if (DVA_GET_GANG(dva))
3287c1cb2cd8Shaad 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3288c1cb2cd8Shaad 
3289c1cb2cd8Shaad 	mutex_enter(&msp->ms_lock);
3290c1cb2cd8Shaad 
329193f3d2b8Schs 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
329293f3d2b8Schs 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3293f59c7639Shaad 
329493f3d2b8Schs 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
329593f3d2b8Schs 		error = SET_ERROR(ENOENT);
3296f59c7639Shaad 
3297c1cb2cd8Shaad 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
3298c1cb2cd8Shaad 		mutex_exit(&msp->ms_lock);
3299c1cb2cd8Shaad 		return (error);
3300c1cb2cd8Shaad 	}
3301c1cb2cd8Shaad 
330293f3d2b8Schs 	VERIFY(!msp->ms_condensing);
330393f3d2b8Schs 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
330493f3d2b8Schs 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
330593f3d2b8Schs 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
330693f3d2b8Schs 	range_tree_remove(msp->ms_tree, offset, size);
3307c1cb2cd8Shaad 
3308f59c7639Shaad 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
330993f3d2b8Schs 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3310c1cb2cd8Shaad 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
331193f3d2b8Schs 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3312c1cb2cd8Shaad 	}
3313c1cb2cd8Shaad 
3314c1cb2cd8Shaad 	mutex_exit(&msp->ms_lock);
3315c1cb2cd8Shaad 
3316c1cb2cd8Shaad 	return (0);
3317c1cb2cd8Shaad }
3318c1cb2cd8Shaad 
331993f3d2b8Schs /*
332093f3d2b8Schs  * Reserve some allocation slots. The reservation system must be called
332193f3d2b8Schs  * before we call into the allocator. If there aren't any available slots
332293f3d2b8Schs  * then the I/O will be throttled until an I/O completes and its slots are
332393f3d2b8Schs  * freed up. The function returns true if it was successful in placing
332493f3d2b8Schs  * the reservation.
332593f3d2b8Schs  */
332693f3d2b8Schs boolean_t
metaslab_class_throttle_reserve(metaslab_class_t * mc,int slots,zio_t * zio,int flags)332793f3d2b8Schs metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
332893f3d2b8Schs     int flags)
332993f3d2b8Schs {
333093f3d2b8Schs 	uint64_t available_slots = 0;
333193f3d2b8Schs 	boolean_t slot_reserved = B_FALSE;
333293f3d2b8Schs 
333393f3d2b8Schs 	ASSERT(mc->mc_alloc_throttle_enabled);
333493f3d2b8Schs 	mutex_enter(&mc->mc_lock);
333593f3d2b8Schs 
333693f3d2b8Schs 	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
333793f3d2b8Schs 	if (reserved_slots < mc->mc_alloc_max_slots)
333893f3d2b8Schs 		available_slots = mc->mc_alloc_max_slots - reserved_slots;
333993f3d2b8Schs 
334093f3d2b8Schs 	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
334193f3d2b8Schs 		/*
334293f3d2b8Schs 		 * We reserve the slots individually so that we can unreserve
334393f3d2b8Schs 		 * them individually when an I/O completes.
334493f3d2b8Schs 		 */
334593f3d2b8Schs 		for (int d = 0; d < slots; d++) {
334693f3d2b8Schs 			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
334793f3d2b8Schs 		}
334893f3d2b8Schs 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
334993f3d2b8Schs 		slot_reserved = B_TRUE;
335093f3d2b8Schs 	}
335193f3d2b8Schs 
335293f3d2b8Schs 	mutex_exit(&mc->mc_lock);
335393f3d2b8Schs 	return (slot_reserved);
335493f3d2b8Schs }
335593f3d2b8Schs 
335693f3d2b8Schs void
metaslab_class_throttle_unreserve(metaslab_class_t * mc,int slots,zio_t * zio)335793f3d2b8Schs metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
335893f3d2b8Schs {
335993f3d2b8Schs 	ASSERT(mc->mc_alloc_throttle_enabled);
336093f3d2b8Schs 	mutex_enter(&mc->mc_lock);
336193f3d2b8Schs 	for (int d = 0; d < slots; d++) {
336293f3d2b8Schs 		(void) refcount_remove(&mc->mc_alloc_slots, zio);
336393f3d2b8Schs 	}
336493f3d2b8Schs 	mutex_exit(&mc->mc_lock);
336593f3d2b8Schs }
336693f3d2b8Schs 
3367c1cb2cd8Shaad int
metaslab_alloc(spa_t * spa,metaslab_class_t * mc,uint64_t psize,blkptr_t * bp,int ndvas,uint64_t txg,blkptr_t * hintbp,int flags,zio_alloc_list_t * zal,zio_t * zio)3368c1cb2cd8Shaad metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
336993f3d2b8Schs     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
337093f3d2b8Schs     zio_alloc_list_t *zal, zio_t *zio)
3371c1cb2cd8Shaad {
3372c1cb2cd8Shaad 	dva_t *dva = bp->blk_dva;
3373*03f30658Sfox 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
3374c1cb2cd8Shaad 	int error = 0;
3375c1cb2cd8Shaad 
3376c1cb2cd8Shaad 	ASSERT(bp->blk_birth == 0);
3377f59c7639Shaad 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3378c1cb2cd8Shaad 
3379c1cb2cd8Shaad 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3380c1cb2cd8Shaad 
3381c1cb2cd8Shaad 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
3382c1cb2cd8Shaad 		spa_config_exit(spa, SCL_ALLOC, FTAG);
338393f3d2b8Schs 		return (SET_ERROR(ENOSPC));
3384c1cb2cd8Shaad 	}
3385c1cb2cd8Shaad 
3386c1cb2cd8Shaad 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3387c1cb2cd8Shaad 	ASSERT(BP_GET_NDVAS(bp) == 0);
3388c1cb2cd8Shaad 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
338993f3d2b8Schs 	ASSERT3P(zal, !=, NULL);
3390c1cb2cd8Shaad 
3391c1cb2cd8Shaad 	for (int d = 0; d < ndvas; d++) {
3392c1cb2cd8Shaad 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
339393f3d2b8Schs 		    txg, flags, zal);
339493f3d2b8Schs 		if (error != 0) {
3395c1cb2cd8Shaad 			for (d--; d >= 0; d--) {
3396c1cb2cd8Shaad 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
339793f3d2b8Schs 				metaslab_group_alloc_decrement(spa,
339893f3d2b8Schs 				    DVA_GET_VDEV(&dva[d]), zio, flags);
3399c1cb2cd8Shaad 				bzero(&dva[d], sizeof (dva_t));
3400c1cb2cd8Shaad 			}
3401c1cb2cd8Shaad 			spa_config_exit(spa, SCL_ALLOC, FTAG);
3402c1cb2cd8Shaad 			return (error);
340393f3d2b8Schs 		} else {
340493f3d2b8Schs 			/*
340593f3d2b8Schs 			 * Update the metaslab group's queue depth
340693f3d2b8Schs 			 * based on the newly allocated dva.
340793f3d2b8Schs 			 */
340893f3d2b8Schs 			metaslab_group_alloc_increment(spa,
340993f3d2b8Schs 			    DVA_GET_VDEV(&dva[d]), zio, flags);
3410c1cb2cd8Shaad 		}
341193f3d2b8Schs 
3412c1cb2cd8Shaad 	}
3413c1cb2cd8Shaad 	ASSERT(error == 0);
3414c1cb2cd8Shaad 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
3415c1cb2cd8Shaad 
3416c1cb2cd8Shaad 	spa_config_exit(spa, SCL_ALLOC, FTAG);
3417c1cb2cd8Shaad 
3418f59c7639Shaad 	BP_SET_BIRTH(bp, txg, txg);
3419c1cb2cd8Shaad 
3420c1cb2cd8Shaad 	return (0);
3421c1cb2cd8Shaad }
3422c1cb2cd8Shaad 
3423c1cb2cd8Shaad void
metaslab_free(spa_t * spa,const blkptr_t * bp,uint64_t txg,boolean_t now)3424c1cb2cd8Shaad metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3425c1cb2cd8Shaad {
3426c1cb2cd8Shaad 	const dva_t *dva = bp->blk_dva;
3427c1cb2cd8Shaad 	int ndvas = BP_GET_NDVAS(bp);
3428c1cb2cd8Shaad 
3429c1cb2cd8Shaad 	ASSERT(!BP_IS_HOLE(bp));
3430f59c7639Shaad 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3431c1cb2cd8Shaad 
3432c1cb2cd8Shaad 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3433c1cb2cd8Shaad 
3434c1cb2cd8Shaad 	for (int d = 0; d < ndvas; d++)
3435c1cb2cd8Shaad 		metaslab_free_dva(spa, &dva[d], txg, now);
3436c1cb2cd8Shaad 
3437c1cb2cd8Shaad 	spa_config_exit(spa, SCL_FREE, FTAG);
3438c1cb2cd8Shaad }
3439c1cb2cd8Shaad 
3440c1cb2cd8Shaad int
metaslab_claim(spa_t * spa,const blkptr_t * bp,uint64_t txg)3441c1cb2cd8Shaad metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3442c1cb2cd8Shaad {
3443c1cb2cd8Shaad 	const dva_t *dva = bp->blk_dva;
3444c1cb2cd8Shaad 	int ndvas = BP_GET_NDVAS(bp);
3445c1cb2cd8Shaad 	int error = 0;
3446c1cb2cd8Shaad 
3447c1cb2cd8Shaad 	ASSERT(!BP_IS_HOLE(bp));
3448c1cb2cd8Shaad 
3449c1cb2cd8Shaad 	if (txg != 0) {
3450c1cb2cd8Shaad 		/*
3451c1cb2cd8Shaad 		 * First do a dry run to make sure all DVAs are claimable,
3452c1cb2cd8Shaad 		 * so we don't have to unwind from partial failures below.
3453c1cb2cd8Shaad 		 */
3454c1cb2cd8Shaad 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
3455c1cb2cd8Shaad 			return (error);
3456c1cb2cd8Shaad 	}
3457c1cb2cd8Shaad 
3458c1cb2cd8Shaad 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3459c1cb2cd8Shaad 
3460c1cb2cd8Shaad 	for (int d = 0; d < ndvas; d++)
3461c1cb2cd8Shaad 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3462c1cb2cd8Shaad 			break;
3463c1cb2cd8Shaad 
3464c1cb2cd8Shaad 	spa_config_exit(spa, SCL_ALLOC, FTAG);
3465c1cb2cd8Shaad 
3466c1cb2cd8Shaad 	ASSERT(error == 0 || txg == 0);
3467c1cb2cd8Shaad 
3468c1cb2cd8Shaad 	return (error);
3469c1cb2cd8Shaad }
347093f3d2b8Schs 
347193f3d2b8Schs void
metaslab_check_free(spa_t * spa,const blkptr_t * bp)347293f3d2b8Schs metaslab_check_free(spa_t *spa, const blkptr_t *bp)
347393f3d2b8Schs {
347493f3d2b8Schs 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
347593f3d2b8Schs 		return;
347693f3d2b8Schs 
347793f3d2b8Schs 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
347893f3d2b8Schs 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
347993f3d2b8Schs 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
348093f3d2b8Schs 		vdev_t *vd = vdev_lookup_top(spa, vdev);
348193f3d2b8Schs 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
348293f3d2b8Schs 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
348393f3d2b8Schs 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
348493f3d2b8Schs 
348593f3d2b8Schs 		if (msp->ms_loaded)
348693f3d2b8Schs 			range_tree_verify(msp->ms_tree, offset, size);
348793f3d2b8Schs 
348893f3d2b8Schs 		for (int j = 0; j < TXG_SIZE; j++)
348993f3d2b8Schs 			range_tree_verify(msp->ms_freetree[j], offset, size);
349093f3d2b8Schs 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
349193f3d2b8Schs 			range_tree_verify(msp->ms_defertree[j], offset, size);
349293f3d2b8Schs 	}
349393f3d2b8Schs 	spa_config_exit(spa, SCL_VDEV, FTAG);
349493f3d2b8Schs }
3495