1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dnode.h>
33 #include <sys/zio.h>
34 #include <sys/range_tree.h>
35 
36 kmem_cache_t *range_seg_cache;
37 
38 void
range_tree_init(void)39 range_tree_init(void)
40 {
41 	ASSERT(range_seg_cache == NULL);
42 	range_seg_cache = kmem_cache_create("range_seg_cache",
43 	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
44 }
45 
46 void
range_tree_fini(void)47 range_tree_fini(void)
48 {
49 	kmem_cache_destroy(range_seg_cache);
50 	range_seg_cache = NULL;
51 }
52 
53 void
range_tree_stat_verify(range_tree_t * rt)54 range_tree_stat_verify(range_tree_t *rt)
55 {
56 	range_seg_t *rs;
57 	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
58 	int i;
59 
60 	for (rs = avl_first(&rt->rt_root); rs != NULL;
61 	    rs = AVL_NEXT(&rt->rt_root, rs)) {
62 		uint64_t size = rs->rs_end - rs->rs_start;
63 		int idx	= highbit64(size) - 1;
64 
65 		hist[idx]++;
66 		ASSERT3U(hist[idx], !=, 0);
67 	}
68 
69 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
70 		if (hist[i] != rt->rt_histogram[i]) {
71 			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
72 			    i, hist, hist[i], rt->rt_histogram[i]);
73 		}
74 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
75 	}
76 }
77 
78 static void
range_tree_stat_incr(range_tree_t * rt,range_seg_t * rs)79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
80 {
81 	uint64_t size = rs->rs_end - rs->rs_start;
82 	int idx = highbit64(size) - 1;
83 
84 	ASSERT(size != 0);
85 	ASSERT3U(idx, <,
86 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
87 
88 	ASSERT(MUTEX_HELD(rt->rt_lock));
89 	rt->rt_histogram[idx]++;
90 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
91 }
92 
93 static void
range_tree_stat_decr(range_tree_t * rt,range_seg_t * rs)94 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
95 {
96 	uint64_t size = rs->rs_end - rs->rs_start;
97 	int idx = highbit64(size) - 1;
98 
99 	ASSERT(size != 0);
100 	ASSERT3U(idx, <,
101 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
102 
103 	ASSERT(MUTEX_HELD(rt->rt_lock));
104 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
105 	rt->rt_histogram[idx]--;
106 }
107 
108 /*
109  * NOTE: caller is responsible for all locking.
110  */
111 static int
range_tree_seg_compare(const void * x1,const void * x2)112 range_tree_seg_compare(const void *x1, const void *x2)
113 {
114 	const range_seg_t *r1 = x1;
115 	const range_seg_t *r2 = x2;
116 
117 	if (r1->rs_start < r2->rs_start) {
118 		if (r1->rs_end > r2->rs_start)
119 			return (0);
120 		return (-1);
121 	}
122 	if (r1->rs_start > r2->rs_start) {
123 		if (r1->rs_start < r2->rs_end)
124 			return (0);
125 		return (1);
126 	}
127 	return (0);
128 }
129 
130 range_tree_t *
range_tree_create(range_tree_ops_t * ops,void * arg,kmutex_t * lp)131 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
132 {
133 	range_tree_t *rt;
134 
135 	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
136 
137 	avl_create(&rt->rt_root, range_tree_seg_compare,
138 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
139 
140 	rt->rt_lock = lp;
141 	rt->rt_ops = ops;
142 	rt->rt_arg = arg;
143 
144 	if (rt->rt_ops != NULL)
145 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
146 
147 	return (rt);
148 }
149 
150 void
range_tree_destroy(range_tree_t * rt)151 range_tree_destroy(range_tree_t *rt)
152 {
153 	VERIFY0(rt->rt_space);
154 
155 	if (rt->rt_ops != NULL)
156 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
157 
158 	avl_destroy(&rt->rt_root);
159 	kmem_free(rt, sizeof (*rt));
160 }
161 
162 void
range_tree_add(void * arg,uint64_t start,uint64_t size)163 range_tree_add(void *arg, uint64_t start, uint64_t size)
164 {
165 	range_tree_t *rt = arg;
166 	avl_index_t where;
167 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
168 	uint64_t end = start + size;
169 	boolean_t merge_before, merge_after;
170 
171 	ASSERT(MUTEX_HELD(rt->rt_lock));
172 	VERIFY(size != 0);
173 
174 	rsearch.rs_start = start;
175 	rsearch.rs_end = end;
176 	rs = avl_find(&rt->rt_root, &rsearch, &where);
177 
178 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
179 		zfs_panic_recover("zfs: allocating allocated segment"
180 		    "(offset=%llu size=%llu)\n",
181 		    (longlong_t)start, (longlong_t)size);
182 		return;
183 	}
184 
185 	/* Make sure we don't overlap with either of our neighbors */
186 	VERIFY(rs == NULL);
187 
188 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
189 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
190 
191 	merge_before = (rs_before != NULL && rs_before->rs_end == start);
192 	merge_after = (rs_after != NULL && rs_after->rs_start == end);
193 
194 	if (merge_before && merge_after) {
195 		avl_remove(&rt->rt_root, rs_before);
196 		if (rt->rt_ops != NULL) {
197 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
198 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
199 		}
200 
201 		range_tree_stat_decr(rt, rs_before);
202 		range_tree_stat_decr(rt, rs_after);
203 
204 		rs_after->rs_start = rs_before->rs_start;
205 		kmem_cache_free(range_seg_cache, rs_before);
206 		rs = rs_after;
207 	} else if (merge_before) {
208 		if (rt->rt_ops != NULL)
209 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
210 
211 		range_tree_stat_decr(rt, rs_before);
212 
213 		rs_before->rs_end = end;
214 		rs = rs_before;
215 	} else if (merge_after) {
216 		if (rt->rt_ops != NULL)
217 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
218 
219 		range_tree_stat_decr(rt, rs_after);
220 
221 		rs_after->rs_start = start;
222 		rs = rs_after;
223 	} else {
224 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
225 		rs->rs_start = start;
226 		rs->rs_end = end;
227 		avl_insert(&rt->rt_root, rs, where);
228 	}
229 
230 	if (rt->rt_ops != NULL)
231 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
232 
233 	range_tree_stat_incr(rt, rs);
234 	rt->rt_space += size;
235 }
236 
237 void
range_tree_remove(void * arg,uint64_t start,uint64_t size)238 range_tree_remove(void *arg, uint64_t start, uint64_t size)
239 {
240 	range_tree_t *rt = arg;
241 	avl_index_t where;
242 	range_seg_t rsearch, *rs, *newseg;
243 	uint64_t end = start + size;
244 	boolean_t left_over, right_over;
245 
246 	ASSERT(MUTEX_HELD(rt->rt_lock));
247 	VERIFY3U(size, !=, 0);
248 	VERIFY3U(size, <=, rt->rt_space);
249 
250 	rsearch.rs_start = start;
251 	rsearch.rs_end = end;
252 	rs = avl_find(&rt->rt_root, &rsearch, &where);
253 
254 	/* Make sure we completely overlap with someone */
255 	if (rs == NULL) {
256 		zfs_panic_recover("zfs: freeing free segment "
257 		    "(offset=%llu size=%llu)",
258 		    (longlong_t)start, (longlong_t)size);
259 		return;
260 	}
261 	VERIFY3U(rs->rs_start, <=, start);
262 	VERIFY3U(rs->rs_end, >=, end);
263 
264 	left_over = (rs->rs_start != start);
265 	right_over = (rs->rs_end != end);
266 
267 	range_tree_stat_decr(rt, rs);
268 
269 	if (rt->rt_ops != NULL)
270 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
271 
272 	if (left_over && right_over) {
273 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
274 		newseg->rs_start = end;
275 		newseg->rs_end = rs->rs_end;
276 		range_tree_stat_incr(rt, newseg);
277 
278 		rs->rs_end = start;
279 
280 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
281 		if (rt->rt_ops != NULL)
282 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
283 	} else if (left_over) {
284 		rs->rs_end = start;
285 	} else if (right_over) {
286 		rs->rs_start = end;
287 	} else {
288 		avl_remove(&rt->rt_root, rs);
289 		kmem_cache_free(range_seg_cache, rs);
290 		rs = NULL;
291 	}
292 
293 	if (rs != NULL) {
294 		range_tree_stat_incr(rt, rs);
295 
296 		if (rt->rt_ops != NULL)
297 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
298 	}
299 
300 	rt->rt_space -= size;
301 }
302 
303 static range_seg_t *
range_tree_find_impl(range_tree_t * rt,uint64_t start,uint64_t size)304 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
305 {
306 	avl_index_t where;
307 	range_seg_t rsearch;
308 	uint64_t end = start + size;
309 
310 	ASSERT(MUTEX_HELD(rt->rt_lock));
311 	VERIFY(size != 0);
312 
313 	rsearch.rs_start = start;
314 	rsearch.rs_end = end;
315 	return (avl_find(&rt->rt_root, &rsearch, &where));
316 }
317 
318 static range_seg_t *
range_tree_find(range_tree_t * rt,uint64_t start,uint64_t size)319 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
320 {
321 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
322 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
323 		return (rs);
324 	return (NULL);
325 }
326 
327 void
range_tree_verify(range_tree_t * rt,uint64_t off,uint64_t size)328 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
329 {
330 	range_seg_t *rs;
331 
332 	mutex_enter(rt->rt_lock);
333 	rs = range_tree_find(rt, off, size);
334 	if (rs != NULL)
335 		panic("freeing free block; rs=%p", (void *)rs);
336 	mutex_exit(rt->rt_lock);
337 }
338 
339 boolean_t
range_tree_contains(range_tree_t * rt,uint64_t start,uint64_t size)340 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
341 {
342 	return (range_tree_find(rt, start, size) != NULL);
343 }
344 
345 /*
346  * Ensure that this range is not in the tree, regardless of whether
347  * it is currently in the tree.
348  */
349 void
range_tree_clear(range_tree_t * rt,uint64_t start,uint64_t size)350 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
351 {
352 	range_seg_t *rs;
353 
354 	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
355 		uint64_t free_start = MAX(rs->rs_start, start);
356 		uint64_t free_end = MIN(rs->rs_end, start + size);
357 		range_tree_remove(rt, free_start, free_end - free_start);
358 	}
359 }
360 
361 void
range_tree_swap(range_tree_t ** rtsrc,range_tree_t ** rtdst)362 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
363 {
364 	range_tree_t *rt;
365 
366 	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
367 	ASSERT0(range_tree_space(*rtdst));
368 	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
369 
370 	rt = *rtsrc;
371 	*rtsrc = *rtdst;
372 	*rtdst = rt;
373 }
374 
375 void
range_tree_vacate(range_tree_t * rt,range_tree_func_t * func,void * arg)376 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
377 {
378 	range_seg_t *rs;
379 	void *cookie = NULL;
380 
381 	ASSERT(MUTEX_HELD(rt->rt_lock));
382 
383 	if (rt->rt_ops != NULL)
384 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
385 
386 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
387 		if (func != NULL)
388 			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
389 		kmem_cache_free(range_seg_cache, rs);
390 	}
391 
392 	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
393 	rt->rt_space = 0;
394 }
395 
396 void
range_tree_walk(range_tree_t * rt,range_tree_func_t * func,void * arg)397 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
398 {
399 	range_seg_t *rs;
400 
401 	ASSERT(MUTEX_HELD(rt->rt_lock));
402 
403 	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
404 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
405 }
406 
407 uint64_t
range_tree_space(range_tree_t * rt)408 range_tree_space(range_tree_t *rt)
409 {
410 	return (rt->rt_space);
411 }
412