xref: /linux/fs/bcachefs/btree_cache.c (revision 5ae67abc)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bbpos.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_iter.h"
9 #include "btree_locking.h"
10 #include "debug.h"
11 #include "errcode.h"
12 #include "error.h"
13 #include "journal.h"
14 #include "trace.h"
15 
16 #include <linux/prefetch.h>
17 #include <linux/sched/mm.h>
18 
19 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
20 do {						 \
21 	if (shrinker_counter)			 \
22 		bc->not_freed_##counter++;	 \
23 } while (0)
24 
25 const char * const bch2_btree_node_flags[] = {
26 #define x(f)	#f,
27 	BTREE_FLAGS()
28 #undef x
29 	NULL
30 };
31 
bch2_recalc_btree_reserve(struct bch_fs * c)32 void bch2_recalc_btree_reserve(struct bch_fs *c)
33 {
34 	unsigned i, reserve = 16;
35 
36 	if (!c->btree_roots_known[0].b)
37 		reserve += 8;
38 
39 	for (i = 0; i < btree_id_nr_alive(c); i++) {
40 		struct btree_root *r = bch2_btree_id_root(c, i);
41 
42 		if (r->b)
43 			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
44 	}
45 
46 	c->btree_cache.reserve = reserve;
47 }
48 
btree_cache_can_free(struct btree_cache * bc)49 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
50 {
51 	return max_t(int, 0, bc->used - bc->reserve);
52 }
53 
btree_node_to_freedlist(struct btree_cache * bc,struct btree * b)54 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
55 {
56 	if (b->c.lock.readers)
57 		list_move(&b->list, &bc->freed_pcpu);
58 	else
59 		list_move(&b->list, &bc->freed_nonpcpu);
60 }
61 
btree_node_data_free(struct bch_fs * c,struct btree * b)62 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
63 {
64 	struct btree_cache *bc = &c->btree_cache;
65 
66 	EBUG_ON(btree_node_write_in_flight(b));
67 
68 	clear_btree_node_just_written(b);
69 
70 	kvfree(b->data);
71 	b->data = NULL;
72 #ifdef __KERNEL__
73 	kvfree(b->aux_data);
74 #else
75 	munmap(b->aux_data, btree_aux_data_bytes(b));
76 #endif
77 	b->aux_data = NULL;
78 
79 	bc->used--;
80 
81 	btree_node_to_freedlist(bc, b);
82 }
83 
bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg * arg,const void * obj)84 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
85 				   const void *obj)
86 {
87 	const struct btree *b = obj;
88 	const u64 *v = arg->key;
89 
90 	return b->hash_val == *v ? 0 : 1;
91 }
92 
93 static const struct rhashtable_params bch_btree_cache_params = {
94 	.head_offset		= offsetof(struct btree, hash),
95 	.key_offset		= offsetof(struct btree, hash_val),
96 	.key_len		= sizeof(u64),
97 	.obj_cmpfn		= bch2_btree_cache_cmp_fn,
98 	.automatic_shrinking	= true,
99 };
100 
btree_node_data_alloc(struct bch_fs * c,struct btree * b,gfp_t gfp)101 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
102 {
103 	BUG_ON(b->data || b->aux_data);
104 
105 	b->data = kvmalloc(btree_buf_bytes(b), gfp);
106 	if (!b->data)
107 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
108 #ifdef __KERNEL__
109 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
110 #else
111 	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
112 			   PROT_READ|PROT_WRITE|PROT_EXEC,
113 			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
114 	if (b->aux_data == MAP_FAILED)
115 		b->aux_data = NULL;
116 #endif
117 	if (!b->aux_data) {
118 		kvfree(b->data);
119 		b->data = NULL;
120 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
121 	}
122 
123 	return 0;
124 }
125 
__btree_node_mem_alloc(struct bch_fs * c,gfp_t gfp)126 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
127 {
128 	struct btree *b;
129 
130 	b = kzalloc(sizeof(struct btree), gfp);
131 	if (!b)
132 		return NULL;
133 
134 	bkey_btree_ptr_init(&b->key);
135 	INIT_LIST_HEAD(&b->list);
136 	INIT_LIST_HEAD(&b->write_blocked);
137 	b->byte_order = ilog2(c->opts.btree_node_size);
138 	return b;
139 }
140 
__bch2_btree_node_mem_alloc(struct bch_fs * c)141 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
142 {
143 	struct btree_cache *bc = &c->btree_cache;
144 	struct btree *b;
145 
146 	b = __btree_node_mem_alloc(c, GFP_KERNEL);
147 	if (!b)
148 		return NULL;
149 
150 	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
151 		kfree(b);
152 		return NULL;
153 	}
154 
155 	bch2_btree_lock_init(&b->c, 0);
156 
157 	bc->used++;
158 	list_add(&b->list, &bc->freeable);
159 	return b;
160 }
161 
162 /* Btree in memory cache - hash table */
163 
bch2_btree_node_hash_remove(struct btree_cache * bc,struct btree * b)164 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
165 {
166 	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
167 
168 	BUG_ON(ret);
169 
170 	/* Cause future lookups for this node to fail: */
171 	b->hash_val = 0;
172 
173 	if (b->c.btree_id < BTREE_ID_NR)
174 		--bc->used_by_btree[b->c.btree_id];
175 }
176 
__bch2_btree_node_hash_insert(struct btree_cache * bc,struct btree * b)177 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
178 {
179 	BUG_ON(b->hash_val);
180 	b->hash_val = btree_ptr_hash_val(&b->key);
181 
182 	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
183 						bch_btree_cache_params);
184 	if (!ret && b->c.btree_id < BTREE_ID_NR)
185 		bc->used_by_btree[b->c.btree_id]++;
186 	return ret;
187 }
188 
bch2_btree_node_hash_insert(struct btree_cache * bc,struct btree * b,unsigned level,enum btree_id id)189 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
190 				unsigned level, enum btree_id id)
191 {
192 	int ret;
193 
194 	b->c.level	= level;
195 	b->c.btree_id	= id;
196 
197 	mutex_lock(&bc->lock);
198 	ret = __bch2_btree_node_hash_insert(bc, b);
199 	if (!ret)
200 		list_add_tail(&b->list, &bc->live);
201 	mutex_unlock(&bc->lock);
202 
203 	return ret;
204 }
205 
bch2_btree_node_update_key_early(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_i * new)206 void bch2_btree_node_update_key_early(struct btree_trans *trans,
207 				      enum btree_id btree, unsigned level,
208 				      struct bkey_s_c old, struct bkey_i *new)
209 {
210 	struct bch_fs *c = trans->c;
211 	struct btree *b;
212 	struct bkey_buf tmp;
213 	int ret;
214 
215 	bch2_bkey_buf_init(&tmp);
216 	bch2_bkey_buf_reassemble(&tmp, c, old);
217 
218 	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
219 	if (!IS_ERR_OR_NULL(b)) {
220 		mutex_lock(&c->btree_cache.lock);
221 
222 		bch2_btree_node_hash_remove(&c->btree_cache, b);
223 
224 		bkey_copy(&b->key, new);
225 		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
226 		BUG_ON(ret);
227 
228 		mutex_unlock(&c->btree_cache.lock);
229 		six_unlock_read(&b->c.lock);
230 	}
231 
232 	bch2_bkey_buf_exit(&tmp, c);
233 }
234 
235 __flatten
btree_cache_find(struct btree_cache * bc,const struct bkey_i * k)236 static inline struct btree *btree_cache_find(struct btree_cache *bc,
237 				     const struct bkey_i *k)
238 {
239 	u64 v = btree_ptr_hash_val(k);
240 
241 	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
242 }
243 
244 /*
245  * this version is for btree nodes that have already been freed (we're not
246  * reaping a real btree node)
247  */
__btree_node_reclaim(struct bch_fs * c,struct btree * b,bool flush,bool shrinker_counter)248 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
249 {
250 	struct btree_cache *bc = &c->btree_cache;
251 	int ret = 0;
252 
253 	lockdep_assert_held(&bc->lock);
254 
255 	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
256 
257 	u64 mask = b->c.level
258 		? bc->pinned_nodes_interior_mask
259 		: bc->pinned_nodes_leaf_mask;
260 
261 	if ((mask & BIT_ULL(b->c.btree_id)) &&
262 	    bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
263 	    bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
264 		return -BCH_ERR_ENOMEM_btree_node_reclaim;
265 
266 wait_on_io:
267 	if (b->flags & ((1U << BTREE_NODE_dirty)|
268 			(1U << BTREE_NODE_read_in_flight)|
269 			(1U << BTREE_NODE_write_in_flight))) {
270 		if (!flush) {
271 			if (btree_node_dirty(b))
272 				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
273 			else if (btree_node_read_in_flight(b))
274 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
275 			else if (btree_node_write_in_flight(b))
276 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
277 			return -BCH_ERR_ENOMEM_btree_node_reclaim;
278 		}
279 
280 		/* XXX: waiting on IO with btree cache lock held */
281 		bch2_btree_node_wait_on_read(b);
282 		bch2_btree_node_wait_on_write(b);
283 	}
284 
285 	if (!six_trylock_intent(&b->c.lock)) {
286 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
287 		return -BCH_ERR_ENOMEM_btree_node_reclaim;
288 	}
289 
290 	if (!six_trylock_write(&b->c.lock)) {
291 		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
292 		goto out_unlock_intent;
293 	}
294 
295 	/* recheck under lock */
296 	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
297 			(1U << BTREE_NODE_write_in_flight))) {
298 		if (!flush) {
299 			if (btree_node_read_in_flight(b))
300 				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
301 			else if (btree_node_write_in_flight(b))
302 				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
303 			goto out_unlock;
304 		}
305 		six_unlock_write(&b->c.lock);
306 		six_unlock_intent(&b->c.lock);
307 		goto wait_on_io;
308 	}
309 
310 	if (btree_node_noevict(b)) {
311 		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
312 		goto out_unlock;
313 	}
314 	if (btree_node_write_blocked(b)) {
315 		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
316 		goto out_unlock;
317 	}
318 	if (btree_node_will_make_reachable(b)) {
319 		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
320 		goto out_unlock;
321 	}
322 
323 	if (btree_node_dirty(b)) {
324 		if (!flush) {
325 			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
326 			goto out_unlock;
327 		}
328 		/*
329 		 * Using the underscore version because we don't want to compact
330 		 * bsets after the write, since this node is about to be evicted
331 		 * - unless btree verify mode is enabled, since it runs out of
332 		 * the post write cleanup:
333 		 */
334 		if (bch2_verify_btree_ondisk)
335 			bch2_btree_node_write(c, b, SIX_LOCK_intent,
336 					      BTREE_WRITE_cache_reclaim);
337 		else
338 			__bch2_btree_node_write(c, b,
339 						BTREE_WRITE_cache_reclaim);
340 
341 		six_unlock_write(&b->c.lock);
342 		six_unlock_intent(&b->c.lock);
343 		goto wait_on_io;
344 	}
345 out:
346 	if (b->hash_val && !ret)
347 		trace_and_count(c, btree_cache_reap, c, b);
348 	return ret;
349 out_unlock:
350 	six_unlock_write(&b->c.lock);
351 out_unlock_intent:
352 	six_unlock_intent(&b->c.lock);
353 	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
354 	goto out;
355 }
356 
btree_node_reclaim(struct bch_fs * c,struct btree * b,bool shrinker_counter)357 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
358 {
359 	return __btree_node_reclaim(c, b, false, shrinker_counter);
360 }
361 
btree_node_write_and_reclaim(struct bch_fs * c,struct btree * b)362 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
363 {
364 	return __btree_node_reclaim(c, b, true, false);
365 }
366 
bch2_btree_cache_scan(struct shrinker * shrink,struct shrink_control * sc)367 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
368 					   struct shrink_control *sc)
369 {
370 	struct bch_fs *c = shrink->private_data;
371 	struct btree_cache *bc = &c->btree_cache;
372 	struct btree *b, *t;
373 	unsigned long nr = sc->nr_to_scan;
374 	unsigned long can_free = 0;
375 	unsigned long freed = 0;
376 	unsigned long touched = 0;
377 	unsigned i, flags;
378 	unsigned long ret = SHRINK_STOP;
379 	bool trigger_writes = atomic_read(&bc->dirty) + nr >=
380 		bc->used * 3 / 4;
381 
382 	if (bch2_btree_shrinker_disabled)
383 		return SHRINK_STOP;
384 
385 	mutex_lock(&bc->lock);
386 	flags = memalloc_nofs_save();
387 
388 	/*
389 	 * It's _really_ critical that we don't free too many btree nodes - we
390 	 * have to always leave ourselves a reserve. The reserve is how we
391 	 * guarantee that allocating memory for a new btree node can always
392 	 * succeed, so that inserting keys into the btree can always succeed and
393 	 * IO can always make forward progress:
394 	 */
395 	can_free = btree_cache_can_free(bc);
396 	nr = min_t(unsigned long, nr, can_free);
397 
398 	i = 0;
399 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
400 		/*
401 		 * Leave a few nodes on the freeable list, so that a btree split
402 		 * won't have to hit the system allocator:
403 		 */
404 		if (++i <= 3)
405 			continue;
406 
407 		touched++;
408 
409 		if (touched >= nr)
410 			goto out;
411 
412 		if (!btree_node_reclaim(c, b, true)) {
413 			btree_node_data_free(c, b);
414 			six_unlock_write(&b->c.lock);
415 			six_unlock_intent(&b->c.lock);
416 			freed++;
417 			bc->freed++;
418 		}
419 	}
420 restart:
421 	list_for_each_entry_safe(b, t, &bc->live, list) {
422 		touched++;
423 
424 		if (btree_node_accessed(b)) {
425 			clear_btree_node_accessed(b);
426 			bc->not_freed_access_bit++;
427 		} else if (!btree_node_reclaim(c, b, true)) {
428 			freed++;
429 			btree_node_data_free(c, b);
430 			bc->freed++;
431 
432 			bch2_btree_node_hash_remove(bc, b);
433 			six_unlock_write(&b->c.lock);
434 			six_unlock_intent(&b->c.lock);
435 
436 			if (freed == nr)
437 				goto out_rotate;
438 		} else if (trigger_writes &&
439 			   btree_node_dirty(b) &&
440 			   !btree_node_will_make_reachable(b) &&
441 			   !btree_node_write_blocked(b) &&
442 			   six_trylock_read(&b->c.lock)) {
443 			list_move(&bc->live, &b->list);
444 			mutex_unlock(&bc->lock);
445 			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
446 			six_unlock_read(&b->c.lock);
447 			if (touched >= nr)
448 				goto out_nounlock;
449 			mutex_lock(&bc->lock);
450 			goto restart;
451 		}
452 
453 		if (touched >= nr)
454 			break;
455 	}
456 out_rotate:
457 	if (&t->list != &bc->live)
458 		list_move_tail(&bc->live, &t->list);
459 out:
460 	mutex_unlock(&bc->lock);
461 out_nounlock:
462 	ret = freed;
463 	memalloc_nofs_restore(flags);
464 	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
465 	return ret;
466 }
467 
bch2_btree_cache_count(struct shrinker * shrink,struct shrink_control * sc)468 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
469 					    struct shrink_control *sc)
470 {
471 	struct bch_fs *c = shrink->private_data;
472 	struct btree_cache *bc = &c->btree_cache;
473 
474 	if (bch2_btree_shrinker_disabled)
475 		return 0;
476 
477 	return btree_cache_can_free(bc);
478 }
479 
bch2_fs_btree_cache_exit(struct bch_fs * c)480 void bch2_fs_btree_cache_exit(struct bch_fs *c)
481 {
482 	struct btree_cache *bc = &c->btree_cache;
483 	struct btree *b;
484 	unsigned i, flags;
485 
486 	shrinker_free(bc->shrink);
487 
488 	/* vfree() can allocate memory: */
489 	flags = memalloc_nofs_save();
490 	mutex_lock(&bc->lock);
491 
492 	if (c->verify_data)
493 		list_move(&c->verify_data->list, &bc->live);
494 
495 	kvfree(c->verify_ondisk);
496 
497 	for (i = 0; i < btree_id_nr_alive(c); i++) {
498 		struct btree_root *r = bch2_btree_id_root(c, i);
499 
500 		if (r->b)
501 			list_add(&r->b->list, &bc->live);
502 	}
503 
504 	list_splice(&bc->freeable, &bc->live);
505 
506 	while (!list_empty(&bc->live)) {
507 		b = list_first_entry(&bc->live, struct btree, list);
508 
509 		BUG_ON(btree_node_read_in_flight(b) ||
510 		       btree_node_write_in_flight(b));
511 
512 		btree_node_data_free(c, b);
513 	}
514 
515 	BUG_ON(!bch2_journal_error(&c->journal) &&
516 	       atomic_read(&c->btree_cache.dirty));
517 
518 	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
519 
520 	while (!list_empty(&bc->freed_nonpcpu)) {
521 		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
522 		list_del(&b->list);
523 		six_lock_exit(&b->c.lock);
524 		kfree(b);
525 	}
526 
527 	mutex_unlock(&bc->lock);
528 	memalloc_nofs_restore(flags);
529 
530 	if (bc->table_init_done)
531 		rhashtable_destroy(&bc->table);
532 }
533 
bch2_fs_btree_cache_init(struct bch_fs * c)534 int bch2_fs_btree_cache_init(struct bch_fs *c)
535 {
536 	struct btree_cache *bc = &c->btree_cache;
537 	struct shrinker *shrink;
538 	unsigned i;
539 	int ret = 0;
540 
541 	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
542 	if (ret)
543 		goto err;
544 
545 	bc->table_init_done = true;
546 
547 	bch2_recalc_btree_reserve(c);
548 
549 	for (i = 0; i < bc->reserve; i++)
550 		if (!__bch2_btree_node_mem_alloc(c))
551 			goto err;
552 
553 	list_splice_init(&bc->live, &bc->freeable);
554 
555 	mutex_init(&c->verify_lock);
556 
557 	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
558 	if (!shrink)
559 		goto err;
560 	bc->shrink = shrink;
561 	shrink->count_objects	= bch2_btree_cache_count;
562 	shrink->scan_objects	= bch2_btree_cache_scan;
563 	shrink->seeks		= 4;
564 	shrink->private_data	= c;
565 	shrinker_register(shrink);
566 
567 	return 0;
568 err:
569 	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
570 }
571 
bch2_fs_btree_cache_init_early(struct btree_cache * bc)572 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
573 {
574 	mutex_init(&bc->lock);
575 	INIT_LIST_HEAD(&bc->live);
576 	INIT_LIST_HEAD(&bc->freeable);
577 	INIT_LIST_HEAD(&bc->freed_pcpu);
578 	INIT_LIST_HEAD(&bc->freed_nonpcpu);
579 }
580 
581 /*
582  * We can only have one thread cannibalizing other cached btree nodes at a time,
583  * or we'll deadlock. We use an open coded mutex to ensure that, which a
584  * cannibalize_bucket() will take. This means every time we unlock the root of
585  * the btree, we need to release this lock if we have it held.
586  */
bch2_btree_cache_cannibalize_unlock(struct btree_trans * trans)587 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
588 {
589 	struct bch_fs *c = trans->c;
590 	struct btree_cache *bc = &c->btree_cache;
591 
592 	if (bc->alloc_lock == current) {
593 		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
594 		bc->alloc_lock = NULL;
595 		closure_wake_up(&bc->alloc_wait);
596 	}
597 }
598 
bch2_btree_cache_cannibalize_lock(struct btree_trans * trans,struct closure * cl)599 int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
600 {
601 	struct bch_fs *c = trans->c;
602 	struct btree_cache *bc = &c->btree_cache;
603 	struct task_struct *old;
604 
605 	old = cmpxchg(&bc->alloc_lock, NULL, current);
606 	if (old == NULL || old == current)
607 		goto success;
608 
609 	if (!cl) {
610 		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
611 		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
612 	}
613 
614 	closure_wait(&bc->alloc_wait, cl);
615 
616 	/* Try again, after adding ourselves to waitlist */
617 	old = cmpxchg(&bc->alloc_lock, NULL, current);
618 	if (old == NULL || old == current) {
619 		/* We raced */
620 		closure_wake_up(&bc->alloc_wait);
621 		goto success;
622 	}
623 
624 	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
625 	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
626 
627 success:
628 	trace_and_count(c, btree_cache_cannibalize_lock, trans);
629 	return 0;
630 }
631 
btree_node_cannibalize(struct bch_fs * c)632 static struct btree *btree_node_cannibalize(struct bch_fs *c)
633 {
634 	struct btree_cache *bc = &c->btree_cache;
635 	struct btree *b;
636 
637 	list_for_each_entry_reverse(b, &bc->live, list)
638 		if (!btree_node_reclaim(c, b, false))
639 			return b;
640 
641 	while (1) {
642 		list_for_each_entry_reverse(b, &bc->live, list)
643 			if (!btree_node_write_and_reclaim(c, b))
644 				return b;
645 
646 		/*
647 		 * Rare case: all nodes were intent-locked.
648 		 * Just busy-wait.
649 		 */
650 		WARN_ONCE(1, "btree cache cannibalize failed\n");
651 		cond_resched();
652 	}
653 }
654 
bch2_btree_node_mem_alloc(struct btree_trans * trans,bool pcpu_read_locks)655 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
656 {
657 	struct bch_fs *c = trans->c;
658 	struct btree_cache *bc = &c->btree_cache;
659 	struct list_head *freed = pcpu_read_locks
660 		? &bc->freed_pcpu
661 		: &bc->freed_nonpcpu;
662 	struct btree *b, *b2;
663 	u64 start_time = local_clock();
664 	unsigned flags;
665 
666 	flags = memalloc_nofs_save();
667 	mutex_lock(&bc->lock);
668 
669 	/*
670 	 * We never free struct btree itself, just the memory that holds the on
671 	 * disk node. Check the freed list before allocating a new one:
672 	 */
673 	list_for_each_entry(b, freed, list)
674 		if (!btree_node_reclaim(c, b, false)) {
675 			list_del_init(&b->list);
676 			goto got_node;
677 		}
678 
679 	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
680 	if (!b) {
681 		mutex_unlock(&bc->lock);
682 		bch2_trans_unlock(trans);
683 		b = __btree_node_mem_alloc(c, GFP_KERNEL);
684 		if (!b)
685 			goto err;
686 		mutex_lock(&bc->lock);
687 	}
688 
689 	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
690 
691 	BUG_ON(!six_trylock_intent(&b->c.lock));
692 	BUG_ON(!six_trylock_write(&b->c.lock));
693 got_node:
694 
695 	/*
696 	 * btree_free() doesn't free memory; it sticks the node on the end of
697 	 * the list. Check if there's any freed nodes there:
698 	 */
699 	list_for_each_entry(b2, &bc->freeable, list)
700 		if (!btree_node_reclaim(c, b2, false)) {
701 			swap(b->data, b2->data);
702 			swap(b->aux_data, b2->aux_data);
703 			btree_node_to_freedlist(bc, b2);
704 			six_unlock_write(&b2->c.lock);
705 			six_unlock_intent(&b2->c.lock);
706 			goto got_mem;
707 		}
708 
709 	mutex_unlock(&bc->lock);
710 
711 	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
712 		bch2_trans_unlock(trans);
713 		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
714 			goto err;
715 	}
716 
717 	mutex_lock(&bc->lock);
718 	bc->used++;
719 got_mem:
720 	mutex_unlock(&bc->lock);
721 
722 	BUG_ON(btree_node_hashed(b));
723 	BUG_ON(btree_node_dirty(b));
724 	BUG_ON(btree_node_write_in_flight(b));
725 out:
726 	b->flags		= 0;
727 	b->written		= 0;
728 	b->nsets		= 0;
729 	b->sib_u64s[0]		= 0;
730 	b->sib_u64s[1]		= 0;
731 	b->whiteout_u64s	= 0;
732 	bch2_btree_keys_init(b);
733 	set_btree_node_accessed(b);
734 
735 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
736 			       start_time);
737 
738 	memalloc_nofs_restore(flags);
739 	return b;
740 err:
741 	mutex_lock(&bc->lock);
742 
743 	/* Try to cannibalize another cached btree node: */
744 	if (bc->alloc_lock == current) {
745 		b2 = btree_node_cannibalize(c);
746 		clear_btree_node_just_written(b2);
747 		bch2_btree_node_hash_remove(bc, b2);
748 
749 		if (b) {
750 			swap(b->data, b2->data);
751 			swap(b->aux_data, b2->aux_data);
752 			btree_node_to_freedlist(bc, b2);
753 			six_unlock_write(&b2->c.lock);
754 			six_unlock_intent(&b2->c.lock);
755 		} else {
756 			b = b2;
757 			list_del_init(&b->list);
758 		}
759 
760 		mutex_unlock(&bc->lock);
761 
762 		trace_and_count(c, btree_cache_cannibalize, trans);
763 		goto out;
764 	}
765 
766 	mutex_unlock(&bc->lock);
767 	memalloc_nofs_restore(flags);
768 	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
769 }
770 
771 /* Slowpath, don't want it inlined into btree_iter_traverse() */
bch2_btree_node_fill(struct btree_trans * trans,struct btree_path * path,const struct bkey_i * k,enum btree_id btree_id,unsigned level,enum six_lock_type lock_type,bool sync)772 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
773 				struct btree_path *path,
774 				const struct bkey_i *k,
775 				enum btree_id btree_id,
776 				unsigned level,
777 				enum six_lock_type lock_type,
778 				bool sync)
779 {
780 	struct bch_fs *c = trans->c;
781 	struct btree_cache *bc = &c->btree_cache;
782 	struct btree *b;
783 
784 	if (unlikely(level >= BTREE_MAX_DEPTH)) {
785 		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
786 						 level, BTREE_MAX_DEPTH);
787 		return ERR_PTR(ret);
788 	}
789 
790 	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
791 		struct printbuf buf = PRINTBUF;
792 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
793 
794 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
795 		printbuf_exit(&buf);
796 		return ERR_PTR(ret);
797 	}
798 
799 	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
800 		struct printbuf buf = PRINTBUF;
801 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
802 
803 		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
804 		printbuf_exit(&buf);
805 		return ERR_PTR(ret);
806 	}
807 
808 	/*
809 	 * Parent node must be locked, else we could read in a btree node that's
810 	 * been freed:
811 	 */
812 	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
813 		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
814 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
815 	}
816 
817 	b = bch2_btree_node_mem_alloc(trans, level != 0);
818 
819 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
820 		if (!path)
821 			return b;
822 
823 		trans->memory_allocation_failure = true;
824 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
825 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
826 	}
827 
828 	if (IS_ERR(b))
829 		return b;
830 
831 	bkey_copy(&b->key, k);
832 	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
833 		/* raced with another fill: */
834 
835 		/* mark as unhashed... */
836 		b->hash_val = 0;
837 
838 		mutex_lock(&bc->lock);
839 		list_add(&b->list, &bc->freeable);
840 		mutex_unlock(&bc->lock);
841 
842 		six_unlock_write(&b->c.lock);
843 		six_unlock_intent(&b->c.lock);
844 		return NULL;
845 	}
846 
847 	set_btree_node_read_in_flight(b);
848 	six_unlock_write(&b->c.lock);
849 
850 	if (path) {
851 		u32 seq = six_lock_seq(&b->c.lock);
852 
853 		/* Unlock before doing IO: */
854 		six_unlock_intent(&b->c.lock);
855 		bch2_trans_unlock_noassert(trans);
856 
857 		bch2_btree_node_read(trans, b, sync);
858 
859 		if (!sync)
860 			return NULL;
861 
862 		if (!six_relock_type(&b->c.lock, lock_type, seq))
863 			b = NULL;
864 	} else {
865 		bch2_btree_node_read(trans, b, sync);
866 		if (lock_type == SIX_LOCK_read)
867 			six_lock_downgrade(&b->c.lock);
868 	}
869 
870 	return b;
871 }
872 
btree_bad_header(struct bch_fs * c,struct btree * b)873 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
874 {
875 	struct printbuf buf = PRINTBUF;
876 
877 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
878 		return;
879 
880 	prt_printf(&buf,
881 	       "btree node header doesn't match ptr\n"
882 	       "btree %s level %u\n"
883 	       "ptr: ",
884 	       bch2_btree_id_str(b->c.btree_id), b->c.level);
885 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
886 
887 	prt_printf(&buf, "\nheader: btree %s level %llu\n"
888 	       "min ",
889 	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
890 	       BTREE_NODE_LEVEL(b->data));
891 	bch2_bpos_to_text(&buf, b->data->min_key);
892 
893 	prt_printf(&buf, "\nmax ");
894 	bch2_bpos_to_text(&buf, b->data->max_key);
895 
896 	bch2_fs_topology_error(c, "%s", buf.buf);
897 
898 	printbuf_exit(&buf);
899 }
900 
btree_check_header(struct bch_fs * c,struct btree * b)901 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
902 {
903 	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
904 	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
905 	    !bpos_eq(b->data->max_key, b->key.k.p) ||
906 	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
907 	     !bpos_eq(b->data->min_key,
908 		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
909 		btree_bad_header(c, b);
910 }
911 
__bch2_btree_node_get(struct btree_trans * trans,struct btree_path * path,const struct bkey_i * k,unsigned level,enum six_lock_type lock_type,unsigned long trace_ip)912 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
913 					   const struct bkey_i *k, unsigned level,
914 					   enum six_lock_type lock_type,
915 					   unsigned long trace_ip)
916 {
917 	struct bch_fs *c = trans->c;
918 	struct btree_cache *bc = &c->btree_cache;
919 	struct btree *b;
920 	bool need_relock = false;
921 	int ret;
922 
923 	EBUG_ON(level >= BTREE_MAX_DEPTH);
924 retry:
925 	b = btree_cache_find(bc, k);
926 	if (unlikely(!b)) {
927 		/*
928 		 * We must have the parent locked to call bch2_btree_node_fill(),
929 		 * else we could read in a btree node from disk that's been
930 		 * freed:
931 		 */
932 		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
933 					 level, lock_type, true);
934 		need_relock = true;
935 
936 		/* We raced and found the btree node in the cache */
937 		if (!b)
938 			goto retry;
939 
940 		if (IS_ERR(b))
941 			return b;
942 	} else {
943 		if (btree_node_read_locked(path, level + 1))
944 			btree_node_unlock(trans, path, level + 1);
945 
946 		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
947 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
948 			return ERR_PTR(ret);
949 
950 		BUG_ON(ret);
951 
952 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
953 			     b->c.level != level ||
954 			     race_fault())) {
955 			six_unlock_type(&b->c.lock, lock_type);
956 			if (bch2_btree_node_relock(trans, path, level + 1))
957 				goto retry;
958 
959 			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
960 			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
961 		}
962 
963 		/* avoid atomic set bit if it's not needed: */
964 		if (!btree_node_accessed(b))
965 			set_btree_node_accessed(b);
966 	}
967 
968 	if (unlikely(btree_node_read_in_flight(b))) {
969 		u32 seq = six_lock_seq(&b->c.lock);
970 
971 		six_unlock_type(&b->c.lock, lock_type);
972 		bch2_trans_unlock(trans);
973 		need_relock = true;
974 
975 		bch2_btree_node_wait_on_read(b);
976 
977 		/*
978 		 * should_be_locked is not set on this path yet, so we need to
979 		 * relock it specifically:
980 		 */
981 		if (!six_relock_type(&b->c.lock, lock_type, seq))
982 			goto retry;
983 	}
984 
985 	if (unlikely(need_relock)) {
986 		ret = bch2_trans_relock(trans) ?:
987 			bch2_btree_path_relock_intent(trans, path);
988 		if (ret) {
989 			six_unlock_type(&b->c.lock, lock_type);
990 			return ERR_PTR(ret);
991 		}
992 	}
993 
994 	prefetch(b->aux_data);
995 
996 	for_each_bset(b, t) {
997 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
998 
999 		prefetch(p + L1_CACHE_BYTES * 0);
1000 		prefetch(p + L1_CACHE_BYTES * 1);
1001 		prefetch(p + L1_CACHE_BYTES * 2);
1002 	}
1003 
1004 	if (unlikely(btree_node_read_error(b))) {
1005 		six_unlock_type(&b->c.lock, lock_type);
1006 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1007 	}
1008 
1009 	EBUG_ON(b->c.btree_id != path->btree_id);
1010 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1011 	btree_check_header(c, b);
1012 
1013 	return b;
1014 }
1015 
1016 /**
1017  * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
1018  * in from disk if necessary.
1019  *
1020  * @trans:	btree transaction object
1021  * @path:	btree_path being traversed
1022  * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
1023  * @level:	level of btree node being looked up (0 == leaf node)
1024  * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
1025  * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
1026  *
1027  * The btree node will have either a read or a write lock held, depending on
1028  * the @write parameter.
1029  *
1030  * Returns: btree node or ERR_PTR()
1031  */
bch2_btree_node_get(struct btree_trans * trans,struct btree_path * path,const struct bkey_i * k,unsigned level,enum six_lock_type lock_type,unsigned long trace_ip)1032 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
1033 				  const struct bkey_i *k, unsigned level,
1034 				  enum six_lock_type lock_type,
1035 				  unsigned long trace_ip)
1036 {
1037 	struct bch_fs *c = trans->c;
1038 	struct btree *b;
1039 	int ret;
1040 
1041 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1042 
1043 	b = btree_node_mem_ptr(k);
1044 
1045 	/*
1046 	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
1047 	 * be the node we want anymore, and trying to lock the wrong node could
1048 	 * cause an unneccessary transaction restart:
1049 	 */
1050 	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
1051 		     !b ||
1052 		     b->hash_val != btree_ptr_hash_val(k)))
1053 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1054 
1055 	if (btree_node_read_locked(path, level + 1))
1056 		btree_node_unlock(trans, path, level + 1);
1057 
1058 	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
1059 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1060 		return ERR_PTR(ret);
1061 
1062 	BUG_ON(ret);
1063 
1064 	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1065 		     b->c.level != level ||
1066 		     race_fault())) {
1067 		six_unlock_type(&b->c.lock, lock_type);
1068 		if (bch2_btree_node_relock(trans, path, level + 1))
1069 			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1070 
1071 		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1072 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1073 	}
1074 
1075 	if (unlikely(btree_node_read_in_flight(b))) {
1076 		six_unlock_type(&b->c.lock, lock_type);
1077 		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1078 	}
1079 
1080 	prefetch(b->aux_data);
1081 
1082 	for_each_bset(b, t) {
1083 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1084 
1085 		prefetch(p + L1_CACHE_BYTES * 0);
1086 		prefetch(p + L1_CACHE_BYTES * 1);
1087 		prefetch(p + L1_CACHE_BYTES * 2);
1088 	}
1089 
1090 	/* avoid atomic set bit if it's not needed: */
1091 	if (!btree_node_accessed(b))
1092 		set_btree_node_accessed(b);
1093 
1094 	if (unlikely(btree_node_read_error(b))) {
1095 		six_unlock_type(&b->c.lock, lock_type);
1096 		return ERR_PTR(-BCH_ERR_btree_node_read_error);
1097 	}
1098 
1099 	EBUG_ON(b->c.btree_id != path->btree_id);
1100 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1101 	btree_check_header(c, b);
1102 
1103 	return b;
1104 }
1105 
bch2_btree_node_get_noiter(struct btree_trans * trans,const struct bkey_i * k,enum btree_id btree_id,unsigned level,bool nofill)1106 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1107 					 const struct bkey_i *k,
1108 					 enum btree_id btree_id,
1109 					 unsigned level,
1110 					 bool nofill)
1111 {
1112 	struct bch_fs *c = trans->c;
1113 	struct btree_cache *bc = &c->btree_cache;
1114 	struct btree *b;
1115 	int ret;
1116 
1117 	EBUG_ON(level >= BTREE_MAX_DEPTH);
1118 
1119 	if (c->opts.btree_node_mem_ptr_optimization) {
1120 		b = btree_node_mem_ptr(k);
1121 		if (b)
1122 			goto lock_node;
1123 	}
1124 retry:
1125 	b = btree_cache_find(bc, k);
1126 	if (unlikely(!b)) {
1127 		if (nofill)
1128 			goto out;
1129 
1130 		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1131 					 level, SIX_LOCK_read, true);
1132 
1133 		/* We raced and found the btree node in the cache */
1134 		if (!b)
1135 			goto retry;
1136 
1137 		if (IS_ERR(b) &&
1138 		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
1139 			goto retry;
1140 
1141 		if (IS_ERR(b))
1142 			goto out;
1143 	} else {
1144 lock_node:
1145 		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1146 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1147 			return ERR_PTR(ret);
1148 
1149 		BUG_ON(ret);
1150 
1151 		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1152 			     b->c.btree_id != btree_id ||
1153 			     b->c.level != level)) {
1154 			six_unlock_read(&b->c.lock);
1155 			goto retry;
1156 		}
1157 	}
1158 
1159 	/* XXX: waiting on IO with btree locks held: */
1160 	__bch2_btree_node_wait_on_read(b);
1161 
1162 	prefetch(b->aux_data);
1163 
1164 	for_each_bset(b, t) {
1165 		void *p = (u64 *) b->aux_data + t->aux_data_offset;
1166 
1167 		prefetch(p + L1_CACHE_BYTES * 0);
1168 		prefetch(p + L1_CACHE_BYTES * 1);
1169 		prefetch(p + L1_CACHE_BYTES * 2);
1170 	}
1171 
1172 	/* avoid atomic set bit if it's not needed: */
1173 	if (!btree_node_accessed(b))
1174 		set_btree_node_accessed(b);
1175 
1176 	if (unlikely(btree_node_read_error(b))) {
1177 		six_unlock_read(&b->c.lock);
1178 		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
1179 		goto out;
1180 	}
1181 
1182 	EBUG_ON(b->c.btree_id != btree_id);
1183 	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1184 	btree_check_header(c, b);
1185 out:
1186 	bch2_btree_cache_cannibalize_unlock(trans);
1187 	return b;
1188 }
1189 
bch2_btree_node_prefetch(struct btree_trans * trans,struct btree_path * path,const struct bkey_i * k,enum btree_id btree_id,unsigned level)1190 int bch2_btree_node_prefetch(struct btree_trans *trans,
1191 			     struct btree_path *path,
1192 			     const struct bkey_i *k,
1193 			     enum btree_id btree_id, unsigned level)
1194 {
1195 	struct bch_fs *c = trans->c;
1196 	struct btree_cache *bc = &c->btree_cache;
1197 
1198 	BUG_ON(path && !btree_node_locked(path, level + 1));
1199 	BUG_ON(level >= BTREE_MAX_DEPTH);
1200 
1201 	struct btree *b = btree_cache_find(bc, k);
1202 	if (b)
1203 		return 0;
1204 
1205 	b = bch2_btree_node_fill(trans, path, k, btree_id,
1206 				 level, SIX_LOCK_read, false);
1207 	if (!IS_ERR_OR_NULL(b))
1208 		six_unlock_read(&b->c.lock);
1209 	return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
1210 }
1211 
bch2_btree_node_evict(struct btree_trans * trans,const struct bkey_i * k)1212 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1213 {
1214 	struct bch_fs *c = trans->c;
1215 	struct btree_cache *bc = &c->btree_cache;
1216 	struct btree *b;
1217 
1218 	b = btree_cache_find(bc, k);
1219 	if (!b)
1220 		return;
1221 
1222 	BUG_ON(b == btree_node_root(trans->c, b));
1223 wait_on_io:
1224 	/* not allowed to wait on io with btree locks held: */
1225 
1226 	/* XXX we're called from btree_gc which will be holding other btree
1227 	 * nodes locked
1228 	 */
1229 	__bch2_btree_node_wait_on_read(b);
1230 	__bch2_btree_node_wait_on_write(b);
1231 
1232 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1233 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1234 	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
1235 		goto out;
1236 
1237 	if (btree_node_dirty(b)) {
1238 		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1239 		six_unlock_write(&b->c.lock);
1240 		six_unlock_intent(&b->c.lock);
1241 		goto wait_on_io;
1242 	}
1243 
1244 	BUG_ON(btree_node_dirty(b));
1245 
1246 	mutex_lock(&bc->lock);
1247 	btree_node_data_free(c, b);
1248 	bch2_btree_node_hash_remove(bc, b);
1249 	mutex_unlock(&bc->lock);
1250 out:
1251 	six_unlock_write(&b->c.lock);
1252 	six_unlock_intent(&b->c.lock);
1253 }
1254 
bch2_btree_id_str(enum btree_id btree)1255 const char *bch2_btree_id_str(enum btree_id btree)
1256 {
1257 	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
1258 }
1259 
bch2_btree_pos_to_text(struct printbuf * out,struct bch_fs * c,const struct btree * b)1260 void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1261 {
1262 	prt_printf(out, "%s level %u/%u\n  ",
1263 	       bch2_btree_id_str(b->c.btree_id),
1264 	       b->c.level,
1265 	       bch2_btree_id_root(c, b->c.btree_id)->level);
1266 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1267 }
1268 
bch2_btree_node_to_text(struct printbuf * out,struct bch_fs * c,const struct btree * b)1269 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
1270 {
1271 	struct bset_stats stats;
1272 
1273 	memset(&stats, 0, sizeof(stats));
1274 
1275 	bch2_btree_keys_stats(b, &stats);
1276 
1277 	prt_printf(out, "l %u ", b->c.level);
1278 	bch2_bpos_to_text(out, b->data->min_key);
1279 	prt_printf(out, " - ");
1280 	bch2_bpos_to_text(out, b->data->max_key);
1281 	prt_printf(out, ":\n"
1282 	       "    ptrs: ");
1283 	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1284 	prt_newline(out);
1285 
1286 	prt_printf(out,
1287 	       "    format: ");
1288 	bch2_bkey_format_to_text(out, &b->format);
1289 
1290 	prt_printf(out,
1291 	       "    unpack fn len: %u\n"
1292 	       "    bytes used %zu/%zu (%zu%% full)\n"
1293 	       "    sib u64s: %u, %u (merge threshold %u)\n"
1294 	       "    nr packed keys %u\n"
1295 	       "    nr unpacked keys %u\n"
1296 	       "    floats %zu\n"
1297 	       "    failed unpacked %zu\n",
1298 	       b->unpack_fn_len,
1299 	       b->nr.live_u64s * sizeof(u64),
1300 	       btree_buf_bytes(b) - sizeof(struct btree_node),
1301 	       b->nr.live_u64s * 100 / btree_max_u64s(c),
1302 	       b->sib_u64s[0],
1303 	       b->sib_u64s[1],
1304 	       c->btree_foreground_merge_threshold,
1305 	       b->nr.packed_keys,
1306 	       b->nr.unpacked_keys,
1307 	       stats.floats,
1308 	       stats.failed);
1309 }
1310 
prt_btree_cache_line(struct printbuf * out,const struct bch_fs * c,const char * label,unsigned nr)1311 static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
1312 				 const char *label, unsigned nr)
1313 {
1314 	prt_printf(out, "%s\t", label);
1315 	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
1316 	prt_printf(out, " (%u)\n", nr);
1317 }
1318 
bch2_btree_cache_to_text(struct printbuf * out,const struct btree_cache * bc)1319 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1320 {
1321 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
1322 
1323 	if (!out->nr_tabstops)
1324 		printbuf_tabstop_push(out, 32);
1325 
1326 	prt_btree_cache_line(out, c, "total:",		bc->used);
1327 	prt_btree_cache_line(out, c, "nr dirty:",	atomic_read(&bc->dirty));
1328 	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
1329 	prt_newline(out);
1330 
1331 	for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
1332 		prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
1333 
1334 	prt_newline(out);
1335 	prt_printf(out, "freed:\t%u\n", bc->freed);
1336 	prt_printf(out, "not freed:\n");
1337 	prt_printf(out, "  dirty\t%u\n", bc->not_freed_dirty);
1338 	prt_printf(out, "  write in flight\t%u\n", bc->not_freed_write_in_flight);
1339 	prt_printf(out, "  read in flight\t%u\n", bc->not_freed_read_in_flight);
1340 	prt_printf(out, "  lock intent failed\t%u\n", bc->not_freed_lock_intent);
1341 	prt_printf(out, "  lock write failed\t%u\n", bc->not_freed_lock_write);
1342 	prt_printf(out, "  access bit\t%u\n", bc->not_freed_access_bit);
1343 	prt_printf(out, "  no evict failed\t%u\n", bc->not_freed_noevict);
1344 	prt_printf(out, "  write blocked\t%u\n", bc->not_freed_write_blocked);
1345 	prt_printf(out, "  will make reachable\t%u\n", bc->not_freed_will_make_reachable);
1346 }
1347