xref: /linux/fs/bcachefs/alloc_background.c (revision d642ef71)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_key_cache.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_gc.h"
12 #include "btree_write_buffer.h"
13 #include "buckets.h"
14 #include "buckets_waiting_for_journal.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "lru.h"
20 #include "recovery.h"
21 #include "trace.h"
22 #include "varint.h"
23 
24 #include <linux/kthread.h>
25 #include <linux/math64.h>
26 #include <linux/random.h>
27 #include <linux/rculist.h>
28 #include <linux/rcupdate.h>
29 #include <linux/sched/task.h>
30 #include <linux/sort.h>
31 
32 /* Persistent alloc info: */
33 
34 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
35 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
36 	BCH_ALLOC_FIELDS_V1()
37 #undef x
38 };
39 
40 struct bkey_alloc_unpacked {
41 	u64		journal_seq;
42 	u8		gen;
43 	u8		oldest_gen;
44 	u8		data_type;
45 	bool		need_discard:1;
46 	bool		need_inc_gen:1;
47 #define x(_name, _bits)	u##_bits _name;
48 	BCH_ALLOC_FIELDS_V2()
49 #undef  x
50 };
51 
52 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
53 				     const void **p, unsigned field)
54 {
55 	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
56 	u64 v;
57 
58 	if (!(a->fields & (1 << field)))
59 		return 0;
60 
61 	switch (bytes) {
62 	case 1:
63 		v = *((const u8 *) *p);
64 		break;
65 	case 2:
66 		v = le16_to_cpup(*p);
67 		break;
68 	case 4:
69 		v = le32_to_cpup(*p);
70 		break;
71 	case 8:
72 		v = le64_to_cpup(*p);
73 		break;
74 	default:
75 		BUG();
76 	}
77 
78 	*p += bytes;
79 	return v;
80 }
81 
82 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
83 				 struct bkey_s_c k)
84 {
85 	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
86 	const void *d = in->data;
87 	unsigned idx = 0;
88 
89 	out->gen = in->gen;
90 
91 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
92 	BCH_ALLOC_FIELDS_V1()
93 #undef  x
94 }
95 
96 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
97 				struct bkey_s_c k)
98 {
99 	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
100 	const u8 *in = a.v->data;
101 	const u8 *end = bkey_val_end(a);
102 	unsigned fieldnr = 0;
103 	int ret;
104 	u64 v;
105 
106 	out->gen	= a.v->gen;
107 	out->oldest_gen	= a.v->oldest_gen;
108 	out->data_type	= a.v->data_type;
109 
110 #define x(_name, _bits)							\
111 	if (fieldnr < a.v->nr_fields) {					\
112 		ret = bch2_varint_decode_fast(in, end, &v);		\
113 		if (ret < 0)						\
114 			return ret;					\
115 		in += ret;						\
116 	} else {							\
117 		v = 0;							\
118 	}								\
119 	out->_name = v;							\
120 	if (v != out->_name)						\
121 		return -1;						\
122 	fieldnr++;
123 
124 	BCH_ALLOC_FIELDS_V2()
125 #undef  x
126 	return 0;
127 }
128 
129 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
130 				struct bkey_s_c k)
131 {
132 	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
133 	const u8 *in = a.v->data;
134 	const u8 *end = bkey_val_end(a);
135 	unsigned fieldnr = 0;
136 	int ret;
137 	u64 v;
138 
139 	out->gen	= a.v->gen;
140 	out->oldest_gen	= a.v->oldest_gen;
141 	out->data_type	= a.v->data_type;
142 	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
143 	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
144 	out->journal_seq = le64_to_cpu(a.v->journal_seq);
145 
146 #define x(_name, _bits)							\
147 	if (fieldnr < a.v->nr_fields) {					\
148 		ret = bch2_varint_decode_fast(in, end, &v);		\
149 		if (ret < 0)						\
150 			return ret;					\
151 		in += ret;						\
152 	} else {							\
153 		v = 0;							\
154 	}								\
155 	out->_name = v;							\
156 	if (v != out->_name)						\
157 		return -1;						\
158 	fieldnr++;
159 
160 	BCH_ALLOC_FIELDS_V2()
161 #undef  x
162 	return 0;
163 }
164 
165 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
166 {
167 	struct bkey_alloc_unpacked ret = { .gen	= 0 };
168 
169 	switch (k.k->type) {
170 	case KEY_TYPE_alloc:
171 		bch2_alloc_unpack_v1(&ret, k);
172 		break;
173 	case KEY_TYPE_alloc_v2:
174 		bch2_alloc_unpack_v2(&ret, k);
175 		break;
176 	case KEY_TYPE_alloc_v3:
177 		bch2_alloc_unpack_v3(&ret, k);
178 		break;
179 	}
180 
181 	return ret;
182 }
183 
184 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
185 {
186 	unsigned i, bytes = offsetof(struct bch_alloc, data);
187 
188 	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
189 		if (a->fields & (1 << i))
190 			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
191 
192 	return DIV_ROUND_UP(bytes, sizeof(u64));
193 }
194 
195 int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
196 			  enum bkey_invalid_flags flags,
197 			  struct printbuf *err)
198 {
199 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
200 	int ret = 0;
201 
202 	/* allow for unknown fields */
203 	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
204 			 alloc_v1_val_size_bad,
205 			 "incorrect value size (%zu < %u)",
206 			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
207 fsck_err:
208 	return ret;
209 }
210 
211 int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
212 			  enum bkey_invalid_flags flags,
213 			  struct printbuf *err)
214 {
215 	struct bkey_alloc_unpacked u;
216 	int ret = 0;
217 
218 	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
219 			 alloc_v2_unpack_error,
220 			 "unpack error");
221 fsck_err:
222 	return ret;
223 }
224 
225 int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
226 			  enum bkey_invalid_flags flags,
227 			  struct printbuf *err)
228 {
229 	struct bkey_alloc_unpacked u;
230 	int ret = 0;
231 
232 	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
233 			 alloc_v2_unpack_error,
234 			 "unpack error");
235 fsck_err:
236 	return ret;
237 }
238 
239 int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
240 			  enum bkey_invalid_flags flags, struct printbuf *err)
241 {
242 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
243 	int ret = 0;
244 
245 	bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
246 			 alloc_v4_val_size_bad,
247 			 "bad val size (%u > %zu)",
248 			 alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
249 
250 	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
251 			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
252 			 alloc_v4_backpointers_start_bad,
253 			 "invalid backpointers_start");
254 
255 	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
256 			 alloc_key_data_type_bad,
257 			 "invalid data type (got %u should be %u)",
258 			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
259 
260 	switch (a.v->data_type) {
261 	case BCH_DATA_free:
262 	case BCH_DATA_need_gc_gens:
263 	case BCH_DATA_need_discard:
264 		bkey_fsck_err_on(a.v->dirty_sectors ||
265 				 a.v->cached_sectors ||
266 				 a.v->stripe, c, err,
267 				 alloc_key_empty_but_have_data,
268 				 "empty data type free but have data");
269 		break;
270 	case BCH_DATA_sb:
271 	case BCH_DATA_journal:
272 	case BCH_DATA_btree:
273 	case BCH_DATA_user:
274 	case BCH_DATA_parity:
275 		bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
276 				 alloc_key_dirty_sectors_0,
277 				 "data_type %s but dirty_sectors==0",
278 				 bch2_data_types[a.v->data_type]);
279 		break;
280 	case BCH_DATA_cached:
281 		bkey_fsck_err_on(!a.v->cached_sectors ||
282 				 a.v->dirty_sectors ||
283 				 a.v->stripe, c, err,
284 				 alloc_key_cached_inconsistency,
285 				 "data type inconsistency");
286 
287 		bkey_fsck_err_on(!a.v->io_time[READ] &&
288 				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
289 				 c, err,
290 				 alloc_key_cached_but_read_time_zero,
291 				 "cached bucket with read_time == 0");
292 		break;
293 	case BCH_DATA_stripe:
294 		break;
295 	}
296 fsck_err:
297 	return ret;
298 }
299 
300 void bch2_alloc_v4_swab(struct bkey_s k)
301 {
302 	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
303 	struct bch_backpointer *bp, *bps;
304 
305 	a->journal_seq		= swab64(a->journal_seq);
306 	a->flags		= swab32(a->flags);
307 	a->dirty_sectors	= swab32(a->dirty_sectors);
308 	a->cached_sectors	= swab32(a->cached_sectors);
309 	a->io_time[0]		= swab64(a->io_time[0]);
310 	a->io_time[1]		= swab64(a->io_time[1]);
311 	a->stripe		= swab32(a->stripe);
312 	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
313 	a->fragmentation_lru	= swab64(a->fragmentation_lru);
314 
315 	bps = alloc_v4_backpointers(a);
316 	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
317 		bp->bucket_offset	= swab40(bp->bucket_offset);
318 		bp->bucket_len		= swab32(bp->bucket_len);
319 		bch2_bpos_swab(&bp->pos);
320 	}
321 }
322 
323 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
324 {
325 	struct bch_alloc_v4 _a;
326 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
327 	unsigned i;
328 
329 	prt_newline(out);
330 	printbuf_indent_add(out, 2);
331 
332 	prt_printf(out, "gen %u oldest_gen %u data_type %s",
333 	       a->gen, a->oldest_gen,
334 	       a->data_type < BCH_DATA_NR
335 	       ? bch2_data_types[a->data_type]
336 	       : "(invalid data type)");
337 	prt_newline(out);
338 	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
339 	prt_newline(out);
340 	prt_printf(out, "need_discard      %llu",	BCH_ALLOC_V4_NEED_DISCARD(a));
341 	prt_newline(out);
342 	prt_printf(out, "need_inc_gen      %llu",	BCH_ALLOC_V4_NEED_INC_GEN(a));
343 	prt_newline(out);
344 	prt_printf(out, "dirty_sectors     %u",	a->dirty_sectors);
345 	prt_newline(out);
346 	prt_printf(out, "cached_sectors    %u",	a->cached_sectors);
347 	prt_newline(out);
348 	prt_printf(out, "stripe            %u",	a->stripe);
349 	prt_newline(out);
350 	prt_printf(out, "stripe_redundancy %u",	a->stripe_redundancy);
351 	prt_newline(out);
352 	prt_printf(out, "io_time[READ]     %llu",	a->io_time[READ]);
353 	prt_newline(out);
354 	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
355 	prt_newline(out);
356 	prt_printf(out, "fragmentation     %llu",	a->fragmentation_lru);
357 	prt_newline(out);
358 	prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
359 	prt_newline(out);
360 
361 	if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
362 		struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
363 		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
364 
365 		prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
366 		printbuf_indent_add(out, 2);
367 
368 		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
369 			prt_newline(out);
370 			bch2_backpointer_to_text(out, &bps[i]);
371 		}
372 
373 		printbuf_indent_sub(out, 2);
374 	}
375 
376 	printbuf_indent_sub(out, 2);
377 }
378 
379 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
380 {
381 	if (k.k->type == KEY_TYPE_alloc_v4) {
382 		void *src, *dst;
383 
384 		*out = *bkey_s_c_to_alloc_v4(k).v;
385 
386 		src = alloc_v4_backpointers(out);
387 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
388 		dst = alloc_v4_backpointers(out);
389 
390 		if (src < dst)
391 			memset(src, 0, dst - src);
392 
393 		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
394 	} else {
395 		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
396 
397 		*out = (struct bch_alloc_v4) {
398 			.journal_seq		= u.journal_seq,
399 			.flags			= u.need_discard,
400 			.gen			= u.gen,
401 			.oldest_gen		= u.oldest_gen,
402 			.data_type		= u.data_type,
403 			.stripe_redundancy	= u.stripe_redundancy,
404 			.dirty_sectors		= u.dirty_sectors,
405 			.cached_sectors		= u.cached_sectors,
406 			.io_time[READ]		= u.read_time,
407 			.io_time[WRITE]		= u.write_time,
408 			.stripe			= u.stripe,
409 		};
410 
411 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
412 	}
413 }
414 
415 static noinline struct bkey_i_alloc_v4 *
416 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
417 {
418 	struct bkey_i_alloc_v4 *ret;
419 
420 	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
421 	if (IS_ERR(ret))
422 		return ret;
423 
424 	if (k.k->type == KEY_TYPE_alloc_v4) {
425 		void *src, *dst;
426 
427 		bkey_reassemble(&ret->k_i, k);
428 
429 		src = alloc_v4_backpointers(&ret->v);
430 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
431 		dst = alloc_v4_backpointers(&ret->v);
432 
433 		if (src < dst)
434 			memset(src, 0, dst - src);
435 
436 		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
437 		set_alloc_v4_u64s(ret);
438 	} else {
439 		bkey_alloc_v4_init(&ret->k_i);
440 		ret->k.p = k.k->p;
441 		bch2_alloc_to_v4(k, &ret->v);
442 	}
443 	return ret;
444 }
445 
446 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
447 {
448 	struct bkey_s_c_alloc_v4 a;
449 
450 	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
451 	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
452 	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
453 		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
454 
455 	return __bch2_alloc_to_v4_mut(trans, k);
456 }
457 
458 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
459 {
460 	return bch2_alloc_to_v4_mut_inlined(trans, k);
461 }
462 
463 struct bkey_i_alloc_v4 *
464 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
465 			      struct bpos pos)
466 {
467 	struct bkey_s_c k;
468 	struct bkey_i_alloc_v4 *a;
469 	int ret;
470 
471 	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
472 			     BTREE_ITER_WITH_UPDATES|
473 			     BTREE_ITER_CACHED|
474 			     BTREE_ITER_INTENT);
475 	ret = bkey_err(k);
476 	if (unlikely(ret))
477 		return ERR_PTR(ret);
478 
479 	a = bch2_alloc_to_v4_mut_inlined(trans, k);
480 	ret = PTR_ERR_OR_ZERO(a);
481 	if (unlikely(ret))
482 		goto err;
483 	return a;
484 err:
485 	bch2_trans_iter_exit(trans, iter);
486 	return ERR_PTR(ret);
487 }
488 
489 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
490 {
491 	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
492 
493 	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
494 	return pos;
495 }
496 
497 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
498 {
499 	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
500 	pos.offset += offset;
501 	return pos;
502 }
503 
504 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
505 {
506 	return k.k->type == KEY_TYPE_bucket_gens
507 		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
508 		: 0;
509 }
510 
511 int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
512 			     enum bkey_invalid_flags flags,
513 			     struct printbuf *err)
514 {
515 	int ret = 0;
516 
517 	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
518 			 bucket_gens_val_size_bad,
519 			 "bad val size (%zu != %zu)",
520 			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
521 fsck_err:
522 	return ret;
523 }
524 
525 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
526 {
527 	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
528 	unsigned i;
529 
530 	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
531 		if (i)
532 			prt_char(out, ' ');
533 		prt_printf(out, "%u", g.v->gens[i]);
534 	}
535 }
536 
537 int bch2_bucket_gens_init(struct bch_fs *c)
538 {
539 	struct btree_trans *trans = bch2_trans_get(c);
540 	struct btree_iter iter;
541 	struct bkey_s_c k;
542 	struct bch_alloc_v4 a;
543 	struct bkey_i_bucket_gens g;
544 	bool have_bucket_gens_key = false;
545 	unsigned offset;
546 	struct bpos pos;
547 	u8 gen;
548 	int ret;
549 
550 	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
551 			   BTREE_ITER_PREFETCH, k, ret) {
552 		/*
553 		 * Not a fsck error because this is checked/repaired by
554 		 * bch2_check_alloc_key() which runs later:
555 		 */
556 		if (!bch2_dev_bucket_exists(c, k.k->p))
557 			continue;
558 
559 		gen = bch2_alloc_to_v4(k, &a)->gen;
560 		pos = alloc_gens_pos(iter.pos, &offset);
561 
562 		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
563 			ret = commit_do(trans, NULL, NULL,
564 					BTREE_INSERT_NOFAIL|
565 					BTREE_INSERT_LAZY_RW,
566 				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
567 			if (ret)
568 				break;
569 			have_bucket_gens_key = false;
570 		}
571 
572 		if (!have_bucket_gens_key) {
573 			bkey_bucket_gens_init(&g.k_i);
574 			g.k.p = pos;
575 			have_bucket_gens_key = true;
576 		}
577 
578 		g.v.gens[offset] = gen;
579 	}
580 	bch2_trans_iter_exit(trans, &iter);
581 
582 	if (have_bucket_gens_key && !ret)
583 		ret = commit_do(trans, NULL, NULL,
584 				BTREE_INSERT_NOFAIL|
585 				BTREE_INSERT_LAZY_RW,
586 			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
587 
588 	bch2_trans_put(trans);
589 
590 	if (ret)
591 		bch_err_fn(c, ret);
592 	return ret;
593 }
594 
595 int bch2_alloc_read(struct bch_fs *c)
596 {
597 	struct btree_trans *trans = bch2_trans_get(c);
598 	struct btree_iter iter;
599 	struct bkey_s_c k;
600 	struct bch_dev *ca;
601 	int ret;
602 
603 	down_read(&c->gc_lock);
604 
605 	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
606 		const struct bch_bucket_gens *g;
607 		u64 b;
608 
609 		for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
610 				   BTREE_ITER_PREFETCH, k, ret) {
611 			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
612 			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
613 
614 			if (k.k->type != KEY_TYPE_bucket_gens)
615 				continue;
616 
617 			g = bkey_s_c_to_bucket_gens(k).v;
618 
619 			/*
620 			 * Not a fsck error because this is checked/repaired by
621 			 * bch2_check_alloc_key() which runs later:
622 			 */
623 			if (!bch2_dev_exists2(c, k.k->p.inode))
624 				continue;
625 
626 			ca = bch_dev_bkey_exists(c, k.k->p.inode);
627 
628 			for (b = max_t(u64, ca->mi.first_bucket, start);
629 			     b < min_t(u64, ca->mi.nbuckets, end);
630 			     b++)
631 				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
632 		}
633 		bch2_trans_iter_exit(trans, &iter);
634 	} else {
635 		struct bch_alloc_v4 a;
636 
637 		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
638 				   BTREE_ITER_PREFETCH, k, ret) {
639 			/*
640 			 * Not a fsck error because this is checked/repaired by
641 			 * bch2_check_alloc_key() which runs later:
642 			 */
643 			if (!bch2_dev_bucket_exists(c, k.k->p))
644 				continue;
645 
646 			ca = bch_dev_bkey_exists(c, k.k->p.inode);
647 
648 			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
649 		}
650 		bch2_trans_iter_exit(trans, &iter);
651 	}
652 
653 	bch2_trans_put(trans);
654 	up_read(&c->gc_lock);
655 
656 	if (ret)
657 		bch_err_fn(c, ret);
658 
659 	return ret;
660 }
661 
662 /* Free space/discard btree: */
663 
664 static int bch2_bucket_do_index(struct btree_trans *trans,
665 				struct bkey_s_c alloc_k,
666 				const struct bch_alloc_v4 *a,
667 				bool set)
668 {
669 	struct bch_fs *c = trans->c;
670 	struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
671 	struct btree_iter iter;
672 	struct bkey_s_c old;
673 	struct bkey_i *k;
674 	enum btree_id btree;
675 	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
676 	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
677 	struct printbuf buf = PRINTBUF;
678 	int ret;
679 
680 	if (a->data_type != BCH_DATA_free &&
681 	    a->data_type != BCH_DATA_need_discard)
682 		return 0;
683 
684 	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
685 	if (IS_ERR(k))
686 		return PTR_ERR(k);
687 
688 	bkey_init(&k->k);
689 	k->k.type = new_type;
690 
691 	switch (a->data_type) {
692 	case BCH_DATA_free:
693 		btree = BTREE_ID_freespace;
694 		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
695 		bch2_key_resize(&k->k, 1);
696 		break;
697 	case BCH_DATA_need_discard:
698 		btree = BTREE_ID_need_discard;
699 		k->k.p = alloc_k.k->p;
700 		break;
701 	default:
702 		return 0;
703 	}
704 
705 	old = bch2_bkey_get_iter(trans, &iter, btree,
706 			     bkey_start_pos(&k->k),
707 			     BTREE_ITER_INTENT);
708 	ret = bkey_err(old);
709 	if (ret)
710 		return ret;
711 
712 	if (ca->mi.freespace_initialized &&
713 	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
714 	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
715 			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
716 			"  for %s",
717 			set ? "setting" : "clearing",
718 			bch2_btree_id_str(btree),
719 			iter.pos.inode,
720 			iter.pos.offset,
721 			bch2_bkey_types[old.k->type],
722 			bch2_bkey_types[old_type],
723 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
724 		ret = -EIO;
725 		goto err;
726 	}
727 
728 	ret = bch2_trans_update(trans, &iter, k, 0);
729 err:
730 	bch2_trans_iter_exit(trans, &iter);
731 	printbuf_exit(&buf);
732 	return ret;
733 }
734 
735 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
736 					   struct bpos bucket, u8 gen)
737 {
738 	struct btree_iter iter;
739 	unsigned offset;
740 	struct bpos pos = alloc_gens_pos(bucket, &offset);
741 	struct bkey_i_bucket_gens *g;
742 	struct bkey_s_c k;
743 	int ret;
744 
745 	g = bch2_trans_kmalloc(trans, sizeof(*g));
746 	ret = PTR_ERR_OR_ZERO(g);
747 	if (ret)
748 		return ret;
749 
750 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
751 			       BTREE_ITER_INTENT|
752 			       BTREE_ITER_WITH_UPDATES);
753 	ret = bkey_err(k);
754 	if (ret)
755 		return ret;
756 
757 	if (k.k->type != KEY_TYPE_bucket_gens) {
758 		bkey_bucket_gens_init(&g->k_i);
759 		g->k.p = iter.pos;
760 	} else {
761 		bkey_reassemble(&g->k_i, k);
762 	}
763 
764 	g->v.gens[offset] = gen;
765 
766 	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
767 	bch2_trans_iter_exit(trans, &iter);
768 	return ret;
769 }
770 
771 int bch2_trans_mark_alloc(struct btree_trans *trans,
772 			  enum btree_id btree_id, unsigned level,
773 			  struct bkey_s_c old, struct bkey_i *new,
774 			  unsigned flags)
775 {
776 	struct bch_fs *c = trans->c;
777 	struct bch_alloc_v4 old_a_convert, *new_a;
778 	const struct bch_alloc_v4 *old_a;
779 	u64 old_lru, new_lru;
780 	int ret = 0;
781 
782 	/*
783 	 * Deletion only happens in the device removal path, with
784 	 * BTREE_TRIGGER_NORUN:
785 	 */
786 	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
787 
788 	old_a = bch2_alloc_to_v4(old, &old_a_convert);
789 	new_a = &bkey_i_to_alloc_v4(new)->v;
790 
791 	new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
792 
793 	if (new_a->dirty_sectors > old_a->dirty_sectors ||
794 	    new_a->cached_sectors > old_a->cached_sectors) {
795 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
796 		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
797 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
798 		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
799 	}
800 
801 	if (data_type_is_empty(new_a->data_type) &&
802 	    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
803 	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
804 		new_a->gen++;
805 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
806 	}
807 
808 	if (old_a->data_type != new_a->data_type ||
809 	    (new_a->data_type == BCH_DATA_free &&
810 	     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
811 		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
812 			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
813 		if (ret)
814 			return ret;
815 	}
816 
817 	if (new_a->data_type == BCH_DATA_cached &&
818 	    !new_a->io_time[READ])
819 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
820 
821 	old_lru = alloc_lru_idx_read(*old_a);
822 	new_lru = alloc_lru_idx_read(*new_a);
823 
824 	if (old_lru != new_lru) {
825 		ret = bch2_lru_change(trans, new->k.p.inode,
826 				      bucket_to_u64(new->k.p),
827 				      old_lru, new_lru);
828 		if (ret)
829 			return ret;
830 	}
831 
832 	new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
833 					bch_dev_bkey_exists(c, new->k.p.inode));
834 
835 	if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
836 		ret = bch2_lru_change(trans,
837 				BCH_LRU_FRAGMENTATION_START,
838 				bucket_to_u64(new->k.p),
839 				old_a->fragmentation_lru, new_a->fragmentation_lru);
840 		if (ret)
841 			return ret;
842 	}
843 
844 	if (old_a->gen != new_a->gen) {
845 		ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
846 		if (ret)
847 			return ret;
848 	}
849 
850 	return 0;
851 }
852 
853 /*
854  * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
855  * extents style btrees, but works on non-extents btrees:
856  */
857 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
858 {
859 	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
860 
861 	if (bkey_err(k))
862 		return k;
863 
864 	if (k.k->type) {
865 		return k;
866 	} else {
867 		struct btree_iter iter2;
868 		struct bpos next;
869 
870 		bch2_trans_copy_iter(&iter2, iter);
871 
872 		if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
873 			end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
874 
875 		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
876 
877 		/*
878 		 * btree node min/max is a closed interval, upto takes a half
879 		 * open interval:
880 		 */
881 		k = bch2_btree_iter_peek_upto(&iter2, end);
882 		next = iter2.pos;
883 		bch2_trans_iter_exit(iter->trans, &iter2);
884 
885 		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
886 
887 		if (bkey_err(k))
888 			return k;
889 
890 		bkey_init(hole);
891 		hole->p = iter->pos;
892 
893 		bch2_key_resize(hole, next.offset - iter->pos.offset);
894 		return (struct bkey_s_c) { hole, NULL };
895 	}
896 }
897 
898 static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
899 {
900 	struct bch_dev *ca;
901 	unsigned iter;
902 
903 	if (bch2_dev_bucket_exists(c, *bucket))
904 		return true;
905 
906 	if (bch2_dev_exists2(c, bucket->inode)) {
907 		ca = bch_dev_bkey_exists(c, bucket->inode);
908 
909 		if (bucket->offset < ca->mi.first_bucket) {
910 			bucket->offset = ca->mi.first_bucket;
911 			return true;
912 		}
913 
914 		bucket->inode++;
915 		bucket->offset = 0;
916 	}
917 
918 	rcu_read_lock();
919 	iter = bucket->inode;
920 	ca = __bch2_next_dev(c, &iter, NULL);
921 	if (ca)
922 		*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
923 	rcu_read_unlock();
924 
925 	return ca != NULL;
926 }
927 
928 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
929 {
930 	struct bch_fs *c = iter->trans->c;
931 	struct bkey_s_c k;
932 again:
933 	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
934 	if (bkey_err(k))
935 		return k;
936 
937 	if (!k.k->type) {
938 		struct bpos bucket = bkey_start_pos(k.k);
939 
940 		if (!bch2_dev_bucket_exists(c, bucket)) {
941 			if (!next_bucket(c, &bucket))
942 				return bkey_s_c_null;
943 
944 			bch2_btree_iter_set_pos(iter, bucket);
945 			goto again;
946 		}
947 
948 		if (!bch2_dev_bucket_exists(c, k.k->p)) {
949 			struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
950 
951 			bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
952 		}
953 	}
954 
955 	return k;
956 }
957 
958 static noinline_for_stack
959 int bch2_check_alloc_key(struct btree_trans *trans,
960 			 struct bkey_s_c alloc_k,
961 			 struct btree_iter *alloc_iter,
962 			 struct btree_iter *discard_iter,
963 			 struct btree_iter *freespace_iter,
964 			 struct btree_iter *bucket_gens_iter)
965 {
966 	struct bch_fs *c = trans->c;
967 	struct bch_dev *ca;
968 	struct bch_alloc_v4 a_convert;
969 	const struct bch_alloc_v4 *a;
970 	unsigned discard_key_type, freespace_key_type;
971 	unsigned gens_offset;
972 	struct bkey_s_c k;
973 	struct printbuf buf = PRINTBUF;
974 	int ret;
975 
976 	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
977 			alloc_key_to_missing_dev_bucket,
978 			"alloc key for invalid device:bucket %llu:%llu",
979 			alloc_k.k->p.inode, alloc_k.k->p.offset))
980 		return bch2_btree_delete_at(trans, alloc_iter, 0);
981 
982 	ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
983 	if (!ca->mi.freespace_initialized)
984 		return 0;
985 
986 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
987 
988 	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
989 	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
990 	k = bch2_btree_iter_peek_slot(discard_iter);
991 	ret = bkey_err(k);
992 	if (ret)
993 		goto err;
994 
995 	if (k.k->type != discard_key_type &&
996 	    (c->opts.reconstruct_alloc ||
997 	     fsck_err(c, need_discard_key_wrong,
998 		      "incorrect key in need_discard btree (got %s should be %s)\n"
999 		      "  %s",
1000 		      bch2_bkey_types[k.k->type],
1001 		      bch2_bkey_types[discard_key_type],
1002 		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
1003 		struct bkey_i *update =
1004 			bch2_trans_kmalloc(trans, sizeof(*update));
1005 
1006 		ret = PTR_ERR_OR_ZERO(update);
1007 		if (ret)
1008 			goto err;
1009 
1010 		bkey_init(&update->k);
1011 		update->k.type	= discard_key_type;
1012 		update->k.p	= discard_iter->pos;
1013 
1014 		ret = bch2_trans_update(trans, discard_iter, update, 0);
1015 		if (ret)
1016 			goto err;
1017 	}
1018 
1019 	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1020 	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1021 	k = bch2_btree_iter_peek_slot(freespace_iter);
1022 	ret = bkey_err(k);
1023 	if (ret)
1024 		goto err;
1025 
1026 	if (k.k->type != freespace_key_type &&
1027 	    (c->opts.reconstruct_alloc ||
1028 	     fsck_err(c, freespace_key_wrong,
1029 		      "incorrect key in freespace btree (got %s should be %s)\n"
1030 		      "  %s",
1031 		      bch2_bkey_types[k.k->type],
1032 		      bch2_bkey_types[freespace_key_type],
1033 		      (printbuf_reset(&buf),
1034 		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
1035 		struct bkey_i *update =
1036 			bch2_trans_kmalloc(trans, sizeof(*update));
1037 
1038 		ret = PTR_ERR_OR_ZERO(update);
1039 		if (ret)
1040 			goto err;
1041 
1042 		bkey_init(&update->k);
1043 		update->k.type	= freespace_key_type;
1044 		update->k.p	= freespace_iter->pos;
1045 		bch2_key_resize(&update->k, 1);
1046 
1047 		ret = bch2_trans_update(trans, freespace_iter, update, 0);
1048 		if (ret)
1049 			goto err;
1050 	}
1051 
1052 	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1053 	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1054 	ret = bkey_err(k);
1055 	if (ret)
1056 		goto err;
1057 
1058 	if (a->gen != alloc_gen(k, gens_offset) &&
1059 	    (c->opts.reconstruct_alloc ||
1060 	     fsck_err(c, bucket_gens_key_wrong,
1061 		      "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1062 		      "  %s",
1063 		      alloc_gen(k, gens_offset), a->gen,
1064 		      (printbuf_reset(&buf),
1065 		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
1066 		struct bkey_i_bucket_gens *g =
1067 			bch2_trans_kmalloc(trans, sizeof(*g));
1068 
1069 		ret = PTR_ERR_OR_ZERO(g);
1070 		if (ret)
1071 			goto err;
1072 
1073 		if (k.k->type == KEY_TYPE_bucket_gens) {
1074 			bkey_reassemble(&g->k_i, k);
1075 		} else {
1076 			bkey_bucket_gens_init(&g->k_i);
1077 			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1078 		}
1079 
1080 		g->v.gens[gens_offset] = a->gen;
1081 
1082 		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1083 		if (ret)
1084 			goto err;
1085 	}
1086 err:
1087 fsck_err:
1088 	printbuf_exit(&buf);
1089 	return ret;
1090 }
1091 
1092 static noinline_for_stack
1093 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1094 				    struct bpos start,
1095 				    struct bpos *end,
1096 				    struct btree_iter *freespace_iter)
1097 {
1098 	struct bch_fs *c = trans->c;
1099 	struct bch_dev *ca;
1100 	struct bkey_s_c k;
1101 	struct printbuf buf = PRINTBUF;
1102 	int ret;
1103 
1104 	ca = bch_dev_bkey_exists(c, start.inode);
1105 	if (!ca->mi.freespace_initialized)
1106 		return 0;
1107 
1108 	bch2_btree_iter_set_pos(freespace_iter, start);
1109 
1110 	k = bch2_btree_iter_peek_slot(freespace_iter);
1111 	ret = bkey_err(k);
1112 	if (ret)
1113 		goto err;
1114 
1115 	*end = bkey_min(k.k->p, *end);
1116 
1117 	if (k.k->type != KEY_TYPE_set &&
1118 	    (c->opts.reconstruct_alloc ||
1119 	     fsck_err(c, freespace_hole_missing,
1120 		      "hole in alloc btree missing in freespace btree\n"
1121 		      "  device %llu buckets %llu-%llu",
1122 		      freespace_iter->pos.inode,
1123 		      freespace_iter->pos.offset,
1124 		      end->offset))) {
1125 		struct bkey_i *update =
1126 			bch2_trans_kmalloc(trans, sizeof(*update));
1127 
1128 		ret = PTR_ERR_OR_ZERO(update);
1129 		if (ret)
1130 			goto err;
1131 
1132 		bkey_init(&update->k);
1133 		update->k.type	= KEY_TYPE_set;
1134 		update->k.p	= freespace_iter->pos;
1135 		bch2_key_resize(&update->k,
1136 				min_t(u64, U32_MAX, end->offset -
1137 				      freespace_iter->pos.offset));
1138 
1139 		ret = bch2_trans_update(trans, freespace_iter, update, 0);
1140 		if (ret)
1141 			goto err;
1142 	}
1143 err:
1144 fsck_err:
1145 	printbuf_exit(&buf);
1146 	return ret;
1147 }
1148 
1149 static noinline_for_stack
1150 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1151 				      struct bpos start,
1152 				      struct bpos *end,
1153 				      struct btree_iter *bucket_gens_iter)
1154 {
1155 	struct bch_fs *c = trans->c;
1156 	struct bkey_s_c k;
1157 	struct printbuf buf = PRINTBUF;
1158 	unsigned i, gens_offset, gens_end_offset;
1159 	int ret;
1160 
1161 	if (c->sb.version < bcachefs_metadata_version_bucket_gens)
1162 		return 0;
1163 
1164 	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1165 
1166 	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1167 	ret = bkey_err(k);
1168 	if (ret)
1169 		goto err;
1170 
1171 	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1172 		     alloc_gens_pos(*end,  &gens_end_offset)))
1173 		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1174 
1175 	if (k.k->type == KEY_TYPE_bucket_gens) {
1176 		struct bkey_i_bucket_gens g;
1177 		bool need_update = false;
1178 
1179 		bkey_reassemble(&g.k_i, k);
1180 
1181 		for (i = gens_offset; i < gens_end_offset; i++) {
1182 			if (fsck_err_on(g.v.gens[i], c,
1183 					bucket_gens_hole_wrong,
1184 					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1185 					bucket_gens_pos_to_alloc(k.k->p, i).inode,
1186 					bucket_gens_pos_to_alloc(k.k->p, i).offset,
1187 					g.v.gens[i])) {
1188 				g.v.gens[i] = 0;
1189 				need_update = true;
1190 			}
1191 		}
1192 
1193 		if (need_update) {
1194 			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1195 
1196 			ret = PTR_ERR_OR_ZERO(u);
1197 			if (ret)
1198 				goto err;
1199 
1200 			memcpy(u, &g, sizeof(g));
1201 
1202 			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1203 			if (ret)
1204 				goto err;
1205 		}
1206 	}
1207 
1208 	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1209 err:
1210 fsck_err:
1211 	printbuf_exit(&buf);
1212 	return ret;
1213 }
1214 
1215 static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
1216 					      struct btree_iter *iter)
1217 {
1218 	struct bch_fs *c = trans->c;
1219 	struct btree_iter alloc_iter;
1220 	struct bkey_s_c alloc_k;
1221 	struct bch_alloc_v4 a_convert;
1222 	const struct bch_alloc_v4 *a;
1223 	u64 genbits;
1224 	struct bpos pos;
1225 	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1226 		? BCH_DATA_need_discard
1227 		: BCH_DATA_free;
1228 	struct printbuf buf = PRINTBUF;
1229 	int ret;
1230 
1231 	pos = iter->pos;
1232 	pos.offset &= ~(~0ULL << 56);
1233 	genbits = iter->pos.offset & (~0ULL << 56);
1234 
1235 	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1236 	ret = bkey_err(alloc_k);
1237 	if (ret)
1238 		return ret;
1239 
1240 	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
1241 			need_discard_freespace_key_to_invalid_dev_bucket,
1242 			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
1243 			bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
1244 		goto delete;
1245 
1246 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
1247 
1248 	if (fsck_err_on(a->data_type != state ||
1249 			(state == BCH_DATA_free &&
1250 			 genbits != alloc_freespace_genbits(*a)), c,
1251 			need_discard_freespace_key_bad,
1252 			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1253 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1254 			bch2_btree_id_str(iter->btree_id),
1255 			iter->pos.inode,
1256 			iter->pos.offset,
1257 			a->data_type == state,
1258 			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1259 		goto delete;
1260 out:
1261 fsck_err:
1262 	set_btree_iter_dontneed(&alloc_iter);
1263 	bch2_trans_iter_exit(trans, &alloc_iter);
1264 	printbuf_exit(&buf);
1265 	return ret;
1266 delete:
1267 	ret =   bch2_btree_delete_extent_at(trans, iter,
1268 			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1269 		bch2_trans_commit(trans, NULL, NULL,
1270 			BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
1271 	goto out;
1272 }
1273 
1274 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
1275 					    struct btree_iter *iter,
1276 					    struct bpos end)
1277 {
1278 	if (!btree_id_is_extents(iter->btree_id)) {
1279 		return __bch2_check_discard_freespace_key(trans, iter);
1280 	} else {
1281 		int ret = 0;
1282 
1283 		while (!bkey_eq(iter->pos, end) &&
1284 		       !(ret = btree_trans_too_many_iters(trans) ?:
1285 			       __bch2_check_discard_freespace_key(trans, iter)))
1286 			bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
1287 
1288 		return ret;
1289 	}
1290 }
1291 
1292 /*
1293  * We've already checked that generation numbers in the bucket_gens btree are
1294  * valid for buckets that exist; this just checks for keys for nonexistent
1295  * buckets.
1296  */
1297 static noinline_for_stack
1298 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1299 			       struct btree_iter *iter,
1300 			       struct bkey_s_c k)
1301 {
1302 	struct bch_fs *c = trans->c;
1303 	struct bkey_i_bucket_gens g;
1304 	struct bch_dev *ca;
1305 	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1306 	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1307 	u64 b;
1308 	bool need_update = false, dev_exists;
1309 	struct printbuf buf = PRINTBUF;
1310 	int ret = 0;
1311 
1312 	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1313 	bkey_reassemble(&g.k_i, k);
1314 
1315 	/* if no bch_dev, skip out whether we repair or not */
1316 	dev_exists = bch2_dev_exists2(c, k.k->p.inode);
1317 	if (!dev_exists) {
1318 		if (fsck_err_on(!dev_exists, c,
1319 				bucket_gens_to_invalid_dev,
1320 				"bucket_gens key for invalid device:\n  %s",
1321 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1322 			ret = bch2_btree_delete_at(trans, iter, 0);
1323 		}
1324 		goto out;
1325 	}
1326 
1327 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
1328 	if (fsck_err_on(end <= ca->mi.first_bucket ||
1329 			start >= ca->mi.nbuckets, c,
1330 			bucket_gens_to_invalid_buckets,
1331 			"bucket_gens key for invalid buckets:\n  %s",
1332 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1333 		ret = bch2_btree_delete_at(trans, iter, 0);
1334 		goto out;
1335 	}
1336 
1337 	for (b = start; b < ca->mi.first_bucket; b++)
1338 		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1339 				bucket_gens_nonzero_for_invalid_buckets,
1340 				"bucket_gens key has nonzero gen for invalid bucket")) {
1341 			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1342 			need_update = true;
1343 		}
1344 
1345 	for (b = ca->mi.nbuckets; b < end; b++)
1346 		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1347 				bucket_gens_nonzero_for_invalid_buckets,
1348 				"bucket_gens key has nonzero gen for invalid bucket")) {
1349 			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1350 			need_update = true;
1351 		}
1352 
1353 	if (need_update) {
1354 		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1355 
1356 		ret = PTR_ERR_OR_ZERO(u);
1357 		if (ret)
1358 			goto out;
1359 
1360 		memcpy(u, &g, sizeof(g));
1361 		ret = bch2_trans_update(trans, iter, u, 0);
1362 	}
1363 out:
1364 fsck_err:
1365 	printbuf_exit(&buf);
1366 	return ret;
1367 }
1368 
1369 int bch2_check_alloc_info(struct bch_fs *c)
1370 {
1371 	struct btree_trans *trans = bch2_trans_get(c);
1372 	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1373 	struct bkey hole;
1374 	struct bkey_s_c k;
1375 	int ret = 0;
1376 
1377 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1378 			     BTREE_ITER_PREFETCH);
1379 	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1380 			     BTREE_ITER_PREFETCH);
1381 	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1382 			     BTREE_ITER_PREFETCH);
1383 	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1384 			     BTREE_ITER_PREFETCH);
1385 
1386 	while (1) {
1387 		struct bpos next;
1388 
1389 		bch2_trans_begin(trans);
1390 
1391 		k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
1392 		ret = bkey_err(k);
1393 		if (ret)
1394 			goto bkey_err;
1395 
1396 		if (!k.k)
1397 			break;
1398 
1399 		if (k.k->type) {
1400 			next = bpos_nosnap_successor(k.k->p);
1401 
1402 			ret = bch2_check_alloc_key(trans,
1403 						   k, &iter,
1404 						   &discard_iter,
1405 						   &freespace_iter,
1406 						   &bucket_gens_iter);
1407 			if (ret)
1408 				goto bkey_err;
1409 		} else {
1410 			next = k.k->p;
1411 
1412 			ret = bch2_check_alloc_hole_freespace(trans,
1413 						    bkey_start_pos(k.k),
1414 						    &next,
1415 						    &freespace_iter) ?:
1416 				bch2_check_alloc_hole_bucket_gens(trans,
1417 						    bkey_start_pos(k.k),
1418 						    &next,
1419 						    &bucket_gens_iter);
1420 			if (ret)
1421 				goto bkey_err;
1422 		}
1423 
1424 		ret = bch2_trans_commit(trans, NULL, NULL,
1425 					BTREE_INSERT_NOFAIL|
1426 					BTREE_INSERT_LAZY_RW);
1427 		if (ret)
1428 			goto bkey_err;
1429 
1430 		bch2_btree_iter_set_pos(&iter, next);
1431 bkey_err:
1432 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1433 			continue;
1434 		if (ret)
1435 			break;
1436 	}
1437 	bch2_trans_iter_exit(trans, &bucket_gens_iter);
1438 	bch2_trans_iter_exit(trans, &freespace_iter);
1439 	bch2_trans_iter_exit(trans, &discard_iter);
1440 	bch2_trans_iter_exit(trans, &iter);
1441 
1442 	if (ret < 0)
1443 		goto err;
1444 
1445 	ret = for_each_btree_key2(trans, iter,
1446 			BTREE_ID_need_discard, POS_MIN,
1447 			BTREE_ITER_PREFETCH, k,
1448 		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
1449 	      for_each_btree_key2(trans, iter,
1450 			BTREE_ID_freespace, POS_MIN,
1451 			BTREE_ITER_PREFETCH, k,
1452 		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
1453 	      for_each_btree_key_commit(trans, iter,
1454 			BTREE_ID_bucket_gens, POS_MIN,
1455 			BTREE_ITER_PREFETCH, k,
1456 			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
1457 		bch2_check_bucket_gens_key(trans, &iter, k));
1458 err:
1459 	bch2_trans_put(trans);
1460 	if (ret)
1461 		bch_err_fn(c, ret);
1462 	return ret;
1463 }
1464 
1465 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1466 				       struct btree_iter *alloc_iter)
1467 {
1468 	struct bch_fs *c = trans->c;
1469 	struct btree_iter lru_iter;
1470 	struct bch_alloc_v4 a_convert;
1471 	const struct bch_alloc_v4 *a;
1472 	struct bkey_s_c alloc_k, lru_k;
1473 	struct printbuf buf = PRINTBUF;
1474 	int ret;
1475 
1476 	alloc_k = bch2_btree_iter_peek(alloc_iter);
1477 	if (!alloc_k.k)
1478 		return 0;
1479 
1480 	ret = bkey_err(alloc_k);
1481 	if (ret)
1482 		return ret;
1483 
1484 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
1485 
1486 	if (a->data_type != BCH_DATA_cached)
1487 		return 0;
1488 
1489 	lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
1490 			     lru_pos(alloc_k.k->p.inode,
1491 				     bucket_to_u64(alloc_k.k->p),
1492 				     a->io_time[READ]), 0);
1493 	ret = bkey_err(lru_k);
1494 	if (ret)
1495 		return ret;
1496 
1497 	if (fsck_err_on(!a->io_time[READ], c,
1498 			alloc_key_cached_but_read_time_zero,
1499 			"cached bucket with read_time 0\n"
1500 			"  %s",
1501 		(printbuf_reset(&buf),
1502 		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
1503 	    fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
1504 			alloc_key_to_missing_lru_entry,
1505 			"missing lru entry\n"
1506 			"  %s",
1507 			(printbuf_reset(&buf),
1508 			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1509 		u64 read_time = a->io_time[READ] ?:
1510 			atomic64_read(&c->io_clock[READ].now);
1511 
1512 		ret = bch2_lru_set(trans,
1513 				   alloc_k.k->p.inode,
1514 				   bucket_to_u64(alloc_k.k->p),
1515 				   read_time);
1516 		if (ret)
1517 			goto err;
1518 
1519 		if (a->io_time[READ] != read_time) {
1520 			struct bkey_i_alloc_v4 *a_mut =
1521 				bch2_alloc_to_v4_mut(trans, alloc_k);
1522 			ret = PTR_ERR_OR_ZERO(a_mut);
1523 			if (ret)
1524 				goto err;
1525 
1526 			a_mut->v.io_time[READ] = read_time;
1527 			ret = bch2_trans_update(trans, alloc_iter,
1528 						&a_mut->k_i, BTREE_TRIGGER_NORUN);
1529 			if (ret)
1530 				goto err;
1531 		}
1532 	}
1533 err:
1534 fsck_err:
1535 	bch2_trans_iter_exit(trans, &lru_iter);
1536 	printbuf_exit(&buf);
1537 	return ret;
1538 }
1539 
1540 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1541 {
1542 	struct btree_iter iter;
1543 	struct bkey_s_c k;
1544 	int ret = 0;
1545 
1546 	ret = bch2_trans_run(c,
1547 		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1548 				POS_MIN, BTREE_ITER_PREFETCH, k,
1549 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
1550 			bch2_check_alloc_to_lru_ref(trans, &iter)));
1551 	if (ret)
1552 		bch_err_fn(c, ret);
1553 	return ret;
1554 }
1555 
1556 static int bch2_discard_one_bucket(struct btree_trans *trans,
1557 				   struct btree_iter *need_discard_iter,
1558 				   struct bpos *discard_pos_done,
1559 				   u64 *seen,
1560 				   u64 *open,
1561 				   u64 *need_journal_commit,
1562 				   u64 *discarded)
1563 {
1564 	struct bch_fs *c = trans->c;
1565 	struct bpos pos = need_discard_iter->pos;
1566 	struct btree_iter iter = { NULL };
1567 	struct bkey_s_c k;
1568 	struct bch_dev *ca;
1569 	struct bkey_i_alloc_v4 *a;
1570 	struct printbuf buf = PRINTBUF;
1571 	int ret = 0;
1572 
1573 	ca = bch_dev_bkey_exists(c, pos.inode);
1574 	if (!percpu_ref_tryget(&ca->io_ref)) {
1575 		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
1576 		return 0;
1577 	}
1578 
1579 	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1580 		(*open)++;
1581 		goto out;
1582 	}
1583 
1584 	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1585 			c->journal.flushed_seq_ondisk,
1586 			pos.inode, pos.offset)) {
1587 		(*need_journal_commit)++;
1588 		goto out;
1589 	}
1590 
1591 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1592 			       need_discard_iter->pos,
1593 			       BTREE_ITER_CACHED);
1594 	ret = bkey_err(k);
1595 	if (ret)
1596 		goto out;
1597 
1598 	a = bch2_alloc_to_v4_mut(trans, k);
1599 	ret = PTR_ERR_OR_ZERO(a);
1600 	if (ret)
1601 		goto out;
1602 
1603 	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1604 		a->v.gen++;
1605 		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1606 		goto write;
1607 	}
1608 
1609 	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1610 		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
1611 			bch2_trans_inconsistent(trans,
1612 				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
1613 				"%s",
1614 				a->v.journal_seq,
1615 				c->journal.flushed_seq_ondisk,
1616 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
1617 			ret = -EIO;
1618 		}
1619 		goto out;
1620 	}
1621 
1622 	if (a->v.data_type != BCH_DATA_need_discard) {
1623 		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
1624 			bch2_trans_inconsistent(trans,
1625 				"bucket incorrectly set in need_discard btree\n"
1626 				"%s",
1627 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
1628 			ret = -EIO;
1629 		}
1630 
1631 		goto out;
1632 	}
1633 
1634 	if (!bkey_eq(*discard_pos_done, iter.pos) &&
1635 	    ca->mi.discard && !c->opts.nochanges) {
1636 		/*
1637 		 * This works without any other locks because this is the only
1638 		 * thread that removes items from the need_discard tree
1639 		 */
1640 		bch2_trans_unlock(trans);
1641 		blkdev_issue_discard(ca->disk_sb.bdev,
1642 				     k.k->p.offset * ca->mi.bucket_size,
1643 				     ca->mi.bucket_size,
1644 				     GFP_KERNEL);
1645 		*discard_pos_done = iter.pos;
1646 
1647 		ret = bch2_trans_relock_notrace(trans);
1648 		if (ret)
1649 			goto out;
1650 	}
1651 
1652 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1653 	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
1654 write:
1655 	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1656 		bch2_trans_commit(trans, NULL, NULL,
1657 				  BCH_WATERMARK_btree|
1658 				  BTREE_INSERT_NOFAIL);
1659 	if (ret)
1660 		goto out;
1661 
1662 	this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
1663 	(*discarded)++;
1664 out:
1665 	(*seen)++;
1666 	bch2_trans_iter_exit(trans, &iter);
1667 	percpu_ref_put(&ca->io_ref);
1668 	printbuf_exit(&buf);
1669 	return ret;
1670 }
1671 
1672 static void bch2_do_discards_work(struct work_struct *work)
1673 {
1674 	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
1675 	struct btree_iter iter;
1676 	struct bkey_s_c k;
1677 	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
1678 	struct bpos discard_pos_done = POS_MAX;
1679 	int ret;
1680 
1681 	/*
1682 	 * We're doing the commit in bch2_discard_one_bucket instead of using
1683 	 * for_each_btree_key_commit() so that we can increment counters after
1684 	 * successful commit:
1685 	 */
1686 	ret = bch2_trans_run(c,
1687 		for_each_btree_key2(trans, iter,
1688 				BTREE_ID_need_discard, POS_MIN, 0, k,
1689 			bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
1690 						&seen,
1691 						&open,
1692 						&need_journal_commit,
1693 						&discarded)));
1694 
1695 	if (need_journal_commit * 2 > seen)
1696 		bch2_journal_flush_async(&c->journal, NULL);
1697 
1698 	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1699 
1700 	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
1701 			      bch2_err_str(ret));
1702 }
1703 
1704 void bch2_do_discards(struct bch_fs *c)
1705 {
1706 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
1707 	    !queue_work(c->write_ref_wq, &c->discard_work))
1708 		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1709 }
1710 
1711 static int invalidate_one_bucket(struct btree_trans *trans,
1712 				 struct btree_iter *lru_iter,
1713 				 struct bkey_s_c lru_k,
1714 				 s64 *nr_to_invalidate)
1715 {
1716 	struct bch_fs *c = trans->c;
1717 	struct btree_iter alloc_iter = { NULL };
1718 	struct bkey_i_alloc_v4 *a = NULL;
1719 	struct printbuf buf = PRINTBUF;
1720 	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
1721 	unsigned cached_sectors;
1722 	int ret = 0;
1723 
1724 	if (*nr_to_invalidate <= 0)
1725 		return 1;
1726 
1727 	if (!bch2_dev_bucket_exists(c, bucket)) {
1728 		prt_str(&buf, "lru entry points to invalid bucket");
1729 		goto err;
1730 	}
1731 
1732 	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
1733 		return 0;
1734 
1735 	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
1736 	ret = PTR_ERR_OR_ZERO(a);
1737 	if (ret)
1738 		goto out;
1739 
1740 	/* We expect harmless races here due to the btree write buffer: */
1741 	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
1742 		goto out;
1743 
1744 	BUG_ON(a->v.data_type != BCH_DATA_cached);
1745 
1746 	if (!a->v.cached_sectors)
1747 		bch_err(c, "invalidating empty bucket, confused");
1748 
1749 	cached_sectors = a->v.cached_sectors;
1750 
1751 	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1752 	a->v.gen++;
1753 	a->v.data_type		= 0;
1754 	a->v.dirty_sectors	= 0;
1755 	a->v.cached_sectors	= 0;
1756 	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
1757 	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
1758 
1759 	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
1760 				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
1761 		bch2_trans_commit(trans, NULL, NULL,
1762 				  BCH_WATERMARK_btree|
1763 				  BTREE_INSERT_NOFAIL);
1764 	if (ret)
1765 		goto out;
1766 
1767 	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
1768 	--*nr_to_invalidate;
1769 out:
1770 	bch2_trans_iter_exit(trans, &alloc_iter);
1771 	printbuf_exit(&buf);
1772 	return ret;
1773 err:
1774 	prt_str(&buf, "\n  lru key: ");
1775 	bch2_bkey_val_to_text(&buf, c, lru_k);
1776 
1777 	prt_str(&buf, "\n  lru entry: ");
1778 	bch2_lru_pos_to_text(&buf, lru_iter->pos);
1779 
1780 	prt_str(&buf, "\n  alloc key: ");
1781 	if (!a)
1782 		bch2_bpos_to_text(&buf, bucket);
1783 	else
1784 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
1785 
1786 	bch_err(c, "%s", buf.buf);
1787 	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
1788 		bch2_inconsistent_error(c);
1789 		ret = -EINVAL;
1790 	}
1791 
1792 	goto out;
1793 }
1794 
1795 static void bch2_do_invalidates_work(struct work_struct *work)
1796 {
1797 	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
1798 	struct bch_dev *ca;
1799 	struct btree_trans *trans = bch2_trans_get(c);
1800 	struct btree_iter iter;
1801 	struct bkey_s_c k;
1802 	unsigned i;
1803 	int ret = 0;
1804 
1805 	ret = bch2_btree_write_buffer_flush(trans);
1806 	if (ret)
1807 		goto err;
1808 
1809 	for_each_member_device(ca, c, i) {
1810 		s64 nr_to_invalidate =
1811 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
1812 
1813 		ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
1814 				lru_pos(ca->dev_idx, 0, 0),
1815 				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
1816 				BTREE_ITER_INTENT, k,
1817 			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
1818 
1819 		if (ret < 0) {
1820 			percpu_ref_put(&ca->ref);
1821 			break;
1822 		}
1823 	}
1824 err:
1825 	bch2_trans_put(trans);
1826 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
1827 }
1828 
1829 void bch2_do_invalidates(struct bch_fs *c)
1830 {
1831 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
1832 	    !queue_work(c->write_ref_wq, &c->invalidate_work))
1833 		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
1834 }
1835 
1836 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
1837 			    u64 bucket_start, u64 bucket_end)
1838 {
1839 	struct btree_trans *trans = bch2_trans_get(c);
1840 	struct btree_iter iter;
1841 	struct bkey_s_c k;
1842 	struct bkey hole;
1843 	struct bpos end = POS(ca->dev_idx, bucket_end);
1844 	struct bch_member *m;
1845 	unsigned long last_updated = jiffies;
1846 	int ret;
1847 
1848 	BUG_ON(bucket_start > bucket_end);
1849 	BUG_ON(bucket_end > ca->mi.nbuckets);
1850 
1851 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
1852 		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
1853 		BTREE_ITER_PREFETCH);
1854 	/*
1855 	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
1856 	 * freespace/need_discard/need_gc_gens btrees as needed:
1857 	 */
1858 	while (1) {
1859 		if (last_updated + HZ * 10 < jiffies) {
1860 			bch_info(ca, "%s: currently at %llu/%llu",
1861 				 __func__, iter.pos.offset, ca->mi.nbuckets);
1862 			last_updated = jiffies;
1863 		}
1864 
1865 		bch2_trans_begin(trans);
1866 
1867 		if (bkey_ge(iter.pos, end)) {
1868 			ret = 0;
1869 			break;
1870 		}
1871 
1872 		k = bch2_get_key_or_hole(&iter, end, &hole);
1873 		ret = bkey_err(k);
1874 		if (ret)
1875 			goto bkey_err;
1876 
1877 		if (k.k->type) {
1878 			/*
1879 			 * We process live keys in the alloc btree one at a
1880 			 * time:
1881 			 */
1882 			struct bch_alloc_v4 a_convert;
1883 			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1884 
1885 			ret =   bch2_bucket_do_index(trans, k, a, true) ?:
1886 				bch2_trans_commit(trans, NULL, NULL,
1887 						  BTREE_INSERT_LAZY_RW|
1888 						  BTREE_INSERT_NOFAIL);
1889 			if (ret)
1890 				goto bkey_err;
1891 
1892 			bch2_btree_iter_advance(&iter);
1893 		} else {
1894 			struct bkey_i *freespace;
1895 
1896 			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
1897 			ret = PTR_ERR_OR_ZERO(freespace);
1898 			if (ret)
1899 				goto bkey_err;
1900 
1901 			bkey_init(&freespace->k);
1902 			freespace->k.type	= KEY_TYPE_set;
1903 			freespace->k.p		= k.k->p;
1904 			freespace->k.size	= k.k->size;
1905 
1906 			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
1907 				bch2_trans_commit(trans, NULL, NULL,
1908 						  BTREE_INSERT_LAZY_RW|
1909 						  BTREE_INSERT_NOFAIL);
1910 			if (ret)
1911 				goto bkey_err;
1912 
1913 			bch2_btree_iter_set_pos(&iter, k.k->p);
1914 		}
1915 bkey_err:
1916 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1917 			continue;
1918 		if (ret)
1919 			break;
1920 	}
1921 
1922 	bch2_trans_iter_exit(trans, &iter);
1923 	bch2_trans_put(trans);
1924 
1925 	if (ret < 0) {
1926 		bch_err_msg(ca, ret, "initializing free space");
1927 		return ret;
1928 	}
1929 
1930 	mutex_lock(&c->sb_lock);
1931 	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
1932 	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
1933 	mutex_unlock(&c->sb_lock);
1934 
1935 	return 0;
1936 }
1937 
1938 int bch2_fs_freespace_init(struct bch_fs *c)
1939 {
1940 	struct bch_dev *ca;
1941 	unsigned i;
1942 	int ret = 0;
1943 	bool doing_init = false;
1944 
1945 	/*
1946 	 * We can crash during the device add path, so we need to check this on
1947 	 * every mount:
1948 	 */
1949 
1950 	for_each_member_device(ca, c, i) {
1951 		if (ca->mi.freespace_initialized)
1952 			continue;
1953 
1954 		if (!doing_init) {
1955 			bch_info(c, "initializing freespace");
1956 			doing_init = true;
1957 		}
1958 
1959 		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
1960 		if (ret) {
1961 			percpu_ref_put(&ca->ref);
1962 			bch_err_fn(c, ret);
1963 			return ret;
1964 		}
1965 	}
1966 
1967 	if (doing_init) {
1968 		mutex_lock(&c->sb_lock);
1969 		bch2_write_super(c);
1970 		mutex_unlock(&c->sb_lock);
1971 		bch_verbose(c, "done initializing freespace");
1972 	}
1973 
1974 	return 0;
1975 }
1976 
1977 /* Bucket IO clocks: */
1978 
1979 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
1980 			      size_t bucket_nr, int rw)
1981 {
1982 	struct bch_fs *c = trans->c;
1983 	struct btree_iter iter;
1984 	struct bkey_i_alloc_v4 *a;
1985 	u64 now;
1986 	int ret = 0;
1987 
1988 	a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
1989 	ret = PTR_ERR_OR_ZERO(a);
1990 	if (ret)
1991 		return ret;
1992 
1993 	now = atomic64_read(&c->io_clock[rw].now);
1994 	if (a->v.io_time[rw] == now)
1995 		goto out;
1996 
1997 	a->v.io_time[rw] = now;
1998 
1999 	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2000 		bch2_trans_commit(trans, NULL, NULL, 0);
2001 out:
2002 	bch2_trans_iter_exit(trans, &iter);
2003 	return ret;
2004 }
2005 
2006 /* Startup/shutdown (ro/rw): */
2007 
2008 void bch2_recalc_capacity(struct bch_fs *c)
2009 {
2010 	struct bch_dev *ca;
2011 	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2012 	unsigned bucket_size_max = 0;
2013 	unsigned long ra_pages = 0;
2014 	unsigned i;
2015 
2016 	lockdep_assert_held(&c->state_lock);
2017 
2018 	for_each_online_member(ca, c, i) {
2019 		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2020 
2021 		ra_pages += bdi->ra_pages;
2022 	}
2023 
2024 	bch2_set_ra_pages(c, ra_pages);
2025 
2026 	for_each_rw_member(ca, c, i) {
2027 		u64 dev_reserve = 0;
2028 
2029 		/*
2030 		 * We need to reserve buckets (from the number
2031 		 * of currently available buckets) against
2032 		 * foreground writes so that mainly copygc can
2033 		 * make forward progress.
2034 		 *
2035 		 * We need enough to refill the various reserves
2036 		 * from scratch - copygc will use its entire
2037 		 * reserve all at once, then run against when
2038 		 * its reserve is refilled (from the formerly
2039 		 * available buckets).
2040 		 *
2041 		 * This reserve is just used when considering if
2042 		 * allocations for foreground writes must wait -
2043 		 * not -ENOSPC calculations.
2044 		 */
2045 
2046 		dev_reserve += ca->nr_btree_reserve * 2;
2047 		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2048 
2049 		dev_reserve += 1;	/* btree write point */
2050 		dev_reserve += 1;	/* copygc write point */
2051 		dev_reserve += 1;	/* rebalance write point */
2052 
2053 		dev_reserve *= ca->mi.bucket_size;
2054 
2055 		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2056 					     ca->mi.first_bucket);
2057 
2058 		reserved_sectors += dev_reserve * 2;
2059 
2060 		bucket_size_max = max_t(unsigned, bucket_size_max,
2061 					ca->mi.bucket_size);
2062 	}
2063 
2064 	gc_reserve = c->opts.gc_reserve_bytes
2065 		? c->opts.gc_reserve_bytes >> 9
2066 		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2067 
2068 	reserved_sectors = max(gc_reserve, reserved_sectors);
2069 
2070 	reserved_sectors = min(reserved_sectors, capacity);
2071 
2072 	c->capacity = capacity - reserved_sectors;
2073 
2074 	c->bucket_size_max = bucket_size_max;
2075 
2076 	/* Wake up case someone was waiting for buckets */
2077 	closure_wake_up(&c->freelist_wait);
2078 }
2079 
2080 u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2081 {
2082 	struct bch_dev *ca;
2083 	unsigned i;
2084 	u64 ret = U64_MAX;
2085 
2086 	for_each_rw_member(ca, c, i)
2087 		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2088 	return ret;
2089 }
2090 
2091 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2092 {
2093 	struct open_bucket *ob;
2094 	bool ret = false;
2095 
2096 	for (ob = c->open_buckets;
2097 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2098 	     ob++) {
2099 		spin_lock(&ob->lock);
2100 		if (ob->valid && !ob->on_partial_list &&
2101 		    ob->dev == ca->dev_idx)
2102 			ret = true;
2103 		spin_unlock(&ob->lock);
2104 	}
2105 
2106 	return ret;
2107 }
2108 
2109 /* device goes ro: */
2110 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2111 {
2112 	unsigned i;
2113 
2114 	/* First, remove device from allocation groups: */
2115 
2116 	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2117 		clear_bit(ca->dev_idx, c->rw_devs[i].d);
2118 
2119 	/*
2120 	 * Capacity is calculated based off of devices in allocation groups:
2121 	 */
2122 	bch2_recalc_capacity(c);
2123 
2124 	bch2_open_buckets_stop(c, ca, false);
2125 
2126 	/*
2127 	 * Wake up threads that were blocked on allocation, so they can notice
2128 	 * the device can no longer be removed and the capacity has changed:
2129 	 */
2130 	closure_wake_up(&c->freelist_wait);
2131 
2132 	/*
2133 	 * journal_res_get() can block waiting for free space in the journal -
2134 	 * it needs to notice there may not be devices to allocate from anymore:
2135 	 */
2136 	wake_up(&c->journal.wait);
2137 
2138 	/* Now wait for any in flight writes: */
2139 
2140 	closure_wait_event(&c->open_buckets_wait,
2141 			   !bch2_dev_has_open_write_point(c, ca));
2142 }
2143 
2144 /* device goes rw: */
2145 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2146 {
2147 	unsigned i;
2148 
2149 	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2150 		if (ca->mi.data_allowed & (1 << i))
2151 			set_bit(ca->dev_idx, c->rw_devs[i].d);
2152 }
2153 
2154 void bch2_fs_allocator_background_init(struct bch_fs *c)
2155 {
2156 	spin_lock_init(&c->freelist_lock);
2157 	INIT_WORK(&c->discard_work, bch2_do_discards_work);
2158 	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
2159 }
2160