1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_key_cache.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_gc.h"
12 #include "btree_write_buffer.h"
13 #include "buckets.h"
14 #include "buckets_waiting_for_journal.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "lru.h"
20 #include "recovery.h"
21 #include "trace.h"
22 #include "varint.h"
23
24 #include <linux/kthread.h>
25 #include <linux/math64.h>
26 #include <linux/random.h>
27 #include <linux/rculist.h>
28 #include <linux/rcupdate.h>
29 #include <linux/sched/task.h>
30 #include <linux/sort.h>
31
32 static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
33
34 /* Persistent alloc info: */
35
36 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
37 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
38 BCH_ALLOC_FIELDS_V1()
39 #undef x
40 };
41
42 struct bkey_alloc_unpacked {
43 u64 journal_seq;
44 u8 gen;
45 u8 oldest_gen;
46 u8 data_type;
47 bool need_discard:1;
48 bool need_inc_gen:1;
49 #define x(_name, _bits) u##_bits _name;
50 BCH_ALLOC_FIELDS_V2()
51 #undef x
52 };
53
alloc_field_v1_get(const struct bch_alloc * a,const void ** p,unsigned field)54 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
55 const void **p, unsigned field)
56 {
57 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
58 u64 v;
59
60 if (!(a->fields & (1 << field)))
61 return 0;
62
63 switch (bytes) {
64 case 1:
65 v = *((const u8 *) *p);
66 break;
67 case 2:
68 v = le16_to_cpup(*p);
69 break;
70 case 4:
71 v = le32_to_cpup(*p);
72 break;
73 case 8:
74 v = le64_to_cpup(*p);
75 break;
76 default:
77 BUG();
78 }
79
80 *p += bytes;
81 return v;
82 }
83
bch2_alloc_unpack_v1(struct bkey_alloc_unpacked * out,struct bkey_s_c k)84 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
85 struct bkey_s_c k)
86 {
87 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
88 const void *d = in->data;
89 unsigned idx = 0;
90
91 out->gen = in->gen;
92
93 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
94 BCH_ALLOC_FIELDS_V1()
95 #undef x
96 }
97
bch2_alloc_unpack_v2(struct bkey_alloc_unpacked * out,struct bkey_s_c k)98 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
99 struct bkey_s_c k)
100 {
101 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
102 const u8 *in = a.v->data;
103 const u8 *end = bkey_val_end(a);
104 unsigned fieldnr = 0;
105 int ret;
106 u64 v;
107
108 out->gen = a.v->gen;
109 out->oldest_gen = a.v->oldest_gen;
110 out->data_type = a.v->data_type;
111
112 #define x(_name, _bits) \
113 if (fieldnr < a.v->nr_fields) { \
114 ret = bch2_varint_decode_fast(in, end, &v); \
115 if (ret < 0) \
116 return ret; \
117 in += ret; \
118 } else { \
119 v = 0; \
120 } \
121 out->_name = v; \
122 if (v != out->_name) \
123 return -1; \
124 fieldnr++;
125
126 BCH_ALLOC_FIELDS_V2()
127 #undef x
128 return 0;
129 }
130
bch2_alloc_unpack_v3(struct bkey_alloc_unpacked * out,struct bkey_s_c k)131 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
132 struct bkey_s_c k)
133 {
134 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
135 const u8 *in = a.v->data;
136 const u8 *end = bkey_val_end(a);
137 unsigned fieldnr = 0;
138 int ret;
139 u64 v;
140
141 out->gen = a.v->gen;
142 out->oldest_gen = a.v->oldest_gen;
143 out->data_type = a.v->data_type;
144 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
145 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
146 out->journal_seq = le64_to_cpu(a.v->journal_seq);
147
148 #define x(_name, _bits) \
149 if (fieldnr < a.v->nr_fields) { \
150 ret = bch2_varint_decode_fast(in, end, &v); \
151 if (ret < 0) \
152 return ret; \
153 in += ret; \
154 } else { \
155 v = 0; \
156 } \
157 out->_name = v; \
158 if (v != out->_name) \
159 return -1; \
160 fieldnr++;
161
162 BCH_ALLOC_FIELDS_V2()
163 #undef x
164 return 0;
165 }
166
bch2_alloc_unpack(struct bkey_s_c k)167 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
168 {
169 struct bkey_alloc_unpacked ret = { .gen = 0 };
170
171 switch (k.k->type) {
172 case KEY_TYPE_alloc:
173 bch2_alloc_unpack_v1(&ret, k);
174 break;
175 case KEY_TYPE_alloc_v2:
176 bch2_alloc_unpack_v2(&ret, k);
177 break;
178 case KEY_TYPE_alloc_v3:
179 bch2_alloc_unpack_v3(&ret, k);
180 break;
181 }
182
183 return ret;
184 }
185
bch_alloc_v1_val_u64s(const struct bch_alloc * a)186 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
187 {
188 unsigned i, bytes = offsetof(struct bch_alloc, data);
189
190 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
191 if (a->fields & (1 << i))
192 bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
193
194 return DIV_ROUND_UP(bytes, sizeof(u64));
195 }
196
bch2_alloc_v1_invalid(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags,struct printbuf * err)197 int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
198 enum bch_validate_flags flags,
199 struct printbuf *err)
200 {
201 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
202 int ret = 0;
203
204 /* allow for unknown fields */
205 bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
206 alloc_v1_val_size_bad,
207 "incorrect value size (%zu < %u)",
208 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
209 fsck_err:
210 return ret;
211 }
212
bch2_alloc_v2_invalid(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags,struct printbuf * err)213 int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
214 enum bch_validate_flags flags,
215 struct printbuf *err)
216 {
217 struct bkey_alloc_unpacked u;
218 int ret = 0;
219
220 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
221 alloc_v2_unpack_error,
222 "unpack error");
223 fsck_err:
224 return ret;
225 }
226
bch2_alloc_v3_invalid(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags,struct printbuf * err)227 int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
228 enum bch_validate_flags flags,
229 struct printbuf *err)
230 {
231 struct bkey_alloc_unpacked u;
232 int ret = 0;
233
234 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
235 alloc_v2_unpack_error,
236 "unpack error");
237 fsck_err:
238 return ret;
239 }
240
bch2_alloc_v4_invalid(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags,struct printbuf * err)241 int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
242 enum bch_validate_flags flags, struct printbuf *err)
243 {
244 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
245 int ret = 0;
246
247 bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
248 alloc_v4_val_size_bad,
249 "bad val size (%u > %zu)",
250 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
251
252 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
253 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
254 alloc_v4_backpointers_start_bad,
255 "invalid backpointers_start");
256
257 bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
258 alloc_key_data_type_bad,
259 "invalid data type (got %u should be %u)",
260 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
261
262 switch (a.v->data_type) {
263 case BCH_DATA_free:
264 case BCH_DATA_need_gc_gens:
265 case BCH_DATA_need_discard:
266 bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
267 c, err, alloc_key_empty_but_have_data,
268 "empty data type free but have data");
269 break;
270 case BCH_DATA_sb:
271 case BCH_DATA_journal:
272 case BCH_DATA_btree:
273 case BCH_DATA_user:
274 case BCH_DATA_parity:
275 bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
276 c, err, alloc_key_dirty_sectors_0,
277 "data_type %s but dirty_sectors==0",
278 bch2_data_type_str(a.v->data_type));
279 break;
280 case BCH_DATA_cached:
281 bkey_fsck_err_on(!a.v->cached_sectors ||
282 bch2_bucket_sectors_dirty(*a.v) ||
283 a.v->stripe,
284 c, err, alloc_key_cached_inconsistency,
285 "data type inconsistency");
286
287 bkey_fsck_err_on(!a.v->io_time[READ] &&
288 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
289 c, err, alloc_key_cached_but_read_time_zero,
290 "cached bucket with read_time == 0");
291 break;
292 case BCH_DATA_stripe:
293 break;
294 }
295 fsck_err:
296 return ret;
297 }
298
bch2_alloc_v4_swab(struct bkey_s k)299 void bch2_alloc_v4_swab(struct bkey_s k)
300 {
301 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
302 struct bch_backpointer *bp, *bps;
303
304 a->journal_seq = swab64(a->journal_seq);
305 a->flags = swab32(a->flags);
306 a->dirty_sectors = swab32(a->dirty_sectors);
307 a->cached_sectors = swab32(a->cached_sectors);
308 a->io_time[0] = swab64(a->io_time[0]);
309 a->io_time[1] = swab64(a->io_time[1]);
310 a->stripe = swab32(a->stripe);
311 a->nr_external_backpointers = swab32(a->nr_external_backpointers);
312 a->fragmentation_lru = swab64(a->fragmentation_lru);
313
314 bps = alloc_v4_backpointers(a);
315 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
316 bp->bucket_offset = swab40(bp->bucket_offset);
317 bp->bucket_len = swab32(bp->bucket_len);
318 bch2_bpos_swab(&bp->pos);
319 }
320 }
321
bch2_alloc_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)322 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
323 {
324 struct bch_alloc_v4 _a;
325 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
326
327 prt_newline(out);
328 printbuf_indent_add(out, 2);
329
330 prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
331 bch2_prt_data_type(out, a->data_type);
332 prt_newline(out);
333 prt_printf(out, "journal_seq %llu\n", a->journal_seq);
334 prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
335 prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
336 prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
337 prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
338 prt_printf(out, "stripe %u\n", a->stripe);
339 prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
340 prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
341 prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
342 prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru);
343 prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
344 printbuf_indent_sub(out, 2);
345 }
346
__bch2_alloc_to_v4(struct bkey_s_c k,struct bch_alloc_v4 * out)347 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
348 {
349 if (k.k->type == KEY_TYPE_alloc_v4) {
350 void *src, *dst;
351
352 *out = *bkey_s_c_to_alloc_v4(k).v;
353
354 src = alloc_v4_backpointers(out);
355 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
356 dst = alloc_v4_backpointers(out);
357
358 if (src < dst)
359 memset(src, 0, dst - src);
360
361 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
362 } else {
363 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
364
365 *out = (struct bch_alloc_v4) {
366 .journal_seq = u.journal_seq,
367 .flags = u.need_discard,
368 .gen = u.gen,
369 .oldest_gen = u.oldest_gen,
370 .data_type = u.data_type,
371 .stripe_redundancy = u.stripe_redundancy,
372 .dirty_sectors = u.dirty_sectors,
373 .cached_sectors = u.cached_sectors,
374 .io_time[READ] = u.read_time,
375 .io_time[WRITE] = u.write_time,
376 .stripe = u.stripe,
377 };
378
379 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
380 }
381 }
382
383 static noinline struct bkey_i_alloc_v4 *
__bch2_alloc_to_v4_mut(struct btree_trans * trans,struct bkey_s_c k)384 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
385 {
386 struct bkey_i_alloc_v4 *ret;
387
388 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
389 if (IS_ERR(ret))
390 return ret;
391
392 if (k.k->type == KEY_TYPE_alloc_v4) {
393 void *src, *dst;
394
395 bkey_reassemble(&ret->k_i, k);
396
397 src = alloc_v4_backpointers(&ret->v);
398 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
399 dst = alloc_v4_backpointers(&ret->v);
400
401 if (src < dst)
402 memset(src, 0, dst - src);
403
404 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
405 set_alloc_v4_u64s(ret);
406 } else {
407 bkey_alloc_v4_init(&ret->k_i);
408 ret->k.p = k.k->p;
409 bch2_alloc_to_v4(k, &ret->v);
410 }
411 return ret;
412 }
413
bch2_alloc_to_v4_mut_inlined(struct btree_trans * trans,struct bkey_s_c k)414 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
415 {
416 struct bkey_s_c_alloc_v4 a;
417
418 if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
419 ((a = bkey_s_c_to_alloc_v4(k), true) &&
420 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
421 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
422
423 return __bch2_alloc_to_v4_mut(trans, k);
424 }
425
bch2_alloc_to_v4_mut(struct btree_trans * trans,struct bkey_s_c k)426 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
427 {
428 return bch2_alloc_to_v4_mut_inlined(trans, k);
429 }
430
431 struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update_noupdate(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)432 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
433 struct bpos pos)
434 {
435 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
436 BTREE_ITER_with_updates|
437 BTREE_ITER_cached|
438 BTREE_ITER_intent);
439 int ret = bkey_err(k);
440 if (unlikely(ret))
441 return ERR_PTR(ret);
442
443 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
444 ret = PTR_ERR_OR_ZERO(a);
445 if (unlikely(ret))
446 goto err;
447 return a;
448 err:
449 bch2_trans_iter_exit(trans, iter);
450 return ERR_PTR(ret);
451 }
452
453 __flatten
bch2_trans_start_alloc_update(struct btree_trans * trans,struct bpos pos)454 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
455 {
456 struct btree_iter iter;
457 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
458 int ret = PTR_ERR_OR_ZERO(a);
459 if (ret)
460 return ERR_PTR(ret);
461
462 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
463 bch2_trans_iter_exit(trans, &iter);
464 return unlikely(ret) ? ERR_PTR(ret) : a;
465 }
466
alloc_gens_pos(struct bpos pos,unsigned * offset)467 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
468 {
469 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
470
471 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
472 return pos;
473 }
474
bucket_gens_pos_to_alloc(struct bpos pos,unsigned offset)475 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
476 {
477 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
478 pos.offset += offset;
479 return pos;
480 }
481
alloc_gen(struct bkey_s_c k,unsigned offset)482 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
483 {
484 return k.k->type == KEY_TYPE_bucket_gens
485 ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
486 : 0;
487 }
488
bch2_bucket_gens_invalid(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags,struct printbuf * err)489 int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
490 enum bch_validate_flags flags,
491 struct printbuf *err)
492 {
493 int ret = 0;
494
495 bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
496 bucket_gens_val_size_bad,
497 "bad val size (%zu != %zu)",
498 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
499 fsck_err:
500 return ret;
501 }
502
bch2_bucket_gens_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)503 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
504 {
505 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
506 unsigned i;
507
508 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
509 if (i)
510 prt_char(out, ' ');
511 prt_printf(out, "%u", g.v->gens[i]);
512 }
513 }
514
bch2_bucket_gens_init(struct bch_fs * c)515 int bch2_bucket_gens_init(struct bch_fs *c)
516 {
517 struct btree_trans *trans = bch2_trans_get(c);
518 struct bkey_i_bucket_gens g;
519 bool have_bucket_gens_key = false;
520 int ret;
521
522 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
523 BTREE_ITER_prefetch, k, ({
524 /*
525 * Not a fsck error because this is checked/repaired by
526 * bch2_check_alloc_key() which runs later:
527 */
528 if (!bch2_dev_bucket_exists(c, k.k->p))
529 continue;
530
531 struct bch_alloc_v4 a;
532 u8 gen = bch2_alloc_to_v4(k, &a)->gen;
533 unsigned offset;
534 struct bpos pos = alloc_gens_pos(iter.pos, &offset);
535 int ret2 = 0;
536
537 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
538 ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
539 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
540 if (ret2)
541 goto iter_err;
542 have_bucket_gens_key = false;
543 }
544
545 if (!have_bucket_gens_key) {
546 bkey_bucket_gens_init(&g.k_i);
547 g.k.p = pos;
548 have_bucket_gens_key = true;
549 }
550
551 g.v.gens[offset] = gen;
552 iter_err:
553 ret2;
554 }));
555
556 if (have_bucket_gens_key && !ret)
557 ret = commit_do(trans, NULL, NULL,
558 BCH_TRANS_COMMIT_no_enospc,
559 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
560
561 bch2_trans_put(trans);
562
563 bch_err_fn(c, ret);
564 return ret;
565 }
566
bch2_alloc_read(struct bch_fs * c)567 int bch2_alloc_read(struct bch_fs *c)
568 {
569 struct btree_trans *trans = bch2_trans_get(c);
570 struct bch_dev *ca = NULL;
571 int ret;
572
573 down_read(&c->gc_lock);
574
575 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
576 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
577 BTREE_ITER_prefetch, k, ({
578 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
579 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
580
581 if (k.k->type != KEY_TYPE_bucket_gens)
582 continue;
583
584 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
585 /*
586 * Not a fsck error because this is checked/repaired by
587 * bch2_check_alloc_key() which runs later:
588 */
589 if (!ca) {
590 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
591 continue;
592 }
593
594 const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
595
596 for (u64 b = max_t(u64, ca->mi.first_bucket, start);
597 b < min_t(u64, ca->mi.nbuckets, end);
598 b++)
599 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
600 0;
601 }));
602 } else {
603 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
604 BTREE_ITER_prefetch, k, ({
605 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
606 /*
607 * Not a fsck error because this is checked/repaired by
608 * bch2_check_alloc_key() which runs later:
609 */
610 if (!ca) {
611 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
612 continue;
613 }
614
615 struct bch_alloc_v4 a;
616 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
617 0;
618 }));
619 }
620
621 bch2_dev_put(ca);
622 bch2_trans_put(trans);
623 up_read(&c->gc_lock);
624
625 bch_err_fn(c, ret);
626 return ret;
627 }
628
629 /* Free space/discard btree: */
630
bch2_bucket_do_index(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c alloc_k,const struct bch_alloc_v4 * a,bool set)631 static int bch2_bucket_do_index(struct btree_trans *trans,
632 struct bch_dev *ca,
633 struct bkey_s_c alloc_k,
634 const struct bch_alloc_v4 *a,
635 bool set)
636 {
637 struct bch_fs *c = trans->c;
638 struct btree_iter iter;
639 struct bkey_s_c old;
640 struct bkey_i *k;
641 enum btree_id btree;
642 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
643 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
644 struct printbuf buf = PRINTBUF;
645 int ret;
646
647 if (a->data_type != BCH_DATA_free &&
648 a->data_type != BCH_DATA_need_discard)
649 return 0;
650
651 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
652 if (IS_ERR(k))
653 return PTR_ERR(k);
654
655 bkey_init(&k->k);
656 k->k.type = new_type;
657
658 switch (a->data_type) {
659 case BCH_DATA_free:
660 btree = BTREE_ID_freespace;
661 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
662 bch2_key_resize(&k->k, 1);
663 break;
664 case BCH_DATA_need_discard:
665 btree = BTREE_ID_need_discard;
666 k->k.p = alloc_k.k->p;
667 break;
668 default:
669 return 0;
670 }
671
672 old = bch2_bkey_get_iter(trans, &iter, btree,
673 bkey_start_pos(&k->k),
674 BTREE_ITER_intent);
675 ret = bkey_err(old);
676 if (ret)
677 return ret;
678
679 if (ca->mi.freespace_initialized &&
680 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
681 bch2_trans_inconsistent_on(old.k->type != old_type, trans,
682 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
683 " for %s",
684 set ? "setting" : "clearing",
685 bch2_btree_id_str(btree),
686 iter.pos.inode,
687 iter.pos.offset,
688 bch2_bkey_types[old.k->type],
689 bch2_bkey_types[old_type],
690 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
691 ret = -EIO;
692 goto err;
693 }
694
695 ret = bch2_trans_update(trans, &iter, k, 0);
696 err:
697 bch2_trans_iter_exit(trans, &iter);
698 printbuf_exit(&buf);
699 return ret;
700 }
701
bch2_bucket_gen_update(struct btree_trans * trans,struct bpos bucket,u8 gen)702 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
703 struct bpos bucket, u8 gen)
704 {
705 struct btree_iter iter;
706 unsigned offset;
707 struct bpos pos = alloc_gens_pos(bucket, &offset);
708 struct bkey_i_bucket_gens *g;
709 struct bkey_s_c k;
710 int ret;
711
712 g = bch2_trans_kmalloc(trans, sizeof(*g));
713 ret = PTR_ERR_OR_ZERO(g);
714 if (ret)
715 return ret;
716
717 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
718 BTREE_ITER_intent|
719 BTREE_ITER_with_updates);
720 ret = bkey_err(k);
721 if (ret)
722 return ret;
723
724 if (k.k->type != KEY_TYPE_bucket_gens) {
725 bkey_bucket_gens_init(&g->k_i);
726 g->k.p = iter.pos;
727 } else {
728 bkey_reassemble(&g->k_i, k);
729 }
730
731 g->v.gens[offset] = gen;
732
733 ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
734 bch2_trans_iter_exit(trans, &iter);
735 return ret;
736 }
737
bch2_trigger_alloc(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)738 int bch2_trigger_alloc(struct btree_trans *trans,
739 enum btree_id btree, unsigned level,
740 struct bkey_s_c old, struct bkey_s new,
741 enum btree_iter_update_trigger_flags flags)
742 {
743 struct bch_fs *c = trans->c;
744 struct printbuf buf = PRINTBUF;
745 int ret = 0;
746
747 struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
748 if (!ca)
749 return -EIO;
750
751 struct bch_alloc_v4 old_a_convert;
752 const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
753
754 if (flags & BTREE_TRIGGER_transactional) {
755 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
756
757 alloc_data_type_set(new_a, new_a->data_type);
758
759 if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
760 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
761 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
762 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
763 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
764 }
765
766 if (data_type_is_empty(new_a->data_type) &&
767 BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
768 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
769 new_a->gen++;
770 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
771 }
772
773 if (old_a->data_type != new_a->data_type ||
774 (new_a->data_type == BCH_DATA_free &&
775 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
776 ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
777 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
778 if (ret)
779 goto err;
780 }
781
782 if (new_a->data_type == BCH_DATA_cached &&
783 !new_a->io_time[READ])
784 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
785
786 u64 old_lru = alloc_lru_idx_read(*old_a);
787 u64 new_lru = alloc_lru_idx_read(*new_a);
788 if (old_lru != new_lru) {
789 ret = bch2_lru_change(trans, new.k->p.inode,
790 bucket_to_u64(new.k->p),
791 old_lru, new_lru);
792 if (ret)
793 goto err;
794 }
795
796 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
797 if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
798 ret = bch2_lru_change(trans,
799 BCH_LRU_FRAGMENTATION_START,
800 bucket_to_u64(new.k->p),
801 old_a->fragmentation_lru, new_a->fragmentation_lru);
802 if (ret)
803 goto err;
804 }
805
806 if (old_a->gen != new_a->gen) {
807 ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
808 if (ret)
809 goto err;
810 }
811
812 /*
813 * need to know if we're getting called from the invalidate path or
814 * not:
815 */
816
817 if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
818 old_a->cached_sectors) {
819 ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
820 -((s64) old_a->cached_sectors));
821 if (ret)
822 goto err;
823 }
824 }
825
826 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
827 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
828 u64 journal_seq = trans->journal_res.seq;
829 u64 bucket_journal_seq = new_a->journal_seq;
830
831 if ((flags & BTREE_TRIGGER_insert) &&
832 data_type_is_empty(old_a->data_type) !=
833 data_type_is_empty(new_a->data_type) &&
834 new.k->type == KEY_TYPE_alloc_v4) {
835 struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
836
837 /*
838 * If the btree updates referring to a bucket weren't flushed
839 * before the bucket became empty again, then the we don't have
840 * to wait on a journal flush before we can reuse the bucket:
841 */
842 v->journal_seq = bucket_journal_seq =
843 data_type_is_empty(new_a->data_type) &&
844 (journal_seq == v->journal_seq ||
845 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
846 ? 0 : journal_seq;
847 }
848
849 if (!data_type_is_empty(old_a->data_type) &&
850 data_type_is_empty(new_a->data_type) &&
851 bucket_journal_seq) {
852 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
853 c->journal.flushed_seq_ondisk,
854 new.k->p.inode, new.k->p.offset,
855 bucket_journal_seq);
856 if (ret) {
857 bch2_fs_fatal_error(c,
858 "setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
859 goto err;
860 }
861 }
862
863 percpu_down_read(&c->mark_lock);
864 if (new_a->gen != old_a->gen) {
865 u8 *gen = bucket_gen(ca, new.k->p.offset);
866 if (unlikely(!gen)) {
867 percpu_up_read(&c->mark_lock);
868 goto invalid_bucket;
869 }
870 *gen = new_a->gen;
871 }
872
873 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
874 percpu_up_read(&c->mark_lock);
875
876 #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
877 #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
878 #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
879
880 if (statechange(a->data_type == BCH_DATA_free) &&
881 bucket_flushed(new_a))
882 closure_wake_up(&c->freelist_wait);
883
884 if (statechange(a->data_type == BCH_DATA_need_discard) &&
885 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
886 bucket_flushed(new_a))
887 bch2_discard_one_bucket_fast(c, new.k->p);
888
889 if (statechange(a->data_type == BCH_DATA_cached) &&
890 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
891 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
892 bch2_do_invalidates(c);
893
894 if (statechange(a->data_type == BCH_DATA_need_gc_gens))
895 bch2_gc_gens_async(c);
896 }
897
898 if ((flags & BTREE_TRIGGER_gc) &&
899 (flags & BTREE_TRIGGER_bucket_invalidate)) {
900 struct bch_alloc_v4 new_a_convert;
901 const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
902
903 percpu_down_read(&c->mark_lock);
904 struct bucket *g = gc_bucket(ca, new.k->p.offset);
905 if (unlikely(!g)) {
906 percpu_up_read(&c->mark_lock);
907 goto invalid_bucket;
908 }
909 g->gen_valid = 1;
910
911 bucket_lock(g);
912
913 g->gen_valid = 1;
914 g->gen = new_a->gen;
915 g->data_type = new_a->data_type;
916 g->stripe = new_a->stripe;
917 g->stripe_redundancy = new_a->stripe_redundancy;
918 g->dirty_sectors = new_a->dirty_sectors;
919 g->cached_sectors = new_a->cached_sectors;
920
921 bucket_unlock(g);
922 percpu_up_read(&c->mark_lock);
923 }
924 err:
925 printbuf_exit(&buf);
926 bch2_dev_put(ca);
927 return ret;
928 invalid_bucket:
929 bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
930 (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
931 ret = -EIO;
932 goto err;
933 }
934
935 /*
936 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
937 * extents style btrees, but works on non-extents btrees:
938 */
bch2_get_key_or_hole(struct btree_iter * iter,struct bpos end,struct bkey * hole)939 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
940 {
941 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
942
943 if (bkey_err(k))
944 return k;
945
946 if (k.k->type) {
947 return k;
948 } else {
949 struct btree_iter iter2;
950 struct bpos next;
951
952 bch2_trans_copy_iter(&iter2, iter);
953
954 struct btree_path *path = btree_iter_path(iter->trans, iter);
955 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
956 end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
957
958 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
959
960 /*
961 * btree node min/max is a closed interval, upto takes a half
962 * open interval:
963 */
964 k = bch2_btree_iter_peek_upto(&iter2, end);
965 next = iter2.pos;
966 bch2_trans_iter_exit(iter->trans, &iter2);
967
968 BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
969
970 if (bkey_err(k))
971 return k;
972
973 bkey_init(hole);
974 hole->p = iter->pos;
975
976 bch2_key_resize(hole, next.offset - iter->pos.offset);
977 return (struct bkey_s_c) { hole, NULL };
978 }
979 }
980
next_bucket(struct bch_fs * c,struct bch_dev ** ca,struct bpos * bucket)981 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
982 {
983 if (*ca) {
984 if (bucket->offset < (*ca)->mi.first_bucket)
985 bucket->offset = (*ca)->mi.first_bucket;
986
987 if (bucket->offset < (*ca)->mi.nbuckets)
988 return true;
989
990 bch2_dev_put(*ca);
991 *ca = NULL;
992 bucket->inode++;
993 bucket->offset = 0;
994 }
995
996 rcu_read_lock();
997 *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
998 if (*ca) {
999 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1000 bch2_dev_get(*ca);
1001 }
1002 rcu_read_unlock();
1003
1004 return *ca != NULL;
1005 }
1006
bch2_get_key_or_real_bucket_hole(struct btree_iter * iter,struct bch_dev ** ca,struct bkey * hole)1007 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
1008 struct bch_dev **ca, struct bkey *hole)
1009 {
1010 struct bch_fs *c = iter->trans->c;
1011 struct bkey_s_c k;
1012 again:
1013 k = bch2_get_key_or_hole(iter, POS_MAX, hole);
1014 if (bkey_err(k))
1015 return k;
1016
1017 *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1018
1019 if (!k.k->type) {
1020 struct bpos hole_start = bkey_start_pos(k.k);
1021
1022 if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1023 if (!next_bucket(c, ca, &hole_start))
1024 return bkey_s_c_null;
1025
1026 bch2_btree_iter_set_pos(iter, hole_start);
1027 goto again;
1028 }
1029
1030 if (k.k->p.offset > (*ca)->mi.nbuckets)
1031 bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1032 }
1033
1034 return k;
1035 }
1036
1037 static noinline_for_stack
bch2_check_alloc_key(struct btree_trans * trans,struct bkey_s_c alloc_k,struct btree_iter * alloc_iter,struct btree_iter * discard_iter,struct btree_iter * freespace_iter,struct btree_iter * bucket_gens_iter)1038 int bch2_check_alloc_key(struct btree_trans *trans,
1039 struct bkey_s_c alloc_k,
1040 struct btree_iter *alloc_iter,
1041 struct btree_iter *discard_iter,
1042 struct btree_iter *freespace_iter,
1043 struct btree_iter *bucket_gens_iter)
1044 {
1045 struct bch_fs *c = trans->c;
1046 struct bch_alloc_v4 a_convert;
1047 const struct bch_alloc_v4 *a;
1048 unsigned discard_key_type, freespace_key_type;
1049 unsigned gens_offset;
1050 struct bkey_s_c k;
1051 struct printbuf buf = PRINTBUF;
1052 int ret = 0;
1053
1054 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1055 if (fsck_err_on(!ca,
1056 c, alloc_key_to_missing_dev_bucket,
1057 "alloc key for invalid device:bucket %llu:%llu",
1058 alloc_k.k->p.inode, alloc_k.k->p.offset))
1059 ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1060 if (!ca)
1061 return ret;
1062
1063 if (!ca->mi.freespace_initialized)
1064 goto out;
1065
1066 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1067
1068 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
1069 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1070 k = bch2_btree_iter_peek_slot(discard_iter);
1071 ret = bkey_err(k);
1072 if (ret)
1073 goto err;
1074
1075 if (fsck_err_on(k.k->type != discard_key_type,
1076 c, need_discard_key_wrong,
1077 "incorrect key in need_discard btree (got %s should be %s)\n"
1078 " %s",
1079 bch2_bkey_types[k.k->type],
1080 bch2_bkey_types[discard_key_type],
1081 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1082 struct bkey_i *update =
1083 bch2_trans_kmalloc(trans, sizeof(*update));
1084
1085 ret = PTR_ERR_OR_ZERO(update);
1086 if (ret)
1087 goto err;
1088
1089 bkey_init(&update->k);
1090 update->k.type = discard_key_type;
1091 update->k.p = discard_iter->pos;
1092
1093 ret = bch2_trans_update(trans, discard_iter, update, 0);
1094 if (ret)
1095 goto err;
1096 }
1097
1098 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1099 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1100 k = bch2_btree_iter_peek_slot(freespace_iter);
1101 ret = bkey_err(k);
1102 if (ret)
1103 goto err;
1104
1105 if (fsck_err_on(k.k->type != freespace_key_type,
1106 c, freespace_key_wrong,
1107 "incorrect key in freespace btree (got %s should be %s)\n"
1108 " %s",
1109 bch2_bkey_types[k.k->type],
1110 bch2_bkey_types[freespace_key_type],
1111 (printbuf_reset(&buf),
1112 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1113 struct bkey_i *update =
1114 bch2_trans_kmalloc(trans, sizeof(*update));
1115
1116 ret = PTR_ERR_OR_ZERO(update);
1117 if (ret)
1118 goto err;
1119
1120 bkey_init(&update->k);
1121 update->k.type = freespace_key_type;
1122 update->k.p = freespace_iter->pos;
1123 bch2_key_resize(&update->k, 1);
1124
1125 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1126 if (ret)
1127 goto err;
1128 }
1129
1130 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1131 k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1132 ret = bkey_err(k);
1133 if (ret)
1134 goto err;
1135
1136 if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1137 c, bucket_gens_key_wrong,
1138 "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1139 " %s",
1140 alloc_gen(k, gens_offset), a->gen,
1141 (printbuf_reset(&buf),
1142 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1143 struct bkey_i_bucket_gens *g =
1144 bch2_trans_kmalloc(trans, sizeof(*g));
1145
1146 ret = PTR_ERR_OR_ZERO(g);
1147 if (ret)
1148 goto err;
1149
1150 if (k.k->type == KEY_TYPE_bucket_gens) {
1151 bkey_reassemble(&g->k_i, k);
1152 } else {
1153 bkey_bucket_gens_init(&g->k_i);
1154 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1155 }
1156
1157 g->v.gens[gens_offset] = a->gen;
1158
1159 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1160 if (ret)
1161 goto err;
1162 }
1163 out:
1164 err:
1165 fsck_err:
1166 bch2_dev_put(ca);
1167 printbuf_exit(&buf);
1168 return ret;
1169 }
1170
1171 static noinline_for_stack
bch2_check_alloc_hole_freespace(struct btree_trans * trans,struct bch_dev * ca,struct bpos start,struct bpos * end,struct btree_iter * freespace_iter)1172 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1173 struct bch_dev *ca,
1174 struct bpos start,
1175 struct bpos *end,
1176 struct btree_iter *freespace_iter)
1177 {
1178 struct bch_fs *c = trans->c;
1179 struct bkey_s_c k;
1180 struct printbuf buf = PRINTBUF;
1181 int ret;
1182
1183 if (!ca->mi.freespace_initialized)
1184 return 0;
1185
1186 bch2_btree_iter_set_pos(freespace_iter, start);
1187
1188 k = bch2_btree_iter_peek_slot(freespace_iter);
1189 ret = bkey_err(k);
1190 if (ret)
1191 goto err;
1192
1193 *end = bkey_min(k.k->p, *end);
1194
1195 if (fsck_err_on(k.k->type != KEY_TYPE_set,
1196 c, freespace_hole_missing,
1197 "hole in alloc btree missing in freespace btree\n"
1198 " device %llu buckets %llu-%llu",
1199 freespace_iter->pos.inode,
1200 freespace_iter->pos.offset,
1201 end->offset)) {
1202 struct bkey_i *update =
1203 bch2_trans_kmalloc(trans, sizeof(*update));
1204
1205 ret = PTR_ERR_OR_ZERO(update);
1206 if (ret)
1207 goto err;
1208
1209 bkey_init(&update->k);
1210 update->k.type = KEY_TYPE_set;
1211 update->k.p = freespace_iter->pos;
1212 bch2_key_resize(&update->k,
1213 min_t(u64, U32_MAX, end->offset -
1214 freespace_iter->pos.offset));
1215
1216 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1217 if (ret)
1218 goto err;
1219 }
1220 err:
1221 fsck_err:
1222 printbuf_exit(&buf);
1223 return ret;
1224 }
1225
1226 static noinline_for_stack
bch2_check_alloc_hole_bucket_gens(struct btree_trans * trans,struct bpos start,struct bpos * end,struct btree_iter * bucket_gens_iter)1227 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1228 struct bpos start,
1229 struct bpos *end,
1230 struct btree_iter *bucket_gens_iter)
1231 {
1232 struct bch_fs *c = trans->c;
1233 struct bkey_s_c k;
1234 struct printbuf buf = PRINTBUF;
1235 unsigned i, gens_offset, gens_end_offset;
1236 int ret;
1237
1238 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1239
1240 k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1241 ret = bkey_err(k);
1242 if (ret)
1243 goto err;
1244
1245 if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1246 alloc_gens_pos(*end, &gens_end_offset)))
1247 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1248
1249 if (k.k->type == KEY_TYPE_bucket_gens) {
1250 struct bkey_i_bucket_gens g;
1251 bool need_update = false;
1252
1253 bkey_reassemble(&g.k_i, k);
1254
1255 for (i = gens_offset; i < gens_end_offset; i++) {
1256 if (fsck_err_on(g.v.gens[i], c,
1257 bucket_gens_hole_wrong,
1258 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1259 bucket_gens_pos_to_alloc(k.k->p, i).inode,
1260 bucket_gens_pos_to_alloc(k.k->p, i).offset,
1261 g.v.gens[i])) {
1262 g.v.gens[i] = 0;
1263 need_update = true;
1264 }
1265 }
1266
1267 if (need_update) {
1268 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1269
1270 ret = PTR_ERR_OR_ZERO(u);
1271 if (ret)
1272 goto err;
1273
1274 memcpy(u, &g, sizeof(g));
1275
1276 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1277 if (ret)
1278 goto err;
1279 }
1280 }
1281
1282 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1283 err:
1284 fsck_err:
1285 printbuf_exit(&buf);
1286 return ret;
1287 }
1288
bch2_check_discard_freespace_key(struct btree_trans * trans,struct btree_iter * iter)1289 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
1290 struct btree_iter *iter)
1291 {
1292 struct bch_fs *c = trans->c;
1293 struct btree_iter alloc_iter;
1294 struct bkey_s_c alloc_k;
1295 struct bch_alloc_v4 a_convert;
1296 const struct bch_alloc_v4 *a;
1297 u64 genbits;
1298 struct bpos pos;
1299 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1300 ? BCH_DATA_need_discard
1301 : BCH_DATA_free;
1302 struct printbuf buf = PRINTBUF;
1303 int ret;
1304
1305 pos = iter->pos;
1306 pos.offset &= ~(~0ULL << 56);
1307 genbits = iter->pos.offset & (~0ULL << 56);
1308
1309 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1310 ret = bkey_err(alloc_k);
1311 if (ret)
1312 return ret;
1313
1314 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
1315 need_discard_freespace_key_to_invalid_dev_bucket,
1316 "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1317 bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
1318 goto delete;
1319
1320 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1321
1322 if (fsck_err_on(a->data_type != state ||
1323 (state == BCH_DATA_free &&
1324 genbits != alloc_freespace_genbits(*a)), c,
1325 need_discard_freespace_key_bad,
1326 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1327 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1328 bch2_btree_id_str(iter->btree_id),
1329 iter->pos.inode,
1330 iter->pos.offset,
1331 a->data_type == state,
1332 genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1333 goto delete;
1334 out:
1335 fsck_err:
1336 bch2_set_btree_iter_dontneed(&alloc_iter);
1337 bch2_trans_iter_exit(trans, &alloc_iter);
1338 printbuf_exit(&buf);
1339 return ret;
1340 delete:
1341 ret = bch2_btree_delete_extent_at(trans, iter,
1342 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1343 bch2_trans_commit(trans, NULL, NULL,
1344 BCH_TRANS_COMMIT_no_enospc);
1345 goto out;
1346 }
1347
1348 /*
1349 * We've already checked that generation numbers in the bucket_gens btree are
1350 * valid for buckets that exist; this just checks for keys for nonexistent
1351 * buckets.
1352 */
1353 static noinline_for_stack
bch2_check_bucket_gens_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1354 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1355 struct btree_iter *iter,
1356 struct bkey_s_c k)
1357 {
1358 struct bch_fs *c = trans->c;
1359 struct bkey_i_bucket_gens g;
1360 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1361 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1362 u64 b;
1363 bool need_update = false;
1364 struct printbuf buf = PRINTBUF;
1365 int ret = 0;
1366
1367 BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1368 bkey_reassemble(&g.k_i, k);
1369
1370 struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1371 if (!ca) {
1372 if (fsck_err(c, bucket_gens_to_invalid_dev,
1373 "bucket_gens key for invalid device:\n %s",
1374 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1375 ret = bch2_btree_delete_at(trans, iter, 0);
1376 goto out;
1377 }
1378
1379 if (fsck_err_on(end <= ca->mi.first_bucket ||
1380 start >= ca->mi.nbuckets, c,
1381 bucket_gens_to_invalid_buckets,
1382 "bucket_gens key for invalid buckets:\n %s",
1383 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1384 ret = bch2_btree_delete_at(trans, iter, 0);
1385 goto out;
1386 }
1387
1388 for (b = start; b < ca->mi.first_bucket; b++)
1389 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1390 bucket_gens_nonzero_for_invalid_buckets,
1391 "bucket_gens key has nonzero gen for invalid bucket")) {
1392 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1393 need_update = true;
1394 }
1395
1396 for (b = ca->mi.nbuckets; b < end; b++)
1397 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1398 bucket_gens_nonzero_for_invalid_buckets,
1399 "bucket_gens key has nonzero gen for invalid bucket")) {
1400 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1401 need_update = true;
1402 }
1403
1404 if (need_update) {
1405 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1406
1407 ret = PTR_ERR_OR_ZERO(u);
1408 if (ret)
1409 goto out;
1410
1411 memcpy(u, &g, sizeof(g));
1412 ret = bch2_trans_update(trans, iter, u, 0);
1413 }
1414 out:
1415 fsck_err:
1416 bch2_dev_put(ca);
1417 printbuf_exit(&buf);
1418 return ret;
1419 }
1420
bch2_check_alloc_info(struct bch_fs * c)1421 int bch2_check_alloc_info(struct bch_fs *c)
1422 {
1423 struct btree_trans *trans = bch2_trans_get(c);
1424 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1425 struct bch_dev *ca = NULL;
1426 struct bkey hole;
1427 struct bkey_s_c k;
1428 int ret = 0;
1429
1430 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1431 BTREE_ITER_prefetch);
1432 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1433 BTREE_ITER_prefetch);
1434 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1435 BTREE_ITER_prefetch);
1436 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1437 BTREE_ITER_prefetch);
1438
1439 while (1) {
1440 struct bpos next;
1441
1442 bch2_trans_begin(trans);
1443
1444 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
1445 ret = bkey_err(k);
1446 if (ret)
1447 goto bkey_err;
1448
1449 if (!k.k)
1450 break;
1451
1452 if (k.k->type) {
1453 next = bpos_nosnap_successor(k.k->p);
1454
1455 ret = bch2_check_alloc_key(trans,
1456 k, &iter,
1457 &discard_iter,
1458 &freespace_iter,
1459 &bucket_gens_iter);
1460 if (ret)
1461 goto bkey_err;
1462 } else {
1463 next = k.k->p;
1464
1465 ret = bch2_check_alloc_hole_freespace(trans, ca,
1466 bkey_start_pos(k.k),
1467 &next,
1468 &freespace_iter) ?:
1469 bch2_check_alloc_hole_bucket_gens(trans,
1470 bkey_start_pos(k.k),
1471 &next,
1472 &bucket_gens_iter);
1473 if (ret)
1474 goto bkey_err;
1475 }
1476
1477 ret = bch2_trans_commit(trans, NULL, NULL,
1478 BCH_TRANS_COMMIT_no_enospc);
1479 if (ret)
1480 goto bkey_err;
1481
1482 bch2_btree_iter_set_pos(&iter, next);
1483 bkey_err:
1484 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1485 continue;
1486 if (ret)
1487 break;
1488 }
1489 bch2_trans_iter_exit(trans, &bucket_gens_iter);
1490 bch2_trans_iter_exit(trans, &freespace_iter);
1491 bch2_trans_iter_exit(trans, &discard_iter);
1492 bch2_trans_iter_exit(trans, &iter);
1493 bch2_dev_put(ca);
1494 ca = NULL;
1495
1496 if (ret < 0)
1497 goto err;
1498
1499 ret = for_each_btree_key(trans, iter,
1500 BTREE_ID_need_discard, POS_MIN,
1501 BTREE_ITER_prefetch, k,
1502 bch2_check_discard_freespace_key(trans, &iter));
1503 if (ret)
1504 goto err;
1505
1506 bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1507 BTREE_ITER_prefetch);
1508 while (1) {
1509 bch2_trans_begin(trans);
1510 k = bch2_btree_iter_peek(&iter);
1511 if (!k.k)
1512 break;
1513
1514 ret = bkey_err(k) ?:
1515 bch2_check_discard_freespace_key(trans, &iter);
1516 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1517 ret = 0;
1518 continue;
1519 }
1520 if (ret) {
1521 struct printbuf buf = PRINTBUF;
1522 bch2_bkey_val_to_text(&buf, c, k);
1523
1524 bch_err(c, "while checking %s", buf.buf);
1525 printbuf_exit(&buf);
1526 break;
1527 }
1528
1529 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
1530 }
1531 bch2_trans_iter_exit(trans, &iter);
1532 if (ret)
1533 goto err;
1534
1535 ret = for_each_btree_key_commit(trans, iter,
1536 BTREE_ID_bucket_gens, POS_MIN,
1537 BTREE_ITER_prefetch, k,
1538 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1539 bch2_check_bucket_gens_key(trans, &iter, k));
1540 err:
1541 bch2_trans_put(trans);
1542 bch_err_fn(c, ret);
1543 return ret;
1544 }
1545
bch2_check_alloc_to_lru_ref(struct btree_trans * trans,struct btree_iter * alloc_iter)1546 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1547 struct btree_iter *alloc_iter)
1548 {
1549 struct bch_fs *c = trans->c;
1550 struct btree_iter lru_iter;
1551 struct bch_alloc_v4 a_convert;
1552 const struct bch_alloc_v4 *a;
1553 struct bkey_s_c alloc_k, lru_k;
1554 struct printbuf buf = PRINTBUF;
1555 int ret;
1556
1557 alloc_k = bch2_btree_iter_peek(alloc_iter);
1558 if (!alloc_k.k)
1559 return 0;
1560
1561 ret = bkey_err(alloc_k);
1562 if (ret)
1563 return ret;
1564
1565 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1566
1567 if (a->data_type != BCH_DATA_cached)
1568 return 0;
1569
1570 if (fsck_err_on(!a->io_time[READ], c,
1571 alloc_key_cached_but_read_time_zero,
1572 "cached bucket with read_time 0\n"
1573 " %s",
1574 (printbuf_reset(&buf),
1575 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1576 struct bkey_i_alloc_v4 *a_mut =
1577 bch2_alloc_to_v4_mut(trans, alloc_k);
1578 ret = PTR_ERR_OR_ZERO(a_mut);
1579 if (ret)
1580 goto err;
1581
1582 a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
1583 ret = bch2_trans_update(trans, alloc_iter,
1584 &a_mut->k_i, BTREE_TRIGGER_norun);
1585 if (ret)
1586 goto err;
1587
1588 a = &a_mut->v;
1589 }
1590
1591 lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
1592 lru_pos(alloc_k.k->p.inode,
1593 bucket_to_u64(alloc_k.k->p),
1594 a->io_time[READ]), 0);
1595 ret = bkey_err(lru_k);
1596 if (ret)
1597 return ret;
1598
1599 if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
1600 alloc_key_to_missing_lru_entry,
1601 "missing lru entry\n"
1602 " %s",
1603 (printbuf_reset(&buf),
1604 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1605 ret = bch2_lru_set(trans,
1606 alloc_k.k->p.inode,
1607 bucket_to_u64(alloc_k.k->p),
1608 a->io_time[READ]);
1609 if (ret)
1610 goto err;
1611 }
1612 err:
1613 fsck_err:
1614 bch2_trans_iter_exit(trans, &lru_iter);
1615 printbuf_exit(&buf);
1616 return ret;
1617 }
1618
bch2_check_alloc_to_lru_refs(struct bch_fs * c)1619 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1620 {
1621 int ret = bch2_trans_run(c,
1622 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1623 POS_MIN, BTREE_ITER_prefetch, k,
1624 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1625 bch2_check_alloc_to_lru_ref(trans, &iter)));
1626 bch_err_fn(c, ret);
1627 return ret;
1628 }
1629
discard_in_flight_add(struct bch_fs * c,struct bpos bucket)1630 static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
1631 {
1632 int ret;
1633
1634 mutex_lock(&c->discard_buckets_in_flight_lock);
1635 darray_for_each(c->discard_buckets_in_flight, i)
1636 if (bkey_eq(*i, bucket)) {
1637 ret = -EEXIST;
1638 goto out;
1639 }
1640
1641 ret = darray_push(&c->discard_buckets_in_flight, bucket);
1642 out:
1643 mutex_unlock(&c->discard_buckets_in_flight_lock);
1644 return ret;
1645 }
1646
discard_in_flight_remove(struct bch_fs * c,struct bpos bucket)1647 static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
1648 {
1649 mutex_lock(&c->discard_buckets_in_flight_lock);
1650 darray_for_each(c->discard_buckets_in_flight, i)
1651 if (bkey_eq(*i, bucket)) {
1652 darray_remove_item(&c->discard_buckets_in_flight, i);
1653 goto found;
1654 }
1655 BUG();
1656 found:
1657 mutex_unlock(&c->discard_buckets_in_flight_lock);
1658 }
1659
1660 struct discard_buckets_state {
1661 u64 seen;
1662 u64 open;
1663 u64 need_journal_commit;
1664 u64 discarded;
1665 struct bch_dev *ca;
1666 u64 need_journal_commit_this_dev;
1667 };
1668
discard_buckets_next_dev(struct bch_fs * c,struct discard_buckets_state * s,struct bch_dev * ca)1669 static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
1670 {
1671 if (s->ca == ca)
1672 return;
1673
1674 if (s->ca && s->need_journal_commit_this_dev >
1675 bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
1676 bch2_journal_flush_async(&c->journal, NULL);
1677
1678 if (s->ca)
1679 percpu_ref_put(&s->ca->io_ref);
1680 s->ca = ca;
1681 s->need_journal_commit_this_dev = 0;
1682 }
1683
bch2_discard_one_bucket(struct btree_trans * trans,struct btree_iter * need_discard_iter,struct bpos * discard_pos_done,struct discard_buckets_state * s)1684 static int bch2_discard_one_bucket(struct btree_trans *trans,
1685 struct btree_iter *need_discard_iter,
1686 struct bpos *discard_pos_done,
1687 struct discard_buckets_state *s)
1688 {
1689 struct bch_fs *c = trans->c;
1690 struct bpos pos = need_discard_iter->pos;
1691 struct btree_iter iter = { NULL };
1692 struct bkey_s_c k;
1693 struct bkey_i_alloc_v4 *a;
1694 struct printbuf buf = PRINTBUF;
1695 bool discard_locked = false;
1696 int ret = 0;
1697
1698 struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
1699 ? s->ca
1700 : bch2_dev_get_ioref(c, pos.inode, WRITE);
1701 if (!ca) {
1702 bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
1703 return 0;
1704 }
1705
1706 discard_buckets_next_dev(c, s, ca);
1707
1708 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1709 s->open++;
1710 goto out;
1711 }
1712
1713 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1714 c->journal.flushed_seq_ondisk,
1715 pos.inode, pos.offset)) {
1716 s->need_journal_commit++;
1717 s->need_journal_commit_this_dev++;
1718 goto out;
1719 }
1720
1721 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1722 need_discard_iter->pos,
1723 BTREE_ITER_cached);
1724 ret = bkey_err(k);
1725 if (ret)
1726 goto out;
1727
1728 a = bch2_alloc_to_v4_mut(trans, k);
1729 ret = PTR_ERR_OR_ZERO(a);
1730 if (ret)
1731 goto out;
1732
1733 if (bch2_bucket_sectors_total(a->v)) {
1734 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1735 trans, "attempting to discard bucket with dirty data\n%s",
1736 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1737 ret = -EIO;
1738 goto out;
1739 }
1740
1741 if (a->v.data_type != BCH_DATA_need_discard) {
1742 if (data_type_is_empty(a->v.data_type) &&
1743 BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1744 a->v.gen++;
1745 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1746 goto write;
1747 }
1748
1749 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1750 trans, "bucket incorrectly set in need_discard btree\n"
1751 "%s",
1752 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1753 ret = -EIO;
1754 goto out;
1755 }
1756
1757 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1758 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1759 trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1760 a->v.journal_seq,
1761 c->journal.flushed_seq_ondisk,
1762 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1763 ret = -EIO;
1764 goto out;
1765 }
1766
1767 if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
1768 goto out;
1769
1770 discard_locked = true;
1771
1772 if (!bkey_eq(*discard_pos_done, iter.pos) &&
1773 ca->mi.discard && !c->opts.nochanges) {
1774 /*
1775 * This works without any other locks because this is the only
1776 * thread that removes items from the need_discard tree
1777 */
1778 bch2_trans_unlock_long(trans);
1779 blkdev_issue_discard(ca->disk_sb.bdev,
1780 k.k->p.offset * ca->mi.bucket_size,
1781 ca->mi.bucket_size,
1782 GFP_KERNEL);
1783 *discard_pos_done = iter.pos;
1784
1785 ret = bch2_trans_relock_notrace(trans);
1786 if (ret)
1787 goto out;
1788 }
1789
1790 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1791 alloc_data_type_set(&a->v, a->v.data_type);
1792 write:
1793 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1794 bch2_trans_commit(trans, NULL, NULL,
1795 BCH_WATERMARK_btree|
1796 BCH_TRANS_COMMIT_no_enospc);
1797 if (ret)
1798 goto out;
1799
1800 count_event(c, bucket_discard);
1801 s->discarded++;
1802 out:
1803 if (discard_locked)
1804 discard_in_flight_remove(c, iter.pos);
1805 s->seen++;
1806 bch2_trans_iter_exit(trans, &iter);
1807 printbuf_exit(&buf);
1808 return ret;
1809 }
1810
bch2_do_discards_work(struct work_struct * work)1811 static void bch2_do_discards_work(struct work_struct *work)
1812 {
1813 struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
1814 struct discard_buckets_state s = {};
1815 struct bpos discard_pos_done = POS_MAX;
1816 int ret;
1817
1818 /*
1819 * We're doing the commit in bch2_discard_one_bucket instead of using
1820 * for_each_btree_key_commit() so that we can increment counters after
1821 * successful commit:
1822 */
1823 ret = bch2_trans_run(c,
1824 for_each_btree_key(trans, iter,
1825 BTREE_ID_need_discard, POS_MIN, 0, k,
1826 bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
1827
1828 discard_buckets_next_dev(c, &s, NULL);
1829
1830 trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1831 bch2_err_str(ret));
1832
1833 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1834 }
1835
bch2_do_discards(struct bch_fs * c)1836 void bch2_do_discards(struct bch_fs *c)
1837 {
1838 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
1839 !queue_work(c->write_ref_wq, &c->discard_work))
1840 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1841 }
1842
bch2_clear_bucket_needs_discard(struct btree_trans * trans,struct bpos bucket)1843 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
1844 {
1845 struct btree_iter iter;
1846 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
1847 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
1848 int ret = bkey_err(k);
1849 if (ret)
1850 goto err;
1851
1852 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
1853 ret = PTR_ERR_OR_ZERO(a);
1854 if (ret)
1855 goto err;
1856
1857 BUG_ON(a->v.dirty_sectors);
1858 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1859 alloc_data_type_set(&a->v, a->v.data_type);
1860
1861 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1862 err:
1863 bch2_trans_iter_exit(trans, &iter);
1864 return ret;
1865 }
1866
bch2_do_discards_fast_work(struct work_struct * work)1867 static void bch2_do_discards_fast_work(struct work_struct *work)
1868 {
1869 struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
1870
1871 while (1) {
1872 bool got_bucket = false;
1873 struct bpos bucket;
1874 struct bch_dev *ca;
1875
1876 mutex_lock(&c->discard_buckets_in_flight_lock);
1877 darray_for_each(c->discard_buckets_in_flight, i) {
1878 if (i->snapshot)
1879 continue;
1880
1881 ca = bch2_dev_get_ioref(c, i->inode, WRITE);
1882 if (!ca) {
1883 darray_remove_item(&c->discard_buckets_in_flight, i);
1884 continue;
1885 }
1886
1887 got_bucket = true;
1888 bucket = *i;
1889 i->snapshot = true;
1890 break;
1891 }
1892 mutex_unlock(&c->discard_buckets_in_flight_lock);
1893
1894 if (!got_bucket)
1895 break;
1896
1897 if (ca->mi.discard && !c->opts.nochanges)
1898 blkdev_issue_discard(ca->disk_sb.bdev,
1899 bucket.offset * ca->mi.bucket_size,
1900 ca->mi.bucket_size,
1901 GFP_KERNEL);
1902
1903 int ret = bch2_trans_do(c, NULL, NULL,
1904 BCH_WATERMARK_btree|
1905 BCH_TRANS_COMMIT_no_enospc,
1906 bch2_clear_bucket_needs_discard(trans, bucket));
1907 bch_err_fn(c, ret);
1908
1909 percpu_ref_put(&ca->io_ref);
1910 discard_in_flight_remove(c, bucket);
1911
1912 if (ret)
1913 break;
1914 }
1915
1916 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1917 }
1918
bch2_discard_one_bucket_fast(struct bch_fs * c,struct bpos bucket)1919 static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
1920 {
1921 rcu_read_lock();
1922 struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
1923 bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
1924 rcu_read_unlock();
1925
1926 if (!dead &&
1927 !discard_in_flight_add(c, bucket) &&
1928 bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
1929 !queue_work(c->write_ref_wq, &c->discard_fast_work))
1930 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1931 }
1932
invalidate_one_bucket(struct btree_trans * trans,struct btree_iter * lru_iter,struct bkey_s_c lru_k,s64 * nr_to_invalidate)1933 static int invalidate_one_bucket(struct btree_trans *trans,
1934 struct btree_iter *lru_iter,
1935 struct bkey_s_c lru_k,
1936 s64 *nr_to_invalidate)
1937 {
1938 struct bch_fs *c = trans->c;
1939 struct bkey_i_alloc_v4 *a = NULL;
1940 struct printbuf buf = PRINTBUF;
1941 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
1942 unsigned cached_sectors;
1943 int ret = 0;
1944
1945 if (*nr_to_invalidate <= 0)
1946 return 1;
1947
1948 if (!bch2_dev_bucket_exists(c, bucket)) {
1949 prt_str(&buf, "lru entry points to invalid bucket");
1950 goto err;
1951 }
1952
1953 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
1954 return 0;
1955
1956 a = bch2_trans_start_alloc_update(trans, bucket);
1957 ret = PTR_ERR_OR_ZERO(a);
1958 if (ret)
1959 goto out;
1960
1961 /* We expect harmless races here due to the btree write buffer: */
1962 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
1963 goto out;
1964
1965 BUG_ON(a->v.data_type != BCH_DATA_cached);
1966 BUG_ON(a->v.dirty_sectors);
1967
1968 if (!a->v.cached_sectors)
1969 bch_err(c, "invalidating empty bucket, confused");
1970
1971 cached_sectors = a->v.cached_sectors;
1972
1973 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1974 a->v.gen++;
1975 a->v.data_type = 0;
1976 a->v.dirty_sectors = 0;
1977 a->v.cached_sectors = 0;
1978 a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
1979 a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
1980
1981 ret = bch2_trans_commit(trans, NULL, NULL,
1982 BCH_WATERMARK_btree|
1983 BCH_TRANS_COMMIT_no_enospc);
1984 if (ret)
1985 goto out;
1986
1987 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
1988 --*nr_to_invalidate;
1989 out:
1990 printbuf_exit(&buf);
1991 return ret;
1992 err:
1993 prt_str(&buf, "\n lru key: ");
1994 bch2_bkey_val_to_text(&buf, c, lru_k);
1995
1996 prt_str(&buf, "\n lru entry: ");
1997 bch2_lru_pos_to_text(&buf, lru_iter->pos);
1998
1999 prt_str(&buf, "\n alloc key: ");
2000 if (!a)
2001 bch2_bpos_to_text(&buf, bucket);
2002 else
2003 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
2004
2005 bch_err(c, "%s", buf.buf);
2006 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
2007 bch2_inconsistent_error(c);
2008 ret = -EINVAL;
2009 }
2010
2011 goto out;
2012 }
2013
bch2_do_invalidates_work(struct work_struct * work)2014 static void bch2_do_invalidates_work(struct work_struct *work)
2015 {
2016 struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
2017 struct btree_trans *trans = bch2_trans_get(c);
2018 int ret = 0;
2019
2020 ret = bch2_btree_write_buffer_tryflush(trans);
2021 if (ret)
2022 goto err;
2023
2024 for_each_member_device(c, ca) {
2025 s64 nr_to_invalidate =
2026 should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2027
2028 ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
2029 lru_pos(ca->dev_idx, 0, 0),
2030 lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
2031 BTREE_ITER_intent, k,
2032 invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
2033
2034 if (ret < 0) {
2035 bch2_dev_put(ca);
2036 break;
2037 }
2038 }
2039 err:
2040 bch2_trans_put(trans);
2041 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2042 }
2043
bch2_do_invalidates(struct bch_fs * c)2044 void bch2_do_invalidates(struct bch_fs *c)
2045 {
2046 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
2047 !queue_work(c->write_ref_wq, &c->invalidate_work))
2048 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2049 }
2050
bch2_dev_freespace_init(struct bch_fs * c,struct bch_dev * ca,u64 bucket_start,u64 bucket_end)2051 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2052 u64 bucket_start, u64 bucket_end)
2053 {
2054 struct btree_trans *trans = bch2_trans_get(c);
2055 struct btree_iter iter;
2056 struct bkey_s_c k;
2057 struct bkey hole;
2058 struct bpos end = POS(ca->dev_idx, bucket_end);
2059 struct bch_member *m;
2060 unsigned long last_updated = jiffies;
2061 int ret;
2062
2063 BUG_ON(bucket_start > bucket_end);
2064 BUG_ON(bucket_end > ca->mi.nbuckets);
2065
2066 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2067 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2068 BTREE_ITER_prefetch);
2069 /*
2070 * Scan the alloc btree for every bucket on @ca, and add buckets to the
2071 * freespace/need_discard/need_gc_gens btrees as needed:
2072 */
2073 while (1) {
2074 if (last_updated + HZ * 10 < jiffies) {
2075 bch_info(ca, "%s: currently at %llu/%llu",
2076 __func__, iter.pos.offset, ca->mi.nbuckets);
2077 last_updated = jiffies;
2078 }
2079
2080 bch2_trans_begin(trans);
2081
2082 if (bkey_ge(iter.pos, end)) {
2083 ret = 0;
2084 break;
2085 }
2086
2087 k = bch2_get_key_or_hole(&iter, end, &hole);
2088 ret = bkey_err(k);
2089 if (ret)
2090 goto bkey_err;
2091
2092 if (k.k->type) {
2093 /*
2094 * We process live keys in the alloc btree one at a
2095 * time:
2096 */
2097 struct bch_alloc_v4 a_convert;
2098 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2099
2100 ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
2101 bch2_trans_commit(trans, NULL, NULL,
2102 BCH_TRANS_COMMIT_no_enospc);
2103 if (ret)
2104 goto bkey_err;
2105
2106 bch2_btree_iter_advance(&iter);
2107 } else {
2108 struct bkey_i *freespace;
2109
2110 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2111 ret = PTR_ERR_OR_ZERO(freespace);
2112 if (ret)
2113 goto bkey_err;
2114
2115 bkey_init(&freespace->k);
2116 freespace->k.type = KEY_TYPE_set;
2117 freespace->k.p = k.k->p;
2118 freespace->k.size = k.k->size;
2119
2120 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2121 bch2_trans_commit(trans, NULL, NULL,
2122 BCH_TRANS_COMMIT_no_enospc);
2123 if (ret)
2124 goto bkey_err;
2125
2126 bch2_btree_iter_set_pos(&iter, k.k->p);
2127 }
2128 bkey_err:
2129 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2130 continue;
2131 if (ret)
2132 break;
2133 }
2134
2135 bch2_trans_iter_exit(trans, &iter);
2136 bch2_trans_put(trans);
2137
2138 if (ret < 0) {
2139 bch_err_msg(ca, ret, "initializing free space");
2140 return ret;
2141 }
2142
2143 mutex_lock(&c->sb_lock);
2144 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2145 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2146 mutex_unlock(&c->sb_lock);
2147
2148 return 0;
2149 }
2150
bch2_fs_freespace_init(struct bch_fs * c)2151 int bch2_fs_freespace_init(struct bch_fs *c)
2152 {
2153 int ret = 0;
2154 bool doing_init = false;
2155
2156 /*
2157 * We can crash during the device add path, so we need to check this on
2158 * every mount:
2159 */
2160
2161 for_each_member_device(c, ca) {
2162 if (ca->mi.freespace_initialized)
2163 continue;
2164
2165 if (!doing_init) {
2166 bch_info(c, "initializing freespace");
2167 doing_init = true;
2168 }
2169
2170 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2171 if (ret) {
2172 bch2_dev_put(ca);
2173 bch_err_fn(c, ret);
2174 return ret;
2175 }
2176 }
2177
2178 if (doing_init) {
2179 mutex_lock(&c->sb_lock);
2180 bch2_write_super(c);
2181 mutex_unlock(&c->sb_lock);
2182 bch_verbose(c, "done initializing freespace");
2183 }
2184
2185 return 0;
2186 }
2187
2188 /* Bucket IO clocks: */
2189
bch2_bucket_io_time_reset(struct btree_trans * trans,unsigned dev,size_t bucket_nr,int rw)2190 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2191 size_t bucket_nr, int rw)
2192 {
2193 struct bch_fs *c = trans->c;
2194 struct btree_iter iter;
2195 struct bkey_i_alloc_v4 *a;
2196 u64 now;
2197 int ret = 0;
2198
2199 if (bch2_trans_relock(trans))
2200 bch2_trans_begin(trans);
2201
2202 a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2203 ret = PTR_ERR_OR_ZERO(a);
2204 if (ret)
2205 return ret;
2206
2207 now = atomic64_read(&c->io_clock[rw].now);
2208 if (a->v.io_time[rw] == now)
2209 goto out;
2210
2211 a->v.io_time[rw] = now;
2212
2213 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2214 bch2_trans_commit(trans, NULL, NULL, 0);
2215 out:
2216 bch2_trans_iter_exit(trans, &iter);
2217 return ret;
2218 }
2219
2220 /* Startup/shutdown (ro/rw): */
2221
bch2_recalc_capacity(struct bch_fs * c)2222 void bch2_recalc_capacity(struct bch_fs *c)
2223 {
2224 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2225 unsigned bucket_size_max = 0;
2226 unsigned long ra_pages = 0;
2227
2228 lockdep_assert_held(&c->state_lock);
2229
2230 for_each_online_member(c, ca) {
2231 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2232
2233 ra_pages += bdi->ra_pages;
2234 }
2235
2236 bch2_set_ra_pages(c, ra_pages);
2237
2238 for_each_rw_member(c, ca) {
2239 u64 dev_reserve = 0;
2240
2241 /*
2242 * We need to reserve buckets (from the number
2243 * of currently available buckets) against
2244 * foreground writes so that mainly copygc can
2245 * make forward progress.
2246 *
2247 * We need enough to refill the various reserves
2248 * from scratch - copygc will use its entire
2249 * reserve all at once, then run against when
2250 * its reserve is refilled (from the formerly
2251 * available buckets).
2252 *
2253 * This reserve is just used when considering if
2254 * allocations for foreground writes must wait -
2255 * not -ENOSPC calculations.
2256 */
2257
2258 dev_reserve += ca->nr_btree_reserve * 2;
2259 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2260
2261 dev_reserve += 1; /* btree write point */
2262 dev_reserve += 1; /* copygc write point */
2263 dev_reserve += 1; /* rebalance write point */
2264
2265 dev_reserve *= ca->mi.bucket_size;
2266
2267 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2268 ca->mi.first_bucket);
2269
2270 reserved_sectors += dev_reserve * 2;
2271
2272 bucket_size_max = max_t(unsigned, bucket_size_max,
2273 ca->mi.bucket_size);
2274 }
2275
2276 gc_reserve = c->opts.gc_reserve_bytes
2277 ? c->opts.gc_reserve_bytes >> 9
2278 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2279
2280 reserved_sectors = max(gc_reserve, reserved_sectors);
2281
2282 reserved_sectors = min(reserved_sectors, capacity);
2283
2284 c->capacity = capacity - reserved_sectors;
2285
2286 c->bucket_size_max = bucket_size_max;
2287
2288 /* Wake up case someone was waiting for buckets */
2289 closure_wake_up(&c->freelist_wait);
2290 }
2291
bch2_min_rw_member_capacity(struct bch_fs * c)2292 u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2293 {
2294 u64 ret = U64_MAX;
2295
2296 for_each_rw_member(c, ca)
2297 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2298 return ret;
2299 }
2300
bch2_dev_has_open_write_point(struct bch_fs * c,struct bch_dev * ca)2301 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2302 {
2303 struct open_bucket *ob;
2304 bool ret = false;
2305
2306 for (ob = c->open_buckets;
2307 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2308 ob++) {
2309 spin_lock(&ob->lock);
2310 if (ob->valid && !ob->on_partial_list &&
2311 ob->dev == ca->dev_idx)
2312 ret = true;
2313 spin_unlock(&ob->lock);
2314 }
2315
2316 return ret;
2317 }
2318
2319 /* device goes ro: */
bch2_dev_allocator_remove(struct bch_fs * c,struct bch_dev * ca)2320 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2321 {
2322 unsigned i;
2323
2324 /* First, remove device from allocation groups: */
2325
2326 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2327 clear_bit(ca->dev_idx, c->rw_devs[i].d);
2328
2329 /*
2330 * Capacity is calculated based off of devices in allocation groups:
2331 */
2332 bch2_recalc_capacity(c);
2333
2334 bch2_open_buckets_stop(c, ca, false);
2335
2336 /*
2337 * Wake up threads that were blocked on allocation, so they can notice
2338 * the device can no longer be removed and the capacity has changed:
2339 */
2340 closure_wake_up(&c->freelist_wait);
2341
2342 /*
2343 * journal_res_get() can block waiting for free space in the journal -
2344 * it needs to notice there may not be devices to allocate from anymore:
2345 */
2346 wake_up(&c->journal.wait);
2347
2348 /* Now wait for any in flight writes: */
2349
2350 closure_wait_event(&c->open_buckets_wait,
2351 !bch2_dev_has_open_write_point(c, ca));
2352 }
2353
2354 /* device goes rw: */
bch2_dev_allocator_add(struct bch_fs * c,struct bch_dev * ca)2355 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2356 {
2357 unsigned i;
2358
2359 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2360 if (ca->mi.data_allowed & (1 << i))
2361 set_bit(ca->dev_idx, c->rw_devs[i].d);
2362 }
2363
bch2_fs_allocator_background_exit(struct bch_fs * c)2364 void bch2_fs_allocator_background_exit(struct bch_fs *c)
2365 {
2366 darray_exit(&c->discard_buckets_in_flight);
2367 }
2368
bch2_fs_allocator_background_init(struct bch_fs * c)2369 void bch2_fs_allocator_background_init(struct bch_fs *c)
2370 {
2371 spin_lock_init(&c->freelist_lock);
2372 mutex_init(&c->discard_buckets_in_flight_lock);
2373 INIT_WORK(&c->discard_work, bch2_do_discards_work);
2374 INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
2375 INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
2376 }
2377