1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "bkey_buf.h"
7 #include "btree_cache.h"
8 #include "btree_io.h"
9 #include "btree_key_cache.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_gc.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "buckets_waiting_for_journal.h"
16 #include "clock.h"
17 #include "debug.h"
18 #include "disk_accounting.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "lru.h"
22 #include "recovery.h"
23 #include "trace.h"
24 #include "varint.h"
25
26 #include <linux/kthread.h>
27 #include <linux/math64.h>
28 #include <linux/random.h>
29 #include <linux/rculist.h>
30 #include <linux/rcupdate.h>
31 #include <linux/sched/task.h>
32 #include <linux/sort.h>
33 #include <linux/jiffies.h>
34
35 static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
36
37 /* Persistent alloc info: */
38
39 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
40 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
41 BCH_ALLOC_FIELDS_V1()
42 #undef x
43 };
44
45 struct bkey_alloc_unpacked {
46 u64 journal_seq;
47 u8 gen;
48 u8 oldest_gen;
49 u8 data_type;
50 bool need_discard:1;
51 bool need_inc_gen:1;
52 #define x(_name, _bits) u##_bits _name;
53 BCH_ALLOC_FIELDS_V2()
54 #undef x
55 };
56
alloc_field_v1_get(const struct bch_alloc * a,const void ** p,unsigned field)57 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
58 const void **p, unsigned field)
59 {
60 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
61 u64 v;
62
63 if (!(a->fields & (1 << field)))
64 return 0;
65
66 switch (bytes) {
67 case 1:
68 v = *((const u8 *) *p);
69 break;
70 case 2:
71 v = le16_to_cpup(*p);
72 break;
73 case 4:
74 v = le32_to_cpup(*p);
75 break;
76 case 8:
77 v = le64_to_cpup(*p);
78 break;
79 default:
80 BUG();
81 }
82
83 *p += bytes;
84 return v;
85 }
86
bch2_alloc_unpack_v1(struct bkey_alloc_unpacked * out,struct bkey_s_c k)87 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
88 struct bkey_s_c k)
89 {
90 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
91 const void *d = in->data;
92 unsigned idx = 0;
93
94 out->gen = in->gen;
95
96 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
97 BCH_ALLOC_FIELDS_V1()
98 #undef x
99 }
100
bch2_alloc_unpack_v2(struct bkey_alloc_unpacked * out,struct bkey_s_c k)101 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
102 struct bkey_s_c k)
103 {
104 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
105 const u8 *in = a.v->data;
106 const u8 *end = bkey_val_end(a);
107 unsigned fieldnr = 0;
108 int ret;
109 u64 v;
110
111 out->gen = a.v->gen;
112 out->oldest_gen = a.v->oldest_gen;
113 out->data_type = a.v->data_type;
114
115 #define x(_name, _bits) \
116 if (fieldnr < a.v->nr_fields) { \
117 ret = bch2_varint_decode_fast(in, end, &v); \
118 if (ret < 0) \
119 return ret; \
120 in += ret; \
121 } else { \
122 v = 0; \
123 } \
124 out->_name = v; \
125 if (v != out->_name) \
126 return -1; \
127 fieldnr++;
128
129 BCH_ALLOC_FIELDS_V2()
130 #undef x
131 return 0;
132 }
133
bch2_alloc_unpack_v3(struct bkey_alloc_unpacked * out,struct bkey_s_c k)134 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
135 struct bkey_s_c k)
136 {
137 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
138 const u8 *in = a.v->data;
139 const u8 *end = bkey_val_end(a);
140 unsigned fieldnr = 0;
141 int ret;
142 u64 v;
143
144 out->gen = a.v->gen;
145 out->oldest_gen = a.v->oldest_gen;
146 out->data_type = a.v->data_type;
147 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
148 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
149 out->journal_seq = le64_to_cpu(a.v->journal_seq);
150
151 #define x(_name, _bits) \
152 if (fieldnr < a.v->nr_fields) { \
153 ret = bch2_varint_decode_fast(in, end, &v); \
154 if (ret < 0) \
155 return ret; \
156 in += ret; \
157 } else { \
158 v = 0; \
159 } \
160 out->_name = v; \
161 if (v != out->_name) \
162 return -1; \
163 fieldnr++;
164
165 BCH_ALLOC_FIELDS_V2()
166 #undef x
167 return 0;
168 }
169
bch2_alloc_unpack(struct bkey_s_c k)170 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
171 {
172 struct bkey_alloc_unpacked ret = { .gen = 0 };
173
174 switch (k.k->type) {
175 case KEY_TYPE_alloc:
176 bch2_alloc_unpack_v1(&ret, k);
177 break;
178 case KEY_TYPE_alloc_v2:
179 bch2_alloc_unpack_v2(&ret, k);
180 break;
181 case KEY_TYPE_alloc_v3:
182 bch2_alloc_unpack_v3(&ret, k);
183 break;
184 }
185
186 return ret;
187 }
188
bch_alloc_v1_val_u64s(const struct bch_alloc * a)189 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
190 {
191 unsigned i, bytes = offsetof(struct bch_alloc, data);
192
193 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
194 if (a->fields & (1 << i))
195 bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
196
197 return DIV_ROUND_UP(bytes, sizeof(u64));
198 }
199
bch2_alloc_v1_validate(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags)200 int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
201 enum bch_validate_flags flags)
202 {
203 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
204 int ret = 0;
205
206 /* allow for unknown fields */
207 bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
208 c, alloc_v1_val_size_bad,
209 "incorrect value size (%zu < %u)",
210 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
211 fsck_err:
212 return ret;
213 }
214
bch2_alloc_v2_validate(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags)215 int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
216 enum bch_validate_flags flags)
217 {
218 struct bkey_alloc_unpacked u;
219 int ret = 0;
220
221 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
222 c, alloc_v2_unpack_error,
223 "unpack error");
224 fsck_err:
225 return ret;
226 }
227
bch2_alloc_v3_validate(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags)228 int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
229 enum bch_validate_flags flags)
230 {
231 struct bkey_alloc_unpacked u;
232 int ret = 0;
233
234 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
235 c, alloc_v2_unpack_error,
236 "unpack error");
237 fsck_err:
238 return ret;
239 }
240
bch2_alloc_v4_validate(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags)241 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
242 enum bch_validate_flags flags)
243 {
244 struct bch_alloc_v4 a;
245 int ret = 0;
246
247 bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
248
249 bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
250 c, alloc_v4_val_size_bad,
251 "bad val size (%u > %zu)",
252 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
253
254 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
255 BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
256 c, alloc_v4_backpointers_start_bad,
257 "invalid backpointers_start");
258
259 bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
260 c, alloc_key_data_type_bad,
261 "invalid data type (got %u should be %u)",
262 a.data_type, alloc_data_type(a, a.data_type));
263
264 for (unsigned i = 0; i < 2; i++)
265 bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
266 c, alloc_key_io_time_bad,
267 "invalid io_time[%s]: %llu, max %llu",
268 i == READ ? "read" : "write",
269 a.io_time[i], LRU_TIME_MAX);
270
271 unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
272 offsetof(struct bch_alloc_v4, stripe_sectors)
273 ? a.stripe_sectors
274 : 0;
275
276 switch (a.data_type) {
277 case BCH_DATA_free:
278 case BCH_DATA_need_gc_gens:
279 case BCH_DATA_need_discard:
280 bkey_fsck_err_on(stripe_sectors ||
281 a.dirty_sectors ||
282 a.cached_sectors ||
283 a.stripe,
284 c, alloc_key_empty_but_have_data,
285 "empty data type free but have data %u.%u.%u %u",
286 stripe_sectors,
287 a.dirty_sectors,
288 a.cached_sectors,
289 a.stripe);
290 break;
291 case BCH_DATA_sb:
292 case BCH_DATA_journal:
293 case BCH_DATA_btree:
294 case BCH_DATA_user:
295 case BCH_DATA_parity:
296 bkey_fsck_err_on(!a.dirty_sectors &&
297 !stripe_sectors,
298 c, alloc_key_dirty_sectors_0,
299 "data_type %s but dirty_sectors==0",
300 bch2_data_type_str(a.data_type));
301 break;
302 case BCH_DATA_cached:
303 bkey_fsck_err_on(!a.cached_sectors ||
304 a.dirty_sectors ||
305 stripe_sectors ||
306 a.stripe,
307 c, alloc_key_cached_inconsistency,
308 "data type inconsistency");
309
310 bkey_fsck_err_on(!a.io_time[READ] &&
311 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
312 c, alloc_key_cached_but_read_time_zero,
313 "cached bucket with read_time == 0");
314 break;
315 case BCH_DATA_stripe:
316 break;
317 }
318 fsck_err:
319 return ret;
320 }
321
bch2_alloc_v4_swab(struct bkey_s k)322 void bch2_alloc_v4_swab(struct bkey_s k)
323 {
324 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
325 struct bch_backpointer *bp, *bps;
326
327 a->journal_seq = swab64(a->journal_seq);
328 a->flags = swab32(a->flags);
329 a->dirty_sectors = swab32(a->dirty_sectors);
330 a->cached_sectors = swab32(a->cached_sectors);
331 a->io_time[0] = swab64(a->io_time[0]);
332 a->io_time[1] = swab64(a->io_time[1]);
333 a->stripe = swab32(a->stripe);
334 a->nr_external_backpointers = swab32(a->nr_external_backpointers);
335 a->fragmentation_lru = swab64(a->fragmentation_lru);
336 a->stripe_sectors = swab32(a->stripe_sectors);
337
338 bps = alloc_v4_backpointers(a);
339 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
340 bp->bucket_offset = swab40(bp->bucket_offset);
341 bp->bucket_len = swab32(bp->bucket_len);
342 bch2_bpos_swab(&bp->pos);
343 }
344 }
345
bch2_alloc_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)346 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
347 {
348 struct bch_alloc_v4 _a;
349 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
350
351 prt_newline(out);
352 printbuf_indent_add(out, 2);
353
354 prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
355 bch2_prt_data_type(out, a->data_type);
356 prt_newline(out);
357 prt_printf(out, "journal_seq %llu\n", a->journal_seq);
358 prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
359 prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
360 prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
361 prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
362 prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
363 prt_printf(out, "stripe %u\n", a->stripe);
364 prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
365 prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
366 prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
367 prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru);
368 prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
369 printbuf_indent_sub(out, 2);
370 }
371
__bch2_alloc_to_v4(struct bkey_s_c k,struct bch_alloc_v4 * out)372 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
373 {
374 if (k.k->type == KEY_TYPE_alloc_v4) {
375 void *src, *dst;
376
377 *out = *bkey_s_c_to_alloc_v4(k).v;
378
379 src = alloc_v4_backpointers(out);
380 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
381 dst = alloc_v4_backpointers(out);
382
383 if (src < dst)
384 memset(src, 0, dst - src);
385
386 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
387 } else {
388 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
389
390 *out = (struct bch_alloc_v4) {
391 .journal_seq = u.journal_seq,
392 .flags = u.need_discard,
393 .gen = u.gen,
394 .oldest_gen = u.oldest_gen,
395 .data_type = u.data_type,
396 .stripe_redundancy = u.stripe_redundancy,
397 .dirty_sectors = u.dirty_sectors,
398 .cached_sectors = u.cached_sectors,
399 .io_time[READ] = u.read_time,
400 .io_time[WRITE] = u.write_time,
401 .stripe = u.stripe,
402 };
403
404 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
405 }
406 }
407
408 static noinline struct bkey_i_alloc_v4 *
__bch2_alloc_to_v4_mut(struct btree_trans * trans,struct bkey_s_c k)409 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
410 {
411 struct bkey_i_alloc_v4 *ret;
412
413 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
414 if (IS_ERR(ret))
415 return ret;
416
417 if (k.k->type == KEY_TYPE_alloc_v4) {
418 void *src, *dst;
419
420 bkey_reassemble(&ret->k_i, k);
421
422 src = alloc_v4_backpointers(&ret->v);
423 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
424 dst = alloc_v4_backpointers(&ret->v);
425
426 if (src < dst)
427 memset(src, 0, dst - src);
428
429 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
430 set_alloc_v4_u64s(ret);
431 } else {
432 bkey_alloc_v4_init(&ret->k_i);
433 ret->k.p = k.k->p;
434 bch2_alloc_to_v4(k, &ret->v);
435 }
436 return ret;
437 }
438
bch2_alloc_to_v4_mut_inlined(struct btree_trans * trans,struct bkey_s_c k)439 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
440 {
441 struct bkey_s_c_alloc_v4 a;
442
443 if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
444 ((a = bkey_s_c_to_alloc_v4(k), true) &&
445 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
446 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
447
448 return __bch2_alloc_to_v4_mut(trans, k);
449 }
450
bch2_alloc_to_v4_mut(struct btree_trans * trans,struct bkey_s_c k)451 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
452 {
453 return bch2_alloc_to_v4_mut_inlined(trans, k);
454 }
455
456 struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update_noupdate(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)457 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
458 struct bpos pos)
459 {
460 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
461 BTREE_ITER_with_updates|
462 BTREE_ITER_cached|
463 BTREE_ITER_intent);
464 int ret = bkey_err(k);
465 if (unlikely(ret))
466 return ERR_PTR(ret);
467
468 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
469 ret = PTR_ERR_OR_ZERO(a);
470 if (unlikely(ret))
471 goto err;
472 return a;
473 err:
474 bch2_trans_iter_exit(trans, iter);
475 return ERR_PTR(ret);
476 }
477
478 __flatten
bch2_trans_start_alloc_update(struct btree_trans * trans,struct bpos pos,enum btree_iter_update_trigger_flags flags)479 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
480 enum btree_iter_update_trigger_flags flags)
481 {
482 struct btree_iter iter;
483 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
484 int ret = PTR_ERR_OR_ZERO(a);
485 if (ret)
486 return ERR_PTR(ret);
487
488 ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
489 bch2_trans_iter_exit(trans, &iter);
490 return unlikely(ret) ? ERR_PTR(ret) : a;
491 }
492
alloc_gens_pos(struct bpos pos,unsigned * offset)493 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
494 {
495 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
496
497 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
498 return pos;
499 }
500
bucket_gens_pos_to_alloc(struct bpos pos,unsigned offset)501 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
502 {
503 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
504 pos.offset += offset;
505 return pos;
506 }
507
alloc_gen(struct bkey_s_c k,unsigned offset)508 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
509 {
510 return k.k->type == KEY_TYPE_bucket_gens
511 ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
512 : 0;
513 }
514
bch2_bucket_gens_validate(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags)515 int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
516 enum bch_validate_flags flags)
517 {
518 int ret = 0;
519
520 bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
521 c, bucket_gens_val_size_bad,
522 "bad val size (%zu != %zu)",
523 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
524 fsck_err:
525 return ret;
526 }
527
bch2_bucket_gens_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)528 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
529 {
530 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
531 unsigned i;
532
533 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
534 if (i)
535 prt_char(out, ' ');
536 prt_printf(out, "%u", g.v->gens[i]);
537 }
538 }
539
bch2_bucket_gens_init(struct bch_fs * c)540 int bch2_bucket_gens_init(struct bch_fs *c)
541 {
542 struct btree_trans *trans = bch2_trans_get(c);
543 struct bkey_i_bucket_gens g;
544 bool have_bucket_gens_key = false;
545 int ret;
546
547 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
548 BTREE_ITER_prefetch, k, ({
549 /*
550 * Not a fsck error because this is checked/repaired by
551 * bch2_check_alloc_key() which runs later:
552 */
553 if (!bch2_dev_bucket_exists(c, k.k->p))
554 continue;
555
556 struct bch_alloc_v4 a;
557 u8 gen = bch2_alloc_to_v4(k, &a)->gen;
558 unsigned offset;
559 struct bpos pos = alloc_gens_pos(iter.pos, &offset);
560 int ret2 = 0;
561
562 if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
563 ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
564 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
565 if (ret2)
566 goto iter_err;
567 have_bucket_gens_key = false;
568 }
569
570 if (!have_bucket_gens_key) {
571 bkey_bucket_gens_init(&g.k_i);
572 g.k.p = pos;
573 have_bucket_gens_key = true;
574 }
575
576 g.v.gens[offset] = gen;
577 iter_err:
578 ret2;
579 }));
580
581 if (have_bucket_gens_key && !ret)
582 ret = commit_do(trans, NULL, NULL,
583 BCH_TRANS_COMMIT_no_enospc,
584 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
585
586 bch2_trans_put(trans);
587
588 bch_err_fn(c, ret);
589 return ret;
590 }
591
bch2_alloc_read(struct bch_fs * c)592 int bch2_alloc_read(struct bch_fs *c)
593 {
594 struct btree_trans *trans = bch2_trans_get(c);
595 struct bch_dev *ca = NULL;
596 int ret;
597
598 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
599 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
600 BTREE_ITER_prefetch, k, ({
601 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
602 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
603
604 if (k.k->type != KEY_TYPE_bucket_gens)
605 continue;
606
607 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
608 /*
609 * Not a fsck error because this is checked/repaired by
610 * bch2_check_alloc_key() which runs later:
611 */
612 if (!ca) {
613 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
614 continue;
615 }
616
617 const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
618
619 for (u64 b = max_t(u64, ca->mi.first_bucket, start);
620 b < min_t(u64, ca->mi.nbuckets, end);
621 b++)
622 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
623 0;
624 }));
625 } else {
626 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
627 BTREE_ITER_prefetch, k, ({
628 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
629 /*
630 * Not a fsck error because this is checked/repaired by
631 * bch2_check_alloc_key() which runs later:
632 */
633 if (!ca) {
634 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
635 continue;
636 }
637
638 struct bch_alloc_v4 a;
639 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
640 0;
641 }));
642 }
643
644 bch2_dev_put(ca);
645 bch2_trans_put(trans);
646
647 bch_err_fn(c, ret);
648 return ret;
649 }
650
651 /* Free space/discard btree: */
652
bch2_bucket_do_index(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c alloc_k,const struct bch_alloc_v4 * a,bool set)653 static int bch2_bucket_do_index(struct btree_trans *trans,
654 struct bch_dev *ca,
655 struct bkey_s_c alloc_k,
656 const struct bch_alloc_v4 *a,
657 bool set)
658 {
659 struct bch_fs *c = trans->c;
660 struct btree_iter iter;
661 struct bkey_s_c old;
662 struct bkey_i *k;
663 enum btree_id btree;
664 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
665 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
666 struct printbuf buf = PRINTBUF;
667 int ret;
668
669 if (a->data_type != BCH_DATA_free &&
670 a->data_type != BCH_DATA_need_discard)
671 return 0;
672
673 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
674 if (IS_ERR(k))
675 return PTR_ERR(k);
676
677 bkey_init(&k->k);
678 k->k.type = new_type;
679
680 switch (a->data_type) {
681 case BCH_DATA_free:
682 btree = BTREE_ID_freespace;
683 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
684 bch2_key_resize(&k->k, 1);
685 break;
686 case BCH_DATA_need_discard:
687 btree = BTREE_ID_need_discard;
688 k->k.p = alloc_k.k->p;
689 break;
690 default:
691 return 0;
692 }
693
694 old = bch2_bkey_get_iter(trans, &iter, btree,
695 bkey_start_pos(&k->k),
696 BTREE_ITER_intent);
697 ret = bkey_err(old);
698 if (ret)
699 return ret;
700
701 if (ca->mi.freespace_initialized &&
702 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
703 bch2_trans_inconsistent_on(old.k->type != old_type, trans,
704 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
705 " for %s",
706 set ? "setting" : "clearing",
707 bch2_btree_id_str(btree),
708 iter.pos.inode,
709 iter.pos.offset,
710 bch2_bkey_types[old.k->type],
711 bch2_bkey_types[old_type],
712 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
713 ret = -EIO;
714 goto err;
715 }
716
717 ret = bch2_trans_update(trans, &iter, k, 0);
718 err:
719 bch2_trans_iter_exit(trans, &iter);
720 printbuf_exit(&buf);
721 return ret;
722 }
723
bch2_bucket_gen_update(struct btree_trans * trans,struct bpos bucket,u8 gen)724 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
725 struct bpos bucket, u8 gen)
726 {
727 struct btree_iter iter;
728 unsigned offset;
729 struct bpos pos = alloc_gens_pos(bucket, &offset);
730 struct bkey_i_bucket_gens *g;
731 struct bkey_s_c k;
732 int ret;
733
734 g = bch2_trans_kmalloc(trans, sizeof(*g));
735 ret = PTR_ERR_OR_ZERO(g);
736 if (ret)
737 return ret;
738
739 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
740 BTREE_ITER_intent|
741 BTREE_ITER_with_updates);
742 ret = bkey_err(k);
743 if (ret)
744 return ret;
745
746 if (k.k->type != KEY_TYPE_bucket_gens) {
747 bkey_bucket_gens_init(&g->k_i);
748 g->k.p = iter.pos;
749 } else {
750 bkey_reassemble(&g->k_i, k);
751 }
752
753 g->v.gens[offset] = gen;
754
755 ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
756 bch2_trans_iter_exit(trans, &iter);
757 return ret;
758 }
759
bch2_dev_data_type_accounting_mod(struct btree_trans * trans,struct bch_dev * ca,enum bch_data_type data_type,s64 delta_buckets,s64 delta_sectors,s64 delta_fragmented,unsigned flags)760 static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
761 enum bch_data_type data_type,
762 s64 delta_buckets,
763 s64 delta_sectors,
764 s64 delta_fragmented, unsigned flags)
765 {
766 struct disk_accounting_pos acc = {
767 .type = BCH_DISK_ACCOUNTING_dev_data_type,
768 .dev_data_type.dev = ca->dev_idx,
769 .dev_data_type.data_type = data_type,
770 };
771 s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
772
773 return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
774 }
775
bch2_alloc_key_to_dev_counters(struct btree_trans * trans,struct bch_dev * ca,const struct bch_alloc_v4 * old,const struct bch_alloc_v4 * new,unsigned flags)776 int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
777 const struct bch_alloc_v4 *old,
778 const struct bch_alloc_v4 *new,
779 unsigned flags)
780 {
781 s64 old_sectors = bch2_bucket_sectors(*old);
782 s64 new_sectors = bch2_bucket_sectors(*new);
783 if (old->data_type != new->data_type) {
784 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
785 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
786 bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
787 -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
788 if (ret)
789 return ret;
790 } else if (old_sectors != new_sectors) {
791 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
792 0,
793 new_sectors - old_sectors,
794 bch2_bucket_sectors_fragmented(ca, *new) -
795 bch2_bucket_sectors_fragmented(ca, *old), flags);
796 if (ret)
797 return ret;
798 }
799
800 s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
801 s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
802 if (old_unstriped != new_unstriped) {
803 int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
804 !!new_unstriped - !!old_unstriped,
805 new_unstriped - old_unstriped,
806 0,
807 flags);
808 if (ret)
809 return ret;
810 }
811
812 return 0;
813 }
814
bch2_trigger_alloc(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)815 int bch2_trigger_alloc(struct btree_trans *trans,
816 enum btree_id btree, unsigned level,
817 struct bkey_s_c old, struct bkey_s new,
818 enum btree_iter_update_trigger_flags flags)
819 {
820 struct bch_fs *c = trans->c;
821 struct printbuf buf = PRINTBUF;
822 int ret = 0;
823
824 struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
825 if (!ca)
826 return -EIO;
827
828 struct bch_alloc_v4 old_a_convert;
829 const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
830
831 struct bch_alloc_v4 *new_a;
832 if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
833 new_a = bkey_s_to_alloc_v4(new).v;
834 } else {
835 BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
836
837 struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
838 ret = PTR_ERR_OR_ZERO(new_ka);
839 if (unlikely(ret))
840 goto err;
841 new_a = &new_ka->v;
842 }
843
844 if (flags & BTREE_TRIGGER_transactional) {
845 alloc_data_type_set(new_a, new_a->data_type);
846
847 if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
848 new_a->io_time[READ] = bch2_current_io_time(c, READ);
849 new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
850 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
851 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
852 }
853
854 if (data_type_is_empty(new_a->data_type) &&
855 BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
856 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
857 new_a->gen++;
858 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
859 alloc_data_type_set(new_a, new_a->data_type);
860 }
861
862 if (old_a->data_type != new_a->data_type ||
863 (new_a->data_type == BCH_DATA_free &&
864 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
865 ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
866 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
867 if (ret)
868 goto err;
869 }
870
871 if (new_a->data_type == BCH_DATA_cached &&
872 !new_a->io_time[READ])
873 new_a->io_time[READ] = bch2_current_io_time(c, READ);
874
875 u64 old_lru = alloc_lru_idx_read(*old_a);
876 u64 new_lru = alloc_lru_idx_read(*new_a);
877 if (old_lru != new_lru) {
878 ret = bch2_lru_change(trans, new.k->p.inode,
879 bucket_to_u64(new.k->p),
880 old_lru, new_lru);
881 if (ret)
882 goto err;
883 }
884
885 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
886 if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
887 ret = bch2_lru_change(trans,
888 BCH_LRU_FRAGMENTATION_START,
889 bucket_to_u64(new.k->p),
890 old_a->fragmentation_lru, new_a->fragmentation_lru);
891 if (ret)
892 goto err;
893 }
894
895 if (old_a->gen != new_a->gen) {
896 ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
897 if (ret)
898 goto err;
899 }
900
901 if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
902 old_a->cached_sectors) {
903 ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
904 -((s64) old_a->cached_sectors),
905 flags & BTREE_TRIGGER_gc);
906 if (ret)
907 goto err;
908 }
909
910 ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
911 if (ret)
912 goto err;
913 }
914
915 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
916 u64 journal_seq = trans->journal_res.seq;
917 u64 bucket_journal_seq = new_a->journal_seq;
918
919 if ((flags & BTREE_TRIGGER_insert) &&
920 data_type_is_empty(old_a->data_type) !=
921 data_type_is_empty(new_a->data_type) &&
922 new.k->type == KEY_TYPE_alloc_v4) {
923 struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
924
925 /*
926 * If the btree updates referring to a bucket weren't flushed
927 * before the bucket became empty again, then the we don't have
928 * to wait on a journal flush before we can reuse the bucket:
929 */
930 v->journal_seq = bucket_journal_seq =
931 data_type_is_empty(new_a->data_type) &&
932 (journal_seq == v->journal_seq ||
933 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
934 ? 0 : journal_seq;
935 }
936
937 if (!data_type_is_empty(old_a->data_type) &&
938 data_type_is_empty(new_a->data_type) &&
939 bucket_journal_seq) {
940 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
941 c->journal.flushed_seq_ondisk,
942 new.k->p.inode, new.k->p.offset,
943 bucket_journal_seq);
944 if (bch2_fs_fatal_err_on(ret, c,
945 "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
946 goto err;
947 }
948
949 if (new_a->gen != old_a->gen) {
950 rcu_read_lock();
951 u8 *gen = bucket_gen(ca, new.k->p.offset);
952 if (unlikely(!gen)) {
953 rcu_read_unlock();
954 goto invalid_bucket;
955 }
956 *gen = new_a->gen;
957 rcu_read_unlock();
958 }
959
960 #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
961 #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
962 #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
963
964 if (statechange(a->data_type == BCH_DATA_free) &&
965 bucket_flushed(new_a))
966 closure_wake_up(&c->freelist_wait);
967
968 if (statechange(a->data_type == BCH_DATA_need_discard) &&
969 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
970 bucket_flushed(new_a))
971 bch2_discard_one_bucket_fast(ca, new.k->p.offset);
972
973 if (statechange(a->data_type == BCH_DATA_cached) &&
974 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
975 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
976 bch2_dev_do_invalidates(ca);
977
978 if (statechange(a->data_type == BCH_DATA_need_gc_gens))
979 bch2_gc_gens_async(c);
980 }
981
982 if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
983 rcu_read_lock();
984 struct bucket *g = gc_bucket(ca, new.k->p.offset);
985 if (unlikely(!g)) {
986 rcu_read_unlock();
987 goto invalid_bucket;
988 }
989 g->gen_valid = 1;
990 g->gen = new_a->gen;
991 rcu_read_unlock();
992 }
993 err:
994 printbuf_exit(&buf);
995 bch2_dev_put(ca);
996 return ret;
997 invalid_bucket:
998 bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
999 (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
1000 ret = -EIO;
1001 goto err;
1002 }
1003
1004 /*
1005 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
1006 * extents style btrees, but works on non-extents btrees:
1007 */
bch2_get_key_or_hole(struct btree_iter * iter,struct bpos end,struct bkey * hole)1008 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
1009 {
1010 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
1011
1012 if (bkey_err(k))
1013 return k;
1014
1015 if (k.k->type) {
1016 return k;
1017 } else {
1018 struct btree_iter iter2;
1019 struct bpos next;
1020
1021 bch2_trans_copy_iter(&iter2, iter);
1022
1023 struct btree_path *path = btree_iter_path(iter->trans, iter);
1024 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
1025 end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
1026
1027 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
1028
1029 /*
1030 * btree node min/max is a closed interval, upto takes a half
1031 * open interval:
1032 */
1033 k = bch2_btree_iter_peek_upto(&iter2, end);
1034 next = iter2.pos;
1035 bch2_trans_iter_exit(iter->trans, &iter2);
1036
1037 BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
1038
1039 if (bkey_err(k))
1040 return k;
1041
1042 bkey_init(hole);
1043 hole->p = iter->pos;
1044
1045 bch2_key_resize(hole, next.offset - iter->pos.offset);
1046 return (struct bkey_s_c) { hole, NULL };
1047 }
1048 }
1049
next_bucket(struct bch_fs * c,struct bch_dev ** ca,struct bpos * bucket)1050 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
1051 {
1052 if (*ca) {
1053 if (bucket->offset < (*ca)->mi.first_bucket)
1054 bucket->offset = (*ca)->mi.first_bucket;
1055
1056 if (bucket->offset < (*ca)->mi.nbuckets)
1057 return true;
1058
1059 bch2_dev_put(*ca);
1060 *ca = NULL;
1061 bucket->inode++;
1062 bucket->offset = 0;
1063 }
1064
1065 rcu_read_lock();
1066 *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
1067 if (*ca) {
1068 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1069 bch2_dev_get(*ca);
1070 }
1071 rcu_read_unlock();
1072
1073 return *ca != NULL;
1074 }
1075
bch2_get_key_or_real_bucket_hole(struct btree_iter * iter,struct bch_dev ** ca,struct bkey * hole)1076 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
1077 struct bch_dev **ca, struct bkey *hole)
1078 {
1079 struct bch_fs *c = iter->trans->c;
1080 struct bkey_s_c k;
1081 again:
1082 k = bch2_get_key_or_hole(iter, POS_MAX, hole);
1083 if (bkey_err(k))
1084 return k;
1085
1086 *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1087
1088 if (!k.k->type) {
1089 struct bpos hole_start = bkey_start_pos(k.k);
1090
1091 if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1092 if (!next_bucket(c, ca, &hole_start))
1093 return bkey_s_c_null;
1094
1095 bch2_btree_iter_set_pos(iter, hole_start);
1096 goto again;
1097 }
1098
1099 if (k.k->p.offset > (*ca)->mi.nbuckets)
1100 bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1101 }
1102
1103 return k;
1104 }
1105
1106 static noinline_for_stack
bch2_check_alloc_key(struct btree_trans * trans,struct bkey_s_c alloc_k,struct btree_iter * alloc_iter,struct btree_iter * discard_iter,struct btree_iter * freespace_iter,struct btree_iter * bucket_gens_iter)1107 int bch2_check_alloc_key(struct btree_trans *trans,
1108 struct bkey_s_c alloc_k,
1109 struct btree_iter *alloc_iter,
1110 struct btree_iter *discard_iter,
1111 struct btree_iter *freespace_iter,
1112 struct btree_iter *bucket_gens_iter)
1113 {
1114 struct bch_fs *c = trans->c;
1115 struct bch_alloc_v4 a_convert;
1116 const struct bch_alloc_v4 *a;
1117 unsigned discard_key_type, freespace_key_type;
1118 unsigned gens_offset;
1119 struct bkey_s_c k;
1120 struct printbuf buf = PRINTBUF;
1121 int ret = 0;
1122
1123 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1124 if (fsck_err_on(!ca,
1125 trans, alloc_key_to_missing_dev_bucket,
1126 "alloc key for invalid device:bucket %llu:%llu",
1127 alloc_k.k->p.inode, alloc_k.k->p.offset))
1128 ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1129 if (!ca)
1130 return ret;
1131
1132 if (!ca->mi.freespace_initialized)
1133 goto out;
1134
1135 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1136
1137 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
1138 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1139 k = bch2_btree_iter_peek_slot(discard_iter);
1140 ret = bkey_err(k);
1141 if (ret)
1142 goto err;
1143
1144 if (fsck_err_on(k.k->type != discard_key_type,
1145 trans, need_discard_key_wrong,
1146 "incorrect key in need_discard btree (got %s should be %s)\n"
1147 " %s",
1148 bch2_bkey_types[k.k->type],
1149 bch2_bkey_types[discard_key_type],
1150 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1151 struct bkey_i *update =
1152 bch2_trans_kmalloc(trans, sizeof(*update));
1153
1154 ret = PTR_ERR_OR_ZERO(update);
1155 if (ret)
1156 goto err;
1157
1158 bkey_init(&update->k);
1159 update->k.type = discard_key_type;
1160 update->k.p = discard_iter->pos;
1161
1162 ret = bch2_trans_update(trans, discard_iter, update, 0);
1163 if (ret)
1164 goto err;
1165 }
1166
1167 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1168 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1169 k = bch2_btree_iter_peek_slot(freespace_iter);
1170 ret = bkey_err(k);
1171 if (ret)
1172 goto err;
1173
1174 if (fsck_err_on(k.k->type != freespace_key_type,
1175 trans, freespace_key_wrong,
1176 "incorrect key in freespace btree (got %s should be %s)\n"
1177 " %s",
1178 bch2_bkey_types[k.k->type],
1179 bch2_bkey_types[freespace_key_type],
1180 (printbuf_reset(&buf),
1181 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1182 struct bkey_i *update =
1183 bch2_trans_kmalloc(trans, sizeof(*update));
1184
1185 ret = PTR_ERR_OR_ZERO(update);
1186 if (ret)
1187 goto err;
1188
1189 bkey_init(&update->k);
1190 update->k.type = freespace_key_type;
1191 update->k.p = freespace_iter->pos;
1192 bch2_key_resize(&update->k, 1);
1193
1194 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1195 if (ret)
1196 goto err;
1197 }
1198
1199 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1200 k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1201 ret = bkey_err(k);
1202 if (ret)
1203 goto err;
1204
1205 if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1206 trans, bucket_gens_key_wrong,
1207 "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1208 " %s",
1209 alloc_gen(k, gens_offset), a->gen,
1210 (printbuf_reset(&buf),
1211 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1212 struct bkey_i_bucket_gens *g =
1213 bch2_trans_kmalloc(trans, sizeof(*g));
1214
1215 ret = PTR_ERR_OR_ZERO(g);
1216 if (ret)
1217 goto err;
1218
1219 if (k.k->type == KEY_TYPE_bucket_gens) {
1220 bkey_reassemble(&g->k_i, k);
1221 } else {
1222 bkey_bucket_gens_init(&g->k_i);
1223 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1224 }
1225
1226 g->v.gens[gens_offset] = a->gen;
1227
1228 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1229 if (ret)
1230 goto err;
1231 }
1232 out:
1233 err:
1234 fsck_err:
1235 bch2_dev_put(ca);
1236 printbuf_exit(&buf);
1237 return ret;
1238 }
1239
1240 static noinline_for_stack
bch2_check_alloc_hole_freespace(struct btree_trans * trans,struct bch_dev * ca,struct bpos start,struct bpos * end,struct btree_iter * freespace_iter)1241 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1242 struct bch_dev *ca,
1243 struct bpos start,
1244 struct bpos *end,
1245 struct btree_iter *freespace_iter)
1246 {
1247 struct bkey_s_c k;
1248 struct printbuf buf = PRINTBUF;
1249 int ret;
1250
1251 if (!ca->mi.freespace_initialized)
1252 return 0;
1253
1254 bch2_btree_iter_set_pos(freespace_iter, start);
1255
1256 k = bch2_btree_iter_peek_slot(freespace_iter);
1257 ret = bkey_err(k);
1258 if (ret)
1259 goto err;
1260
1261 *end = bkey_min(k.k->p, *end);
1262
1263 if (fsck_err_on(k.k->type != KEY_TYPE_set,
1264 trans, freespace_hole_missing,
1265 "hole in alloc btree missing in freespace btree\n"
1266 " device %llu buckets %llu-%llu",
1267 freespace_iter->pos.inode,
1268 freespace_iter->pos.offset,
1269 end->offset)) {
1270 struct bkey_i *update =
1271 bch2_trans_kmalloc(trans, sizeof(*update));
1272
1273 ret = PTR_ERR_OR_ZERO(update);
1274 if (ret)
1275 goto err;
1276
1277 bkey_init(&update->k);
1278 update->k.type = KEY_TYPE_set;
1279 update->k.p = freespace_iter->pos;
1280 bch2_key_resize(&update->k,
1281 min_t(u64, U32_MAX, end->offset -
1282 freespace_iter->pos.offset));
1283
1284 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1285 if (ret)
1286 goto err;
1287 }
1288 err:
1289 fsck_err:
1290 printbuf_exit(&buf);
1291 return ret;
1292 }
1293
1294 static noinline_for_stack
bch2_check_alloc_hole_bucket_gens(struct btree_trans * trans,struct bpos start,struct bpos * end,struct btree_iter * bucket_gens_iter)1295 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1296 struct bpos start,
1297 struct bpos *end,
1298 struct btree_iter *bucket_gens_iter)
1299 {
1300 struct bkey_s_c k;
1301 struct printbuf buf = PRINTBUF;
1302 unsigned i, gens_offset, gens_end_offset;
1303 int ret;
1304
1305 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1306
1307 k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1308 ret = bkey_err(k);
1309 if (ret)
1310 goto err;
1311
1312 if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1313 alloc_gens_pos(*end, &gens_end_offset)))
1314 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1315
1316 if (k.k->type == KEY_TYPE_bucket_gens) {
1317 struct bkey_i_bucket_gens g;
1318 bool need_update = false;
1319
1320 bkey_reassemble(&g.k_i, k);
1321
1322 for (i = gens_offset; i < gens_end_offset; i++) {
1323 if (fsck_err_on(g.v.gens[i], trans,
1324 bucket_gens_hole_wrong,
1325 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1326 bucket_gens_pos_to_alloc(k.k->p, i).inode,
1327 bucket_gens_pos_to_alloc(k.k->p, i).offset,
1328 g.v.gens[i])) {
1329 g.v.gens[i] = 0;
1330 need_update = true;
1331 }
1332 }
1333
1334 if (need_update) {
1335 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1336
1337 ret = PTR_ERR_OR_ZERO(u);
1338 if (ret)
1339 goto err;
1340
1341 memcpy(u, &g, sizeof(g));
1342
1343 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1344 if (ret)
1345 goto err;
1346 }
1347 }
1348
1349 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1350 err:
1351 fsck_err:
1352 printbuf_exit(&buf);
1353 return ret;
1354 }
1355
bch2_check_discard_freespace_key(struct btree_trans * trans,struct btree_iter * iter)1356 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
1357 struct btree_iter *iter)
1358 {
1359 struct bch_fs *c = trans->c;
1360 struct btree_iter alloc_iter;
1361 struct bkey_s_c alloc_k;
1362 struct bch_alloc_v4 a_convert;
1363 const struct bch_alloc_v4 *a;
1364 u64 genbits;
1365 struct bpos pos;
1366 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1367 ? BCH_DATA_need_discard
1368 : BCH_DATA_free;
1369 struct printbuf buf = PRINTBUF;
1370 int ret;
1371
1372 pos = iter->pos;
1373 pos.offset &= ~(~0ULL << 56);
1374 genbits = iter->pos.offset & (~0ULL << 56);
1375
1376 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1377 ret = bkey_err(alloc_k);
1378 if (ret)
1379 return ret;
1380
1381 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
1382 trans, need_discard_freespace_key_to_invalid_dev_bucket,
1383 "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1384 bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
1385 goto delete;
1386
1387 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1388
1389 if (fsck_err_on(a->data_type != state ||
1390 (state == BCH_DATA_free &&
1391 genbits != alloc_freespace_genbits(*a)),
1392 trans, need_discard_freespace_key_bad,
1393 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1394 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1395 bch2_btree_id_str(iter->btree_id),
1396 iter->pos.inode,
1397 iter->pos.offset,
1398 a->data_type == state,
1399 genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1400 goto delete;
1401 out:
1402 fsck_err:
1403 bch2_set_btree_iter_dontneed(&alloc_iter);
1404 bch2_trans_iter_exit(trans, &alloc_iter);
1405 printbuf_exit(&buf);
1406 return ret;
1407 delete:
1408 ret = bch2_btree_delete_extent_at(trans, iter,
1409 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1410 bch2_trans_commit(trans, NULL, NULL,
1411 BCH_TRANS_COMMIT_no_enospc);
1412 goto out;
1413 }
1414
1415 /*
1416 * We've already checked that generation numbers in the bucket_gens btree are
1417 * valid for buckets that exist; this just checks for keys for nonexistent
1418 * buckets.
1419 */
1420 static noinline_for_stack
bch2_check_bucket_gens_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1421 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1422 struct btree_iter *iter,
1423 struct bkey_s_c k)
1424 {
1425 struct bch_fs *c = trans->c;
1426 struct bkey_i_bucket_gens g;
1427 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1428 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1429 u64 b;
1430 bool need_update = false;
1431 struct printbuf buf = PRINTBUF;
1432 int ret = 0;
1433
1434 BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1435 bkey_reassemble(&g.k_i, k);
1436
1437 struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1438 if (!ca) {
1439 if (fsck_err(trans, bucket_gens_to_invalid_dev,
1440 "bucket_gens key for invalid device:\n %s",
1441 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1442 ret = bch2_btree_delete_at(trans, iter, 0);
1443 goto out;
1444 }
1445
1446 if (fsck_err_on(end <= ca->mi.first_bucket ||
1447 start >= ca->mi.nbuckets,
1448 trans, bucket_gens_to_invalid_buckets,
1449 "bucket_gens key for invalid buckets:\n %s",
1450 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1451 ret = bch2_btree_delete_at(trans, iter, 0);
1452 goto out;
1453 }
1454
1455 for (b = start; b < ca->mi.first_bucket; b++)
1456 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1457 trans, bucket_gens_nonzero_for_invalid_buckets,
1458 "bucket_gens key has nonzero gen for invalid bucket")) {
1459 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1460 need_update = true;
1461 }
1462
1463 for (b = ca->mi.nbuckets; b < end; b++)
1464 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1465 trans, bucket_gens_nonzero_for_invalid_buckets,
1466 "bucket_gens key has nonzero gen for invalid bucket")) {
1467 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1468 need_update = true;
1469 }
1470
1471 if (need_update) {
1472 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1473
1474 ret = PTR_ERR_OR_ZERO(u);
1475 if (ret)
1476 goto out;
1477
1478 memcpy(u, &g, sizeof(g));
1479 ret = bch2_trans_update(trans, iter, u, 0);
1480 }
1481 out:
1482 fsck_err:
1483 bch2_dev_put(ca);
1484 printbuf_exit(&buf);
1485 return ret;
1486 }
1487
bch2_check_alloc_info(struct bch_fs * c)1488 int bch2_check_alloc_info(struct bch_fs *c)
1489 {
1490 struct btree_trans *trans = bch2_trans_get(c);
1491 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1492 struct bch_dev *ca = NULL;
1493 struct bkey hole;
1494 struct bkey_s_c k;
1495 int ret = 0;
1496
1497 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1498 BTREE_ITER_prefetch);
1499 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1500 BTREE_ITER_prefetch);
1501 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1502 BTREE_ITER_prefetch);
1503 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1504 BTREE_ITER_prefetch);
1505
1506 while (1) {
1507 struct bpos next;
1508
1509 bch2_trans_begin(trans);
1510
1511 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
1512 ret = bkey_err(k);
1513 if (ret)
1514 goto bkey_err;
1515
1516 if (!k.k)
1517 break;
1518
1519 if (k.k->type) {
1520 next = bpos_nosnap_successor(k.k->p);
1521
1522 ret = bch2_check_alloc_key(trans,
1523 k, &iter,
1524 &discard_iter,
1525 &freespace_iter,
1526 &bucket_gens_iter);
1527 if (ret)
1528 goto bkey_err;
1529 } else {
1530 next = k.k->p;
1531
1532 ret = bch2_check_alloc_hole_freespace(trans, ca,
1533 bkey_start_pos(k.k),
1534 &next,
1535 &freespace_iter) ?:
1536 bch2_check_alloc_hole_bucket_gens(trans,
1537 bkey_start_pos(k.k),
1538 &next,
1539 &bucket_gens_iter);
1540 if (ret)
1541 goto bkey_err;
1542 }
1543
1544 ret = bch2_trans_commit(trans, NULL, NULL,
1545 BCH_TRANS_COMMIT_no_enospc);
1546 if (ret)
1547 goto bkey_err;
1548
1549 bch2_btree_iter_set_pos(&iter, next);
1550 bkey_err:
1551 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1552 continue;
1553 if (ret)
1554 break;
1555 }
1556 bch2_trans_iter_exit(trans, &bucket_gens_iter);
1557 bch2_trans_iter_exit(trans, &freespace_iter);
1558 bch2_trans_iter_exit(trans, &discard_iter);
1559 bch2_trans_iter_exit(trans, &iter);
1560 bch2_dev_put(ca);
1561 ca = NULL;
1562
1563 if (ret < 0)
1564 goto err;
1565
1566 ret = for_each_btree_key(trans, iter,
1567 BTREE_ID_need_discard, POS_MIN,
1568 BTREE_ITER_prefetch, k,
1569 bch2_check_discard_freespace_key(trans, &iter));
1570 if (ret)
1571 goto err;
1572
1573 bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1574 BTREE_ITER_prefetch);
1575 while (1) {
1576 bch2_trans_begin(trans);
1577 k = bch2_btree_iter_peek(&iter);
1578 if (!k.k)
1579 break;
1580
1581 ret = bkey_err(k) ?:
1582 bch2_check_discard_freespace_key(trans, &iter);
1583 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1584 ret = 0;
1585 continue;
1586 }
1587 if (ret) {
1588 struct printbuf buf = PRINTBUF;
1589 bch2_bkey_val_to_text(&buf, c, k);
1590
1591 bch_err(c, "while checking %s", buf.buf);
1592 printbuf_exit(&buf);
1593 break;
1594 }
1595
1596 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
1597 }
1598 bch2_trans_iter_exit(trans, &iter);
1599 if (ret)
1600 goto err;
1601
1602 ret = for_each_btree_key_commit(trans, iter,
1603 BTREE_ID_bucket_gens, POS_MIN,
1604 BTREE_ITER_prefetch, k,
1605 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1606 bch2_check_bucket_gens_key(trans, &iter, k));
1607 err:
1608 bch2_trans_put(trans);
1609 bch_err_fn(c, ret);
1610 return ret;
1611 }
1612
bch2_check_alloc_to_lru_ref(struct btree_trans * trans,struct btree_iter * alloc_iter,struct bkey_buf * last_flushed)1613 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1614 struct btree_iter *alloc_iter,
1615 struct bkey_buf *last_flushed)
1616 {
1617 struct bch_fs *c = trans->c;
1618 struct bch_alloc_v4 a_convert;
1619 const struct bch_alloc_v4 *a;
1620 struct bkey_s_c alloc_k;
1621 struct printbuf buf = PRINTBUF;
1622 int ret;
1623
1624 alloc_k = bch2_btree_iter_peek(alloc_iter);
1625 if (!alloc_k.k)
1626 return 0;
1627
1628 ret = bkey_err(alloc_k);
1629 if (ret)
1630 return ret;
1631
1632 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1633
1634 if (a->fragmentation_lru) {
1635 ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
1636 a->fragmentation_lru,
1637 alloc_k, last_flushed);
1638 if (ret)
1639 return ret;
1640 }
1641
1642 if (a->data_type != BCH_DATA_cached)
1643 return 0;
1644
1645 if (fsck_err_on(!a->io_time[READ],
1646 trans, alloc_key_cached_but_read_time_zero,
1647 "cached bucket with read_time 0\n"
1648 " %s",
1649 (printbuf_reset(&buf),
1650 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1651 struct bkey_i_alloc_v4 *a_mut =
1652 bch2_alloc_to_v4_mut(trans, alloc_k);
1653 ret = PTR_ERR_OR_ZERO(a_mut);
1654 if (ret)
1655 goto err;
1656
1657 a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
1658 ret = bch2_trans_update(trans, alloc_iter,
1659 &a_mut->k_i, BTREE_TRIGGER_norun);
1660 if (ret)
1661 goto err;
1662
1663 a = &a_mut->v;
1664 }
1665
1666 ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
1667 alloc_k, last_flushed);
1668 if (ret)
1669 goto err;
1670 err:
1671 fsck_err:
1672 printbuf_exit(&buf);
1673 return ret;
1674 }
1675
bch2_check_alloc_to_lru_refs(struct bch_fs * c)1676 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1677 {
1678 struct bkey_buf last_flushed;
1679
1680 bch2_bkey_buf_init(&last_flushed);
1681 bkey_init(&last_flushed.k->k);
1682
1683 int ret = bch2_trans_run(c,
1684 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1685 POS_MIN, BTREE_ITER_prefetch, k,
1686 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1687 bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
1688
1689 bch2_bkey_buf_exit(&last_flushed, c);
1690 bch_err_fn(c, ret);
1691 return ret;
1692 }
1693
discard_in_flight_add(struct bch_dev * ca,u64 bucket,bool in_progress)1694 static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
1695 {
1696 int ret;
1697
1698 mutex_lock(&ca->discard_buckets_in_flight_lock);
1699 darray_for_each(ca->discard_buckets_in_flight, i)
1700 if (i->bucket == bucket) {
1701 ret = -BCH_ERR_EEXIST_discard_in_flight_add;
1702 goto out;
1703 }
1704
1705 ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
1706 .in_progress = in_progress,
1707 .bucket = bucket,
1708 }));
1709 out:
1710 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1711 return ret;
1712 }
1713
discard_in_flight_remove(struct bch_dev * ca,u64 bucket)1714 static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
1715 {
1716 mutex_lock(&ca->discard_buckets_in_flight_lock);
1717 darray_for_each(ca->discard_buckets_in_flight, i)
1718 if (i->bucket == bucket) {
1719 BUG_ON(!i->in_progress);
1720 darray_remove_item(&ca->discard_buckets_in_flight, i);
1721 goto found;
1722 }
1723 BUG();
1724 found:
1725 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1726 }
1727
1728 struct discard_buckets_state {
1729 u64 seen;
1730 u64 open;
1731 u64 need_journal_commit;
1732 u64 discarded;
1733 u64 need_journal_commit_this_dev;
1734 };
1735
bch2_discard_one_bucket(struct btree_trans * trans,struct bch_dev * ca,struct btree_iter * need_discard_iter,struct bpos * discard_pos_done,struct discard_buckets_state * s)1736 static int bch2_discard_one_bucket(struct btree_trans *trans,
1737 struct bch_dev *ca,
1738 struct btree_iter *need_discard_iter,
1739 struct bpos *discard_pos_done,
1740 struct discard_buckets_state *s)
1741 {
1742 struct bch_fs *c = trans->c;
1743 struct bpos pos = need_discard_iter->pos;
1744 struct btree_iter iter = { NULL };
1745 struct bkey_s_c k;
1746 struct bkey_i_alloc_v4 *a;
1747 struct printbuf buf = PRINTBUF;
1748 bool discard_locked = false;
1749 int ret = 0;
1750
1751 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1752 s->open++;
1753 goto out;
1754 }
1755
1756 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1757 c->journal.flushed_seq_ondisk,
1758 pos.inode, pos.offset)) {
1759 s->need_journal_commit++;
1760 s->need_journal_commit_this_dev++;
1761 goto out;
1762 }
1763
1764 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1765 need_discard_iter->pos,
1766 BTREE_ITER_cached);
1767 ret = bkey_err(k);
1768 if (ret)
1769 goto out;
1770
1771 a = bch2_alloc_to_v4_mut(trans, k);
1772 ret = PTR_ERR_OR_ZERO(a);
1773 if (ret)
1774 goto out;
1775
1776 if (bch2_bucket_sectors_total(a->v)) {
1777 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1778 trans, "attempting to discard bucket with dirty data\n%s",
1779 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1780 ret = -EIO;
1781 goto out;
1782 }
1783
1784 if (a->v.data_type != BCH_DATA_need_discard) {
1785 if (data_type_is_empty(a->v.data_type) &&
1786 BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1787 a->v.gen++;
1788 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1789 goto write;
1790 }
1791
1792 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1793 trans, "bucket incorrectly set in need_discard btree\n"
1794 "%s",
1795 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1796 ret = -EIO;
1797 goto out;
1798 }
1799
1800 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1801 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1802 trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1803 a->v.journal_seq,
1804 c->journal.flushed_seq_ondisk,
1805 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1806 ret = -EIO;
1807 goto out;
1808 }
1809
1810 if (discard_in_flight_add(ca, iter.pos.offset, true))
1811 goto out;
1812
1813 discard_locked = true;
1814
1815 if (!bkey_eq(*discard_pos_done, iter.pos) &&
1816 ca->mi.discard && !c->opts.nochanges) {
1817 /*
1818 * This works without any other locks because this is the only
1819 * thread that removes items from the need_discard tree
1820 */
1821 bch2_trans_unlock_long(trans);
1822 blkdev_issue_discard(ca->disk_sb.bdev,
1823 k.k->p.offset * ca->mi.bucket_size,
1824 ca->mi.bucket_size,
1825 GFP_KERNEL);
1826 *discard_pos_done = iter.pos;
1827
1828 ret = bch2_trans_relock_notrace(trans);
1829 if (ret)
1830 goto out;
1831 }
1832
1833 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1834 write:
1835 alloc_data_type_set(&a->v, a->v.data_type);
1836
1837 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1838 bch2_trans_commit(trans, NULL, NULL,
1839 BCH_WATERMARK_btree|
1840 BCH_TRANS_COMMIT_no_enospc);
1841 if (ret)
1842 goto out;
1843
1844 count_event(c, bucket_discard);
1845 s->discarded++;
1846 out:
1847 if (discard_locked)
1848 discard_in_flight_remove(ca, iter.pos.offset);
1849 s->seen++;
1850 bch2_trans_iter_exit(trans, &iter);
1851 printbuf_exit(&buf);
1852 return ret;
1853 }
1854
bch2_do_discards_work(struct work_struct * work)1855 static void bch2_do_discards_work(struct work_struct *work)
1856 {
1857 struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
1858 struct bch_fs *c = ca->fs;
1859 struct discard_buckets_state s = {};
1860 struct bpos discard_pos_done = POS_MAX;
1861 int ret;
1862
1863 /*
1864 * We're doing the commit in bch2_discard_one_bucket instead of using
1865 * for_each_btree_key_commit() so that we can increment counters after
1866 * successful commit:
1867 */
1868 ret = bch2_trans_run(c,
1869 for_each_btree_key_upto(trans, iter,
1870 BTREE_ID_need_discard,
1871 POS(ca->dev_idx, 0),
1872 POS(ca->dev_idx, U64_MAX), 0, k,
1873 bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
1874
1875 trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1876 bch2_err_str(ret));
1877
1878 percpu_ref_put(&ca->io_ref);
1879 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1880 }
1881
bch2_dev_do_discards(struct bch_dev * ca)1882 void bch2_dev_do_discards(struct bch_dev *ca)
1883 {
1884 struct bch_fs *c = ca->fs;
1885
1886 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
1887 return;
1888
1889 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1890 goto put_write_ref;
1891
1892 if (queue_work(c->write_ref_wq, &ca->discard_work))
1893 return;
1894
1895 percpu_ref_put(&ca->io_ref);
1896 put_write_ref:
1897 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1898 }
1899
bch2_do_discards(struct bch_fs * c)1900 void bch2_do_discards(struct bch_fs *c)
1901 {
1902 for_each_member_device(c, ca)
1903 bch2_dev_do_discards(ca);
1904 }
1905
bch2_clear_bucket_needs_discard(struct btree_trans * trans,struct bpos bucket)1906 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
1907 {
1908 struct btree_iter iter;
1909 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
1910 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
1911 int ret = bkey_err(k);
1912 if (ret)
1913 goto err;
1914
1915 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
1916 ret = PTR_ERR_OR_ZERO(a);
1917 if (ret)
1918 goto err;
1919
1920 BUG_ON(a->v.dirty_sectors);
1921 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1922 alloc_data_type_set(&a->v, a->v.data_type);
1923
1924 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1925 err:
1926 bch2_trans_iter_exit(trans, &iter);
1927 return ret;
1928 }
1929
bch2_do_discards_fast_work(struct work_struct * work)1930 static void bch2_do_discards_fast_work(struct work_struct *work)
1931 {
1932 struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
1933 struct bch_fs *c = ca->fs;
1934
1935 while (1) {
1936 bool got_bucket = false;
1937 u64 bucket;
1938
1939 mutex_lock(&ca->discard_buckets_in_flight_lock);
1940 darray_for_each(ca->discard_buckets_in_flight, i) {
1941 if (i->in_progress)
1942 continue;
1943
1944 got_bucket = true;
1945 bucket = i->bucket;
1946 i->in_progress = true;
1947 break;
1948 }
1949 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1950
1951 if (!got_bucket)
1952 break;
1953
1954 if (ca->mi.discard && !c->opts.nochanges)
1955 blkdev_issue_discard(ca->disk_sb.bdev,
1956 bucket_to_sector(ca, bucket),
1957 ca->mi.bucket_size,
1958 GFP_KERNEL);
1959
1960 int ret = bch2_trans_do(c, NULL, NULL,
1961 BCH_WATERMARK_btree|
1962 BCH_TRANS_COMMIT_no_enospc,
1963 bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
1964 bch_err_fn(c, ret);
1965
1966 discard_in_flight_remove(ca, bucket);
1967
1968 if (ret)
1969 break;
1970 }
1971
1972 percpu_ref_put(&ca->io_ref);
1973 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1974 }
1975
bch2_discard_one_bucket_fast(struct bch_dev * ca,u64 bucket)1976 static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
1977 {
1978 struct bch_fs *c = ca->fs;
1979
1980 if (discard_in_flight_add(ca, bucket, false))
1981 return;
1982
1983 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
1984 return;
1985
1986 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1987 goto put_ref;
1988
1989 if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
1990 return;
1991
1992 percpu_ref_put(&ca->io_ref);
1993 put_ref:
1994 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1995 }
1996
invalidate_one_bucket(struct btree_trans * trans,struct btree_iter * lru_iter,struct bkey_s_c lru_k,s64 * nr_to_invalidate)1997 static int invalidate_one_bucket(struct btree_trans *trans,
1998 struct btree_iter *lru_iter,
1999 struct bkey_s_c lru_k,
2000 s64 *nr_to_invalidate)
2001 {
2002 struct bch_fs *c = trans->c;
2003 struct bkey_i_alloc_v4 *a = NULL;
2004 struct printbuf buf = PRINTBUF;
2005 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
2006 unsigned cached_sectors;
2007 int ret = 0;
2008
2009 if (*nr_to_invalidate <= 0)
2010 return 1;
2011
2012 if (!bch2_dev_bucket_exists(c, bucket)) {
2013 prt_str(&buf, "lru entry points to invalid bucket");
2014 goto err;
2015 }
2016
2017 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
2018 return 0;
2019
2020 a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
2021 ret = PTR_ERR_OR_ZERO(a);
2022 if (ret)
2023 goto out;
2024
2025 /* We expect harmless races here due to the btree write buffer: */
2026 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
2027 goto out;
2028
2029 BUG_ON(a->v.data_type != BCH_DATA_cached);
2030 BUG_ON(a->v.dirty_sectors);
2031
2032 if (!a->v.cached_sectors)
2033 bch_err(c, "invalidating empty bucket, confused");
2034
2035 cached_sectors = a->v.cached_sectors;
2036
2037 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
2038 a->v.gen++;
2039 a->v.data_type = 0;
2040 a->v.dirty_sectors = 0;
2041 a->v.stripe_sectors = 0;
2042 a->v.cached_sectors = 0;
2043 a->v.io_time[READ] = bch2_current_io_time(c, READ);
2044 a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
2045
2046 ret = bch2_trans_commit(trans, NULL, NULL,
2047 BCH_WATERMARK_btree|
2048 BCH_TRANS_COMMIT_no_enospc);
2049 if (ret)
2050 goto out;
2051
2052 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
2053 --*nr_to_invalidate;
2054 out:
2055 printbuf_exit(&buf);
2056 return ret;
2057 err:
2058 prt_str(&buf, "\n lru key: ");
2059 bch2_bkey_val_to_text(&buf, c, lru_k);
2060
2061 prt_str(&buf, "\n lru entry: ");
2062 bch2_lru_pos_to_text(&buf, lru_iter->pos);
2063
2064 prt_str(&buf, "\n alloc key: ");
2065 if (!a)
2066 bch2_bpos_to_text(&buf, bucket);
2067 else
2068 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
2069
2070 bch_err(c, "%s", buf.buf);
2071 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
2072 bch2_inconsistent_error(c);
2073 ret = -EINVAL;
2074 }
2075
2076 goto out;
2077 }
2078
next_lru_key(struct btree_trans * trans,struct btree_iter * iter,struct bch_dev * ca,bool * wrapped)2079 static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
2080 struct bch_dev *ca, bool *wrapped)
2081 {
2082 struct bkey_s_c k;
2083 again:
2084 k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
2085 if (!k.k && !*wrapped) {
2086 bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
2087 *wrapped = true;
2088 goto again;
2089 }
2090
2091 return k;
2092 }
2093
bch2_do_invalidates_work(struct work_struct * work)2094 static void bch2_do_invalidates_work(struct work_struct *work)
2095 {
2096 struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
2097 struct bch_fs *c = ca->fs;
2098 struct btree_trans *trans = bch2_trans_get(c);
2099 int ret = 0;
2100
2101 ret = bch2_btree_write_buffer_tryflush(trans);
2102 if (ret)
2103 goto err;
2104
2105 s64 nr_to_invalidate =
2106 should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2107 struct btree_iter iter;
2108 bool wrapped = false;
2109
2110 bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
2111 lru_pos(ca->dev_idx, 0,
2112 ((bch2_current_io_time(c, READ) + U32_MAX) &
2113 LRU_TIME_MAX)), 0);
2114
2115 while (true) {
2116 bch2_trans_begin(trans);
2117
2118 struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
2119 ret = bkey_err(k);
2120 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2121 continue;
2122 if (ret)
2123 break;
2124 if (!k.k)
2125 break;
2126
2127 ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
2128 if (ret)
2129 break;
2130
2131 bch2_btree_iter_advance(&iter);
2132 }
2133 bch2_trans_iter_exit(trans, &iter);
2134 err:
2135 bch2_trans_put(trans);
2136 percpu_ref_put(&ca->io_ref);
2137 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2138 }
2139
bch2_dev_do_invalidates(struct bch_dev * ca)2140 void bch2_dev_do_invalidates(struct bch_dev *ca)
2141 {
2142 struct bch_fs *c = ca->fs;
2143
2144 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
2145 return;
2146
2147 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
2148 goto put_ref;
2149
2150 if (queue_work(c->write_ref_wq, &ca->invalidate_work))
2151 return;
2152
2153 percpu_ref_put(&ca->io_ref);
2154 put_ref:
2155 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2156 }
2157
bch2_do_invalidates(struct bch_fs * c)2158 void bch2_do_invalidates(struct bch_fs *c)
2159 {
2160 for_each_member_device(c, ca)
2161 bch2_dev_do_invalidates(ca);
2162 }
2163
bch2_dev_freespace_init(struct bch_fs * c,struct bch_dev * ca,u64 bucket_start,u64 bucket_end)2164 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2165 u64 bucket_start, u64 bucket_end)
2166 {
2167 struct btree_trans *trans = bch2_trans_get(c);
2168 struct btree_iter iter;
2169 struct bkey_s_c k;
2170 struct bkey hole;
2171 struct bpos end = POS(ca->dev_idx, bucket_end);
2172 struct bch_member *m;
2173 unsigned long last_updated = jiffies;
2174 int ret;
2175
2176 BUG_ON(bucket_start > bucket_end);
2177 BUG_ON(bucket_end > ca->mi.nbuckets);
2178
2179 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2180 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2181 BTREE_ITER_prefetch);
2182 /*
2183 * Scan the alloc btree for every bucket on @ca, and add buckets to the
2184 * freespace/need_discard/need_gc_gens btrees as needed:
2185 */
2186 while (1) {
2187 if (time_after(jiffies, last_updated + HZ * 10)) {
2188 bch_info(ca, "%s: currently at %llu/%llu",
2189 __func__, iter.pos.offset, ca->mi.nbuckets);
2190 last_updated = jiffies;
2191 }
2192
2193 bch2_trans_begin(trans);
2194
2195 if (bkey_ge(iter.pos, end)) {
2196 ret = 0;
2197 break;
2198 }
2199
2200 k = bch2_get_key_or_hole(&iter, end, &hole);
2201 ret = bkey_err(k);
2202 if (ret)
2203 goto bkey_err;
2204
2205 if (k.k->type) {
2206 /*
2207 * We process live keys in the alloc btree one at a
2208 * time:
2209 */
2210 struct bch_alloc_v4 a_convert;
2211 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2212
2213 ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
2214 bch2_trans_commit(trans, NULL, NULL,
2215 BCH_TRANS_COMMIT_no_enospc);
2216 if (ret)
2217 goto bkey_err;
2218
2219 bch2_btree_iter_advance(&iter);
2220 } else {
2221 struct bkey_i *freespace;
2222
2223 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2224 ret = PTR_ERR_OR_ZERO(freespace);
2225 if (ret)
2226 goto bkey_err;
2227
2228 bkey_init(&freespace->k);
2229 freespace->k.type = KEY_TYPE_set;
2230 freespace->k.p = k.k->p;
2231 freespace->k.size = k.k->size;
2232
2233 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2234 bch2_trans_commit(trans, NULL, NULL,
2235 BCH_TRANS_COMMIT_no_enospc);
2236 if (ret)
2237 goto bkey_err;
2238
2239 bch2_btree_iter_set_pos(&iter, k.k->p);
2240 }
2241 bkey_err:
2242 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2243 continue;
2244 if (ret)
2245 break;
2246 }
2247
2248 bch2_trans_iter_exit(trans, &iter);
2249 bch2_trans_put(trans);
2250
2251 if (ret < 0) {
2252 bch_err_msg(ca, ret, "initializing free space");
2253 return ret;
2254 }
2255
2256 mutex_lock(&c->sb_lock);
2257 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2258 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2259 mutex_unlock(&c->sb_lock);
2260
2261 return 0;
2262 }
2263
bch2_fs_freespace_init(struct bch_fs * c)2264 int bch2_fs_freespace_init(struct bch_fs *c)
2265 {
2266 int ret = 0;
2267 bool doing_init = false;
2268
2269 /*
2270 * We can crash during the device add path, so we need to check this on
2271 * every mount:
2272 */
2273
2274 for_each_member_device(c, ca) {
2275 if (ca->mi.freespace_initialized)
2276 continue;
2277
2278 if (!doing_init) {
2279 bch_info(c, "initializing freespace");
2280 doing_init = true;
2281 }
2282
2283 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2284 if (ret) {
2285 bch2_dev_put(ca);
2286 bch_err_fn(c, ret);
2287 return ret;
2288 }
2289 }
2290
2291 if (doing_init) {
2292 mutex_lock(&c->sb_lock);
2293 bch2_write_super(c);
2294 mutex_unlock(&c->sb_lock);
2295 bch_verbose(c, "done initializing freespace");
2296 }
2297
2298 return 0;
2299 }
2300
2301 /* device removal */
2302
bch2_dev_remove_alloc(struct bch_fs * c,struct bch_dev * ca)2303 int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
2304 {
2305 struct bpos start = POS(ca->dev_idx, 0);
2306 struct bpos end = POS(ca->dev_idx, U64_MAX);
2307 int ret;
2308
2309 /*
2310 * We clear the LRU and need_discard btrees first so that we don't race
2311 * with bch2_do_invalidates() and bch2_do_discards()
2312 */
2313 ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
2314 bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
2315 BTREE_TRIGGER_norun, NULL) ?:
2316 bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
2317 BTREE_TRIGGER_norun, NULL) ?:
2318 bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
2319 BTREE_TRIGGER_norun, NULL) ?:
2320 bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
2321 BTREE_TRIGGER_norun, NULL) ?:
2322 bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
2323 BTREE_TRIGGER_norun, NULL) ?:
2324 bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
2325 BTREE_TRIGGER_norun, NULL) ?:
2326 bch2_dev_usage_remove(c, ca->dev_idx);
2327 bch_err_msg(ca, ret, "removing dev alloc info");
2328 return ret;
2329 }
2330
2331 /* Bucket IO clocks: */
2332
bch2_bucket_io_time_reset(struct btree_trans * trans,unsigned dev,size_t bucket_nr,int rw)2333 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2334 size_t bucket_nr, int rw)
2335 {
2336 struct bch_fs *c = trans->c;
2337 struct btree_iter iter;
2338 struct bkey_i_alloc_v4 *a;
2339 u64 now;
2340 int ret = 0;
2341
2342 if (bch2_trans_relock(trans))
2343 bch2_trans_begin(trans);
2344
2345 a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2346 ret = PTR_ERR_OR_ZERO(a);
2347 if (ret)
2348 return ret;
2349
2350 now = bch2_current_io_time(c, rw);
2351 if (a->v.io_time[rw] == now)
2352 goto out;
2353
2354 a->v.io_time[rw] = now;
2355
2356 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2357 bch2_trans_commit(trans, NULL, NULL, 0);
2358 out:
2359 bch2_trans_iter_exit(trans, &iter);
2360 return ret;
2361 }
2362
2363 /* Startup/shutdown (ro/rw): */
2364
bch2_recalc_capacity(struct bch_fs * c)2365 void bch2_recalc_capacity(struct bch_fs *c)
2366 {
2367 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2368 unsigned bucket_size_max = 0;
2369 unsigned long ra_pages = 0;
2370
2371 lockdep_assert_held(&c->state_lock);
2372
2373 for_each_online_member(c, ca) {
2374 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2375
2376 ra_pages += bdi->ra_pages;
2377 }
2378
2379 bch2_set_ra_pages(c, ra_pages);
2380
2381 for_each_rw_member(c, ca) {
2382 u64 dev_reserve = 0;
2383
2384 /*
2385 * We need to reserve buckets (from the number
2386 * of currently available buckets) against
2387 * foreground writes so that mainly copygc can
2388 * make forward progress.
2389 *
2390 * We need enough to refill the various reserves
2391 * from scratch - copygc will use its entire
2392 * reserve all at once, then run against when
2393 * its reserve is refilled (from the formerly
2394 * available buckets).
2395 *
2396 * This reserve is just used when considering if
2397 * allocations for foreground writes must wait -
2398 * not -ENOSPC calculations.
2399 */
2400
2401 dev_reserve += ca->nr_btree_reserve * 2;
2402 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2403
2404 dev_reserve += 1; /* btree write point */
2405 dev_reserve += 1; /* copygc write point */
2406 dev_reserve += 1; /* rebalance write point */
2407
2408 dev_reserve *= ca->mi.bucket_size;
2409
2410 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2411 ca->mi.first_bucket);
2412
2413 reserved_sectors += dev_reserve * 2;
2414
2415 bucket_size_max = max_t(unsigned, bucket_size_max,
2416 ca->mi.bucket_size);
2417 }
2418
2419 gc_reserve = c->opts.gc_reserve_bytes
2420 ? c->opts.gc_reserve_bytes >> 9
2421 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2422
2423 reserved_sectors = max(gc_reserve, reserved_sectors);
2424
2425 reserved_sectors = min(reserved_sectors, capacity);
2426
2427 c->reserved = reserved_sectors;
2428 c->capacity = capacity - reserved_sectors;
2429
2430 c->bucket_size_max = bucket_size_max;
2431
2432 /* Wake up case someone was waiting for buckets */
2433 closure_wake_up(&c->freelist_wait);
2434 }
2435
bch2_min_rw_member_capacity(struct bch_fs * c)2436 u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2437 {
2438 u64 ret = U64_MAX;
2439
2440 for_each_rw_member(c, ca)
2441 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2442 return ret;
2443 }
2444
bch2_dev_has_open_write_point(struct bch_fs * c,struct bch_dev * ca)2445 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2446 {
2447 struct open_bucket *ob;
2448 bool ret = false;
2449
2450 for (ob = c->open_buckets;
2451 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2452 ob++) {
2453 spin_lock(&ob->lock);
2454 if (ob->valid && !ob->on_partial_list &&
2455 ob->dev == ca->dev_idx)
2456 ret = true;
2457 spin_unlock(&ob->lock);
2458 }
2459
2460 return ret;
2461 }
2462
2463 /* device goes ro: */
bch2_dev_allocator_remove(struct bch_fs * c,struct bch_dev * ca)2464 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2465 {
2466 lockdep_assert_held(&c->state_lock);
2467
2468 /* First, remove device from allocation groups: */
2469
2470 for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2471 clear_bit(ca->dev_idx, c->rw_devs[i].d);
2472
2473 c->rw_devs_change_count++;
2474
2475 /*
2476 * Capacity is calculated based off of devices in allocation groups:
2477 */
2478 bch2_recalc_capacity(c);
2479
2480 bch2_open_buckets_stop(c, ca, false);
2481
2482 /*
2483 * Wake up threads that were blocked on allocation, so they can notice
2484 * the device can no longer be removed and the capacity has changed:
2485 */
2486 closure_wake_up(&c->freelist_wait);
2487
2488 /*
2489 * journal_res_get() can block waiting for free space in the journal -
2490 * it needs to notice there may not be devices to allocate from anymore:
2491 */
2492 wake_up(&c->journal.wait);
2493
2494 /* Now wait for any in flight writes: */
2495
2496 closure_wait_event(&c->open_buckets_wait,
2497 !bch2_dev_has_open_write_point(c, ca));
2498 }
2499
2500 /* device goes rw: */
bch2_dev_allocator_add(struct bch_fs * c,struct bch_dev * ca)2501 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2502 {
2503 lockdep_assert_held(&c->state_lock);
2504
2505 for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2506 if (ca->mi.data_allowed & (1 << i))
2507 set_bit(ca->dev_idx, c->rw_devs[i].d);
2508
2509 c->rw_devs_change_count++;
2510 }
2511
bch2_dev_allocator_background_exit(struct bch_dev * ca)2512 void bch2_dev_allocator_background_exit(struct bch_dev *ca)
2513 {
2514 darray_exit(&ca->discard_buckets_in_flight);
2515 }
2516
bch2_dev_allocator_background_init(struct bch_dev * ca)2517 void bch2_dev_allocator_background_init(struct bch_dev *ca)
2518 {
2519 mutex_init(&ca->discard_buckets_in_flight_lock);
2520 INIT_WORK(&ca->discard_work, bch2_do_discards_work);
2521 INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
2522 INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
2523 }
2524
bch2_fs_allocator_background_init(struct bch_fs * c)2525 void bch2_fs_allocator_background_init(struct bch_fs *c)
2526 {
2527 spin_lock_init(&c->freelist_lock);
2528 }
2529