1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "journal.h"
6 #include "replicas.h"
7 #include "super-io.h"
8
9 #include <linux/sort.h>
10
11 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
12 struct bch_replicas_cpu *);
13
14 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
bch2_memcmp(const void * l,const void * r,const void * priv)15 static int bch2_memcmp(const void *l, const void *r, const void *priv)
16 {
17 size_t size = (size_t) priv;
18 return memcmp(l, r, size);
19 }
20
21 /* Replicas tracking - in memory: */
22
verify_replicas_entry(struct bch_replicas_entry_v1 * e)23 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
24 {
25 #ifdef CONFIG_BCACHEFS_DEBUG
26 BUG_ON(e->data_type >= BCH_DATA_NR);
27 BUG_ON(!e->nr_devs);
28 BUG_ON(e->nr_required > 1 &&
29 e->nr_required >= e->nr_devs);
30
31 for (unsigned i = 0; i + 1 < e->nr_devs; i++)
32 BUG_ON(e->devs[i] >= e->devs[i + 1]);
33 #endif
34 }
35
bch2_replicas_entry_sort(struct bch_replicas_entry_v1 * e)36 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
37 {
38 bubble_sort(e->devs, e->nr_devs, u8_cmp);
39 }
40
bch2_cpu_replicas_sort(struct bch_replicas_cpu * r)41 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
42 {
43 eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
44 bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
45 }
46
bch2_replicas_entry_v0_to_text(struct printbuf * out,struct bch_replicas_entry_v0 * e)47 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
48 struct bch_replicas_entry_v0 *e)
49 {
50 bch2_prt_data_type(out, e->data_type);
51
52 prt_printf(out, ": %u [", e->nr_devs);
53 for (unsigned i = 0; i < e->nr_devs; i++)
54 prt_printf(out, i ? " %u" : "%u", e->devs[i]);
55 prt_printf(out, "]");
56 }
57
bch2_replicas_entry_to_text(struct printbuf * out,struct bch_replicas_entry_v1 * e)58 void bch2_replicas_entry_to_text(struct printbuf *out,
59 struct bch_replicas_entry_v1 *e)
60 {
61 bch2_prt_data_type(out, e->data_type);
62
63 prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
64 for (unsigned i = 0; i < e->nr_devs; i++)
65 prt_printf(out, i ? " %u" : "%u", e->devs[i]);
66 prt_printf(out, "]");
67 }
68
bch2_replicas_entry_validate(struct bch_replicas_entry_v1 * r,struct bch_sb * sb,struct printbuf * err)69 int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
70 struct bch_sb *sb,
71 struct printbuf *err)
72 {
73 if (!r->nr_devs) {
74 prt_printf(err, "no devices in entry ");
75 goto bad;
76 }
77
78 if (r->nr_required > 1 &&
79 r->nr_required >= r->nr_devs) {
80 prt_printf(err, "bad nr_required in entry ");
81 goto bad;
82 }
83
84 for (unsigned i = 0; i < r->nr_devs; i++)
85 if (!bch2_member_exists(sb, r->devs[i])) {
86 prt_printf(err, "invalid device %u in entry ", r->devs[i]);
87 goto bad;
88 }
89
90 return 0;
91 bad:
92 bch2_replicas_entry_to_text(err, r);
93 return -BCH_ERR_invalid_replicas_entry;
94 }
95
bch2_cpu_replicas_to_text(struct printbuf * out,struct bch_replicas_cpu * r)96 void bch2_cpu_replicas_to_text(struct printbuf *out,
97 struct bch_replicas_cpu *r)
98 {
99 struct bch_replicas_entry_v1 *e;
100 bool first = true;
101
102 for_each_cpu_replicas_entry(r, e) {
103 if (!first)
104 prt_printf(out, " ");
105 first = false;
106
107 bch2_replicas_entry_to_text(out, e);
108 }
109 }
110
extent_to_replicas(struct bkey_s_c k,struct bch_replicas_entry_v1 * r)111 static void extent_to_replicas(struct bkey_s_c k,
112 struct bch_replicas_entry_v1 *r)
113 {
114 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
115 const union bch_extent_entry *entry;
116 struct extent_ptr_decoded p;
117
118 r->nr_required = 1;
119
120 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
121 if (p.ptr.cached)
122 continue;
123
124 if (!p.has_ec)
125 r->devs[r->nr_devs++] = p.ptr.dev;
126 else
127 r->nr_required = 0;
128 }
129 }
130
stripe_to_replicas(struct bkey_s_c k,struct bch_replicas_entry_v1 * r)131 static void stripe_to_replicas(struct bkey_s_c k,
132 struct bch_replicas_entry_v1 *r)
133 {
134 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
135 const struct bch_extent_ptr *ptr;
136
137 r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
138
139 for (ptr = s.v->ptrs;
140 ptr < s.v->ptrs + s.v->nr_blocks;
141 ptr++)
142 r->devs[r->nr_devs++] = ptr->dev;
143 }
144
bch2_bkey_to_replicas(struct bch_replicas_entry_v1 * e,struct bkey_s_c k)145 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
146 struct bkey_s_c k)
147 {
148 e->nr_devs = 0;
149
150 switch (k.k->type) {
151 case KEY_TYPE_btree_ptr:
152 case KEY_TYPE_btree_ptr_v2:
153 e->data_type = BCH_DATA_btree;
154 extent_to_replicas(k, e);
155 break;
156 case KEY_TYPE_extent:
157 case KEY_TYPE_reflink_v:
158 e->data_type = BCH_DATA_user;
159 extent_to_replicas(k, e);
160 break;
161 case KEY_TYPE_stripe:
162 e->data_type = BCH_DATA_parity;
163 stripe_to_replicas(k, e);
164 break;
165 }
166
167 bch2_replicas_entry_sort(e);
168 }
169
bch2_devlist_to_replicas(struct bch_replicas_entry_v1 * e,enum bch_data_type data_type,struct bch_devs_list devs)170 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
171 enum bch_data_type data_type,
172 struct bch_devs_list devs)
173 {
174 BUG_ON(!data_type ||
175 data_type == BCH_DATA_sb ||
176 data_type >= BCH_DATA_NR);
177
178 e->data_type = data_type;
179 e->nr_devs = 0;
180 e->nr_required = 1;
181
182 darray_for_each(devs, i)
183 e->devs[e->nr_devs++] = *i;
184
185 bch2_replicas_entry_sort(e);
186 }
187
188 static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs * c,struct bch_replicas_cpu * old,struct bch_replicas_entry_v1 * new_entry)189 cpu_replicas_add_entry(struct bch_fs *c,
190 struct bch_replicas_cpu *old,
191 struct bch_replicas_entry_v1 *new_entry)
192 {
193 struct bch_replicas_cpu new = {
194 .nr = old->nr + 1,
195 .entry_size = max_t(unsigned, old->entry_size,
196 replicas_entry_bytes(new_entry)),
197 };
198
199 new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
200 if (!new.entries)
201 return new;
202
203 for (unsigned i = 0; i < old->nr; i++)
204 memcpy(cpu_replicas_entry(&new, i),
205 cpu_replicas_entry(old, i),
206 old->entry_size);
207
208 memcpy(cpu_replicas_entry(&new, old->nr),
209 new_entry,
210 replicas_entry_bytes(new_entry));
211
212 bch2_cpu_replicas_sort(&new);
213 return new;
214 }
215
__replicas_entry_idx(struct bch_replicas_cpu * r,struct bch_replicas_entry_v1 * search)216 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
217 struct bch_replicas_entry_v1 *search)
218 {
219 int idx, entry_size = replicas_entry_bytes(search);
220
221 if (unlikely(entry_size > r->entry_size))
222 return -1;
223
224 #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
225 idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
226 entry_cmp, search);
227 #undef entry_cmp
228
229 return idx < r->nr ? idx : -1;
230 }
231
bch2_replicas_entry_idx(struct bch_fs * c,struct bch_replicas_entry_v1 * search)232 int bch2_replicas_entry_idx(struct bch_fs *c,
233 struct bch_replicas_entry_v1 *search)
234 {
235 bch2_replicas_entry_sort(search);
236
237 return __replicas_entry_idx(&c->replicas, search);
238 }
239
__replicas_has_entry(struct bch_replicas_cpu * r,struct bch_replicas_entry_v1 * search)240 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
241 struct bch_replicas_entry_v1 *search)
242 {
243 return __replicas_entry_idx(r, search) >= 0;
244 }
245
bch2_replicas_marked(struct bch_fs * c,struct bch_replicas_entry_v1 * search)246 bool bch2_replicas_marked(struct bch_fs *c,
247 struct bch_replicas_entry_v1 *search)
248 {
249 bool marked;
250
251 if (!search->nr_devs)
252 return true;
253
254 verify_replicas_entry(search);
255
256 percpu_down_read(&c->mark_lock);
257 marked = __replicas_has_entry(&c->replicas, search) &&
258 (likely((!c->replicas_gc.entries)) ||
259 __replicas_has_entry(&c->replicas_gc, search));
260 percpu_up_read(&c->mark_lock);
261
262 return marked;
263 }
264
__replicas_table_update(struct bch_fs_usage * dst,struct bch_replicas_cpu * dst_r,struct bch_fs_usage * src,struct bch_replicas_cpu * src_r)265 static void __replicas_table_update(struct bch_fs_usage *dst,
266 struct bch_replicas_cpu *dst_r,
267 struct bch_fs_usage *src,
268 struct bch_replicas_cpu *src_r)
269 {
270 int src_idx, dst_idx;
271
272 *dst = *src;
273
274 for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
275 if (!src->replicas[src_idx])
276 continue;
277
278 dst_idx = __replicas_entry_idx(dst_r,
279 cpu_replicas_entry(src_r, src_idx));
280 BUG_ON(dst_idx < 0);
281
282 dst->replicas[dst_idx] = src->replicas[src_idx];
283 }
284 }
285
__replicas_table_update_pcpu(struct bch_fs_usage __percpu * dst_p,struct bch_replicas_cpu * dst_r,struct bch_fs_usage __percpu * src_p,struct bch_replicas_cpu * src_r)286 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
287 struct bch_replicas_cpu *dst_r,
288 struct bch_fs_usage __percpu *src_p,
289 struct bch_replicas_cpu *src_r)
290 {
291 unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
292 struct bch_fs_usage *dst, *src = (void *)
293 bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
294
295 preempt_disable();
296 dst = this_cpu_ptr(dst_p);
297 preempt_enable();
298
299 __replicas_table_update(dst, dst_r, src, src_r);
300 }
301
302 /*
303 * Resize filesystem accounting:
304 */
replicas_table_update(struct bch_fs * c,struct bch_replicas_cpu * new_r)305 static int replicas_table_update(struct bch_fs *c,
306 struct bch_replicas_cpu *new_r)
307 {
308 struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
309 struct bch_fs_usage_online *new_scratch = NULL;
310 struct bch_fs_usage __percpu *new_gc = NULL;
311 struct bch_fs_usage *new_base = NULL;
312 unsigned i, bytes = sizeof(struct bch_fs_usage) +
313 sizeof(u64) * new_r->nr;
314 unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
315 sizeof(u64) * new_r->nr;
316 int ret = 0;
317
318 memset(new_usage, 0, sizeof(new_usage));
319
320 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
321 if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
322 sizeof(u64), GFP_KERNEL)))
323 goto err;
324
325 if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
326 !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
327 (c->usage_gc &&
328 !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
329 goto err;
330
331 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
332 if (c->usage[i])
333 __replicas_table_update_pcpu(new_usage[i], new_r,
334 c->usage[i], &c->replicas);
335 if (c->usage_base)
336 __replicas_table_update(new_base, new_r,
337 c->usage_base, &c->replicas);
338 if (c->usage_gc)
339 __replicas_table_update_pcpu(new_gc, new_r,
340 c->usage_gc, &c->replicas);
341
342 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
343 swap(c->usage[i], new_usage[i]);
344 swap(c->usage_base, new_base);
345 swap(c->usage_scratch, new_scratch);
346 swap(c->usage_gc, new_gc);
347 swap(c->replicas, *new_r);
348 out:
349 free_percpu(new_gc);
350 kfree(new_scratch);
351 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
352 free_percpu(new_usage[i]);
353 kfree(new_base);
354 return ret;
355 err:
356 bch_err(c, "error updating replicas table: memory allocation failure");
357 ret = -BCH_ERR_ENOMEM_replicas_table;
358 goto out;
359 }
360
reserve_journal_replicas(struct bch_fs * c,struct bch_replicas_cpu * r)361 static unsigned reserve_journal_replicas(struct bch_fs *c,
362 struct bch_replicas_cpu *r)
363 {
364 struct bch_replicas_entry_v1 *e;
365 unsigned journal_res_u64s = 0;
366
367 /* nr_inodes: */
368 journal_res_u64s +=
369 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
370
371 /* key_version: */
372 journal_res_u64s +=
373 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
374
375 /* persistent_reserved: */
376 journal_res_u64s +=
377 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
378 BCH_REPLICAS_MAX;
379
380 for_each_cpu_replicas_entry(r, e)
381 journal_res_u64s +=
382 DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
383 e->nr_devs, sizeof(u64));
384 return journal_res_u64s;
385 }
386
387 noinline
bch2_mark_replicas_slowpath(struct bch_fs * c,struct bch_replicas_entry_v1 * new_entry)388 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
389 struct bch_replicas_entry_v1 *new_entry)
390 {
391 struct bch_replicas_cpu new_r, new_gc;
392 int ret = 0;
393
394 verify_replicas_entry(new_entry);
395
396 memset(&new_r, 0, sizeof(new_r));
397 memset(&new_gc, 0, sizeof(new_gc));
398
399 mutex_lock(&c->sb_lock);
400
401 if (c->replicas_gc.entries &&
402 !__replicas_has_entry(&c->replicas_gc, new_entry)) {
403 new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
404 if (!new_gc.entries) {
405 ret = -BCH_ERR_ENOMEM_cpu_replicas;
406 goto err;
407 }
408 }
409
410 if (!__replicas_has_entry(&c->replicas, new_entry)) {
411 new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
412 if (!new_r.entries) {
413 ret = -BCH_ERR_ENOMEM_cpu_replicas;
414 goto err;
415 }
416
417 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
418 if (ret)
419 goto err;
420
421 bch2_journal_entry_res_resize(&c->journal,
422 &c->replicas_journal_res,
423 reserve_journal_replicas(c, &new_r));
424 }
425
426 if (!new_r.entries &&
427 !new_gc.entries)
428 goto out;
429
430 /* allocations done, now commit: */
431
432 if (new_r.entries)
433 bch2_write_super(c);
434
435 /* don't update in memory replicas until changes are persistent */
436 percpu_down_write(&c->mark_lock);
437 if (new_r.entries)
438 ret = replicas_table_update(c, &new_r);
439 if (new_gc.entries)
440 swap(new_gc, c->replicas_gc);
441 percpu_up_write(&c->mark_lock);
442 out:
443 mutex_unlock(&c->sb_lock);
444
445 kfree(new_r.entries);
446 kfree(new_gc.entries);
447
448 return ret;
449 err:
450 bch_err_msg(c, ret, "adding replicas entry");
451 goto out;
452 }
453
bch2_mark_replicas(struct bch_fs * c,struct bch_replicas_entry_v1 * r)454 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
455 {
456 return likely(bch2_replicas_marked(c, r))
457 ? 0 : bch2_mark_replicas_slowpath(c, r);
458 }
459
460 /* replicas delta list: */
461
bch2_replicas_delta_list_mark(struct bch_fs * c,struct replicas_delta_list * r)462 int bch2_replicas_delta_list_mark(struct bch_fs *c,
463 struct replicas_delta_list *r)
464 {
465 struct replicas_delta *d = r->d;
466 struct replicas_delta *top = (void *) r->d + r->used;
467 int ret = 0;
468
469 for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
470 ret = bch2_mark_replicas(c, &d->r);
471 return ret;
472 }
473
474 /*
475 * Old replicas_gc mechanism: only used for journal replicas entries now, should
476 * die at some point:
477 */
478
bch2_replicas_gc_end(struct bch_fs * c,int ret)479 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
480 {
481 lockdep_assert_held(&c->replicas_gc_lock);
482
483 mutex_lock(&c->sb_lock);
484 percpu_down_write(&c->mark_lock);
485
486 ret = ret ?:
487 bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
488 replicas_table_update(c, &c->replicas_gc);
489
490 kfree(c->replicas_gc.entries);
491 c->replicas_gc.entries = NULL;
492
493 percpu_up_write(&c->mark_lock);
494
495 if (!ret)
496 bch2_write_super(c);
497
498 mutex_unlock(&c->sb_lock);
499
500 return ret;
501 }
502
bch2_replicas_gc_start(struct bch_fs * c,unsigned typemask)503 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
504 {
505 struct bch_replicas_entry_v1 *e;
506 unsigned i = 0;
507
508 lockdep_assert_held(&c->replicas_gc_lock);
509
510 mutex_lock(&c->sb_lock);
511 BUG_ON(c->replicas_gc.entries);
512
513 c->replicas_gc.nr = 0;
514 c->replicas_gc.entry_size = 0;
515
516 for_each_cpu_replicas_entry(&c->replicas, e) {
517 /* Preserve unknown data types */
518 if (e->data_type >= BCH_DATA_NR ||
519 !((1 << e->data_type) & typemask)) {
520 c->replicas_gc.nr++;
521 c->replicas_gc.entry_size =
522 max_t(unsigned, c->replicas_gc.entry_size,
523 replicas_entry_bytes(e));
524 }
525 }
526
527 c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
528 c->replicas_gc.entry_size,
529 GFP_KERNEL);
530 if (!c->replicas_gc.entries) {
531 mutex_unlock(&c->sb_lock);
532 bch_err(c, "error allocating c->replicas_gc");
533 return -BCH_ERR_ENOMEM_replicas_gc;
534 }
535
536 for_each_cpu_replicas_entry(&c->replicas, e)
537 if (e->data_type >= BCH_DATA_NR ||
538 !((1 << e->data_type) & typemask))
539 memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
540 e, c->replicas_gc.entry_size);
541
542 bch2_cpu_replicas_sort(&c->replicas_gc);
543 mutex_unlock(&c->sb_lock);
544
545 return 0;
546 }
547
548 /*
549 * New much simpler mechanism for clearing out unneeded replicas entries - drop
550 * replicas entries that have 0 sectors used.
551 *
552 * However, we don't track sector counts for journal usage, so this doesn't drop
553 * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
554 * is retained for that.
555 */
bch2_replicas_gc2(struct bch_fs * c)556 int bch2_replicas_gc2(struct bch_fs *c)
557 {
558 struct bch_replicas_cpu new = { 0 };
559 unsigned i, nr;
560 int ret = 0;
561
562 bch2_journal_meta(&c->journal);
563 retry:
564 nr = READ_ONCE(c->replicas.nr);
565 new.entry_size = READ_ONCE(c->replicas.entry_size);
566 new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
567 if (!new.entries) {
568 bch_err(c, "error allocating c->replicas_gc");
569 return -BCH_ERR_ENOMEM_replicas_gc;
570 }
571
572 mutex_lock(&c->sb_lock);
573 percpu_down_write(&c->mark_lock);
574
575 if (nr != c->replicas.nr ||
576 new.entry_size != c->replicas.entry_size) {
577 percpu_up_write(&c->mark_lock);
578 mutex_unlock(&c->sb_lock);
579 kfree(new.entries);
580 goto retry;
581 }
582
583 for (i = 0; i < c->replicas.nr; i++) {
584 struct bch_replicas_entry_v1 *e =
585 cpu_replicas_entry(&c->replicas, i);
586
587 if (e->data_type == BCH_DATA_journal ||
588 c->usage_base->replicas[i] ||
589 percpu_u64_get(&c->usage[0]->replicas[i]) ||
590 percpu_u64_get(&c->usage[1]->replicas[i]) ||
591 percpu_u64_get(&c->usage[2]->replicas[i]) ||
592 percpu_u64_get(&c->usage[3]->replicas[i]))
593 memcpy(cpu_replicas_entry(&new, new.nr++),
594 e, new.entry_size);
595 }
596
597 bch2_cpu_replicas_sort(&new);
598
599 ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
600 replicas_table_update(c, &new);
601
602 kfree(new.entries);
603
604 percpu_up_write(&c->mark_lock);
605
606 if (!ret)
607 bch2_write_super(c);
608
609 mutex_unlock(&c->sb_lock);
610
611 return ret;
612 }
613
bch2_replicas_set_usage(struct bch_fs * c,struct bch_replicas_entry_v1 * r,u64 sectors)614 int bch2_replicas_set_usage(struct bch_fs *c,
615 struct bch_replicas_entry_v1 *r,
616 u64 sectors)
617 {
618 int ret, idx = bch2_replicas_entry_idx(c, r);
619
620 if (idx < 0) {
621 struct bch_replicas_cpu n;
622
623 n = cpu_replicas_add_entry(c, &c->replicas, r);
624 if (!n.entries)
625 return -BCH_ERR_ENOMEM_cpu_replicas;
626
627 ret = replicas_table_update(c, &n);
628 if (ret)
629 return ret;
630
631 kfree(n.entries);
632
633 idx = bch2_replicas_entry_idx(c, r);
634 BUG_ON(ret < 0);
635 }
636
637 c->usage_base->replicas[idx] = sectors;
638
639 return 0;
640 }
641
642 /* Replicas tracking - superblock: */
643
644 static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas * sb_r,struct bch_replicas_cpu * cpu_r)645 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
646 struct bch_replicas_cpu *cpu_r)
647 {
648 struct bch_replicas_entry_v1 *e, *dst;
649 unsigned nr = 0, entry_size = 0, idx = 0;
650
651 for_each_replicas_entry(sb_r, e) {
652 entry_size = max_t(unsigned, entry_size,
653 replicas_entry_bytes(e));
654 nr++;
655 }
656
657 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
658 if (!cpu_r->entries)
659 return -BCH_ERR_ENOMEM_cpu_replicas;
660
661 cpu_r->nr = nr;
662 cpu_r->entry_size = entry_size;
663
664 for_each_replicas_entry(sb_r, e) {
665 dst = cpu_replicas_entry(cpu_r, idx++);
666 memcpy(dst, e, replicas_entry_bytes(e));
667 bch2_replicas_entry_sort(dst);
668 }
669
670 return 0;
671 }
672
673 static int
__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 * sb_r,struct bch_replicas_cpu * cpu_r)674 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
675 struct bch_replicas_cpu *cpu_r)
676 {
677 struct bch_replicas_entry_v0 *e;
678 unsigned nr = 0, entry_size = 0, idx = 0;
679
680 for_each_replicas_entry(sb_r, e) {
681 entry_size = max_t(unsigned, entry_size,
682 replicas_entry_bytes(e));
683 nr++;
684 }
685
686 entry_size += sizeof(struct bch_replicas_entry_v1) -
687 sizeof(struct bch_replicas_entry_v0);
688
689 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
690 if (!cpu_r->entries)
691 return -BCH_ERR_ENOMEM_cpu_replicas;
692
693 cpu_r->nr = nr;
694 cpu_r->entry_size = entry_size;
695
696 for_each_replicas_entry(sb_r, e) {
697 struct bch_replicas_entry_v1 *dst =
698 cpu_replicas_entry(cpu_r, idx++);
699
700 dst->data_type = e->data_type;
701 dst->nr_devs = e->nr_devs;
702 dst->nr_required = 1;
703 memcpy(dst->devs, e->devs, e->nr_devs);
704 bch2_replicas_entry_sort(dst);
705 }
706
707 return 0;
708 }
709
bch2_sb_replicas_to_cpu_replicas(struct bch_fs * c)710 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
711 {
712 struct bch_sb_field_replicas *sb_v1;
713 struct bch_sb_field_replicas_v0 *sb_v0;
714 struct bch_replicas_cpu new_r = { 0, 0, NULL };
715 int ret = 0;
716
717 if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
718 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
719 else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
720 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
721 if (ret)
722 return ret;
723
724 bch2_cpu_replicas_sort(&new_r);
725
726 percpu_down_write(&c->mark_lock);
727
728 ret = replicas_table_update(c, &new_r);
729 percpu_up_write(&c->mark_lock);
730
731 kfree(new_r.entries);
732
733 return 0;
734 }
735
bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs * c,struct bch_replicas_cpu * r)736 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
737 struct bch_replicas_cpu *r)
738 {
739 struct bch_sb_field_replicas_v0 *sb_r;
740 struct bch_replicas_entry_v0 *dst;
741 struct bch_replicas_entry_v1 *src;
742 size_t bytes;
743
744 bytes = sizeof(struct bch_sb_field_replicas);
745
746 for_each_cpu_replicas_entry(r, src)
747 bytes += replicas_entry_bytes(src) - 1;
748
749 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
750 DIV_ROUND_UP(bytes, sizeof(u64)));
751 if (!sb_r)
752 return -BCH_ERR_ENOSPC_sb_replicas;
753
754 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
755 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
756
757 memset(&sb_r->entries, 0,
758 vstruct_end(&sb_r->field) -
759 (void *) &sb_r->entries);
760
761 dst = sb_r->entries;
762 for_each_cpu_replicas_entry(r, src) {
763 dst->data_type = src->data_type;
764 dst->nr_devs = src->nr_devs;
765 memcpy(dst->devs, src->devs, src->nr_devs);
766
767 dst = replicas_entry_next(dst);
768
769 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
770 }
771
772 return 0;
773 }
774
bch2_cpu_replicas_to_sb_replicas(struct bch_fs * c,struct bch_replicas_cpu * r)775 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
776 struct bch_replicas_cpu *r)
777 {
778 struct bch_sb_field_replicas *sb_r;
779 struct bch_replicas_entry_v1 *dst, *src;
780 bool need_v1 = false;
781 size_t bytes;
782
783 bytes = sizeof(struct bch_sb_field_replicas);
784
785 for_each_cpu_replicas_entry(r, src) {
786 bytes += replicas_entry_bytes(src);
787 if (src->nr_required != 1)
788 need_v1 = true;
789 }
790
791 if (!need_v1)
792 return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
793
794 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
795 DIV_ROUND_UP(bytes, sizeof(u64)));
796 if (!sb_r)
797 return -BCH_ERR_ENOSPC_sb_replicas;
798
799 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
800 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
801
802 memset(&sb_r->entries, 0,
803 vstruct_end(&sb_r->field) -
804 (void *) &sb_r->entries);
805
806 dst = sb_r->entries;
807 for_each_cpu_replicas_entry(r, src) {
808 memcpy(dst, src, replicas_entry_bytes(src));
809
810 dst = replicas_entry_next(dst);
811
812 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
813 }
814
815 return 0;
816 }
817
bch2_cpu_replicas_validate(struct bch_replicas_cpu * cpu_r,struct bch_sb * sb,struct printbuf * err)818 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
819 struct bch_sb *sb,
820 struct printbuf *err)
821 {
822 unsigned i;
823
824 sort_r(cpu_r->entries,
825 cpu_r->nr,
826 cpu_r->entry_size,
827 bch2_memcmp, NULL,
828 (void *)(size_t)cpu_r->entry_size);
829
830 for (i = 0; i < cpu_r->nr; i++) {
831 struct bch_replicas_entry_v1 *e =
832 cpu_replicas_entry(cpu_r, i);
833
834 int ret = bch2_replicas_entry_validate(e, sb, err);
835 if (ret)
836 return ret;
837
838 if (i + 1 < cpu_r->nr) {
839 struct bch_replicas_entry_v1 *n =
840 cpu_replicas_entry(cpu_r, i + 1);
841
842 BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
843
844 if (!memcmp(e, n, cpu_r->entry_size)) {
845 prt_printf(err, "duplicate replicas entry ");
846 bch2_replicas_entry_to_text(err, e);
847 return -BCH_ERR_invalid_sb_replicas;
848 }
849 }
850 }
851
852 return 0;
853 }
854
bch2_sb_replicas_validate(struct bch_sb * sb,struct bch_sb_field * f,enum bch_validate_flags flags,struct printbuf * err)855 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
856 enum bch_validate_flags flags, struct printbuf *err)
857 {
858 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
859 struct bch_replicas_cpu cpu_r;
860 int ret;
861
862 ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
863 if (ret)
864 return ret;
865
866 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
867 kfree(cpu_r.entries);
868 return ret;
869 }
870
bch2_sb_replicas_to_text(struct printbuf * out,struct bch_sb * sb,struct bch_sb_field * f)871 static void bch2_sb_replicas_to_text(struct printbuf *out,
872 struct bch_sb *sb,
873 struct bch_sb_field *f)
874 {
875 struct bch_sb_field_replicas *r = field_to_type(f, replicas);
876 struct bch_replicas_entry_v1 *e;
877 bool first = true;
878
879 for_each_replicas_entry(r, e) {
880 if (!first)
881 prt_printf(out, " ");
882 first = false;
883
884 bch2_replicas_entry_to_text(out, e);
885 }
886 prt_newline(out);
887 }
888
889 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
890 .validate = bch2_sb_replicas_validate,
891 .to_text = bch2_sb_replicas_to_text,
892 };
893
bch2_sb_replicas_v0_validate(struct bch_sb * sb,struct bch_sb_field * f,enum bch_validate_flags flags,struct printbuf * err)894 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
895 enum bch_validate_flags flags, struct printbuf *err)
896 {
897 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
898 struct bch_replicas_cpu cpu_r;
899 int ret;
900
901 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
902 if (ret)
903 return ret;
904
905 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
906 kfree(cpu_r.entries);
907 return ret;
908 }
909
bch2_sb_replicas_v0_to_text(struct printbuf * out,struct bch_sb * sb,struct bch_sb_field * f)910 static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
911 struct bch_sb *sb,
912 struct bch_sb_field *f)
913 {
914 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
915 struct bch_replicas_entry_v0 *e;
916 bool first = true;
917
918 for_each_replicas_entry(sb_r, e) {
919 if (!first)
920 prt_printf(out, " ");
921 first = false;
922
923 bch2_replicas_entry_v0_to_text(out, e);
924 }
925 prt_newline(out);
926 }
927
928 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
929 .validate = bch2_sb_replicas_v0_validate,
930 .to_text = bch2_sb_replicas_v0_to_text,
931 };
932
933 /* Query replicas: */
934
bch2_have_enough_devs(struct bch_fs * c,struct bch_devs_mask devs,unsigned flags,bool print)935 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
936 unsigned flags, bool print)
937 {
938 struct bch_replicas_entry_v1 *e;
939 bool ret = true;
940
941 percpu_down_read(&c->mark_lock);
942 for_each_cpu_replicas_entry(&c->replicas, e) {
943 unsigned nr_online = 0, nr_failed = 0, dflags = 0;
944 bool metadata = e->data_type < BCH_DATA_user;
945
946 if (e->data_type == BCH_DATA_cached)
947 continue;
948
949 rcu_read_lock();
950 for (unsigned i = 0; i < e->nr_devs; i++) {
951 nr_online += test_bit(e->devs[i], devs.d);
952
953 struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
954 nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
955 }
956 rcu_read_unlock();
957
958 if (nr_failed == e->nr_devs)
959 continue;
960
961 if (nr_online < e->nr_required)
962 dflags |= metadata
963 ? BCH_FORCE_IF_METADATA_LOST
964 : BCH_FORCE_IF_DATA_LOST;
965
966 if (nr_online < e->nr_devs)
967 dflags |= metadata
968 ? BCH_FORCE_IF_METADATA_DEGRADED
969 : BCH_FORCE_IF_DATA_DEGRADED;
970
971 if (dflags & ~flags) {
972 if (print) {
973 struct printbuf buf = PRINTBUF;
974
975 bch2_replicas_entry_to_text(&buf, e);
976 bch_err(c, "insufficient devices online (%u) for replicas entry %s",
977 nr_online, buf.buf);
978 printbuf_exit(&buf);
979 }
980 ret = false;
981 break;
982 }
983
984 }
985 percpu_up_read(&c->mark_lock);
986
987 return ret;
988 }
989
bch2_sb_dev_has_data(struct bch_sb * sb,unsigned dev)990 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
991 {
992 struct bch_sb_field_replicas *replicas;
993 struct bch_sb_field_replicas_v0 *replicas_v0;
994 unsigned data_has = 0;
995
996 replicas = bch2_sb_field_get(sb, replicas);
997 replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
998
999 if (replicas) {
1000 struct bch_replicas_entry_v1 *r;
1001
1002 for_each_replicas_entry(replicas, r) {
1003 if (r->data_type >= sizeof(data_has) * 8)
1004 continue;
1005
1006 for (unsigned i = 0; i < r->nr_devs; i++)
1007 if (r->devs[i] == dev)
1008 data_has |= 1 << r->data_type;
1009 }
1010
1011 } else if (replicas_v0) {
1012 struct bch_replicas_entry_v0 *r;
1013
1014 for_each_replicas_entry_v0(replicas_v0, r) {
1015 if (r->data_type >= sizeof(data_has) * 8)
1016 continue;
1017
1018 for (unsigned i = 0; i < r->nr_devs; i++)
1019 if (r->devs[i] == dev)
1020 data_has |= 1 << r->data_type;
1021 }
1022 }
1023
1024
1025 return data_has;
1026 }
1027
bch2_dev_has_data(struct bch_fs * c,struct bch_dev * ca)1028 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1029 {
1030 unsigned ret;
1031
1032 mutex_lock(&c->sb_lock);
1033 ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
1034 mutex_unlock(&c->sb_lock);
1035
1036 return ret;
1037 }
1038
bch2_fs_replicas_exit(struct bch_fs * c)1039 void bch2_fs_replicas_exit(struct bch_fs *c)
1040 {
1041 unsigned i;
1042
1043 kfree(c->usage_scratch);
1044 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1045 free_percpu(c->usage[i]);
1046 kfree(c->usage_base);
1047 kfree(c->replicas.entries);
1048 kfree(c->replicas_gc.entries);
1049
1050 mempool_exit(&c->replicas_delta_pool);
1051 }
1052
bch2_fs_replicas_init(struct bch_fs * c)1053 int bch2_fs_replicas_init(struct bch_fs *c)
1054 {
1055 bch2_journal_entry_res_resize(&c->journal,
1056 &c->replicas_journal_res,
1057 reserve_journal_replicas(c, &c->replicas));
1058
1059 return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
1060 REPLICAS_DELTA_LIST_MAX) ?:
1061 replicas_table_update(c, &c->replicas);
1062 }
1063