xref: /linux/fs/bcachefs/replicas.c (revision 1e525507)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "journal.h"
6 #include "replicas.h"
7 #include "super-io.h"
8 
9 #include <linux/sort.h>
10 
11 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
12 					    struct bch_replicas_cpu *);
13 
14 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
15 static int bch2_memcmp(const void *l, const void *r,  const void *priv)
16 {
17 	size_t size = (size_t) priv;
18 	return memcmp(l, r, size);
19 }
20 
21 /* Replicas tracking - in memory: */
22 
23 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
24 {
25 #ifdef CONFIG_BCACHEFS_DEBUG
26 	unsigned i;
27 
28 	BUG_ON(e->data_type >= BCH_DATA_NR);
29 	BUG_ON(!e->nr_devs);
30 	BUG_ON(e->nr_required > 1 &&
31 	       e->nr_required >= e->nr_devs);
32 
33 	for (i = 0; i + 1 < e->nr_devs; i++)
34 		BUG_ON(e->devs[i] >= e->devs[i + 1]);
35 #endif
36 }
37 
38 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
39 {
40 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
41 }
42 
43 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
44 {
45 	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
46 			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
47 }
48 
49 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
50 					   struct bch_replicas_entry_v0 *e)
51 {
52 	bch2_prt_data_type(out, e->data_type);
53 
54 	prt_printf(out, ": %u [", e->nr_devs);
55 	for (unsigned i = 0; i < e->nr_devs; i++)
56 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
57 	prt_printf(out, "]");
58 }
59 
60 void bch2_replicas_entry_to_text(struct printbuf *out,
61 				 struct bch_replicas_entry_v1 *e)
62 {
63 	bch2_prt_data_type(out, e->data_type);
64 
65 	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
66 	for (unsigned i = 0; i < e->nr_devs; i++)
67 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
68 	prt_printf(out, "]");
69 }
70 
71 int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
72 				 struct bch_sb *sb,
73 				 struct printbuf *err)
74 {
75 	if (!r->nr_devs) {
76 		prt_printf(err, "no devices in entry ");
77 		goto bad;
78 	}
79 
80 	if (r->nr_required > 1 &&
81 	    r->nr_required >= r->nr_devs) {
82 		prt_printf(err, "bad nr_required in entry ");
83 		goto bad;
84 	}
85 
86 	for (unsigned i = 0; i < r->nr_devs; i++)
87 		if (!bch2_dev_exists(sb, r->devs[i])) {
88 			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
89 			goto bad;
90 		}
91 
92 	return 0;
93 bad:
94 	bch2_replicas_entry_to_text(err, r);
95 	return -BCH_ERR_invalid_replicas_entry;
96 }
97 
98 void bch2_cpu_replicas_to_text(struct printbuf *out,
99 			       struct bch_replicas_cpu *r)
100 {
101 	struct bch_replicas_entry_v1 *e;
102 	bool first = true;
103 
104 	for_each_cpu_replicas_entry(r, e) {
105 		if (!first)
106 			prt_printf(out, " ");
107 		first = false;
108 
109 		bch2_replicas_entry_to_text(out, e);
110 	}
111 }
112 
113 static void extent_to_replicas(struct bkey_s_c k,
114 			       struct bch_replicas_entry_v1 *r)
115 {
116 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
117 	const union bch_extent_entry *entry;
118 	struct extent_ptr_decoded p;
119 
120 	r->nr_required	= 1;
121 
122 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
123 		if (p.ptr.cached)
124 			continue;
125 
126 		if (!p.has_ec)
127 			r->devs[r->nr_devs++] = p.ptr.dev;
128 		else
129 			r->nr_required = 0;
130 	}
131 }
132 
133 static void stripe_to_replicas(struct bkey_s_c k,
134 			       struct bch_replicas_entry_v1 *r)
135 {
136 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
137 	const struct bch_extent_ptr *ptr;
138 
139 	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
140 
141 	for (ptr = s.v->ptrs;
142 	     ptr < s.v->ptrs + s.v->nr_blocks;
143 	     ptr++)
144 		r->devs[r->nr_devs++] = ptr->dev;
145 }
146 
147 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
148 			   struct bkey_s_c k)
149 {
150 	e->nr_devs = 0;
151 
152 	switch (k.k->type) {
153 	case KEY_TYPE_btree_ptr:
154 	case KEY_TYPE_btree_ptr_v2:
155 		e->data_type = BCH_DATA_btree;
156 		extent_to_replicas(k, e);
157 		break;
158 	case KEY_TYPE_extent:
159 	case KEY_TYPE_reflink_v:
160 		e->data_type = BCH_DATA_user;
161 		extent_to_replicas(k, e);
162 		break;
163 	case KEY_TYPE_stripe:
164 		e->data_type = BCH_DATA_parity;
165 		stripe_to_replicas(k, e);
166 		break;
167 	}
168 
169 	bch2_replicas_entry_sort(e);
170 }
171 
172 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
173 			      enum bch_data_type data_type,
174 			      struct bch_devs_list devs)
175 {
176 	BUG_ON(!data_type ||
177 	       data_type == BCH_DATA_sb ||
178 	       data_type >= BCH_DATA_NR);
179 
180 	e->data_type	= data_type;
181 	e->nr_devs	= 0;
182 	e->nr_required	= 1;
183 
184 	darray_for_each(devs, i)
185 		e->devs[e->nr_devs++] = *i;
186 
187 	bch2_replicas_entry_sort(e);
188 }
189 
190 static struct bch_replicas_cpu
191 cpu_replicas_add_entry(struct bch_fs *c,
192 		       struct bch_replicas_cpu *old,
193 		       struct bch_replicas_entry_v1 *new_entry)
194 {
195 	unsigned i;
196 	struct bch_replicas_cpu new = {
197 		.nr		= old->nr + 1,
198 		.entry_size	= max_t(unsigned, old->entry_size,
199 					replicas_entry_bytes(new_entry)),
200 	};
201 
202 	for (i = 0; i < new_entry->nr_devs; i++)
203 		BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
204 
205 	BUG_ON(!new_entry->data_type);
206 	verify_replicas_entry(new_entry);
207 
208 	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
209 	if (!new.entries)
210 		return new;
211 
212 	for (i = 0; i < old->nr; i++)
213 		memcpy(cpu_replicas_entry(&new, i),
214 		       cpu_replicas_entry(old, i),
215 		       old->entry_size);
216 
217 	memcpy(cpu_replicas_entry(&new, old->nr),
218 	       new_entry,
219 	       replicas_entry_bytes(new_entry));
220 
221 	bch2_cpu_replicas_sort(&new);
222 	return new;
223 }
224 
225 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
226 				       struct bch_replicas_entry_v1 *search)
227 {
228 	int idx, entry_size = replicas_entry_bytes(search);
229 
230 	if (unlikely(entry_size > r->entry_size))
231 		return -1;
232 
233 	verify_replicas_entry(search);
234 
235 #define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
236 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
237 			      entry_cmp, search);
238 #undef entry_cmp
239 
240 	return idx < r->nr ? idx : -1;
241 }
242 
243 int bch2_replicas_entry_idx(struct bch_fs *c,
244 			    struct bch_replicas_entry_v1 *search)
245 {
246 	bch2_replicas_entry_sort(search);
247 
248 	return __replicas_entry_idx(&c->replicas, search);
249 }
250 
251 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
252 				 struct bch_replicas_entry_v1 *search)
253 {
254 	return __replicas_entry_idx(r, search) >= 0;
255 }
256 
257 bool bch2_replicas_marked(struct bch_fs *c,
258 			  struct bch_replicas_entry_v1 *search)
259 {
260 	bool marked;
261 
262 	if (!search->nr_devs)
263 		return true;
264 
265 	verify_replicas_entry(search);
266 
267 	percpu_down_read(&c->mark_lock);
268 	marked = __replicas_has_entry(&c->replicas, search) &&
269 		(likely((!c->replicas_gc.entries)) ||
270 		 __replicas_has_entry(&c->replicas_gc, search));
271 	percpu_up_read(&c->mark_lock);
272 
273 	return marked;
274 }
275 
276 static void __replicas_table_update(struct bch_fs_usage *dst,
277 				    struct bch_replicas_cpu *dst_r,
278 				    struct bch_fs_usage *src,
279 				    struct bch_replicas_cpu *src_r)
280 {
281 	int src_idx, dst_idx;
282 
283 	*dst = *src;
284 
285 	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
286 		if (!src->replicas[src_idx])
287 			continue;
288 
289 		dst_idx = __replicas_entry_idx(dst_r,
290 				cpu_replicas_entry(src_r, src_idx));
291 		BUG_ON(dst_idx < 0);
292 
293 		dst->replicas[dst_idx] = src->replicas[src_idx];
294 	}
295 }
296 
297 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
298 				    struct bch_replicas_cpu *dst_r,
299 				    struct bch_fs_usage __percpu *src_p,
300 				    struct bch_replicas_cpu *src_r)
301 {
302 	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
303 	struct bch_fs_usage *dst, *src = (void *)
304 		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
305 
306 	preempt_disable();
307 	dst = this_cpu_ptr(dst_p);
308 	preempt_enable();
309 
310 	__replicas_table_update(dst, dst_r, src, src_r);
311 }
312 
313 /*
314  * Resize filesystem accounting:
315  */
316 static int replicas_table_update(struct bch_fs *c,
317 				 struct bch_replicas_cpu *new_r)
318 {
319 	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
320 	struct bch_fs_usage_online *new_scratch = NULL;
321 	struct bch_fs_usage __percpu *new_gc = NULL;
322 	struct bch_fs_usage *new_base = NULL;
323 	unsigned i, bytes = sizeof(struct bch_fs_usage) +
324 		sizeof(u64) * new_r->nr;
325 	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
326 		sizeof(u64) * new_r->nr;
327 	int ret = 0;
328 
329 	memset(new_usage, 0, sizeof(new_usage));
330 
331 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
332 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
333 					sizeof(u64), GFP_KERNEL)))
334 			goto err;
335 
336 	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
337 	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
338 	    (c->usage_gc &&
339 	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
340 		goto err;
341 
342 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
343 		if (c->usage[i])
344 			__replicas_table_update_pcpu(new_usage[i], new_r,
345 						     c->usage[i], &c->replicas);
346 	if (c->usage_base)
347 		__replicas_table_update(new_base,		new_r,
348 					c->usage_base,		&c->replicas);
349 	if (c->usage_gc)
350 		__replicas_table_update_pcpu(new_gc,		new_r,
351 					     c->usage_gc,	&c->replicas);
352 
353 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
354 		swap(c->usage[i],	new_usage[i]);
355 	swap(c->usage_base,	new_base);
356 	swap(c->usage_scratch,	new_scratch);
357 	swap(c->usage_gc,	new_gc);
358 	swap(c->replicas,	*new_r);
359 out:
360 	free_percpu(new_gc);
361 	kfree(new_scratch);
362 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
363 		free_percpu(new_usage[i]);
364 	kfree(new_base);
365 	return ret;
366 err:
367 	bch_err(c, "error updating replicas table: memory allocation failure");
368 	ret = -BCH_ERR_ENOMEM_replicas_table;
369 	goto out;
370 }
371 
372 static unsigned reserve_journal_replicas(struct bch_fs *c,
373 				     struct bch_replicas_cpu *r)
374 {
375 	struct bch_replicas_entry_v1 *e;
376 	unsigned journal_res_u64s = 0;
377 
378 	/* nr_inodes: */
379 	journal_res_u64s +=
380 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
381 
382 	/* key_version: */
383 	journal_res_u64s +=
384 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
385 
386 	/* persistent_reserved: */
387 	journal_res_u64s +=
388 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
389 		BCH_REPLICAS_MAX;
390 
391 	for_each_cpu_replicas_entry(r, e)
392 		journal_res_u64s +=
393 			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
394 				     e->nr_devs, sizeof(u64));
395 	return journal_res_u64s;
396 }
397 
398 noinline
399 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
400 				struct bch_replicas_entry_v1 *new_entry)
401 {
402 	struct bch_replicas_cpu new_r, new_gc;
403 	int ret = 0;
404 
405 	verify_replicas_entry(new_entry);
406 
407 	memset(&new_r, 0, sizeof(new_r));
408 	memset(&new_gc, 0, sizeof(new_gc));
409 
410 	mutex_lock(&c->sb_lock);
411 
412 	if (c->replicas_gc.entries &&
413 	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
414 		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
415 		if (!new_gc.entries) {
416 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
417 			goto err;
418 		}
419 	}
420 
421 	if (!__replicas_has_entry(&c->replicas, new_entry)) {
422 		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
423 		if (!new_r.entries) {
424 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
425 			goto err;
426 		}
427 
428 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
429 		if (ret)
430 			goto err;
431 
432 		bch2_journal_entry_res_resize(&c->journal,
433 				&c->replicas_journal_res,
434 				reserve_journal_replicas(c, &new_r));
435 	}
436 
437 	if (!new_r.entries &&
438 	    !new_gc.entries)
439 		goto out;
440 
441 	/* allocations done, now commit: */
442 
443 	if (new_r.entries)
444 		bch2_write_super(c);
445 
446 	/* don't update in memory replicas until changes are persistent */
447 	percpu_down_write(&c->mark_lock);
448 	if (new_r.entries)
449 		ret = replicas_table_update(c, &new_r);
450 	if (new_gc.entries)
451 		swap(new_gc, c->replicas_gc);
452 	percpu_up_write(&c->mark_lock);
453 out:
454 	mutex_unlock(&c->sb_lock);
455 
456 	kfree(new_r.entries);
457 	kfree(new_gc.entries);
458 
459 	return ret;
460 err:
461 	bch_err_msg(c, ret, "adding replicas entry");
462 	goto out;
463 }
464 
465 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
466 {
467 	return likely(bch2_replicas_marked(c, r))
468 		? 0 : bch2_mark_replicas_slowpath(c, r);
469 }
470 
471 /* replicas delta list: */
472 
473 int bch2_replicas_delta_list_mark(struct bch_fs *c,
474 				  struct replicas_delta_list *r)
475 {
476 	struct replicas_delta *d = r->d;
477 	struct replicas_delta *top = (void *) r->d + r->used;
478 	int ret = 0;
479 
480 	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
481 		ret = bch2_mark_replicas(c, &d->r);
482 	return ret;
483 }
484 
485 /*
486  * Old replicas_gc mechanism: only used for journal replicas entries now, should
487  * die at some point:
488  */
489 
490 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
491 {
492 	lockdep_assert_held(&c->replicas_gc_lock);
493 
494 	mutex_lock(&c->sb_lock);
495 	percpu_down_write(&c->mark_lock);
496 
497 	ret =   ret ?:
498 		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
499 		replicas_table_update(c, &c->replicas_gc);
500 
501 	kfree(c->replicas_gc.entries);
502 	c->replicas_gc.entries = NULL;
503 
504 	percpu_up_write(&c->mark_lock);
505 
506 	if (!ret)
507 		bch2_write_super(c);
508 
509 	mutex_unlock(&c->sb_lock);
510 
511 	return ret;
512 }
513 
514 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
515 {
516 	struct bch_replicas_entry_v1 *e;
517 	unsigned i = 0;
518 
519 	lockdep_assert_held(&c->replicas_gc_lock);
520 
521 	mutex_lock(&c->sb_lock);
522 	BUG_ON(c->replicas_gc.entries);
523 
524 	c->replicas_gc.nr		= 0;
525 	c->replicas_gc.entry_size	= 0;
526 
527 	for_each_cpu_replicas_entry(&c->replicas, e)
528 		if (!((1 << e->data_type) & typemask)) {
529 			c->replicas_gc.nr++;
530 			c->replicas_gc.entry_size =
531 				max_t(unsigned, c->replicas_gc.entry_size,
532 				      replicas_entry_bytes(e));
533 		}
534 
535 	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
536 					 c->replicas_gc.entry_size,
537 					 GFP_KERNEL);
538 	if (!c->replicas_gc.entries) {
539 		mutex_unlock(&c->sb_lock);
540 		bch_err(c, "error allocating c->replicas_gc");
541 		return -BCH_ERR_ENOMEM_replicas_gc;
542 	}
543 
544 	for_each_cpu_replicas_entry(&c->replicas, e)
545 		if (!((1 << e->data_type) & typemask))
546 			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
547 			       e, c->replicas_gc.entry_size);
548 
549 	bch2_cpu_replicas_sort(&c->replicas_gc);
550 	mutex_unlock(&c->sb_lock);
551 
552 	return 0;
553 }
554 
555 /*
556  * New much simpler mechanism for clearing out unneeded replicas entries - drop
557  * replicas entries that have 0 sectors used.
558  *
559  * However, we don't track sector counts for journal usage, so this doesn't drop
560  * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
561  * is retained for that.
562  */
563 int bch2_replicas_gc2(struct bch_fs *c)
564 {
565 	struct bch_replicas_cpu new = { 0 };
566 	unsigned i, nr;
567 	int ret = 0;
568 
569 	bch2_journal_meta(&c->journal);
570 retry:
571 	nr		= READ_ONCE(c->replicas.nr);
572 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
573 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
574 	if (!new.entries) {
575 		bch_err(c, "error allocating c->replicas_gc");
576 		return -BCH_ERR_ENOMEM_replicas_gc;
577 	}
578 
579 	mutex_lock(&c->sb_lock);
580 	percpu_down_write(&c->mark_lock);
581 
582 	if (nr			!= c->replicas.nr ||
583 	    new.entry_size	!= c->replicas.entry_size) {
584 		percpu_up_write(&c->mark_lock);
585 		mutex_unlock(&c->sb_lock);
586 		kfree(new.entries);
587 		goto retry;
588 	}
589 
590 	for (i = 0; i < c->replicas.nr; i++) {
591 		struct bch_replicas_entry_v1 *e =
592 			cpu_replicas_entry(&c->replicas, i);
593 
594 		if (e->data_type == BCH_DATA_journal ||
595 		    c->usage_base->replicas[i] ||
596 		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
597 		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
598 		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
599 		    percpu_u64_get(&c->usage[3]->replicas[i]))
600 			memcpy(cpu_replicas_entry(&new, new.nr++),
601 			       e, new.entry_size);
602 	}
603 
604 	bch2_cpu_replicas_sort(&new);
605 
606 	ret =   bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
607 		replicas_table_update(c, &new);
608 
609 	kfree(new.entries);
610 
611 	percpu_up_write(&c->mark_lock);
612 
613 	if (!ret)
614 		bch2_write_super(c);
615 
616 	mutex_unlock(&c->sb_lock);
617 
618 	return ret;
619 }
620 
621 int bch2_replicas_set_usage(struct bch_fs *c,
622 			    struct bch_replicas_entry_v1 *r,
623 			    u64 sectors)
624 {
625 	int ret, idx = bch2_replicas_entry_idx(c, r);
626 
627 	if (idx < 0) {
628 		struct bch_replicas_cpu n;
629 
630 		n = cpu_replicas_add_entry(c, &c->replicas, r);
631 		if (!n.entries)
632 			return -BCH_ERR_ENOMEM_cpu_replicas;
633 
634 		ret = replicas_table_update(c, &n);
635 		if (ret)
636 			return ret;
637 
638 		kfree(n.entries);
639 
640 		idx = bch2_replicas_entry_idx(c, r);
641 		BUG_ON(ret < 0);
642 	}
643 
644 	c->usage_base->replicas[idx] = sectors;
645 
646 	return 0;
647 }
648 
649 /* Replicas tracking - superblock: */
650 
651 static int
652 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
653 				   struct bch_replicas_cpu *cpu_r)
654 {
655 	struct bch_replicas_entry_v1 *e, *dst;
656 	unsigned nr = 0, entry_size = 0, idx = 0;
657 
658 	for_each_replicas_entry(sb_r, e) {
659 		entry_size = max_t(unsigned, entry_size,
660 				   replicas_entry_bytes(e));
661 		nr++;
662 	}
663 
664 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
665 	if (!cpu_r->entries)
666 		return -BCH_ERR_ENOMEM_cpu_replicas;
667 
668 	cpu_r->nr		= nr;
669 	cpu_r->entry_size	= entry_size;
670 
671 	for_each_replicas_entry(sb_r, e) {
672 		dst = cpu_replicas_entry(cpu_r, idx++);
673 		memcpy(dst, e, replicas_entry_bytes(e));
674 		bch2_replicas_entry_sort(dst);
675 	}
676 
677 	return 0;
678 }
679 
680 static int
681 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
682 				      struct bch_replicas_cpu *cpu_r)
683 {
684 	struct bch_replicas_entry_v0 *e;
685 	unsigned nr = 0, entry_size = 0, idx = 0;
686 
687 	for_each_replicas_entry(sb_r, e) {
688 		entry_size = max_t(unsigned, entry_size,
689 				   replicas_entry_bytes(e));
690 		nr++;
691 	}
692 
693 	entry_size += sizeof(struct bch_replicas_entry_v1) -
694 		sizeof(struct bch_replicas_entry_v0);
695 
696 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
697 	if (!cpu_r->entries)
698 		return -BCH_ERR_ENOMEM_cpu_replicas;
699 
700 	cpu_r->nr		= nr;
701 	cpu_r->entry_size	= entry_size;
702 
703 	for_each_replicas_entry(sb_r, e) {
704 		struct bch_replicas_entry_v1 *dst =
705 			cpu_replicas_entry(cpu_r, idx++);
706 
707 		dst->data_type	= e->data_type;
708 		dst->nr_devs	= e->nr_devs;
709 		dst->nr_required = 1;
710 		memcpy(dst->devs, e->devs, e->nr_devs);
711 		bch2_replicas_entry_sort(dst);
712 	}
713 
714 	return 0;
715 }
716 
717 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
718 {
719 	struct bch_sb_field_replicas *sb_v1;
720 	struct bch_sb_field_replicas_v0 *sb_v0;
721 	struct bch_replicas_cpu new_r = { 0, 0, NULL };
722 	int ret = 0;
723 
724 	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
725 		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
726 	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
727 		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
728 	if (ret)
729 		return ret;
730 
731 	bch2_cpu_replicas_sort(&new_r);
732 
733 	percpu_down_write(&c->mark_lock);
734 
735 	ret = replicas_table_update(c, &new_r);
736 	percpu_up_write(&c->mark_lock);
737 
738 	kfree(new_r.entries);
739 
740 	return 0;
741 }
742 
743 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
744 					       struct bch_replicas_cpu *r)
745 {
746 	struct bch_sb_field_replicas_v0 *sb_r;
747 	struct bch_replicas_entry_v0 *dst;
748 	struct bch_replicas_entry_v1 *src;
749 	size_t bytes;
750 
751 	bytes = sizeof(struct bch_sb_field_replicas);
752 
753 	for_each_cpu_replicas_entry(r, src)
754 		bytes += replicas_entry_bytes(src) - 1;
755 
756 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
757 			DIV_ROUND_UP(bytes, sizeof(u64)));
758 	if (!sb_r)
759 		return -BCH_ERR_ENOSPC_sb_replicas;
760 
761 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
762 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
763 
764 	memset(&sb_r->entries, 0,
765 	       vstruct_end(&sb_r->field) -
766 	       (void *) &sb_r->entries);
767 
768 	dst = sb_r->entries;
769 	for_each_cpu_replicas_entry(r, src) {
770 		dst->data_type	= src->data_type;
771 		dst->nr_devs	= src->nr_devs;
772 		memcpy(dst->devs, src->devs, src->nr_devs);
773 
774 		dst = replicas_entry_next(dst);
775 
776 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
777 	}
778 
779 	return 0;
780 }
781 
782 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
783 					    struct bch_replicas_cpu *r)
784 {
785 	struct bch_sb_field_replicas *sb_r;
786 	struct bch_replicas_entry_v1 *dst, *src;
787 	bool need_v1 = false;
788 	size_t bytes;
789 
790 	bytes = sizeof(struct bch_sb_field_replicas);
791 
792 	for_each_cpu_replicas_entry(r, src) {
793 		bytes += replicas_entry_bytes(src);
794 		if (src->nr_required != 1)
795 			need_v1 = true;
796 	}
797 
798 	if (!need_v1)
799 		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
800 
801 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
802 			DIV_ROUND_UP(bytes, sizeof(u64)));
803 	if (!sb_r)
804 		return -BCH_ERR_ENOSPC_sb_replicas;
805 
806 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
807 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
808 
809 	memset(&sb_r->entries, 0,
810 	       vstruct_end(&sb_r->field) -
811 	       (void *) &sb_r->entries);
812 
813 	dst = sb_r->entries;
814 	for_each_cpu_replicas_entry(r, src) {
815 		memcpy(dst, src, replicas_entry_bytes(src));
816 
817 		dst = replicas_entry_next(dst);
818 
819 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
820 	}
821 
822 	return 0;
823 }
824 
825 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
826 				      struct bch_sb *sb,
827 				      struct printbuf *err)
828 {
829 	unsigned i;
830 
831 	sort_r(cpu_r->entries,
832 	       cpu_r->nr,
833 	       cpu_r->entry_size,
834 	       bch2_memcmp, NULL,
835 	       (void *)(size_t)cpu_r->entry_size);
836 
837 	for (i = 0; i < cpu_r->nr; i++) {
838 		struct bch_replicas_entry_v1 *e =
839 			cpu_replicas_entry(cpu_r, i);
840 
841 		int ret = bch2_replicas_entry_validate(e, sb, err);
842 		if (ret)
843 			return ret;
844 
845 		if (i + 1 < cpu_r->nr) {
846 			struct bch_replicas_entry_v1 *n =
847 				cpu_replicas_entry(cpu_r, i + 1);
848 
849 			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
850 
851 			if (!memcmp(e, n, cpu_r->entry_size)) {
852 				prt_printf(err, "duplicate replicas entry ");
853 				bch2_replicas_entry_to_text(err, e);
854 				return -BCH_ERR_invalid_sb_replicas;
855 			}
856 		}
857 	}
858 
859 	return 0;
860 }
861 
862 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
863 				     struct printbuf *err)
864 {
865 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
866 	struct bch_replicas_cpu cpu_r;
867 	int ret;
868 
869 	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
870 	if (ret)
871 		return ret;
872 
873 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
874 	kfree(cpu_r.entries);
875 	return ret;
876 }
877 
878 static void bch2_sb_replicas_to_text(struct printbuf *out,
879 				     struct bch_sb *sb,
880 				     struct bch_sb_field *f)
881 {
882 	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
883 	struct bch_replicas_entry_v1 *e;
884 	bool first = true;
885 
886 	for_each_replicas_entry(r, e) {
887 		if (!first)
888 			prt_printf(out, " ");
889 		first = false;
890 
891 		bch2_replicas_entry_to_text(out, e);
892 	}
893 	prt_newline(out);
894 }
895 
896 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
897 	.validate	= bch2_sb_replicas_validate,
898 	.to_text	= bch2_sb_replicas_to_text,
899 };
900 
901 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
902 					struct printbuf *err)
903 {
904 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
905 	struct bch_replicas_cpu cpu_r;
906 	int ret;
907 
908 	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
909 	if (ret)
910 		return ret;
911 
912 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
913 	kfree(cpu_r.entries);
914 	return ret;
915 }
916 
917 static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
918 					struct bch_sb *sb,
919 					struct bch_sb_field *f)
920 {
921 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
922 	struct bch_replicas_entry_v0 *e;
923 	bool first = true;
924 
925 	for_each_replicas_entry(sb_r, e) {
926 		if (!first)
927 			prt_printf(out, " ");
928 		first = false;
929 
930 		bch2_replicas_entry_v0_to_text(out, e);
931 	}
932 	prt_newline(out);
933 }
934 
935 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
936 	.validate	= bch2_sb_replicas_v0_validate,
937 	.to_text	= bch2_sb_replicas_v0_to_text,
938 };
939 
940 /* Query replicas: */
941 
942 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
943 			   unsigned flags, bool print)
944 {
945 	struct bch_replicas_entry_v1 *e;
946 	bool ret = true;
947 
948 	percpu_down_read(&c->mark_lock);
949 	for_each_cpu_replicas_entry(&c->replicas, e) {
950 		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
951 		bool metadata = e->data_type < BCH_DATA_user;
952 
953 		if (e->data_type == BCH_DATA_cached)
954 			continue;
955 
956 		for (i = 0; i < e->nr_devs; i++) {
957 			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
958 
959 			nr_online += test_bit(e->devs[i], devs.d);
960 			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
961 		}
962 
963 		if (nr_failed == e->nr_devs)
964 			continue;
965 
966 		if (nr_online < e->nr_required)
967 			dflags |= metadata
968 				? BCH_FORCE_IF_METADATA_LOST
969 				: BCH_FORCE_IF_DATA_LOST;
970 
971 		if (nr_online < e->nr_devs)
972 			dflags |= metadata
973 				? BCH_FORCE_IF_METADATA_DEGRADED
974 				: BCH_FORCE_IF_DATA_DEGRADED;
975 
976 		if (dflags & ~flags) {
977 			if (print) {
978 				struct printbuf buf = PRINTBUF;
979 
980 				bch2_replicas_entry_to_text(&buf, e);
981 				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
982 					nr_online, buf.buf);
983 				printbuf_exit(&buf);
984 			}
985 			ret = false;
986 			break;
987 		}
988 
989 	}
990 	percpu_up_read(&c->mark_lock);
991 
992 	return ret;
993 }
994 
995 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
996 {
997 	struct bch_sb_field_replicas *replicas;
998 	struct bch_sb_field_replicas_v0 *replicas_v0;
999 	unsigned i, data_has = 0;
1000 
1001 	replicas = bch2_sb_field_get(sb, replicas);
1002 	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
1003 
1004 	if (replicas) {
1005 		struct bch_replicas_entry_v1 *r;
1006 
1007 		for_each_replicas_entry(replicas, r)
1008 			for (i = 0; i < r->nr_devs; i++)
1009 				if (r->devs[i] == dev)
1010 					data_has |= 1 << r->data_type;
1011 	} else if (replicas_v0) {
1012 		struct bch_replicas_entry_v0 *r;
1013 
1014 		for_each_replicas_entry_v0(replicas_v0, r)
1015 			for (i = 0; i < r->nr_devs; i++)
1016 				if (r->devs[i] == dev)
1017 					data_has |= 1 << r->data_type;
1018 	}
1019 
1020 
1021 	return data_has;
1022 }
1023 
1024 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1025 {
1026 	unsigned ret;
1027 
1028 	mutex_lock(&c->sb_lock);
1029 	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
1030 	mutex_unlock(&c->sb_lock);
1031 
1032 	return ret;
1033 }
1034 
1035 void bch2_fs_replicas_exit(struct bch_fs *c)
1036 {
1037 	unsigned i;
1038 
1039 	kfree(c->usage_scratch);
1040 	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1041 		free_percpu(c->usage[i]);
1042 	kfree(c->usage_base);
1043 	kfree(c->replicas.entries);
1044 	kfree(c->replicas_gc.entries);
1045 
1046 	mempool_exit(&c->replicas_delta_pool);
1047 }
1048 
1049 int bch2_fs_replicas_init(struct bch_fs *c)
1050 {
1051 	bch2_journal_entry_res_resize(&c->journal,
1052 			&c->replicas_journal_res,
1053 			reserve_journal_replicas(c, &c->replicas));
1054 
1055 	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
1056 					 REPLICAS_DELTA_LIST_MAX) ?:
1057 		replicas_table_update(c, &c->replicas);
1058 }
1059