xref: /linux/fs/bcachefs/replicas.c (revision 7d84d9f4)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "disk_accounting.h"
6 #include "journal.h"
7 #include "replicas.h"
8 #include "super-io.h"
9 
10 #include <linux/sort.h>
11 
12 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
13 					    struct bch_replicas_cpu *);
14 
15 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
bch2_memcmp(const void * l,const void * r,const void * priv)16 static int bch2_memcmp(const void *l, const void *r,  const void *priv)
17 {
18 	size_t size = (size_t) priv;
19 	return memcmp(l, r, size);
20 }
21 
22 /* Replicas tracking - in memory: */
23 
verify_replicas_entry(struct bch_replicas_entry_v1 * e)24 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
25 {
26 #ifdef CONFIG_BCACHEFS_DEBUG
27 	BUG_ON(!e->nr_devs);
28 	BUG_ON(e->nr_required > 1 &&
29 	       e->nr_required >= e->nr_devs);
30 
31 	for (unsigned i = 0; i + 1 < e->nr_devs; i++)
32 		BUG_ON(e->devs[i] >= e->devs[i + 1]);
33 #endif
34 }
35 
bch2_replicas_entry_sort(struct bch_replicas_entry_v1 * e)36 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
37 {
38 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
39 }
40 
bch2_cpu_replicas_sort(struct bch_replicas_cpu * r)41 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
42 {
43 	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
44 			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
45 }
46 
bch2_replicas_entry_v0_to_text(struct printbuf * out,struct bch_replicas_entry_v0 * e)47 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
48 					   struct bch_replicas_entry_v0 *e)
49 {
50 	bch2_prt_data_type(out, e->data_type);
51 
52 	prt_printf(out, ": %u [", e->nr_devs);
53 	for (unsigned i = 0; i < e->nr_devs; i++)
54 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
55 	prt_printf(out, "]");
56 }
57 
bch2_replicas_entry_to_text(struct printbuf * out,struct bch_replicas_entry_v1 * e)58 void bch2_replicas_entry_to_text(struct printbuf *out,
59 				 struct bch_replicas_entry_v1 *e)
60 {
61 	bch2_prt_data_type(out, e->data_type);
62 
63 	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
64 	for (unsigned i = 0; i < e->nr_devs; i++)
65 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
66 	prt_printf(out, "]");
67 }
68 
bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 * r,struct bch_sb * sb,struct printbuf * err)69 static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
70 					   struct bch_sb *sb,
71 					   struct printbuf *err)
72 {
73 	if (!r->nr_devs) {
74 		prt_printf(err, "no devices in entry ");
75 		goto bad;
76 	}
77 
78 	if (r->nr_required > 1 &&
79 	    r->nr_required >= r->nr_devs) {
80 		prt_printf(err, "bad nr_required in entry ");
81 		goto bad;
82 	}
83 
84 	for (unsigned i = 0; i < r->nr_devs; i++)
85 		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
86 		    !bch2_member_exists(sb, r->devs[i])) {
87 			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
88 			goto bad;
89 		}
90 
91 	return 0;
92 bad:
93 	bch2_replicas_entry_to_text(err, r);
94 	return -BCH_ERR_invalid_replicas_entry;
95 }
96 
bch2_replicas_entry_validate(struct bch_replicas_entry_v1 * r,struct bch_fs * c,struct printbuf * err)97 int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
98 				 struct bch_fs *c,
99 				 struct printbuf *err)
100 {
101 	if (!r->nr_devs) {
102 		prt_printf(err, "no devices in entry ");
103 		goto bad;
104 	}
105 
106 	if (r->nr_required > 1 &&
107 	    r->nr_required >= r->nr_devs) {
108 		prt_printf(err, "bad nr_required in entry ");
109 		goto bad;
110 	}
111 
112 	for (unsigned i = 0; i < r->nr_devs; i++)
113 		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
114 		    !bch2_dev_exists(c, r->devs[i])) {
115 			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
116 			goto bad;
117 		}
118 
119 	return 0;
120 bad:
121 	bch2_replicas_entry_to_text(err, r);
122 	return -BCH_ERR_invalid_replicas_entry;
123 }
124 
bch2_cpu_replicas_to_text(struct printbuf * out,struct bch_replicas_cpu * r)125 void bch2_cpu_replicas_to_text(struct printbuf *out,
126 			       struct bch_replicas_cpu *r)
127 {
128 	struct bch_replicas_entry_v1 *e;
129 	bool first = true;
130 
131 	for_each_cpu_replicas_entry(r, e) {
132 		if (!first)
133 			prt_printf(out, " ");
134 		first = false;
135 
136 		bch2_replicas_entry_to_text(out, e);
137 	}
138 }
139 
extent_to_replicas(struct bkey_s_c k,struct bch_replicas_entry_v1 * r)140 static void extent_to_replicas(struct bkey_s_c k,
141 			       struct bch_replicas_entry_v1 *r)
142 {
143 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
144 	const union bch_extent_entry *entry;
145 	struct extent_ptr_decoded p;
146 
147 	r->nr_required	= 1;
148 
149 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
150 		if (p.ptr.cached)
151 			continue;
152 
153 		if (!p.has_ec)
154 			replicas_entry_add_dev(r, p.ptr.dev);
155 		else
156 			r->nr_required = 0;
157 	}
158 }
159 
stripe_to_replicas(struct bkey_s_c k,struct bch_replicas_entry_v1 * r)160 static void stripe_to_replicas(struct bkey_s_c k,
161 			       struct bch_replicas_entry_v1 *r)
162 {
163 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
164 	const struct bch_extent_ptr *ptr;
165 
166 	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
167 
168 	for (ptr = s.v->ptrs;
169 	     ptr < s.v->ptrs + s.v->nr_blocks;
170 	     ptr++)
171 		replicas_entry_add_dev(r, ptr->dev);
172 }
173 
bch2_bkey_to_replicas(struct bch_replicas_entry_v1 * e,struct bkey_s_c k)174 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
175 			   struct bkey_s_c k)
176 {
177 	e->nr_devs = 0;
178 
179 	switch (k.k->type) {
180 	case KEY_TYPE_btree_ptr:
181 	case KEY_TYPE_btree_ptr_v2:
182 		e->data_type = BCH_DATA_btree;
183 		extent_to_replicas(k, e);
184 		break;
185 	case KEY_TYPE_extent:
186 	case KEY_TYPE_reflink_v:
187 		e->data_type = BCH_DATA_user;
188 		extent_to_replicas(k, e);
189 		break;
190 	case KEY_TYPE_stripe:
191 		e->data_type = BCH_DATA_parity;
192 		stripe_to_replicas(k, e);
193 		break;
194 	}
195 
196 	bch2_replicas_entry_sort(e);
197 }
198 
bch2_devlist_to_replicas(struct bch_replicas_entry_v1 * e,enum bch_data_type data_type,struct bch_devs_list devs)199 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
200 			      enum bch_data_type data_type,
201 			      struct bch_devs_list devs)
202 {
203 	BUG_ON(!data_type ||
204 	       data_type == BCH_DATA_sb ||
205 	       data_type >= BCH_DATA_NR);
206 
207 	e->data_type	= data_type;
208 	e->nr_devs	= 0;
209 	e->nr_required	= 1;
210 
211 	darray_for_each(devs, i)
212 		replicas_entry_add_dev(e, *i);
213 
214 	bch2_replicas_entry_sort(e);
215 }
216 
217 static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs * c,struct bch_replicas_cpu * old,struct bch_replicas_entry_v1 * new_entry)218 cpu_replicas_add_entry(struct bch_fs *c,
219 		       struct bch_replicas_cpu *old,
220 		       struct bch_replicas_entry_v1 *new_entry)
221 {
222 	struct bch_replicas_cpu new = {
223 		.nr		= old->nr + 1,
224 		.entry_size	= max_t(unsigned, old->entry_size,
225 					replicas_entry_bytes(new_entry)),
226 	};
227 
228 	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
229 	if (!new.entries)
230 		return new;
231 
232 	for (unsigned i = 0; i < old->nr; i++)
233 		memcpy(cpu_replicas_entry(&new, i),
234 		       cpu_replicas_entry(old, i),
235 		       old->entry_size);
236 
237 	memcpy(cpu_replicas_entry(&new, old->nr),
238 	       new_entry,
239 	       replicas_entry_bytes(new_entry));
240 
241 	bch2_cpu_replicas_sort(&new);
242 	return new;
243 }
244 
__replicas_entry_idx(struct bch_replicas_cpu * r,struct bch_replicas_entry_v1 * search)245 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
246 				       struct bch_replicas_entry_v1 *search)
247 {
248 	int idx, entry_size = replicas_entry_bytes(search);
249 
250 	if (unlikely(entry_size > r->entry_size))
251 		return -1;
252 
253 #define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
254 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
255 			      entry_cmp, search);
256 #undef entry_cmp
257 
258 	return idx < r->nr ? idx : -1;
259 }
260 
bch2_replicas_entry_idx(struct bch_fs * c,struct bch_replicas_entry_v1 * search)261 int bch2_replicas_entry_idx(struct bch_fs *c,
262 			    struct bch_replicas_entry_v1 *search)
263 {
264 	bch2_replicas_entry_sort(search);
265 
266 	return __replicas_entry_idx(&c->replicas, search);
267 }
268 
__replicas_has_entry(struct bch_replicas_cpu * r,struct bch_replicas_entry_v1 * search)269 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
270 				 struct bch_replicas_entry_v1 *search)
271 {
272 	return __replicas_entry_idx(r, search) >= 0;
273 }
274 
bch2_replicas_marked_locked(struct bch_fs * c,struct bch_replicas_entry_v1 * search)275 bool bch2_replicas_marked_locked(struct bch_fs *c,
276 			  struct bch_replicas_entry_v1 *search)
277 {
278 	verify_replicas_entry(search);
279 
280 	return !search->nr_devs ||
281 		(__replicas_has_entry(&c->replicas, search) &&
282 		 (likely((!c->replicas_gc.entries)) ||
283 		  __replicas_has_entry(&c->replicas_gc, search)));
284 }
285 
bch2_replicas_marked(struct bch_fs * c,struct bch_replicas_entry_v1 * search)286 bool bch2_replicas_marked(struct bch_fs *c,
287 			  struct bch_replicas_entry_v1 *search)
288 {
289 	percpu_down_read(&c->mark_lock);
290 	bool ret = bch2_replicas_marked_locked(c, search);
291 	percpu_up_read(&c->mark_lock);
292 
293 	return ret;
294 }
295 
296 noinline
bch2_mark_replicas_slowpath(struct bch_fs * c,struct bch_replicas_entry_v1 * new_entry)297 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
298 				struct bch_replicas_entry_v1 *new_entry)
299 {
300 	struct bch_replicas_cpu new_r, new_gc;
301 	int ret = 0;
302 
303 	verify_replicas_entry(new_entry);
304 
305 	memset(&new_r, 0, sizeof(new_r));
306 	memset(&new_gc, 0, sizeof(new_gc));
307 
308 	mutex_lock(&c->sb_lock);
309 
310 	if (c->replicas_gc.entries &&
311 	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
312 		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
313 		if (!new_gc.entries) {
314 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
315 			goto err;
316 		}
317 	}
318 
319 	if (!__replicas_has_entry(&c->replicas, new_entry)) {
320 		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
321 		if (!new_r.entries) {
322 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
323 			goto err;
324 		}
325 
326 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
327 		if (ret)
328 			goto err;
329 	}
330 
331 	if (!new_r.entries &&
332 	    !new_gc.entries)
333 		goto out;
334 
335 	/* allocations done, now commit: */
336 
337 	if (new_r.entries)
338 		bch2_write_super(c);
339 
340 	/* don't update in memory replicas until changes are persistent */
341 	percpu_down_write(&c->mark_lock);
342 	if (new_r.entries)
343 		swap(c->replicas, new_r);
344 	if (new_gc.entries)
345 		swap(new_gc, c->replicas_gc);
346 	percpu_up_write(&c->mark_lock);
347 out:
348 	mutex_unlock(&c->sb_lock);
349 
350 	kfree(new_r.entries);
351 	kfree(new_gc.entries);
352 
353 	return ret;
354 err:
355 	bch_err_msg(c, ret, "adding replicas entry");
356 	goto out;
357 }
358 
bch2_mark_replicas(struct bch_fs * c,struct bch_replicas_entry_v1 * r)359 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
360 {
361 	return likely(bch2_replicas_marked(c, r))
362 		? 0 : bch2_mark_replicas_slowpath(c, r);
363 }
364 
365 /*
366  * Old replicas_gc mechanism: only used for journal replicas entries now, should
367  * die at some point:
368  */
369 
bch2_replicas_gc_end(struct bch_fs * c,int ret)370 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
371 {
372 	lockdep_assert_held(&c->replicas_gc_lock);
373 
374 	mutex_lock(&c->sb_lock);
375 	percpu_down_write(&c->mark_lock);
376 
377 	ret =   ret ?:
378 		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
379 	if (!ret)
380 		swap(c->replicas, c->replicas_gc);
381 
382 	kfree(c->replicas_gc.entries);
383 	c->replicas_gc.entries = NULL;
384 
385 	percpu_up_write(&c->mark_lock);
386 
387 	if (!ret)
388 		bch2_write_super(c);
389 
390 	mutex_unlock(&c->sb_lock);
391 
392 	return ret;
393 }
394 
bch2_replicas_gc_start(struct bch_fs * c,unsigned typemask)395 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
396 {
397 	struct bch_replicas_entry_v1 *e;
398 	unsigned i = 0;
399 
400 	lockdep_assert_held(&c->replicas_gc_lock);
401 
402 	mutex_lock(&c->sb_lock);
403 	BUG_ON(c->replicas_gc.entries);
404 
405 	c->replicas_gc.nr		= 0;
406 	c->replicas_gc.entry_size	= 0;
407 
408 	for_each_cpu_replicas_entry(&c->replicas, e) {
409 		/* Preserve unknown data types */
410 		if (e->data_type >= BCH_DATA_NR ||
411 		    !((1 << e->data_type) & typemask)) {
412 			c->replicas_gc.nr++;
413 			c->replicas_gc.entry_size =
414 				max_t(unsigned, c->replicas_gc.entry_size,
415 				      replicas_entry_bytes(e));
416 		}
417 	}
418 
419 	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
420 					 c->replicas_gc.entry_size,
421 					 GFP_KERNEL);
422 	if (!c->replicas_gc.entries) {
423 		mutex_unlock(&c->sb_lock);
424 		bch_err(c, "error allocating c->replicas_gc");
425 		return -BCH_ERR_ENOMEM_replicas_gc;
426 	}
427 
428 	for_each_cpu_replicas_entry(&c->replicas, e)
429 		if (e->data_type >= BCH_DATA_NR ||
430 		    !((1 << e->data_type) & typemask))
431 			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
432 			       e, c->replicas_gc.entry_size);
433 
434 	bch2_cpu_replicas_sort(&c->replicas_gc);
435 	mutex_unlock(&c->sb_lock);
436 
437 	return 0;
438 }
439 
440 /*
441  * New much simpler mechanism for clearing out unneeded replicas entries - drop
442  * replicas entries that have 0 sectors used.
443  *
444  * However, we don't track sector counts for journal usage, so this doesn't drop
445  * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
446  * is retained for that.
447  */
bch2_replicas_gc2(struct bch_fs * c)448 int bch2_replicas_gc2(struct bch_fs *c)
449 {
450 	struct bch_replicas_cpu new = { 0 };
451 	unsigned nr;
452 	int ret = 0;
453 
454 	bch2_accounting_mem_gc(c);
455 retry:
456 	nr		= READ_ONCE(c->replicas.nr);
457 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
458 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
459 	if (!new.entries) {
460 		bch_err(c, "error allocating c->replicas_gc");
461 		return -BCH_ERR_ENOMEM_replicas_gc;
462 	}
463 
464 	mutex_lock(&c->sb_lock);
465 	percpu_down_write(&c->mark_lock);
466 
467 	if (nr			!= c->replicas.nr ||
468 	    new.entry_size	!= c->replicas.entry_size) {
469 		percpu_up_write(&c->mark_lock);
470 		mutex_unlock(&c->sb_lock);
471 		kfree(new.entries);
472 		goto retry;
473 	}
474 
475 	for (unsigned i = 0; i < c->replicas.nr; i++) {
476 		struct bch_replicas_entry_v1 *e =
477 			cpu_replicas_entry(&c->replicas, i);
478 
479 		struct disk_accounting_pos k = {
480 			.type = BCH_DISK_ACCOUNTING_replicas,
481 		};
482 
483 		unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
484 			      "embedded variable length struct");
485 
486 		struct bpos p = disk_accounting_pos_to_bpos(&k);
487 
488 		struct bch_accounting_mem *acc = &c->accounting;
489 		bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
490 					    accounting_pos_cmp, &p) >= acc->k.nr;
491 
492 		if (e->data_type == BCH_DATA_journal || !kill)
493 			memcpy(cpu_replicas_entry(&new, new.nr++),
494 			       e, new.entry_size);
495 	}
496 
497 	bch2_cpu_replicas_sort(&new);
498 
499 	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
500 
501 	if (!ret)
502 		swap(c->replicas, new);
503 
504 	kfree(new.entries);
505 
506 	percpu_up_write(&c->mark_lock);
507 
508 	if (!ret)
509 		bch2_write_super(c);
510 
511 	mutex_unlock(&c->sb_lock);
512 
513 	return ret;
514 }
515 
516 /* Replicas tracking - superblock: */
517 
518 static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas * sb_r,struct bch_replicas_cpu * cpu_r)519 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
520 				   struct bch_replicas_cpu *cpu_r)
521 {
522 	struct bch_replicas_entry_v1 *e, *dst;
523 	unsigned nr = 0, entry_size = 0, idx = 0;
524 
525 	for_each_replicas_entry(sb_r, e) {
526 		entry_size = max_t(unsigned, entry_size,
527 				   replicas_entry_bytes(e));
528 		nr++;
529 	}
530 
531 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
532 	if (!cpu_r->entries)
533 		return -BCH_ERR_ENOMEM_cpu_replicas;
534 
535 	cpu_r->nr		= nr;
536 	cpu_r->entry_size	= entry_size;
537 
538 	for_each_replicas_entry(sb_r, e) {
539 		dst = cpu_replicas_entry(cpu_r, idx++);
540 		memcpy(dst, e, replicas_entry_bytes(e));
541 		bch2_replicas_entry_sort(dst);
542 	}
543 
544 	return 0;
545 }
546 
547 static int
__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 * sb_r,struct bch_replicas_cpu * cpu_r)548 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
549 				      struct bch_replicas_cpu *cpu_r)
550 {
551 	struct bch_replicas_entry_v0 *e;
552 	unsigned nr = 0, entry_size = 0, idx = 0;
553 
554 	for_each_replicas_entry(sb_r, e) {
555 		entry_size = max_t(unsigned, entry_size,
556 				   replicas_entry_bytes(e));
557 		nr++;
558 	}
559 
560 	entry_size += sizeof(struct bch_replicas_entry_v1) -
561 		sizeof(struct bch_replicas_entry_v0);
562 
563 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
564 	if (!cpu_r->entries)
565 		return -BCH_ERR_ENOMEM_cpu_replicas;
566 
567 	cpu_r->nr		= nr;
568 	cpu_r->entry_size	= entry_size;
569 
570 	for_each_replicas_entry(sb_r, e) {
571 		struct bch_replicas_entry_v1 *dst =
572 			cpu_replicas_entry(cpu_r, idx++);
573 
574 		dst->data_type	= e->data_type;
575 		dst->nr_devs	= e->nr_devs;
576 		dst->nr_required = 1;
577 		memcpy(dst->devs, e->devs, e->nr_devs);
578 		bch2_replicas_entry_sort(dst);
579 	}
580 
581 	return 0;
582 }
583 
bch2_sb_replicas_to_cpu_replicas(struct bch_fs * c)584 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
585 {
586 	struct bch_sb_field_replicas *sb_v1;
587 	struct bch_sb_field_replicas_v0 *sb_v0;
588 	struct bch_replicas_cpu new_r = { 0, 0, NULL };
589 	int ret = 0;
590 
591 	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
592 		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
593 	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
594 		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
595 	if (ret)
596 		return ret;
597 
598 	bch2_cpu_replicas_sort(&new_r);
599 
600 	percpu_down_write(&c->mark_lock);
601 	swap(c->replicas, new_r);
602 	percpu_up_write(&c->mark_lock);
603 
604 	kfree(new_r.entries);
605 
606 	return 0;
607 }
608 
bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs * c,struct bch_replicas_cpu * r)609 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
610 					       struct bch_replicas_cpu *r)
611 {
612 	struct bch_sb_field_replicas_v0 *sb_r;
613 	struct bch_replicas_entry_v0 *dst;
614 	struct bch_replicas_entry_v1 *src;
615 	size_t bytes;
616 
617 	bytes = sizeof(struct bch_sb_field_replicas);
618 
619 	for_each_cpu_replicas_entry(r, src)
620 		bytes += replicas_entry_bytes(src) - 1;
621 
622 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
623 			DIV_ROUND_UP(bytes, sizeof(u64)));
624 	if (!sb_r)
625 		return -BCH_ERR_ENOSPC_sb_replicas;
626 
627 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
628 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
629 
630 	memset(&sb_r->entries, 0,
631 	       vstruct_end(&sb_r->field) -
632 	       (void *) &sb_r->entries);
633 
634 	dst = sb_r->entries;
635 	for_each_cpu_replicas_entry(r, src) {
636 		dst->data_type	= src->data_type;
637 		dst->nr_devs	= src->nr_devs;
638 		memcpy(dst->devs, src->devs, src->nr_devs);
639 
640 		dst = replicas_entry_next(dst);
641 
642 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
643 	}
644 
645 	return 0;
646 }
647 
bch2_cpu_replicas_to_sb_replicas(struct bch_fs * c,struct bch_replicas_cpu * r)648 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
649 					    struct bch_replicas_cpu *r)
650 {
651 	struct bch_sb_field_replicas *sb_r;
652 	struct bch_replicas_entry_v1 *dst, *src;
653 	bool need_v1 = false;
654 	size_t bytes;
655 
656 	bytes = sizeof(struct bch_sb_field_replicas);
657 
658 	for_each_cpu_replicas_entry(r, src) {
659 		bytes += replicas_entry_bytes(src);
660 		if (src->nr_required != 1)
661 			need_v1 = true;
662 	}
663 
664 	if (!need_v1)
665 		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
666 
667 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
668 			DIV_ROUND_UP(bytes, sizeof(u64)));
669 	if (!sb_r)
670 		return -BCH_ERR_ENOSPC_sb_replicas;
671 
672 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
673 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
674 
675 	memset(&sb_r->entries, 0,
676 	       vstruct_end(&sb_r->field) -
677 	       (void *) &sb_r->entries);
678 
679 	dst = sb_r->entries;
680 	for_each_cpu_replicas_entry(r, src) {
681 		memcpy(dst, src, replicas_entry_bytes(src));
682 
683 		dst = replicas_entry_next(dst);
684 
685 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
686 	}
687 
688 	return 0;
689 }
690 
bch2_cpu_replicas_validate(struct bch_replicas_cpu * cpu_r,struct bch_sb * sb,struct printbuf * err)691 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
692 				      struct bch_sb *sb,
693 				      struct printbuf *err)
694 {
695 	unsigned i;
696 
697 	sort_r(cpu_r->entries,
698 	       cpu_r->nr,
699 	       cpu_r->entry_size,
700 	       bch2_memcmp, NULL,
701 	       (void *)(size_t)cpu_r->entry_size);
702 
703 	for (i = 0; i < cpu_r->nr; i++) {
704 		struct bch_replicas_entry_v1 *e =
705 			cpu_replicas_entry(cpu_r, i);
706 
707 		int ret = bch2_replicas_entry_sb_validate(e, sb, err);
708 		if (ret)
709 			return ret;
710 
711 		if (i + 1 < cpu_r->nr) {
712 			struct bch_replicas_entry_v1 *n =
713 				cpu_replicas_entry(cpu_r, i + 1);
714 
715 			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
716 
717 			if (!memcmp(e, n, cpu_r->entry_size)) {
718 				prt_printf(err, "duplicate replicas entry ");
719 				bch2_replicas_entry_to_text(err, e);
720 				return -BCH_ERR_invalid_sb_replicas;
721 			}
722 		}
723 	}
724 
725 	return 0;
726 }
727 
bch2_sb_replicas_validate(struct bch_sb * sb,struct bch_sb_field * f,enum bch_validate_flags flags,struct printbuf * err)728 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
729 				     enum bch_validate_flags flags, struct printbuf *err)
730 {
731 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
732 	struct bch_replicas_cpu cpu_r;
733 	int ret;
734 
735 	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
736 	if (ret)
737 		return ret;
738 
739 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
740 	kfree(cpu_r.entries);
741 	return ret;
742 }
743 
bch2_sb_replicas_to_text(struct printbuf * out,struct bch_sb * sb,struct bch_sb_field * f)744 static void bch2_sb_replicas_to_text(struct printbuf *out,
745 				     struct bch_sb *sb,
746 				     struct bch_sb_field *f)
747 {
748 	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
749 	struct bch_replicas_entry_v1 *e;
750 	bool first = true;
751 
752 	for_each_replicas_entry(r, e) {
753 		if (!first)
754 			prt_printf(out, " ");
755 		first = false;
756 
757 		bch2_replicas_entry_to_text(out, e);
758 	}
759 	prt_newline(out);
760 }
761 
762 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
763 	.validate	= bch2_sb_replicas_validate,
764 	.to_text	= bch2_sb_replicas_to_text,
765 };
766 
bch2_sb_replicas_v0_validate(struct bch_sb * sb,struct bch_sb_field * f,enum bch_validate_flags flags,struct printbuf * err)767 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
768 					enum bch_validate_flags flags, struct printbuf *err)
769 {
770 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
771 	struct bch_replicas_cpu cpu_r;
772 	int ret;
773 
774 	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
775 	if (ret)
776 		return ret;
777 
778 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
779 	kfree(cpu_r.entries);
780 	return ret;
781 }
782 
bch2_sb_replicas_v0_to_text(struct printbuf * out,struct bch_sb * sb,struct bch_sb_field * f)783 static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
784 					struct bch_sb *sb,
785 					struct bch_sb_field *f)
786 {
787 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
788 	struct bch_replicas_entry_v0 *e;
789 	bool first = true;
790 
791 	for_each_replicas_entry(sb_r, e) {
792 		if (!first)
793 			prt_printf(out, " ");
794 		first = false;
795 
796 		bch2_replicas_entry_v0_to_text(out, e);
797 	}
798 	prt_newline(out);
799 }
800 
801 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
802 	.validate	= bch2_sb_replicas_v0_validate,
803 	.to_text	= bch2_sb_replicas_v0_to_text,
804 };
805 
806 /* Query replicas: */
807 
bch2_have_enough_devs(struct bch_fs * c,struct bch_devs_mask devs,unsigned flags,bool print)808 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
809 			   unsigned flags, bool print)
810 {
811 	struct bch_replicas_entry_v1 *e;
812 	bool ret = true;
813 
814 	percpu_down_read(&c->mark_lock);
815 	for_each_cpu_replicas_entry(&c->replicas, e) {
816 		unsigned nr_online = 0, nr_failed = 0, dflags = 0;
817 		bool metadata = e->data_type < BCH_DATA_user;
818 
819 		if (e->data_type == BCH_DATA_cached)
820 			continue;
821 
822 		rcu_read_lock();
823 		for (unsigned i = 0; i < e->nr_devs; i++) {
824 			if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
825 				nr_failed++;
826 				continue;
827 			}
828 
829 			nr_online += test_bit(e->devs[i], devs.d);
830 
831 			struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
832 			nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
833 		}
834 		rcu_read_unlock();
835 
836 		if (nr_online + nr_failed == e->nr_devs)
837 			continue;
838 
839 		if (nr_online < e->nr_required)
840 			dflags |= metadata
841 				? BCH_FORCE_IF_METADATA_LOST
842 				: BCH_FORCE_IF_DATA_LOST;
843 
844 		if (nr_online < e->nr_devs)
845 			dflags |= metadata
846 				? BCH_FORCE_IF_METADATA_DEGRADED
847 				: BCH_FORCE_IF_DATA_DEGRADED;
848 
849 		if (dflags & ~flags) {
850 			if (print) {
851 				struct printbuf buf = PRINTBUF;
852 
853 				bch2_replicas_entry_to_text(&buf, e);
854 				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
855 					nr_online, buf.buf);
856 				printbuf_exit(&buf);
857 			}
858 			ret = false;
859 			break;
860 		}
861 
862 	}
863 	percpu_up_read(&c->mark_lock);
864 
865 	return ret;
866 }
867 
bch2_sb_dev_has_data(struct bch_sb * sb,unsigned dev)868 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
869 {
870 	struct bch_sb_field_replicas *replicas;
871 	struct bch_sb_field_replicas_v0 *replicas_v0;
872 	unsigned data_has = 0;
873 
874 	replicas = bch2_sb_field_get(sb, replicas);
875 	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
876 
877 	if (replicas) {
878 		struct bch_replicas_entry_v1 *r;
879 
880 		for_each_replicas_entry(replicas, r) {
881 			if (r->data_type >= sizeof(data_has) * 8)
882 				continue;
883 
884 			for (unsigned i = 0; i < r->nr_devs; i++)
885 				if (r->devs[i] == dev)
886 					data_has |= 1 << r->data_type;
887 		}
888 
889 	} else if (replicas_v0) {
890 		struct bch_replicas_entry_v0 *r;
891 
892 		for_each_replicas_entry_v0(replicas_v0, r) {
893 			if (r->data_type >= sizeof(data_has) * 8)
894 				continue;
895 
896 			for (unsigned i = 0; i < r->nr_devs; i++)
897 				if (r->devs[i] == dev)
898 					data_has |= 1 << r->data_type;
899 		}
900 	}
901 
902 
903 	return data_has;
904 }
905 
bch2_dev_has_data(struct bch_fs * c,struct bch_dev * ca)906 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
907 {
908 	mutex_lock(&c->sb_lock);
909 	unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
910 	mutex_unlock(&c->sb_lock);
911 
912 	return ret;
913 }
914 
bch2_fs_replicas_exit(struct bch_fs * c)915 void bch2_fs_replicas_exit(struct bch_fs *c)
916 {
917 	kfree(c->replicas.entries);
918 	kfree(c->replicas_gc.entries);
919 }
920