1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_journal_iter.h"
9 #include "btree_key_cache.h"
10 #include "btree_locking.h"
11 #include "btree_update.h"
12 #include "debug.h"
13 #include "error.h"
14 #include "extents.h"
15 #include "journal.h"
16 #include "journal_io.h"
17 #include "replicas.h"
18 #include "snapshot.h"
19 #include "trace.h"
20
21 #include <linux/random.h>
22 #include <linux/prefetch.h>
23
24 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
25 static inline void btree_path_list_add(struct btree_trans *,
26 btree_path_idx_t, btree_path_idx_t);
27
btree_iter_ip_allocated(struct btree_iter * iter)28 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
29 {
30 #ifdef TRACK_PATH_ALLOCATED
31 return iter->ip_allocated;
32 #else
33 return 0;
34 #endif
35 }
36
37 static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
38 static void bch2_trans_srcu_lock(struct btree_trans *);
39
__btree_path_cmp(const struct btree_path * l,enum btree_id r_btree_id,bool r_cached,struct bpos r_pos,unsigned r_level)40 static inline int __btree_path_cmp(const struct btree_path *l,
41 enum btree_id r_btree_id,
42 bool r_cached,
43 struct bpos r_pos,
44 unsigned r_level)
45 {
46 /*
47 * Must match lock ordering as defined by __bch2_btree_node_lock:
48 */
49 return cmp_int(l->btree_id, r_btree_id) ?:
50 cmp_int((int) l->cached, (int) r_cached) ?:
51 bpos_cmp(l->pos, r_pos) ?:
52 -cmp_int(l->level, r_level);
53 }
54
btree_path_cmp(const struct btree_path * l,const struct btree_path * r)55 static inline int btree_path_cmp(const struct btree_path *l,
56 const struct btree_path *r)
57 {
58 return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
59 }
60
bkey_successor(struct btree_iter * iter,struct bpos p)61 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
62 {
63 /* Are we iterating over keys in all snapshots? */
64 if (iter->flags & BTREE_ITER_all_snapshots) {
65 p = bpos_successor(p);
66 } else {
67 p = bpos_nosnap_successor(p);
68 p.snapshot = iter->snapshot;
69 }
70
71 return p;
72 }
73
bkey_predecessor(struct btree_iter * iter,struct bpos p)74 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
75 {
76 /* Are we iterating over keys in all snapshots? */
77 if (iter->flags & BTREE_ITER_all_snapshots) {
78 p = bpos_predecessor(p);
79 } else {
80 p = bpos_nosnap_predecessor(p);
81 p.snapshot = iter->snapshot;
82 }
83
84 return p;
85 }
86
btree_iter_search_key(struct btree_iter * iter)87 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
88 {
89 struct bpos pos = iter->pos;
90
91 if ((iter->flags & BTREE_ITER_is_extents) &&
92 !bkey_eq(pos, POS_MAX))
93 pos = bkey_successor(iter, pos);
94 return pos;
95 }
96
btree_path_pos_before_node(struct btree_path * path,struct btree * b)97 static inline bool btree_path_pos_before_node(struct btree_path *path,
98 struct btree *b)
99 {
100 return bpos_lt(path->pos, b->data->min_key);
101 }
102
btree_path_pos_after_node(struct btree_path * path,struct btree * b)103 static inline bool btree_path_pos_after_node(struct btree_path *path,
104 struct btree *b)
105 {
106 return bpos_gt(path->pos, b->key.k.p);
107 }
108
btree_path_pos_in_node(struct btree_path * path,struct btree * b)109 static inline bool btree_path_pos_in_node(struct btree_path *path,
110 struct btree *b)
111 {
112 return path->btree_id == b->c.btree_id &&
113 !btree_path_pos_before_node(path, b) &&
114 !btree_path_pos_after_node(path, b);
115 }
116
117 /* Btree iterator: */
118
119 #ifdef CONFIG_BCACHEFS_DEBUG
120
bch2_btree_path_verify_cached(struct btree_trans * trans,struct btree_path * path)121 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
122 struct btree_path *path)
123 {
124 struct bkey_cached *ck;
125 bool locked = btree_node_locked(path, 0);
126
127 if (!bch2_btree_node_relock(trans, path, 0))
128 return;
129
130 ck = (void *) path->l[0].b;
131 BUG_ON(ck->key.btree_id != path->btree_id ||
132 !bkey_eq(ck->key.pos, path->pos));
133
134 if (!locked)
135 btree_node_unlock(trans, path, 0);
136 }
137
bch2_btree_path_verify_level(struct btree_trans * trans,struct btree_path * path,unsigned level)138 static void bch2_btree_path_verify_level(struct btree_trans *trans,
139 struct btree_path *path, unsigned level)
140 {
141 struct btree_path_level *l;
142 struct btree_node_iter tmp;
143 bool locked;
144 struct bkey_packed *p, *k;
145 struct printbuf buf1 = PRINTBUF;
146 struct printbuf buf2 = PRINTBUF;
147 struct printbuf buf3 = PRINTBUF;
148 const char *msg;
149
150 if (!bch2_debug_check_iterators)
151 return;
152
153 l = &path->l[level];
154 tmp = l->iter;
155 locked = btree_node_locked(path, level);
156
157 if (path->cached) {
158 if (!level)
159 bch2_btree_path_verify_cached(trans, path);
160 return;
161 }
162
163 if (!btree_path_node(path, level))
164 return;
165
166 if (!bch2_btree_node_relock_notrace(trans, path, level))
167 return;
168
169 BUG_ON(!btree_path_pos_in_node(path, l->b));
170
171 bch2_btree_node_iter_verify(&l->iter, l->b);
172
173 /*
174 * For interior nodes, the iterator will have skipped past deleted keys:
175 */
176 p = level
177 ? bch2_btree_node_iter_prev(&tmp, l->b)
178 : bch2_btree_node_iter_prev_all(&tmp, l->b);
179 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
180
181 if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
182 msg = "before";
183 goto err;
184 }
185
186 if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
187 msg = "after";
188 goto err;
189 }
190
191 if (!locked)
192 btree_node_unlock(trans, path, level);
193 return;
194 err:
195 bch2_bpos_to_text(&buf1, path->pos);
196
197 if (p) {
198 struct bkey uk = bkey_unpack_key(l->b, p);
199
200 bch2_bkey_to_text(&buf2, &uk);
201 } else {
202 prt_printf(&buf2, "(none)");
203 }
204
205 if (k) {
206 struct bkey uk = bkey_unpack_key(l->b, k);
207
208 bch2_bkey_to_text(&buf3, &uk);
209 } else {
210 prt_printf(&buf3, "(none)");
211 }
212
213 panic("path should be %s key at level %u:\n"
214 "path pos %s\n"
215 "prev key %s\n"
216 "cur key %s\n",
217 msg, level, buf1.buf, buf2.buf, buf3.buf);
218 }
219
bch2_btree_path_verify(struct btree_trans * trans,struct btree_path * path)220 static void bch2_btree_path_verify(struct btree_trans *trans,
221 struct btree_path *path)
222 {
223 struct bch_fs *c = trans->c;
224 unsigned i;
225
226 EBUG_ON(path->btree_id >= BTREE_ID_NR);
227
228 for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
229 if (!path->l[i].b) {
230 BUG_ON(!path->cached &&
231 bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
232 break;
233 }
234
235 bch2_btree_path_verify_level(trans, path, i);
236 }
237
238 bch2_btree_path_verify_locks(path);
239 }
240
bch2_trans_verify_paths(struct btree_trans * trans)241 void bch2_trans_verify_paths(struct btree_trans *trans)
242 {
243 struct btree_path *path;
244 unsigned iter;
245
246 trans_for_each_path(trans, path, iter)
247 bch2_btree_path_verify(trans, path);
248 }
249
bch2_btree_iter_verify(struct btree_iter * iter)250 static void bch2_btree_iter_verify(struct btree_iter *iter)
251 {
252 struct btree_trans *trans = iter->trans;
253
254 BUG_ON(iter->btree_id >= BTREE_ID_NR);
255
256 BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
257
258 BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
259 (iter->flags & BTREE_ITER_all_snapshots));
260
261 BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
262 (iter->flags & BTREE_ITER_all_snapshots) &&
263 !btree_type_has_snapshot_field(iter->btree_id));
264
265 if (iter->update_path)
266 bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
267 bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
268 }
269
bch2_btree_iter_verify_entry_exit(struct btree_iter * iter)270 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
271 {
272 BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
273 !iter->pos.snapshot);
274
275 BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
276 iter->pos.snapshot != iter->snapshot);
277
278 BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
279 bkey_gt(iter->pos, iter->k.p));
280 }
281
bch2_btree_iter_verify_ret(struct btree_iter * iter,struct bkey_s_c k)282 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
283 {
284 struct btree_trans *trans = iter->trans;
285 struct btree_iter copy;
286 struct bkey_s_c prev;
287 int ret = 0;
288
289 if (!bch2_debug_check_iterators)
290 return 0;
291
292 if (!(iter->flags & BTREE_ITER_filter_snapshots))
293 return 0;
294
295 if (bkey_err(k) || !k.k)
296 return 0;
297
298 BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
299 iter->snapshot,
300 k.k->p.snapshot));
301
302 bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos,
303 BTREE_ITER_nopreserve|
304 BTREE_ITER_all_snapshots);
305 prev = bch2_btree_iter_prev(©);
306 if (!prev.k)
307 goto out;
308
309 ret = bkey_err(prev);
310 if (ret)
311 goto out;
312
313 if (bkey_eq(prev.k->p, k.k->p) &&
314 bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
315 prev.k->p.snapshot) > 0) {
316 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
317
318 bch2_bkey_to_text(&buf1, k.k);
319 bch2_bkey_to_text(&buf2, prev.k);
320
321 panic("iter snap %u\n"
322 "k %s\n"
323 "prev %s\n",
324 iter->snapshot,
325 buf1.buf, buf2.buf);
326 }
327 out:
328 bch2_trans_iter_exit(trans, ©);
329 return ret;
330 }
331
bch2_assert_pos_locked(struct btree_trans * trans,enum btree_id id,struct bpos pos,bool key_cache)332 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
333 struct bpos pos, bool key_cache)
334 {
335 bch2_trans_verify_not_unlocked(trans);
336
337 struct btree_path *path;
338 struct trans_for_each_path_inorder_iter iter;
339 struct printbuf buf = PRINTBUF;
340
341 btree_trans_sort_paths(trans);
342
343 trans_for_each_path_inorder(trans, path, iter) {
344 int cmp = cmp_int(path->btree_id, id) ?:
345 cmp_int(path->cached, key_cache);
346
347 if (cmp > 0)
348 break;
349 if (cmp < 0)
350 continue;
351
352 if (!btree_node_locked(path, 0) ||
353 !path->should_be_locked)
354 continue;
355
356 if (!key_cache) {
357 if (bkey_ge(pos, path->l[0].b->data->min_key) &&
358 bkey_le(pos, path->l[0].b->key.k.p))
359 return;
360 } else {
361 if (bkey_eq(pos, path->pos))
362 return;
363 }
364 }
365
366 bch2_dump_trans_paths_updates(trans);
367 bch2_bpos_to_text(&buf, pos);
368
369 panic("not locked: %s %s%s\n",
370 bch2_btree_id_str(id), buf.buf,
371 key_cache ? " cached" : "");
372 }
373
374 #else
375
bch2_btree_path_verify_level(struct btree_trans * trans,struct btree_path * path,unsigned l)376 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
377 struct btree_path *path, unsigned l) {}
bch2_btree_path_verify(struct btree_trans * trans,struct btree_path * path)378 static inline void bch2_btree_path_verify(struct btree_trans *trans,
379 struct btree_path *path) {}
bch2_btree_iter_verify(struct btree_iter * iter)380 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
bch2_btree_iter_verify_entry_exit(struct btree_iter * iter)381 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
bch2_btree_iter_verify_ret(struct btree_iter * iter,struct bkey_s_c k)382 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
383
384 #endif
385
386 /* Btree path: fixups after btree updates */
387
btree_node_iter_set_set_pos(struct btree_node_iter * iter,struct btree * b,struct bset_tree * t,struct bkey_packed * k)388 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
389 struct btree *b,
390 struct bset_tree *t,
391 struct bkey_packed *k)
392 {
393 struct btree_node_iter_set *set;
394
395 btree_node_iter_for_each(iter, set)
396 if (set->end == t->end_offset) {
397 set->k = __btree_node_key_to_offset(b, k);
398 bch2_btree_node_iter_sort(iter, b);
399 return;
400 }
401
402 bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
403 }
404
__bch2_btree_path_fix_key_modified(struct btree_path * path,struct btree * b,struct bkey_packed * where)405 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
406 struct btree *b,
407 struct bkey_packed *where)
408 {
409 struct btree_path_level *l = &path->l[b->c.level];
410
411 if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
412 return;
413
414 if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
415 bch2_btree_node_iter_advance(&l->iter, l->b);
416 }
417
bch2_btree_path_fix_key_modified(struct btree_trans * trans,struct btree * b,struct bkey_packed * where)418 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
419 struct btree *b,
420 struct bkey_packed *where)
421 {
422 struct btree_path *path;
423 unsigned i;
424
425 trans_for_each_path_with_node(trans, b, path, i) {
426 __bch2_btree_path_fix_key_modified(path, b, where);
427 bch2_btree_path_verify_level(trans, path, b->c.level);
428 }
429 }
430
__bch2_btree_node_iter_fix(struct btree_path * path,struct btree * b,struct btree_node_iter * node_iter,struct bset_tree * t,struct bkey_packed * where,unsigned clobber_u64s,unsigned new_u64s)431 static void __bch2_btree_node_iter_fix(struct btree_path *path,
432 struct btree *b,
433 struct btree_node_iter *node_iter,
434 struct bset_tree *t,
435 struct bkey_packed *where,
436 unsigned clobber_u64s,
437 unsigned new_u64s)
438 {
439 const struct bkey_packed *end = btree_bkey_last(b, t);
440 struct btree_node_iter_set *set;
441 unsigned offset = __btree_node_key_to_offset(b, where);
442 int shift = new_u64s - clobber_u64s;
443 unsigned old_end = t->end_offset - shift;
444 unsigned orig_iter_pos = node_iter->data[0].k;
445 bool iter_current_key_modified =
446 orig_iter_pos >= offset &&
447 orig_iter_pos <= offset + clobber_u64s;
448
449 btree_node_iter_for_each(node_iter, set)
450 if (set->end == old_end)
451 goto found;
452
453 /* didn't find the bset in the iterator - might have to readd it: */
454 if (new_u64s &&
455 bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
456 bch2_btree_node_iter_push(node_iter, b, where, end);
457 goto fixup_done;
458 } else {
459 /* Iterator is after key that changed */
460 return;
461 }
462 found:
463 set->end = t->end_offset;
464
465 /* Iterator hasn't gotten to the key that changed yet: */
466 if (set->k < offset)
467 return;
468
469 if (new_u64s &&
470 bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
471 set->k = offset;
472 } else if (set->k < offset + clobber_u64s) {
473 set->k = offset + new_u64s;
474 if (set->k == set->end)
475 bch2_btree_node_iter_set_drop(node_iter, set);
476 } else {
477 /* Iterator is after key that changed */
478 set->k = (int) set->k + shift;
479 return;
480 }
481
482 bch2_btree_node_iter_sort(node_iter, b);
483 fixup_done:
484 if (node_iter->data[0].k != orig_iter_pos)
485 iter_current_key_modified = true;
486
487 /*
488 * When a new key is added, and the node iterator now points to that
489 * key, the iterator might have skipped past deleted keys that should
490 * come after the key the iterator now points to. We have to rewind to
491 * before those deleted keys - otherwise
492 * bch2_btree_node_iter_prev_all() breaks:
493 */
494 if (!bch2_btree_node_iter_end(node_iter) &&
495 iter_current_key_modified &&
496 b->c.level) {
497 struct bkey_packed *k, *k2, *p;
498
499 k = bch2_btree_node_iter_peek_all(node_iter, b);
500
501 for_each_bset(b, t) {
502 bool set_pos = false;
503
504 if (node_iter->data[0].end == t->end_offset)
505 continue;
506
507 k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
508
509 while ((p = bch2_bkey_prev_all(b, t, k2)) &&
510 bkey_iter_cmp(b, k, p) < 0) {
511 k2 = p;
512 set_pos = true;
513 }
514
515 if (set_pos)
516 btree_node_iter_set_set_pos(node_iter,
517 b, t, k2);
518 }
519 }
520 }
521
bch2_btree_node_iter_fix(struct btree_trans * trans,struct btree_path * path,struct btree * b,struct btree_node_iter * node_iter,struct bkey_packed * where,unsigned clobber_u64s,unsigned new_u64s)522 void bch2_btree_node_iter_fix(struct btree_trans *trans,
523 struct btree_path *path,
524 struct btree *b,
525 struct btree_node_iter *node_iter,
526 struct bkey_packed *where,
527 unsigned clobber_u64s,
528 unsigned new_u64s)
529 {
530 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
531 struct btree_path *linked;
532 unsigned i;
533
534 if (node_iter != &path->l[b->c.level].iter) {
535 __bch2_btree_node_iter_fix(path, b, node_iter, t,
536 where, clobber_u64s, new_u64s);
537
538 if (bch2_debug_check_iterators)
539 bch2_btree_node_iter_verify(node_iter, b);
540 }
541
542 trans_for_each_path_with_node(trans, b, linked, i) {
543 __bch2_btree_node_iter_fix(linked, b,
544 &linked->l[b->c.level].iter, t,
545 where, clobber_u64s, new_u64s);
546 bch2_btree_path_verify_level(trans, linked, b->c.level);
547 }
548 }
549
550 /* Btree path level: pointer to a particular btree node and node iter */
551
__btree_iter_unpack(struct bch_fs * c,struct btree_path_level * l,struct bkey * u,struct bkey_packed * k)552 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
553 struct btree_path_level *l,
554 struct bkey *u,
555 struct bkey_packed *k)
556 {
557 if (unlikely(!k)) {
558 /*
559 * signal to bch2_btree_iter_peek_slot() that we're currently at
560 * a hole
561 */
562 u->type = KEY_TYPE_deleted;
563 return bkey_s_c_null;
564 }
565
566 return bkey_disassemble(l->b, k, u);
567 }
568
btree_path_level_peek_all(struct bch_fs * c,struct btree_path_level * l,struct bkey * u)569 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
570 struct btree_path_level *l,
571 struct bkey *u)
572 {
573 return __btree_iter_unpack(c, l, u,
574 bch2_btree_node_iter_peek_all(&l->iter, l->b));
575 }
576
btree_path_level_peek(struct btree_trans * trans,struct btree_path * path,struct btree_path_level * l,struct bkey * u)577 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
578 struct btree_path *path,
579 struct btree_path_level *l,
580 struct bkey *u)
581 {
582 struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
583 bch2_btree_node_iter_peek(&l->iter, l->b));
584
585 path->pos = k.k ? k.k->p : l->b->key.k.p;
586 trans->paths_sorted = false;
587 bch2_btree_path_verify_level(trans, path, l - path->l);
588 return k;
589 }
590
btree_path_level_prev(struct btree_trans * trans,struct btree_path * path,struct btree_path_level * l,struct bkey * u)591 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
592 struct btree_path *path,
593 struct btree_path_level *l,
594 struct bkey *u)
595 {
596 struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
597 bch2_btree_node_iter_prev(&l->iter, l->b));
598
599 path->pos = k.k ? k.k->p : l->b->data->min_key;
600 trans->paths_sorted = false;
601 bch2_btree_path_verify_level(trans, path, l - path->l);
602 return k;
603 }
604
btree_path_advance_to_pos(struct btree_path * path,struct btree_path_level * l,int max_advance)605 static inline bool btree_path_advance_to_pos(struct btree_path *path,
606 struct btree_path_level *l,
607 int max_advance)
608 {
609 struct bkey_packed *k;
610 int nr_advanced = 0;
611
612 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
613 bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
614 if (max_advance > 0 && nr_advanced >= max_advance)
615 return false;
616
617 bch2_btree_node_iter_advance(&l->iter, l->b);
618 nr_advanced++;
619 }
620
621 return true;
622 }
623
__btree_path_level_init(struct btree_path * path,unsigned level)624 static inline void __btree_path_level_init(struct btree_path *path,
625 unsigned level)
626 {
627 struct btree_path_level *l = &path->l[level];
628
629 bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
630
631 /*
632 * Iterators to interior nodes should always be pointed at the first non
633 * whiteout:
634 */
635 if (level)
636 bch2_btree_node_iter_peek(&l->iter, l->b);
637 }
638
bch2_btree_path_level_init(struct btree_trans * trans,struct btree_path * path,struct btree * b)639 void bch2_btree_path_level_init(struct btree_trans *trans,
640 struct btree_path *path,
641 struct btree *b)
642 {
643 BUG_ON(path->cached);
644
645 EBUG_ON(!btree_path_pos_in_node(path, b));
646
647 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
648 path->l[b->c.level].b = b;
649 __btree_path_level_init(path, b->c.level);
650 }
651
652 /* Btree path: fixups after btree node updates: */
653
bch2_trans_revalidate_updates_in_node(struct btree_trans * trans,struct btree * b)654 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
655 {
656 struct bch_fs *c = trans->c;
657
658 trans_for_each_update(trans, i)
659 if (!i->cached &&
660 i->level == b->c.level &&
661 i->btree_id == b->c.btree_id &&
662 bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
663 bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
664 i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
665
666 if (unlikely(trans->journal_replay_not_finished)) {
667 struct bkey_i *j_k =
668 bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
669 i->k->k.p);
670
671 if (j_k) {
672 i->old_k = j_k->k;
673 i->old_v = &j_k->v;
674 }
675 }
676 }
677 }
678
679 /*
680 * A btree node is being replaced - update the iterator to point to the new
681 * node:
682 */
bch2_trans_node_add(struct btree_trans * trans,struct btree_path * path,struct btree * b)683 void bch2_trans_node_add(struct btree_trans *trans,
684 struct btree_path *path,
685 struct btree *b)
686 {
687 struct btree_path *prev;
688
689 BUG_ON(!btree_path_pos_in_node(path, b));
690
691 while ((prev = prev_btree_path(trans, path)) &&
692 btree_path_pos_in_node(prev, b))
693 path = prev;
694
695 for (;
696 path && btree_path_pos_in_node(path, b);
697 path = next_btree_path(trans, path))
698 if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
699 enum btree_node_locked_type t =
700 btree_lock_want(path, b->c.level);
701
702 if (t != BTREE_NODE_UNLOCKED) {
703 btree_node_unlock(trans, path, b->c.level);
704 six_lock_increment(&b->c.lock, (enum six_lock_type) t);
705 mark_btree_node_locked(trans, path, b->c.level, t);
706 }
707
708 bch2_btree_path_level_init(trans, path, b);
709 }
710
711 bch2_trans_revalidate_updates_in_node(trans, b);
712 }
713
714 /*
715 * A btree node has been modified in such a way as to invalidate iterators - fix
716 * them:
717 */
bch2_trans_node_reinit_iter(struct btree_trans * trans,struct btree * b)718 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
719 {
720 struct btree_path *path;
721 unsigned i;
722
723 trans_for_each_path_with_node(trans, b, path, i)
724 __btree_path_level_init(path, b->c.level);
725
726 bch2_trans_revalidate_updates_in_node(trans, b);
727 }
728
729 /* Btree path: traverse, set_pos: */
730
btree_path_lock_root(struct btree_trans * trans,struct btree_path * path,unsigned depth_want,unsigned long trace_ip)731 static inline int btree_path_lock_root(struct btree_trans *trans,
732 struct btree_path *path,
733 unsigned depth_want,
734 unsigned long trace_ip)
735 {
736 struct bch_fs *c = trans->c;
737 struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
738 enum six_lock_type lock_type;
739 unsigned i;
740 int ret;
741
742 EBUG_ON(path->nodes_locked);
743
744 while (1) {
745 b = READ_ONCE(*rootp);
746 path->level = READ_ONCE(b->c.level);
747
748 if (unlikely(path->level < depth_want)) {
749 /*
750 * the root is at a lower depth than the depth we want:
751 * got to the end of the btree, or we're walking nodes
752 * greater than some depth and there are no nodes >=
753 * that depth
754 */
755 path->level = depth_want;
756 for (i = path->level; i < BTREE_MAX_DEPTH; i++)
757 path->l[i].b = NULL;
758 return 1;
759 }
760
761 lock_type = __btree_lock_want(path, path->level);
762 ret = btree_node_lock(trans, path, &b->c,
763 path->level, lock_type, trace_ip);
764 if (unlikely(ret)) {
765 if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
766 continue;
767 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
768 return ret;
769 BUG();
770 }
771
772 if (likely(b == READ_ONCE(*rootp) &&
773 b->c.level == path->level &&
774 !race_fault())) {
775 for (i = 0; i < path->level; i++)
776 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
777 path->l[path->level].b = b;
778 for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
779 path->l[i].b = NULL;
780
781 mark_btree_node_locked(trans, path, path->level,
782 (enum btree_node_locked_type) lock_type);
783 bch2_btree_path_level_init(trans, path, b);
784 return 0;
785 }
786
787 six_unlock_type(&b->c.lock, lock_type);
788 }
789 }
790
791 noinline
btree_path_prefetch(struct btree_trans * trans,struct btree_path * path)792 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
793 {
794 struct bch_fs *c = trans->c;
795 struct btree_path_level *l = path_l(path);
796 struct btree_node_iter node_iter = l->iter;
797 struct bkey_packed *k;
798 struct bkey_buf tmp;
799 unsigned nr = test_bit(BCH_FS_started, &c->flags)
800 ? (path->level > 1 ? 0 : 2)
801 : (path->level > 1 ? 1 : 16);
802 bool was_locked = btree_node_locked(path, path->level);
803 int ret = 0;
804
805 bch2_bkey_buf_init(&tmp);
806
807 while (nr-- && !ret) {
808 if (!bch2_btree_node_relock(trans, path, path->level))
809 break;
810
811 bch2_btree_node_iter_advance(&node_iter, l->b);
812 k = bch2_btree_node_iter_peek(&node_iter, l->b);
813 if (!k)
814 break;
815
816 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
817 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
818 path->level - 1);
819 }
820
821 if (!was_locked)
822 btree_node_unlock(trans, path, path->level);
823
824 bch2_bkey_buf_exit(&tmp, c);
825 return ret;
826 }
827
btree_path_prefetch_j(struct btree_trans * trans,struct btree_path * path,struct btree_and_journal_iter * jiter)828 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
829 struct btree_and_journal_iter *jiter)
830 {
831 struct bch_fs *c = trans->c;
832 struct bkey_s_c k;
833 struct bkey_buf tmp;
834 unsigned nr = test_bit(BCH_FS_started, &c->flags)
835 ? (path->level > 1 ? 0 : 2)
836 : (path->level > 1 ? 1 : 16);
837 bool was_locked = btree_node_locked(path, path->level);
838 int ret = 0;
839
840 bch2_bkey_buf_init(&tmp);
841
842 while (nr-- && !ret) {
843 if (!bch2_btree_node_relock(trans, path, path->level))
844 break;
845
846 bch2_btree_and_journal_iter_advance(jiter);
847 k = bch2_btree_and_journal_iter_peek(jiter);
848 if (!k.k)
849 break;
850
851 bch2_bkey_buf_reassemble(&tmp, c, k);
852 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
853 path->level - 1);
854 }
855
856 if (!was_locked)
857 btree_node_unlock(trans, path, path->level);
858
859 bch2_bkey_buf_exit(&tmp, c);
860 return ret;
861 }
862
btree_node_mem_ptr_set(struct btree_trans * trans,struct btree_path * path,unsigned plevel,struct btree * b)863 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
864 struct btree_path *path,
865 unsigned plevel, struct btree *b)
866 {
867 struct btree_path_level *l = &path->l[plevel];
868 bool locked = btree_node_locked(path, plevel);
869 struct bkey_packed *k;
870 struct bch_btree_ptr_v2 *bp;
871
872 if (!bch2_btree_node_relock(trans, path, plevel))
873 return;
874
875 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
876 BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
877
878 bp = (void *) bkeyp_val(&l->b->format, k);
879 bp->mem_ptr = (unsigned long)b;
880
881 if (!locked)
882 btree_node_unlock(trans, path, plevel);
883 }
884
btree_node_iter_and_journal_peek(struct btree_trans * trans,struct btree_path * path,unsigned flags,struct bkey_buf * out)885 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
886 struct btree_path *path,
887 unsigned flags,
888 struct bkey_buf *out)
889 {
890 struct bch_fs *c = trans->c;
891 struct btree_path_level *l = path_l(path);
892 struct btree_and_journal_iter jiter;
893 struct bkey_s_c k;
894 int ret = 0;
895
896 __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
897
898 k = bch2_btree_and_journal_iter_peek(&jiter);
899
900 bch2_bkey_buf_reassemble(out, c, k);
901
902 if ((flags & BTREE_ITER_prefetch) &&
903 c->opts.btree_node_prefetch)
904 ret = btree_path_prefetch_j(trans, path, &jiter);
905
906 bch2_btree_and_journal_iter_exit(&jiter);
907 return ret;
908 }
909
btree_path_down(struct btree_trans * trans,struct btree_path * path,unsigned flags,unsigned long trace_ip)910 static __always_inline int btree_path_down(struct btree_trans *trans,
911 struct btree_path *path,
912 unsigned flags,
913 unsigned long trace_ip)
914 {
915 struct bch_fs *c = trans->c;
916 struct btree_path_level *l = path_l(path);
917 struct btree *b;
918 unsigned level = path->level - 1;
919 enum six_lock_type lock_type = __btree_lock_want(path, level);
920 struct bkey_buf tmp;
921 int ret;
922
923 EBUG_ON(!btree_node_locked(path, path->level));
924
925 bch2_bkey_buf_init(&tmp);
926
927 if (unlikely(trans->journal_replay_not_finished)) {
928 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
929 if (ret)
930 goto err;
931 } else {
932 struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
933 if (!k) {
934 struct printbuf buf = PRINTBUF;
935
936 prt_str(&buf, "node not found at pos ");
937 bch2_bpos_to_text(&buf, path->pos);
938 prt_str(&buf, " within parent node ");
939 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
940
941 bch2_fs_fatal_error(c, "%s", buf.buf);
942 printbuf_exit(&buf);
943 ret = -BCH_ERR_btree_need_topology_repair;
944 goto err;
945 }
946
947 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
948
949 if ((flags & BTREE_ITER_prefetch) &&
950 c->opts.btree_node_prefetch) {
951 ret = btree_path_prefetch(trans, path);
952 if (ret)
953 goto err;
954 }
955 }
956
957 b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
958 ret = PTR_ERR_OR_ZERO(b);
959 if (unlikely(ret))
960 goto err;
961
962 if (likely(!trans->journal_replay_not_finished &&
963 tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
964 unlikely(b != btree_node_mem_ptr(tmp.k)))
965 btree_node_mem_ptr_set(trans, path, level + 1, b);
966
967 if (btree_node_read_locked(path, level + 1))
968 btree_node_unlock(trans, path, level + 1);
969
970 mark_btree_node_locked(trans, path, level,
971 (enum btree_node_locked_type) lock_type);
972 path->level = level;
973 bch2_btree_path_level_init(trans, path, b);
974
975 bch2_btree_path_verify_locks(path);
976 err:
977 bch2_bkey_buf_exit(&tmp, c);
978 return ret;
979 }
980
bch2_btree_path_traverse_all(struct btree_trans * trans)981 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
982 {
983 struct bch_fs *c = trans->c;
984 struct btree_path *path;
985 unsigned long trace_ip = _RET_IP_;
986 unsigned i;
987 int ret = 0;
988
989 if (trans->in_traverse_all)
990 return -BCH_ERR_transaction_restart_in_traverse_all;
991
992 trans->in_traverse_all = true;
993 retry_all:
994 trans->restarted = 0;
995 trans->last_restarted_ip = 0;
996
997 trans_for_each_path(trans, path, i)
998 path->should_be_locked = false;
999
1000 btree_trans_sort_paths(trans);
1001
1002 bch2_trans_unlock(trans);
1003 cond_resched();
1004 trans->locked = true;
1005
1006 if (unlikely(trans->memory_allocation_failure)) {
1007 struct closure cl;
1008
1009 closure_init_stack(&cl);
1010
1011 do {
1012 ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
1013 closure_sync(&cl);
1014 } while (ret);
1015 }
1016
1017 /* Now, redo traversals in correct order: */
1018 i = 0;
1019 while (i < trans->nr_sorted) {
1020 btree_path_idx_t idx = trans->sorted[i];
1021
1022 /*
1023 * Traversing a path can cause another path to be added at about
1024 * the same position:
1025 */
1026 if (trans->paths[idx].uptodate) {
1027 __btree_path_get(&trans->paths[idx], false);
1028 ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
1029 __btree_path_put(&trans->paths[idx], false);
1030
1031 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1032 bch2_err_matches(ret, ENOMEM))
1033 goto retry_all;
1034 if (ret)
1035 goto err;
1036 } else {
1037 i++;
1038 }
1039 }
1040
1041 /*
1042 * We used to assert that all paths had been traversed here
1043 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
1044 * path->should_be_locked is not set yet, we might have unlocked and
1045 * then failed to relock a path - that's fine.
1046 */
1047 err:
1048 bch2_btree_cache_cannibalize_unlock(trans);
1049
1050 trans->in_traverse_all = false;
1051
1052 trace_and_count(c, trans_traverse_all, trans, trace_ip);
1053 return ret;
1054 }
1055
btree_path_check_pos_in_node(struct btree_path * path,unsigned l,int check_pos)1056 static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1057 unsigned l, int check_pos)
1058 {
1059 if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1060 return false;
1061 if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1062 return false;
1063 return true;
1064 }
1065
btree_path_good_node(struct btree_trans * trans,struct btree_path * path,unsigned l,int check_pos)1066 static inline bool btree_path_good_node(struct btree_trans *trans,
1067 struct btree_path *path,
1068 unsigned l, int check_pos)
1069 {
1070 return is_btree_node(path, l) &&
1071 bch2_btree_node_relock(trans, path, l) &&
1072 btree_path_check_pos_in_node(path, l, check_pos);
1073 }
1074
btree_path_set_level_down(struct btree_trans * trans,struct btree_path * path,unsigned new_level)1075 static void btree_path_set_level_down(struct btree_trans *trans,
1076 struct btree_path *path,
1077 unsigned new_level)
1078 {
1079 unsigned l;
1080
1081 path->level = new_level;
1082
1083 for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1084 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1085 btree_node_unlock(trans, path, l);
1086
1087 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1088 bch2_btree_path_verify(trans, path);
1089 }
1090
__btree_path_up_until_good_node(struct btree_trans * trans,struct btree_path * path,int check_pos)1091 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1092 struct btree_path *path,
1093 int check_pos)
1094 {
1095 unsigned i, l = path->level;
1096 again:
1097 while (btree_path_node(path, l) &&
1098 !btree_path_good_node(trans, path, l, check_pos))
1099 __btree_path_set_level_up(trans, path, l++);
1100
1101 /* If we need intent locks, take them too: */
1102 for (i = l + 1;
1103 i < path->locks_want && btree_path_node(path, i);
1104 i++)
1105 if (!bch2_btree_node_relock(trans, path, i)) {
1106 while (l <= i)
1107 __btree_path_set_level_up(trans, path, l++);
1108 goto again;
1109 }
1110
1111 return l;
1112 }
1113
btree_path_up_until_good_node(struct btree_trans * trans,struct btree_path * path,int check_pos)1114 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1115 struct btree_path *path,
1116 int check_pos)
1117 {
1118 return likely(btree_node_locked(path, path->level) &&
1119 btree_path_check_pos_in_node(path, path->level, check_pos))
1120 ? path->level
1121 : __btree_path_up_until_good_node(trans, path, check_pos);
1122 }
1123
1124 /*
1125 * This is the main state machine for walking down the btree - walks down to a
1126 * specified depth
1127 *
1128 * Returns 0 on success, -EIO on error (error reading in a btree node).
1129 *
1130 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1131 * stashed in the iterator and returned from bch2_trans_exit().
1132 */
bch2_btree_path_traverse_one(struct btree_trans * trans,btree_path_idx_t path_idx,unsigned flags,unsigned long trace_ip)1133 int bch2_btree_path_traverse_one(struct btree_trans *trans,
1134 btree_path_idx_t path_idx,
1135 unsigned flags,
1136 unsigned long trace_ip)
1137 {
1138 struct btree_path *path = &trans->paths[path_idx];
1139 unsigned depth_want = path->level;
1140 int ret = -((int) trans->restarted);
1141
1142 if (unlikely(ret))
1143 goto out;
1144
1145 if (unlikely(!trans->srcu_held))
1146 bch2_trans_srcu_lock(trans);
1147
1148 /*
1149 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1150 * and re-traverse the path without a transaction restart:
1151 */
1152 if (path->should_be_locked) {
1153 ret = bch2_btree_path_relock(trans, path, trace_ip);
1154 goto out;
1155 }
1156
1157 if (path->cached) {
1158 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1159 goto out;
1160 }
1161
1162 path = &trans->paths[path_idx];
1163
1164 if (unlikely(path->level >= BTREE_MAX_DEPTH))
1165 goto out_uptodate;
1166
1167 path->level = btree_path_up_until_good_node(trans, path, 0);
1168 unsigned max_level = path->level;
1169
1170 EBUG_ON(btree_path_node(path, path->level) &&
1171 !btree_node_locked(path, path->level));
1172
1173 /*
1174 * Note: path->nodes[path->level] may be temporarily NULL here - that
1175 * would indicate to other code that we got to the end of the btree,
1176 * here it indicates that relocking the root failed - it's critical that
1177 * btree_path_lock_root() comes next and that it can't fail
1178 */
1179 while (path->level > depth_want) {
1180 ret = btree_path_node(path, path->level)
1181 ? btree_path_down(trans, path, flags, trace_ip)
1182 : btree_path_lock_root(trans, path, depth_want, trace_ip);
1183 if (unlikely(ret)) {
1184 if (ret == 1) {
1185 /*
1186 * No nodes at this level - got to the end of
1187 * the btree:
1188 */
1189 ret = 0;
1190 goto out;
1191 }
1192
1193 __bch2_btree_path_unlock(trans, path);
1194 path->level = depth_want;
1195 path->l[path->level].b = ERR_PTR(ret);
1196 goto out;
1197 }
1198 }
1199
1200 if (unlikely(max_level > path->level)) {
1201 struct btree_path *linked;
1202 unsigned iter;
1203
1204 trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
1205 for (unsigned j = path->level + 1; j < max_level; j++)
1206 linked->l[j] = path->l[j];
1207 }
1208
1209 out_uptodate:
1210 path->uptodate = BTREE_ITER_UPTODATE;
1211 out:
1212 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
1213 panic("ret %s (%i) trans->restarted %s (%i)\n",
1214 bch2_err_str(ret), ret,
1215 bch2_err_str(trans->restarted), trans->restarted);
1216 bch2_btree_path_verify(trans, path);
1217 return ret;
1218 }
1219
btree_path_copy(struct btree_trans * trans,struct btree_path * dst,struct btree_path * src)1220 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1221 struct btree_path *src)
1222 {
1223 unsigned i, offset = offsetof(struct btree_path, pos);
1224
1225 memcpy((void *) dst + offset,
1226 (void *) src + offset,
1227 sizeof(struct btree_path) - offset);
1228
1229 for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1230 unsigned t = btree_node_locked_type(dst, i);
1231
1232 if (t != BTREE_NODE_UNLOCKED)
1233 six_lock_increment(&dst->l[i].b->c.lock, t);
1234 }
1235 }
1236
btree_path_clone(struct btree_trans * trans,btree_path_idx_t src,bool intent,unsigned long ip)1237 static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
1238 bool intent, unsigned long ip)
1239 {
1240 btree_path_idx_t new = btree_path_alloc(trans, src);
1241 btree_path_copy(trans, trans->paths + new, trans->paths + src);
1242 __btree_path_get(trans->paths + new, intent);
1243 #ifdef TRACK_PATH_ALLOCATED
1244 trans->paths[new].ip_allocated = ip;
1245 #endif
1246 return new;
1247 }
1248
1249 __flatten
__bch2_btree_path_make_mut(struct btree_trans * trans,btree_path_idx_t path,bool intent,unsigned long ip)1250 btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
1251 btree_path_idx_t path, bool intent, unsigned long ip)
1252 {
1253 __btree_path_put(trans->paths + path, intent);
1254 path = btree_path_clone(trans, path, intent, ip);
1255 trans->paths[path].preserve = false;
1256 return path;
1257 }
1258
1259 btree_path_idx_t __must_check
__bch2_btree_path_set_pos(struct btree_trans * trans,btree_path_idx_t path_idx,struct bpos new_pos,bool intent,unsigned long ip)1260 __bch2_btree_path_set_pos(struct btree_trans *trans,
1261 btree_path_idx_t path_idx, struct bpos new_pos,
1262 bool intent, unsigned long ip)
1263 {
1264 int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
1265
1266 bch2_trans_verify_not_in_restart(trans);
1267 EBUG_ON(!trans->paths[path_idx].ref);
1268
1269 path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
1270
1271 struct btree_path *path = trans->paths + path_idx;
1272 path->pos = new_pos;
1273 trans->paths_sorted = false;
1274
1275 if (unlikely(path->cached)) {
1276 btree_node_unlock(trans, path, 0);
1277 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1278 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1279 goto out;
1280 }
1281
1282 unsigned level = btree_path_up_until_good_node(trans, path, cmp);
1283
1284 if (btree_path_node(path, level)) {
1285 struct btree_path_level *l = &path->l[level];
1286
1287 BUG_ON(!btree_node_locked(path, level));
1288 /*
1289 * We might have to skip over many keys, or just a few: try
1290 * advancing the node iterator, and if we have to skip over too
1291 * many keys just reinit it (or if we're rewinding, since that
1292 * is expensive).
1293 */
1294 if (cmp < 0 ||
1295 !btree_path_advance_to_pos(path, l, 8))
1296 bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
1297
1298 /*
1299 * Iterators to interior nodes should always be pointed at the first non
1300 * whiteout:
1301 */
1302 if (unlikely(level))
1303 bch2_btree_node_iter_peek(&l->iter, l->b);
1304 }
1305
1306 if (unlikely(level != path->level)) {
1307 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1308 __bch2_btree_path_unlock(trans, path);
1309 }
1310 out:
1311 bch2_btree_path_verify(trans, path);
1312 return path_idx;
1313 }
1314
1315 /* Btree path: main interface: */
1316
have_path_at_pos(struct btree_trans * trans,struct btree_path * path)1317 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1318 {
1319 struct btree_path *sib;
1320
1321 sib = prev_btree_path(trans, path);
1322 if (sib && !btree_path_cmp(sib, path))
1323 return sib;
1324
1325 sib = next_btree_path(trans, path);
1326 if (sib && !btree_path_cmp(sib, path))
1327 return sib;
1328
1329 return NULL;
1330 }
1331
have_node_at_pos(struct btree_trans * trans,struct btree_path * path)1332 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1333 {
1334 struct btree_path *sib;
1335
1336 sib = prev_btree_path(trans, path);
1337 if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1338 return sib;
1339
1340 sib = next_btree_path(trans, path);
1341 if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1342 return sib;
1343
1344 return NULL;
1345 }
1346
__bch2_path_free(struct btree_trans * trans,btree_path_idx_t path)1347 static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
1348 {
1349 __bch2_btree_path_unlock(trans, trans->paths + path);
1350 btree_path_list_remove(trans, trans->paths + path);
1351 __clear_bit(path, trans->paths_allocated);
1352 }
1353
bch2_btree_path_can_relock(struct btree_trans * trans,struct btree_path * path)1354 static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
1355 {
1356 unsigned l = path->level;
1357
1358 do {
1359 if (!btree_path_node(path, l))
1360 break;
1361
1362 if (!is_btree_node(path, l))
1363 return false;
1364
1365 if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
1366 return false;
1367
1368 l++;
1369 } while (l < path->locks_want);
1370
1371 return true;
1372 }
1373
bch2_path_put(struct btree_trans * trans,btree_path_idx_t path_idx,bool intent)1374 void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
1375 {
1376 struct btree_path *path = trans->paths + path_idx, *dup;
1377
1378 if (!__btree_path_put(path, intent))
1379 return;
1380
1381 dup = path->preserve
1382 ? have_path_at_pos(trans, path)
1383 : have_node_at_pos(trans, path);
1384
1385 if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1386 return;
1387
1388 if (path->should_be_locked && !trans->restarted) {
1389 if (!dup)
1390 return;
1391
1392 if (!(trans->locked
1393 ? bch2_btree_path_relock_norestart(trans, dup)
1394 : bch2_btree_path_can_relock(trans, dup)))
1395 return;
1396 }
1397
1398 if (dup) {
1399 dup->preserve |= path->preserve;
1400 dup->should_be_locked |= path->should_be_locked;
1401 }
1402
1403 __bch2_path_free(trans, path_idx);
1404 }
1405
bch2_path_put_nokeep(struct btree_trans * trans,btree_path_idx_t path,bool intent)1406 static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
1407 bool intent)
1408 {
1409 if (!__btree_path_put(trans->paths + path, intent))
1410 return;
1411
1412 __bch2_path_free(trans, path);
1413 }
1414
bch2_trans_restart_error(struct btree_trans * trans,u32 restart_count)1415 void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
1416 {
1417 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
1418 trans->restart_count, restart_count,
1419 (void *) trans->last_begin_ip);
1420 }
1421
bch2_trans_in_restart_error(struct btree_trans * trans)1422 void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
1423 {
1424 panic("in transaction restart: %s, last restarted by %pS\n",
1425 bch2_err_str(trans->restarted),
1426 (void *) trans->last_restarted_ip);
1427 }
1428
bch2_trans_unlocked_error(struct btree_trans * trans)1429 void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
1430 {
1431 panic("trans should be locked, unlocked by %pS\n",
1432 (void *) trans->last_unlock_ip);
1433 }
1434
1435 noinline __cold
bch2_trans_updates_to_text(struct printbuf * buf,struct btree_trans * trans)1436 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1437 {
1438 prt_printf(buf, "transaction updates for %s journal seq %llu\n",
1439 trans->fn, trans->journal_res.seq);
1440 printbuf_indent_add(buf, 2);
1441
1442 trans_for_each_update(trans, i) {
1443 struct bkey_s_c old = { &i->old_k, i->old_v };
1444
1445 prt_printf(buf, "update: btree=%s cached=%u %pS\n",
1446 bch2_btree_id_str(i->btree_id),
1447 i->cached,
1448 (void *) i->ip_allocated);
1449
1450 prt_printf(buf, " old ");
1451 bch2_bkey_val_to_text(buf, trans->c, old);
1452 prt_newline(buf);
1453
1454 prt_printf(buf, " new ");
1455 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1456 prt_newline(buf);
1457 }
1458
1459 for (struct jset_entry *e = trans->journal_entries;
1460 e != btree_trans_journal_entries_top(trans);
1461 e = vstruct_next(e))
1462 bch2_journal_entry_to_text(buf, trans->c, e);
1463
1464 printbuf_indent_sub(buf, 2);
1465 }
1466
1467 noinline __cold
bch2_dump_trans_updates(struct btree_trans * trans)1468 void bch2_dump_trans_updates(struct btree_trans *trans)
1469 {
1470 struct printbuf buf = PRINTBUF;
1471
1472 bch2_trans_updates_to_text(&buf, trans);
1473 bch2_print_string_as_lines(KERN_ERR, buf.buf);
1474 printbuf_exit(&buf);
1475 }
1476
bch2_btree_path_to_text_short(struct printbuf * out,struct btree_trans * trans,btree_path_idx_t path_idx)1477 static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
1478 {
1479 struct btree_path *path = trans->paths + path_idx;
1480
1481 prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
1482 path_idx, path->ref, path->intent_ref,
1483 path->preserve ? 'P' : ' ',
1484 path->should_be_locked ? 'S' : ' ',
1485 path->cached ? 'C' : 'B',
1486 bch2_btree_id_str(path->btree_id),
1487 path->level);
1488 bch2_bpos_to_text(out, path->pos);
1489
1490 #ifdef TRACK_PATH_ALLOCATED
1491 prt_printf(out, " %pS", (void *) path->ip_allocated);
1492 #endif
1493 }
1494
btree_node_locked_str(enum btree_node_locked_type t)1495 static const char *btree_node_locked_str(enum btree_node_locked_type t)
1496 {
1497 switch (t) {
1498 case BTREE_NODE_UNLOCKED:
1499 return "unlocked";
1500 case BTREE_NODE_READ_LOCKED:
1501 return "read";
1502 case BTREE_NODE_INTENT_LOCKED:
1503 return "intent";
1504 case BTREE_NODE_WRITE_LOCKED:
1505 return "write";
1506 default:
1507 return NULL;
1508 }
1509 }
1510
bch2_btree_path_to_text(struct printbuf * out,struct btree_trans * trans,btree_path_idx_t path_idx)1511 void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
1512 {
1513 bch2_btree_path_to_text_short(out, trans, path_idx);
1514
1515 struct btree_path *path = trans->paths + path_idx;
1516
1517 prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
1518 prt_newline(out);
1519
1520 printbuf_indent_add(out, 2);
1521 for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
1522 prt_printf(out, "l=%u locks %s seq %u node ", l,
1523 btree_node_locked_str(btree_node_locked_type(path, l)),
1524 path->l[l].lock_seq);
1525
1526 int ret = PTR_ERR_OR_ZERO(path->l[l].b);
1527 if (ret)
1528 prt_str(out, bch2_err_str(ret));
1529 else
1530 prt_printf(out, "%px", path->l[l].b);
1531 prt_newline(out);
1532 }
1533 printbuf_indent_sub(out, 2);
1534 }
1535
1536 static noinline __cold
__bch2_trans_paths_to_text(struct printbuf * out,struct btree_trans * trans,bool nosort)1537 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
1538 bool nosort)
1539 {
1540 struct trans_for_each_path_inorder_iter iter;
1541
1542 if (!nosort)
1543 btree_trans_sort_paths(trans);
1544
1545 trans_for_each_path_idx_inorder(trans, iter) {
1546 bch2_btree_path_to_text_short(out, trans, iter.path_idx);
1547 prt_newline(out);
1548 }
1549 }
1550
1551 noinline __cold
bch2_trans_paths_to_text(struct printbuf * out,struct btree_trans * trans)1552 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1553 {
1554 __bch2_trans_paths_to_text(out, trans, false);
1555 }
1556
1557 static noinline __cold
__bch2_dump_trans_paths_updates(struct btree_trans * trans,bool nosort)1558 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
1559 {
1560 struct printbuf buf = PRINTBUF;
1561
1562 __bch2_trans_paths_to_text(&buf, trans, nosort);
1563 bch2_trans_updates_to_text(&buf, trans);
1564
1565 bch2_print_string_as_lines(KERN_ERR, buf.buf);
1566 printbuf_exit(&buf);
1567 }
1568
1569 noinline __cold
bch2_dump_trans_paths_updates(struct btree_trans * trans)1570 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1571 {
1572 __bch2_dump_trans_paths_updates(trans, false);
1573 }
1574
1575 noinline __cold
bch2_trans_update_max_paths(struct btree_trans * trans)1576 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1577 {
1578 struct btree_transaction_stats *s = btree_trans_stats(trans);
1579 struct printbuf buf = PRINTBUF;
1580 size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
1581
1582 bch2_trans_paths_to_text(&buf, trans);
1583
1584 if (!buf.allocation_failure) {
1585 mutex_lock(&s->lock);
1586 if (nr > s->nr_max_paths) {
1587 s->nr_max_paths = nr;
1588 swap(s->max_paths_text, buf.buf);
1589 }
1590 mutex_unlock(&s->lock);
1591 }
1592
1593 printbuf_exit(&buf);
1594
1595 trans->nr_paths_max = nr;
1596 }
1597
1598 noinline __cold
__bch2_btree_trans_too_many_iters(struct btree_trans * trans)1599 int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
1600 {
1601 if (trace_trans_restart_too_many_iters_enabled()) {
1602 struct printbuf buf = PRINTBUF;
1603
1604 bch2_trans_paths_to_text(&buf, trans);
1605 trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
1606 printbuf_exit(&buf);
1607 }
1608
1609 count_event(trans->c, trans_restart_too_many_iters);
1610
1611 return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
1612 }
1613
btree_path_overflow(struct btree_trans * trans)1614 static noinline void btree_path_overflow(struct btree_trans *trans)
1615 {
1616 bch2_dump_trans_paths_updates(trans);
1617 bch_err(trans->c, "trans path overflow");
1618 }
1619
btree_paths_realloc(struct btree_trans * trans)1620 static noinline void btree_paths_realloc(struct btree_trans *trans)
1621 {
1622 unsigned nr = trans->nr_paths * 2;
1623
1624 void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
1625 sizeof(struct btree_trans_paths) +
1626 nr * sizeof(struct btree_path) +
1627 nr * sizeof(btree_path_idx_t) + 8 +
1628 nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
1629
1630 unsigned long *paths_allocated = p;
1631 memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
1632 p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
1633
1634 p += sizeof(struct btree_trans_paths);
1635 struct btree_path *paths = p;
1636 *trans_paths_nr(paths) = nr;
1637 memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
1638 p += nr * sizeof(struct btree_path);
1639
1640 btree_path_idx_t *sorted = p;
1641 memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
1642 p += nr * sizeof(btree_path_idx_t) + 8;
1643
1644 struct btree_insert_entry *updates = p;
1645 memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
1646
1647 unsigned long *old = trans->paths_allocated;
1648
1649 rcu_assign_pointer(trans->paths_allocated, paths_allocated);
1650 rcu_assign_pointer(trans->paths, paths);
1651 rcu_assign_pointer(trans->sorted, sorted);
1652 rcu_assign_pointer(trans->updates, updates);
1653
1654 trans->nr_paths = nr;
1655
1656 if (old != trans->_paths_allocated)
1657 kfree_rcu_mightsleep(old);
1658 }
1659
btree_path_alloc(struct btree_trans * trans,btree_path_idx_t pos)1660 static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
1661 btree_path_idx_t pos)
1662 {
1663 btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
1664
1665 if (unlikely(idx == trans->nr_paths)) {
1666 if (trans->nr_paths == BTREE_ITER_MAX) {
1667 btree_path_overflow(trans);
1668 return 0;
1669 }
1670
1671 btree_paths_realloc(trans);
1672 }
1673
1674 /*
1675 * Do this before marking the new path as allocated, since it won't be
1676 * initialized yet:
1677 */
1678 if (unlikely(idx > trans->nr_paths_max))
1679 bch2_trans_update_max_paths(trans);
1680
1681 __set_bit(idx, trans->paths_allocated);
1682
1683 struct btree_path *path = &trans->paths[idx];
1684 path->ref = 0;
1685 path->intent_ref = 0;
1686 path->nodes_locked = 0;
1687
1688 btree_path_list_add(trans, pos, idx);
1689 trans->paths_sorted = false;
1690 return idx;
1691 }
1692
bch2_path_get(struct btree_trans * trans,enum btree_id btree_id,struct bpos pos,unsigned locks_want,unsigned level,unsigned flags,unsigned long ip)1693 btree_path_idx_t bch2_path_get(struct btree_trans *trans,
1694 enum btree_id btree_id, struct bpos pos,
1695 unsigned locks_want, unsigned level,
1696 unsigned flags, unsigned long ip)
1697 {
1698 struct btree_path *path;
1699 bool cached = flags & BTREE_ITER_cached;
1700 bool intent = flags & BTREE_ITER_intent;
1701 struct trans_for_each_path_inorder_iter iter;
1702 btree_path_idx_t path_pos = 0, path_idx;
1703
1704 bch2_trans_verify_not_unlocked(trans);
1705 bch2_trans_verify_not_in_restart(trans);
1706 bch2_trans_verify_locks(trans);
1707
1708 btree_trans_sort_paths(trans);
1709
1710 trans_for_each_path_inorder(trans, path, iter) {
1711 if (__btree_path_cmp(path,
1712 btree_id,
1713 cached,
1714 pos,
1715 level) > 0)
1716 break;
1717
1718 path_pos = iter.path_idx;
1719 }
1720
1721 if (path_pos &&
1722 trans->paths[path_pos].cached == cached &&
1723 trans->paths[path_pos].btree_id == btree_id &&
1724 trans->paths[path_pos].level == level) {
1725 __btree_path_get(trans->paths + path_pos, intent);
1726 path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1727 path = trans->paths + path_idx;
1728 } else {
1729 path_idx = btree_path_alloc(trans, path_pos);
1730 path = trans->paths + path_idx;
1731
1732 __btree_path_get(path, intent);
1733 path->pos = pos;
1734 path->btree_id = btree_id;
1735 path->cached = cached;
1736 path->uptodate = BTREE_ITER_NEED_TRAVERSE;
1737 path->should_be_locked = false;
1738 path->level = level;
1739 path->locks_want = locks_want;
1740 path->nodes_locked = 0;
1741 for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
1742 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
1743 #ifdef TRACK_PATH_ALLOCATED
1744 path->ip_allocated = ip;
1745 #endif
1746 trans->paths_sorted = false;
1747 }
1748
1749 if (!(flags & BTREE_ITER_nopreserve))
1750 path->preserve = true;
1751
1752 if (path->intent_ref)
1753 locks_want = max(locks_want, level + 1);
1754
1755 /*
1756 * If the path has locks_want greater than requested, we don't downgrade
1757 * it here - on transaction restart because btree node split needs to
1758 * upgrade locks, we might be putting/getting the iterator again.
1759 * Downgrading iterators only happens via bch2_trans_downgrade(), after
1760 * a successful transaction commit.
1761 */
1762
1763 locks_want = min(locks_want, BTREE_MAX_DEPTH);
1764 if (locks_want > path->locks_want)
1765 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
1766
1767 return path_idx;
1768 }
1769
bch2_path_get_unlocked_mut(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct bpos pos)1770 btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
1771 enum btree_id btree_id,
1772 unsigned level,
1773 struct bpos pos)
1774 {
1775 btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
1776 BTREE_ITER_nopreserve|
1777 BTREE_ITER_intent, _RET_IP_);
1778 path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
1779
1780 struct btree_path *path = trans->paths + path_idx;
1781 bch2_btree_path_downgrade(trans, path);
1782 __bch2_btree_path_unlock(trans, path);
1783 return path_idx;
1784 }
1785
bch2_btree_path_peek_slot(struct btree_path * path,struct bkey * u)1786 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1787 {
1788
1789 struct btree_path_level *l = path_l(path);
1790 struct bkey_packed *_k;
1791 struct bkey_s_c k;
1792
1793 if (unlikely(!l->b))
1794 return bkey_s_c_null;
1795
1796 EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1797 EBUG_ON(!btree_node_locked(path, path->level));
1798
1799 if (!path->cached) {
1800 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1801 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1802
1803 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1804
1805 if (!k.k || !bpos_eq(path->pos, k.k->p))
1806 goto hole;
1807 } else {
1808 struct bkey_cached *ck = (void *) path->l[0].b;
1809
1810 EBUG_ON(ck &&
1811 (path->btree_id != ck->key.btree_id ||
1812 !bkey_eq(path->pos, ck->key.pos)));
1813 if (!ck || !ck->valid)
1814 return bkey_s_c_null;
1815
1816 *u = ck->k->k;
1817 k = bkey_i_to_s_c(ck->k);
1818 }
1819
1820 return k;
1821 hole:
1822 bkey_init(u);
1823 u->p = path->pos;
1824 return (struct bkey_s_c) { u, NULL };
1825 }
1826
1827
bch2_set_btree_iter_dontneed(struct btree_iter * iter)1828 void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
1829 {
1830 struct btree_trans *trans = iter->trans;
1831
1832 if (!iter->path || trans->restarted)
1833 return;
1834
1835 struct btree_path *path = btree_iter_path(trans, iter);
1836 path->preserve = false;
1837 if (path->ref == 1)
1838 path->should_be_locked = false;
1839 }
1840 /* Btree iterators: */
1841
1842 int __must_check
__bch2_btree_iter_traverse(struct btree_iter * iter)1843 __bch2_btree_iter_traverse(struct btree_iter *iter)
1844 {
1845 return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1846 }
1847
1848 int __must_check
bch2_btree_iter_traverse(struct btree_iter * iter)1849 bch2_btree_iter_traverse(struct btree_iter *iter)
1850 {
1851 struct btree_trans *trans = iter->trans;
1852 int ret;
1853
1854 bch2_trans_verify_not_unlocked(trans);
1855
1856 iter->path = bch2_btree_path_set_pos(trans, iter->path,
1857 btree_iter_search_key(iter),
1858 iter->flags & BTREE_ITER_intent,
1859 btree_iter_ip_allocated(iter));
1860
1861 ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1862 if (ret)
1863 return ret;
1864
1865 struct btree_path *path = btree_iter_path(trans, iter);
1866 if (btree_path_node(path, path->level))
1867 btree_path_set_should_be_locked(path);
1868 return 0;
1869 }
1870
1871 /* Iterate across nodes (leaf and interior nodes) */
1872
bch2_btree_iter_peek_node(struct btree_iter * iter)1873 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1874 {
1875 struct btree_trans *trans = iter->trans;
1876 struct btree *b = NULL;
1877 int ret;
1878
1879 EBUG_ON(trans->paths[iter->path].cached);
1880 bch2_btree_iter_verify(iter);
1881
1882 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1883 if (ret)
1884 goto err;
1885
1886 struct btree_path *path = btree_iter_path(trans, iter);
1887 b = btree_path_node(path, path->level);
1888 if (!b)
1889 goto out;
1890
1891 BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1892
1893 bkey_init(&iter->k);
1894 iter->k.p = iter->pos = b->key.k.p;
1895
1896 iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1897 iter->flags & BTREE_ITER_intent,
1898 btree_iter_ip_allocated(iter));
1899 btree_path_set_should_be_locked(btree_iter_path(trans, iter));
1900 out:
1901 bch2_btree_iter_verify_entry_exit(iter);
1902 bch2_btree_iter_verify(iter);
1903
1904 return b;
1905 err:
1906 b = ERR_PTR(ret);
1907 goto out;
1908 }
1909
bch2_btree_iter_peek_node_and_restart(struct btree_iter * iter)1910 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
1911 {
1912 struct btree *b;
1913
1914 while (b = bch2_btree_iter_peek_node(iter),
1915 bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
1916 bch2_trans_begin(iter->trans);
1917
1918 return b;
1919 }
1920
bch2_btree_iter_next_node(struct btree_iter * iter)1921 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1922 {
1923 struct btree_trans *trans = iter->trans;
1924 struct btree *b = NULL;
1925 int ret;
1926
1927 EBUG_ON(trans->paths[iter->path].cached);
1928 bch2_trans_verify_not_in_restart(trans);
1929 bch2_btree_iter_verify(iter);
1930
1931 struct btree_path *path = btree_iter_path(trans, iter);
1932
1933 /* already at end? */
1934 if (!btree_path_node(path, path->level))
1935 return NULL;
1936
1937 /* got to end? */
1938 if (!btree_path_node(path, path->level + 1)) {
1939 btree_path_set_level_up(trans, path);
1940 return NULL;
1941 }
1942
1943 if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1944 __bch2_btree_path_unlock(trans, path);
1945 path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1946 path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1947 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1948 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1949 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1950 goto err;
1951 }
1952
1953 b = btree_path_node(path, path->level + 1);
1954
1955 if (bpos_eq(iter->pos, b->key.k.p)) {
1956 __btree_path_set_level_up(trans, path, path->level++);
1957 } else {
1958 if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
1959 btree_node_unlock(trans, path, path->level + 1);
1960
1961 /*
1962 * Haven't gotten to the end of the parent node: go back down to
1963 * the next child node
1964 */
1965 iter->path = bch2_btree_path_set_pos(trans, iter->path,
1966 bpos_successor(iter->pos),
1967 iter->flags & BTREE_ITER_intent,
1968 btree_iter_ip_allocated(iter));
1969
1970 path = btree_iter_path(trans, iter);
1971 btree_path_set_level_down(trans, path, iter->min_depth);
1972
1973 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1974 if (ret)
1975 goto err;
1976
1977 path = btree_iter_path(trans, iter);
1978 b = path->l[path->level].b;
1979 }
1980
1981 bkey_init(&iter->k);
1982 iter->k.p = iter->pos = b->key.k.p;
1983
1984 iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1985 iter->flags & BTREE_ITER_intent,
1986 btree_iter_ip_allocated(iter));
1987 btree_path_set_should_be_locked(btree_iter_path(trans, iter));
1988 EBUG_ON(btree_iter_path(trans, iter)->uptodate);
1989 out:
1990 bch2_btree_iter_verify_entry_exit(iter);
1991 bch2_btree_iter_verify(iter);
1992
1993 return b;
1994 err:
1995 b = ERR_PTR(ret);
1996 goto out;
1997 }
1998
1999 /* Iterate across keys (in leaf nodes only) */
2000
bch2_btree_iter_advance(struct btree_iter * iter)2001 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
2002 {
2003 struct bpos pos = iter->k.p;
2004 bool ret = !(iter->flags & BTREE_ITER_all_snapshots
2005 ? bpos_eq(pos, SPOS_MAX)
2006 : bkey_eq(pos, SPOS_MAX));
2007
2008 if (ret && !(iter->flags & BTREE_ITER_is_extents))
2009 pos = bkey_successor(iter, pos);
2010 bch2_btree_iter_set_pos(iter, pos);
2011 return ret;
2012 }
2013
bch2_btree_iter_rewind(struct btree_iter * iter)2014 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
2015 {
2016 struct bpos pos = bkey_start_pos(&iter->k);
2017 bool ret = !(iter->flags & BTREE_ITER_all_snapshots
2018 ? bpos_eq(pos, POS_MIN)
2019 : bkey_eq(pos, POS_MIN));
2020
2021 if (ret && !(iter->flags & BTREE_ITER_is_extents))
2022 pos = bkey_predecessor(iter, pos);
2023 bch2_btree_iter_set_pos(iter, pos);
2024 return ret;
2025 }
2026
2027 static noinline
bch2_btree_trans_peek_prev_updates(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c * k)2028 void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
2029 struct bkey_s_c *k)
2030 {
2031 struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
2032
2033 trans_for_each_update(trans, i)
2034 if (!i->key_cache_already_flushed &&
2035 i->btree_id == iter->btree_id &&
2036 bpos_le(i->k->k.p, iter->pos) &&
2037 bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
2038 iter->k = i->k->k;
2039 *k = bkey_i_to_s_c(i->k);
2040 }
2041 }
2042
2043 static noinline
bch2_btree_trans_peek_updates(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c * k)2044 void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
2045 struct bkey_s_c *k)
2046 {
2047 struct btree_path *path = btree_iter_path(trans, iter);
2048 struct bpos end = path_l(path)->b->key.k.p;
2049
2050 trans_for_each_update(trans, i)
2051 if (!i->key_cache_already_flushed &&
2052 i->btree_id == iter->btree_id &&
2053 bpos_ge(i->k->k.p, path->pos) &&
2054 bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
2055 iter->k = i->k->k;
2056 *k = bkey_i_to_s_c(i->k);
2057 }
2058 }
2059
2060 static noinline
bch2_btree_trans_peek_slot_updates(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c * k)2061 void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
2062 struct bkey_s_c *k)
2063 {
2064 trans_for_each_update(trans, i)
2065 if (!i->key_cache_already_flushed &&
2066 i->btree_id == iter->btree_id &&
2067 bpos_eq(i->k->k.p, iter->pos)) {
2068 iter->k = i->k->k;
2069 *k = bkey_i_to_s_c(i->k);
2070 }
2071 }
2072
bch2_btree_journal_peek(struct btree_trans * trans,struct btree_iter * iter,struct bpos end_pos)2073 static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
2074 struct btree_iter *iter,
2075 struct bpos end_pos)
2076 {
2077 struct btree_path *path = btree_iter_path(trans, iter);
2078
2079 return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
2080 path->level,
2081 path->pos,
2082 end_pos,
2083 &iter->journal_idx);
2084 }
2085
2086 static noinline
btree_trans_peek_slot_journal(struct btree_trans * trans,struct btree_iter * iter)2087 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
2088 struct btree_iter *iter)
2089 {
2090 struct btree_path *path = btree_iter_path(trans, iter);
2091 struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
2092
2093 if (k) {
2094 iter->k = k->k;
2095 return bkey_i_to_s_c(k);
2096 } else {
2097 return bkey_s_c_null;
2098 }
2099 }
2100
2101 static noinline
btree_trans_peek_journal(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2102 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
2103 struct btree_iter *iter,
2104 struct bkey_s_c k)
2105 {
2106 struct btree_path *path = btree_iter_path(trans, iter);
2107 struct bkey_i *next_journal =
2108 bch2_btree_journal_peek(trans, iter,
2109 k.k ? k.k->p : path_l(path)->b->key.k.p);
2110
2111 if (next_journal) {
2112 iter->k = next_journal->k;
2113 k = bkey_i_to_s_c(next_journal);
2114 }
2115
2116 return k;
2117 }
2118
2119 /*
2120 * Checks btree key cache for key at iter->pos and returns it if present, or
2121 * bkey_s_c_null:
2122 */
2123 static noinline
btree_trans_peek_key_cache(struct btree_iter * iter,struct bpos pos)2124 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
2125 {
2126 struct btree_trans *trans = iter->trans;
2127 struct bch_fs *c = trans->c;
2128 struct bkey u;
2129 struct bkey_s_c k;
2130 int ret;
2131
2132 bch2_trans_verify_not_in_restart(trans);
2133 bch2_trans_verify_not_unlocked(trans);
2134
2135 if ((iter->flags & BTREE_ITER_key_cache_fill) &&
2136 bpos_eq(iter->pos, pos))
2137 return bkey_s_c_null;
2138
2139 if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
2140 return bkey_s_c_null;
2141
2142 if (!iter->key_cache_path)
2143 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
2144 iter->flags & BTREE_ITER_intent, 0,
2145 iter->flags|BTREE_ITER_cached|
2146 BTREE_ITER_cached_nofill,
2147 _THIS_IP_);
2148
2149 iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
2150 iter->flags & BTREE_ITER_intent,
2151 btree_iter_ip_allocated(iter));
2152
2153 ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
2154 iter->flags|BTREE_ITER_cached) ?:
2155 bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
2156 if (unlikely(ret))
2157 return bkey_s_c_err(ret);
2158
2159 btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
2160
2161 k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
2162 if (k.k && !bkey_err(k)) {
2163 iter->k = u;
2164 k.k = &iter->k;
2165 }
2166 return k;
2167 }
2168
__bch2_btree_iter_peek(struct btree_iter * iter,struct bpos search_key)2169 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
2170 {
2171 struct btree_trans *trans = iter->trans;
2172 struct bkey_s_c k, k2;
2173 int ret;
2174
2175 EBUG_ON(btree_iter_path(trans, iter)->cached);
2176 bch2_btree_iter_verify(iter);
2177
2178 while (1) {
2179 struct btree_path_level *l;
2180
2181 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2182 iter->flags & BTREE_ITER_intent,
2183 btree_iter_ip_allocated(iter));
2184
2185 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2186 if (unlikely(ret)) {
2187 /* ensure that iter->k is consistent with iter->pos: */
2188 bch2_btree_iter_set_pos(iter, iter->pos);
2189 k = bkey_s_c_err(ret);
2190 goto out;
2191 }
2192
2193 struct btree_path *path = btree_iter_path(trans, iter);
2194 l = path_l(path);
2195
2196 if (unlikely(!l->b)) {
2197 /* No btree nodes at requested level: */
2198 bch2_btree_iter_set_pos(iter, SPOS_MAX);
2199 k = bkey_s_c_null;
2200 goto out;
2201 }
2202
2203 btree_path_set_should_be_locked(path);
2204
2205 k = btree_path_level_peek_all(trans->c, l, &iter->k);
2206
2207 if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
2208 k.k &&
2209 (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
2210 k = k2;
2211 ret = bkey_err(k);
2212 if (ret) {
2213 bch2_btree_iter_set_pos(iter, iter->pos);
2214 goto out;
2215 }
2216 }
2217
2218 if (unlikely(iter->flags & BTREE_ITER_with_journal))
2219 k = btree_trans_peek_journal(trans, iter, k);
2220
2221 if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
2222 trans->nr_updates))
2223 bch2_btree_trans_peek_updates(trans, iter, &k);
2224
2225 if (k.k && bkey_deleted(k.k)) {
2226 /*
2227 * If we've got a whiteout, and it's after the search
2228 * key, advance the search key to the whiteout instead
2229 * of just after the whiteout - it might be a btree
2230 * whiteout, with a real key at the same position, since
2231 * in the btree deleted keys sort before non deleted.
2232 */
2233 search_key = !bpos_eq(search_key, k.k->p)
2234 ? k.k->p
2235 : bpos_successor(k.k->p);
2236 continue;
2237 }
2238
2239 if (likely(k.k)) {
2240 break;
2241 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
2242 /* Advance to next leaf node: */
2243 search_key = bpos_successor(l->b->key.k.p);
2244 } else {
2245 /* End of btree: */
2246 bch2_btree_iter_set_pos(iter, SPOS_MAX);
2247 k = bkey_s_c_null;
2248 goto out;
2249 }
2250 }
2251 out:
2252 bch2_btree_iter_verify(iter);
2253
2254 return k;
2255 }
2256
2257 /**
2258 * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
2259 * iterator's current position
2260 * @iter: iterator to peek from
2261 * @end: search limit: returns keys less than or equal to @end
2262 *
2263 * Returns: key if found, or an error extractable with bkey_err().
2264 */
bch2_btree_iter_peek_upto(struct btree_iter * iter,struct bpos end)2265 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
2266 {
2267 struct btree_trans *trans = iter->trans;
2268 struct bpos search_key = btree_iter_search_key(iter);
2269 struct bkey_s_c k;
2270 struct bpos iter_pos;
2271 int ret;
2272
2273 bch2_trans_verify_not_unlocked(trans);
2274 EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
2275
2276 if (iter->update_path) {
2277 bch2_path_put_nokeep(trans, iter->update_path,
2278 iter->flags & BTREE_ITER_intent);
2279 iter->update_path = 0;
2280 }
2281
2282 bch2_btree_iter_verify_entry_exit(iter);
2283
2284 while (1) {
2285 k = __bch2_btree_iter_peek(iter, search_key);
2286 if (unlikely(!k.k))
2287 goto end;
2288 if (unlikely(bkey_err(k)))
2289 goto out_no_locked;
2290
2291 /*
2292 * We need to check against @end before FILTER_SNAPSHOTS because
2293 * if we get to a different inode that requested we might be
2294 * seeing keys for a different snapshot tree that will all be
2295 * filtered out.
2296 *
2297 * But we can't do the full check here, because bkey_start_pos()
2298 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
2299 * that's what we check against in extents mode:
2300 */
2301 if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
2302 ? bkey_gt(k.k->p, end)
2303 : k.k->p.inode > end.inode))
2304 goto end;
2305
2306 if (iter->update_path &&
2307 !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
2308 bch2_path_put_nokeep(trans, iter->update_path,
2309 iter->flags & BTREE_ITER_intent);
2310 iter->update_path = 0;
2311 }
2312
2313 if ((iter->flags & BTREE_ITER_filter_snapshots) &&
2314 (iter->flags & BTREE_ITER_intent) &&
2315 !(iter->flags & BTREE_ITER_is_extents) &&
2316 !iter->update_path) {
2317 struct bpos pos = k.k->p;
2318
2319 if (pos.snapshot < iter->snapshot) {
2320 search_key = bpos_successor(k.k->p);
2321 continue;
2322 }
2323
2324 pos.snapshot = iter->snapshot;
2325
2326 /*
2327 * advance, same as on exit for iter->path, but only up
2328 * to snapshot
2329 */
2330 __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
2331 iter->update_path = iter->path;
2332
2333 iter->update_path = bch2_btree_path_set_pos(trans,
2334 iter->update_path, pos,
2335 iter->flags & BTREE_ITER_intent,
2336 _THIS_IP_);
2337 ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
2338 if (unlikely(ret)) {
2339 k = bkey_s_c_err(ret);
2340 goto out_no_locked;
2341 }
2342 }
2343
2344 /*
2345 * We can never have a key in a leaf node at POS_MAX, so
2346 * we don't have to check these successor() calls:
2347 */
2348 if ((iter->flags & BTREE_ITER_filter_snapshots) &&
2349 !bch2_snapshot_is_ancestor(trans->c,
2350 iter->snapshot,
2351 k.k->p.snapshot)) {
2352 search_key = bpos_successor(k.k->p);
2353 continue;
2354 }
2355
2356 if (bkey_whiteout(k.k) &&
2357 !(iter->flags & BTREE_ITER_all_snapshots)) {
2358 search_key = bkey_successor(iter, k.k->p);
2359 continue;
2360 }
2361
2362 /*
2363 * iter->pos should be mononotically increasing, and always be
2364 * equal to the key we just returned - except extents can
2365 * straddle iter->pos:
2366 */
2367 if (!(iter->flags & BTREE_ITER_is_extents))
2368 iter_pos = k.k->p;
2369 else
2370 iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2371
2372 if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
2373 ? bkey_gt(iter_pos, end)
2374 : bkey_ge(iter_pos, end)))
2375 goto end;
2376
2377 break;
2378 }
2379
2380 iter->pos = iter_pos;
2381
2382 iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2383 iter->flags & BTREE_ITER_intent,
2384 btree_iter_ip_allocated(iter));
2385
2386 btree_path_set_should_be_locked(btree_iter_path(trans, iter));
2387 out_no_locked:
2388 if (iter->update_path) {
2389 ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
2390 if (unlikely(ret))
2391 k = bkey_s_c_err(ret);
2392 else
2393 btree_path_set_should_be_locked(trans->paths + iter->update_path);
2394 }
2395
2396 if (!(iter->flags & BTREE_ITER_all_snapshots))
2397 iter->pos.snapshot = iter->snapshot;
2398
2399 ret = bch2_btree_iter_verify_ret(iter, k);
2400 if (unlikely(ret)) {
2401 bch2_btree_iter_set_pos(iter, iter->pos);
2402 k = bkey_s_c_err(ret);
2403 }
2404
2405 bch2_btree_iter_verify_entry_exit(iter);
2406
2407 return k;
2408 end:
2409 bch2_btree_iter_set_pos(iter, end);
2410 k = bkey_s_c_null;
2411 goto out_no_locked;
2412 }
2413
2414 /**
2415 * bch2_btree_iter_next() - returns first key greater than iterator's current
2416 * position
2417 * @iter: iterator to peek from
2418 *
2419 * Returns: key if found, or an error extractable with bkey_err().
2420 */
bch2_btree_iter_next(struct btree_iter * iter)2421 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2422 {
2423 if (!bch2_btree_iter_advance(iter))
2424 return bkey_s_c_null;
2425
2426 return bch2_btree_iter_peek(iter);
2427 }
2428
2429 /**
2430 * bch2_btree_iter_peek_prev() - returns first key less than or equal to
2431 * iterator's current position
2432 * @iter: iterator to peek from
2433 *
2434 * Returns: key if found, or an error extractable with bkey_err().
2435 */
bch2_btree_iter_peek_prev(struct btree_iter * iter)2436 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2437 {
2438 struct btree_trans *trans = iter->trans;
2439 struct bpos search_key = iter->pos;
2440 struct bkey_s_c k;
2441 struct bkey saved_k;
2442 const struct bch_val *saved_v;
2443 btree_path_idx_t saved_path = 0;
2444 int ret;
2445
2446 bch2_trans_verify_not_unlocked(trans);
2447 EBUG_ON(btree_iter_path(trans, iter)->cached ||
2448 btree_iter_path(trans, iter)->level);
2449
2450 if (iter->flags & BTREE_ITER_with_journal)
2451 return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
2452
2453 bch2_btree_iter_verify(iter);
2454 bch2_btree_iter_verify_entry_exit(iter);
2455
2456 if (iter->flags & BTREE_ITER_filter_snapshots)
2457 search_key.snapshot = U32_MAX;
2458
2459 while (1) {
2460 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2461 iter->flags & BTREE_ITER_intent,
2462 btree_iter_ip_allocated(iter));
2463
2464 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2465 if (unlikely(ret)) {
2466 /* ensure that iter->k is consistent with iter->pos: */
2467 bch2_btree_iter_set_pos(iter, iter->pos);
2468 k = bkey_s_c_err(ret);
2469 goto out_no_locked;
2470 }
2471
2472 struct btree_path *path = btree_iter_path(trans, iter);
2473
2474 k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
2475 if (!k.k ||
2476 ((iter->flags & BTREE_ITER_is_extents)
2477 ? bpos_ge(bkey_start_pos(k.k), search_key)
2478 : bpos_gt(k.k->p, search_key)))
2479 k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
2480
2481 if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
2482 trans->nr_updates))
2483 bch2_btree_trans_peek_prev_updates(trans, iter, &k);
2484
2485 if (likely(k.k)) {
2486 if (iter->flags & BTREE_ITER_filter_snapshots) {
2487 if (k.k->p.snapshot == iter->snapshot)
2488 goto got_key;
2489
2490 /*
2491 * If we have a saved candidate, and we're no
2492 * longer at the same _key_ (not pos), return
2493 * that candidate
2494 */
2495 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
2496 bch2_path_put_nokeep(trans, iter->path,
2497 iter->flags & BTREE_ITER_intent);
2498 iter->path = saved_path;
2499 saved_path = 0;
2500 iter->k = saved_k;
2501 k.v = saved_v;
2502 goto got_key;
2503 }
2504
2505 if (bch2_snapshot_is_ancestor(trans->c,
2506 iter->snapshot,
2507 k.k->p.snapshot)) {
2508 if (saved_path)
2509 bch2_path_put_nokeep(trans, saved_path,
2510 iter->flags & BTREE_ITER_intent);
2511 saved_path = btree_path_clone(trans, iter->path,
2512 iter->flags & BTREE_ITER_intent,
2513 _THIS_IP_);
2514 path = btree_iter_path(trans, iter);
2515 saved_k = *k.k;
2516 saved_v = k.v;
2517 }
2518
2519 search_key = bpos_predecessor(k.k->p);
2520 continue;
2521 }
2522 got_key:
2523 if (bkey_whiteout(k.k) &&
2524 !(iter->flags & BTREE_ITER_all_snapshots)) {
2525 search_key = bkey_predecessor(iter, k.k->p);
2526 if (iter->flags & BTREE_ITER_filter_snapshots)
2527 search_key.snapshot = U32_MAX;
2528 continue;
2529 }
2530
2531 btree_path_set_should_be_locked(path);
2532 break;
2533 } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
2534 /* Advance to previous leaf node: */
2535 search_key = bpos_predecessor(path->l[0].b->data->min_key);
2536 } else {
2537 /* Start of btree: */
2538 bch2_btree_iter_set_pos(iter, POS_MIN);
2539 k = bkey_s_c_null;
2540 goto out_no_locked;
2541 }
2542 }
2543
2544 EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
2545
2546 /* Extents can straddle iter->pos: */
2547 if (bkey_lt(k.k->p, iter->pos))
2548 iter->pos = k.k->p;
2549
2550 if (iter->flags & BTREE_ITER_filter_snapshots)
2551 iter->pos.snapshot = iter->snapshot;
2552 out_no_locked:
2553 if (saved_path)
2554 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
2555
2556 bch2_btree_iter_verify_entry_exit(iter);
2557 bch2_btree_iter_verify(iter);
2558
2559 return k;
2560 }
2561
2562 /**
2563 * bch2_btree_iter_prev() - returns first key less than iterator's current
2564 * position
2565 * @iter: iterator to peek from
2566 *
2567 * Returns: key if found, or an error extractable with bkey_err().
2568 */
bch2_btree_iter_prev(struct btree_iter * iter)2569 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2570 {
2571 if (!bch2_btree_iter_rewind(iter))
2572 return bkey_s_c_null;
2573
2574 return bch2_btree_iter_peek_prev(iter);
2575 }
2576
bch2_btree_iter_peek_slot(struct btree_iter * iter)2577 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2578 {
2579 struct btree_trans *trans = iter->trans;
2580 struct bpos search_key;
2581 struct bkey_s_c k;
2582 int ret;
2583
2584 bch2_trans_verify_not_unlocked(trans);
2585 bch2_btree_iter_verify(iter);
2586 bch2_btree_iter_verify_entry_exit(iter);
2587 EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
2588
2589 /* extents can't span inode numbers: */
2590 if ((iter->flags & BTREE_ITER_is_extents) &&
2591 unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2592 if (iter->pos.inode == KEY_INODE_MAX)
2593 return bkey_s_c_null;
2594
2595 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2596 }
2597
2598 search_key = btree_iter_search_key(iter);
2599 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2600 iter->flags & BTREE_ITER_intent,
2601 btree_iter_ip_allocated(iter));
2602
2603 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2604 if (unlikely(ret)) {
2605 k = bkey_s_c_err(ret);
2606 goto out_no_locked;
2607 }
2608
2609 if ((iter->flags & BTREE_ITER_cached) ||
2610 !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
2611 k = bkey_s_c_null;
2612
2613 if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
2614 trans->nr_updates)) {
2615 bch2_btree_trans_peek_slot_updates(trans, iter, &k);
2616 if (k.k)
2617 goto out;
2618 }
2619
2620 if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
2621 (k = btree_trans_peek_slot_journal(trans, iter)).k)
2622 goto out;
2623
2624 if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
2625 (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
2626 if (!bkey_err(k))
2627 iter->k = *k.k;
2628 /* We're not returning a key from iter->path: */
2629 goto out_no_locked;
2630 }
2631
2632 k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
2633 if (unlikely(!k.k))
2634 goto out_no_locked;
2635 } else {
2636 struct bpos next;
2637 struct bpos end = iter->pos;
2638
2639 if (iter->flags & BTREE_ITER_is_extents)
2640 end.offset = U64_MAX;
2641
2642 EBUG_ON(btree_iter_path(trans, iter)->level);
2643
2644 if (iter->flags & BTREE_ITER_intent) {
2645 struct btree_iter iter2;
2646
2647 bch2_trans_copy_iter(&iter2, iter);
2648 k = bch2_btree_iter_peek_upto(&iter2, end);
2649
2650 if (k.k && !bkey_err(k)) {
2651 swap(iter->key_cache_path, iter2.key_cache_path);
2652 iter->k = iter2.k;
2653 k.k = &iter->k;
2654 }
2655 bch2_trans_iter_exit(trans, &iter2);
2656 } else {
2657 struct bpos pos = iter->pos;
2658
2659 k = bch2_btree_iter_peek_upto(iter, end);
2660 if (unlikely(bkey_err(k)))
2661 bch2_btree_iter_set_pos(iter, pos);
2662 else
2663 iter->pos = pos;
2664 }
2665
2666 if (unlikely(bkey_err(k)))
2667 goto out_no_locked;
2668
2669 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2670
2671 if (bkey_lt(iter->pos, next)) {
2672 bkey_init(&iter->k);
2673 iter->k.p = iter->pos;
2674
2675 if (iter->flags & BTREE_ITER_is_extents) {
2676 bch2_key_resize(&iter->k,
2677 min_t(u64, KEY_SIZE_MAX,
2678 (next.inode == iter->pos.inode
2679 ? next.offset
2680 : KEY_OFFSET_MAX) -
2681 iter->pos.offset));
2682 EBUG_ON(!iter->k.size);
2683 }
2684
2685 k = (struct bkey_s_c) { &iter->k, NULL };
2686 }
2687 }
2688 out:
2689 btree_path_set_should_be_locked(btree_iter_path(trans, iter));
2690 out_no_locked:
2691 bch2_btree_iter_verify_entry_exit(iter);
2692 bch2_btree_iter_verify(iter);
2693 ret = bch2_btree_iter_verify_ret(iter, k);
2694 if (unlikely(ret))
2695 return bkey_s_c_err(ret);
2696
2697 return k;
2698 }
2699
bch2_btree_iter_next_slot(struct btree_iter * iter)2700 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2701 {
2702 if (!bch2_btree_iter_advance(iter))
2703 return bkey_s_c_null;
2704
2705 return bch2_btree_iter_peek_slot(iter);
2706 }
2707
bch2_btree_iter_prev_slot(struct btree_iter * iter)2708 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2709 {
2710 if (!bch2_btree_iter_rewind(iter))
2711 return bkey_s_c_null;
2712
2713 return bch2_btree_iter_peek_slot(iter);
2714 }
2715
bch2_btree_iter_peek_and_restart_outlined(struct btree_iter * iter)2716 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
2717 {
2718 struct bkey_s_c k;
2719
2720 while (btree_trans_too_many_iters(iter->trans) ||
2721 (k = bch2_btree_iter_peek_type(iter, iter->flags),
2722 bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
2723 bch2_trans_begin(iter->trans);
2724
2725 return k;
2726 }
2727
2728 /* new transactional stuff: */
2729
2730 #ifdef CONFIG_BCACHEFS_DEBUG
btree_trans_verify_sorted_refs(struct btree_trans * trans)2731 static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2732 {
2733 struct btree_path *path;
2734 unsigned i;
2735
2736 BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
2737
2738 trans_for_each_path(trans, path, i) {
2739 BUG_ON(path->sorted_idx >= trans->nr_sorted);
2740 BUG_ON(trans->sorted[path->sorted_idx] != i);
2741 }
2742
2743 for (i = 0; i < trans->nr_sorted; i++) {
2744 unsigned idx = trans->sorted[i];
2745
2746 BUG_ON(!test_bit(idx, trans->paths_allocated));
2747 BUG_ON(trans->paths[idx].sorted_idx != i);
2748 }
2749 }
2750
btree_trans_verify_sorted(struct btree_trans * trans)2751 static void btree_trans_verify_sorted(struct btree_trans *trans)
2752 {
2753 struct btree_path *path, *prev = NULL;
2754 struct trans_for_each_path_inorder_iter iter;
2755
2756 if (!bch2_debug_check_iterators)
2757 return;
2758
2759 trans_for_each_path_inorder(trans, path, iter) {
2760 if (prev && btree_path_cmp(prev, path) > 0) {
2761 __bch2_dump_trans_paths_updates(trans, true);
2762 panic("trans paths out of order!\n");
2763 }
2764 prev = path;
2765 }
2766 }
2767 #else
btree_trans_verify_sorted_refs(struct btree_trans * trans)2768 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
btree_trans_verify_sorted(struct btree_trans * trans)2769 static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
2770 #endif
2771
__bch2_btree_trans_sort_paths(struct btree_trans * trans)2772 void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
2773 {
2774 int i, l = 0, r = trans->nr_sorted, inc = 1;
2775 bool swapped;
2776
2777 btree_trans_verify_sorted_refs(trans);
2778
2779 if (trans->paths_sorted)
2780 goto out;
2781
2782 /*
2783 * Cocktail shaker sort: this is efficient because iterators will be
2784 * mostly sorted.
2785 */
2786 do {
2787 swapped = false;
2788
2789 for (i = inc > 0 ? l : r - 2;
2790 i + 1 < r && i >= l;
2791 i += inc) {
2792 if (btree_path_cmp(trans->paths + trans->sorted[i],
2793 trans->paths + trans->sorted[i + 1]) > 0) {
2794 swap(trans->sorted[i], trans->sorted[i + 1]);
2795 trans->paths[trans->sorted[i]].sorted_idx = i;
2796 trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
2797 swapped = true;
2798 }
2799 }
2800
2801 if (inc > 0)
2802 --r;
2803 else
2804 l++;
2805 inc = -inc;
2806 } while (swapped);
2807
2808 trans->paths_sorted = true;
2809 out:
2810 btree_trans_verify_sorted(trans);
2811 }
2812
btree_path_list_remove(struct btree_trans * trans,struct btree_path * path)2813 static inline void btree_path_list_remove(struct btree_trans *trans,
2814 struct btree_path *path)
2815 {
2816 EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2817 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2818 trans->nr_sorted--;
2819 memmove_u64s_down_small(trans->sorted + path->sorted_idx,
2820 trans->sorted + path->sorted_idx + 1,
2821 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
2822 sizeof(u64) / sizeof(btree_path_idx_t)));
2823 #else
2824 array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2825 #endif
2826 for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
2827 trans->paths[trans->sorted[i]].sorted_idx = i;
2828 }
2829
btree_path_list_add(struct btree_trans * trans,btree_path_idx_t pos,btree_path_idx_t path_idx)2830 static inline void btree_path_list_add(struct btree_trans *trans,
2831 btree_path_idx_t pos,
2832 btree_path_idx_t path_idx)
2833 {
2834 struct btree_path *path = trans->paths + path_idx;
2835
2836 path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
2837
2838 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2839 memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
2840 trans->sorted + path->sorted_idx,
2841 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
2842 sizeof(u64) / sizeof(btree_path_idx_t)));
2843 trans->nr_sorted++;
2844 trans->sorted[path->sorted_idx] = path_idx;
2845 #else
2846 array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
2847 #endif
2848
2849 for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
2850 trans->paths[trans->sorted[i]].sorted_idx = i;
2851
2852 btree_trans_verify_sorted_refs(trans);
2853 }
2854
bch2_trans_iter_exit(struct btree_trans * trans,struct btree_iter * iter)2855 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2856 {
2857 if (iter->update_path)
2858 bch2_path_put_nokeep(trans, iter->update_path,
2859 iter->flags & BTREE_ITER_intent);
2860 if (iter->path)
2861 bch2_path_put(trans, iter->path,
2862 iter->flags & BTREE_ITER_intent);
2863 if (iter->key_cache_path)
2864 bch2_path_put(trans, iter->key_cache_path,
2865 iter->flags & BTREE_ITER_intent);
2866 iter->path = 0;
2867 iter->update_path = 0;
2868 iter->key_cache_path = 0;
2869 iter->trans = NULL;
2870 }
2871
bch2_trans_iter_init_outlined(struct btree_trans * trans,struct btree_iter * iter,enum btree_id btree_id,struct bpos pos,unsigned flags)2872 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
2873 struct btree_iter *iter,
2874 enum btree_id btree_id, struct bpos pos,
2875 unsigned flags)
2876 {
2877 bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2878 bch2_btree_iter_flags(trans, btree_id, flags),
2879 _RET_IP_);
2880 }
2881
bch2_trans_node_iter_init(struct btree_trans * trans,struct btree_iter * iter,enum btree_id btree_id,struct bpos pos,unsigned locks_want,unsigned depth,unsigned flags)2882 void bch2_trans_node_iter_init(struct btree_trans *trans,
2883 struct btree_iter *iter,
2884 enum btree_id btree_id,
2885 struct bpos pos,
2886 unsigned locks_want,
2887 unsigned depth,
2888 unsigned flags)
2889 {
2890 flags |= BTREE_ITER_not_extents;
2891 flags |= BTREE_ITER_snapshot_field;
2892 flags |= BTREE_ITER_all_snapshots;
2893
2894 bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
2895 __bch2_btree_iter_flags(trans, btree_id, flags),
2896 _RET_IP_);
2897
2898 iter->min_depth = depth;
2899
2900 struct btree_path *path = btree_iter_path(trans, iter);
2901 BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
2902 BUG_ON(path->level != depth);
2903 BUG_ON(iter->min_depth != depth);
2904 }
2905
bch2_trans_copy_iter(struct btree_iter * dst,struct btree_iter * src)2906 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2907 {
2908 struct btree_trans *trans = src->trans;
2909
2910 *dst = *src;
2911 #ifdef TRACK_PATH_ALLOCATED
2912 dst->ip_allocated = _RET_IP_;
2913 #endif
2914 if (src->path)
2915 __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
2916 if (src->update_path)
2917 __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
2918 dst->key_cache_path = 0;
2919 }
2920
__bch2_trans_kmalloc(struct btree_trans * trans,size_t size)2921 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2922 {
2923 struct bch_fs *c = trans->c;
2924 unsigned new_top = trans->mem_top + size;
2925 unsigned old_bytes = trans->mem_bytes;
2926 unsigned new_bytes = roundup_pow_of_two(new_top);
2927 int ret;
2928 void *new_mem;
2929 void *p;
2930
2931 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2932
2933 struct btree_transaction_stats *s = btree_trans_stats(trans);
2934 s->max_mem = max(s->max_mem, new_bytes);
2935
2936 if (trans->used_mempool) {
2937 if (trans->mem_bytes >= new_bytes)
2938 goto out_change_top;
2939
2940 /* No more space from mempool item, need malloc new one */
2941 new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
2942 if (unlikely(!new_mem)) {
2943 bch2_trans_unlock(trans);
2944
2945 new_mem = kmalloc(new_bytes, GFP_KERNEL);
2946 if (!new_mem)
2947 return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
2948
2949 ret = bch2_trans_relock(trans);
2950 if (ret) {
2951 kfree(new_mem);
2952 return ERR_PTR(ret);
2953 }
2954 }
2955 memcpy(new_mem, trans->mem, trans->mem_top);
2956 trans->used_mempool = false;
2957 mempool_free(trans->mem, &c->btree_trans_mem_pool);
2958 goto out_new_mem;
2959 }
2960
2961 new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
2962 if (unlikely(!new_mem)) {
2963 bch2_trans_unlock(trans);
2964
2965 new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
2966 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2967 new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2968 new_bytes = BTREE_TRANS_MEM_MAX;
2969 memcpy(new_mem, trans->mem, trans->mem_top);
2970 trans->used_mempool = true;
2971 kfree(trans->mem);
2972 }
2973
2974 if (!new_mem)
2975 return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
2976
2977 trans->mem = new_mem;
2978 trans->mem_bytes = new_bytes;
2979
2980 ret = bch2_trans_relock(trans);
2981 if (ret)
2982 return ERR_PTR(ret);
2983 }
2984 out_new_mem:
2985 trans->mem = new_mem;
2986 trans->mem_bytes = new_bytes;
2987
2988 if (old_bytes) {
2989 trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2990 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2991 }
2992 out_change_top:
2993 p = trans->mem + trans->mem_top;
2994 trans->mem_top += size;
2995 memset(p, 0, size);
2996 return p;
2997 }
2998
check_srcu_held_too_long(struct btree_trans * trans)2999 static inline void check_srcu_held_too_long(struct btree_trans *trans)
3000 {
3001 WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
3002 "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
3003 (jiffies - trans->srcu_lock_time) / HZ);
3004 }
3005
bch2_trans_srcu_unlock(struct btree_trans * trans)3006 void bch2_trans_srcu_unlock(struct btree_trans *trans)
3007 {
3008 if (trans->srcu_held) {
3009 struct bch_fs *c = trans->c;
3010 struct btree_path *path;
3011 unsigned i;
3012
3013 trans_for_each_path(trans, path, i)
3014 if (path->cached && !btree_node_locked(path, 0))
3015 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
3016
3017 check_srcu_held_too_long(trans);
3018 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
3019 trans->srcu_held = false;
3020 }
3021 }
3022
bch2_trans_srcu_lock(struct btree_trans * trans)3023 static void bch2_trans_srcu_lock(struct btree_trans *trans)
3024 {
3025 if (!trans->srcu_held) {
3026 trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
3027 trans->srcu_lock_time = jiffies;
3028 trans->srcu_held = true;
3029 }
3030 }
3031
3032 /**
3033 * bch2_trans_begin() - reset a transaction after a interrupted attempt
3034 * @trans: transaction to reset
3035 *
3036 * Returns: current restart counter, to be used with trans_was_restarted()
3037 *
3038 * While iterating over nodes or updating nodes a attempt to lock a btree node
3039 * may return BCH_ERR_transaction_restart when the trylock fails. When this
3040 * occurs bch2_trans_begin() should be called and the transaction retried.
3041 */
bch2_trans_begin(struct btree_trans * trans)3042 u32 bch2_trans_begin(struct btree_trans *trans)
3043 {
3044 struct btree_path *path;
3045 unsigned i;
3046 u64 now;
3047
3048 bch2_trans_reset_updates(trans);
3049
3050 trans->restart_count++;
3051 trans->mem_top = 0;
3052 trans->journal_entries = NULL;
3053
3054 trans_for_each_path(trans, path, i) {
3055 path->should_be_locked = false;
3056
3057 /*
3058 * If the transaction wasn't restarted, we're presuming to be
3059 * doing something new: dont keep iterators excpt the ones that
3060 * are in use - except for the subvolumes btree:
3061 */
3062 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
3063 path->preserve = false;
3064
3065 /*
3066 * XXX: we probably shouldn't be doing this if the transaction
3067 * was restarted, but currently we still overflow transaction
3068 * iterators if we do that
3069 */
3070 if (!path->ref && !path->preserve)
3071 __bch2_path_free(trans, i);
3072 else
3073 path->preserve = false;
3074 }
3075
3076 now = local_clock();
3077
3078 if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
3079 time_after64(now, trans->last_begin_time + 10))
3080 __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
3081 trans->last_begin_time, now);
3082
3083 if (!trans->restarted &&
3084 (need_resched() ||
3085 time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
3086 bch2_trans_unlock(trans);
3087 cond_resched();
3088 now = local_clock();
3089 }
3090 trans->last_begin_time = now;
3091
3092 if (unlikely(trans->srcu_held &&
3093 time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
3094 bch2_trans_srcu_unlock(trans);
3095
3096 trans->last_begin_ip = _RET_IP_;
3097 trans->locked = true;
3098
3099 if (trans->restarted) {
3100 bch2_btree_path_traverse_all(trans);
3101 trans->notrace_relock_fail = false;
3102 }
3103
3104 bch2_trans_verify_not_unlocked(trans);
3105 return trans->restart_count;
3106 }
3107
3108 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
3109
bch2_trans_get_fn_idx(const char * fn)3110 unsigned bch2_trans_get_fn_idx(const char *fn)
3111 {
3112 for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
3113 if (!bch2_btree_transaction_fns[i] ||
3114 bch2_btree_transaction_fns[i] == fn) {
3115 bch2_btree_transaction_fns[i] = fn;
3116 return i;
3117 }
3118
3119 pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
3120 return 0;
3121 }
3122
__bch2_trans_get(struct bch_fs * c,unsigned fn_idx)3123 struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
3124 __acquires(&c->btree_trans_barrier)
3125 {
3126 struct btree_trans *trans;
3127
3128 if (IS_ENABLED(__KERNEL__)) {
3129 trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
3130 if (trans) {
3131 memset(trans, 0, offsetof(struct btree_trans, list));
3132 goto got_trans;
3133 }
3134 }
3135
3136 trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
3137 memset(trans, 0, sizeof(*trans));
3138 closure_init_stack(&trans->ref);
3139
3140 seqmutex_lock(&c->btree_trans_lock);
3141 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
3142 struct btree_trans *pos;
3143 pid_t pid = current->pid;
3144
3145 trans->locking_wait.task = current;
3146
3147 list_for_each_entry(pos, &c->btree_trans_list, list) {
3148 struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
3149 /*
3150 * We'd much prefer to be stricter here and completely
3151 * disallow multiple btree_trans in the same thread -
3152 * but the data move path calls bch2_write when we
3153 * already have a btree_trans initialized.
3154 */
3155 BUG_ON(pos_task &&
3156 pid == pos_task->pid &&
3157 pos->locked);
3158
3159 if (pos_task && pid < pos_task->pid) {
3160 list_add_tail(&trans->list, &pos->list);
3161 goto list_add_done;
3162 }
3163 }
3164 }
3165 list_add_tail(&trans->list, &c->btree_trans_list);
3166 list_add_done:
3167 seqmutex_unlock(&c->btree_trans_lock);
3168 got_trans:
3169 trans->c = c;
3170 trans->last_begin_time = local_clock();
3171 trans->fn_idx = fn_idx;
3172 trans->locking_wait.task = current;
3173 trans->locked = true;
3174 trans->journal_replay_not_finished =
3175 unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
3176 atomic_inc_not_zero(&c->journal_keys.ref);
3177 trans->nr_paths = ARRAY_SIZE(trans->_paths);
3178 trans->paths_allocated = trans->_paths_allocated;
3179 trans->sorted = trans->_sorted;
3180 trans->paths = trans->_paths;
3181 trans->updates = trans->_updates;
3182
3183 *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
3184
3185 trans->paths_allocated[0] = 1;
3186
3187 if (fn_idx < BCH_TRANSACTIONS_NR) {
3188 trans->fn = bch2_btree_transaction_fns[fn_idx];
3189
3190 struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
3191
3192 if (s->max_mem) {
3193 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
3194
3195 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
3196 if (likely(trans->mem))
3197 trans->mem_bytes = expected_mem_bytes;
3198 }
3199
3200 trans->nr_paths_max = s->nr_max_paths;
3201 trans->journal_entries_size = s->journal_entries_size;
3202 }
3203
3204 trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
3205 trans->srcu_lock_time = jiffies;
3206 trans->srcu_held = true;
3207 return trans;
3208 }
3209
check_btree_paths_leaked(struct btree_trans * trans)3210 static void check_btree_paths_leaked(struct btree_trans *trans)
3211 {
3212 #ifdef CONFIG_BCACHEFS_DEBUG
3213 struct bch_fs *c = trans->c;
3214 struct btree_path *path;
3215 unsigned i;
3216
3217 trans_for_each_path(trans, path, i)
3218 if (path->ref)
3219 goto leaked;
3220 return;
3221 leaked:
3222 bch_err(c, "btree paths leaked from %s!", trans->fn);
3223 trans_for_each_path(trans, path, i)
3224 if (path->ref)
3225 printk(KERN_ERR " btree %s %pS\n",
3226 bch2_btree_id_str(path->btree_id),
3227 (void *) path->ip_allocated);
3228 /* Be noisy about this: */
3229 bch2_fatal_error(c);
3230 #endif
3231 }
3232
bch2_trans_put(struct btree_trans * trans)3233 void bch2_trans_put(struct btree_trans *trans)
3234 __releases(&c->btree_trans_barrier)
3235 {
3236 struct bch_fs *c = trans->c;
3237
3238 bch2_trans_unlock(trans);
3239
3240 trans_for_each_update(trans, i)
3241 __btree_path_put(trans->paths + i->path, true);
3242 trans->nr_updates = 0;
3243 trans->locking_wait.task = NULL;
3244
3245 check_btree_paths_leaked(trans);
3246
3247 if (trans->srcu_held) {
3248 check_srcu_held_too_long(trans);
3249 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
3250 }
3251
3252 if (trans->fs_usage_deltas) {
3253 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
3254 REPLICAS_DELTA_LIST_MAX)
3255 mempool_free(trans->fs_usage_deltas,
3256 &c->replicas_delta_pool);
3257 else
3258 kfree(trans->fs_usage_deltas);
3259 }
3260
3261 if (unlikely(trans->journal_replay_not_finished))
3262 bch2_journal_keys_put(c);
3263
3264 unsigned long *paths_allocated = trans->paths_allocated;
3265 trans->paths_allocated = NULL;
3266 trans->paths = NULL;
3267
3268 if (paths_allocated != trans->_paths_allocated)
3269 kvfree_rcu_mightsleep(paths_allocated);
3270
3271 if (trans->used_mempool)
3272 mempool_free(trans->mem, &c->btree_trans_mem_pool);
3273 else
3274 kfree(trans->mem);
3275
3276 /* Userspace doesn't have a real percpu implementation: */
3277 if (IS_ENABLED(__KERNEL__))
3278 trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
3279
3280 if (trans) {
3281 closure_sync(&trans->ref);
3282
3283 seqmutex_lock(&c->btree_trans_lock);
3284 list_del(&trans->list);
3285 seqmutex_unlock(&c->btree_trans_lock);
3286
3287 mempool_free(trans, &c->btree_trans_pool);
3288 }
3289 }
3290
3291 static void __maybe_unused
bch2_btree_bkey_cached_common_to_text(struct printbuf * out,struct btree_bkey_cached_common * b)3292 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
3293 struct btree_bkey_cached_common *b)
3294 {
3295 struct six_lock_count c = six_lock_counts(&b->lock);
3296 struct task_struct *owner;
3297 pid_t pid;
3298
3299 rcu_read_lock();
3300 owner = READ_ONCE(b->lock.owner);
3301 pid = owner ? owner->pid : 0;
3302 rcu_read_unlock();
3303
3304 prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
3305 b->level, bch2_btree_id_str(b->btree_id));
3306 bch2_bpos_to_text(out, btree_node_pos(b));
3307
3308 prt_printf(out, "\t locks %u:%u:%u held by pid %u",
3309 c.n[0], c.n[1], c.n[2], pid);
3310 }
3311
bch2_btree_trans_to_text(struct printbuf * out,struct btree_trans * trans)3312 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3313 {
3314 struct btree_bkey_cached_common *b;
3315 static char lock_types[] = { 'r', 'i', 'w' };
3316 struct task_struct *task = READ_ONCE(trans->locking_wait.task);
3317 unsigned l, idx;
3318
3319 /* before rcu_read_lock(): */
3320 bch2_printbuf_make_room(out, 4096);
3321
3322 if (!out->nr_tabstops) {
3323 printbuf_tabstop_push(out, 16);
3324 printbuf_tabstop_push(out, 32);
3325 }
3326
3327 prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
3328
3329 /* trans->paths is rcu protected vs. freeing */
3330 rcu_read_lock();
3331 out->atomic++;
3332
3333 struct btree_path *paths = rcu_dereference(trans->paths);
3334 if (!paths)
3335 goto out;
3336
3337 unsigned long *paths_allocated = trans_paths_allocated(paths);
3338
3339 trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
3340 struct btree_path *path = paths + idx;
3341 if (!path->nodes_locked)
3342 continue;
3343
3344 prt_printf(out, " path %u %c l=%u %s:",
3345 idx,
3346 path->cached ? 'c' : 'b',
3347 path->level,
3348 bch2_btree_id_str(path->btree_id));
3349 bch2_bpos_to_text(out, path->pos);
3350 prt_newline(out);
3351
3352 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3353 if (btree_node_locked(path, l) &&
3354 !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3355 prt_printf(out, " %c l=%u ",
3356 lock_types[btree_node_locked_type(path, l)], l);
3357 bch2_btree_bkey_cached_common_to_text(out, b);
3358 prt_newline(out);
3359 }
3360 }
3361 }
3362
3363 b = READ_ONCE(trans->locking);
3364 if (b) {
3365 prt_printf(out, " blocked for %lluus on\n",
3366 div_u64(local_clock() - trans->locking_wait.start_time, 1000));
3367 prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
3368 bch2_btree_bkey_cached_common_to_text(out, b);
3369 prt_newline(out);
3370 }
3371 out:
3372 --out->atomic;
3373 rcu_read_unlock();
3374 }
3375
bch2_fs_btree_iter_exit(struct bch_fs * c)3376 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3377 {
3378 struct btree_transaction_stats *s;
3379 struct btree_trans *trans;
3380 int cpu;
3381
3382 if (c->btree_trans_bufs)
3383 for_each_possible_cpu(cpu) {
3384 struct btree_trans *trans =
3385 per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
3386
3387 if (trans) {
3388 closure_sync(&trans->ref);
3389
3390 seqmutex_lock(&c->btree_trans_lock);
3391 list_del(&trans->list);
3392 seqmutex_unlock(&c->btree_trans_lock);
3393 }
3394 kfree(trans);
3395 }
3396 free_percpu(c->btree_trans_bufs);
3397
3398 trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
3399 if (trans)
3400 panic("%s leaked btree_trans\n", trans->fn);
3401
3402 for (s = c->btree_transaction_stats;
3403 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3404 s++) {
3405 kfree(s->max_paths_text);
3406 bch2_time_stats_exit(&s->lock_hold_times);
3407 }
3408
3409 if (c->btree_trans_barrier_initialized)
3410 cleanup_srcu_struct(&c->btree_trans_barrier);
3411 mempool_exit(&c->btree_trans_mem_pool);
3412 mempool_exit(&c->btree_trans_pool);
3413 }
3414
bch2_fs_btree_iter_init_early(struct bch_fs * c)3415 void bch2_fs_btree_iter_init_early(struct bch_fs *c)
3416 {
3417 struct btree_transaction_stats *s;
3418
3419 for (s = c->btree_transaction_stats;
3420 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3421 s++) {
3422 bch2_time_stats_init(&s->duration);
3423 bch2_time_stats_init(&s->lock_hold_times);
3424 mutex_init(&s->lock);
3425 }
3426
3427 INIT_LIST_HEAD(&c->btree_trans_list);
3428 seqmutex_init(&c->btree_trans_lock);
3429 }
3430
bch2_fs_btree_iter_init(struct bch_fs * c)3431 int bch2_fs_btree_iter_init(struct bch_fs *c)
3432 {
3433 int ret;
3434
3435 c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
3436 if (!c->btree_trans_bufs)
3437 return -ENOMEM;
3438
3439 ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
3440 sizeof(struct btree_trans)) ?:
3441 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3442 BTREE_TRANS_MEM_MAX) ?:
3443 init_srcu_struct(&c->btree_trans_barrier);
3444 if (!ret)
3445 c->btree_trans_barrier_initialized = true;
3446 return ret;
3447 }
3448