3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_iter.h"
6 #include "btree_locking.h"
10 #include <linux/prefetch.h>
11 #include <trace/events/bcachefs.h>
13 static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
14 struct btree_iter_level *,
17 #define BTREE_ITER_NOT_END ((struct btree *) 1)
19 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
21 return iter->l[l].b && iter->l[l].b != BTREE_ITER_NOT_END;
24 /* Btree node locking: */
27 * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
30 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
32 struct btree_iter *linked;
34 EBUG_ON(iter->l[b->level].b != b);
35 EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
37 for_each_linked_btree_node(iter, b, linked)
38 linked->lock_seq[b->level] += 2;
40 iter->lock_seq[b->level] += 2;
42 six_unlock_write(&b->lock);
45 void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
47 struct btree_iter *linked;
50 EBUG_ON(iter->l[b->level].b != b);
51 EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
53 if (six_trylock_write(&b->lock))
56 for_each_linked_btree_iter(iter, linked)
57 if (linked->l[b->level].b == b &&
58 btree_node_read_locked(linked, b->level))
61 if (likely(!readers)) {
62 six_lock_write(&b->lock);
65 * Must drop our read locks before calling six_lock_write() -
66 * six_unlock() won't do wakeups until the reader count
67 * goes to 0, and it's safe because we have the node intent
70 atomic64_sub(__SIX_VAL(read_lock, readers),
71 &b->lock.state.counter);
72 six_lock_write(&b->lock);
73 atomic64_add(__SIX_VAL(read_lock, readers),
74 &b->lock.state.counter);
78 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
80 struct btree_iter *linked;
81 struct btree *b = iter->l[level].b;
82 int want = btree_lock_want(iter, level);
83 int have = btree_node_locked_type(iter, level);
88 if (!is_btree_node(iter, level))
94 if (have != BTREE_NODE_UNLOCKED
95 ? six_trylock_convert(&b->lock, have, want)
96 : six_relock_type(&b->lock, want, iter->lock_seq[level]))
99 for_each_linked_btree_iter(iter, linked)
100 if (linked->l[level].b == b &&
101 btree_node_locked_type(linked, level) == want &&
102 iter->lock_seq[level] == b->lock.state.seq) {
103 btree_node_unlock(iter, level);
104 six_lock_increment(&b->lock, want);
110 mark_btree_node_unlocked(iter, level);
111 mark_btree_node_locked(iter, level, want);
115 bool bch2_btree_iter_relock(struct btree_iter *iter)
119 for (l = iter->level;
120 l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b;
122 if (!bch2_btree_node_relock(iter, l)) {
123 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
127 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
128 iter->uptodate = BTREE_ITER_NEED_PEEK;
133 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
135 struct btree_iter *iter,
136 enum six_lock_type type)
138 struct btree_iter *linked;
140 /* Can't have children locked before ancestors: */
141 EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
144 * Can't hold any read locks while we block taking an intent lock - see
145 * below for reasoning, and we should have already dropped any read
146 * locks in the current iterator
148 EBUG_ON(type == SIX_LOCK_intent &&
149 iter->nodes_locked != iter->nodes_intent_locked);
151 for_each_linked_btree_iter(iter, linked)
152 if (linked->l[level].b == b &&
153 btree_node_locked_type(linked, level) == type) {
154 six_lock_increment(&b->lock, type);
159 * Must lock btree nodes in key order - this case hapens when locking
160 * the prev sibling in btree node merging:
162 if (iter->nodes_locked &&
163 __ffs(iter->nodes_locked) == level &&
164 __btree_iter_cmp(iter->btree_id, pos, iter))
167 for_each_linked_btree_iter(iter, linked) {
168 if (!linked->nodes_locked)
172 * Can't block taking an intent lock if we have _any_ nodes read
175 * - Our read lock blocks another thread with an intent lock on
176 * the same node from getting a write lock, and thus from
177 * dropping its intent lock
179 * - And the other thread may have multiple nodes intent locked:
180 * both the node we want to intent lock, and the node we
181 * already have read locked - deadlock:
183 if (type == SIX_LOCK_intent &&
184 linked->nodes_locked != linked->nodes_intent_locked) {
185 linked->locks_want = max_t(unsigned,
191 /* We have to lock btree nodes in key order: */
192 if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
196 * Interior nodes must be locked before their descendants: if
197 * another iterator has possible descendants locked of the node
198 * we're about to lock, it must have the ancestors locked too:
200 if (linked->btree_id == iter->btree_id &&
201 level > __fls(linked->nodes_locked)) {
202 linked->locks_want = max_t(unsigned,
209 six_lock_type(&b->lock, type);
213 /* Btree iterator locking: */
215 static void btree_iter_drop_extra_locks(struct btree_iter *iter)
219 while (iter->nodes_locked &&
220 (l = __fls(iter->nodes_locked)) > iter->locks_want) {
221 if (l > iter->level) {
222 btree_node_unlock(iter, l);
224 if (btree_node_intent_locked(iter, l)) {
225 six_lock_downgrade(&iter->l[l].b->lock);
226 iter->nodes_intent_locked ^= 1 << l;
233 bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
234 unsigned new_locks_want)
236 struct btree_iter *linked;
238 /* Drop locks we don't want anymore: */
239 if (new_locks_want < iter->locks_want)
240 for_each_linked_btree_iter(iter, linked)
241 if (linked->locks_want > new_locks_want) {
242 linked->locks_want = max_t(unsigned, 1,
244 btree_iter_drop_extra_locks(linked);
247 iter->locks_want = new_locks_want;
248 btree_iter_drop_extra_locks(iter);
250 if (bch2_btree_iter_relock(iter))
254 * Just an optimization: ancestor nodes must be locked before child
255 * nodes, so set locks_want on iterators that might lock ancestors
256 * before us to avoid getting -EINTR later:
258 for_each_linked_btree_iter(iter, linked)
259 if (linked->btree_id == iter->btree_id &&
260 btree_iter_cmp(linked, iter) <= 0)
261 linked->locks_want = max_t(unsigned, linked->locks_want,
266 static void __bch2_btree_iter_unlock(struct btree_iter *iter)
268 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
270 while (iter->nodes_locked)
271 btree_node_unlock(iter, __ffs(iter->nodes_locked));
274 int bch2_btree_iter_unlock(struct btree_iter *iter)
276 struct btree_iter *linked;
278 for_each_linked_btree_iter(iter, linked)
279 __bch2_btree_iter_unlock(linked);
280 __bch2_btree_iter_unlock(iter);
282 return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
285 /* Btree iterator: */
287 #ifdef CONFIG_BCACHEFS_DEBUG
289 static void __bch2_btree_iter_verify(struct btree_iter *iter,
292 struct btree_iter_level *l = &iter->l[b->level];
293 struct btree_node_iter tmp = l->iter;
294 struct bkey_packed *k;
296 bch2_btree_node_iter_verify(&l->iter, b);
299 * For interior nodes, the iterator will have skipped past
303 ? bch2_btree_node_iter_prev(&tmp, b)
304 : bch2_btree_node_iter_prev_all(&tmp, b);
305 if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
306 iter->flags & BTREE_ITER_IS_EXTENTS)) {
308 struct bkey uk = bkey_unpack_key(b, k);
310 bch2_bkey_to_text(buf, sizeof(buf), &uk);
311 panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
312 buf, iter->pos.inode, iter->pos.offset);
315 k = bch2_btree_node_iter_peek_all(&l->iter, b);
316 if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
317 iter->flags & BTREE_ITER_IS_EXTENTS)) {
319 struct bkey uk = bkey_unpack_key(b, k);
321 bch2_bkey_to_text(buf, sizeof(buf), &uk);
322 panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
323 iter->pos.inode, iter->pos.offset, buf);
327 void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
329 struct btree_iter *linked;
331 if (iter->l[b->level].b == b)
332 __bch2_btree_iter_verify(iter, b);
334 for_each_linked_btree_node(iter, b, linked)
335 __bch2_btree_iter_verify(iter, b);
340 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
342 struct btree_node_iter *node_iter,
344 struct bkey_packed *where,
345 unsigned clobber_u64s,
348 const struct bkey_packed *end = btree_bkey_last(b, t);
349 struct btree_node_iter_set *set;
350 unsigned offset = __btree_node_key_to_offset(b, where);
351 int shift = new_u64s - clobber_u64s;
352 unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
354 btree_node_iter_for_each(node_iter, set)
355 if (set->end == old_end)
358 /* didn't find the bset in the iterator - might have to readd it: */
360 btree_iter_pos_cmp_packed(b, &iter->pos, where,
361 iter->flags & BTREE_ITER_IS_EXTENTS)) {
362 bch2_btree_node_iter_push(node_iter, b, where, end);
365 node_iter == &iter->l[0].iter)
367 bch2_btree_node_iter_peek_all(node_iter, b),
372 set->end = (int) set->end + shift;
374 /* Iterator hasn't gotten to the key that changed yet: */
379 btree_iter_pos_cmp_packed(b, &iter->pos, where,
380 iter->flags & BTREE_ITER_IS_EXTENTS)) {
382 } else if (set->k < offset + clobber_u64s) {
383 set->k = offset + new_u64s;
384 if (set->k == set->end)
385 bch2_btree_node_iter_set_drop(node_iter, set);
387 set->k = (int) set->k + shift;
388 goto iter_current_key_not_modified;
391 bch2_btree_node_iter_sort(node_iter, b);
392 if (!b->level && node_iter == &iter->l[0].iter)
393 __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
394 iter_current_key_not_modified:
397 * Interior nodes are special because iterators for interior nodes don't
398 * obey the usual invariants regarding the iterator position:
400 * We may have whiteouts that compare greater than the iterator
401 * position, and logically should be in the iterator, but that we
402 * skipped past to find the first live key greater than the iterator
403 * position. This becomes an issue when we insert a new key that is
404 * greater than the current iterator position, but smaller than the
405 * whiteouts we've already skipped past - this happens in the course of
408 * We have to rewind the iterator past to before those whiteouts here,
409 * else bkey_node_iter_prev() is not going to work and who knows what
410 * else would happen. And we have to do it manually, because here we've
411 * already done the insert and the iterator is currently inconsistent:
413 * We've got multiple competing invariants, here - we have to be careful
414 * about rewinding iterators for interior nodes, because they should
415 * always point to the key for the child node the btree iterator points
418 if (b->level && new_u64s && !bkey_deleted(where) &&
419 btree_iter_pos_cmp_packed(b, &iter->pos, where,
420 iter->flags & BTREE_ITER_IS_EXTENTS)) {
422 struct bkey_packed *k;
424 for_each_bset(b, t) {
425 if (bch2_bkey_to_bset(b, where) == t)
428 k = bch2_bkey_prev_all(b, t,
429 bch2_btree_node_iter_bset_pos(node_iter, b, t));
431 __btree_node_iter_cmp(node_iter, b,
433 struct btree_node_iter_set *set;
435 __btree_node_key_to_offset(b, bkey_next(k));
437 btree_node_iter_for_each(node_iter, set)
438 if (set->k == offset) {
439 set->k = __btree_node_key_to_offset(b, k);
440 bch2_btree_node_iter_sort(node_iter, b);
444 bch2_btree_node_iter_push(node_iter, b, k,
445 btree_bkey_last(b, t));
453 void bch2_btree_node_iter_fix(struct btree_iter *iter,
455 struct btree_node_iter *node_iter,
457 struct bkey_packed *where,
458 unsigned clobber_u64s,
461 struct btree_iter *linked;
463 if (node_iter != &iter->l[b->level].iter)
464 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
465 where, clobber_u64s, new_u64s);
467 if (iter->l[b->level].b == b)
468 __bch2_btree_node_iter_fix(iter, b,
469 &iter->l[b->level].iter, t,
470 where, clobber_u64s, new_u64s);
472 for_each_linked_btree_node(iter, b, linked)
473 __bch2_btree_node_iter_fix(linked, b,
474 &linked->l[b->level].iter, t,
475 where, clobber_u64s, new_u64s);
477 /* interior node iterators are... special... */
479 bch2_btree_iter_verify(iter, b);
482 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
483 struct btree_iter_level *l,
485 struct bkey_packed *k)
491 * signal to bch2_btree_iter_peek_slot() that we're currently at
494 u->type = KEY_TYPE_DELETED;
495 return bkey_s_c_null;
498 ret = bkey_disassemble(l->b, k, u);
500 if (debug_check_bkeys(iter->c))
501 bch2_bkey_debugcheck(iter->c, l->b, ret);
506 /* peek_all() doesn't skip deleted keys */
507 static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
508 struct btree_iter_level *l,
511 return __btree_iter_unpack(iter, l, u,
512 bch2_btree_node_iter_peek_all(&l->iter, l->b));
515 static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
516 struct btree_iter_level *l)
518 return __btree_iter_unpack(iter, l, &iter->k,
519 bch2_btree_node_iter_peek(&l->iter, l->b));
522 static inline void __btree_iter_advance(struct btree_iter_level *l)
524 bch2_btree_node_iter_advance(&l->iter, l->b);
528 * Verify that iterator for parent node points to child node:
530 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
532 struct btree_iter_level *l;
535 struct bkey_packed *k;
537 if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
540 plevel = b->level + 1;
541 if (!btree_iter_node(iter, plevel))
544 parent_locked = btree_node_locked(iter, plevel);
546 if (!bch2_btree_node_relock(iter, plevel))
549 l = &iter->l[plevel];
550 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
553 bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
555 struct bkey uk = bkey_unpack_key(b, k);
557 bch2_bkey_to_text(buf, sizeof(buf), &uk);
558 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
559 buf, b->key.k.p.inode, b->key.k.p.offset);
563 btree_node_unlock(iter, b->level + 1);
566 /* Returns true if @k is after iterator position @pos */
567 static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
568 const struct bkey *k)
570 int cmp = bkey_cmp(k->p, iter->pos);
574 !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
577 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
580 return !btree_iter_pos_cmp(iter, &b->key.k);
583 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
586 return iter->btree_id == b->btree_id &&
587 bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
588 !btree_iter_pos_after_node(iter, b);
591 static inline void __btree_iter_init(struct btree_iter *iter,
594 struct btree_iter_level *l = &iter->l[b->level];
596 bch2_btree_node_iter_init(&l->iter, b, iter->pos,
597 iter->flags & BTREE_ITER_IS_EXTENTS,
598 btree_node_is_extents(b));
600 /* Skip to first non whiteout: */
602 bch2_btree_node_iter_peek(&l->iter, b);
605 static inline void btree_iter_node_set(struct btree_iter *iter,
608 btree_iter_verify_new_node(iter, b);
610 EBUG_ON(!btree_iter_pos_in_node(iter, b));
611 EBUG_ON(b->lock.state.seq & 1);
613 iter->lock_seq[b->level] = b->lock.state.seq;
614 iter->l[b->level].b = b;
615 __btree_iter_init(iter, b);
619 * A btree node is being replaced - update the iterator to point to the new
622 bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
624 struct btree_iter *linked;
626 for_each_linked_btree_iter(iter, linked)
627 if (btree_iter_pos_in_node(linked, b)) {
629 * bch2_btree_iter_node_drop() has already been called -
630 * the old node we're replacing has already been
631 * unlocked and the pointer invalidated
633 BUG_ON(btree_node_locked(linked, b->level));
636 * If @linked wants this node read locked, we don't want
637 * to actually take the read lock now because it's not
638 * legal to hold read locks on other nodes while we take
639 * write locks, so the journal can make forward
642 * Instead, btree_iter_node_set() sets things up so
643 * bch2_btree_node_relock() will succeed:
646 if (btree_want_intent(linked, b->level)) {
647 six_lock_increment(&b->lock, SIX_LOCK_intent);
648 mark_btree_node_intent_locked(linked, b->level);
651 btree_iter_node_set(linked, b);
654 if (!btree_iter_pos_in_node(iter, b)) {
655 six_unlock_intent(&b->lock);
659 mark_btree_node_intent_locked(iter, b->level);
660 btree_iter_node_set(iter, b);
664 void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
666 struct btree_iter *linked;
668 for_each_linked_btree_iter(iter, linked)
669 bch2_btree_iter_node_drop(linked, b);
672 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
674 unsigned level = b->level;
676 if (iter->l[level].b == b) {
677 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
678 btree_node_unlock(iter, level);
679 iter->l[level].b = BTREE_ITER_NOT_END;
684 * A btree node has been modified in such a way as to invalidate iterators - fix
687 void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
689 struct btree_iter *linked;
691 for_each_linked_btree_node(iter, b, linked)
692 __btree_iter_init(linked, b);
693 __btree_iter_init(iter, b);
696 static inline int btree_iter_lock_root(struct btree_iter *iter,
699 struct bch_fs *c = iter->c;
701 enum six_lock_type lock_type;
704 EBUG_ON(iter->nodes_locked);
707 b = READ_ONCE(c->btree_roots[iter->btree_id].b);
708 iter->level = READ_ONCE(b->level);
710 if (unlikely(iter->level < depth_want)) {
712 * the root is at a lower depth than the depth we want:
713 * got to the end of the btree, or we're walking nodes
714 * greater than some depth and there are no nodes >=
717 iter->level = depth_want;
718 iter->l[iter->level].b = NULL;
722 lock_type = btree_lock_want(iter, iter->level);
723 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
727 if (likely(b == c->btree_roots[iter->btree_id].b &&
728 b->level == iter->level &&
730 for (i = 0; i < iter->level; i++)
731 iter->l[i].b = BTREE_ITER_NOT_END;
732 iter->l[iter->level].b = b;
734 mark_btree_node_locked(iter, iter->level, lock_type);
735 btree_iter_node_set(iter, b);
740 six_unlock_type(&b->lock, lock_type);
745 static void btree_iter_prefetch(struct btree_iter *iter)
747 struct btree_iter_level *l = &iter->l[iter->level];
748 struct btree_node_iter node_iter = l->iter;
749 struct bkey_packed *k;
751 unsigned nr = iter->level > 1 ? 1 : 8;
752 bool was_locked = btree_node_locked(iter, iter->level);
755 if (!bch2_btree_node_relock(iter, iter->level))
758 bch2_btree_node_iter_advance(&node_iter, l->b);
759 k = bch2_btree_node_iter_peek(&node_iter, l->b);
763 bch2_bkey_unpack(l->b, &tmp.k, k);
764 bch2_btree_node_prefetch(iter->c, &tmp.k,
770 btree_node_unlock(iter, iter->level);
773 static inline int btree_iter_down(struct btree_iter *iter)
775 struct btree_iter_level *l = &iter->l[iter->level];
777 unsigned level = iter->level - 1;
778 enum six_lock_type lock_type = btree_lock_want(iter, level);
781 BUG_ON(!btree_node_locked(iter, iter->level));
783 bch2_bkey_unpack(l->b, &tmp.k,
784 bch2_btree_node_iter_peek(&l->iter, l->b));
786 b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type);
787 if (unlikely(IS_ERR(b)))
790 mark_btree_node_locked(iter, level, lock_type);
791 btree_iter_node_set(iter, b);
793 if (iter->flags & BTREE_ITER_PREFETCH)
794 btree_iter_prefetch(iter);
801 static void btree_iter_up(struct btree_iter *iter)
803 btree_node_unlock(iter, iter->level++);
806 int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
808 static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
810 struct bch_fs *c = iter->c;
811 struct btree_iter *linked, *sorted_iters, **i;
813 bch2_btree_iter_unlock(iter);
815 if (ret != -ENOMEM && ret != -EINTR)
818 if (ret == -ENOMEM) {
821 closure_init_stack(&cl);
824 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
830 * Linked iters are normally a circular singly linked list - break cycle
831 * while we sort them:
839 linked = linked->next;
842 while (*i && btree_iter_cmp(iter, *i) > 0)
849 /* Make list circular again: */
853 iter->next = sorted_iters;
855 /* Now, redo traversals in correct order: */
860 ret = __bch2_btree_iter_traverse(iter);
868 } while (iter != sorted_iters);
870 ret = btree_iter_linked(iter) ? -EINTR : 0;
872 bch2_btree_cache_cannibalize_unlock(c);
877 iter->flags |= BTREE_ITER_ERROR;
878 iter->l[iter->level].b = NULL;
883 * This is the main state machine for walking down the btree - walks down to a
886 * Returns 0 on success, -EIO on error (error reading in a btree node).
888 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
889 * stashed in the iterator and returned from bch2_btree_iter_unlock().
891 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
893 unsigned depth_want = iter->level;
895 if (unlikely(!iter->l[iter->level].b))
898 iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
900 /* make sure we have all the intent locks we need - ugh */
901 if (unlikely(iter->l[iter->level].b &&
902 iter->level + 1 < iter->locks_want)) {
905 for (i = iter->level + 1;
906 i < iter->locks_want && iter->l[i].b;
908 if (!bch2_btree_node_relock(iter, i)) {
909 while (iter->level < BTREE_MAX_DEPTH &&
910 iter->l[iter->level].b &&
911 iter->level + 1 < iter->locks_want)
918 * If the current node isn't locked, go up until we have a locked node
919 * or run out of nodes:
921 while (btree_iter_node(iter, iter->level) &&
922 !(is_btree_node(iter, iter->level) &&
923 bch2_btree_node_relock(iter, iter->level) &&
926 * XXX: correctly using BTREE_ITER_UPTODATE should make
927 * comparing iter->pos against node's key unnecessary
929 btree_iter_pos_in_node(iter, iter->l[iter->level].b)))
933 * If we've got a btree node locked (i.e. we aren't about to relock the
934 * root) - advance its node iterator if necessary:
936 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
938 if (btree_iter_node(iter, iter->level)) {
939 struct btree_iter_level *l = &iter->l[iter->level];
943 while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
944 !btree_iter_pos_cmp(iter, k.k))
945 __btree_iter_advance(l);
949 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
950 * would indicate to other code that we got to the end of the btree,
951 * here it indicates that relocking the root failed - it's critical that
952 * btree_iter_lock_root() comes next and that it can't fail
954 while (iter->level > depth_want) {
955 int ret = btree_iter_node(iter, iter->level)
956 ? btree_iter_down(iter)
957 : btree_iter_lock_root(iter, depth_want);
959 iter->level = depth_want;
960 iter->l[iter->level].b = BTREE_ITER_NOT_END;
965 iter->uptodate = BTREE_ITER_NEED_PEEK;
969 int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
973 if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
976 ret = __bch2_btree_iter_traverse(iter);
978 ret = btree_iter_traverse_error(iter, ret);
983 /* Iterate across nodes (leaf and interior nodes) */
985 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
990 EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
992 ret = bch2_btree_iter_traverse(iter);
996 b = iter->l[iter->level].b;
999 EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1000 iter->pos = b->key.k.p;
1006 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
1011 EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
1013 btree_iter_up(iter);
1015 if (!btree_iter_node(iter, iter->level))
1018 /* parent node usually won't be locked: redo traversal if necessary */
1019 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1020 ret = bch2_btree_iter_traverse(iter);
1024 b = iter->l[iter->level].b;
1028 if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1029 /* Haven't gotten to the end of the parent node: */
1032 iter->pos = iter->btree_id == BTREE_ID_INODES
1033 ? btree_type_successor(iter->btree_id, iter->pos)
1034 : bkey_successor(iter->pos);
1035 iter->level = depth;
1037 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1038 ret = bch2_btree_iter_traverse(iter);
1042 b = iter->l[iter->level].b;
1045 iter->pos = b->key.k.p;
1050 /* Iterate across keys (in leaf nodes only) */
1052 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1054 struct btree_iter_level *l = &iter->l[0];
1055 struct bkey_packed *k;
1057 EBUG_ON(iter->level != 0);
1058 EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1059 EBUG_ON(!btree_node_locked(iter, 0));
1060 EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1062 iter->pos = new_pos;
1063 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1065 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
1066 !btree_iter_pos_cmp_packed(l->b, &iter->pos, k,
1067 iter->flags & BTREE_ITER_IS_EXTENTS))
1068 __btree_iter_advance(l);
1070 if (!k && btree_iter_pos_after_node(iter, l->b)) {
1071 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1072 iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
1076 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1078 EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
1079 iter->pos = new_pos;
1081 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1084 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1086 struct btree_iter_level *l = &iter->l[0];
1090 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
1091 (iter->btree_id == BTREE_ID_EXTENTS));
1092 EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
1094 if (iter->uptodate == BTREE_ITER_UPTODATE) {
1095 struct bkey_packed *k =
1096 __bch2_btree_node_iter_peek_all(&l->iter, l->b);
1097 struct bkey_s_c ret = {
1099 .v = bkeyp_val(&l->b->format, k)
1102 EBUG_ON(!btree_node_locked(iter, 0));
1104 if (debug_check_bkeys(iter->c))
1105 bch2_bkey_debugcheck(iter->c, l->b, ret);
1109 if (iter->uptodate == BTREE_ITER_END)
1110 return bkey_s_c_null;
1113 ret = bch2_btree_iter_traverse(iter);
1115 return bkey_s_c_err(ret);
1117 k = __btree_iter_peek(iter, l);
1121 /* got to the end of the leaf, iterator needs to be traversed: */
1122 iter->pos = l->b->key.k.p;
1123 if (!bkey_cmp(iter->pos, POS_MAX)) {
1124 iter->uptodate = BTREE_ITER_END;
1125 return bkey_s_c_null;
1128 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1129 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1133 * iter->pos should always be equal to the key we just
1134 * returned - except extents can straddle iter->pos:
1136 if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1137 bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1138 iter->pos = bkey_start_pos(k.k);
1140 iter->uptodate = BTREE_ITER_UPTODATE;
1145 struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
1147 struct btree_iter_level *l = &iter->l[0];
1149 iter->pos = l->b->key.k.p;
1150 if (!bkey_cmp(iter->pos, POS_MAX)) {
1151 iter->uptodate = BTREE_ITER_END;
1152 return bkey_s_c_null;
1155 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1156 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1158 return bch2_btree_iter_peek(iter);
1161 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1163 struct btree_iter_level *l = &iter->l[0];
1164 struct bkey_packed *p;
1167 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
1168 (iter->btree_id == BTREE_ID_EXTENTS));
1169 EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
1171 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1172 k = bch2_btree_iter_peek(iter);
1173 if (IS_ERR_OR_NULL(k.k))
1178 __btree_iter_advance(l);
1179 p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1181 return bch2_btree_iter_peek_next_leaf(iter);
1182 } while (bkey_deleted(p));
1184 k = __btree_iter_unpack(iter, l, &iter->k, p);
1186 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
1187 iter->pos = bkey_start_pos(k.k);
1191 static inline struct bkey_s_c
1192 __bch2_btree_iter_peek_slot(struct btree_iter *iter)
1194 struct btree_iter_level *l = &iter->l[0];
1200 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1201 bkey_deleted(k.k) &&
1202 bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
1203 __btree_iter_advance(l);
1205 if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
1206 EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
1207 EBUG_ON(bkey_deleted(k.k));
1208 iter->uptodate = BTREE_ITER_UPTODATE;
1213 * If we got to the end of the node, check if we need to traverse to the
1216 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1217 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1218 ret = bch2_btree_iter_traverse(iter);
1220 return bkey_s_c_err(ret);
1229 if (iter->flags & BTREE_ITER_IS_EXTENTS) {
1230 if (n.p.offset == KEY_OFFSET_MAX) {
1231 if (n.p.inode == KEY_INODE_MAX) {
1232 iter->uptodate = BTREE_ITER_END;
1233 return bkey_s_c_null;
1236 iter->pos = bkey_successor(iter->pos);
1244 min_t(u64, KEY_SIZE_MAX,
1245 (k.k->p.inode == n.p.inode
1246 ? bkey_start_offset(k.k)
1254 iter->uptodate = BTREE_ITER_UPTODATE;
1255 return (struct bkey_s_c) { &iter->k, NULL };
1258 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1260 struct btree_iter_level *l = &iter->l[0];
1263 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
1264 (iter->btree_id == BTREE_ID_EXTENTS));
1265 EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
1267 if (iter->uptodate == BTREE_ITER_UPTODATE) {
1268 struct bkey_s_c ret = { .k = &iter->k };;
1270 if (!bkey_deleted(&iter->k))
1271 ret.v = bkeyp_val(&l->b->format,
1272 __bch2_btree_node_iter_peek_all(&l->iter, l->b));
1274 EBUG_ON(!btree_node_locked(iter, 0));
1276 if (debug_check_bkeys(iter->c))
1277 bch2_bkey_debugcheck(iter->c, l->b, ret);
1281 if (iter->uptodate == BTREE_ITER_END)
1282 return bkey_s_c_null;
1284 ret = bch2_btree_iter_traverse(iter);
1286 return bkey_s_c_err(ret);
1288 return __bch2_btree_iter_peek_slot(iter);
1291 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1293 iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1295 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1297 * XXX: when we just need to relock we should be able to avoid
1298 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1301 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1303 return bch2_btree_iter_peek_slot(iter);
1306 if (!bkey_deleted(&iter->k))
1307 __btree_iter_advance(&iter->l[0]);
1309 return __bch2_btree_iter_peek_slot(iter);
1312 void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
1313 enum btree_id btree_id, struct bpos pos,
1314 unsigned locks_want, unsigned depth,
1319 EBUG_ON(depth >= BTREE_MAX_DEPTH);
1320 EBUG_ON(locks_want > BTREE_MAX_DEPTH);
1324 bkey_init(&iter->k);
1326 iter->flags = flags;
1327 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1328 iter->btree_id = btree_id;
1329 iter->level = depth;
1330 iter->locks_want = locks_want;
1331 iter->nodes_locked = 0;
1332 iter->nodes_intent_locked = 0;
1333 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1334 iter->l[i].b = NULL;
1335 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1338 if (unlikely((flags & BTREE_ITER_IS_EXTENTS) &&
1339 !bkey_cmp(pos, POS_MAX)))
1340 iter->uptodate = BTREE_ITER_END;
1342 prefetch(c->btree_roots[btree_id].b);
1345 void bch2_btree_iter_unlink(struct btree_iter *iter)
1347 struct btree_iter *linked;
1349 __bch2_btree_iter_unlock(iter);
1351 if (!btree_iter_linked(iter))
1354 for_each_linked_btree_iter(iter, linked) {
1356 if (linked->next == iter) {
1357 linked->next = iter->next;
1365 void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
1367 BUG_ON(btree_iter_linked(new));
1369 new->next = iter->next;
1372 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1373 unsigned nr_iters = 1;
1375 for_each_linked_btree_iter(iter, new)
1378 BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
1382 void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
1384 __bch2_btree_iter_unlock(dst);
1385 memcpy(dst, src, offsetof(struct btree_iter, next));
1386 dst->nodes_locked = dst->nodes_intent_locked = 0;
1387 dst->uptodate = BTREE_ITER_NEED_RELOCK;