1 // SPDX-License-Identifier: GPL-2.0
5 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
12 #include <linux/prefetch.h>
13 #include <linux/sched/mm.h>
14 #include <trace/events/bcachefs.h>
16 void bch2_recalc_btree_reserve(struct bch_fs *c)
18 unsigned i, reserve = 16;
20 if (!c->btree_roots[0].b)
23 for (i = 0; i < BTREE_ID_NR; i++)
24 if (c->btree_roots[i].b)
25 reserve += min_t(unsigned, 1,
26 c->btree_roots[i].b->c.level) * 8;
28 c->btree_cache.reserve = reserve;
31 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
33 return max_t(int, 0, bc->used - bc->reserve);
36 static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
38 EBUG_ON(btree_node_write_in_flight(b));
40 kvpfree(b->data, btree_bytes(c));
46 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
48 struct btree_cache *bc = &c->btree_cache;
50 __btree_node_data_free(c, b);
52 list_move(&b->list, &bc->freed);
55 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
58 const struct btree *b = obj;
59 const u64 *v = arg->key;
61 return b->hash_val == *v ? 0 : 1;
64 static const struct rhashtable_params bch_btree_cache_params = {
65 .head_offset = offsetof(struct btree, hash),
66 .key_offset = offsetof(struct btree, hash_val),
67 .key_len = sizeof(u64),
68 .obj_cmpfn = bch2_btree_cache_cmp_fn,
71 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
73 BUG_ON(b->data || b->aux_data);
75 b->data = kvpmalloc(btree_bytes(c), gfp);
79 b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
81 kvpfree(b->data, btree_bytes(c));
89 static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
91 struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
95 bkey_btree_ptr_init(&b->key);
96 six_lock_init(&b->c.lock);
97 INIT_LIST_HEAD(&b->list);
98 INIT_LIST_HEAD(&b->write_blocked);
99 b->byte_order = ilog2(btree_bytes(c));
103 static struct btree *btree_node_mem_alloc(struct bch_fs *c)
105 struct btree_cache *bc = &c->btree_cache;
106 struct btree *b = __btree_node_mem_alloc(c);
110 if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
116 list_add(&b->list, &bc->freeable);
120 /* Btree in memory cache - hash table */
122 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
124 rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
126 /* Cause future lookups for this node to fail: */
129 six_lock_wakeup_all(&b->c.lock);
132 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
135 b->hash_val = btree_ptr_hash_val(&b->key);
137 return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
138 bch_btree_cache_params);
141 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
142 unsigned level, enum btree_id id)
149 mutex_lock(&bc->lock);
150 ret = __bch2_btree_node_hash_insert(bc, b);
152 list_add(&b->list, &bc->live);
153 mutex_unlock(&bc->lock);
159 static inline struct btree *btree_cache_find(struct btree_cache *bc,
160 const struct bkey_i *k)
162 u64 v = btree_ptr_hash_val(k);
164 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
168 * this version is for btree nodes that have already been freed (we're not
169 * reaping a real btree node)
171 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
173 struct btree_cache *bc = &c->btree_cache;
176 lockdep_assert_held(&bc->lock);
178 if (!six_trylock_intent(&b->c.lock))
181 if (!six_trylock_write(&b->c.lock))
182 goto out_unlock_intent;
184 if (btree_node_noevict(b))
187 if (!btree_node_may_write(b))
190 if (btree_node_dirty(b) &&
191 test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
194 if (btree_node_dirty(b) ||
195 btree_node_write_in_flight(b) ||
196 btree_node_read_in_flight(b)) {
200 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
201 TASK_UNINTERRUPTIBLE);
204 * Using the underscore version because we don't want to compact
205 * bsets after the write, since this node is about to be evicted
206 * - unless btree verify mode is enabled, since it runs out of
207 * the post write cleanup:
209 if (bch2_verify_btree_ondisk)
210 bch2_btree_node_write(c, b, SIX_LOCK_intent);
212 __bch2_btree_node_write(c, b, SIX_LOCK_read);
214 /* wait for any in flight btree write */
215 btree_node_wait_on_io(b);
218 if (b->hash_val && !ret)
219 trace_btree_node_reap(c, b);
222 six_unlock_write(&b->c.lock);
224 six_unlock_intent(&b->c.lock);
229 static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
231 return __btree_node_reclaim(c, b, false);
234 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
236 return __btree_node_reclaim(c, b, true);
239 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
240 struct shrink_control *sc)
242 struct bch_fs *c = container_of(shrink, struct bch_fs,
244 struct btree_cache *bc = &c->btree_cache;
246 unsigned long nr = sc->nr_to_scan;
247 unsigned long can_free;
248 unsigned long touched = 0;
249 unsigned long freed = 0;
252 if (bch2_btree_shrinker_disabled)
255 /* Return -1 if we can't do anything right now */
256 if (sc->gfp_mask & __GFP_FS)
257 mutex_lock(&bc->lock);
258 else if (!mutex_trylock(&bc->lock))
261 flags = memalloc_nofs_save();
264 * It's _really_ critical that we don't free too many btree nodes - we
265 * have to always leave ourselves a reserve. The reserve is how we
266 * guarantee that allocating memory for a new btree node can always
267 * succeed, so that inserting keys into the btree can always succeed and
268 * IO can always make forward progress:
270 nr /= btree_pages(c);
271 can_free = btree_cache_can_free(bc);
272 nr = min_t(unsigned long, nr, can_free);
275 list_for_each_entry_safe(b, t, &bc->freeable, list) {
282 !btree_node_reclaim(c, b)) {
283 btree_node_data_free(c, b);
284 six_unlock_write(&b->c.lock);
285 six_unlock_intent(&b->c.lock);
290 list_for_each_entry_safe(b, t, &bc->live, list) {
295 if (&t->list != &bc->live)
296 list_move_tail(&bc->live, &t->list);
300 if (!btree_node_accessed(b) &&
301 !btree_node_reclaim(c, b)) {
302 /* can't call bch2_btree_node_hash_remove under lock */
304 if (&t->list != &bc->live)
305 list_move_tail(&bc->live, &t->list);
307 btree_node_data_free(c, b);
308 mutex_unlock(&bc->lock);
310 bch2_btree_node_hash_remove(bc, b);
311 six_unlock_write(&b->c.lock);
312 six_unlock_intent(&b->c.lock);
317 if (sc->gfp_mask & __GFP_FS)
318 mutex_lock(&bc->lock);
319 else if (!mutex_trylock(&bc->lock))
323 clear_btree_node_accessed(b);
326 mutex_unlock(&bc->lock);
328 memalloc_nofs_restore(flags);
329 return (unsigned long) freed * btree_pages(c);
332 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
333 struct shrink_control *sc)
335 struct bch_fs *c = container_of(shrink, struct bch_fs,
337 struct btree_cache *bc = &c->btree_cache;
339 if (bch2_btree_shrinker_disabled)
342 return btree_cache_can_free(bc) * btree_pages(c);
345 void bch2_fs_btree_cache_exit(struct bch_fs *c)
347 struct btree_cache *bc = &c->btree_cache;
351 if (bc->shrink.list.next)
352 unregister_shrinker(&bc->shrink);
354 /* vfree() can allocate memory: */
355 flags = memalloc_nofs_save();
356 mutex_lock(&bc->lock);
358 #ifdef CONFIG_BCACHEFS_DEBUG
360 list_move(&c->verify_data->list, &bc->live);
362 kvpfree(c->verify_ondisk, btree_bytes(c));
365 for (i = 0; i < BTREE_ID_NR; i++)
366 if (c->btree_roots[i].b)
367 list_add(&c->btree_roots[i].b->list, &bc->live);
369 list_splice(&bc->freeable, &bc->live);
371 while (!list_empty(&bc->live)) {
372 b = list_first_entry(&bc->live, struct btree, list);
374 BUG_ON(btree_node_read_in_flight(b) ||
375 btree_node_write_in_flight(b));
377 if (btree_node_dirty(b))
378 bch2_btree_complete_write(c, b, btree_current_write(b));
379 clear_btree_node_dirty(c, b);
381 btree_node_data_free(c, b);
384 BUG_ON(atomic_read(&c->btree_cache.dirty));
386 while (!list_empty(&bc->freed)) {
387 b = list_first_entry(&bc->freed, struct btree, list);
392 mutex_unlock(&bc->lock);
393 memalloc_nofs_restore(flags);
395 if (bc->table_init_done)
396 rhashtable_destroy(&bc->table);
399 int bch2_fs_btree_cache_init(struct bch_fs *c)
401 struct btree_cache *bc = &c->btree_cache;
405 pr_verbose_init(c->opts, "");
407 ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
411 bc->table_init_done = true;
413 bch2_recalc_btree_reserve(c);
415 for (i = 0; i < bc->reserve; i++)
416 if (!btree_node_mem_alloc(c)) {
421 list_splice_init(&bc->live, &bc->freeable);
423 #ifdef CONFIG_BCACHEFS_DEBUG
424 mutex_init(&c->verify_lock);
426 c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
427 if (!c->verify_ondisk) {
432 c->verify_data = btree_node_mem_alloc(c);
433 if (!c->verify_data) {
438 list_del_init(&c->verify_data->list);
441 bc->shrink.count_objects = bch2_btree_cache_count;
442 bc->shrink.scan_objects = bch2_btree_cache_scan;
443 bc->shrink.seeks = 4;
444 bc->shrink.batch = btree_pages(c) * 2;
445 ret = register_shrinker(&bc->shrink);
447 pr_verbose_init(c->opts, "ret %i", ret);
451 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
453 mutex_init(&bc->lock);
454 INIT_LIST_HEAD(&bc->live);
455 INIT_LIST_HEAD(&bc->freeable);
456 INIT_LIST_HEAD(&bc->freed);
460 * We can only have one thread cannibalizing other cached btree nodes at a time,
461 * or we'll deadlock. We use an open coded mutex to ensure that, which a
462 * cannibalize_bucket() will take. This means every time we unlock the root of
463 * the btree, we need to release this lock if we have it held.
465 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
467 struct btree_cache *bc = &c->btree_cache;
469 if (bc->alloc_lock == current) {
470 trace_btree_node_cannibalize_unlock(c);
471 bc->alloc_lock = NULL;
472 closure_wake_up(&bc->alloc_wait);
476 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
478 struct btree_cache *bc = &c->btree_cache;
479 struct task_struct *old;
481 old = cmpxchg(&bc->alloc_lock, NULL, current);
482 if (old == NULL || old == current)
486 trace_btree_node_cannibalize_lock_fail(c);
490 closure_wait(&bc->alloc_wait, cl);
492 /* Try again, after adding ourselves to waitlist */
493 old = cmpxchg(&bc->alloc_lock, NULL, current);
494 if (old == NULL || old == current) {
496 closure_wake_up(&bc->alloc_wait);
500 trace_btree_node_cannibalize_lock_fail(c);
504 trace_btree_node_cannibalize_lock(c);
508 static struct btree *btree_node_cannibalize(struct bch_fs *c)
510 struct btree_cache *bc = &c->btree_cache;
513 list_for_each_entry_reverse(b, &bc->live, list)
514 if (!btree_node_reclaim(c, b))
518 list_for_each_entry_reverse(b, &bc->live, list)
519 if (!btree_node_write_and_reclaim(c, b))
523 * Rare case: all nodes were intent-locked.
526 WARN_ONCE(1, "btree cache cannibalize failed\n");
531 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
533 struct btree_cache *bc = &c->btree_cache;
535 u64 start_time = local_clock();
538 flags = memalloc_nofs_save();
539 mutex_lock(&bc->lock);
542 * btree_free() doesn't free memory; it sticks the node on the end of
543 * the list. Check if there's any freed nodes there:
545 list_for_each_entry(b, &bc->freeable, list)
546 if (!btree_node_reclaim(c, b))
550 * We never free struct btree itself, just the memory that holds the on
551 * disk node. Check the freed list before allocating a new one:
553 list_for_each_entry(b, &bc->freed, list)
554 if (!btree_node_reclaim(c, b))
560 list_del_init(&b->list);
561 mutex_unlock(&bc->lock);
564 b = __btree_node_mem_alloc(c);
568 BUG_ON(!six_trylock_intent(&b->c.lock));
569 BUG_ON(!six_trylock_write(&b->c.lock));
573 if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
576 mutex_lock(&bc->lock);
578 mutex_unlock(&bc->lock);
581 BUG_ON(btree_node_hashed(b));
582 BUG_ON(btree_node_write_in_flight(b));
589 b->whiteout_u64s = 0;
590 bch2_btree_keys_init(b);
592 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
595 memalloc_nofs_restore(flags);
598 mutex_lock(&bc->lock);
601 list_add(&b->list, &bc->freed);
602 six_unlock_write(&b->c.lock);
603 six_unlock_intent(&b->c.lock);
606 /* Try to cannibalize another cached btree node: */
607 if (bc->alloc_lock == current) {
608 b = btree_node_cannibalize(c);
609 list_del_init(&b->list);
610 mutex_unlock(&bc->lock);
612 bch2_btree_node_hash_remove(bc, b);
614 trace_btree_node_cannibalize(c);
618 mutex_unlock(&bc->lock);
619 memalloc_nofs_restore(flags);
620 return ERR_PTR(-ENOMEM);
623 /* Slowpath, don't want it inlined into btree_iter_traverse() */
624 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
625 struct btree_iter *iter,
626 const struct bkey_i *k,
627 enum btree_id btree_id,
629 enum six_lock_type lock_type,
632 struct btree_cache *bc = &c->btree_cache;
635 BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
637 * Parent node must be locked, else we could read in a btree node that's
640 if (iter && !bch2_btree_node_relock(iter, level + 1))
641 return ERR_PTR(-EINTR);
643 b = bch2_btree_node_mem_alloc(c);
647 bkey_copy(&b->key, k);
648 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
649 /* raced with another fill: */
651 /* mark as unhashed... */
654 mutex_lock(&bc->lock);
655 list_add(&b->list, &bc->freeable);
656 mutex_unlock(&bc->lock);
658 six_unlock_write(&b->c.lock);
659 six_unlock_intent(&b->c.lock);
664 * Unlock before doing IO:
666 * XXX: ideally should be dropping all btree node locks here
668 if (iter && btree_node_read_locked(iter, level + 1))
669 btree_node_unlock(iter, level + 1);
671 bch2_btree_node_read(c, b, sync);
673 six_unlock_write(&b->c.lock);
676 six_unlock_intent(&b->c.lock);
680 if (lock_type == SIX_LOCK_read)
681 six_lock_downgrade(&b->c.lock);
686 static int lock_node_check_fn(struct six_lock *lock, void *p)
688 struct btree *b = container_of(lock, struct btree, c.lock);
689 const struct bkey_i *k = p;
691 return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
695 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
696 * in from disk if necessary.
698 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
700 * The btree node will have either a read or a write lock held, depending on
701 * the @write parameter.
703 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
704 const struct bkey_i *k, unsigned level,
705 enum six_lock_type lock_type,
706 unsigned long trace_ip)
708 struct btree_cache *bc = &c->btree_cache;
712 EBUG_ON(level >= BTREE_MAX_DEPTH);
714 b = btree_node_mem_ptr(k);
718 b = btree_cache_find(bc, k);
721 * We must have the parent locked to call bch2_btree_node_fill(),
722 * else we could read in a btree node from disk that's been
725 b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
726 level, lock_type, true);
728 /* We raced and found the btree node in the cache */
737 * There's a potential deadlock with splits and insertions into
738 * interior nodes we have to avoid:
740 * The other thread might be holding an intent lock on the node
741 * we want, and they want to update its parent node so they're
742 * going to upgrade their intent lock on the parent node to a
745 * But if we're holding a read lock on the parent, and we're
746 * trying to get the intent lock they're holding, we deadlock.
748 * So to avoid this we drop the read locks on parent nodes when
749 * we're starting to take intent locks - and handle the race.
751 * The race is that they might be about to free the node we
752 * want, and dropping our read lock on the parent node lets them
753 * update the parent marking the node we want as freed, and then
756 * To guard against this, btree nodes are evicted from the cache
757 * when they're freed - and b->hash_val is zeroed out, which we
758 * check for after we lock the node.
760 * Then, bch2_btree_node_relock() on the parent will fail - because
761 * the parent was modified, when the pointer to the node we want
762 * was removed - and we'll bail out:
764 if (btree_node_read_locked(iter, level + 1))
765 btree_node_unlock(iter, level + 1);
767 if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
768 lock_node_check_fn, (void *) k, trace_ip)) {
769 if (b->hash_val != btree_ptr_hash_val(k))
771 return ERR_PTR(-EINTR);
774 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
775 b->c.level != level ||
777 six_unlock_type(&b->c.lock, lock_type);
778 if (bch2_btree_node_relock(iter, level + 1))
781 trace_trans_restart_btree_node_reused(iter->trans->ip);
782 return ERR_PTR(-EINTR);
786 /* XXX: waiting on IO with btree locks held: */
787 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
788 TASK_UNINTERRUPTIBLE);
790 prefetch(b->aux_data);
792 for_each_bset(b, t) {
793 void *p = (u64 *) b->aux_data + t->aux_data_offset;
795 prefetch(p + L1_CACHE_BYTES * 0);
796 prefetch(p + L1_CACHE_BYTES * 1);
797 prefetch(p + L1_CACHE_BYTES * 2);
800 /* avoid atomic set bit if it's not needed: */
801 if (!btree_node_accessed(b))
802 set_btree_node_accessed(b);
804 if (unlikely(btree_node_read_error(b))) {
805 six_unlock_type(&b->c.lock, lock_type);
806 return ERR_PTR(-EIO);
809 EBUG_ON(b->c.btree_id != iter->btree_id);
810 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
811 EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
812 EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
813 bkey_cmp(b->data->min_key,
814 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
819 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
820 const struct bkey_i *k,
821 enum btree_id btree_id,
825 struct btree_cache *bc = &c->btree_cache;
830 EBUG_ON(level >= BTREE_MAX_DEPTH);
832 b = btree_node_mem_ptr(k);
836 b = btree_cache_find(bc, k);
841 b = bch2_btree_node_fill(c, NULL, k, btree_id,
842 level, SIX_LOCK_read, true);
844 /* We raced and found the btree node in the cache */
849 !bch2_btree_cache_cannibalize_lock(c, NULL))
856 ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
860 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
861 b->c.btree_id != btree_id ||
862 b->c.level != level)) {
863 six_unlock_read(&b->c.lock);
868 /* XXX: waiting on IO with btree locks held: */
869 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
870 TASK_UNINTERRUPTIBLE);
872 prefetch(b->aux_data);
874 for_each_bset(b, t) {
875 void *p = (u64 *) b->aux_data + t->aux_data_offset;
877 prefetch(p + L1_CACHE_BYTES * 0);
878 prefetch(p + L1_CACHE_BYTES * 1);
879 prefetch(p + L1_CACHE_BYTES * 2);
882 /* avoid atomic set bit if it's not needed: */
883 if (!btree_node_accessed(b))
884 set_btree_node_accessed(b);
886 if (unlikely(btree_node_read_error(b))) {
887 six_unlock_read(&b->c.lock);
892 EBUG_ON(b->c.btree_id != btree_id);
893 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
894 EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
895 EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
896 bkey_cmp(b->data->min_key,
897 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
899 bch2_btree_cache_cannibalize_unlock(c);
903 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
904 struct btree_iter *iter,
906 enum btree_node_sibling sib)
908 struct btree_trans *trans = iter->trans;
909 struct btree *parent;
910 struct btree_node_iter node_iter;
911 struct bkey_packed *k;
913 struct btree *ret = NULL;
914 unsigned level = b->c.level;
916 bch2_bkey_buf_init(&tmp);
918 parent = btree_iter_node(iter, level + 1);
923 * There's a corner case where a btree_iter might have a node locked
924 * that is just outside its current pos - when
925 * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
927 * But the lock ordering checks in __bch2_btree_node_lock() go off of
928 * iter->pos, not the node's key: so if the iterator is marked as
929 * needing to be traversed, we risk deadlock if we don't bail out here:
931 if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
932 return ERR_PTR(-EINTR);
934 if (!bch2_btree_node_relock(iter, level + 1)) {
935 ret = ERR_PTR(-EINTR);
939 node_iter = iter->l[parent->c.level].iter;
941 k = bch2_btree_node_iter_peek_all(&node_iter, parent);
942 BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
944 k = sib == btree_prev_sib
945 ? bch2_btree_node_iter_prev(&node_iter, parent)
946 : (bch2_btree_node_iter_advance(&node_iter, parent),
947 bch2_btree_node_iter_peek(&node_iter, parent));
951 bch2_bkey_buf_unpack(&tmp, c, parent, k);
953 ret = bch2_btree_node_get(c, iter, tmp.k, level,
954 SIX_LOCK_intent, _THIS_IP_);
956 if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
957 struct btree_iter *linked;
959 if (!bch2_btree_node_relock(iter, level + 1))
963 * We might have got -EINTR because trylock failed, and we're
964 * holding other locks that would cause us to deadlock:
966 trans_for_each_iter(trans, linked)
967 if (btree_iter_lock_cmp(iter, linked) < 0)
968 __bch2_btree_iter_unlock(linked);
970 if (sib == btree_prev_sib)
971 btree_node_unlock(iter, level);
973 ret = bch2_btree_node_get(c, iter, tmp.k, level,
974 SIX_LOCK_intent, _THIS_IP_);
977 * before btree_iter_relock() calls btree_iter_verify_locks():
979 if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
980 btree_node_unlock(iter, level + 1);
982 if (!bch2_btree_node_relock(iter, level)) {
983 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
986 six_unlock_intent(&ret->c.lock);
987 ret = ERR_PTR(-EINTR);
991 bch2_trans_relock(trans);
994 if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
995 btree_node_unlock(iter, level + 1);
997 if (PTR_ERR_OR_ZERO(ret) == -EINTR)
998 bch2_btree_iter_upgrade(iter, level + 2);
1000 BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
1002 if (!IS_ERR_OR_NULL(ret)) {
1003 struct btree *n1 = ret, *n2 = b;
1005 if (sib != btree_prev_sib)
1008 if (bkey_cmp(bkey_successor(n1->key.k.p),
1009 n2->data->min_key)) {
1010 char buf1[200], buf2[200];
1012 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
1013 bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
1015 bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
1018 bch2_btree_ids[iter->btree_id], level,
1021 six_unlock_intent(&ret->c.lock);
1026 bch2_btree_trans_verify_locks(trans);
1028 bch2_bkey_buf_exit(&tmp, c);
1033 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
1034 const struct bkey_i *k,
1035 enum btree_id btree_id, unsigned level)
1037 struct btree_cache *bc = &c->btree_cache;
1040 BUG_ON(iter && !btree_node_locked(iter, level + 1));
1041 BUG_ON(level >= BTREE_MAX_DEPTH);
1043 b = btree_cache_find(bc, k);
1047 bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
1050 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1053 const struct bkey_format *f = &b->format;
1054 struct bset_stats stats;
1056 memset(&stats, 0, sizeof(stats));
1058 bch2_btree_keys_stats(b, &stats);
1060 pr_buf(out, "l %u ", b->c.level);
1061 bch2_bpos_to_text(out, b->data->min_key);
1063 bch2_bpos_to_text(out, b->data->max_key);
1066 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1069 " format: u64s %u fields %u %u %u %u %u\n"
1070 " unpack fn len: %u\n"
1071 " bytes used %zu/%zu (%zu%% full)\n"
1072 " sib u64s: %u, %u (merge threshold %zu)\n"
1073 " nr packed keys %u\n"
1074 " nr unpacked keys %u\n"
1076 " failed unpacked %zu\n",
1078 f->bits_per_field[0],
1079 f->bits_per_field[1],
1080 f->bits_per_field[2],
1081 f->bits_per_field[3],
1082 f->bits_per_field[4],
1084 b->nr.live_u64s * sizeof(u64),
1085 btree_bytes(c) - sizeof(struct btree_node),
1086 b->nr.live_u64s * 100 / btree_max_u64s(c),
1089 BTREE_FOREGROUND_MERGE_THRESHOLD(c),
1091 b->nr.unpacked_keys,
1096 void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
1098 pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
1099 pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
1100 pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);