1 // SPDX-License-Identifier: GPL-2.0
5 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
17 struct lock_class_key bch2_btree_node_lock_key;
19 const char * const bch2_btree_node_flags[] = {
26 void bch2_recalc_btree_reserve(struct bch_fs *c)
28 unsigned i, reserve = 16;
30 if (!c->btree_roots[0].b)
33 for (i = 0; i < BTREE_ID_NR; i++)
34 if (c->btree_roots[i].b)
35 reserve += min_t(unsigned, 1,
36 c->btree_roots[i].b->c.level) * 8;
38 c->btree_cache.reserve = reserve;
41 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
43 return max_t(int, 0, bc->used - bc->reserve);
46 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
48 if (b->c.lock.readers)
49 list_move(&b->list, &bc->freed_pcpu);
51 list_move(&b->list, &bc->freed_nonpcpu);
54 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
56 struct btree_cache *bc = &c->btree_cache;
58 EBUG_ON(btree_node_write_in_flight(b));
60 kvpfree(b->data, btree_bytes(c));
65 munmap(b->aux_data, btree_aux_data_bytes(b));
71 btree_node_to_freedlist(bc, b);
74 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
77 const struct btree *b = obj;
78 const u64 *v = arg->key;
80 return b->hash_val == *v ? 0 : 1;
83 static const struct rhashtable_params bch_btree_cache_params = {
84 .head_offset = offsetof(struct btree, hash),
85 .key_offset = offsetof(struct btree, hash_val),
86 .key_len = sizeof(u64),
87 .obj_cmpfn = bch2_btree_cache_cmp_fn,
90 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
92 BUG_ON(b->data || b->aux_data);
94 b->data = kvpmalloc(btree_bytes(c), gfp);
98 b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
100 b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
101 PROT_READ|PROT_WRITE|PROT_EXEC,
102 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
103 if (b->aux_data == MAP_FAILED)
107 kvpfree(b->data, btree_bytes(c));
115 static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
117 struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
121 bkey_btree_ptr_init(&b->key);
122 __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
123 INIT_LIST_HEAD(&b->list);
124 INIT_LIST_HEAD(&b->write_blocked);
125 b->byte_order = ilog2(btree_bytes(c));
129 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
131 struct btree_cache *bc = &c->btree_cache;
132 struct btree *b = __btree_node_mem_alloc(c);
136 if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
142 list_add(&b->list, &bc->freeable);
146 /* Btree in memory cache - hash table */
148 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
150 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
153 /* Cause future lookups for this node to fail: */
156 six_lock_wakeup_all(&b->c.lock);
159 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
162 b->hash_val = btree_ptr_hash_val(&b->key);
164 return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
165 bch_btree_cache_params);
168 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
169 unsigned level, enum btree_id id)
176 mutex_lock(&bc->lock);
177 ret = __bch2_btree_node_hash_insert(bc, b);
179 list_add(&b->list, &bc->live);
180 mutex_unlock(&bc->lock);
186 static inline struct btree *btree_cache_find(struct btree_cache *bc,
187 const struct bkey_i *k)
189 u64 v = btree_ptr_hash_val(k);
191 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
195 * this version is for btree nodes that have already been freed (we're not
196 * reaping a real btree node)
198 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
200 struct btree_cache *bc = &c->btree_cache;
203 lockdep_assert_held(&bc->lock);
205 if (b->flags & ((1U << BTREE_NODE_dirty)|
206 (1U << BTREE_NODE_read_in_flight)|
207 (1U << BTREE_NODE_write_in_flight))) {
211 /* XXX: waiting on IO with btree cache lock held */
212 bch2_btree_node_wait_on_read(b);
213 bch2_btree_node_wait_on_write(b);
216 if (!six_trylock_intent(&b->c.lock))
219 if (!six_trylock_write(&b->c.lock))
220 goto out_unlock_intent;
222 /* recheck under lock */
223 if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
224 (1U << BTREE_NODE_write_in_flight))) {
227 six_unlock_write(&b->c.lock);
228 six_unlock_intent(&b->c.lock);
232 if (btree_node_noevict(b) ||
233 btree_node_write_blocked(b) ||
234 btree_node_will_make_reachable(b))
237 if (btree_node_dirty(b)) {
241 * Using the underscore version because we don't want to compact
242 * bsets after the write, since this node is about to be evicted
243 * - unless btree verify mode is enabled, since it runs out of
244 * the post write cleanup:
246 if (bch2_verify_btree_ondisk)
247 bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
249 __bch2_btree_node_write(c, b, 0);
251 six_unlock_write(&b->c.lock);
252 six_unlock_intent(&b->c.lock);
256 if (b->hash_val && !ret)
257 trace_btree_node_reap(c, b);
260 six_unlock_write(&b->c.lock);
262 six_unlock_intent(&b->c.lock);
267 static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
269 return __btree_node_reclaim(c, b, false);
272 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
274 return __btree_node_reclaim(c, b, true);
277 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
278 struct shrink_control *sc)
280 struct bch_fs *c = container_of(shrink, struct bch_fs,
282 struct btree_cache *bc = &c->btree_cache;
284 unsigned long nr = sc->nr_to_scan;
285 unsigned long can_free = 0;
286 unsigned long touched = 0;
287 unsigned long freed = 0;
289 unsigned long ret = SHRINK_STOP;
291 if (bch2_btree_shrinker_disabled)
294 /* Return -1 if we can't do anything right now */
295 if (sc->gfp_mask & __GFP_FS)
296 mutex_lock(&bc->lock);
297 else if (!mutex_trylock(&bc->lock))
300 flags = memalloc_nofs_save();
303 * It's _really_ critical that we don't free too many btree nodes - we
304 * have to always leave ourselves a reserve. The reserve is how we
305 * guarantee that allocating memory for a new btree node can always
306 * succeed, so that inserting keys into the btree can always succeed and
307 * IO can always make forward progress:
309 can_free = btree_cache_can_free(bc);
310 nr = min_t(unsigned long, nr, can_free);
313 list_for_each_entry_safe(b, t, &bc->freeable, list) {
315 * Leave a few nodes on the freeable list, so that a btree split
316 * won't have to hit the system allocator:
326 if (!btree_node_reclaim(c, b)) {
327 btree_node_data_free(c, b);
328 six_unlock_write(&b->c.lock);
329 six_unlock_intent(&b->c.lock);
334 list_for_each_entry_safe(b, t, &bc->live, list) {
336 if (btree_node_accessed(b)) {
337 clear_btree_node_accessed(b);
341 if (!btree_node_reclaim(c, b)) {
342 /* can't call bch2_btree_node_hash_remove under lock */
344 if (&t->list != &bc->live)
345 list_move_tail(&bc->live, &t->list);
347 btree_node_data_free(c, b);
348 mutex_unlock(&bc->lock);
350 bch2_btree_node_hash_remove(bc, b);
351 six_unlock_write(&b->c.lock);
352 six_unlock_intent(&b->c.lock);
357 if (sc->gfp_mask & __GFP_FS)
358 mutex_lock(&bc->lock);
359 else if (!mutex_trylock(&bc->lock))
370 if (&t->list != &bc->live)
371 list_move_tail(&bc->live, &t->list);
376 mutex_unlock(&bc->lock);
379 memalloc_nofs_restore(flags);
381 trace_btree_cache_scan(sc->nr_to_scan, can_free, ret);
385 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
386 struct shrink_control *sc)
388 struct bch_fs *c = container_of(shrink, struct bch_fs,
390 struct btree_cache *bc = &c->btree_cache;
392 if (bch2_btree_shrinker_disabled)
395 return btree_cache_can_free(bc);
398 static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
400 struct bch_fs *c = container_of(shrink, struct bch_fs,
403 bch2_btree_cache_to_text(out, c);
406 void bch2_fs_btree_cache_exit(struct bch_fs *c)
408 struct btree_cache *bc = &c->btree_cache;
412 if (bc->shrink.list.next)
413 unregister_shrinker(&bc->shrink);
415 /* vfree() can allocate memory: */
416 flags = memalloc_nofs_save();
417 mutex_lock(&bc->lock);
420 list_move(&c->verify_data->list, &bc->live);
422 kvpfree(c->verify_ondisk, btree_bytes(c));
424 for (i = 0; i < BTREE_ID_NR; i++)
425 if (c->btree_roots[i].b)
426 list_add(&c->btree_roots[i].b->list, &bc->live);
428 list_splice(&bc->freeable, &bc->live);
430 while (!list_empty(&bc->live)) {
431 b = list_first_entry(&bc->live, struct btree, list);
433 BUG_ON(btree_node_read_in_flight(b) ||
434 btree_node_write_in_flight(b));
436 if (btree_node_dirty(b))
437 bch2_btree_complete_write(c, b, btree_current_write(b));
438 clear_btree_node_dirty_acct(c, b);
440 btree_node_data_free(c, b);
443 BUG_ON(atomic_read(&c->btree_cache.dirty));
445 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
447 while (!list_empty(&bc->freed_nonpcpu)) {
448 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
450 six_lock_pcpu_free(&b->c.lock);
454 mutex_unlock(&bc->lock);
455 memalloc_nofs_restore(flags);
457 if (bc->table_init_done)
458 rhashtable_destroy(&bc->table);
461 int bch2_fs_btree_cache_init(struct bch_fs *c)
463 struct btree_cache *bc = &c->btree_cache;
467 pr_verbose_init(c->opts, "");
469 ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
473 bc->table_init_done = true;
475 bch2_recalc_btree_reserve(c);
477 for (i = 0; i < bc->reserve; i++)
478 if (!__bch2_btree_node_mem_alloc(c)) {
483 list_splice_init(&bc->live, &bc->freeable);
485 mutex_init(&c->verify_lock);
487 bc->shrink.count_objects = bch2_btree_cache_count;
488 bc->shrink.scan_objects = bch2_btree_cache_scan;
489 bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
490 bc->shrink.seeks = 4;
491 ret = register_shrinker(&bc->shrink);
493 pr_verbose_init(c->opts, "ret %i", ret);
497 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
499 mutex_init(&bc->lock);
500 INIT_LIST_HEAD(&bc->live);
501 INIT_LIST_HEAD(&bc->freeable);
502 INIT_LIST_HEAD(&bc->freed_pcpu);
503 INIT_LIST_HEAD(&bc->freed_nonpcpu);
507 * We can only have one thread cannibalizing other cached btree nodes at a time,
508 * or we'll deadlock. We use an open coded mutex to ensure that, which a
509 * cannibalize_bucket() will take. This means every time we unlock the root of
510 * the btree, we need to release this lock if we have it held.
512 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
514 struct btree_cache *bc = &c->btree_cache;
516 if (bc->alloc_lock == current) {
517 trace_btree_node_cannibalize_unlock(c);
518 bc->alloc_lock = NULL;
519 closure_wake_up(&bc->alloc_wait);
523 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
525 struct btree_cache *bc = &c->btree_cache;
526 struct task_struct *old;
528 old = cmpxchg(&bc->alloc_lock, NULL, current);
529 if (old == NULL || old == current)
533 trace_btree_node_cannibalize_lock_fail(c);
537 closure_wait(&bc->alloc_wait, cl);
539 /* Try again, after adding ourselves to waitlist */
540 old = cmpxchg(&bc->alloc_lock, NULL, current);
541 if (old == NULL || old == current) {
543 closure_wake_up(&bc->alloc_wait);
547 trace_btree_node_cannibalize_lock_fail(c);
551 trace_btree_node_cannibalize_lock(c);
555 static struct btree *btree_node_cannibalize(struct bch_fs *c)
557 struct btree_cache *bc = &c->btree_cache;
560 list_for_each_entry_reverse(b, &bc->live, list)
561 if (!btree_node_reclaim(c, b))
565 list_for_each_entry_reverse(b, &bc->live, list)
566 if (!btree_node_write_and_reclaim(c, b))
570 * Rare case: all nodes were intent-locked.
573 WARN_ONCE(1, "btree cache cannibalize failed\n");
578 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
580 struct btree_cache *bc = &c->btree_cache;
581 struct list_head *freed = pcpu_read_locks
583 : &bc->freed_nonpcpu;
584 struct btree *b, *b2;
585 u64 start_time = local_clock();
588 flags = memalloc_nofs_save();
589 mutex_lock(&bc->lock);
592 * We never free struct btree itself, just the memory that holds the on
593 * disk node. Check the freed list before allocating a new one:
595 list_for_each_entry(b, freed, list)
596 if (!btree_node_reclaim(c, b)) {
597 list_del_init(&b->list);
601 b = __btree_node_mem_alloc(c);
606 six_lock_pcpu_alloc(&b->c.lock);
608 BUG_ON(!six_trylock_intent(&b->c.lock));
609 BUG_ON(!six_trylock_write(&b->c.lock));
613 * btree_free() doesn't free memory; it sticks the node on the end of
614 * the list. Check if there's any freed nodes there:
616 list_for_each_entry(b2, &bc->freeable, list)
617 if (!btree_node_reclaim(c, b2)) {
618 swap(b->data, b2->data);
619 swap(b->aux_data, b2->aux_data);
620 btree_node_to_freedlist(bc, b2);
621 six_unlock_write(&b2->c.lock);
622 six_unlock_intent(&b2->c.lock);
626 mutex_unlock(&bc->lock);
628 if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
631 mutex_lock(&bc->lock);
634 mutex_unlock(&bc->lock);
636 BUG_ON(btree_node_hashed(b));
637 BUG_ON(btree_node_dirty(b));
638 BUG_ON(btree_node_write_in_flight(b));
645 b->whiteout_u64s = 0;
646 bch2_btree_keys_init(b);
647 set_btree_node_accessed(b);
649 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
652 memalloc_nofs_restore(flags);
655 mutex_lock(&bc->lock);
657 /* Try to cannibalize another cached btree node: */
658 if (bc->alloc_lock == current) {
659 b2 = btree_node_cannibalize(c);
660 bch2_btree_node_hash_remove(bc, b2);
663 swap(b->data, b2->data);
664 swap(b->aux_data, b2->aux_data);
665 btree_node_to_freedlist(bc, b2);
666 six_unlock_write(&b2->c.lock);
667 six_unlock_intent(&b2->c.lock);
670 list_del_init(&b->list);
673 mutex_unlock(&bc->lock);
675 trace_btree_node_cannibalize(c);
679 mutex_unlock(&bc->lock);
680 memalloc_nofs_restore(flags);
681 return ERR_PTR(-ENOMEM);
684 /* Slowpath, don't want it inlined into btree_iter_traverse() */
685 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
686 struct btree_trans *trans,
687 struct btree_path *path,
688 const struct bkey_i *k,
689 enum btree_id btree_id,
691 enum six_lock_type lock_type,
694 struct btree_cache *bc = &c->btree_cache;
698 BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
700 * Parent node must be locked, else we could read in a btree node that's
703 if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
704 trace_trans_restart_relock_parent_for_fill(trans, _THIS_IP_, path);
705 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
708 b = bch2_btree_node_mem_alloc(c, level != 0);
710 if (trans && b == ERR_PTR(-ENOMEM)) {
711 trans->memory_allocation_failure = true;
712 trace_trans_restart_memory_allocation_failure(trans, _THIS_IP_, path);
713 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
719 bkey_copy(&b->key, k);
720 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
721 /* raced with another fill: */
723 /* mark as unhashed... */
726 mutex_lock(&bc->lock);
727 list_add(&b->list, &bc->freeable);
728 mutex_unlock(&bc->lock);
730 six_unlock_write(&b->c.lock);
731 six_unlock_intent(&b->c.lock);
735 set_btree_node_read_in_flight(b);
737 six_unlock_write(&b->c.lock);
738 seq = b->c.lock.state.seq;
739 six_unlock_intent(&b->c.lock);
741 /* Unlock before doing IO: */
743 bch2_trans_unlock(trans);
745 bch2_btree_node_read(c, b, sync);
751 int ret = bch2_trans_relock(trans) ?:
752 bch2_btree_path_relock_intent(trans, path);
754 BUG_ON(!trans->restarted);
759 if (!six_relock_type(&b->c.lock, lock_type, seq)) {
761 trace_trans_restart_relock_after_fill(trans, _THIS_IP_, path);
762 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
768 static int lock_node_check_fn(struct six_lock *lock, void *p)
770 struct btree *b = container_of(lock, struct btree, c.lock);
771 const struct bkey_i *k = p;
773 if (b->hash_val != btree_ptr_hash_val(k))
774 return BCH_ERR_lock_fail_node_reused;
778 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
780 struct printbuf buf = PRINTBUF;
782 if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
786 "btree node header doesn't match ptr\n"
787 "btree %s level %u\n"
789 bch2_btree_ids[b->c.btree_id], b->c.level);
790 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
792 prt_printf(&buf, "\nheader: btree %s level %llu\n"
794 bch2_btree_ids[BTREE_NODE_ID(b->data)],
795 BTREE_NODE_LEVEL(b->data));
796 bch2_bpos_to_text(&buf, b->data->min_key);
798 prt_printf(&buf, "\nmax ");
799 bch2_bpos_to_text(&buf, b->data->max_key);
801 bch2_fs_inconsistent(c, "%s", buf.buf);
805 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
807 if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
808 b->c.level != BTREE_NODE_LEVEL(b->data) ||
809 bpos_cmp(b->data->max_key, b->key.k.p) ||
810 (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
811 bpos_cmp(b->data->min_key,
812 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
813 btree_bad_header(c, b);
817 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
818 * in from disk if necessary.
820 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
822 * The btree node will have either a read or a write lock held, depending on
823 * the @write parameter.
825 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
826 const struct bkey_i *k, unsigned level,
827 enum six_lock_type lock_type,
828 unsigned long trace_ip)
830 struct bch_fs *c = trans->c;
831 struct btree_cache *bc = &c->btree_cache;
836 EBUG_ON(level >= BTREE_MAX_DEPTH);
838 b = btree_node_mem_ptr(k);
841 * Check b->hash_val _before_ calling btree_node_lock() - this might not
842 * be the node we want anymore, and trying to lock the wrong node could
843 * cause an unneccessary transaction restart:
845 if (likely(c->opts.btree_node_mem_ptr_optimization &&
847 b->hash_val == btree_ptr_hash_val(k)))
850 b = btree_cache_find(bc, k);
853 * We must have the parent locked to call bch2_btree_node_fill(),
854 * else we could read in a btree node from disk that's been
857 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
858 level, lock_type, true);
860 /* We raced and found the btree node in the cache */
869 * There's a potential deadlock with splits and insertions into
870 * interior nodes we have to avoid:
872 * The other thread might be holding an intent lock on the node
873 * we want, and they want to update its parent node so they're
874 * going to upgrade their intent lock on the parent node to a
877 * But if we're holding a read lock on the parent, and we're
878 * trying to get the intent lock they're holding, we deadlock.
880 * So to avoid this we drop the read locks on parent nodes when
881 * we're starting to take intent locks - and handle the race.
883 * The race is that they might be about to free the node we
884 * want, and dropping our read lock on the parent node lets them
885 * update the parent marking the node we want as freed, and then
888 * To guard against this, btree nodes are evicted from the cache
889 * when they're freed - and b->hash_val is zeroed out, which we
890 * check for after we lock the node.
892 * Then, bch2_btree_node_relock() on the parent will fail - because
893 * the parent was modified, when the pointer to the node we want
894 * was removed - and we'll bail out:
896 if (btree_node_read_locked(path, level + 1))
897 btree_node_unlock(trans, path, level + 1);
899 ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
900 lock_node_check_fn, (void *) k, trace_ip);
902 if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
904 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
909 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
910 b->c.level != level ||
912 six_unlock_type(&b->c.lock, lock_type);
913 if (bch2_btree_node_relock(trans, path, level + 1))
916 trace_trans_restart_btree_node_reused(trans, trace_ip, path);
917 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
921 if (unlikely(btree_node_read_in_flight(b))) {
922 u32 seq = b->c.lock.state.seq;
924 six_unlock_type(&b->c.lock, lock_type);
925 bch2_trans_unlock(trans);
927 bch2_btree_node_wait_on_read(b);
930 * should_be_locked is not set on this path yet, so we need to
931 * relock it specifically:
934 int ret = bch2_trans_relock(trans) ?:
935 bch2_btree_path_relock_intent(trans, path);
937 BUG_ON(!trans->restarted);
942 if (!six_relock_type(&b->c.lock, lock_type, seq))
946 prefetch(b->aux_data);
948 for_each_bset(b, t) {
949 void *p = (u64 *) b->aux_data + t->aux_data_offset;
951 prefetch(p + L1_CACHE_BYTES * 0);
952 prefetch(p + L1_CACHE_BYTES * 1);
953 prefetch(p + L1_CACHE_BYTES * 2);
956 /* avoid atomic set bit if it's not needed: */
957 if (!btree_node_accessed(b))
958 set_btree_node_accessed(b);
960 if (unlikely(btree_node_read_error(b))) {
961 six_unlock_type(&b->c.lock, lock_type);
962 return ERR_PTR(-EIO);
965 EBUG_ON(b->c.btree_id != path->btree_id);
966 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
967 btree_check_header(c, b);
972 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
973 const struct bkey_i *k,
974 enum btree_id btree_id,
978 struct btree_cache *bc = &c->btree_cache;
983 EBUG_ON(level >= BTREE_MAX_DEPTH);
985 if (c->opts.btree_node_mem_ptr_optimization) {
986 b = btree_node_mem_ptr(k);
991 b = btree_cache_find(bc, k);
996 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
997 level, SIX_LOCK_read, true);
999 /* We raced and found the btree node in the cache */
1004 !bch2_btree_cache_cannibalize_lock(c, NULL))
1011 ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
1015 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1016 b->c.btree_id != btree_id ||
1017 b->c.level != level)) {
1018 six_unlock_read(&b->c.lock);
1023 /* XXX: waiting on IO with btree locks held: */
1024 __bch2_btree_node_wait_on_read(b);
1026 prefetch(b->aux_data);
1028 for_each_bset(b, t) {
1029 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1031 prefetch(p + L1_CACHE_BYTES * 0);
1032 prefetch(p + L1_CACHE_BYTES * 1);
1033 prefetch(p + L1_CACHE_BYTES * 2);
1036 /* avoid atomic set bit if it's not needed: */
1037 if (!btree_node_accessed(b))
1038 set_btree_node_accessed(b);
1040 if (unlikely(btree_node_read_error(b))) {
1041 six_unlock_read(&b->c.lock);
1046 EBUG_ON(b->c.btree_id != btree_id);
1047 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1048 btree_check_header(c, b);
1050 bch2_btree_cache_cannibalize_unlock(c);
1054 int bch2_btree_node_prefetch(struct bch_fs *c,
1055 struct btree_trans *trans,
1056 struct btree_path *path,
1057 const struct bkey_i *k,
1058 enum btree_id btree_id, unsigned level)
1060 struct btree_cache *bc = &c->btree_cache;
1063 BUG_ON(trans && !btree_node_locked(path, level + 1));
1064 BUG_ON(level >= BTREE_MAX_DEPTH);
1066 b = btree_cache_find(bc, k);
1070 b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1071 level, SIX_LOCK_read, false);
1072 return PTR_ERR_OR_ZERO(b);
1075 void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
1077 struct btree_cache *bc = &c->btree_cache;
1080 b = btree_cache_find(bc, k);
1084 /* not allowed to wait on io with btree locks held: */
1086 /* XXX we're called from btree_gc which will be holding other btree
1089 __bch2_btree_node_wait_on_read(b);
1090 __bch2_btree_node_wait_on_write(b);
1092 six_lock_intent(&b->c.lock, NULL, NULL);
1093 six_lock_write(&b->c.lock, NULL, NULL);
1095 if (btree_node_dirty(b)) {
1096 __bch2_btree_node_write(c, b, 0);
1097 six_unlock_write(&b->c.lock);
1098 six_unlock_intent(&b->c.lock);
1102 BUG_ON(btree_node_dirty(b));
1104 mutex_lock(&bc->lock);
1105 btree_node_data_free(c, b);
1106 bch2_btree_node_hash_remove(bc, b);
1107 mutex_unlock(&bc->lock);
1109 six_unlock_write(&b->c.lock);
1110 six_unlock_intent(&b->c.lock);
1113 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1116 const struct bkey_format *f = &b->format;
1117 struct bset_stats stats;
1119 memset(&stats, 0, sizeof(stats));
1121 bch2_btree_keys_stats(b, &stats);
1123 prt_printf(out, "l %u ", b->c.level);
1124 bch2_bpos_to_text(out, b->data->min_key);
1125 prt_printf(out, " - ");
1126 bch2_bpos_to_text(out, b->data->max_key);
1127 prt_printf(out, ":\n"
1129 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1131 prt_printf(out, "\n"
1132 " format: u64s %u fields %u %u %u %u %u\n"
1133 " unpack fn len: %u\n"
1134 " bytes used %zu/%zu (%zu%% full)\n"
1135 " sib u64s: %u, %u (merge threshold %u)\n"
1136 " nr packed keys %u\n"
1137 " nr unpacked keys %u\n"
1139 " failed unpacked %zu\n",
1141 f->bits_per_field[0],
1142 f->bits_per_field[1],
1143 f->bits_per_field[2],
1144 f->bits_per_field[3],
1145 f->bits_per_field[4],
1147 b->nr.live_u64s * sizeof(u64),
1148 btree_bytes(c) - sizeof(struct btree_node),
1149 b->nr.live_u64s * 100 / btree_max_u64s(c),
1152 c->btree_foreground_merge_threshold,
1154 b->nr.unpacked_keys,
1159 void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
1161 prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
1162 prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
1163 prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);