#include "debug.h"
#include "errcode.h"
#include "error.h"
+#include "journal.h"
#include "trace.h"
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
-#include <linux/seq_buf.h>
-
-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
-do { \
- if (shrinker_counter) \
- bc->not_freed_##counter++; \
-} while (0)
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_dirty(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- else if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
- else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
+ if (!flush)
return -BCH_ERR_ENOMEM_btree_node_reclaim;
- }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
+ if (!six_trylock_intent(&b->c.lock))
return -BCH_ERR_ENOMEM_btree_node_reclaim;
- }
- if (!six_trylock_write(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
+ if (!six_trylock_write(&b->c.lock))
goto out_unlock_intent;
- }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
- else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
+ if (!flush)
goto out_unlock;
- }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
- goto out_unlock;
- }
- if (btree_node_write_blocked(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
+ if (btree_node_noevict(b) ||
+ btree_node_write_blocked(b) ||
+ btree_node_will_make_reachable(b))
goto out_unlock;
- }
- if (btree_node_will_make_reachable(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
- goto out_unlock;
- }
if (btree_node_dirty(b)) {
- if (!flush) {
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ if (!flush)
goto out_unlock;
- }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, false, shrinker_counter);
+ return __btree_node_reclaim(c, b, false);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true, false);
+ return __btree_node_reclaim(c, b, true);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b, true)) {
+ if (!btree_node_reclaim(c, b)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
- bc->freed++;
}
}
restart:
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- bc->not_freed_access_bit++;
- } else if (!btree_node_reclaim(c, b, true)) {
+ } else if (!btree_node_reclaim(c, b)) {
freed++;
btree_node_data_free(c, b);
- bc->freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
if (bch2_btree_shrinker_disabled)
return btree_cache_can_free(bc);
}
-static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
-{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
- char *cbuf;
- size_t buflen = seq_buf_get_buf(s, &cbuf);
- struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
-
- bch2_btree_cache_to_text(&out, &c->btree_cache);
- seq_buf_commit(s, out.pos);
-}
-
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
unsigned i, flags;
- unregister_shrinker(&bc->shrink);
+ shrinker_free(bc->shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
- if (btree_node_dirty(b))
- bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty_acct(c, b);
-
btree_node_data_free(c, b);
}
- BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(!bch2_journal_error(&c->journal) &&
+ atomic_read(&c->btree_cache.dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
int bch2_fs_btree_cache_init(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
+ struct shrinker *shrink;
unsigned i;
int ret = 0;
- pr_verbose_init(c->opts, "");
-
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
if (ret)
- goto out;
+ goto err;
bc->table_init_done = true;
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->reserve; i++)
- if (!__bch2_btree_node_mem_alloc(c)) {
- ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
- goto out;
- }
+ if (!__bch2_btree_node_mem_alloc(c))
+ goto err;
list_splice_init(&bc->live, &bc->freeable);
mutex_init(&c->verify_lock);
- bc->shrink.count_objects = bch2_btree_cache_count;
- bc->shrink.scan_objects = bch2_btree_cache_scan;
- bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
- bc->shrink.seeks = 4;
- ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
-out:
- pr_verbose_init(c->opts, "ret %i", ret);
- return ret;
+ shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+ if (!shrink)
+ goto err;
+ bc->shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 4;
+ shrink->private_data = c;
+ shrinker_register(shrink);
+
+ return 0;
+err:
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, trans);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
}
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
goto success;
if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
}
goto success;
}
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
success:
- trace_and_count(c, btree_cache_cannibalize_lock, c);
+ trace_and_count(c, btree_cache_cannibalize_lock, trans);
return 0;
}
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b, false))
+ if (!btree_node_reclaim(c, b))
return b;
while (1) {
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b, false)) {
+ if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list);
goto got_node;
}
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2, false)) {
+ if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
btree_node_to_freedlist(bc, b2);
mutex_unlock(&bc->lock);
- trace_and_count(c, btree_cache_cannibalize, c);
+ trace_and_count(c, btree_cache_cannibalize, trans);
goto out;
}
if (IS_ERR(b))
return b;
- /*
- * Btree nodes read in from disk should not have the accessed bit set
- * initially, so that linear scans don't thrash the cache:
- */
- clear_btree_node_accessed(b);
-
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
six_unlock_intent(&b->c.lock);
/* Unlock before doing IO: */
- if (trans && sync)
+ if (path && sync)
bch2_trans_unlock_noassert(trans);
- bch2_btree_node_read(c, b, sync);
+ bch2_btree_node_read(trans, b, sync);
if (!sync)
return NULL;
{
struct printbuf buf = PRINTBUF;
- if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
return;
prt_printf(&buf,
"btree node header doesn't match ptr\n"
"btree %s level %u\n"
"ptr: ",
- bch2_btree_ids[b->c.btree_id], b->c.level);
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_printf(&buf, "\nheader: btree %s level %llu\n"
"min ",
- bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
BTREE_NODE_LEVEL(b->data));
bch2_bpos_to_text(&buf, b->data->min_key);
}
if (unlikely(need_relock)) {
- int ret = bch2_trans_relock(trans) ?:
+ ret = bch2_trans_relock(trans) ?:
bch2_btree_path_relock_intent(trans, path);
if (ret) {
six_unlock_type(&b->c.lock, lock_type);
}
/**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
* in from disk if necessary.
*
+ * @trans: btree transaction object
+ * @path: btree_path being traversed
+ * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level: level of btree node being looked up (0 == leaf node)
+ * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
* The btree node will have either a read or a write lock held, depending on
* the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
*/
struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
const struct bkey_i *k, unsigned level,
}
if (unlikely(btree_node_read_in_flight(b))) {
- u32 seq = six_lock_seq(&b->c.lock);
-
six_unlock_type(&b->c.lock, lock_type);
- bch2_trans_unlock(trans);
-
- bch2_btree_node_wait_on_read(b);
-
- /*
- * should_be_locked is not set on this path yet, so we need to
- * relock it specifically:
- */
- if (trans) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(ret);
- }
- }
-
- if (!six_relock_type(&b->c.lock, lock_type, seq))
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
}
prefetch(b->aux_data);
goto retry;
if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(c, NULL))
+ !bch2_btree_cache_cannibalize_lock(trans, NULL))
goto retry;
if (IS_ERR(b))
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
out:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return b;
}
six_unlock_intent(&b->c.lock);
}
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
- const struct btree *b)
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+ return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+ prt_printf(out, "%s level %u/%u\n ",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level,
+ bch2_btree_id_root(c, b->c.btree_id)->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
- const struct bkey_format *f = &b->format;
struct bset_stats stats;
memset(&stats, 0, sizeof(stats));
prt_printf(out, ":\n"
" ptrs: ");
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ prt_newline(out);
+
+ prt_printf(out,
+ " format: ");
+ bch2_bkey_format_to_text(out, &b->format);
- prt_printf(out, "\n"
- " format: u64s %u fields %u %u %u %u %u\n"
+ prt_printf(out,
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
" sib u64s: %u, %u (merge threshold %u)\n"
" nr unpacked keys %u\n"
" floats %zu\n"
" failed unpacked %zu\n",
- f->key_u64s,
- f->bits_per_field[0],
- f->bits_per_field[1],
- f->bits_per_field[2],
- f->bits_per_field[3],
- f->bits_per_field[4],
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
btree_bytes(c) - sizeof(struct btree_node),
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
+void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
{
- prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
- prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
-
- prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
- prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
- prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
- prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
- prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
- prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
- prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
- prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
- prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
- prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
-
+ prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
}