#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_iter.h"
+#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
-#include "recovery.h"
#include "replicas.h"
-#include "subvolume.h"
+#include "snapshot.h"
#include "trace.h"
#include <linux/random.h>
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(iter->btree_id));
+ !btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path);
bch2_bpos_to_text(&buf, pos);
panic("not locked: %s %s%s\n",
- bch2_btree_ids[id], buf.buf,
+ bch2_btree_id_str(id), buf.buf,
key_cache ? " cached" : "");
}
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
b->c.level) {
- struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(trans, path, b->c.level);
six_lock_increment(&b->c.lock, (enum six_lock_type) t);
- mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+ mark_btree_node_locked(trans, path, b->c.level, t);
}
bch2_btree_path_level_init(trans, path, b);
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
- mark_btree_node_locked(trans, path, path->level, lock_type);
+ mark_btree_node_locked(trans, path, path->level,
+ (enum btree_node_locked_type) lock_type);
bch2_btree_path_level_init(trans, path, b);
return 0;
}
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(trans, path, level + 1);
- mark_btree_node_locked(trans, path, level, lock_type);
+ mark_btree_node_locked(trans, path, level,
+ (enum btree_node_locked_type) lock_type);
path->level = level;
bch2_btree_path_level_init(trans, path, b);
/*
* We used to assert that all paths had been traversed here
* (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
- * path->Should_be_locked is not set yet, we we might have unlocked and
+ * path->should_be_locked is not set yet, we might have unlocked and
* then failed to relock a path - that's fine.
*/
err:
if (unlikely(ret))
goto out;
+ if (unlikely(!trans->srcu_held))
+ bch2_trans_srcu_lock(trans);
+
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip, int cmp)
{
- unsigned level = path->level;
-
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!path->ref);
goto out;
}
- level = btree_path_up_until_good_node(trans, path, cmp);
+ unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
__bch2_path_free(trans, path);
}
-void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
{
panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
trans->restart_count, restart_count,
(void *) trans->last_begin_ip);
}
-void bch2_trans_in_restart_error(struct btree_trans *trans)
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
{
panic("in transaction restart: %s, last restarted by %pS\n",
bch2_err_str(trans->restarted),
struct bkey_s_c old = { &i->old_k, i->old_v };
prt_printf(buf, "update: btree=%s cached=%u %pS",
- bch2_btree_ids[i->btree_id],
+ bch2_btree_id_str(i->btree_id),
i->cached,
(void *) i->ip_allocated);
prt_newline(buf);
trans_for_each_wb_update(trans, wb) {
prt_printf(buf, "update: btree=%s wb=1 %pS",
- bch2_btree_ids[wb->btree],
+ bch2_btree_id_str(wb->btree),
(void *) i->ip_allocated);
prt_newline(buf);
path->idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
- bch2_btree_ids[path->btree_id],
+ bch2_btree_id_str(path->btree_id),
path->level);
bch2_bpos_to_text(out, path->pos);
static noinline void btree_path_overflow(struct btree_trans *trans)
{
bch2_dump_trans_paths_updates(trans);
- panic("trans path oveflow\n");
+ panic("trans path overflow\n");
}
static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
+ path->alloc_seq++;
btree_path_list_add(trans, pos, path);
trans->paths_sorted = false;
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
return path;
}
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
- } else {
- if (!btree_path_node(iter->path, iter->path->level))
- return true;
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
- iter->advanced = true;
- return false;
- }
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
}
/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter: iterator to peek from
+ * @end: search limit: returns keys less than or equal to @end
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
{
struct bpos iter_pos;
int ret;
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
}
/**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(iter->path->cached);
- bch2_btree_iter_verify(iter);
- BUG_ON(iter->path->level < iter->min_depth);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- /* Already at end? */
- if (!btree_path_node(iter->path, iter->path->level)) {
- k = bkey_s_c_null;
- goto out_no_locked;
- }
-
- k = btree_path_level_peek_all(trans->c,
- &iter->path->l[iter->path->level], &iter->k);
-
- /* Check if we should go up to the parent node: */
- if (!k.k ||
- (iter->advanced &&
- bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
- iter->pos = path_l(iter->path)->b->key.k.p;
- btree_path_set_level_up(trans, iter->path);
- iter->advanced = false;
- continue;
- }
-
- /*
- * Check if we should go back down to a leaf:
- * If we're not in a leaf node, we only return the current key
- * if it exactly matches iter->pos - otherwise we first have to
- * go back to the leaf:
- */
- if (iter->path->level != iter->min_depth &&
- (iter->advanced ||
- !k.k ||
- !bpos_eq(iter->pos, k.k->p))) {
- btree_path_set_level_down(trans, iter->path, iter->min_depth);
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- /* Check if we should go to the next key: */
- if (iter->path->level == iter->min_depth &&
- iter->advanced &&
- k.k &&
- bpos_eq(iter->pos, k.k->p)) {
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- if (iter->advanced &&
- iter->path->level == iter->min_depth &&
- !bpos_eq(k.k->p, iter->pos))
- iter->advanced = false;
-
- BUG_ON(iter->advanced);
- BUG_ON(!k.k);
- break;
- }
-
- iter->pos = k.k->p;
- btree_path_set_should_be_locked(iter->path);
-out_no_locked:
- bch2_btree_iter_verify(iter);
-
- return k;
-}
-
-/**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
* position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
}
/**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
* iterator's current position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
}
/**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
* position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
{
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
+ enum btree_id btree_id, struct bpos pos,
unsigned flags)
{
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
unsigned depth,
unsigned flags)
{
- flags |= BTREE_ITER_NOT_EXTENTS;
- flags |= __BTREE_ITER_ALL_SNAPSHOTS;
- flags |= BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_NOT_EXTENTS;
+ flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_ALL_SNAPSHOTS;
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
__bch2_btree_iter_flags(trans, btree_id, flags),
return p;
}
-static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- struct btree_path *path;
+ WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+ "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+ (jiffies - trans->srcu_lock_time) / HZ);
+}
- trans_for_each_path(trans, path)
- if (path->cached && !btree_node_locked(path, 0))
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+ if (trans->srcu_held) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
+ trans_for_each_path(trans, path)
+ if (path->cached && !btree_node_locked(path, 0))
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ trans->srcu_held = false;
+ }
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+ if (!trans->srcu_held) {
+ trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
+ }
}
/**
* bch2_trans_begin() - reset a transaction after a interrupted attempt
* @trans: transaction to reset
*
+ * Returns: current restart counter, to be used with trans_was_restarted()
+ *
* While iterating over nodes or updating nodes a attempt to lock a btree node
* may return BCH_ERR_transaction_restart when the trylock fails. When this
* occurs bch2_trans_begin() should be called and the transaction retried.
}
trans->last_begin_time = now;
- if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
- bch2_trans_reset_srcu_lock(trans);
+ if (unlikely(trans->srcu_held &&
+ time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+ bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
if (trans->restarted) {
return trans->restart_count;
}
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
{
- size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX;
- size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
- void *p = NULL;
+ struct btree_trans *trans;
- BUG_ON(trans->used_mempool);
+ if (IS_ENABLED(__KERNEL__)) {
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+ if (trans)
+ return trans;
+ }
-#ifdef __KERNEL__
- p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
- if (!p)
- p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
+ trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
/*
- * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
- * other threads
+ * paths need to be zeroed, bch2_check_for_deadlock looks at
+ * paths in other threads
*/
-
- trans->paths = p; p += paths_bytes;
- trans->updates = p; p += updates_bytes;
+ memset(&trans->paths, 0, sizeof(trans->paths));
+ return trans;
}
const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
return i;
}
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
+ struct btree_trans *trans;
struct btree_transaction_stats *s;
bch2_assert_btree_nodes_not_locked();
+ trans = bch2_trans_alloc(c);
+
memset(trans, 0, sizeof(*trans));
trans->c = c;
trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
closure_init_stack(&trans->ref);
- bch2_trans_alloc_paths(trans, c);
-
s = btree_trans_stats(trans);
if (s && s->max_mem) {
unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
trans->wb_updates_size = s->wb_updates_size;
}
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
struct btree_trans *pos;
list_add_done:
seqmutex_unlock(&c->btree_trans_lock);
}
+
+ return trans;
}
static void check_btree_paths_leaked(struct btree_trans *trans)
trans_for_each_path(trans, path)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_ids[path->btree_id],
+ bch2_btree_id_str(path->btree_id),
(void *) path->ip_allocated);
/* Be noisy about this: */
bch2_fatal_error(c);
#endif
}
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
struct btree_insert_entry *i;
check_btree_paths_leaked(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ if (trans->srcu_held) {
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ }
kfree(trans->extra_journal_entries.data);
else
kfree(trans->mem);
-#ifdef __KERNEL__
- /*
- * Userspace doesn't have a real percpu implementation:
- */
- trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
- if (trans->paths)
- mempool_free(trans->paths, &c->btree_paths_pool);
-
- trans->mem = (void *) 0x1;
- trans->paths = (void *) 0x1;
+ /* Userspace doesn't have a real percpu implementation: */
+ if (IS_ENABLED(__KERNEL__))
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+ if (trans)
+ mempool_free(trans, &c->btree_trans_pool);
}
static void __maybe_unused
prt_tab(out);
prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
- b->level, bch2_btree_ids[b->btree_id]);
+ b->level, bch2_btree_id_str(b->btree_id));
bch2_bpos_to_text(out, btree_node_pos(b));
prt_tab(out);
path->idx,
path->cached ? 'c' : 'b',
path->level,
- bch2_btree_ids[path->btree_id]);
+ bch2_btree_id_str(path->btree_id));
bch2_bpos_to_text(out, path->pos);
prt_newline(out);
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
struct btree_transaction_stats *s;
+ struct btree_trans *trans;
+ int cpu;
+
+ trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+ if (trans)
+ panic("%s leaked btree_trans\n", trans->fn);
+
+ if (c->btree_trans_bufs)
+ for_each_possible_cpu(cpu)
+ kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+ free_percpu(c->btree_trans_bufs);
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
if (c->btree_trans_barrier_initialized)
cleanup_srcu_struct(&c->btree_trans_barrier);
mempool_exit(&c->btree_trans_mem_pool);
- mempool_exit(&c->btree_paths_pool);
+ mempool_exit(&c->btree_trans_pool);
}
int bch2_fs_btree_iter_init(struct bch_fs *c)
{
struct btree_transaction_stats *s;
- unsigned nr = BTREE_ITER_MAX;
int ret;
for (s = c->btree_transaction_stats;
INIT_LIST_HEAD(&c->btree_trans_list);
seqmutex_init(&c->btree_trans_lock);
- ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
- sizeof(struct btree_path) * nr +
- sizeof(struct btree_insert_entry) * nr) ?:
+ c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+ if (!c->btree_trans_bufs)
+ return -ENOMEM;
+
+ ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+ sizeof(struct btree_trans)) ?:
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
BTREE_TRANS_MEM_MAX) ?:
init_srcu_struct(&c->btree_trans_barrier);