#include "bkey_methods.h"
#include "buckets_types.h"
+#include "darray.h"
#include "journal_types.h"
struct open_bucket;
u16 data_offset;
u16 aux_data_offset;
u16 end_offset;
-
- struct bpos max_key;
};
struct btree_write {
struct btree_alloc {
struct open_buckets ob;
- BKEY_PADDED(k);
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_bkey_cached_common {
struct six_lock lock;
u8 level;
u8 btree_id;
+ bool cached;
};
struct btree {
u16 written;
u8 nsets;
u8 nr_key_bits;
+ u16 version_ondisk;
struct bkey_format format;
u8 byte_order;
u8 unpack_fn_len;
+ struct btree_write writes[2];
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
/*
* XXX: add a delete sequence number, so when bch2_btree_node_relock()
* fails because the lock sequence number has changed - i.e. the
/* lru list */
struct list_head list;
-
- struct btree_write writes[2];
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- bool *expensive_debug_checks;
-#endif
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_cache {
struct mutex lock;
struct list_head live;
struct list_head freeable;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ atomic_t dirty;
struct shrinker shrink;
/*
} data[MAX_BSETS];
};
-enum btree_iter_type {
- BTREE_ITER_KEYS,
- BTREE_ITER_NODES,
- BTREE_ITER_CACHED,
-};
-
-#define BTREE_ITER_TYPE ((1 << 2) - 1)
-
/*
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
-#define BTREE_ITER_SLOTS (1 << 2)
+#define BTREE_ITER_SLOTS (1 << 0)
+#define BTREE_ITER_ALL_LEVELS (1 << 1)
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-#define BTREE_ITER_INTENT (1 << 3)
+#define BTREE_ITER_INTENT (1 << 2)
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-#define BTREE_ITER_PREFETCH (1 << 4)
-/*
- * Indicates that this iterator should not be reused until transaction commit,
- * either because a pending update references it or because the update depends
- * on that particular key being locked (e.g. by the str_hash code, for hash
- * table consistency)
- */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
+#define BTREE_ITER_PREFETCH (1 << 3)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 6)
-#define BTREE_ITER_ERROR (1 << 7)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
-#define BTREE_ITER_CACHED_NOFILL (1 << 9)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
-
-#define BTREE_ITER_USER_FLAGS \
- (BTREE_ITER_SLOTS \
- |BTREE_ITER_INTENT \
- |BTREE_ITER_PREFETCH \
- |BTREE_ITER_CACHED_NOFILL \
- |BTREE_ITER_CACHED_NOCREATE)
-
-enum btree_iter_uptodate {
+#define BTREE_ITER_IS_EXTENTS (1 << 4)
+#define BTREE_ITER_NOT_EXTENTS (1 << 5)
+#define BTREE_ITER_CACHED (1 << 6)
+#define BTREE_ITER_WITH_KEY_CACHE (1 << 7)
+#define BTREE_ITER_WITH_UPDATES (1 << 8)
+#define BTREE_ITER_WITH_JOURNAL (1 << 9)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 10)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_NOPRESERVE (1 << 13)
+
+enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
- BTREE_ITER_NEED_PEEK = 1,
- BTREE_ITER_NEED_RELOCK = 2,
- BTREE_ITER_NEED_TRAVERSE = 3,
+ BTREE_ITER_NEED_RELOCK = 1,
+ BTREE_ITER_NEED_TRAVERSE = 2,
};
-#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
-#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
-#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
-#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
-#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
-#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
-#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
+struct btree_path {
+ u8 idx;
+ u8 sorted_idx;
+ u8 ref;
+ u8 intent_ref;
+
+ /* btree_iter_copy starts here: */
+ struct bpos pos;
+
+ enum btree_id btree_id:4;
+ bool cached:1;
+ bool preserve:1;
+ enum btree_path_uptodate uptodate:2;
+ /*
+ * When true, failing to relock this path will cause the transaction to
+ * restart:
+ */
+ bool should_be_locked:1;
+ unsigned level:3,
+ locks_want:4;
+ u8 nodes_locked;
+
+ struct btree_path_level {
+ struct btree *b;
+ struct btree_node_iter iter;
+ u32 lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ u64 lock_taken_time;
+#endif
+ } l[BTREE_MAX_DEPTH];
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned long ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+ return path->l + path->level;
+}
/*
* @pos - iterator's current position
*/
struct btree_iter {
struct btree_trans *trans;
- struct bpos pos;
- struct bpos pos_after_commit;
+ struct btree_path *path;
+ struct btree_path *update_path;
+ struct btree_path *key_cache_path;
+
+ enum btree_id btree_id:4;
+ unsigned min_depth:3;
+ unsigned advanced:1;
+ /* btree_iter_copy starts here: */
u16 flags;
- u8 idx;
- enum btree_id btree_id:4;
- enum btree_iter_uptodate uptodate:4;
- unsigned level:4,
- min_depth:4,
- locks_want:4,
- nodes_locked:4,
- nodes_intent_locked:4;
-
- struct btree_iter_level {
- struct btree *b;
- struct btree_node_iter iter;
- u32 lock_seq;
- } l[BTREE_MAX_DEPTH];
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ unsigned snapshot;
+ struct bpos pos;
+ struct bpos pos_after_commit;
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
+
+ /* BTREE_ITER_WITH_JOURNAL: */
+ size_t journal_idx;
+ struct bpos journal_pos;
+#ifdef CONFIG_BCACHEFS_DEBUG
unsigned long ip_allocated;
+#endif
};
-static inline enum btree_iter_type
-btree_iter_type(const struct btree_iter *iter)
-{
- return iter->flags & BTREE_ITER_TYPE;
-}
-
-static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
-{
- return iter->l + iter->level;
-}
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
- struct list_head freed;
- struct list_head clean;
+ bool table_init_done;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
+ struct shrinker shrink;
+ unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+ atomic_long_t nr_freed;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
};
struct bkey_cached_key {
struct bpos pos;
} __attribute__((packed, aligned(4)));
-#define BKEY_CACHED_DIRTY 0
+#define BKEY_CACHED_ACCESSED 0
+#define BKEY_CACHED_DIRTY 1
struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
- u8 u64s;
+ u16 u64s;
bool valid;
+ u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
struct bkey_i *k;
};
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+ return !b->cached
+ ? container_of(b, struct btree, c)->key.k.p
+ : container_of(b, struct bkey_cached, c)->key.pos;
+}
+
struct btree_insert_entry {
- unsigned trigger_flags;
- unsigned trans_triggers_run:1;
+ unsigned flags;
+ u8 bkey_type;
+ enum btree_id btree_id:8;
+ u8 level:4;
+ bool cached:1;
+ bool insert_trigger_run:1;
+ bool overwrite_trigger_run:1;
+ bool key_cache_already_flushed:1;
+ /*
+ * @old_k may be a key from the journal; @old_btree_u64s always refers
+ * to the size of the key being overwritten in the btree:
+ */
+ u8 old_btree_u64s;
struct bkey_i *k;
- struct btree_iter *iter;
+ struct btree_path *path;
+ /* key being overwritten: */
+ struct bkey old_k;
+ const struct bch_val *old_v;
+ unsigned long ip_allocated;
};
#ifndef CONFIG_LOCKDEP
#define BTREE_ITER_MAX 32
#endif
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+ btree_trans_commit_hook_fn *fn;
+ struct btree_trans_commit_hook *next;
+};
+
+#define BTREE_TRANS_MEM_MAX (1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
+
struct btree_trans {
struct bch_fs *c;
-#ifdef CONFIG_BCACHEFS_DEBUG
+ const char *fn;
+ struct closure ref;
struct list_head list;
- struct btree *locking;
- unsigned locking_iter_idx;
- struct bpos locking_pos;
- u8 locking_btree_id;
- u8 locking_level;
- pid_t pid;
-#endif
- unsigned long ip;
+ u64 last_begin_time;
+
+ u8 lock_may_not_fail;
+ u8 lock_must_abort;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
- u64 iters_linked;
- u64 iters_live;
- u64 iters_touched;
+ int srcu_idx;
- u8 nr_iters;
+ u8 fn_idx;
+ u8 nr_sorted;
u8 nr_updates;
- u8 nr_updates2;
- u8 size;
- unsigned used_mempool:1;
- unsigned error:1;
- unsigned nounlock:1;
- unsigned need_reset:1;
- unsigned in_traverse_all:1;
+ u8 traverse_all_idx;
+ bool used_mempool:1;
+ bool in_traverse_all:1;
+ bool memory_allocation_failure:1;
+ bool is_initial_gc:1;
+ enum bch_errcode restarted:16;
+ u32 restart_count;
+ unsigned long last_restarted_ip;
+
+ /*
+ * For when bch2_trans_update notices we'll be splitting a compressed
+ * extent:
+ */
+ unsigned extra_journal_res;
+ unsigned nr_max_paths;
+
+ u64 paths_allocated;
unsigned mem_top;
+ unsigned mem_max;
unsigned mem_bytes;
void *mem;
- struct btree_iter *iters;
+ u8 sorted[BTREE_ITER_MAX];
+ struct btree_path *paths;
struct btree_insert_entry *updates;
- struct btree_insert_entry *updates2;
/* update path: */
- struct jset_entry *extra_journal_entries;
- unsigned extra_journal_entry_u64s;
+ struct btree_trans_commit_hook *hooks;
+ DARRAY(u64) extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
+};
+
+#define BTREE_FLAGS() \
+ x(read_in_flight) \
+ x(read_error) \
+ x(dirty) \
+ x(need_write) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(noevict) \
+ x(write_idx) \
+ x(accessed) \
+ x(write_in_flight) \
+ x(write_in_flight_inner) \
+ x(just_written) \
+ x(dying) \
+ x(fake) \
+ x(need_rewrite) \
+ x(never_write)
- struct btree_iter iters_onstack[2];
- struct btree_insert_entry updates_onstack[2];
- struct btree_insert_entry updates2_onstack[2];
+enum btree_flags {
+#define x(flag) BTREE_NODE_##flag,
+ BTREE_FLAGS()
+#undef x
};
-#define BTREE_FLAG(flag) \
+#define x(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
static inline void clear_btree_node_ ## flag(struct btree *b) \
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-enum btree_flags {
- BTREE_NODE_read_in_flight,
- BTREE_NODE_read_error,
- BTREE_NODE_dirty,
- BTREE_NODE_need_write,
- BTREE_NODE_noevict,
- BTREE_NODE_write_idx,
- BTREE_NODE_accessed,
- BTREE_NODE_write_in_flight,
- BTREE_NODE_just_written,
- BTREE_NODE_dying,
- BTREE_NODE_fake,
- BTREE_NODE_old_extent_overwrite,
- BTREE_NODE_need_rewrite,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(old_extent_overwrite);
-BTREE_FLAG(need_rewrite);
+BTREE_FLAGS()
+#undef x
static inline struct btree_write *btree_current_write(struct btree *b)
{
}
enum btree_node_type {
-#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+#define x(kwd, val) BKEY_TYPE_##kwd = val,
BCH_BTREE_IDS()
#undef x
- BKEY_TYPE_BTREE,
+ BKEY_TYPE_btree,
};
/* Type of a key in btree @id at level @level: */
static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
{
- return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
+ return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
}
/* Type of keys @b contains: */
return __btree_node_type(b->c.level, b->c.btree_id);
}
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
+ ((1U << BKEY_TYPE_extents)| \
+ (1U << BKEY_TYPE_alloc)| \
+ (1U << BKEY_TYPE_inodes)| \
+ (1U << BKEY_TYPE_stripes)| \
+ (1U << BKEY_TYPE_reflink)| \
+ (1U << BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+ ((1U << BKEY_TYPE_alloc)| \
+ (1U << BKEY_TYPE_inodes)| \
+ (1U << BKEY_TYPE_stripes)| \
+ (1U << BKEY_TYPE_snapshots))
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS \
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
+ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+
+#define BTREE_ID_IS_EXTENTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_reflink)| \
+ (1U << BTREE_ID_freespace))
+
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- switch (type) {
- case BKEY_TYPE_EXTENTS:
- case BKEY_TYPE_REFLINK:
- return true;
- default:
- return false;
- }
+ return (1U << type) & BTREE_ID_IS_EXTENTS;
}
-static inline bool btree_node_is_extents(struct btree *b)
-{
- return btree_node_type_is_extents(btree_node_type(b));
-}
+#define BTREE_ID_HAS_SNAPSHOTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_inodes)| \
+ (1U << BTREE_ID_dirents)| \
+ (1U << BTREE_ID_xattrs))
+
+#define BTREE_ID_HAS_PTRS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_reflink))
-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
+static inline bool btree_type_has_snapshots(enum btree_id id)
{
- return __btree_node_type(iter->level, iter->btree_id);
+ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
}
-static inline bool btree_iter_is_extents(struct btree_iter *iter)
+static inline bool btree_type_has_ptrs(enum btree_id id)
{
- return btree_node_type_is_extents(btree_iter_key_type(iter));
+ return (1 << id) & BTREE_ID_HAS_PTRS;
}
-#define BTREE_NODE_TYPE_HAS_TRIGGERS \
- ((1U << BKEY_TYPE_EXTENTS)| \
- (1U << BKEY_TYPE_ALLOC)| \
- (1U << BKEY_TYPE_INODES)| \
- (1U << BKEY_TYPE_REFLINK)| \
- (1U << BKEY_TYPE_EC)| \
- (1U << BKEY_TYPE_BTREE))
-
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
- ((1U << BKEY_TYPE_EXTENTS)| \
- (1U << BKEY_TYPE_INODES)| \
- (1U << BKEY_TYPE_REFLINK))
-
-enum btree_trigger_flags {
- __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
-
- __BTREE_TRIGGER_INSERT,
- __BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_OVERWRITE_SPLIT,
-
- __BTREE_TRIGGER_GC,
- __BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_NOATOMIC,
-};
-
-#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-
-#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
-#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
-
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
s8 error;
};
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
enum btree_insert_ret {
BTREE_INSERT_OK,
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
- BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_MARK_REPLICAS,
BTREE_INSERT_NEED_JOURNAL_RES,
+ BTREE_INSERT_NEED_JOURNAL_RECLAIM,
};
enum btree_gc_coalesce_fail_reason {
btree_next_sib,
};
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
- struct btree *,
- struct btree_node_iter *);
-
#endif /* _BCACHEFS_BTREE_TYPES_H */