-#ifndef _BCACHE_BTREE_TYPES_H
-#define _BCACHE_BTREE_TYPES_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
#include <linux/list.h>
#include <linux/rhashtable.h>
-#include <linux/semaphore.h>
-#include <linux/workqueue.h>
+#include <linux/six.h>
#include "bkey_methods.h"
+#include "buckets_types.h"
#include "journal_types.h"
-#include "six.h"
struct open_bucket;
-struct btree_interior_update;
+struct btree_update;
+struct btree_trans;
#define MAX_BSETS 3U
struct btree_write {
struct journal_entry_pin journal;
- struct closure_waitlist wait;
};
-struct btree {
- /* Hottest entries first */
- struct rhash_head hash;
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+struct btree_alloc {
+ struct open_buckets ob;
+ BKEY_PADDED(k);
+};
+struct btree_bkey_cached_common {
struct six_lock lock;
+ u8 level;
+ u8 btree_id;
+};
+
+struct btree {
+ struct btree_bkey_cached_common c;
+
+ struct rhash_head hash;
+ u64 hash_val;
unsigned long flags;
u16 written;
- u8 level;
- u8 btree_id;
u8 nsets;
u8 nr_key_bits;
struct btree_nr_keys nr;
u16 sib_u64s[2];
u16 whiteout_u64s;
- u16 uncompacted_whiteout_u64s;
u8 page_order;
u8 unpack_fn_len;
* node to point to them: we update the parent in memory immediately,
* but then we must wait until the children have been written out before
* the update to the parent can be written - this is a list of the
- * btree_interior_updates that are blocking this node from being
+ * btree_updates that are blocking this node from being
* written:
*/
struct list_head write_blocked;
* another write - because that write also won't yet be reachable and
* marking it as completed before it's reachable would be incorrect:
*/
- struct btree_interior_update *will_make_reachable;
+ unsigned long will_make_reachable;
- struct open_bucket *ob;
+ struct open_buckets ob;
/* lru list */
struct list_head list;
#ifdef CONFIG_BCACHEFS_DEBUG
bool *expensive_debug_checks;
#endif
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+struct btree_cache {
+ struct rhashtable table;
+ bool table_init_done;
+ /*
+ * We never free a struct btree, except on shutdown - we just put it on
+ * the btree_cache_freed list and reuse it later. This simplifies the
+ * code, and it doesn't cost us much memory as the memory usage is
+ * dominated by buffers that hold the actual btree node data and those
+ * can be freed - and the number of struct btrees allocated is
+ * effectively bounded.
+ *
+ * btree_cache_freeable effectively is a small cache - we use it because
+ * high order page allocations can be rather expensive, and it's quite
+ * common to delete and allocate btree nodes in quick succession. It
+ * should never grow past ~2-3 nodes in practice.
+ */
+ struct mutex lock;
+ struct list_head live;
+ struct list_head freeable;
+ struct list_head freed;
+
+ /* Number of elements in live + freeable lists */
+ unsigned used;
+ unsigned reserve;
+ struct shrinker shrink;
+
+ /*
+ * If we need to allocate memory for a new btree node and that
+ * allocation fails, we can cannibalize another node in the btree cache
+ * to satisfy the allocation - lock to guarantee only one thread does
+ * this at a time:
+ */
+ struct task_struct *alloc_lock;
+ struct closure_waitlist alloc_wait;
+};
+
+struct btree_node_iter {
+ struct btree_node_iter_set {
+ u16 k, end;
+ } data[MAX_BSETS];
+};
+
+enum btree_iter_type {
+ BTREE_ITER_KEYS,
+ BTREE_ITER_NODES,
+ BTREE_ITER_CACHED,
+};
+
+#define BTREE_ITER_TYPE ((1 << 2) - 1)
+
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
+#define BTREE_ITER_SLOTS (1 << 2)
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
+#define BTREE_ITER_INTENT (1 << 3)
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
+#define BTREE_ITER_PREFETCH (1 << 4)
+/*
+ * Indicates that this iterator should not be reused until transaction commit,
+ * either because a pending update references it or because the update depends
+ * on that particular key being locked (e.g. by the str_hash code, for hash
+ * table consistency)
+ */
+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS (1 << 6)
+#define BTREE_ITER_ERROR (1 << 7)
+#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
+#define BTREE_ITER_CACHED_NOFILL (1 << 9)
+#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
+
+#define BTREE_ITER_USER_FLAGS \
+ (BTREE_ITER_SLOTS \
+ |BTREE_ITER_INTENT \
+ |BTREE_ITER_PREFETCH \
+ |BTREE_ITER_CACHED_NOFILL \
+ |BTREE_ITER_CACHED_NOCREATE)
+
+enum btree_iter_uptodate {
+ BTREE_ITER_UPTODATE = 0,
+ BTREE_ITER_NEED_PEEK = 1,
+ BTREE_ITER_NEED_RELOCK = 2,
+ BTREE_ITER_NEED_TRAVERSE = 3,
+};
+
+#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
+#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
+#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
+#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
+#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
+#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
+#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
+
+/*
+ * @pos - iterator's current position
+ * @level - current btree depth
+ * @locks_want - btree level below which we start taking intent locks
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+ struct btree_trans *trans;
+ struct bpos pos;
+ struct bpos pos_after_commit;
+
+ u16 flags;
+ u8 idx;
+
+ enum btree_id btree_id:4;
+ enum btree_iter_uptodate uptodate:4;
+ unsigned level:4,
+ min_depth:4,
+ locks_want:4,
+ nodes_locked:4,
+ nodes_intent_locked:4;
+
+ struct btree_iter_level {
+ struct btree *b;
+ struct btree_node_iter iter;
+ u32 lock_seq;
+ } l[BTREE_MAX_DEPTH];
+
+ /*
+ * Current unpacked key - so that bch2_btree_iter_next()/
+ * bch2_btree_iter_next_slot() can correctly advance pos.
+ */
+ struct bkey k;
+ unsigned long ip_allocated;
+};
+
+static inline enum btree_iter_type
+btree_iter_type(const struct btree_iter *iter)
+{
+ return iter->flags & BTREE_ITER_TYPE;
+}
+
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
+{
+ return iter->l + iter->level;
+}
+
+struct btree_key_cache {
+ struct mutex lock;
+ struct rhashtable table;
+ struct list_head freed;
+ struct list_head clean;
+};
+
+struct bkey_cached_key {
+ u32 btree_id;
+ struct bpos pos;
+} __packed;
+
+#define BKEY_CACHED_DIRTY 0
+
+struct bkey_cached {
+ struct btree_bkey_cached_common c;
+
+ unsigned long flags;
+ u8 u64s;
+ bool valid;
+ struct bkey_cached_key key;
+
+ struct rhash_head hash;
+ struct list_head list;
+
+ struct journal_preres res;
+ struct journal_entry_pin journal;
+
+ struct bkey_i *k;
+};
+
+struct btree_insert_entry {
+ unsigned trigger_flags;
+ unsigned trans_triggers_run:1;
+ struct bkey_i *k;
+ struct btree_iter *iter;
+};
+
+#ifndef CONFIG_LOCKDEP
+#define BTREE_ITER_MAX 64
+#else
+#define BTREE_ITER_MAX 32
+#endif
+
+struct btree_trans {
+ struct bch_fs *c;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct list_head list;
+ struct btree *locking;
+ unsigned locking_iter_idx;
+ struct bpos locking_pos;
+ u8 locking_btree_id;
+ u8 locking_level;
+ pid_t pid;
+#endif
+ unsigned long ip;
+
+ u64 iters_linked;
+ u64 iters_live;
+ u64 iters_touched;
+
+ u8 nr_iters;
+ u8 nr_updates;
+ u8 nr_updates2;
+ u8 size;
+ unsigned used_mempool:1;
+ unsigned error:1;
+ unsigned nounlock:1;
+ unsigned need_reset:1;
+ unsigned in_traverse_all:1;
+
+ unsigned mem_top;
+ unsigned mem_bytes;
+ void *mem;
+
+ struct btree_iter *iters;
+ struct btree_insert_entry *updates;
+ struct btree_insert_entry *updates2;
+
+ /* update path: */
+ struct jset_entry *extra_journal_entries;
+ unsigned extra_journal_entry_u64s;
+ struct journal_entry_pin *journal_pin;
+
+ struct journal_res journal_res;
+ struct journal_preres journal_preres;
+ u64 *journal_seq;
+ struct disk_reservation *disk_res;
+ unsigned flags;
+ unsigned journal_u64s;
+ unsigned journal_preres_u64s;
+ struct replicas_delta_list *fs_usage_deltas;
+
+ struct btree_iter iters_onstack[2];
+ struct btree_insert_entry updates_onstack[2];
+ struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
+ BTREE_NODE_dying,
+ BTREE_NODE_fake,
+ BTREE_NODE_old_extent_overwrite,
};
BTREE_FLAG(read_in_flight);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
+BTREE_FLAG(fake);
+BTREE_FLAG(old_extent_overwrite);
static inline struct btree_write *btree_current_write(struct btree *b)
{
return b->set + b->nsets - 1;
}
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+ return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+ u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+ return ret;
+}
+
static inline struct bset *bset(const struct btree *b,
const struct bset_tree *t)
{
- return (void *) b->data + t->data_offset * sizeof(u64);
+ return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+ t->end_offset =
+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+ const struct bset *i)
+{
+ t->data_offset = __btree_node_ptr_to_offset(b, i);
+ set_btree_bset_end(b, t);
}
static inline struct bset *btree_bset_first(struct btree *b)
static inline u16
__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
{
- size_t ret = (u64 *) k - (u64 *) b->data - 1;
-
- EBUG_ON(ret > U16_MAX);
- return ret;
+ return __btree_node_ptr_to_offset(b, k);
}
static inline struct bkey_packed *
__btree_node_offset_to_key(const struct btree *b, u16 k)
{
- return (void *) ((u64 *) b->data + k + 1);
+ return __btree_node_offset_to_ptr(b, k);
}
-#define btree_bkey_first(_b, _t) (bset(_b, _t)->start)
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t) \
+({ \
+ EBUG_ON(bset(_b, _t)->start != \
+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+ \
+ bset(_b, _t)->start; \
+})
#define btree_bkey_last(_b, _t) \
({ \
__btree_node_offset_to_key(_b, (_t)->end_offset); \
})
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+static inline unsigned bset_u64s(struct bset_tree *t)
{
- t->end_offset =
- __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
- btree_bkey_last(b, t);
+ return t->end_offset - t->data_offset -
+ sizeof(struct bset) / sizeof(u64);
}
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
- const struct bset *i)
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
{
- t->data_offset = (u64 *) i - (u64 *) b->data;
-
- EBUG_ON(bset(b, t) != i);
-
- set_btree_bset_end(b, t);
+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
}
static inline unsigned bset_byte_offset(struct btree *b, void *i)
return i - (void *) b->data;
}
-/* Type of keys @b contains: */
-static inline enum bkey_type btree_node_type(struct btree *b)
+enum btree_node_type {
+#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+ BCH_BTREE_IDS()
+#undef x
+ BKEY_TYPE_BTREE,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
{
- return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
}
-static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+/* Type of keys @b contains: */
+static inline enum btree_node_type btree_node_type(struct btree *b)
{
- return bch2_bkey_ops[btree_node_type(b)];
+ return __btree_node_type(b->c.level, b->c.btree_id);
}
-static inline bool btree_node_has_ptrs(struct btree *b)
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- return btree_type_has_ptrs(btree_node_type(b));
+ switch (type) {
+ case BKEY_TYPE_EXTENTS:
+ case BKEY_TYPE_REFLINK:
+ return true;
+ default:
+ return false;
+ }
}
static inline bool btree_node_is_extents(struct btree *b)
{
- return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+ return btree_node_type_is_extents(btree_node_type(b));
+}
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS \
+ ((1U << BKEY_TYPE_EXTENTS)| \
+ (1U << BKEY_TYPE_ALLOC)| \
+ (1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_REFLINK)| \
+ (1U << BKEY_TYPE_EC)| \
+ (1U << BKEY_TYPE_BTREE))
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
+ ((1U << BKEY_TYPE_EXTENTS)| \
+ (1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_REFLINK))
+
+enum btree_trigger_flags {
+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
+ __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */
+
+ __BTREE_TRIGGER_INSERT,
+ __BTREE_TRIGGER_OVERWRITE,
+ __BTREE_TRIGGER_OVERWRITE_SPLIT,
+
+ __BTREE_TRIGGER_GC,
+ __BTREE_TRIGGER_BUCKET_INVALIDATE,
+ __BTREE_TRIGGER_ALLOC_READ,
+ __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES)
+
+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
+#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
+
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ)
+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
}
struct btree_root {
struct btree *b;
- struct btree_interior_update *as;
-
/* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
+ s8 error;
};
/*
* we're holding the write lock and we know what key is about to be overwritten:
*/
-struct btree_iter;
-struct btree_node_iter;
-
-enum extent_insert_hook_ret {
- BTREE_HOOK_DO_INSERT,
- BTREE_HOOK_NO_INSERT,
- BTREE_HOOK_RESTART_TRANS,
-};
-
-struct extent_insert_hook {
- enum extent_insert_hook_ret
- (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
- struct bkey_s_c, const struct bkey_i *);
-};
-
enum btree_insert_ret {
BTREE_INSERT_OK,
- /* extent spanned multiple leaf nodes: have to traverse to next node: */
- BTREE_INSERT_NEED_TRAVERSE,
- /* write lock held for too long */
- BTREE_INSERT_NEED_RESCHED,
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
- BTREE_INSERT_JOURNAL_RES_FULL,
BTREE_INSERT_ENOSPC,
- BTREE_INSERT_NEED_GC_LOCK,
+ BTREE_INSERT_NEED_MARK_REPLICAS,
+ BTREE_INSERT_NEED_JOURNAL_RES,
};
enum btree_gc_coalesce_fail_reason {
BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
};
+enum btree_node_sibling {
+ btree_prev_sib,
+ btree_next_sib,
+};
+
typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
struct btree *,
struct btree_node_iter *);
-#endif /* _BCACHE_BTREE_TYPES_H */
+#endif /* _BCACHEFS_BTREE_TYPES_H */