X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_types.h;h=1e4d1fecc6bd336539025e15d7de80b3f50028a0;hb=5ec29da373fa39680dcb5aa41f78b93d85fbb51f;hp=b7af88e058373ccaf298d468024d776ef54b58a4;hpb=e0eb64c84601b2bbae055df809dd21f95f85c034;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index b7af88e..1e4d1fe 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -8,6 +8,7 @@ #include "bkey_methods.h" #include "buckets_types.h" +#include "darray.h" #include "journal_types.h" struct open_bucket; @@ -47,35 +48,34 @@ struct bset_tree { u16 data_offset; u16 aux_data_offset; u16 end_offset; - - struct bpos max_key; }; struct btree_write { struct journal_entry_pin journal; - struct closure_waitlist wait; }; struct btree_alloc { struct open_buckets ob; - BKEY_PADDED(k); + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); }; -struct btree { - /* Hottest entries first */ - struct rhash_head hash; +struct btree_bkey_cached_common { + struct six_lock lock; + u8 level; + u8 btree_id; +}; - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +struct btree { + struct btree_bkey_cached_common c; - struct six_lock lock; + struct rhash_head hash; + u64 hash_val; unsigned long flags; u16 written; - u8 level; - u8 btree_id; u8 nsets; u8 nr_key_bits; + u16 version_ondisk; struct bkey_format format; @@ -94,9 +94,14 @@ struct btree { struct btree_nr_keys nr; u16 sib_u64s[2]; u16 whiteout_u64s; - u8 page_order; + u8 byte_order; u8 unpack_fn_len; + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + /* * XXX: add a delete sequence number, so when bch2_btree_node_relock() * fails because the lock sequence number has changed - i.e. the @@ -127,12 +132,6 @@ struct btree { /* lru list */ struct list_head list; - - struct btree_write writes[2]; - -#ifdef CONFIG_BCACHEFS_DEBUG - bool *expensive_debug_checks; -#endif }; struct btree_cache { @@ -154,11 +153,13 @@ struct btree_cache { struct mutex lock; struct list_head live; struct list_head freeable; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + atomic_t dirty; struct shrinker shrink; /* @@ -177,121 +178,254 @@ struct btree_node_iter { } data[MAX_BSETS]; }; -enum btree_iter_type { - BTREE_ITER_KEYS, - BTREE_ITER_NODES, -}; - -#define BTREE_ITER_TYPE ((1 << 2) - 1) - /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -#define BTREE_ITER_SLOTS (1 << 2) +#define BTREE_ITER_SLOTS (1 << 0) +#define BTREE_ITER_ALL_LEVELS (1 << 1) /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 3) +#define BTREE_ITER_INTENT (1 << 2) /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 4) -/* - * Indicates that this iterator should not be reused until transaction commit, - * either because a pending update references it or because the update depends - * on that particular key being locked (e.g. by the str_hash code, for hash - * table consistency) - */ -#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) +#define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 6) -#define BTREE_ITER_ERROR (1 << 7) -#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) - -enum btree_iter_uptodate { +#define BTREE_ITER_IS_EXTENTS (1 << 4) +#define BTREE_ITER_NOT_EXTENTS (1 << 5) +#define BTREE_ITER_CACHED (1 << 6) +#define BTREE_ITER_CACHED_NOFILL (1 << 7) +#define BTREE_ITER_CACHED_NOCREATE (1 << 8) +#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) +#define BTREE_ITER_WITH_UPDATES (1 << 10) +#define BTREE_ITER_WITH_JOURNAL (1 << 11) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) +#define BTREE_ITER_NOPRESERVE (1 << 15) + +enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_PEEK = 1, - BTREE_ITER_NEED_RELOCK = 2, - BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, }; -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - struct btree_trans *trans; - struct bpos pos; - struct bpos pos_after_commit; +#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) +#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) +#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) +#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) +#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) +#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) +#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) +#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) - u16 flags; +struct btree_path { u8 idx; + u8 sorted_idx; + u8 ref; + u8 intent_ref; + + /* btree_iter_copy starts here: */ + struct bpos pos; - enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; - unsigned level:4, + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; + /* + * When true, failing to relock this path will cause the transaction to + * restart: + */ + bool should_be_locked:1; + unsigned level:3, locks_want:4, nodes_locked:4, nodes_intent_locked:4; - struct btree_iter_level { + struct btree_path_level { struct btree *b; struct btree_node_iter iter; u32 lock_seq; } l[BTREE_MAX_DEPTH]; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif +}; +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ +struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; + + enum btree_id btree_id:4; + unsigned min_depth:3; + unsigned advanced:1; + + /* btree_iter_copy starts here: */ + u16 flags; + + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + + struct bpos pos; + struct bpos pos_after_commit; /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; + + /* BTREE_ITER_WITH_JOURNAL: */ + size_t journal_idx; + struct bpos journal_pos; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif }; -static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) -{ - return iter->flags & BTREE_ITER_TYPE; -} +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + struct list_head freed; + struct shrinker shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __attribute__((packed, aligned(4))); + +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 + +struct bkey_cached { + struct btree_bkey_cached_common c; + + unsigned long flags; + u16 u64s; + bool valid; + u32 btree_trans_barrier_seq; + struct bkey_cached_key key; + + struct rhash_head hash; + struct list_head list; + + struct journal_preres res; + struct journal_entry_pin journal; + + struct bkey_i *k; +}; struct btree_insert_entry { - unsigned trigger_flags; - unsigned trans_triggers_run:1; + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; struct bkey_i *k; - struct btree_iter *iter; + struct btree_path *path; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; + unsigned long ip_allocated; }; +#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 +#else +#define BTREE_ITER_MAX 32 +#endif -struct btree_trans { - struct bch_fs *c; - unsigned long ip; +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; - u64 iters_linked; - u64 iters_live; - u64 iters_touched; +#define BTREE_TRANS_MEM_MAX (1U << 16) - u8 nr_iters; +struct btree_trans { + struct bch_fs *c; + const char *fn; + struct list_head list; + struct btree *locking; + unsigned locking_path_idx; + struct bpos locking_pos; + u8 locking_btree_id; + u8 locking_level; + u8 locking_lock_type; + struct task_struct *task; + int srcu_idx; + + u8 nr_sorted; u8 nr_updates; - u8 size; - unsigned used_mempool:1; - unsigned error:1; - unsigned nounlock:1; - unsigned need_reset:1; + u8 traverse_all_idx; + bool used_mempool:1; + bool in_traverse_all:1; + bool restarted:1; + bool memory_allocation_failure:1; + bool is_initial_gc:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: + */ + unsigned extra_journal_res; + + u64 paths_allocated; unsigned mem_top; unsigned mem_bytes; void *mem; - struct btree_iter *iters; + u8 sorted[BTREE_ITER_MAX]; + struct btree_path *paths; struct btree_insert_entry *updates; /* update path: */ + struct btree_trans_commit_hook *hooks; + DARRAY(u64) extra_journal_entries; + struct journal_entry_pin *journal_pin; + struct journal_res journal_res; struct journal_preres journal_preres; u64 *journal_seq; @@ -300,12 +434,33 @@ struct btree_trans { unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; +}; - struct btree_iter iters_onstack[2]; - struct btree_insert_entry updates_onstack[2]; +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) + +enum btree_flags { +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x }; -#define BTREE_FLAG(flag) \ +#define x(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ @@ -315,33 +470,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } -enum btree_flags { - BTREE_NODE_read_in_flight, - BTREE_NODE_read_error, - BTREE_NODE_dirty, - BTREE_NODE_need_write, - BTREE_NODE_noevict, - BTREE_NODE_write_idx, - BTREE_NODE_accessed, - BTREE_NODE_write_in_flight, - BTREE_NODE_just_written, - BTREE_NODE_dying, - BTREE_NODE_fake, - BTREE_NODE_old_extent_overwrite, -}; - -BTREE_FLAG(read_in_flight); -BTREE_FLAG(read_error); -BTREE_FLAG(dirty); -BTREE_FLAG(need_write); -BTREE_FLAG(noevict); -BTREE_FLAG(write_idx); -BTREE_FLAG(accessed); -BTREE_FLAG(write_in_flight); -BTREE_FLAG(just_written); -BTREE_FLAG(dying); -BTREE_FLAG(fake); -BTREE_FLAG(old_extent_overwrite); +BTREE_FLAGS() +#undef x static inline struct btree_write *btree_current_write(struct btree *b) { @@ -453,78 +583,71 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) } enum btree_node_type { -#define x(kwd, val, name) BKEY_TYPE_##kwd = val, +#define x(kwd, val) BKEY_TYPE_##kwd = val, BCH_BTREE_IDS() #undef x - BKEY_TYPE_BTREE, + BKEY_TYPE_btree, }; /* Type of a key in btree @id at level @level: */ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; + return level ? BKEY_TYPE_btree : (enum btree_node_type) id; } /* Type of keys @b contains: */ static inline enum btree_node_type btree_node_type(struct btree *b) { - return __btree_node_type(b->level, b->btree_id); + return __btree_node_type(b->c.level, b->c.btree_id); } -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - switch (type) { - case BKEY_TYPE_EXTENTS: - case BKEY_TYPE_REFLINK: - return true; - default: - return false; - } -} +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) -static inline bool btree_node_is_extents(struct btree *b) +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + +static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return btree_node_type_is_extents(btree_node_type(b)); + return (1U << type) & BTREE_ID_IS_EXTENTS; } -#define BTREE_NODE_TYPE_HAS_TRIGGERS \ - ((1U << BKEY_TYPE_EXTENTS)| \ - (1U << BKEY_TYPE_ALLOC)| \ - (1U << BKEY_TYPE_INODES)| \ - (1U << BKEY_TYPE_REFLINK)| \ - (1U << BKEY_TYPE_EC)| \ - (1U << BKEY_TYPE_BTREE)) +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) -#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - ((1U << BKEY_TYPE_EXTENTS)| \ - (1U << BKEY_TYPE_INODES)| \ - (1U << BKEY_TYPE_REFLINK)) - -enum btree_trigger_flags { - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ - - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_OVERWRITE_SPLIT, - - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_ALLOC_READ, - __BTREE_TRIGGER_NOATOMIC, -}; +#define BTREE_ID_HAS_PTRS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)) -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) - -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) +static inline bool btree_type_has_snapshots(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +} -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_PTRS; +} static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -534,8 +657,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) struct btree_root { struct btree *b; - struct btree_update *as; - /* On disk root - see async splits: */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; @@ -543,18 +664,13 @@ struct btree_root { s8 error; }; -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - enum btree_insert_ret { BTREE_INSERT_OK, /* leaf node needs to be split */ BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_ENOSPC, BTREE_INSERT_NEED_MARK_REPLICAS, BTREE_INSERT_NEED_JOURNAL_RES, + BTREE_INSERT_NEED_JOURNAL_RECLAIM, }; enum btree_gc_coalesce_fail_reason { @@ -568,8 +684,4 @@ enum btree_node_sibling { btree_next_sib, }; -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - #endif /* _BCACHEFS_BTREE_TYPES_H */