X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_types.h;h=07c509aaa4ef399bf07d3119eac84a61965c4fbb;hb=5a5a6c25a93e83e0b1e659d7808e7636d0b770ed;hp=18596dc8d7ba9b7f946566d8c935facbe29c8ac1;hpb=17c5215c1c542dd7b6b4f891a0da16d8c98e0591;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 18596dc..07c509a 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_TYPES_H #define _BCACHEFS_BTREE_TYPES_H @@ -5,11 +6,14 @@ #include #include -#include "bkey_methods.h" +//#include "bkey_methods.h" +#include "buckets_types.h" +#include "darray.h" #include "journal_types.h" struct open_bucket; struct btree_update; +struct btree_trans; #define MAX_BSETS 3U @@ -44,35 +48,35 @@ struct bset_tree { u16 data_offset; u16 aux_data_offset; u16 end_offset; - - struct bpos max_key; }; struct btree_write { struct journal_entry_pin journal; - struct closure_waitlist wait; }; struct btree_alloc { struct open_buckets ob; - BKEY_PADDED(k); + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); }; -struct btree { - /* Hottest entries first */ - struct rhash_head hash; +struct btree_bkey_cached_common { + struct six_lock lock; + u8 level; + u8 btree_id; + bool cached; +}; - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +struct btree { + struct btree_bkey_cached_common c; - struct six_lock lock; + struct rhash_head hash; + u64 hash_val; unsigned long flags; u16 written; - u8 level; - u8 btree_id; u8 nsets; u8 nr_key_bits; + u16 version_ondisk; struct bkey_format format; @@ -91,10 +95,14 @@ struct btree { struct btree_nr_keys nr; u16 sib_u64s[2]; u16 whiteout_u64s; - u16 uncompacted_whiteout_u64s; - u8 page_order; + u8 byte_order; u8 unpack_fn_len; + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + /* * XXX: add a delete sequence number, so when bch2_btree_node_relock() * fails because the lock sequence number has changed - i.e. the @@ -125,12 +133,6 @@ struct btree { /* lru list */ struct list_head list; - - struct btree_write writes[2]; - -#ifdef CONFIG_BCACHEFS_DEBUG - bool *expensive_debug_checks; -#endif }; struct btree_cache { @@ -152,11 +154,23 @@ struct btree_cache { struct mutex lock; struct list_head live; struct list_head freeable; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; + atomic_t dirty; struct shrinker shrink; /* @@ -175,31 +189,81 @@ struct btree_node_iter { } data[MAX_BSETS]; }; -enum btree_iter_type { - BTREE_ITER_KEYS, - BTREE_ITER_SLOTS, - BTREE_ITER_NODES, -}; - -#define BTREE_ITER_TYPE ((1 << 2) - 1) - +/* + * Iterate over all possible positions, synthesizing deleted keys for holes: + */ +#define BTREE_ITER_SLOTS (1 << 0) +#define BTREE_ITER_ALL_LEVELS (1 << 1) +/* + * Indicates that intent locks should be taken on leaf nodes, because we expect + * to be doing updates: + */ #define BTREE_ITER_INTENT (1 << 2) +/* + * Causes the btree iterator code to prefetch additional btree nodes from disk: + */ #define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ #define BTREE_ITER_IS_EXTENTS (1 << 4) -#define BTREE_ITER_ERROR (1 << 5) -#define BTREE_ITER_NOUNLOCK (1 << 6) - -enum btree_iter_uptodate { +#define BTREE_ITER_NOT_EXTENTS (1 << 5) +#define BTREE_ITER_CACHED (1 << 6) +#define BTREE_ITER_WITH_KEY_CACHE (1 << 7) +#define BTREE_ITER_WITH_UPDATES (1 << 8) +#define BTREE_ITER_WITH_JOURNAL (1 << 9) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 10) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 11) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 12) +#define BTREE_ITER_NOPRESERVE (1 << 13) + +enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_PEEK = 1, - BTREE_ITER_NEED_RELOCK = 2, - BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, +}; + +struct btree_path { + u8 idx; + u8 sorted_idx; + u8 ref; + u8 intent_ref; + + /* btree_iter_copy starts here: */ + struct bpos pos; + + enum btree_id btree_id:4; + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; + /* + * When true, failing to relock this path will cause the transaction to + * restart: + */ + bool should_be_locked:1; + unsigned level:3, + locks_want:4; + u8 nodes_locked; + + struct btree_path_level { + struct btree *b; + struct btree_node_iter iter; + u32 lock_seq; +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + u64 lock_taken_time; +#endif + } l[BTREE_MAX_DEPTH]; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif }; +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + /* * @pos - iterator's current position * @level - current btree depth @@ -208,88 +272,233 @@ enum btree_iter_uptodate { * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { - struct bch_fs *c; - struct bpos pos; + struct btree_trans *trans; + struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; - u8 flags; - enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; - unsigned level:4, - locks_want:4, - nodes_locked:4, - nodes_intent_locked:4; + unsigned min_depth:3; + unsigned advanced:1; - struct btree_iter_level { - struct btree *b; - struct btree_node_iter iter; - u32 lock_seq; - } l[BTREE_MAX_DEPTH]; + /* btree_iter_copy starts here: */ + u16 flags; + + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + struct bpos pos; /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; - /* - * Circular linked list of linked iterators: linked iterators share - * locks (e.g. two linked iterators may have the same node intent - * locked, or read and write locked, at the same time), and insertions - * through one iterator won't invalidate the other linked iterators. - */ + /* BTREE_ITER_WITH_JOURNAL: */ + size_t journal_idx; + struct bpos journal_pos; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif +}; - /* Must come last: */ - struct btree_iter *next; +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; }; -#define BTREE_ITER_MAX 8 +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + struct shrinker shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; -struct deferred_update { - struct journal_entry_pin journal; + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; - spinlock_t lock; - unsigned gen; +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 + +struct bkey_cached { + struct btree_bkey_cached_common c; + + unsigned long flags; + u16 u64s; + bool valid; + u32 btree_trans_barrier_seq; + struct bkey_cached_key key; + + struct rhash_head hash; + struct list_head list; - u8 allocated_u64s; - enum btree_id btree_id; + struct journal_preres res; + struct journal_entry_pin journal; - /* must be last: */ - struct bkey_i k; + struct bkey_i *k; }; +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) +{ + return !b->cached + ? container_of(b, struct btree, c)->key.k.p + : container_of(b, struct bkey_cached, c)->key.pos; +} + struct btree_insert_entry { + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; struct bkey_i *k; + struct btree_path *path; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; + unsigned long ip_allocated; +}; - union { - struct btree_iter *iter; - struct deferred_update *d; - }; +#ifndef CONFIG_LOCKDEP +#define BTREE_ITER_MAX 64 +#else +#define BTREE_ITER_MAX 32 +#endif + +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); - bool deferred; +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; }; +#define BTREE_TRANS_MEM_MAX (1U << 16) + +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 + struct btree_trans { struct bch_fs *c; - size_t nr_restarts; + const char *fn; + struct closure ref; + struct list_head list; + u64 last_begin_time; + + u8 lock_may_not_fail; + u8 lock_must_abort; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; - u8 nr_iters; - u8 iters_live; - u8 iters_linked; + int srcu_idx; + + u8 fn_idx; + u8 nr_sorted; u8 nr_updates; + u8 traverse_all_idx; + bool used_mempool:1; + bool in_traverse_all:1; + bool memory_allocation_failure:1; + bool is_initial_gc:1; + bool journal_replay_not_finished:1; + enum bch_errcode restarted:16; + u32 restart_count; + unsigned long last_restarted_ip; + + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: + */ + unsigned extra_journal_res; + unsigned nr_max_paths; + + u64 paths_allocated; unsigned mem_top; + unsigned mem_max; unsigned mem_bytes; void *mem; - struct btree_iter *iters; - u64 iter_ids[BTREE_ITER_MAX]; + u8 sorted[BTREE_ITER_MAX]; + struct btree_path *paths; + struct btree_insert_entry *updates; + + /* update path: */ + struct btree_trans_commit_hook *hooks; + DARRAY(u64) extra_journal_entries; + struct journal_entry_pin *journal_pin; + + struct journal_res journal_res; + struct journal_preres journal_preres; + u64 *journal_seq; + struct disk_reservation *disk_res; + unsigned flags; + unsigned journal_u64s; + unsigned journal_preres_u64s; + struct replicas_delta_list *fs_usage_deltas; +}; + +#define BCH_BTREE_WRITE_TYPES() \ + x(initial, 0) \ + x(init_next_bset, 1) \ + x(cache_reclaim, 2) \ + x(journal_reclaim, 3) \ + x(interior, 4) + +enum btree_write_type { +#define x(t, n) BTREE_WRITE_##t, + BCH_BTREE_WRITE_TYPES() +#undef x + BTREE_WRITE_TYPE_NR, +}; - struct btree_insert_entry updates[BTREE_ITER_MAX]; +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) +#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) + +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) - struct btree_iter iters_onstack[2]; +enum btree_flags { + /* First bits for btree node write type */ + BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x }; -#define BTREE_FLAG(flag) \ +#define x(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ @@ -299,31 +508,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } -enum btree_flags { - BTREE_NODE_read_in_flight, - BTREE_NODE_read_error, - BTREE_NODE_dirty, - BTREE_NODE_need_write, - BTREE_NODE_noevict, - BTREE_NODE_write_idx, - BTREE_NODE_accessed, - BTREE_NODE_write_in_flight, - BTREE_NODE_just_written, - BTREE_NODE_dying, - BTREE_NODE_fake, -}; - -BTREE_FLAG(read_in_flight); -BTREE_FLAG(read_error); -BTREE_FLAG(dirty); -BTREE_FLAG(need_write); -BTREE_FLAG(noevict); -BTREE_FLAG(write_idx); -BTREE_FLAG(accessed); -BTREE_FLAG(write_in_flight); -BTREE_FLAG(just_written); -BTREE_FLAG(dying); -BTREE_FLAG(fake); +BTREE_FLAGS() +#undef x static inline struct btree_write *btree_current_write(struct btree *b) { @@ -418,58 +604,97 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) __btree_node_offset_to_key(_b, (_t)->end_offset); \ }) +static inline unsigned bset_u64s(struct bset_tree *t) +{ + return t->end_offset - t->data_offset - + sizeof(struct bset) / sizeof(u64); +} + +static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) +{ + return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; +} + static inline unsigned bset_byte_offset(struct btree *b, void *i) { return i - (void *) b->data; } enum btree_node_type { -#define x(kwd, val, name) BKEY_TYPE_##kwd = val, +#define x(kwd, val) BKEY_TYPE_##kwd = val, BCH_BTREE_IDS() #undef x - BKEY_TYPE_BTREE, + BKEY_TYPE_btree, }; /* Type of a key in btree @id at level @level: */ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; + return level ? BKEY_TYPE_btree : (enum btree_node_type) id; } /* Type of keys @b contains: */ static inline enum btree_node_type btree_node_type(struct btree *b) { - return __btree_node_type(b->level, b->btree_id); + return __btree_node_type(b->c.level, b->c.btree_id); } +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) + +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return type == BKEY_TYPE_EXTENTS; + return (1U << type) & BTREE_ID_IS_EXTENTS; } -static inline bool btree_node_is_extents(struct btree *b) +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) + +#define BTREE_ID_HAS_PTRS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)) + +static inline bool btree_type_has_snapshots(enum btree_id id) { - return btree_node_type_is_extents(btree_node_type(b)); + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +} + +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_PTRS; } static inline bool btree_node_type_needs_gc(enum btree_node_type type) { - switch (type) { - case BKEY_TYPE_BTREE: - case BKEY_TYPE_EXTENTS: - case BKEY_TYPE_INODES: - case BKEY_TYPE_EC: - return true; - default: - return false; - } + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); } struct btree_root { struct btree *b; - struct btree_update *as; - /* On disk root - see async splits: */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; @@ -477,20 +702,13 @@ struct btree_root { s8 error; }; -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - enum btree_insert_ret { BTREE_INSERT_OK, - /* extent spanned multiple leaf nodes: have to traverse to next node: */ - BTREE_INSERT_NEED_TRAVERSE, /* leaf node needs to be split */ BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_ENOSPC, - BTREE_INSERT_NEED_GC_LOCK, BTREE_INSERT_NEED_MARK_REPLICAS, + BTREE_INSERT_NEED_JOURNAL_RES, + BTREE_INSERT_NEED_JOURNAL_RECLAIM, }; enum btree_gc_coalesce_fail_reason { @@ -504,8 +722,4 @@ enum btree_node_sibling { btree_next_sib, }; -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - #endif /* _BCACHEFS_BTREE_TYPES_H */