X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_types.h;h=6250f34fe561e58d5c9b258e4cc11cedd065c522;hb=693e278c14efae893a6f051446fbbc0cf5e408cd;hp=cf59f12247413aca9c3a4d788d77d2110bf420c0;hpb=1e574cb1aa07ab3a796c7d6c5501b96f3056ef4d;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index cf59f12..6250f34 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -6,9 +6,12 @@ #include #include -#include "bkey_methods.h" +//#include "bkey_methods.h" #include "buckets_types.h" +#include "darray.h" +#include "errcode.h" #include "journal_types.h" +#include "replicas_types.h" struct open_bucket; struct btree_update; @@ -47,8 +50,6 @@ struct bset_tree { u16 data_offset; u16 aux_data_offset; u16 end_offset; - - struct bpos max_key; }; struct btree_write { @@ -57,13 +58,14 @@ struct btree_write { struct btree_alloc { struct open_buckets ob; - BKEY_PADDED(k); + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_bkey_cached_common { struct six_lock lock; u8 level; u8 btree_id; + bool cached; }; struct btree { @@ -76,6 +78,7 @@ struct btree { u16 written; u8 nsets; u8 nr_key_bits; + u16 version_ondisk; struct bkey_format format; @@ -97,6 +100,11 @@ struct btree { u8 byte_order; u8 unpack_fn_len; + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + /* * XXX: add a delete sequence number, so when bch2_btree_node_relock() * fails because the lock sequence number has changed - i.e. the @@ -127,11 +135,6 @@ struct btree { /* lru list */ struct list_head list; - - struct btree_write writes[2]; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_cache { @@ -153,11 +156,22 @@ struct btree_cache { struct mutex lock; struct list_head live; struct list_head freeable; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; atomic_t dirty; struct shrinker shrink; @@ -177,65 +191,95 @@ struct btree_node_iter { } data[MAX_BSETS]; }; -enum btree_iter_type { - BTREE_ITER_KEYS, - BTREE_ITER_NODES, - BTREE_ITER_CACHED, -}; - -#define BTREE_ITER_TYPE ((1 << 2) - 1) - /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -#define BTREE_ITER_SLOTS (1 << 2) +static const u16 BTREE_ITER_SLOTS = 1 << 0; +static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 3) +static const u16 BTREE_ITER_INTENT = 1 << 2; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 4) -/* - * Indicates that this iterator should not be reused until transaction commit, - * either because a pending update references it or because the update depends - * on that particular key being locked (e.g. by the str_hash code, for hash - * table consistency) - */ -#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) +static const u16 BTREE_ITER_PREFETCH = 1 << 3; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 6) -#define BTREE_ITER_ERROR (1 << 7) -#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) -#define BTREE_ITER_CACHED_NOFILL (1 << 9) -#define BTREE_ITER_CACHED_NOCREATE (1 << 10) - -#define BTREE_ITER_USER_FLAGS \ - (BTREE_ITER_SLOTS \ - |BTREE_ITER_INTENT \ - |BTREE_ITER_PREFETCH \ - |BTREE_ITER_CACHED_NOFILL \ - |BTREE_ITER_CACHED_NOCREATE) - -enum btree_iter_uptodate { +static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4; +static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; +static const u16 BTREE_ITER_CACHED = 1 << 6; +static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; +static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8; +static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; +static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; +static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; +static const u16 BTREE_ITER_NOPRESERVE = 1 << 13; +static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; +static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; + +enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_PEEK = 1, - BTREE_ITER_NEED_RELOCK = 2, - BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, }; -#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) -#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) -#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) -#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) -#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) -#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) -#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) +#define TRACK_PATH_ALLOCATED +#endif + +struct btree_path { + u8 idx; + u8 sorted_idx; + u8 ref; + u8 intent_ref; + + /* btree_iter_copy starts here: */ + struct bpos pos; + + enum btree_id btree_id:5; + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; + /* + * When true, failing to relock this path will cause the transaction to + * restart: + */ + bool should_be_locked:1; + unsigned level:3, + locks_want:3; + u8 nodes_locked; + + struct btree_path_level { + struct btree *b; + struct btree_node_iter iter; + u32 lock_seq; +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + u64 lock_taken_time; +#endif + } l[BTREE_MAX_DEPTH]; +#ifdef TRACK_PATH_ALLOCATED + unsigned long ip_allocated; +#endif +}; + +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + +static inline unsigned long btree_path_ip_allocated(struct btree_path *path) +{ +#ifdef TRACK_PATH_ALLOCATED + return path->ip_allocated; +#else + return _THIS_IP_; +#endif +} /* * @pos - iterator's current position @@ -246,68 +290,59 @@ enum btree_iter_uptodate { */ struct btree_iter { struct btree_trans *trans; - struct bpos pos; - struct bpos pos_after_commit; + struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; - u16 flags; - u8 idx; + enum btree_id btree_id:8; + unsigned min_depth:3; + unsigned advanced:1; - enum btree_id btree_id:4; - enum btree_iter_uptodate uptodate:4; - unsigned level:4, - min_depth:4, - locks_want:4, - nodes_locked:4, - nodes_intent_locked:4; + /* btree_iter_copy starts here: */ + u16 flags; - struct btree_iter_level { - struct btree *b; - struct btree_node_iter iter; - u32 lock_seq; - } l[BTREE_MAX_DEPTH]; + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + struct bpos pos; /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; + + /* BTREE_ITER_WITH_JOURNAL: */ + size_t journal_idx; + struct bpos journal_pos; +#ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; +#endif }; -static inline enum btree_iter_type -btree_iter_type(const struct btree_iter *iter) -{ - return iter->flags & BTREE_ITER_TYPE; -} - -static inline bool btree_iter_is_cached(const struct btree_iter *iter) -{ - return btree_iter_type(iter) == BTREE_ITER_CACHED; -} - -static inline struct btree_iter_level *iter_l(struct btree_iter *iter) -{ - return iter->l + iter->level; -} +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; struct btree_key_cache { struct mutex lock; struct rhashtable table; bool table_init_done; - struct list_head freed; - struct list_head clean; - struct list_head dirty; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; struct shrinker shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; - size_t nr_freed; - size_t nr_keys; - size_t nr_dirty; + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; }; struct bkey_cached_key { u32 btree_id; struct bpos pos; -} __attribute__((packed, aligned(4))); +} __packed __aligned(4); #define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_DIRTY 1 @@ -316,7 +351,7 @@ struct bkey_cached { struct btree_bkey_cached_common c; unsigned long flags; - u8 u64s; + u16 u64s; bool valid; u32 btree_trans_barrier_seq; struct bkey_cached_key key; @@ -326,75 +361,164 @@ struct bkey_cached { struct journal_preres res; struct journal_entry_pin journal; + u64 seq; struct bkey_i *k; }; +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) +{ + return !b->cached + ? container_of(b, struct btree, c)->key.k.p + : container_of(b, struct bkey_cached, c)->key.pos; +} + struct btree_insert_entry { - unsigned trigger_flags; - unsigned trans_triggers_run:1; + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; struct bkey_i *k; - struct btree_iter *iter; + struct btree_path *path; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; + unsigned long ip_allocated; }; -#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 -#else -#define BTREE_ITER_MAX 32 -#endif + +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; + +#define BTREE_TRANS_MEM_MAX (1U << 16) + +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 struct btree_trans { struct bch_fs *c; -#ifdef CONFIG_BCACHEFS_DEBUG + const char *fn; + struct closure ref; struct list_head list; - struct btree *locking; - unsigned locking_iter_idx; - struct bpos locking_pos; - u8 locking_btree_id; - u8 locking_level; - pid_t pid; -#endif - unsigned long ip; - int srcu_idx; + u64 last_begin_time; - u64 iters_linked; - u64 iters_live; - u64 iters_touched; + u8 lock_may_not_fail; + u8 lock_must_abort; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; - u8 nr_iters; + int srcu_idx; + + u8 fn_idx; + u8 nr_sorted; u8 nr_updates; - u8 nr_updates2; - u8 size; - unsigned used_mempool:1; - unsigned error:1; - unsigned nounlock:1; - unsigned need_reset:1; - unsigned in_traverse_all:1; + u8 nr_wb_updates; + u8 wb_updates_size; + bool used_mempool:1; + bool in_traverse_all:1; + bool paths_sorted:1; + bool memory_allocation_failure:1; + bool journal_transaction_names:1; + bool journal_replay_not_finished:1; + bool is_initial_gc:1; + bool notrace_relock_fail:1; + enum bch_errcode restarted:16; + u32 restart_count; + unsigned long last_begin_ip; + unsigned long last_restarted_ip; + unsigned long srcu_lock_time; + + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: + */ + unsigned extra_journal_res; + unsigned nr_max_paths; + + u64 paths_allocated; unsigned mem_top; + unsigned mem_max; unsigned mem_bytes; void *mem; - struct btree_iter *iters; + u8 sorted[BTREE_ITER_MAX + 8]; + struct btree_path *paths; struct btree_insert_entry *updates; - struct btree_insert_entry *updates2; + struct btree_write_buffered_key *wb_updates; /* update path: */ - struct jset_entry *extra_journal_entries; - unsigned extra_journal_entry_u64s; + struct btree_trans_commit_hook *hooks; + darray_u64 extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; - unsigned flags; unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; }; -#define BTREE_FLAG(flag) \ +#define BCH_BTREE_WRITE_TYPES() \ + x(initial, 0) \ + x(init_next_bset, 1) \ + x(cache_reclaim, 2) \ + x(journal_reclaim, 3) \ + x(interior, 4) + +enum btree_write_type { +#define x(t, n) BTREE_WRITE_##t, + BCH_BTREE_WRITE_TYPES() +#undef x + BTREE_WRITE_TYPE_NR, +}; + +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) +#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) + +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) + +enum btree_flags { + /* First bits for btree node write type */ + BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x +}; + +#define x(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ @@ -404,34 +528,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } -enum btree_flags { - BTREE_NODE_read_in_flight, - BTREE_NODE_read_error, - BTREE_NODE_dirty, - BTREE_NODE_need_write, - BTREE_NODE_noevict, - BTREE_NODE_write_idx, - BTREE_NODE_accessed, - BTREE_NODE_write_in_flight, - BTREE_NODE_just_written, - BTREE_NODE_dying, - BTREE_NODE_fake, - BTREE_NODE_old_extent_overwrite, - BTREE_NODE_need_rewrite, -}; - -BTREE_FLAG(read_in_flight); -BTREE_FLAG(read_error); -BTREE_FLAG(need_write); -BTREE_FLAG(noevict); -BTREE_FLAG(write_idx); -BTREE_FLAG(accessed); -BTREE_FLAG(write_in_flight); -BTREE_FLAG(just_written); -BTREE_FLAG(dying); -BTREE_FLAG(fake); -BTREE_FLAG(old_extent_overwrite); -BTREE_FLAG(need_rewrite); +BTREE_FLAGS() +#undef x static inline struct btree_write *btree_current_write(struct btree *b) { @@ -543,16 +641,16 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) } enum btree_node_type { -#define x(kwd, val, name) BKEY_TYPE_##kwd = val, +#define x(kwd, val) BKEY_TYPE_##kwd = val, BCH_BTREE_IDS() #undef x - BKEY_TYPE_BTREE, + BKEY_TYPE_btree, }; /* Type of a key in btree @id at level @level: */ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; + return level ? BKEY_TYPE_btree : (enum btree_node_type) id; } /* Type of keys @b contains: */ @@ -561,68 +659,54 @@ static inline enum btree_node_type btree_node_type(struct btree *b) return __btree_node_type(b->c.level, b->c.btree_id); } +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) + +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + static inline bool btree_node_type_is_extents(enum btree_node_type type) { - switch (type) { - case BKEY_TYPE_EXTENTS: - case BKEY_TYPE_REFLINK: - return true; - default: - return false; - } + return (1U << type) & BTREE_ID_IS_EXTENTS; } -static inline bool btree_node_is_extents(struct btree *b) -{ - return btree_node_type_is_extents(btree_node_type(b)); -} +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) -static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) +#define BTREE_ID_HAS_PTRS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)) + +static inline bool btree_type_has_snapshots(enum btree_id id) { - return __btree_node_type(iter->level, iter->btree_id); + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; } -static inline bool btree_iter_is_extents(struct btree_iter *iter) +static inline bool btree_type_has_ptrs(enum btree_id id) { - return btree_node_type_is_extents(btree_iter_key_type(iter)); + return (1 << id) & BTREE_ID_HAS_PTRS; } -#define BTREE_NODE_TYPE_HAS_TRIGGERS \ - ((1U << BKEY_TYPE_EXTENTS)| \ - (1U << BKEY_TYPE_ALLOC)| \ - (1U << BKEY_TYPE_INODES)| \ - (1U << BKEY_TYPE_REFLINK)| \ - (1U << BKEY_TYPE_EC)| \ - (1U << BKEY_TYPE_BTREE)) - -#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - ((1U << BKEY_TYPE_EXTENTS)| \ - (1U << BKEY_TYPE_INODES)| \ - (1U << BKEY_TYPE_EC)| \ - (1U << BKEY_TYPE_REFLINK)) - -enum btree_trigger_flags { - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_OVERWRITE_SPLIT, - - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_NOATOMIC, -}; - -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) - -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) - static inline bool btree_node_type_needs_gc(enum btree_node_type type) { return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); @@ -638,21 +722,6 @@ struct btree_root { s8 error; }; -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - -enum btree_insert_ret { - BTREE_INSERT_OK, - /* leaf node needs to be split */ - BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_ENOSPC, - BTREE_INSERT_NEED_MARK_REPLICAS, - BTREE_INSERT_NEED_JOURNAL_RES, - BTREE_INSERT_NEED_JOURNAL_RECLAIM, -}; - enum btree_gc_coalesce_fail_reason { BTREE_GC_COALESCE_FAIL_RESERVE_GET, BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, @@ -664,8 +733,4 @@ enum btree_node_sibling { btree_next_sib, }; -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - #endif /* _BCACHEFS_BTREE_TYPES_H */