]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_types.h
Update bcachefs sources to edf5f38218 bcachefs: Refactor superblock code
[bcachefs-tools-debian] / libbcachefs / btree_types.h
1 #ifndef _BCACHEFS_BTREE_TYPES_H
2 #define _BCACHEFS_BTREE_TYPES_H
3
4 #include <linux/list.h>
5 #include <linux/rhashtable.h>
6 #include <linux/semaphore.h>
7 #include <linux/workqueue.h>
8
9 #include "bkey_methods.h"
10 #include "journal_types.h"
11 #include "six.h"
12
13 struct open_bucket;
14 struct btree_update;
15
16 #define MAX_BSETS               3U
17
18 struct btree_nr_keys {
19
20         /*
21          * Amount of live metadata (i.e. size of node after a compaction) in
22          * units of u64s
23          */
24         u16                     live_u64s;
25         u16                     bset_u64s[MAX_BSETS];
26
27         /* live keys only: */
28         u16                     packed_keys;
29         u16                     unpacked_keys;
30 };
31
32 struct bset_tree {
33         /*
34          * We construct a binary tree in an array as if the array
35          * started at 1, so that things line up on the same cachelines
36          * better: see comments in bset.c at cacheline_to_bkey() for
37          * details
38          */
39
40         /* size of the binary tree and prev array */
41         u16                     size;
42
43         /* function of size - precalculated for to_inorder() */
44         u16                     extra;
45
46         u16                     data_offset;
47         u16                     aux_data_offset;
48         u16                     end_offset;
49
50         struct bpos             max_key;
51 };
52
53 struct btree_write {
54         struct journal_entry_pin        journal;
55         struct closure_waitlist         wait;
56 };
57
58 struct btree_ob_ref {
59         u8                      nr;
60         u8                      refs[BCH_REPLICAS_MAX];
61 };
62
63 struct btree_alloc {
64         struct btree_ob_ref     ob;
65         BKEY_PADDED(k);
66 };
67
68 struct btree {
69         /* Hottest entries first */
70         struct rhash_head       hash;
71
72         /* Key/pointer for this btree node */
73         __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
74
75         struct six_lock         lock;
76
77         unsigned long           flags;
78         u16                     written;
79         u8                      level;
80         u8                      btree_id;
81         u8                      nsets;
82         u8                      nr_key_bits;
83
84         struct bkey_format      format;
85
86         struct btree_node       *data;
87         void                    *aux_data;
88
89         /*
90          * Sets of sorted keys - the real btree node - plus a binary search tree
91          *
92          * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
93          * to the memory we have allocated for this btree node. Additionally,
94          * set[0]->data points to the entire btree node as it exists on disk.
95          */
96         struct bset_tree        set[MAX_BSETS];
97
98         struct btree_nr_keys    nr;
99         u16                     sib_u64s[2];
100         u16                     whiteout_u64s;
101         u16                     uncompacted_whiteout_u64s;
102         u8                      page_order;
103         u8                      unpack_fn_len;
104
105         /*
106          * XXX: add a delete sequence number, so when bch2_btree_node_relock()
107          * fails because the lock sequence number has changed - i.e. the
108          * contents were modified - we can still relock the node if it's still
109          * the one we want, without redoing the traversal
110          */
111
112         /*
113          * For asynchronous splits/interior node updates:
114          * When we do a split, we allocate new child nodes and update the parent
115          * node to point to them: we update the parent in memory immediately,
116          * but then we must wait until the children have been written out before
117          * the update to the parent can be written - this is a list of the
118          * btree_updates that are blocking this node from being
119          * written:
120          */
121         struct list_head        write_blocked;
122
123         /*
124          * Also for asynchronous splits/interior node updates:
125          * If a btree node isn't reachable yet, we don't want to kick off
126          * another write - because that write also won't yet be reachable and
127          * marking it as completed before it's reachable would be incorrect:
128          */
129         unsigned long           will_make_reachable;
130
131         struct btree_ob_ref     ob;
132
133         /* lru list */
134         struct list_head        list;
135
136         struct btree_write      writes[2];
137
138 #ifdef CONFIG_BCACHEFS_DEBUG
139         bool                    *expensive_debug_checks;
140 #endif
141 };
142
143 struct btree_cache {
144         struct rhashtable       table;
145         bool                    table_init_done;
146         /*
147          * We never free a struct btree, except on shutdown - we just put it on
148          * the btree_cache_freed list and reuse it later. This simplifies the
149          * code, and it doesn't cost us much memory as the memory usage is
150          * dominated by buffers that hold the actual btree node data and those
151          * can be freed - and the number of struct btrees allocated is
152          * effectively bounded.
153          *
154          * btree_cache_freeable effectively is a small cache - we use it because
155          * high order page allocations can be rather expensive, and it's quite
156          * common to delete and allocate btree nodes in quick succession. It
157          * should never grow past ~2-3 nodes in practice.
158          */
159         struct mutex            lock;
160         struct list_head        live;
161         struct list_head        freeable;
162         struct list_head        freed;
163
164         /* Number of elements in live + freeable lists */
165         unsigned                used;
166         unsigned                reserve;
167         struct shrinker         shrink;
168
169         /*
170          * If we need to allocate memory for a new btree node and that
171          * allocation fails, we can cannibalize another node in the btree cache
172          * to satisfy the allocation - lock to guarantee only one thread does
173          * this at a time:
174          */
175         struct task_struct      *alloc_lock;
176         struct closure_waitlist alloc_wait;
177 };
178
179 #define BTREE_FLAG(flag)                                                \
180 static inline bool btree_node_ ## flag(struct btree *b)                 \
181 {       return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
182                                                                         \
183 static inline void set_btree_node_ ## flag(struct btree *b)             \
184 {       set_bit(BTREE_NODE_ ## flag, &b->flags); }                      \
185                                                                         \
186 static inline void clear_btree_node_ ## flag(struct btree *b)           \
187 {       clear_bit(BTREE_NODE_ ## flag, &b->flags); }
188
189 enum btree_flags {
190         BTREE_NODE_read_in_flight,
191         BTREE_NODE_read_error,
192         BTREE_NODE_dirty,
193         BTREE_NODE_need_write,
194         BTREE_NODE_noevict,
195         BTREE_NODE_write_idx,
196         BTREE_NODE_accessed,
197         BTREE_NODE_write_in_flight,
198         BTREE_NODE_just_written,
199         BTREE_NODE_dying,
200         BTREE_NODE_fake,
201 };
202
203 BTREE_FLAG(read_in_flight);
204 BTREE_FLAG(read_error);
205 BTREE_FLAG(dirty);
206 BTREE_FLAG(need_write);
207 BTREE_FLAG(noevict);
208 BTREE_FLAG(write_idx);
209 BTREE_FLAG(accessed);
210 BTREE_FLAG(write_in_flight);
211 BTREE_FLAG(just_written);
212 BTREE_FLAG(dying);
213 BTREE_FLAG(fake);
214
215 static inline struct btree_write *btree_current_write(struct btree *b)
216 {
217         return b->writes + btree_node_write_idx(b);
218 }
219
220 static inline struct btree_write *btree_prev_write(struct btree *b)
221 {
222         return b->writes + (btree_node_write_idx(b) ^ 1);
223 }
224
225 static inline struct bset_tree *bset_tree_last(struct btree *b)
226 {
227         EBUG_ON(!b->nsets);
228         return b->set + b->nsets - 1;
229 }
230
231 static inline struct bset *bset(const struct btree *b,
232                                 const struct bset_tree *t)
233 {
234         return (void *) b->data + t->data_offset * sizeof(u64);
235 }
236
237 static inline struct bset *btree_bset_first(struct btree *b)
238 {
239         return bset(b, b->set);
240 }
241
242 static inline struct bset *btree_bset_last(struct btree *b)
243 {
244         return bset(b, bset_tree_last(b));
245 }
246
247 static inline u16
248 __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
249 {
250         size_t ret = (u64 *) k - (u64 *) b->data - 1;
251
252         EBUG_ON(ret > U16_MAX);
253         return ret;
254 }
255
256 static inline struct bkey_packed *
257 __btree_node_offset_to_key(const struct btree *b, u16 k)
258 {
259         return (void *) ((u64 *) b->data + k + 1);
260 }
261
262 #define btree_bkey_first(_b, _t)        (bset(_b, _t)->start)
263
264 #define btree_bkey_last(_b, _t)                                         \
265 ({                                                                      \
266         EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=     \
267                 vstruct_last(bset(_b, _t)));                            \
268                                                                         \
269         __btree_node_offset_to_key(_b, (_t)->end_offset);               \
270 })
271
272 static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
273 {
274         t->end_offset =
275                 __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
276         btree_bkey_last(b, t);
277 }
278
279 static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
280                                   const struct bset *i)
281 {
282         t->data_offset = (u64 *) i - (u64 *) b->data;
283
284         EBUG_ON(bset(b, t) != i);
285
286         set_btree_bset_end(b, t);
287 }
288
289 static inline unsigned bset_byte_offset(struct btree *b, void *i)
290 {
291         return i - (void *) b->data;
292 }
293
294 /* Type of keys @b contains: */
295 static inline enum bkey_type btree_node_type(struct btree *b)
296 {
297         return b->level ? BKEY_TYPE_BTREE : b->btree_id;
298 }
299
300 static inline const struct bkey_ops *btree_node_ops(struct btree *b)
301 {
302         return &bch2_bkey_ops[btree_node_type(b)];
303 }
304
305 static inline bool btree_node_has_ptrs(struct btree *b)
306 {
307         return btree_type_has_ptrs(btree_node_type(b));
308 }
309
310 static inline bool btree_node_is_extents(struct btree *b)
311 {
312         return btree_node_type(b) == BKEY_TYPE_EXTENTS;
313 }
314
315 struct btree_root {
316         struct btree            *b;
317
318         struct btree_update     *as;
319
320         /* On disk root - see async splits: */
321         __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
322         u8                      level;
323         u8                      alive;
324 };
325
326 /*
327  * Optional hook that will be called just prior to a btree node update, when
328  * we're holding the write lock and we know what key is about to be overwritten:
329  */
330
331 struct btree_iter;
332 struct btree_node_iter;
333
334 enum btree_insert_ret {
335         BTREE_INSERT_OK,
336         /* extent spanned multiple leaf nodes: have to traverse to next node: */
337         BTREE_INSERT_NEED_TRAVERSE,
338         /* write lock held for too long */
339         BTREE_INSERT_NEED_RESCHED,
340         /* leaf node needs to be split */
341         BTREE_INSERT_BTREE_NODE_FULL,
342         BTREE_INSERT_JOURNAL_RES_FULL,
343         BTREE_INSERT_ENOSPC,
344         BTREE_INSERT_NEED_GC_LOCK,
345 };
346
347 struct extent_insert_hook {
348         enum btree_insert_ret
349         (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
350               struct bkey_s_c, const struct bkey_i *);
351 };
352
353 enum btree_gc_coalesce_fail_reason {
354         BTREE_GC_COALESCE_FAIL_RESERVE_GET,
355         BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
356         BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
357 };
358
359 enum btree_node_sibling {
360         btree_prev_sib,
361         btree_next_sib,
362 };
363
364 typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
365                                                         struct btree *,
366                                                         struct btree_node_iter *);
367
368 #endif /* _BCACHEFS_BTREE_TYPES_H */