1 #ifndef _BCACHE_BTREE_INSERT_H
2 #define _BCACHE_BTREE_INSERT_H
4 #include "btree_cache.h"
5 #include "btree_iter.h"
10 struct bkey_format_state;
14 #define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
16 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
17 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
18 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
19 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
21 static inline void btree_node_reset_sib_u64s(struct btree *b)
23 b->sib_u64s[0] = b->nr.live_u64s;
24 b->sib_u64s[1] = b->nr.live_u64s;
27 struct btree_reserve {
28 struct disk_reservation disk_res;
30 struct btree *b[BTREE_RESERVE_MAX];
33 void __bch_btree_calc_format(struct bkey_format_state *, struct btree *);
34 bool bch_btree_node_format_fits(struct cache_set *c, struct btree *,
35 struct bkey_format *);
37 /* Btree node freeing/allocation: */
40 * Tracks a btree node that has been (or is about to be) freed in memory, but
41 * has _not_ yet been freed on disk (because the write that makes the new
42 * node(s) visible and frees the old hasn't completed yet)
44 struct pending_btree_node_free {
45 bool index_update_done;
48 enum btree_id btree_id;
50 __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
54 * Tracks an in progress split/rewrite of a btree node and the update to the
57 * When we split/rewrite a node, we do all the updates in memory without
58 * waiting for any writes to complete - we allocate the new node(s) and update
59 * the parent node, possibly recursively up to the root.
61 * The end result is that we have one or more new nodes being written -
62 * possibly several, if there were multiple splits - and then a write (updating
63 * an interior node) which will make all these new nodes visible.
65 * Additionally, as we split/rewrite nodes we free the old nodes - but the old
66 * nodes can't be freed (their space on disk can't be reclaimed) until the
67 * update to the interior node that makes the new node visible completes -
68 * until then, the old nodes are still reachable on disk.
71 struct btree_interior_update {
75 struct list_head list;
77 /* What kind of update are we doing? */
79 BTREE_INTERIOR_NO_UPDATE,
80 BTREE_INTERIOR_UPDATING_NODE,
81 BTREE_INTERIOR_UPDATING_ROOT,
82 BTREE_INTERIOR_UPDATING_AS,
86 * BTREE_INTERIOR_UPDATING_NODE:
87 * The update that made the new nodes visible was a regular update to an
88 * existing interior node - @b. We can't write out the update to @b
89 * until the new nodes we created are finished writing, so we block @b
90 * from writing by putting this btree_interior update on the
91 * @b->write_blocked list with @write_blocked_list:
94 struct list_head write_blocked_list;
97 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
98 * we're now blocking another btree_interior_update
99 * @parent_as - btree_interior_update that's waiting on our nodes to finish
100 * writing, before it can make new nodes visible on disk
101 * @wait - list of child btree_interior_updates that are waiting on this
102 * btree_interior_update to make all the new nodes visible before they can free
103 * their old btree nodes
105 struct btree_interior_update *parent_as;
106 struct closure_waitlist wait;
109 * We may be freeing nodes that were dirty, and thus had journal entries
110 * pinned: we need to transfer the oldest of those pins to the
111 * btree_interior_update operation, and release it when the new node(s)
112 * are all persistent and reachable:
114 struct journal_entry_pin journal;
120 * Protected by c->btree_node_pending_free_lock
122 struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
125 /* Only here to reduce stack usage on recursive splits: */
126 struct keylist parent_keys;
128 * Enough room for btree_split's keys without realloc - btree node
129 * pointers never have crc/compression info, so we only need to acount
130 * for the pointers for three keys
132 u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
135 #define for_each_pending_btree_node_free(c, as, p) \
136 list_for_each_entry(as, &c->btree_interior_update_list, list) \
137 for (p = as->pending; p < as->pending + as->nr_pending; p++)
139 void bch_btree_node_free_inmem(struct btree_iter *, struct btree *);
140 void bch_btree_node_free_never_inserted(struct cache_set *, struct btree *);
142 void btree_open_bucket_put(struct cache_set *c, struct btree *);
144 struct btree *__btree_node_alloc_replacement(struct cache_set *,
147 struct btree_reserve *);
148 struct btree *btree_node_alloc_replacement(struct cache_set *, struct btree *,
149 struct btree_reserve *);
151 struct btree_interior_update *
152 bch_btree_interior_update_alloc(struct cache_set *);
154 void bch_btree_interior_update_will_free_node(struct cache_set *,
155 struct btree_interior_update *,
158 void bch_btree_set_root_initial(struct cache_set *, struct btree *,
159 struct btree_reserve *);
161 void bch_btree_reserve_put(struct cache_set *, struct btree_reserve *);
162 struct btree_reserve *bch_btree_reserve_get(struct cache_set *,
163 struct btree *, unsigned,
164 unsigned, struct closure *);
166 int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *);
168 /* Inserting into a given leaf node (last stage of insert): */
170 bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *,
171 struct btree_node_iter *, struct bkey_i *);
172 void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
175 static inline void *btree_data_end(struct cache_set *c, struct btree *b)
177 return (void *) b->data + btree_bytes(c);
180 static inline struct bkey_packed *unwritten_whiteouts_start(struct cache_set *c,
183 return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
186 static inline struct bkey_packed *unwritten_whiteouts_end(struct cache_set *c,
189 return btree_data_end(c, b);
192 static inline void *write_block(struct btree *b)
194 return (void *) b->data + (b->written << 9);
197 static inline bool bset_written(struct btree *b, struct bset *i)
199 return (void *) i < write_block(b);
202 static inline bool bset_unwritten(struct btree *b, struct bset *i)
204 return (void *) i > write_block(b);
207 static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
210 return round_up(bset_byte_offset(b, bset_bkey_last(i)),
211 block_bytes(c)) >> 9;
214 static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
217 struct bset *i = btree_bset_last(b);
218 unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) +
220 b->uncompacted_whiteout_u64s;
221 unsigned total = c->sb.btree_node_size << 6;
223 EBUG_ON(used > total);
225 if (bset_written(b, i))
231 static inline unsigned btree_write_set_buffer(struct btree *b)
234 * Could buffer up larger amounts of keys for btrees with larger keys,
235 * pending benchmarking:
240 static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
243 struct bset *i = btree_bset_last(b);
244 unsigned offset = max_t(unsigned, b->written << 9,
245 bset_byte_offset(b, bset_bkey_last(i)));
246 ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
247 (offset + sizeof(struct btree_node_entry) +
248 b->whiteout_u64s * sizeof(u64) +
249 b->uncompacted_whiteout_u64s * sizeof(u64));
251 EBUG_ON(offset > btree_bytes(c));
253 if ((unlikely(bset_written(b, i)) && n > 0) ||
254 (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) >
255 btree_write_set_buffer(b)) && n > btree_write_set_buffer(b)))
256 return (void *) b->data + offset;
262 * write lock must be held on @b (else the dirty bset that we were going to
263 * insert into could be written out from under us)
265 static inline bool bch_btree_node_insert_fits(struct cache_set *c,
266 struct btree *b, unsigned u64s)
268 if (btree_node_is_extents(b)) {
269 /* The insert key might split an existing key
270 * (bch_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
272 u64s += BKEY_EXTENT_U64s_MAX;
275 return u64s <= bch_btree_keys_u64s_remaining(c, b);
278 static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
279 struct bkey_packed *k)
281 if (bset_written(b, bset(b, t))) {
282 EBUG_ON(b->uncompacted_whiteout_u64s <
283 bkeyp_key_u64s(&b->format, k));
284 b->uncompacted_whiteout_u64s -=
285 bkeyp_key_u64s(&b->format, k);
289 static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
290 struct bkey_packed *k)
292 if (bset_written(b, bset(b, t))) {
293 BUG_ON(!k->needs_whiteout);
294 b->uncompacted_whiteout_u64s +=
295 bkeyp_key_u64s(&b->format, k);
299 void bch_btree_insert_node(struct btree *, struct btree_iter *,
300 struct keylist *, struct btree_reserve *,
301 struct btree_interior_update *as);
303 /* Normal update interface: */
305 struct btree_insert {
307 struct disk_reservation *disk_res;
308 struct journal_res journal_res;
310 struct extent_insert_hook *hook;
315 struct btree_insert_entry {
316 struct btree_iter *iter;
319 * true if entire key was inserted - can only be false for
326 int __bch_btree_insert_at(struct btree_insert *);
329 #define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
330 #define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
332 #define BTREE_INSERT_ENTRY(_iter, _k) \
333 ((struct btree_insert_entry) { \
340 * bch_btree_insert_at - insert one or more keys at iterator positions
341 * @iter: btree iterator
342 * @insert_key: key to insert
343 * @disk_res: disk reservation
344 * @hook: extent insert callback
347 * -EINTR: locking changed, this function should be called again. Only returned
348 * if passed BTREE_INSERT_ATOMIC.
349 * -EROFS: cache set read only
350 * -EIO: journal or btree node IO error
352 #define bch_btree_insert_at(_c, _disk_res, _hook, \
353 _journal_seq, _flags, ...) \
354 __bch_btree_insert_at(&(struct btree_insert) { \
356 .disk_res = (_disk_res), \
357 .journal_seq = (_journal_seq), \
360 .nr = COUNT_ARGS(__VA_ARGS__), \
361 .entries = (struct btree_insert_entry[]) { \
366 * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
367 * locks, -EAGAIN if need to wait on btree reserve
369 #define BTREE_INSERT_ATOMIC (1 << 0)
371 /* Don't check for -ENOSPC: */
372 #define BTREE_INSERT_NOFAIL (1 << 1)
374 /* for copygc, or when merging btree nodes */
375 #define BTREE_INSERT_USE_RESERVE (1 << 2)
378 * Insert is for journal replay: don't get journal reservations, or mark extents
381 #define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
383 int bch_btree_insert_list_at(struct btree_iter *, struct keylist *,
384 struct disk_reservation *,
385 struct extent_insert_hook *, u64 *, unsigned);
387 static inline bool journal_res_insert_fits(struct btree_insert *trans,
388 struct btree_insert_entry *insert)
391 struct btree_insert_entry *i;
394 * If we didn't get a journal reservation, we're in journal replay and
395 * we're not journalling updates:
397 if (!trans->journal_res.ref)
400 for (i = insert; i < trans->entries + trans->nr; i++)
401 u64s += jset_u64s(i->k->k.u64s);
403 return u64s <= trans->journal_res.u64s;
406 int bch_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
407 int bch_btree_insert(struct cache_set *, enum btree_id, struct bkey_i *,
408 struct disk_reservation *,
409 struct extent_insert_hook *, u64 *, int flags);
410 int bch_btree_update(struct cache_set *, enum btree_id,
411 struct bkey_i *, u64 *);
413 int bch_btree_delete_range(struct cache_set *, enum btree_id,
414 struct bpos, struct bpos, u64,
415 struct disk_reservation *,
416 struct extent_insert_hook *, u64 *);
418 int bch_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *);
420 #endif /* _BCACHE_BTREE_INSERT_H */