1 // SPDX-License-Identifier: GPL-2.0
3 #include "bkey_on_stack.h"
4 #include "btree_update.h"
5 #include "btree_update_interior.h"
9 #include "extent_update.h"
12 * This counts the number of iterators to the alloc & ec btrees we'll need
13 * inserting/removing this extent:
15 static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
17 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
18 const union bch_extent_entry *entry;
21 bkey_extent_entry_for_each(ptrs, entry) {
22 switch (__extent_entry_type(entry)) {
23 case BCH_EXTENT_ENTRY_ptr:
24 case BCH_EXTENT_ENTRY_stripe_ptr:
32 static int count_iters_for_insert(struct btree_trans *trans,
44 case KEY_TYPE_reflink_v:
45 *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
47 if (*nr_iters >= max_iters) {
48 *end = bpos_min(*end, k.k->p);
53 case KEY_TYPE_reflink_p: {
54 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
55 u64 idx = le64_to_cpu(p.v->idx);
56 unsigned sectors = bpos_min(*end, p.k->p).offset -
57 bkey_start_offset(p.k);
58 struct btree_iter *iter;
61 for_each_btree_key(trans, iter,
62 BTREE_ID_REFLINK, POS(0, idx + offset),
63 BTREE_ITER_SLOTS, r_k, ret) {
64 if (bkey_cmp(bkey_start_pos(r_k.k),
65 POS(0, idx + sectors)) >= 0)
68 *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
70 if (*nr_iters >= max_iters) {
71 struct bpos pos = bkey_start_pos(k.k);
72 pos.offset += r_k.k->p.offset - idx;
74 *end = bpos_min(*end, pos);
80 bch2_trans_iter_put(trans, iter);
88 #define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
90 int bch2_extent_atomic_end(struct btree_iter *iter,
91 struct bkey_i *insert,
94 struct btree_trans *trans = iter->trans;
96 struct btree_node_iter node_iter;
97 struct bkey_packed *_k;
98 unsigned nr_iters = 0;
101 ret = bch2_btree_iter_traverse(iter);
106 node_iter = iter->l[0].iter;
108 BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
110 *end = bpos_min(insert->k.p, b->key.k.p);
112 ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
113 &nr_iters, EXTENT_ITERS_MAX / 2, false);
117 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
118 KEY_TYPE_discard))) {
119 struct bkey unpacked;
120 struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
123 if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
126 if (bkey_cmp(bkey_start_pos(&insert->k),
127 bkey_start_pos(k.k)) > 0)
128 offset = bkey_start_offset(&insert->k) -
129 bkey_start_offset(k.k);
131 ret = count_iters_for_insert(trans, k, offset, end,
132 &nr_iters, EXTENT_ITERS_MAX, true);
136 bch2_btree_node_iter_advance(&node_iter, b);
139 return ret < 0 ? ret : 0;
142 int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
147 ret = bch2_extent_atomic_end(iter, k, &end);
151 bch2_cut_back(end, k);
155 int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
160 ret = bch2_extent_atomic_end(iter, k, &end);
164 return !bkey_cmp(end, k->k.p);
167 enum btree_insert_ret
168 bch2_extent_can_insert(struct btree_trans *trans,
169 struct btree_insert_entry *insert,
172 struct btree_iter_level *l = &insert->iter->l[0];
173 struct btree_node_iter node_iter = l->iter;
174 enum bch_extent_overlap overlap;
175 struct bkey_packed *_k;
176 struct bkey unpacked;
181 * We avoid creating whiteouts whenever possible when deleting, but
182 * those optimizations mean we may potentially insert two whiteouts
183 * instead of one (when we overlap with the front of one extent and the
186 if (bkey_whiteout(&insert->k->k))
189 _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
192 return BTREE_INSERT_OK;
194 k = bkey_disassemble(l->b, _k, &unpacked);
196 overlap = bch2_extent_overlap(&insert->k->k, k.k);
198 /* account for having to split existing extent: */
199 if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
202 if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
203 (sectors = bch2_bkey_sectors_compressed(k))) {
204 int flags = trans->flags & BTREE_INSERT_NOFAIL
205 ? BCH_DISK_RESERVATION_NOFAIL : 0;
207 switch (bch2_disk_reservation_add(trans->c,
213 return BTREE_INSERT_ENOSPC;
219 return BTREE_INSERT_OK;
222 static void verify_extent_nonoverlapping(struct bch_fs *c,
224 struct btree_node_iter *_iter,
225 struct bkey_i *insert)
227 #ifdef CONFIG_BCACHEFS_DEBUG
228 struct btree_node_iter iter;
229 struct bkey_packed *k;
232 if (!expensive_debug_checks(c))
236 k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
238 (uk = bkey_unpack_key(b, k),
239 bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
242 k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
245 (uk = bkey_unpack_key(b, k),
246 bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
249 (uk = bkey_unpack_key(b, k),
250 bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
254 bch2_bkey_to_text(&PBUF(buf1), &insert->k);
255 bch2_bkey_to_text(&PBUF(buf2), &uk);
257 bch2_dump_btree_node(b);
258 panic("insert > next :\n"
268 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
269 struct bkey_i *insert)
271 struct btree_iter_level *l = &iter->l[0];
272 struct bkey_packed *k =
273 bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
275 BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
277 EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
278 verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
280 if (debug_check_bkeys(c))
281 bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
283 bch2_bset_insert(l->b, &l->iter, k, insert, 0);
284 bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
288 extent_squash(struct bch_fs *c, struct btree_iter *iter,
289 struct bkey_i *insert,
290 struct bkey_packed *_k, struct bkey_s k,
291 enum bch_extent_overlap overlap)
293 struct btree_iter_level *l = &iter->l[0];
297 case BCH_EXTENT_OVERLAP_FRONT:
298 /* insert overlaps with start of k: */
299 u64s_delta = bch2_cut_front_s(insert->k.p, k);
300 btree_keys_account_val_delta(l->b, _k, u64s_delta);
302 EBUG_ON(bkey_deleted(k.k));
303 extent_save(l->b, _k, k.k);
304 bch2_btree_iter_fix_key_modified(iter, l->b, _k);
307 case BCH_EXTENT_OVERLAP_BACK:
308 /* insert overlaps with end of k: */
309 u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
310 btree_keys_account_val_delta(l->b, _k, u64s_delta);
312 EBUG_ON(bkey_deleted(k.k));
313 extent_save(l->b, _k, k.k);
316 * As the auxiliary tree is indexed by the end of the
317 * key and we've just changed the end, update the
320 bch2_bset_fix_invalidated_key(l->b, _k);
321 bch2_btree_node_iter_fix(iter, l->b, &l->iter,
322 _k, _k->u64s, _k->u64s);
325 case BCH_EXTENT_OVERLAP_ALL: {
326 /* The insert key completely covers k, invalidate k */
327 if (!bkey_whiteout(k.k))
328 btree_account_key_drop(l->b, _k);
331 k.k->type = KEY_TYPE_deleted;
333 if (_k >= btree_bset_last(l->b)->start) {
334 unsigned u64s = _k->u64s;
336 bch2_bset_delete(l->b, _k, _k->u64s);
337 bch2_btree_node_iter_fix(iter, l->b, &l->iter,
340 extent_save(l->b, _k, k.k);
341 bch2_btree_iter_fix_key_modified(iter, l->b, _k);
346 case BCH_EXTENT_OVERLAP_MIDDLE: {
347 struct bkey_on_stack split;
349 bkey_on_stack_init(&split);
350 bkey_on_stack_realloc(&split, c, k.k->u64s);
353 * The insert key falls 'in the middle' of k
354 * The insert key splits k in 3:
355 * - start only in k, preserve
356 * - middle common section, invalidate in k
357 * - end only in k, preserve
359 * We update the old key to preserve the start,
360 * insert will be the new common section,
361 * we manually insert the end that we are preserving.
363 * modify k _before_ doing the insert (which will move
366 bkey_reassemble(split.k, k.s_c);
367 split.k->k.needs_whiteout |= bkey_written(l->b, _k);
369 bch2_cut_back(bkey_start_pos(&insert->k), split.k);
370 BUG_ON(bkey_deleted(&split.k->k));
372 u64s_delta = bch2_cut_front_s(insert->k.p, k);
373 btree_keys_account_val_delta(l->b, _k, u64s_delta);
375 BUG_ON(bkey_deleted(k.k));
376 extent_save(l->b, _k, k.k);
377 bch2_btree_iter_fix_key_modified(iter, l->b, _k);
379 extent_bset_insert(c, iter, split.k);
380 bkey_on_stack_exit(&split, c);
387 * bch_extent_insert_fixup - insert a new extent and deal with overlaps
389 * this may result in not actually doing the insert, or inserting some subset
390 * of the insert key. For cmpxchg operations this is where that logic lives.
392 * All subsets of @insert that need to be inserted are inserted using
393 * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
394 * returns false, setting @iter->pos for the prefix of @insert that actually got
397 * BSET INVARIANTS: this function is responsible for maintaining all the
398 * invariants for bsets of extents in memory. things get really hairy with 0
403 * bkey_start_pos(bkey_next(k)) >= k
404 * or bkey_start_offset(bkey_next(k)) >= k->offset
406 * i.e. strict ordering, no overlapping extents.
408 * multiple bsets (i.e. full btree node):
411 * k.size != 0 ∧ j.size != 0 →
412 * ¬ (k > bkey_start_pos(j) ∧ k < j)
414 * i.e. no two overlapping keys _of nonzero size_
416 * We can't realistically maintain this invariant for zero size keys because of
417 * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
418 * there may be another 0 size key between them in another bset, and it will
419 * thus overlap with the merged key.
421 * In addition, the end of iter->pos indicates how much has been processed.
422 * If the end of iter->pos is not the same as the end of insert, then
423 * key insertion needs to continue/be retried.
425 void bch2_insert_fixup_extent(struct btree_trans *trans,
426 struct btree_insert_entry *insert_entry)
428 struct bch_fs *c = trans->c;
429 struct btree_iter *iter = insert_entry->iter;
430 struct bkey_i *insert = insert_entry->k;
431 struct btree_iter_level *l = &iter->l[0];
432 struct btree_node_iter node_iter = l->iter;
433 bool deleting = bkey_whiteout(&insert->k);
434 bool update_journal = !deleting;
435 bool update_btree = !deleting;
436 struct bkey_i whiteout = *insert;
437 struct bkey_packed *_k;
438 struct bkey unpacked;
440 EBUG_ON(iter->level);
441 EBUG_ON(!insert->k.size);
442 EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
444 while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
445 KEY_TYPE_discard))) {
446 struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
447 struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
448 enum bch_extent_overlap overlap =
449 bch2_extent_overlap(&insert->k, k.k);
451 if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
454 if (!bkey_whiteout(k.k))
455 update_journal = true;
457 if (!update_journal) {
458 bch2_cut_front(cur_end, insert);
459 bch2_cut_front(cur_end, &whiteout);
460 bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
465 * When deleting, if possible just do it by switching the type
466 * of the key we're deleting, instead of creating and inserting
471 !bkey_cmp(insert->k.p, k.k->p) &&
472 !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
473 if (!bkey_whiteout(k.k)) {
474 btree_account_key_drop(l->b, _k);
475 _k->type = KEY_TYPE_discard;
476 reserve_whiteout(l->b, _k);
477 bch2_btree_iter_fix_key_modified(iter,
483 if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
484 insert->k.needs_whiteout = true;
489 overlap == BCH_EXTENT_OVERLAP_ALL &&
490 bkey_whiteout(k.k) &&
491 k.k->needs_whiteout) {
492 unreserve_whiteout(l->b, _k);
493 _k->needs_whiteout = false;
496 extent_squash(c, iter, insert, _k, k, overlap);
499 bch2_cut_front(cur_end, insert);
503 if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
504 overlap == BCH_EXTENT_OVERLAP_MIDDLE)
509 bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
513 insert->k.type = KEY_TYPE_discard;
515 EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
517 extent_bset_insert(c, iter, insert);
520 if (update_journal) {
521 struct bkey_i *k = !deleting ? insert : &whiteout;
524 k->k.type = KEY_TYPE_discard;
526 EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
528 bch2_btree_journal_key(trans, iter, k);
531 bch2_cut_front(insert->k.p, insert);