X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;ds=sidebyside;f=libbcachefs%2Fbtree_update_leaf.c;h=46c0a1e7fa2029e65f31ce1adc2827dbbab1dee1;hb=92d34f6ed29e90d48c40a4c31816df805edfe483;hp=c0a84153ecda839b44b93713b6711bc8a0b6d81f;hpb=cc41f52bcc7bcc6ec3a63c10fd2b84bc3e2f6615;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index c0a8415..46c0a1e 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -10,21 +10,36 @@ #include "buckets.h" #include "debug.h" #include "error.h" -#include "extents.h" +#include "extent_update.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" #include "replicas.h" +#include #include #include +static inline bool same_leaf_as_prev(struct btree_trans *trans, + unsigned idx) +{ + return idx && + trans->updates[trans->updates_sorted[idx]].iter->l[0].b == + trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b; +} + +#define trans_for_each_update_sorted(_trans, _i, _iter) \ + for (_iter = 0; \ + _iter < _trans->nr_updates && \ + (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \ + _iter++) + inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { bch2_btree_node_lock_write(b, iter); - if (btree_node_just_written(b) && + if (unlikely(btree_node_just_written(b)) && bch2_btree_post_write_cleanup(c, b)) bch2_btree_iter_reinit_node(iter, b); @@ -36,27 +51,26 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, bch2_btree_init_next(c, b, iter); } -static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans) +static inline void btree_trans_sort_updates(struct btree_trans *trans) { - struct btree_insert_entry *i; + struct btree_insert_entry *l, *r; + unsigned nr = 0, pos; - trans_for_each_update_leaf(trans, i) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); -} + trans_for_each_update(trans, l) { + for (pos = 0; pos < nr; pos++) { + r = trans->updates + trans->updates_sorted[pos]; -static void btree_trans_unlock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; + if (btree_iter_cmp(l->iter, r->iter) <= 0) + break; + } - trans_for_each_update_leaf(trans, i) - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); -} + memmove(&trans->updates_sorted[pos + 1], + &trans->updates_sorted[pos], + (nr - pos) * sizeof(trans->updates_sorted[0])); -static inline int btree_trans_cmp(struct btree_insert_entry l, - struct btree_insert_entry r) -{ - return cmp_int(l.deferred, r.deferred) ?: - btree_iter_cmp(l.iter, r.iter); + trans->updates_sorted[pos] = l - trans->updates; + nr++; + } } /* Inserting into a given leaf node (last stage of insert): */ @@ -90,40 +104,43 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, return true; } - insert->k.needs_whiteout = k->needs_whiteout; - btree_account_key_drop(b, k); - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; + if (bkey_whiteout(&insert->k)) { + unsigned clobber_u64s = k->u64s, new_u64s = k->u64s; + + k->type = KEY_TYPE_deleted; - /* - * If we're deleting, and the key we're deleting doesn't - * need a whiteout (it wasn't overwriting a key that had - * been written to disk) - just delete it: - */ - if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { + if (k->needs_whiteout) { + push_whiteout(iter->trans->c, b, k); + k->needs_whiteout = false; + } + + if (k >= btree_bset_last(b)->start) { bch2_bset_delete(b, k, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, - k, clobber_u64s, 0); - bch2_btree_iter_verify(iter, b); - return true; + new_u64s = 0; } + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, new_u64s); + return true; + + } + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; goto overwrite; } + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; k->type = KEY_TYPE_deleted; + /* + * XXX: we should be able to do this without two calls to + * bch2_btree_node_iter_fix: + */ bch2_btree_node_iter_fix(iter, b, node_iter, k, k->u64s, k->u64s); - bch2_btree_iter_verify(iter, b); - - if (bkey_whiteout(&insert->k)) { - reserve_whiteout(b, k); - return true; - } else { - k->needs_whiteout = false; - } } else { /* * Deleting, but the key to delete wasn't found - nothing to do: @@ -138,10 +155,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, clobber_u64s = 0; overwrite: bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch2_btree_node_iter_fix(iter, b, node_iter, k, - clobber_u64s, k->u64s); - bch2_btree_iter_verify(iter, b); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, k->u64s); return true; } @@ -233,8 +248,8 @@ static void bch2_insert_fixup_key(struct btree_trans *trans, EBUG_ON(insert->k->k.u64s > bch_btree_keys_u64s_remaining(trans->c, l->b)); - if (bch2_btree_bset_insert_key(iter, l->b, &l->iter, - insert->k)) + if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, + insert->k))) bch2_btree_journal_key(trans, iter, insert->k); } @@ -247,7 +262,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + struct bset_tree *t = bset_tree_last(b); + int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -257,7 +273,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans, bch2_insert_fixup_extent(trans, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); @@ -271,165 +287,31 @@ static void btree_insert_key_leaf(struct btree_trans *trans, trace_btree_insert_key(c, b, insert->k); } -/* Deferred btree updates: */ - -static void deferred_update_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct deferred_update *d = - container_of(pin, struct deferred_update, journal); - struct journal_preres res = { 0 }; - u64 tmp[32]; - struct bkey_i *k = (void *) tmp; - int ret; - - if (d->allocated_u64s > ARRAY_SIZE(tmp)) { - k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS); - - BUG_ON(!k); /* XXX */ - } - - spin_lock(&d->lock); - if (d->dirty) { - BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s); - - swap(res, d->res); - - BUG_ON(d->k.k.u64s > d->allocated_u64s); - - bkey_copy(k, &d->k); - d->dirty = false; - spin_unlock(&d->lock); - - ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED); - bch2_fs_fatal_err_on(ret && !bch2_journal_error(j), - c, "error flushing deferred btree update: %i", ret); - - spin_lock(&d->lock); - } - - if (!d->dirty) - bch2_journal_pin_drop(j, &d->journal); - spin_unlock(&d->lock); - - bch2_journal_preres_put(j, &res); - if (k != (void *) tmp) - kfree(k); -} - -static void btree_insert_key_deferred(struct btree_trans *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct deferred_update *d = insert->d; - int difference; - - BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY); - BUG_ON(insert->k->u64s > d->allocated_u64s); - - __btree_journal_key(trans, d->btree_id, insert->k); - - spin_lock(&d->lock); - BUG_ON(jset_u64s(insert->k->u64s) > - trans->journal_preres.u64s); - - difference = jset_u64s(insert->k->u64s) - d->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - d->res.u64s += difference; - } - - bkey_copy(&d->k, insert->k); - d->dirty = true; - - bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal, - deferred_update_flush); - spin_unlock(&d->lock); -} - -void bch2_deferred_update_free(struct bch_fs *c, - struct deferred_update *d) -{ - deferred_update_flush(&c->journal, &d->journal, 0); - - BUG_ON(journal_pin_active(&d->journal)); - - bch2_journal_pin_flush(&c->journal, &d->journal); - kfree(d); -} - -struct deferred_update * -bch2_deferred_update_alloc(struct bch_fs *c, - enum btree_id btree_id, - unsigned u64s) -{ - struct deferred_update *d; - - BUG_ON(u64s > U8_MAX); - - d = kmalloc(offsetof(struct deferred_update, k) + - u64s * sizeof(u64), GFP_NOFS); - BUG_ON(!d); - - memset(d, 0, offsetof(struct deferred_update, k)); - - spin_lock_init(&d->lock); - d->allocated_u64s = u64s; - d->btree_id = btree_id; - - return d; -} - /* Normal update interface: */ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { struct bch_fs *c = trans->c; - enum btree_id btree_id = !i->deferred - ? i->iter->btree_id - : i->d->btree_id; - - if (!i->deferred) { - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - !(trans->flags & BTREE_INSERT_ATOMIC)); - } + + BUG_ON(i->iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + !(trans->flags & BTREE_INSERT_ATOMIC)); BUG_ON(debug_check_bkeys(c) && !bkey_deleted(&i->k->k) && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id)); + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); } -static int bch2_trans_journal_preres_get(struct btree_trans *trans) +static noinline int +bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned u64s = 0; int ret; - trans_for_each_update(trans, i) - if (i->deferred) - u64s += jset_u64s(i->k->k.u64s); - - if (!u64s) - return 0; - - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, u64s, - JOURNAL_RES_GET_NONBLOCK); - if (ret != -EAGAIN) - return ret; - bch2_trans_unlock(trans); ret = bch2_journal_preres_get(&c->journal, @@ -445,8 +327,8 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans) return 0; } -static int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) +static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) { struct bch_fs *c = trans->c; int ret; @@ -484,107 +366,63 @@ btree_key_can_insert(struct btree_trans *trans, return BTREE_INSERT_OK; } -static int btree_trans_check_can_insert(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static inline void do_btree_insert_one(struct btree_trans *trans, + struct btree_insert_entry *insert) { - struct btree_insert_entry *i; - unsigned u64s = 0; - int ret; - - trans_for_each_update_iter(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = btree_key_can_insert(trans, i, &u64s); - if (ret) { - *stopped_at = i; - return ret; - } - } - - return 0; + btree_insert_key_leaf(trans, insert); } -static inline void do_btree_insert_one(struct btree_trans *trans, - struct btree_insert_entry *insert) +static inline bool update_has_trans_triggers(struct btree_insert_entry *i) { - if (likely(!insert->deferred)) - btree_insert_key_leaf(trans, insert); - else - btree_insert_key_deferred(trans, insert); + return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id); } -static inline bool update_triggers_transactional(struct btree_trans *trans, - struct btree_insert_entry *i) +static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i) { - return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && - (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES || - i->iter->btree_id == BTREE_ID_REFLINK); + return (BTREE_NODE_TYPE_HAS_TRIGGERS & + ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & + (1U << i->iter->btree_id); } -static inline bool update_has_triggers(struct btree_trans *trans, - struct btree_insert_entry *i) +static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) { - return likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - !i->deferred && - btree_node_type_needs_gc(i->iter->btree_id); + __bch2_btree_iter_unlock(iter); } -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_btree_insert_at(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static noinline void bch2_trans_mark_gc(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; - bool saw_non_marked; unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ? BCH_BUCKET_MARK_BUCKET_INVALIDATE : 0; - int ret; - - trans_for_each_update_iter(trans, i) - BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - - trans_for_each_update_iter(trans, i) - i->marked = false; - - do { - saw_non_marked = false; - trans_for_each_update_iter(trans, i) { - if (i->marked) - continue; - - saw_non_marked = true; - i->marked = true; - - if (update_has_triggers(trans, i) && - update_triggers_transactional(trans, i)) { - ret = bch2_trans_mark_update(trans, i->iter, i->k); - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip); - if (ret) - goto out_clear_replicas; - } - } - } while (saw_non_marked); + if (unlikely(trans->flags & BTREE_INSERT_NOMARK)) + return; trans_for_each_update(trans, i) - btree_insert_entry_checks(trans, i); - bch2_btree_trans_verify_locks(trans); + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i, NULL, + mark_flags|BCH_BUCKET_MARK_GC); +} - btree_trans_lock_write(c, trans); +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; + unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE + ? BCH_BUCKET_MARK_BUCKET_INVALIDATE + : 0; + unsigned iter, u64s = 0; + bool marking = false; + int ret; if (race_fault()) { - ret = -EINTR; trace_trans_restart_fault_inject(trans->ip); - goto out; + return -EINTR; } /* @@ -592,25 +430,28 @@ static inline int do_btree_insert_at(struct btree_trans *trans, * held, otherwise another thread could write the node changing the * amount of space available: */ - ret = btree_trans_check_can_insert(trans, stopped_at); - if (ret) - goto out; - trans_for_each_update_iter(trans, i) { - if (i->deferred || - !btree_node_type_needs_gc(i->iter->btree_id)) - continue; + prefetch(&trans->c->journal.flags); - if (!fs_usage) { - percpu_down_read(&c->mark_lock); - fs_usage = bch2_fs_usage_scratch_get(c); - } + trans_for_each_update_sorted(trans, i, iter) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, iter)) + u64s = 0; - if (!bch2_bkey_replicas_marked_locked(c, - bkey_i_to_s_c(i->k), true)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto out; + u64s += i->k->k.u64s; + ret = btree_key_can_insert(trans, i, &u64s); + if (ret) { + *stopped_at = i; + return ret; } + + if (btree_node_type_needs_gc(i->iter->btree_id)) + marking = true; + } + + if (marking) { + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); } /* @@ -618,16 +459,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans, * succeed: */ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - trans->journal_u64s = 0; - - trans_for_each_update(trans, i) - trans->journal_u64s += jset_u64s(i->k->k.u64s); - - ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); + ret = bch2_trans_journal_res_get(trans, + JOURNAL_RES_GET_NONBLOCK); if (ret) - goto out; + goto err; } + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) trans_for_each_update(trans, i) @@ -637,49 +479,146 @@ static inline int do_btree_insert_at(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } - trans_for_each_update_iter(trans, i) - if (update_has_triggers(trans, i) && - !update_triggers_transactional(trans, i)) - bch2_mark_update(trans, i, fs_usage, mark_flags); + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && + bch2_replicas_delta_list_apply(c, fs_usage, + trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; + } - if (fs_usage && trans->fs_usage_deltas) - bch2_replicas_delta_list_apply(c, fs_usage, - trans->fs_usage_deltas); + trans_for_each_update(trans, i) + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + update_has_nontrans_triggers(i)) + bch2_mark_update(trans, i, fs_usage, mark_flags); - if (fs_usage) + if (marking) bch2_trans_fs_usage_apply(trans, fs_usage); - if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - unlikely(c->gc_pos.phase)) - trans_for_each_update_iter(trans, i) - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) - bch2_mark_update(trans, i, NULL, - mark_flags| - BCH_BUCKET_MARK_GC); + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); trans_for_each_update(trans, i) do_btree_insert_one(trans, i); -out: - BUG_ON(ret && - (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && - trans->journal_res.ref); - - btree_trans_unlock_write(trans); - - if (fs_usage) { +err: + if (marking) { bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); } - bch2_journal_res_put(&c->journal, &trans->journal_res); -out_clear_replicas: - if (trans->fs_usage_deltas) { - memset(&trans->fs_usage_deltas->fs_usage, 0, - sizeof(trans->fs_usage_deltas->fs_usage)); - trans->fs_usage_deltas->used = 0; + return ret; +} + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct btree_insert_entry *i; + struct btree_iter *iter; + unsigned idx, u64s, journal_preres_u64s = 0; + int ret; + + /* + * note: running triggers will append more updates to the list of + * updates as we're walking it: + */ + trans_for_each_update(trans, i) { + /* we know trans->nounlock won't be set here: */ + if (unlikely(!(i->iter->locks_want < 1 + ? __bch2_btree_iter_upgrade(i->iter, 1) + : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { + trace_trans_restart_upgrade(trans->ip); + return -EINTR; + } + + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + update_has_trans_triggers(i)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + return ret; + } + } + + u64s = jset_u64s(i->k->k.u64s); + if (0) + journal_preres_u64s += u64s; + trans->journal_u64s += u64s; } - return ret; + ret = bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + journal_preres_u64s); + if (unlikely(ret)) + return ret; + + /* + * Can't be holding any read locks when we go to take write locks: + * + * note - this must be done after bch2_trans_journal_preres_get_cold() + * or anything else that might call bch2_trans_relock(), since that + * would just retake the read locks: + */ + trans_for_each_iter_all(trans, iter) { + if (iter->nodes_locked != iter->nodes_intent_locked) { + EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + EBUG_ON(trans->iters_live & (1ULL << iter->idx)); + bch2_btree_iter_unlock_noinline(iter); + } + } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_update(trans, i) + btree_insert_entry_checks(trans, i); + bch2_btree_trans_verify_locks(trans); + + /* + * No more updates can be added - sort updates so we can take write + * locks in the correct order: + */ + btree_trans_sort_updates(trans); + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_btree_node_lock_for_insert(trans->c, + i->iter->l[0].b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at); + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + i->iter); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ + bch2_journal_res_put(&trans->c->journal, &trans->journal_res); + + if (unlikely(ret)) + return ret; + + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_foreground_maybe_merge(trans->c, i->iter, + 0, trans->flags); + + trans->nounlock = false; + + trans_for_each_update(trans, i) + bch2_btree_iter_downgrade(i->iter); + + return 0; } static noinline @@ -689,19 +628,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, { struct bch_fs *c = trans->c; unsigned flags = trans->flags; - struct btree_insert_entry *src, *dst; - - src = dst = trans->updates; - - while (src < trans->updates + trans->nr_updates) { - if (!src->triggered) { - *dst = *src; - dst++; - } - src++; - } - - trans->nr_updates = dst - trans->updates; /* * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree @@ -749,7 +675,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); if (ret) return ret; @@ -800,128 +726,81 @@ int bch2_trans_commit_error(struct btree_trans *trans, return ret; } -/** - * __bch_btree_insert_at - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -static int __bch2_trans_commit(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret; - trans_for_each_update_iter(trans, i) { - if (!bch2_btree_iter_upgrade(i->iter, 1)) { - trace_trans_restart_upgrade(trans->ip); - ret = -EINTR; - goto err; - } + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + return -EROFS; - ret = btree_iter_err(i->iter); - if (ret) - goto err; - } - - ret = do_btree_insert_at(trans, stopped_at); - if (unlikely(ret)) - goto err; - - if (trans->flags & BTREE_INSERT_NOUNLOCK) - trans->nounlock = true; - - trans_for_each_update_leaf(trans, i) - bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); - - trans->nounlock = false; + bch2_trans_unlock(trans); - trans_for_each_update_iter(trans, i) - bch2_btree_iter_downgrade(i->iter); -err: - /* make sure we didn't drop or screw up locks: */ - bch2_btree_trans_verify_locks(trans); + ret = bch2_fs_read_write_early(c); + if (ret) + return ret; - return ret; + percpu_ref_get(&c->writes); + return 0; } -int bch2_trans_commit(struct btree_trans *trans, - struct disk_reservation *disk_res, - u64 *journal_seq, - unsigned flags) +int __bch2_trans_commit(struct btree_trans *trans) { - struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; - unsigned orig_mem_top = trans->mem_top; + struct btree_iter *iter; + unsigned orig_nr_updates = trans->nr_updates; + unsigned orig_mem_top = trans->mem_top; int ret = 0; if (!trans->nr_updates) goto out_noupdates; /* for the sake of sanity: */ - BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC)); + EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&trans->c->gc_lock); - if (!trans->commit_start) - trans->commit_start = local_clock(); - - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->disk_res = disk_res; - trans->journal_seq = journal_seq; - trans->flags = flags; - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - !percpu_ref_tryget(&c->writes))) { - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) - return -EROFS; - - bch2_trans_unlock(trans); - - ret = bch2_fs_read_write_early(c); + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!percpu_ref_tryget(&trans->c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); if (ret) return ret; + } +retry: + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + trans->journal_u64s = 0; - percpu_ref_get(&c->writes); + ret = do_bch2_trans_commit(trans, &i); - if (!bch2_trans_relock(trans)) { - ret = -EINTR; - goto err; - } + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset(&trans->fs_usage_deltas->memset_start, 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); } -retry: - ret = bch2_trans_journal_preres_get(trans); - if (ret) - goto err; - ret = __bch2_trans_commit(trans, &i); + /* make sure we didn't drop or screw up locks: */ + bch2_btree_trans_verify_locks(trans); + if (ret) goto err; out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&c->writes); + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); out_noupdates: - if (!ret && trans->commit_start) { - bch2_time_stats_update(&c->times[BCH_TIME_btree_update], - trans->commit_start); - trans->commit_start = 0; - } + EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); - BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); + trans_for_each_iter_all(trans, iter) + iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; if (!ret) { - bch2_trans_unlink_iters(trans, ~trans->iters_touched| - trans->iters_unlink_on_commit); + bch2_trans_unlink_iters(trans); trans->iters_touched = 0; } trans->nr_updates = 0; @@ -934,34 +813,13 @@ err: /* can't loop if it was passed in and we changed it: */ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) ret = -EINTR; + if (ret) + goto out; - if (!ret) { - /* free memory used by triggers, they'll be reexecuted: */ - trans->mem_top = orig_mem_top; - goto retry; - } - - goto out; -} - -struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans, - struct btree_insert_entry entry) -{ - struct btree_insert_entry *i; - - BUG_ON(trans->nr_updates >= trans->nr_iters + 4); - - for (i = trans->updates; - i < trans->updates + trans->nr_updates; - i++) - if (btree_trans_cmp(entry, *i) < 0) - break; - - memmove(&i[1], &i[0], - (void *) &trans->updates[trans->nr_updates] - (void *) i); - trans->nr_updates++; - *i = entry; - return i; + /* free updates and memory used by triggers, they'll be reexecuted: */ + trans->nr_updates = orig_nr_updates; + trans->mem_top = orig_mem_top; + goto retry; } /** @@ -987,7 +845,7 @@ retry: iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); + bch2_trans_update(&trans, iter, k); ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); if (ret == -EINTR) @@ -1030,14 +888,14 @@ retry: /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); + bch2_cut_back(end, &delete); ret = bch2_extent_trim_atomic(&delete, iter); if (ret) break; } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); + bch2_trans_update(trans, iter, &delete); ret = bch2_trans_commit(trans, NULL, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); @@ -1064,7 +922,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, bkey_init(&k.k); k.k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k)); + bch2_trans_update(trans, iter, &k); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE|flags);