]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_update_leaf.c
Update bcachefs sources to fb39031ade bcachefs: bch2_sb_maybe_downgrade(), bch2_sb_up...
[bcachefs-tools-debian] / libbcachefs / btree_update_leaf.c
index de98d7601a0e1dc87211a5cdfb1b8e3d1d5c027f..53219fdcff667b29cf86baeacc19e8fa499b4291 100644 (file)
 #include "recovery.h"
 #include "subvolume.h"
 #include "replicas.h"
+#include "trace.h"
 
 #include <linux/prefetch.h>
 #include <linux/sort.h>
-#include <trace/events/bcachefs.h>
 
 /*
  * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
  * different snapshot:
  */
-struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
 {
        struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
 
@@ -272,8 +272,10 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
 
        bch2_btree_add_journal_pin(c, b, journal_seq);
 
-       if (unlikely(!btree_node_dirty(b)))
+       if (unlikely(!btree_node_dirty(b))) {
+               EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
                set_btree_node_dirty_acct(c, b);
+       }
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -316,25 +318,11 @@ static noinline int
 bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
                                   unsigned long trace_ip)
 {
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       bch2_trans_unlock(trans);
-
-       ret = bch2_journal_preres_get(&c->journal,
+       return drop_locks_do(trans,
+               bch2_journal_preres_get(&trans->c->journal,
                        &trans->journal_preres,
                        trans->journal_preres_u64s,
-                       (flags & JOURNAL_WATERMARK_MASK));
-       if (ret)
-               return ret;
-
-       ret = bch2_trans_relock(trans);
-       if (ret) {
-               trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
-               return ret;
-       }
-
-       return 0;
+                       (flags & BCH_WATERMARK_MASK)));
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -421,6 +409,8 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 {
        struct bkey_s_c old = { &i->old_k, i->old_v };
        struct bkey_i *new = i->k;
+       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
        int ret;
 
        verify_update_old_key(trans, i);
@@ -431,8 +421,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
        if (!btree_node_type_needs_gc(i->btree_id))
                return 0;
 
-       if (bch2_bkey_ops[old.k->type].atomic_trigger ==
-           bch2_bkey_ops[i->k->k.type].atomic_trigger &&
+       if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
            ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
                ret   = bch2_mark_key(trans, i->btree_id, i->level,
                                old, bkey_i_to_s_c(new),
@@ -464,6 +453,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
         */
        struct bkey old_k = i->old_k;
        struct bkey_s_c old = { &old_k, i->old_v };
+       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
 
        verify_update_old_key(trans, i);
 
@@ -473,8 +464,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 
        if (!i->insert_trigger_run &&
            !i->overwrite_trigger_run &&
-           bch2_bkey_ops[old.k->type].trans_trigger ==
-           bch2_bkey_ops[i->k->k.type].trans_trigger &&
+           old_ops->trans_trigger == new_ops->trans_trigger &&
            ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
                i->overwrite_trigger_run = true;
                i->insert_trigger_run = true;
@@ -622,14 +612,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
        prefetch(&trans->c->journal.flags);
 
-       h = trans->hooks;
-       while (h) {
-               ret = h->fn(trans, h);
-               if (ret)
-                       return ret;
-               h = h->next;
-       }
-
        trans_for_each_update(trans, i) {
                /* Multiple inserts might go to same leaf: */
                if (!same_leaf_as_prev(trans, i))
@@ -658,7 +640,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
         */
        if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
                ret = bch2_trans_journal_res_get(trans,
-                               (flags & JOURNAL_WATERMARK_MASK)|
+                               (flags & BCH_WATERMARK_MASK)|
                                JOURNAL_RES_GET_NONBLOCK);
                if (ret)
                        return ret;
@@ -696,6 +678,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                        goto revert_fs_usage;
        }
 
+       h = trans->hooks;
+       while (h) {
+               ret = h->fn(trans, h);
+               if (ret)
+                       goto revert_fs_usage;
+               h = h->next;
+       }
+
        trans_for_each_update(trans, i)
                if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
                        ret = run_one_mem_trigger(trans, i, i->flags);
@@ -868,10 +858,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
        struct printbuf buf = PRINTBUF;
 
        trans_for_each_update(trans, i) {
-               int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+               enum bkey_invalid_flags invalid_flags = 0;
+
+               if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+                       invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
 
                if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-                                              i->bkey_type, rw, &buf)))
+                                              i->bkey_type, invalid_flags, &buf)))
                        return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
                btree_insert_entry_checks(trans, i);
        }
@@ -899,7 +892,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 
        ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
-                       (flags & JOURNAL_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
+                       (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
        if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
                ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
        if (unlikely(ret))
@@ -961,34 +954,24 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                        trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
                break;
        case -BCH_ERR_btree_insert_need_mark_replicas:
-               bch2_trans_unlock(trans);
-
-               ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
-               if (ret)
-                       break;
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
+               ret = drop_locks_do(trans,
+                       bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
                break;
        case -BCH_ERR_journal_res_get_blocked:
-               bch2_trans_unlock(trans);
-
+               /*
+                * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+                * flag
+                */
                if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-                   !(flags & JOURNAL_WATERMARK_reserved)) {
+                   (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
                        ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        break;
                }
 
-               ret = bch2_trans_journal_res_get(trans,
-                                       (flags & JOURNAL_WATERMARK_MASK)|
-                                       JOURNAL_RES_GET_CHECK);
-               if (ret)
-                       break;
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
+               ret = drop_locks_do(trans,
+                       bch2_trans_journal_res_get(trans,
+                                       (flags & BCH_WATERMARK_MASK)|
+                                       JOURNAL_RES_GET_CHECK));
                break;
        case -BCH_ERR_btree_insert_need_journal_reclaim:
                bch2_trans_unlock(trans);
@@ -1001,8 +984,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                        break;
 
                ret = bch2_trans_relock(trans);
-               if (ret)
-                       trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
                break;
        case -BCH_ERR_btree_insert_need_flush_buffer: {
                struct btree_write_buffer *wb = &c->btree_write_buffer;
@@ -1010,20 +991,20 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                ret = 0;
 
                if (wb->state.nr > wb->size * 3 / 4) {
-                       bch2_trans_reset_updates(trans);
                        bch2_trans_unlock(trans);
-
                        mutex_lock(&wb->flush_lock);
 
-                       if (wb->state.nr > wb->size * 3 / 4)
+                       if (wb->state.nr > wb->size * 3 / 4) {
+                               bch2_trans_begin(trans);
                                ret = __bch2_btree_write_buffer_flush(trans,
                                                flags|BTREE_INSERT_NOCHECK_RW, true);
-                       else
+                               if (!ret) {
+                                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+                                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+                               }
+                       } else {
                                mutex_unlock(&wb->flush_lock);
-
-                       if (!ret) {
-                               trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-                               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+                               ret = bch2_trans_relock(trans);
                        }
                }
                break;
@@ -1053,10 +1034,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
            test_bit(BCH_FS_STARTED, &c->flags))
                return -BCH_ERR_erofs_trans_commit;
 
-       bch2_trans_unlock(trans);
-
-       ret =   bch2_fs_read_write_early(c) ?:
-               bch2_trans_relock(trans);
+       ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
        if (ret)
                return ret;
 
@@ -1251,7 +1229,6 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
                                          struct bpos pos)
 {
        if (!btree_type_has_snapshots(id) ||
-           pos.snapshot == U32_MAX ||
            !snapshot_t(trans->c, pos.snapshot)->children[0])
                return 0;
 
@@ -1268,7 +1245,7 @@ static noinline int extent_front_merge(struct btree_trans *trans,
        struct bkey_i *update;
        int ret;
 
-       update = bch2_bkey_make_mut(trans, k);
+       update = bch2_bkey_make_mut_noupdate(trans, k);
        ret = PTR_ERR_OR_ZERO(update);
        if (ret)
                return ret;
@@ -1343,6 +1320,69 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 
        return ret;
 }
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+                                  enum btree_id id,
+                                  struct bpos old_pos,
+                                  struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter old_iter, new_iter;
+       struct bkey_s_c old_k, new_k;
+       snapshot_id_list s;
+       struct bkey_i *update;
+       int ret;
+
+       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+               return 0;
+
+       darray_init(&s);
+
+       bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+              !(ret = bkey_err(old_k)) &&
+              bkey_eq(old_pos, old_k.k->p)) {
+               struct bpos whiteout_pos =
+                       SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+               if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+                   snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+                       continue;
+
+               new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+                                          BTREE_ITER_NOT_EXTENTS|
+                                          BTREE_ITER_INTENT);
+               ret = bkey_err(new_k);
+               if (ret)
+                       break;
+
+               if (new_k.k->type == KEY_TYPE_deleted) {
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p             = whiteout_pos;
+                       update->k.type          = KEY_TYPE_whiteout;
+
+                       ret = bch2_trans_update(trans, &new_iter, update,
+                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               }
+               bch2_trans_iter_exit(trans, &new_iter);
+
+               ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &old_iter);
+       darray_exit(&s);
+
+       return ret;
+}
+
 int bch2_trans_update_extent(struct btree_trans *trans,
                             struct btree_iter *orig_iter,
                             struct bkey_i *insert,
@@ -1390,28 +1430,32 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                        trans->extra_journal_res += compressed_sectors;
 
                if (front_split) {
-                       update = bch2_bkey_make_mut(trans, k);
+                       update = bch2_bkey_make_mut_noupdate(trans, k);
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
 
                        bch2_cut_back(start, update);
 
-                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+                       ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                               k.k->p, update->k.p) ?:
+                               bch2_btree_insert_nonextent(trans, btree_id, update,
+                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
                }
 
                if (k.k->p.snapshot != insert->k.p.snapshot &&
                    (front_split || back_split)) {
-                       update = bch2_bkey_make_mut(trans, k);
+                       update = bch2_bkey_make_mut_noupdate(trans, k);
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
 
                        bch2_cut_front(start, update);
                        bch2_cut_back(insert->k.p, update);
 
-                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                       ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                               k.k->p, update->k.p) ?:
+                               bch2_btree_insert_nonextent(trans, btree_id, update,
                                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
@@ -1426,10 +1470,15 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                        update->k.p = k.k->p;
                        update->k.p.snapshot = insert->k.p.snapshot;
 
-                       if (insert->k.p.snapshot != k.k->p.snapshot ||
-                           (btree_type_has_snapshots(btree_id) &&
-                            need_whiteout_for_snapshot(trans, btree_id, update->k.p)))
+                       if (insert->k.p.snapshot != k.k->p.snapshot) {
                                update->k.type = KEY_TYPE_whiteout;
+                       } else if (btree_type_has_snapshots(btree_id)) {
+                               ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+                               if (ret < 0)
+                                       goto err;
+                               if (ret)
+                                       update->k.type = KEY_TYPE_whiteout;
+                       }
 
                        ret = bch2_btree_insert_nonextent(trans, btree_id, update,
                                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
@@ -1438,7 +1487,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                }
 
                if (back_split) {
-                       update = bch2_bkey_make_mut(trans, k);
+                       update = bch2_bkey_make_mut_noupdate(trans, k);
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
 
@@ -1496,21 +1545,31 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
                                            unsigned long ip)
 {
        struct btree_path *btree_path;
+       struct bkey k;
        int ret;
 
-       i->key_cache_already_flushed = true;
-       i->flags |= BTREE_TRIGGER_NORUN;
-
        btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
                                   BTREE_ITER_INTENT, _THIS_IP_);
-
        ret = bch2_btree_path_traverse(trans, btree_path, 0);
        if (ret)
-               goto err;
+               goto out;
+
+       /*
+        * The old key in the insert entry might actually refer to an existing
+        * key in the btree that has been deleted from cache and not yet
+        * flushed. Check for this and skip the flush so we don't run triggers
+        * against a stale key.
+        */
+       bch2_btree_path_peek_slot_exact(btree_path, &k);
+       if (!bkey_deleted(&k))
+               goto out;
+
+       i->key_cache_already_flushed = true;
+       i->flags |= BTREE_TRIGGER_NORUN;
 
        btree_path_set_should_be_locked(btree_path);
        ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
-err:
+out:
        bch2_path_put(trans, btree_path, true);
        return ret;
 }
@@ -1591,9 +1650,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
         * the key cache - but the key has to exist in the btree for that to
         * work:
         */
-       if (path->cached &&
-           bkey_deleted(&i->old_k) &&
-           !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY))
+       if (path->cached && bkey_deleted(&i->old_k))
                return flush_new_cached_update(trans, path, i, flags, ip);
 
        return 0;
@@ -1722,6 +1779,37 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
        return 0;
 }
 
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+                            enum btree_id btree, struct bpos end)
+{
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_prev(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_advance(iter);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+       if (bkey_gt(k.k->p, end)) {
+               ret = -BCH_ERR_ENOSPC_btree_slot;
+               goto err;
+       }
+
+       return 0;
+err:
+       bch2_trans_iter_exit(trans, iter);
+       return ret;
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
                            struct btree_trans_commit_hook *h)
 {
@@ -1797,6 +1885,20 @@ int bch2_btree_delete_at(struct btree_trans *trans,
        return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
 }
 
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+                                 enum btree_id btree, struct bpos pos)
+{
+       struct bkey_i *k;
+
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
+
+       bkey_init(&k->k);
+       k->k.p = pos;
+       return bch2_trans_update_buffered(trans, btree, k);
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
                                  unsigned update_flags,
@@ -1919,14 +2021,19 @@ err:
        return ret;
 }
 
-int bch2_trans_log_msg(struct btree_trans *trans, const char *fmt, ...)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+                 va_list args)
 {
-       va_list args;
        int ret;
 
-       va_start(args, fmt);
-       ret = __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args);
-       va_end(args);
+       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+       } else {
+               ret = bch2_trans_do(c, NULL, NULL,
+                       BTREE_INSERT_LAZY_RW|commit_flags,
+                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+       }
 
        return ret;
 }
@@ -1937,16 +2044,22 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
        int ret;
 
        va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, 0, fmt, args);
+       va_end(args);
+       return ret;
+}
 
-       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
-       } else {
-               ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
-       }
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+       va_list args;
+       int ret;
 
+       va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
        va_end(args);
-
        return ret;
-
 }