]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_update_leaf.c
Update bcachefs sources to bca25b802d fixup! bcachefs: Fix bch2_check_discard_freespa...
[bcachefs-tools-debian] / libbcachefs / btree_update_leaf.c
index c93c132dd815d963360dfe5060dd74925e3ba915..b42b83c55c5bf58ba24606e19f6fed54d72fe4fd 100644 (file)
 #include "recovery.h"
 #include "subvolume.h"
 #include "replicas.h"
+#include "trace.h"
 
 #include <linux/prefetch.h>
 #include <linux/sort.h>
-#include <trace/events/bcachefs.h>
 
 /*
  * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
@@ -227,12 +227,12 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        return 0;
 }
 
-static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 0, seq);
 }
 
-static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 1, seq);
 }
@@ -244,8 +244,8 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 
        bch2_journal_pin_add(&c->journal, seq, &w->journal,
                             btree_node_write_idx(b) == 0
-                            ? btree_node_flush0
-                            : btree_node_flush1);
+                            ? bch2_btree_node_flush0
+                            : bch2_btree_node_flush1);
 }
 
 /**
@@ -316,25 +316,11 @@ static noinline int
 bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
                                   unsigned long trace_ip)
 {
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       bch2_trans_unlock(trans);
-
-       ret = bch2_journal_preres_get(&c->journal,
+       return drop_locks_do(trans,
+               bch2_journal_preres_get(&trans->c->journal,
                        &trans->journal_preres,
                        trans->journal_preres_u64s,
-                       (flags & JOURNAL_WATERMARK_MASK));
-       if (ret)
-               return ret;
-
-       ret = bch2_trans_relock(trans);
-       if (ret) {
-               trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
-               return ret;
-       }
-
-       return 0;
+                       (flags & JOURNAL_WATERMARK_MASK)));
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -401,7 +387,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
        if (!new_k) {
                bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
                        bch2_btree_ids[path->btree_id], new_u64s);
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
        }
 
        trans_for_each_update(trans, i)
@@ -622,14 +608,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
        prefetch(&trans->c->journal.flags);
 
-       h = trans->hooks;
-       while (h) {
-               ret = h->fn(trans, h);
-               if (ret)
-                       return ret;
-               h = h->next;
-       }
-
        trans_for_each_update(trans, i) {
                /* Multiple inserts might go to same leaf: */
                if (!same_leaf_as_prev(trans, i))
@@ -696,6 +674,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                        goto revert_fs_usage;
        }
 
+       h = trans->hooks;
+       while (h) {
+               ret = h->fn(trans, h);
+               if (ret)
+                       goto revert_fs_usage;
+               h = h->next;
+       }
+
        trans_for_each_update(trans, i)
                if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
                        ret = run_one_mem_trigger(trans, i, i->flags);
@@ -765,7 +751,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                if (!i->cached)
                        btree_insert_key_leaf(trans, i);
                else if (!i->key_cache_already_flushed)
-                       bch2_btree_insert_key_cached(trans, flags, i->path, i->k);
+                       bch2_btree_insert_key_cached(trans, flags, i);
                else {
                        bch2_btree_key_cache_drop(trans, i->path);
                        btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
@@ -961,34 +947,20 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                        trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
                break;
        case -BCH_ERR_btree_insert_need_mark_replicas:
-               bch2_trans_unlock(trans);
-
-               ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
-               if (ret)
-                       break;
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
+               ret = drop_locks_do(trans,
+                       bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
                break;
        case -BCH_ERR_journal_res_get_blocked:
-               bch2_trans_unlock(trans);
-
                if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
                    !(flags & JOURNAL_WATERMARK_reserved)) {
                        ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        break;
                }
 
-               ret = bch2_trans_journal_res_get(trans,
+               ret = drop_locks_do(trans,
+                       bch2_trans_journal_res_get(trans,
                                        (flags & JOURNAL_WATERMARK_MASK)|
-                                       JOURNAL_RES_GET_CHECK);
-               if (ret)
-                       break;
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
+                                       JOURNAL_RES_GET_CHECK));
                break;
        case -BCH_ERR_btree_insert_need_journal_reclaim:
                bch2_trans_unlock(trans);
@@ -1001,8 +973,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                        break;
 
                ret = bch2_trans_relock(trans);
-               if (ret)
-                       trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
                break;
        case -BCH_ERR_btree_insert_need_flush_buffer: {
                struct btree_write_buffer *wb = &c->btree_write_buffer;
@@ -1010,20 +980,20 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                ret = 0;
 
                if (wb->state.nr > wb->size * 3 / 4) {
-                       bch2_trans_reset_updates(trans);
                        bch2_trans_unlock(trans);
-
                        mutex_lock(&wb->flush_lock);
 
-                       if (wb->state.nr > wb->size * 3 / 4)
+                       if (wb->state.nr > wb->size * 3 / 4) {
+                               bch2_trans_begin(trans);
                                ret = __bch2_btree_write_buffer_flush(trans,
                                                flags|BTREE_INSERT_NOCHECK_RW, true);
-                       else
+                               if (!ret) {
+                                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+                                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+                               }
+                       } else {
                                mutex_unlock(&wb->flush_lock);
-
-                       if (!ret) {
-                               trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-                               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+                               ret = bch2_trans_relock(trans);
                        }
                }
                break;
@@ -1053,10 +1023,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
            test_bit(BCH_FS_STARTED, &c->flags))
                return -BCH_ERR_erofs_trans_commit;
 
-       bch2_trans_unlock(trans);
-
-       ret =   bch2_fs_read_write_early(c) ?:
-               bch2_trans_relock(trans);
+       ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
        if (ret)
                return ret;
 
@@ -1268,7 +1235,7 @@ static noinline int extent_front_merge(struct btree_trans *trans,
        struct bkey_i *update;
        int ret;
 
-       update = bch2_bkey_make_mut(trans, k);
+       update = bch2_bkey_make_mut_noupdate(trans, k);
        ret = PTR_ERR_OR_ZERO(update);
        if (ret)
                return ret;
@@ -1343,6 +1310,69 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 
        return ret;
 }
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+                                  enum btree_id id,
+                                  struct bpos old_pos,
+                                  struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter old_iter, new_iter;
+       struct bkey_s_c old_k, new_k;
+       snapshot_id_list s;
+       struct bkey_i *update;
+       int ret;
+
+       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+               return 0;
+
+       darray_init(&s);
+
+       bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+              !(ret = bkey_err(old_k)) &&
+              bkey_eq(old_pos, old_k.k->p)) {
+               struct bpos whiteout_pos =
+                       SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+               if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+                   snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+                       continue;
+
+               new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+                                          BTREE_ITER_NOT_EXTENTS|
+                                          BTREE_ITER_INTENT);
+               ret = bkey_err(new_k);
+               if (ret)
+                       break;
+
+               if (new_k.k->type == KEY_TYPE_deleted) {
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p             = whiteout_pos;
+                       update->k.type          = KEY_TYPE_whiteout;
+
+                       ret = bch2_trans_update(trans, &new_iter, update,
+                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               }
+               bch2_trans_iter_exit(trans, &new_iter);
+
+               ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &old_iter);
+       darray_exit(&s);
+
+       return ret;
+}
+
 int bch2_trans_update_extent(struct btree_trans *trans,
                             struct btree_iter *orig_iter,
                             struct bkey_i *insert,
@@ -1390,28 +1420,32 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                        trans->extra_journal_res += compressed_sectors;
 
                if (front_split) {
-                       update = bch2_bkey_make_mut(trans, k);
+                       update = bch2_bkey_make_mut_noupdate(trans, k);
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
 
                        bch2_cut_back(start, update);
 
-                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+                       ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                               k.k->p, update->k.p) ?:
+                               bch2_btree_insert_nonextent(trans, btree_id, update,
+                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
                }
 
                if (k.k->p.snapshot != insert->k.p.snapshot &&
                    (front_split || back_split)) {
-                       update = bch2_bkey_make_mut(trans, k);
+                       update = bch2_bkey_make_mut_noupdate(trans, k);
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
 
                        bch2_cut_front(start, update);
                        bch2_cut_back(insert->k.p, update);
 
-                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                       ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                               k.k->p, update->k.p) ?:
+                               bch2_btree_insert_nonextent(trans, btree_id, update,
                                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
@@ -1426,10 +1460,15 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                        update->k.p = k.k->p;
                        update->k.p.snapshot = insert->k.p.snapshot;
 
-                       if (insert->k.p.snapshot != k.k->p.snapshot ||
-                           (btree_type_has_snapshots(btree_id) &&
-                            need_whiteout_for_snapshot(trans, btree_id, update->k.p)))
+                       if (insert->k.p.snapshot != k.k->p.snapshot) {
                                update->k.type = KEY_TYPE_whiteout;
+                       } else if (btree_type_has_snapshots(btree_id)) {
+                               ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+                               if (ret < 0)
+                                       goto err;
+                               if (ret)
+                                       update->k.type = KEY_TYPE_whiteout;
+                       }
 
                        ret = bch2_btree_insert_nonextent(trans, btree_id, update,
                                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
@@ -1438,7 +1477,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                }
 
                if (back_split) {
-                       update = bch2_bkey_make_mut(trans, k);
+                       update = bch2_bkey_make_mut_noupdate(trans, k);
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
 
@@ -1496,21 +1535,31 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
                                            unsigned long ip)
 {
        struct btree_path *btree_path;
+       struct bkey k;
        int ret;
 
-       i->key_cache_already_flushed = true;
-       i->flags |= BTREE_TRIGGER_NORUN;
-
        btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
                                   BTREE_ITER_INTENT, _THIS_IP_);
-
        ret = bch2_btree_path_traverse(trans, btree_path, 0);
        if (ret)
-               goto err;
+               goto out;
+
+       /*
+        * The old key in the insert entry might actually refer to an existing
+        * key in the btree that has been deleted from cache and not yet
+        * flushed. Check for this and skip the flush so we don't run triggers
+        * against a stale key.
+        */
+       bch2_btree_path_peek_slot_exact(btree_path, &k);
+       if (!bkey_deleted(&k))
+               goto out;
+
+       i->key_cache_already_flushed = true;
+       i->flags |= BTREE_TRIGGER_NORUN;
 
        btree_path_set_should_be_locked(btree_path);
        ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
-err:
+out:
        bch2_path_put(trans, btree_path, true);
        return ret;
 }
@@ -1591,9 +1640,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa
         * the key cache - but the key has to exist in the btree for that to
         * work:
         */
-       if (path->cached &&
-           bkey_deleted(&i->old_k) &&
-           !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY))
+       if (path->cached && bkey_deleted(&i->old_k))
                return flush_new_cached_update(trans, path, i, flags, ip);
 
        return 0;
@@ -1722,6 +1769,37 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
        return 0;
 }
 
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+                            enum btree_id btree, struct bpos end)
+{
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_prev(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_advance(iter);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+       if (bkey_gt(k.k->p, end)) {
+               ret = -BCH_ERR_ENOSPC_btree_slot;
+               goto err;
+       }
+
+       return 0;
+err:
+       bch2_trans_iter_exit(trans, iter);
+       return ret;
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
                            struct btree_trans_commit_hook *h)
 {
@@ -1797,6 +1875,20 @@ int bch2_btree_delete_at(struct btree_trans *trans,
        return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
 }
 
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+                                 enum btree_id btree, struct bpos pos)
+{
+       struct bkey_i *k;
+
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
+
+       bkey_init(&k->k);
+       k->k.p = pos;
+       return bch2_trans_update_buffered(trans, btree, k);
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
                                  unsigned update_flags,
@@ -1891,7 +1983,7 @@ static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list ar
        int ret;
 
        prt_vprintf(&buf, fmt, args);
-       ret = buf.allocation_failure ? -ENOMEM : 0;
+       ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
        if (ret)
                goto err;
 
@@ -1919,14 +2011,19 @@ err:
        return ret;
 }
 
-int bch2_trans_log_msg(struct btree_trans *trans, const char *fmt, ...)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+                 va_list args)
 {
-       va_list args;
        int ret;
 
-       va_start(args, fmt);
-       ret = __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args);
-       va_end(args);
+       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+       } else {
+               ret = bch2_trans_do(c, NULL, NULL,
+                       BTREE_INSERT_LAZY_RW|commit_flags,
+                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+       }
 
        return ret;
 }
@@ -1937,16 +2034,22 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
        int ret;
 
        va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, 0, fmt, args);
+       va_end(args);
+       return ret;
+}
 
-       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
-       } else {
-               ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
-       }
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+       va_list args;
+       int ret;
 
+       va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, JOURNAL_WATERMARK_reserved, fmt, args);
        va_end(args);
-
        return ret;
-
 }